{ "best_metric": 0.85768563, "best_model_checkpoint": "/home/ubuntu/s2/output_qwen72B_2_epochs_b/qwen2_5-72b/v6-20250307-183758/checkpoint-38500", "epoch": 0.9766631415892022, "eval_steps": 500, "global_step": 38500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.7252503, "epoch": 2.5367873807511746e-05, "grad_norm": 4.875, "learning_rate": 2.536783358701167e-09, "loss": 1.10256255, "memory(GiB)": 684.38, "step": 1, "train_speed(iter/s)": 0.033782 }, { "acc": 0.70898664, "epoch": 0.00012683936903755871, "grad_norm": 5.03125, "learning_rate": 1.2683916793505834e-08, "loss": 1.20267498, "memory(GiB)": 688.22, "step": 5, "train_speed(iter/s)": 0.059465 }, { "acc": 0.69412413, "epoch": 0.00025367873807511743, "grad_norm": 4.21875, "learning_rate": 2.536783358701167e-08, "loss": 1.26536179, "memory(GiB)": 688.24, "step": 10, "train_speed(iter/s)": 0.066837 }, { "acc": 0.68325424, "epoch": 0.00038051810711267614, "grad_norm": 4.90625, "learning_rate": 3.80517503805175e-08, "loss": 1.34663887, "memory(GiB)": 690.83, "step": 15, "train_speed(iter/s)": 0.070364 }, { "acc": 0.70039992, "epoch": 0.0005073574761502349, "grad_norm": 4.875, "learning_rate": 5.073566717402334e-08, "loss": 1.23108244, "memory(GiB)": 703.19, "step": 20, "train_speed(iter/s)": 0.070125 }, { "acc": 0.69260392, "epoch": 0.0006341968451877936, "grad_norm": 5.21875, "learning_rate": 6.341958396752917e-08, "loss": 1.29314003, "memory(GiB)": 703.19, "step": 25, "train_speed(iter/s)": 0.072523 }, { "acc": 0.70214748, "epoch": 0.0007610362142253523, "grad_norm": 6.21875, "learning_rate": 7.6103500761035e-08, "loss": 1.27978735, "memory(GiB)": 703.21, "step": 30, "train_speed(iter/s)": 0.072655 }, { "acc": 0.69119005, "epoch": 0.000887875583262911, "grad_norm": 5.3125, "learning_rate": 8.878741755454084e-08, "loss": 1.28390646, "memory(GiB)": 703.21, "step": 35, "train_speed(iter/s)": 0.073113 }, { "acc": 0.7086309, "epoch": 0.0010147149523004697, "grad_norm": 5.03125, "learning_rate": 1.0147133434804667e-07, "loss": 1.25399752, "memory(GiB)": 703.21, "step": 40, "train_speed(iter/s)": 0.072843 }, { "acc": 0.70717978, "epoch": 0.0011415543213380286, "grad_norm": 5.125, "learning_rate": 1.1415525114155251e-07, "loss": 1.26872826, "memory(GiB)": 703.21, "step": 45, "train_speed(iter/s)": 0.073419 }, { "acc": 0.70300112, "epoch": 0.0012683936903755872, "grad_norm": 4.65625, "learning_rate": 1.2683916793505834e-07, "loss": 1.26245632, "memory(GiB)": 703.21, "step": 50, "train_speed(iter/s)": 0.073785 }, { "acc": 0.69998479, "epoch": 0.001395233059413146, "grad_norm": 5.59375, "learning_rate": 1.3952308472856418e-07, "loss": 1.34806995, "memory(GiB)": 703.21, "step": 55, "train_speed(iter/s)": 0.074101 }, { "acc": 0.6732316, "epoch": 0.0015220724284507046, "grad_norm": 5.21875, "learning_rate": 1.5220700152207e-07, "loss": 1.33423166, "memory(GiB)": 703.21, "step": 60, "train_speed(iter/s)": 0.073939 }, { "acc": 0.69136062, "epoch": 0.0016489117974882634, "grad_norm": 4.875, "learning_rate": 1.6489091831557585e-07, "loss": 1.29400597, "memory(GiB)": 703.21, "step": 65, "train_speed(iter/s)": 0.074087 }, { "acc": 0.70776587, "epoch": 0.001775751166525822, "grad_norm": 4.4375, "learning_rate": 1.7757483510908168e-07, "loss": 1.25748501, "memory(GiB)": 703.21, "step": 70, "train_speed(iter/s)": 0.074644 }, { "acc": 0.69060044, "epoch": 0.0019025905355633808, "grad_norm": 4.53125, "learning_rate": 1.9025875190258752e-07, "loss": 1.30145187, "memory(GiB)": 703.21, "step": 75, "train_speed(iter/s)": 0.074318 }, { "acc": 0.71234188, "epoch": 0.0020294299046009394, "grad_norm": 11.25, "learning_rate": 2.0294266869609335e-07, "loss": 1.19062223, "memory(GiB)": 703.21, "step": 80, "train_speed(iter/s)": 0.073952 }, { "acc": 0.6821022, "epoch": 0.0021562692736384983, "grad_norm": 4.84375, "learning_rate": 2.1562658548959918e-07, "loss": 1.36011, "memory(GiB)": 703.21, "step": 85, "train_speed(iter/s)": 0.07419 }, { "acc": 0.69667435, "epoch": 0.002283108642676057, "grad_norm": 5.0, "learning_rate": 2.2831050228310502e-07, "loss": 1.25699472, "memory(GiB)": 703.21, "step": 90, "train_speed(iter/s)": 0.074315 }, { "acc": 0.68051744, "epoch": 0.0024099480117136156, "grad_norm": 4.21875, "learning_rate": 2.409944190766109e-07, "loss": 1.37008066, "memory(GiB)": 703.21, "step": 95, "train_speed(iter/s)": 0.074389 }, { "acc": 0.69806561, "epoch": 0.0025367873807511745, "grad_norm": 5.03125, "learning_rate": 2.536783358701167e-07, "loss": 1.24697285, "memory(GiB)": 703.21, "step": 100, "train_speed(iter/s)": 0.074303 }, { "acc": 0.68684988, "epoch": 0.0026636267497887334, "grad_norm": 4.78125, "learning_rate": 2.6636225266362255e-07, "loss": 1.30004454, "memory(GiB)": 703.21, "step": 105, "train_speed(iter/s)": 0.074595 }, { "acc": 0.71229615, "epoch": 0.002790466118826292, "grad_norm": 3.875, "learning_rate": 2.7904616945712836e-07, "loss": 1.22484961, "memory(GiB)": 703.21, "step": 110, "train_speed(iter/s)": 0.074407 }, { "acc": 0.70204368, "epoch": 0.0029173054878638507, "grad_norm": 4.75, "learning_rate": 2.917300862506342e-07, "loss": 1.29973726, "memory(GiB)": 705.92, "step": 115, "train_speed(iter/s)": 0.074628 }, { "acc": 0.70110397, "epoch": 0.003044144856901409, "grad_norm": 4.28125, "learning_rate": 3.0441400304414e-07, "loss": 1.27111044, "memory(GiB)": 705.92, "step": 120, "train_speed(iter/s)": 0.074518 }, { "acc": 0.70062938, "epoch": 0.003170984225938968, "grad_norm": 5.90625, "learning_rate": 3.170979198376459e-07, "loss": 1.21185141, "memory(GiB)": 705.92, "step": 125, "train_speed(iter/s)": 0.07454 }, { "acc": 0.6938592, "epoch": 0.003297823594976527, "grad_norm": 4.875, "learning_rate": 3.297818366311517e-07, "loss": 1.2761632, "memory(GiB)": 705.92, "step": 130, "train_speed(iter/s)": 0.074596 }, { "acc": 0.71001849, "epoch": 0.0034246629640140853, "grad_norm": 4.28125, "learning_rate": 3.4246575342465755e-07, "loss": 1.22871561, "memory(GiB)": 705.92, "step": 135, "train_speed(iter/s)": 0.074544 }, { "acc": 0.6992981, "epoch": 0.003551502333051644, "grad_norm": 5.0625, "learning_rate": 3.5514967021816336e-07, "loss": 1.25103512, "memory(GiB)": 705.92, "step": 140, "train_speed(iter/s)": 0.074622 }, { "acc": 0.69908299, "epoch": 0.003678341702089203, "grad_norm": 5.09375, "learning_rate": 3.678335870116692e-07, "loss": 1.2659831, "memory(GiB)": 705.92, "step": 145, "train_speed(iter/s)": 0.074566 }, { "acc": 0.711797, "epoch": 0.0038051810711267615, "grad_norm": 6.09375, "learning_rate": 3.8051750380517503e-07, "loss": 1.24269857, "memory(GiB)": 705.92, "step": 150, "train_speed(iter/s)": 0.074525 }, { "acc": 0.69584384, "epoch": 0.00393202044016432, "grad_norm": 5.59375, "learning_rate": 3.932014205986809e-07, "loss": 1.24790506, "memory(GiB)": 705.92, "step": 155, "train_speed(iter/s)": 0.074813 }, { "acc": 0.70516539, "epoch": 0.004058859809201879, "grad_norm": 4.375, "learning_rate": 4.058853373921867e-07, "loss": 1.28211327, "memory(GiB)": 705.92, "step": 160, "train_speed(iter/s)": 0.074786 }, { "acc": 0.69989386, "epoch": 0.004185699178239438, "grad_norm": 5.875, "learning_rate": 4.1856925418569256e-07, "loss": 1.27263384, "memory(GiB)": 705.92, "step": 165, "train_speed(iter/s)": 0.074643 }, { "acc": 0.68829498, "epoch": 0.004312538547276997, "grad_norm": 5.03125, "learning_rate": 4.3125317097919837e-07, "loss": 1.29005661, "memory(GiB)": 705.92, "step": 170, "train_speed(iter/s)": 0.074562 }, { "acc": 0.69468913, "epoch": 0.004439377916314555, "grad_norm": 5.34375, "learning_rate": 4.4393708777270423e-07, "loss": 1.30272627, "memory(GiB)": 705.92, "step": 175, "train_speed(iter/s)": 0.074663 }, { "acc": 0.68903208, "epoch": 0.004566217285352114, "grad_norm": 5.125, "learning_rate": 4.5662100456621004e-07, "loss": 1.36622458, "memory(GiB)": 705.92, "step": 180, "train_speed(iter/s)": 0.074742 }, { "acc": 0.70514245, "epoch": 0.004693056654389673, "grad_norm": 5.65625, "learning_rate": 4.693049213597159e-07, "loss": 1.23907776, "memory(GiB)": 705.92, "step": 185, "train_speed(iter/s)": 0.074943 }, { "acc": 0.69755521, "epoch": 0.004819896023427231, "grad_norm": 4.0, "learning_rate": 4.819888381532218e-07, "loss": 1.22573414, "memory(GiB)": 705.92, "step": 190, "train_speed(iter/s)": 0.074929 }, { "acc": 0.70192838, "epoch": 0.0049467353924647905, "grad_norm": 6.1875, "learning_rate": 4.946727549467275e-07, "loss": 1.29649677, "memory(GiB)": 705.92, "step": 195, "train_speed(iter/s)": 0.074755 }, { "acc": 0.69601464, "epoch": 0.005073574761502349, "grad_norm": 4.8125, "learning_rate": 5.073566717402334e-07, "loss": 1.27929068, "memory(GiB)": 705.92, "step": 200, "train_speed(iter/s)": 0.074605 }, { "acc": 0.69486694, "epoch": 0.0052004141305399074, "grad_norm": 5.0625, "learning_rate": 5.200405885337392e-07, "loss": 1.2940012, "memory(GiB)": 705.92, "step": 205, "train_speed(iter/s)": 0.074496 }, { "acc": 0.70043459, "epoch": 0.005327253499577467, "grad_norm": 5.0, "learning_rate": 5.327245053272451e-07, "loss": 1.28847551, "memory(GiB)": 705.93, "step": 210, "train_speed(iter/s)": 0.074595 }, { "acc": 0.69270186, "epoch": 0.005454092868615025, "grad_norm": 5.0625, "learning_rate": 5.454084221207509e-07, "loss": 1.28030043, "memory(GiB)": 705.93, "step": 215, "train_speed(iter/s)": 0.074613 }, { "acc": 0.70389848, "epoch": 0.005580932237652584, "grad_norm": 4.625, "learning_rate": 5.580923389142567e-07, "loss": 1.20271158, "memory(GiB)": 705.93, "step": 220, "train_speed(iter/s)": 0.074538 }, { "acc": 0.70287404, "epoch": 0.005707771606690143, "grad_norm": 4.53125, "learning_rate": 5.707762557077626e-07, "loss": 1.21137466, "memory(GiB)": 705.93, "step": 225, "train_speed(iter/s)": 0.074462 }, { "acc": 0.70215244, "epoch": 0.005834610975727701, "grad_norm": 4.46875, "learning_rate": 5.834601725012684e-07, "loss": 1.27294455, "memory(GiB)": 705.93, "step": 230, "train_speed(iter/s)": 0.074478 }, { "acc": 0.70949712, "epoch": 0.00596145034476526, "grad_norm": 3.984375, "learning_rate": 5.961440892947743e-07, "loss": 1.20199633, "memory(GiB)": 705.93, "step": 235, "train_speed(iter/s)": 0.074453 }, { "acc": 0.69587889, "epoch": 0.006088289713802818, "grad_norm": 4.5625, "learning_rate": 6.0882800608828e-07, "loss": 1.31174927, "memory(GiB)": 705.93, "step": 240, "train_speed(iter/s)": 0.074366 }, { "acc": 0.70396123, "epoch": 0.006215129082840378, "grad_norm": 4.625, "learning_rate": 6.215119228817859e-07, "loss": 1.244203, "memory(GiB)": 705.93, "step": 245, "train_speed(iter/s)": 0.074313 }, { "acc": 0.69503036, "epoch": 0.006341968451877936, "grad_norm": 5.40625, "learning_rate": 6.341958396752918e-07, "loss": 1.33607941, "memory(GiB)": 705.93, "step": 250, "train_speed(iter/s)": 0.074337 }, { "acc": 0.67675338, "epoch": 0.0064688078209154945, "grad_norm": 4.34375, "learning_rate": 6.468797564687976e-07, "loss": 1.33707571, "memory(GiB)": 705.93, "step": 255, "train_speed(iter/s)": 0.07435 }, { "acc": 0.70271873, "epoch": 0.006595647189953054, "grad_norm": 4.875, "learning_rate": 6.595636732623034e-07, "loss": 1.25247669, "memory(GiB)": 705.93, "step": 260, "train_speed(iter/s)": 0.074375 }, { "acc": 0.71135869, "epoch": 0.006722486558990612, "grad_norm": 4.625, "learning_rate": 6.722475900558092e-07, "loss": 1.22958012, "memory(GiB)": 705.93, "step": 265, "train_speed(iter/s)": 0.074396 }, { "acc": 0.70565267, "epoch": 0.006849325928028171, "grad_norm": 5.6875, "learning_rate": 6.849315068493151e-07, "loss": 1.23322773, "memory(GiB)": 705.93, "step": 270, "train_speed(iter/s)": 0.074423 }, { "acc": 0.69605665, "epoch": 0.00697616529706573, "grad_norm": 4.75, "learning_rate": 6.97615423642821e-07, "loss": 1.2605278, "memory(GiB)": 705.93, "step": 275, "train_speed(iter/s)": 0.074447 }, { "acc": 0.69835491, "epoch": 0.007103004666103288, "grad_norm": 4.5625, "learning_rate": 7.102993404363267e-07, "loss": 1.27609825, "memory(GiB)": 705.93, "step": 280, "train_speed(iter/s)": 0.074516 }, { "acc": 0.71606073, "epoch": 0.007229844035140847, "grad_norm": 4.3125, "learning_rate": 7.229832572298326e-07, "loss": 1.20699759, "memory(GiB)": 705.93, "step": 285, "train_speed(iter/s)": 0.074507 }, { "acc": 0.71897593, "epoch": 0.007356683404178406, "grad_norm": 5.53125, "learning_rate": 7.356671740233384e-07, "loss": 1.2386878, "memory(GiB)": 705.93, "step": 290, "train_speed(iter/s)": 0.074491 }, { "acc": 0.70688396, "epoch": 0.007483522773215965, "grad_norm": 4.625, "learning_rate": 7.483510908168443e-07, "loss": 1.24334478, "memory(GiB)": 705.93, "step": 295, "train_speed(iter/s)": 0.0746 }, { "acc": 0.70727439, "epoch": 0.007610362142253523, "grad_norm": 4.5625, "learning_rate": 7.610350076103501e-07, "loss": 1.21690493, "memory(GiB)": 705.93, "step": 300, "train_speed(iter/s)": 0.074469 }, { "acc": 0.70311933, "epoch": 0.007737201511291082, "grad_norm": 4.96875, "learning_rate": 7.737189244038559e-07, "loss": 1.26911039, "memory(GiB)": 705.93, "step": 305, "train_speed(iter/s)": 0.074498 }, { "acc": 0.71290689, "epoch": 0.00786404088032864, "grad_norm": 3.96875, "learning_rate": 7.864028411973618e-07, "loss": 1.15569429, "memory(GiB)": 705.93, "step": 310, "train_speed(iter/s)": 0.074434 }, { "acc": 0.69536786, "epoch": 0.0079908802493662, "grad_norm": 4.34375, "learning_rate": 7.990867579908676e-07, "loss": 1.29864721, "memory(GiB)": 705.93, "step": 315, "train_speed(iter/s)": 0.074411 }, { "acc": 0.71117964, "epoch": 0.008117719618403758, "grad_norm": 4.4375, "learning_rate": 8.117706747843734e-07, "loss": 1.19631577, "memory(GiB)": 705.93, "step": 320, "train_speed(iter/s)": 0.074295 }, { "acc": 0.68551359, "epoch": 0.008244558987441317, "grad_norm": 4.84375, "learning_rate": 8.244545915778793e-07, "loss": 1.30613565, "memory(GiB)": 705.93, "step": 325, "train_speed(iter/s)": 0.074234 }, { "acc": 0.69570723, "epoch": 0.008371398356478876, "grad_norm": 4.5, "learning_rate": 8.371385083713851e-07, "loss": 1.29529123, "memory(GiB)": 705.93, "step": 330, "train_speed(iter/s)": 0.074243 }, { "acc": 0.70422235, "epoch": 0.008498237725516434, "grad_norm": 6.90625, "learning_rate": 8.49822425164891e-07, "loss": 1.25490093, "memory(GiB)": 705.93, "step": 335, "train_speed(iter/s)": 0.074311 }, { "acc": 0.7074904, "epoch": 0.008625077094553993, "grad_norm": 4.09375, "learning_rate": 8.625063419583967e-07, "loss": 1.21646786, "memory(GiB)": 705.93, "step": 340, "train_speed(iter/s)": 0.074285 }, { "acc": 0.70522237, "epoch": 0.008751916463591553, "grad_norm": 4.1875, "learning_rate": 8.751902587519026e-07, "loss": 1.26807604, "memory(GiB)": 705.93, "step": 345, "train_speed(iter/s)": 0.074222 }, { "acc": 0.69780521, "epoch": 0.00887875583262911, "grad_norm": 4.3125, "learning_rate": 8.878741755454085e-07, "loss": 1.23584023, "memory(GiB)": 705.93, "step": 350, "train_speed(iter/s)": 0.074189 }, { "acc": 0.69765496, "epoch": 0.00900559520166667, "grad_norm": 4.78125, "learning_rate": 9.005580923389143e-07, "loss": 1.28085318, "memory(GiB)": 705.93, "step": 355, "train_speed(iter/s)": 0.074193 }, { "acc": 0.70572066, "epoch": 0.009132434570704229, "grad_norm": 4.5625, "learning_rate": 9.132420091324201e-07, "loss": 1.17473335, "memory(GiB)": 718.79, "step": 360, "train_speed(iter/s)": 0.074188 }, { "acc": 0.71008086, "epoch": 0.009259273939741786, "grad_norm": 5.3125, "learning_rate": 9.259259259259259e-07, "loss": 1.24739075, "memory(GiB)": 718.79, "step": 365, "train_speed(iter/s)": 0.074225 }, { "acc": 0.69661398, "epoch": 0.009386113308779346, "grad_norm": 6.125, "learning_rate": 9.386098427194318e-07, "loss": 1.27091541, "memory(GiB)": 718.79, "step": 370, "train_speed(iter/s)": 0.074323 }, { "acc": 0.69881306, "epoch": 0.009512952677816905, "grad_norm": 4.125, "learning_rate": 9.512937595129377e-07, "loss": 1.3132082, "memory(GiB)": 718.79, "step": 375, "train_speed(iter/s)": 0.074312 }, { "acc": 0.70737042, "epoch": 0.009639792046854462, "grad_norm": 5.15625, "learning_rate": 9.639776763064435e-07, "loss": 1.21749325, "memory(GiB)": 718.79, "step": 380, "train_speed(iter/s)": 0.074229 }, { "acc": 0.70235839, "epoch": 0.009766631415892022, "grad_norm": 5.5625, "learning_rate": 9.766615930999493e-07, "loss": 1.22765017, "memory(GiB)": 718.79, "step": 385, "train_speed(iter/s)": 0.074116 }, { "acc": 0.70292387, "epoch": 0.009893470784929581, "grad_norm": 4.09375, "learning_rate": 9.89345509893455e-07, "loss": 1.21197252, "memory(GiB)": 718.79, "step": 390, "train_speed(iter/s)": 0.074113 }, { "acc": 0.70379477, "epoch": 0.010020310153967139, "grad_norm": 4.59375, "learning_rate": 1.002029426686961e-06, "loss": 1.27991791, "memory(GiB)": 718.79, "step": 395, "train_speed(iter/s)": 0.074103 }, { "acc": 0.6988462, "epoch": 0.010147149523004698, "grad_norm": 5.0, "learning_rate": 1.0147133434804667e-06, "loss": 1.26324711, "memory(GiB)": 718.79, "step": 400, "train_speed(iter/s)": 0.074191 }, { "acc": 0.69844575, "epoch": 0.010273988892042257, "grad_norm": 4.3125, "learning_rate": 1.0273972602739727e-06, "loss": 1.25943871, "memory(GiB)": 718.79, "step": 405, "train_speed(iter/s)": 0.074129 }, { "acc": 0.70567522, "epoch": 0.010400828261079815, "grad_norm": 5.0625, "learning_rate": 1.0400811770674785e-06, "loss": 1.28413792, "memory(GiB)": 718.79, "step": 410, "train_speed(iter/s)": 0.073992 }, { "acc": 0.71318221, "epoch": 0.010527667630117374, "grad_norm": 4.65625, "learning_rate": 1.0527650938609842e-06, "loss": 1.18144283, "memory(GiB)": 718.79, "step": 415, "train_speed(iter/s)": 0.074008 }, { "acc": 0.71340609, "epoch": 0.010654506999154933, "grad_norm": 5.125, "learning_rate": 1.0654490106544902e-06, "loss": 1.2107976, "memory(GiB)": 718.79, "step": 420, "train_speed(iter/s)": 0.073922 }, { "acc": 0.70895333, "epoch": 0.010781346368192491, "grad_norm": 4.25, "learning_rate": 1.078132927447996e-06, "loss": 1.20121546, "memory(GiB)": 718.79, "step": 425, "train_speed(iter/s)": 0.073912 }, { "acc": 0.70498199, "epoch": 0.01090818573723005, "grad_norm": 4.5, "learning_rate": 1.0908168442415017e-06, "loss": 1.24450054, "memory(GiB)": 718.79, "step": 430, "train_speed(iter/s)": 0.073938 }, { "acc": 0.69542966, "epoch": 0.01103502510626761, "grad_norm": 4.5, "learning_rate": 1.1035007610350077e-06, "loss": 1.25221138, "memory(GiB)": 718.79, "step": 435, "train_speed(iter/s)": 0.073937 }, { "acc": 0.71139903, "epoch": 0.011161864475305167, "grad_norm": 4.5, "learning_rate": 1.1161846778285134e-06, "loss": 1.18420372, "memory(GiB)": 718.79, "step": 440, "train_speed(iter/s)": 0.074021 }, { "acc": 0.71367621, "epoch": 0.011288703844342727, "grad_norm": 3.84375, "learning_rate": 1.1288685946220194e-06, "loss": 1.19961557, "memory(GiB)": 718.79, "step": 445, "train_speed(iter/s)": 0.074046 }, { "acc": 0.70494995, "epoch": 0.011415543213380286, "grad_norm": 3.984375, "learning_rate": 1.1415525114155251e-06, "loss": 1.23290558, "memory(GiB)": 718.79, "step": 450, "train_speed(iter/s)": 0.074006 }, { "acc": 0.70404992, "epoch": 0.011542382582417843, "grad_norm": 4.625, "learning_rate": 1.154236428209031e-06, "loss": 1.27957067, "memory(GiB)": 718.79, "step": 455, "train_speed(iter/s)": 0.074065 }, { "acc": 0.71185393, "epoch": 0.011669221951455403, "grad_norm": 3.984375, "learning_rate": 1.1669203450025369e-06, "loss": 1.2261385, "memory(GiB)": 718.79, "step": 460, "train_speed(iter/s)": 0.073995 }, { "acc": 0.71031203, "epoch": 0.01179606132049296, "grad_norm": 4.40625, "learning_rate": 1.1796042617960426e-06, "loss": 1.25894651, "memory(GiB)": 718.79, "step": 465, "train_speed(iter/s)": 0.073996 }, { "acc": 0.69609122, "epoch": 0.01192290068953052, "grad_norm": 4.34375, "learning_rate": 1.1922881785895486e-06, "loss": 1.3383832, "memory(GiB)": 718.79, "step": 470, "train_speed(iter/s)": 0.074019 }, { "acc": 0.71629095, "epoch": 0.012049740058568079, "grad_norm": 3.640625, "learning_rate": 1.2049720953830543e-06, "loss": 1.18814011, "memory(GiB)": 718.79, "step": 475, "train_speed(iter/s)": 0.074031 }, { "acc": 0.70650764, "epoch": 0.012176579427605637, "grad_norm": 4.59375, "learning_rate": 1.21765601217656e-06, "loss": 1.23158607, "memory(GiB)": 718.79, "step": 480, "train_speed(iter/s)": 0.074004 }, { "acc": 0.70237837, "epoch": 0.012303418796643196, "grad_norm": 3.390625, "learning_rate": 1.230339928970066e-06, "loss": 1.19678068, "memory(GiB)": 718.79, "step": 485, "train_speed(iter/s)": 0.073997 }, { "acc": 0.72122169, "epoch": 0.012430258165680755, "grad_norm": 3.890625, "learning_rate": 1.2430238457635718e-06, "loss": 1.1495203, "memory(GiB)": 718.79, "step": 490, "train_speed(iter/s)": 0.073988 }, { "acc": 0.71461229, "epoch": 0.012557097534718313, "grad_norm": 5.21875, "learning_rate": 1.2557077625570776e-06, "loss": 1.25303793, "memory(GiB)": 718.79, "step": 495, "train_speed(iter/s)": 0.073986 }, { "acc": 0.7040134, "epoch": 0.012683936903755872, "grad_norm": 5.03125, "learning_rate": 1.2683916793505835e-06, "loss": 1.21581659, "memory(GiB)": 718.79, "step": 500, "train_speed(iter/s)": 0.074024 }, { "epoch": 0.012683936903755872, "eval_acc": 0.7012420639877477, "eval_loss": 1.2403513193130493, "eval_runtime": 1149.4796, "eval_samples_per_second": 5.542, "eval_steps_per_second": 5.542, "step": 500 }, { "acc": 0.69238873, "epoch": 0.012810776272793431, "grad_norm": 4.78125, "learning_rate": 1.2810755961440893e-06, "loss": 1.2383461, "memory(GiB)": 693.27, "step": 505, "train_speed(iter/s)": 1.092316 }, { "acc": 0.71393585, "epoch": 0.012937615641830989, "grad_norm": 4.84375, "learning_rate": 1.2937595129375953e-06, "loss": 1.21627216, "memory(GiB)": 693.27, "step": 510, "train_speed(iter/s)": 0.954203 }, { "acc": 0.70454078, "epoch": 0.013064455010868548, "grad_norm": 4.53125, "learning_rate": 1.306443429731101e-06, "loss": 1.26664505, "memory(GiB)": 693.27, "step": 515, "train_speed(iter/s)": 0.852878 }, { "acc": 0.72112861, "epoch": 0.013191294379906108, "grad_norm": 3.40625, "learning_rate": 1.3191273465246068e-06, "loss": 1.15431147, "memory(GiB)": 703.87, "step": 520, "train_speed(iter/s)": 0.773944 }, { "acc": 0.70371137, "epoch": 0.013318133748943665, "grad_norm": 4.53125, "learning_rate": 1.3318112633181127e-06, "loss": 1.1782011, "memory(GiB)": 703.87, "step": 525, "train_speed(iter/s)": 0.709198 }, { "acc": 0.73194542, "epoch": 0.013444973117981224, "grad_norm": 4.25, "learning_rate": 1.3444951801116185e-06, "loss": 1.12573624, "memory(GiB)": 703.87, "step": 530, "train_speed(iter/s)": 0.654442 }, { "acc": 0.71719723, "epoch": 0.013571812487018784, "grad_norm": 4.65625, "learning_rate": 1.3571790969051243e-06, "loss": 1.14439783, "memory(GiB)": 703.87, "step": 535, "train_speed(iter/s)": 0.612931 }, { "acc": 0.71094031, "epoch": 0.013698651856056341, "grad_norm": 4.40625, "learning_rate": 1.3698630136986302e-06, "loss": 1.25032654, "memory(GiB)": 703.87, "step": 540, "train_speed(iter/s)": 0.575403 }, { "acc": 0.7056006, "epoch": 0.0138254912250939, "grad_norm": 4.125, "learning_rate": 1.382546930492136e-06, "loss": 1.24478254, "memory(GiB)": 703.87, "step": 545, "train_speed(iter/s)": 0.5439 }, { "acc": 0.70396962, "epoch": 0.01395233059413146, "grad_norm": 5.3125, "learning_rate": 1.395230847285642e-06, "loss": 1.26985121, "memory(GiB)": 716.04, "step": 550, "train_speed(iter/s)": 0.5125 }, { "acc": 0.71781588, "epoch": 0.014079169963169018, "grad_norm": 4.40625, "learning_rate": 1.4079147640791477e-06, "loss": 1.19301729, "memory(GiB)": 716.04, "step": 555, "train_speed(iter/s)": 0.48656 }, { "acc": 0.71928558, "epoch": 0.014206009332206577, "grad_norm": 4.34375, "learning_rate": 1.4205986808726534e-06, "loss": 1.1350502, "memory(GiB)": 716.04, "step": 560, "train_speed(iter/s)": 0.464071 }, { "acc": 0.71445975, "epoch": 0.014332848701244136, "grad_norm": 7.0625, "learning_rate": 1.4332825976661594e-06, "loss": 1.20523424, "memory(GiB)": 716.04, "step": 565, "train_speed(iter/s)": 0.44581 }, { "acc": 0.71506991, "epoch": 0.014459688070281694, "grad_norm": 6.0, "learning_rate": 1.4459665144596652e-06, "loss": 1.17690716, "memory(GiB)": 716.04, "step": 570, "train_speed(iter/s)": 0.426657 }, { "acc": 0.72804146, "epoch": 0.014586527439319253, "grad_norm": 4.46875, "learning_rate": 1.458650431253171e-06, "loss": 1.1356019, "memory(GiB)": 716.05, "step": 575, "train_speed(iter/s)": 0.406058 }, { "acc": 0.70740824, "epoch": 0.014713366808356812, "grad_norm": 5.375, "learning_rate": 1.4713343480466769e-06, "loss": 1.26851826, "memory(GiB)": 716.05, "step": 580, "train_speed(iter/s)": 0.391442 }, { "acc": 0.70877681, "epoch": 0.01484020617739437, "grad_norm": 4.78125, "learning_rate": 1.4840182648401826e-06, "loss": 1.24639091, "memory(GiB)": 716.07, "step": 585, "train_speed(iter/s)": 0.379618 }, { "acc": 0.70566602, "epoch": 0.01496704554643193, "grad_norm": 3.625, "learning_rate": 1.4967021816336886e-06, "loss": 1.15318823, "memory(GiB)": 716.07, "step": 590, "train_speed(iter/s)": 0.365777 }, { "acc": 0.70103335, "epoch": 0.015093884915469489, "grad_norm": 5.40625, "learning_rate": 1.5093860984271944e-06, "loss": 1.20599718, "memory(GiB)": 716.07, "step": 595, "train_speed(iter/s)": 0.354653 }, { "acc": 0.72306404, "epoch": 0.015220724284507046, "grad_norm": 3.578125, "learning_rate": 1.5220700152207001e-06, "loss": 1.12358561, "memory(GiB)": 716.07, "step": 600, "train_speed(iter/s)": 0.343837 }, { "acc": 0.72534409, "epoch": 0.015347563653544605, "grad_norm": 4.3125, "learning_rate": 1.534753932014206e-06, "loss": 1.16072998, "memory(GiB)": 716.07, "step": 605, "train_speed(iter/s)": 0.33414 }, { "acc": 0.7108871, "epoch": 0.015474403022582165, "grad_norm": 3.9375, "learning_rate": 1.5474378488077118e-06, "loss": 1.19201441, "memory(GiB)": 716.07, "step": 610, "train_speed(iter/s)": 0.325193 }, { "acc": 0.71111226, "epoch": 0.015601242391619722, "grad_norm": 4.5625, "learning_rate": 1.5601217656012176e-06, "loss": 1.1451581, "memory(GiB)": 716.07, "step": 615, "train_speed(iter/s)": 0.316636 }, { "acc": 0.71744499, "epoch": 0.01572808176065728, "grad_norm": 3.78125, "learning_rate": 1.5728056823947236e-06, "loss": 1.1465745, "memory(GiB)": 716.07, "step": 620, "train_speed(iter/s)": 0.308462 }, { "acc": 0.71238351, "epoch": 0.01585492112969484, "grad_norm": 4.09375, "learning_rate": 1.5854895991882293e-06, "loss": 1.20283289, "memory(GiB)": 716.07, "step": 625, "train_speed(iter/s)": 0.30123 }, { "acc": 0.7059279, "epoch": 0.0159817604987324, "grad_norm": 4.0625, "learning_rate": 1.5981735159817353e-06, "loss": 1.20629864, "memory(GiB)": 716.07, "step": 630, "train_speed(iter/s)": 0.294774 }, { "acc": 0.7183167, "epoch": 0.016108599867769958, "grad_norm": 5.09375, "learning_rate": 1.610857432775241e-06, "loss": 1.19549456, "memory(GiB)": 716.07, "step": 635, "train_speed(iter/s)": 0.288649 }, { "acc": 0.70174546, "epoch": 0.016235439236807515, "grad_norm": 8.75, "learning_rate": 1.6235413495687468e-06, "loss": 1.18085556, "memory(GiB)": 716.07, "step": 640, "train_speed(iter/s)": 0.283239 }, { "acc": 0.71624722, "epoch": 0.016362278605845076, "grad_norm": 3.90625, "learning_rate": 1.6362252663622528e-06, "loss": 1.18583794, "memory(GiB)": 716.07, "step": 645, "train_speed(iter/s)": 0.276183 }, { "acc": 0.70312815, "epoch": 0.016489117974882634, "grad_norm": 4.3125, "learning_rate": 1.6489091831557585e-06, "loss": 1.1900631, "memory(GiB)": 716.07, "step": 650, "train_speed(iter/s)": 0.270599 }, { "acc": 0.70930433, "epoch": 0.01661595734392019, "grad_norm": 3.84375, "learning_rate": 1.6615930999492643e-06, "loss": 1.21291971, "memory(GiB)": 716.08, "step": 655, "train_speed(iter/s)": 0.26502 }, { "acc": 0.7094172, "epoch": 0.016742796712957753, "grad_norm": 4.0, "learning_rate": 1.6742770167427702e-06, "loss": 1.15495892, "memory(GiB)": 716.08, "step": 660, "train_speed(iter/s)": 0.260404 }, { "acc": 0.71294503, "epoch": 0.01686963608199531, "grad_norm": 3.53125, "learning_rate": 1.686960933536276e-06, "loss": 1.15456524, "memory(GiB)": 716.08, "step": 665, "train_speed(iter/s)": 0.255147 }, { "acc": 0.72149391, "epoch": 0.016996475451032868, "grad_norm": 3.953125, "learning_rate": 1.699644850329782e-06, "loss": 1.14795704, "memory(GiB)": 716.08, "step": 670, "train_speed(iter/s)": 0.250096 }, { "acc": 0.7228827, "epoch": 0.01712331482007043, "grad_norm": 4.3125, "learning_rate": 1.7123287671232877e-06, "loss": 1.14000387, "memory(GiB)": 716.08, "step": 675, "train_speed(iter/s)": 0.245119 }, { "acc": 0.70801902, "epoch": 0.017250154189107986, "grad_norm": 4.21875, "learning_rate": 1.7250126839167935e-06, "loss": 1.2399003, "memory(GiB)": 716.08, "step": 680, "train_speed(iter/s)": 0.240931 }, { "acc": 0.71511226, "epoch": 0.017376993558145544, "grad_norm": 3.90625, "learning_rate": 1.7376966007102994e-06, "loss": 1.17621012, "memory(GiB)": 716.08, "step": 685, "train_speed(iter/s)": 0.236659 }, { "acc": 0.71745572, "epoch": 0.017503832927183105, "grad_norm": 3.640625, "learning_rate": 1.7503805175038052e-06, "loss": 1.19633217, "memory(GiB)": 716.08, "step": 690, "train_speed(iter/s)": 0.232434 }, { "acc": 0.71388378, "epoch": 0.017630672296220663, "grad_norm": 4.75, "learning_rate": 1.7630644342973112e-06, "loss": 1.15443182, "memory(GiB)": 716.08, "step": 695, "train_speed(iter/s)": 0.229161 }, { "acc": 0.71587558, "epoch": 0.01775751166525822, "grad_norm": 3.53125, "learning_rate": 1.775748351090817e-06, "loss": 1.17089396, "memory(GiB)": 716.08, "step": 700, "train_speed(iter/s)": 0.225759 }, { "acc": 0.71538463, "epoch": 0.01788435103429578, "grad_norm": 3.859375, "learning_rate": 1.7884322678843227e-06, "loss": 1.12359343, "memory(GiB)": 716.08, "step": 705, "train_speed(iter/s)": 0.222669 }, { "acc": 0.7066607, "epoch": 0.01801119040333334, "grad_norm": 3.71875, "learning_rate": 1.8011161846778286e-06, "loss": 1.18892384, "memory(GiB)": 716.08, "step": 710, "train_speed(iter/s)": 0.219329 }, { "acc": 0.70282207, "epoch": 0.018138029772370896, "grad_norm": 3.46875, "learning_rate": 1.8138001014713344e-06, "loss": 1.2215683, "memory(GiB)": 716.08, "step": 715, "train_speed(iter/s)": 0.216561 }, { "acc": 0.71406026, "epoch": 0.018264869141408457, "grad_norm": 4.0, "learning_rate": 1.8264840182648401e-06, "loss": 1.15411272, "memory(GiB)": 716.08, "step": 720, "train_speed(iter/s)": 0.214036 }, { "acc": 0.70888257, "epoch": 0.018391708510446015, "grad_norm": 3.75, "learning_rate": 1.8391679350583461e-06, "loss": 1.20209866, "memory(GiB)": 716.08, "step": 725, "train_speed(iter/s)": 0.21136 }, { "acc": 0.72232623, "epoch": 0.018518547879483573, "grad_norm": 4.0625, "learning_rate": 1.8518518518518519e-06, "loss": 1.09846487, "memory(GiB)": 716.08, "step": 730, "train_speed(iter/s)": 0.20886 }, { "acc": 0.70227747, "epoch": 0.018645387248521134, "grad_norm": 4.125, "learning_rate": 1.8645357686453578e-06, "loss": 1.20243912, "memory(GiB)": 716.08, "step": 735, "train_speed(iter/s)": 0.206542 }, { "acc": 0.71964135, "epoch": 0.01877222661755869, "grad_norm": 4.15625, "learning_rate": 1.8772196854388636e-06, "loss": 1.15801373, "memory(GiB)": 716.08, "step": 740, "train_speed(iter/s)": 0.204168 }, { "acc": 0.72229791, "epoch": 0.01889906598659625, "grad_norm": 3.6875, "learning_rate": 1.8899036022323693e-06, "loss": 1.1230051, "memory(GiB)": 716.08, "step": 745, "train_speed(iter/s)": 0.20172 }, { "acc": 0.71375947, "epoch": 0.01902590535563381, "grad_norm": 3.9375, "learning_rate": 1.9025875190258753e-06, "loss": 1.12471981, "memory(GiB)": 716.08, "step": 750, "train_speed(iter/s)": 0.199092 }, { "acc": 0.71728702, "epoch": 0.019152744724671367, "grad_norm": 3.65625, "learning_rate": 1.915271435819381e-06, "loss": 1.17249479, "memory(GiB)": 716.08, "step": 755, "train_speed(iter/s)": 0.196972 }, { "acc": 0.70999818, "epoch": 0.019279584093708925, "grad_norm": 3.59375, "learning_rate": 1.927955352612887e-06, "loss": 1.17380257, "memory(GiB)": 716.08, "step": 760, "train_speed(iter/s)": 0.194562 }, { "acc": 0.72474732, "epoch": 0.019406423462746486, "grad_norm": 3.546875, "learning_rate": 1.9406392694063926e-06, "loss": 1.17787828, "memory(GiB)": 716.08, "step": 765, "train_speed(iter/s)": 0.192477 }, { "acc": 0.72535315, "epoch": 0.019533262831784044, "grad_norm": 4.21875, "learning_rate": 1.9533231861998985e-06, "loss": 1.15834141, "memory(GiB)": 716.08, "step": 770, "train_speed(iter/s)": 0.190833 }, { "acc": 0.71626344, "epoch": 0.0196601022008216, "grad_norm": 3.96875, "learning_rate": 1.9660071029934045e-06, "loss": 1.16199188, "memory(GiB)": 716.08, "step": 775, "train_speed(iter/s)": 0.188479 }, { "acc": 0.69535327, "epoch": 0.019786941569859162, "grad_norm": 4.28125, "learning_rate": 1.97869101978691e-06, "loss": 1.22800732, "memory(GiB)": 716.08, "step": 780, "train_speed(iter/s)": 0.186594 }, { "acc": 0.73215113, "epoch": 0.01991378093889672, "grad_norm": 4.375, "learning_rate": 1.991374936580416e-06, "loss": 1.07615404, "memory(GiB)": 716.08, "step": 785, "train_speed(iter/s)": 0.184646 }, { "acc": 0.71870198, "epoch": 0.020040620307934277, "grad_norm": 4.375, "learning_rate": 2.004058853373922e-06, "loss": 1.16565113, "memory(GiB)": 716.08, "step": 790, "train_speed(iter/s)": 0.182939 }, { "acc": 0.71637645, "epoch": 0.02016745967697184, "grad_norm": 11.75, "learning_rate": 2.016742770167428e-06, "loss": 1.10822697, "memory(GiB)": 716.08, "step": 795, "train_speed(iter/s)": 0.181261 }, { "acc": 0.71879053, "epoch": 0.020294299046009396, "grad_norm": 3.90625, "learning_rate": 2.0294266869609335e-06, "loss": 1.07904673, "memory(GiB)": 716.08, "step": 800, "train_speed(iter/s)": 0.179662 }, { "acc": 0.71296697, "epoch": 0.020421138415046954, "grad_norm": 4.34375, "learning_rate": 2.0421106037544395e-06, "loss": 1.20433559, "memory(GiB)": 716.08, "step": 805, "train_speed(iter/s)": 0.178276 }, { "acc": 0.73687682, "epoch": 0.020547977784084515, "grad_norm": 4.4375, "learning_rate": 2.0547945205479454e-06, "loss": 1.06286173, "memory(GiB)": 716.08, "step": 810, "train_speed(iter/s)": 0.176887 }, { "acc": 0.73427939, "epoch": 0.020674817153122072, "grad_norm": 4.84375, "learning_rate": 2.067478437341451e-06, "loss": 1.17185049, "memory(GiB)": 716.08, "step": 815, "train_speed(iter/s)": 0.175582 }, { "acc": 0.71840081, "epoch": 0.02080165652215963, "grad_norm": 4.1875, "learning_rate": 2.080162354134957e-06, "loss": 1.13183632, "memory(GiB)": 716.08, "step": 820, "train_speed(iter/s)": 0.173893 }, { "acc": 0.70150599, "epoch": 0.02092849589119719, "grad_norm": 3.703125, "learning_rate": 2.092846270928463e-06, "loss": 1.19761896, "memory(GiB)": 716.08, "step": 825, "train_speed(iter/s)": 0.172303 }, { "acc": 0.71073704, "epoch": 0.02105533526023475, "grad_norm": 3.84375, "learning_rate": 2.1055301877219685e-06, "loss": 1.18635216, "memory(GiB)": 716.08, "step": 830, "train_speed(iter/s)": 0.1712 }, { "acc": 0.72272239, "epoch": 0.021182174629272306, "grad_norm": 3.21875, "learning_rate": 2.1182141045154744e-06, "loss": 1.13622894, "memory(GiB)": 716.08, "step": 835, "train_speed(iter/s)": 0.169851 }, { "acc": 0.72596316, "epoch": 0.021309013998309867, "grad_norm": 5.25, "learning_rate": 2.1308980213089804e-06, "loss": 1.08603525, "memory(GiB)": 716.08, "step": 840, "train_speed(iter/s)": 0.168258 }, { "acc": 0.72033401, "epoch": 0.021435853367347425, "grad_norm": 3.84375, "learning_rate": 2.143581938102486e-06, "loss": 1.09301767, "memory(GiB)": 716.08, "step": 845, "train_speed(iter/s)": 0.167022 }, { "acc": 0.7393373, "epoch": 0.021562692736384982, "grad_norm": 4.0625, "learning_rate": 2.156265854895992e-06, "loss": 1.10431881, "memory(GiB)": 716.08, "step": 850, "train_speed(iter/s)": 0.166005 }, { "acc": 0.70286732, "epoch": 0.021689532105422543, "grad_norm": 4.25, "learning_rate": 2.168949771689498e-06, "loss": 1.16547337, "memory(GiB)": 716.08, "step": 855, "train_speed(iter/s)": 0.164782 }, { "acc": 0.72485232, "epoch": 0.0218163714744601, "grad_norm": 3.6875, "learning_rate": 2.1816336884830034e-06, "loss": 1.13430386, "memory(GiB)": 716.08, "step": 860, "train_speed(iter/s)": 0.163615 }, { "acc": 0.70873051, "epoch": 0.02194321084349766, "grad_norm": 5.6875, "learning_rate": 2.1943176052765094e-06, "loss": 1.12950029, "memory(GiB)": 716.08, "step": 865, "train_speed(iter/s)": 0.1627 }, { "acc": 0.71550512, "epoch": 0.02207005021253522, "grad_norm": 4.59375, "learning_rate": 2.2070015220700153e-06, "loss": 1.20582247, "memory(GiB)": 716.08, "step": 870, "train_speed(iter/s)": 0.161699 }, { "acc": 0.71554365, "epoch": 0.022196889581572777, "grad_norm": 4.375, "learning_rate": 2.2196854388635213e-06, "loss": 1.1712286, "memory(GiB)": 716.08, "step": 875, "train_speed(iter/s)": 0.160699 }, { "acc": 0.72741241, "epoch": 0.022323728950610335, "grad_norm": 4.53125, "learning_rate": 2.232369355657027e-06, "loss": 1.08568029, "memory(GiB)": 716.08, "step": 880, "train_speed(iter/s)": 0.159521 }, { "acc": 0.71931663, "epoch": 0.022450568319647896, "grad_norm": 4.25, "learning_rate": 2.245053272450533e-06, "loss": 1.20063105, "memory(GiB)": 716.08, "step": 885, "train_speed(iter/s)": 0.158336 }, { "acc": 0.71398335, "epoch": 0.022577407688685453, "grad_norm": 5.53125, "learning_rate": 2.2577371892440388e-06, "loss": 1.15531969, "memory(GiB)": 716.08, "step": 890, "train_speed(iter/s)": 0.157416 }, { "acc": 0.72847533, "epoch": 0.02270424705772301, "grad_norm": 3.375, "learning_rate": 2.2704211060375443e-06, "loss": 1.13619661, "memory(GiB)": 716.08, "step": 895, "train_speed(iter/s)": 0.156327 }, { "acc": 0.72854233, "epoch": 0.022831086426760572, "grad_norm": 4.4375, "learning_rate": 2.2831050228310503e-06, "loss": 1.17181969, "memory(GiB)": 716.08, "step": 900, "train_speed(iter/s)": 0.155319 }, { "acc": 0.72286367, "epoch": 0.02295792579579813, "grad_norm": 3.921875, "learning_rate": 2.2957889396245563e-06, "loss": 1.14308462, "memory(GiB)": 716.08, "step": 905, "train_speed(iter/s)": 0.154399 }, { "acc": 0.71818662, "epoch": 0.023084765164835687, "grad_norm": 3.984375, "learning_rate": 2.308472856418062e-06, "loss": 1.12757072, "memory(GiB)": 716.08, "step": 910, "train_speed(iter/s)": 0.153588 }, { "acc": 0.72144122, "epoch": 0.023211604533873248, "grad_norm": 3.34375, "learning_rate": 2.3211567732115678e-06, "loss": 1.12636938, "memory(GiB)": 716.08, "step": 915, "train_speed(iter/s)": 0.152518 }, { "acc": 0.7466135, "epoch": 0.023338443902910806, "grad_norm": 3.171875, "learning_rate": 2.3338406900050737e-06, "loss": 1.05186653, "memory(GiB)": 716.08, "step": 920, "train_speed(iter/s)": 0.15169 }, { "acc": 0.72062955, "epoch": 0.023465283271948363, "grad_norm": 3.65625, "learning_rate": 2.3465246067985793e-06, "loss": 1.15060568, "memory(GiB)": 716.08, "step": 925, "train_speed(iter/s)": 0.150919 }, { "acc": 0.72815747, "epoch": 0.02359212264098592, "grad_norm": 5.625, "learning_rate": 2.3592085235920852e-06, "loss": 1.12048998, "memory(GiB)": 716.08, "step": 930, "train_speed(iter/s)": 0.150279 }, { "acc": 0.72536459, "epoch": 0.02371896201002348, "grad_norm": 4.125, "learning_rate": 2.371892440385591e-06, "loss": 1.11704025, "memory(GiB)": 716.08, "step": 935, "train_speed(iter/s)": 0.149418 }, { "acc": 0.72390294, "epoch": 0.02384580137906104, "grad_norm": 3.984375, "learning_rate": 2.384576357179097e-06, "loss": 1.10539808, "memory(GiB)": 716.08, "step": 940, "train_speed(iter/s)": 0.148782 }, { "acc": 0.72730379, "epoch": 0.023972640748098597, "grad_norm": 3.796875, "learning_rate": 2.3972602739726027e-06, "loss": 1.07713203, "memory(GiB)": 716.08, "step": 945, "train_speed(iter/s)": 0.148148 }, { "acc": 0.70982289, "epoch": 0.024099480117136158, "grad_norm": 4.90625, "learning_rate": 2.4099441907661087e-06, "loss": 1.20057573, "memory(GiB)": 716.08, "step": 950, "train_speed(iter/s)": 0.147535 }, { "acc": 0.73517013, "epoch": 0.024226319486173716, "grad_norm": 5.125, "learning_rate": 2.4226281075596147e-06, "loss": 1.15255814, "memory(GiB)": 716.08, "step": 955, "train_speed(iter/s)": 0.146776 }, { "acc": 0.72606153, "epoch": 0.024353158855211273, "grad_norm": 4.59375, "learning_rate": 2.43531202435312e-06, "loss": 1.13639002, "memory(GiB)": 716.08, "step": 960, "train_speed(iter/s)": 0.146107 }, { "acc": 0.71093731, "epoch": 0.024479998224248834, "grad_norm": 4.5, "learning_rate": 2.447995941146626e-06, "loss": 1.20805645, "memory(GiB)": 716.08, "step": 965, "train_speed(iter/s)": 0.145442 }, { "acc": 0.71310081, "epoch": 0.02460683759328639, "grad_norm": 3.5625, "learning_rate": 2.460679857940132e-06, "loss": 1.16273155, "memory(GiB)": 716.08, "step": 970, "train_speed(iter/s)": 0.144698 }, { "acc": 0.7224586, "epoch": 0.02473367696232395, "grad_norm": 4.375, "learning_rate": 2.4733637747336377e-06, "loss": 1.18405285, "memory(GiB)": 716.08, "step": 975, "train_speed(iter/s)": 0.144062 }, { "acc": 0.73900285, "epoch": 0.02486051633136151, "grad_norm": 5.5625, "learning_rate": 2.4860476915271436e-06, "loss": 1.08678923, "memory(GiB)": 716.08, "step": 980, "train_speed(iter/s)": 0.143315 }, { "acc": 0.72145615, "epoch": 0.024987355700399068, "grad_norm": 4.03125, "learning_rate": 2.4987316083206496e-06, "loss": 1.15568199, "memory(GiB)": 716.08, "step": 985, "train_speed(iter/s)": 0.142611 }, { "acc": 0.71068401, "epoch": 0.025114195069436625, "grad_norm": 4.21875, "learning_rate": 2.511415525114155e-06, "loss": 1.18622751, "memory(GiB)": 716.08, "step": 990, "train_speed(iter/s)": 0.142035 }, { "acc": 0.71146727, "epoch": 0.025241034438474187, "grad_norm": 3.640625, "learning_rate": 2.5240994419076615e-06, "loss": 1.12192678, "memory(GiB)": 716.08, "step": 995, "train_speed(iter/s)": 0.141456 }, { "acc": 0.73246889, "epoch": 0.025367873807511744, "grad_norm": 4.96875, "learning_rate": 2.536783358701167e-06, "loss": 1.08213797, "memory(GiB)": 716.08, "step": 1000, "train_speed(iter/s)": 0.140899 }, { "epoch": 0.025367873807511744, "eval_acc": 0.7120242204775331, "eval_loss": 1.1089752912521362, "eval_runtime": 1151.2781, "eval_samples_per_second": 5.533, "eval_steps_per_second": 5.533, "step": 1000 }, { "acc": 0.71823125, "epoch": 0.0254947131765493, "grad_norm": 4.4375, "learning_rate": 2.549467275494673e-06, "loss": 1.11135616, "memory(GiB)": 716.08, "step": 1005, "train_speed(iter/s)": 0.112263 }, { "acc": 0.71357379, "epoch": 0.025621552545586863, "grad_norm": 4.34375, "learning_rate": 2.5621511922881786e-06, "loss": 1.20040264, "memory(GiB)": 716.08, "step": 1010, "train_speed(iter/s)": 0.111972 }, { "acc": 0.71337662, "epoch": 0.02574839191462442, "grad_norm": 5.59375, "learning_rate": 2.5748351090816846e-06, "loss": 1.17563696, "memory(GiB)": 716.08, "step": 1015, "train_speed(iter/s)": 0.111616 }, { "acc": 0.72939906, "epoch": 0.025875231283661978, "grad_norm": 4.84375, "learning_rate": 2.5875190258751905e-06, "loss": 1.12692509, "memory(GiB)": 716.08, "step": 1020, "train_speed(iter/s)": 0.111378 }, { "acc": 0.73126035, "epoch": 0.02600207065269954, "grad_norm": 3.65625, "learning_rate": 2.6002029426686965e-06, "loss": 1.12436371, "memory(GiB)": 716.08, "step": 1025, "train_speed(iter/s)": 0.111028 }, { "acc": 0.72617035, "epoch": 0.026128910021737096, "grad_norm": 3.453125, "learning_rate": 2.612886859462202e-06, "loss": 1.0834898, "memory(GiB)": 716.08, "step": 1030, "train_speed(iter/s)": 0.110729 }, { "acc": 0.73020062, "epoch": 0.026255749390774654, "grad_norm": 3.9375, "learning_rate": 2.625570776255708e-06, "loss": 1.14709311, "memory(GiB)": 716.08, "step": 1035, "train_speed(iter/s)": 0.110407 }, { "acc": 0.71293941, "epoch": 0.026382588759812215, "grad_norm": 5.40625, "learning_rate": 2.6382546930492135e-06, "loss": 1.14082327, "memory(GiB)": 716.08, "step": 1040, "train_speed(iter/s)": 0.110197 }, { "acc": 0.72533112, "epoch": 0.026509428128849773, "grad_norm": 4.125, "learning_rate": 2.65093860984272e-06, "loss": 1.1423316, "memory(GiB)": 716.08, "step": 1045, "train_speed(iter/s)": 0.109816 }, { "acc": 0.72878118, "epoch": 0.02663626749788733, "grad_norm": 4.34375, "learning_rate": 2.6636225266362255e-06, "loss": 1.16954851, "memory(GiB)": 716.08, "step": 1050, "train_speed(iter/s)": 0.109619 }, { "acc": 0.72693763, "epoch": 0.02676310686692489, "grad_norm": 4.375, "learning_rate": 2.6763064434297314e-06, "loss": 1.11993637, "memory(GiB)": 716.08, "step": 1055, "train_speed(iter/s)": 0.109433 }, { "acc": 0.72025924, "epoch": 0.02688994623596245, "grad_norm": 3.5, "learning_rate": 2.688990360223237e-06, "loss": 1.13372126, "memory(GiB)": 716.08, "step": 1060, "train_speed(iter/s)": 0.109159 }, { "acc": 0.72486877, "epoch": 0.027016785605000006, "grad_norm": 4.21875, "learning_rate": 2.701674277016743e-06, "loss": 1.11448812, "memory(GiB)": 716.08, "step": 1065, "train_speed(iter/s)": 0.108959 }, { "acc": 0.72403121, "epoch": 0.027143624974037567, "grad_norm": 4.25, "learning_rate": 2.7143581938102485e-06, "loss": 1.15187216, "memory(GiB)": 716.08, "step": 1070, "train_speed(iter/s)": 0.108675 }, { "acc": 0.71573958, "epoch": 0.027270464343075125, "grad_norm": 4.0, "learning_rate": 2.727042110603755e-06, "loss": 1.13157339, "memory(GiB)": 716.08, "step": 1075, "train_speed(iter/s)": 0.108381 }, { "acc": 0.70645237, "epoch": 0.027397303712112683, "grad_norm": 3.59375, "learning_rate": 2.7397260273972604e-06, "loss": 1.21420641, "memory(GiB)": 716.08, "step": 1080, "train_speed(iter/s)": 0.108186 }, { "acc": 0.73235698, "epoch": 0.027524143081150244, "grad_norm": 3.90625, "learning_rate": 2.7524099441907664e-06, "loss": 1.12557955, "memory(GiB)": 716.08, "step": 1085, "train_speed(iter/s)": 0.108009 }, { "acc": 0.73463359, "epoch": 0.0276509824501878, "grad_norm": 3.171875, "learning_rate": 2.765093860984272e-06, "loss": 1.10591593, "memory(GiB)": 716.08, "step": 1090, "train_speed(iter/s)": 0.107794 }, { "acc": 0.71664882, "epoch": 0.02777782181922536, "grad_norm": 3.921875, "learning_rate": 2.7777777777777783e-06, "loss": 1.12714472, "memory(GiB)": 716.08, "step": 1095, "train_speed(iter/s)": 0.10761 }, { "acc": 0.72901711, "epoch": 0.02790466118826292, "grad_norm": 4.40625, "learning_rate": 2.790461694571284e-06, "loss": 1.06854181, "memory(GiB)": 716.08, "step": 1100, "train_speed(iter/s)": 0.1074 }, { "acc": 0.72950463, "epoch": 0.028031500557300477, "grad_norm": 4.34375, "learning_rate": 2.80314561136479e-06, "loss": 1.10076494, "memory(GiB)": 716.08, "step": 1105, "train_speed(iter/s)": 0.107183 }, { "acc": 0.72246914, "epoch": 0.028158339926338035, "grad_norm": 4.3125, "learning_rate": 2.8158295281582954e-06, "loss": 1.1285141, "memory(GiB)": 716.08, "step": 1110, "train_speed(iter/s)": 0.106991 }, { "acc": 0.71768341, "epoch": 0.028285179295375596, "grad_norm": 4.28125, "learning_rate": 2.8285134449518014e-06, "loss": 1.15132465, "memory(GiB)": 716.08, "step": 1115, "train_speed(iter/s)": 0.106783 }, { "acc": 0.72595868, "epoch": 0.028412018664413154, "grad_norm": 3.875, "learning_rate": 2.841197361745307e-06, "loss": 1.12989979, "memory(GiB)": 716.08, "step": 1120, "train_speed(iter/s)": 0.106576 }, { "acc": 0.7158792, "epoch": 0.02853885803345071, "grad_norm": 3.578125, "learning_rate": 2.8538812785388133e-06, "loss": 1.10597734, "memory(GiB)": 716.08, "step": 1125, "train_speed(iter/s)": 0.106391 }, { "acc": 0.72832103, "epoch": 0.028665697402488272, "grad_norm": 5.03125, "learning_rate": 2.866565195332319e-06, "loss": 1.11254539, "memory(GiB)": 716.08, "step": 1130, "train_speed(iter/s)": 0.10622 }, { "acc": 0.72651081, "epoch": 0.02879253677152583, "grad_norm": 3.3125, "learning_rate": 2.879249112125825e-06, "loss": 1.11735687, "memory(GiB)": 716.08, "step": 1135, "train_speed(iter/s)": 0.105958 }, { "acc": 0.72205391, "epoch": 0.028919376140563387, "grad_norm": 3.765625, "learning_rate": 2.8919330289193303e-06, "loss": 1.14999619, "memory(GiB)": 716.08, "step": 1140, "train_speed(iter/s)": 0.105761 }, { "acc": 0.71791921, "epoch": 0.02904621550960095, "grad_norm": 3.34375, "learning_rate": 2.9046169457128363e-06, "loss": 1.12417316, "memory(GiB)": 716.08, "step": 1145, "train_speed(iter/s)": 0.105574 }, { "acc": 0.72393527, "epoch": 0.029173054878638506, "grad_norm": 4.125, "learning_rate": 2.917300862506342e-06, "loss": 1.11965208, "memory(GiB)": 716.08, "step": 1150, "train_speed(iter/s)": 0.105426 }, { "acc": 0.73083835, "epoch": 0.029299894247676064, "grad_norm": 4.125, "learning_rate": 2.9299847792998482e-06, "loss": 1.13393221, "memory(GiB)": 716.08, "step": 1155, "train_speed(iter/s)": 0.105289 }, { "acc": 0.73041697, "epoch": 0.029426733616713625, "grad_norm": 4.15625, "learning_rate": 2.9426686960933538e-06, "loss": 1.15275049, "memory(GiB)": 716.08, "step": 1160, "train_speed(iter/s)": 0.105149 }, { "acc": 0.72296844, "epoch": 0.029553572985751182, "grad_norm": 4.21875, "learning_rate": 2.9553526128868598e-06, "loss": 1.11409264, "memory(GiB)": 716.08, "step": 1165, "train_speed(iter/s)": 0.104921 }, { "acc": 0.71729097, "epoch": 0.02968041235478874, "grad_norm": 3.734375, "learning_rate": 2.9680365296803653e-06, "loss": 1.1603075, "memory(GiB)": 716.08, "step": 1170, "train_speed(iter/s)": 0.104752 }, { "acc": 0.72649264, "epoch": 0.0298072517238263, "grad_norm": 4.6875, "learning_rate": 2.9807204464738717e-06, "loss": 1.13994551, "memory(GiB)": 716.08, "step": 1175, "train_speed(iter/s)": 0.104554 }, { "acc": 0.73480854, "epoch": 0.02993409109286386, "grad_norm": 3.484375, "learning_rate": 2.9934043632673772e-06, "loss": 1.06608372, "memory(GiB)": 716.08, "step": 1180, "train_speed(iter/s)": 0.104365 }, { "acc": 0.71536102, "epoch": 0.030060930461901416, "grad_norm": 3.6875, "learning_rate": 3.006088280060883e-06, "loss": 1.12518406, "memory(GiB)": 716.09, "step": 1185, "train_speed(iter/s)": 0.104157 }, { "acc": 0.71552415, "epoch": 0.030187769830938977, "grad_norm": 4.375, "learning_rate": 3.0187721968543887e-06, "loss": 1.1488308, "memory(GiB)": 716.09, "step": 1190, "train_speed(iter/s)": 0.103957 }, { "acc": 0.71360922, "epoch": 0.030314609199976535, "grad_norm": 4.25, "learning_rate": 3.0314561136478947e-06, "loss": 1.18062773, "memory(GiB)": 716.09, "step": 1195, "train_speed(iter/s)": 0.103765 }, { "acc": 0.74917727, "epoch": 0.030441448569014092, "grad_norm": 4.09375, "learning_rate": 3.0441400304414002e-06, "loss": 1.07541142, "memory(GiB)": 716.09, "step": 1200, "train_speed(iter/s)": 0.103609 }, { "acc": 0.71424084, "epoch": 0.030568287938051653, "grad_norm": 4.25, "learning_rate": 3.0568239472349066e-06, "loss": 1.19573317, "memory(GiB)": 716.09, "step": 1205, "train_speed(iter/s)": 0.103466 }, { "acc": 0.71354327, "epoch": 0.03069512730708921, "grad_norm": 4.90625, "learning_rate": 3.069507864028412e-06, "loss": 1.1080409, "memory(GiB)": 716.09, "step": 1210, "train_speed(iter/s)": 0.103274 }, { "acc": 0.7261847, "epoch": 0.03082196667612677, "grad_norm": 3.671875, "learning_rate": 3.082191780821918e-06, "loss": 1.14090605, "memory(GiB)": 716.09, "step": 1215, "train_speed(iter/s)": 0.103098 }, { "acc": 0.72223349, "epoch": 0.03094880604516433, "grad_norm": 4.4375, "learning_rate": 3.0948756976154237e-06, "loss": 1.12770157, "memory(GiB)": 716.09, "step": 1220, "train_speed(iter/s)": 0.102913 }, { "acc": 0.73490996, "epoch": 0.031075645414201887, "grad_norm": 3.828125, "learning_rate": 3.1075596144089297e-06, "loss": 1.09932175, "memory(GiB)": 716.09, "step": 1225, "train_speed(iter/s)": 0.102752 }, { "acc": 0.73099222, "epoch": 0.031202484783239445, "grad_norm": 3.796875, "learning_rate": 3.120243531202435e-06, "loss": 1.1075407, "memory(GiB)": 716.09, "step": 1230, "train_speed(iter/s)": 0.102574 }, { "acc": 0.7195909, "epoch": 0.031329324152277, "grad_norm": 4.5, "learning_rate": 3.1329274479959416e-06, "loss": 1.16610346, "memory(GiB)": 716.09, "step": 1235, "train_speed(iter/s)": 0.102423 }, { "acc": 0.72939353, "epoch": 0.03145616352131456, "grad_norm": 4.28125, "learning_rate": 3.145611364789447e-06, "loss": 1.14630384, "memory(GiB)": 716.09, "step": 1240, "train_speed(iter/s)": 0.10225 }, { "acc": 0.72962475, "epoch": 0.031583002890352124, "grad_norm": 5.34375, "learning_rate": 3.158295281582953e-06, "loss": 1.19253006, "memory(GiB)": 716.09, "step": 1245, "train_speed(iter/s)": 0.102085 }, { "acc": 0.72520251, "epoch": 0.03170984225938968, "grad_norm": 3.5625, "learning_rate": 3.1709791983764586e-06, "loss": 1.10104227, "memory(GiB)": 716.09, "step": 1250, "train_speed(iter/s)": 0.101931 }, { "acc": 0.72946897, "epoch": 0.03183668162842724, "grad_norm": 3.6875, "learning_rate": 3.183663115169965e-06, "loss": 1.16023941, "memory(GiB)": 716.09, "step": 1255, "train_speed(iter/s)": 0.101795 }, { "acc": 0.73677502, "epoch": 0.0319635209974648, "grad_norm": 3.203125, "learning_rate": 3.1963470319634706e-06, "loss": 1.11934443, "memory(GiB)": 716.09, "step": 1260, "train_speed(iter/s)": 0.101639 }, { "acc": 0.72850318, "epoch": 0.032090360366502355, "grad_norm": 3.359375, "learning_rate": 3.2090309487569765e-06, "loss": 1.13090591, "memory(GiB)": 716.09, "step": 1265, "train_speed(iter/s)": 0.101496 }, { "acc": 0.7251842, "epoch": 0.032217199735539916, "grad_norm": 4.21875, "learning_rate": 3.221714865550482e-06, "loss": 1.10692616, "memory(GiB)": 716.09, "step": 1270, "train_speed(iter/s)": 0.101307 }, { "acc": 0.71733479, "epoch": 0.03234403910457748, "grad_norm": 3.9375, "learning_rate": 3.234398782343988e-06, "loss": 1.1800293, "memory(GiB)": 716.09, "step": 1275, "train_speed(iter/s)": 0.101104 }, { "acc": 0.7245748, "epoch": 0.03247087847361503, "grad_norm": 3.78125, "learning_rate": 3.2470826991374936e-06, "loss": 1.10834522, "memory(GiB)": 716.09, "step": 1280, "train_speed(iter/s)": 0.101003 }, { "acc": 0.72692299, "epoch": 0.03259771784265259, "grad_norm": 3.921875, "learning_rate": 3.259766615931e-06, "loss": 1.11341286, "memory(GiB)": 716.09, "step": 1285, "train_speed(iter/s)": 0.100864 }, { "acc": 0.72953873, "epoch": 0.03272455721169015, "grad_norm": 3.6875, "learning_rate": 3.2724505327245055e-06, "loss": 1.11483431, "memory(GiB)": 716.09, "step": 1290, "train_speed(iter/s)": 0.100709 }, { "acc": 0.71973414, "epoch": 0.03285139658072771, "grad_norm": 3.984375, "learning_rate": 3.2851344495180115e-06, "loss": 1.15957565, "memory(GiB)": 716.09, "step": 1295, "train_speed(iter/s)": 0.100528 }, { "acc": 0.72765436, "epoch": 0.03297823594976527, "grad_norm": 3.546875, "learning_rate": 3.297818366311517e-06, "loss": 1.1660078, "memory(GiB)": 716.09, "step": 1300, "train_speed(iter/s)": 0.100342 }, { "acc": 0.72894225, "epoch": 0.03310507531880283, "grad_norm": 5.75, "learning_rate": 3.310502283105023e-06, "loss": 1.11197119, "memory(GiB)": 716.09, "step": 1305, "train_speed(iter/s)": 0.100215 }, { "acc": 0.7269526, "epoch": 0.03323191468784038, "grad_norm": 3.875, "learning_rate": 3.3231861998985286e-06, "loss": 1.10187292, "memory(GiB)": 716.09, "step": 1310, "train_speed(iter/s)": 0.100114 }, { "acc": 0.74278431, "epoch": 0.033358754056877944, "grad_norm": 4.59375, "learning_rate": 3.335870116692035e-06, "loss": 1.04058161, "memory(GiB)": 716.09, "step": 1315, "train_speed(iter/s)": 0.100021 }, { "acc": 0.72256579, "epoch": 0.033485593425915505, "grad_norm": 4.375, "learning_rate": 3.3485540334855405e-06, "loss": 1.14832373, "memory(GiB)": 716.09, "step": 1320, "train_speed(iter/s)": 0.099887 }, { "acc": 0.74053731, "epoch": 0.03361243279495306, "grad_norm": 4.71875, "learning_rate": 3.3612379502790465e-06, "loss": 1.0542223, "memory(GiB)": 716.09, "step": 1325, "train_speed(iter/s)": 0.099774 }, { "acc": 0.73818202, "epoch": 0.03373927216399062, "grad_norm": 4.8125, "learning_rate": 3.373921867072552e-06, "loss": 1.1176693, "memory(GiB)": 718.79, "step": 1330, "train_speed(iter/s)": 0.099648 }, { "acc": 0.72819867, "epoch": 0.03386611153302818, "grad_norm": 3.734375, "learning_rate": 3.3866057838660584e-06, "loss": 1.15310555, "memory(GiB)": 718.79, "step": 1335, "train_speed(iter/s)": 0.099511 }, { "acc": 0.72802615, "epoch": 0.033992950902065736, "grad_norm": 4.46875, "learning_rate": 3.399289700659564e-06, "loss": 1.09178839, "memory(GiB)": 718.79, "step": 1340, "train_speed(iter/s)": 0.099377 }, { "acc": 0.73928475, "epoch": 0.0341197902711033, "grad_norm": 4.46875, "learning_rate": 3.41197361745307e-06, "loss": 1.08111343, "memory(GiB)": 718.79, "step": 1345, "train_speed(iter/s)": 0.099233 }, { "acc": 0.72034421, "epoch": 0.03424662964014086, "grad_norm": 3.8125, "learning_rate": 3.4246575342465754e-06, "loss": 1.10428448, "memory(GiB)": 718.79, "step": 1350, "train_speed(iter/s)": 0.099076 }, { "acc": 0.74723248, "epoch": 0.03437346900917841, "grad_norm": 7.96875, "learning_rate": 3.4373414510400814e-06, "loss": 1.04754629, "memory(GiB)": 718.79, "step": 1355, "train_speed(iter/s)": 0.098976 }, { "acc": 0.72803183, "epoch": 0.03450030837821597, "grad_norm": 4.1875, "learning_rate": 3.450025367833587e-06, "loss": 1.08613291, "memory(GiB)": 718.79, "step": 1360, "train_speed(iter/s)": 0.098835 }, { "acc": 0.7227026, "epoch": 0.034627147747253534, "grad_norm": 3.84375, "learning_rate": 3.4627092846270933e-06, "loss": 1.10562115, "memory(GiB)": 718.79, "step": 1365, "train_speed(iter/s)": 0.098728 }, { "acc": 0.7225667, "epoch": 0.03475398711629109, "grad_norm": 4.1875, "learning_rate": 3.475393201420599e-06, "loss": 1.13411131, "memory(GiB)": 718.79, "step": 1370, "train_speed(iter/s)": 0.09862 }, { "acc": 0.72657218, "epoch": 0.03488082648532865, "grad_norm": 3.84375, "learning_rate": 3.488077118214105e-06, "loss": 1.11886721, "memory(GiB)": 718.79, "step": 1375, "train_speed(iter/s)": 0.098496 }, { "acc": 0.73108273, "epoch": 0.03500766585436621, "grad_norm": 3.59375, "learning_rate": 3.5007610350076104e-06, "loss": 1.10787992, "memory(GiB)": 718.79, "step": 1380, "train_speed(iter/s)": 0.098383 }, { "acc": 0.73188643, "epoch": 0.035134505223403764, "grad_norm": 3.96875, "learning_rate": 3.5134449518011164e-06, "loss": 1.09342623, "memory(GiB)": 718.79, "step": 1385, "train_speed(iter/s)": 0.098292 }, { "acc": 0.72065315, "epoch": 0.035261344592441325, "grad_norm": 3.84375, "learning_rate": 3.5261288685946223e-06, "loss": 1.12419987, "memory(GiB)": 718.79, "step": 1390, "train_speed(iter/s)": 0.09818 }, { "acc": 0.71385336, "epoch": 0.035388183961478886, "grad_norm": 3.78125, "learning_rate": 3.5388127853881283e-06, "loss": 1.17531729, "memory(GiB)": 718.79, "step": 1395, "train_speed(iter/s)": 0.098025 }, { "acc": 0.72281761, "epoch": 0.03551502333051644, "grad_norm": 3.6875, "learning_rate": 3.551496702181634e-06, "loss": 1.07453766, "memory(GiB)": 718.79, "step": 1400, "train_speed(iter/s)": 0.097941 }, { "acc": 0.72399907, "epoch": 0.035641862699554, "grad_norm": 5.0, "learning_rate": 3.56418061897514e-06, "loss": 1.17504416, "memory(GiB)": 718.79, "step": 1405, "train_speed(iter/s)": 0.097851 }, { "acc": 0.72053843, "epoch": 0.03576870206859156, "grad_norm": 4.03125, "learning_rate": 3.5768645357686453e-06, "loss": 1.19518795, "memory(GiB)": 718.79, "step": 1410, "train_speed(iter/s)": 0.097778 }, { "acc": 0.73095679, "epoch": 0.03589554143762912, "grad_norm": 3.765625, "learning_rate": 3.5895484525621517e-06, "loss": 1.13578186, "memory(GiB)": 718.79, "step": 1415, "train_speed(iter/s)": 0.097646 }, { "acc": 0.74252267, "epoch": 0.03602238080666668, "grad_norm": 4.125, "learning_rate": 3.6022323693556573e-06, "loss": 1.0720561, "memory(GiB)": 718.79, "step": 1420, "train_speed(iter/s)": 0.097491 }, { "acc": 0.72270479, "epoch": 0.03614922017570424, "grad_norm": 4.03125, "learning_rate": 3.6149162861491632e-06, "loss": 1.12422981, "memory(GiB)": 718.79, "step": 1425, "train_speed(iter/s)": 0.097393 }, { "acc": 0.74172378, "epoch": 0.03627605954474179, "grad_norm": 7.875, "learning_rate": 3.6276002029426688e-06, "loss": 1.049687, "memory(GiB)": 718.79, "step": 1430, "train_speed(iter/s)": 0.09725 }, { "acc": 0.71376433, "epoch": 0.036402898913779354, "grad_norm": 3.78125, "learning_rate": 3.6402841197361748e-06, "loss": 1.12558098, "memory(GiB)": 718.79, "step": 1435, "train_speed(iter/s)": 0.097148 }, { "acc": 0.72436094, "epoch": 0.036529738282816915, "grad_norm": 4.5, "learning_rate": 3.6529680365296803e-06, "loss": 1.14405594, "memory(GiB)": 718.79, "step": 1440, "train_speed(iter/s)": 0.097077 }, { "acc": 0.74326386, "epoch": 0.03665657765185447, "grad_norm": 3.65625, "learning_rate": 3.6656519533231867e-06, "loss": 1.09956245, "memory(GiB)": 718.79, "step": 1445, "train_speed(iter/s)": 0.096944 }, { "acc": 0.72299342, "epoch": 0.03678341702089203, "grad_norm": 3.671875, "learning_rate": 3.6783358701166922e-06, "loss": 1.16495848, "memory(GiB)": 718.79, "step": 1450, "train_speed(iter/s)": 0.096857 }, { "acc": 0.73660994, "epoch": 0.03691025638992959, "grad_norm": 3.546875, "learning_rate": 3.691019786910198e-06, "loss": 1.1454978, "memory(GiB)": 718.79, "step": 1455, "train_speed(iter/s)": 0.096751 }, { "acc": 0.74425159, "epoch": 0.037037095758967145, "grad_norm": 3.90625, "learning_rate": 3.7037037037037037e-06, "loss": 1.02513485, "memory(GiB)": 718.79, "step": 1460, "train_speed(iter/s)": 0.09666 }, { "acc": 0.72722569, "epoch": 0.037163935128004706, "grad_norm": 3.96875, "learning_rate": 3.7163876204972097e-06, "loss": 1.11825933, "memory(GiB)": 718.79, "step": 1465, "train_speed(iter/s)": 0.096518 }, { "acc": 0.72045307, "epoch": 0.03729077449704227, "grad_norm": 3.3125, "learning_rate": 3.7290715372907157e-06, "loss": 1.11089067, "memory(GiB)": 718.79, "step": 1470, "train_speed(iter/s)": 0.096393 }, { "acc": 0.72192326, "epoch": 0.03741761386607982, "grad_norm": 4.15625, "learning_rate": 3.7417554540842216e-06, "loss": 1.0924159, "memory(GiB)": 718.79, "step": 1475, "train_speed(iter/s)": 0.09632 }, { "acc": 0.741365, "epoch": 0.03754445323511738, "grad_norm": 3.5625, "learning_rate": 3.754439370877727e-06, "loss": 1.09033852, "memory(GiB)": 718.79, "step": 1480, "train_speed(iter/s)": 0.09622 }, { "acc": 0.72691522, "epoch": 0.03767129260415494, "grad_norm": 3.96875, "learning_rate": 3.767123287671233e-06, "loss": 1.0580699, "memory(GiB)": 718.79, "step": 1485, "train_speed(iter/s)": 0.09608 }, { "acc": 0.71948462, "epoch": 0.0377981319731925, "grad_norm": 4.125, "learning_rate": 3.7798072044647387e-06, "loss": 1.13767204, "memory(GiB)": 718.79, "step": 1490, "train_speed(iter/s)": 0.095975 }, { "acc": 0.70682645, "epoch": 0.03792497134223006, "grad_norm": 6.09375, "learning_rate": 3.792491121258245e-06, "loss": 1.20163145, "memory(GiB)": 718.79, "step": 1495, "train_speed(iter/s)": 0.095875 }, { "acc": 0.71535783, "epoch": 0.03805181071126762, "grad_norm": 3.65625, "learning_rate": 3.8051750380517506e-06, "loss": 1.15468054, "memory(GiB)": 718.79, "step": 1500, "train_speed(iter/s)": 0.095784 }, { "epoch": 0.03805181071126762, "eval_acc": 0.7180045595124154, "eval_loss": 1.0711435079574585, "eval_runtime": 1150.9869, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 1500 }, { "acc": 0.7246347, "epoch": 0.038178650080305174, "grad_norm": 3.40625, "learning_rate": 3.817858954845256e-06, "loss": 1.10216627, "memory(GiB)": 718.79, "step": 1505, "train_speed(iter/s)": 0.08583 }, { "acc": 0.72538137, "epoch": 0.038305489449342735, "grad_norm": 3.953125, "learning_rate": 3.830542871638762e-06, "loss": 1.10698538, "memory(GiB)": 718.79, "step": 1510, "train_speed(iter/s)": 0.085787 }, { "acc": 0.7315701, "epoch": 0.038432328818380296, "grad_norm": 5.71875, "learning_rate": 3.843226788432268e-06, "loss": 1.06786518, "memory(GiB)": 718.79, "step": 1515, "train_speed(iter/s)": 0.085701 }, { "acc": 0.73528957, "epoch": 0.03855916818741785, "grad_norm": 4.15625, "learning_rate": 3.855910705225774e-06, "loss": 1.10022621, "memory(GiB)": 718.79, "step": 1520, "train_speed(iter/s)": 0.085674 }, { "acc": 0.72468395, "epoch": 0.03868600755645541, "grad_norm": 3.625, "learning_rate": 3.86859462201928e-06, "loss": 1.14012966, "memory(GiB)": 718.79, "step": 1525, "train_speed(iter/s)": 0.085618 }, { "acc": 0.72148342, "epoch": 0.03881284692549297, "grad_norm": 3.6875, "learning_rate": 3.881278538812785e-06, "loss": 1.13102236, "memory(GiB)": 718.79, "step": 1530, "train_speed(iter/s)": 0.085572 }, { "acc": 0.73412213, "epoch": 0.038939686294530526, "grad_norm": 3.546875, "learning_rate": 3.893962455606292e-06, "loss": 1.08934793, "memory(GiB)": 718.79, "step": 1535, "train_speed(iter/s)": 0.085514 }, { "acc": 0.74264526, "epoch": 0.03906652566356809, "grad_norm": 4.625, "learning_rate": 3.906646372399797e-06, "loss": 1.04649611, "memory(GiB)": 718.79, "step": 1540, "train_speed(iter/s)": 0.085466 }, { "acc": 0.71356602, "epoch": 0.03919336503260565, "grad_norm": 4.28125, "learning_rate": 3.919330289193303e-06, "loss": 1.14192724, "memory(GiB)": 718.79, "step": 1545, "train_speed(iter/s)": 0.085419 }, { "acc": 0.71243205, "epoch": 0.0393202044016432, "grad_norm": 4.15625, "learning_rate": 3.932014205986809e-06, "loss": 1.11165705, "memory(GiB)": 718.79, "step": 1550, "train_speed(iter/s)": 0.085414 }, { "acc": 0.74409561, "epoch": 0.03944704377068076, "grad_norm": 4.3125, "learning_rate": 3.944698122780315e-06, "loss": 1.09080343, "memory(GiB)": 718.79, "step": 1555, "train_speed(iter/s)": 0.085381 }, { "acc": 0.72377443, "epoch": 0.039573883139718324, "grad_norm": 5.3125, "learning_rate": 3.95738203957382e-06, "loss": 1.14687262, "memory(GiB)": 718.79, "step": 1560, "train_speed(iter/s)": 0.085342 }, { "acc": 0.73633537, "epoch": 0.03970072250875588, "grad_norm": 4.375, "learning_rate": 3.970065956367327e-06, "loss": 1.11694841, "memory(GiB)": 718.79, "step": 1565, "train_speed(iter/s)": 0.085304 }, { "acc": 0.72283545, "epoch": 0.03982756187779344, "grad_norm": 3.671875, "learning_rate": 3.982749873160832e-06, "loss": 1.12590218, "memory(GiB)": 718.79, "step": 1570, "train_speed(iter/s)": 0.085286 }, { "acc": 0.7166234, "epoch": 0.039954401246831, "grad_norm": 4.875, "learning_rate": 3.995433789954338e-06, "loss": 1.1604516, "memory(GiB)": 718.79, "step": 1575, "train_speed(iter/s)": 0.085275 }, { "acc": 0.72920585, "epoch": 0.040081240615868555, "grad_norm": 3.296875, "learning_rate": 4.008117706747844e-06, "loss": 1.12943373, "memory(GiB)": 718.79, "step": 1580, "train_speed(iter/s)": 0.085214 }, { "acc": 0.73364539, "epoch": 0.040208079984906116, "grad_norm": 4.5, "learning_rate": 4.02080162354135e-06, "loss": 1.09394379, "memory(GiB)": 718.79, "step": 1585, "train_speed(iter/s)": 0.085208 }, { "acc": 0.73656225, "epoch": 0.04033491935394368, "grad_norm": 4.15625, "learning_rate": 4.033485540334856e-06, "loss": 1.09071598, "memory(GiB)": 718.79, "step": 1590, "train_speed(iter/s)": 0.085174 }, { "acc": 0.72590566, "epoch": 0.04046175872298123, "grad_norm": 4.0, "learning_rate": 4.046169457128362e-06, "loss": 1.1107914, "memory(GiB)": 718.79, "step": 1595, "train_speed(iter/s)": 0.085122 }, { "acc": 0.71310315, "epoch": 0.04058859809201879, "grad_norm": 3.765625, "learning_rate": 4.058853373921867e-06, "loss": 1.15837984, "memory(GiB)": 718.79, "step": 1600, "train_speed(iter/s)": 0.08511 }, { "acc": 0.74315119, "epoch": 0.04071543746105635, "grad_norm": 3.484375, "learning_rate": 4.071537290715373e-06, "loss": 1.02837381, "memory(GiB)": 718.79, "step": 1605, "train_speed(iter/s)": 0.085071 }, { "acc": 0.72865486, "epoch": 0.04084227683009391, "grad_norm": 5.125, "learning_rate": 4.084221207508879e-06, "loss": 1.09325609, "memory(GiB)": 718.79, "step": 1610, "train_speed(iter/s)": 0.085013 }, { "acc": 0.72813187, "epoch": 0.04096911619913147, "grad_norm": 4.0, "learning_rate": 4.096905124302385e-06, "loss": 1.13726902, "memory(GiB)": 718.79, "step": 1615, "train_speed(iter/s)": 0.084996 }, { "acc": 0.73846812, "epoch": 0.04109595556816903, "grad_norm": 3.625, "learning_rate": 4.109589041095891e-06, "loss": 1.05812473, "memory(GiB)": 718.79, "step": 1620, "train_speed(iter/s)": 0.084936 }, { "acc": 0.72395558, "epoch": 0.04122279493720658, "grad_norm": 4.15625, "learning_rate": 4.122272957889397e-06, "loss": 1.18141155, "memory(GiB)": 718.79, "step": 1625, "train_speed(iter/s)": 0.084887 }, { "acc": 0.72486963, "epoch": 0.041349634306244144, "grad_norm": 5.375, "learning_rate": 4.134956874682902e-06, "loss": 1.10160503, "memory(GiB)": 718.79, "step": 1630, "train_speed(iter/s)": 0.084858 }, { "acc": 0.73921947, "epoch": 0.041476473675281705, "grad_norm": 3.421875, "learning_rate": 4.147640791476408e-06, "loss": 1.06409588, "memory(GiB)": 718.79, "step": 1635, "train_speed(iter/s)": 0.084817 }, { "acc": 0.72238622, "epoch": 0.04160331304431926, "grad_norm": 3.53125, "learning_rate": 4.160324708269914e-06, "loss": 1.15455198, "memory(GiB)": 718.79, "step": 1640, "train_speed(iter/s)": 0.08478 }, { "acc": 0.74868407, "epoch": 0.04173015241335682, "grad_norm": 3.234375, "learning_rate": 4.17300862506342e-06, "loss": 1.03081617, "memory(GiB)": 718.79, "step": 1645, "train_speed(iter/s)": 0.084729 }, { "acc": 0.73257599, "epoch": 0.04185699178239438, "grad_norm": 4.15625, "learning_rate": 4.185692541856926e-06, "loss": 1.04626379, "memory(GiB)": 718.79, "step": 1650, "train_speed(iter/s)": 0.084669 }, { "acc": 0.73046741, "epoch": 0.041983831151431936, "grad_norm": 5.625, "learning_rate": 4.198376458650432e-06, "loss": 1.08477097, "memory(GiB)": 718.79, "step": 1655, "train_speed(iter/s)": 0.084641 }, { "acc": 0.74434614, "epoch": 0.0421106705204695, "grad_norm": 4.96875, "learning_rate": 4.211060375443937e-06, "loss": 1.04569769, "memory(GiB)": 718.79, "step": 1660, "train_speed(iter/s)": 0.084618 }, { "acc": 0.71682549, "epoch": 0.04223750988950706, "grad_norm": 3.53125, "learning_rate": 4.223744292237444e-06, "loss": 1.18480291, "memory(GiB)": 718.79, "step": 1665, "train_speed(iter/s)": 0.084603 }, { "acc": 0.72557116, "epoch": 0.04236434925854461, "grad_norm": 3.765625, "learning_rate": 4.236428209030949e-06, "loss": 1.12748051, "memory(GiB)": 718.79, "step": 1670, "train_speed(iter/s)": 0.084562 }, { "acc": 0.71331043, "epoch": 0.04249118862758217, "grad_norm": 3.953125, "learning_rate": 4.249112125824455e-06, "loss": 1.13741865, "memory(GiB)": 718.79, "step": 1675, "train_speed(iter/s)": 0.084536 }, { "acc": 0.72983394, "epoch": 0.042618027996619734, "grad_norm": 51.5, "learning_rate": 4.261796042617961e-06, "loss": 1.08446884, "memory(GiB)": 718.79, "step": 1680, "train_speed(iter/s)": 0.084498 }, { "acc": 0.74365244, "epoch": 0.04274486736565729, "grad_norm": 3.875, "learning_rate": 4.274479959411467e-06, "loss": 1.05496893, "memory(GiB)": 718.79, "step": 1685, "train_speed(iter/s)": 0.084481 }, { "acc": 0.74298811, "epoch": 0.04287170673469485, "grad_norm": 3.609375, "learning_rate": 4.287163876204972e-06, "loss": 1.01561308, "memory(GiB)": 718.79, "step": 1690, "train_speed(iter/s)": 0.084464 }, { "acc": 0.73061461, "epoch": 0.04299854610373241, "grad_norm": 3.734375, "learning_rate": 4.299847792998479e-06, "loss": 1.03858109, "memory(GiB)": 718.79, "step": 1695, "train_speed(iter/s)": 0.084461 }, { "acc": 0.72672672, "epoch": 0.043125385472769964, "grad_norm": 3.96875, "learning_rate": 4.312531709791984e-06, "loss": 1.10237541, "memory(GiB)": 718.79, "step": 1700, "train_speed(iter/s)": 0.084428 }, { "acc": 0.7149848, "epoch": 0.043252224841807525, "grad_norm": 5.03125, "learning_rate": 4.32521562658549e-06, "loss": 1.09635391, "memory(GiB)": 718.79, "step": 1705, "train_speed(iter/s)": 0.084401 }, { "acc": 0.73536196, "epoch": 0.043379064210845086, "grad_norm": 15.375, "learning_rate": 4.337899543378996e-06, "loss": 1.09021912, "memory(GiB)": 718.79, "step": 1710, "train_speed(iter/s)": 0.084385 }, { "acc": 0.71080928, "epoch": 0.04350590357988264, "grad_norm": 3.4375, "learning_rate": 4.350583460172502e-06, "loss": 1.15741005, "memory(GiB)": 718.79, "step": 1715, "train_speed(iter/s)": 0.084359 }, { "acc": 0.74639778, "epoch": 0.0436327429489202, "grad_norm": 4.1875, "learning_rate": 4.363267376966007e-06, "loss": 1.02539768, "memory(GiB)": 718.79, "step": 1720, "train_speed(iter/s)": 0.084341 }, { "acc": 0.72667909, "epoch": 0.04375958231795776, "grad_norm": 4.09375, "learning_rate": 4.375951293759514e-06, "loss": 1.13467817, "memory(GiB)": 718.79, "step": 1725, "train_speed(iter/s)": 0.084318 }, { "acc": 0.73781705, "epoch": 0.04388642168699532, "grad_norm": 5.03125, "learning_rate": 4.388635210553019e-06, "loss": 1.07571278, "memory(GiB)": 718.79, "step": 1730, "train_speed(iter/s)": 0.084282 }, { "acc": 0.73761697, "epoch": 0.04401326105603288, "grad_norm": 3.625, "learning_rate": 4.401319127346525e-06, "loss": 1.08938627, "memory(GiB)": 718.79, "step": 1735, "train_speed(iter/s)": 0.084225 }, { "acc": 0.73465357, "epoch": 0.04414010042507044, "grad_norm": 6.78125, "learning_rate": 4.414003044140031e-06, "loss": 1.06743126, "memory(GiB)": 718.79, "step": 1740, "train_speed(iter/s)": 0.08419 }, { "acc": 0.73911104, "epoch": 0.04426693979410799, "grad_norm": 4.65625, "learning_rate": 4.426686960933537e-06, "loss": 1.07912788, "memory(GiB)": 718.79, "step": 1745, "train_speed(iter/s)": 0.084122 }, { "acc": 0.72774343, "epoch": 0.044393779163145554, "grad_norm": 3.15625, "learning_rate": 4.439370877727043e-06, "loss": 1.12549067, "memory(GiB)": 718.79, "step": 1750, "train_speed(iter/s)": 0.084115 }, { "acc": 0.72437081, "epoch": 0.044520618532183115, "grad_norm": 4.375, "learning_rate": 4.4520547945205486e-06, "loss": 1.08264847, "memory(GiB)": 718.79, "step": 1755, "train_speed(iter/s)": 0.08408 }, { "acc": 0.74658637, "epoch": 0.04464745790122067, "grad_norm": 4.59375, "learning_rate": 4.464738711314054e-06, "loss": 1.04440289, "memory(GiB)": 718.79, "step": 1760, "train_speed(iter/s)": 0.084031 }, { "acc": 0.73288293, "epoch": 0.04477429727025823, "grad_norm": 3.84375, "learning_rate": 4.47742262810756e-06, "loss": 1.10686588, "memory(GiB)": 718.79, "step": 1765, "train_speed(iter/s)": 0.084008 }, { "acc": 0.73172789, "epoch": 0.04490113663929579, "grad_norm": 3.46875, "learning_rate": 4.490106544901066e-06, "loss": 1.03324766, "memory(GiB)": 718.79, "step": 1770, "train_speed(iter/s)": 0.083967 }, { "acc": 0.71975641, "epoch": 0.045027976008333345, "grad_norm": 4.4375, "learning_rate": 4.502790461694572e-06, "loss": 1.13407097, "memory(GiB)": 718.79, "step": 1775, "train_speed(iter/s)": 0.083936 }, { "acc": 0.73276114, "epoch": 0.045154815377370906, "grad_norm": 4.1875, "learning_rate": 4.5154743784880776e-06, "loss": 1.12307205, "memory(GiB)": 718.79, "step": 1780, "train_speed(iter/s)": 0.083926 }, { "acc": 0.73779793, "epoch": 0.04528165474640847, "grad_norm": 3.4375, "learning_rate": 4.5281582952815835e-06, "loss": 1.09218092, "memory(GiB)": 718.79, "step": 1785, "train_speed(iter/s)": 0.083867 }, { "acc": 0.73650222, "epoch": 0.04540849411544602, "grad_norm": 3.8125, "learning_rate": 4.540842212075089e-06, "loss": 1.06910944, "memory(GiB)": 718.79, "step": 1790, "train_speed(iter/s)": 0.083823 }, { "acc": 0.73312602, "epoch": 0.04553533348448358, "grad_norm": 4.28125, "learning_rate": 4.553526128868595e-06, "loss": 1.08117952, "memory(GiB)": 718.79, "step": 1795, "train_speed(iter/s)": 0.083806 }, { "acc": 0.7252749, "epoch": 0.045662172853521144, "grad_norm": 3.03125, "learning_rate": 4.566210045662101e-06, "loss": 1.09482632, "memory(GiB)": 718.79, "step": 1800, "train_speed(iter/s)": 0.08375 }, { "acc": 0.75420914, "epoch": 0.0457890122225587, "grad_norm": 4.125, "learning_rate": 4.5788939624556065e-06, "loss": 0.98805285, "memory(GiB)": 718.79, "step": 1805, "train_speed(iter/s)": 0.08372 }, { "acc": 0.73276443, "epoch": 0.04591585159159626, "grad_norm": 3.921875, "learning_rate": 4.5915778792491125e-06, "loss": 1.10571651, "memory(GiB)": 718.79, "step": 1810, "train_speed(iter/s)": 0.083678 }, { "acc": 0.70546103, "epoch": 0.04604269096063382, "grad_norm": 5.0, "learning_rate": 4.6042617960426185e-06, "loss": 1.1994751, "memory(GiB)": 718.79, "step": 1815, "train_speed(iter/s)": 0.083681 }, { "acc": 0.72367401, "epoch": 0.046169530329671374, "grad_norm": 4.3125, "learning_rate": 4.616945712836124e-06, "loss": 1.0912178, "memory(GiB)": 718.79, "step": 1820, "train_speed(iter/s)": 0.083675 }, { "acc": 0.73504739, "epoch": 0.046296369698708935, "grad_norm": 4.15625, "learning_rate": 4.62962962962963e-06, "loss": 1.12059212, "memory(GiB)": 718.79, "step": 1825, "train_speed(iter/s)": 0.083627 }, { "acc": 0.7331521, "epoch": 0.046423209067746496, "grad_norm": 6.34375, "learning_rate": 4.6423135464231355e-06, "loss": 1.15598421, "memory(GiB)": 718.79, "step": 1830, "train_speed(iter/s)": 0.083607 }, { "acc": 0.74487777, "epoch": 0.04655004843678405, "grad_norm": 5.40625, "learning_rate": 4.6549974632166415e-06, "loss": 1.07054653, "memory(GiB)": 718.79, "step": 1835, "train_speed(iter/s)": 0.083593 }, { "acc": 0.74779167, "epoch": 0.04667688780582161, "grad_norm": 3.765625, "learning_rate": 4.6676813800101475e-06, "loss": 1.02127342, "memory(GiB)": 718.79, "step": 1840, "train_speed(iter/s)": 0.083555 }, { "acc": 0.73100944, "epoch": 0.04680372717485917, "grad_norm": 4.25, "learning_rate": 4.6803652968036534e-06, "loss": 1.10492773, "memory(GiB)": 718.79, "step": 1845, "train_speed(iter/s)": 0.083539 }, { "acc": 0.74110584, "epoch": 0.046930566543896726, "grad_norm": 4.5625, "learning_rate": 4.6930492135971586e-06, "loss": 1.07143106, "memory(GiB)": 718.79, "step": 1850, "train_speed(iter/s)": 0.083494 }, { "acc": 0.73770719, "epoch": 0.04705740591293429, "grad_norm": 4.4375, "learning_rate": 4.705733130390665e-06, "loss": 1.06165476, "memory(GiB)": 718.79, "step": 1855, "train_speed(iter/s)": 0.083464 }, { "acc": 0.71670842, "epoch": 0.04718424528197184, "grad_norm": 3.09375, "learning_rate": 4.7184170471841705e-06, "loss": 1.09230862, "memory(GiB)": 718.79, "step": 1860, "train_speed(iter/s)": 0.08342 }, { "acc": 0.72829442, "epoch": 0.0473110846510094, "grad_norm": 3.765625, "learning_rate": 4.7311009639776765e-06, "loss": 1.09811954, "memory(GiB)": 718.79, "step": 1865, "train_speed(iter/s)": 0.08337 }, { "acc": 0.7211596, "epoch": 0.04743792402004696, "grad_norm": 4.25, "learning_rate": 4.743784880771182e-06, "loss": 1.10092802, "memory(GiB)": 718.79, "step": 1870, "train_speed(iter/s)": 0.08335 }, { "acc": 0.72994251, "epoch": 0.04756476338908452, "grad_norm": 3.734375, "learning_rate": 4.756468797564688e-06, "loss": 1.05655842, "memory(GiB)": 718.79, "step": 1875, "train_speed(iter/s)": 0.083345 }, { "acc": 0.73486295, "epoch": 0.04769160275812208, "grad_norm": 4.0, "learning_rate": 4.769152714358194e-06, "loss": 1.13491545, "memory(GiB)": 718.79, "step": 1880, "train_speed(iter/s)": 0.083304 }, { "acc": 0.73768172, "epoch": 0.04781844212715964, "grad_norm": 3.203125, "learning_rate": 4.7818366311517e-06, "loss": 1.09551935, "memory(GiB)": 718.79, "step": 1885, "train_speed(iter/s)": 0.083264 }, { "acc": 0.73570943, "epoch": 0.047945281496197194, "grad_norm": 3.53125, "learning_rate": 4.7945205479452054e-06, "loss": 1.08589182, "memory(GiB)": 718.79, "step": 1890, "train_speed(iter/s)": 0.083252 }, { "acc": 0.74790888, "epoch": 0.048072120865234755, "grad_norm": 4.03125, "learning_rate": 4.807204464738711e-06, "loss": 1.05189209, "memory(GiB)": 718.79, "step": 1895, "train_speed(iter/s)": 0.083219 }, { "acc": 0.73690128, "epoch": 0.048198960234272316, "grad_norm": 4.78125, "learning_rate": 4.819888381532217e-06, "loss": 1.1037364, "memory(GiB)": 718.79, "step": 1900, "train_speed(iter/s)": 0.083192 }, { "acc": 0.73878284, "epoch": 0.04832579960330987, "grad_norm": 4.9375, "learning_rate": 4.832572298325723e-06, "loss": 1.06208582, "memory(GiB)": 718.79, "step": 1905, "train_speed(iter/s)": 0.083136 }, { "acc": 0.73142195, "epoch": 0.04845263897234743, "grad_norm": 4.625, "learning_rate": 4.845256215119229e-06, "loss": 1.08186474, "memory(GiB)": 718.79, "step": 1910, "train_speed(iter/s)": 0.083093 }, { "acc": 0.73630753, "epoch": 0.04857947834138499, "grad_norm": 3.578125, "learning_rate": 4.857940131912735e-06, "loss": 1.07219143, "memory(GiB)": 718.79, "step": 1915, "train_speed(iter/s)": 0.083054 }, { "acc": 0.74507456, "epoch": 0.048706317710422546, "grad_norm": 3.8125, "learning_rate": 4.87062404870624e-06, "loss": 1.00137577, "memory(GiB)": 718.79, "step": 1920, "train_speed(iter/s)": 0.083008 }, { "acc": 0.71761727, "epoch": 0.04883315707946011, "grad_norm": 4.09375, "learning_rate": 4.883307965499746e-06, "loss": 1.09504614, "memory(GiB)": 718.79, "step": 1925, "train_speed(iter/s)": 0.082986 }, { "acc": 0.72379365, "epoch": 0.04895999644849767, "grad_norm": 4.03125, "learning_rate": 4.895991882293252e-06, "loss": 1.14927769, "memory(GiB)": 718.79, "step": 1930, "train_speed(iter/s)": 0.082967 }, { "acc": 0.73000317, "epoch": 0.04908683581753522, "grad_norm": 4.25, "learning_rate": 4.908675799086758e-06, "loss": 1.03359871, "memory(GiB)": 718.79, "step": 1935, "train_speed(iter/s)": 0.082953 }, { "acc": 0.73239641, "epoch": 0.04921367518657278, "grad_norm": 3.609375, "learning_rate": 4.921359715880264e-06, "loss": 1.09363413, "memory(GiB)": 718.79, "step": 1940, "train_speed(iter/s)": 0.082936 }, { "acc": 0.73723631, "epoch": 0.049340514555610344, "grad_norm": 3.5, "learning_rate": 4.93404363267377e-06, "loss": 1.0690979, "memory(GiB)": 718.79, "step": 1945, "train_speed(iter/s)": 0.082902 }, { "acc": 0.75831723, "epoch": 0.0494673539246479, "grad_norm": 3.015625, "learning_rate": 4.946727549467275e-06, "loss": 1.01846361, "memory(GiB)": 718.79, "step": 1950, "train_speed(iter/s)": 0.082888 }, { "acc": 0.74462595, "epoch": 0.04959419329368546, "grad_norm": 3.640625, "learning_rate": 4.959411466260781e-06, "loss": 1.00498743, "memory(GiB)": 718.79, "step": 1955, "train_speed(iter/s)": 0.082879 }, { "acc": 0.74097805, "epoch": 0.04972103266272302, "grad_norm": 4.28125, "learning_rate": 4.972095383054287e-06, "loss": 1.09568939, "memory(GiB)": 718.79, "step": 1960, "train_speed(iter/s)": 0.082878 }, { "acc": 0.73649545, "epoch": 0.049847872031760575, "grad_norm": 4.15625, "learning_rate": 4.984779299847793e-06, "loss": 1.02083015, "memory(GiB)": 718.79, "step": 1965, "train_speed(iter/s)": 0.082865 }, { "acc": 0.73449421, "epoch": 0.049974711400798136, "grad_norm": 4.9375, "learning_rate": 4.997463216641299e-06, "loss": 1.10768023, "memory(GiB)": 718.79, "step": 1970, "train_speed(iter/s)": 0.082844 }, { "acc": 0.73845868, "epoch": 0.0501015507698357, "grad_norm": 4.3125, "learning_rate": 5.010147133434805e-06, "loss": 1.06854124, "memory(GiB)": 718.79, "step": 1975, "train_speed(iter/s)": 0.082831 }, { "acc": 0.7425838, "epoch": 0.05022839013887325, "grad_norm": 4.375, "learning_rate": 5.02283105022831e-06, "loss": 1.03619995, "memory(GiB)": 718.79, "step": 1980, "train_speed(iter/s)": 0.082808 }, { "acc": 0.71783519, "epoch": 0.05035522950791081, "grad_norm": 4.4375, "learning_rate": 5.035514967021817e-06, "loss": 1.14234018, "memory(GiB)": 718.79, "step": 1985, "train_speed(iter/s)": 0.082781 }, { "acc": 0.73760967, "epoch": 0.05048206887694837, "grad_norm": 4.40625, "learning_rate": 5.048198883815323e-06, "loss": 1.06194096, "memory(GiB)": 718.79, "step": 1990, "train_speed(iter/s)": 0.082763 }, { "acc": 0.72205534, "epoch": 0.05060890824598593, "grad_norm": 4.53125, "learning_rate": 5.060882800608828e-06, "loss": 1.12164936, "memory(GiB)": 718.79, "step": 1995, "train_speed(iter/s)": 0.082746 }, { "acc": 0.73288484, "epoch": 0.05073574761502349, "grad_norm": 3.65625, "learning_rate": 5.073566717402334e-06, "loss": 1.07282648, "memory(GiB)": 718.79, "step": 2000, "train_speed(iter/s)": 0.08274 }, { "epoch": 0.05073574761502349, "eval_acc": 0.7228656999252614, "eval_loss": 1.0433900356292725, "eval_runtime": 1150.1664, "eval_samples_per_second": 5.538, "eval_steps_per_second": 5.538, "step": 2000 }, { "acc": 0.7355454, "epoch": 0.05086258698406105, "grad_norm": 4.40625, "learning_rate": 5.086250634195841e-06, "loss": 1.04103813, "memory(GiB)": 680.93, "step": 2005, "train_speed(iter/s)": 4.461309 }, { "acc": 0.72398758, "epoch": 0.0509894263530986, "grad_norm": 3.734375, "learning_rate": 5.098934550989346e-06, "loss": 1.08804283, "memory(GiB)": 683.64, "step": 2010, "train_speed(iter/s)": 3.949305 }, { "acc": 0.74594688, "epoch": 0.051116265722136164, "grad_norm": 4.84375, "learning_rate": 5.111618467782852e-06, "loss": 1.04766636, "memory(GiB)": 683.64, "step": 2015, "train_speed(iter/s)": 3.490683 }, { "acc": 0.73466501, "epoch": 0.051243105091173725, "grad_norm": 3.53125, "learning_rate": 5.124302384576357e-06, "loss": 1.06179695, "memory(GiB)": 686.41, "step": 2020, "train_speed(iter/s)": 3.136949 }, { "acc": 0.7349719, "epoch": 0.05136994446021128, "grad_norm": 3.6875, "learning_rate": 5.136986301369864e-06, "loss": 1.06118565, "memory(GiB)": 686.41, "step": 2025, "train_speed(iter/s)": 2.841685 }, { "acc": 0.73467398, "epoch": 0.05149678382924884, "grad_norm": 4.6875, "learning_rate": 5.149670218163369e-06, "loss": 1.03996954, "memory(GiB)": 686.43, "step": 2030, "train_speed(iter/s)": 2.618843 }, { "acc": 0.74474463, "epoch": 0.0516236231982864, "grad_norm": 4.65625, "learning_rate": 5.162354134956875e-06, "loss": 1.10850763, "memory(GiB)": 686.43, "step": 2035, "train_speed(iter/s)": 2.42797 }, { "acc": 0.74719334, "epoch": 0.051750462567323956, "grad_norm": 4.5625, "learning_rate": 5.175038051750381e-06, "loss": 1.04095182, "memory(GiB)": 686.43, "step": 2040, "train_speed(iter/s)": 2.254371 }, { "acc": 0.73669128, "epoch": 0.05187730193636152, "grad_norm": 5.0625, "learning_rate": 5.187721968543887e-06, "loss": 1.08177261, "memory(GiB)": 686.43, "step": 2045, "train_speed(iter/s)": 2.115123 }, { "acc": 0.73551726, "epoch": 0.05200414130539908, "grad_norm": 8.875, "learning_rate": 5.200405885337393e-06, "loss": 1.00123072, "memory(GiB)": 698.16, "step": 2050, "train_speed(iter/s)": 1.979157 }, { "acc": 0.73991194, "epoch": 0.05213098067443663, "grad_norm": 4.28125, "learning_rate": 5.213089802130898e-06, "loss": 1.0586195, "memory(GiB)": 698.16, "step": 2055, "train_speed(iter/s)": 1.86477 }, { "acc": 0.74162564, "epoch": 0.05225782004347419, "grad_norm": 3.359375, "learning_rate": 5.225773718924404e-06, "loss": 1.04727993, "memory(GiB)": 698.16, "step": 2060, "train_speed(iter/s)": 1.754588 }, { "acc": 0.71180758, "epoch": 0.052384659412511754, "grad_norm": 4.09375, "learning_rate": 5.238457635717911e-06, "loss": 1.18983078, "memory(GiB)": 698.16, "step": 2065, "train_speed(iter/s)": 1.666914 }, { "acc": 0.7354475, "epoch": 0.05251149878154931, "grad_norm": 4.125, "learning_rate": 5.251141552511416e-06, "loss": 1.08162603, "memory(GiB)": 698.16, "step": 2070, "train_speed(iter/s)": 1.584066 }, { "acc": 0.71916766, "epoch": 0.05263833815058687, "grad_norm": 3.453125, "learning_rate": 5.263825469304922e-06, "loss": 1.12984715, "memory(GiB)": 698.16, "step": 2075, "train_speed(iter/s)": 1.510185 }, { "acc": 0.71549516, "epoch": 0.05276517751962443, "grad_norm": 4.03125, "learning_rate": 5.276509386098427e-06, "loss": 1.13867826, "memory(GiB)": 700.87, "step": 2080, "train_speed(iter/s)": 1.445301 }, { "acc": 0.73147478, "epoch": 0.052892016888661984, "grad_norm": 4.125, "learning_rate": 5.289193302891934e-06, "loss": 1.05333605, "memory(GiB)": 700.87, "step": 2085, "train_speed(iter/s)": 1.380471 }, { "acc": 0.74165578, "epoch": 0.053018856257699545, "grad_norm": 3.265625, "learning_rate": 5.30187721968544e-06, "loss": 1.03108768, "memory(GiB)": 700.87, "step": 2090, "train_speed(iter/s)": 1.327641 }, { "acc": 0.72636805, "epoch": 0.053145695626737106, "grad_norm": 3.671875, "learning_rate": 5.314561136478945e-06, "loss": 1.09756832, "memory(GiB)": 700.87, "step": 2095, "train_speed(iter/s)": 1.274489 }, { "acc": 0.73040671, "epoch": 0.05327253499577466, "grad_norm": 4.875, "learning_rate": 5.327245053272451e-06, "loss": 1.07995081, "memory(GiB)": 700.87, "step": 2100, "train_speed(iter/s)": 1.226829 }, { "acc": 0.75579438, "epoch": 0.05339937436481222, "grad_norm": 3.984375, "learning_rate": 5.339928970065957e-06, "loss": 0.99694605, "memory(GiB)": 700.87, "step": 2105, "train_speed(iter/s)": 1.186683 }, { "acc": 0.74447131, "epoch": 0.05352621373384978, "grad_norm": 4.96875, "learning_rate": 5.352612886859463e-06, "loss": 1.07987146, "memory(GiB)": 700.87, "step": 2110, "train_speed(iter/s)": 1.147424 }, { "acc": 0.73772626, "epoch": 0.05365305310288734, "grad_norm": 3.796875, "learning_rate": 5.365296803652969e-06, "loss": 1.04535637, "memory(GiB)": 700.87, "step": 2115, "train_speed(iter/s)": 1.112323 }, { "acc": 0.73245783, "epoch": 0.0537798924719249, "grad_norm": 3.984375, "learning_rate": 5.377980720446474e-06, "loss": 1.0343647, "memory(GiB)": 700.87, "step": 2120, "train_speed(iter/s)": 1.072331 }, { "acc": 0.72603068, "epoch": 0.05390673184096246, "grad_norm": 4.59375, "learning_rate": 5.390664637239981e-06, "loss": 1.08113022, "memory(GiB)": 700.87, "step": 2125, "train_speed(iter/s)": 1.039739 }, { "acc": 0.73231473, "epoch": 0.05403357121000001, "grad_norm": 3.828125, "learning_rate": 5.403348554033486e-06, "loss": 1.0825552, "memory(GiB)": 700.87, "step": 2130, "train_speed(iter/s)": 1.011207 }, { "acc": 0.74060464, "epoch": 0.054160410579037574, "grad_norm": 3.75, "learning_rate": 5.416032470826992e-06, "loss": 1.02191248, "memory(GiB)": 700.87, "step": 2135, "train_speed(iter/s)": 0.983049 }, { "acc": 0.74956303, "epoch": 0.054287249948075135, "grad_norm": 5.15625, "learning_rate": 5.428716387620497e-06, "loss": 1.04850273, "memory(GiB)": 700.87, "step": 2140, "train_speed(iter/s)": 0.954975 }, { "acc": 0.73633666, "epoch": 0.05441408931711269, "grad_norm": 4.09375, "learning_rate": 5.441400304414004e-06, "loss": 1.09051123, "memory(GiB)": 700.87, "step": 2145, "train_speed(iter/s)": 0.932134 }, { "acc": 0.7308331, "epoch": 0.05454092868615025, "grad_norm": 3.46875, "learning_rate": 5.45408422120751e-06, "loss": 1.08491688, "memory(GiB)": 700.87, "step": 2150, "train_speed(iter/s)": 0.90534 }, { "acc": 0.72409673, "epoch": 0.05466776805518781, "grad_norm": 3.515625, "learning_rate": 5.466768138001015e-06, "loss": 1.13955355, "memory(GiB)": 700.87, "step": 2155, "train_speed(iter/s)": 0.883814 }, { "acc": 0.74231524, "epoch": 0.054794607424225365, "grad_norm": 3.484375, "learning_rate": 5.479452054794521e-06, "loss": 1.07483091, "memory(GiB)": 700.87, "step": 2160, "train_speed(iter/s)": 0.862482 }, { "acc": 0.72512736, "epoch": 0.054921446793262926, "grad_norm": 4.34375, "learning_rate": 5.492135971588028e-06, "loss": 1.07431698, "memory(GiB)": 700.87, "step": 2165, "train_speed(iter/s)": 0.843341 }, { "acc": 0.71340313, "epoch": 0.05504828616230049, "grad_norm": 3.3125, "learning_rate": 5.504819888381533e-06, "loss": 1.14585171, "memory(GiB)": 700.87, "step": 2170, "train_speed(iter/s)": 0.824528 }, { "acc": 0.73598676, "epoch": 0.05517512553133804, "grad_norm": 3.984375, "learning_rate": 5.517503805175039e-06, "loss": 1.05876732, "memory(GiB)": 700.87, "step": 2175, "train_speed(iter/s)": 0.804011 }, { "acc": 0.73373427, "epoch": 0.0553019649003756, "grad_norm": 4.125, "learning_rate": 5.530187721968544e-06, "loss": 1.04140282, "memory(GiB)": 700.87, "step": 2180, "train_speed(iter/s)": 0.785292 }, { "acc": 0.73256855, "epoch": 0.055428804269413164, "grad_norm": 3.84375, "learning_rate": 5.542871638762051e-06, "loss": 1.08922949, "memory(GiB)": 700.87, "step": 2185, "train_speed(iter/s)": 0.768485 }, { "acc": 0.72948971, "epoch": 0.05555564363845072, "grad_norm": 3.859375, "learning_rate": 5.555555555555557e-06, "loss": 1.12244968, "memory(GiB)": 700.87, "step": 2190, "train_speed(iter/s)": 0.75036 }, { "acc": 0.74479337, "epoch": 0.05568248300748828, "grad_norm": 3.015625, "learning_rate": 5.568239472349062e-06, "loss": 1.07049351, "memory(GiB)": 700.87, "step": 2195, "train_speed(iter/s)": 0.735817 }, { "acc": 0.73332505, "epoch": 0.05580932237652584, "grad_norm": 4.8125, "learning_rate": 5.580923389142568e-06, "loss": 1.08581324, "memory(GiB)": 700.87, "step": 2200, "train_speed(iter/s)": 0.721931 }, { "acc": 0.73061824, "epoch": 0.055936161745563394, "grad_norm": 3.6875, "learning_rate": 5.593607305936074e-06, "loss": 1.07395506, "memory(GiB)": 700.87, "step": 2205, "train_speed(iter/s)": 0.706303 }, { "acc": 0.73722563, "epoch": 0.056063001114600955, "grad_norm": 3.5, "learning_rate": 5.60629122272958e-06, "loss": 1.06038399, "memory(GiB)": 700.87, "step": 2210, "train_speed(iter/s)": 0.693077 }, { "acc": 0.72916126, "epoch": 0.056189840483638516, "grad_norm": 4.625, "learning_rate": 5.618975139523085e-06, "loss": 1.12785168, "memory(GiB)": 700.87, "step": 2215, "train_speed(iter/s)": 0.680522 }, { "acc": 0.73601155, "epoch": 0.05631667985267607, "grad_norm": 3.53125, "learning_rate": 5.631659056316591e-06, "loss": 1.0048377, "memory(GiB)": 700.87, "step": 2220, "train_speed(iter/s)": 0.668869 }, { "acc": 0.73946404, "epoch": 0.05644351922171363, "grad_norm": 4.03125, "learning_rate": 5.644342973110098e-06, "loss": 1.0624465, "memory(GiB)": 700.87, "step": 2225, "train_speed(iter/s)": 0.657785 }, { "acc": 0.74476805, "epoch": 0.05657035859075119, "grad_norm": 4.125, "learning_rate": 5.657026889903603e-06, "loss": 1.04915304, "memory(GiB)": 700.87, "step": 2230, "train_speed(iter/s)": 0.645161 }, { "acc": 0.73302474, "epoch": 0.056697197959788746, "grad_norm": 3.03125, "learning_rate": 5.669710806697109e-06, "loss": 1.08960724, "memory(GiB)": 700.87, "step": 2235, "train_speed(iter/s)": 0.633926 }, { "acc": 0.73430719, "epoch": 0.05682403732882631, "grad_norm": 3.78125, "learning_rate": 5.682394723490614e-06, "loss": 1.08016138, "memory(GiB)": 700.87, "step": 2240, "train_speed(iter/s)": 0.62405 }, { "acc": 0.72835646, "epoch": 0.05695087669786387, "grad_norm": 3.5, "learning_rate": 5.695078640284121e-06, "loss": 1.08384943, "memory(GiB)": 700.87, "step": 2245, "train_speed(iter/s)": 0.614653 }, { "acc": 0.73662305, "epoch": 0.05707771606690142, "grad_norm": 3.921875, "learning_rate": 5.7077625570776266e-06, "loss": 1.02084293, "memory(GiB)": 700.87, "step": 2250, "train_speed(iter/s)": 0.606191 }, { "acc": 0.7323246, "epoch": 0.057204555435938984, "grad_norm": 3.53125, "learning_rate": 5.720446473871132e-06, "loss": 1.11828623, "memory(GiB)": 700.87, "step": 2255, "train_speed(iter/s)": 0.597395 }, { "acc": 0.7449183, "epoch": 0.057331394804976545, "grad_norm": 3.84375, "learning_rate": 5.733130390664638e-06, "loss": 1.01307898, "memory(GiB)": 700.87, "step": 2260, "train_speed(iter/s)": 0.588252 }, { "acc": 0.73773422, "epoch": 0.0574582341740141, "grad_norm": 4.5625, "learning_rate": 5.7458143074581445e-06, "loss": 1.06351242, "memory(GiB)": 700.87, "step": 2265, "train_speed(iter/s)": 0.5798 }, { "acc": 0.73245668, "epoch": 0.05758507354305166, "grad_norm": 3.828125, "learning_rate": 5.75849822425165e-06, "loss": 1.08009129, "memory(GiB)": 700.87, "step": 2270, "train_speed(iter/s)": 0.571911 }, { "acc": 0.72299724, "epoch": 0.05771191291208922, "grad_norm": 3.984375, "learning_rate": 5.7711821410451556e-06, "loss": 1.1220315, "memory(GiB)": 700.87, "step": 2275, "train_speed(iter/s)": 0.562695 }, { "acc": 0.74474459, "epoch": 0.057838752281126775, "grad_norm": 14.8125, "learning_rate": 5.783866057838661e-06, "loss": 1.02579641, "memory(GiB)": 700.87, "step": 2280, "train_speed(iter/s)": 0.554916 }, { "acc": 0.71685443, "epoch": 0.057965591650164336, "grad_norm": 4.28125, "learning_rate": 5.7965499746321675e-06, "loss": 1.13489838, "memory(GiB)": 700.87, "step": 2285, "train_speed(iter/s)": 0.547775 }, { "acc": 0.73445511, "epoch": 0.0580924310192019, "grad_norm": 4.15625, "learning_rate": 5.809233891425673e-06, "loss": 1.08155212, "memory(GiB)": 700.87, "step": 2290, "train_speed(iter/s)": 0.539676 }, { "acc": 0.74162884, "epoch": 0.05821927038823945, "grad_norm": 3.921875, "learning_rate": 5.821917808219179e-06, "loss": 1.01798477, "memory(GiB)": 700.87, "step": 2295, "train_speed(iter/s)": 0.532626 }, { "acc": 0.74246216, "epoch": 0.05834610975727701, "grad_norm": 3.671875, "learning_rate": 5.834601725012684e-06, "loss": 1.0343708, "memory(GiB)": 700.87, "step": 2300, "train_speed(iter/s)": 0.525317 }, { "acc": 0.73538294, "epoch": 0.05847294912631457, "grad_norm": 4.28125, "learning_rate": 5.8472856418061905e-06, "loss": 1.09610291, "memory(GiB)": 700.87, "step": 2305, "train_speed(iter/s)": 0.518611 }, { "acc": 0.74390526, "epoch": 0.05859978849535213, "grad_norm": 4.0, "learning_rate": 5.8599695585996965e-06, "loss": 1.04676008, "memory(GiB)": 700.87, "step": 2310, "train_speed(iter/s)": 0.512165 }, { "acc": 0.74160547, "epoch": 0.05872662786438969, "grad_norm": 3.484375, "learning_rate": 5.872653475393202e-06, "loss": 1.0305625, "memory(GiB)": 700.87, "step": 2315, "train_speed(iter/s)": 0.50546 }, { "acc": 0.74237084, "epoch": 0.05885346723342725, "grad_norm": 3.796875, "learning_rate": 5.8853373921867076e-06, "loss": 1.11864405, "memory(GiB)": 700.87, "step": 2320, "train_speed(iter/s)": 0.499406 }, { "acc": 0.75012746, "epoch": 0.0589803066024648, "grad_norm": 3.9375, "learning_rate": 5.898021308980214e-06, "loss": 1.01910572, "memory(GiB)": 700.87, "step": 2325, "train_speed(iter/s)": 0.493161 }, { "acc": 0.71654973, "epoch": 0.059107145971502364, "grad_norm": 4.25, "learning_rate": 5.9107052257737195e-06, "loss": 1.16439257, "memory(GiB)": 700.87, "step": 2330, "train_speed(iter/s)": 0.486665 }, { "acc": 0.73143096, "epoch": 0.059233985340539926, "grad_norm": 3.53125, "learning_rate": 5.9233891425672255e-06, "loss": 1.12133656, "memory(GiB)": 700.87, "step": 2335, "train_speed(iter/s)": 0.48097 }, { "acc": 0.74867058, "epoch": 0.05936082470957748, "grad_norm": 3.671875, "learning_rate": 5.936073059360731e-06, "loss": 0.99707088, "memory(GiB)": 700.87, "step": 2340, "train_speed(iter/s)": 0.475972 }, { "acc": 0.74474683, "epoch": 0.05948766407861504, "grad_norm": 3.765625, "learning_rate": 5.948756976154237e-06, "loss": 1.04889841, "memory(GiB)": 700.87, "step": 2345, "train_speed(iter/s)": 0.470426 }, { "acc": 0.74111743, "epoch": 0.0596145034476526, "grad_norm": 3.796875, "learning_rate": 5.961440892947743e-06, "loss": 1.05445337, "memory(GiB)": 700.87, "step": 2350, "train_speed(iter/s)": 0.465105 }, { "acc": 0.74705968, "epoch": 0.059741342816690156, "grad_norm": 4.0625, "learning_rate": 5.9741248097412485e-06, "loss": 1.01334209, "memory(GiB)": 700.87, "step": 2355, "train_speed(iter/s)": 0.460325 }, { "acc": 0.74455948, "epoch": 0.05986818218572772, "grad_norm": 4.53125, "learning_rate": 5.9868087265347545e-06, "loss": 1.0563612, "memory(GiB)": 700.87, "step": 2360, "train_speed(iter/s)": 0.455518 }, { "acc": 0.74513974, "epoch": 0.05999502155476528, "grad_norm": 4.15625, "learning_rate": 5.99949264332826e-06, "loss": 1.01707058, "memory(GiB)": 700.87, "step": 2365, "train_speed(iter/s)": 0.45044 }, { "acc": 0.73526735, "epoch": 0.06012186092380283, "grad_norm": 3.296875, "learning_rate": 6.012176560121766e-06, "loss": 1.05965252, "memory(GiB)": 700.87, "step": 2370, "train_speed(iter/s)": 0.445338 }, { "acc": 0.74766822, "epoch": 0.06024870029284039, "grad_norm": 3.96875, "learning_rate": 6.0248604769152715e-06, "loss": 1.07013502, "memory(GiB)": 700.87, "step": 2375, "train_speed(iter/s)": 0.440601 }, { "acc": 0.74243588, "epoch": 0.060375539661877954, "grad_norm": 4.625, "learning_rate": 6.0375443937087775e-06, "loss": 0.98550501, "memory(GiB)": 700.87, "step": 2380, "train_speed(iter/s)": 0.436213 }, { "acc": 0.74141808, "epoch": 0.06050237903091551, "grad_norm": 3.765625, "learning_rate": 6.050228310502284e-06, "loss": 1.13836946, "memory(GiB)": 700.87, "step": 2385, "train_speed(iter/s)": 0.431323 }, { "acc": 0.73937325, "epoch": 0.06062921839995307, "grad_norm": 3.796875, "learning_rate": 6.062912227295789e-06, "loss": 1.04344378, "memory(GiB)": 700.87, "step": 2390, "train_speed(iter/s)": 0.426792 }, { "acc": 0.73284411, "epoch": 0.06075605776899063, "grad_norm": 3.8125, "learning_rate": 6.075596144089295e-06, "loss": 1.01525679, "memory(GiB)": 700.87, "step": 2395, "train_speed(iter/s)": 0.422365 }, { "acc": 0.73770318, "epoch": 0.060882897138028184, "grad_norm": 4.0, "learning_rate": 6.0882800608828005e-06, "loss": 1.10316944, "memory(GiB)": 700.87, "step": 2400, "train_speed(iter/s)": 0.41871 }, { "acc": 0.72508645, "epoch": 0.061009736507065745, "grad_norm": 3.46875, "learning_rate": 6.100963977676307e-06, "loss": 1.11522112, "memory(GiB)": 700.87, "step": 2405, "train_speed(iter/s)": 0.414659 }, { "acc": 0.73386221, "epoch": 0.061136575876103307, "grad_norm": 3.484375, "learning_rate": 6.113647894469813e-06, "loss": 1.04839478, "memory(GiB)": 700.87, "step": 2410, "train_speed(iter/s)": 0.410915 }, { "acc": 0.75192337, "epoch": 0.06126341524514086, "grad_norm": 3.296875, "learning_rate": 6.126331811263318e-06, "loss": 1.01782618, "memory(GiB)": 700.87, "step": 2415, "train_speed(iter/s)": 0.406722 }, { "acc": 0.75525494, "epoch": 0.06139025461417842, "grad_norm": 3.8125, "learning_rate": 6.139015728056824e-06, "loss": 1.02150507, "memory(GiB)": 700.87, "step": 2420, "train_speed(iter/s)": 0.403131 }, { "acc": 0.73428736, "epoch": 0.06151709398321598, "grad_norm": 3.640625, "learning_rate": 6.151699644850331e-06, "loss": 1.05219364, "memory(GiB)": 700.87, "step": 2425, "train_speed(iter/s)": 0.399433 }, { "acc": 0.74979439, "epoch": 0.06164393335225354, "grad_norm": 3.40625, "learning_rate": 6.164383561643836e-06, "loss": 1.01416903, "memory(GiB)": 700.87, "step": 2430, "train_speed(iter/s)": 0.395688 }, { "acc": 0.73622847, "epoch": 0.0617707727212911, "grad_norm": 4.125, "learning_rate": 6.177067478437342e-06, "loss": 1.05061922, "memory(GiB)": 712.83, "step": 2435, "train_speed(iter/s)": 0.391764 }, { "acc": 0.72560949, "epoch": 0.06189761209032866, "grad_norm": 3.65625, "learning_rate": 6.189751395230847e-06, "loss": 1.06969719, "memory(GiB)": 712.83, "step": 2440, "train_speed(iter/s)": 0.388082 }, { "acc": 0.73002267, "epoch": 0.06202445145936621, "grad_norm": 5.375, "learning_rate": 6.202435312024354e-06, "loss": 1.10300655, "memory(GiB)": 712.83, "step": 2445, "train_speed(iter/s)": 0.384928 }, { "acc": 0.75374389, "epoch": 0.062151290828403774, "grad_norm": 3.90625, "learning_rate": 6.215119228817859e-06, "loss": 0.98489933, "memory(GiB)": 712.83, "step": 2450, "train_speed(iter/s)": 0.381864 }, { "acc": 0.72771401, "epoch": 0.062278130197441335, "grad_norm": 3.828125, "learning_rate": 6.227803145611365e-06, "loss": 1.10539951, "memory(GiB)": 712.83, "step": 2455, "train_speed(iter/s)": 0.378815 }, { "acc": 0.72952728, "epoch": 0.06240496956647889, "grad_norm": 3.53125, "learning_rate": 6.24048706240487e-06, "loss": 1.08509541, "memory(GiB)": 712.83, "step": 2460, "train_speed(iter/s)": 0.375887 }, { "acc": 0.74388204, "epoch": 0.06253180893551645, "grad_norm": 4.6875, "learning_rate": 6.253170979198377e-06, "loss": 1.02197189, "memory(GiB)": 712.83, "step": 2465, "train_speed(iter/s)": 0.372477 }, { "acc": 0.72946029, "epoch": 0.062658648304554, "grad_norm": 3.53125, "learning_rate": 6.265854895991883e-06, "loss": 1.06045599, "memory(GiB)": 712.83, "step": 2470, "train_speed(iter/s)": 0.369194 }, { "acc": 0.75154672, "epoch": 0.06278548767359157, "grad_norm": 3.859375, "learning_rate": 6.278538812785388e-06, "loss": 1.03968582, "memory(GiB)": 712.83, "step": 2475, "train_speed(iter/s)": 0.366507 }, { "acc": 0.73219714, "epoch": 0.06291232704262913, "grad_norm": 4.71875, "learning_rate": 6.291222729578894e-06, "loss": 1.05832453, "memory(GiB)": 712.83, "step": 2480, "train_speed(iter/s)": 0.363203 }, { "acc": 0.73614001, "epoch": 0.06303916641166668, "grad_norm": 4.1875, "learning_rate": 6.303906646372401e-06, "loss": 1.09530144, "memory(GiB)": 712.83, "step": 2485, "train_speed(iter/s)": 0.360337 }, { "acc": 0.74804697, "epoch": 0.06316600578070425, "grad_norm": 3.828125, "learning_rate": 6.316590563165906e-06, "loss": 0.99869356, "memory(GiB)": 712.83, "step": 2490, "train_speed(iter/s)": 0.35758 }, { "acc": 0.7306551, "epoch": 0.0632928451497418, "grad_norm": 4.09375, "learning_rate": 6.329274479959412e-06, "loss": 1.08893976, "memory(GiB)": 712.83, "step": 2495, "train_speed(iter/s)": 0.355262 }, { "acc": 0.73677764, "epoch": 0.06341968451877936, "grad_norm": 3.578125, "learning_rate": 6.341958396752917e-06, "loss": 1.09019766, "memory(GiB)": 712.83, "step": 2500, "train_speed(iter/s)": 0.352811 }, { "epoch": 0.06341968451877936, "eval_acc": 0.7267906237976135, "eval_loss": 1.0218573808670044, "eval_runtime": 1151.4998, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 2500 }, { "acc": 0.73941984, "epoch": 0.06354652388781692, "grad_norm": 4.53125, "learning_rate": 6.354642313546424e-06, "loss": 1.08443918, "memory(GiB)": 712.85, "step": 2505, "train_speed(iter/s)": 0.280184 }, { "acc": 0.73856373, "epoch": 0.06367336325685448, "grad_norm": 3.21875, "learning_rate": 6.36732623033993e-06, "loss": 1.10211744, "memory(GiB)": 712.85, "step": 2510, "train_speed(iter/s)": 0.278502 }, { "acc": 0.74905643, "epoch": 0.06380020262589203, "grad_norm": 3.53125, "learning_rate": 6.380010147133435e-06, "loss": 0.97853765, "memory(GiB)": 712.85, "step": 2515, "train_speed(iter/s)": 0.276826 }, { "acc": 0.74244866, "epoch": 0.0639270419949296, "grad_norm": 3.59375, "learning_rate": 6.392694063926941e-06, "loss": 1.04342461, "memory(GiB)": 712.85, "step": 2520, "train_speed(iter/s)": 0.275407 }, { "acc": 0.72467942, "epoch": 0.06405388136396716, "grad_norm": 4.0, "learning_rate": 6.405377980720447e-06, "loss": 1.0771102, "memory(GiB)": 712.85, "step": 2525, "train_speed(iter/s)": 0.273741 }, { "acc": 0.75735292, "epoch": 0.06418072073300471, "grad_norm": 4.90625, "learning_rate": 6.418061897513953e-06, "loss": 0.95681753, "memory(GiB)": 712.85, "step": 2530, "train_speed(iter/s)": 0.272324 }, { "acc": 0.74181938, "epoch": 0.06430756010204228, "grad_norm": 4.3125, "learning_rate": 6.430745814307458e-06, "loss": 1.02523499, "memory(GiB)": 712.85, "step": 2535, "train_speed(iter/s)": 0.271029 }, { "acc": 0.73091626, "epoch": 0.06443439947107983, "grad_norm": 3.578125, "learning_rate": 6.443429731100964e-06, "loss": 1.06887321, "memory(GiB)": 712.85, "step": 2540, "train_speed(iter/s)": 0.269595 }, { "acc": 0.75726256, "epoch": 0.06456123884011739, "grad_norm": 4.875, "learning_rate": 6.456113647894471e-06, "loss": 1.0179143, "memory(GiB)": 712.85, "step": 2545, "train_speed(iter/s)": 0.268277 }, { "acc": 0.74385161, "epoch": 0.06468807820915495, "grad_norm": 2.96875, "learning_rate": 6.468797564687976e-06, "loss": 0.99797049, "memory(GiB)": 712.85, "step": 2550, "train_speed(iter/s)": 0.266782 }, { "acc": 0.75496349, "epoch": 0.06481491757819251, "grad_norm": 4.3125, "learning_rate": 6.481481481481482e-06, "loss": 0.96146011, "memory(GiB)": 712.85, "step": 2555, "train_speed(iter/s)": 0.265489 }, { "acc": 0.73853374, "epoch": 0.06494175694723006, "grad_norm": 3.8125, "learning_rate": 6.494165398274987e-06, "loss": 1.04984941, "memory(GiB)": 712.85, "step": 2560, "train_speed(iter/s)": 0.264227 }, { "acc": 0.7221128, "epoch": 0.06506859631626763, "grad_norm": 3.875, "learning_rate": 6.506849315068494e-06, "loss": 1.10938053, "memory(GiB)": 712.85, "step": 2565, "train_speed(iter/s)": 0.262889 }, { "acc": 0.74085445, "epoch": 0.06519543568530518, "grad_norm": 3.953125, "learning_rate": 6.519533231862e-06, "loss": 1.07764053, "memory(GiB)": 712.85, "step": 2570, "train_speed(iter/s)": 0.261706 }, { "acc": 0.75078063, "epoch": 0.06532227505434274, "grad_norm": 3.75, "learning_rate": 6.532217148655505e-06, "loss": 0.98579454, "memory(GiB)": 712.85, "step": 2575, "train_speed(iter/s)": 0.260279 }, { "acc": 0.73388495, "epoch": 0.0654491144233803, "grad_norm": 3.640625, "learning_rate": 6.544901065449011e-06, "loss": 1.14492016, "memory(GiB)": 712.85, "step": 2580, "train_speed(iter/s)": 0.259071 }, { "acc": 0.73719635, "epoch": 0.06557595379241786, "grad_norm": 5.03125, "learning_rate": 6.557584982242518e-06, "loss": 1.11883011, "memory(GiB)": 712.85, "step": 2585, "train_speed(iter/s)": 0.257923 }, { "acc": 0.7385664, "epoch": 0.06570279316145541, "grad_norm": 3.421875, "learning_rate": 6.570268899036023e-06, "loss": 1.05879765, "memory(GiB)": 712.85, "step": 2590, "train_speed(iter/s)": 0.256693 }, { "acc": 0.74173932, "epoch": 0.06582963253049298, "grad_norm": 3.515625, "learning_rate": 6.582952815829529e-06, "loss": 1.06894321, "memory(GiB)": 712.85, "step": 2595, "train_speed(iter/s)": 0.255582 }, { "acc": 0.73548675, "epoch": 0.06595647189953054, "grad_norm": 4.3125, "learning_rate": 6.595636732623034e-06, "loss": 1.12644081, "memory(GiB)": 712.85, "step": 2600, "train_speed(iter/s)": 0.254455 }, { "acc": 0.74300318, "epoch": 0.06608331126856809, "grad_norm": 4.40625, "learning_rate": 6.608320649416541e-06, "loss": 0.99693995, "memory(GiB)": 712.85, "step": 2605, "train_speed(iter/s)": 0.253249 }, { "acc": 0.73517036, "epoch": 0.06621015063760566, "grad_norm": 5.0625, "learning_rate": 6.621004566210046e-06, "loss": 1.06798601, "memory(GiB)": 712.85, "step": 2610, "train_speed(iter/s)": 0.252204 }, { "acc": 0.73838415, "epoch": 0.06633699000664321, "grad_norm": 3.8125, "learning_rate": 6.633688483003552e-06, "loss": 1.02430286, "memory(GiB)": 712.85, "step": 2615, "train_speed(iter/s)": 0.250921 }, { "acc": 0.73383999, "epoch": 0.06646382937568077, "grad_norm": 4.8125, "learning_rate": 6.646372399797057e-06, "loss": 1.04852753, "memory(GiB)": 712.85, "step": 2620, "train_speed(iter/s)": 0.249711 }, { "acc": 0.73751726, "epoch": 0.06659066874471833, "grad_norm": 4.46875, "learning_rate": 6.659056316590564e-06, "loss": 1.08480577, "memory(GiB)": 712.85, "step": 2625, "train_speed(iter/s)": 0.24864 }, { "acc": 0.74238749, "epoch": 0.06671750811375589, "grad_norm": 4.15625, "learning_rate": 6.67174023338407e-06, "loss": 1.09210796, "memory(GiB)": 712.85, "step": 2630, "train_speed(iter/s)": 0.247675 }, { "acc": 0.74454141, "epoch": 0.06684434748279344, "grad_norm": 5.09375, "learning_rate": 6.684424150177575e-06, "loss": 1.0669054, "memory(GiB)": 712.86, "step": 2635, "train_speed(iter/s)": 0.246539 }, { "acc": 0.74092984, "epoch": 0.06697118685183101, "grad_norm": 4.28125, "learning_rate": 6.697108066971081e-06, "loss": 1.0999836, "memory(GiB)": 712.86, "step": 2640, "train_speed(iter/s)": 0.245564 }, { "acc": 0.73976254, "epoch": 0.06709802622086856, "grad_norm": 3.734375, "learning_rate": 6.709791983764588e-06, "loss": 1.02645082, "memory(GiB)": 712.86, "step": 2645, "train_speed(iter/s)": 0.244465 }, { "acc": 0.73439779, "epoch": 0.06722486558990612, "grad_norm": 4.09375, "learning_rate": 6.722475900558093e-06, "loss": 1.05226164, "memory(GiB)": 712.86, "step": 2650, "train_speed(iter/s)": 0.243454 }, { "acc": 0.7532732, "epoch": 0.06735170495894369, "grad_norm": 4.09375, "learning_rate": 6.735159817351599e-06, "loss": 1.03699322, "memory(GiB)": 712.86, "step": 2655, "train_speed(iter/s)": 0.242302 }, { "acc": 0.74196596, "epoch": 0.06747854432798124, "grad_norm": 4.4375, "learning_rate": 6.747843734145104e-06, "loss": 1.02373457, "memory(GiB)": 712.86, "step": 2660, "train_speed(iter/s)": 0.241311 }, { "acc": 0.73736491, "epoch": 0.0676053836970188, "grad_norm": 4.96875, "learning_rate": 6.760527650938611e-06, "loss": 0.99768305, "memory(GiB)": 712.86, "step": 2665, "train_speed(iter/s)": 0.240079 }, { "acc": 0.73809638, "epoch": 0.06773222306605636, "grad_norm": 3.28125, "learning_rate": 6.773211567732117e-06, "loss": 1.03850145, "memory(GiB)": 712.86, "step": 2670, "train_speed(iter/s)": 0.239163 }, { "acc": 0.74410691, "epoch": 0.06785906243509392, "grad_norm": 3.421875, "learning_rate": 6.785895484525622e-06, "loss": 1.10494757, "memory(GiB)": 712.86, "step": 2675, "train_speed(iter/s)": 0.238189 }, { "acc": 0.74412398, "epoch": 0.06798590180413147, "grad_norm": 3.9375, "learning_rate": 6.798579401319128e-06, "loss": 1.00959387, "memory(GiB)": 712.86, "step": 2680, "train_speed(iter/s)": 0.237288 }, { "acc": 0.74229331, "epoch": 0.06811274117316904, "grad_norm": 3.5625, "learning_rate": 6.811263318112634e-06, "loss": 1.08812885, "memory(GiB)": 712.86, "step": 2685, "train_speed(iter/s)": 0.236401 }, { "acc": 0.72512851, "epoch": 0.0682395805422066, "grad_norm": 3.671875, "learning_rate": 6.82394723490614e-06, "loss": 1.05420189, "memory(GiB)": 712.86, "step": 2690, "train_speed(iter/s)": 0.235603 }, { "acc": 0.74012742, "epoch": 0.06836641991124415, "grad_norm": 3.765625, "learning_rate": 6.836631151699645e-06, "loss": 1.08686771, "memory(GiB)": 712.86, "step": 2695, "train_speed(iter/s)": 0.234609 }, { "acc": 0.72109275, "epoch": 0.06849325928028172, "grad_norm": 3.8125, "learning_rate": 6.849315068493151e-06, "loss": 1.10834608, "memory(GiB)": 712.86, "step": 2700, "train_speed(iter/s)": 0.233626 }, { "acc": 0.7333076, "epoch": 0.06862009864931927, "grad_norm": 4.21875, "learning_rate": 6.861998985286658e-06, "loss": 1.0789094, "memory(GiB)": 712.86, "step": 2705, "train_speed(iter/s)": 0.23278 }, { "acc": 0.74078126, "epoch": 0.06874693801835682, "grad_norm": 4.21875, "learning_rate": 6.874682902080163e-06, "loss": 1.13162394, "memory(GiB)": 712.86, "step": 2710, "train_speed(iter/s)": 0.231998 }, { "acc": 0.74010892, "epoch": 0.06887377738739439, "grad_norm": 4.25, "learning_rate": 6.887366818873669e-06, "loss": 1.01550131, "memory(GiB)": 712.86, "step": 2715, "train_speed(iter/s)": 0.231036 }, { "acc": 0.73923855, "epoch": 0.06900061675643195, "grad_norm": 3.828125, "learning_rate": 6.900050735667174e-06, "loss": 1.02979288, "memory(GiB)": 712.86, "step": 2720, "train_speed(iter/s)": 0.230264 }, { "acc": 0.74587674, "epoch": 0.0691274561254695, "grad_norm": 3.21875, "learning_rate": 6.912734652460681e-06, "loss": 1.01869745, "memory(GiB)": 712.86, "step": 2725, "train_speed(iter/s)": 0.229503 }, { "acc": 0.7465179, "epoch": 0.06925429549450707, "grad_norm": 4.78125, "learning_rate": 6.925418569254187e-06, "loss": 0.99871855, "memory(GiB)": 712.86, "step": 2730, "train_speed(iter/s)": 0.228757 }, { "acc": 0.7407052, "epoch": 0.06938113486354462, "grad_norm": 3.5, "learning_rate": 6.938102486047692e-06, "loss": 0.98910847, "memory(GiB)": 712.86, "step": 2735, "train_speed(iter/s)": 0.227816 }, { "acc": 0.74335132, "epoch": 0.06950797423258218, "grad_norm": 3.59375, "learning_rate": 6.950786402841198e-06, "loss": 1.02721691, "memory(GiB)": 712.86, "step": 2740, "train_speed(iter/s)": 0.22701 }, { "acc": 0.75367599, "epoch": 0.06963481360161974, "grad_norm": 3.921875, "learning_rate": 6.9634703196347046e-06, "loss": 1.00117378, "memory(GiB)": 712.86, "step": 2745, "train_speed(iter/s)": 0.226211 }, { "acc": 0.73998494, "epoch": 0.0697616529706573, "grad_norm": 3.8125, "learning_rate": 6.97615423642821e-06, "loss": 1.06691408, "memory(GiB)": 712.86, "step": 2750, "train_speed(iter/s)": 0.225488 }, { "acc": 0.73603945, "epoch": 0.06988849233969485, "grad_norm": 3.890625, "learning_rate": 6.988838153221716e-06, "loss": 1.05119514, "memory(GiB)": 712.86, "step": 2755, "train_speed(iter/s)": 0.224611 }, { "acc": 0.73085127, "epoch": 0.07001533170873242, "grad_norm": 4.25, "learning_rate": 7.001522070015221e-06, "loss": 1.05533676, "memory(GiB)": 712.86, "step": 2760, "train_speed(iter/s)": 0.223615 }, { "acc": 0.74050226, "epoch": 0.07014217107776997, "grad_norm": 3.859375, "learning_rate": 7.014205986808728e-06, "loss": 1.10257654, "memory(GiB)": 712.86, "step": 2765, "train_speed(iter/s)": 0.222838 }, { "acc": 0.73884907, "epoch": 0.07026901044680753, "grad_norm": 3.5625, "learning_rate": 7.026889903602233e-06, "loss": 1.03619003, "memory(GiB)": 712.86, "step": 2770, "train_speed(iter/s)": 0.22206 }, { "acc": 0.74189234, "epoch": 0.0703958498158451, "grad_norm": 4.375, "learning_rate": 7.039573820395739e-06, "loss": 1.09212751, "memory(GiB)": 712.86, "step": 2775, "train_speed(iter/s)": 0.221288 }, { "acc": 0.72997556, "epoch": 0.07052268918488265, "grad_norm": 3.71875, "learning_rate": 7.052257737189245e-06, "loss": 1.0832859, "memory(GiB)": 712.86, "step": 2780, "train_speed(iter/s)": 0.220591 }, { "acc": 0.73762894, "epoch": 0.0706495285539202, "grad_norm": 4.21875, "learning_rate": 7.064941653982751e-06, "loss": 1.05854664, "memory(GiB)": 712.86, "step": 2785, "train_speed(iter/s)": 0.219699 }, { "acc": 0.72688179, "epoch": 0.07077636792295777, "grad_norm": 3.6875, "learning_rate": 7.077625570776257e-06, "loss": 1.10451975, "memory(GiB)": 712.86, "step": 2790, "train_speed(iter/s)": 0.218894 }, { "acc": 0.74227581, "epoch": 0.07090320729199533, "grad_norm": 12.0625, "learning_rate": 7.090309487569762e-06, "loss": 1.02827902, "memory(GiB)": 712.86, "step": 2795, "train_speed(iter/s)": 0.218081 }, { "acc": 0.75992136, "epoch": 0.07103004666103288, "grad_norm": 4.1875, "learning_rate": 7.102993404363268e-06, "loss": 1.01788273, "memory(GiB)": 712.86, "step": 2800, "train_speed(iter/s)": 0.217261 }, { "acc": 0.75193658, "epoch": 0.07115688603007045, "grad_norm": 4.0625, "learning_rate": 7.1156773211567745e-06, "loss": 1.01184187, "memory(GiB)": 712.86, "step": 2805, "train_speed(iter/s)": 0.216469 }, { "acc": 0.75154963, "epoch": 0.071283725399108, "grad_norm": 4.125, "learning_rate": 7.12836123795028e-06, "loss": 1.04269762, "memory(GiB)": 712.86, "step": 2810, "train_speed(iter/s)": 0.215784 }, { "acc": 0.72939191, "epoch": 0.07141056476814556, "grad_norm": 3.6875, "learning_rate": 7.1410451547437856e-06, "loss": 1.06388893, "memory(GiB)": 712.86, "step": 2815, "train_speed(iter/s)": 0.215052 }, { "acc": 0.74546738, "epoch": 0.07153740413718312, "grad_norm": 3.84375, "learning_rate": 7.153729071537291e-06, "loss": 1.00280027, "memory(GiB)": 712.86, "step": 2820, "train_speed(iter/s)": 0.214331 }, { "acc": 0.74872088, "epoch": 0.07166424350622068, "grad_norm": 3.390625, "learning_rate": 7.1664129883307975e-06, "loss": 0.98580322, "memory(GiB)": 712.86, "step": 2825, "train_speed(iter/s)": 0.213653 }, { "acc": 0.75020013, "epoch": 0.07179108287525823, "grad_norm": 3.4375, "learning_rate": 7.1790969051243035e-06, "loss": 0.99759197, "memory(GiB)": 712.86, "step": 2830, "train_speed(iter/s)": 0.212971 }, { "acc": 0.72837143, "epoch": 0.0719179222442958, "grad_norm": 4.5625, "learning_rate": 7.191780821917809e-06, "loss": 1.08650703, "memory(GiB)": 712.86, "step": 2835, "train_speed(iter/s)": 0.212313 }, { "acc": 0.73942428, "epoch": 0.07204476161333336, "grad_norm": 4.0625, "learning_rate": 7.2044647387113146e-06, "loss": 1.07045317, "memory(GiB)": 712.86, "step": 2840, "train_speed(iter/s)": 0.211668 }, { "acc": 0.74549274, "epoch": 0.07217160098237091, "grad_norm": 3.453125, "learning_rate": 7.2171486555048205e-06, "loss": 1.01102924, "memory(GiB)": 712.86, "step": 2845, "train_speed(iter/s)": 0.210955 }, { "acc": 0.7416441, "epoch": 0.07229844035140848, "grad_norm": 3.4375, "learning_rate": 7.2298325722983265e-06, "loss": 1.01254959, "memory(GiB)": 712.86, "step": 2850, "train_speed(iter/s)": 0.210349 }, { "acc": 0.74920001, "epoch": 0.07242527972044603, "grad_norm": 4.375, "learning_rate": 7.242516489091832e-06, "loss": 1.05326624, "memory(GiB)": 712.86, "step": 2855, "train_speed(iter/s)": 0.20971 }, { "acc": 0.74601579, "epoch": 0.07255211908948359, "grad_norm": 4.15625, "learning_rate": 7.2552004058853376e-06, "loss": 1.01814995, "memory(GiB)": 712.86, "step": 2860, "train_speed(iter/s)": 0.208942 }, { "acc": 0.74433851, "epoch": 0.07267895845852115, "grad_norm": 3.84375, "learning_rate": 7.267884322678844e-06, "loss": 1.10652027, "memory(GiB)": 712.86, "step": 2865, "train_speed(iter/s)": 0.208349 }, { "acc": 0.73895044, "epoch": 0.07280579782755871, "grad_norm": 7.625, "learning_rate": 7.2805682394723495e-06, "loss": 1.07226782, "memory(GiB)": 712.86, "step": 2870, "train_speed(iter/s)": 0.207653 }, { "acc": 0.73754616, "epoch": 0.07293263719659626, "grad_norm": 3.625, "learning_rate": 7.2932521562658555e-06, "loss": 1.06369514, "memory(GiB)": 712.86, "step": 2875, "train_speed(iter/s)": 0.207069 }, { "acc": 0.73590159, "epoch": 0.07305947656563383, "grad_norm": 2.875, "learning_rate": 7.305936073059361e-06, "loss": 1.04320183, "memory(GiB)": 712.86, "step": 2880, "train_speed(iter/s)": 0.206417 }, { "acc": 0.7603518, "epoch": 0.07318631593467138, "grad_norm": 3.09375, "learning_rate": 7.318619989852867e-06, "loss": 1.06569614, "memory(GiB)": 712.86, "step": 2885, "train_speed(iter/s)": 0.205756 }, { "acc": 0.74738789, "epoch": 0.07331315530370894, "grad_norm": 4.59375, "learning_rate": 7.331303906646373e-06, "loss": 1.03869829, "memory(GiB)": 712.86, "step": 2890, "train_speed(iter/s)": 0.205213 }, { "acc": 0.74970798, "epoch": 0.0734399946727465, "grad_norm": 3.796875, "learning_rate": 7.3439878234398785e-06, "loss": 1.01421814, "memory(GiB)": 712.86, "step": 2895, "train_speed(iter/s)": 0.20456 }, { "acc": 0.72605772, "epoch": 0.07356683404178406, "grad_norm": 4.46875, "learning_rate": 7.3566717402333845e-06, "loss": 1.1174202, "memory(GiB)": 712.86, "step": 2900, "train_speed(iter/s)": 0.203949 }, { "acc": 0.7348217, "epoch": 0.07369367341082161, "grad_norm": 4.09375, "learning_rate": 7.369355657026891e-06, "loss": 1.05403404, "memory(GiB)": 712.86, "step": 2905, "train_speed(iter/s)": 0.203329 }, { "acc": 0.72840309, "epoch": 0.07382051277985918, "grad_norm": 3.234375, "learning_rate": 7.382039573820396e-06, "loss": 1.11284361, "memory(GiB)": 712.86, "step": 2910, "train_speed(iter/s)": 0.20278 }, { "acc": 0.73993363, "epoch": 0.07394735214889674, "grad_norm": 4.65625, "learning_rate": 7.394723490613902e-06, "loss": 1.0389164, "memory(GiB)": 712.86, "step": 2915, "train_speed(iter/s)": 0.202194 }, { "acc": 0.73437357, "epoch": 0.07407419151793429, "grad_norm": 3.546875, "learning_rate": 7.4074074074074075e-06, "loss": 1.06261711, "memory(GiB)": 712.86, "step": 2920, "train_speed(iter/s)": 0.20154 }, { "acc": 0.73155131, "epoch": 0.07420103088697186, "grad_norm": 3.984375, "learning_rate": 7.420091324200914e-06, "loss": 1.0855484, "memory(GiB)": 712.86, "step": 2925, "train_speed(iter/s)": 0.200982 }, { "acc": 0.73189301, "epoch": 0.07432787025600941, "grad_norm": 4.09375, "learning_rate": 7.432775240994419e-06, "loss": 1.09842205, "memory(GiB)": 712.86, "step": 2930, "train_speed(iter/s)": 0.200458 }, { "acc": 0.74786272, "epoch": 0.07445470962504697, "grad_norm": 4.09375, "learning_rate": 7.445459157787925e-06, "loss": 1.02163324, "memory(GiB)": 712.86, "step": 2935, "train_speed(iter/s)": 0.199888 }, { "acc": 0.73966484, "epoch": 0.07458154899408453, "grad_norm": 3.84375, "learning_rate": 7.458143074581431e-06, "loss": 1.01137133, "memory(GiB)": 712.86, "step": 2940, "train_speed(iter/s)": 0.199325 }, { "acc": 0.75620112, "epoch": 0.07470838836312209, "grad_norm": 3.90625, "learning_rate": 7.470826991374937e-06, "loss": 0.97116442, "memory(GiB)": 712.86, "step": 2945, "train_speed(iter/s)": 0.198721 }, { "acc": 0.72623677, "epoch": 0.07483522773215964, "grad_norm": 4.4375, "learning_rate": 7.483510908168443e-06, "loss": 1.10625401, "memory(GiB)": 712.86, "step": 2950, "train_speed(iter/s)": 0.198195 }, { "acc": 0.74675727, "epoch": 0.07496206710119721, "grad_norm": 3.328125, "learning_rate": 7.496194824961948e-06, "loss": 0.98624315, "memory(GiB)": 712.86, "step": 2955, "train_speed(iter/s)": 0.197501 }, { "acc": 0.74540873, "epoch": 0.07508890647023476, "grad_norm": 3.40625, "learning_rate": 7.508878741755454e-06, "loss": 1.03904848, "memory(GiB)": 712.86, "step": 2960, "train_speed(iter/s)": 0.196881 }, { "acc": 0.73268886, "epoch": 0.07521574583927232, "grad_norm": 3.59375, "learning_rate": 7.521562658548961e-06, "loss": 1.02398787, "memory(GiB)": 712.86, "step": 2965, "train_speed(iter/s)": 0.196272 }, { "acc": 0.74799376, "epoch": 0.07534258520830989, "grad_norm": 3.578125, "learning_rate": 7.534246575342466e-06, "loss": 0.99966927, "memory(GiB)": 712.86, "step": 2970, "train_speed(iter/s)": 0.195764 }, { "acc": 0.74372325, "epoch": 0.07546942457734744, "grad_norm": 3.46875, "learning_rate": 7.546930492135972e-06, "loss": 1.00786362, "memory(GiB)": 712.86, "step": 2975, "train_speed(iter/s)": 0.195249 }, { "acc": 0.74997015, "epoch": 0.075596263946385, "grad_norm": 6.96875, "learning_rate": 7.559614408929477e-06, "loss": 0.99255104, "memory(GiB)": 712.86, "step": 2980, "train_speed(iter/s)": 0.194687 }, { "acc": 0.74056463, "epoch": 0.07572310331542256, "grad_norm": 3.765625, "learning_rate": 7.572298325722984e-06, "loss": 1.04237261, "memory(GiB)": 712.86, "step": 2985, "train_speed(iter/s)": 0.1942 }, { "acc": 0.75241714, "epoch": 0.07584994268446012, "grad_norm": 3.53125, "learning_rate": 7.58498224251649e-06, "loss": 1.01268606, "memory(GiB)": 712.86, "step": 2990, "train_speed(iter/s)": 0.193636 }, { "acc": 0.73697963, "epoch": 0.07597678205349767, "grad_norm": 3.84375, "learning_rate": 7.597666159309995e-06, "loss": 1.07770824, "memory(GiB)": 712.86, "step": 2995, "train_speed(iter/s)": 0.193134 }, { "acc": 0.73533049, "epoch": 0.07610362142253524, "grad_norm": 3.78125, "learning_rate": 7.610350076103501e-06, "loss": 1.0706131, "memory(GiB)": 712.86, "step": 3000, "train_speed(iter/s)": 0.192592 }, { "epoch": 0.07610362142253524, "eval_acc": 0.7298374007854861, "eval_loss": 1.005064606666565, "eval_runtime": 1150.8001, "eval_samples_per_second": 5.535, "eval_steps_per_second": 5.535, "step": 3000 }, { "acc": 0.75979223, "epoch": 0.0762304607915728, "grad_norm": 2.9375, "learning_rate": 7.623033992897007e-06, "loss": 0.97026253, "memory(GiB)": 712.86, "step": 3005, "train_speed(iter/s)": 0.172241 }, { "acc": 0.74157848, "epoch": 0.07635730016061035, "grad_norm": 4.25, "learning_rate": 7.635717909690512e-06, "loss": 1.02482281, "memory(GiB)": 712.86, "step": 3010, "train_speed(iter/s)": 0.171879 }, { "acc": 0.74563398, "epoch": 0.07648413952964792, "grad_norm": 5.53125, "learning_rate": 7.648401826484018e-06, "loss": 1.06076841, "memory(GiB)": 712.86, "step": 3015, "train_speed(iter/s)": 0.171579 }, { "acc": 0.75483041, "epoch": 0.07661097889868547, "grad_norm": 4.0625, "learning_rate": 7.661085743277524e-06, "loss": 0.9966692, "memory(GiB)": 712.86, "step": 3020, "train_speed(iter/s)": 0.171219 }, { "acc": 0.74570794, "epoch": 0.07673781826772302, "grad_norm": 3.140625, "learning_rate": 7.67376966007103e-06, "loss": 1.01437006, "memory(GiB)": 712.86, "step": 3025, "train_speed(iter/s)": 0.170887 }, { "acc": 0.7509366, "epoch": 0.07686465763676059, "grad_norm": 3.671875, "learning_rate": 7.686453576864536e-06, "loss": 0.98657579, "memory(GiB)": 712.86, "step": 3030, "train_speed(iter/s)": 0.170465 }, { "acc": 0.73626823, "epoch": 0.07699149700579815, "grad_norm": 4.09375, "learning_rate": 7.699137493658042e-06, "loss": 1.04591904, "memory(GiB)": 712.86, "step": 3035, "train_speed(iter/s)": 0.170075 }, { "acc": 0.74100199, "epoch": 0.0771183363748357, "grad_norm": 3.46875, "learning_rate": 7.711821410451548e-06, "loss": 0.98191643, "memory(GiB)": 712.86, "step": 3040, "train_speed(iter/s)": 0.169691 }, { "acc": 0.74500542, "epoch": 0.07724517574387327, "grad_norm": 3.78125, "learning_rate": 7.724505327245054e-06, "loss": 1.0112915, "memory(GiB)": 712.86, "step": 3045, "train_speed(iter/s)": 0.169386 }, { "acc": 0.73908501, "epoch": 0.07737201511291082, "grad_norm": 4.1875, "learning_rate": 7.73718924403856e-06, "loss": 1.03803358, "memory(GiB)": 712.86, "step": 3050, "train_speed(iter/s)": 0.169042 }, { "acc": 0.74339809, "epoch": 0.07749885448194838, "grad_norm": 3.46875, "learning_rate": 7.749873160832066e-06, "loss": 1.0345665, "memory(GiB)": 712.86, "step": 3055, "train_speed(iter/s)": 0.168693 }, { "acc": 0.7480711, "epoch": 0.07762569385098594, "grad_norm": 3.421875, "learning_rate": 7.76255707762557e-06, "loss": 0.97479153, "memory(GiB)": 712.86, "step": 3060, "train_speed(iter/s)": 0.168353 }, { "acc": 0.74977193, "epoch": 0.0777525332200235, "grad_norm": 3.859375, "learning_rate": 7.775240994419078e-06, "loss": 0.99698706, "memory(GiB)": 712.86, "step": 3065, "train_speed(iter/s)": 0.167979 }, { "acc": 0.74214401, "epoch": 0.07787937258906105, "grad_norm": 3.765625, "learning_rate": 7.787924911212584e-06, "loss": 1.02702379, "memory(GiB)": 712.86, "step": 3070, "train_speed(iter/s)": 0.16759 }, { "acc": 0.74774446, "epoch": 0.07800621195809862, "grad_norm": 4.21875, "learning_rate": 7.800608828006088e-06, "loss": 1.03893309, "memory(GiB)": 712.86, "step": 3075, "train_speed(iter/s)": 0.167327 }, { "acc": 0.73598084, "epoch": 0.07813305132713617, "grad_norm": 3.5, "learning_rate": 7.813292744799594e-06, "loss": 1.02898722, "memory(GiB)": 712.86, "step": 3080, "train_speed(iter/s)": 0.167013 }, { "acc": 0.73516078, "epoch": 0.07825989069617373, "grad_norm": 3.9375, "learning_rate": 7.8259766615931e-06, "loss": 1.0013134, "memory(GiB)": 712.86, "step": 3085, "train_speed(iter/s)": 0.166564 }, { "acc": 0.73109941, "epoch": 0.0783867300652113, "grad_norm": 4.21875, "learning_rate": 7.838660578386606e-06, "loss": 1.06972713, "memory(GiB)": 712.86, "step": 3090, "train_speed(iter/s)": 0.166273 }, { "acc": 0.74395905, "epoch": 0.07851356943424885, "grad_norm": 3.3125, "learning_rate": 7.851344495180112e-06, "loss": 1.02143135, "memory(GiB)": 712.86, "step": 3095, "train_speed(iter/s)": 0.165934 }, { "acc": 0.73886971, "epoch": 0.0786404088032864, "grad_norm": 3.578125, "learning_rate": 7.864028411973618e-06, "loss": 1.05474262, "memory(GiB)": 712.86, "step": 3100, "train_speed(iter/s)": 0.165564 }, { "acc": 0.73085713, "epoch": 0.07876724817232397, "grad_norm": 4.09375, "learning_rate": 7.876712328767124e-06, "loss": 1.0593298, "memory(GiB)": 712.86, "step": 3105, "train_speed(iter/s)": 0.165245 }, { "acc": 0.74066849, "epoch": 0.07889408754136153, "grad_norm": 3.703125, "learning_rate": 7.88939624556063e-06, "loss": 1.07523575, "memory(GiB)": 712.86, "step": 3110, "train_speed(iter/s)": 0.164925 }, { "acc": 0.74822536, "epoch": 0.07902092691039908, "grad_norm": 3.796875, "learning_rate": 7.902080162354136e-06, "loss": 0.97126989, "memory(GiB)": 712.86, "step": 3115, "train_speed(iter/s)": 0.16464 }, { "acc": 0.76483145, "epoch": 0.07914776627943665, "grad_norm": 4.125, "learning_rate": 7.91476407914764e-06, "loss": 0.96388378, "memory(GiB)": 712.86, "step": 3120, "train_speed(iter/s)": 0.164313 }, { "acc": 0.75258322, "epoch": 0.0792746056484742, "grad_norm": 3.78125, "learning_rate": 7.927447995941148e-06, "loss": 1.00401297, "memory(GiB)": 712.86, "step": 3125, "train_speed(iter/s)": 0.164007 }, { "acc": 0.73388195, "epoch": 0.07940144501751176, "grad_norm": 3.875, "learning_rate": 7.940131912734654e-06, "loss": 1.0399539, "memory(GiB)": 712.86, "step": 3130, "train_speed(iter/s)": 0.163748 }, { "acc": 0.75992727, "epoch": 0.07952828438654932, "grad_norm": 3.890625, "learning_rate": 7.952815829528158e-06, "loss": 0.9760993, "memory(GiB)": 712.86, "step": 3135, "train_speed(iter/s)": 0.163437 }, { "acc": 0.74468346, "epoch": 0.07965512375558688, "grad_norm": 4.15625, "learning_rate": 7.965499746321664e-06, "loss": 1.0373498, "memory(GiB)": 712.86, "step": 3140, "train_speed(iter/s)": 0.163114 }, { "acc": 0.73826385, "epoch": 0.07978196312462443, "grad_norm": 4.0, "learning_rate": 7.978183663115172e-06, "loss": 1.09880123, "memory(GiB)": 712.86, "step": 3145, "train_speed(iter/s)": 0.16285 }, { "acc": 0.74062409, "epoch": 0.079908802493662, "grad_norm": 2.984375, "learning_rate": 7.990867579908676e-06, "loss": 1.05002842, "memory(GiB)": 712.86, "step": 3150, "train_speed(iter/s)": 0.162534 }, { "acc": 0.73696413, "epoch": 0.08003564186269956, "grad_norm": 4.625, "learning_rate": 8.003551496702182e-06, "loss": 1.06571522, "memory(GiB)": 712.86, "step": 3155, "train_speed(iter/s)": 0.162225 }, { "acc": 0.73647451, "epoch": 0.08016248123173711, "grad_norm": 3.828125, "learning_rate": 8.016235413495688e-06, "loss": 1.02954578, "memory(GiB)": 712.86, "step": 3160, "train_speed(iter/s)": 0.161908 }, { "acc": 0.74248118, "epoch": 0.08028932060077468, "grad_norm": 4.96875, "learning_rate": 8.028919330289194e-06, "loss": 1.06327133, "memory(GiB)": 712.86, "step": 3165, "train_speed(iter/s)": 0.161563 }, { "acc": 0.75167189, "epoch": 0.08041615996981223, "grad_norm": 3.265625, "learning_rate": 8.0416032470827e-06, "loss": 0.9998889, "memory(GiB)": 712.86, "step": 3170, "train_speed(iter/s)": 0.161272 }, { "acc": 0.73922911, "epoch": 0.08054299933884979, "grad_norm": 4.34375, "learning_rate": 8.054287163876206e-06, "loss": 1.04964743, "memory(GiB)": 712.86, "step": 3175, "train_speed(iter/s)": 0.160941 }, { "acc": 0.7515255, "epoch": 0.08066983870788735, "grad_norm": 4.5625, "learning_rate": 8.066971080669712e-06, "loss": 0.98818817, "memory(GiB)": 712.86, "step": 3180, "train_speed(iter/s)": 0.160692 }, { "acc": 0.73134961, "epoch": 0.08079667807692491, "grad_norm": 3.84375, "learning_rate": 8.079654997463218e-06, "loss": 1.10216141, "memory(GiB)": 712.86, "step": 3185, "train_speed(iter/s)": 0.160444 }, { "acc": 0.74619994, "epoch": 0.08092351744596246, "grad_norm": 11.3125, "learning_rate": 8.092338914256724e-06, "loss": 0.99366808, "memory(GiB)": 712.86, "step": 3190, "train_speed(iter/s)": 0.160136 }, { "acc": 0.73170948, "epoch": 0.08105035681500003, "grad_norm": 4.46875, "learning_rate": 8.105022831050228e-06, "loss": 1.01828842, "memory(GiB)": 712.86, "step": 3195, "train_speed(iter/s)": 0.15985 }, { "acc": 0.74330025, "epoch": 0.08117719618403758, "grad_norm": 3.59375, "learning_rate": 8.117706747843734e-06, "loss": 1.01840935, "memory(GiB)": 712.86, "step": 3200, "train_speed(iter/s)": 0.15955 }, { "acc": 0.74462204, "epoch": 0.08130403555307514, "grad_norm": 3.703125, "learning_rate": 8.130390664637242e-06, "loss": 1.0255085, "memory(GiB)": 712.86, "step": 3205, "train_speed(iter/s)": 0.159238 }, { "acc": 0.75661931, "epoch": 0.0814308749221127, "grad_norm": 3.625, "learning_rate": 8.143074581430746e-06, "loss": 0.98465443, "memory(GiB)": 712.86, "step": 3210, "train_speed(iter/s)": 0.158919 }, { "acc": 0.75253773, "epoch": 0.08155771429115026, "grad_norm": 3.765625, "learning_rate": 8.155758498224252e-06, "loss": 1.02504807, "memory(GiB)": 712.86, "step": 3215, "train_speed(iter/s)": 0.158642 }, { "acc": 0.75237379, "epoch": 0.08168455366018781, "grad_norm": 3.828125, "learning_rate": 8.168442415017758e-06, "loss": 1.01671095, "memory(GiB)": 712.86, "step": 3220, "train_speed(iter/s)": 0.158369 }, { "acc": 0.73394365, "epoch": 0.08181139302922538, "grad_norm": 4.34375, "learning_rate": 8.181126331811264e-06, "loss": 1.06683931, "memory(GiB)": 712.86, "step": 3225, "train_speed(iter/s)": 0.158109 }, { "acc": 0.74555378, "epoch": 0.08193823239826294, "grad_norm": 3.328125, "learning_rate": 8.19381024860477e-06, "loss": 0.99379778, "memory(GiB)": 712.86, "step": 3230, "train_speed(iter/s)": 0.157901 }, { "acc": 0.74124413, "epoch": 0.08206507176730049, "grad_norm": 3.71875, "learning_rate": 8.206494165398276e-06, "loss": 0.97062759, "memory(GiB)": 712.86, "step": 3235, "train_speed(iter/s)": 0.157664 }, { "acc": 0.73749018, "epoch": 0.08219191113633806, "grad_norm": 3.875, "learning_rate": 8.219178082191782e-06, "loss": 1.0017561, "memory(GiB)": 712.86, "step": 3240, "train_speed(iter/s)": 0.157384 }, { "acc": 0.74744635, "epoch": 0.08231875050537561, "grad_norm": 3.734375, "learning_rate": 8.231861998985288e-06, "loss": 1.00319672, "memory(GiB)": 712.86, "step": 3245, "train_speed(iter/s)": 0.157116 }, { "acc": 0.75339308, "epoch": 0.08244558987441317, "grad_norm": 3.796875, "learning_rate": 8.244545915778794e-06, "loss": 0.96730051, "memory(GiB)": 712.86, "step": 3250, "train_speed(iter/s)": 0.156839 }, { "acc": 0.73677135, "epoch": 0.08257242924345073, "grad_norm": 3.0625, "learning_rate": 8.2572298325723e-06, "loss": 1.01608162, "memory(GiB)": 712.86, "step": 3255, "train_speed(iter/s)": 0.156548 }, { "acc": 0.74584975, "epoch": 0.08269926861248829, "grad_norm": 3.390625, "learning_rate": 8.269913749365804e-06, "loss": 0.97694054, "memory(GiB)": 712.86, "step": 3260, "train_speed(iter/s)": 0.156299 }, { "acc": 0.74107103, "epoch": 0.08282610798152584, "grad_norm": 4.09375, "learning_rate": 8.282597666159312e-06, "loss": 1.00809908, "memory(GiB)": 712.86, "step": 3265, "train_speed(iter/s)": 0.156065 }, { "acc": 0.73910408, "epoch": 0.08295294735056341, "grad_norm": 3.9375, "learning_rate": 8.295281582952816e-06, "loss": 1.04563932, "memory(GiB)": 712.86, "step": 3270, "train_speed(iter/s)": 0.155826 }, { "acc": 0.74723768, "epoch": 0.08307978671960096, "grad_norm": 5.53125, "learning_rate": 8.307965499746322e-06, "loss": 0.9926672, "memory(GiB)": 725.1, "step": 3275, "train_speed(iter/s)": 0.155481 }, { "acc": 0.73783526, "epoch": 0.08320662608863852, "grad_norm": 3.34375, "learning_rate": 8.320649416539828e-06, "loss": 1.08285818, "memory(GiB)": 725.1, "step": 3280, "train_speed(iter/s)": 0.155272 }, { "acc": 0.75411696, "epoch": 0.08333346545767609, "grad_norm": 4.21875, "learning_rate": 8.333333333333334e-06, "loss": 0.95669899, "memory(GiB)": 725.1, "step": 3285, "train_speed(iter/s)": 0.155034 }, { "acc": 0.74923158, "epoch": 0.08346030482671364, "grad_norm": 4.15625, "learning_rate": 8.34601725012684e-06, "loss": 0.95244007, "memory(GiB)": 725.1, "step": 3290, "train_speed(iter/s)": 0.154798 }, { "acc": 0.75026278, "epoch": 0.0835871441957512, "grad_norm": 5.09375, "learning_rate": 8.358701166920346e-06, "loss": 1.03908396, "memory(GiB)": 725.1, "step": 3295, "train_speed(iter/s)": 0.154535 }, { "acc": 0.72459912, "epoch": 0.08371398356478876, "grad_norm": 3.5625, "learning_rate": 8.371385083713852e-06, "loss": 1.13757792, "memory(GiB)": 725.1, "step": 3300, "train_speed(iter/s)": 0.154292 }, { "acc": 0.73599329, "epoch": 0.08384082293382632, "grad_norm": 4.25, "learning_rate": 8.384069000507358e-06, "loss": 1.03929024, "memory(GiB)": 725.1, "step": 3305, "train_speed(iter/s)": 0.154076 }, { "acc": 0.7270957, "epoch": 0.08396766230286387, "grad_norm": 3.46875, "learning_rate": 8.396752917300864e-06, "loss": 1.03871651, "memory(GiB)": 725.1, "step": 3310, "train_speed(iter/s)": 0.153848 }, { "acc": 0.73062596, "epoch": 0.08409450167190144, "grad_norm": 3.4375, "learning_rate": 8.40943683409437e-06, "loss": 1.01835394, "memory(GiB)": 725.1, "step": 3315, "train_speed(iter/s)": 0.153589 }, { "acc": 0.73999295, "epoch": 0.084221341040939, "grad_norm": 3.6875, "learning_rate": 8.422120750887874e-06, "loss": 1.07226906, "memory(GiB)": 725.1, "step": 3320, "train_speed(iter/s)": 0.153283 }, { "acc": 0.73936572, "epoch": 0.08434818040997655, "grad_norm": 3.453125, "learning_rate": 8.434804667681381e-06, "loss": 1.04194069, "memory(GiB)": 725.1, "step": 3325, "train_speed(iter/s)": 0.153054 }, { "acc": 0.74366288, "epoch": 0.08447501977901412, "grad_norm": 3.421875, "learning_rate": 8.447488584474887e-06, "loss": 1.03205614, "memory(GiB)": 725.1, "step": 3330, "train_speed(iter/s)": 0.152775 }, { "acc": 0.74873748, "epoch": 0.08460185914805167, "grad_norm": 4.28125, "learning_rate": 8.460172501268392e-06, "loss": 1.01290073, "memory(GiB)": 725.1, "step": 3335, "train_speed(iter/s)": 0.152505 }, { "acc": 0.74574733, "epoch": 0.08472869851708922, "grad_norm": 3.390625, "learning_rate": 8.472856418061898e-06, "loss": 1.01715107, "memory(GiB)": 725.1, "step": 3340, "train_speed(iter/s)": 0.152276 }, { "acc": 0.75098009, "epoch": 0.08485553788612679, "grad_norm": 3.953125, "learning_rate": 8.485540334855404e-06, "loss": 1.01160402, "memory(GiB)": 737.53, "step": 3345, "train_speed(iter/s)": 0.151999 }, { "acc": 0.75636835, "epoch": 0.08498237725516435, "grad_norm": 3.65625, "learning_rate": 8.49822425164891e-06, "loss": 1.0057826, "memory(GiB)": 737.53, "step": 3350, "train_speed(iter/s)": 0.151718 }, { "acc": 0.73685312, "epoch": 0.0851092166242019, "grad_norm": 3.296875, "learning_rate": 8.510908168442416e-06, "loss": 1.02014723, "memory(GiB)": 737.53, "step": 3355, "train_speed(iter/s)": 0.151421 }, { "acc": 0.72669983, "epoch": 0.08523605599323947, "grad_norm": 4.34375, "learning_rate": 8.523592085235922e-06, "loss": 1.03470345, "memory(GiB)": 737.53, "step": 3360, "train_speed(iter/s)": 0.151248 }, { "acc": 0.73357959, "epoch": 0.08536289536227702, "grad_norm": 3.78125, "learning_rate": 8.536276002029428e-06, "loss": 1.02725592, "memory(GiB)": 737.53, "step": 3365, "train_speed(iter/s)": 0.151015 }, { "acc": 0.74428148, "epoch": 0.08548973473131458, "grad_norm": 4.5, "learning_rate": 8.548959918822933e-06, "loss": 1.04656973, "memory(GiB)": 737.53, "step": 3370, "train_speed(iter/s)": 0.150802 }, { "acc": 0.73577332, "epoch": 0.08561657410035214, "grad_norm": 3.765625, "learning_rate": 8.56164383561644e-06, "loss": 1.04299755, "memory(GiB)": 737.53, "step": 3375, "train_speed(iter/s)": 0.150565 }, { "acc": 0.75590944, "epoch": 0.0857434134693897, "grad_norm": 4.46875, "learning_rate": 8.574327752409944e-06, "loss": 0.98649263, "memory(GiB)": 737.53, "step": 3380, "train_speed(iter/s)": 0.150295 }, { "acc": 0.7454843, "epoch": 0.08587025283842725, "grad_norm": 3.890625, "learning_rate": 8.587011669203451e-06, "loss": 1.00970383, "memory(GiB)": 737.53, "step": 3385, "train_speed(iter/s)": 0.150048 }, { "acc": 0.75881181, "epoch": 0.08599709220746482, "grad_norm": 3.875, "learning_rate": 8.599695585996957e-06, "loss": 0.98785906, "memory(GiB)": 737.53, "step": 3390, "train_speed(iter/s)": 0.149854 }, { "acc": 0.73549542, "epoch": 0.08612393157650237, "grad_norm": 4.34375, "learning_rate": 8.612379502790462e-06, "loss": 1.08945827, "memory(GiB)": 737.53, "step": 3395, "train_speed(iter/s)": 0.149603 }, { "acc": 0.75274725, "epoch": 0.08625077094553993, "grad_norm": 4.0625, "learning_rate": 8.625063419583968e-06, "loss": 1.01588573, "memory(GiB)": 737.53, "step": 3400, "train_speed(iter/s)": 0.149383 }, { "acc": 0.74035139, "epoch": 0.0863776103145775, "grad_norm": 3.53125, "learning_rate": 8.637747336377475e-06, "loss": 1.0308527, "memory(GiB)": 737.53, "step": 3405, "train_speed(iter/s)": 0.149178 }, { "acc": 0.75327072, "epoch": 0.08650444968361505, "grad_norm": 4.3125, "learning_rate": 8.65043125317098e-06, "loss": 1.06523609, "memory(GiB)": 737.53, "step": 3410, "train_speed(iter/s)": 0.149018 }, { "acc": 0.73736567, "epoch": 0.0866312890526526, "grad_norm": 3.921875, "learning_rate": 8.663115169964485e-06, "loss": 1.00675535, "memory(GiB)": 737.53, "step": 3415, "train_speed(iter/s)": 0.148827 }, { "acc": 0.72853551, "epoch": 0.08675812842169017, "grad_norm": 4.21875, "learning_rate": 8.675799086757991e-06, "loss": 1.08170986, "memory(GiB)": 737.53, "step": 3420, "train_speed(iter/s)": 0.14861 }, { "acc": 0.74386406, "epoch": 0.08688496779072773, "grad_norm": 3.84375, "learning_rate": 8.688483003551497e-06, "loss": 1.0507349, "memory(GiB)": 737.53, "step": 3425, "train_speed(iter/s)": 0.148375 }, { "acc": 0.74743357, "epoch": 0.08701180715976528, "grad_norm": 3.5625, "learning_rate": 8.701166920345003e-06, "loss": 1.03419323, "memory(GiB)": 737.53, "step": 3430, "train_speed(iter/s)": 0.148132 }, { "acc": 0.75259447, "epoch": 0.08713864652880285, "grad_norm": 4.4375, "learning_rate": 8.71385083713851e-06, "loss": 1.01483126, "memory(GiB)": 737.53, "step": 3435, "train_speed(iter/s)": 0.147912 }, { "acc": 0.72126675, "epoch": 0.0872654858978404, "grad_norm": 4.25, "learning_rate": 8.726534753932014e-06, "loss": 1.13936214, "memory(GiB)": 737.53, "step": 3440, "train_speed(iter/s)": 0.14773 }, { "acc": 0.74562607, "epoch": 0.08739232526687796, "grad_norm": 4.34375, "learning_rate": 8.739218670725521e-06, "loss": 1.00316896, "memory(GiB)": 737.53, "step": 3445, "train_speed(iter/s)": 0.147527 }, { "acc": 0.74423041, "epoch": 0.08751916463591553, "grad_norm": 3.703125, "learning_rate": 8.751902587519027e-06, "loss": 0.98094015, "memory(GiB)": 737.53, "step": 3450, "train_speed(iter/s)": 0.147331 }, { "acc": 0.73170328, "epoch": 0.08764600400495308, "grad_norm": 4.40625, "learning_rate": 8.764586504312532e-06, "loss": 1.08793011, "memory(GiB)": 737.53, "step": 3455, "train_speed(iter/s)": 0.147118 }, { "acc": 0.74760985, "epoch": 0.08777284337399063, "grad_norm": 3.671875, "learning_rate": 8.777270421106037e-06, "loss": 1.05466089, "memory(GiB)": 737.53, "step": 3460, "train_speed(iter/s)": 0.146874 }, { "acc": 0.75005431, "epoch": 0.0878996827430282, "grad_norm": 3.421875, "learning_rate": 8.789954337899545e-06, "loss": 1.01000834, "memory(GiB)": 737.53, "step": 3465, "train_speed(iter/s)": 0.146674 }, { "acc": 0.74119024, "epoch": 0.08802652211206576, "grad_norm": 4.25, "learning_rate": 8.80263825469305e-06, "loss": 1.0565897, "memory(GiB)": 737.53, "step": 3470, "train_speed(iter/s)": 0.146492 }, { "acc": 0.75601597, "epoch": 0.08815336148110331, "grad_norm": 3.859375, "learning_rate": 8.815322171486555e-06, "loss": 0.97477751, "memory(GiB)": 737.53, "step": 3475, "train_speed(iter/s)": 0.146288 }, { "acc": 0.74294767, "epoch": 0.08828020085014088, "grad_norm": 3.734375, "learning_rate": 8.828006088280061e-06, "loss": 1.02450895, "memory(GiB)": 737.53, "step": 3480, "train_speed(iter/s)": 0.146122 }, { "acc": 0.75352597, "epoch": 0.08840704021917843, "grad_norm": 4.125, "learning_rate": 8.840690005073567e-06, "loss": 0.99151192, "memory(GiB)": 737.53, "step": 3485, "train_speed(iter/s)": 0.145908 }, { "acc": 0.73837776, "epoch": 0.08853387958821599, "grad_norm": 3.984375, "learning_rate": 8.853373921867073e-06, "loss": 1.01401043, "memory(GiB)": 737.53, "step": 3490, "train_speed(iter/s)": 0.145704 }, { "acc": 0.74483185, "epoch": 0.08866071895725355, "grad_norm": 3.453125, "learning_rate": 8.86605783866058e-06, "loss": 1.03784657, "memory(GiB)": 737.53, "step": 3495, "train_speed(iter/s)": 0.145499 }, { "acc": 0.75128078, "epoch": 0.08878755832629111, "grad_norm": 3.140625, "learning_rate": 8.878741755454085e-06, "loss": 0.9529952, "memory(GiB)": 737.53, "step": 3500, "train_speed(iter/s)": 0.145307 }, { "epoch": 0.08878755832629111, "eval_acc": 0.7324739302754716, "eval_loss": 0.990281879901886, "eval_runtime": 1150.4332, "eval_samples_per_second": 5.537, "eval_steps_per_second": 5.537, "step": 3500 }, { "acc": 0.74731226, "epoch": 0.08891439769532866, "grad_norm": 3.359375, "learning_rate": 8.891425672247591e-06, "loss": 0.97767258, "memory(GiB)": 737.53, "step": 3505, "train_speed(iter/s)": 0.134587 }, { "acc": 0.74541616, "epoch": 0.08904123706436623, "grad_norm": 3.34375, "learning_rate": 8.904109589041097e-06, "loss": 1.01794033, "memory(GiB)": 737.53, "step": 3510, "train_speed(iter/s)": 0.134432 }, { "acc": 0.73357639, "epoch": 0.08916807643340378, "grad_norm": 5.15625, "learning_rate": 8.916793505834601e-06, "loss": 1.07534943, "memory(GiB)": 737.53, "step": 3515, "train_speed(iter/s)": 0.134307 }, { "acc": 0.74227777, "epoch": 0.08929491580244134, "grad_norm": 3.71875, "learning_rate": 8.929477422628107e-06, "loss": 1.0228507, "memory(GiB)": 737.53, "step": 3520, "train_speed(iter/s)": 0.134172 }, { "acc": 0.7491303, "epoch": 0.0894217551714789, "grad_norm": 4.75, "learning_rate": 8.942161339421615e-06, "loss": 1.04462328, "memory(GiB)": 737.53, "step": 3525, "train_speed(iter/s)": 0.134009 }, { "acc": 0.74492898, "epoch": 0.08954859454051646, "grad_norm": 4.78125, "learning_rate": 8.95484525621512e-06, "loss": 1.02567501, "memory(GiB)": 737.53, "step": 3530, "train_speed(iter/s)": 0.13386 }, { "acc": 0.73820953, "epoch": 0.08967543390955401, "grad_norm": 3.84375, "learning_rate": 8.967529173008625e-06, "loss": 1.00099554, "memory(GiB)": 737.53, "step": 3535, "train_speed(iter/s)": 0.133709 }, { "acc": 0.74376373, "epoch": 0.08980227327859158, "grad_norm": 3.28125, "learning_rate": 8.980213089802131e-06, "loss": 0.96745014, "memory(GiB)": 737.53, "step": 3540, "train_speed(iter/s)": 0.133561 }, { "acc": 0.75709805, "epoch": 0.08992911264762914, "grad_norm": 3.5, "learning_rate": 8.992897006595637e-06, "loss": 0.97940092, "memory(GiB)": 737.53, "step": 3545, "train_speed(iter/s)": 0.133377 }, { "acc": 0.74229579, "epoch": 0.09005595201666669, "grad_norm": 3.5, "learning_rate": 9.005580923389143e-06, "loss": 1.00915432, "memory(GiB)": 737.53, "step": 3550, "train_speed(iter/s)": 0.133196 }, { "acc": 0.73166947, "epoch": 0.09018279138570426, "grad_norm": 3.375, "learning_rate": 9.01826484018265e-06, "loss": 1.01725092, "memory(GiB)": 737.53, "step": 3555, "train_speed(iter/s)": 0.133064 }, { "acc": 0.73444114, "epoch": 0.09030963075474181, "grad_norm": 4.875, "learning_rate": 9.030948756976155e-06, "loss": 1.07198925, "memory(GiB)": 737.53, "step": 3560, "train_speed(iter/s)": 0.132945 }, { "acc": 0.73994389, "epoch": 0.09043647012377937, "grad_norm": 3.4375, "learning_rate": 9.043632673769661e-06, "loss": 1.05429831, "memory(GiB)": 737.53, "step": 3565, "train_speed(iter/s)": 0.132796 }, { "acc": 0.74092727, "epoch": 0.09056330949281693, "grad_norm": 3.765625, "learning_rate": 9.056316590563167e-06, "loss": 0.99938164, "memory(GiB)": 737.53, "step": 3570, "train_speed(iter/s)": 0.132668 }, { "acc": 0.74497414, "epoch": 0.09069014886185449, "grad_norm": 3.234375, "learning_rate": 9.069000507356673e-06, "loss": 0.987257, "memory(GiB)": 737.53, "step": 3575, "train_speed(iter/s)": 0.132547 }, { "acc": 0.74285455, "epoch": 0.09081698823089204, "grad_norm": 3.953125, "learning_rate": 9.081684424150177e-06, "loss": 1.02823792, "memory(GiB)": 737.53, "step": 3580, "train_speed(iter/s)": 0.132423 }, { "acc": 0.7390224, "epoch": 0.09094382759992961, "grad_norm": 3.59375, "learning_rate": 9.094368340943685e-06, "loss": 0.98630085, "memory(GiB)": 737.53, "step": 3585, "train_speed(iter/s)": 0.132294 }, { "acc": 0.73801417, "epoch": 0.09107066696896716, "grad_norm": 4.625, "learning_rate": 9.10705225773719e-06, "loss": 0.94992638, "memory(GiB)": 737.53, "step": 3590, "train_speed(iter/s)": 0.13219 }, { "acc": 0.74053278, "epoch": 0.09119750633800472, "grad_norm": 5.71875, "learning_rate": 9.119736174530695e-06, "loss": 1.07699223, "memory(GiB)": 737.53, "step": 3595, "train_speed(iter/s)": 0.132089 }, { "acc": 0.74614739, "epoch": 0.09132434570704229, "grad_norm": 3.71875, "learning_rate": 9.132420091324201e-06, "loss": 0.99928226, "memory(GiB)": 737.53, "step": 3600, "train_speed(iter/s)": 0.131903 }, { "acc": 0.763939, "epoch": 0.09145118507607984, "grad_norm": 3.59375, "learning_rate": 9.145104008117707e-06, "loss": 0.94673748, "memory(GiB)": 737.53, "step": 3605, "train_speed(iter/s)": 0.131738 }, { "acc": 0.73564816, "epoch": 0.0915780244451174, "grad_norm": 3.09375, "learning_rate": 9.157787924911213e-06, "loss": 1.05662947, "memory(GiB)": 737.53, "step": 3610, "train_speed(iter/s)": 0.131616 }, { "acc": 0.74418159, "epoch": 0.09170486381415496, "grad_norm": 3.328125, "learning_rate": 9.170471841704719e-06, "loss": 1.03677139, "memory(GiB)": 737.53, "step": 3615, "train_speed(iter/s)": 0.131479 }, { "acc": 0.73897696, "epoch": 0.09183170318319252, "grad_norm": 4.40625, "learning_rate": 9.183155758498225e-06, "loss": 1.0220293, "memory(GiB)": 737.53, "step": 3620, "train_speed(iter/s)": 0.131341 }, { "acc": 0.73506179, "epoch": 0.09195854255223007, "grad_norm": 3.390625, "learning_rate": 9.195839675291731e-06, "loss": 1.05671482, "memory(GiB)": 737.53, "step": 3625, "train_speed(iter/s)": 0.131204 }, { "acc": 0.74403915, "epoch": 0.09208538192126764, "grad_norm": 3.84375, "learning_rate": 9.208523592085237e-06, "loss": 0.99435663, "memory(GiB)": 737.53, "step": 3630, "train_speed(iter/s)": 0.131066 }, { "acc": 0.75708375, "epoch": 0.0922122212903052, "grad_norm": 4.1875, "learning_rate": 9.221207508878743e-06, "loss": 0.95827208, "memory(GiB)": 737.53, "step": 3635, "train_speed(iter/s)": 0.130946 }, { "acc": 0.74024777, "epoch": 0.09233906065934275, "grad_norm": 3.96875, "learning_rate": 9.233891425672247e-06, "loss": 1.04295397, "memory(GiB)": 737.53, "step": 3640, "train_speed(iter/s)": 0.130809 }, { "acc": 0.73554759, "epoch": 0.09246590002838032, "grad_norm": 3.890625, "learning_rate": 9.246575342465755e-06, "loss": 1.05170507, "memory(GiB)": 737.53, "step": 3645, "train_speed(iter/s)": 0.130677 }, { "acc": 0.74252329, "epoch": 0.09259273939741787, "grad_norm": 3.734375, "learning_rate": 9.25925925925926e-06, "loss": 1.03678904, "memory(GiB)": 737.53, "step": 3650, "train_speed(iter/s)": 0.130505 }, { "acc": 0.75265646, "epoch": 0.09271957876645542, "grad_norm": 3.53125, "learning_rate": 9.271943176052765e-06, "loss": 0.97542276, "memory(GiB)": 737.53, "step": 3655, "train_speed(iter/s)": 0.130375 }, { "acc": 0.76769323, "epoch": 0.09284641813549299, "grad_norm": 4.53125, "learning_rate": 9.284627092846271e-06, "loss": 0.9608077, "memory(GiB)": 737.53, "step": 3660, "train_speed(iter/s)": 0.130279 }, { "acc": 0.7554141, "epoch": 0.09297325750453055, "grad_norm": 4.03125, "learning_rate": 9.297311009639777e-06, "loss": 1.04752588, "memory(GiB)": 737.53, "step": 3665, "train_speed(iter/s)": 0.13017 }, { "acc": 0.74331608, "epoch": 0.0931000968735681, "grad_norm": 3.984375, "learning_rate": 9.309994926433283e-06, "loss": 1.06705046, "memory(GiB)": 737.53, "step": 3670, "train_speed(iter/s)": 0.130028 }, { "acc": 0.73426709, "epoch": 0.09322693624260567, "grad_norm": 3.703125, "learning_rate": 9.322678843226789e-06, "loss": 1.08360577, "memory(GiB)": 737.53, "step": 3675, "train_speed(iter/s)": 0.129924 }, { "acc": 0.7441246, "epoch": 0.09335377561164322, "grad_norm": 4.75, "learning_rate": 9.335362760020295e-06, "loss": 1.06544094, "memory(GiB)": 737.53, "step": 3680, "train_speed(iter/s)": 0.1298 }, { "acc": 0.73959866, "epoch": 0.09348061498068078, "grad_norm": 3.953125, "learning_rate": 9.348046676813801e-06, "loss": 1.00209017, "memory(GiB)": 737.53, "step": 3685, "train_speed(iter/s)": 0.129678 }, { "acc": 0.73663335, "epoch": 0.09360745434971834, "grad_norm": 3.6875, "learning_rate": 9.360730593607307e-06, "loss": 1.03826342, "memory(GiB)": 737.53, "step": 3690, "train_speed(iter/s)": 0.129557 }, { "acc": 0.75873685, "epoch": 0.0937342937187559, "grad_norm": 4.21875, "learning_rate": 9.373414510400813e-06, "loss": 0.95116854, "memory(GiB)": 737.53, "step": 3695, "train_speed(iter/s)": 0.129419 }, { "acc": 0.73513823, "epoch": 0.09386113308779345, "grad_norm": 4.0625, "learning_rate": 9.386098427194317e-06, "loss": 1.10369635, "memory(GiB)": 737.53, "step": 3700, "train_speed(iter/s)": 0.12931 }, { "acc": 0.73579359, "epoch": 0.09398797245683102, "grad_norm": 3.765625, "learning_rate": 9.398782343987825e-06, "loss": 1.06466475, "memory(GiB)": 737.53, "step": 3705, "train_speed(iter/s)": 0.129187 }, { "acc": 0.74712749, "epoch": 0.09411481182586857, "grad_norm": 3.75, "learning_rate": 9.41146626078133e-06, "loss": 0.95465021, "memory(GiB)": 737.53, "step": 3710, "train_speed(iter/s)": 0.129072 }, { "acc": 0.75587149, "epoch": 0.09424165119490613, "grad_norm": 4.28125, "learning_rate": 9.424150177574835e-06, "loss": 1.01661139, "memory(GiB)": 737.53, "step": 3715, "train_speed(iter/s)": 0.128952 }, { "acc": 0.73995361, "epoch": 0.09436849056394368, "grad_norm": 4.25, "learning_rate": 9.436834094368341e-06, "loss": 1.0141489, "memory(GiB)": 737.53, "step": 3720, "train_speed(iter/s)": 0.12882 }, { "acc": 0.74958839, "epoch": 0.09449532993298125, "grad_norm": 4.4375, "learning_rate": 9.449518011161849e-06, "loss": 0.99550219, "memory(GiB)": 737.53, "step": 3725, "train_speed(iter/s)": 0.128676 }, { "acc": 0.75727515, "epoch": 0.0946221693020188, "grad_norm": 3.5, "learning_rate": 9.462201927955353e-06, "loss": 0.965205, "memory(GiB)": 737.53, "step": 3730, "train_speed(iter/s)": 0.128573 }, { "acc": 0.73767533, "epoch": 0.09474900867105636, "grad_norm": 3.46875, "learning_rate": 9.474885844748859e-06, "loss": 1.03593235, "memory(GiB)": 737.53, "step": 3735, "train_speed(iter/s)": 0.128455 }, { "acc": 0.74304743, "epoch": 0.09487584804009393, "grad_norm": 3.40625, "learning_rate": 9.487569761542365e-06, "loss": 0.97849922, "memory(GiB)": 737.53, "step": 3740, "train_speed(iter/s)": 0.128299 }, { "acc": 0.74662423, "epoch": 0.09500268740913148, "grad_norm": 3.84375, "learning_rate": 9.50025367833587e-06, "loss": 1.02517424, "memory(GiB)": 737.53, "step": 3745, "train_speed(iter/s)": 0.128172 }, { "acc": 0.73667212, "epoch": 0.09512952677816904, "grad_norm": 3.703125, "learning_rate": 9.512937595129377e-06, "loss": 1.04222202, "memory(GiB)": 737.53, "step": 3750, "train_speed(iter/s)": 0.128071 }, { "acc": 0.74892964, "epoch": 0.0952563661472066, "grad_norm": 3.96875, "learning_rate": 9.525621511922883e-06, "loss": 1.06781025, "memory(GiB)": 737.53, "step": 3755, "train_speed(iter/s)": 0.127946 }, { "acc": 0.73083372, "epoch": 0.09538320551624416, "grad_norm": 3.546875, "learning_rate": 9.538305428716389e-06, "loss": 1.0062604, "memory(GiB)": 737.53, "step": 3760, "train_speed(iter/s)": 0.127814 }, { "acc": 0.74352846, "epoch": 0.09551004488528171, "grad_norm": 4.5625, "learning_rate": 9.550989345509895e-06, "loss": 1.07082024, "memory(GiB)": 737.53, "step": 3765, "train_speed(iter/s)": 0.127708 }, { "acc": 0.73775387, "epoch": 0.09563688425431928, "grad_norm": 3.984375, "learning_rate": 9.5636732623034e-06, "loss": 1.04446716, "memory(GiB)": 737.53, "step": 3770, "train_speed(iter/s)": 0.127584 }, { "acc": 0.74858227, "epoch": 0.09576372362335683, "grad_norm": 3.578125, "learning_rate": 9.576357179096905e-06, "loss": 0.99251537, "memory(GiB)": 737.53, "step": 3775, "train_speed(iter/s)": 0.127431 }, { "acc": 0.73514485, "epoch": 0.09589056299239439, "grad_norm": 4.03125, "learning_rate": 9.589041095890411e-06, "loss": 1.05437489, "memory(GiB)": 737.53, "step": 3780, "train_speed(iter/s)": 0.127282 }, { "acc": 0.73243518, "epoch": 0.09601740236143196, "grad_norm": 4.3125, "learning_rate": 9.601725012683919e-06, "loss": 1.05542841, "memory(GiB)": 737.53, "step": 3785, "train_speed(iter/s)": 0.127172 }, { "acc": 0.73991098, "epoch": 0.09614424173046951, "grad_norm": 3.671875, "learning_rate": 9.614408929477423e-06, "loss": 1.05121651, "memory(GiB)": 737.53, "step": 3790, "train_speed(iter/s)": 0.127067 }, { "acc": 0.73265309, "epoch": 0.09627108109950706, "grad_norm": 3.03125, "learning_rate": 9.627092846270929e-06, "loss": 1.0782445, "memory(GiB)": 737.53, "step": 3795, "train_speed(iter/s)": 0.126943 }, { "acc": 0.73672719, "epoch": 0.09639792046854463, "grad_norm": 3.921875, "learning_rate": 9.639776763064435e-06, "loss": 1.02827711, "memory(GiB)": 737.53, "step": 3800, "train_speed(iter/s)": 0.12685 }, { "acc": 0.73653293, "epoch": 0.09652475983758219, "grad_norm": 3.375, "learning_rate": 9.65246067985794e-06, "loss": 1.03888741, "memory(GiB)": 737.53, "step": 3805, "train_speed(iter/s)": 0.126716 }, { "acc": 0.74384699, "epoch": 0.09665159920661974, "grad_norm": 3.53125, "learning_rate": 9.665144596651447e-06, "loss": 1.01756573, "memory(GiB)": 737.53, "step": 3810, "train_speed(iter/s)": 0.126596 }, { "acc": 0.71868787, "epoch": 0.09677843857565731, "grad_norm": 5.0625, "learning_rate": 9.677828513444953e-06, "loss": 1.12379274, "memory(GiB)": 737.53, "step": 3815, "train_speed(iter/s)": 0.126519 }, { "acc": 0.7255054, "epoch": 0.09690527794469486, "grad_norm": 4.1875, "learning_rate": 9.690512430238459e-06, "loss": 1.10287561, "memory(GiB)": 737.53, "step": 3820, "train_speed(iter/s)": 0.126395 }, { "acc": 0.76430387, "epoch": 0.09703211731373242, "grad_norm": 3.484375, "learning_rate": 9.703196347031965e-06, "loss": 0.92424393, "memory(GiB)": 737.53, "step": 3825, "train_speed(iter/s)": 0.126237 }, { "acc": 0.7523262, "epoch": 0.09715895668276998, "grad_norm": 3.6875, "learning_rate": 9.71588026382547e-06, "loss": 0.97552509, "memory(GiB)": 737.53, "step": 3830, "train_speed(iter/s)": 0.126101 }, { "acc": 0.72883034, "epoch": 0.09728579605180754, "grad_norm": 4.4375, "learning_rate": 9.728564180618977e-06, "loss": 1.06751661, "memory(GiB)": 737.53, "step": 3835, "train_speed(iter/s)": 0.126005 }, { "acc": 0.73827252, "epoch": 0.09741263542084509, "grad_norm": 3.75, "learning_rate": 9.74124809741248e-06, "loss": 1.06239748, "memory(GiB)": 737.53, "step": 3840, "train_speed(iter/s)": 0.125893 }, { "acc": 0.74080424, "epoch": 0.09753947478988266, "grad_norm": 3.578125, "learning_rate": 9.753932014205988e-06, "loss": 1.03517647, "memory(GiB)": 737.53, "step": 3845, "train_speed(iter/s)": 0.125797 }, { "acc": 0.75311518, "epoch": 0.09766631415892021, "grad_norm": 3.453125, "learning_rate": 9.766615930999493e-06, "loss": 0.98373175, "memory(GiB)": 737.53, "step": 3850, "train_speed(iter/s)": 0.125687 }, { "acc": 0.74976845, "epoch": 0.09779315352795777, "grad_norm": 3.78125, "learning_rate": 9.779299847792999e-06, "loss": 1.00783777, "memory(GiB)": 737.53, "step": 3855, "train_speed(iter/s)": 0.125536 }, { "acc": 0.74650168, "epoch": 0.09791999289699534, "grad_norm": 4.75, "learning_rate": 9.791983764586505e-06, "loss": 0.95917559, "memory(GiB)": 737.53, "step": 3860, "train_speed(iter/s)": 0.125418 }, { "acc": 0.75365, "epoch": 0.09804683226603289, "grad_norm": 14.3125, "learning_rate": 9.80466768138001e-06, "loss": 1.01990499, "memory(GiB)": 737.53, "step": 3865, "train_speed(iter/s)": 0.12532 }, { "acc": 0.73771019, "epoch": 0.09817367163507044, "grad_norm": 3.8125, "learning_rate": 9.817351598173517e-06, "loss": 1.05276308, "memory(GiB)": 737.53, "step": 3870, "train_speed(iter/s)": 0.125233 }, { "acc": 0.73925796, "epoch": 0.09830051100410801, "grad_norm": 3.171875, "learning_rate": 9.830035514967023e-06, "loss": 1.0564785, "memory(GiB)": 737.53, "step": 3875, "train_speed(iter/s)": 0.125108 }, { "acc": 0.73643689, "epoch": 0.09842735037314557, "grad_norm": 3.6875, "learning_rate": 9.842719431760529e-06, "loss": 1.09437885, "memory(GiB)": 737.53, "step": 3880, "train_speed(iter/s)": 0.12501 }, { "acc": 0.74427118, "epoch": 0.09855418974218312, "grad_norm": 4.15625, "learning_rate": 9.855403348554034e-06, "loss": 1.00005941, "memory(GiB)": 737.53, "step": 3885, "train_speed(iter/s)": 0.124905 }, { "acc": 0.72647791, "epoch": 0.09868102911122069, "grad_norm": 4.03125, "learning_rate": 9.86808726534754e-06, "loss": 1.04868336, "memory(GiB)": 737.53, "step": 3890, "train_speed(iter/s)": 0.124822 }, { "acc": 0.73806906, "epoch": 0.09880786848025824, "grad_norm": 3.75, "learning_rate": 9.880771182141046e-06, "loss": 1.09265776, "memory(GiB)": 737.53, "step": 3895, "train_speed(iter/s)": 0.12473 }, { "acc": 0.7514565, "epoch": 0.0989347078492958, "grad_norm": 3.765625, "learning_rate": 9.89345509893455e-06, "loss": 1.00374784, "memory(GiB)": 737.53, "step": 3900, "train_speed(iter/s)": 0.124626 }, { "acc": 0.74350233, "epoch": 0.09906154721833337, "grad_norm": 3.84375, "learning_rate": 9.906139015728058e-06, "loss": 1.05138121, "memory(GiB)": 737.53, "step": 3905, "train_speed(iter/s)": 0.124517 }, { "acc": 0.73173933, "epoch": 0.09918838658737092, "grad_norm": 3.375, "learning_rate": 9.918822932521563e-06, "loss": 1.06723261, "memory(GiB)": 737.53, "step": 3910, "train_speed(iter/s)": 0.124375 }, { "acc": 0.74421134, "epoch": 0.09931522595640847, "grad_norm": 3.65625, "learning_rate": 9.931506849315069e-06, "loss": 1.01362972, "memory(GiB)": 737.53, "step": 3915, "train_speed(iter/s)": 0.124285 }, { "acc": 0.73994265, "epoch": 0.09944206532544604, "grad_norm": 3.390625, "learning_rate": 9.944190766108575e-06, "loss": 1.04115791, "memory(GiB)": 737.53, "step": 3920, "train_speed(iter/s)": 0.124192 }, { "acc": 0.74426379, "epoch": 0.0995689046944836, "grad_norm": 4.25, "learning_rate": 9.95687468290208e-06, "loss": 0.95658569, "memory(GiB)": 737.53, "step": 3925, "train_speed(iter/s)": 0.124089 }, { "acc": 0.74618554, "epoch": 0.09969574406352115, "grad_norm": 3.953125, "learning_rate": 9.969558599695586e-06, "loss": 1.0586195, "memory(GiB)": 737.53, "step": 3930, "train_speed(iter/s)": 0.123959 }, { "acc": 0.73725777, "epoch": 0.09982258343255872, "grad_norm": 3.890625, "learning_rate": 9.982242516489092e-06, "loss": 1.04030504, "memory(GiB)": 737.53, "step": 3935, "train_speed(iter/s)": 0.123853 }, { "acc": 0.74806242, "epoch": 0.09994942280159627, "grad_norm": 4.90625, "learning_rate": 9.994926433282598e-06, "loss": 0.98998432, "memory(GiB)": 737.53, "step": 3940, "train_speed(iter/s)": 0.123725 }, { "acc": 0.76303473, "epoch": 0.10007626217063383, "grad_norm": 3.546875, "learning_rate": 9.999999960411868e-06, "loss": 0.97619619, "memory(GiB)": 737.53, "step": 3945, "train_speed(iter/s)": 0.123623 }, { "acc": 0.74294982, "epoch": 0.1002031015396714, "grad_norm": 4.5625, "learning_rate": 9.999999718484394e-06, "loss": 1.05751095, "memory(GiB)": 737.53, "step": 3950, "train_speed(iter/s)": 0.123518 }, { "acc": 0.74796538, "epoch": 0.10032994090870895, "grad_norm": 3.8125, "learning_rate": 9.999999256622863e-06, "loss": 1.04737921, "memory(GiB)": 737.53, "step": 3955, "train_speed(iter/s)": 0.123433 }, { "acc": 0.76085348, "epoch": 0.1004567802777465, "grad_norm": 4.03125, "learning_rate": 9.999998574827297e-06, "loss": 0.97217741, "memory(GiB)": 737.53, "step": 3960, "train_speed(iter/s)": 0.12334 }, { "acc": 0.74643993, "epoch": 0.10058361964678407, "grad_norm": 4.375, "learning_rate": 9.999997673097723e-06, "loss": 1.04031982, "memory(GiB)": 737.53, "step": 3965, "train_speed(iter/s)": 0.123227 }, { "acc": 0.74225101, "epoch": 0.10071045901582162, "grad_norm": 3.75, "learning_rate": 9.999996551434183e-06, "loss": 0.99924231, "memory(GiB)": 737.53, "step": 3970, "train_speed(iter/s)": 0.123135 }, { "acc": 0.74231324, "epoch": 0.10083729838485918, "grad_norm": 4.09375, "learning_rate": 9.999995209836724e-06, "loss": 1.03034077, "memory(GiB)": 737.53, "step": 3975, "train_speed(iter/s)": 0.123052 }, { "acc": 0.737889, "epoch": 0.10096413775389675, "grad_norm": 4.78125, "learning_rate": 9.999993648305408e-06, "loss": 1.00398788, "memory(GiB)": 737.54, "step": 3980, "train_speed(iter/s)": 0.12297 }, { "acc": 0.74601383, "epoch": 0.1010909771229343, "grad_norm": 4.40625, "learning_rate": 9.999991866840304e-06, "loss": 1.02922716, "memory(GiB)": 737.54, "step": 3985, "train_speed(iter/s)": 0.122859 }, { "acc": 0.74904871, "epoch": 0.10121781649197185, "grad_norm": 4.125, "learning_rate": 9.999989865441486e-06, "loss": 0.95392981, "memory(GiB)": 737.54, "step": 3990, "train_speed(iter/s)": 0.122766 }, { "acc": 0.72981577, "epoch": 0.10134465586100942, "grad_norm": 4.875, "learning_rate": 9.999987644109046e-06, "loss": 1.00337715, "memory(GiB)": 737.54, "step": 3995, "train_speed(iter/s)": 0.122664 }, { "acc": 0.74311113, "epoch": 0.10147149523004698, "grad_norm": 3.9375, "learning_rate": 9.999985202843081e-06, "loss": 1.02529831, "memory(GiB)": 737.54, "step": 4000, "train_speed(iter/s)": 0.122569 }, { "epoch": 0.10147149523004698, "eval_acc": 0.734929566520462, "eval_loss": 0.9776066541671753, "eval_runtime": 1151.1455, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 4000 }, { "acc": 0.74224234, "epoch": 0.10159833459908453, "grad_norm": 4.6875, "learning_rate": 9.999982541643699e-06, "loss": 0.98954687, "memory(GiB)": 737.54, "step": 4005, "train_speed(iter/s)": 0.115847 }, { "acc": 0.74518986, "epoch": 0.1017251739681221, "grad_norm": 4.9375, "learning_rate": 9.999979660511012e-06, "loss": 1.01344576, "memory(GiB)": 737.54, "step": 4010, "train_speed(iter/s)": 0.115762 }, { "acc": 0.74665093, "epoch": 0.10185201333715965, "grad_norm": 3.6875, "learning_rate": 9.999976559445153e-06, "loss": 0.9854188, "memory(GiB)": 737.54, "step": 4015, "train_speed(iter/s)": 0.115669 }, { "acc": 0.75183759, "epoch": 0.1019788527061972, "grad_norm": 3.875, "learning_rate": 9.999973238446256e-06, "loss": 1.03382301, "memory(GiB)": 737.54, "step": 4020, "train_speed(iter/s)": 0.115583 }, { "acc": 0.75165744, "epoch": 0.10210569207523477, "grad_norm": 4.15625, "learning_rate": 9.999969697514467e-06, "loss": 0.98098173, "memory(GiB)": 737.54, "step": 4025, "train_speed(iter/s)": 0.115514 }, { "acc": 0.73423505, "epoch": 0.10223253144427233, "grad_norm": 3.125, "learning_rate": 9.999965936649943e-06, "loss": 1.03824015, "memory(GiB)": 737.54, "step": 4030, "train_speed(iter/s)": 0.115434 }, { "acc": 0.74402456, "epoch": 0.10235937081330988, "grad_norm": 4.21875, "learning_rate": 9.999961955852846e-06, "loss": 1.0250226, "memory(GiB)": 737.54, "step": 4035, "train_speed(iter/s)": 0.115365 }, { "acc": 0.75260425, "epoch": 0.10248621018234745, "grad_norm": 4.59375, "learning_rate": 9.999957755123355e-06, "loss": 1.02280025, "memory(GiB)": 737.54, "step": 4040, "train_speed(iter/s)": 0.115281 }, { "acc": 0.75083389, "epoch": 0.102613049551385, "grad_norm": 4.75, "learning_rate": 9.999953334461653e-06, "loss": 0.9787159, "memory(GiB)": 752.16, "step": 4045, "train_speed(iter/s)": 0.115179 }, { "acc": 0.74181952, "epoch": 0.10273988892042256, "grad_norm": 3.75, "learning_rate": 9.999948693867934e-06, "loss": 1.03843117, "memory(GiB)": 752.16, "step": 4050, "train_speed(iter/s)": 0.11512 }, { "acc": 0.73973885, "epoch": 0.10286672828946013, "grad_norm": 3.5625, "learning_rate": 9.999943833342403e-06, "loss": 1.01250944, "memory(GiB)": 752.16, "step": 4055, "train_speed(iter/s)": 0.115051 }, { "acc": 0.76136117, "epoch": 0.10299356765849768, "grad_norm": 3.84375, "learning_rate": 9.999938752885274e-06, "loss": 0.94831896, "memory(GiB)": 752.16, "step": 4060, "train_speed(iter/s)": 0.114954 }, { "acc": 0.74817305, "epoch": 0.10312040702753524, "grad_norm": 4.625, "learning_rate": 9.99993345249677e-06, "loss": 1.03629456, "memory(GiB)": 752.16, "step": 4065, "train_speed(iter/s)": 0.114859 }, { "acc": 0.73544407, "epoch": 0.1032472463965728, "grad_norm": 3.546875, "learning_rate": 9.999927932177124e-06, "loss": 1.05702333, "memory(GiB)": 752.16, "step": 4070, "train_speed(iter/s)": 0.114788 }, { "acc": 0.75057802, "epoch": 0.10337408576561036, "grad_norm": 3.859375, "learning_rate": 9.999922191926579e-06, "loss": 0.97013502, "memory(GiB)": 752.16, "step": 4075, "train_speed(iter/s)": 0.114724 }, { "acc": 0.75275688, "epoch": 0.10350092513464791, "grad_norm": 3.46875, "learning_rate": 9.999916231745388e-06, "loss": 0.96876688, "memory(GiB)": 752.16, "step": 4080, "train_speed(iter/s)": 0.11464 }, { "acc": 0.74430819, "epoch": 0.10362776450368548, "grad_norm": 3.9375, "learning_rate": 9.999910051633812e-06, "loss": 1.01441736, "memory(GiB)": 752.16, "step": 4085, "train_speed(iter/s)": 0.114596 }, { "acc": 0.74345551, "epoch": 0.10375460387272303, "grad_norm": 3.078125, "learning_rate": 9.999903651592123e-06, "loss": 1.01581144, "memory(GiB)": 752.16, "step": 4090, "train_speed(iter/s)": 0.114526 }, { "acc": 0.74728513, "epoch": 0.10388144324176059, "grad_norm": 3.921875, "learning_rate": 9.999897031620605e-06, "loss": 0.96141424, "memory(GiB)": 752.16, "step": 4095, "train_speed(iter/s)": 0.114451 }, { "acc": 0.74901695, "epoch": 0.10400828261079816, "grad_norm": 3.625, "learning_rate": 9.999890191719546e-06, "loss": 1.05359631, "memory(GiB)": 752.16, "step": 4100, "train_speed(iter/s)": 0.114381 }, { "acc": 0.73080978, "epoch": 0.10413512197983571, "grad_norm": 5.21875, "learning_rate": 9.999883131889248e-06, "loss": 1.03021717, "memory(GiB)": 752.16, "step": 4105, "train_speed(iter/s)": 0.114321 }, { "acc": 0.75052996, "epoch": 0.10426196134887326, "grad_norm": 4.03125, "learning_rate": 9.999875852130021e-06, "loss": 0.99726877, "memory(GiB)": 752.16, "step": 4110, "train_speed(iter/s)": 0.114252 }, { "acc": 0.74044747, "epoch": 0.10438880071791083, "grad_norm": 3.890625, "learning_rate": 9.999868352442186e-06, "loss": 1.07566948, "memory(GiB)": 752.16, "step": 4115, "train_speed(iter/s)": 0.114174 }, { "acc": 0.74456038, "epoch": 0.10451564008694839, "grad_norm": 3.625, "learning_rate": 9.999860632826073e-06, "loss": 1.02970791, "memory(GiB)": 752.16, "step": 4120, "train_speed(iter/s)": 0.114105 }, { "acc": 0.75181894, "epoch": 0.10464247945598594, "grad_norm": 3.515625, "learning_rate": 9.999852693282021e-06, "loss": 1.00275707, "memory(GiB)": 752.16, "step": 4125, "train_speed(iter/s)": 0.114037 }, { "acc": 0.7526216, "epoch": 0.10476931882502351, "grad_norm": 4.90625, "learning_rate": 9.999844533810381e-06, "loss": 1.01284971, "memory(GiB)": 752.16, "step": 4130, "train_speed(iter/s)": 0.113959 }, { "acc": 0.73741188, "epoch": 0.10489615819406106, "grad_norm": 3.203125, "learning_rate": 9.999836154411508e-06, "loss": 1.03886728, "memory(GiB)": 752.16, "step": 4135, "train_speed(iter/s)": 0.113886 }, { "acc": 0.76328688, "epoch": 0.10502299756309862, "grad_norm": 4.8125, "learning_rate": 9.999827555085775e-06, "loss": 0.92552795, "memory(GiB)": 752.16, "step": 4140, "train_speed(iter/s)": 0.113811 }, { "acc": 0.75002608, "epoch": 0.10514983693213618, "grad_norm": 3.875, "learning_rate": 9.999818735833559e-06, "loss": 0.97192049, "memory(GiB)": 752.16, "step": 4145, "train_speed(iter/s)": 0.113734 }, { "acc": 0.74204612, "epoch": 0.10527667630117374, "grad_norm": 3.78125, "learning_rate": 9.999809696655244e-06, "loss": 1.03400059, "memory(GiB)": 752.16, "step": 4150, "train_speed(iter/s)": 0.113653 }, { "acc": 0.76085434, "epoch": 0.10540351567021129, "grad_norm": 4.1875, "learning_rate": 9.999800437551234e-06, "loss": 0.96207151, "memory(GiB)": 752.16, "step": 4155, "train_speed(iter/s)": 0.113602 }, { "acc": 0.74788475, "epoch": 0.10553035503924886, "grad_norm": 3.78125, "learning_rate": 9.999790958521932e-06, "loss": 0.97378445, "memory(GiB)": 752.16, "step": 4160, "train_speed(iter/s)": 0.113525 }, { "acc": 0.74507089, "epoch": 0.10565719440828641, "grad_norm": 4.1875, "learning_rate": 9.999781259567756e-06, "loss": 1.02241802, "memory(GiB)": 752.16, "step": 4165, "train_speed(iter/s)": 0.113463 }, { "acc": 0.75409489, "epoch": 0.10578403377732397, "grad_norm": 3.421875, "learning_rate": 9.999771340689133e-06, "loss": 0.97503538, "memory(GiB)": 752.16, "step": 4170, "train_speed(iter/s)": 0.113411 }, { "acc": 0.75267701, "epoch": 0.10591087314636154, "grad_norm": 3.421875, "learning_rate": 9.999761201886497e-06, "loss": 0.93355923, "memory(GiB)": 752.16, "step": 4175, "train_speed(iter/s)": 0.113331 }, { "acc": 0.75358357, "epoch": 0.10603771251539909, "grad_norm": 3.40625, "learning_rate": 9.9997508431603e-06, "loss": 0.95598936, "memory(GiB)": 752.16, "step": 4180, "train_speed(iter/s)": 0.113255 }, { "acc": 0.75756726, "epoch": 0.10616455188443664, "grad_norm": 3.578125, "learning_rate": 9.99974026451099e-06, "loss": 0.94137573, "memory(GiB)": 752.16, "step": 4185, "train_speed(iter/s)": 0.113204 }, { "acc": 0.74566541, "epoch": 0.10629139125347421, "grad_norm": 2.84375, "learning_rate": 9.999729465939036e-06, "loss": 0.99793549, "memory(GiB)": 752.16, "step": 4190, "train_speed(iter/s)": 0.113133 }, { "acc": 0.75591035, "epoch": 0.10641823062251177, "grad_norm": 3.828125, "learning_rate": 9.999718447444915e-06, "loss": 1.00127048, "memory(GiB)": 752.16, "step": 4195, "train_speed(iter/s)": 0.113064 }, { "acc": 0.76030197, "epoch": 0.10654506999154932, "grad_norm": 10.5, "learning_rate": 9.999707209029108e-06, "loss": 0.96431332, "memory(GiB)": 752.16, "step": 4200, "train_speed(iter/s)": 0.113001 }, { "acc": 0.75733862, "epoch": 0.10667190936058689, "grad_norm": 4.15625, "learning_rate": 9.99969575069211e-06, "loss": 0.97548399, "memory(GiB)": 752.16, "step": 4205, "train_speed(iter/s)": 0.112947 }, { "acc": 0.74261155, "epoch": 0.10679874872962444, "grad_norm": 3.109375, "learning_rate": 9.999684072434429e-06, "loss": 1.00569582, "memory(GiB)": 752.16, "step": 4210, "train_speed(iter/s)": 0.112893 }, { "acc": 0.74866962, "epoch": 0.106925588098662, "grad_norm": 3.4375, "learning_rate": 9.999672174256573e-06, "loss": 1.00119839, "memory(GiB)": 752.16, "step": 4215, "train_speed(iter/s)": 0.112818 }, { "acc": 0.74894967, "epoch": 0.10705242746769957, "grad_norm": 3.640625, "learning_rate": 9.99966005615907e-06, "loss": 0.94909086, "memory(GiB)": 752.16, "step": 4220, "train_speed(iter/s)": 0.112724 }, { "acc": 0.74780436, "epoch": 0.10717926683673712, "grad_norm": 3.03125, "learning_rate": 9.99964771814245e-06, "loss": 1.03638916, "memory(GiB)": 752.16, "step": 4225, "train_speed(iter/s)": 0.112643 }, { "acc": 0.75929885, "epoch": 0.10730610620577467, "grad_norm": 3.703125, "learning_rate": 9.999635160207257e-06, "loss": 0.98652191, "memory(GiB)": 752.16, "step": 4230, "train_speed(iter/s)": 0.112581 }, { "acc": 0.74700942, "epoch": 0.10743294557481224, "grad_norm": 3.421875, "learning_rate": 9.999622382354042e-06, "loss": 1.03639078, "memory(GiB)": 752.16, "step": 4235, "train_speed(iter/s)": 0.112525 }, { "acc": 0.7442492, "epoch": 0.1075597849438498, "grad_norm": 4.3125, "learning_rate": 9.999609384583368e-06, "loss": 1.01263981, "memory(GiB)": 752.16, "step": 4240, "train_speed(iter/s)": 0.112465 }, { "acc": 0.75172, "epoch": 0.10768662431288735, "grad_norm": 3.78125, "learning_rate": 9.999596166895808e-06, "loss": 0.94851637, "memory(GiB)": 752.16, "step": 4245, "train_speed(iter/s)": 0.112399 }, { "acc": 0.75359545, "epoch": 0.10781346368192492, "grad_norm": 3.0625, "learning_rate": 9.999582729291943e-06, "loss": 0.97741489, "memory(GiB)": 752.16, "step": 4250, "train_speed(iter/s)": 0.112314 }, { "acc": 0.74646292, "epoch": 0.10794030305096247, "grad_norm": 3.9375, "learning_rate": 9.999569071772361e-06, "loss": 1.06008091, "memory(GiB)": 752.16, "step": 4255, "train_speed(iter/s)": 0.11226 }, { "acc": 0.75752134, "epoch": 0.10806714242000003, "grad_norm": 3.265625, "learning_rate": 9.999555194337668e-06, "loss": 0.97554636, "memory(GiB)": 752.16, "step": 4260, "train_speed(iter/s)": 0.112197 }, { "acc": 0.74950771, "epoch": 0.1081939817890376, "grad_norm": 3.84375, "learning_rate": 9.99954109698847e-06, "loss": 0.99295578, "memory(GiB)": 752.16, "step": 4265, "train_speed(iter/s)": 0.112143 }, { "acc": 0.74162297, "epoch": 0.10832082115807515, "grad_norm": 4.875, "learning_rate": 9.999526779725388e-06, "loss": 1.02023163, "memory(GiB)": 752.16, "step": 4270, "train_speed(iter/s)": 0.112078 }, { "acc": 0.73535676, "epoch": 0.1084476605271127, "grad_norm": 3.84375, "learning_rate": 9.999512242549054e-06, "loss": 1.05113916, "memory(GiB)": 752.16, "step": 4275, "train_speed(iter/s)": 0.112002 }, { "acc": 0.75518146, "epoch": 0.10857449989615027, "grad_norm": 3.515625, "learning_rate": 9.999497485460106e-06, "loss": 1.00629511, "memory(GiB)": 752.16, "step": 4280, "train_speed(iter/s)": 0.11194 }, { "acc": 0.76201296, "epoch": 0.10870133926518782, "grad_norm": 4.90625, "learning_rate": 9.999482508459191e-06, "loss": 0.98338518, "memory(GiB)": 752.16, "step": 4285, "train_speed(iter/s)": 0.11189 }, { "acc": 0.75419092, "epoch": 0.10882817863422538, "grad_norm": 3.703125, "learning_rate": 9.999467311546971e-06, "loss": 0.96835251, "memory(GiB)": 752.16, "step": 4290, "train_speed(iter/s)": 0.111822 }, { "acc": 0.75740447, "epoch": 0.10895501800326295, "grad_norm": 3.890625, "learning_rate": 9.999451894724113e-06, "loss": 0.97330647, "memory(GiB)": 752.16, "step": 4295, "train_speed(iter/s)": 0.11176 }, { "acc": 0.74405251, "epoch": 0.1090818573723005, "grad_norm": 3.703125, "learning_rate": 9.999436257991295e-06, "loss": 0.97453966, "memory(GiB)": 752.16, "step": 4300, "train_speed(iter/s)": 0.111674 }, { "acc": 0.74079027, "epoch": 0.10920869674133805, "grad_norm": 4.5, "learning_rate": 9.999420401349207e-06, "loss": 1.01113234, "memory(GiB)": 752.16, "step": 4305, "train_speed(iter/s)": 0.111601 }, { "acc": 0.75193615, "epoch": 0.10933553611037562, "grad_norm": 3.765625, "learning_rate": 9.999404324798543e-06, "loss": 0.96562471, "memory(GiB)": 752.16, "step": 4310, "train_speed(iter/s)": 0.111526 }, { "acc": 0.74738445, "epoch": 0.10946237547941318, "grad_norm": 3.96875, "learning_rate": 9.999388028340011e-06, "loss": 1.03437538, "memory(GiB)": 752.16, "step": 4315, "train_speed(iter/s)": 0.111469 }, { "acc": 0.7476191, "epoch": 0.10958921484845073, "grad_norm": 3.796875, "learning_rate": 9.999371511974332e-06, "loss": 1.02468367, "memory(GiB)": 752.16, "step": 4320, "train_speed(iter/s)": 0.111425 }, { "acc": 0.73226857, "epoch": 0.1097160542174883, "grad_norm": 4.28125, "learning_rate": 9.999354775702226e-06, "loss": 1.01945171, "memory(GiB)": 752.16, "step": 4325, "train_speed(iter/s)": 0.111348 }, { "acc": 0.73976922, "epoch": 0.10984289358652585, "grad_norm": 4.03125, "learning_rate": 9.999337819524433e-06, "loss": 1.00922403, "memory(GiB)": 752.16, "step": 4330, "train_speed(iter/s)": 0.111281 }, { "acc": 0.75826507, "epoch": 0.1099697329555634, "grad_norm": 3.421875, "learning_rate": 9.9993206434417e-06, "loss": 0.97276258, "memory(GiB)": 752.16, "step": 4335, "train_speed(iter/s)": 0.111232 }, { "acc": 0.75154748, "epoch": 0.11009657232460097, "grad_norm": 3.5, "learning_rate": 9.999303247454778e-06, "loss": 0.97340717, "memory(GiB)": 752.16, "step": 4340, "train_speed(iter/s)": 0.111176 }, { "acc": 0.76054435, "epoch": 0.11022341169363853, "grad_norm": 3.1875, "learning_rate": 9.999285631564437e-06, "loss": 0.97558393, "memory(GiB)": 752.16, "step": 4345, "train_speed(iter/s)": 0.111114 }, { "acc": 0.76035628, "epoch": 0.11035025106267608, "grad_norm": 3.859375, "learning_rate": 9.999267795771447e-06, "loss": 0.92237177, "memory(GiB)": 752.16, "step": 4350, "train_speed(iter/s)": 0.11106 }, { "acc": 0.75089111, "epoch": 0.11047709043171365, "grad_norm": 4.375, "learning_rate": 9.999249740076598e-06, "loss": 0.96350327, "memory(GiB)": 752.16, "step": 4355, "train_speed(iter/s)": 0.110983 }, { "acc": 0.74806981, "epoch": 0.1106039298007512, "grad_norm": 3.21875, "learning_rate": 9.999231464480682e-06, "loss": 0.98374777, "memory(GiB)": 752.16, "step": 4360, "train_speed(iter/s)": 0.110916 }, { "acc": 0.73510838, "epoch": 0.11073076916978876, "grad_norm": 3.390625, "learning_rate": 9.999212968984499e-06, "loss": 1.03198795, "memory(GiB)": 752.16, "step": 4365, "train_speed(iter/s)": 0.110856 }, { "acc": 0.74243994, "epoch": 0.11085760853882633, "grad_norm": 3.203125, "learning_rate": 9.999194253588868e-06, "loss": 1.00246334, "memory(GiB)": 752.16, "step": 4370, "train_speed(iter/s)": 0.110782 }, { "acc": 0.75513783, "epoch": 0.11098444790786388, "grad_norm": 4.71875, "learning_rate": 9.999175318294611e-06, "loss": 0.97760515, "memory(GiB)": 752.16, "step": 4375, "train_speed(iter/s)": 0.110734 }, { "acc": 0.74865522, "epoch": 0.11111128727690144, "grad_norm": 3.671875, "learning_rate": 9.999156163102559e-06, "loss": 0.99429569, "memory(GiB)": 752.16, "step": 4380, "train_speed(iter/s)": 0.110684 }, { "acc": 0.74945931, "epoch": 0.111238126645939, "grad_norm": 4.125, "learning_rate": 9.999136788013557e-06, "loss": 0.98678761, "memory(GiB)": 752.16, "step": 4385, "train_speed(iter/s)": 0.110623 }, { "acc": 0.7395905, "epoch": 0.11136496601497656, "grad_norm": 3.109375, "learning_rate": 9.999117193028455e-06, "loss": 1.03640137, "memory(GiB)": 752.16, "step": 4390, "train_speed(iter/s)": 0.110565 }, { "acc": 0.73678107, "epoch": 0.11149180538401411, "grad_norm": 5.28125, "learning_rate": 9.999097378148116e-06, "loss": 1.09470806, "memory(GiB)": 752.16, "step": 4395, "train_speed(iter/s)": 0.110529 }, { "acc": 0.74549088, "epoch": 0.11161864475305168, "grad_norm": 3.9375, "learning_rate": 9.99907734337341e-06, "loss": 1.00027628, "memory(GiB)": 752.16, "step": 4400, "train_speed(iter/s)": 0.110468 }, { "acc": 0.75642076, "epoch": 0.11174548412208923, "grad_norm": 7.1875, "learning_rate": 9.999057088705222e-06, "loss": 1.00004358, "memory(GiB)": 752.16, "step": 4405, "train_speed(iter/s)": 0.110405 }, { "acc": 0.75230813, "epoch": 0.11187232349112679, "grad_norm": 3.734375, "learning_rate": 9.999036614144439e-06, "loss": 0.97716055, "memory(GiB)": 752.16, "step": 4410, "train_speed(iter/s)": 0.110352 }, { "acc": 0.75491962, "epoch": 0.11199916286016436, "grad_norm": 3.609375, "learning_rate": 9.999015919691963e-06, "loss": 0.98622313, "memory(GiB)": 752.16, "step": 4415, "train_speed(iter/s)": 0.110277 }, { "acc": 0.75359869, "epoch": 0.11212600222920191, "grad_norm": 6.25, "learning_rate": 9.998995005348706e-06, "loss": 1.03141556, "memory(GiB)": 752.16, "step": 4420, "train_speed(iter/s)": 0.110202 }, { "acc": 0.74053874, "epoch": 0.11225284159823946, "grad_norm": 4.09375, "learning_rate": 9.998973871115586e-06, "loss": 1.00296354, "memory(GiB)": 752.16, "step": 4425, "train_speed(iter/s)": 0.110127 }, { "acc": 0.74584208, "epoch": 0.11237968096727703, "grad_norm": 3.578125, "learning_rate": 9.998952516993533e-06, "loss": 0.99996538, "memory(GiB)": 752.16, "step": 4430, "train_speed(iter/s)": 0.110065 }, { "acc": 0.72905068, "epoch": 0.11250652033631459, "grad_norm": 3.84375, "learning_rate": 9.998930942983486e-06, "loss": 1.07936735, "memory(GiB)": 752.16, "step": 4435, "train_speed(iter/s)": 0.110028 }, { "acc": 0.74475665, "epoch": 0.11263335970535214, "grad_norm": 3.71875, "learning_rate": 9.998909149086396e-06, "loss": 0.9908514, "memory(GiB)": 752.16, "step": 4440, "train_speed(iter/s)": 0.109984 }, { "acc": 0.73757906, "epoch": 0.11276019907438971, "grad_norm": 3.765625, "learning_rate": 9.998887135303216e-06, "loss": 1.07004633, "memory(GiB)": 752.16, "step": 4445, "train_speed(iter/s)": 0.109932 }, { "acc": 0.7531682, "epoch": 0.11288703844342726, "grad_norm": 3.6875, "learning_rate": 9.998864901634922e-06, "loss": 0.97439899, "memory(GiB)": 752.16, "step": 4450, "train_speed(iter/s)": 0.109851 }, { "acc": 0.7412569, "epoch": 0.11301387781246482, "grad_norm": 3.6875, "learning_rate": 9.998842448082489e-06, "loss": 1.04324284, "memory(GiB)": 752.16, "step": 4455, "train_speed(iter/s)": 0.109796 }, { "acc": 0.74663997, "epoch": 0.11314071718150238, "grad_norm": 4.28125, "learning_rate": 9.998819774646903e-06, "loss": 1.02018948, "memory(GiB)": 752.16, "step": 4460, "train_speed(iter/s)": 0.10974 }, { "acc": 0.75740047, "epoch": 0.11326755655053994, "grad_norm": 5.0625, "learning_rate": 9.99879688132916e-06, "loss": 0.99686823, "memory(GiB)": 752.16, "step": 4465, "train_speed(iter/s)": 0.109679 }, { "acc": 0.75029693, "epoch": 0.11339439591957749, "grad_norm": 3.640625, "learning_rate": 9.99877376813027e-06, "loss": 0.96644659, "memory(GiB)": 752.16, "step": 4470, "train_speed(iter/s)": 0.109629 }, { "acc": 0.7580296, "epoch": 0.11352123528861506, "grad_norm": 3.6875, "learning_rate": 9.998750435051251e-06, "loss": 0.93504868, "memory(GiB)": 752.16, "step": 4475, "train_speed(iter/s)": 0.109571 }, { "acc": 0.7502986, "epoch": 0.11364807465765261, "grad_norm": 3.671875, "learning_rate": 9.998726882093126e-06, "loss": 1.01808805, "memory(GiB)": 752.16, "step": 4480, "train_speed(iter/s)": 0.109513 }, { "acc": 0.75042324, "epoch": 0.11377491402669017, "grad_norm": 4.15625, "learning_rate": 9.998703109256933e-06, "loss": 1.04668398, "memory(GiB)": 752.16, "step": 4485, "train_speed(iter/s)": 0.109465 }, { "acc": 0.74276819, "epoch": 0.11390175339572774, "grad_norm": 4.15625, "learning_rate": 9.998679116543716e-06, "loss": 0.98354435, "memory(GiB)": 752.16, "step": 4490, "train_speed(iter/s)": 0.109404 }, { "acc": 0.74212914, "epoch": 0.11402859276476529, "grad_norm": 4.09375, "learning_rate": 9.998654903954533e-06, "loss": 0.97885075, "memory(GiB)": 752.16, "step": 4495, "train_speed(iter/s)": 0.109356 }, { "acc": 0.74902201, "epoch": 0.11415543213380284, "grad_norm": 3.484375, "learning_rate": 9.998630471490448e-06, "loss": 0.97831688, "memory(GiB)": 752.16, "step": 4500, "train_speed(iter/s)": 0.109311 }, { "epoch": 0.11415543213380284, "eval_acc": 0.7364824076097151, "eval_loss": 0.9674122333526611, "eval_runtime": 1152.4171, "eval_samples_per_second": 5.528, "eval_steps_per_second": 5.528, "step": 4500 }, { "acc": 0.75655599, "epoch": 0.11428227150284041, "grad_norm": 3.109375, "learning_rate": 9.998605819152534e-06, "loss": 0.96861229, "memory(GiB)": 752.16, "step": 4505, "train_speed(iter/s)": 0.104504 }, { "acc": 0.74822845, "epoch": 0.11440911087187797, "grad_norm": 3.640625, "learning_rate": 9.998580946941876e-06, "loss": 1.01846972, "memory(GiB)": 752.16, "step": 4510, "train_speed(iter/s)": 0.104456 }, { "acc": 0.7424737, "epoch": 0.11453595024091552, "grad_norm": 3.71875, "learning_rate": 9.998555854859569e-06, "loss": 1.05881052, "memory(GiB)": 752.16, "step": 4515, "train_speed(iter/s)": 0.104382 }, { "acc": 0.74131165, "epoch": 0.11466278960995309, "grad_norm": 3.359375, "learning_rate": 9.998530542906716e-06, "loss": 0.98642349, "memory(GiB)": 752.16, "step": 4520, "train_speed(iter/s)": 0.10432 }, { "acc": 0.75210967, "epoch": 0.11478962897899064, "grad_norm": 3.1875, "learning_rate": 9.998505011084432e-06, "loss": 1.00484571, "memory(GiB)": 752.16, "step": 4525, "train_speed(iter/s)": 0.104278 }, { "acc": 0.760008, "epoch": 0.1149164683480282, "grad_norm": 3.625, "learning_rate": 9.998479259393837e-06, "loss": 0.97556133, "memory(GiB)": 752.16, "step": 4530, "train_speed(iter/s)": 0.104243 }, { "acc": 0.75428686, "epoch": 0.11504330771706577, "grad_norm": 3.171875, "learning_rate": 9.998453287836067e-06, "loss": 0.9485363, "memory(GiB)": 752.16, "step": 4535, "train_speed(iter/s)": 0.104203 }, { "acc": 0.73581429, "epoch": 0.11517014708610332, "grad_norm": 4.25, "learning_rate": 9.998427096412263e-06, "loss": 1.06327467, "memory(GiB)": 752.16, "step": 4540, "train_speed(iter/s)": 0.104137 }, { "acc": 0.74285254, "epoch": 0.11529698645514087, "grad_norm": 3.734375, "learning_rate": 9.998400685123574e-06, "loss": 1.03099499, "memory(GiB)": 752.16, "step": 4545, "train_speed(iter/s)": 0.104094 }, { "acc": 0.75701032, "epoch": 0.11542382582417844, "grad_norm": 3.546875, "learning_rate": 9.998374053971167e-06, "loss": 0.98667955, "memory(GiB)": 752.16, "step": 4550, "train_speed(iter/s)": 0.104058 }, { "acc": 0.73546305, "epoch": 0.115550665193216, "grad_norm": 3.484375, "learning_rate": 9.998347202956212e-06, "loss": 1.01833258, "memory(GiB)": 752.16, "step": 4555, "train_speed(iter/s)": 0.104015 }, { "acc": 0.75785637, "epoch": 0.11567750456225355, "grad_norm": 3.515625, "learning_rate": 9.998320132079889e-06, "loss": 0.9983696, "memory(GiB)": 752.16, "step": 4560, "train_speed(iter/s)": 0.103976 }, { "acc": 0.72295065, "epoch": 0.11580434393129112, "grad_norm": 4.65625, "learning_rate": 9.998292841343387e-06, "loss": 1.12023916, "memory(GiB)": 752.16, "step": 4565, "train_speed(iter/s)": 0.103943 }, { "acc": 0.74308062, "epoch": 0.11593118330032867, "grad_norm": 3.609375, "learning_rate": 9.998265330747909e-06, "loss": 1.03777208, "memory(GiB)": 752.16, "step": 4570, "train_speed(iter/s)": 0.1039 }, { "acc": 0.74852285, "epoch": 0.11605802266936623, "grad_norm": 5.0625, "learning_rate": 9.998237600294667e-06, "loss": 0.99675465, "memory(GiB)": 752.16, "step": 4575, "train_speed(iter/s)": 0.103861 }, { "acc": 0.75376573, "epoch": 0.1161848620384038, "grad_norm": 4.1875, "learning_rate": 9.998209649984876e-06, "loss": 0.94726553, "memory(GiB)": 752.16, "step": 4580, "train_speed(iter/s)": 0.10382 }, { "acc": 0.75058179, "epoch": 0.11631170140744135, "grad_norm": 3.46875, "learning_rate": 9.998181479819768e-06, "loss": 1.01577826, "memory(GiB)": 752.16, "step": 4585, "train_speed(iter/s)": 0.103775 }, { "acc": 0.75974236, "epoch": 0.1164385407764789, "grad_norm": 3.15625, "learning_rate": 9.998153089800583e-06, "loss": 0.91112318, "memory(GiB)": 752.16, "step": 4590, "train_speed(iter/s)": 0.103728 }, { "acc": 0.75096498, "epoch": 0.11656538014551647, "grad_norm": 3.578125, "learning_rate": 9.998124479928569e-06, "loss": 0.96490622, "memory(GiB)": 752.16, "step": 4595, "train_speed(iter/s)": 0.103683 }, { "acc": 0.7544312, "epoch": 0.11669221951455402, "grad_norm": 3.0625, "learning_rate": 9.998095650204982e-06, "loss": 1.01440954, "memory(GiB)": 752.16, "step": 4600, "train_speed(iter/s)": 0.103639 }, { "acc": 0.73815861, "epoch": 0.11681905888359158, "grad_norm": 2.953125, "learning_rate": 9.998066600631094e-06, "loss": 0.99812269, "memory(GiB)": 752.16, "step": 4605, "train_speed(iter/s)": 0.103602 }, { "acc": 0.74432735, "epoch": 0.11694589825262915, "grad_norm": 4.3125, "learning_rate": 9.998037331208181e-06, "loss": 1.00787649, "memory(GiB)": 752.16, "step": 4610, "train_speed(iter/s)": 0.103566 }, { "acc": 0.74547324, "epoch": 0.1170727376216667, "grad_norm": 3.640625, "learning_rate": 9.99800784193753e-06, "loss": 0.9641923, "memory(GiB)": 752.16, "step": 4615, "train_speed(iter/s)": 0.103514 }, { "acc": 0.74724116, "epoch": 0.11719957699070425, "grad_norm": 3.46875, "learning_rate": 9.997978132820437e-06, "loss": 0.99503689, "memory(GiB)": 752.16, "step": 4620, "train_speed(iter/s)": 0.103483 }, { "acc": 0.74720478, "epoch": 0.11732641635974182, "grad_norm": 4.28125, "learning_rate": 9.997948203858212e-06, "loss": 1.01521282, "memory(GiB)": 752.16, "step": 4625, "train_speed(iter/s)": 0.103438 }, { "acc": 0.75069752, "epoch": 0.11745325572877938, "grad_norm": 3.640625, "learning_rate": 9.99791805505217e-06, "loss": 1.02976007, "memory(GiB)": 752.16, "step": 4630, "train_speed(iter/s)": 0.103401 }, { "acc": 0.74651785, "epoch": 0.11758009509781693, "grad_norm": 3.28125, "learning_rate": 9.997887686403637e-06, "loss": 0.98568392, "memory(GiB)": 752.16, "step": 4635, "train_speed(iter/s)": 0.103353 }, { "acc": 0.73894367, "epoch": 0.1177069344668545, "grad_norm": 3.40625, "learning_rate": 9.99785709791395e-06, "loss": 0.99165764, "memory(GiB)": 752.16, "step": 4640, "train_speed(iter/s)": 0.103312 }, { "acc": 0.75009871, "epoch": 0.11783377383589205, "grad_norm": 3.25, "learning_rate": 9.997826289584453e-06, "loss": 0.99496832, "memory(GiB)": 752.16, "step": 4645, "train_speed(iter/s)": 0.103275 }, { "acc": 0.76386623, "epoch": 0.1179606132049296, "grad_norm": 3.5625, "learning_rate": 9.997795261416501e-06, "loss": 0.88284941, "memory(GiB)": 752.16, "step": 4650, "train_speed(iter/s)": 0.103222 }, { "acc": 0.72487745, "epoch": 0.11808745257396717, "grad_norm": 3.78125, "learning_rate": 9.997764013411458e-06, "loss": 1.07046318, "memory(GiB)": 752.16, "step": 4655, "train_speed(iter/s)": 0.103188 }, { "acc": 0.74561038, "epoch": 0.11821429194300473, "grad_norm": 4.25, "learning_rate": 9.997732545570703e-06, "loss": 1.04524145, "memory(GiB)": 752.16, "step": 4660, "train_speed(iter/s)": 0.103158 }, { "acc": 0.73919764, "epoch": 0.11834113131204228, "grad_norm": 3.59375, "learning_rate": 9.997700857895614e-06, "loss": 1.01681013, "memory(GiB)": 752.16, "step": 4665, "train_speed(iter/s)": 0.103133 }, { "acc": 0.75343585, "epoch": 0.11846797068107985, "grad_norm": 3.78125, "learning_rate": 9.997668950387589e-06, "loss": 0.96315393, "memory(GiB)": 752.16, "step": 4670, "train_speed(iter/s)": 0.103093 }, { "acc": 0.75248737, "epoch": 0.1185948100501174, "grad_norm": 3.875, "learning_rate": 9.997636823048031e-06, "loss": 1.00509033, "memory(GiB)": 752.16, "step": 4675, "train_speed(iter/s)": 0.103042 }, { "acc": 0.74621258, "epoch": 0.11872164941915496, "grad_norm": 3.53125, "learning_rate": 9.997604475878353e-06, "loss": 1.04012814, "memory(GiB)": 752.16, "step": 4680, "train_speed(iter/s)": 0.103011 }, { "acc": 0.72870412, "epoch": 0.11884848878819253, "grad_norm": 3.78125, "learning_rate": 9.997571908879976e-06, "loss": 1.0702774, "memory(GiB)": 752.16, "step": 4685, "train_speed(iter/s)": 0.102967 }, { "acc": 0.74634085, "epoch": 0.11897532815723008, "grad_norm": 4.0625, "learning_rate": 9.997539122054334e-06, "loss": 1.01406097, "memory(GiB)": 752.16, "step": 4690, "train_speed(iter/s)": 0.102918 }, { "acc": 0.74766312, "epoch": 0.11910216752626764, "grad_norm": 3.5625, "learning_rate": 9.99750611540287e-06, "loss": 1.00270319, "memory(GiB)": 752.16, "step": 4695, "train_speed(iter/s)": 0.10288 }, { "acc": 0.76039348, "epoch": 0.1192290068953052, "grad_norm": 4.34375, "learning_rate": 9.997472888927036e-06, "loss": 1.03004503, "memory(GiB)": 752.16, "step": 4700, "train_speed(iter/s)": 0.102845 }, { "acc": 0.73990617, "epoch": 0.11935584626434276, "grad_norm": 3.296875, "learning_rate": 9.997439442628292e-06, "loss": 1.03703632, "memory(GiB)": 752.16, "step": 4705, "train_speed(iter/s)": 0.102805 }, { "acc": 0.74242711, "epoch": 0.11948268563338031, "grad_norm": 3.359375, "learning_rate": 9.997405776508107e-06, "loss": 1.00356293, "memory(GiB)": 752.16, "step": 4710, "train_speed(iter/s)": 0.102762 }, { "acc": 0.7579917, "epoch": 0.11960952500241788, "grad_norm": 4.0625, "learning_rate": 9.997371890567968e-06, "loss": 0.95205622, "memory(GiB)": 752.16, "step": 4715, "train_speed(iter/s)": 0.102711 }, { "acc": 0.75941219, "epoch": 0.11973636437145543, "grad_norm": 3.46875, "learning_rate": 9.99733778480936e-06, "loss": 0.94908867, "memory(GiB)": 752.16, "step": 4720, "train_speed(iter/s)": 0.102669 }, { "acc": 0.73999887, "epoch": 0.11986320374049299, "grad_norm": 3.9375, "learning_rate": 9.997303459233788e-06, "loss": 1.01623049, "memory(GiB)": 752.16, "step": 4725, "train_speed(iter/s)": 0.102622 }, { "acc": 0.76459441, "epoch": 0.11999004310953056, "grad_norm": 3.96875, "learning_rate": 9.997268913842756e-06, "loss": 0.94577379, "memory(GiB)": 752.16, "step": 4730, "train_speed(iter/s)": 0.102574 }, { "acc": 0.7466713, "epoch": 0.12011688247856811, "grad_norm": 4.1875, "learning_rate": 9.997234148637788e-06, "loss": 1.03604479, "memory(GiB)": 752.16, "step": 4735, "train_speed(iter/s)": 0.102549 }, { "acc": 0.7475369, "epoch": 0.12024372184760566, "grad_norm": 5.0625, "learning_rate": 9.997199163620413e-06, "loss": 0.99787531, "memory(GiB)": 752.16, "step": 4740, "train_speed(iter/s)": 0.102498 }, { "acc": 0.76194997, "epoch": 0.12037056121664323, "grad_norm": 3.609375, "learning_rate": 9.997163958792167e-06, "loss": 0.90208397, "memory(GiB)": 752.16, "step": 4745, "train_speed(iter/s)": 0.102463 }, { "acc": 0.75383925, "epoch": 0.12049740058568079, "grad_norm": 4.3125, "learning_rate": 9.997128534154602e-06, "loss": 0.97083187, "memory(GiB)": 752.16, "step": 4750, "train_speed(iter/s)": 0.102426 }, { "acc": 0.76673875, "epoch": 0.12062423995471834, "grad_norm": 3.859375, "learning_rate": 9.997092889709275e-06, "loss": 0.98251266, "memory(GiB)": 752.16, "step": 4755, "train_speed(iter/s)": 0.102389 }, { "acc": 0.7551239, "epoch": 0.12075107932375591, "grad_norm": 3.90625, "learning_rate": 9.99705702545775e-06, "loss": 0.94976053, "memory(GiB)": 752.16, "step": 4760, "train_speed(iter/s)": 0.102348 }, { "acc": 0.75013175, "epoch": 0.12087791869279346, "grad_norm": 4.4375, "learning_rate": 9.997020941401612e-06, "loss": 0.97356129, "memory(GiB)": 752.16, "step": 4765, "train_speed(iter/s)": 0.102313 }, { "acc": 0.75459833, "epoch": 0.12100475806183102, "grad_norm": 3.640625, "learning_rate": 9.996984637542442e-06, "loss": 0.93677044, "memory(GiB)": 752.16, "step": 4770, "train_speed(iter/s)": 0.102271 }, { "acc": 0.74295726, "epoch": 0.12113159743086858, "grad_norm": 4.3125, "learning_rate": 9.99694811388184e-06, "loss": 1.01171265, "memory(GiB)": 752.16, "step": 4775, "train_speed(iter/s)": 0.102237 }, { "acc": 0.74877625, "epoch": 0.12125843679990614, "grad_norm": 3.765625, "learning_rate": 9.996911370421412e-06, "loss": 1.03503504, "memory(GiB)": 752.16, "step": 4780, "train_speed(iter/s)": 0.102201 }, { "acc": 0.75407557, "epoch": 0.12138527616894369, "grad_norm": 3.765625, "learning_rate": 9.996874407162773e-06, "loss": 1.00580301, "memory(GiB)": 752.16, "step": 4785, "train_speed(iter/s)": 0.102156 }, { "acc": 0.74918756, "epoch": 0.12151211553798126, "grad_norm": 2.859375, "learning_rate": 9.99683722410755e-06, "loss": 0.99955511, "memory(GiB)": 752.16, "step": 4790, "train_speed(iter/s)": 0.102123 }, { "acc": 0.75499868, "epoch": 0.12163895490701881, "grad_norm": 4.09375, "learning_rate": 9.996799821257378e-06, "loss": 0.98588972, "memory(GiB)": 752.16, "step": 4795, "train_speed(iter/s)": 0.102089 }, { "acc": 0.75317125, "epoch": 0.12176579427605637, "grad_norm": 4.1875, "learning_rate": 9.996762198613905e-06, "loss": 1.00483751, "memory(GiB)": 752.16, "step": 4800, "train_speed(iter/s)": 0.102062 }, { "acc": 0.75305796, "epoch": 0.12189263364509394, "grad_norm": 3.265625, "learning_rate": 9.996724356178782e-06, "loss": 0.95304375, "memory(GiB)": 752.16, "step": 4805, "train_speed(iter/s)": 0.102029 }, { "acc": 0.75623436, "epoch": 0.12201947301413149, "grad_norm": 4.03125, "learning_rate": 9.996686293953674e-06, "loss": 0.98308544, "memory(GiB)": 752.16, "step": 4810, "train_speed(iter/s)": 0.101997 }, { "acc": 0.75411277, "epoch": 0.12214631238316905, "grad_norm": 3.5, "learning_rate": 9.996648011940259e-06, "loss": 0.93584023, "memory(GiB)": 752.16, "step": 4815, "train_speed(iter/s)": 0.101968 }, { "acc": 0.75710387, "epoch": 0.12227315175220661, "grad_norm": 3.984375, "learning_rate": 9.996609510140215e-06, "loss": 0.96444979, "memory(GiB)": 752.16, "step": 4820, "train_speed(iter/s)": 0.101939 }, { "acc": 0.75967565, "epoch": 0.12239999112124417, "grad_norm": 3.734375, "learning_rate": 9.996570788555242e-06, "loss": 0.96572151, "memory(GiB)": 752.16, "step": 4825, "train_speed(iter/s)": 0.101909 }, { "acc": 0.73076024, "epoch": 0.12252683049028172, "grad_norm": 4.09375, "learning_rate": 9.99653184718704e-06, "loss": 1.05108452, "memory(GiB)": 752.16, "step": 4830, "train_speed(iter/s)": 0.101869 }, { "acc": 0.74501381, "epoch": 0.12265366985931929, "grad_norm": 6.0625, "learning_rate": 9.996492686037318e-06, "loss": 1.0257637, "memory(GiB)": 752.16, "step": 4835, "train_speed(iter/s)": 0.101826 }, { "acc": 0.75450878, "epoch": 0.12278050922835684, "grad_norm": 4.1875, "learning_rate": 9.996453305107806e-06, "loss": 0.96025753, "memory(GiB)": 752.16, "step": 4840, "train_speed(iter/s)": 0.101786 }, { "acc": 0.75046601, "epoch": 0.1229073485973944, "grad_norm": 4.375, "learning_rate": 9.996413704400233e-06, "loss": 1.0217391, "memory(GiB)": 752.16, "step": 4845, "train_speed(iter/s)": 0.101757 }, { "acc": 0.74974484, "epoch": 0.12303418796643197, "grad_norm": 4.1875, "learning_rate": 9.996373883916339e-06, "loss": 0.97907734, "memory(GiB)": 752.16, "step": 4850, "train_speed(iter/s)": 0.101715 }, { "acc": 0.76258039, "epoch": 0.12316102733546952, "grad_norm": 3.59375, "learning_rate": 9.996333843657876e-06, "loss": 0.93298359, "memory(GiB)": 752.16, "step": 4855, "train_speed(iter/s)": 0.101681 }, { "acc": 0.76279583, "epoch": 0.12328786670450707, "grad_norm": 4.84375, "learning_rate": 9.99629358362661e-06, "loss": 0.93154068, "memory(GiB)": 752.16, "step": 4860, "train_speed(iter/s)": 0.101648 }, { "acc": 0.7465941, "epoch": 0.12341470607354464, "grad_norm": 3.578125, "learning_rate": 9.996253103824306e-06, "loss": 0.95391207, "memory(GiB)": 752.16, "step": 4865, "train_speed(iter/s)": 0.101621 }, { "acc": 0.7529007, "epoch": 0.1235415454425822, "grad_norm": 4.625, "learning_rate": 9.996212404252748e-06, "loss": 0.94577894, "memory(GiB)": 752.16, "step": 4870, "train_speed(iter/s)": 0.101586 }, { "acc": 0.75715299, "epoch": 0.12366838481161975, "grad_norm": 3.609375, "learning_rate": 9.996171484913725e-06, "loss": 1.00711946, "memory(GiB)": 752.16, "step": 4875, "train_speed(iter/s)": 0.101549 }, { "acc": 0.75582981, "epoch": 0.12379522418065732, "grad_norm": 3.453125, "learning_rate": 9.996130345809037e-06, "loss": 0.96841545, "memory(GiB)": 752.16, "step": 4880, "train_speed(iter/s)": 0.101513 }, { "acc": 0.74929113, "epoch": 0.12392206354969487, "grad_norm": 3.4375, "learning_rate": 9.996088986940492e-06, "loss": 0.92891188, "memory(GiB)": 752.16, "step": 4885, "train_speed(iter/s)": 0.101462 }, { "acc": 0.74627237, "epoch": 0.12404890291873243, "grad_norm": 4.0625, "learning_rate": 9.996047408309914e-06, "loss": 0.9967227, "memory(GiB)": 752.16, "step": 4890, "train_speed(iter/s)": 0.101415 }, { "acc": 0.74715157, "epoch": 0.12417574228777, "grad_norm": 3.84375, "learning_rate": 9.996005609919129e-06, "loss": 0.97716761, "memory(GiB)": 752.16, "step": 4895, "train_speed(iter/s)": 0.101379 }, { "acc": 0.74936247, "epoch": 0.12430258165680755, "grad_norm": 4.46875, "learning_rate": 9.995963591769974e-06, "loss": 0.98215828, "memory(GiB)": 752.16, "step": 4900, "train_speed(iter/s)": 0.101348 }, { "acc": 0.74838262, "epoch": 0.1244294210258451, "grad_norm": 3.515625, "learning_rate": 9.995921353864298e-06, "loss": 0.96148338, "memory(GiB)": 752.16, "step": 4905, "train_speed(iter/s)": 0.101291 }, { "acc": 0.73778849, "epoch": 0.12455626039488267, "grad_norm": 4.59375, "learning_rate": 9.99587889620396e-06, "loss": 1.02277622, "memory(GiB)": 752.16, "step": 4910, "train_speed(iter/s)": 0.101259 }, { "acc": 0.7596837, "epoch": 0.12468309976392022, "grad_norm": 3.59375, "learning_rate": 9.995836218790828e-06, "loss": 0.94523649, "memory(GiB)": 752.16, "step": 4915, "train_speed(iter/s)": 0.101213 }, { "acc": 0.75462875, "epoch": 0.12480993913295778, "grad_norm": 3.703125, "learning_rate": 9.995793321626778e-06, "loss": 0.91677475, "memory(GiB)": 752.16, "step": 4920, "train_speed(iter/s)": 0.101178 }, { "acc": 0.75017915, "epoch": 0.12493677850199535, "grad_norm": 4.09375, "learning_rate": 9.9957502047137e-06, "loss": 0.98129711, "memory(GiB)": 752.16, "step": 4925, "train_speed(iter/s)": 0.101146 }, { "acc": 0.75492878, "epoch": 0.1250636178710329, "grad_norm": 4.15625, "learning_rate": 9.995706868053482e-06, "loss": 0.98893394, "memory(GiB)": 752.16, "step": 4930, "train_speed(iter/s)": 0.101118 }, { "acc": 0.74878106, "epoch": 0.12519045724007047, "grad_norm": 3.34375, "learning_rate": 9.99566331164804e-06, "loss": 0.99787674, "memory(GiB)": 752.16, "step": 4935, "train_speed(iter/s)": 0.101087 }, { "acc": 0.75867982, "epoch": 0.125317296609108, "grad_norm": 3.640625, "learning_rate": 9.995619535499288e-06, "loss": 0.97289009, "memory(GiB)": 752.16, "step": 4940, "train_speed(iter/s)": 0.10105 }, { "acc": 0.74322863, "epoch": 0.12544413597814558, "grad_norm": 4.3125, "learning_rate": 9.995575539609145e-06, "loss": 1.01749001, "memory(GiB)": 752.16, "step": 4945, "train_speed(iter/s)": 0.101013 }, { "acc": 0.76369638, "epoch": 0.12557097534718314, "grad_norm": 3.9375, "learning_rate": 9.995531323979554e-06, "loss": 0.98480167, "memory(GiB)": 752.16, "step": 4950, "train_speed(iter/s)": 0.100963 }, { "acc": 0.76020536, "epoch": 0.12569781471622068, "grad_norm": 4.21875, "learning_rate": 9.995486888612456e-06, "loss": 0.94628887, "memory(GiB)": 752.16, "step": 4955, "train_speed(iter/s)": 0.100924 }, { "acc": 0.75865121, "epoch": 0.12582465408525825, "grad_norm": 3.546875, "learning_rate": 9.995442233509807e-06, "loss": 0.97654161, "memory(GiB)": 752.16, "step": 4960, "train_speed(iter/s)": 0.100877 }, { "acc": 0.75113387, "epoch": 0.12595149345429582, "grad_norm": 3.140625, "learning_rate": 9.99539735867357e-06, "loss": 0.9601409, "memory(GiB)": 752.16, "step": 4965, "train_speed(iter/s)": 0.100846 }, { "acc": 0.74876742, "epoch": 0.12607833282333336, "grad_norm": 3.5, "learning_rate": 9.99535226410572e-06, "loss": 1.01792173, "memory(GiB)": 752.16, "step": 4970, "train_speed(iter/s)": 0.100802 }, { "acc": 0.75631413, "epoch": 0.12620517219237093, "grad_norm": 4.5625, "learning_rate": 9.99530694980824e-06, "loss": 0.98233175, "memory(GiB)": 752.16, "step": 4975, "train_speed(iter/s)": 0.100764 }, { "acc": 0.7497674, "epoch": 0.1263320115614085, "grad_norm": 3.828125, "learning_rate": 9.995261415783121e-06, "loss": 0.92036467, "memory(GiB)": 752.16, "step": 4980, "train_speed(iter/s)": 0.100732 }, { "acc": 0.76099453, "epoch": 0.12645885093044604, "grad_norm": 13.0625, "learning_rate": 9.995215662032371e-06, "loss": 0.95832729, "memory(GiB)": 752.16, "step": 4985, "train_speed(iter/s)": 0.10069 }, { "acc": 0.73373742, "epoch": 0.1265856902994836, "grad_norm": 3.296875, "learning_rate": 9.995169688557998e-06, "loss": 0.97786474, "memory(GiB)": 752.16, "step": 4990, "train_speed(iter/s)": 0.100651 }, { "acc": 0.74838552, "epoch": 0.12671252966852117, "grad_norm": 3.859375, "learning_rate": 9.995123495362027e-06, "loss": 0.99101439, "memory(GiB)": 752.16, "step": 4995, "train_speed(iter/s)": 0.100613 }, { "acc": 0.74706802, "epoch": 0.1268393690375587, "grad_norm": 5.1875, "learning_rate": 9.995077082446488e-06, "loss": 0.97952814, "memory(GiB)": 752.16, "step": 5000, "train_speed(iter/s)": 0.100577 }, { "epoch": 0.1268393690375587, "eval_acc": 0.7382015200881322, "eval_loss": 0.9582358002662659, "eval_runtime": 1150.5655, "eval_samples_per_second": 5.536, "eval_steps_per_second": 5.536, "step": 5000 }, { "acc": 0.74380736, "epoch": 0.12696620840659628, "grad_norm": 3.890625, "learning_rate": 9.995030449813425e-06, "loss": 1.03213043, "memory(GiB)": 685.94, "step": 5005, "train_speed(iter/s)": 11.224222 }, { "acc": 0.73832207, "epoch": 0.12709304777563385, "grad_norm": 3.875, "learning_rate": 9.994983597464886e-06, "loss": 1.02516899, "memory(GiB)": 685.94, "step": 5010, "train_speed(iter/s)": 9.766327 }, { "acc": 0.75005326, "epoch": 0.1272198871446714, "grad_norm": 4.3125, "learning_rate": 9.994936525402934e-06, "loss": 0.98979521, "memory(GiB)": 685.94, "step": 5015, "train_speed(iter/s)": 8.619609 }, { "acc": 0.75587425, "epoch": 0.12734672651370896, "grad_norm": 3.234375, "learning_rate": 9.99488923362964e-06, "loss": 0.95403843, "memory(GiB)": 695.51, "step": 5020, "train_speed(iter/s)": 7.698297 }, { "acc": 0.75424728, "epoch": 0.12747356588274653, "grad_norm": 3.5625, "learning_rate": 9.994841722147082e-06, "loss": 1.01266899, "memory(GiB)": 695.51, "step": 5025, "train_speed(iter/s)": 6.994585 }, { "acc": 0.75497198, "epoch": 0.12760040525178407, "grad_norm": 3.125, "learning_rate": 9.994793990957352e-06, "loss": 1.00433655, "memory(GiB)": 695.51, "step": 5030, "train_speed(iter/s)": 6.309634 }, { "acc": 0.74364724, "epoch": 0.12772724462082163, "grad_norm": 4.8125, "learning_rate": 9.994746040062548e-06, "loss": 1.01294498, "memory(GiB)": 695.51, "step": 5035, "train_speed(iter/s)": 5.853006 }, { "acc": 0.74793839, "epoch": 0.1278540839898592, "grad_norm": 3.640625, "learning_rate": 9.994697869464781e-06, "loss": 1.00968323, "memory(GiB)": 695.51, "step": 5040, "train_speed(iter/s)": 5.460047 }, { "acc": 0.75869646, "epoch": 0.12798092335889674, "grad_norm": 4.125, "learning_rate": 9.994649479166167e-06, "loss": 0.93388243, "memory(GiB)": 695.51, "step": 5045, "train_speed(iter/s)": 5.117368 }, { "acc": 0.74759712, "epoch": 0.1281077627279343, "grad_norm": 3.796875, "learning_rate": 9.99460086916884e-06, "loss": 0.97734404, "memory(GiB)": 695.51, "step": 5050, "train_speed(iter/s)": 4.804677 }, { "acc": 0.73673806, "epoch": 0.12823460209697188, "grad_norm": 3.546875, "learning_rate": 9.99455203947493e-06, "loss": 1.02416534, "memory(GiB)": 695.51, "step": 5055, "train_speed(iter/s)": 4.526519 }, { "acc": 0.73931127, "epoch": 0.12836144146600942, "grad_norm": 3.109375, "learning_rate": 9.994502990086591e-06, "loss": 0.99528847, "memory(GiB)": 706.35, "step": 5060, "train_speed(iter/s)": 4.26963 }, { "acc": 0.73404751, "epoch": 0.128488280835047, "grad_norm": 3.28125, "learning_rate": 9.994453721005982e-06, "loss": 1.02900867, "memory(GiB)": 706.36, "step": 5065, "train_speed(iter/s)": 4.018349 }, { "acc": 0.75139446, "epoch": 0.12861512020408455, "grad_norm": 3.3125, "learning_rate": 9.994404232235265e-06, "loss": 0.959023, "memory(GiB)": 706.36, "step": 5070, "train_speed(iter/s)": 3.807155 }, { "acc": 0.76602216, "epoch": 0.1287419595731221, "grad_norm": 4.0625, "learning_rate": 9.994354523776617e-06, "loss": 0.9421195, "memory(GiB)": 706.36, "step": 5075, "train_speed(iter/s)": 3.633812 }, { "acc": 0.74199934, "epoch": 0.12886879894215966, "grad_norm": 3.9375, "learning_rate": 9.994304595632228e-06, "loss": 1.00199604, "memory(GiB)": 706.36, "step": 5080, "train_speed(iter/s)": 3.463102 }, { "acc": 0.73691249, "epoch": 0.12899563831119723, "grad_norm": 5.46875, "learning_rate": 9.994254447804292e-06, "loss": 1.04193735, "memory(GiB)": 706.36, "step": 5085, "train_speed(iter/s)": 3.325508 }, { "acc": 0.75552731, "epoch": 0.12912247768023477, "grad_norm": 3.9375, "learning_rate": 9.994204080295018e-06, "loss": 0.95514536, "memory(GiB)": 706.36, "step": 5090, "train_speed(iter/s)": 3.187816 }, { "acc": 0.74837279, "epoch": 0.12924931704927234, "grad_norm": 4.0625, "learning_rate": 9.994153493106615e-06, "loss": 1.00321941, "memory(GiB)": 706.36, "step": 5095, "train_speed(iter/s)": 3.066068 }, { "acc": 0.75356746, "epoch": 0.1293761564183099, "grad_norm": 3.75, "learning_rate": 9.994102686241317e-06, "loss": 0.91721354, "memory(GiB)": 706.36, "step": 5100, "train_speed(iter/s)": 2.945296 }, { "acc": 0.75584321, "epoch": 0.12950299578734745, "grad_norm": 4.1875, "learning_rate": 9.994051659701349e-06, "loss": 0.96614437, "memory(GiB)": 706.36, "step": 5105, "train_speed(iter/s)": 2.847326 }, { "acc": 0.73338623, "epoch": 0.12962983515638501, "grad_norm": 3.515625, "learning_rate": 9.994000413488963e-06, "loss": 1.05536346, "memory(GiB)": 706.36, "step": 5110, "train_speed(iter/s)": 2.74862 }, { "acc": 0.75367103, "epoch": 0.12975667452542258, "grad_norm": 3.0625, "learning_rate": 9.993948947606411e-06, "loss": 0.97288752, "memory(GiB)": 706.36, "step": 5115, "train_speed(iter/s)": 2.650674 }, { "acc": 0.77712193, "epoch": 0.12988351389446012, "grad_norm": 3.5, "learning_rate": 9.993897262055956e-06, "loss": 0.93294344, "memory(GiB)": 706.36, "step": 5120, "train_speed(iter/s)": 2.572269 }, { "acc": 0.75017996, "epoch": 0.1300103532634977, "grad_norm": 3.5625, "learning_rate": 9.993845356839872e-06, "loss": 0.97371922, "memory(GiB)": 706.36, "step": 5125, "train_speed(iter/s)": 2.489387 }, { "acc": 0.72993417, "epoch": 0.13013719263253526, "grad_norm": 4.4375, "learning_rate": 9.993793231960442e-06, "loss": 1.02742968, "memory(GiB)": 706.36, "step": 5130, "train_speed(iter/s)": 2.413955 }, { "acc": 0.75048513, "epoch": 0.1302640320015728, "grad_norm": 3.234375, "learning_rate": 9.99374088741996e-06, "loss": 0.9793952, "memory(GiB)": 706.36, "step": 5135, "train_speed(iter/s)": 2.341687 }, { "acc": 0.74271703, "epoch": 0.13039087137061037, "grad_norm": 3.765625, "learning_rate": 9.993688323220725e-06, "loss": 0.99770088, "memory(GiB)": 706.36, "step": 5140, "train_speed(iter/s)": 2.271825 }, { "acc": 0.7685226, "epoch": 0.13051771073964794, "grad_norm": 4.09375, "learning_rate": 9.993635539365054e-06, "loss": 0.93323622, "memory(GiB)": 706.36, "step": 5145, "train_speed(iter/s)": 2.197533 }, { "acc": 0.7546526, "epoch": 0.13064455010868548, "grad_norm": 3.90625, "learning_rate": 9.993582535855265e-06, "loss": 0.96185789, "memory(GiB)": 706.36, "step": 5150, "train_speed(iter/s)": 2.140917 }, { "acc": 0.7396543, "epoch": 0.13077138947772304, "grad_norm": 3.859375, "learning_rate": 9.993529312693691e-06, "loss": 0.99631157, "memory(GiB)": 706.36, "step": 5155, "train_speed(iter/s)": 2.086276 }, { "acc": 0.75716028, "epoch": 0.1308982288467606, "grad_norm": 3.671875, "learning_rate": 9.993475869882672e-06, "loss": 0.97877178, "memory(GiB)": 709.07, "step": 5160, "train_speed(iter/s)": 2.036574 }, { "acc": 0.76204162, "epoch": 0.13102506821579815, "grad_norm": 3.734375, "learning_rate": 9.99342220742456e-06, "loss": 0.96487474, "memory(GiB)": 709.07, "step": 5165, "train_speed(iter/s)": 1.989754 }, { "acc": 0.75560422, "epoch": 0.13115190758483572, "grad_norm": 3.203125, "learning_rate": 9.993368325321717e-06, "loss": 0.95793638, "memory(GiB)": 720.92, "step": 5170, "train_speed(iter/s)": 1.937228 }, { "acc": 0.7545177, "epoch": 0.1312787469538733, "grad_norm": 3.46875, "learning_rate": 9.99331422357651e-06, "loss": 0.95265474, "memory(GiB)": 720.92, "step": 5175, "train_speed(iter/s)": 1.890466 }, { "acc": 0.74717793, "epoch": 0.13140558632291083, "grad_norm": 4.9375, "learning_rate": 9.99325990219132e-06, "loss": 0.98266573, "memory(GiB)": 720.92, "step": 5180, "train_speed(iter/s)": 1.841776 }, { "acc": 0.73658118, "epoch": 0.1315324256919484, "grad_norm": 4.28125, "learning_rate": 9.993205361168536e-06, "loss": 1.0161088, "memory(GiB)": 720.92, "step": 5185, "train_speed(iter/s)": 1.801471 }, { "acc": 0.76326227, "epoch": 0.13165926506098596, "grad_norm": 5.4375, "learning_rate": 9.993150600510557e-06, "loss": 0.94549856, "memory(GiB)": 720.92, "step": 5190, "train_speed(iter/s)": 1.758217 }, { "acc": 0.73670673, "epoch": 0.1317861044300235, "grad_norm": 3.796875, "learning_rate": 9.993095620219793e-06, "loss": 1.01097326, "memory(GiB)": 720.92, "step": 5195, "train_speed(iter/s)": 1.721243 }, { "acc": 0.74235926, "epoch": 0.13191294379906107, "grad_norm": 3.421875, "learning_rate": 9.993040420298663e-06, "loss": 1.02001362, "memory(GiB)": 720.92, "step": 5200, "train_speed(iter/s)": 1.685297 }, { "acc": 0.75692391, "epoch": 0.13203978316809864, "grad_norm": 4.1875, "learning_rate": 9.992985000749592e-06, "loss": 1.00827961, "memory(GiB)": 720.92, "step": 5205, "train_speed(iter/s)": 1.647633 }, { "acc": 0.74145336, "epoch": 0.13216662253713618, "grad_norm": 3.828125, "learning_rate": 9.99292936157502e-06, "loss": 0.98924131, "memory(GiB)": 720.92, "step": 5210, "train_speed(iter/s)": 1.618131 }, { "acc": 0.74740767, "epoch": 0.13229346190617375, "grad_norm": 3.84375, "learning_rate": 9.992873502777394e-06, "loss": 1.00679178, "memory(GiB)": 720.92, "step": 5215, "train_speed(iter/s)": 1.585603 }, { "acc": 0.74466586, "epoch": 0.13242030127521132, "grad_norm": 3.046875, "learning_rate": 9.99281742435917e-06, "loss": 1.00212078, "memory(GiB)": 720.92, "step": 5220, "train_speed(iter/s)": 1.557401 }, { "acc": 0.75231166, "epoch": 0.13254714064424886, "grad_norm": 3.6875, "learning_rate": 9.992761126322816e-06, "loss": 1.01863003, "memory(GiB)": 720.92, "step": 5225, "train_speed(iter/s)": 1.530118 }, { "acc": 0.74967875, "epoch": 0.13267398001328642, "grad_norm": 3.796875, "learning_rate": 9.992704608670808e-06, "loss": 0.95891924, "memory(GiB)": 720.92, "step": 5230, "train_speed(iter/s)": 1.500588 }, { "acc": 0.73157883, "epoch": 0.132800819382324, "grad_norm": 3.734375, "learning_rate": 9.992647871405633e-06, "loss": 1.0294549, "memory(GiB)": 720.92, "step": 5235, "train_speed(iter/s)": 1.474578 }, { "acc": 0.745366, "epoch": 0.13292765875136153, "grad_norm": 3.421875, "learning_rate": 9.992590914529783e-06, "loss": 1.03484535, "memory(GiB)": 720.92, "step": 5240, "train_speed(iter/s)": 1.448698 }, { "acc": 0.74465008, "epoch": 0.1330544981203991, "grad_norm": 3.84375, "learning_rate": 9.992533738045768e-06, "loss": 0.96093769, "memory(GiB)": 720.92, "step": 5245, "train_speed(iter/s)": 1.423782 }, { "acc": 0.75215673, "epoch": 0.13318133748943667, "grad_norm": 3.625, "learning_rate": 9.9924763419561e-06, "loss": 0.97072792, "memory(GiB)": 720.92, "step": 5250, "train_speed(iter/s)": 1.397811 }, { "acc": 0.76497703, "epoch": 0.1333081768584742, "grad_norm": 3.0625, "learning_rate": 9.992418726263307e-06, "loss": 0.94634705, "memory(GiB)": 720.92, "step": 5255, "train_speed(iter/s)": 1.37251 }, { "acc": 0.75926399, "epoch": 0.13343501622751178, "grad_norm": 3.671875, "learning_rate": 9.992360890969918e-06, "loss": 0.91400871, "memory(GiB)": 720.92, "step": 5260, "train_speed(iter/s)": 1.351282 }, { "acc": 0.74869723, "epoch": 0.13356185559654934, "grad_norm": 4.1875, "learning_rate": 9.992302836078482e-06, "loss": 1.00202579, "memory(GiB)": 720.92, "step": 5265, "train_speed(iter/s)": 1.329316 }, { "acc": 0.76227064, "epoch": 0.13368869496558689, "grad_norm": 4.09375, "learning_rate": 9.992244561591551e-06, "loss": 0.95735264, "memory(GiB)": 720.92, "step": 5270, "train_speed(iter/s)": 1.309269 }, { "acc": 0.74665456, "epoch": 0.13381553433462445, "grad_norm": 3.4375, "learning_rate": 9.992186067511688e-06, "loss": 1.01034842, "memory(GiB)": 720.92, "step": 5275, "train_speed(iter/s)": 1.288782 }, { "acc": 0.75827289, "epoch": 0.13394237370366202, "grad_norm": 4.21875, "learning_rate": 9.992127353841465e-06, "loss": 0.93456907, "memory(GiB)": 720.92, "step": 5280, "train_speed(iter/s)": 1.269925 }, { "acc": 0.74000421, "epoch": 0.13406921307269956, "grad_norm": 3.953125, "learning_rate": 9.992068420583466e-06, "loss": 1.09607973, "memory(GiB)": 720.92, "step": 5285, "train_speed(iter/s)": 1.251258 }, { "acc": 0.7467957, "epoch": 0.13419605244173713, "grad_norm": 10.625, "learning_rate": 9.992009267740282e-06, "loss": 1.02662802, "memory(GiB)": 720.92, "step": 5290, "train_speed(iter/s)": 1.232414 }, { "acc": 0.75270038, "epoch": 0.1343228918107747, "grad_norm": 4.875, "learning_rate": 9.991949895314517e-06, "loss": 1.01215591, "memory(GiB)": 720.92, "step": 5295, "train_speed(iter/s)": 1.214217 }, { "acc": 0.74388995, "epoch": 0.13444973117981224, "grad_norm": 3.703125, "learning_rate": 9.991890303308781e-06, "loss": 0.95126848, "memory(GiB)": 720.92, "step": 5300, "train_speed(iter/s)": 1.195544 }, { "acc": 0.74746695, "epoch": 0.1345765705488498, "grad_norm": 4.0625, "learning_rate": 9.991830491725696e-06, "loss": 1.01871986, "memory(GiB)": 720.92, "step": 5305, "train_speed(iter/s)": 1.178356 }, { "acc": 0.75427232, "epoch": 0.13470340991788737, "grad_norm": 3.53125, "learning_rate": 9.991770460567893e-06, "loss": 0.94578743, "memory(GiB)": 720.92, "step": 5310, "train_speed(iter/s)": 1.1615 }, { "acc": 0.7509994, "epoch": 0.1348302492869249, "grad_norm": 4.625, "learning_rate": 9.99171020983801e-06, "loss": 0.9731636, "memory(GiB)": 720.92, "step": 5315, "train_speed(iter/s)": 1.147091 }, { "acc": 0.74613676, "epoch": 0.13495708865596248, "grad_norm": 3.546875, "learning_rate": 9.991649739538701e-06, "loss": 0.99395113, "memory(GiB)": 720.92, "step": 5320, "train_speed(iter/s)": 1.132257 }, { "acc": 0.75032306, "epoch": 0.13508392802500005, "grad_norm": 3.953125, "learning_rate": 9.991589049672625e-06, "loss": 0.94492264, "memory(GiB)": 720.92, "step": 5325, "train_speed(iter/s)": 1.117273 }, { "acc": 0.76288409, "epoch": 0.1352107673940376, "grad_norm": 3.90625, "learning_rate": 9.99152814024245e-06, "loss": 0.96170311, "memory(GiB)": 720.92, "step": 5330, "train_speed(iter/s)": 1.103375 }, { "acc": 0.73930154, "epoch": 0.13533760676307516, "grad_norm": 3.9375, "learning_rate": 9.991467011250856e-06, "loss": 1.06535454, "memory(GiB)": 720.92, "step": 5335, "train_speed(iter/s)": 1.089098 }, { "acc": 0.75519352, "epoch": 0.13546444613211273, "grad_norm": 4.0625, "learning_rate": 9.991405662700532e-06, "loss": 0.97965117, "memory(GiB)": 720.92, "step": 5340, "train_speed(iter/s)": 1.075353 }, { "acc": 0.74159508, "epoch": 0.13559128550115027, "grad_norm": 3.921875, "learning_rate": 9.991344094594177e-06, "loss": 0.99276648, "memory(GiB)": 720.92, "step": 5345, "train_speed(iter/s)": 1.0633 }, { "acc": 0.76205759, "epoch": 0.13571812487018783, "grad_norm": 3.28125, "learning_rate": 9.991282306934497e-06, "loss": 0.93948565, "memory(GiB)": 720.92, "step": 5350, "train_speed(iter/s)": 1.050732 }, { "acc": 0.75011287, "epoch": 0.1358449642392254, "grad_norm": 3.8125, "learning_rate": 9.991220299724214e-06, "loss": 0.96336689, "memory(GiB)": 720.92, "step": 5355, "train_speed(iter/s)": 1.038878 }, { "acc": 0.75802174, "epoch": 0.13597180360826294, "grad_norm": 3.515625, "learning_rate": 9.991158072966053e-06, "loss": 0.96565104, "memory(GiB)": 720.92, "step": 5360, "train_speed(iter/s)": 1.027291 }, { "acc": 0.76896944, "epoch": 0.1360986429773005, "grad_norm": 3.78125, "learning_rate": 9.991095626662748e-06, "loss": 0.96900511, "memory(GiB)": 720.92, "step": 5365, "train_speed(iter/s)": 1.016521 }, { "acc": 0.75249057, "epoch": 0.13622548234633808, "grad_norm": 4.46875, "learning_rate": 9.99103296081705e-06, "loss": 0.97000227, "memory(GiB)": 720.92, "step": 5370, "train_speed(iter/s)": 1.006035 }, { "acc": 0.73969774, "epoch": 0.13635232171537562, "grad_norm": 3.640625, "learning_rate": 9.990970075431715e-06, "loss": 1.01554508, "memory(GiB)": 720.92, "step": 5375, "train_speed(iter/s)": 0.994642 }, { "acc": 0.75958853, "epoch": 0.1364791610844132, "grad_norm": 3.90625, "learning_rate": 9.990906970509512e-06, "loss": 0.95686626, "memory(GiB)": 720.92, "step": 5380, "train_speed(iter/s)": 0.982908 }, { "acc": 0.74268661, "epoch": 0.13660600045345075, "grad_norm": 3.234375, "learning_rate": 9.99084364605321e-06, "loss": 1.02057381, "memory(GiB)": 720.92, "step": 5385, "train_speed(iter/s)": 0.971385 }, { "acc": 0.76423988, "epoch": 0.1367328398224883, "grad_norm": 3.578125, "learning_rate": 9.990780102065598e-06, "loss": 0.95466022, "memory(GiB)": 720.92, "step": 5390, "train_speed(iter/s)": 0.960375 }, { "acc": 0.75069275, "epoch": 0.13685967919152586, "grad_norm": 3.234375, "learning_rate": 9.990716338549472e-06, "loss": 0.93844833, "memory(GiB)": 720.92, "step": 5395, "train_speed(iter/s)": 0.949121 }, { "acc": 0.74169111, "epoch": 0.13698651856056343, "grad_norm": 3.703125, "learning_rate": 9.990652355507634e-06, "loss": 0.98303709, "memory(GiB)": 720.92, "step": 5400, "train_speed(iter/s)": 0.939193 }, { "acc": 0.74389811, "epoch": 0.13711335792960097, "grad_norm": 3.71875, "learning_rate": 9.990588152942901e-06, "loss": 0.9930378, "memory(GiB)": 720.92, "step": 5405, "train_speed(iter/s)": 0.928908 }, { "acc": 0.76002069, "epoch": 0.13724019729863854, "grad_norm": 4.0625, "learning_rate": 9.990523730858094e-06, "loss": 0.98007107, "memory(GiB)": 720.92, "step": 5410, "train_speed(iter/s)": 0.919766 }, { "acc": 0.75955682, "epoch": 0.1373670366676761, "grad_norm": 3.828125, "learning_rate": 9.990459089256053e-06, "loss": 0.92423115, "memory(GiB)": 720.92, "step": 5415, "train_speed(iter/s)": 0.911294 }, { "acc": 0.74874148, "epoch": 0.13749387603671365, "grad_norm": 3.125, "learning_rate": 9.990394228139616e-06, "loss": 0.91347198, "memory(GiB)": 720.92, "step": 5420, "train_speed(iter/s)": 0.900312 }, { "acc": 0.75045867, "epoch": 0.13762071540575121, "grad_norm": 4.03125, "learning_rate": 9.990329147511636e-06, "loss": 1.0071455, "memory(GiB)": 720.92, "step": 5425, "train_speed(iter/s)": 0.891717 }, { "acc": 0.75431638, "epoch": 0.13774755477478878, "grad_norm": 3.84375, "learning_rate": 9.990263847374976e-06, "loss": 0.93601027, "memory(GiB)": 720.92, "step": 5430, "train_speed(iter/s)": 0.882484 }, { "acc": 0.74735575, "epoch": 0.13787439414382632, "grad_norm": 3.140625, "learning_rate": 9.99019832773251e-06, "loss": 0.98336821, "memory(GiB)": 720.92, "step": 5435, "train_speed(iter/s)": 0.873879 }, { "acc": 0.75198207, "epoch": 0.1380012335128639, "grad_norm": 3.46875, "learning_rate": 9.99013258858712e-06, "loss": 0.95224972, "memory(GiB)": 720.92, "step": 5440, "train_speed(iter/s)": 0.865317 }, { "acc": 0.75303159, "epoch": 0.13812807288190146, "grad_norm": 3.96875, "learning_rate": 9.990066629941698e-06, "loss": 1.00533361, "memory(GiB)": 733.25, "step": 5445, "train_speed(iter/s)": 0.856097 }, { "acc": 0.7451786, "epoch": 0.138254912250939, "grad_norm": 3.21875, "learning_rate": 9.990000451799142e-06, "loss": 1.03218346, "memory(GiB)": 733.25, "step": 5450, "train_speed(iter/s)": 0.847599 }, { "acc": 0.74803524, "epoch": 0.13838175161997657, "grad_norm": 3.34375, "learning_rate": 9.989934054162367e-06, "loss": 0.99784346, "memory(GiB)": 733.25, "step": 5455, "train_speed(iter/s)": 0.839724 }, { "acc": 0.73315706, "epoch": 0.13850859098901414, "grad_norm": 4.25, "learning_rate": 9.989867437034291e-06, "loss": 1.0524704, "memory(GiB)": 733.25, "step": 5460, "train_speed(iter/s)": 0.831566 }, { "acc": 0.7532445, "epoch": 0.13863543035805168, "grad_norm": 4.0, "learning_rate": 9.989800600417845e-06, "loss": 0.96512794, "memory(GiB)": 733.25, "step": 5465, "train_speed(iter/s)": 0.823676 }, { "acc": 0.75845385, "epoch": 0.13876226972708924, "grad_norm": 4.0, "learning_rate": 9.98973354431597e-06, "loss": 0.99316053, "memory(GiB)": 733.25, "step": 5470, "train_speed(iter/s)": 0.816441 }, { "acc": 0.74239817, "epoch": 0.1388891090961268, "grad_norm": 3.453125, "learning_rate": 9.989666268731613e-06, "loss": 0.99130278, "memory(GiB)": 733.25, "step": 5475, "train_speed(iter/s)": 0.80971 }, { "acc": 0.74633904, "epoch": 0.13901594846516435, "grad_norm": 4.65625, "learning_rate": 9.989598773667737e-06, "loss": 1.01893578, "memory(GiB)": 733.25, "step": 5480, "train_speed(iter/s)": 0.802585 }, { "acc": 0.75186906, "epoch": 0.13914278783420192, "grad_norm": 3.171875, "learning_rate": 9.989531059127307e-06, "loss": 0.95224333, "memory(GiB)": 733.25, "step": 5485, "train_speed(iter/s)": 0.794661 }, { "acc": 0.77075977, "epoch": 0.1392696272032395, "grad_norm": 4.15625, "learning_rate": 9.989463125113304e-06, "loss": 0.89247227, "memory(GiB)": 733.25, "step": 5490, "train_speed(iter/s)": 0.787567 }, { "acc": 0.76524277, "epoch": 0.13939646657227703, "grad_norm": 4.28125, "learning_rate": 9.989394971628717e-06, "loss": 0.97145214, "memory(GiB)": 733.25, "step": 5495, "train_speed(iter/s)": 0.781207 }, { "acc": 0.74325719, "epoch": 0.1395233059413146, "grad_norm": 3.421875, "learning_rate": 9.989326598676542e-06, "loss": 0.9871562, "memory(GiB)": 733.25, "step": 5500, "train_speed(iter/s)": 0.773811 }, { "epoch": 0.1395233059413146, "eval_acc": 0.7397167621195341, "eval_loss": 0.950545608997345, "eval_runtime": 1151.7409, "eval_samples_per_second": 5.531, "eval_steps_per_second": 5.531, "step": 5500 }, { "acc": 0.75052838, "epoch": 0.13965014531035216, "grad_norm": 3.515625, "learning_rate": 9.989258006259783e-06, "loss": 1.00677967, "memory(GiB)": 733.25, "step": 5505, "train_speed(iter/s)": 0.613207 }, { "acc": 0.75000763, "epoch": 0.1397769846793897, "grad_norm": 3.421875, "learning_rate": 9.989189194381465e-06, "loss": 0.94798746, "memory(GiB)": 733.25, "step": 5510, "train_speed(iter/s)": 0.609193 }, { "acc": 0.75012617, "epoch": 0.13990382404842727, "grad_norm": 3.703125, "learning_rate": 9.98912016304461e-06, "loss": 0.98472176, "memory(GiB)": 733.25, "step": 5515, "train_speed(iter/s)": 0.60511 }, { "acc": 0.74240603, "epoch": 0.14003066341746484, "grad_norm": 4.9375, "learning_rate": 9.989050912252255e-06, "loss": 1.06186218, "memory(GiB)": 733.25, "step": 5520, "train_speed(iter/s)": 0.601816 }, { "acc": 0.73991513, "epoch": 0.14015750278650238, "grad_norm": 3.953125, "learning_rate": 9.988981442007445e-06, "loss": 1.02157602, "memory(GiB)": 733.25, "step": 5525, "train_speed(iter/s)": 0.597506 }, { "acc": 0.74704614, "epoch": 0.14028434215553995, "grad_norm": 4.09375, "learning_rate": 9.98891175231324e-06, "loss": 1.01559668, "memory(GiB)": 733.28, "step": 5530, "train_speed(iter/s)": 0.593824 }, { "acc": 0.7552865, "epoch": 0.14041118152457752, "grad_norm": 3.25, "learning_rate": 9.9888418431727e-06, "loss": 0.9573123, "memory(GiB)": 733.28, "step": 5535, "train_speed(iter/s)": 0.589902 }, { "acc": 0.74690185, "epoch": 0.14053802089361506, "grad_norm": 4.21875, "learning_rate": 9.988771714588904e-06, "loss": 0.98733311, "memory(GiB)": 733.28, "step": 5540, "train_speed(iter/s)": 0.586325 }, { "acc": 0.75068641, "epoch": 0.14066486026265262, "grad_norm": 3.9375, "learning_rate": 9.988701366564937e-06, "loss": 0.96340561, "memory(GiB)": 733.28, "step": 5545, "train_speed(iter/s)": 0.583166 }, { "acc": 0.75370517, "epoch": 0.1407916996316902, "grad_norm": 2.9375, "learning_rate": 9.988630799103891e-06, "loss": 0.97464914, "memory(GiB)": 733.28, "step": 5550, "train_speed(iter/s)": 0.579619 }, { "acc": 0.75431943, "epoch": 0.14091853900072773, "grad_norm": 3.375, "learning_rate": 9.98856001220887e-06, "loss": 0.95243969, "memory(GiB)": 733.28, "step": 5555, "train_speed(iter/s)": 0.575919 }, { "acc": 0.76401701, "epoch": 0.1410453783697653, "grad_norm": 3.546875, "learning_rate": 9.988489005882989e-06, "loss": 0.94929447, "memory(GiB)": 733.28, "step": 5560, "train_speed(iter/s)": 0.572594 }, { "acc": 0.76718712, "epoch": 0.14117221773880287, "grad_norm": 3.234375, "learning_rate": 9.988417780129371e-06, "loss": 0.91205168, "memory(GiB)": 733.28, "step": 5565, "train_speed(iter/s)": 0.569397 }, { "acc": 0.73946424, "epoch": 0.1412990571078404, "grad_norm": 4.15625, "learning_rate": 9.988346334951149e-06, "loss": 1.0705409, "memory(GiB)": 733.28, "step": 5570, "train_speed(iter/s)": 0.566416 }, { "acc": 0.75224075, "epoch": 0.14142589647687798, "grad_norm": 3.421875, "learning_rate": 9.988274670351466e-06, "loss": 0.98892832, "memory(GiB)": 733.28, "step": 5575, "train_speed(iter/s)": 0.563279 }, { "acc": 0.7600666, "epoch": 0.14155273584591554, "grad_norm": 3.25, "learning_rate": 9.988202786333473e-06, "loss": 0.91135988, "memory(GiB)": 733.28, "step": 5580, "train_speed(iter/s)": 0.559664 }, { "acc": 0.75451241, "epoch": 0.14167957521495309, "grad_norm": 10.6875, "learning_rate": 9.988130682900333e-06, "loss": 0.97192049, "memory(GiB)": 733.28, "step": 5585, "train_speed(iter/s)": 0.55662 }, { "acc": 0.74731193, "epoch": 0.14180641458399065, "grad_norm": 3.609375, "learning_rate": 9.988058360055217e-06, "loss": 0.97573347, "memory(GiB)": 733.28, "step": 5590, "train_speed(iter/s)": 0.553092 }, { "acc": 0.74450898, "epoch": 0.14193325395302822, "grad_norm": 4.21875, "learning_rate": 9.987985817801307e-06, "loss": 1.04594336, "memory(GiB)": 733.28, "step": 5595, "train_speed(iter/s)": 0.550309 }, { "acc": 0.74998822, "epoch": 0.14206009332206576, "grad_norm": 4.34375, "learning_rate": 9.987913056141793e-06, "loss": 0.9941927, "memory(GiB)": 733.28, "step": 5600, "train_speed(iter/s)": 0.547253 }, { "acc": 0.74467316, "epoch": 0.14218693269110333, "grad_norm": 3.421875, "learning_rate": 9.987840075079878e-06, "loss": 1.00092945, "memory(GiB)": 733.28, "step": 5605, "train_speed(iter/s)": 0.544337 }, { "acc": 0.74542952, "epoch": 0.1423137720601409, "grad_norm": 3.953125, "learning_rate": 9.987766874618769e-06, "loss": 0.96608763, "memory(GiB)": 733.28, "step": 5610, "train_speed(iter/s)": 0.541052 }, { "acc": 0.75067, "epoch": 0.14244061142917844, "grad_norm": 3.484375, "learning_rate": 9.987693454761688e-06, "loss": 0.98531923, "memory(GiB)": 733.28, "step": 5615, "train_speed(iter/s)": 0.538045 }, { "acc": 0.76413145, "epoch": 0.142567450798216, "grad_norm": 3.3125, "learning_rate": 9.987619815511862e-06, "loss": 0.95331888, "memory(GiB)": 733.28, "step": 5620, "train_speed(iter/s)": 0.535177 }, { "acc": 0.76008906, "epoch": 0.14269429016725357, "grad_norm": 3.359375, "learning_rate": 9.987545956872533e-06, "loss": 0.92839928, "memory(GiB)": 733.28, "step": 5625, "train_speed(iter/s)": 0.532031 }, { "acc": 0.74695983, "epoch": 0.1428211295362911, "grad_norm": 3.8125, "learning_rate": 9.987471878846949e-06, "loss": 1.01008148, "memory(GiB)": 733.28, "step": 5630, "train_speed(iter/s)": 0.529272 }, { "acc": 0.74549341, "epoch": 0.14294796890532868, "grad_norm": 4.0625, "learning_rate": 9.987397581438367e-06, "loss": 1.00769367, "memory(GiB)": 733.28, "step": 5635, "train_speed(iter/s)": 0.526581 }, { "acc": 0.75065703, "epoch": 0.14307480827436625, "grad_norm": 3.609375, "learning_rate": 9.987323064650057e-06, "loss": 0.99777994, "memory(GiB)": 733.28, "step": 5640, "train_speed(iter/s)": 0.523855 }, { "acc": 0.76536608, "epoch": 0.1432016476434038, "grad_norm": 5.4375, "learning_rate": 9.987248328485295e-06, "loss": 0.90497112, "memory(GiB)": 733.28, "step": 5645, "train_speed(iter/s)": 0.521041 }, { "acc": 0.75962267, "epoch": 0.14332848701244136, "grad_norm": 4.5, "learning_rate": 9.987173372947373e-06, "loss": 0.95565796, "memory(GiB)": 733.28, "step": 5650, "train_speed(iter/s)": 0.518393 }, { "acc": 0.74397845, "epoch": 0.14345532638147893, "grad_norm": 4.03125, "learning_rate": 9.98709819803958e-06, "loss": 0.9805213, "memory(GiB)": 733.28, "step": 5655, "train_speed(iter/s)": 0.515725 }, { "acc": 0.73542495, "epoch": 0.14358216575051647, "grad_norm": 3.765625, "learning_rate": 9.98702280376523e-06, "loss": 1.03051519, "memory(GiB)": 733.28, "step": 5660, "train_speed(iter/s)": 0.512765 }, { "acc": 0.75391417, "epoch": 0.14370900511955403, "grad_norm": 4.59375, "learning_rate": 9.986947190127634e-06, "loss": 0.99182072, "memory(GiB)": 733.28, "step": 5665, "train_speed(iter/s)": 0.51045 }, { "acc": 0.74624019, "epoch": 0.1438358444885916, "grad_norm": 3.609375, "learning_rate": 9.98687135713012e-06, "loss": 1.03355713, "memory(GiB)": 733.28, "step": 5670, "train_speed(iter/s)": 0.507173 }, { "acc": 0.75272374, "epoch": 0.14396268385762914, "grad_norm": 3.765625, "learning_rate": 9.986795304776028e-06, "loss": 0.93574829, "memory(GiB)": 733.28, "step": 5675, "train_speed(iter/s)": 0.504671 }, { "acc": 0.75965385, "epoch": 0.1440895232266667, "grad_norm": 3.0625, "learning_rate": 9.986719033068697e-06, "loss": 0.89824772, "memory(GiB)": 733.28, "step": 5680, "train_speed(iter/s)": 0.502268 }, { "acc": 0.7592061, "epoch": 0.14421636259570428, "grad_norm": 3.9375, "learning_rate": 9.986642542011484e-06, "loss": 0.9566803, "memory(GiB)": 733.28, "step": 5685, "train_speed(iter/s)": 0.49997 }, { "acc": 0.75305099, "epoch": 0.14434320196474182, "grad_norm": 3.484375, "learning_rate": 9.986565831607755e-06, "loss": 0.97858696, "memory(GiB)": 733.28, "step": 5690, "train_speed(iter/s)": 0.497775 }, { "acc": 0.76204009, "epoch": 0.1444700413337794, "grad_norm": 4.0625, "learning_rate": 9.986488901860884e-06, "loss": 0.95111465, "memory(GiB)": 733.28, "step": 5695, "train_speed(iter/s)": 0.494854 }, { "acc": 0.75567436, "epoch": 0.14459688070281695, "grad_norm": 3.59375, "learning_rate": 9.986411752774252e-06, "loss": 0.92374325, "memory(GiB)": 733.28, "step": 5700, "train_speed(iter/s)": 0.492616 }, { "acc": 0.75533519, "epoch": 0.1447237200718545, "grad_norm": 4.59375, "learning_rate": 9.986334384351257e-06, "loss": 0.96552954, "memory(GiB)": 733.28, "step": 5705, "train_speed(iter/s)": 0.490315 }, { "acc": 0.75985918, "epoch": 0.14485055944089206, "grad_norm": 3.515625, "learning_rate": 9.986256796595297e-06, "loss": 0.92455654, "memory(GiB)": 733.28, "step": 5710, "train_speed(iter/s)": 0.488045 }, { "acc": 0.7456202, "epoch": 0.14497739880992963, "grad_norm": 3.65625, "learning_rate": 9.986178989509789e-06, "loss": 0.94999084, "memory(GiB)": 733.28, "step": 5715, "train_speed(iter/s)": 0.485746 }, { "acc": 0.7365231, "epoch": 0.14510423817896717, "grad_norm": 4.625, "learning_rate": 9.986100963098156e-06, "loss": 1.03201962, "memory(GiB)": 733.28, "step": 5720, "train_speed(iter/s)": 0.483193 }, { "acc": 0.74314446, "epoch": 0.14523107754800474, "grad_norm": 3.34375, "learning_rate": 9.986022717363825e-06, "loss": 1.01528502, "memory(GiB)": 733.28, "step": 5725, "train_speed(iter/s)": 0.480507 }, { "acc": 0.75699129, "epoch": 0.1453579169170423, "grad_norm": 3.453125, "learning_rate": 9.985944252310245e-06, "loss": 0.95386639, "memory(GiB)": 733.28, "step": 5730, "train_speed(iter/s)": 0.47832 }, { "acc": 0.7570735, "epoch": 0.14548475628607985, "grad_norm": 4.75, "learning_rate": 9.98586556794086e-06, "loss": 1.00440903, "memory(GiB)": 733.28, "step": 5735, "train_speed(iter/s)": 0.476173 }, { "acc": 0.75115309, "epoch": 0.14561159565511742, "grad_norm": 4.75, "learning_rate": 9.985786664259136e-06, "loss": 0.9905406, "memory(GiB)": 733.28, "step": 5740, "train_speed(iter/s)": 0.473866 }, { "acc": 0.75747099, "epoch": 0.14573843502415498, "grad_norm": 7.84375, "learning_rate": 9.985707541268543e-06, "loss": 1.02919407, "memory(GiB)": 733.28, "step": 5745, "train_speed(iter/s)": 0.471612 }, { "acc": 0.75352902, "epoch": 0.14586527439319252, "grad_norm": 3.390625, "learning_rate": 9.98562819897256e-06, "loss": 0.99633913, "memory(GiB)": 733.28, "step": 5750, "train_speed(iter/s)": 0.469671 }, { "acc": 0.75445051, "epoch": 0.1459921137622301, "grad_norm": 4.40625, "learning_rate": 9.985548637374679e-06, "loss": 0.95945129, "memory(GiB)": 733.28, "step": 5755, "train_speed(iter/s)": 0.467238 }, { "acc": 0.73866854, "epoch": 0.14611895313126766, "grad_norm": 3.453125, "learning_rate": 9.985468856478395e-06, "loss": 1.037887, "memory(GiB)": 733.28, "step": 5760, "train_speed(iter/s)": 0.465047 }, { "acc": 0.74267216, "epoch": 0.1462457925003052, "grad_norm": 3.375, "learning_rate": 9.985388856287224e-06, "loss": 1.00412626, "memory(GiB)": 733.28, "step": 5765, "train_speed(iter/s)": 0.462877 }, { "acc": 0.74767861, "epoch": 0.14637263186934277, "grad_norm": 4.0625, "learning_rate": 9.98530863680468e-06, "loss": 0.97153769, "memory(GiB)": 733.28, "step": 5770, "train_speed(iter/s)": 0.460724 }, { "acc": 0.75791283, "epoch": 0.14649947123838034, "grad_norm": 3.6875, "learning_rate": 9.985228198034294e-06, "loss": 0.99684229, "memory(GiB)": 733.28, "step": 5775, "train_speed(iter/s)": 0.458817 }, { "acc": 0.75485692, "epoch": 0.14662631060741788, "grad_norm": 3.421875, "learning_rate": 9.985147539979603e-06, "loss": 0.96192303, "memory(GiB)": 733.28, "step": 5780, "train_speed(iter/s)": 0.456763 }, { "acc": 0.76672421, "epoch": 0.14675314997645544, "grad_norm": 4.125, "learning_rate": 9.985066662644155e-06, "loss": 0.95805397, "memory(GiB)": 733.28, "step": 5785, "train_speed(iter/s)": 0.454894 }, { "acc": 0.75643101, "epoch": 0.146879989345493, "grad_norm": 7.75, "learning_rate": 9.98498556603151e-06, "loss": 0.97905846, "memory(GiB)": 733.28, "step": 5790, "train_speed(iter/s)": 0.452905 }, { "acc": 0.75485559, "epoch": 0.14700682871453055, "grad_norm": 3.640625, "learning_rate": 9.98490425014523e-06, "loss": 0.95546284, "memory(GiB)": 733.28, "step": 5795, "train_speed(iter/s)": 0.451066 }, { "acc": 0.72558632, "epoch": 0.14713366808356812, "grad_norm": 3.859375, "learning_rate": 9.984822714988896e-06, "loss": 1.02929335, "memory(GiB)": 733.28, "step": 5800, "train_speed(iter/s)": 0.449325 }, { "acc": 0.74472766, "epoch": 0.1472605074526057, "grad_norm": 3.375, "learning_rate": 9.984740960566095e-06, "loss": 1.01449881, "memory(GiB)": 733.28, "step": 5805, "train_speed(iter/s)": 0.447423 }, { "acc": 0.73379211, "epoch": 0.14738734682164323, "grad_norm": 3.40625, "learning_rate": 9.98465898688042e-06, "loss": 1.02026882, "memory(GiB)": 733.28, "step": 5810, "train_speed(iter/s)": 0.445592 }, { "acc": 0.75707035, "epoch": 0.1475141861906808, "grad_norm": 3.328125, "learning_rate": 9.98457679393548e-06, "loss": 0.98314037, "memory(GiB)": 733.28, "step": 5815, "train_speed(iter/s)": 0.443622 }, { "acc": 0.7465857, "epoch": 0.14764102555971836, "grad_norm": 4.375, "learning_rate": 9.984494381734885e-06, "loss": 1.01894722, "memory(GiB)": 733.28, "step": 5820, "train_speed(iter/s)": 0.441927 }, { "acc": 0.74804506, "epoch": 0.1477678649287559, "grad_norm": 4.4375, "learning_rate": 9.984411750282266e-06, "loss": 1.01231375, "memory(GiB)": 733.28, "step": 5825, "train_speed(iter/s)": 0.440232 }, { "acc": 0.75132923, "epoch": 0.14789470429779347, "grad_norm": 4.0625, "learning_rate": 9.984328899581255e-06, "loss": 0.9289115, "memory(GiB)": 733.28, "step": 5830, "train_speed(iter/s)": 0.438481 }, { "acc": 0.73721414, "epoch": 0.14802154366683104, "grad_norm": 3.59375, "learning_rate": 9.984245829635497e-06, "loss": 0.99680471, "memory(GiB)": 733.28, "step": 5835, "train_speed(iter/s)": 0.436544 }, { "acc": 0.75196781, "epoch": 0.14814838303586858, "grad_norm": 3.75, "learning_rate": 9.984162540448646e-06, "loss": 0.97238951, "memory(GiB)": 733.28, "step": 5840, "train_speed(iter/s)": 0.434808 }, { "acc": 0.75241737, "epoch": 0.14827522240490615, "grad_norm": 4.03125, "learning_rate": 9.984079032024365e-06, "loss": 0.98890753, "memory(GiB)": 733.28, "step": 5845, "train_speed(iter/s)": 0.433226 }, { "acc": 0.74966216, "epoch": 0.14840206177394372, "grad_norm": 3.828125, "learning_rate": 9.983995304366327e-06, "loss": 0.9518652, "memory(GiB)": 733.28, "step": 5850, "train_speed(iter/s)": 0.431341 }, { "acc": 0.75837283, "epoch": 0.14852890114298126, "grad_norm": 2.9375, "learning_rate": 9.983911357478217e-06, "loss": 0.89550667, "memory(GiB)": 733.28, "step": 5855, "train_speed(iter/s)": 0.429542 }, { "acc": 0.75168357, "epoch": 0.14865574051201882, "grad_norm": 4.4375, "learning_rate": 9.983827191363726e-06, "loss": 1.03048925, "memory(GiB)": 733.28, "step": 5860, "train_speed(iter/s)": 0.427792 }, { "acc": 0.75339718, "epoch": 0.1487825798810564, "grad_norm": 3.859375, "learning_rate": 9.983742806026555e-06, "loss": 1.00587215, "memory(GiB)": 733.28, "step": 5865, "train_speed(iter/s)": 0.425855 }, { "acc": 0.7611371, "epoch": 0.14890941925009393, "grad_norm": 5.375, "learning_rate": 9.98365820147042e-06, "loss": 0.95586872, "memory(GiB)": 733.28, "step": 5870, "train_speed(iter/s)": 0.424405 }, { "acc": 0.74961905, "epoch": 0.1490362586191315, "grad_norm": 4.46875, "learning_rate": 9.983573377699037e-06, "loss": 0.99970446, "memory(GiB)": 733.28, "step": 5875, "train_speed(iter/s)": 0.422823 }, { "acc": 0.76440015, "epoch": 0.14916309798816907, "grad_norm": 3.09375, "learning_rate": 9.983488334716139e-06, "loss": 0.95284243, "memory(GiB)": 733.28, "step": 5880, "train_speed(iter/s)": 0.421231 }, { "acc": 0.76123362, "epoch": 0.1492899373572066, "grad_norm": 3.296875, "learning_rate": 9.983403072525468e-06, "loss": 0.96045752, "memory(GiB)": 733.28, "step": 5885, "train_speed(iter/s)": 0.419769 }, { "acc": 0.7459178, "epoch": 0.14941677672624418, "grad_norm": 3.734375, "learning_rate": 9.983317591130775e-06, "loss": 1.08631649, "memory(GiB)": 733.28, "step": 5890, "train_speed(iter/s)": 0.418148 }, { "acc": 0.75005875, "epoch": 0.14954361609528175, "grad_norm": 3.375, "learning_rate": 9.983231890535818e-06, "loss": 1.00282097, "memory(GiB)": 733.28, "step": 5895, "train_speed(iter/s)": 0.416561 }, { "acc": 0.73885193, "epoch": 0.14967045546431929, "grad_norm": 4.21875, "learning_rate": 9.98314597074437e-06, "loss": 1.00325413, "memory(GiB)": 733.28, "step": 5900, "train_speed(iter/s)": 0.415062 }, { "acc": 0.75007477, "epoch": 0.14979729483335685, "grad_norm": 32.0, "learning_rate": 9.983059831760205e-06, "loss": 1.02001371, "memory(GiB)": 733.28, "step": 5905, "train_speed(iter/s)": 0.413626 }, { "acc": 0.76178975, "epoch": 0.14992413420239442, "grad_norm": 4.28125, "learning_rate": 9.982973473587117e-06, "loss": 0.93752584, "memory(GiB)": 733.28, "step": 5910, "train_speed(iter/s)": 0.412031 }, { "acc": 0.75406981, "epoch": 0.15005097357143196, "grad_norm": 3.921875, "learning_rate": 9.982886896228903e-06, "loss": 0.98652935, "memory(GiB)": 733.28, "step": 5915, "train_speed(iter/s)": 0.410409 }, { "acc": 0.75283527, "epoch": 0.15017781294046953, "grad_norm": 3.171875, "learning_rate": 9.98280009968937e-06, "loss": 0.99986782, "memory(GiB)": 733.28, "step": 5920, "train_speed(iter/s)": 0.408868 }, { "acc": 0.74817348, "epoch": 0.1503046523095071, "grad_norm": 3.5625, "learning_rate": 9.982713083972335e-06, "loss": 0.98670292, "memory(GiB)": 733.28, "step": 5925, "train_speed(iter/s)": 0.407266 }, { "acc": 0.74876943, "epoch": 0.15043149167854464, "grad_norm": 3.359375, "learning_rate": 9.98262584908163e-06, "loss": 0.97073936, "memory(GiB)": 733.28, "step": 5930, "train_speed(iter/s)": 0.40562 }, { "acc": 0.74198127, "epoch": 0.1505583310475822, "grad_norm": 3.390625, "learning_rate": 9.982538395021088e-06, "loss": 0.97874565, "memory(GiB)": 733.28, "step": 5935, "train_speed(iter/s)": 0.404114 }, { "acc": 0.74680223, "epoch": 0.15068517041661977, "grad_norm": 4.84375, "learning_rate": 9.982450721794558e-06, "loss": 1.0378829, "memory(GiB)": 733.28, "step": 5940, "train_speed(iter/s)": 0.402702 }, { "acc": 0.76224866, "epoch": 0.1508120097856573, "grad_norm": 3.921875, "learning_rate": 9.982362829405896e-06, "loss": 0.97271881, "memory(GiB)": 733.28, "step": 5945, "train_speed(iter/s)": 0.401287 }, { "acc": 0.75076108, "epoch": 0.15093884915469488, "grad_norm": 3.734375, "learning_rate": 9.982274717858966e-06, "loss": 1.01043615, "memory(GiB)": 733.28, "step": 5950, "train_speed(iter/s)": 0.39965 }, { "acc": 0.75363326, "epoch": 0.15106568852373245, "grad_norm": 3.953125, "learning_rate": 9.982186387157648e-06, "loss": 0.93773403, "memory(GiB)": 733.28, "step": 5955, "train_speed(iter/s)": 0.398261 }, { "acc": 0.73748126, "epoch": 0.15119252789277, "grad_norm": 3.3125, "learning_rate": 9.982097837305825e-06, "loss": 0.99364977, "memory(GiB)": 733.28, "step": 5960, "train_speed(iter/s)": 0.396897 }, { "acc": 0.74658904, "epoch": 0.15131936726180756, "grad_norm": 3.984375, "learning_rate": 9.98200906830739e-06, "loss": 1.01888905, "memory(GiB)": 733.28, "step": 5965, "train_speed(iter/s)": 0.395347 }, { "acc": 0.75300841, "epoch": 0.15144620663084513, "grad_norm": 4.40625, "learning_rate": 9.981920080166252e-06, "loss": 0.9220994, "memory(GiB)": 733.28, "step": 5970, "train_speed(iter/s)": 0.393909 }, { "acc": 0.75440702, "epoch": 0.15157304599988267, "grad_norm": 3.578125, "learning_rate": 9.981830872886323e-06, "loss": 0.95081768, "memory(GiB)": 733.28, "step": 5975, "train_speed(iter/s)": 0.392647 }, { "acc": 0.74606218, "epoch": 0.15169988536892023, "grad_norm": 3.5, "learning_rate": 9.981741446471524e-06, "loss": 1.00480738, "memory(GiB)": 733.28, "step": 5980, "train_speed(iter/s)": 0.391312 }, { "acc": 0.75563245, "epoch": 0.1518267247379578, "grad_norm": 3.53125, "learning_rate": 9.981651800925794e-06, "loss": 0.94699335, "memory(GiB)": 733.28, "step": 5985, "train_speed(iter/s)": 0.390128 }, { "acc": 0.76710815, "epoch": 0.15195356410699534, "grad_norm": 4.34375, "learning_rate": 9.981561936253073e-06, "loss": 0.94009237, "memory(GiB)": 733.28, "step": 5990, "train_speed(iter/s)": 0.388852 }, { "acc": 0.73774152, "epoch": 0.1520804034760329, "grad_norm": 3.59375, "learning_rate": 9.981471852457316e-06, "loss": 1.03294706, "memory(GiB)": 733.28, "step": 5995, "train_speed(iter/s)": 0.387402 }, { "acc": 0.74578142, "epoch": 0.15220724284507048, "grad_norm": 3.921875, "learning_rate": 9.981381549542483e-06, "loss": 1.03127499, "memory(GiB)": 733.28, "step": 6000, "train_speed(iter/s)": 0.385846 }, { "epoch": 0.15220724284507048, "eval_acc": 0.7411313222293567, "eval_loss": 0.9432640671730042, "eval_runtime": 1147.6345, "eval_samples_per_second": 5.551, "eval_steps_per_second": 5.551, "step": 6000 }, { "acc": 0.74667768, "epoch": 0.15233408221410802, "grad_norm": 4.15625, "learning_rate": 9.981291027512547e-06, "loss": 1.00049992, "memory(GiB)": 733.28, "step": 6005, "train_speed(iter/s)": 0.344767 }, { "acc": 0.74779501, "epoch": 0.1524609215831456, "grad_norm": 3.921875, "learning_rate": 9.98120028637149e-06, "loss": 0.98892403, "memory(GiB)": 733.28, "step": 6010, "train_speed(iter/s)": 0.343737 }, { "acc": 0.74822383, "epoch": 0.15258776095218315, "grad_norm": 3.703125, "learning_rate": 9.981109326123305e-06, "loss": 1.01399574, "memory(GiB)": 733.28, "step": 6015, "train_speed(iter/s)": 0.342821 }, { "acc": 0.76016836, "epoch": 0.1527146003212207, "grad_norm": 3.90625, "learning_rate": 9.98101814677199e-06, "loss": 0.9492301, "memory(GiB)": 733.28, "step": 6020, "train_speed(iter/s)": 0.341995 }, { "acc": 0.7488502, "epoch": 0.15284143969025826, "grad_norm": 3.3125, "learning_rate": 9.980926748321558e-06, "loss": 0.9372159, "memory(GiB)": 733.28, "step": 6025, "train_speed(iter/s)": 0.340957 }, { "acc": 0.75986738, "epoch": 0.15296827905929583, "grad_norm": 3.359375, "learning_rate": 9.980835130776029e-06, "loss": 0.92164421, "memory(GiB)": 733.28, "step": 6030, "train_speed(iter/s)": 0.339951 }, { "acc": 0.76253128, "epoch": 0.15309511842833337, "grad_norm": 3.703125, "learning_rate": 9.980743294139432e-06, "loss": 0.96275539, "memory(GiB)": 733.28, "step": 6035, "train_speed(iter/s)": 0.338879 }, { "acc": 0.74217606, "epoch": 0.15322195779737094, "grad_norm": 3.515625, "learning_rate": 9.980651238415805e-06, "loss": 1.05777569, "memory(GiB)": 733.28, "step": 6040, "train_speed(iter/s)": 0.337891 }, { "acc": 0.75920153, "epoch": 0.1533487971664085, "grad_norm": 3.234375, "learning_rate": 9.980558963609203e-06, "loss": 0.98766975, "memory(GiB)": 733.28, "step": 6045, "train_speed(iter/s)": 0.336891 }, { "acc": 0.73601098, "epoch": 0.15347563653544605, "grad_norm": 3.765625, "learning_rate": 9.98046646972368e-06, "loss": 1.0340435, "memory(GiB)": 733.28, "step": 6050, "train_speed(iter/s)": 0.335979 }, { "acc": 0.73280487, "epoch": 0.15360247590448362, "grad_norm": 4.40625, "learning_rate": 9.980373756763304e-06, "loss": 0.98169832, "memory(GiB)": 746.04, "step": 6055, "train_speed(iter/s)": 0.334967 }, { "acc": 0.75084038, "epoch": 0.15372931527352118, "grad_norm": 4.65625, "learning_rate": 9.980280824732156e-06, "loss": 1.00089655, "memory(GiB)": 746.04, "step": 6060, "train_speed(iter/s)": 0.334077 }, { "acc": 0.74956126, "epoch": 0.15385615464255872, "grad_norm": 3.46875, "learning_rate": 9.980187673634323e-06, "loss": 0.98124809, "memory(GiB)": 746.04, "step": 6065, "train_speed(iter/s)": 0.333175 }, { "acc": 0.76056967, "epoch": 0.1539829940115963, "grad_norm": 3.703125, "learning_rate": 9.980094303473902e-06, "loss": 0.93299255, "memory(GiB)": 746.04, "step": 6070, "train_speed(iter/s)": 0.332205 }, { "acc": 0.75733962, "epoch": 0.15410983338063386, "grad_norm": 3.96875, "learning_rate": 9.980000714255001e-06, "loss": 0.98405399, "memory(GiB)": 746.04, "step": 6075, "train_speed(iter/s)": 0.331273 }, { "acc": 0.74732099, "epoch": 0.1542366727496714, "grad_norm": 3.859375, "learning_rate": 9.979906905981733e-06, "loss": 0.97986355, "memory(GiB)": 746.04, "step": 6080, "train_speed(iter/s)": 0.330418 }, { "acc": 0.75562444, "epoch": 0.15436351211870897, "grad_norm": 4.0625, "learning_rate": 9.979812878658229e-06, "loss": 0.94972048, "memory(GiB)": 746.04, "step": 6085, "train_speed(iter/s)": 0.329545 }, { "acc": 0.74398723, "epoch": 0.15449035148774654, "grad_norm": 4.28125, "learning_rate": 9.979718632288624e-06, "loss": 0.9478158, "memory(GiB)": 746.04, "step": 6090, "train_speed(iter/s)": 0.328637 }, { "acc": 0.75158129, "epoch": 0.15461719085678408, "grad_norm": 3.53125, "learning_rate": 9.979624166877062e-06, "loss": 1.03568621, "memory(GiB)": 746.04, "step": 6095, "train_speed(iter/s)": 0.32763 }, { "acc": 0.74333029, "epoch": 0.15474403022582164, "grad_norm": 4.28125, "learning_rate": 9.979529482427699e-06, "loss": 1.03074188, "memory(GiB)": 746.04, "step": 6100, "train_speed(iter/s)": 0.326769 }, { "acc": 0.75715327, "epoch": 0.1548708695948592, "grad_norm": 4.09375, "learning_rate": 9.9794345789447e-06, "loss": 0.92855873, "memory(GiB)": 746.04, "step": 6105, "train_speed(iter/s)": 0.325771 }, { "acc": 0.75217085, "epoch": 0.15499770896389675, "grad_norm": 3.46875, "learning_rate": 9.979339456432238e-06, "loss": 0.93017721, "memory(GiB)": 746.04, "step": 6110, "train_speed(iter/s)": 0.324797 }, { "acc": 0.74324374, "epoch": 0.15512454833293432, "grad_norm": 3.0625, "learning_rate": 9.9792441148945e-06, "loss": 0.98471146, "memory(GiB)": 746.04, "step": 6115, "train_speed(iter/s)": 0.323963 }, { "acc": 0.74107456, "epoch": 0.1552513877019719, "grad_norm": 3.4375, "learning_rate": 9.979148554335676e-06, "loss": 1.00685854, "memory(GiB)": 746.04, "step": 6120, "train_speed(iter/s)": 0.323148 }, { "acc": 0.74294257, "epoch": 0.15537822707100943, "grad_norm": 3.859375, "learning_rate": 9.979052774759974e-06, "loss": 1.00761442, "memory(GiB)": 746.04, "step": 6125, "train_speed(iter/s)": 0.322305 }, { "acc": 0.74747128, "epoch": 0.155505066440047, "grad_norm": 3.375, "learning_rate": 9.978956776171603e-06, "loss": 1.02359238, "memory(GiB)": 746.04, "step": 6130, "train_speed(iter/s)": 0.321456 }, { "acc": 0.76041489, "epoch": 0.15563190580908456, "grad_norm": 3.90625, "learning_rate": 9.978860558574789e-06, "loss": 0.95023117, "memory(GiB)": 746.04, "step": 6135, "train_speed(iter/s)": 0.320597 }, { "acc": 0.74935222, "epoch": 0.1557587451781221, "grad_norm": 4.5, "learning_rate": 9.97876412197376e-06, "loss": 0.99190149, "memory(GiB)": 746.04, "step": 6140, "train_speed(iter/s)": 0.319698 }, { "acc": 0.75636611, "epoch": 0.15588558454715967, "grad_norm": 3.390625, "learning_rate": 9.978667466372764e-06, "loss": 0.95491619, "memory(GiB)": 746.04, "step": 6145, "train_speed(iter/s)": 0.318914 }, { "acc": 0.76299605, "epoch": 0.15601242391619724, "grad_norm": 4.09375, "learning_rate": 9.978570591776046e-06, "loss": 0.96408386, "memory(GiB)": 746.04, "step": 6150, "train_speed(iter/s)": 0.318034 }, { "acc": 0.73896151, "epoch": 0.15613926328523478, "grad_norm": 3.40625, "learning_rate": 9.978473498187873e-06, "loss": 0.96658049, "memory(GiB)": 746.04, "step": 6155, "train_speed(iter/s)": 0.317141 }, { "acc": 0.75253973, "epoch": 0.15626610265427235, "grad_norm": 4.15625, "learning_rate": 9.978376185612512e-06, "loss": 0.97212467, "memory(GiB)": 746.04, "step": 6160, "train_speed(iter/s)": 0.316046 }, { "acc": 0.77233868, "epoch": 0.15639294202330992, "grad_norm": 4.28125, "learning_rate": 9.978278654054245e-06, "loss": 0.94327412, "memory(GiB)": 746.04, "step": 6165, "train_speed(iter/s)": 0.315263 }, { "acc": 0.76666417, "epoch": 0.15651978139234746, "grad_norm": 3.328125, "learning_rate": 9.978180903517362e-06, "loss": 0.93931408, "memory(GiB)": 746.04, "step": 6170, "train_speed(iter/s)": 0.314404 }, { "acc": 0.73833399, "epoch": 0.15664662076138502, "grad_norm": 3.03125, "learning_rate": 9.97808293400616e-06, "loss": 1.05496845, "memory(GiB)": 746.04, "step": 6175, "train_speed(iter/s)": 0.313505 }, { "acc": 0.75602994, "epoch": 0.1567734601304226, "grad_norm": 4.25, "learning_rate": 9.977984745524955e-06, "loss": 0.9375515, "memory(GiB)": 746.04, "step": 6180, "train_speed(iter/s)": 0.312721 }, { "acc": 0.74517913, "epoch": 0.15690029949946013, "grad_norm": 3.921875, "learning_rate": 9.97788633807806e-06, "loss": 1.02068243, "memory(GiB)": 746.04, "step": 6185, "train_speed(iter/s)": 0.311941 }, { "acc": 0.73888669, "epoch": 0.1570271388684977, "grad_norm": 3.1875, "learning_rate": 9.977787711669805e-06, "loss": 1.05766401, "memory(GiB)": 746.04, "step": 6190, "train_speed(iter/s)": 0.311178 }, { "acc": 0.75078254, "epoch": 0.15715397823753527, "grad_norm": 3.875, "learning_rate": 9.977688866304529e-06, "loss": 0.99435625, "memory(GiB)": 746.04, "step": 6195, "train_speed(iter/s)": 0.31037 }, { "acc": 0.75112638, "epoch": 0.1572808176065728, "grad_norm": 4.46875, "learning_rate": 9.97758980198658e-06, "loss": 0.97208939, "memory(GiB)": 746.04, "step": 6200, "train_speed(iter/s)": 0.309497 }, { "acc": 0.74617739, "epoch": 0.15740765697561038, "grad_norm": 3.59375, "learning_rate": 9.977490518720315e-06, "loss": 0.97860975, "memory(GiB)": 746.04, "step": 6205, "train_speed(iter/s)": 0.308619 }, { "acc": 0.74776192, "epoch": 0.15753449634464795, "grad_norm": 3.6875, "learning_rate": 9.977391016510101e-06, "loss": 0.99683065, "memory(GiB)": 746.04, "step": 6210, "train_speed(iter/s)": 0.307845 }, { "acc": 0.75554657, "epoch": 0.15766133571368549, "grad_norm": 3.75, "learning_rate": 9.977291295360316e-06, "loss": 0.93762722, "memory(GiB)": 746.04, "step": 6215, "train_speed(iter/s)": 0.307082 }, { "acc": 0.73784771, "epoch": 0.15778817508272305, "grad_norm": 4.0625, "learning_rate": 9.977191355275344e-06, "loss": 1.03510666, "memory(GiB)": 746.04, "step": 6220, "train_speed(iter/s)": 0.306341 }, { "acc": 0.75600634, "epoch": 0.15791501445176062, "grad_norm": 4.0625, "learning_rate": 9.977091196259584e-06, "loss": 0.98927498, "memory(GiB)": 746.04, "step": 6225, "train_speed(iter/s)": 0.305705 }, { "acc": 0.75087337, "epoch": 0.15804185382079816, "grad_norm": 4.03125, "learning_rate": 9.97699081831744e-06, "loss": 1.02526894, "memory(GiB)": 746.04, "step": 6230, "train_speed(iter/s)": 0.304862 }, { "acc": 0.75455117, "epoch": 0.15816869318983573, "grad_norm": 3.21875, "learning_rate": 9.976890221453327e-06, "loss": 0.96489334, "memory(GiB)": 746.04, "step": 6235, "train_speed(iter/s)": 0.304192 }, { "acc": 0.76661305, "epoch": 0.1582955325588733, "grad_norm": 3.78125, "learning_rate": 9.97678940567167e-06, "loss": 0.90802078, "memory(GiB)": 746.04, "step": 6240, "train_speed(iter/s)": 0.30347 }, { "acc": 0.75832205, "epoch": 0.15842237192791084, "grad_norm": 3.421875, "learning_rate": 9.976688370976906e-06, "loss": 0.9270113, "memory(GiB)": 746.04, "step": 6245, "train_speed(iter/s)": 0.302614 }, { "acc": 0.74880052, "epoch": 0.1585492112969484, "grad_norm": 5.4375, "learning_rate": 9.976587117373476e-06, "loss": 1.01334391, "memory(GiB)": 746.04, "step": 6250, "train_speed(iter/s)": 0.301946 }, { "acc": 0.75074248, "epoch": 0.15867605066598597, "grad_norm": 3.921875, "learning_rate": 9.976485644865835e-06, "loss": 0.94647579, "memory(GiB)": 746.04, "step": 6255, "train_speed(iter/s)": 0.30115 }, { "acc": 0.75355024, "epoch": 0.15880289003502351, "grad_norm": 3.6875, "learning_rate": 9.976383953458446e-06, "loss": 0.95775318, "memory(GiB)": 746.04, "step": 6260, "train_speed(iter/s)": 0.300479 }, { "acc": 0.75452971, "epoch": 0.15892972940406108, "grad_norm": 3.828125, "learning_rate": 9.976282043155785e-06, "loss": 1.01098557, "memory(GiB)": 746.04, "step": 6265, "train_speed(iter/s)": 0.299863 }, { "acc": 0.75601797, "epoch": 0.15905656877309865, "grad_norm": 4.3125, "learning_rate": 9.97617991396233e-06, "loss": 0.98445692, "memory(GiB)": 746.04, "step": 6270, "train_speed(iter/s)": 0.299256 }, { "acc": 0.75660553, "epoch": 0.1591834081421362, "grad_norm": 2.953125, "learning_rate": 9.976077565882576e-06, "loss": 0.96159124, "memory(GiB)": 746.04, "step": 6275, "train_speed(iter/s)": 0.298493 }, { "acc": 0.73819265, "epoch": 0.15931024751117376, "grad_norm": 3.34375, "learning_rate": 9.975974998921025e-06, "loss": 1.08361082, "memory(GiB)": 746.04, "step": 6280, "train_speed(iter/s)": 0.297658 }, { "acc": 0.74130526, "epoch": 0.15943708688021133, "grad_norm": 3.859375, "learning_rate": 9.975872213082189e-06, "loss": 1.04125471, "memory(GiB)": 746.04, "step": 6285, "train_speed(iter/s)": 0.296955 }, { "acc": 0.74992671, "epoch": 0.15956392624924887, "grad_norm": 4.125, "learning_rate": 9.975769208370587e-06, "loss": 0.99230909, "memory(GiB)": 746.04, "step": 6290, "train_speed(iter/s)": 0.296241 }, { "acc": 0.75768638, "epoch": 0.15969076561828643, "grad_norm": 3.671875, "learning_rate": 9.975665984790753e-06, "loss": 0.93900518, "memory(GiB)": 746.04, "step": 6295, "train_speed(iter/s)": 0.295533 }, { "acc": 0.74551992, "epoch": 0.159817604987324, "grad_norm": 4.5625, "learning_rate": 9.975562542347223e-06, "loss": 1.00750904, "memory(GiB)": 746.04, "step": 6300, "train_speed(iter/s)": 0.29488 }, { "acc": 0.75232887, "epoch": 0.15994444435636154, "grad_norm": 3.5625, "learning_rate": 9.975458881044554e-06, "loss": 0.93622036, "memory(GiB)": 746.04, "step": 6305, "train_speed(iter/s)": 0.294122 }, { "acc": 0.75141711, "epoch": 0.1600712837253991, "grad_norm": 4.28125, "learning_rate": 9.975355000887297e-06, "loss": 1.01006632, "memory(GiB)": 746.04, "step": 6310, "train_speed(iter/s)": 0.293482 }, { "acc": 0.74520168, "epoch": 0.16019812309443668, "grad_norm": 3.453125, "learning_rate": 9.975250901880029e-06, "loss": 0.96835003, "memory(GiB)": 746.04, "step": 6315, "train_speed(iter/s)": 0.292673 }, { "acc": 0.7437952, "epoch": 0.16032496246347422, "grad_norm": 3.296875, "learning_rate": 9.975146584027324e-06, "loss": 0.97352695, "memory(GiB)": 746.04, "step": 6320, "train_speed(iter/s)": 0.292041 }, { "acc": 0.75789866, "epoch": 0.1604518018325118, "grad_norm": 3.703125, "learning_rate": 9.975042047333773e-06, "loss": 0.96337366, "memory(GiB)": 746.04, "step": 6325, "train_speed(iter/s)": 0.291366 }, { "acc": 0.75992475, "epoch": 0.16057864120154935, "grad_norm": 3.609375, "learning_rate": 9.974937291803975e-06, "loss": 0.93397503, "memory(GiB)": 746.04, "step": 6330, "train_speed(iter/s)": 0.290708 }, { "acc": 0.76007104, "epoch": 0.1607054805705869, "grad_norm": 3.125, "learning_rate": 9.974832317442537e-06, "loss": 0.99259348, "memory(GiB)": 746.04, "step": 6335, "train_speed(iter/s)": 0.29008 }, { "acc": 0.75002313, "epoch": 0.16083231993962446, "grad_norm": 4.78125, "learning_rate": 9.974727124254075e-06, "loss": 0.93126707, "memory(GiB)": 746.04, "step": 6340, "train_speed(iter/s)": 0.289513 }, { "acc": 0.7493587, "epoch": 0.16095915930866203, "grad_norm": 3.265625, "learning_rate": 9.974621712243217e-06, "loss": 0.98997869, "memory(GiB)": 746.04, "step": 6345, "train_speed(iter/s)": 0.288751 }, { "acc": 0.76479664, "epoch": 0.16108599867769957, "grad_norm": 3.390625, "learning_rate": 9.9745160814146e-06, "loss": 0.925739, "memory(GiB)": 746.04, "step": 6350, "train_speed(iter/s)": 0.28814 }, { "acc": 0.75061941, "epoch": 0.16121283804673714, "grad_norm": 4.53125, "learning_rate": 9.974410231772871e-06, "loss": 0.99884729, "memory(GiB)": 746.04, "step": 6355, "train_speed(iter/s)": 0.287399 }, { "acc": 0.74617219, "epoch": 0.1613396774157747, "grad_norm": 4.09375, "learning_rate": 9.974304163322685e-06, "loss": 0.98507099, "memory(GiB)": 746.04, "step": 6360, "train_speed(iter/s)": 0.286761 }, { "acc": 0.76068797, "epoch": 0.16146651678481225, "grad_norm": 3.5, "learning_rate": 9.97419787606871e-06, "loss": 0.96495504, "memory(GiB)": 746.04, "step": 6365, "train_speed(iter/s)": 0.286139 }, { "acc": 0.75384526, "epoch": 0.16159335615384982, "grad_norm": 3.6875, "learning_rate": 9.974091370015617e-06, "loss": 0.95059795, "memory(GiB)": 746.04, "step": 6370, "train_speed(iter/s)": 0.285518 }, { "acc": 0.75936747, "epoch": 0.16172019552288738, "grad_norm": 3.515625, "learning_rate": 9.973984645168093e-06, "loss": 0.96988478, "memory(GiB)": 746.04, "step": 6375, "train_speed(iter/s)": 0.28485 }, { "acc": 0.76375499, "epoch": 0.16184703489192492, "grad_norm": 3.46875, "learning_rate": 9.973877701530832e-06, "loss": 0.92991829, "memory(GiB)": 746.04, "step": 6380, "train_speed(iter/s)": 0.284103 }, { "acc": 0.75417619, "epoch": 0.1619738742609625, "grad_norm": 4.46875, "learning_rate": 9.97377053910854e-06, "loss": 0.98734779, "memory(GiB)": 746.04, "step": 6385, "train_speed(iter/s)": 0.28356 }, { "acc": 0.74135985, "epoch": 0.16210071363000006, "grad_norm": 4.0625, "learning_rate": 9.973663157905928e-06, "loss": 1.02208357, "memory(GiB)": 746.04, "step": 6390, "train_speed(iter/s)": 0.282864 }, { "acc": 0.74713187, "epoch": 0.1622275529990376, "grad_norm": 4.53125, "learning_rate": 9.973555557927723e-06, "loss": 1.00926313, "memory(GiB)": 746.04, "step": 6395, "train_speed(iter/s)": 0.282294 }, { "acc": 0.75148339, "epoch": 0.16235439236807517, "grad_norm": 3.796875, "learning_rate": 9.973447739178655e-06, "loss": 0.96272688, "memory(GiB)": 746.04, "step": 6400, "train_speed(iter/s)": 0.281687 }, { "acc": 0.76285372, "epoch": 0.16248123173711274, "grad_norm": 3.53125, "learning_rate": 9.973339701663465e-06, "loss": 0.95139961, "memory(GiB)": 746.04, "step": 6405, "train_speed(iter/s)": 0.281124 }, { "acc": 0.7662024, "epoch": 0.16260807110615028, "grad_norm": 3.421875, "learning_rate": 9.973231445386911e-06, "loss": 0.90011292, "memory(GiB)": 746.04, "step": 6410, "train_speed(iter/s)": 0.280525 }, { "acc": 0.74945259, "epoch": 0.16273491047518784, "grad_norm": 3.515625, "learning_rate": 9.973122970353748e-06, "loss": 1.00152769, "memory(GiB)": 746.04, "step": 6415, "train_speed(iter/s)": 0.279816 }, { "acc": 0.74457784, "epoch": 0.1628617498442254, "grad_norm": 3.640625, "learning_rate": 9.973014276568754e-06, "loss": 1.04544315, "memory(GiB)": 746.04, "step": 6420, "train_speed(iter/s)": 0.279119 }, { "acc": 0.7564837, "epoch": 0.16298858921326295, "grad_norm": 4.0, "learning_rate": 9.972905364036705e-06, "loss": 0.95050755, "memory(GiB)": 746.04, "step": 6425, "train_speed(iter/s)": 0.278502 }, { "acc": 0.75613632, "epoch": 0.16311542858230052, "grad_norm": 3.65625, "learning_rate": 9.972796232762394e-06, "loss": 0.94878035, "memory(GiB)": 746.04, "step": 6430, "train_speed(iter/s)": 0.277898 }, { "acc": 0.75726399, "epoch": 0.1632422679513381, "grad_norm": 3.359375, "learning_rate": 9.972686882750621e-06, "loss": 0.93431273, "memory(GiB)": 746.04, "step": 6435, "train_speed(iter/s)": 0.277235 }, { "acc": 0.75727406, "epoch": 0.16336910732037563, "grad_norm": 4.09375, "learning_rate": 9.972577314006195e-06, "loss": 0.98566275, "memory(GiB)": 746.04, "step": 6440, "train_speed(iter/s)": 0.276635 }, { "acc": 0.75229044, "epoch": 0.1634959466894132, "grad_norm": 3.90625, "learning_rate": 9.972467526533938e-06, "loss": 0.9460165, "memory(GiB)": 746.04, "step": 6445, "train_speed(iter/s)": 0.276055 }, { "acc": 0.75258017, "epoch": 0.16362278605845076, "grad_norm": 3.5625, "learning_rate": 9.972357520338677e-06, "loss": 0.92934036, "memory(GiB)": 746.04, "step": 6450, "train_speed(iter/s)": 0.275479 }, { "acc": 0.75669885, "epoch": 0.1637496254274883, "grad_norm": 4.34375, "learning_rate": 9.972247295425252e-06, "loss": 0.99679403, "memory(GiB)": 746.04, "step": 6455, "train_speed(iter/s)": 0.274952 }, { "acc": 0.74419408, "epoch": 0.16387646479652587, "grad_norm": 5.71875, "learning_rate": 9.972136851798513e-06, "loss": 1.0175004, "memory(GiB)": 746.04, "step": 6460, "train_speed(iter/s)": 0.274366 }, { "acc": 0.75644455, "epoch": 0.16400330416556344, "grad_norm": 3.484375, "learning_rate": 9.972026189463313e-06, "loss": 0.98167048, "memory(GiB)": 746.04, "step": 6465, "train_speed(iter/s)": 0.273807 }, { "acc": 0.74075623, "epoch": 0.16413014353460098, "grad_norm": 3.1875, "learning_rate": 9.971915308424525e-06, "loss": 1.06470652, "memory(GiB)": 746.04, "step": 6470, "train_speed(iter/s)": 0.273237 }, { "acc": 0.74561338, "epoch": 0.16425698290363855, "grad_norm": 4.15625, "learning_rate": 9.971804208687023e-06, "loss": 0.99688482, "memory(GiB)": 746.04, "step": 6475, "train_speed(iter/s)": 0.272662 }, { "acc": 0.75205231, "epoch": 0.16438382227267612, "grad_norm": 3.171875, "learning_rate": 9.971692890255695e-06, "loss": 0.93190098, "memory(GiB)": 746.04, "step": 6480, "train_speed(iter/s)": 0.27201 }, { "acc": 0.75070143, "epoch": 0.16451066164171366, "grad_norm": 3.203125, "learning_rate": 9.971581353135436e-06, "loss": 0.9813386, "memory(GiB)": 746.04, "step": 6485, "train_speed(iter/s)": 0.271353 }, { "acc": 0.75473809, "epoch": 0.16463750101075122, "grad_norm": 3.171875, "learning_rate": 9.971469597331156e-06, "loss": 0.94590998, "memory(GiB)": 746.04, "step": 6490, "train_speed(iter/s)": 0.270846 }, { "acc": 0.75057726, "epoch": 0.1647643403797888, "grad_norm": 4.625, "learning_rate": 9.971357622847768e-06, "loss": 0.95873375, "memory(GiB)": 746.04, "step": 6495, "train_speed(iter/s)": 0.270379 }, { "acc": 0.75686345, "epoch": 0.16489117974882633, "grad_norm": 3.6875, "learning_rate": 9.971245429690197e-06, "loss": 0.94789505, "memory(GiB)": 746.04, "step": 6500, "train_speed(iter/s)": 0.269793 }, { "epoch": 0.16489117974882633, "eval_acc": 0.7417395914319267, "eval_loss": 0.9389312863349915, "eval_runtime": 1150.3164, "eval_samples_per_second": 5.538, "eval_steps_per_second": 5.538, "step": 6500 }, { "acc": 0.75876842, "epoch": 0.1650180191178639, "grad_norm": 5.34375, "learning_rate": 9.97113301786338e-06, "loss": 0.96014519, "memory(GiB)": 746.04, "step": 6505, "train_speed(iter/s)": 0.249786 }, { "acc": 0.75505919, "epoch": 0.16514485848690147, "grad_norm": 3.625, "learning_rate": 9.97102038737226e-06, "loss": 0.94769945, "memory(GiB)": 746.04, "step": 6510, "train_speed(iter/s)": 0.249358 }, { "acc": 0.76722436, "epoch": 0.165271697855939, "grad_norm": 4.5, "learning_rate": 9.970907538221793e-06, "loss": 0.93168287, "memory(GiB)": 746.04, "step": 6515, "train_speed(iter/s)": 0.248963 }, { "acc": 0.74569755, "epoch": 0.16539853722497658, "grad_norm": 3.4375, "learning_rate": 9.970794470416938e-06, "loss": 1.0036315, "memory(GiB)": 746.04, "step": 6520, "train_speed(iter/s)": 0.248532 }, { "acc": 0.76110029, "epoch": 0.16552537659401415, "grad_norm": 4.3125, "learning_rate": 9.970681183962677e-06, "loss": 0.96381016, "memory(GiB)": 746.04, "step": 6525, "train_speed(iter/s)": 0.248085 }, { "acc": 0.75611682, "epoch": 0.16565221596305169, "grad_norm": 3.15625, "learning_rate": 9.970567678863985e-06, "loss": 0.93363314, "memory(GiB)": 746.04, "step": 6530, "train_speed(iter/s)": 0.247626 }, { "acc": 0.76495633, "epoch": 0.16577905533208925, "grad_norm": 3.34375, "learning_rate": 9.970453955125858e-06, "loss": 0.91556034, "memory(GiB)": 746.04, "step": 6535, "train_speed(iter/s)": 0.247178 }, { "acc": 0.75666213, "epoch": 0.16590589470112682, "grad_norm": 3.96875, "learning_rate": 9.9703400127533e-06, "loss": 0.94611473, "memory(GiB)": 746.04, "step": 6540, "train_speed(iter/s)": 0.246737 }, { "acc": 0.74496722, "epoch": 0.16603273407016436, "grad_norm": 4.1875, "learning_rate": 9.97022585175132e-06, "loss": 0.98028154, "memory(GiB)": 746.04, "step": 6545, "train_speed(iter/s)": 0.246287 }, { "acc": 0.75789824, "epoch": 0.16615957343920193, "grad_norm": 3.4375, "learning_rate": 9.970111472124941e-06, "loss": 0.97502508, "memory(GiB)": 746.04, "step": 6550, "train_speed(iter/s)": 0.245811 }, { "acc": 0.74735723, "epoch": 0.1662864128082395, "grad_norm": 3.578125, "learning_rate": 9.969996873879196e-06, "loss": 0.99212732, "memory(GiB)": 746.04, "step": 6555, "train_speed(iter/s)": 0.245334 }, { "acc": 0.7664536, "epoch": 0.16641325217727704, "grad_norm": 4.03125, "learning_rate": 9.969882057019122e-06, "loss": 0.91447916, "memory(GiB)": 746.04, "step": 6560, "train_speed(iter/s)": 0.244887 }, { "acc": 0.7653059, "epoch": 0.1665400915463146, "grad_norm": 4.0, "learning_rate": 9.969767021549772e-06, "loss": 0.9399394, "memory(GiB)": 746.04, "step": 6565, "train_speed(iter/s)": 0.244504 }, { "acc": 0.77788591, "epoch": 0.16666693091535217, "grad_norm": 2.984375, "learning_rate": 9.969651767476206e-06, "loss": 0.90305481, "memory(GiB)": 746.04, "step": 6570, "train_speed(iter/s)": 0.244072 }, { "acc": 0.76419425, "epoch": 0.16679377028438971, "grad_norm": 3.546875, "learning_rate": 9.969536294803492e-06, "loss": 0.96879654, "memory(GiB)": 746.04, "step": 6575, "train_speed(iter/s)": 0.243651 }, { "acc": 0.74189377, "epoch": 0.16692060965342728, "grad_norm": 3.375, "learning_rate": 9.96942060353671e-06, "loss": 1.01339054, "memory(GiB)": 746.04, "step": 6580, "train_speed(iter/s)": 0.243223 }, { "acc": 0.76930146, "epoch": 0.16704744902246485, "grad_norm": 3.265625, "learning_rate": 9.96930469368095e-06, "loss": 0.91458702, "memory(GiB)": 746.04, "step": 6585, "train_speed(iter/s)": 0.242759 }, { "acc": 0.74102554, "epoch": 0.1671742883915024, "grad_norm": 4.1875, "learning_rate": 9.969188565241308e-06, "loss": 1.0295804, "memory(GiB)": 746.04, "step": 6590, "train_speed(iter/s)": 0.242378 }, { "acc": 0.74993949, "epoch": 0.16730112776053996, "grad_norm": 3.90625, "learning_rate": 9.969072218222895e-06, "loss": 1.00491199, "memory(GiB)": 746.04, "step": 6595, "train_speed(iter/s)": 0.241994 }, { "acc": 0.74039865, "epoch": 0.16742796712957753, "grad_norm": 3.421875, "learning_rate": 9.968955652630828e-06, "loss": 1.03865852, "memory(GiB)": 746.04, "step": 6600, "train_speed(iter/s)": 0.241636 }, { "acc": 0.75159526, "epoch": 0.16755480649861507, "grad_norm": 4.34375, "learning_rate": 9.968838868470234e-06, "loss": 1.02050562, "memory(GiB)": 746.04, "step": 6605, "train_speed(iter/s)": 0.241241 }, { "acc": 0.73892102, "epoch": 0.16768164586765263, "grad_norm": 4.28125, "learning_rate": 9.968721865746249e-06, "loss": 1.00847006, "memory(GiB)": 746.04, "step": 6610, "train_speed(iter/s)": 0.240835 }, { "acc": 0.74795313, "epoch": 0.1678084852366902, "grad_norm": 4.875, "learning_rate": 9.968604644464022e-06, "loss": 1.02207136, "memory(GiB)": 746.04, "step": 6615, "train_speed(iter/s)": 0.240391 }, { "acc": 0.74421182, "epoch": 0.16793532460572774, "grad_norm": 4.3125, "learning_rate": 9.968487204628707e-06, "loss": 1.03072186, "memory(GiB)": 746.04, "step": 6620, "train_speed(iter/s)": 0.239976 }, { "acc": 0.75521603, "epoch": 0.1680621639747653, "grad_norm": 3.25, "learning_rate": 9.968369546245469e-06, "loss": 0.9702507, "memory(GiB)": 746.04, "step": 6625, "train_speed(iter/s)": 0.239592 }, { "acc": 0.75641842, "epoch": 0.16818900334380288, "grad_norm": 3.640625, "learning_rate": 9.968251669319487e-06, "loss": 0.96216021, "memory(GiB)": 746.04, "step": 6630, "train_speed(iter/s)": 0.239203 }, { "acc": 0.75488248, "epoch": 0.16831584271284042, "grad_norm": 3.546875, "learning_rate": 9.968133573855942e-06, "loss": 0.90729866, "memory(GiB)": 746.04, "step": 6635, "train_speed(iter/s)": 0.238725 }, { "acc": 0.74140468, "epoch": 0.168442682081878, "grad_norm": 4.09375, "learning_rate": 9.968015259860031e-06, "loss": 0.96801291, "memory(GiB)": 746.04, "step": 6640, "train_speed(iter/s)": 0.238346 }, { "acc": 0.76571794, "epoch": 0.16856952145091555, "grad_norm": 3.671875, "learning_rate": 9.967896727336957e-06, "loss": 0.90658455, "memory(GiB)": 746.04, "step": 6645, "train_speed(iter/s)": 0.237943 }, { "acc": 0.74908195, "epoch": 0.1686963608199531, "grad_norm": 6.53125, "learning_rate": 9.967777976291937e-06, "loss": 0.99491053, "memory(GiB)": 746.04, "step": 6650, "train_speed(iter/s)": 0.23752 }, { "acc": 0.75336361, "epoch": 0.16882320018899066, "grad_norm": 3.5625, "learning_rate": 9.96765900673019e-06, "loss": 0.96741247, "memory(GiB)": 746.04, "step": 6655, "train_speed(iter/s)": 0.237128 }, { "acc": 0.75111294, "epoch": 0.16895003955802823, "grad_norm": 3.546875, "learning_rate": 9.967539818656953e-06, "loss": 0.95507078, "memory(GiB)": 746.04, "step": 6660, "train_speed(iter/s)": 0.236747 }, { "acc": 0.76656942, "epoch": 0.16907687892706577, "grad_norm": 3.703125, "learning_rate": 9.967420412077465e-06, "loss": 0.94250984, "memory(GiB)": 746.04, "step": 6665, "train_speed(iter/s)": 0.236425 }, { "acc": 0.75043283, "epoch": 0.16920371829610334, "grad_norm": 3.375, "learning_rate": 9.96730078699698e-06, "loss": 0.96228142, "memory(GiB)": 746.04, "step": 6670, "train_speed(iter/s)": 0.236046 }, { "acc": 0.75571218, "epoch": 0.1693305576651409, "grad_norm": 3.71875, "learning_rate": 9.967180943420762e-06, "loss": 0.96306772, "memory(GiB)": 746.04, "step": 6675, "train_speed(iter/s)": 0.235682 }, { "acc": 0.76652169, "epoch": 0.16945739703417845, "grad_norm": 3.328125, "learning_rate": 9.96706088135408e-06, "loss": 0.944065, "memory(GiB)": 746.04, "step": 6680, "train_speed(iter/s)": 0.235276 }, { "acc": 0.75361395, "epoch": 0.16958423640321602, "grad_norm": 4.125, "learning_rate": 9.966940600802217e-06, "loss": 0.95191641, "memory(GiB)": 746.04, "step": 6685, "train_speed(iter/s)": 0.234906 }, { "acc": 0.74697037, "epoch": 0.16971107577225358, "grad_norm": 3.65625, "learning_rate": 9.966820101770462e-06, "loss": 0.9916172, "memory(GiB)": 746.04, "step": 6690, "train_speed(iter/s)": 0.234565 }, { "acc": 0.7557476, "epoch": 0.16983791514129112, "grad_norm": 4.03125, "learning_rate": 9.966699384264114e-06, "loss": 0.9796401, "memory(GiB)": 746.04, "step": 6695, "train_speed(iter/s)": 0.234174 }, { "acc": 0.76530509, "epoch": 0.1699647545103287, "grad_norm": 4.28125, "learning_rate": 9.966578448288486e-06, "loss": 0.95276651, "memory(GiB)": 746.04, "step": 6700, "train_speed(iter/s)": 0.233806 }, { "acc": 0.75756092, "epoch": 0.17009159387936626, "grad_norm": 4.28125, "learning_rate": 9.966457293848898e-06, "loss": 0.97989693, "memory(GiB)": 746.04, "step": 6705, "train_speed(iter/s)": 0.233442 }, { "acc": 0.7436923, "epoch": 0.1702184332484038, "grad_norm": 3.328125, "learning_rate": 9.966335920950677e-06, "loss": 1.00374565, "memory(GiB)": 746.04, "step": 6710, "train_speed(iter/s)": 0.233098 }, { "acc": 0.7462081, "epoch": 0.17034527261744137, "grad_norm": 3.796875, "learning_rate": 9.966214329599162e-06, "loss": 1.00944529, "memory(GiB)": 746.04, "step": 6715, "train_speed(iter/s)": 0.232747 }, { "acc": 0.74208546, "epoch": 0.17047211198647894, "grad_norm": 3.984375, "learning_rate": 9.966092519799702e-06, "loss": 0.97827282, "memory(GiB)": 746.04, "step": 6720, "train_speed(iter/s)": 0.232368 }, { "acc": 0.74334216, "epoch": 0.17059895135551648, "grad_norm": 3.640625, "learning_rate": 9.965970491557655e-06, "loss": 1.02943983, "memory(GiB)": 746.04, "step": 6725, "train_speed(iter/s)": 0.231969 }, { "acc": 0.76986747, "epoch": 0.17072579072455404, "grad_norm": 3.84375, "learning_rate": 9.965848244878389e-06, "loss": 0.88666925, "memory(GiB)": 746.04, "step": 6730, "train_speed(iter/s)": 0.231662 }, { "acc": 0.74996705, "epoch": 0.1708526300935916, "grad_norm": 3.421875, "learning_rate": 9.965725779767281e-06, "loss": 0.96027269, "memory(GiB)": 746.04, "step": 6735, "train_speed(iter/s)": 0.231348 }, { "acc": 0.75762987, "epoch": 0.17097946946262915, "grad_norm": 3.40625, "learning_rate": 9.965603096229717e-06, "loss": 0.99684181, "memory(GiB)": 746.06, "step": 6740, "train_speed(iter/s)": 0.230945 }, { "acc": 0.75110049, "epoch": 0.17110630883166672, "grad_norm": 3.5625, "learning_rate": 9.965480194271096e-06, "loss": 0.96454668, "memory(GiB)": 746.06, "step": 6745, "train_speed(iter/s)": 0.230586 }, { "acc": 0.75645103, "epoch": 0.1712331482007043, "grad_norm": 3.640625, "learning_rate": 9.96535707389682e-06, "loss": 0.9842555, "memory(GiB)": 746.06, "step": 6750, "train_speed(iter/s)": 0.230263 }, { "acc": 0.77018204, "epoch": 0.17135998756974183, "grad_norm": 3.640625, "learning_rate": 9.965233735112308e-06, "loss": 0.93736801, "memory(GiB)": 746.06, "step": 6755, "train_speed(iter/s)": 0.229903 }, { "acc": 0.76025929, "epoch": 0.1714868269387794, "grad_norm": 2.828125, "learning_rate": 9.965110177922983e-06, "loss": 0.97130136, "memory(GiB)": 746.06, "step": 6760, "train_speed(iter/s)": 0.229539 }, { "acc": 0.7544138, "epoch": 0.17161366630781696, "grad_norm": 6.71875, "learning_rate": 9.964986402334282e-06, "loss": 0.99113741, "memory(GiB)": 746.06, "step": 6765, "train_speed(iter/s)": 0.22919 }, { "acc": 0.75101385, "epoch": 0.1717405056768545, "grad_norm": 3.421875, "learning_rate": 9.96486240835165e-06, "loss": 0.95344629, "memory(GiB)": 746.06, "step": 6770, "train_speed(iter/s)": 0.228832 }, { "acc": 0.75633349, "epoch": 0.17186734504589207, "grad_norm": 3.984375, "learning_rate": 9.964738195980537e-06, "loss": 0.96050634, "memory(GiB)": 746.06, "step": 6775, "train_speed(iter/s)": 0.228468 }, { "acc": 0.74450493, "epoch": 0.17199418441492964, "grad_norm": 4.0625, "learning_rate": 9.964613765226412e-06, "loss": 1.01304083, "memory(GiB)": 746.06, "step": 6780, "train_speed(iter/s)": 0.228143 }, { "acc": 0.76004629, "epoch": 0.17212102378396718, "grad_norm": 4.03125, "learning_rate": 9.964489116094745e-06, "loss": 0.92363186, "memory(GiB)": 746.06, "step": 6785, "train_speed(iter/s)": 0.227816 }, { "acc": 0.75922446, "epoch": 0.17224786315300475, "grad_norm": 3.625, "learning_rate": 9.964364248591018e-06, "loss": 0.9650033, "memory(GiB)": 746.06, "step": 6790, "train_speed(iter/s)": 0.227462 }, { "acc": 0.76506991, "epoch": 0.17237470252204232, "grad_norm": 3.140625, "learning_rate": 9.964239162720727e-06, "loss": 0.91653805, "memory(GiB)": 746.06, "step": 6795, "train_speed(iter/s)": 0.227125 }, { "acc": 0.76149945, "epoch": 0.17250154189107986, "grad_norm": 3.359375, "learning_rate": 9.96411385848937e-06, "loss": 0.96418285, "memory(GiB)": 746.06, "step": 6800, "train_speed(iter/s)": 0.226845 }, { "acc": 0.75169549, "epoch": 0.17262838126011743, "grad_norm": 3.296875, "learning_rate": 9.963988335902462e-06, "loss": 0.97607374, "memory(GiB)": 746.06, "step": 6805, "train_speed(iter/s)": 0.22652 }, { "acc": 0.76494193, "epoch": 0.172755220629155, "grad_norm": 4.28125, "learning_rate": 9.963862594965524e-06, "loss": 0.93213301, "memory(GiB)": 746.06, "step": 6810, "train_speed(iter/s)": 0.226194 }, { "acc": 0.75627093, "epoch": 0.17288205999819253, "grad_norm": 3.375, "learning_rate": 9.963736635684088e-06, "loss": 0.99318981, "memory(GiB)": 746.06, "step": 6815, "train_speed(iter/s)": 0.225872 }, { "acc": 0.75740318, "epoch": 0.1730088993672301, "grad_norm": 4.46875, "learning_rate": 9.963610458063688e-06, "loss": 0.97778454, "memory(GiB)": 746.06, "step": 6820, "train_speed(iter/s)": 0.225526 }, { "acc": 0.73893619, "epoch": 0.17313573873626767, "grad_norm": 3.921875, "learning_rate": 9.963484062109883e-06, "loss": 1.00900221, "memory(GiB)": 746.06, "step": 6825, "train_speed(iter/s)": 0.225251 }, { "acc": 0.75731354, "epoch": 0.1732625781053052, "grad_norm": 3.625, "learning_rate": 9.963357447828228e-06, "loss": 0.95983162, "memory(GiB)": 746.06, "step": 6830, "train_speed(iter/s)": 0.224927 }, { "acc": 0.75526481, "epoch": 0.17338941747434278, "grad_norm": 4.5625, "learning_rate": 9.963230615224292e-06, "loss": 1.01115799, "memory(GiB)": 746.06, "step": 6835, "train_speed(iter/s)": 0.224609 }, { "acc": 0.745541, "epoch": 0.17351625684338035, "grad_norm": 3.390625, "learning_rate": 9.963103564303656e-06, "loss": 1.02827854, "memory(GiB)": 746.06, "step": 6840, "train_speed(iter/s)": 0.224253 }, { "acc": 0.74767027, "epoch": 0.17364309621241789, "grad_norm": 4.375, "learning_rate": 9.962976295071907e-06, "loss": 1.00990381, "memory(GiB)": 746.06, "step": 6845, "train_speed(iter/s)": 0.223936 }, { "acc": 0.74773574, "epoch": 0.17376993558145545, "grad_norm": 3.09375, "learning_rate": 9.962848807534644e-06, "loss": 0.93410778, "memory(GiB)": 746.06, "step": 6850, "train_speed(iter/s)": 0.223584 }, { "acc": 0.74901776, "epoch": 0.17389677495049302, "grad_norm": 3.484375, "learning_rate": 9.962721101697474e-06, "loss": 0.94320564, "memory(GiB)": 746.06, "step": 6855, "train_speed(iter/s)": 0.223294 }, { "acc": 0.74831018, "epoch": 0.17402361431953056, "grad_norm": 3.484375, "learning_rate": 9.962593177566016e-06, "loss": 0.97264462, "memory(GiB)": 746.06, "step": 6860, "train_speed(iter/s)": 0.22299 }, { "acc": 0.762673, "epoch": 0.17415045368856813, "grad_norm": 3.484375, "learning_rate": 9.962465035145895e-06, "loss": 0.87191029, "memory(GiB)": 746.06, "step": 6865, "train_speed(iter/s)": 0.222647 }, { "acc": 0.75297203, "epoch": 0.1742772930576057, "grad_norm": 3.359375, "learning_rate": 9.96233667444275e-06, "loss": 0.96599827, "memory(GiB)": 746.06, "step": 6870, "train_speed(iter/s)": 0.222326 }, { "acc": 0.74595351, "epoch": 0.17440413242664324, "grad_norm": 3.3125, "learning_rate": 9.962208095462225e-06, "loss": 1.02472763, "memory(GiB)": 746.06, "step": 6875, "train_speed(iter/s)": 0.222047 }, { "acc": 0.74794936, "epoch": 0.1745309717956808, "grad_norm": 4.78125, "learning_rate": 9.962079298209975e-06, "loss": 0.99292355, "memory(GiB)": 746.06, "step": 6880, "train_speed(iter/s)": 0.221688 }, { "acc": 0.76474619, "epoch": 0.17465781116471837, "grad_norm": 3.890625, "learning_rate": 9.961950282691668e-06, "loss": 0.93523312, "memory(GiB)": 759.12, "step": 6885, "train_speed(iter/s)": 0.221361 }, { "acc": 0.75892248, "epoch": 0.17478465053375591, "grad_norm": 3.796875, "learning_rate": 9.961821048912977e-06, "loss": 0.94608345, "memory(GiB)": 759.12, "step": 6890, "train_speed(iter/s)": 0.220993 }, { "acc": 0.7618876, "epoch": 0.17491148990279348, "grad_norm": 3.359375, "learning_rate": 9.961691596879588e-06, "loss": 0.89516277, "memory(GiB)": 759.12, "step": 6895, "train_speed(iter/s)": 0.220673 }, { "acc": 0.75556917, "epoch": 0.17503832927183105, "grad_norm": 4.21875, "learning_rate": 9.961561926597194e-06, "loss": 0.98355055, "memory(GiB)": 759.12, "step": 6900, "train_speed(iter/s)": 0.220333 }, { "acc": 0.75219903, "epoch": 0.1751651686408686, "grad_norm": 3.890625, "learning_rate": 9.961432038071502e-06, "loss": 0.99649477, "memory(GiB)": 759.12, "step": 6905, "train_speed(iter/s)": 0.220067 }, { "acc": 0.75665922, "epoch": 0.17529200800990616, "grad_norm": 2.953125, "learning_rate": 9.961301931308221e-06, "loss": 0.9206398, "memory(GiB)": 759.12, "step": 6910, "train_speed(iter/s)": 0.219711 }, { "acc": 0.76179409, "epoch": 0.17541884737894373, "grad_norm": 3.75, "learning_rate": 9.961171606313073e-06, "loss": 0.9441761, "memory(GiB)": 759.12, "step": 6915, "train_speed(iter/s)": 0.219395 }, { "acc": 0.74877648, "epoch": 0.17554568674798127, "grad_norm": 3.53125, "learning_rate": 9.961041063091797e-06, "loss": 0.95610867, "memory(GiB)": 759.12, "step": 6920, "train_speed(iter/s)": 0.219071 }, { "acc": 0.74629793, "epoch": 0.17567252611701883, "grad_norm": 4.15625, "learning_rate": 9.96091030165013e-06, "loss": 1.01607828, "memory(GiB)": 759.12, "step": 6925, "train_speed(iter/s)": 0.218699 }, { "acc": 0.75383396, "epoch": 0.1757993654860564, "grad_norm": 3.5625, "learning_rate": 9.960779321993826e-06, "loss": 0.93264647, "memory(GiB)": 759.12, "step": 6930, "train_speed(iter/s)": 0.218394 }, { "acc": 0.75300894, "epoch": 0.17592620485509394, "grad_norm": 4.5625, "learning_rate": 9.960648124128645e-06, "loss": 0.98736649, "memory(GiB)": 759.12, "step": 6935, "train_speed(iter/s)": 0.218092 }, { "acc": 0.76473088, "epoch": 0.1760530442241315, "grad_norm": 4.03125, "learning_rate": 9.960516708060358e-06, "loss": 0.90912647, "memory(GiB)": 759.12, "step": 6940, "train_speed(iter/s)": 0.217722 }, { "acc": 0.74573007, "epoch": 0.17617988359316908, "grad_norm": 3.71875, "learning_rate": 9.960385073794746e-06, "loss": 0.96849508, "memory(GiB)": 759.12, "step": 6945, "train_speed(iter/s)": 0.217443 }, { "acc": 0.75282598, "epoch": 0.17630672296220662, "grad_norm": 3.5, "learning_rate": 9.960253221337602e-06, "loss": 0.93515501, "memory(GiB)": 759.12, "step": 6950, "train_speed(iter/s)": 0.217102 }, { "acc": 0.76936822, "epoch": 0.1764335623312442, "grad_norm": 3.484375, "learning_rate": 9.96012115069472e-06, "loss": 0.94642467, "memory(GiB)": 759.12, "step": 6955, "train_speed(iter/s)": 0.216839 }, { "acc": 0.74082031, "epoch": 0.17656040170028175, "grad_norm": 4.21875, "learning_rate": 9.959988861871914e-06, "loss": 0.96683102, "memory(GiB)": 759.12, "step": 6960, "train_speed(iter/s)": 0.216538 }, { "acc": 0.75377579, "epoch": 0.1766872410693193, "grad_norm": 4.96875, "learning_rate": 9.959856354875001e-06, "loss": 0.94503126, "memory(GiB)": 759.12, "step": 6965, "train_speed(iter/s)": 0.216253 }, { "acc": 0.76600718, "epoch": 0.17681408043835686, "grad_norm": 2.90625, "learning_rate": 9.95972362970981e-06, "loss": 0.9551981, "memory(GiB)": 759.12, "step": 6970, "train_speed(iter/s)": 0.215976 }, { "acc": 0.75375953, "epoch": 0.17694091980739443, "grad_norm": 4.28125, "learning_rate": 9.959590686382181e-06, "loss": 1.00945883, "memory(GiB)": 759.12, "step": 6975, "train_speed(iter/s)": 0.215689 }, { "acc": 0.74059367, "epoch": 0.17706775917643197, "grad_norm": 4.625, "learning_rate": 9.959457524897958e-06, "loss": 1.01100101, "memory(GiB)": 759.12, "step": 6980, "train_speed(iter/s)": 0.215414 }, { "acc": 0.74029608, "epoch": 0.17719459854546954, "grad_norm": 3.703125, "learning_rate": 9.959324145263002e-06, "loss": 0.98887434, "memory(GiB)": 759.12, "step": 6985, "train_speed(iter/s)": 0.215112 }, { "acc": 0.75037947, "epoch": 0.1773214379145071, "grad_norm": 4.3125, "learning_rate": 9.959190547483175e-06, "loss": 0.97410269, "memory(GiB)": 759.12, "step": 6990, "train_speed(iter/s)": 0.214837 }, { "acc": 0.74187198, "epoch": 0.17744827728354465, "grad_norm": 3.796875, "learning_rate": 9.959056731564358e-06, "loss": 1.06635857, "memory(GiB)": 759.12, "step": 6995, "train_speed(iter/s)": 0.214583 }, { "acc": 0.7383461, "epoch": 0.17757511665258222, "grad_norm": 4.0625, "learning_rate": 9.958922697512437e-06, "loss": 1.0163559, "memory(GiB)": 759.12, "step": 7000, "train_speed(iter/s)": 0.214291 }, { "epoch": 0.17757511665258222, "eval_acc": 0.7425876590701251, "eval_loss": 0.9343035817146301, "eval_runtime": 1148.8309, "eval_samples_per_second": 5.545, "eval_steps_per_second": 5.545, "step": 7000 }, { "acc": 0.77021036, "epoch": 0.17770195602161978, "grad_norm": 3.71875, "learning_rate": 9.958788445333307e-06, "loss": 0.86479177, "memory(GiB)": 759.12, "step": 7005, "train_speed(iter/s)": 0.202322 }, { "acc": 0.74691978, "epoch": 0.17782879539065732, "grad_norm": 2.984375, "learning_rate": 9.958653975032873e-06, "loss": 0.93190899, "memory(GiB)": 759.12, "step": 7010, "train_speed(iter/s)": 0.202018 }, { "acc": 0.75930538, "epoch": 0.1779556347596949, "grad_norm": 3.25, "learning_rate": 9.958519286617047e-06, "loss": 0.94886112, "memory(GiB)": 759.12, "step": 7015, "train_speed(iter/s)": 0.201786 }, { "acc": 0.75853214, "epoch": 0.17808247412873246, "grad_norm": 3.671875, "learning_rate": 9.95838438009176e-06, "loss": 0.9878664, "memory(GiB)": 759.12, "step": 7020, "train_speed(iter/s)": 0.201522 }, { "acc": 0.77465038, "epoch": 0.17820931349777, "grad_norm": 3.703125, "learning_rate": 9.95824925546294e-06, "loss": 0.91505327, "memory(GiB)": 759.12, "step": 7025, "train_speed(iter/s)": 0.20126 }, { "acc": 0.74934731, "epoch": 0.17833615286680757, "grad_norm": 3.25, "learning_rate": 9.958113912736533e-06, "loss": 1.03381653, "memory(GiB)": 759.12, "step": 7030, "train_speed(iter/s)": 0.201014 }, { "acc": 0.74649754, "epoch": 0.17846299223584514, "grad_norm": 3.75, "learning_rate": 9.957978351918495e-06, "loss": 0.98932457, "memory(GiB)": 759.12, "step": 7035, "train_speed(iter/s)": 0.200807 }, { "acc": 0.76697555, "epoch": 0.17858983160488268, "grad_norm": 3.65625, "learning_rate": 9.957842573014785e-06, "loss": 0.90514936, "memory(GiB)": 759.12, "step": 7040, "train_speed(iter/s)": 0.200559 }, { "acc": 0.76347795, "epoch": 0.17871667097392024, "grad_norm": 3.359375, "learning_rate": 9.957706576031375e-06, "loss": 0.88085041, "memory(GiB)": 759.12, "step": 7045, "train_speed(iter/s)": 0.200322 }, { "acc": 0.75533071, "epoch": 0.1788435103429578, "grad_norm": 3.9375, "learning_rate": 9.957570360974253e-06, "loss": 1.04336739, "memory(GiB)": 759.12, "step": 7050, "train_speed(iter/s)": 0.200091 }, { "acc": 0.74642344, "epoch": 0.17897034971199535, "grad_norm": 4.125, "learning_rate": 9.957433927849403e-06, "loss": 1.00772295, "memory(GiB)": 759.12, "step": 7055, "train_speed(iter/s)": 0.199867 }, { "acc": 0.75515471, "epoch": 0.17909718908103292, "grad_norm": 4.0625, "learning_rate": 9.957297276662831e-06, "loss": 0.98432169, "memory(GiB)": 759.12, "step": 7060, "train_speed(iter/s)": 0.199644 }, { "acc": 0.75675821, "epoch": 0.1792240284500705, "grad_norm": 4.0625, "learning_rate": 9.957160407420548e-06, "loss": 0.9155879, "memory(GiB)": 759.12, "step": 7065, "train_speed(iter/s)": 0.199404 }, { "acc": 0.76296506, "epoch": 0.17935086781910803, "grad_norm": 3.703125, "learning_rate": 9.957023320128572e-06, "loss": 0.94162045, "memory(GiB)": 759.12, "step": 7070, "train_speed(iter/s)": 0.199149 }, { "acc": 0.76021395, "epoch": 0.1794777071881456, "grad_norm": 3.3125, "learning_rate": 9.956886014792935e-06, "loss": 0.92753649, "memory(GiB)": 759.12, "step": 7075, "train_speed(iter/s)": 0.198921 }, { "acc": 0.75403442, "epoch": 0.17960454655718316, "grad_norm": 4.03125, "learning_rate": 9.956748491419675e-06, "loss": 0.97569971, "memory(GiB)": 759.12, "step": 7080, "train_speed(iter/s)": 0.198697 }, { "acc": 0.74833555, "epoch": 0.1797313859262207, "grad_norm": 3.09375, "learning_rate": 9.956610750014842e-06, "loss": 1.01864767, "memory(GiB)": 759.12, "step": 7085, "train_speed(iter/s)": 0.198475 }, { "acc": 0.75717931, "epoch": 0.17985822529525827, "grad_norm": 2.40625, "learning_rate": 9.956472790584495e-06, "loss": 0.91843939, "memory(GiB)": 759.12, "step": 7090, "train_speed(iter/s)": 0.198238 }, { "acc": 0.75786543, "epoch": 0.17998506466429584, "grad_norm": 3.609375, "learning_rate": 9.956334613134702e-06, "loss": 0.97101994, "memory(GiB)": 759.12, "step": 7095, "train_speed(iter/s)": 0.197999 }, { "acc": 0.76152325, "epoch": 0.18011190403333338, "grad_norm": 3.453125, "learning_rate": 9.95619621767154e-06, "loss": 0.91727247, "memory(GiB)": 759.12, "step": 7100, "train_speed(iter/s)": 0.197752 }, { "acc": 0.72724123, "epoch": 0.18023874340237095, "grad_norm": 3.375, "learning_rate": 9.956057604201099e-06, "loss": 1.03145514, "memory(GiB)": 759.12, "step": 7105, "train_speed(iter/s)": 0.197527 }, { "acc": 0.75510502, "epoch": 0.18036558277140852, "grad_norm": 3.453125, "learning_rate": 9.955918772729476e-06, "loss": 0.94548359, "memory(GiB)": 759.12, "step": 7110, "train_speed(iter/s)": 0.197334 }, { "acc": 0.75768313, "epoch": 0.18049242214044606, "grad_norm": 3.953125, "learning_rate": 9.955779723262775e-06, "loss": 0.94893999, "memory(GiB)": 759.12, "step": 7115, "train_speed(iter/s)": 0.197067 }, { "acc": 0.7591094, "epoch": 0.18061926150948363, "grad_norm": 3.90625, "learning_rate": 9.955640455807116e-06, "loss": 0.95295067, "memory(GiB)": 759.12, "step": 7120, "train_speed(iter/s)": 0.196818 }, { "acc": 0.76924524, "epoch": 0.1807461008785212, "grad_norm": 3.296875, "learning_rate": 9.955500970368622e-06, "loss": 0.90469198, "memory(GiB)": 759.12, "step": 7125, "train_speed(iter/s)": 0.196542 }, { "acc": 0.75693359, "epoch": 0.18087294024755873, "grad_norm": 3.765625, "learning_rate": 9.955361266953429e-06, "loss": 0.92874479, "memory(GiB)": 759.12, "step": 7130, "train_speed(iter/s)": 0.196241 }, { "acc": 0.74578404, "epoch": 0.1809997796165963, "grad_norm": 4.03125, "learning_rate": 9.955221345567682e-06, "loss": 1.00890656, "memory(GiB)": 759.12, "step": 7135, "train_speed(iter/s)": 0.195993 }, { "acc": 0.75672946, "epoch": 0.18112661898563387, "grad_norm": 3.46875, "learning_rate": 9.955081206217539e-06, "loss": 0.95592813, "memory(GiB)": 759.12, "step": 7140, "train_speed(iter/s)": 0.195754 }, { "acc": 0.73540144, "epoch": 0.1812534583546714, "grad_norm": 3.75, "learning_rate": 9.954940848909159e-06, "loss": 1.03051386, "memory(GiB)": 759.12, "step": 7145, "train_speed(iter/s)": 0.195524 }, { "acc": 0.74990311, "epoch": 0.18138029772370898, "grad_norm": 3.578125, "learning_rate": 9.95480027364872e-06, "loss": 0.99484634, "memory(GiB)": 759.12, "step": 7150, "train_speed(iter/s)": 0.195293 }, { "acc": 0.76184402, "epoch": 0.18150713709274655, "grad_norm": 3.546875, "learning_rate": 9.954659480442403e-06, "loss": 0.92264919, "memory(GiB)": 759.12, "step": 7155, "train_speed(iter/s)": 0.195072 }, { "acc": 0.74856062, "epoch": 0.18163397646178409, "grad_norm": 3.859375, "learning_rate": 9.9545184692964e-06, "loss": 0.98566198, "memory(GiB)": 759.12, "step": 7160, "train_speed(iter/s)": 0.194883 }, { "acc": 0.75536966, "epoch": 0.18176081583082165, "grad_norm": 3.234375, "learning_rate": 9.954377240216919e-06, "loss": 0.98789215, "memory(GiB)": 759.12, "step": 7165, "train_speed(iter/s)": 0.194637 }, { "acc": 0.74957356, "epoch": 0.18188765519985922, "grad_norm": 3.53125, "learning_rate": 9.95423579321017e-06, "loss": 0.98715982, "memory(GiB)": 759.12, "step": 7170, "train_speed(iter/s)": 0.194412 }, { "acc": 0.75699139, "epoch": 0.18201449456889676, "grad_norm": 4.4375, "learning_rate": 9.95409412828237e-06, "loss": 0.9721386, "memory(GiB)": 759.12, "step": 7175, "train_speed(iter/s)": 0.194159 }, { "acc": 0.74603963, "epoch": 0.18214133393793433, "grad_norm": 3.5625, "learning_rate": 9.953952245439755e-06, "loss": 0.98992434, "memory(GiB)": 759.12, "step": 7180, "train_speed(iter/s)": 0.193955 }, { "acc": 0.75272479, "epoch": 0.1822681733069719, "grad_norm": 3.03125, "learning_rate": 9.953810144688566e-06, "loss": 0.98233681, "memory(GiB)": 759.12, "step": 7185, "train_speed(iter/s)": 0.193724 }, { "acc": 0.76296339, "epoch": 0.18239501267600944, "grad_norm": 4.0, "learning_rate": 9.953667826035052e-06, "loss": 0.97459478, "memory(GiB)": 759.12, "step": 7190, "train_speed(iter/s)": 0.19355 }, { "acc": 0.74267588, "epoch": 0.182521852045047, "grad_norm": 4.28125, "learning_rate": 9.953525289485472e-06, "loss": 0.9824501, "memory(GiB)": 759.12, "step": 7195, "train_speed(iter/s)": 0.193308 }, { "acc": 0.77093496, "epoch": 0.18264869141408457, "grad_norm": 3.4375, "learning_rate": 9.953382535046099e-06, "loss": 0.89636402, "memory(GiB)": 759.12, "step": 7200, "train_speed(iter/s)": 0.193072 }, { "acc": 0.75744295, "epoch": 0.18277553078312211, "grad_norm": 3.734375, "learning_rate": 9.953239562723211e-06, "loss": 0.96280909, "memory(GiB)": 759.12, "step": 7205, "train_speed(iter/s)": 0.192865 }, { "acc": 0.74472709, "epoch": 0.18290237015215968, "grad_norm": 4.0625, "learning_rate": 9.953096372523097e-06, "loss": 0.99588099, "memory(GiB)": 759.12, "step": 7210, "train_speed(iter/s)": 0.192689 }, { "acc": 0.75883417, "epoch": 0.18302920952119725, "grad_norm": 3.5625, "learning_rate": 9.952952964452054e-06, "loss": 0.93244915, "memory(GiB)": 759.12, "step": 7215, "train_speed(iter/s)": 0.192457 }, { "acc": 0.74579902, "epoch": 0.1831560488902348, "grad_norm": 3.84375, "learning_rate": 9.95280933851639e-06, "loss": 1.02535057, "memory(GiB)": 759.12, "step": 7220, "train_speed(iter/s)": 0.192256 }, { "acc": 0.7421948, "epoch": 0.18328288825927236, "grad_norm": 4.4375, "learning_rate": 9.952665494722427e-06, "loss": 1.00577803, "memory(GiB)": 759.12, "step": 7225, "train_speed(iter/s)": 0.192021 }, { "acc": 0.75496664, "epoch": 0.18340972762830993, "grad_norm": 3.921875, "learning_rate": 9.952521433076486e-06, "loss": 0.96006107, "memory(GiB)": 759.12, "step": 7230, "train_speed(iter/s)": 0.191835 }, { "acc": 0.75145826, "epoch": 0.18353656699734747, "grad_norm": 3.515625, "learning_rate": 9.952377153584909e-06, "loss": 0.9687582, "memory(GiB)": 759.12, "step": 7235, "train_speed(iter/s)": 0.191633 }, { "acc": 0.75812469, "epoch": 0.18366340636638503, "grad_norm": 4.125, "learning_rate": 9.95223265625404e-06, "loss": 0.91827192, "memory(GiB)": 759.12, "step": 7240, "train_speed(iter/s)": 0.191405 }, { "acc": 0.76283703, "epoch": 0.1837902457354226, "grad_norm": 3.703125, "learning_rate": 9.952087941090234e-06, "loss": 0.94235983, "memory(GiB)": 759.12, "step": 7245, "train_speed(iter/s)": 0.191199 }, { "acc": 0.75417218, "epoch": 0.18391708510446014, "grad_norm": 3.96875, "learning_rate": 9.951943008099857e-06, "loss": 0.96952095, "memory(GiB)": 759.12, "step": 7250, "train_speed(iter/s)": 0.191018 }, { "acc": 0.74702196, "epoch": 0.1840439244734977, "grad_norm": 3.875, "learning_rate": 9.951797857289287e-06, "loss": 1.0186348, "memory(GiB)": 759.12, "step": 7255, "train_speed(iter/s)": 0.190816 }, { "acc": 0.74627752, "epoch": 0.18417076384253528, "grad_norm": 4.09375, "learning_rate": 9.951652488664907e-06, "loss": 1.01089134, "memory(GiB)": 759.12, "step": 7260, "train_speed(iter/s)": 0.190626 }, { "acc": 0.7452642, "epoch": 0.18429760321157282, "grad_norm": 3.484375, "learning_rate": 9.95150690223311e-06, "loss": 1.00305996, "memory(GiB)": 759.12, "step": 7265, "train_speed(iter/s)": 0.190406 }, { "acc": 0.7407836, "epoch": 0.1844244425806104, "grad_norm": 3.953125, "learning_rate": 9.9513610980003e-06, "loss": 0.99406986, "memory(GiB)": 759.12, "step": 7270, "train_speed(iter/s)": 0.190193 }, { "acc": 0.75329566, "epoch": 0.18455128194964796, "grad_norm": 3.796875, "learning_rate": 9.951215075972892e-06, "loss": 0.98202772, "memory(GiB)": 759.12, "step": 7275, "train_speed(iter/s)": 0.19001 }, { "acc": 0.74883213, "epoch": 0.1846781213186855, "grad_norm": 3.546875, "learning_rate": 9.95106883615731e-06, "loss": 0.95020437, "memory(GiB)": 764.53, "step": 7280, "train_speed(iter/s)": 0.189815 }, { "acc": 0.75098143, "epoch": 0.18480496068772306, "grad_norm": 3.609375, "learning_rate": 9.950922378559981e-06, "loss": 0.96779881, "memory(GiB)": 764.53, "step": 7285, "train_speed(iter/s)": 0.1896 }, { "acc": 0.75484567, "epoch": 0.18493180005676063, "grad_norm": 3.703125, "learning_rate": 9.950775703187354e-06, "loss": 0.97316952, "memory(GiB)": 764.53, "step": 7290, "train_speed(iter/s)": 0.189396 }, { "acc": 0.77013526, "epoch": 0.18505863942579817, "grad_norm": 3.234375, "learning_rate": 9.950628810045879e-06, "loss": 0.93428144, "memory(GiB)": 764.53, "step": 7295, "train_speed(iter/s)": 0.189211 }, { "acc": 0.75550275, "epoch": 0.18518547879483574, "grad_norm": 4.3125, "learning_rate": 9.950481699142013e-06, "loss": 0.9523097, "memory(GiB)": 764.53, "step": 7300, "train_speed(iter/s)": 0.188985 }, { "acc": 0.76036167, "epoch": 0.1853123181638733, "grad_norm": 3.671875, "learning_rate": 9.950334370482233e-06, "loss": 0.95483971, "memory(GiB)": 764.53, "step": 7305, "train_speed(iter/s)": 0.188783 }, { "acc": 0.75079317, "epoch": 0.18543915753291085, "grad_norm": 3.234375, "learning_rate": 9.950186824073016e-06, "loss": 0.96359653, "memory(GiB)": 764.53, "step": 7310, "train_speed(iter/s)": 0.188579 }, { "acc": 0.75427465, "epoch": 0.18556599690194842, "grad_norm": 3.390625, "learning_rate": 9.950039059920854e-06, "loss": 0.99444866, "memory(GiB)": 764.53, "step": 7315, "train_speed(iter/s)": 0.188364 }, { "acc": 0.74916916, "epoch": 0.18569283627098598, "grad_norm": 3.3125, "learning_rate": 9.949891078032243e-06, "loss": 0.98114004, "memory(GiB)": 764.53, "step": 7320, "train_speed(iter/s)": 0.188165 }, { "acc": 0.75313382, "epoch": 0.18581967564002352, "grad_norm": 3.78125, "learning_rate": 9.949742878413696e-06, "loss": 1.02315207, "memory(GiB)": 764.53, "step": 7325, "train_speed(iter/s)": 0.18796 }, { "acc": 0.74782481, "epoch": 0.1859465150090611, "grad_norm": 3.078125, "learning_rate": 9.94959446107173e-06, "loss": 1.00162792, "memory(GiB)": 764.53, "step": 7330, "train_speed(iter/s)": 0.187764 }, { "acc": 0.74774394, "epoch": 0.18607335437809866, "grad_norm": 3.609375, "learning_rate": 9.949445826012876e-06, "loss": 1.03086004, "memory(GiB)": 764.53, "step": 7335, "train_speed(iter/s)": 0.187568 }, { "acc": 0.7534184, "epoch": 0.1862001937471362, "grad_norm": 4.46875, "learning_rate": 9.949296973243667e-06, "loss": 0.9718935, "memory(GiB)": 764.53, "step": 7340, "train_speed(iter/s)": 0.18739 }, { "acc": 0.76207304, "epoch": 0.18632703311617377, "grad_norm": 3.6875, "learning_rate": 9.949147902770656e-06, "loss": 0.91498013, "memory(GiB)": 764.53, "step": 7345, "train_speed(iter/s)": 0.187201 }, { "acc": 0.76442165, "epoch": 0.18645387248521134, "grad_norm": 3.5, "learning_rate": 9.948998614600397e-06, "loss": 0.92040567, "memory(GiB)": 764.53, "step": 7350, "train_speed(iter/s)": 0.18702 }, { "acc": 0.73943267, "epoch": 0.18658071185424888, "grad_norm": 3.953125, "learning_rate": 9.948849108739458e-06, "loss": 1.05349102, "memory(GiB)": 764.53, "step": 7355, "train_speed(iter/s)": 0.186857 }, { "acc": 0.74437833, "epoch": 0.18670755122328644, "grad_norm": 3.453125, "learning_rate": 9.948699385194413e-06, "loss": 0.98740501, "memory(GiB)": 764.53, "step": 7360, "train_speed(iter/s)": 0.18666 }, { "acc": 0.76975317, "epoch": 0.186834390592324, "grad_norm": 3.515625, "learning_rate": 9.94854944397185e-06, "loss": 0.91446028, "memory(GiB)": 764.53, "step": 7365, "train_speed(iter/s)": 0.18644 }, { "acc": 0.76530409, "epoch": 0.18696122996136155, "grad_norm": 3.296875, "learning_rate": 9.948399285078366e-06, "loss": 0.91540899, "memory(GiB)": 764.53, "step": 7370, "train_speed(iter/s)": 0.186242 }, { "acc": 0.74687924, "epoch": 0.18708806933039912, "grad_norm": 3.515625, "learning_rate": 9.948248908520562e-06, "loss": 1.00255966, "memory(GiB)": 764.53, "step": 7375, "train_speed(iter/s)": 0.186048 }, { "acc": 0.74496837, "epoch": 0.1872149086994367, "grad_norm": 3.359375, "learning_rate": 9.948098314305056e-06, "loss": 0.93127203, "memory(GiB)": 764.53, "step": 7380, "train_speed(iter/s)": 0.185846 }, { "acc": 0.75722976, "epoch": 0.18734174806847423, "grad_norm": 3.328125, "learning_rate": 9.947947502438469e-06, "loss": 0.93731146, "memory(GiB)": 764.53, "step": 7385, "train_speed(iter/s)": 0.185693 }, { "acc": 0.75325394, "epoch": 0.1874685874375118, "grad_norm": 3.609375, "learning_rate": 9.947796472927438e-06, "loss": 0.98726816, "memory(GiB)": 764.53, "step": 7390, "train_speed(iter/s)": 0.18552 }, { "acc": 0.76214004, "epoch": 0.18759542680654936, "grad_norm": 3.390625, "learning_rate": 9.947645225778605e-06, "loss": 0.94401321, "memory(GiB)": 764.53, "step": 7395, "train_speed(iter/s)": 0.185375 }, { "acc": 0.7605052, "epoch": 0.1877222661755869, "grad_norm": 3.75, "learning_rate": 9.947493760998622e-06, "loss": 0.97562761, "memory(GiB)": 764.53, "step": 7400, "train_speed(iter/s)": 0.185174 }, { "acc": 0.75400305, "epoch": 0.18784910554462447, "grad_norm": 3.5, "learning_rate": 9.947342078594151e-06, "loss": 0.99117708, "memory(GiB)": 764.53, "step": 7405, "train_speed(iter/s)": 0.185003 }, { "acc": 0.73721371, "epoch": 0.18797594491366204, "grad_norm": 4.375, "learning_rate": 9.947190178571867e-06, "loss": 1.02597065, "memory(GiB)": 764.53, "step": 7410, "train_speed(iter/s)": 0.184819 }, { "acc": 0.74860806, "epoch": 0.18810278428269958, "grad_norm": 3.828125, "learning_rate": 9.94703806093845e-06, "loss": 0.97386227, "memory(GiB)": 764.53, "step": 7415, "train_speed(iter/s)": 0.18461 }, { "acc": 0.76122098, "epoch": 0.18822962365173715, "grad_norm": 3.796875, "learning_rate": 9.946885725700589e-06, "loss": 0.9401124, "memory(GiB)": 764.53, "step": 7420, "train_speed(iter/s)": 0.18443 }, { "acc": 0.7573813, "epoch": 0.18835646302077472, "grad_norm": 3.453125, "learning_rate": 9.946733172864987e-06, "loss": 0.92917843, "memory(GiB)": 764.53, "step": 7425, "train_speed(iter/s)": 0.184262 }, { "acc": 0.75859776, "epoch": 0.18848330238981226, "grad_norm": 3.484375, "learning_rate": 9.946580402438354e-06, "loss": 0.94347858, "memory(GiB)": 764.53, "step": 7430, "train_speed(iter/s)": 0.184092 }, { "acc": 0.75696468, "epoch": 0.18861014175884983, "grad_norm": 3.265625, "learning_rate": 9.946427414427411e-06, "loss": 0.91114187, "memory(GiB)": 764.53, "step": 7435, "train_speed(iter/s)": 0.183916 }, { "acc": 0.76023726, "epoch": 0.18873698112788737, "grad_norm": 3.546875, "learning_rate": 9.946274208838886e-06, "loss": 0.96526403, "memory(GiB)": 764.53, "step": 7440, "train_speed(iter/s)": 0.183716 }, { "acc": 0.75595212, "epoch": 0.18886382049692493, "grad_norm": 3.375, "learning_rate": 9.946120785679518e-06, "loss": 0.94986124, "memory(GiB)": 764.53, "step": 7445, "train_speed(iter/s)": 0.18349 }, { "acc": 0.75315857, "epoch": 0.1889906598659625, "grad_norm": 3.890625, "learning_rate": 9.945967144956057e-06, "loss": 0.93962374, "memory(GiB)": 764.53, "step": 7450, "train_speed(iter/s)": 0.18331 }, { "acc": 0.75513358, "epoch": 0.18911749923500004, "grad_norm": 4.5625, "learning_rate": 9.94581328667526e-06, "loss": 0.93926897, "memory(GiB)": 764.53, "step": 7455, "train_speed(iter/s)": 0.183116 }, { "acc": 0.74275002, "epoch": 0.1892443386040376, "grad_norm": 3.453125, "learning_rate": 9.945659210843892e-06, "loss": 1.01111822, "memory(GiB)": 764.53, "step": 7460, "train_speed(iter/s)": 0.18292 }, { "acc": 0.7602417, "epoch": 0.18937117797307518, "grad_norm": 4.03125, "learning_rate": 9.945504917468735e-06, "loss": 0.94951496, "memory(GiB)": 764.53, "step": 7465, "train_speed(iter/s)": 0.182741 }, { "acc": 0.76095767, "epoch": 0.18949801734211272, "grad_norm": 4.15625, "learning_rate": 9.945350406556575e-06, "loss": 0.93798962, "memory(GiB)": 764.53, "step": 7470, "train_speed(iter/s)": 0.182549 }, { "acc": 0.74294825, "epoch": 0.18962485671115029, "grad_norm": 3.5625, "learning_rate": 9.945195678114207e-06, "loss": 0.96564245, "memory(GiB)": 764.53, "step": 7475, "train_speed(iter/s)": 0.182398 }, { "acc": 0.73848157, "epoch": 0.18975169608018785, "grad_norm": 3.390625, "learning_rate": 9.945040732148437e-06, "loss": 1.00191813, "memory(GiB)": 764.53, "step": 7480, "train_speed(iter/s)": 0.182211 }, { "acc": 0.75406799, "epoch": 0.1898785354492254, "grad_norm": 3.140625, "learning_rate": 9.94488556866608e-06, "loss": 0.95315914, "memory(GiB)": 764.53, "step": 7485, "train_speed(iter/s)": 0.181989 }, { "acc": 0.76182065, "epoch": 0.19000537481826296, "grad_norm": 4.0, "learning_rate": 9.944730187673965e-06, "loss": 0.97146816, "memory(GiB)": 764.53, "step": 7490, "train_speed(iter/s)": 0.181819 }, { "acc": 0.7570435, "epoch": 0.19013221418730053, "grad_norm": 3.8125, "learning_rate": 9.944574589178922e-06, "loss": 0.94102345, "memory(GiB)": 764.53, "step": 7495, "train_speed(iter/s)": 0.181645 }, { "acc": 0.75002413, "epoch": 0.19025905355633807, "grad_norm": 3.765625, "learning_rate": 9.944418773187797e-06, "loss": 0.95416012, "memory(GiB)": 764.53, "step": 7500, "train_speed(iter/s)": 0.181497 }, { "epoch": 0.19025905355633807, "eval_acc": 0.7436291529726023, "eval_loss": 0.9297406077384949, "eval_runtime": 1147.0769, "eval_samples_per_second": 5.553, "eval_steps_per_second": 5.553, "step": 7500 }, { "acc": 0.74038606, "epoch": 0.19038589292537564, "grad_norm": 3.546875, "learning_rate": 9.944262739707444e-06, "loss": 1.03262596, "memory(GiB)": 764.53, "step": 7505, "train_speed(iter/s)": 0.173424 }, { "acc": 0.75851178, "epoch": 0.1905127322944132, "grad_norm": 3.28125, "learning_rate": 9.944106488744727e-06, "loss": 0.92950354, "memory(GiB)": 764.53, "step": 7510, "train_speed(iter/s)": 0.173259 }, { "acc": 0.75709491, "epoch": 0.19063957166345075, "grad_norm": 2.984375, "learning_rate": 9.943950020306519e-06, "loss": 0.95745649, "memory(GiB)": 764.53, "step": 7515, "train_speed(iter/s)": 0.173101 }, { "acc": 0.76084824, "epoch": 0.19076641103248831, "grad_norm": 4.0625, "learning_rate": 9.943793334399702e-06, "loss": 0.93439493, "memory(GiB)": 764.53, "step": 7520, "train_speed(iter/s)": 0.172931 }, { "acc": 0.76394277, "epoch": 0.19089325040152588, "grad_norm": 3.359375, "learning_rate": 9.943636431031168e-06, "loss": 0.92641945, "memory(GiB)": 764.53, "step": 7525, "train_speed(iter/s)": 0.172785 }, { "acc": 0.76440091, "epoch": 0.19102008977056342, "grad_norm": 3.765625, "learning_rate": 9.943479310207817e-06, "loss": 0.93161297, "memory(GiB)": 764.53, "step": 7530, "train_speed(iter/s)": 0.172625 }, { "acc": 0.74378128, "epoch": 0.191146929139601, "grad_norm": 4.0, "learning_rate": 9.943321971936564e-06, "loss": 0.96909952, "memory(GiB)": 764.53, "step": 7535, "train_speed(iter/s)": 0.17249 }, { "acc": 0.76072226, "epoch": 0.19127376850863856, "grad_norm": 3.71875, "learning_rate": 9.943164416224326e-06, "loss": 0.93828259, "memory(GiB)": 764.53, "step": 7540, "train_speed(iter/s)": 0.172327 }, { "acc": 0.76801906, "epoch": 0.1914006078776761, "grad_norm": 3.5625, "learning_rate": 9.943006643078036e-06, "loss": 0.92284126, "memory(GiB)": 764.53, "step": 7545, "train_speed(iter/s)": 0.172173 }, { "acc": 0.76418657, "epoch": 0.19152744724671367, "grad_norm": 3.34375, "learning_rate": 9.942848652504635e-06, "loss": 0.96384029, "memory(GiB)": 764.53, "step": 7550, "train_speed(iter/s)": 0.172031 }, { "acc": 0.77341132, "epoch": 0.19165428661575123, "grad_norm": 3.5, "learning_rate": 9.94269044451107e-06, "loss": 0.90356417, "memory(GiB)": 764.53, "step": 7555, "train_speed(iter/s)": 0.171873 }, { "acc": 0.76694226, "epoch": 0.19178112598478878, "grad_norm": 3.28125, "learning_rate": 9.942532019104299e-06, "loss": 0.91862898, "memory(GiB)": 764.53, "step": 7560, "train_speed(iter/s)": 0.171696 }, { "acc": 0.74491301, "epoch": 0.19190796535382634, "grad_norm": 3.421875, "learning_rate": 9.942373376291293e-06, "loss": 0.95804968, "memory(GiB)": 764.53, "step": 7565, "train_speed(iter/s)": 0.171544 }, { "acc": 0.77384806, "epoch": 0.1920348047228639, "grad_norm": 3.34375, "learning_rate": 9.942214516079029e-06, "loss": 0.87012882, "memory(GiB)": 764.53, "step": 7570, "train_speed(iter/s)": 0.171396 }, { "acc": 0.74085879, "epoch": 0.19216164409190145, "grad_norm": 3.203125, "learning_rate": 9.942055438474497e-06, "loss": 0.99100065, "memory(GiB)": 764.53, "step": 7575, "train_speed(iter/s)": 0.171243 }, { "acc": 0.74484606, "epoch": 0.19228848346093902, "grad_norm": 3.328125, "learning_rate": 9.941896143484692e-06, "loss": 0.97350388, "memory(GiB)": 764.53, "step": 7580, "train_speed(iter/s)": 0.171094 }, { "acc": 0.73977823, "epoch": 0.1924153228299766, "grad_norm": 4.1875, "learning_rate": 9.94173663111662e-06, "loss": 0.9862793, "memory(GiB)": 764.53, "step": 7585, "train_speed(iter/s)": 0.170953 }, { "acc": 0.76206326, "epoch": 0.19254216219901413, "grad_norm": 3.8125, "learning_rate": 9.941576901377301e-06, "loss": 0.91066151, "memory(GiB)": 764.53, "step": 7590, "train_speed(iter/s)": 0.170768 }, { "acc": 0.73538108, "epoch": 0.1926690015680517, "grad_norm": 4.09375, "learning_rate": 9.941416954273759e-06, "loss": 1.02986422, "memory(GiB)": 764.53, "step": 7595, "train_speed(iter/s)": 0.170649 }, { "acc": 0.76647573, "epoch": 0.19279584093708926, "grad_norm": 4.0625, "learning_rate": 9.941256789813028e-06, "loss": 0.90714903, "memory(GiB)": 764.53, "step": 7600, "train_speed(iter/s)": 0.170517 }, { "acc": 0.75368309, "epoch": 0.1929226803061268, "grad_norm": 5.5625, "learning_rate": 9.941096408002155e-06, "loss": 0.97560902, "memory(GiB)": 764.53, "step": 7605, "train_speed(iter/s)": 0.170398 }, { "acc": 0.75299067, "epoch": 0.19304951967516437, "grad_norm": 3.625, "learning_rate": 9.940935808848195e-06, "loss": 0.94932699, "memory(GiB)": 764.53, "step": 7610, "train_speed(iter/s)": 0.170223 }, { "acc": 0.7498714, "epoch": 0.19317635904420194, "grad_norm": 3.796875, "learning_rate": 9.94077499235821e-06, "loss": 0.96842928, "memory(GiB)": 764.53, "step": 7615, "train_speed(iter/s)": 0.170075 }, { "acc": 0.75152926, "epoch": 0.19330319841323948, "grad_norm": 3.40625, "learning_rate": 9.940613958539276e-06, "loss": 0.96406832, "memory(GiB)": 764.53, "step": 7620, "train_speed(iter/s)": 0.169941 }, { "acc": 0.76748781, "epoch": 0.19343003778227705, "grad_norm": 3.921875, "learning_rate": 9.940452707398476e-06, "loss": 0.88696661, "memory(GiB)": 764.53, "step": 7625, "train_speed(iter/s)": 0.169816 }, { "acc": 0.75672216, "epoch": 0.19355687715131462, "grad_norm": 3.734375, "learning_rate": 9.940291238942904e-06, "loss": 0.9002409, "memory(GiB)": 764.53, "step": 7630, "train_speed(iter/s)": 0.169648 }, { "acc": 0.7390245, "epoch": 0.19368371652035216, "grad_norm": 3.875, "learning_rate": 9.94012955317966e-06, "loss": 1.02038612, "memory(GiB)": 764.53, "step": 7635, "train_speed(iter/s)": 0.169502 }, { "acc": 0.76033778, "epoch": 0.19381055588938972, "grad_norm": 3.78125, "learning_rate": 9.939967650115857e-06, "loss": 0.912673, "memory(GiB)": 764.53, "step": 7640, "train_speed(iter/s)": 0.169346 }, { "acc": 0.74076033, "epoch": 0.1939373952584273, "grad_norm": 3.734375, "learning_rate": 9.939805529758616e-06, "loss": 1.0374074, "memory(GiB)": 764.53, "step": 7645, "train_speed(iter/s)": 0.169214 }, { "acc": 0.75307665, "epoch": 0.19406423462746483, "grad_norm": 3.515625, "learning_rate": 9.93964319211507e-06, "loss": 0.923388, "memory(GiB)": 764.53, "step": 7650, "train_speed(iter/s)": 0.16908 }, { "acc": 0.76106753, "epoch": 0.1941910739965024, "grad_norm": 3.8125, "learning_rate": 9.939480637192358e-06, "loss": 0.92712126, "memory(GiB)": 764.53, "step": 7655, "train_speed(iter/s)": 0.168939 }, { "acc": 0.75409069, "epoch": 0.19431791336553997, "grad_norm": 3.71875, "learning_rate": 9.939317864997631e-06, "loss": 0.92849274, "memory(GiB)": 764.53, "step": 7660, "train_speed(iter/s)": 0.168801 }, { "acc": 0.75340018, "epoch": 0.1944447527345775, "grad_norm": 3.1875, "learning_rate": 9.939154875538051e-06, "loss": 0.96949577, "memory(GiB)": 764.53, "step": 7665, "train_speed(iter/s)": 0.168648 }, { "acc": 0.75155759, "epoch": 0.19457159210361508, "grad_norm": 3.78125, "learning_rate": 9.938991668820782e-06, "loss": 0.9590888, "memory(GiB)": 764.53, "step": 7670, "train_speed(iter/s)": 0.168514 }, { "acc": 0.76591949, "epoch": 0.19469843147265264, "grad_norm": 3.484375, "learning_rate": 9.938828244853007e-06, "loss": 0.89724169, "memory(GiB)": 764.53, "step": 7675, "train_speed(iter/s)": 0.168405 }, { "acc": 0.75412884, "epoch": 0.19482527084169018, "grad_norm": 3.59375, "learning_rate": 9.938664603641914e-06, "loss": 0.92203264, "memory(GiB)": 764.53, "step": 7680, "train_speed(iter/s)": 0.168222 }, { "acc": 0.77065248, "epoch": 0.19495211021072775, "grad_norm": 3.421875, "learning_rate": 9.938500745194701e-06, "loss": 0.90653143, "memory(GiB)": 764.53, "step": 7685, "train_speed(iter/s)": 0.168074 }, { "acc": 0.74889035, "epoch": 0.19507894957976532, "grad_norm": 3.046875, "learning_rate": 9.938336669518575e-06, "loss": 0.97292023, "memory(GiB)": 764.53, "step": 7690, "train_speed(iter/s)": 0.167915 }, { "acc": 0.73876147, "epoch": 0.19520578894880286, "grad_norm": 3.40625, "learning_rate": 9.938172376620753e-06, "loss": 1.01267204, "memory(GiB)": 764.53, "step": 7695, "train_speed(iter/s)": 0.167782 }, { "acc": 0.74891067, "epoch": 0.19533262831784043, "grad_norm": 3.640625, "learning_rate": 9.938007866508462e-06, "loss": 0.95528755, "memory(GiB)": 764.53, "step": 7700, "train_speed(iter/s)": 0.167652 }, { "acc": 0.75196042, "epoch": 0.195459467686878, "grad_norm": 6.1875, "learning_rate": 9.937843139188939e-06, "loss": 0.96606321, "memory(GiB)": 764.53, "step": 7705, "train_speed(iter/s)": 0.167509 }, { "acc": 0.75411611, "epoch": 0.19558630705591554, "grad_norm": 4.46875, "learning_rate": 9.93767819466943e-06, "loss": 0.9310813, "memory(GiB)": 764.53, "step": 7710, "train_speed(iter/s)": 0.167359 }, { "acc": 0.76135411, "epoch": 0.1957131464249531, "grad_norm": 3.65625, "learning_rate": 9.937513032957186e-06, "loss": 0.92245827, "memory(GiB)": 764.53, "step": 7715, "train_speed(iter/s)": 0.167203 }, { "acc": 0.73942857, "epoch": 0.19583998579399067, "grad_norm": 4.0625, "learning_rate": 9.93734765405948e-06, "loss": 1.00804243, "memory(GiB)": 764.53, "step": 7720, "train_speed(iter/s)": 0.167054 }, { "acc": 0.75295863, "epoch": 0.1959668251630282, "grad_norm": 3.28125, "learning_rate": 9.937182057983579e-06, "loss": 0.95706301, "memory(GiB)": 764.53, "step": 7725, "train_speed(iter/s)": 0.16693 }, { "acc": 0.74413075, "epoch": 0.19609366453206578, "grad_norm": 3.28125, "learning_rate": 9.937016244736772e-06, "loss": 1.00663424, "memory(GiB)": 764.53, "step": 7730, "train_speed(iter/s)": 0.166792 }, { "acc": 0.74067893, "epoch": 0.19622050390110335, "grad_norm": 3.859375, "learning_rate": 9.936850214326349e-06, "loss": 0.95291052, "memory(GiB)": 764.53, "step": 7735, "train_speed(iter/s)": 0.166657 }, { "acc": 0.74860106, "epoch": 0.1963473432701409, "grad_norm": 4.375, "learning_rate": 9.936683966759615e-06, "loss": 0.97767782, "memory(GiB)": 764.53, "step": 7740, "train_speed(iter/s)": 0.166533 }, { "acc": 0.75600643, "epoch": 0.19647418263917846, "grad_norm": 3.34375, "learning_rate": 9.936517502043884e-06, "loss": 0.92711229, "memory(GiB)": 764.53, "step": 7745, "train_speed(iter/s)": 0.166405 }, { "acc": 0.74236612, "epoch": 0.19660102200821603, "grad_norm": 3.6875, "learning_rate": 9.936350820186475e-06, "loss": 1.02211657, "memory(GiB)": 764.53, "step": 7750, "train_speed(iter/s)": 0.166263 }, { "acc": 0.74620724, "epoch": 0.19672786137725357, "grad_norm": 3.1875, "learning_rate": 9.936183921194724e-06, "loss": 1.00245457, "memory(GiB)": 764.53, "step": 7755, "train_speed(iter/s)": 0.166126 }, { "acc": 0.74255466, "epoch": 0.19685470074629113, "grad_norm": 3.375, "learning_rate": 9.936016805075968e-06, "loss": 1.00122232, "memory(GiB)": 764.53, "step": 7760, "train_speed(iter/s)": 0.165988 }, { "acc": 0.75814528, "epoch": 0.1969815401153287, "grad_norm": 3.5, "learning_rate": 9.935849471837562e-06, "loss": 0.95238037, "memory(GiB)": 764.53, "step": 7765, "train_speed(iter/s)": 0.165866 }, { "acc": 0.74265957, "epoch": 0.19710837948436624, "grad_norm": 4.75, "learning_rate": 9.935681921486862e-06, "loss": 0.96199598, "memory(GiB)": 764.53, "step": 7770, "train_speed(iter/s)": 0.165735 }, { "acc": 0.75318646, "epoch": 0.1972352188534038, "grad_norm": 3.546875, "learning_rate": 9.93551415403124e-06, "loss": 0.94837246, "memory(GiB)": 764.53, "step": 7775, "train_speed(iter/s)": 0.165599 }, { "acc": 0.76046696, "epoch": 0.19736205822244138, "grad_norm": 3.515625, "learning_rate": 9.935346169478078e-06, "loss": 0.93166723, "memory(GiB)": 764.53, "step": 7780, "train_speed(iter/s)": 0.165452 }, { "acc": 0.76198168, "epoch": 0.19748889759147892, "grad_norm": 3.484375, "learning_rate": 9.935177967834762e-06, "loss": 0.9484951, "memory(GiB)": 764.53, "step": 7785, "train_speed(iter/s)": 0.165337 }, { "acc": 0.74941645, "epoch": 0.19761573696051649, "grad_norm": 4.15625, "learning_rate": 9.935009549108692e-06, "loss": 1.01360931, "memory(GiB)": 764.53, "step": 7790, "train_speed(iter/s)": 0.165198 }, { "acc": 0.76021571, "epoch": 0.19774257632955405, "grad_norm": 4.21875, "learning_rate": 9.934840913307275e-06, "loss": 0.91453524, "memory(GiB)": 764.53, "step": 7795, "train_speed(iter/s)": 0.165083 }, { "acc": 0.75420151, "epoch": 0.1978694156985916, "grad_norm": 4.0, "learning_rate": 9.934672060437929e-06, "loss": 0.93171387, "memory(GiB)": 764.53, "step": 7800, "train_speed(iter/s)": 0.164963 }, { "acc": 0.76617117, "epoch": 0.19799625506762916, "grad_norm": 4.5, "learning_rate": 9.934502990508084e-06, "loss": 0.90660801, "memory(GiB)": 764.53, "step": 7805, "train_speed(iter/s)": 0.164834 }, { "acc": 0.76066999, "epoch": 0.19812309443666673, "grad_norm": 3.5625, "learning_rate": 9.934333703525174e-06, "loss": 0.96534567, "memory(GiB)": 764.53, "step": 7810, "train_speed(iter/s)": 0.164724 }, { "acc": 0.75678864, "epoch": 0.19824993380570427, "grad_norm": 4.15625, "learning_rate": 9.934164199496646e-06, "loss": 0.97277107, "memory(GiB)": 764.53, "step": 7815, "train_speed(iter/s)": 0.164618 }, { "acc": 0.75857606, "epoch": 0.19837677317474184, "grad_norm": 4.03125, "learning_rate": 9.933994478429955e-06, "loss": 0.9691308, "memory(GiB)": 764.53, "step": 7820, "train_speed(iter/s)": 0.164476 }, { "acc": 0.75407252, "epoch": 0.1985036125437794, "grad_norm": 3.75, "learning_rate": 9.933824540332568e-06, "loss": 1.01544809, "memory(GiB)": 764.53, "step": 7825, "train_speed(iter/s)": 0.164361 }, { "acc": 0.76207252, "epoch": 0.19863045191281695, "grad_norm": 3.53125, "learning_rate": 9.93365438521196e-06, "loss": 0.93298254, "memory(GiB)": 764.53, "step": 7830, "train_speed(iter/s)": 0.164248 }, { "acc": 0.75052147, "epoch": 0.19875729128185451, "grad_norm": 3.421875, "learning_rate": 9.933484013075616e-06, "loss": 0.96948757, "memory(GiB)": 764.53, "step": 7835, "train_speed(iter/s)": 0.164116 }, { "acc": 0.75695472, "epoch": 0.19888413065089208, "grad_norm": 3.65625, "learning_rate": 9.933313423931028e-06, "loss": 1.03020258, "memory(GiB)": 764.53, "step": 7840, "train_speed(iter/s)": 0.163989 }, { "acc": 0.77336516, "epoch": 0.19901097001992962, "grad_norm": 3.3125, "learning_rate": 9.9331426177857e-06, "loss": 0.87818012, "memory(GiB)": 764.53, "step": 7845, "train_speed(iter/s)": 0.163851 }, { "acc": 0.76131382, "epoch": 0.1991378093889672, "grad_norm": 3.296875, "learning_rate": 9.932971594647148e-06, "loss": 0.97006197, "memory(GiB)": 764.53, "step": 7850, "train_speed(iter/s)": 0.163713 }, { "acc": 0.75562263, "epoch": 0.19926464875800476, "grad_norm": 3.96875, "learning_rate": 9.932800354522893e-06, "loss": 0.93896475, "memory(GiB)": 764.53, "step": 7855, "train_speed(iter/s)": 0.163586 }, { "acc": 0.7504847, "epoch": 0.1993914881270423, "grad_norm": 4.40625, "learning_rate": 9.932628897420466e-06, "loss": 1.01467581, "memory(GiB)": 764.53, "step": 7860, "train_speed(iter/s)": 0.163461 }, { "acc": 0.74892197, "epoch": 0.19951832749607987, "grad_norm": 3.59375, "learning_rate": 9.932457223347412e-06, "loss": 0.9586936, "memory(GiB)": 764.53, "step": 7865, "train_speed(iter/s)": 0.163319 }, { "acc": 0.74859638, "epoch": 0.19964516686511743, "grad_norm": 3.6875, "learning_rate": 9.932285332311278e-06, "loss": 1.03595839, "memory(GiB)": 764.53, "step": 7870, "train_speed(iter/s)": 0.163207 }, { "acc": 0.75374336, "epoch": 0.19977200623415498, "grad_norm": 3.921875, "learning_rate": 9.93211322431963e-06, "loss": 0.91836033, "memory(GiB)": 764.53, "step": 7875, "train_speed(iter/s)": 0.163096 }, { "acc": 0.75731502, "epoch": 0.19989884560319254, "grad_norm": 4.65625, "learning_rate": 9.931940899380034e-06, "loss": 0.9815484, "memory(GiB)": 764.53, "step": 7880, "train_speed(iter/s)": 0.162972 }, { "acc": 0.75694232, "epoch": 0.2000256849722301, "grad_norm": 3.875, "learning_rate": 9.931768357500074e-06, "loss": 0.88329725, "memory(GiB)": 764.53, "step": 7885, "train_speed(iter/s)": 0.162825 }, { "acc": 0.755827, "epoch": 0.20015252434126765, "grad_norm": 2.90625, "learning_rate": 9.931595598687335e-06, "loss": 0.97636375, "memory(GiB)": 764.53, "step": 7890, "train_speed(iter/s)": 0.162716 }, { "acc": 0.75550027, "epoch": 0.20027936371030522, "grad_norm": 4.125, "learning_rate": 9.93142262294942e-06, "loss": 0.94412308, "memory(GiB)": 764.53, "step": 7895, "train_speed(iter/s)": 0.162604 }, { "acc": 0.74968352, "epoch": 0.2004062030793428, "grad_norm": 3.421875, "learning_rate": 9.931249430293936e-06, "loss": 0.96547852, "memory(GiB)": 764.53, "step": 7900, "train_speed(iter/s)": 0.162484 }, { "acc": 0.75854568, "epoch": 0.20053304244838033, "grad_norm": 2.84375, "learning_rate": 9.931076020728502e-06, "loss": 0.93544817, "memory(GiB)": 764.53, "step": 7905, "train_speed(iter/s)": 0.162371 }, { "acc": 0.76866441, "epoch": 0.2006598818174179, "grad_norm": 3.0625, "learning_rate": 9.930902394260746e-06, "loss": 0.9457139, "memory(GiB)": 764.53, "step": 7910, "train_speed(iter/s)": 0.162239 }, { "acc": 0.7713172, "epoch": 0.20078672118645546, "grad_norm": 3.296875, "learning_rate": 9.930728550898304e-06, "loss": 0.90232592, "memory(GiB)": 764.53, "step": 7915, "train_speed(iter/s)": 0.162092 }, { "acc": 0.74842672, "epoch": 0.200913560555493, "grad_norm": 4.3125, "learning_rate": 9.930554490648822e-06, "loss": 1.0063365, "memory(GiB)": 764.53, "step": 7920, "train_speed(iter/s)": 0.161971 }, { "acc": 0.75644021, "epoch": 0.20104039992453057, "grad_norm": 3.09375, "learning_rate": 9.930380213519958e-06, "loss": 1.02486258, "memory(GiB)": 764.53, "step": 7925, "train_speed(iter/s)": 0.16186 }, { "acc": 0.74742951, "epoch": 0.20116723929356814, "grad_norm": 3.890625, "learning_rate": 9.930205719519378e-06, "loss": 0.99170904, "memory(GiB)": 764.53, "step": 7930, "train_speed(iter/s)": 0.161743 }, { "acc": 0.74935222, "epoch": 0.20129407866260568, "grad_norm": 3.1875, "learning_rate": 9.930031008654757e-06, "loss": 0.9817523, "memory(GiB)": 764.53, "step": 7935, "train_speed(iter/s)": 0.161635 }, { "acc": 0.75828838, "epoch": 0.20142091803164325, "grad_norm": 3.578125, "learning_rate": 9.929856080933779e-06, "loss": 0.91875172, "memory(GiB)": 764.53, "step": 7940, "train_speed(iter/s)": 0.161522 }, { "acc": 0.76449094, "epoch": 0.20154775740068082, "grad_norm": 3.703125, "learning_rate": 9.929680936364141e-06, "loss": 0.88240929, "memory(GiB)": 764.53, "step": 7945, "train_speed(iter/s)": 0.161369 }, { "acc": 0.74715343, "epoch": 0.20167459676971836, "grad_norm": 3.546875, "learning_rate": 9.929505574953544e-06, "loss": 0.95937948, "memory(GiB)": 764.53, "step": 7950, "train_speed(iter/s)": 0.161252 }, { "acc": 0.7458746, "epoch": 0.20180143613875592, "grad_norm": 3.25, "learning_rate": 9.929329996709704e-06, "loss": 0.98643951, "memory(GiB)": 764.53, "step": 7955, "train_speed(iter/s)": 0.161149 }, { "acc": 0.74406624, "epoch": 0.2019282755077935, "grad_norm": 3.59375, "learning_rate": 9.929154201640346e-06, "loss": 0.98837214, "memory(GiB)": 764.53, "step": 7960, "train_speed(iter/s)": 0.161049 }, { "acc": 0.7533, "epoch": 0.20205511487683103, "grad_norm": 4.28125, "learning_rate": 9.928978189753196e-06, "loss": 0.98862762, "memory(GiB)": 764.53, "step": 7965, "train_speed(iter/s)": 0.160934 }, { "acc": 0.76319385, "epoch": 0.2021819542458686, "grad_norm": 3.125, "learning_rate": 9.928801961056001e-06, "loss": 0.8955677, "memory(GiB)": 764.53, "step": 7970, "train_speed(iter/s)": 0.160805 }, { "acc": 0.75241818, "epoch": 0.20230879361490617, "grad_norm": 3.84375, "learning_rate": 9.928625515556512e-06, "loss": 0.92324772, "memory(GiB)": 764.53, "step": 7975, "train_speed(iter/s)": 0.160699 }, { "acc": 0.75532646, "epoch": 0.2024356329839437, "grad_norm": 4.09375, "learning_rate": 9.928448853262491e-06, "loss": 0.98412666, "memory(GiB)": 764.53, "step": 7980, "train_speed(iter/s)": 0.160569 }, { "acc": 0.74668021, "epoch": 0.20256247235298128, "grad_norm": 3.484375, "learning_rate": 9.928271974181707e-06, "loss": 1.01107845, "memory(GiB)": 764.53, "step": 7985, "train_speed(iter/s)": 0.160433 }, { "acc": 0.76616063, "epoch": 0.20268931172201884, "grad_norm": 3.546875, "learning_rate": 9.92809487832194e-06, "loss": 0.93093042, "memory(GiB)": 764.53, "step": 7990, "train_speed(iter/s)": 0.16031 }, { "acc": 0.7532423, "epoch": 0.20281615109105638, "grad_norm": 4.3125, "learning_rate": 9.927917565690983e-06, "loss": 0.97821484, "memory(GiB)": 764.53, "step": 7995, "train_speed(iter/s)": 0.160197 }, { "acc": 0.76397719, "epoch": 0.20294299046009395, "grad_norm": 3.859375, "learning_rate": 9.927740036296633e-06, "loss": 0.97450285, "memory(GiB)": 764.53, "step": 8000, "train_speed(iter/s)": 0.160065 }, { "epoch": 0.20294299046009395, "eval_acc": 0.7442182048789373, "eval_loss": 0.9259604811668396, "eval_runtime": 1151.0221, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 8000 }, { "acc": 0.76598964, "epoch": 0.20306982982913152, "grad_norm": 3.34375, "learning_rate": 9.927562290146702e-06, "loss": 0.93027821, "memory(GiB)": 764.53, "step": 8005, "train_speed(iter/s)": 0.154071 }, { "acc": 0.74863, "epoch": 0.20319666919816906, "grad_norm": 3.90625, "learning_rate": 9.927384327249003e-06, "loss": 0.95269842, "memory(GiB)": 764.53, "step": 8010, "train_speed(iter/s)": 0.153957 }, { "acc": 0.75393, "epoch": 0.20332350856720663, "grad_norm": 4.0, "learning_rate": 9.927206147611368e-06, "loss": 0.94970036, "memory(GiB)": 764.53, "step": 8015, "train_speed(iter/s)": 0.153853 }, { "acc": 0.75527182, "epoch": 0.2034503479362442, "grad_norm": 3.984375, "learning_rate": 9.927027751241634e-06, "loss": 0.98228273, "memory(GiB)": 764.53, "step": 8020, "train_speed(iter/s)": 0.153759 }, { "acc": 0.75463352, "epoch": 0.20357718730528174, "grad_norm": 4.125, "learning_rate": 9.926849138147647e-06, "loss": 0.95153217, "memory(GiB)": 764.53, "step": 8025, "train_speed(iter/s)": 0.153666 }, { "acc": 0.76198535, "epoch": 0.2037040266743193, "grad_norm": 4.125, "learning_rate": 9.926670308337264e-06, "loss": 1.01287756, "memory(GiB)": 764.53, "step": 8030, "train_speed(iter/s)": 0.153555 }, { "acc": 0.75733461, "epoch": 0.20383086604335687, "grad_norm": 3.1875, "learning_rate": 9.926491261818353e-06, "loss": 0.92819033, "memory(GiB)": 764.53, "step": 8035, "train_speed(iter/s)": 0.153465 }, { "acc": 0.75061679, "epoch": 0.2039577054123944, "grad_norm": 3.921875, "learning_rate": 9.926311998598785e-06, "loss": 0.97459412, "memory(GiB)": 764.53, "step": 8040, "train_speed(iter/s)": 0.153363 }, { "acc": 0.75045123, "epoch": 0.20408454478143198, "grad_norm": 3.390625, "learning_rate": 9.926132518686452e-06, "loss": 0.97426863, "memory(GiB)": 764.53, "step": 8045, "train_speed(iter/s)": 0.15326 }, { "acc": 0.76605463, "epoch": 0.20421138415046955, "grad_norm": 3.421875, "learning_rate": 9.925952822089243e-06, "loss": 0.94705, "memory(GiB)": 764.53, "step": 8050, "train_speed(iter/s)": 0.153174 }, { "acc": 0.75370593, "epoch": 0.2043382235195071, "grad_norm": 3.296875, "learning_rate": 9.925772908815065e-06, "loss": 0.94059534, "memory(GiB)": 764.53, "step": 8055, "train_speed(iter/s)": 0.153082 }, { "acc": 0.74940104, "epoch": 0.20446506288854466, "grad_norm": 3.5, "learning_rate": 9.925592778871832e-06, "loss": 0.99492025, "memory(GiB)": 764.53, "step": 8060, "train_speed(iter/s)": 0.152992 }, { "acc": 0.76142068, "epoch": 0.20459190225758223, "grad_norm": 3.40625, "learning_rate": 9.925412432267465e-06, "loss": 0.93552055, "memory(GiB)": 764.53, "step": 8065, "train_speed(iter/s)": 0.152896 }, { "acc": 0.75788183, "epoch": 0.20471874162661977, "grad_norm": 3.53125, "learning_rate": 9.925231869009899e-06, "loss": 0.95566235, "memory(GiB)": 764.53, "step": 8070, "train_speed(iter/s)": 0.152806 }, { "acc": 0.74633007, "epoch": 0.20484558099565733, "grad_norm": 3.25, "learning_rate": 9.925051089107077e-06, "loss": 0.9996316, "memory(GiB)": 764.53, "step": 8075, "train_speed(iter/s)": 0.152712 }, { "acc": 0.75825243, "epoch": 0.2049724203646949, "grad_norm": 3.921875, "learning_rate": 9.924870092566949e-06, "loss": 0.92673645, "memory(GiB)": 764.53, "step": 8080, "train_speed(iter/s)": 0.152598 }, { "acc": 0.75974441, "epoch": 0.20509925973373244, "grad_norm": 4.53125, "learning_rate": 9.924688879397475e-06, "loss": 0.92979555, "memory(GiB)": 764.53, "step": 8085, "train_speed(iter/s)": 0.152514 }, { "acc": 0.75907249, "epoch": 0.20522609910277, "grad_norm": 3.4375, "learning_rate": 9.92450744960663e-06, "loss": 0.90264482, "memory(GiB)": 764.53, "step": 8090, "train_speed(iter/s)": 0.152415 }, { "acc": 0.75542378, "epoch": 0.20535293847180758, "grad_norm": 3.75, "learning_rate": 9.924325803202395e-06, "loss": 0.94214125, "memory(GiB)": 764.53, "step": 8095, "train_speed(iter/s)": 0.152309 }, { "acc": 0.75622725, "epoch": 0.20547977784084512, "grad_norm": 3.8125, "learning_rate": 9.924143940192754e-06, "loss": 0.95501356, "memory(GiB)": 764.53, "step": 8100, "train_speed(iter/s)": 0.152201 }, { "acc": 0.76367321, "epoch": 0.20560661720988269, "grad_norm": 3.59375, "learning_rate": 9.923961860585713e-06, "loss": 0.9358757, "memory(GiB)": 764.53, "step": 8105, "train_speed(iter/s)": 0.152072 }, { "acc": 0.74188075, "epoch": 0.20573345657892025, "grad_norm": 7.0, "learning_rate": 9.923779564389278e-06, "loss": 1.03976011, "memory(GiB)": 764.53, "step": 8110, "train_speed(iter/s)": 0.151981 }, { "acc": 0.76300683, "epoch": 0.2058602959479578, "grad_norm": 3.453125, "learning_rate": 9.923597051611467e-06, "loss": 0.95200558, "memory(GiB)": 764.53, "step": 8115, "train_speed(iter/s)": 0.151884 }, { "acc": 0.73648691, "epoch": 0.20598713531699536, "grad_norm": 3.359375, "learning_rate": 9.923414322260309e-06, "loss": 0.98372183, "memory(GiB)": 764.53, "step": 8120, "train_speed(iter/s)": 0.151776 }, { "acc": 0.75048137, "epoch": 0.20611397468603293, "grad_norm": 3.921875, "learning_rate": 9.923231376343843e-06, "loss": 0.92567711, "memory(GiB)": 764.53, "step": 8125, "train_speed(iter/s)": 0.15168 }, { "acc": 0.75764971, "epoch": 0.20624081405507047, "grad_norm": 4.1875, "learning_rate": 9.923048213870116e-06, "loss": 0.95160141, "memory(GiB)": 764.53, "step": 8130, "train_speed(iter/s)": 0.15157 }, { "acc": 0.75245185, "epoch": 0.20636765342410804, "grad_norm": 3.09375, "learning_rate": 9.922864834847182e-06, "loss": 0.9353528, "memory(GiB)": 764.53, "step": 8135, "train_speed(iter/s)": 0.151471 }, { "acc": 0.75278206, "epoch": 0.2064944927931456, "grad_norm": 3.71875, "learning_rate": 9.92268123928311e-06, "loss": 0.93366547, "memory(GiB)": 764.53, "step": 8140, "train_speed(iter/s)": 0.151363 }, { "acc": 0.75758328, "epoch": 0.20662133216218315, "grad_norm": 3.96875, "learning_rate": 9.922497427185975e-06, "loss": 0.93526182, "memory(GiB)": 764.53, "step": 8145, "train_speed(iter/s)": 0.151278 }, { "acc": 0.75196948, "epoch": 0.20674817153122071, "grad_norm": 3.109375, "learning_rate": 9.922313398563863e-06, "loss": 0.94056358, "memory(GiB)": 764.53, "step": 8150, "train_speed(iter/s)": 0.151164 }, { "acc": 0.76763744, "epoch": 0.20687501090025828, "grad_norm": 3.53125, "learning_rate": 9.922129153424867e-06, "loss": 0.91658669, "memory(GiB)": 764.53, "step": 8155, "train_speed(iter/s)": 0.151082 }, { "acc": 0.75027986, "epoch": 0.20700185026929582, "grad_norm": 4.84375, "learning_rate": 9.921944691777093e-06, "loss": 0.99037752, "memory(GiB)": 764.53, "step": 8160, "train_speed(iter/s)": 0.150993 }, { "acc": 0.76430874, "epoch": 0.2071286896383334, "grad_norm": 3.96875, "learning_rate": 9.921760013628651e-06, "loss": 0.91370049, "memory(GiB)": 764.53, "step": 8165, "train_speed(iter/s)": 0.150913 }, { "acc": 0.76212201, "epoch": 0.20725552900737096, "grad_norm": 4.34375, "learning_rate": 9.921575118987672e-06, "loss": 0.96633549, "memory(GiB)": 764.53, "step": 8170, "train_speed(iter/s)": 0.150785 }, { "acc": 0.74523735, "epoch": 0.2073823683764085, "grad_norm": 3.765625, "learning_rate": 9.921390007862282e-06, "loss": 0.96598167, "memory(GiB)": 764.53, "step": 8175, "train_speed(iter/s)": 0.150705 }, { "acc": 0.75270152, "epoch": 0.20750920774544607, "grad_norm": 3.25, "learning_rate": 9.921204680260627e-06, "loss": 0.95573149, "memory(GiB)": 764.53, "step": 8180, "train_speed(iter/s)": 0.150589 }, { "acc": 0.75517917, "epoch": 0.20763604711448364, "grad_norm": 4.09375, "learning_rate": 9.921019136190859e-06, "loss": 0.93855686, "memory(GiB)": 764.53, "step": 8185, "train_speed(iter/s)": 0.15047 }, { "acc": 0.76094427, "epoch": 0.20776288648352118, "grad_norm": 3.71875, "learning_rate": 9.920833375661137e-06, "loss": 0.910952, "memory(GiB)": 764.53, "step": 8190, "train_speed(iter/s)": 0.15038 }, { "acc": 0.76251888, "epoch": 0.20788972585255874, "grad_norm": 3.375, "learning_rate": 9.920647398679634e-06, "loss": 0.99198895, "memory(GiB)": 764.53, "step": 8195, "train_speed(iter/s)": 0.150294 }, { "acc": 0.75394049, "epoch": 0.2080165652215963, "grad_norm": 3.984375, "learning_rate": 9.92046120525453e-06, "loss": 0.96777697, "memory(GiB)": 764.53, "step": 8200, "train_speed(iter/s)": 0.150221 }, { "acc": 0.75064101, "epoch": 0.20814340459063385, "grad_norm": 3.34375, "learning_rate": 9.920274795394017e-06, "loss": 0.97246046, "memory(GiB)": 764.53, "step": 8205, "train_speed(iter/s)": 0.150125 }, { "acc": 0.75882063, "epoch": 0.20827024395967142, "grad_norm": 3.4375, "learning_rate": 9.92008816910629e-06, "loss": 0.94977798, "memory(GiB)": 764.53, "step": 8210, "train_speed(iter/s)": 0.150031 }, { "acc": 0.75860114, "epoch": 0.208397083328709, "grad_norm": 3.75, "learning_rate": 9.919901326399561e-06, "loss": 0.94299049, "memory(GiB)": 764.53, "step": 8215, "train_speed(iter/s)": 0.149917 }, { "acc": 0.74950433, "epoch": 0.20852392269774653, "grad_norm": 4.28125, "learning_rate": 9.91971426728205e-06, "loss": 0.93250771, "memory(GiB)": 764.53, "step": 8220, "train_speed(iter/s)": 0.14983 }, { "acc": 0.75042315, "epoch": 0.2086507620667841, "grad_norm": 3.609375, "learning_rate": 9.919526991761982e-06, "loss": 0.98768892, "memory(GiB)": 764.53, "step": 8225, "train_speed(iter/s)": 0.149737 }, { "acc": 0.75796866, "epoch": 0.20877760143582166, "grad_norm": 5.28125, "learning_rate": 9.919339499847598e-06, "loss": 0.89001932, "memory(GiB)": 764.53, "step": 8230, "train_speed(iter/s)": 0.149643 }, { "acc": 0.76703887, "epoch": 0.2089044408048592, "grad_norm": 2.96875, "learning_rate": 9.919151791547142e-06, "loss": 0.86979141, "memory(GiB)": 764.53, "step": 8235, "train_speed(iter/s)": 0.149555 }, { "acc": 0.76283231, "epoch": 0.20903128017389677, "grad_norm": 4.03125, "learning_rate": 9.918963866868873e-06, "loss": 0.91069822, "memory(GiB)": 764.53, "step": 8240, "train_speed(iter/s)": 0.149455 }, { "acc": 0.76013513, "epoch": 0.20915811954293434, "grad_norm": 3.90625, "learning_rate": 9.918775725821054e-06, "loss": 0.96807241, "memory(GiB)": 764.53, "step": 8245, "train_speed(iter/s)": 0.149359 }, { "acc": 0.75165858, "epoch": 0.20928495891197188, "grad_norm": 3.671875, "learning_rate": 9.918587368411967e-06, "loss": 0.9358779, "memory(GiB)": 764.53, "step": 8250, "train_speed(iter/s)": 0.149248 }, { "acc": 0.74954524, "epoch": 0.20941179828100945, "grad_norm": 3.828125, "learning_rate": 9.91839879464989e-06, "loss": 0.94853477, "memory(GiB)": 764.53, "step": 8255, "train_speed(iter/s)": 0.149159 }, { "acc": 0.74939885, "epoch": 0.20953863765004702, "grad_norm": 3.21875, "learning_rate": 9.918210004543122e-06, "loss": 0.94367647, "memory(GiB)": 764.53, "step": 8260, "train_speed(iter/s)": 0.149071 }, { "acc": 0.7550571, "epoch": 0.20966547701908456, "grad_norm": 6.21875, "learning_rate": 9.918020998099966e-06, "loss": 0.9339716, "memory(GiB)": 764.53, "step": 8265, "train_speed(iter/s)": 0.148968 }, { "acc": 0.75960622, "epoch": 0.20979231638812212, "grad_norm": 3.375, "learning_rate": 9.917831775328737e-06, "loss": 0.93351889, "memory(GiB)": 764.53, "step": 8270, "train_speed(iter/s)": 0.14888 }, { "acc": 0.75798969, "epoch": 0.2099191557571597, "grad_norm": 3.328125, "learning_rate": 9.917642336237756e-06, "loss": 0.98031082, "memory(GiB)": 764.53, "step": 8275, "train_speed(iter/s)": 0.148781 }, { "acc": 0.75558629, "epoch": 0.21004599512619723, "grad_norm": 4.03125, "learning_rate": 9.917452680835357e-06, "loss": 0.95503082, "memory(GiB)": 764.53, "step": 8280, "train_speed(iter/s)": 0.148697 }, { "acc": 0.75512547, "epoch": 0.2101728344952348, "grad_norm": 3.359375, "learning_rate": 9.917262809129883e-06, "loss": 0.97080107, "memory(GiB)": 764.53, "step": 8285, "train_speed(iter/s)": 0.148617 }, { "acc": 0.7502564, "epoch": 0.21029967386427237, "grad_norm": 3.1875, "learning_rate": 9.917072721129687e-06, "loss": 0.91660614, "memory(GiB)": 764.53, "step": 8290, "train_speed(iter/s)": 0.148533 }, { "acc": 0.75426402, "epoch": 0.2104265132333099, "grad_norm": 3.53125, "learning_rate": 9.916882416843127e-06, "loss": 0.94831829, "memory(GiB)": 764.53, "step": 8295, "train_speed(iter/s)": 0.148429 }, { "acc": 0.77038317, "epoch": 0.21055335260234748, "grad_norm": 3.859375, "learning_rate": 9.916691896278577e-06, "loss": 0.96550303, "memory(GiB)": 764.53, "step": 8300, "train_speed(iter/s)": 0.148327 }, { "acc": 0.75201588, "epoch": 0.21068019197138504, "grad_norm": 3.734375, "learning_rate": 9.916501159444417e-06, "loss": 0.92474527, "memory(GiB)": 764.53, "step": 8305, "train_speed(iter/s)": 0.148242 }, { "acc": 0.75227585, "epoch": 0.21080703134042258, "grad_norm": 3.984375, "learning_rate": 9.916310206349033e-06, "loss": 0.95551767, "memory(GiB)": 764.53, "step": 8310, "train_speed(iter/s)": 0.148159 }, { "acc": 0.75742941, "epoch": 0.21093387070946015, "grad_norm": 3.328125, "learning_rate": 9.916119037000827e-06, "loss": 0.97605429, "memory(GiB)": 764.53, "step": 8315, "train_speed(iter/s)": 0.148071 }, { "acc": 0.76346803, "epoch": 0.21106071007849772, "grad_norm": 3.984375, "learning_rate": 9.91592765140821e-06, "loss": 0.91273603, "memory(GiB)": 764.53, "step": 8320, "train_speed(iter/s)": 0.147976 }, { "acc": 0.75365992, "epoch": 0.21118754944753526, "grad_norm": 3.5, "learning_rate": 9.915736049579599e-06, "loss": 0.97765541, "memory(GiB)": 764.53, "step": 8325, "train_speed(iter/s)": 0.147891 }, { "acc": 0.7570838, "epoch": 0.21131438881657283, "grad_norm": 3.546875, "learning_rate": 9.915544231523421e-06, "loss": 0.96863384, "memory(GiB)": 764.53, "step": 8330, "train_speed(iter/s)": 0.147808 }, { "acc": 0.77476869, "epoch": 0.2114412281856104, "grad_norm": 4.125, "learning_rate": 9.915352197248114e-06, "loss": 0.90350189, "memory(GiB)": 764.53, "step": 8335, "train_speed(iter/s)": 0.147709 }, { "acc": 0.75070343, "epoch": 0.21156806755464794, "grad_norm": 3.84375, "learning_rate": 9.915159946762124e-06, "loss": 0.94438295, "memory(GiB)": 764.53, "step": 8340, "train_speed(iter/s)": 0.147606 }, { "acc": 0.76185346, "epoch": 0.2116949069236855, "grad_norm": 4.09375, "learning_rate": 9.914967480073911e-06, "loss": 0.95214014, "memory(GiB)": 764.53, "step": 8345, "train_speed(iter/s)": 0.147497 }, { "acc": 0.74739642, "epoch": 0.21182174629272307, "grad_norm": 3.8125, "learning_rate": 9.914774797191937e-06, "loss": 1.00357361, "memory(GiB)": 764.53, "step": 8350, "train_speed(iter/s)": 0.147408 }, { "acc": 0.74426579, "epoch": 0.2119485856617606, "grad_norm": 3.5, "learning_rate": 9.91458189812468e-06, "loss": 0.9967967, "memory(GiB)": 764.53, "step": 8355, "train_speed(iter/s)": 0.147317 }, { "acc": 0.76237011, "epoch": 0.21207542503079818, "grad_norm": 3.75, "learning_rate": 9.914388782880623e-06, "loss": 0.96926012, "memory(GiB)": 764.53, "step": 8360, "train_speed(iter/s)": 0.147233 }, { "acc": 0.75567484, "epoch": 0.21220226439983575, "grad_norm": 3.609375, "learning_rate": 9.914195451468263e-06, "loss": 0.96054516, "memory(GiB)": 764.53, "step": 8365, "train_speed(iter/s)": 0.147142 }, { "acc": 0.75746112, "epoch": 0.2123291037688733, "grad_norm": 3.5625, "learning_rate": 9.914001903896102e-06, "loss": 0.92924109, "memory(GiB)": 764.53, "step": 8370, "train_speed(iter/s)": 0.147066 }, { "acc": 0.74555068, "epoch": 0.21245594313791086, "grad_norm": 3.34375, "learning_rate": 9.913808140172655e-06, "loss": 0.9582016, "memory(GiB)": 764.53, "step": 8375, "train_speed(iter/s)": 0.146972 }, { "acc": 0.76432233, "epoch": 0.21258278250694843, "grad_norm": 4.375, "learning_rate": 9.913614160306445e-06, "loss": 0.95940151, "memory(GiB)": 764.53, "step": 8380, "train_speed(iter/s)": 0.146902 }, { "acc": 0.76080923, "epoch": 0.21270962187598597, "grad_norm": 3.734375, "learning_rate": 9.913419964306001e-06, "loss": 0.89115639, "memory(GiB)": 764.53, "step": 8385, "train_speed(iter/s)": 0.146821 }, { "acc": 0.75896034, "epoch": 0.21283646124502353, "grad_norm": 3.90625, "learning_rate": 9.913225552179869e-06, "loss": 0.97203302, "memory(GiB)": 764.53, "step": 8390, "train_speed(iter/s)": 0.146722 }, { "acc": 0.7372375, "epoch": 0.2129633006140611, "grad_norm": 4.375, "learning_rate": 9.9130309239366e-06, "loss": 0.98686848, "memory(GiB)": 764.53, "step": 8395, "train_speed(iter/s)": 0.146644 }, { "acc": 0.76071978, "epoch": 0.21309013998309864, "grad_norm": 3.46875, "learning_rate": 9.912836079584754e-06, "loss": 0.91786852, "memory(GiB)": 764.53, "step": 8400, "train_speed(iter/s)": 0.146559 }, { "acc": 0.77455072, "epoch": 0.2132169793521362, "grad_norm": 3.21875, "learning_rate": 9.912641019132903e-06, "loss": 0.8836668, "memory(GiB)": 764.53, "step": 8405, "train_speed(iter/s)": 0.146485 }, { "acc": 0.7463222, "epoch": 0.21334381872117378, "grad_norm": 4.46875, "learning_rate": 9.912445742589624e-06, "loss": 1.00355682, "memory(GiB)": 764.53, "step": 8410, "train_speed(iter/s)": 0.146421 }, { "acc": 0.76223011, "epoch": 0.21347065809021132, "grad_norm": 3.296875, "learning_rate": 9.91225024996351e-06, "loss": 0.92290964, "memory(GiB)": 764.53, "step": 8415, "train_speed(iter/s)": 0.14635 }, { "acc": 0.75452266, "epoch": 0.2135974974592489, "grad_norm": 3.84375, "learning_rate": 9.912054541263161e-06, "loss": 0.99366693, "memory(GiB)": 764.53, "step": 8420, "train_speed(iter/s)": 0.146274 }, { "acc": 0.75609508, "epoch": 0.21372433682828645, "grad_norm": 3.484375, "learning_rate": 9.91185861649718e-06, "loss": 0.9340251, "memory(GiB)": 764.53, "step": 8425, "train_speed(iter/s)": 0.14619 }, { "acc": 0.73619285, "epoch": 0.213851176197324, "grad_norm": 3.34375, "learning_rate": 9.91166247567419e-06, "loss": 1.00694485, "memory(GiB)": 764.53, "step": 8430, "train_speed(iter/s)": 0.146117 }, { "acc": 0.76767826, "epoch": 0.21397801556636156, "grad_norm": 3.34375, "learning_rate": 9.911466118802819e-06, "loss": 0.90827179, "memory(GiB)": 764.53, "step": 8435, "train_speed(iter/s)": 0.146037 }, { "acc": 0.74853282, "epoch": 0.21410485493539913, "grad_norm": 3.546875, "learning_rate": 9.911269545891699e-06, "loss": 0.95501051, "memory(GiB)": 764.53, "step": 8440, "train_speed(iter/s)": 0.145948 }, { "acc": 0.76795259, "epoch": 0.21423169430443667, "grad_norm": 4.0, "learning_rate": 9.911072756949483e-06, "loss": 0.9478117, "memory(GiB)": 764.53, "step": 8445, "train_speed(iter/s)": 0.145861 }, { "acc": 0.75353556, "epoch": 0.21435853367347424, "grad_norm": 3.890625, "learning_rate": 9.910875751984822e-06, "loss": 0.92457771, "memory(GiB)": 764.53, "step": 8450, "train_speed(iter/s)": 0.145778 }, { "acc": 0.75958772, "epoch": 0.2144853730425118, "grad_norm": 3.40625, "learning_rate": 9.910678531006386e-06, "loss": 0.93757257, "memory(GiB)": 764.53, "step": 8455, "train_speed(iter/s)": 0.145713 }, { "acc": 0.74540749, "epoch": 0.21461221241154935, "grad_norm": 3.46875, "learning_rate": 9.910481094022845e-06, "loss": 0.95585051, "memory(GiB)": 764.53, "step": 8460, "train_speed(iter/s)": 0.145646 }, { "acc": 0.75233049, "epoch": 0.21473905178058691, "grad_norm": 3.59375, "learning_rate": 9.910283441042889e-06, "loss": 0.98558483, "memory(GiB)": 764.53, "step": 8465, "train_speed(iter/s)": 0.145571 }, { "acc": 0.75199122, "epoch": 0.21486589114962448, "grad_norm": 3.453125, "learning_rate": 9.910085572075208e-06, "loss": 0.98177681, "memory(GiB)": 764.53, "step": 8470, "train_speed(iter/s)": 0.145479 }, { "acc": 0.75910373, "epoch": 0.21499273051866202, "grad_norm": 3.84375, "learning_rate": 9.909887487128507e-06, "loss": 0.95633841, "memory(GiB)": 764.53, "step": 8475, "train_speed(iter/s)": 0.145414 }, { "acc": 0.7643805, "epoch": 0.2151195698876996, "grad_norm": 3.921875, "learning_rate": 9.9096891862115e-06, "loss": 0.9464262, "memory(GiB)": 764.53, "step": 8480, "train_speed(iter/s)": 0.145325 }, { "acc": 0.76353331, "epoch": 0.21524640925673716, "grad_norm": 8.0, "learning_rate": 9.909490669332909e-06, "loss": 0.8960928, "memory(GiB)": 764.53, "step": 8485, "train_speed(iter/s)": 0.145226 }, { "acc": 0.75855021, "epoch": 0.2153732486257747, "grad_norm": 3.390625, "learning_rate": 9.909291936501465e-06, "loss": 0.91588345, "memory(GiB)": 764.53, "step": 8490, "train_speed(iter/s)": 0.145136 }, { "acc": 0.76673198, "epoch": 0.21550008799481227, "grad_norm": 4.40625, "learning_rate": 9.90909298772591e-06, "loss": 0.91584272, "memory(GiB)": 764.53, "step": 8495, "train_speed(iter/s)": 0.14505 }, { "acc": 0.76800332, "epoch": 0.21562692736384984, "grad_norm": 17.0, "learning_rate": 9.908893823014998e-06, "loss": 0.96532784, "memory(GiB)": 764.53, "step": 8500, "train_speed(iter/s)": 0.144976 }, { "epoch": 0.21562692736384984, "eval_acc": 0.7449472088339404, "eval_loss": 0.9230129718780518, "eval_runtime": 1148.1179, "eval_samples_per_second": 5.548, "eval_steps_per_second": 5.548, "step": 8500 }, { "acc": 0.76322365, "epoch": 0.21575376673288738, "grad_norm": 3.453125, "learning_rate": 9.908694442377487e-06, "loss": 0.9432435, "memory(GiB)": 681.73, "step": 8505, "train_speed(iter/s)": 17.914163 }, { "acc": 0.74194589, "epoch": 0.21588060610192494, "grad_norm": 4.0625, "learning_rate": 9.908494845822148e-06, "loss": 1.00527925, "memory(GiB)": 684.05, "step": 8510, "train_speed(iter/s)": 15.88846 }, { "acc": 0.75993061, "epoch": 0.2160074454709625, "grad_norm": 4.0, "learning_rate": 9.908295033357758e-06, "loss": 0.94629097, "memory(GiB)": 684.05, "step": 8515, "train_speed(iter/s)": 14.087472 }, { "acc": 0.75293474, "epoch": 0.21613428484000005, "grad_norm": 3.40625, "learning_rate": 9.90809500499311e-06, "loss": 0.96797476, "memory(GiB)": 684.05, "step": 8520, "train_speed(iter/s)": 12.695829 }, { "acc": 0.75380268, "epoch": 0.21626112420903762, "grad_norm": 3.96875, "learning_rate": 9.907894760737001e-06, "loss": 0.95908527, "memory(GiB)": 686.76, "step": 8525, "train_speed(iter/s)": 11.52242 }, { "acc": 0.7489881, "epoch": 0.2163879635780752, "grad_norm": 3.15625, "learning_rate": 9.907694300598237e-06, "loss": 0.98274794, "memory(GiB)": 686.76, "step": 8530, "train_speed(iter/s)": 10.509644 }, { "acc": 0.75150461, "epoch": 0.21651480294711273, "grad_norm": 3.75, "learning_rate": 9.90749362458564e-06, "loss": 0.94533205, "memory(GiB)": 686.76, "step": 8535, "train_speed(iter/s)": 9.675249 }, { "acc": 0.76125498, "epoch": 0.2166416423161503, "grad_norm": 4.40625, "learning_rate": 9.907292732708032e-06, "loss": 0.94804239, "memory(GiB)": 686.76, "step": 8540, "train_speed(iter/s)": 8.983103 }, { "acc": 0.74844561, "epoch": 0.21676848168518786, "grad_norm": 3.359375, "learning_rate": 9.907091624974254e-06, "loss": 0.95122938, "memory(GiB)": 686.76, "step": 8545, "train_speed(iter/s)": 8.399832 }, { "acc": 0.7587996, "epoch": 0.2168953210542254, "grad_norm": 3.578125, "learning_rate": 9.90689030139315e-06, "loss": 0.91159592, "memory(GiB)": 686.76, "step": 8550, "train_speed(iter/s)": 7.901192 }, { "acc": 0.76420336, "epoch": 0.21702216042326297, "grad_norm": 3.140625, "learning_rate": 9.906688761973577e-06, "loss": 0.89710817, "memory(GiB)": 689.46, "step": 8555, "train_speed(iter/s)": 7.437043 }, { "acc": 0.75905175, "epoch": 0.21714899979230054, "grad_norm": 3.546875, "learning_rate": 9.906487006724398e-06, "loss": 0.91394682, "memory(GiB)": 689.46, "step": 8560, "train_speed(iter/s)": 7.045827 }, { "acc": 0.74041061, "epoch": 0.21727583916133808, "grad_norm": 4.28125, "learning_rate": 9.90628503565449e-06, "loss": 0.98557768, "memory(GiB)": 698.87, "step": 8565, "train_speed(iter/s)": 6.666341 }, { "acc": 0.75434852, "epoch": 0.21740267853037565, "grad_norm": 3.671875, "learning_rate": 9.906082848772733e-06, "loss": 0.96588345, "memory(GiB)": 698.87, "step": 8570, "train_speed(iter/s)": 6.364285 }, { "acc": 0.75254269, "epoch": 0.21752951789941322, "grad_norm": 3.390625, "learning_rate": 9.905880446088026e-06, "loss": 0.96604271, "memory(GiB)": 698.87, "step": 8575, "train_speed(iter/s)": 6.052445 }, { "acc": 0.75550256, "epoch": 0.21765635726845076, "grad_norm": 3.703125, "learning_rate": 9.905677827609268e-06, "loss": 0.95950212, "memory(GiB)": 698.87, "step": 8580, "train_speed(iter/s)": 5.796503 }, { "acc": 0.75557709, "epoch": 0.21778319663748832, "grad_norm": 3.5625, "learning_rate": 9.905474993345372e-06, "loss": 0.97650366, "memory(GiB)": 698.87, "step": 8585, "train_speed(iter/s)": 5.554586 }, { "acc": 0.74899402, "epoch": 0.2179100360065259, "grad_norm": 3.671875, "learning_rate": 9.90527194330526e-06, "loss": 0.97532425, "memory(GiB)": 698.87, "step": 8590, "train_speed(iter/s)": 5.328337 }, { "acc": 0.75594845, "epoch": 0.21803687537556343, "grad_norm": 3.609375, "learning_rate": 9.905068677497868e-06, "loss": 0.91825504, "memory(GiB)": 698.87, "step": 8595, "train_speed(iter/s)": 5.127417 }, { "acc": 0.74050303, "epoch": 0.218163714744601, "grad_norm": 4.125, "learning_rate": 9.904865195932133e-06, "loss": 0.96310682, "memory(GiB)": 698.87, "step": 8600, "train_speed(iter/s)": 4.93997 }, { "acc": 0.74993138, "epoch": 0.21829055411363857, "grad_norm": 3.96875, "learning_rate": 9.904661498617003e-06, "loss": 0.95386515, "memory(GiB)": 698.87, "step": 8605, "train_speed(iter/s)": 4.761935 }, { "acc": 0.74109197, "epoch": 0.2184173934826761, "grad_norm": 4.03125, "learning_rate": 9.904457585561443e-06, "loss": 0.99795218, "memory(GiB)": 709.9, "step": 8610, "train_speed(iter/s)": 4.591165 }, { "acc": 0.7450294, "epoch": 0.21854423285171368, "grad_norm": 3.609375, "learning_rate": 9.904253456774417e-06, "loss": 1.0170557, "memory(GiB)": 709.9, "step": 8615, "train_speed(iter/s)": 4.439846 }, { "acc": 0.75317616, "epoch": 0.21867107222075124, "grad_norm": 3.28125, "learning_rate": 9.904049112264911e-06, "loss": 0.95989065, "memory(GiB)": 709.9, "step": 8620, "train_speed(iter/s)": 4.297984 }, { "acc": 0.75508862, "epoch": 0.21879791158978878, "grad_norm": 3.234375, "learning_rate": 9.903844552041908e-06, "loss": 0.9360425, "memory(GiB)": 709.91, "step": 8625, "train_speed(iter/s)": 4.164569 }, { "acc": 0.75259442, "epoch": 0.21892475095882635, "grad_norm": 3.71875, "learning_rate": 9.903639776114409e-06, "loss": 0.9206769, "memory(GiB)": 709.91, "step": 8630, "train_speed(iter/s)": 4.044768 }, { "acc": 0.74823537, "epoch": 0.21905159032786392, "grad_norm": 3.234375, "learning_rate": 9.90343478449142e-06, "loss": 0.93771219, "memory(GiB)": 722.93, "step": 8635, "train_speed(iter/s)": 3.905903 }, { "acc": 0.75685968, "epoch": 0.21917842969690146, "grad_norm": 3.359375, "learning_rate": 9.903229577181957e-06, "loss": 0.95620022, "memory(GiB)": 722.93, "step": 8640, "train_speed(iter/s)": 3.78766 }, { "acc": 0.73214765, "epoch": 0.21930526906593903, "grad_norm": 3.875, "learning_rate": 9.903024154195049e-06, "loss": 1.03000107, "memory(GiB)": 722.93, "step": 8645, "train_speed(iter/s)": 3.683832 }, { "acc": 0.76958938, "epoch": 0.2194321084349766, "grad_norm": 3.1875, "learning_rate": 9.90281851553973e-06, "loss": 0.89609556, "memory(GiB)": 722.93, "step": 8650, "train_speed(iter/s)": 3.589279 }, { "acc": 0.76234641, "epoch": 0.21955894780401414, "grad_norm": 3.625, "learning_rate": 9.902612661225044e-06, "loss": 0.904634, "memory(GiB)": 722.93, "step": 8655, "train_speed(iter/s)": 3.498131 }, { "acc": 0.76311588, "epoch": 0.2196857871730517, "grad_norm": 3.28125, "learning_rate": 9.90240659126005e-06, "loss": 0.91615734, "memory(GiB)": 722.93, "step": 8660, "train_speed(iter/s)": 3.410641 }, { "acc": 0.7618906, "epoch": 0.21981262654208927, "grad_norm": 3.34375, "learning_rate": 9.902200305653809e-06, "loss": 0.94792461, "memory(GiB)": 722.93, "step": 8665, "train_speed(iter/s)": 3.32531 }, { "acc": 0.74429712, "epoch": 0.2199394659111268, "grad_norm": 3.65625, "learning_rate": 9.901993804415396e-06, "loss": 1.00037804, "memory(GiB)": 722.93, "step": 8670, "train_speed(iter/s)": 3.2497 }, { "acc": 0.74851975, "epoch": 0.22006630528016438, "grad_norm": 3.875, "learning_rate": 9.901787087553894e-06, "loss": 0.95962505, "memory(GiB)": 722.93, "step": 8675, "train_speed(iter/s)": 3.171519 }, { "acc": 0.76350069, "epoch": 0.22019314464920195, "grad_norm": 3.234375, "learning_rate": 9.901580155078397e-06, "loss": 0.87991972, "memory(GiB)": 722.93, "step": 8680, "train_speed(iter/s)": 3.089727 }, { "acc": 0.75055761, "epoch": 0.2203199840182395, "grad_norm": 3.34375, "learning_rate": 9.901373006998006e-06, "loss": 1.00595798, "memory(GiB)": 722.93, "step": 8685, "train_speed(iter/s)": 3.023027 }, { "acc": 0.76683722, "epoch": 0.22044682338727706, "grad_norm": 4.25, "learning_rate": 9.901165643321832e-06, "loss": 0.92090101, "memory(GiB)": 722.93, "step": 8690, "train_speed(iter/s)": 2.94672 }, { "acc": 0.74236937, "epoch": 0.22057366275631463, "grad_norm": 3.203125, "learning_rate": 9.900958064059e-06, "loss": 1.02273569, "memory(GiB)": 722.93, "step": 8695, "train_speed(iter/s)": 2.879532 }, { "acc": 0.7552906, "epoch": 0.22070050212535217, "grad_norm": 5.375, "learning_rate": 9.900750269218636e-06, "loss": 0.98757877, "memory(GiB)": 722.93, "step": 8700, "train_speed(iter/s)": 2.817934 }, { "acc": 0.76812997, "epoch": 0.22082734149438973, "grad_norm": 3.796875, "learning_rate": 9.900542258809883e-06, "loss": 0.90321293, "memory(GiB)": 722.93, "step": 8705, "train_speed(iter/s)": 2.75632 }, { "acc": 0.75600066, "epoch": 0.2209541808634273, "grad_norm": 3.359375, "learning_rate": 9.90033403284189e-06, "loss": 0.98412294, "memory(GiB)": 722.93, "step": 8710, "train_speed(iter/s)": 2.701045 }, { "acc": 0.75571704, "epoch": 0.22108102023246484, "grad_norm": 3.890625, "learning_rate": 9.900125591323818e-06, "loss": 0.98207865, "memory(GiB)": 722.93, "step": 8715, "train_speed(iter/s)": 2.648703 }, { "acc": 0.74385982, "epoch": 0.2212078596015024, "grad_norm": 3.9375, "learning_rate": 9.899916934264832e-06, "loss": 1.02330914, "memory(GiB)": 722.93, "step": 8720, "train_speed(iter/s)": 2.598706 }, { "acc": 0.75844407, "epoch": 0.22133469897053998, "grad_norm": 3.015625, "learning_rate": 9.899708061674112e-06, "loss": 0.9168457, "memory(GiB)": 722.93, "step": 8725, "train_speed(iter/s)": 2.552236 }, { "acc": 0.76357379, "epoch": 0.22146153833957752, "grad_norm": 3.828125, "learning_rate": 9.899498973560845e-06, "loss": 0.94934845, "memory(GiB)": 722.93, "step": 8730, "train_speed(iter/s)": 2.508261 }, { "acc": 0.75039091, "epoch": 0.2215883777086151, "grad_norm": 3.5, "learning_rate": 9.899289669934232e-06, "loss": 0.99897957, "memory(GiB)": 722.93, "step": 8735, "train_speed(iter/s)": 2.458313 }, { "acc": 0.74942818, "epoch": 0.22171521707765265, "grad_norm": 3.5, "learning_rate": 9.899080150803474e-06, "loss": 0.93919621, "memory(GiB)": 722.93, "step": 8740, "train_speed(iter/s)": 2.414148 }, { "acc": 0.76600389, "epoch": 0.2218420564466902, "grad_norm": 3.234375, "learning_rate": 9.898870416177791e-06, "loss": 0.93634396, "memory(GiB)": 722.93, "step": 8745, "train_speed(iter/s)": 2.372307 }, { "acc": 0.75152216, "epoch": 0.22196889581572776, "grad_norm": 4.15625, "learning_rate": 9.898660466066406e-06, "loss": 0.99190407, "memory(GiB)": 722.93, "step": 8750, "train_speed(iter/s)": 2.332653 }, { "acc": 0.75540361, "epoch": 0.22209573518476533, "grad_norm": 4.40625, "learning_rate": 9.898450300478557e-06, "loss": 0.98315868, "memory(GiB)": 722.93, "step": 8755, "train_speed(iter/s)": 2.294001 }, { "acc": 0.75072918, "epoch": 0.22222257455380287, "grad_norm": 3.390625, "learning_rate": 9.898239919423484e-06, "loss": 0.97209387, "memory(GiB)": 722.93, "step": 8760, "train_speed(iter/s)": 2.253044 }, { "acc": 0.73930583, "epoch": 0.22234941392284044, "grad_norm": 3.390625, "learning_rate": 9.898029322910445e-06, "loss": 0.97084608, "memory(GiB)": 722.93, "step": 8765, "train_speed(iter/s)": 2.216889 }, { "acc": 0.75061169, "epoch": 0.222476253291878, "grad_norm": 3.8125, "learning_rate": 9.897818510948703e-06, "loss": 1.0080411, "memory(GiB)": 722.93, "step": 8770, "train_speed(iter/s)": 2.182977 }, { "acc": 0.75104828, "epoch": 0.22260309266091555, "grad_norm": 3.921875, "learning_rate": 9.897607483547529e-06, "loss": 1.01808891, "memory(GiB)": 722.93, "step": 8775, "train_speed(iter/s)": 2.150775 }, { "acc": 0.75420747, "epoch": 0.22272993202995311, "grad_norm": 3.546875, "learning_rate": 9.897396240716206e-06, "loss": 0.93694, "memory(GiB)": 722.93, "step": 8780, "train_speed(iter/s)": 2.116784 }, { "acc": 0.76089644, "epoch": 0.22285677139899068, "grad_norm": 4.5, "learning_rate": 9.897184782464029e-06, "loss": 0.95530901, "memory(GiB)": 722.93, "step": 8785, "train_speed(iter/s)": 2.081552 }, { "acc": 0.74775777, "epoch": 0.22298361076802822, "grad_norm": 3.296875, "learning_rate": 9.896973108800294e-06, "loss": 0.97890663, "memory(GiB)": 722.93, "step": 8790, "train_speed(iter/s)": 2.050707 }, { "acc": 0.74998703, "epoch": 0.2231104501370658, "grad_norm": 3.671875, "learning_rate": 9.896761219734317e-06, "loss": 0.96600828, "memory(GiB)": 722.93, "step": 8795, "train_speed(iter/s)": 2.021221 }, { "acc": 0.75519428, "epoch": 0.22323728950610336, "grad_norm": 3.859375, "learning_rate": 9.896549115275414e-06, "loss": 0.96935034, "memory(GiB)": 722.93, "step": 8800, "train_speed(iter/s)": 1.993514 }, { "acc": 0.76167426, "epoch": 0.2233641288751409, "grad_norm": 4.0625, "learning_rate": 9.896336795432919e-06, "loss": 0.9649147, "memory(GiB)": 722.93, "step": 8805, "train_speed(iter/s)": 1.963801 }, { "acc": 0.76540632, "epoch": 0.22349096824417847, "grad_norm": 4.6875, "learning_rate": 9.896124260216167e-06, "loss": 0.94020834, "memory(GiB)": 722.93, "step": 8810, "train_speed(iter/s)": 1.93604 }, { "acc": 0.76080275, "epoch": 0.22361780761321604, "grad_norm": 3.359375, "learning_rate": 9.895911509634508e-06, "loss": 0.94243145, "memory(GiB)": 722.93, "step": 8815, "train_speed(iter/s)": 1.910373 }, { "acc": 0.74824252, "epoch": 0.22374464698225358, "grad_norm": 4.78125, "learning_rate": 9.895698543697303e-06, "loss": 0.97851315, "memory(GiB)": 722.93, "step": 8820, "train_speed(iter/s)": 1.884393 }, { "acc": 0.76589775, "epoch": 0.22387148635129114, "grad_norm": 3.296875, "learning_rate": 9.895485362413917e-06, "loss": 0.93145351, "memory(GiB)": 722.93, "step": 8825, "train_speed(iter/s)": 1.858653 }, { "acc": 0.76607161, "epoch": 0.2239983257203287, "grad_norm": 3.4375, "learning_rate": 9.89527196579373e-06, "loss": 0.86707897, "memory(GiB)": 722.93, "step": 8830, "train_speed(iter/s)": 1.831587 }, { "acc": 0.74184637, "epoch": 0.22412516508936625, "grad_norm": 3.8125, "learning_rate": 9.895058353846123e-06, "loss": 1.00271931, "memory(GiB)": 722.93, "step": 8835, "train_speed(iter/s)": 1.806077 }, { "acc": 0.74594755, "epoch": 0.22425200445840382, "grad_norm": 2.84375, "learning_rate": 9.894844526580498e-06, "loss": 0.97692728, "memory(GiB)": 722.93, "step": 8840, "train_speed(iter/s)": 1.77958 }, { "acc": 0.75953879, "epoch": 0.2243788438274414, "grad_norm": 3.234375, "learning_rate": 9.894630484006256e-06, "loss": 0.93702774, "memory(GiB)": 722.93, "step": 8845, "train_speed(iter/s)": 1.755658 }, { "acc": 0.75534282, "epoch": 0.22450568319647893, "grad_norm": 3.8125, "learning_rate": 9.894416226132816e-06, "loss": 0.95038319, "memory(GiB)": 722.93, "step": 8850, "train_speed(iter/s)": 1.732136 }, { "acc": 0.77929649, "epoch": 0.2246325225655165, "grad_norm": 3.875, "learning_rate": 9.894201752969602e-06, "loss": 0.88132496, "memory(GiB)": 722.93, "step": 8855, "train_speed(iter/s)": 1.711436 }, { "acc": 0.75207701, "epoch": 0.22475936193455406, "grad_norm": 4.25, "learning_rate": 9.893987064526043e-06, "loss": 0.93583927, "memory(GiB)": 722.93, "step": 8860, "train_speed(iter/s)": 1.690736 }, { "acc": 0.7585187, "epoch": 0.2248862013035916, "grad_norm": 3.390625, "learning_rate": 9.893772160811589e-06, "loss": 0.96948719, "memory(GiB)": 722.93, "step": 8865, "train_speed(iter/s)": 1.67048 }, { "acc": 0.75375476, "epoch": 0.22501304067262917, "grad_norm": 5.03125, "learning_rate": 9.893557041835688e-06, "loss": 0.99425936, "memory(GiB)": 722.93, "step": 8870, "train_speed(iter/s)": 1.650607 }, { "acc": 0.76451764, "epoch": 0.22513988004166674, "grad_norm": 3.046875, "learning_rate": 9.893341707607808e-06, "loss": 0.92241001, "memory(GiB)": 722.93, "step": 8875, "train_speed(iter/s)": 1.633061 }, { "acc": 0.76680627, "epoch": 0.22526671941070428, "grad_norm": 4.5, "learning_rate": 9.893126158137415e-06, "loss": 0.92329912, "memory(GiB)": 722.93, "step": 8880, "train_speed(iter/s)": 1.615147 }, { "acc": 0.75540047, "epoch": 0.22539355877974185, "grad_norm": 3.515625, "learning_rate": 9.892910393433994e-06, "loss": 0.96882067, "memory(GiB)": 722.93, "step": 8885, "train_speed(iter/s)": 1.594208 }, { "acc": 0.76458416, "epoch": 0.22552039814877942, "grad_norm": 3.484375, "learning_rate": 9.892694413507034e-06, "loss": 0.96409769, "memory(GiB)": 722.93, "step": 8890, "train_speed(iter/s)": 1.575577 }, { "acc": 0.7671114, "epoch": 0.22564723751781696, "grad_norm": 3.5625, "learning_rate": 9.892478218366034e-06, "loss": 0.93876562, "memory(GiB)": 722.93, "step": 8895, "train_speed(iter/s)": 1.556844 }, { "acc": 0.74667306, "epoch": 0.22577407688685452, "grad_norm": 4.28125, "learning_rate": 9.892261808020507e-06, "loss": 0.95504961, "memory(GiB)": 722.93, "step": 8900, "train_speed(iter/s)": 1.538323 }, { "acc": 0.7647398, "epoch": 0.2259009162558921, "grad_norm": 4.21875, "learning_rate": 9.892045182479971e-06, "loss": 0.92596178, "memory(GiB)": 722.93, "step": 8905, "train_speed(iter/s)": 1.519953 }, { "acc": 0.75570593, "epoch": 0.22602775562492963, "grad_norm": 4.0, "learning_rate": 9.891828341753954e-06, "loss": 0.96732101, "memory(GiB)": 722.93, "step": 8910, "train_speed(iter/s)": 1.503157 }, { "acc": 0.74280109, "epoch": 0.2261545949939672, "grad_norm": 3.328125, "learning_rate": 9.891611285851997e-06, "loss": 1.0059556, "memory(GiB)": 722.93, "step": 8915, "train_speed(iter/s)": 1.485613 }, { "acc": 0.75220089, "epoch": 0.22628143436300477, "grad_norm": 3.890625, "learning_rate": 9.891394014783642e-06, "loss": 0.95760603, "memory(GiB)": 722.93, "step": 8920, "train_speed(iter/s)": 1.470261 }, { "acc": 0.75370917, "epoch": 0.2264082737320423, "grad_norm": 3.609375, "learning_rate": 9.891176528558451e-06, "loss": 1.02342787, "memory(GiB)": 722.93, "step": 8925, "train_speed(iter/s)": 1.45602 }, { "acc": 0.75638428, "epoch": 0.22653511310107988, "grad_norm": 3.65625, "learning_rate": 9.890958827185989e-06, "loss": 0.99515915, "memory(GiB)": 722.93, "step": 8930, "train_speed(iter/s)": 1.440251 }, { "acc": 0.75290918, "epoch": 0.22666195247011744, "grad_norm": 3.234375, "learning_rate": 9.89074091067583e-06, "loss": 0.97623892, "memory(GiB)": 722.93, "step": 8935, "train_speed(iter/s)": 1.425618 }, { "acc": 0.75485601, "epoch": 0.22678879183915499, "grad_norm": 3.765625, "learning_rate": 9.890522779037562e-06, "loss": 0.91798878, "memory(GiB)": 722.93, "step": 8940, "train_speed(iter/s)": 1.410477 }, { "acc": 0.75928445, "epoch": 0.22691563120819255, "grad_norm": 3.53125, "learning_rate": 9.890304432280778e-06, "loss": 0.94353485, "memory(GiB)": 722.93, "step": 8945, "train_speed(iter/s)": 1.397211 }, { "acc": 0.73858509, "epoch": 0.22704247057723012, "grad_norm": 3.484375, "learning_rate": 9.890085870415084e-06, "loss": 1.00670853, "memory(GiB)": 725.64, "step": 8950, "train_speed(iter/s)": 1.383018 }, { "acc": 0.75379772, "epoch": 0.22716930994626766, "grad_norm": 3.765625, "learning_rate": 9.889867093450094e-06, "loss": 0.96849594, "memory(GiB)": 725.64, "step": 8955, "train_speed(iter/s)": 1.368873 }, { "acc": 0.75420899, "epoch": 0.22729614931530523, "grad_norm": 4.3125, "learning_rate": 9.88964810139543e-06, "loss": 0.96078663, "memory(GiB)": 725.64, "step": 8960, "train_speed(iter/s)": 1.356862 }, { "acc": 0.75352945, "epoch": 0.2274229886843428, "grad_norm": 3.390625, "learning_rate": 9.889428894260727e-06, "loss": 0.93522472, "memory(GiB)": 725.64, "step": 8965, "train_speed(iter/s)": 1.344636 }, { "acc": 0.75590286, "epoch": 0.22754982805338034, "grad_norm": 3.40625, "learning_rate": 9.889209472055624e-06, "loss": 0.97111597, "memory(GiB)": 725.64, "step": 8970, "train_speed(iter/s)": 1.33106 }, { "acc": 0.74466181, "epoch": 0.2276766674224179, "grad_norm": 3.734375, "learning_rate": 9.888989834789775e-06, "loss": 1.01364412, "memory(GiB)": 725.64, "step": 8975, "train_speed(iter/s)": 1.318088 }, { "acc": 0.75846424, "epoch": 0.22780350679145547, "grad_norm": 3.609375, "learning_rate": 9.88876998247284e-06, "loss": 0.89570284, "memory(GiB)": 725.64, "step": 8980, "train_speed(iter/s)": 1.306296 }, { "acc": 0.75332799, "epoch": 0.227930346160493, "grad_norm": 4.1875, "learning_rate": 9.888549915114492e-06, "loss": 0.98063755, "memory(GiB)": 725.64, "step": 8985, "train_speed(iter/s)": 1.294447 }, { "acc": 0.74860129, "epoch": 0.22805718552953058, "grad_norm": 3.515625, "learning_rate": 9.888329632724405e-06, "loss": 0.97061644, "memory(GiB)": 725.64, "step": 8990, "train_speed(iter/s)": 1.281913 }, { "acc": 0.75830874, "epoch": 0.22818402489856815, "grad_norm": 3.4375, "learning_rate": 9.888109135312276e-06, "loss": 0.88901958, "memory(GiB)": 725.64, "step": 8995, "train_speed(iter/s)": 1.270844 }, { "acc": 0.75413299, "epoch": 0.2283108642676057, "grad_norm": 3.578125, "learning_rate": 9.887888422887798e-06, "loss": 0.96113024, "memory(GiB)": 725.64, "step": 9000, "train_speed(iter/s)": 1.260104 }, { "epoch": 0.2283108642676057, "eval_acc": 0.7453963086916071, "eval_loss": 0.9197199940681458, "eval_runtime": 1152.24, "eval_samples_per_second": 5.528, "eval_steps_per_second": 5.528, "step": 9000 }, { "acc": 0.75143375, "epoch": 0.22843770363664326, "grad_norm": 4.34375, "learning_rate": 9.887667495460686e-06, "loss": 0.96113472, "memory(GiB)": 725.64, "step": 9005, "train_speed(iter/s)": 0.998825 }, { "acc": 0.75853782, "epoch": 0.22856454300568083, "grad_norm": 3.546875, "learning_rate": 9.887446353040651e-06, "loss": 0.97348175, "memory(GiB)": 725.64, "step": 9010, "train_speed(iter/s)": 0.992161 }, { "acc": 0.74635191, "epoch": 0.22869138237471837, "grad_norm": 3.546875, "learning_rate": 9.887224995637424e-06, "loss": 0.98462353, "memory(GiB)": 725.64, "step": 9015, "train_speed(iter/s)": 0.984869 }, { "acc": 0.76049566, "epoch": 0.22881822174375593, "grad_norm": 3.515625, "learning_rate": 9.887003423260741e-06, "loss": 0.90797091, "memory(GiB)": 725.64, "step": 9020, "train_speed(iter/s)": 0.978072 }, { "acc": 0.75202231, "epoch": 0.2289450611127935, "grad_norm": 4.03125, "learning_rate": 9.886781635920349e-06, "loss": 0.91685123, "memory(GiB)": 725.64, "step": 9025, "train_speed(iter/s)": 0.971013 }, { "acc": 0.75260987, "epoch": 0.22907190048183104, "grad_norm": 3.421875, "learning_rate": 9.886559633626002e-06, "loss": 0.94911928, "memory(GiB)": 725.64, "step": 9030, "train_speed(iter/s)": 0.964246 }, { "acc": 0.76232128, "epoch": 0.2291987398508686, "grad_norm": 4.125, "learning_rate": 9.886337416387467e-06, "loss": 0.95066051, "memory(GiB)": 725.64, "step": 9035, "train_speed(iter/s)": 0.957849 }, { "acc": 0.7576601, "epoch": 0.22932557921990618, "grad_norm": 3.5625, "learning_rate": 9.886114984214517e-06, "loss": 0.94195538, "memory(GiB)": 725.64, "step": 9040, "train_speed(iter/s)": 0.951776 }, { "acc": 0.76017065, "epoch": 0.22945241858894372, "grad_norm": 3.453125, "learning_rate": 9.885892337116937e-06, "loss": 0.9354147, "memory(GiB)": 725.64, "step": 9045, "train_speed(iter/s)": 0.945595 }, { "acc": 0.76255078, "epoch": 0.2295792579579813, "grad_norm": 4.3125, "learning_rate": 9.885669475104522e-06, "loss": 0.99689817, "memory(GiB)": 725.64, "step": 9050, "train_speed(iter/s)": 0.939814 }, { "acc": 0.75931187, "epoch": 0.22970609732701885, "grad_norm": 3.75, "learning_rate": 9.885446398187073e-06, "loss": 0.89556656, "memory(GiB)": 725.64, "step": 9055, "train_speed(iter/s)": 0.934605 }, { "acc": 0.75152555, "epoch": 0.2298329366960564, "grad_norm": 3.9375, "learning_rate": 9.885223106374402e-06, "loss": 0.94068375, "memory(GiB)": 725.64, "step": 9060, "train_speed(iter/s)": 0.928896 }, { "acc": 0.75422044, "epoch": 0.22995977606509396, "grad_norm": 2.953125, "learning_rate": 9.884999599676331e-06, "loss": 0.92827625, "memory(GiB)": 725.64, "step": 9065, "train_speed(iter/s)": 0.922462 }, { "acc": 0.75978751, "epoch": 0.23008661543413153, "grad_norm": 3.8125, "learning_rate": 9.884775878102694e-06, "loss": 0.89530706, "memory(GiB)": 725.64, "step": 9070, "train_speed(iter/s)": 0.915744 }, { "acc": 0.7579258, "epoch": 0.23021345480316907, "grad_norm": 3.453125, "learning_rate": 9.88455194166333e-06, "loss": 0.9168354, "memory(GiB)": 725.64, "step": 9075, "train_speed(iter/s)": 0.909694 }, { "acc": 0.77167249, "epoch": 0.23034029417220664, "grad_norm": 3.40625, "learning_rate": 9.884327790368088e-06, "loss": 0.87756472, "memory(GiB)": 725.64, "step": 9080, "train_speed(iter/s)": 0.903915 }, { "acc": 0.75423598, "epoch": 0.2304671335412442, "grad_norm": 3.46875, "learning_rate": 9.88410342422683e-06, "loss": 0.95669966, "memory(GiB)": 725.64, "step": 9085, "train_speed(iter/s)": 0.89909 }, { "acc": 0.75303273, "epoch": 0.23059397291028175, "grad_norm": 3.453125, "learning_rate": 9.883878843249422e-06, "loss": 0.99434242, "memory(GiB)": 725.64, "step": 9090, "train_speed(iter/s)": 0.894146 }, { "acc": 0.76841507, "epoch": 0.23072081227931932, "grad_norm": 3.15625, "learning_rate": 9.883654047445748e-06, "loss": 0.87140884, "memory(GiB)": 725.64, "step": 9095, "train_speed(iter/s)": 0.888626 }, { "acc": 0.75315924, "epoch": 0.23084765164835688, "grad_norm": 3.359375, "learning_rate": 9.88342903682569e-06, "loss": 0.95910835, "memory(GiB)": 725.64, "step": 9100, "train_speed(iter/s)": 0.883494 }, { "acc": 0.77172766, "epoch": 0.23097449101739442, "grad_norm": 3.3125, "learning_rate": 9.883203811399149e-06, "loss": 0.89308615, "memory(GiB)": 725.64, "step": 9105, "train_speed(iter/s)": 0.8782 }, { "acc": 0.75633321, "epoch": 0.231101330386432, "grad_norm": 3.796875, "learning_rate": 9.882978371176031e-06, "loss": 0.96831741, "memory(GiB)": 725.64, "step": 9110, "train_speed(iter/s)": 0.873333 }, { "acc": 0.75689769, "epoch": 0.23122816975546956, "grad_norm": 3.515625, "learning_rate": 9.882752716166254e-06, "loss": 0.97885828, "memory(GiB)": 725.64, "step": 9115, "train_speed(iter/s)": 0.867966 }, { "acc": 0.75473089, "epoch": 0.2313550091245071, "grad_norm": 3.75, "learning_rate": 9.882526846379741e-06, "loss": 0.92667828, "memory(GiB)": 725.64, "step": 9120, "train_speed(iter/s)": 0.862667 }, { "acc": 0.75560293, "epoch": 0.23148184849354467, "grad_norm": 4.9375, "learning_rate": 9.882300761826429e-06, "loss": 0.92909317, "memory(GiB)": 725.64, "step": 9125, "train_speed(iter/s)": 0.858042 }, { "acc": 0.74882851, "epoch": 0.23160868786258224, "grad_norm": 3.078125, "learning_rate": 9.882074462516263e-06, "loss": 1.02265539, "memory(GiB)": 725.64, "step": 9130, "train_speed(iter/s)": 0.853382 }, { "acc": 0.75936246, "epoch": 0.23173552723161978, "grad_norm": 3.25, "learning_rate": 9.881847948459196e-06, "loss": 0.9411602, "memory(GiB)": 725.64, "step": 9135, "train_speed(iter/s)": 0.848417 }, { "acc": 0.7564909, "epoch": 0.23186236660065734, "grad_norm": 3.625, "learning_rate": 9.881621219665192e-06, "loss": 0.92903576, "memory(GiB)": 725.64, "step": 9140, "train_speed(iter/s)": 0.843536 }, { "acc": 0.73975773, "epoch": 0.2319892059696949, "grad_norm": 3.4375, "learning_rate": 9.881394276144225e-06, "loss": 1.018468, "memory(GiB)": 725.64, "step": 9145, "train_speed(iter/s)": 0.839029 }, { "acc": 0.75743027, "epoch": 0.23211604533873245, "grad_norm": 3.40625, "learning_rate": 9.881167117906276e-06, "loss": 0.94112377, "memory(GiB)": 725.64, "step": 9150, "train_speed(iter/s)": 0.834614 }, { "acc": 0.76782289, "epoch": 0.23224288470777002, "grad_norm": 3.0625, "learning_rate": 9.880939744961337e-06, "loss": 0.88700142, "memory(GiB)": 725.64, "step": 9155, "train_speed(iter/s)": 0.830189 }, { "acc": 0.73903327, "epoch": 0.2323697240768076, "grad_norm": 3.4375, "learning_rate": 9.880712157319412e-06, "loss": 1.02850294, "memory(GiB)": 725.64, "step": 9160, "train_speed(iter/s)": 0.82553 }, { "acc": 0.74587946, "epoch": 0.23249656344584513, "grad_norm": 3.515625, "learning_rate": 9.880484354990511e-06, "loss": 0.9540616, "memory(GiB)": 725.64, "step": 9165, "train_speed(iter/s)": 0.821007 }, { "acc": 0.75163469, "epoch": 0.2326234028148827, "grad_norm": 3.375, "learning_rate": 9.880256337984652e-06, "loss": 0.94113617, "memory(GiB)": 725.64, "step": 9170, "train_speed(iter/s)": 0.816416 }, { "acc": 0.74865217, "epoch": 0.23275024218392026, "grad_norm": 3.671875, "learning_rate": 9.880028106311866e-06, "loss": 0.96099796, "memory(GiB)": 725.64, "step": 9175, "train_speed(iter/s)": 0.811629 }, { "acc": 0.74674654, "epoch": 0.2328770815529578, "grad_norm": 4.46875, "learning_rate": 9.879799659982193e-06, "loss": 0.94593735, "memory(GiB)": 725.64, "step": 9180, "train_speed(iter/s)": 0.807314 }, { "acc": 0.75664568, "epoch": 0.23300392092199537, "grad_norm": 3.296875, "learning_rate": 9.87957099900568e-06, "loss": 0.95762444, "memory(GiB)": 725.64, "step": 9185, "train_speed(iter/s)": 0.803103 }, { "acc": 0.74685302, "epoch": 0.23313076029103294, "grad_norm": 3.515625, "learning_rate": 9.879342123392385e-06, "loss": 0.96823101, "memory(GiB)": 725.64, "step": 9190, "train_speed(iter/s)": 0.799146 }, { "acc": 0.75219951, "epoch": 0.23325759966007048, "grad_norm": 2.8125, "learning_rate": 9.87911303315238e-06, "loss": 0.94153194, "memory(GiB)": 725.64, "step": 9195, "train_speed(iter/s)": 0.79525 }, { "acc": 0.76744032, "epoch": 0.23338443902910805, "grad_norm": 3.21875, "learning_rate": 9.878883728295736e-06, "loss": 0.91340942, "memory(GiB)": 725.64, "step": 9200, "train_speed(iter/s)": 0.791033 }, { "acc": 0.76279097, "epoch": 0.23351127839814562, "grad_norm": 3.484375, "learning_rate": 9.878654208832543e-06, "loss": 0.93283558, "memory(GiB)": 725.64, "step": 9205, "train_speed(iter/s)": 0.786588 }, { "acc": 0.75902753, "epoch": 0.23363811776718316, "grad_norm": 4.4375, "learning_rate": 9.878424474772894e-06, "loss": 0.92217798, "memory(GiB)": 725.64, "step": 9210, "train_speed(iter/s)": 0.782993 }, { "acc": 0.74827976, "epoch": 0.23376495713622072, "grad_norm": 4.21875, "learning_rate": 9.878194526126899e-06, "loss": 0.98233538, "memory(GiB)": 725.64, "step": 9215, "train_speed(iter/s)": 0.779132 }, { "acc": 0.77071657, "epoch": 0.2338917965052583, "grad_norm": 3.78125, "learning_rate": 9.877964362904667e-06, "loss": 0.9173152, "memory(GiB)": 725.64, "step": 9220, "train_speed(iter/s)": 0.775321 }, { "acc": 0.74808173, "epoch": 0.23401863587429583, "grad_norm": 3.84375, "learning_rate": 9.877733985116325e-06, "loss": 0.96578035, "memory(GiB)": 725.64, "step": 9225, "train_speed(iter/s)": 0.770422 }, { "acc": 0.75438542, "epoch": 0.2341454752433334, "grad_norm": 3.765625, "learning_rate": 9.877503392772008e-06, "loss": 0.91406479, "memory(GiB)": 725.64, "step": 9230, "train_speed(iter/s)": 0.766844 }, { "acc": 0.76815801, "epoch": 0.23427231461237097, "grad_norm": 3.625, "learning_rate": 9.877272585881856e-06, "loss": 0.86340647, "memory(GiB)": 725.64, "step": 9235, "train_speed(iter/s)": 0.762745 }, { "acc": 0.75705285, "epoch": 0.2343991539814085, "grad_norm": 3.6875, "learning_rate": 9.877041564456023e-06, "loss": 0.89258709, "memory(GiB)": 725.64, "step": 9240, "train_speed(iter/s)": 0.758821 }, { "acc": 0.75520625, "epoch": 0.23452599335044608, "grad_norm": 3.890625, "learning_rate": 9.876810328504672e-06, "loss": 0.96995525, "memory(GiB)": 725.64, "step": 9245, "train_speed(iter/s)": 0.754528 }, { "acc": 0.75583429, "epoch": 0.23465283271948364, "grad_norm": 2.875, "learning_rate": 9.876578878037971e-06, "loss": 0.96339703, "memory(GiB)": 725.64, "step": 9250, "train_speed(iter/s)": 0.750674 }, { "acc": 0.75768967, "epoch": 0.23477967208852119, "grad_norm": 3.59375, "learning_rate": 9.876347213066104e-06, "loss": 0.98674498, "memory(GiB)": 725.64, "step": 9255, "train_speed(iter/s)": 0.746604 }, { "acc": 0.74808846, "epoch": 0.23490651145755875, "grad_norm": 4.28125, "learning_rate": 9.87611533359926e-06, "loss": 1.01132698, "memory(GiB)": 725.64, "step": 9260, "train_speed(iter/s)": 0.743017 }, { "acc": 0.75282764, "epoch": 0.23503335082659632, "grad_norm": 3.578125, "learning_rate": 9.875883239647638e-06, "loss": 1.00073338, "memory(GiB)": 725.64, "step": 9265, "train_speed(iter/s)": 0.739726 }, { "acc": 0.74938226, "epoch": 0.23516019019563386, "grad_norm": 3.125, "learning_rate": 9.87565093122145e-06, "loss": 0.94863443, "memory(GiB)": 725.64, "step": 9270, "train_speed(iter/s)": 0.735533 }, { "acc": 0.746527, "epoch": 0.23528702956467143, "grad_norm": 3.6875, "learning_rate": 9.87541840833091e-06, "loss": 0.95073681, "memory(GiB)": 725.64, "step": 9275, "train_speed(iter/s)": 0.732164 }, { "acc": 0.7722342, "epoch": 0.235413868933709, "grad_norm": 3.546875, "learning_rate": 9.87518567098625e-06, "loss": 0.93177605, "memory(GiB)": 725.64, "step": 9280, "train_speed(iter/s)": 0.728616 }, { "acc": 0.76375923, "epoch": 0.23554070830274654, "grad_norm": 3.875, "learning_rate": 9.874952719197706e-06, "loss": 0.90163612, "memory(GiB)": 725.64, "step": 9285, "train_speed(iter/s)": 0.725152 }, { "acc": 0.7465817, "epoch": 0.2356675476717841, "grad_norm": 3.5625, "learning_rate": 9.874719552975523e-06, "loss": 0.97739639, "memory(GiB)": 725.64, "step": 9290, "train_speed(iter/s)": 0.722216 }, { "acc": 0.75617623, "epoch": 0.23579438704082167, "grad_norm": 3.984375, "learning_rate": 9.874486172329958e-06, "loss": 0.9005331, "memory(GiB)": 725.64, "step": 9295, "train_speed(iter/s)": 0.718956 }, { "acc": 0.75837374, "epoch": 0.2359212264098592, "grad_norm": 3.390625, "learning_rate": 9.874252577271278e-06, "loss": 0.98159733, "memory(GiB)": 725.64, "step": 9300, "train_speed(iter/s)": 0.715515 }, { "acc": 0.7640132, "epoch": 0.23604806577889678, "grad_norm": 3.28125, "learning_rate": 9.874018767809759e-06, "loss": 0.88462782, "memory(GiB)": 725.64, "step": 9305, "train_speed(iter/s)": 0.712242 }, { "acc": 0.74749541, "epoch": 0.23617490514793435, "grad_norm": 3.640625, "learning_rate": 9.873784743955681e-06, "loss": 0.95759573, "memory(GiB)": 725.64, "step": 9310, "train_speed(iter/s)": 0.70865 }, { "acc": 0.75500779, "epoch": 0.2363017445169719, "grad_norm": 4.3125, "learning_rate": 9.873550505719344e-06, "loss": 0.93826199, "memory(GiB)": 725.64, "step": 9315, "train_speed(iter/s)": 0.705439 }, { "acc": 0.75495119, "epoch": 0.23642858388600946, "grad_norm": 3.921875, "learning_rate": 9.873316053111046e-06, "loss": 0.92849112, "memory(GiB)": 725.64, "step": 9320, "train_speed(iter/s)": 0.702108 }, { "acc": 0.75307388, "epoch": 0.23655542325504703, "grad_norm": 3.734375, "learning_rate": 9.873081386141104e-06, "loss": 0.96532402, "memory(GiB)": 725.64, "step": 9325, "train_speed(iter/s)": 0.698596 }, { "acc": 0.73986144, "epoch": 0.23668226262408457, "grad_norm": 3.375, "learning_rate": 9.872846504819838e-06, "loss": 1.00911989, "memory(GiB)": 725.64, "step": 9330, "train_speed(iter/s)": 0.69523 }, { "acc": 0.75837317, "epoch": 0.23680910199312213, "grad_norm": 3.546875, "learning_rate": 9.872611409157579e-06, "loss": 0.93429489, "memory(GiB)": 725.64, "step": 9335, "train_speed(iter/s)": 0.692167 }, { "acc": 0.7327724, "epoch": 0.2369359413621597, "grad_norm": 3.671875, "learning_rate": 9.87237609916467e-06, "loss": 1.01419058, "memory(GiB)": 725.64, "step": 9340, "train_speed(iter/s)": 0.688986 }, { "acc": 0.74743814, "epoch": 0.23706278073119724, "grad_norm": 3.5, "learning_rate": 9.87214057485146e-06, "loss": 0.98154001, "memory(GiB)": 725.64, "step": 9345, "train_speed(iter/s)": 0.685877 }, { "acc": 0.75320415, "epoch": 0.2371896201002348, "grad_norm": 3.765625, "learning_rate": 9.87190483622831e-06, "loss": 0.97353659, "memory(GiB)": 725.64, "step": 9350, "train_speed(iter/s)": 0.683183 }, { "acc": 0.75308084, "epoch": 0.23731645946927238, "grad_norm": 2.921875, "learning_rate": 9.87166888330559e-06, "loss": 0.94490261, "memory(GiB)": 725.64, "step": 9355, "train_speed(iter/s)": 0.680133 }, { "acc": 0.76555719, "epoch": 0.23744329883830992, "grad_norm": 4.03125, "learning_rate": 9.871432716093677e-06, "loss": 0.95327845, "memory(GiB)": 725.64, "step": 9360, "train_speed(iter/s)": 0.67725 }, { "acc": 0.75712814, "epoch": 0.2375701382073475, "grad_norm": 3.421875, "learning_rate": 9.87119633460296e-06, "loss": 0.95896015, "memory(GiB)": 725.64, "step": 9365, "train_speed(iter/s)": 0.674392 }, { "acc": 0.77636471, "epoch": 0.23769697757638505, "grad_norm": 4.21875, "learning_rate": 9.870959738843837e-06, "loss": 0.95288897, "memory(GiB)": 725.64, "step": 9370, "train_speed(iter/s)": 0.671757 }, { "acc": 0.76769266, "epoch": 0.2378238169454226, "grad_norm": 3.890625, "learning_rate": 9.870722928826716e-06, "loss": 0.92881212, "memory(GiB)": 725.64, "step": 9375, "train_speed(iter/s)": 0.66875 }, { "acc": 0.75489888, "epoch": 0.23795065631446016, "grad_norm": 4.21875, "learning_rate": 9.870485904562011e-06, "loss": 0.92549343, "memory(GiB)": 725.64, "step": 9380, "train_speed(iter/s)": 0.666079 }, { "acc": 0.75902057, "epoch": 0.23807749568349773, "grad_norm": 5.71875, "learning_rate": 9.87024866606015e-06, "loss": 0.88195095, "memory(GiB)": 725.64, "step": 9385, "train_speed(iter/s)": 0.66332 }, { "acc": 0.77578154, "epoch": 0.23820433505253527, "grad_norm": 4.03125, "learning_rate": 9.870011213331567e-06, "loss": 0.90000887, "memory(GiB)": 725.64, "step": 9390, "train_speed(iter/s)": 0.660617 }, { "acc": 0.76824541, "epoch": 0.23833117442157284, "grad_norm": 3.71875, "learning_rate": 9.86977354638671e-06, "loss": 0.88298187, "memory(GiB)": 725.64, "step": 9395, "train_speed(iter/s)": 0.657808 }, { "acc": 0.76469893, "epoch": 0.2384580137906104, "grad_norm": 3.015625, "learning_rate": 9.86953566523603e-06, "loss": 0.91971903, "memory(GiB)": 725.64, "step": 9400, "train_speed(iter/s)": 0.65471 }, { "acc": 0.74740977, "epoch": 0.23858485315964795, "grad_norm": 4.53125, "learning_rate": 9.869297569889988e-06, "loss": 1.00008345, "memory(GiB)": 725.64, "step": 9405, "train_speed(iter/s)": 0.651969 }, { "acc": 0.75538816, "epoch": 0.23871169252868552, "grad_norm": 3.265625, "learning_rate": 9.869059260359064e-06, "loss": 0.98680592, "memory(GiB)": 725.64, "step": 9410, "train_speed(iter/s)": 0.649229 }, { "acc": 0.75526872, "epoch": 0.23883853189772308, "grad_norm": 3.03125, "learning_rate": 9.868820736653735e-06, "loss": 0.9510828, "memory(GiB)": 725.64, "step": 9415, "train_speed(iter/s)": 0.646281 }, { "acc": 0.73786631, "epoch": 0.23896537126676062, "grad_norm": 3.421875, "learning_rate": 9.868581998784496e-06, "loss": 1.03495512, "memory(GiB)": 725.64, "step": 9420, "train_speed(iter/s)": 0.643703 }, { "acc": 0.74685526, "epoch": 0.2390922106357982, "grad_norm": 3.640625, "learning_rate": 9.868343046761846e-06, "loss": 1.00433092, "memory(GiB)": 725.64, "step": 9425, "train_speed(iter/s)": 0.64122 }, { "acc": 0.75240378, "epoch": 0.23921905000483576, "grad_norm": 4.21875, "learning_rate": 9.868103880596299e-06, "loss": 0.95621099, "memory(GiB)": 725.64, "step": 9430, "train_speed(iter/s)": 0.638721 }, { "acc": 0.78228335, "epoch": 0.2393458893738733, "grad_norm": 3.859375, "learning_rate": 9.86786450029837e-06, "loss": 0.89327431, "memory(GiB)": 725.64, "step": 9435, "train_speed(iter/s)": 0.636422 }, { "acc": 0.76086597, "epoch": 0.23947272874291087, "grad_norm": 3.484375, "learning_rate": 9.867624905878594e-06, "loss": 0.93630753, "memory(GiB)": 725.64, "step": 9440, "train_speed(iter/s)": 0.63414 }, { "acc": 0.74681907, "epoch": 0.23959956811194844, "grad_norm": 3.296875, "learning_rate": 9.867385097347506e-06, "loss": 0.98030939, "memory(GiB)": 725.64, "step": 9445, "train_speed(iter/s)": 0.631798 }, { "acc": 0.75468006, "epoch": 0.23972640748098598, "grad_norm": 3.796875, "learning_rate": 9.867145074715655e-06, "loss": 0.94345703, "memory(GiB)": 725.64, "step": 9450, "train_speed(iter/s)": 0.629367 }, { "acc": 0.75604162, "epoch": 0.23985324685002354, "grad_norm": 3.40625, "learning_rate": 9.866904837993603e-06, "loss": 0.93358688, "memory(GiB)": 725.64, "step": 9455, "train_speed(iter/s)": 0.626855 }, { "acc": 0.75426149, "epoch": 0.2399800862190611, "grad_norm": 3.703125, "learning_rate": 9.866664387191913e-06, "loss": 0.92472048, "memory(GiB)": 725.64, "step": 9460, "train_speed(iter/s)": 0.624535 }, { "acc": 0.76894574, "epoch": 0.24010692558809865, "grad_norm": 3.390625, "learning_rate": 9.866423722321161e-06, "loss": 0.9051672, "memory(GiB)": 725.64, "step": 9465, "train_speed(iter/s)": 0.621936 }, { "acc": 0.76855655, "epoch": 0.24023376495713622, "grad_norm": 3.53125, "learning_rate": 9.866182843391937e-06, "loss": 0.89465122, "memory(GiB)": 725.64, "step": 9470, "train_speed(iter/s)": 0.619367 }, { "acc": 0.74940362, "epoch": 0.2403606043261738, "grad_norm": 4.25, "learning_rate": 9.865941750414833e-06, "loss": 0.95500908, "memory(GiB)": 725.64, "step": 9475, "train_speed(iter/s)": 0.61715 }, { "acc": 0.76247888, "epoch": 0.24048744369521133, "grad_norm": 4.0, "learning_rate": 9.865700443400456e-06, "loss": 0.95265408, "memory(GiB)": 725.64, "step": 9480, "train_speed(iter/s)": 0.615036 }, { "acc": 0.75788865, "epoch": 0.2406142830642489, "grad_norm": 3.90625, "learning_rate": 9.865458922359419e-06, "loss": 0.95931396, "memory(GiB)": 725.64, "step": 9485, "train_speed(iter/s)": 0.612448 }, { "acc": 0.77483082, "epoch": 0.24074112243328646, "grad_norm": 3.265625, "learning_rate": 9.865217187302347e-06, "loss": 0.89772005, "memory(GiB)": 725.64, "step": 9490, "train_speed(iter/s)": 0.610201 }, { "acc": 0.74846792, "epoch": 0.240867961802324, "grad_norm": 3.578125, "learning_rate": 9.864975238239871e-06, "loss": 0.99680748, "memory(GiB)": 725.64, "step": 9495, "train_speed(iter/s)": 0.607997 }, { "acc": 0.7568975, "epoch": 0.24099480117136157, "grad_norm": 3.515625, "learning_rate": 9.864733075182637e-06, "loss": 0.95156956, "memory(GiB)": 725.64, "step": 9500, "train_speed(iter/s)": 0.605556 }, { "epoch": 0.24099480117136157, "eval_acc": 0.7461173750677306, "eval_loss": 0.9163406491279602, "eval_runtime": 1149.2345, "eval_samples_per_second": 5.543, "eval_steps_per_second": 5.543, "step": 9500 }, { "acc": 0.75830774, "epoch": 0.24112164054039914, "grad_norm": 3.328125, "learning_rate": 9.864490698141294e-06, "loss": 0.93687963, "memory(GiB)": 725.64, "step": 9505, "train_speed(iter/s)": 0.541422 }, { "acc": 0.76644616, "epoch": 0.24124847990943668, "grad_norm": 3.859375, "learning_rate": 9.864248107126504e-06, "loss": 0.9181654, "memory(GiB)": 725.64, "step": 9510, "train_speed(iter/s)": 0.539673 }, { "acc": 0.74961295, "epoch": 0.24137531927847425, "grad_norm": 3.921875, "learning_rate": 9.864005302148939e-06, "loss": 0.97270842, "memory(GiB)": 725.64, "step": 9515, "train_speed(iter/s)": 0.538121 }, { "acc": 0.74838696, "epoch": 0.24150215864751182, "grad_norm": 3.421875, "learning_rate": 9.863762283219277e-06, "loss": 0.96978245, "memory(GiB)": 725.64, "step": 9520, "train_speed(iter/s)": 0.536516 }, { "acc": 0.77173533, "epoch": 0.24162899801654936, "grad_norm": 3.546875, "learning_rate": 9.863519050348211e-06, "loss": 0.8802722, "memory(GiB)": 725.64, "step": 9525, "train_speed(iter/s)": 0.534885 }, { "acc": 0.74988718, "epoch": 0.24175583738558692, "grad_norm": 5.03125, "learning_rate": 9.863275603546438e-06, "loss": 1.02002449, "memory(GiB)": 725.64, "step": 9530, "train_speed(iter/s)": 0.533297 }, { "acc": 0.75776205, "epoch": 0.2418826767546245, "grad_norm": 3.5625, "learning_rate": 9.863031942824666e-06, "loss": 0.94022388, "memory(GiB)": 725.64, "step": 9535, "train_speed(iter/s)": 0.531569 }, { "acc": 0.75577426, "epoch": 0.24200951612366203, "grad_norm": 3.484375, "learning_rate": 9.862788068193612e-06, "loss": 0.91213903, "memory(GiB)": 725.64, "step": 9540, "train_speed(iter/s)": 0.529789 }, { "acc": 0.76834111, "epoch": 0.2421363554926996, "grad_norm": 3.5625, "learning_rate": 9.862543979664007e-06, "loss": 0.91515799, "memory(GiB)": 725.64, "step": 9545, "train_speed(iter/s)": 0.528063 }, { "acc": 0.76314616, "epoch": 0.24226319486173717, "grad_norm": 3.4375, "learning_rate": 9.862299677246583e-06, "loss": 0.90561705, "memory(GiB)": 725.64, "step": 9550, "train_speed(iter/s)": 0.526264 }, { "acc": 0.75189357, "epoch": 0.2423900342307747, "grad_norm": 4.28125, "learning_rate": 9.862055160952091e-06, "loss": 0.97451687, "memory(GiB)": 725.64, "step": 9555, "train_speed(iter/s)": 0.52472 }, { "acc": 0.76228604, "epoch": 0.24251687359981228, "grad_norm": 3.5625, "learning_rate": 9.86181043079128e-06, "loss": 0.91260262, "memory(GiB)": 725.64, "step": 9560, "train_speed(iter/s)": 0.523272 }, { "acc": 0.75031204, "epoch": 0.24264371296884985, "grad_norm": 4.625, "learning_rate": 9.861565486774922e-06, "loss": 0.95672426, "memory(GiB)": 725.64, "step": 9565, "train_speed(iter/s)": 0.521862 }, { "acc": 0.75635352, "epoch": 0.24277055233788739, "grad_norm": 3.25, "learning_rate": 9.861320328913788e-06, "loss": 0.93554125, "memory(GiB)": 725.64, "step": 9570, "train_speed(iter/s)": 0.520079 }, { "acc": 0.7537106, "epoch": 0.24289739170692495, "grad_norm": 3.71875, "learning_rate": 9.86107495721866e-06, "loss": 0.98105888, "memory(GiB)": 725.64, "step": 9575, "train_speed(iter/s)": 0.51859 }, { "acc": 0.7639452, "epoch": 0.24302423107596252, "grad_norm": 2.96875, "learning_rate": 9.860829371700336e-06, "loss": 0.93254976, "memory(GiB)": 725.64, "step": 9580, "train_speed(iter/s)": 0.516928 }, { "acc": 0.75595355, "epoch": 0.24315107044500006, "grad_norm": 3.515625, "learning_rate": 9.860583572369614e-06, "loss": 0.96531715, "memory(GiB)": 725.64, "step": 9585, "train_speed(iter/s)": 0.515294 }, { "acc": 0.76105356, "epoch": 0.24327790981403763, "grad_norm": 3.90625, "learning_rate": 9.860337559237307e-06, "loss": 0.95480328, "memory(GiB)": 725.64, "step": 9590, "train_speed(iter/s)": 0.513886 }, { "acc": 0.75423002, "epoch": 0.2434047491830752, "grad_norm": 3.40625, "learning_rate": 9.860091332314237e-06, "loss": 0.98509884, "memory(GiB)": 725.64, "step": 9595, "train_speed(iter/s)": 0.512366 }, { "acc": 0.75892172, "epoch": 0.24353158855211274, "grad_norm": 2.8125, "learning_rate": 9.859844891611235e-06, "loss": 0.98403578, "memory(GiB)": 725.64, "step": 9600, "train_speed(iter/s)": 0.510846 }, { "acc": 0.76132674, "epoch": 0.2436584279211503, "grad_norm": 3.90625, "learning_rate": 9.85959823713914e-06, "loss": 0.99134769, "memory(GiB)": 725.64, "step": 9605, "train_speed(iter/s)": 0.509192 }, { "acc": 0.78098359, "epoch": 0.24378526729018787, "grad_norm": 3.75, "learning_rate": 9.859351368908802e-06, "loss": 0.8694047, "memory(GiB)": 725.64, "step": 9610, "train_speed(iter/s)": 0.507771 }, { "acc": 0.75656195, "epoch": 0.24391210665922541, "grad_norm": 3.5, "learning_rate": 9.85910428693108e-06, "loss": 0.91573467, "memory(GiB)": 725.64, "step": 9615, "train_speed(iter/s)": 0.5063 }, { "acc": 0.75957069, "epoch": 0.24403894602826298, "grad_norm": 3.828125, "learning_rate": 9.858856991216843e-06, "loss": 0.94594603, "memory(GiB)": 725.64, "step": 9620, "train_speed(iter/s)": 0.504904 }, { "acc": 0.7453444, "epoch": 0.24416578539730055, "grad_norm": 3.984375, "learning_rate": 9.858609481776969e-06, "loss": 0.98936939, "memory(GiB)": 725.64, "step": 9625, "train_speed(iter/s)": 0.503616 }, { "acc": 0.75013847, "epoch": 0.2442926247663381, "grad_norm": 3.8125, "learning_rate": 9.858361758622342e-06, "loss": 0.95050793, "memory(GiB)": 725.64, "step": 9630, "train_speed(iter/s)": 0.502255 }, { "acc": 0.75991039, "epoch": 0.24441946413537566, "grad_norm": 3.3125, "learning_rate": 9.858113821763863e-06, "loss": 0.93999567, "memory(GiB)": 725.64, "step": 9635, "train_speed(iter/s)": 0.50069 }, { "acc": 0.75460262, "epoch": 0.24454630350441323, "grad_norm": 3.953125, "learning_rate": 9.857865671212435e-06, "loss": 1.01307421, "memory(GiB)": 725.64, "step": 9640, "train_speed(iter/s)": 0.499401 }, { "acc": 0.75157909, "epoch": 0.24467314287345077, "grad_norm": 3.03125, "learning_rate": 9.857617306978975e-06, "loss": 0.9222086, "memory(GiB)": 725.64, "step": 9645, "train_speed(iter/s)": 0.497839 }, { "acc": 0.76951556, "epoch": 0.24479998224248833, "grad_norm": 3.734375, "learning_rate": 9.857368729074406e-06, "loss": 0.85178823, "memory(GiB)": 725.64, "step": 9650, "train_speed(iter/s)": 0.496281 }, { "acc": 0.7618845, "epoch": 0.2449268216115259, "grad_norm": 3.171875, "learning_rate": 9.857119937509662e-06, "loss": 0.97753096, "memory(GiB)": 725.64, "step": 9655, "train_speed(iter/s)": 0.494848 }, { "acc": 0.75437875, "epoch": 0.24505366098056344, "grad_norm": 3.765625, "learning_rate": 9.85687093229569e-06, "loss": 0.95256348, "memory(GiB)": 725.64, "step": 9660, "train_speed(iter/s)": 0.493112 }, { "acc": 0.76570268, "epoch": 0.245180500349601, "grad_norm": 3.46875, "learning_rate": 9.856621713443441e-06, "loss": 0.90984163, "memory(GiB)": 725.64, "step": 9665, "train_speed(iter/s)": 0.491546 }, { "acc": 0.75496464, "epoch": 0.24530733971863858, "grad_norm": 3.484375, "learning_rate": 9.856372280963876e-06, "loss": 0.95151758, "memory(GiB)": 725.64, "step": 9670, "train_speed(iter/s)": 0.490212 }, { "acc": 0.75613213, "epoch": 0.24543417908767612, "grad_norm": 3.03125, "learning_rate": 9.856122634867966e-06, "loss": 0.96124115, "memory(GiB)": 725.64, "step": 9675, "train_speed(iter/s)": 0.488878 }, { "acc": 0.75621414, "epoch": 0.2455610184567137, "grad_norm": 3.125, "learning_rate": 9.855872775166696e-06, "loss": 0.92500954, "memory(GiB)": 725.64, "step": 9680, "train_speed(iter/s)": 0.48767 }, { "acc": 0.76135106, "epoch": 0.24568785782575125, "grad_norm": 3.78125, "learning_rate": 9.855622701871054e-06, "loss": 0.9407546, "memory(GiB)": 725.64, "step": 9685, "train_speed(iter/s)": 0.486311 }, { "acc": 0.75703182, "epoch": 0.2458146971947888, "grad_norm": 3.46875, "learning_rate": 9.85537241499204e-06, "loss": 0.96614103, "memory(GiB)": 725.64, "step": 9690, "train_speed(iter/s)": 0.484878 }, { "acc": 0.75847998, "epoch": 0.24594153656382636, "grad_norm": 3.46875, "learning_rate": 9.855121914540664e-06, "loss": 0.93665428, "memory(GiB)": 725.64, "step": 9695, "train_speed(iter/s)": 0.483586 }, { "acc": 0.76203885, "epoch": 0.24606837593286393, "grad_norm": 5.375, "learning_rate": 9.854871200527944e-06, "loss": 0.99455233, "memory(GiB)": 725.64, "step": 9700, "train_speed(iter/s)": 0.482415 }, { "acc": 0.7463645, "epoch": 0.24619521530190147, "grad_norm": 4.625, "learning_rate": 9.854620272964907e-06, "loss": 1.01751223, "memory(GiB)": 725.64, "step": 9705, "train_speed(iter/s)": 0.481157 }, { "acc": 0.7474122, "epoch": 0.24632205467093904, "grad_norm": 3.734375, "learning_rate": 9.854369131862595e-06, "loss": 0.97924633, "memory(GiB)": 725.64, "step": 9710, "train_speed(iter/s)": 0.479935 }, { "acc": 0.75224838, "epoch": 0.2464488940399766, "grad_norm": 4.34375, "learning_rate": 9.85411777723205e-06, "loss": 0.97130833, "memory(GiB)": 725.64, "step": 9715, "train_speed(iter/s)": 0.47873 }, { "acc": 0.75553408, "epoch": 0.24657573340901415, "grad_norm": 4.15625, "learning_rate": 9.853866209084331e-06, "loss": 0.93139896, "memory(GiB)": 725.64, "step": 9720, "train_speed(iter/s)": 0.477478 }, { "acc": 0.75505643, "epoch": 0.24670257277805172, "grad_norm": 3.609375, "learning_rate": 9.853614427430501e-06, "loss": 1.00020294, "memory(GiB)": 725.65, "step": 9725, "train_speed(iter/s)": 0.476167 }, { "acc": 0.75823741, "epoch": 0.24682941214708928, "grad_norm": 3.25, "learning_rate": 9.853362432281639e-06, "loss": 0.93426523, "memory(GiB)": 725.65, "step": 9730, "train_speed(iter/s)": 0.474842 }, { "acc": 0.77503581, "epoch": 0.24695625151612682, "grad_norm": 3.453125, "learning_rate": 9.853110223648826e-06, "loss": 0.88234196, "memory(GiB)": 725.65, "step": 9735, "train_speed(iter/s)": 0.473268 }, { "acc": 0.76055341, "epoch": 0.2470830908851644, "grad_norm": 4.09375, "learning_rate": 9.852857801543157e-06, "loss": 0.97339935, "memory(GiB)": 725.65, "step": 9740, "train_speed(iter/s)": 0.472024 }, { "acc": 0.74972496, "epoch": 0.24720993025420196, "grad_norm": 2.921875, "learning_rate": 9.852605165975737e-06, "loss": 0.96044369, "memory(GiB)": 725.65, "step": 9745, "train_speed(iter/s)": 0.470726 }, { "acc": 0.76964769, "epoch": 0.2473367696232395, "grad_norm": 3.5625, "learning_rate": 9.852352316957677e-06, "loss": 0.93381586, "memory(GiB)": 725.65, "step": 9750, "train_speed(iter/s)": 0.469514 }, { "acc": 0.75509391, "epoch": 0.24746360899227707, "grad_norm": 3.453125, "learning_rate": 9.852099254500098e-06, "loss": 0.95430031, "memory(GiB)": 725.65, "step": 9755, "train_speed(iter/s)": 0.468232 }, { "acc": 0.77578692, "epoch": 0.24759044836131464, "grad_norm": 3.359375, "learning_rate": 9.851845978614131e-06, "loss": 0.90843115, "memory(GiB)": 725.65, "step": 9760, "train_speed(iter/s)": 0.467028 }, { "acc": 0.76493154, "epoch": 0.24771728773035218, "grad_norm": 4.15625, "learning_rate": 9.851592489310922e-06, "loss": 0.90427647, "memory(GiB)": 725.65, "step": 9765, "train_speed(iter/s)": 0.465914 }, { "acc": 0.76299629, "epoch": 0.24784412709938974, "grad_norm": 3.765625, "learning_rate": 9.851338786601616e-06, "loss": 0.94065971, "memory(GiB)": 725.65, "step": 9770, "train_speed(iter/s)": 0.46472 }, { "acc": 0.75070634, "epoch": 0.2479709664684273, "grad_norm": 2.921875, "learning_rate": 9.851084870497373e-06, "loss": 0.96149406, "memory(GiB)": 725.65, "step": 9775, "train_speed(iter/s)": 0.463501 }, { "acc": 0.76090765, "epoch": 0.24809780583746485, "grad_norm": 4.125, "learning_rate": 9.850830741009363e-06, "loss": 0.94778452, "memory(GiB)": 725.65, "step": 9780, "train_speed(iter/s)": 0.46235 }, { "acc": 0.76364431, "epoch": 0.24822464520650242, "grad_norm": 4.5, "learning_rate": 9.850576398148766e-06, "loss": 0.92817202, "memory(GiB)": 725.65, "step": 9785, "train_speed(iter/s)": 0.461197 }, { "acc": 0.74268889, "epoch": 0.24835148457554, "grad_norm": 3.3125, "learning_rate": 9.850321841926767e-06, "loss": 0.94740763, "memory(GiB)": 725.65, "step": 9790, "train_speed(iter/s)": 0.460031 }, { "acc": 0.75469513, "epoch": 0.24847832394457753, "grad_norm": 3.25, "learning_rate": 9.850067072354566e-06, "loss": 0.96257029, "memory(GiB)": 725.65, "step": 9795, "train_speed(iter/s)": 0.45893 }, { "acc": 0.76358376, "epoch": 0.2486051633136151, "grad_norm": 3.421875, "learning_rate": 9.849812089443365e-06, "loss": 0.96290016, "memory(GiB)": 725.65, "step": 9800, "train_speed(iter/s)": 0.45771 }, { "acc": 0.7510025, "epoch": 0.24873200268265266, "grad_norm": 4.15625, "learning_rate": 9.849556893204384e-06, "loss": 0.97774734, "memory(GiB)": 725.65, "step": 9805, "train_speed(iter/s)": 0.456461 }, { "acc": 0.75844936, "epoch": 0.2488588420516902, "grad_norm": 3.296875, "learning_rate": 9.849301483648847e-06, "loss": 0.95429468, "memory(GiB)": 725.65, "step": 9810, "train_speed(iter/s)": 0.455257 }, { "acc": 0.75643849, "epoch": 0.24898568142072777, "grad_norm": 3.875, "learning_rate": 9.84904586078799e-06, "loss": 0.94761953, "memory(GiB)": 725.65, "step": 9815, "train_speed(iter/s)": 0.454045 }, { "acc": 0.75846047, "epoch": 0.24911252078976534, "grad_norm": 3.234375, "learning_rate": 9.848790024633052e-06, "loss": 0.97447252, "memory(GiB)": 725.65, "step": 9820, "train_speed(iter/s)": 0.453029 }, { "acc": 0.76430836, "epoch": 0.24923936015880288, "grad_norm": 3.796875, "learning_rate": 9.848533975195293e-06, "loss": 0.96283369, "memory(GiB)": 725.65, "step": 9825, "train_speed(iter/s)": 0.451748 }, { "acc": 0.77743521, "epoch": 0.24936619952784045, "grad_norm": 3.53125, "learning_rate": 9.848277712485972e-06, "loss": 0.87934551, "memory(GiB)": 725.65, "step": 9830, "train_speed(iter/s)": 0.450549 }, { "acc": 0.75483274, "epoch": 0.24949303889687802, "grad_norm": 3.796875, "learning_rate": 9.848021236516361e-06, "loss": 0.95502081, "memory(GiB)": 725.65, "step": 9835, "train_speed(iter/s)": 0.449323 }, { "acc": 0.76303811, "epoch": 0.24961987826591556, "grad_norm": 4.28125, "learning_rate": 9.847764547297744e-06, "loss": 0.91951532, "memory(GiB)": 725.65, "step": 9840, "train_speed(iter/s)": 0.448194 }, { "acc": 0.75388293, "epoch": 0.24974671763495312, "grad_norm": 3.4375, "learning_rate": 9.84750764484141e-06, "loss": 0.96296349, "memory(GiB)": 725.65, "step": 9845, "train_speed(iter/s)": 0.4471 }, { "acc": 0.76039996, "epoch": 0.2498735570039907, "grad_norm": 3.34375, "learning_rate": 9.84725052915866e-06, "loss": 0.96554718, "memory(GiB)": 725.65, "step": 9850, "train_speed(iter/s)": 0.445877 }, { "acc": 0.73640981, "epoch": 0.25000039637302823, "grad_norm": 3.296875, "learning_rate": 9.846993200260803e-06, "loss": 0.98753271, "memory(GiB)": 725.65, "step": 9855, "train_speed(iter/s)": 0.44463 }, { "acc": 0.76293893, "epoch": 0.2501272357420658, "grad_norm": 3.34375, "learning_rate": 9.84673565815916e-06, "loss": 0.90208292, "memory(GiB)": 725.65, "step": 9860, "train_speed(iter/s)": 0.443521 }, { "acc": 0.74780674, "epoch": 0.25025407511110337, "grad_norm": 3.703125, "learning_rate": 9.846477902865055e-06, "loss": 0.98316231, "memory(GiB)": 725.65, "step": 9865, "train_speed(iter/s)": 0.442295 }, { "acc": 0.75512495, "epoch": 0.25038091448014094, "grad_norm": 3.109375, "learning_rate": 9.846219934389831e-06, "loss": 0.9246664, "memory(GiB)": 725.65, "step": 9870, "train_speed(iter/s)": 0.441235 }, { "acc": 0.74677582, "epoch": 0.2505077538491785, "grad_norm": 3.703125, "learning_rate": 9.845961752744833e-06, "loss": 1.0304225, "memory(GiB)": 725.65, "step": 9875, "train_speed(iter/s)": 0.440155 }, { "acc": 0.75553484, "epoch": 0.250634593218216, "grad_norm": 5.3125, "learning_rate": 9.845703357941417e-06, "loss": 0.96622801, "memory(GiB)": 725.65, "step": 9880, "train_speed(iter/s)": 0.438988 }, { "acc": 0.76229601, "epoch": 0.2507614325872536, "grad_norm": 3.359375, "learning_rate": 9.84544474999095e-06, "loss": 0.90798512, "memory(GiB)": 725.65, "step": 9885, "train_speed(iter/s)": 0.43787 }, { "acc": 0.76221533, "epoch": 0.25088827195629115, "grad_norm": 3.609375, "learning_rate": 9.845185928904806e-06, "loss": 0.92468081, "memory(GiB)": 725.65, "step": 9890, "train_speed(iter/s)": 0.43682 }, { "acc": 0.75258737, "epoch": 0.2510151113253287, "grad_norm": 3.25, "learning_rate": 9.844926894694374e-06, "loss": 0.93070221, "memory(GiB)": 725.65, "step": 9895, "train_speed(iter/s)": 0.435741 }, { "acc": 0.73670554, "epoch": 0.2511419506943663, "grad_norm": 3.359375, "learning_rate": 9.844667647371041e-06, "loss": 1.017694, "memory(GiB)": 725.65, "step": 9900, "train_speed(iter/s)": 0.43458 }, { "acc": 0.74972162, "epoch": 0.2512687900634038, "grad_norm": 3.3125, "learning_rate": 9.844408186946216e-06, "loss": 0.97069721, "memory(GiB)": 725.65, "step": 9905, "train_speed(iter/s)": 0.433395 }, { "acc": 0.75887961, "epoch": 0.25139562943244137, "grad_norm": 3.390625, "learning_rate": 9.84414851343131e-06, "loss": 0.91470156, "memory(GiB)": 725.65, "step": 9910, "train_speed(iter/s)": 0.432405 }, { "acc": 0.76457472, "epoch": 0.25152246880147894, "grad_norm": 3.71875, "learning_rate": 9.843888626837745e-06, "loss": 0.94082642, "memory(GiB)": 725.65, "step": 9915, "train_speed(iter/s)": 0.431352 }, { "acc": 0.75916371, "epoch": 0.2516493081705165, "grad_norm": 3.328125, "learning_rate": 9.843628527176954e-06, "loss": 0.94995127, "memory(GiB)": 725.65, "step": 9920, "train_speed(iter/s)": 0.43037 }, { "acc": 0.75879397, "epoch": 0.2517761475395541, "grad_norm": 3.703125, "learning_rate": 9.843368214460374e-06, "loss": 0.92467365, "memory(GiB)": 725.65, "step": 9925, "train_speed(iter/s)": 0.429381 }, { "acc": 0.76795549, "epoch": 0.25190298690859164, "grad_norm": 2.71875, "learning_rate": 9.84310768869946e-06, "loss": 0.88705111, "memory(GiB)": 725.65, "step": 9930, "train_speed(iter/s)": 0.428378 }, { "acc": 0.7588594, "epoch": 0.25202982627762915, "grad_norm": 3.96875, "learning_rate": 9.84284694990567e-06, "loss": 0.937918, "memory(GiB)": 725.65, "step": 9935, "train_speed(iter/s)": 0.427357 }, { "acc": 0.75044398, "epoch": 0.2521566656466667, "grad_norm": 4.21875, "learning_rate": 9.842585998090472e-06, "loss": 1.01130905, "memory(GiB)": 725.65, "step": 9940, "train_speed(iter/s)": 0.426373 }, { "acc": 0.76604137, "epoch": 0.2522835050157043, "grad_norm": 3.65625, "learning_rate": 9.842324833265348e-06, "loss": 0.90806723, "memory(GiB)": 725.65, "step": 9945, "train_speed(iter/s)": 0.42535 }, { "acc": 0.76795435, "epoch": 0.25241034438474186, "grad_norm": 3.96875, "learning_rate": 9.84206345544178e-06, "loss": 0.8800189, "memory(GiB)": 725.65, "step": 9950, "train_speed(iter/s)": 0.424372 }, { "acc": 0.75466757, "epoch": 0.2525371837537794, "grad_norm": 3.484375, "learning_rate": 9.84180186463127e-06, "loss": 0.96069279, "memory(GiB)": 725.65, "step": 9955, "train_speed(iter/s)": 0.423408 }, { "acc": 0.7606688, "epoch": 0.252664023122817, "grad_norm": 3.234375, "learning_rate": 9.841540060845323e-06, "loss": 0.9454277, "memory(GiB)": 725.65, "step": 9960, "train_speed(iter/s)": 0.422376 }, { "acc": 0.75588183, "epoch": 0.2527908624918545, "grad_norm": 3.9375, "learning_rate": 9.841278044095454e-06, "loss": 0.95712252, "memory(GiB)": 725.65, "step": 9965, "train_speed(iter/s)": 0.421427 }, { "acc": 0.76387682, "epoch": 0.2529177018608921, "grad_norm": 3.65625, "learning_rate": 9.841015814393189e-06, "loss": 0.90752058, "memory(GiB)": 725.65, "step": 9970, "train_speed(iter/s)": 0.420375 }, { "acc": 0.7583642, "epoch": 0.25304454122992964, "grad_norm": 3.484375, "learning_rate": 9.840753371750064e-06, "loss": 0.93238087, "memory(GiB)": 725.65, "step": 9975, "train_speed(iter/s)": 0.419536 }, { "acc": 0.75476913, "epoch": 0.2531713805989672, "grad_norm": 4.25, "learning_rate": 9.84049071617762e-06, "loss": 0.96089029, "memory(GiB)": 725.65, "step": 9980, "train_speed(iter/s)": 0.418643 }, { "acc": 0.75048037, "epoch": 0.2532982199680048, "grad_norm": 3.609375, "learning_rate": 9.840227847687414e-06, "loss": 0.93940496, "memory(GiB)": 725.65, "step": 9985, "train_speed(iter/s)": 0.417652 }, { "acc": 0.76046329, "epoch": 0.25342505933704235, "grad_norm": 3.0, "learning_rate": 9.839964766291006e-06, "loss": 0.93258152, "memory(GiB)": 725.65, "step": 9990, "train_speed(iter/s)": 0.416644 }, { "acc": 0.77813253, "epoch": 0.25355189870607986, "grad_norm": 3.15625, "learning_rate": 9.83970147199997e-06, "loss": 0.89718552, "memory(GiB)": 725.65, "step": 9995, "train_speed(iter/s)": 0.415719 }, { "acc": 0.76638446, "epoch": 0.2536787380751174, "grad_norm": 3.203125, "learning_rate": 9.839437964825884e-06, "loss": 0.90833445, "memory(GiB)": 725.65, "step": 10000, "train_speed(iter/s)": 0.414761 }, { "epoch": 0.2536787380751174, "eval_acc": 0.746709769112541, "eval_loss": 0.9142379760742188, "eval_runtime": 1151.1001, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 10000 }, { "acc": 0.75456343, "epoch": 0.253805577444155, "grad_norm": 3.515625, "learning_rate": 9.839174244780343e-06, "loss": 0.89657822, "memory(GiB)": 725.65, "step": 10005, "train_speed(iter/s)": 0.383745 }, { "acc": 0.76099577, "epoch": 0.25393241681319256, "grad_norm": 3.578125, "learning_rate": 9.838910311874945e-06, "loss": 0.90029669, "memory(GiB)": 725.65, "step": 10010, "train_speed(iter/s)": 0.382905 }, { "acc": 0.76183052, "epoch": 0.25405925618223013, "grad_norm": 3.515625, "learning_rate": 9.838646166121301e-06, "loss": 0.89279509, "memory(GiB)": 725.65, "step": 10015, "train_speed(iter/s)": 0.382154 }, { "acc": 0.76128011, "epoch": 0.2541860955512677, "grad_norm": 3.453125, "learning_rate": 9.838381807531027e-06, "loss": 0.93741789, "memory(GiB)": 725.65, "step": 10020, "train_speed(iter/s)": 0.38142 }, { "acc": 0.75016618, "epoch": 0.2543129349203052, "grad_norm": 4.0, "learning_rate": 9.838117236115757e-06, "loss": 0.96110525, "memory(GiB)": 725.65, "step": 10025, "train_speed(iter/s)": 0.38063 }, { "acc": 0.75898895, "epoch": 0.2544397742893428, "grad_norm": 3.390625, "learning_rate": 9.837852451887122e-06, "loss": 0.96353531, "memory(GiB)": 725.65, "step": 10030, "train_speed(iter/s)": 0.379893 }, { "acc": 0.75922513, "epoch": 0.25456661365838035, "grad_norm": 3.359375, "learning_rate": 9.837587454856772e-06, "loss": 0.89244375, "memory(GiB)": 725.65, "step": 10035, "train_speed(iter/s)": 0.378956 }, { "acc": 0.76163559, "epoch": 0.2546934530274179, "grad_norm": 4.125, "learning_rate": 9.837322245036366e-06, "loss": 0.93757057, "memory(GiB)": 725.65, "step": 10040, "train_speed(iter/s)": 0.378219 }, { "acc": 0.75543137, "epoch": 0.2548202923964555, "grad_norm": 3.484375, "learning_rate": 9.837056822437562e-06, "loss": 0.97284985, "memory(GiB)": 725.65, "step": 10045, "train_speed(iter/s)": 0.377485 }, { "acc": 0.7523025, "epoch": 0.25494713176549305, "grad_norm": 3.65625, "learning_rate": 9.836791187072045e-06, "loss": 0.9527957, "memory(GiB)": 725.65, "step": 10050, "train_speed(iter/s)": 0.376684 }, { "acc": 0.75685215, "epoch": 0.25507397113453056, "grad_norm": 3.4375, "learning_rate": 9.836525338951492e-06, "loss": 0.9310812, "memory(GiB)": 725.65, "step": 10055, "train_speed(iter/s)": 0.37599 }, { "acc": 0.7529511, "epoch": 0.25520081050356813, "grad_norm": 3.46875, "learning_rate": 9.8362592780876e-06, "loss": 0.94542971, "memory(GiB)": 725.65, "step": 10060, "train_speed(iter/s)": 0.375216 }, { "acc": 0.74850345, "epoch": 0.2553276498726057, "grad_norm": 3.484375, "learning_rate": 9.835993004492073e-06, "loss": 0.99299097, "memory(GiB)": 725.65, "step": 10065, "train_speed(iter/s)": 0.374385 }, { "acc": 0.75441437, "epoch": 0.25545448924164327, "grad_norm": 3.734375, "learning_rate": 9.835726518176621e-06, "loss": 0.91581659, "memory(GiB)": 725.65, "step": 10070, "train_speed(iter/s)": 0.37369 }, { "acc": 0.75673389, "epoch": 0.25558132861068084, "grad_norm": 3.25, "learning_rate": 9.835459819152967e-06, "loss": 0.94498138, "memory(GiB)": 725.65, "step": 10075, "train_speed(iter/s)": 0.372913 }, { "acc": 0.75178747, "epoch": 0.2557081679797184, "grad_norm": 3.453125, "learning_rate": 9.835192907432842e-06, "loss": 0.97077618, "memory(GiB)": 725.65, "step": 10080, "train_speed(iter/s)": 0.372151 }, { "acc": 0.74695687, "epoch": 0.2558350073487559, "grad_norm": 3.984375, "learning_rate": 9.834925783027988e-06, "loss": 0.93371248, "memory(GiB)": 725.65, "step": 10085, "train_speed(iter/s)": 0.371459 }, { "acc": 0.75995431, "epoch": 0.2559618467177935, "grad_norm": 3.71875, "learning_rate": 9.834658445950154e-06, "loss": 0.95230503, "memory(GiB)": 725.65, "step": 10090, "train_speed(iter/s)": 0.370778 }, { "acc": 0.75678949, "epoch": 0.25608868608683105, "grad_norm": 3.140625, "learning_rate": 9.834390896211097e-06, "loss": 0.93176012, "memory(GiB)": 725.65, "step": 10095, "train_speed(iter/s)": 0.37007 }, { "acc": 0.75518866, "epoch": 0.2562155254558686, "grad_norm": 3.796875, "learning_rate": 9.834123133822589e-06, "loss": 0.92428398, "memory(GiB)": 725.65, "step": 10100, "train_speed(iter/s)": 0.369329 }, { "acc": 0.76400361, "epoch": 0.2563423648249062, "grad_norm": 4.09375, "learning_rate": 9.833855158796407e-06, "loss": 0.91967688, "memory(GiB)": 725.65, "step": 10105, "train_speed(iter/s)": 0.368609 }, { "acc": 0.7701344, "epoch": 0.25646920419394376, "grad_norm": 3.578125, "learning_rate": 9.833586971144338e-06, "loss": 0.93293877, "memory(GiB)": 725.65, "step": 10110, "train_speed(iter/s)": 0.367971 }, { "acc": 0.7611279, "epoch": 0.25659604356298127, "grad_norm": 3.5625, "learning_rate": 9.833318570878179e-06, "loss": 0.95223455, "memory(GiB)": 725.65, "step": 10115, "train_speed(iter/s)": 0.367176 }, { "acc": 0.74725819, "epoch": 0.25672288293201884, "grad_norm": 3.78125, "learning_rate": 9.833049958009735e-06, "loss": 0.9789628, "memory(GiB)": 725.65, "step": 10120, "train_speed(iter/s)": 0.366424 }, { "acc": 0.75444126, "epoch": 0.2568497223010564, "grad_norm": 4.125, "learning_rate": 9.832781132550824e-06, "loss": 0.94320412, "memory(GiB)": 725.65, "step": 10125, "train_speed(iter/s)": 0.365674 }, { "acc": 0.76805019, "epoch": 0.256976561670094, "grad_norm": 3.84375, "learning_rate": 9.832512094513268e-06, "loss": 0.90172491, "memory(GiB)": 725.65, "step": 10130, "train_speed(iter/s)": 0.365007 }, { "acc": 0.76839762, "epoch": 0.25710340103913154, "grad_norm": 3.421875, "learning_rate": 9.832242843908904e-06, "loss": 0.9059536, "memory(GiB)": 725.65, "step": 10135, "train_speed(iter/s)": 0.364256 }, { "acc": 0.76023264, "epoch": 0.2572302404081691, "grad_norm": 4.0625, "learning_rate": 9.831973380749571e-06, "loss": 0.90989685, "memory(GiB)": 725.65, "step": 10140, "train_speed(iter/s)": 0.363599 }, { "acc": 0.76187673, "epoch": 0.2573570797772066, "grad_norm": 3.359375, "learning_rate": 9.831703705047126e-06, "loss": 0.91041842, "memory(GiB)": 725.65, "step": 10145, "train_speed(iter/s)": 0.362876 }, { "acc": 0.77849364, "epoch": 0.2574839191462442, "grad_norm": 3.75, "learning_rate": 9.83143381681343e-06, "loss": 0.84196596, "memory(GiB)": 725.65, "step": 10150, "train_speed(iter/s)": 0.362053 }, { "acc": 0.77387047, "epoch": 0.25761075851528176, "grad_norm": 2.875, "learning_rate": 9.831163716060355e-06, "loss": 0.91718845, "memory(GiB)": 725.65, "step": 10155, "train_speed(iter/s)": 0.361216 }, { "acc": 0.77559924, "epoch": 0.2577375978843193, "grad_norm": 3.625, "learning_rate": 9.83089340279978e-06, "loss": 0.91441813, "memory(GiB)": 725.65, "step": 10160, "train_speed(iter/s)": 0.360519 }, { "acc": 0.76750917, "epoch": 0.2578644372533569, "grad_norm": 3.234375, "learning_rate": 9.830622877043596e-06, "loss": 0.8997632, "memory(GiB)": 725.65, "step": 10165, "train_speed(iter/s)": 0.359823 }, { "acc": 0.76816092, "epoch": 0.25799127662239446, "grad_norm": 4.0, "learning_rate": 9.830352138803704e-06, "loss": 0.92539883, "memory(GiB)": 725.65, "step": 10170, "train_speed(iter/s)": 0.35921 }, { "acc": 0.75003376, "epoch": 0.258118115991432, "grad_norm": 3.171875, "learning_rate": 9.830081188092012e-06, "loss": 0.96734056, "memory(GiB)": 725.65, "step": 10175, "train_speed(iter/s)": 0.358584 }, { "acc": 0.75828414, "epoch": 0.25824495536046954, "grad_norm": 3.953125, "learning_rate": 9.829810024920437e-06, "loss": 0.93619595, "memory(GiB)": 725.65, "step": 10180, "train_speed(iter/s)": 0.357896 }, { "acc": 0.77307744, "epoch": 0.2583717947295071, "grad_norm": 3.40625, "learning_rate": 9.829538649300907e-06, "loss": 0.88514795, "memory(GiB)": 725.65, "step": 10185, "train_speed(iter/s)": 0.357232 }, { "acc": 0.74386592, "epoch": 0.2584986340985447, "grad_norm": 4.21875, "learning_rate": 9.829267061245362e-06, "loss": 1.00026388, "memory(GiB)": 725.65, "step": 10190, "train_speed(iter/s)": 0.356542 }, { "acc": 0.7507175, "epoch": 0.25862547346758225, "grad_norm": 4.0, "learning_rate": 9.828995260765745e-06, "loss": 0.94614305, "memory(GiB)": 725.65, "step": 10195, "train_speed(iter/s)": 0.355931 }, { "acc": 0.75170999, "epoch": 0.2587523128366198, "grad_norm": 2.953125, "learning_rate": 9.828723247874013e-06, "loss": 0.98201914, "memory(GiB)": 725.65, "step": 10200, "train_speed(iter/s)": 0.355333 }, { "acc": 0.74801216, "epoch": 0.2588791522056573, "grad_norm": 4.78125, "learning_rate": 9.828451022582129e-06, "loss": 0.98305979, "memory(GiB)": 725.65, "step": 10205, "train_speed(iter/s)": 0.354727 }, { "acc": 0.73998661, "epoch": 0.2590059915746949, "grad_norm": 3.765625, "learning_rate": 9.828178584902071e-06, "loss": 0.96892462, "memory(GiB)": 725.65, "step": 10210, "train_speed(iter/s)": 0.354094 }, { "acc": 0.76854134, "epoch": 0.25913283094373246, "grad_norm": 3.375, "learning_rate": 9.827905934845818e-06, "loss": 0.91610203, "memory(GiB)": 725.65, "step": 10215, "train_speed(iter/s)": 0.353446 }, { "acc": 0.76440911, "epoch": 0.25925967031277003, "grad_norm": 3.6875, "learning_rate": 9.827633072425367e-06, "loss": 0.91592007, "memory(GiB)": 739.27, "step": 10220, "train_speed(iter/s)": 0.352795 }, { "acc": 0.75217662, "epoch": 0.2593865096818076, "grad_norm": 4.21875, "learning_rate": 9.827359997652718e-06, "loss": 0.99215794, "memory(GiB)": 739.27, "step": 10225, "train_speed(iter/s)": 0.352132 }, { "acc": 0.74884338, "epoch": 0.25951334905084517, "grad_norm": 4.1875, "learning_rate": 9.827086710539886e-06, "loss": 0.99708672, "memory(GiB)": 739.27, "step": 10230, "train_speed(iter/s)": 0.351528 }, { "acc": 0.77280931, "epoch": 0.2596401884198827, "grad_norm": 3.90625, "learning_rate": 9.826813211098889e-06, "loss": 0.86041069, "memory(GiB)": 739.27, "step": 10235, "train_speed(iter/s)": 0.350897 }, { "acc": 0.7662992, "epoch": 0.25976702778892025, "grad_norm": 3.1875, "learning_rate": 9.826539499341757e-06, "loss": 0.87459526, "memory(GiB)": 739.27, "step": 10240, "train_speed(iter/s)": 0.350222 }, { "acc": 0.75499845, "epoch": 0.2598938671579578, "grad_norm": 3.84375, "learning_rate": 9.826265575280531e-06, "loss": 0.90154486, "memory(GiB)": 739.27, "step": 10245, "train_speed(iter/s)": 0.349545 }, { "acc": 0.75970888, "epoch": 0.2600207065269954, "grad_norm": 3.34375, "learning_rate": 9.82599143892726e-06, "loss": 0.96217537, "memory(GiB)": 739.27, "step": 10250, "train_speed(iter/s)": 0.348973 }, { "acc": 0.76162558, "epoch": 0.26014754589603295, "grad_norm": 3.765625, "learning_rate": 9.825717090294e-06, "loss": 0.95020132, "memory(GiB)": 739.27, "step": 10255, "train_speed(iter/s)": 0.348346 }, { "acc": 0.75693946, "epoch": 0.2602743852650705, "grad_norm": 3.515625, "learning_rate": 9.825442529392823e-06, "loss": 0.92784538, "memory(GiB)": 739.27, "step": 10260, "train_speed(iter/s)": 0.347726 }, { "acc": 0.75413651, "epoch": 0.26040122463410803, "grad_norm": 4.125, "learning_rate": 9.825167756235805e-06, "loss": 0.94456663, "memory(GiB)": 739.27, "step": 10265, "train_speed(iter/s)": 0.347132 }, { "acc": 0.74835815, "epoch": 0.2605280640031456, "grad_norm": 3.46875, "learning_rate": 9.82489277083503e-06, "loss": 0.98641224, "memory(GiB)": 739.27, "step": 10270, "train_speed(iter/s)": 0.346527 }, { "acc": 0.76502123, "epoch": 0.26065490337218317, "grad_norm": 4.03125, "learning_rate": 9.824617573202595e-06, "loss": 0.95417328, "memory(GiB)": 739.27, "step": 10275, "train_speed(iter/s)": 0.345953 }, { "acc": 0.74959993, "epoch": 0.26078174274122073, "grad_norm": 3.40625, "learning_rate": 9.824342163350604e-06, "loss": 0.91525116, "memory(GiB)": 739.27, "step": 10280, "train_speed(iter/s)": 0.345265 }, { "acc": 0.76358619, "epoch": 0.2609085821102583, "grad_norm": 3.828125, "learning_rate": 9.824066541291175e-06, "loss": 0.91386757, "memory(GiB)": 739.27, "step": 10285, "train_speed(iter/s)": 0.344673 }, { "acc": 0.7675272, "epoch": 0.26103542147929587, "grad_norm": 3.984375, "learning_rate": 9.823790707036428e-06, "loss": 0.9551302, "memory(GiB)": 739.27, "step": 10290, "train_speed(iter/s)": 0.344064 }, { "acc": 0.76247158, "epoch": 0.2611622608483334, "grad_norm": 4.625, "learning_rate": 9.823514660598497e-06, "loss": 0.97464857, "memory(GiB)": 739.27, "step": 10295, "train_speed(iter/s)": 0.343478 }, { "acc": 0.75536733, "epoch": 0.26128910021737095, "grad_norm": 3.40625, "learning_rate": 9.823238401989524e-06, "loss": 0.91848631, "memory(GiB)": 739.27, "step": 10300, "train_speed(iter/s)": 0.342888 }, { "acc": 0.75339575, "epoch": 0.2614159395864085, "grad_norm": 3.96875, "learning_rate": 9.822961931221663e-06, "loss": 0.94591646, "memory(GiB)": 739.27, "step": 10305, "train_speed(iter/s)": 0.342352 }, { "acc": 0.77337937, "epoch": 0.2615427789554461, "grad_norm": 3.328125, "learning_rate": 9.822685248307074e-06, "loss": 0.89312181, "memory(GiB)": 739.27, "step": 10310, "train_speed(iter/s)": 0.341719 }, { "acc": 0.75670686, "epoch": 0.26166961832448365, "grad_norm": 3.796875, "learning_rate": 9.822408353257924e-06, "loss": 0.94860992, "memory(GiB)": 739.27, "step": 10315, "train_speed(iter/s)": 0.341114 }, { "acc": 0.74723983, "epoch": 0.2617964576935212, "grad_norm": 3.34375, "learning_rate": 9.822131246086399e-06, "loss": 0.98127012, "memory(GiB)": 739.27, "step": 10320, "train_speed(iter/s)": 0.340465 }, { "acc": 0.75763273, "epoch": 0.26192329706255874, "grad_norm": 3.671875, "learning_rate": 9.821853926804684e-06, "loss": 0.93778419, "memory(GiB)": 739.27, "step": 10325, "train_speed(iter/s)": 0.339855 }, { "acc": 0.76736627, "epoch": 0.2620501364315963, "grad_norm": 3.453125, "learning_rate": 9.821576395424977e-06, "loss": 0.94875431, "memory(GiB)": 739.27, "step": 10330, "train_speed(iter/s)": 0.339216 }, { "acc": 0.75202837, "epoch": 0.26217697580063387, "grad_norm": 3.328125, "learning_rate": 9.821298651959487e-06, "loss": 0.96716433, "memory(GiB)": 739.27, "step": 10335, "train_speed(iter/s)": 0.338644 }, { "acc": 0.75515738, "epoch": 0.26230381516967144, "grad_norm": 3.71875, "learning_rate": 9.821020696420431e-06, "loss": 0.95930691, "memory(GiB)": 739.27, "step": 10340, "train_speed(iter/s)": 0.338043 }, { "acc": 0.7572742, "epoch": 0.262430654538709, "grad_norm": 3.71875, "learning_rate": 9.820742528820035e-06, "loss": 0.92723656, "memory(GiB)": 739.27, "step": 10345, "train_speed(iter/s)": 0.337433 }, { "acc": 0.75137873, "epoch": 0.2625574939077466, "grad_norm": 3.5, "learning_rate": 9.820464149170537e-06, "loss": 0.90797606, "memory(GiB)": 739.27, "step": 10350, "train_speed(iter/s)": 0.336722 }, { "acc": 0.76596413, "epoch": 0.2626843332767841, "grad_norm": 4.0, "learning_rate": 9.820185557484177e-06, "loss": 0.92928362, "memory(GiB)": 739.27, "step": 10355, "train_speed(iter/s)": 0.336139 }, { "acc": 0.75195975, "epoch": 0.26281117264582166, "grad_norm": 3.140625, "learning_rate": 9.819906753773214e-06, "loss": 0.92101622, "memory(GiB)": 739.27, "step": 10360, "train_speed(iter/s)": 0.3356 }, { "acc": 0.74485135, "epoch": 0.2629380120148592, "grad_norm": 5.6875, "learning_rate": 9.819627738049911e-06, "loss": 0.98608999, "memory(GiB)": 739.27, "step": 10365, "train_speed(iter/s)": 0.335014 }, { "acc": 0.75736094, "epoch": 0.2630648513838968, "grad_norm": 3.328125, "learning_rate": 9.819348510326538e-06, "loss": 0.89008598, "memory(GiB)": 739.27, "step": 10370, "train_speed(iter/s)": 0.334381 }, { "acc": 0.74541874, "epoch": 0.26319169075293436, "grad_norm": 3.8125, "learning_rate": 9.81906907061538e-06, "loss": 1.01412802, "memory(GiB)": 739.27, "step": 10375, "train_speed(iter/s)": 0.333851 }, { "acc": 0.74404049, "epoch": 0.2633185301219719, "grad_norm": 3.5625, "learning_rate": 9.81878941892873e-06, "loss": 1.00570936, "memory(GiB)": 739.27, "step": 10380, "train_speed(iter/s)": 0.333333 }, { "acc": 0.75877438, "epoch": 0.26344536949100944, "grad_norm": 3.15625, "learning_rate": 9.818509555278885e-06, "loss": 0.93515024, "memory(GiB)": 739.27, "step": 10385, "train_speed(iter/s)": 0.332787 }, { "acc": 0.75752649, "epoch": 0.263572208860047, "grad_norm": 3.453125, "learning_rate": 9.81822947967816e-06, "loss": 0.93600492, "memory(GiB)": 739.27, "step": 10390, "train_speed(iter/s)": 0.332291 }, { "acc": 0.75348053, "epoch": 0.2636990482290846, "grad_norm": 3.5, "learning_rate": 9.817949192138869e-06, "loss": 0.97208652, "memory(GiB)": 739.27, "step": 10395, "train_speed(iter/s)": 0.331763 }, { "acc": 0.75543036, "epoch": 0.26382588759812214, "grad_norm": 3.125, "learning_rate": 9.817668692673344e-06, "loss": 0.96873341, "memory(GiB)": 739.27, "step": 10400, "train_speed(iter/s)": 0.331226 }, { "acc": 0.74919124, "epoch": 0.2639527269671597, "grad_norm": 3.890625, "learning_rate": 9.817387981293924e-06, "loss": 1.02502203, "memory(GiB)": 739.27, "step": 10405, "train_speed(iter/s)": 0.330718 }, { "acc": 0.74558964, "epoch": 0.2640795663361973, "grad_norm": 3.390625, "learning_rate": 9.817107058012955e-06, "loss": 0.9783637, "memory(GiB)": 739.27, "step": 10410, "train_speed(iter/s)": 0.33013 }, { "acc": 0.76958323, "epoch": 0.2642064057052348, "grad_norm": 3.390625, "learning_rate": 9.816825922842796e-06, "loss": 0.87089777, "memory(GiB)": 739.27, "step": 10415, "train_speed(iter/s)": 0.329653 }, { "acc": 0.76073542, "epoch": 0.26433324507427236, "grad_norm": 3.9375, "learning_rate": 9.816544575795812e-06, "loss": 0.93182678, "memory(GiB)": 739.27, "step": 10420, "train_speed(iter/s)": 0.329064 }, { "acc": 0.75927515, "epoch": 0.26446008444330993, "grad_norm": 3.6875, "learning_rate": 9.816263016884379e-06, "loss": 0.93276224, "memory(GiB)": 739.27, "step": 10425, "train_speed(iter/s)": 0.328545 }, { "acc": 0.76041317, "epoch": 0.2645869238123475, "grad_norm": 3.796875, "learning_rate": 9.81598124612088e-06, "loss": 0.92413034, "memory(GiB)": 739.27, "step": 10430, "train_speed(iter/s)": 0.328004 }, { "acc": 0.75254488, "epoch": 0.26471376318138506, "grad_norm": 3.421875, "learning_rate": 9.815699263517712e-06, "loss": 1.00746183, "memory(GiB)": 739.27, "step": 10435, "train_speed(iter/s)": 0.327496 }, { "acc": 0.75142145, "epoch": 0.26484060255042263, "grad_norm": 3.015625, "learning_rate": 9.815417069087276e-06, "loss": 0.98633356, "memory(GiB)": 739.27, "step": 10440, "train_speed(iter/s)": 0.326921 }, { "acc": 0.76450701, "epoch": 0.26496744191946014, "grad_norm": 3.34375, "learning_rate": 9.815134662841987e-06, "loss": 0.90104733, "memory(GiB)": 739.27, "step": 10445, "train_speed(iter/s)": 0.326388 }, { "acc": 0.76225996, "epoch": 0.2650942812884977, "grad_norm": 3.53125, "learning_rate": 9.814852044794265e-06, "loss": 0.90472326, "memory(GiB)": 739.27, "step": 10450, "train_speed(iter/s)": 0.325836 }, { "acc": 0.75900984, "epoch": 0.2652211206575353, "grad_norm": 3.890625, "learning_rate": 9.814569214956545e-06, "loss": 0.89215918, "memory(GiB)": 739.27, "step": 10455, "train_speed(iter/s)": 0.325334 }, { "acc": 0.75883675, "epoch": 0.26534796002657285, "grad_norm": 3.671875, "learning_rate": 9.814286173341263e-06, "loss": 0.9406703, "memory(GiB)": 739.27, "step": 10460, "train_speed(iter/s)": 0.324818 }, { "acc": 0.74088755, "epoch": 0.2654747993956104, "grad_norm": 3.296875, "learning_rate": 9.814002919960873e-06, "loss": 0.98304129, "memory(GiB)": 739.27, "step": 10465, "train_speed(iter/s)": 0.324335 }, { "acc": 0.75075679, "epoch": 0.265601638764648, "grad_norm": 3.75, "learning_rate": 9.81371945482783e-06, "loss": 0.93093071, "memory(GiB)": 739.27, "step": 10470, "train_speed(iter/s)": 0.323715 }, { "acc": 0.74512067, "epoch": 0.2657284781336855, "grad_norm": 3.828125, "learning_rate": 9.813435777954607e-06, "loss": 0.9764823, "memory(GiB)": 739.27, "step": 10475, "train_speed(iter/s)": 0.32324 }, { "acc": 0.75364952, "epoch": 0.26585531750272307, "grad_norm": 3.921875, "learning_rate": 9.813151889353682e-06, "loss": 0.93950281, "memory(GiB)": 739.27, "step": 10480, "train_speed(iter/s)": 0.322716 }, { "acc": 0.75809641, "epoch": 0.26598215687176063, "grad_norm": 3.859375, "learning_rate": 9.812867789037542e-06, "loss": 0.95455551, "memory(GiB)": 739.27, "step": 10485, "train_speed(iter/s)": 0.322155 }, { "acc": 0.74940681, "epoch": 0.2661089962407982, "grad_norm": 4.5625, "learning_rate": 9.81258347701868e-06, "loss": 0.99520922, "memory(GiB)": 739.27, "step": 10490, "train_speed(iter/s)": 0.321681 }, { "acc": 0.74760351, "epoch": 0.26623583560983577, "grad_norm": 3.1875, "learning_rate": 9.812298953309606e-06, "loss": 1.01061926, "memory(GiB)": 739.27, "step": 10495, "train_speed(iter/s)": 0.321179 }, { "acc": 0.75689144, "epoch": 0.26636267497887334, "grad_norm": 3.703125, "learning_rate": 9.812014217922834e-06, "loss": 0.99006319, "memory(GiB)": 739.27, "step": 10500, "train_speed(iter/s)": 0.320686 }, { "epoch": 0.26636267497887334, "eval_acc": 0.7469604294982155, "eval_loss": 0.9123309850692749, "eval_runtime": 1153.964, "eval_samples_per_second": 5.52, "eval_steps_per_second": 5.52, "step": 10500 }, { "acc": 0.75727386, "epoch": 0.26648951434791085, "grad_norm": 3.765625, "learning_rate": 9.811729270870888e-06, "loss": 0.9456543, "memory(GiB)": 739.27, "step": 10505, "train_speed(iter/s)": 0.302752 }, { "acc": 0.76474271, "epoch": 0.2666163537169484, "grad_norm": 3.046875, "learning_rate": 9.811444112166301e-06, "loss": 0.93404999, "memory(GiB)": 739.27, "step": 10510, "train_speed(iter/s)": 0.302259 }, { "acc": 0.74897895, "epoch": 0.266743193085986, "grad_norm": 3.515625, "learning_rate": 9.81115874182162e-06, "loss": 0.95728579, "memory(GiB)": 739.27, "step": 10515, "train_speed(iter/s)": 0.301812 }, { "acc": 0.75298295, "epoch": 0.26687003245502355, "grad_norm": 3.71875, "learning_rate": 9.810873159849394e-06, "loss": 0.95541697, "memory(GiB)": 739.27, "step": 10520, "train_speed(iter/s)": 0.301339 }, { "acc": 0.76424246, "epoch": 0.2669968718240611, "grad_norm": 3.359375, "learning_rate": 9.810587366262188e-06, "loss": 0.87505398, "memory(GiB)": 739.27, "step": 10525, "train_speed(iter/s)": 0.300884 }, { "acc": 0.75497055, "epoch": 0.2671237111930987, "grad_norm": 3.609375, "learning_rate": 9.810301361072569e-06, "loss": 0.96433458, "memory(GiB)": 739.27, "step": 10530, "train_speed(iter/s)": 0.300441 }, { "acc": 0.76924171, "epoch": 0.2672505505621362, "grad_norm": 3.578125, "learning_rate": 9.810015144293122e-06, "loss": 0.92902775, "memory(GiB)": 739.27, "step": 10535, "train_speed(iter/s)": 0.300008 }, { "acc": 0.72730117, "epoch": 0.26737738993117377, "grad_norm": 4.625, "learning_rate": 9.809728715936433e-06, "loss": 1.04589157, "memory(GiB)": 739.27, "step": 10540, "train_speed(iter/s)": 0.299556 }, { "acc": 0.75815544, "epoch": 0.26750422930021134, "grad_norm": 3.5625, "learning_rate": 9.809442076015103e-06, "loss": 0.93697424, "memory(GiB)": 739.27, "step": 10545, "train_speed(iter/s)": 0.299137 }, { "acc": 0.75298567, "epoch": 0.2676310686692489, "grad_norm": 3.3125, "learning_rate": 9.809155224541739e-06, "loss": 0.97217712, "memory(GiB)": 739.27, "step": 10550, "train_speed(iter/s)": 0.298716 }, { "acc": 0.77458286, "epoch": 0.2677579080382865, "grad_norm": 3.890625, "learning_rate": 9.808868161528959e-06, "loss": 0.92996712, "memory(GiB)": 739.27, "step": 10555, "train_speed(iter/s)": 0.298285 }, { "acc": 0.75547891, "epoch": 0.26788474740732404, "grad_norm": 5.3125, "learning_rate": 9.808580886989392e-06, "loss": 0.95863714, "memory(GiB)": 739.27, "step": 10560, "train_speed(iter/s)": 0.297845 }, { "acc": 0.75215507, "epoch": 0.26801158677636155, "grad_norm": 3.625, "learning_rate": 9.808293400935673e-06, "loss": 0.97073841, "memory(GiB)": 739.27, "step": 10565, "train_speed(iter/s)": 0.297377 }, { "acc": 0.76175318, "epoch": 0.2681384261453991, "grad_norm": 4.21875, "learning_rate": 9.808005703380447e-06, "loss": 0.95764561, "memory(GiB)": 739.27, "step": 10570, "train_speed(iter/s)": 0.296992 }, { "acc": 0.76214242, "epoch": 0.2682652655144367, "grad_norm": 3.703125, "learning_rate": 9.807717794336369e-06, "loss": 0.91137877, "memory(GiB)": 739.27, "step": 10575, "train_speed(iter/s)": 0.296587 }, { "acc": 0.75532842, "epoch": 0.26839210488347426, "grad_norm": 3.8125, "learning_rate": 9.807429673816105e-06, "loss": 0.94314709, "memory(GiB)": 739.27, "step": 10580, "train_speed(iter/s)": 0.296182 }, { "acc": 0.74958153, "epoch": 0.2685189442525118, "grad_norm": 3.921875, "learning_rate": 9.807141341832325e-06, "loss": 0.97599945, "memory(GiB)": 739.27, "step": 10585, "train_speed(iter/s)": 0.295772 }, { "acc": 0.75251083, "epoch": 0.2686457836215494, "grad_norm": 3.4375, "learning_rate": 9.806852798397714e-06, "loss": 0.98892117, "memory(GiB)": 739.27, "step": 10590, "train_speed(iter/s)": 0.29536 }, { "acc": 0.75521011, "epoch": 0.2687726229905869, "grad_norm": 3.390625, "learning_rate": 9.806564043524966e-06, "loss": 0.9669776, "memory(GiB)": 739.27, "step": 10595, "train_speed(iter/s)": 0.294944 }, { "acc": 0.75940175, "epoch": 0.2688994623596245, "grad_norm": 4.09375, "learning_rate": 9.806275077226778e-06, "loss": 0.9397419, "memory(GiB)": 739.27, "step": 10600, "train_speed(iter/s)": 0.294553 }, { "acc": 0.7676343, "epoch": 0.26902630172866204, "grad_norm": 3.671875, "learning_rate": 9.805985899515864e-06, "loss": 0.90158033, "memory(GiB)": 739.27, "step": 10605, "train_speed(iter/s)": 0.294183 }, { "acc": 0.74774113, "epoch": 0.2691531410976996, "grad_norm": 3.46875, "learning_rate": 9.805696510404945e-06, "loss": 0.94891415, "memory(GiB)": 739.27, "step": 10610, "train_speed(iter/s)": 0.293754 }, { "acc": 0.75249352, "epoch": 0.2692799804667372, "grad_norm": 3.921875, "learning_rate": 9.805406909906745e-06, "loss": 0.94496307, "memory(GiB)": 739.27, "step": 10615, "train_speed(iter/s)": 0.293384 }, { "acc": 0.75154219, "epoch": 0.26940681983577475, "grad_norm": 3.484375, "learning_rate": 9.805117098034008e-06, "loss": 0.97207565, "memory(GiB)": 739.27, "step": 10620, "train_speed(iter/s)": 0.292981 }, { "acc": 0.74728632, "epoch": 0.26953365920481226, "grad_norm": 3.21875, "learning_rate": 9.804827074799479e-06, "loss": 0.9686223, "memory(GiB)": 739.27, "step": 10625, "train_speed(iter/s)": 0.292545 }, { "acc": 0.76673737, "epoch": 0.2696604985738498, "grad_norm": 3.53125, "learning_rate": 9.804536840215917e-06, "loss": 0.96513548, "memory(GiB)": 739.27, "step": 10630, "train_speed(iter/s)": 0.292098 }, { "acc": 0.75689888, "epoch": 0.2697873379428874, "grad_norm": 3.625, "learning_rate": 9.804246394296088e-06, "loss": 0.93769321, "memory(GiB)": 739.27, "step": 10635, "train_speed(iter/s)": 0.291687 }, { "acc": 0.75790501, "epoch": 0.26991417731192496, "grad_norm": 3.65625, "learning_rate": 9.803955737052766e-06, "loss": 0.91334295, "memory(GiB)": 739.27, "step": 10640, "train_speed(iter/s)": 0.291242 }, { "acc": 0.74893389, "epoch": 0.27004101668096253, "grad_norm": 3.578125, "learning_rate": 9.803664868498738e-06, "loss": 1.00288143, "memory(GiB)": 739.27, "step": 10645, "train_speed(iter/s)": 0.290847 }, { "acc": 0.75716066, "epoch": 0.2701678560500001, "grad_norm": 4.0625, "learning_rate": 9.803373788646798e-06, "loss": 0.98118439, "memory(GiB)": 739.27, "step": 10650, "train_speed(iter/s)": 0.290465 }, { "acc": 0.75448756, "epoch": 0.2702946954190376, "grad_norm": 4.21875, "learning_rate": 9.80308249750975e-06, "loss": 0.96654444, "memory(GiB)": 739.27, "step": 10655, "train_speed(iter/s)": 0.290141 }, { "acc": 0.76700063, "epoch": 0.2704215347880752, "grad_norm": 3.53125, "learning_rate": 9.802790995100405e-06, "loss": 0.91079054, "memory(GiB)": 739.27, "step": 10660, "train_speed(iter/s)": 0.289775 }, { "acc": 0.75589852, "epoch": 0.27054837415711275, "grad_norm": 4.625, "learning_rate": 9.802499281431588e-06, "loss": 0.93940468, "memory(GiB)": 739.27, "step": 10665, "train_speed(iter/s)": 0.2893 }, { "acc": 0.74588757, "epoch": 0.2706752135261503, "grad_norm": 2.75, "learning_rate": 9.802207356516128e-06, "loss": 0.993085, "memory(GiB)": 739.27, "step": 10670, "train_speed(iter/s)": 0.28893 }, { "acc": 0.75564513, "epoch": 0.2708020528951879, "grad_norm": 3.984375, "learning_rate": 9.80191522036687e-06, "loss": 0.93480549, "memory(GiB)": 739.27, "step": 10675, "train_speed(iter/s)": 0.288528 }, { "acc": 0.7584137, "epoch": 0.27092889226422545, "grad_norm": 4.0625, "learning_rate": 9.801622872996658e-06, "loss": 0.95455923, "memory(GiB)": 739.27, "step": 10680, "train_speed(iter/s)": 0.288118 }, { "acc": 0.76630974, "epoch": 0.27105573163326296, "grad_norm": 3.921875, "learning_rate": 9.801330314418356e-06, "loss": 0.94893026, "memory(GiB)": 739.27, "step": 10685, "train_speed(iter/s)": 0.287723 }, { "acc": 0.77443142, "epoch": 0.27118257100230053, "grad_norm": 4.09375, "learning_rate": 9.80103754464483e-06, "loss": 0.89420595, "memory(GiB)": 739.27, "step": 10690, "train_speed(iter/s)": 0.287354 }, { "acc": 0.74963751, "epoch": 0.2713094103713381, "grad_norm": 3.390625, "learning_rate": 9.800744563688964e-06, "loss": 0.93897142, "memory(GiB)": 739.27, "step": 10695, "train_speed(iter/s)": 0.28697 }, { "acc": 0.7559875, "epoch": 0.27143624974037567, "grad_norm": 3.625, "learning_rate": 9.800451371563638e-06, "loss": 0.94671545, "memory(GiB)": 739.27, "step": 10700, "train_speed(iter/s)": 0.286551 }, { "acc": 0.75456462, "epoch": 0.27156308910941324, "grad_norm": 3.90625, "learning_rate": 9.80015796828175e-06, "loss": 0.93368969, "memory(GiB)": 739.27, "step": 10705, "train_speed(iter/s)": 0.28616 }, { "acc": 0.76921082, "epoch": 0.2716899284784508, "grad_norm": 3.484375, "learning_rate": 9.799864353856207e-06, "loss": 0.90657253, "memory(GiB)": 739.27, "step": 10710, "train_speed(iter/s)": 0.285812 }, { "acc": 0.7631557, "epoch": 0.2718167678474883, "grad_norm": 3.984375, "learning_rate": 9.799570528299926e-06, "loss": 0.93864765, "memory(GiB)": 739.27, "step": 10715, "train_speed(iter/s)": 0.285454 }, { "acc": 0.76600614, "epoch": 0.2719436072165259, "grad_norm": 3.046875, "learning_rate": 9.799276491625831e-06, "loss": 0.90924902, "memory(GiB)": 739.27, "step": 10720, "train_speed(iter/s)": 0.285051 }, { "acc": 0.76435852, "epoch": 0.27207044658556345, "grad_norm": 3.59375, "learning_rate": 9.798982243846853e-06, "loss": 0.88290319, "memory(GiB)": 739.27, "step": 10725, "train_speed(iter/s)": 0.284655 }, { "acc": 0.75816302, "epoch": 0.272197285954601, "grad_norm": 3.625, "learning_rate": 9.798687784975935e-06, "loss": 0.95803118, "memory(GiB)": 739.27, "step": 10730, "train_speed(iter/s)": 0.284241 }, { "acc": 0.75191827, "epoch": 0.2723241253236386, "grad_norm": 3.671875, "learning_rate": 9.798393115026034e-06, "loss": 0.9625927, "memory(GiB)": 739.27, "step": 10735, "train_speed(iter/s)": 0.283799 }, { "acc": 0.76657648, "epoch": 0.27245096469267616, "grad_norm": 3.375, "learning_rate": 9.798098234010107e-06, "loss": 0.87425871, "memory(GiB)": 739.27, "step": 10740, "train_speed(iter/s)": 0.283399 }, { "acc": 0.76077304, "epoch": 0.27257780406171367, "grad_norm": 3.4375, "learning_rate": 9.797803141941125e-06, "loss": 0.8914259, "memory(GiB)": 739.27, "step": 10745, "train_speed(iter/s)": 0.283027 }, { "acc": 0.76994081, "epoch": 0.27270464343075124, "grad_norm": 3.5625, "learning_rate": 9.797507838832072e-06, "loss": 0.84405241, "memory(GiB)": 739.27, "step": 10750, "train_speed(iter/s)": 0.282664 }, { "acc": 0.76112766, "epoch": 0.2728314827997888, "grad_norm": 3.0625, "learning_rate": 9.797212324695934e-06, "loss": 0.92920637, "memory(GiB)": 739.27, "step": 10755, "train_speed(iter/s)": 0.282276 }, { "acc": 0.75494981, "epoch": 0.2729583221688264, "grad_norm": 3.15625, "learning_rate": 9.79691659954571e-06, "loss": 0.94099922, "memory(GiB)": 739.27, "step": 10760, "train_speed(iter/s)": 0.281953 }, { "acc": 0.76326275, "epoch": 0.27308516153786394, "grad_norm": 3.40625, "learning_rate": 9.796620663394409e-06, "loss": 0.92479496, "memory(GiB)": 739.27, "step": 10765, "train_speed(iter/s)": 0.281578 }, { "acc": 0.74388723, "epoch": 0.2732120009069015, "grad_norm": 3.890625, "learning_rate": 9.796324516255049e-06, "loss": 0.93401108, "memory(GiB)": 739.27, "step": 10770, "train_speed(iter/s)": 0.281212 }, { "acc": 0.76403937, "epoch": 0.273338840275939, "grad_norm": 3.984375, "learning_rate": 9.796028158140656e-06, "loss": 0.93635492, "memory(GiB)": 739.27, "step": 10775, "train_speed(iter/s)": 0.280847 }, { "acc": 0.76297231, "epoch": 0.2734656796449766, "grad_norm": 3.5625, "learning_rate": 9.795731589064264e-06, "loss": 0.9385375, "memory(GiB)": 739.27, "step": 10780, "train_speed(iter/s)": 0.28048 }, { "acc": 0.74168224, "epoch": 0.27359251901401416, "grad_norm": 3.234375, "learning_rate": 9.795434809038918e-06, "loss": 1.01726494, "memory(GiB)": 739.27, "step": 10785, "train_speed(iter/s)": 0.280099 }, { "acc": 0.75144262, "epoch": 0.2737193583830517, "grad_norm": 3.21875, "learning_rate": 9.795137818077677e-06, "loss": 0.95988979, "memory(GiB)": 739.27, "step": 10790, "train_speed(iter/s)": 0.279763 }, { "acc": 0.75025201, "epoch": 0.2738461977520893, "grad_norm": 3.71875, "learning_rate": 9.7948406161936e-06, "loss": 0.95623026, "memory(GiB)": 739.27, "step": 10795, "train_speed(iter/s)": 0.279441 }, { "acc": 0.75124636, "epoch": 0.27397303712112686, "grad_norm": 3.90625, "learning_rate": 9.794543203399762e-06, "loss": 0.92160931, "memory(GiB)": 739.27, "step": 10800, "train_speed(iter/s)": 0.279118 }, { "acc": 0.7510262, "epoch": 0.2740998764901644, "grad_norm": 3.140625, "learning_rate": 9.794245579709244e-06, "loss": 0.89907494, "memory(GiB)": 739.27, "step": 10805, "train_speed(iter/s)": 0.278776 }, { "acc": 0.75858822, "epoch": 0.27422671585920194, "grad_norm": 3.984375, "learning_rate": 9.79394774513514e-06, "loss": 0.97534294, "memory(GiB)": 739.27, "step": 10810, "train_speed(iter/s)": 0.278434 }, { "acc": 0.76700592, "epoch": 0.2743535552282395, "grad_norm": 3.1875, "learning_rate": 9.793649699690548e-06, "loss": 0.92563028, "memory(GiB)": 739.27, "step": 10815, "train_speed(iter/s)": 0.278066 }, { "acc": 0.75732112, "epoch": 0.2744803945972771, "grad_norm": 3.3125, "learning_rate": 9.79335144338858e-06, "loss": 0.94132957, "memory(GiB)": 739.27, "step": 10820, "train_speed(iter/s)": 0.277719 }, { "acc": 0.75634098, "epoch": 0.27460723396631465, "grad_norm": 3.8125, "learning_rate": 9.793052976242351e-06, "loss": 0.93372803, "memory(GiB)": 739.27, "step": 10825, "train_speed(iter/s)": 0.277324 }, { "acc": 0.76942692, "epoch": 0.2747340733353522, "grad_norm": 2.796875, "learning_rate": 9.792754298264996e-06, "loss": 0.88528614, "memory(GiB)": 739.27, "step": 10830, "train_speed(iter/s)": 0.276945 }, { "acc": 0.75643258, "epoch": 0.2748609127043897, "grad_norm": 3.40625, "learning_rate": 9.792455409469652e-06, "loss": 0.9467989, "memory(GiB)": 739.27, "step": 10835, "train_speed(iter/s)": 0.276616 }, { "acc": 0.76190548, "epoch": 0.2749877520734273, "grad_norm": 3.3125, "learning_rate": 9.792156309869462e-06, "loss": 0.92346983, "memory(GiB)": 739.27, "step": 10840, "train_speed(iter/s)": 0.276274 }, { "acc": 0.76648841, "epoch": 0.27511459144246486, "grad_norm": 3.171875, "learning_rate": 9.791856999477585e-06, "loss": 0.93466072, "memory(GiB)": 739.27, "step": 10845, "train_speed(iter/s)": 0.275937 }, { "acc": 0.75819283, "epoch": 0.27524143081150243, "grad_norm": 4.28125, "learning_rate": 9.791557478307186e-06, "loss": 0.97443695, "memory(GiB)": 739.27, "step": 10850, "train_speed(iter/s)": 0.275628 }, { "acc": 0.76006384, "epoch": 0.27536827018054, "grad_norm": 4.0625, "learning_rate": 9.791257746371441e-06, "loss": 0.93257113, "memory(GiB)": 739.27, "step": 10855, "train_speed(iter/s)": 0.275254 }, { "acc": 0.76071734, "epoch": 0.27549510954957757, "grad_norm": 3.71875, "learning_rate": 9.790957803683535e-06, "loss": 0.92267418, "memory(GiB)": 739.27, "step": 10860, "train_speed(iter/s)": 0.274937 }, { "acc": 0.7628521, "epoch": 0.2756219489186151, "grad_norm": 4.0, "learning_rate": 9.790657650256658e-06, "loss": 0.94869337, "memory(GiB)": 739.27, "step": 10865, "train_speed(iter/s)": 0.274615 }, { "acc": 0.75457959, "epoch": 0.27574878828765265, "grad_norm": 3.21875, "learning_rate": 9.790357286104015e-06, "loss": 0.99849367, "memory(GiB)": 739.27, "step": 10870, "train_speed(iter/s)": 0.274299 }, { "acc": 0.75251641, "epoch": 0.2758756276566902, "grad_norm": 2.90625, "learning_rate": 9.790056711238818e-06, "loss": 0.8971221, "memory(GiB)": 739.27, "step": 10875, "train_speed(iter/s)": 0.27392 }, { "acc": 0.75615034, "epoch": 0.2760024670257278, "grad_norm": 3.40625, "learning_rate": 9.789755925674289e-06, "loss": 0.91096563, "memory(GiB)": 739.27, "step": 10880, "train_speed(iter/s)": 0.27356 }, { "acc": 0.76182528, "epoch": 0.27612930639476535, "grad_norm": 3.765625, "learning_rate": 9.789454929423658e-06, "loss": 0.94627819, "memory(GiB)": 739.27, "step": 10885, "train_speed(iter/s)": 0.273193 }, { "acc": 0.77241116, "epoch": 0.2762561457638029, "grad_norm": 4.65625, "learning_rate": 9.789153722500163e-06, "loss": 0.92222681, "memory(GiB)": 739.27, "step": 10890, "train_speed(iter/s)": 0.272873 }, { "acc": 0.76848903, "epoch": 0.27638298513284043, "grad_norm": 3.640625, "learning_rate": 9.788852304917057e-06, "loss": 0.89529409, "memory(GiB)": 739.27, "step": 10895, "train_speed(iter/s)": 0.272515 }, { "acc": 0.75587125, "epoch": 0.276509824501878, "grad_norm": 3.53125, "learning_rate": 9.788550676687598e-06, "loss": 0.95443401, "memory(GiB)": 739.27, "step": 10900, "train_speed(iter/s)": 0.272214 }, { "acc": 0.76052461, "epoch": 0.27663666387091557, "grad_norm": 4.09375, "learning_rate": 9.788248837825049e-06, "loss": 0.98829517, "memory(GiB)": 739.27, "step": 10905, "train_speed(iter/s)": 0.271897 }, { "acc": 0.74542589, "epoch": 0.27676350323995313, "grad_norm": 3.875, "learning_rate": 9.78794678834269e-06, "loss": 0.98473873, "memory(GiB)": 739.27, "step": 10910, "train_speed(iter/s)": 0.271564 }, { "acc": 0.76311107, "epoch": 0.2768903426089907, "grad_norm": 5.84375, "learning_rate": 9.787644528253808e-06, "loss": 0.89876881, "memory(GiB)": 739.27, "step": 10915, "train_speed(iter/s)": 0.271233 }, { "acc": 0.77087841, "epoch": 0.27701718197802827, "grad_norm": 3.8125, "learning_rate": 9.787342057571698e-06, "loss": 0.8826066, "memory(GiB)": 739.27, "step": 10920, "train_speed(iter/s)": 0.270924 }, { "acc": 0.75206494, "epoch": 0.2771440213470658, "grad_norm": 3.671875, "learning_rate": 9.787039376309664e-06, "loss": 0.97557154, "memory(GiB)": 739.27, "step": 10925, "train_speed(iter/s)": 0.270593 }, { "acc": 0.7449173, "epoch": 0.27727086071610335, "grad_norm": 3.546875, "learning_rate": 9.786736484481019e-06, "loss": 0.97903566, "memory(GiB)": 739.27, "step": 10930, "train_speed(iter/s)": 0.270256 }, { "acc": 0.76805058, "epoch": 0.2773977000851409, "grad_norm": 3.46875, "learning_rate": 9.786433382099089e-06, "loss": 0.917383, "memory(GiB)": 739.27, "step": 10935, "train_speed(iter/s)": 0.269963 }, { "acc": 0.74460349, "epoch": 0.2775245394541785, "grad_norm": 3.421875, "learning_rate": 9.786130069177204e-06, "loss": 0.98090544, "memory(GiB)": 739.27, "step": 10940, "train_speed(iter/s)": 0.269641 }, { "acc": 0.77197099, "epoch": 0.27765137882321606, "grad_norm": 3.375, "learning_rate": 9.785826545728708e-06, "loss": 0.8910821, "memory(GiB)": 739.27, "step": 10945, "train_speed(iter/s)": 0.269316 }, { "acc": 0.75893626, "epoch": 0.2777782181922536, "grad_norm": 3.71875, "learning_rate": 9.78552281176695e-06, "loss": 0.94822407, "memory(GiB)": 739.27, "step": 10950, "train_speed(iter/s)": 0.268983 }, { "acc": 0.75453668, "epoch": 0.27790505756129114, "grad_norm": 3.640625, "learning_rate": 9.78521886730529e-06, "loss": 0.95044336, "memory(GiB)": 739.27, "step": 10955, "train_speed(iter/s)": 0.268634 }, { "acc": 0.75020099, "epoch": 0.2780318969303287, "grad_norm": 3.25, "learning_rate": 9.7849147123571e-06, "loss": 0.94988852, "memory(GiB)": 739.27, "step": 10960, "train_speed(iter/s)": 0.268353 }, { "acc": 0.76946406, "epoch": 0.27815873629936627, "grad_norm": 3.109375, "learning_rate": 9.784610346935757e-06, "loss": 0.91804352, "memory(GiB)": 739.27, "step": 10965, "train_speed(iter/s)": 0.268032 }, { "acc": 0.75349054, "epoch": 0.27828557566840384, "grad_norm": 3.5625, "learning_rate": 9.78430577105465e-06, "loss": 0.98783369, "memory(GiB)": 739.27, "step": 10970, "train_speed(iter/s)": 0.267716 }, { "acc": 0.75435333, "epoch": 0.2784124150374414, "grad_norm": 3.015625, "learning_rate": 9.784000984727177e-06, "loss": 0.95857506, "memory(GiB)": 739.27, "step": 10975, "train_speed(iter/s)": 0.267353 }, { "acc": 0.75666986, "epoch": 0.278539254406479, "grad_norm": 3.265625, "learning_rate": 9.783695987966742e-06, "loss": 0.96058531, "memory(GiB)": 739.27, "step": 10980, "train_speed(iter/s)": 0.267042 }, { "acc": 0.75216289, "epoch": 0.2786660937755165, "grad_norm": 4.375, "learning_rate": 9.783390780786761e-06, "loss": 0.87837143, "memory(GiB)": 739.27, "step": 10985, "train_speed(iter/s)": 0.266711 }, { "acc": 0.75045218, "epoch": 0.27879293314455406, "grad_norm": 3.046875, "learning_rate": 9.783085363200662e-06, "loss": 0.94387589, "memory(GiB)": 739.27, "step": 10990, "train_speed(iter/s)": 0.266399 }, { "acc": 0.77792826, "epoch": 0.2789197725135916, "grad_norm": 3.59375, "learning_rate": 9.782779735221879e-06, "loss": 0.82673206, "memory(GiB)": 739.27, "step": 10995, "train_speed(iter/s)": 0.266106 }, { "acc": 0.75743341, "epoch": 0.2790466118826292, "grad_norm": 2.96875, "learning_rate": 9.782473896863853e-06, "loss": 0.91449671, "memory(GiB)": 739.27, "step": 11000, "train_speed(iter/s)": 0.265805 }, { "epoch": 0.2790466118826292, "eval_acc": 0.747469687848444, "eval_loss": 0.908851683139801, "eval_runtime": 1150.4752, "eval_samples_per_second": 5.537, "eval_steps_per_second": 5.537, "step": 11000 }, { "acc": 0.76991682, "epoch": 0.27917345125166676, "grad_norm": 3.484375, "learning_rate": 9.782167848140037e-06, "loss": 0.96267147, "memory(GiB)": 739.27, "step": 11005, "train_speed(iter/s)": 0.253852 }, { "acc": 0.747685, "epoch": 0.27930029062070433, "grad_norm": 3.65625, "learning_rate": 9.781861589063895e-06, "loss": 0.94446535, "memory(GiB)": 739.27, "step": 11010, "train_speed(iter/s)": 0.253591 }, { "acc": 0.75766315, "epoch": 0.27942712998974184, "grad_norm": 3.296875, "learning_rate": 9.781555119648898e-06, "loss": 0.89937325, "memory(GiB)": 739.27, "step": 11015, "train_speed(iter/s)": 0.253324 }, { "acc": 0.75673728, "epoch": 0.2795539693587794, "grad_norm": 3.734375, "learning_rate": 9.781248439908526e-06, "loss": 0.97184725, "memory(GiB)": 739.27, "step": 11020, "train_speed(iter/s)": 0.252999 }, { "acc": 0.76538796, "epoch": 0.279680808727817, "grad_norm": 3.03125, "learning_rate": 9.78094154985627e-06, "loss": 0.91359949, "memory(GiB)": 739.27, "step": 11025, "train_speed(iter/s)": 0.252714 }, { "acc": 0.75321794, "epoch": 0.27980764809685454, "grad_norm": 4.0, "learning_rate": 9.780634449505629e-06, "loss": 0.95090103, "memory(GiB)": 739.27, "step": 11030, "train_speed(iter/s)": 0.252454 }, { "acc": 0.76349897, "epoch": 0.2799344874658921, "grad_norm": 3.5, "learning_rate": 9.780327138870107e-06, "loss": 0.97426701, "memory(GiB)": 739.27, "step": 11035, "train_speed(iter/s)": 0.252191 }, { "acc": 0.75261512, "epoch": 0.2800613268349297, "grad_norm": 3.78125, "learning_rate": 9.780019617963227e-06, "loss": 0.96150341, "memory(GiB)": 739.27, "step": 11040, "train_speed(iter/s)": 0.2519 }, { "acc": 0.76372933, "epoch": 0.2801881662039672, "grad_norm": 5.09375, "learning_rate": 9.779711886798516e-06, "loss": 0.97256937, "memory(GiB)": 739.27, "step": 11045, "train_speed(iter/s)": 0.251683 }, { "acc": 0.75563149, "epoch": 0.28031500557300476, "grad_norm": 3.890625, "learning_rate": 9.779403945389507e-06, "loss": 0.96190929, "memory(GiB)": 739.27, "step": 11050, "train_speed(iter/s)": 0.25143 }, { "acc": 0.76360292, "epoch": 0.28044184494204233, "grad_norm": 3.609375, "learning_rate": 9.779095793749744e-06, "loss": 0.95153666, "memory(GiB)": 739.27, "step": 11055, "train_speed(iter/s)": 0.251155 }, { "acc": 0.7698523, "epoch": 0.2805686843110799, "grad_norm": 3.59375, "learning_rate": 9.778787431892787e-06, "loss": 0.89945612, "memory(GiB)": 739.27, "step": 11060, "train_speed(iter/s)": 0.250868 }, { "acc": 0.753438, "epoch": 0.28069552368011746, "grad_norm": 3.09375, "learning_rate": 9.778478859832197e-06, "loss": 0.97692099, "memory(GiB)": 739.27, "step": 11065, "train_speed(iter/s)": 0.250564 }, { "acc": 0.7403244, "epoch": 0.28082236304915503, "grad_norm": 3.28125, "learning_rate": 9.778170077581545e-06, "loss": 0.96876526, "memory(GiB)": 739.27, "step": 11070, "train_speed(iter/s)": 0.250292 }, { "acc": 0.76418629, "epoch": 0.28094920241819255, "grad_norm": 4.375, "learning_rate": 9.77786108515442e-06, "loss": 0.93062572, "memory(GiB)": 739.27, "step": 11075, "train_speed(iter/s)": 0.250011 }, { "acc": 0.77708259, "epoch": 0.2810760417872301, "grad_norm": 3.296875, "learning_rate": 9.777551882564404e-06, "loss": 0.86761742, "memory(GiB)": 739.27, "step": 11080, "train_speed(iter/s)": 0.249767 }, { "acc": 0.75695167, "epoch": 0.2812028811562677, "grad_norm": 3.8125, "learning_rate": 9.777242469825105e-06, "loss": 0.91010818, "memory(GiB)": 739.27, "step": 11085, "train_speed(iter/s)": 0.249473 }, { "acc": 0.7701786, "epoch": 0.28132972052530525, "grad_norm": 3.390625, "learning_rate": 9.776932846950132e-06, "loss": 0.93254528, "memory(GiB)": 739.27, "step": 11090, "train_speed(iter/s)": 0.249178 }, { "acc": 0.75893836, "epoch": 0.2814565598943428, "grad_norm": 4.0, "learning_rate": 9.776623013953101e-06, "loss": 0.92729397, "memory(GiB)": 739.27, "step": 11095, "train_speed(iter/s)": 0.248935 }, { "acc": 0.7529676, "epoch": 0.2815833992633804, "grad_norm": 3.46875, "learning_rate": 9.776312970847644e-06, "loss": 0.96719685, "memory(GiB)": 739.27, "step": 11100, "train_speed(iter/s)": 0.248646 }, { "acc": 0.75688863, "epoch": 0.2817102386324179, "grad_norm": 3.28125, "learning_rate": 9.776002717647397e-06, "loss": 0.9344677, "memory(GiB)": 739.27, "step": 11105, "train_speed(iter/s)": 0.248379 }, { "acc": 0.7564548, "epoch": 0.28183707800145547, "grad_norm": 3.75, "learning_rate": 9.77569225436601e-06, "loss": 0.91740379, "memory(GiB)": 739.27, "step": 11110, "train_speed(iter/s)": 0.248095 }, { "acc": 0.76277885, "epoch": 0.28196391737049303, "grad_norm": 3.484375, "learning_rate": 9.775381581017135e-06, "loss": 0.91687918, "memory(GiB)": 739.27, "step": 11115, "train_speed(iter/s)": 0.247849 }, { "acc": 0.75487423, "epoch": 0.2820907567395306, "grad_norm": 3.828125, "learning_rate": 9.77507069761444e-06, "loss": 0.94182539, "memory(GiB)": 739.27, "step": 11120, "train_speed(iter/s)": 0.24762 }, { "acc": 0.76915216, "epoch": 0.28221759610856817, "grad_norm": 3.90625, "learning_rate": 9.774759604171599e-06, "loss": 0.89514589, "memory(GiB)": 739.27, "step": 11125, "train_speed(iter/s)": 0.247376 }, { "acc": 0.75180869, "epoch": 0.28234443547760574, "grad_norm": 3.546875, "learning_rate": 9.774448300702297e-06, "loss": 0.94967613, "memory(GiB)": 739.27, "step": 11130, "train_speed(iter/s)": 0.247115 }, { "acc": 0.7586019, "epoch": 0.28247127484664325, "grad_norm": 3.90625, "learning_rate": 9.774136787220226e-06, "loss": 0.92872496, "memory(GiB)": 739.27, "step": 11135, "train_speed(iter/s)": 0.246823 }, { "acc": 0.75592504, "epoch": 0.2825981142156808, "grad_norm": 3.9375, "learning_rate": 9.773825063739088e-06, "loss": 0.98710518, "memory(GiB)": 739.27, "step": 11140, "train_speed(iter/s)": 0.246578 }, { "acc": 0.74453564, "epoch": 0.2827249535847184, "grad_norm": 3.875, "learning_rate": 9.773513130272598e-06, "loss": 0.98894501, "memory(GiB)": 739.27, "step": 11145, "train_speed(iter/s)": 0.24633 }, { "acc": 0.75349975, "epoch": 0.28285179295375595, "grad_norm": 2.96875, "learning_rate": 9.773200986834473e-06, "loss": 0.96305065, "memory(GiB)": 739.27, "step": 11150, "train_speed(iter/s)": 0.246079 }, { "acc": 0.7433475, "epoch": 0.2829786323227935, "grad_norm": 3.546875, "learning_rate": 9.772888633438446e-06, "loss": 0.99985161, "memory(GiB)": 739.27, "step": 11155, "train_speed(iter/s)": 0.245837 }, { "acc": 0.75861754, "epoch": 0.2831054716918311, "grad_norm": 3.203125, "learning_rate": 9.772576070098256e-06, "loss": 0.92042494, "memory(GiB)": 739.27, "step": 11160, "train_speed(iter/s)": 0.245609 }, { "acc": 0.76832991, "epoch": 0.2832323110608686, "grad_norm": 3.609375, "learning_rate": 9.772263296827648e-06, "loss": 0.95657787, "memory(GiB)": 739.27, "step": 11165, "train_speed(iter/s)": 0.24538 }, { "acc": 0.7633626, "epoch": 0.28335915042990617, "grad_norm": 3.59375, "learning_rate": 9.771950313640385e-06, "loss": 0.93867216, "memory(GiB)": 739.27, "step": 11170, "train_speed(iter/s)": 0.24513 }, { "acc": 0.76910515, "epoch": 0.28348598979894374, "grad_norm": 3.671875, "learning_rate": 9.771637120550233e-06, "loss": 0.85239172, "memory(GiB)": 739.27, "step": 11175, "train_speed(iter/s)": 0.244865 }, { "acc": 0.74115219, "epoch": 0.2836128291679813, "grad_norm": 3.90625, "learning_rate": 9.771323717570967e-06, "loss": 0.95782671, "memory(GiB)": 739.27, "step": 11180, "train_speed(iter/s)": 0.244627 }, { "acc": 0.7644732, "epoch": 0.2837396685370189, "grad_norm": 3.765625, "learning_rate": 9.771010104716372e-06, "loss": 0.90194912, "memory(GiB)": 739.27, "step": 11185, "train_speed(iter/s)": 0.244366 }, { "acc": 0.7561892, "epoch": 0.28386650790605644, "grad_norm": 3.375, "learning_rate": 9.770696282000245e-06, "loss": 0.98952436, "memory(GiB)": 739.27, "step": 11190, "train_speed(iter/s)": 0.244126 }, { "acc": 0.74352641, "epoch": 0.28399334727509395, "grad_norm": 3.734375, "learning_rate": 9.770382249436388e-06, "loss": 0.96977854, "memory(GiB)": 739.27, "step": 11195, "train_speed(iter/s)": 0.243885 }, { "acc": 0.76568894, "epoch": 0.2841201866441315, "grad_norm": 3.375, "learning_rate": 9.770068007038616e-06, "loss": 0.88590326, "memory(GiB)": 739.27, "step": 11200, "train_speed(iter/s)": 0.243648 }, { "acc": 0.76063418, "epoch": 0.2842470260131691, "grad_norm": 3.21875, "learning_rate": 9.76975355482075e-06, "loss": 0.91701517, "memory(GiB)": 739.27, "step": 11205, "train_speed(iter/s)": 0.243408 }, { "acc": 0.76348772, "epoch": 0.28437386538220666, "grad_norm": 3.796875, "learning_rate": 9.769438892796624e-06, "loss": 0.90628366, "memory(GiB)": 739.27, "step": 11210, "train_speed(iter/s)": 0.243186 }, { "acc": 0.75111203, "epoch": 0.2845007047512442, "grad_norm": 4.0, "learning_rate": 9.769124020980076e-06, "loss": 0.98351355, "memory(GiB)": 739.27, "step": 11215, "train_speed(iter/s)": 0.242923 }, { "acc": 0.75514522, "epoch": 0.2846275441202818, "grad_norm": 4.0625, "learning_rate": 9.768808939384957e-06, "loss": 0.94141178, "memory(GiB)": 739.27, "step": 11220, "train_speed(iter/s)": 0.242716 }, { "acc": 0.7588479, "epoch": 0.2847543834893193, "grad_norm": 3.5, "learning_rate": 9.768493648025128e-06, "loss": 0.90218391, "memory(GiB)": 739.27, "step": 11225, "train_speed(iter/s)": 0.242478 }, { "acc": 0.75875144, "epoch": 0.2848812228583569, "grad_norm": 3.78125, "learning_rate": 9.768178146914457e-06, "loss": 0.93685608, "memory(GiB)": 739.27, "step": 11230, "train_speed(iter/s)": 0.242236 }, { "acc": 0.76591935, "epoch": 0.28500806222739444, "grad_norm": 3.203125, "learning_rate": 9.767862436066821e-06, "loss": 0.89398251, "memory(GiB)": 739.27, "step": 11235, "train_speed(iter/s)": 0.241983 }, { "acc": 0.76207657, "epoch": 0.285134901596432, "grad_norm": 13.5, "learning_rate": 9.767546515496108e-06, "loss": 0.90170927, "memory(GiB)": 739.27, "step": 11240, "train_speed(iter/s)": 0.241749 }, { "acc": 0.75455484, "epoch": 0.2852617409654696, "grad_norm": 3.515625, "learning_rate": 9.767230385216216e-06, "loss": 0.9045743, "memory(GiB)": 739.27, "step": 11245, "train_speed(iter/s)": 0.241511 }, { "acc": 0.74687643, "epoch": 0.28538858033450715, "grad_norm": 3.890625, "learning_rate": 9.766914045241046e-06, "loss": 1.0063324, "memory(GiB)": 739.27, "step": 11250, "train_speed(iter/s)": 0.241291 }, { "acc": 0.77647481, "epoch": 0.28551541970354466, "grad_norm": 3.921875, "learning_rate": 9.766597495584516e-06, "loss": 0.86864548, "memory(GiB)": 739.27, "step": 11255, "train_speed(iter/s)": 0.241043 }, { "acc": 0.76380811, "epoch": 0.2856422590725822, "grad_norm": 3.65625, "learning_rate": 9.76628073626055e-06, "loss": 0.95636692, "memory(GiB)": 739.27, "step": 11260, "train_speed(iter/s)": 0.240806 }, { "acc": 0.76178188, "epoch": 0.2857690984416198, "grad_norm": 3.28125, "learning_rate": 9.76596376728308e-06, "loss": 0.95464935, "memory(GiB)": 739.27, "step": 11265, "train_speed(iter/s)": 0.240561 }, { "acc": 0.7551353, "epoch": 0.28589593781065736, "grad_norm": 2.765625, "learning_rate": 9.765646588666051e-06, "loss": 0.89828939, "memory(GiB)": 739.27, "step": 11270, "train_speed(iter/s)": 0.240312 }, { "acc": 0.75252752, "epoch": 0.28602277717969493, "grad_norm": 3.34375, "learning_rate": 9.765329200423412e-06, "loss": 0.96414871, "memory(GiB)": 739.27, "step": 11275, "train_speed(iter/s)": 0.240073 }, { "acc": 0.76006594, "epoch": 0.2861496165487325, "grad_norm": 3.390625, "learning_rate": 9.765011602569124e-06, "loss": 0.95142498, "memory(GiB)": 739.27, "step": 11280, "train_speed(iter/s)": 0.239853 }, { "acc": 0.75568361, "epoch": 0.28627645591777, "grad_norm": 3.265625, "learning_rate": 9.764693795117157e-06, "loss": 0.93686314, "memory(GiB)": 739.27, "step": 11285, "train_speed(iter/s)": 0.239626 }, { "acc": 0.7565784, "epoch": 0.2864032952868076, "grad_norm": 3.4375, "learning_rate": 9.764375778081495e-06, "loss": 0.92226429, "memory(GiB)": 739.27, "step": 11290, "train_speed(iter/s)": 0.239378 }, { "acc": 0.74499116, "epoch": 0.28653013465584515, "grad_norm": 4.21875, "learning_rate": 9.764057551476119e-06, "loss": 0.96200056, "memory(GiB)": 739.27, "step": 11295, "train_speed(iter/s)": 0.239132 }, { "acc": 0.75094581, "epoch": 0.2866569740248827, "grad_norm": 2.765625, "learning_rate": 9.763739115315033e-06, "loss": 0.93946409, "memory(GiB)": 739.27, "step": 11300, "train_speed(iter/s)": 0.238888 }, { "acc": 0.76335974, "epoch": 0.2867838133939203, "grad_norm": 4.03125, "learning_rate": 9.763420469612239e-06, "loss": 0.93376017, "memory(GiB)": 739.27, "step": 11305, "train_speed(iter/s)": 0.238666 }, { "acc": 0.75621157, "epoch": 0.28691065276295785, "grad_norm": 3.625, "learning_rate": 9.763101614381756e-06, "loss": 0.98332291, "memory(GiB)": 739.27, "step": 11310, "train_speed(iter/s)": 0.238433 }, { "acc": 0.74849434, "epoch": 0.28703749213199536, "grad_norm": 3.421875, "learning_rate": 9.762782549637611e-06, "loss": 0.96468611, "memory(GiB)": 739.27, "step": 11315, "train_speed(iter/s)": 0.238203 }, { "acc": 0.76680336, "epoch": 0.28716433150103293, "grad_norm": 3.71875, "learning_rate": 9.762463275393836e-06, "loss": 0.92260475, "memory(GiB)": 739.27, "step": 11320, "train_speed(iter/s)": 0.237963 }, { "acc": 0.76465535, "epoch": 0.2872911708700705, "grad_norm": 3.0625, "learning_rate": 9.762143791664476e-06, "loss": 0.87470083, "memory(GiB)": 739.27, "step": 11325, "train_speed(iter/s)": 0.237726 }, { "acc": 0.73741307, "epoch": 0.28741801023910807, "grad_norm": 3.734375, "learning_rate": 9.761824098463583e-06, "loss": 0.96248856, "memory(GiB)": 739.27, "step": 11330, "train_speed(iter/s)": 0.237479 }, { "acc": 0.74692273, "epoch": 0.28754484960814564, "grad_norm": 3.46875, "learning_rate": 9.76150419580522e-06, "loss": 0.96208344, "memory(GiB)": 739.27, "step": 11335, "train_speed(iter/s)": 0.237287 }, { "acc": 0.76739178, "epoch": 0.2876716889771832, "grad_norm": 3.546875, "learning_rate": 9.761184083703459e-06, "loss": 0.94882555, "memory(GiB)": 739.27, "step": 11340, "train_speed(iter/s)": 0.23708 }, { "acc": 0.76033888, "epoch": 0.2877985283462207, "grad_norm": 3.578125, "learning_rate": 9.76086376217238e-06, "loss": 0.93825197, "memory(GiB)": 739.27, "step": 11345, "train_speed(iter/s)": 0.236841 }, { "acc": 0.7592495, "epoch": 0.2879253677152583, "grad_norm": 3.703125, "learning_rate": 9.760543231226071e-06, "loss": 0.92472744, "memory(GiB)": 739.27, "step": 11350, "train_speed(iter/s)": 0.236578 }, { "acc": 0.7644145, "epoch": 0.28805220708429585, "grad_norm": 3.40625, "learning_rate": 9.760222490878638e-06, "loss": 0.92390642, "memory(GiB)": 739.27, "step": 11355, "train_speed(iter/s)": 0.236363 }, { "acc": 0.75008469, "epoch": 0.2881790464533334, "grad_norm": 3.828125, "learning_rate": 9.759901541144182e-06, "loss": 0.95813208, "memory(GiB)": 739.27, "step": 11360, "train_speed(iter/s)": 0.236142 }, { "acc": 0.7577909, "epoch": 0.288305885822371, "grad_norm": 3.453125, "learning_rate": 9.759580382036822e-06, "loss": 0.93638849, "memory(GiB)": 739.27, "step": 11365, "train_speed(iter/s)": 0.235908 }, { "acc": 0.76293716, "epoch": 0.28843272519140856, "grad_norm": 3.515625, "learning_rate": 9.759259013570687e-06, "loss": 0.92311497, "memory(GiB)": 739.27, "step": 11370, "train_speed(iter/s)": 0.235675 }, { "acc": 0.74839406, "epoch": 0.28855956456044607, "grad_norm": 3.453125, "learning_rate": 9.758937435759912e-06, "loss": 0.97529297, "memory(GiB)": 739.27, "step": 11375, "train_speed(iter/s)": 0.235417 }, { "acc": 0.76858597, "epoch": 0.28868640392948364, "grad_norm": 3.125, "learning_rate": 9.758615648618642e-06, "loss": 0.87906675, "memory(GiB)": 739.27, "step": 11380, "train_speed(iter/s)": 0.235205 }, { "acc": 0.75666475, "epoch": 0.2888132432985212, "grad_norm": 3.484375, "learning_rate": 9.758293652161032e-06, "loss": 0.94756556, "memory(GiB)": 739.27, "step": 11385, "train_speed(iter/s)": 0.234975 }, { "acc": 0.75329838, "epoch": 0.2889400826675588, "grad_norm": 3.921875, "learning_rate": 9.757971446401245e-06, "loss": 0.91921253, "memory(GiB)": 739.27, "step": 11390, "train_speed(iter/s)": 0.234762 }, { "acc": 0.76494856, "epoch": 0.28906692203659634, "grad_norm": 3.875, "learning_rate": 9.757649031353453e-06, "loss": 0.89500198, "memory(GiB)": 739.27, "step": 11395, "train_speed(iter/s)": 0.234546 }, { "acc": 0.75325146, "epoch": 0.2891937614056339, "grad_norm": 3.6875, "learning_rate": 9.75732640703184e-06, "loss": 0.98611784, "memory(GiB)": 739.27, "step": 11400, "train_speed(iter/s)": 0.234369 }, { "acc": 0.747609, "epoch": 0.2893206007746714, "grad_norm": 3.078125, "learning_rate": 9.757003573450597e-06, "loss": 0.96531086, "memory(GiB)": 739.27, "step": 11405, "train_speed(iter/s)": 0.234113 }, { "acc": 0.74940643, "epoch": 0.289447440143709, "grad_norm": 3.34375, "learning_rate": 9.756680530623922e-06, "loss": 0.96038952, "memory(GiB)": 739.27, "step": 11410, "train_speed(iter/s)": 0.233898 }, { "acc": 0.75493779, "epoch": 0.28957427951274656, "grad_norm": 3.65625, "learning_rate": 9.756357278566025e-06, "loss": 1.00141582, "memory(GiB)": 739.27, "step": 11415, "train_speed(iter/s)": 0.23367 }, { "acc": 0.76280088, "epoch": 0.2897011188817841, "grad_norm": 4.0, "learning_rate": 9.756033817291128e-06, "loss": 0.90463257, "memory(GiB)": 739.27, "step": 11420, "train_speed(iter/s)": 0.233472 }, { "acc": 0.75048065, "epoch": 0.2898279582508217, "grad_norm": 3.625, "learning_rate": 9.755710146813456e-06, "loss": 0.98039961, "memory(GiB)": 739.27, "step": 11425, "train_speed(iter/s)": 0.233265 }, { "acc": 0.77005959, "epoch": 0.28995479761985926, "grad_norm": 3.734375, "learning_rate": 9.755386267147247e-06, "loss": 0.89451723, "memory(GiB)": 739.27, "step": 11430, "train_speed(iter/s)": 0.23304 }, { "acc": 0.75990095, "epoch": 0.2900816369888968, "grad_norm": 4.34375, "learning_rate": 9.75506217830675e-06, "loss": 0.91594048, "memory(GiB)": 739.27, "step": 11435, "train_speed(iter/s)": 0.232832 }, { "acc": 0.75498333, "epoch": 0.29020847635793434, "grad_norm": 3.546875, "learning_rate": 9.754737880306215e-06, "loss": 0.92854958, "memory(GiB)": 739.27, "step": 11440, "train_speed(iter/s)": 0.232589 }, { "acc": 0.75511441, "epoch": 0.2903353157269719, "grad_norm": 3.4375, "learning_rate": 9.754413373159912e-06, "loss": 0.94631195, "memory(GiB)": 739.27, "step": 11445, "train_speed(iter/s)": 0.232376 }, { "acc": 0.75495086, "epoch": 0.2904621550960095, "grad_norm": 3.640625, "learning_rate": 9.754088656882114e-06, "loss": 0.95799818, "memory(GiB)": 739.27, "step": 11450, "train_speed(iter/s)": 0.232201 }, { "acc": 0.74286761, "epoch": 0.29058899446504705, "grad_norm": 3.859375, "learning_rate": 9.753763731487103e-06, "loss": 0.95456839, "memory(GiB)": 739.27, "step": 11455, "train_speed(iter/s)": 0.232014 }, { "acc": 0.75834804, "epoch": 0.2907158338340846, "grad_norm": 3.3125, "learning_rate": 9.753438596989171e-06, "loss": 0.93889389, "memory(GiB)": 739.27, "step": 11460, "train_speed(iter/s)": 0.231816 }, { "acc": 0.75713806, "epoch": 0.2908426732031221, "grad_norm": 3.53125, "learning_rate": 9.753113253402619e-06, "loss": 0.97872038, "memory(GiB)": 739.27, "step": 11465, "train_speed(iter/s)": 0.231607 }, { "acc": 0.77934475, "epoch": 0.2909695125721597, "grad_norm": 3.203125, "learning_rate": 9.752787700741762e-06, "loss": 0.87809038, "memory(GiB)": 739.27, "step": 11470, "train_speed(iter/s)": 0.231407 }, { "acc": 0.76623845, "epoch": 0.29109635194119726, "grad_norm": 3.65625, "learning_rate": 9.752461939020917e-06, "loss": 0.91076632, "memory(GiB)": 739.27, "step": 11475, "train_speed(iter/s)": 0.231172 }, { "acc": 0.76200395, "epoch": 0.29122319131023483, "grad_norm": 3.109375, "learning_rate": 9.752135968254412e-06, "loss": 0.90769005, "memory(GiB)": 739.27, "step": 11480, "train_speed(iter/s)": 0.230963 }, { "acc": 0.76670194, "epoch": 0.2913500306792724, "grad_norm": 4.0, "learning_rate": 9.75180978845659e-06, "loss": 0.89075537, "memory(GiB)": 739.27, "step": 11485, "train_speed(iter/s)": 0.230766 }, { "acc": 0.75685792, "epoch": 0.29147687004830997, "grad_norm": 4.15625, "learning_rate": 9.751483399641793e-06, "loss": 0.86552591, "memory(GiB)": 739.27, "step": 11490, "train_speed(iter/s)": 0.230583 }, { "acc": 0.75334148, "epoch": 0.2916037094173475, "grad_norm": 3.53125, "learning_rate": 9.75115680182438e-06, "loss": 0.90958061, "memory(GiB)": 739.27, "step": 11495, "train_speed(iter/s)": 0.230377 }, { "acc": 0.7671248, "epoch": 0.29173054878638505, "grad_norm": 3.34375, "learning_rate": 9.75082999501872e-06, "loss": 0.90801706, "memory(GiB)": 739.27, "step": 11500, "train_speed(iter/s)": 0.230176 }, { "epoch": 0.29173054878638505, "eval_acc": 0.7480896545356788, "eval_loss": 0.9068630933761597, "eval_runtime": 1149.5546, "eval_samples_per_second": 5.541, "eval_steps_per_second": 5.541, "step": 11500 }, { "acc": 0.76470566, "epoch": 0.2918573881554226, "grad_norm": 2.671875, "learning_rate": 9.750502979239183e-06, "loss": 0.93083525, "memory(GiB)": 739.27, "step": 11505, "train_speed(iter/s)": 0.221593 }, { "acc": 0.76507459, "epoch": 0.2919842275244602, "grad_norm": 2.65625, "learning_rate": 9.750175754500158e-06, "loss": 0.91680679, "memory(GiB)": 739.27, "step": 11510, "train_speed(iter/s)": 0.22139 }, { "acc": 0.75457187, "epoch": 0.29211106689349775, "grad_norm": 4.09375, "learning_rate": 9.749848320816035e-06, "loss": 0.97736607, "memory(GiB)": 739.27, "step": 11515, "train_speed(iter/s)": 0.221202 }, { "acc": 0.7471951, "epoch": 0.2922379062625353, "grad_norm": 3.921875, "learning_rate": 9.749520678201219e-06, "loss": 1.00023947, "memory(GiB)": 739.27, "step": 11520, "train_speed(iter/s)": 0.221031 }, { "acc": 0.76864934, "epoch": 0.29236474563157283, "grad_norm": 4.09375, "learning_rate": 9.74919282667012e-06, "loss": 0.90233116, "memory(GiB)": 739.27, "step": 11525, "train_speed(iter/s)": 0.220881 }, { "acc": 0.74615974, "epoch": 0.2924915850006104, "grad_norm": 3.65625, "learning_rate": 9.748864766237164e-06, "loss": 0.986376, "memory(GiB)": 739.27, "step": 11530, "train_speed(iter/s)": 0.220688 }, { "acc": 0.75865254, "epoch": 0.29261842436964797, "grad_norm": 3.78125, "learning_rate": 9.748536496916774e-06, "loss": 0.98787689, "memory(GiB)": 739.27, "step": 11535, "train_speed(iter/s)": 0.220514 }, { "acc": 0.76059356, "epoch": 0.29274526373868553, "grad_norm": 3.421875, "learning_rate": 9.748208018723393e-06, "loss": 0.95426807, "memory(GiB)": 739.27, "step": 11540, "train_speed(iter/s)": 0.220342 }, { "acc": 0.75745425, "epoch": 0.2928721031077231, "grad_norm": 3.625, "learning_rate": 9.747879331671473e-06, "loss": 0.90573311, "memory(GiB)": 739.27, "step": 11545, "train_speed(iter/s)": 0.220164 }, { "acc": 0.75625238, "epoch": 0.29299894247676067, "grad_norm": 3.875, "learning_rate": 9.747550435775468e-06, "loss": 0.96197214, "memory(GiB)": 739.27, "step": 11550, "train_speed(iter/s)": 0.220008 }, { "acc": 0.76015353, "epoch": 0.2931257818457982, "grad_norm": 3.296875, "learning_rate": 9.747221331049844e-06, "loss": 0.89834824, "memory(GiB)": 739.27, "step": 11555, "train_speed(iter/s)": 0.219823 }, { "acc": 0.75820732, "epoch": 0.29325262121483575, "grad_norm": 3.421875, "learning_rate": 9.74689201750908e-06, "loss": 0.91910095, "memory(GiB)": 739.27, "step": 11560, "train_speed(iter/s)": 0.219648 }, { "acc": 0.75008726, "epoch": 0.2933794605838733, "grad_norm": 3.8125, "learning_rate": 9.746562495167662e-06, "loss": 0.96296682, "memory(GiB)": 739.27, "step": 11565, "train_speed(iter/s)": 0.21947 }, { "acc": 0.74959083, "epoch": 0.2935062999529109, "grad_norm": 3.515625, "learning_rate": 9.746232764040082e-06, "loss": 0.97749548, "memory(GiB)": 739.27, "step": 11570, "train_speed(iter/s)": 0.219282 }, { "acc": 0.76257572, "epoch": 0.29363313932194846, "grad_norm": 3.1875, "learning_rate": 9.745902824140847e-06, "loss": 0.89127798, "memory(GiB)": 739.27, "step": 11575, "train_speed(iter/s)": 0.219083 }, { "acc": 0.76156468, "epoch": 0.293759978690986, "grad_norm": 4.71875, "learning_rate": 9.745572675484466e-06, "loss": 0.94091043, "memory(GiB)": 739.27, "step": 11580, "train_speed(iter/s)": 0.218894 }, { "acc": 0.76055727, "epoch": 0.29388681806002354, "grad_norm": 4.71875, "learning_rate": 9.745242318085464e-06, "loss": 1.00114574, "memory(GiB)": 739.27, "step": 11585, "train_speed(iter/s)": 0.218728 }, { "acc": 0.75451531, "epoch": 0.2940136574290611, "grad_norm": 3.703125, "learning_rate": 9.744911751958372e-06, "loss": 0.96457787, "memory(GiB)": 739.27, "step": 11590, "train_speed(iter/s)": 0.218538 }, { "acc": 0.74559698, "epoch": 0.29414049679809867, "grad_norm": 3.34375, "learning_rate": 9.74458097711773e-06, "loss": 0.9503624, "memory(GiB)": 739.27, "step": 11595, "train_speed(iter/s)": 0.218354 }, { "acc": 0.75400395, "epoch": 0.29426733616713624, "grad_norm": 5.125, "learning_rate": 9.744249993578088e-06, "loss": 0.94838104, "memory(GiB)": 739.27, "step": 11600, "train_speed(iter/s)": 0.218151 }, { "acc": 0.76579466, "epoch": 0.2943941755361738, "grad_norm": 3.84375, "learning_rate": 9.743918801354005e-06, "loss": 0.91421089, "memory(GiB)": 739.27, "step": 11605, "train_speed(iter/s)": 0.217973 }, { "acc": 0.74953289, "epoch": 0.2945210149052114, "grad_norm": 3.1875, "learning_rate": 9.743587400460048e-06, "loss": 0.95262136, "memory(GiB)": 739.27, "step": 11610, "train_speed(iter/s)": 0.217803 }, { "acc": 0.76205306, "epoch": 0.2946478542742489, "grad_norm": 3.515625, "learning_rate": 9.743255790910798e-06, "loss": 0.95007429, "memory(GiB)": 739.27, "step": 11615, "train_speed(iter/s)": 0.217635 }, { "acc": 0.76305332, "epoch": 0.29477469364328646, "grad_norm": 4.0625, "learning_rate": 9.742923972720837e-06, "loss": 0.90266294, "memory(GiB)": 739.27, "step": 11620, "train_speed(iter/s)": 0.217459 }, { "acc": 0.75131865, "epoch": 0.294901533012324, "grad_norm": 4.53125, "learning_rate": 9.74259194590476e-06, "loss": 0.92742243, "memory(GiB)": 739.27, "step": 11625, "train_speed(iter/s)": 0.217279 }, { "acc": 0.7631896, "epoch": 0.2950283723813616, "grad_norm": 4.34375, "learning_rate": 9.742259710477178e-06, "loss": 0.976266, "memory(GiB)": 739.27, "step": 11630, "train_speed(iter/s)": 0.217118 }, { "acc": 0.76524944, "epoch": 0.29515521175039916, "grad_norm": 3.296875, "learning_rate": 9.741927266452697e-06, "loss": 0.96166821, "memory(GiB)": 739.27, "step": 11635, "train_speed(iter/s)": 0.216938 }, { "acc": 0.7719759, "epoch": 0.29528205111943673, "grad_norm": 3.1875, "learning_rate": 9.741594613845948e-06, "loss": 0.87937717, "memory(GiB)": 739.27, "step": 11640, "train_speed(iter/s)": 0.216732 }, { "acc": 0.76327901, "epoch": 0.29540889048847424, "grad_norm": 3.375, "learning_rate": 9.741261752671559e-06, "loss": 0.94851885, "memory(GiB)": 739.27, "step": 11645, "train_speed(iter/s)": 0.216556 }, { "acc": 0.77187433, "epoch": 0.2955357298575118, "grad_norm": 3.78125, "learning_rate": 9.740928682944171e-06, "loss": 0.86348801, "memory(GiB)": 739.27, "step": 11650, "train_speed(iter/s)": 0.216356 }, { "acc": 0.75514007, "epoch": 0.2956625692265494, "grad_norm": 3.46875, "learning_rate": 9.740595404678435e-06, "loss": 0.93219347, "memory(GiB)": 739.27, "step": 11655, "train_speed(iter/s)": 0.216169 }, { "acc": 0.76216655, "epoch": 0.29578940859558694, "grad_norm": 3.8125, "learning_rate": 9.740261917889014e-06, "loss": 0.93734112, "memory(GiB)": 739.27, "step": 11660, "train_speed(iter/s)": 0.215988 }, { "acc": 0.76290083, "epoch": 0.2959162479646245, "grad_norm": 3.5625, "learning_rate": 9.739928222590571e-06, "loss": 0.88629856, "memory(GiB)": 739.27, "step": 11665, "train_speed(iter/s)": 0.215821 }, { "acc": 0.76162214, "epoch": 0.2960430873336621, "grad_norm": 3.046875, "learning_rate": 9.739594318797791e-06, "loss": 0.90609951, "memory(GiB)": 739.27, "step": 11670, "train_speed(iter/s)": 0.215599 }, { "acc": 0.75489984, "epoch": 0.2961699267026996, "grad_norm": 3.203125, "learning_rate": 9.739260206525358e-06, "loss": 0.95003128, "memory(GiB)": 739.27, "step": 11675, "train_speed(iter/s)": 0.215416 }, { "acc": 0.76002922, "epoch": 0.29629676607173716, "grad_norm": 3.515625, "learning_rate": 9.738925885787968e-06, "loss": 0.93572311, "memory(GiB)": 739.27, "step": 11680, "train_speed(iter/s)": 0.215252 }, { "acc": 0.76773071, "epoch": 0.29642360544077473, "grad_norm": 2.90625, "learning_rate": 9.738591356600326e-06, "loss": 0.921737, "memory(GiB)": 739.27, "step": 11685, "train_speed(iter/s)": 0.215073 }, { "acc": 0.75919657, "epoch": 0.2965504448098123, "grad_norm": 2.96875, "learning_rate": 9.73825661897715e-06, "loss": 0.94152536, "memory(GiB)": 739.27, "step": 11690, "train_speed(iter/s)": 0.2149 }, { "acc": 0.75223875, "epoch": 0.29667728417884986, "grad_norm": 4.15625, "learning_rate": 9.737921672933163e-06, "loss": 0.96951284, "memory(GiB)": 739.27, "step": 11695, "train_speed(iter/s)": 0.214728 }, { "acc": 0.76056576, "epoch": 0.29680412354788743, "grad_norm": 3.65625, "learning_rate": 9.737586518483097e-06, "loss": 0.93124647, "memory(GiB)": 739.27, "step": 11700, "train_speed(iter/s)": 0.214572 }, { "acc": 0.75250125, "epoch": 0.29693096291692495, "grad_norm": 3.875, "learning_rate": 9.737251155641694e-06, "loss": 0.95738382, "memory(GiB)": 739.27, "step": 11705, "train_speed(iter/s)": 0.214412 }, { "acc": 0.75871229, "epoch": 0.2970578022859625, "grad_norm": 3.6875, "learning_rate": 9.736915584423707e-06, "loss": 0.94996576, "memory(GiB)": 739.27, "step": 11710, "train_speed(iter/s)": 0.21423 }, { "acc": 0.76853752, "epoch": 0.2971846416550001, "grad_norm": 4.09375, "learning_rate": 9.736579804843895e-06, "loss": 0.93511353, "memory(GiB)": 739.27, "step": 11715, "train_speed(iter/s)": 0.214083 }, { "acc": 0.76783357, "epoch": 0.29731148102403765, "grad_norm": 3.21875, "learning_rate": 9.73624381691703e-06, "loss": 0.92975063, "memory(GiB)": 739.27, "step": 11720, "train_speed(iter/s)": 0.213932 }, { "acc": 0.76956348, "epoch": 0.2974383203930752, "grad_norm": 3.78125, "learning_rate": 9.735907620657892e-06, "loss": 0.9381156, "memory(GiB)": 739.27, "step": 11725, "train_speed(iter/s)": 0.213778 }, { "acc": 0.76406708, "epoch": 0.2975651597621128, "grad_norm": 3.40625, "learning_rate": 9.735571216081265e-06, "loss": 0.92615604, "memory(GiB)": 739.27, "step": 11730, "train_speed(iter/s)": 0.213621 }, { "acc": 0.75381436, "epoch": 0.2976919991311503, "grad_norm": 4.03125, "learning_rate": 9.735234603201948e-06, "loss": 0.95473318, "memory(GiB)": 739.27, "step": 11735, "train_speed(iter/s)": 0.213466 }, { "acc": 0.75776124, "epoch": 0.29781883850018787, "grad_norm": 3.21875, "learning_rate": 9.734897782034752e-06, "loss": 0.92842216, "memory(GiB)": 739.27, "step": 11740, "train_speed(iter/s)": 0.213278 }, { "acc": 0.75650554, "epoch": 0.29794567786922543, "grad_norm": 3.265625, "learning_rate": 9.734560752594485e-06, "loss": 1.03853245, "memory(GiB)": 739.27, "step": 11745, "train_speed(iter/s)": 0.213127 }, { "acc": 0.7640626, "epoch": 0.298072517238263, "grad_norm": 5.1875, "learning_rate": 9.734223514895976e-06, "loss": 0.94307518, "memory(GiB)": 739.27, "step": 11750, "train_speed(iter/s)": 0.212963 }, { "acc": 0.76027889, "epoch": 0.29819935660730057, "grad_norm": 3.5, "learning_rate": 9.73388606895406e-06, "loss": 0.88321705, "memory(GiB)": 739.27, "step": 11755, "train_speed(iter/s)": 0.212784 }, { "acc": 0.75902381, "epoch": 0.29832619597633814, "grad_norm": 3.59375, "learning_rate": 9.73354841478358e-06, "loss": 0.9303751, "memory(GiB)": 739.27, "step": 11760, "train_speed(iter/s)": 0.212616 }, { "acc": 0.75265632, "epoch": 0.29845303534537565, "grad_norm": 3.953125, "learning_rate": 9.733210552399385e-06, "loss": 0.92026863, "memory(GiB)": 739.27, "step": 11765, "train_speed(iter/s)": 0.212437 }, { "acc": 0.75778236, "epoch": 0.2985798747144132, "grad_norm": 4.125, "learning_rate": 9.73287248181634e-06, "loss": 0.9069005, "memory(GiB)": 739.27, "step": 11770, "train_speed(iter/s)": 0.21226 }, { "acc": 0.76870012, "epoch": 0.2987067140834508, "grad_norm": 4.0625, "learning_rate": 9.732534203049313e-06, "loss": 0.8899971, "memory(GiB)": 739.27, "step": 11775, "train_speed(iter/s)": 0.212076 }, { "acc": 0.75667911, "epoch": 0.29883355345248835, "grad_norm": 3.03125, "learning_rate": 9.732195716113186e-06, "loss": 0.87998924, "memory(GiB)": 739.27, "step": 11780, "train_speed(iter/s)": 0.211869 }, { "acc": 0.75555468, "epoch": 0.2989603928215259, "grad_norm": 3.625, "learning_rate": 9.731857021022848e-06, "loss": 0.90526571, "memory(GiB)": 739.27, "step": 11785, "train_speed(iter/s)": 0.21172 }, { "acc": 0.77400432, "epoch": 0.2990872321905635, "grad_norm": 4.34375, "learning_rate": 9.731518117793195e-06, "loss": 0.92797718, "memory(GiB)": 739.27, "step": 11790, "train_speed(iter/s)": 0.211572 }, { "acc": 0.7737802, "epoch": 0.299214071559601, "grad_norm": 3.453125, "learning_rate": 9.731179006439136e-06, "loss": 0.89805632, "memory(GiB)": 739.27, "step": 11795, "train_speed(iter/s)": 0.211379 }, { "acc": 0.75632658, "epoch": 0.29934091092863857, "grad_norm": 3.3125, "learning_rate": 9.730839686975589e-06, "loss": 0.92337503, "memory(GiB)": 739.27, "step": 11800, "train_speed(iter/s)": 0.211206 }, { "acc": 0.76205511, "epoch": 0.29946775029767614, "grad_norm": 3.84375, "learning_rate": 9.730500159417473e-06, "loss": 0.92757616, "memory(GiB)": 739.27, "step": 11805, "train_speed(iter/s)": 0.211048 }, { "acc": 0.75867982, "epoch": 0.2995945896667137, "grad_norm": 5.0625, "learning_rate": 9.73016042377973e-06, "loss": 0.93268538, "memory(GiB)": 739.27, "step": 11810, "train_speed(iter/s)": 0.210853 }, { "acc": 0.7617094, "epoch": 0.2997214290357513, "grad_norm": 3.234375, "learning_rate": 9.729820480077303e-06, "loss": 0.87682724, "memory(GiB)": 739.27, "step": 11815, "train_speed(iter/s)": 0.210686 }, { "acc": 0.7517086, "epoch": 0.29984826840478884, "grad_norm": 3.125, "learning_rate": 9.72948032832514e-06, "loss": 1.00082207, "memory(GiB)": 739.27, "step": 11820, "train_speed(iter/s)": 0.210531 }, { "acc": 0.75816751, "epoch": 0.29997510777382635, "grad_norm": 3.25, "learning_rate": 9.729139968538209e-06, "loss": 0.9225771, "memory(GiB)": 739.27, "step": 11825, "train_speed(iter/s)": 0.210373 }, { "acc": 0.75120225, "epoch": 0.3001019471428639, "grad_norm": 5.03125, "learning_rate": 9.728799400731477e-06, "loss": 0.99966297, "memory(GiB)": 739.27, "step": 11830, "train_speed(iter/s)": 0.210189 }, { "acc": 0.77218885, "epoch": 0.3002287865119015, "grad_norm": 3.015625, "learning_rate": 9.728458624919926e-06, "loss": 0.88492451, "memory(GiB)": 739.27, "step": 11835, "train_speed(iter/s)": 0.209996 }, { "acc": 0.7593123, "epoch": 0.30035562588093906, "grad_norm": 3.140625, "learning_rate": 9.728117641118546e-06, "loss": 0.85113459, "memory(GiB)": 739.27, "step": 11840, "train_speed(iter/s)": 0.209846 }, { "acc": 0.77109556, "epoch": 0.3004824652499766, "grad_norm": 3.25, "learning_rate": 9.727776449342337e-06, "loss": 0.92963905, "memory(GiB)": 739.27, "step": 11845, "train_speed(iter/s)": 0.209683 }, { "acc": 0.76583066, "epoch": 0.3006093046190142, "grad_norm": 3.078125, "learning_rate": 9.727435049606303e-06, "loss": 0.88998604, "memory(GiB)": 739.27, "step": 11850, "train_speed(iter/s)": 0.209492 }, { "acc": 0.75796285, "epoch": 0.3007361439880517, "grad_norm": 4.03125, "learning_rate": 9.727093441925467e-06, "loss": 0.96937275, "memory(GiB)": 739.27, "step": 11855, "train_speed(iter/s)": 0.209356 }, { "acc": 0.76223507, "epoch": 0.3008629833570893, "grad_norm": 3.765625, "learning_rate": 9.72675162631485e-06, "loss": 0.90555878, "memory(GiB)": 739.27, "step": 11860, "train_speed(iter/s)": 0.209195 }, { "acc": 0.75516262, "epoch": 0.30098982272612684, "grad_norm": 4.1875, "learning_rate": 9.726409602789491e-06, "loss": 0.92302761, "memory(GiB)": 739.27, "step": 11865, "train_speed(iter/s)": 0.209024 }, { "acc": 0.76476579, "epoch": 0.3011166620951644, "grad_norm": 3.28125, "learning_rate": 9.726067371364431e-06, "loss": 0.91360178, "memory(GiB)": 739.27, "step": 11870, "train_speed(iter/s)": 0.208883 }, { "acc": 0.78028073, "epoch": 0.301243501464202, "grad_norm": 3.671875, "learning_rate": 9.725724932054726e-06, "loss": 0.91675224, "memory(GiB)": 739.27, "step": 11875, "train_speed(iter/s)": 0.208729 }, { "acc": 0.75950623, "epoch": 0.30137034083323955, "grad_norm": 3.203125, "learning_rate": 9.725382284875438e-06, "loss": 0.95822134, "memory(GiB)": 739.27, "step": 11880, "train_speed(iter/s)": 0.208571 }, { "acc": 0.75433574, "epoch": 0.30149718020227706, "grad_norm": 3.53125, "learning_rate": 9.72503942984164e-06, "loss": 0.99382, "memory(GiB)": 739.27, "step": 11885, "train_speed(iter/s)": 0.208419 }, { "acc": 0.7516326, "epoch": 0.3016240195713146, "grad_norm": 3.21875, "learning_rate": 9.724696366968411e-06, "loss": 0.98238506, "memory(GiB)": 739.27, "step": 11890, "train_speed(iter/s)": 0.208247 }, { "acc": 0.760816, "epoch": 0.3017508589403522, "grad_norm": 3.28125, "learning_rate": 9.724353096270844e-06, "loss": 0.92701292, "memory(GiB)": 739.27, "step": 11895, "train_speed(iter/s)": 0.208093 }, { "acc": 0.77687273, "epoch": 0.30187769830938976, "grad_norm": 3.4375, "learning_rate": 9.724009617764037e-06, "loss": 0.87537069, "memory(GiB)": 739.27, "step": 11900, "train_speed(iter/s)": 0.207932 }, { "acc": 0.75471725, "epoch": 0.30200453767842733, "grad_norm": 3.65625, "learning_rate": 9.723665931463098e-06, "loss": 0.91742439, "memory(GiB)": 739.27, "step": 11905, "train_speed(iter/s)": 0.207748 }, { "acc": 0.75736809, "epoch": 0.3021313770474649, "grad_norm": 4.1875, "learning_rate": 9.723322037383146e-06, "loss": 0.9594367, "memory(GiB)": 739.27, "step": 11910, "train_speed(iter/s)": 0.207598 }, { "acc": 0.75834718, "epoch": 0.3022582164165024, "grad_norm": 3.3125, "learning_rate": 9.722977935539307e-06, "loss": 0.93497858, "memory(GiB)": 739.27, "step": 11915, "train_speed(iter/s)": 0.207436 }, { "acc": 0.77578049, "epoch": 0.30238505578554, "grad_norm": 3.609375, "learning_rate": 9.722633625946715e-06, "loss": 0.84598122, "memory(GiB)": 739.27, "step": 11920, "train_speed(iter/s)": 0.207274 }, { "acc": 0.76526294, "epoch": 0.30251189515457755, "grad_norm": 3.703125, "learning_rate": 9.72228910862052e-06, "loss": 0.92968149, "memory(GiB)": 739.27, "step": 11925, "train_speed(iter/s)": 0.20711 }, { "acc": 0.75686345, "epoch": 0.3026387345236151, "grad_norm": 4.09375, "learning_rate": 9.72194438357587e-06, "loss": 0.97395134, "memory(GiB)": 739.27, "step": 11930, "train_speed(iter/s)": 0.206935 }, { "acc": 0.77564855, "epoch": 0.3027655738926527, "grad_norm": 4.25, "learning_rate": 9.721599450827934e-06, "loss": 0.89491835, "memory(GiB)": 739.27, "step": 11935, "train_speed(iter/s)": 0.206794 }, { "acc": 0.76935892, "epoch": 0.30289241326169025, "grad_norm": 4.09375, "learning_rate": 9.721254310391882e-06, "loss": 0.94490204, "memory(GiB)": 739.27, "step": 11940, "train_speed(iter/s)": 0.206654 }, { "acc": 0.75592484, "epoch": 0.30301925263072776, "grad_norm": 3.015625, "learning_rate": 9.720908962282893e-06, "loss": 0.89983377, "memory(GiB)": 739.27, "step": 11945, "train_speed(iter/s)": 0.206502 }, { "acc": 0.74642005, "epoch": 0.30314609199976533, "grad_norm": 2.96875, "learning_rate": 9.720563406516163e-06, "loss": 0.94695673, "memory(GiB)": 739.27, "step": 11950, "train_speed(iter/s)": 0.206341 }, { "acc": 0.76885772, "epoch": 0.3032729313688029, "grad_norm": 3.84375, "learning_rate": 9.720217643106889e-06, "loss": 0.93357973, "memory(GiB)": 739.27, "step": 11955, "train_speed(iter/s)": 0.206165 }, { "acc": 0.76300335, "epoch": 0.30339977073784047, "grad_norm": 3.546875, "learning_rate": 9.71987167207028e-06, "loss": 0.92367268, "memory(GiB)": 739.27, "step": 11960, "train_speed(iter/s)": 0.206021 }, { "acc": 0.76487193, "epoch": 0.30352661010687804, "grad_norm": 3.875, "learning_rate": 9.719525493421555e-06, "loss": 0.88316936, "memory(GiB)": 739.27, "step": 11965, "train_speed(iter/s)": 0.205848 }, { "acc": 0.76643896, "epoch": 0.3036534494759156, "grad_norm": 3.796875, "learning_rate": 9.71917910717594e-06, "loss": 0.95532198, "memory(GiB)": 739.27, "step": 11970, "train_speed(iter/s)": 0.2057 }, { "acc": 0.74603443, "epoch": 0.3037802888449531, "grad_norm": 3.421875, "learning_rate": 9.718832513348673e-06, "loss": 0.99139099, "memory(GiB)": 739.27, "step": 11975, "train_speed(iter/s)": 0.205545 }, { "acc": 0.76129375, "epoch": 0.3039071282139907, "grad_norm": 3.5625, "learning_rate": 9.718485711955e-06, "loss": 0.95750036, "memory(GiB)": 739.27, "step": 11980, "train_speed(iter/s)": 0.205406 }, { "acc": 0.74966578, "epoch": 0.30403396758302825, "grad_norm": 3.96875, "learning_rate": 9.718138703010173e-06, "loss": 0.93806715, "memory(GiB)": 739.27, "step": 11985, "train_speed(iter/s)": 0.205266 }, { "acc": 0.74996252, "epoch": 0.3041608069520658, "grad_norm": 3.4375, "learning_rate": 9.717791486529458e-06, "loss": 0.97519941, "memory(GiB)": 739.27, "step": 11990, "train_speed(iter/s)": 0.20508 }, { "acc": 0.76366816, "epoch": 0.3042876463211034, "grad_norm": 4.8125, "learning_rate": 9.717444062528128e-06, "loss": 0.93377399, "memory(GiB)": 739.27, "step": 11995, "train_speed(iter/s)": 0.204935 }, { "acc": 0.75699382, "epoch": 0.30441448569014096, "grad_norm": 3.921875, "learning_rate": 9.717096431021463e-06, "loss": 0.9068943, "memory(GiB)": 739.27, "step": 12000, "train_speed(iter/s)": 0.204803 }, { "epoch": 0.30441448569014096, "eval_acc": 0.7482241756093241, "eval_loss": 0.9046736359596252, "eval_runtime": 1152.5259, "eval_samples_per_second": 5.527, "eval_steps_per_second": 5.527, "step": 12000 }, { "acc": 0.74636865, "epoch": 0.30454132505917847, "grad_norm": 3.296875, "learning_rate": 9.716748592024757e-06, "loss": 0.98919744, "memory(GiB)": 681.43, "step": 12005, "train_speed(iter/s)": 25.745468 }, { "acc": 0.77174287, "epoch": 0.30466816442821604, "grad_norm": 3.3125, "learning_rate": 9.716400545553307e-06, "loss": 0.86848116, "memory(GiB)": 690.99, "step": 12010, "train_speed(iter/s)": 22.29205 }, { "acc": 0.75972152, "epoch": 0.3047950037972536, "grad_norm": 3.359375, "learning_rate": 9.716052291622424e-06, "loss": 0.90193148, "memory(GiB)": 690.99, "step": 12015, "train_speed(iter/s)": 19.790312 }, { "acc": 0.73755016, "epoch": 0.3049218431662912, "grad_norm": 4.1875, "learning_rate": 9.71570383024743e-06, "loss": 0.99364862, "memory(GiB)": 701.93, "step": 12020, "train_speed(iter/s)": 17.724902 }, { "acc": 0.75685906, "epoch": 0.30504868253532874, "grad_norm": 3.234375, "learning_rate": 9.715355161443647e-06, "loss": 0.9203186, "memory(GiB)": 701.93, "step": 12025, "train_speed(iter/s)": 16.053497 }, { "acc": 0.75000658, "epoch": 0.3051755219043663, "grad_norm": 3.578125, "learning_rate": 9.715006285226416e-06, "loss": 0.96080618, "memory(GiB)": 701.93, "step": 12030, "train_speed(iter/s)": 14.748922 }, { "acc": 0.76195226, "epoch": 0.3053023612734038, "grad_norm": 3.78125, "learning_rate": 9.714657201611081e-06, "loss": 0.9569459, "memory(GiB)": 701.93, "step": 12035, "train_speed(iter/s)": 13.657077 }, { "acc": 0.75045171, "epoch": 0.3054292006424414, "grad_norm": 3.875, "learning_rate": 9.714307910612997e-06, "loss": 0.96764374, "memory(GiB)": 701.93, "step": 12040, "train_speed(iter/s)": 12.656318 }, { "acc": 0.75459805, "epoch": 0.30555604001147896, "grad_norm": 3.796875, "learning_rate": 9.713958412247528e-06, "loss": 0.91255531, "memory(GiB)": 701.93, "step": 12045, "train_speed(iter/s)": 11.68271 }, { "acc": 0.76102252, "epoch": 0.3056828793805165, "grad_norm": 3.578125, "learning_rate": 9.71360870653005e-06, "loss": 0.89570198, "memory(GiB)": 701.94, "step": 12050, "train_speed(iter/s)": 10.999982 }, { "acc": 0.76737733, "epoch": 0.3058097187495541, "grad_norm": 3.703125, "learning_rate": 9.713258793475942e-06, "loss": 0.92242174, "memory(GiB)": 701.94, "step": 12055, "train_speed(iter/s)": 10.346345 }, { "acc": 0.76286845, "epoch": 0.30593655811859166, "grad_norm": 3.140625, "learning_rate": 9.712908673100598e-06, "loss": 0.89301329, "memory(GiB)": 701.94, "step": 12060, "train_speed(iter/s)": 9.80999 }, { "acc": 0.75490041, "epoch": 0.3060633974876292, "grad_norm": 4.15625, "learning_rate": 9.712558345419418e-06, "loss": 0.94499559, "memory(GiB)": 701.94, "step": 12065, "train_speed(iter/s)": 9.335 }, { "acc": 0.7420938, "epoch": 0.30619023685666674, "grad_norm": 4.09375, "learning_rate": 9.71220781044781e-06, "loss": 1.00611382, "memory(GiB)": 701.94, "step": 12070, "train_speed(iter/s)": 8.896271 }, { "acc": 0.76860628, "epoch": 0.3063170762257043, "grad_norm": 3.90625, "learning_rate": 9.711857068201196e-06, "loss": 0.91457872, "memory(GiB)": 701.94, "step": 12075, "train_speed(iter/s)": 8.483207 }, { "acc": 0.757445, "epoch": 0.3064439155947419, "grad_norm": 3.671875, "learning_rate": 9.711506118695003e-06, "loss": 0.95674906, "memory(GiB)": 701.94, "step": 12080, "train_speed(iter/s)": 8.091659 }, { "acc": 0.75584455, "epoch": 0.30657075496377945, "grad_norm": 3.890625, "learning_rate": 9.711154961944664e-06, "loss": 0.95805311, "memory(GiB)": 701.94, "step": 12085, "train_speed(iter/s)": 7.745571 }, { "acc": 0.75608764, "epoch": 0.306697594332817, "grad_norm": 3.578125, "learning_rate": 9.710803597965632e-06, "loss": 0.91645327, "memory(GiB)": 714.35, "step": 12090, "train_speed(iter/s)": 7.390007 }, { "acc": 0.76442485, "epoch": 0.3068244337018545, "grad_norm": 3.625, "learning_rate": 9.710452026773358e-06, "loss": 0.92629833, "memory(GiB)": 714.35, "step": 12095, "train_speed(iter/s)": 7.081314 }, { "acc": 0.75234098, "epoch": 0.3069512730708921, "grad_norm": 3.234375, "learning_rate": 9.71010024838331e-06, "loss": 0.94452467, "memory(GiB)": 714.35, "step": 12100, "train_speed(iter/s)": 6.820763 }, { "acc": 0.75644503, "epoch": 0.30707811243992966, "grad_norm": 3.859375, "learning_rate": 9.709748262810956e-06, "loss": 0.9564621, "memory(GiB)": 714.35, "step": 12105, "train_speed(iter/s)": 6.586182 }, { "acc": 0.75673013, "epoch": 0.30720495180896723, "grad_norm": 3.140625, "learning_rate": 9.709396070071784e-06, "loss": 0.85984221, "memory(GiB)": 714.35, "step": 12110, "train_speed(iter/s)": 6.322467 }, { "acc": 0.76935062, "epoch": 0.3073317911780048, "grad_norm": 4.40625, "learning_rate": 9.709043670181284e-06, "loss": 0.8614707, "memory(GiB)": 714.35, "step": 12115, "train_speed(iter/s)": 6.077823 }, { "acc": 0.75877857, "epoch": 0.30745863054704237, "grad_norm": 3.859375, "learning_rate": 9.708691063154956e-06, "loss": 0.9515727, "memory(GiB)": 714.35, "step": 12120, "train_speed(iter/s)": 5.885294 }, { "acc": 0.75026708, "epoch": 0.3075854699160799, "grad_norm": 3.796875, "learning_rate": 9.708338249008312e-06, "loss": 0.97174435, "memory(GiB)": 714.35, "step": 12125, "train_speed(iter/s)": 5.705789 }, { "acc": 0.75476165, "epoch": 0.30771230928511745, "grad_norm": 3.953125, "learning_rate": 9.707985227756868e-06, "loss": 0.97880745, "memory(GiB)": 714.35, "step": 12130, "train_speed(iter/s)": 5.543779 }, { "acc": 0.75918331, "epoch": 0.307839148654155, "grad_norm": 3.703125, "learning_rate": 9.707631999416158e-06, "loss": 0.94738417, "memory(GiB)": 714.35, "step": 12135, "train_speed(iter/s)": 5.400955 }, { "acc": 0.75224471, "epoch": 0.3079659880231926, "grad_norm": 3.765625, "learning_rate": 9.707278564001714e-06, "loss": 0.91490746, "memory(GiB)": 714.35, "step": 12140, "train_speed(iter/s)": 5.245115 }, { "acc": 0.76055732, "epoch": 0.30809282739223015, "grad_norm": 3.25, "learning_rate": 9.706924921529082e-06, "loss": 0.99226723, "memory(GiB)": 714.35, "step": 12145, "train_speed(iter/s)": 5.09113 }, { "acc": 0.7470356, "epoch": 0.3082196667612677, "grad_norm": 3.28125, "learning_rate": 9.706571072013823e-06, "loss": 0.98430271, "memory(GiB)": 714.35, "step": 12150, "train_speed(iter/s)": 4.954927 }, { "acc": 0.75587211, "epoch": 0.30834650613030523, "grad_norm": 3.796875, "learning_rate": 9.706217015471497e-06, "loss": 0.93747025, "memory(GiB)": 714.35, "step": 12155, "train_speed(iter/s)": 4.818329 }, { "acc": 0.7788856, "epoch": 0.3084733454993428, "grad_norm": 4.0625, "learning_rate": 9.70586275191768e-06, "loss": 0.87296782, "memory(GiB)": 714.35, "step": 12160, "train_speed(iter/s)": 4.700473 }, { "acc": 0.75812726, "epoch": 0.30860018486838037, "grad_norm": 3.984375, "learning_rate": 9.705508281367954e-06, "loss": 0.92681999, "memory(GiB)": 714.35, "step": 12165, "train_speed(iter/s)": 4.586865 }, { "acc": 0.75190468, "epoch": 0.30872702423741794, "grad_norm": 4.46875, "learning_rate": 9.70515360383791e-06, "loss": 0.99338255, "memory(GiB)": 714.35, "step": 12170, "train_speed(iter/s)": 4.462721 }, { "acc": 0.77738862, "epoch": 0.3088538636064555, "grad_norm": 3.46875, "learning_rate": 9.704798719343154e-06, "loss": 0.90482407, "memory(GiB)": 714.35, "step": 12175, "train_speed(iter/s)": 4.356831 }, { "acc": 0.76987214, "epoch": 0.30898070297549307, "grad_norm": 3.6875, "learning_rate": 9.704443627899292e-06, "loss": 0.90258598, "memory(GiB)": 714.35, "step": 12180, "train_speed(iter/s)": 4.249402 }, { "acc": 0.74643812, "epoch": 0.3091075423445306, "grad_norm": 3.09375, "learning_rate": 9.704088329521942e-06, "loss": 0.93671131, "memory(GiB)": 714.35, "step": 12185, "train_speed(iter/s)": 4.15522 }, { "acc": 0.76365724, "epoch": 0.30923438171356815, "grad_norm": 3.671875, "learning_rate": 9.703732824226735e-06, "loss": 0.88685503, "memory(GiB)": 714.35, "step": 12190, "train_speed(iter/s)": 4.052205 }, { "acc": 0.77812986, "epoch": 0.3093612210826057, "grad_norm": 3.703125, "learning_rate": 9.703377112029309e-06, "loss": 0.89453945, "memory(GiB)": 714.35, "step": 12195, "train_speed(iter/s)": 3.958891 }, { "acc": 0.75640917, "epoch": 0.3094880604516433, "grad_norm": 3.734375, "learning_rate": 9.703021192945309e-06, "loss": 0.97895403, "memory(GiB)": 714.35, "step": 12200, "train_speed(iter/s)": 3.879186 }, { "acc": 0.77636414, "epoch": 0.30961489982068086, "grad_norm": 3.984375, "learning_rate": 9.702665066990391e-06, "loss": 0.89035463, "memory(GiB)": 714.35, "step": 12205, "train_speed(iter/s)": 3.793606 }, { "acc": 0.7615911, "epoch": 0.3097417391897184, "grad_norm": 3.390625, "learning_rate": 9.70230873418022e-06, "loss": 0.93544693, "memory(GiB)": 714.35, "step": 12210, "train_speed(iter/s)": 3.71274 }, { "acc": 0.75618477, "epoch": 0.30986857855875594, "grad_norm": 3.546875, "learning_rate": 9.701952194530473e-06, "loss": 0.94164753, "memory(GiB)": 714.35, "step": 12215, "train_speed(iter/s)": 3.640132 }, { "acc": 0.76621718, "epoch": 0.3099954179277935, "grad_norm": 3.953125, "learning_rate": 9.701595448056829e-06, "loss": 0.92784777, "memory(GiB)": 714.35, "step": 12220, "train_speed(iter/s)": 3.570169 }, { "acc": 0.75793638, "epoch": 0.31012225729683107, "grad_norm": 3.390625, "learning_rate": 9.70123849477498e-06, "loss": 0.89849625, "memory(GiB)": 714.35, "step": 12225, "train_speed(iter/s)": 3.505498 }, { "acc": 0.76546774, "epoch": 0.31024909666586864, "grad_norm": 3.46875, "learning_rate": 9.70088133470063e-06, "loss": 0.93396873, "memory(GiB)": 714.35, "step": 12230, "train_speed(iter/s)": 3.439373 }, { "acc": 0.75439873, "epoch": 0.3103759360349062, "grad_norm": 3.453125, "learning_rate": 9.700523967849488e-06, "loss": 0.98493595, "memory(GiB)": 714.35, "step": 12235, "train_speed(iter/s)": 3.379491 }, { "acc": 0.75589123, "epoch": 0.3105027754039438, "grad_norm": 3.390625, "learning_rate": 9.700166394237273e-06, "loss": 0.92788029, "memory(GiB)": 714.35, "step": 12240, "train_speed(iter/s)": 3.321498 }, { "acc": 0.74634128, "epoch": 0.3106296147729813, "grad_norm": 3.453125, "learning_rate": 9.699808613879714e-06, "loss": 0.96644859, "memory(GiB)": 714.35, "step": 12245, "train_speed(iter/s)": 3.267702 }, { "acc": 0.74561992, "epoch": 0.31075645414201886, "grad_norm": 3.96875, "learning_rate": 9.699450626792549e-06, "loss": 1.01061974, "memory(GiB)": 714.35, "step": 12250, "train_speed(iter/s)": 3.210852 }, { "acc": 0.77733417, "epoch": 0.3108832935110564, "grad_norm": 3.75, "learning_rate": 9.699092432991523e-06, "loss": 0.87949629, "memory(GiB)": 714.35, "step": 12255, "train_speed(iter/s)": 3.15453 }, { "acc": 0.74466968, "epoch": 0.311010132880094, "grad_norm": 3.25, "learning_rate": 9.698734032492394e-06, "loss": 0.97275019, "memory(GiB)": 714.35, "step": 12260, "train_speed(iter/s)": 3.102642 }, { "acc": 0.75539122, "epoch": 0.31113697224913156, "grad_norm": 4.15625, "learning_rate": 9.698375425310927e-06, "loss": 0.99256744, "memory(GiB)": 714.35, "step": 12265, "train_speed(iter/s)": 3.055398 }, { "acc": 0.75866995, "epoch": 0.31126381161816913, "grad_norm": 3.28125, "learning_rate": 9.698016611462893e-06, "loss": 0.9080307, "memory(GiB)": 714.35, "step": 12270, "train_speed(iter/s)": 3.007887 }, { "acc": 0.76025672, "epoch": 0.31139065098720664, "grad_norm": 3.671875, "learning_rate": 9.697657590964079e-06, "loss": 0.93272104, "memory(GiB)": 714.35, "step": 12275, "train_speed(iter/s)": 2.963925 }, { "acc": 0.74871311, "epoch": 0.3115174903562442, "grad_norm": 5.34375, "learning_rate": 9.697298363830274e-06, "loss": 0.96110468, "memory(GiB)": 714.35, "step": 12280, "train_speed(iter/s)": 2.917121 }, { "acc": 0.76057491, "epoch": 0.3116443297252818, "grad_norm": 3.734375, "learning_rate": 9.69693893007728e-06, "loss": 0.95644722, "memory(GiB)": 714.35, "step": 12285, "train_speed(iter/s)": 2.874159 }, { "acc": 0.76060719, "epoch": 0.31177116909431934, "grad_norm": 3.234375, "learning_rate": 9.696579289720908e-06, "loss": 0.93175573, "memory(GiB)": 714.35, "step": 12290, "train_speed(iter/s)": 2.829668 }, { "acc": 0.76899071, "epoch": 0.3118980084633569, "grad_norm": 4.46875, "learning_rate": 9.69621944277698e-06, "loss": 0.95185747, "memory(GiB)": 714.35, "step": 12295, "train_speed(iter/s)": 2.792736 }, { "acc": 0.75042815, "epoch": 0.3120248478323945, "grad_norm": 3.8125, "learning_rate": 9.695859389261319e-06, "loss": 0.95863323, "memory(GiB)": 714.35, "step": 12300, "train_speed(iter/s)": 2.756245 }, { "acc": 0.77072163, "epoch": 0.312151687201432, "grad_norm": 4.46875, "learning_rate": 9.695499129189764e-06, "loss": 0.93208828, "memory(GiB)": 714.35, "step": 12305, "train_speed(iter/s)": 2.718437 }, { "acc": 0.76386485, "epoch": 0.31227852657046956, "grad_norm": 3.765625, "learning_rate": 9.695138662578166e-06, "loss": 0.93492975, "memory(GiB)": 714.35, "step": 12310, "train_speed(iter/s)": 2.68116 }, { "acc": 0.75385723, "epoch": 0.31240536593950713, "grad_norm": 3.90625, "learning_rate": 9.694777989442377e-06, "loss": 0.96310244, "memory(GiB)": 714.35, "step": 12315, "train_speed(iter/s)": 2.644257 }, { "acc": 0.75754204, "epoch": 0.3125322053085447, "grad_norm": 3.03125, "learning_rate": 9.694417109798263e-06, "loss": 0.92461205, "memory(GiB)": 714.35, "step": 12320, "train_speed(iter/s)": 2.609204 }, { "acc": 0.75493765, "epoch": 0.31265904467758227, "grad_norm": 3.515625, "learning_rate": 9.694056023661698e-06, "loss": 0.97744045, "memory(GiB)": 714.35, "step": 12325, "train_speed(iter/s)": 2.573044 }, { "acc": 0.75867782, "epoch": 0.31278588404661983, "grad_norm": 3.65625, "learning_rate": 9.693694731048563e-06, "loss": 0.96505032, "memory(GiB)": 714.35, "step": 12330, "train_speed(iter/s)": 2.540092 }, { "acc": 0.77897339, "epoch": 0.31291272341565735, "grad_norm": 3.625, "learning_rate": 9.693333231974753e-06, "loss": 0.88929691, "memory(GiB)": 726.89, "step": 12335, "train_speed(iter/s)": 2.505835 }, { "acc": 0.76895704, "epoch": 0.3130395627846949, "grad_norm": 3.390625, "learning_rate": 9.692971526456167e-06, "loss": 0.89437189, "memory(GiB)": 726.89, "step": 12340, "train_speed(iter/s)": 2.470732 }, { "acc": 0.76002707, "epoch": 0.3131664021537325, "grad_norm": 3.875, "learning_rate": 9.692609614508718e-06, "loss": 0.91689062, "memory(GiB)": 726.89, "step": 12345, "train_speed(iter/s)": 2.438315 }, { "acc": 0.76796637, "epoch": 0.31329324152277005, "grad_norm": 3.875, "learning_rate": 9.692247496148323e-06, "loss": 0.91795111, "memory(GiB)": 726.89, "step": 12350, "train_speed(iter/s)": 2.408421 }, { "acc": 0.7610023, "epoch": 0.3134200808918076, "grad_norm": 3.078125, "learning_rate": 9.691885171390912e-06, "loss": 0.92265806, "memory(GiB)": 726.89, "step": 12355, "train_speed(iter/s)": 2.379051 }, { "acc": 0.75623021, "epoch": 0.3135469202608452, "grad_norm": 3.328125, "learning_rate": 9.691522640252419e-06, "loss": 0.94626379, "memory(GiB)": 726.89, "step": 12360, "train_speed(iter/s)": 2.352632 }, { "acc": 0.76019669, "epoch": 0.3136737596298827, "grad_norm": 3.1875, "learning_rate": 9.691159902748795e-06, "loss": 0.96556969, "memory(GiB)": 726.89, "step": 12365, "train_speed(iter/s)": 2.322929 }, { "acc": 0.75595231, "epoch": 0.31380059899892027, "grad_norm": 3.484375, "learning_rate": 9.690796958895992e-06, "loss": 0.91918983, "memory(GiB)": 726.89, "step": 12370, "train_speed(iter/s)": 2.297283 }, { "acc": 0.75444679, "epoch": 0.31392743836795783, "grad_norm": 3.421875, "learning_rate": 9.690433808709978e-06, "loss": 0.94066696, "memory(GiB)": 726.89, "step": 12375, "train_speed(iter/s)": 2.270241 }, { "acc": 0.74488435, "epoch": 0.3140542777369954, "grad_norm": 3.375, "learning_rate": 9.690070452206725e-06, "loss": 0.94125471, "memory(GiB)": 726.89, "step": 12380, "train_speed(iter/s)": 2.245341 }, { "acc": 0.74056311, "epoch": 0.31418111710603297, "grad_norm": 3.25, "learning_rate": 9.689706889402217e-06, "loss": 0.96008215, "memory(GiB)": 726.89, "step": 12385, "train_speed(iter/s)": 2.217039 }, { "acc": 0.76759381, "epoch": 0.31430795647507054, "grad_norm": 3.671875, "learning_rate": 9.689343120312446e-06, "loss": 0.9201355, "memory(GiB)": 726.89, "step": 12390, "train_speed(iter/s)": 2.19287 }, { "acc": 0.76621799, "epoch": 0.31443479584410805, "grad_norm": 3.5, "learning_rate": 9.68897914495341e-06, "loss": 0.91438084, "memory(GiB)": 726.89, "step": 12395, "train_speed(iter/s)": 2.170287 }, { "acc": 0.76918392, "epoch": 0.3145616352131456, "grad_norm": 3.796875, "learning_rate": 9.688614963341122e-06, "loss": 0.88287992, "memory(GiB)": 726.89, "step": 12400, "train_speed(iter/s)": 2.146054 }, { "acc": 0.76873732, "epoch": 0.3146884745821832, "grad_norm": 3.9375, "learning_rate": 9.688250575491603e-06, "loss": 0.89112825, "memory(GiB)": 726.89, "step": 12405, "train_speed(iter/s)": 2.122465 }, { "acc": 0.75291071, "epoch": 0.31481531395122075, "grad_norm": 3.53125, "learning_rate": 9.687885981420875e-06, "loss": 0.98830404, "memory(GiB)": 726.89, "step": 12410, "train_speed(iter/s)": 2.09889 }, { "acc": 0.75637455, "epoch": 0.3149421533202583, "grad_norm": 3.28125, "learning_rate": 9.687521181144983e-06, "loss": 0.91533403, "memory(GiB)": 726.89, "step": 12415, "train_speed(iter/s)": 2.078245 }, { "acc": 0.74881139, "epoch": 0.3150689926892959, "grad_norm": 3.859375, "learning_rate": 9.687156174679967e-06, "loss": 0.94414444, "memory(GiB)": 726.89, "step": 12420, "train_speed(iter/s)": 2.054301 }, { "acc": 0.73542895, "epoch": 0.3151958320583334, "grad_norm": 2.9375, "learning_rate": 9.686790962041886e-06, "loss": 0.99535513, "memory(GiB)": 726.89, "step": 12425, "train_speed(iter/s)": 2.031497 }, { "acc": 0.76169357, "epoch": 0.31532267142737097, "grad_norm": 3.875, "learning_rate": 9.686425543246803e-06, "loss": 0.92269497, "memory(GiB)": 726.89, "step": 12430, "train_speed(iter/s)": 2.01171 }, { "acc": 0.74079571, "epoch": 0.31544951079640854, "grad_norm": 3.671875, "learning_rate": 9.686059918310793e-06, "loss": 0.95180454, "memory(GiB)": 726.89, "step": 12435, "train_speed(iter/s)": 1.990978 }, { "acc": 0.75448694, "epoch": 0.3155763501654461, "grad_norm": 3.21875, "learning_rate": 9.685694087249938e-06, "loss": 0.93843508, "memory(GiB)": 726.89, "step": 12440, "train_speed(iter/s)": 1.970249 }, { "acc": 0.75809755, "epoch": 0.3157031895344837, "grad_norm": 3.734375, "learning_rate": 9.685328050080329e-06, "loss": 0.91769438, "memory(GiB)": 726.89, "step": 12445, "train_speed(iter/s)": 1.951867 }, { "acc": 0.75792718, "epoch": 0.31583002890352124, "grad_norm": 3.828125, "learning_rate": 9.684961806818067e-06, "loss": 0.95424347, "memory(GiB)": 726.89, "step": 12450, "train_speed(iter/s)": 1.932304 }, { "acc": 0.76383762, "epoch": 0.31595686827255876, "grad_norm": 3.296875, "learning_rate": 9.684595357479263e-06, "loss": 0.90512676, "memory(GiB)": 740.26, "step": 12455, "train_speed(iter/s)": 1.909842 }, { "acc": 0.75258861, "epoch": 0.3160837076415963, "grad_norm": 3.296875, "learning_rate": 9.684228702080036e-06, "loss": 0.94817114, "memory(GiB)": 740.26, "step": 12460, "train_speed(iter/s)": 1.891458 }, { "acc": 0.75915837, "epoch": 0.3162105470106339, "grad_norm": 3.875, "learning_rate": 9.68386184063651e-06, "loss": 0.94664297, "memory(GiB)": 740.26, "step": 12465, "train_speed(iter/s)": 1.874221 }, { "acc": 0.75921597, "epoch": 0.31633738637967146, "grad_norm": 4.15625, "learning_rate": 9.68349477316483e-06, "loss": 0.91069098, "memory(GiB)": 740.26, "step": 12470, "train_speed(iter/s)": 1.857752 }, { "acc": 0.75956926, "epoch": 0.316464225748709, "grad_norm": 3.765625, "learning_rate": 9.683127499681135e-06, "loss": 0.90083618, "memory(GiB)": 740.26, "step": 12475, "train_speed(iter/s)": 1.837577 }, { "acc": 0.75951576, "epoch": 0.3165910651177466, "grad_norm": 4.1875, "learning_rate": 9.682760020201582e-06, "loss": 0.92615805, "memory(GiB)": 740.26, "step": 12480, "train_speed(iter/s)": 1.820405 }, { "acc": 0.74868288, "epoch": 0.3167179044867841, "grad_norm": 3.75, "learning_rate": 9.682392334742337e-06, "loss": 0.94194918, "memory(GiB)": 740.26, "step": 12485, "train_speed(iter/s)": 1.802096 }, { "acc": 0.74533434, "epoch": 0.3168447438558217, "grad_norm": 3.265625, "learning_rate": 9.682024443319572e-06, "loss": 0.92447309, "memory(GiB)": 740.26, "step": 12490, "train_speed(iter/s)": 1.785408 }, { "acc": 0.76000166, "epoch": 0.31697158322485924, "grad_norm": 3.78125, "learning_rate": 9.681656345949471e-06, "loss": 0.93439236, "memory(GiB)": 740.26, "step": 12495, "train_speed(iter/s)": 1.770599 }, { "acc": 0.7681314, "epoch": 0.3170984225938968, "grad_norm": 3.265625, "learning_rate": 9.681288042648221e-06, "loss": 0.88914747, "memory(GiB)": 740.26, "step": 12500, "train_speed(iter/s)": 1.754471 }, { "epoch": 0.3170984225938968, "eval_acc": 0.748611445905191, "eval_loss": 0.903021514415741, "eval_runtime": 1152.439, "eval_samples_per_second": 5.527, "eval_steps_per_second": 5.527, "step": 12500 }, { "acc": 0.75621033, "epoch": 0.3172252619629344, "grad_norm": 3.625, "learning_rate": 9.680919533432027e-06, "loss": 1.02733364, "memory(GiB)": 740.26, "step": 12505, "train_speed(iter/s)": 1.390941 }, { "acc": 0.76098895, "epoch": 0.31735210133197195, "grad_norm": 3.578125, "learning_rate": 9.680550818317099e-06, "loss": 0.93606262, "memory(GiB)": 740.26, "step": 12510, "train_speed(iter/s)": 1.380559 }, { "acc": 0.76355515, "epoch": 0.31747894070100946, "grad_norm": 3.890625, "learning_rate": 9.68018189731965e-06, "loss": 0.94707012, "memory(GiB)": 740.26, "step": 12515, "train_speed(iter/s)": 1.370395 }, { "acc": 0.75054431, "epoch": 0.31760578007004703, "grad_norm": 3.59375, "learning_rate": 9.679812770455913e-06, "loss": 0.92232523, "memory(GiB)": 740.26, "step": 12520, "train_speed(iter/s)": 1.360757 }, { "acc": 0.77215958, "epoch": 0.3177326194390846, "grad_norm": 3.28125, "learning_rate": 9.679443437742124e-06, "loss": 0.91561041, "memory(GiB)": 740.28, "step": 12525, "train_speed(iter/s)": 1.351546 }, { "acc": 0.75383015, "epoch": 0.31785945880812216, "grad_norm": 3.296875, "learning_rate": 9.679073899194527e-06, "loss": 0.95084915, "memory(GiB)": 740.28, "step": 12530, "train_speed(iter/s)": 1.342704 }, { "acc": 0.74931345, "epoch": 0.31798629817715973, "grad_norm": 4.0, "learning_rate": 9.678704154829376e-06, "loss": 0.95761213, "memory(GiB)": 740.28, "step": 12535, "train_speed(iter/s)": 1.333969 }, { "acc": 0.7416162, "epoch": 0.3181131375461973, "grad_norm": 3.671875, "learning_rate": 9.67833420466294e-06, "loss": 0.97970285, "memory(GiB)": 740.28, "step": 12540, "train_speed(iter/s)": 1.32528 }, { "acc": 0.75939207, "epoch": 0.3182399769152348, "grad_norm": 4.09375, "learning_rate": 9.677964048711485e-06, "loss": 0.93996487, "memory(GiB)": 740.28, "step": 12545, "train_speed(iter/s)": 1.315818 }, { "acc": 0.7474246, "epoch": 0.3183668162842724, "grad_norm": 3.96875, "learning_rate": 9.677593686991298e-06, "loss": 0.99436178, "memory(GiB)": 740.28, "step": 12550, "train_speed(iter/s)": 1.306248 }, { "acc": 0.73475132, "epoch": 0.31849365565330995, "grad_norm": 4.46875, "learning_rate": 9.677223119518669e-06, "loss": 1.02159662, "memory(GiB)": 740.28, "step": 12555, "train_speed(iter/s)": 1.298161 }, { "acc": 0.75749679, "epoch": 0.3186204950223475, "grad_norm": 4.15625, "learning_rate": 9.676852346309896e-06, "loss": 0.95077257, "memory(GiB)": 740.28, "step": 12560, "train_speed(iter/s)": 1.288662 }, { "acc": 0.75401149, "epoch": 0.3187473343913851, "grad_norm": 3.625, "learning_rate": 9.676481367381289e-06, "loss": 0.98502169, "memory(GiB)": 740.28, "step": 12565, "train_speed(iter/s)": 1.28087 }, { "acc": 0.76156888, "epoch": 0.31887417376042265, "grad_norm": 3.390625, "learning_rate": 9.67611018274917e-06, "loss": 0.95360203, "memory(GiB)": 740.28, "step": 12570, "train_speed(iter/s)": 1.2725 }, { "acc": 0.75063353, "epoch": 0.31900101312946016, "grad_norm": 3.671875, "learning_rate": 9.675738792429861e-06, "loss": 0.97811527, "memory(GiB)": 740.28, "step": 12575, "train_speed(iter/s)": 1.263584 }, { "acc": 0.76869121, "epoch": 0.31912785249849773, "grad_norm": 3.453125, "learning_rate": 9.6753671964397e-06, "loss": 0.91561403, "memory(GiB)": 740.28, "step": 12580, "train_speed(iter/s)": 1.254763 }, { "acc": 0.75455236, "epoch": 0.3192546918675353, "grad_norm": 3.640625, "learning_rate": 9.674995394795034e-06, "loss": 0.96511145, "memory(GiB)": 740.28, "step": 12585, "train_speed(iter/s)": 1.247184 }, { "acc": 0.75748291, "epoch": 0.31938153123657287, "grad_norm": 3.828125, "learning_rate": 9.674623387512215e-06, "loss": 0.98412828, "memory(GiB)": 740.28, "step": 12590, "train_speed(iter/s)": 1.239776 }, { "acc": 0.75657911, "epoch": 0.31950837060561044, "grad_norm": 4.40625, "learning_rate": 9.674251174607608e-06, "loss": 0.99423933, "memory(GiB)": 740.28, "step": 12595, "train_speed(iter/s)": 1.232126 }, { "acc": 0.75925541, "epoch": 0.319635209974648, "grad_norm": 4.65625, "learning_rate": 9.673878756097584e-06, "loss": 0.93268995, "memory(GiB)": 740.28, "step": 12600, "train_speed(iter/s)": 1.22387 }, { "acc": 0.76706319, "epoch": 0.3197620493436855, "grad_norm": 3.890625, "learning_rate": 9.673506131998527e-06, "loss": 0.91011934, "memory(GiB)": 740.28, "step": 12605, "train_speed(iter/s)": 1.216681 }, { "acc": 0.75089412, "epoch": 0.3198888887127231, "grad_norm": 4.6875, "learning_rate": 9.673133302326826e-06, "loss": 0.93604946, "memory(GiB)": 740.28, "step": 12610, "train_speed(iter/s)": 1.209883 }, { "acc": 0.74969335, "epoch": 0.32001572808176065, "grad_norm": 3.625, "learning_rate": 9.672760267098877e-06, "loss": 0.9873085, "memory(GiB)": 740.28, "step": 12615, "train_speed(iter/s)": 1.20219 }, { "acc": 0.76627932, "epoch": 0.3201425674507982, "grad_norm": 3.3125, "learning_rate": 9.672387026331095e-06, "loss": 0.92843142, "memory(GiB)": 740.28, "step": 12620, "train_speed(iter/s)": 1.194772 }, { "acc": 0.76596165, "epoch": 0.3202694068198358, "grad_norm": 3.625, "learning_rate": 9.672013580039896e-06, "loss": 0.90036144, "memory(GiB)": 740.28, "step": 12625, "train_speed(iter/s)": 1.187099 }, { "acc": 0.75837183, "epoch": 0.32039624618887336, "grad_norm": 3.390625, "learning_rate": 9.671639928241703e-06, "loss": 1.02114353, "memory(GiB)": 740.28, "step": 12630, "train_speed(iter/s)": 1.179731 }, { "acc": 0.7666863, "epoch": 0.32052308555791087, "grad_norm": 3.40625, "learning_rate": 9.671266070952955e-06, "loss": 0.91697569, "memory(GiB)": 740.28, "step": 12635, "train_speed(iter/s)": 1.172976 }, { "acc": 0.75864472, "epoch": 0.32064992492694844, "grad_norm": 4.03125, "learning_rate": 9.670892008190098e-06, "loss": 0.93136387, "memory(GiB)": 740.28, "step": 12640, "train_speed(iter/s)": 1.166661 }, { "acc": 0.75733557, "epoch": 0.320776764295986, "grad_norm": 3.078125, "learning_rate": 9.670517739969582e-06, "loss": 0.96247272, "memory(GiB)": 740.28, "step": 12645, "train_speed(iter/s)": 1.159524 }, { "acc": 0.76093302, "epoch": 0.3209036036650236, "grad_norm": 3.6875, "learning_rate": 9.670143266307872e-06, "loss": 0.94876652, "memory(GiB)": 740.28, "step": 12650, "train_speed(iter/s)": 1.153021 }, { "acc": 0.75275979, "epoch": 0.32103044303406114, "grad_norm": 3.53125, "learning_rate": 9.66976858722144e-06, "loss": 0.95036345, "memory(GiB)": 740.28, "step": 12655, "train_speed(iter/s)": 1.145594 }, { "acc": 0.76232061, "epoch": 0.3211572824030987, "grad_norm": 3.625, "learning_rate": 9.669393702726766e-06, "loss": 0.92810192, "memory(GiB)": 740.28, "step": 12660, "train_speed(iter/s)": 1.139273 }, { "acc": 0.75606899, "epoch": 0.3212841217721362, "grad_norm": 3.171875, "learning_rate": 9.669018612840343e-06, "loss": 0.93427191, "memory(GiB)": 740.28, "step": 12665, "train_speed(iter/s)": 1.133311 }, { "acc": 0.76280813, "epoch": 0.3214109611411738, "grad_norm": 3.9375, "learning_rate": 9.668643317578664e-06, "loss": 0.89160776, "memory(GiB)": 740.28, "step": 12670, "train_speed(iter/s)": 1.126986 }, { "acc": 0.776894, "epoch": 0.32153780051021136, "grad_norm": 4.28125, "learning_rate": 9.668267816958243e-06, "loss": 0.90523453, "memory(GiB)": 740.28, "step": 12675, "train_speed(iter/s)": 1.120532 }, { "acc": 0.74784589, "epoch": 0.3216646398792489, "grad_norm": 3.484375, "learning_rate": 9.667892110995593e-06, "loss": 0.97676907, "memory(GiB)": 740.28, "step": 12680, "train_speed(iter/s)": 1.11412 }, { "acc": 0.77922034, "epoch": 0.3217914792482865, "grad_norm": 4.21875, "learning_rate": 9.667516199707244e-06, "loss": 0.85627298, "memory(GiB)": 740.28, "step": 12685, "train_speed(iter/s)": 1.108037 }, { "acc": 0.75690761, "epoch": 0.32191831861732406, "grad_norm": 4.0625, "learning_rate": 9.667140083109728e-06, "loss": 0.94420795, "memory(GiB)": 740.28, "step": 12690, "train_speed(iter/s)": 1.102248 }, { "acc": 0.76134014, "epoch": 0.3220451579863616, "grad_norm": 3.96875, "learning_rate": 9.666763761219591e-06, "loss": 0.95165567, "memory(GiB)": 740.28, "step": 12695, "train_speed(iter/s)": 1.096184 }, { "acc": 0.76272092, "epoch": 0.32217199735539914, "grad_norm": 3.40625, "learning_rate": 9.666387234053385e-06, "loss": 0.9271843, "memory(GiB)": 740.28, "step": 12700, "train_speed(iter/s)": 1.090167 }, { "acc": 0.76044245, "epoch": 0.3222988367244367, "grad_norm": 3.921875, "learning_rate": 9.666010501627671e-06, "loss": 0.91485987, "memory(GiB)": 740.28, "step": 12705, "train_speed(iter/s)": 1.084135 }, { "acc": 0.76631179, "epoch": 0.3224256760934743, "grad_norm": 3.609375, "learning_rate": 9.665633563959022e-06, "loss": 0.95088787, "memory(GiB)": 740.28, "step": 12710, "train_speed(iter/s)": 1.078676 }, { "acc": 0.76366892, "epoch": 0.32255251546251185, "grad_norm": 3.265625, "learning_rate": 9.66525642106402e-06, "loss": 0.97255583, "memory(GiB)": 740.28, "step": 12715, "train_speed(iter/s)": 1.073271 }, { "acc": 0.75607533, "epoch": 0.3226793548315494, "grad_norm": 3.296875, "learning_rate": 9.664879072959253e-06, "loss": 0.95195713, "memory(GiB)": 740.28, "step": 12720, "train_speed(iter/s)": 1.067558 }, { "acc": 0.75572042, "epoch": 0.3228061942005869, "grad_norm": 3.328125, "learning_rate": 9.664501519661316e-06, "loss": 0.9569643, "memory(GiB)": 740.28, "step": 12725, "train_speed(iter/s)": 1.06197 }, { "acc": 0.76304202, "epoch": 0.3229330335696245, "grad_norm": 3.234375, "learning_rate": 9.664123761186821e-06, "loss": 0.89024496, "memory(GiB)": 740.28, "step": 12730, "train_speed(iter/s)": 1.055776 }, { "acc": 0.76485386, "epoch": 0.32305987293866206, "grad_norm": 3.21875, "learning_rate": 9.663745797552381e-06, "loss": 0.95200396, "memory(GiB)": 740.28, "step": 12735, "train_speed(iter/s)": 1.050193 }, { "acc": 0.75688324, "epoch": 0.32318671230769963, "grad_norm": 3.5, "learning_rate": 9.663367628774625e-06, "loss": 0.93074808, "memory(GiB)": 740.28, "step": 12740, "train_speed(iter/s)": 1.04465 }, { "acc": 0.77071447, "epoch": 0.3233135516767372, "grad_norm": 4.1875, "learning_rate": 9.662989254870185e-06, "loss": 0.98351879, "memory(GiB)": 740.28, "step": 12745, "train_speed(iter/s)": 1.039581 }, { "acc": 0.76719708, "epoch": 0.32344039104577477, "grad_norm": 3.234375, "learning_rate": 9.662610675855703e-06, "loss": 0.8621294, "memory(GiB)": 740.28, "step": 12750, "train_speed(iter/s)": 1.0343 }, { "acc": 0.75492806, "epoch": 0.3235672304148123, "grad_norm": 3.921875, "learning_rate": 9.662231891747835e-06, "loss": 0.92779531, "memory(GiB)": 740.28, "step": 12755, "train_speed(iter/s)": 1.029331 }, { "acc": 0.7592639, "epoch": 0.32369406978384985, "grad_norm": 3.59375, "learning_rate": 9.661852902563242e-06, "loss": 0.93588905, "memory(GiB)": 740.28, "step": 12760, "train_speed(iter/s)": 1.024218 }, { "acc": 0.75560708, "epoch": 0.3238209091528874, "grad_norm": 3.375, "learning_rate": 9.661473708318592e-06, "loss": 0.89484434, "memory(GiB)": 740.28, "step": 12765, "train_speed(iter/s)": 1.018813 }, { "acc": 0.74731083, "epoch": 0.323947748521925, "grad_norm": 4.78125, "learning_rate": 9.661094309030565e-06, "loss": 0.93065815, "memory(GiB)": 740.28, "step": 12770, "train_speed(iter/s)": 1.013792 }, { "acc": 0.74987822, "epoch": 0.32407458789096255, "grad_norm": 3.953125, "learning_rate": 9.660714704715851e-06, "loss": 0.97086582, "memory(GiB)": 740.28, "step": 12775, "train_speed(iter/s)": 1.008502 }, { "acc": 0.76953545, "epoch": 0.3242014272600001, "grad_norm": 3.578125, "learning_rate": 9.660334895391149e-06, "loss": 0.90312243, "memory(GiB)": 740.28, "step": 12780, "train_speed(iter/s)": 1.003675 }, { "acc": 0.73846517, "epoch": 0.32432826662903763, "grad_norm": 3.765625, "learning_rate": 9.659954881073162e-06, "loss": 0.96551666, "memory(GiB)": 740.28, "step": 12785, "train_speed(iter/s)": 0.998901 }, { "acc": 0.76298947, "epoch": 0.3244551059980752, "grad_norm": 3.140625, "learning_rate": 9.659574661778606e-06, "loss": 0.90173368, "memory(GiB)": 740.28, "step": 12790, "train_speed(iter/s)": 0.994352 }, { "acc": 0.75615325, "epoch": 0.32458194536711277, "grad_norm": 4.0625, "learning_rate": 9.659194237524209e-06, "loss": 0.9779254, "memory(GiB)": 740.28, "step": 12795, "train_speed(iter/s)": 0.989563 }, { "acc": 0.75970297, "epoch": 0.32470878473615034, "grad_norm": 3.796875, "learning_rate": 9.658813608326703e-06, "loss": 0.95575027, "memory(GiB)": 740.28, "step": 12800, "train_speed(iter/s)": 0.984695 }, { "acc": 0.75258212, "epoch": 0.3248356241051879, "grad_norm": 3.84375, "learning_rate": 9.658432774202828e-06, "loss": 0.95537405, "memory(GiB)": 740.28, "step": 12805, "train_speed(iter/s)": 0.980533 }, { "acc": 0.75903578, "epoch": 0.32496246347422547, "grad_norm": 3.265625, "learning_rate": 9.658051735169339e-06, "loss": 0.89311533, "memory(GiB)": 740.28, "step": 12810, "train_speed(iter/s)": 0.97545 }, { "acc": 0.77548227, "epoch": 0.325089302843263, "grad_norm": 3.140625, "learning_rate": 9.657670491242996e-06, "loss": 0.84895344, "memory(GiB)": 740.28, "step": 12815, "train_speed(iter/s)": 0.970638 }, { "acc": 0.77153015, "epoch": 0.32521614221230055, "grad_norm": 3.8125, "learning_rate": 9.657289042440567e-06, "loss": 0.88854847, "memory(GiB)": 740.28, "step": 12820, "train_speed(iter/s)": 0.966329 }, { "acc": 0.75712276, "epoch": 0.3253429815813381, "grad_norm": 3.375, "learning_rate": 9.656907388778834e-06, "loss": 0.93066349, "memory(GiB)": 740.28, "step": 12825, "train_speed(iter/s)": 0.962294 }, { "acc": 0.77364273, "epoch": 0.3254698209503757, "grad_norm": 3.0625, "learning_rate": 9.656525530274583e-06, "loss": 0.88946714, "memory(GiB)": 740.28, "step": 12830, "train_speed(iter/s)": 0.957953 }, { "acc": 0.76148634, "epoch": 0.32559666031941326, "grad_norm": 3.5625, "learning_rate": 9.656143466944608e-06, "loss": 0.89125233, "memory(GiB)": 740.28, "step": 12835, "train_speed(iter/s)": 0.953463 }, { "acc": 0.74540153, "epoch": 0.3257234996884508, "grad_norm": 4.53125, "learning_rate": 9.65576119880572e-06, "loss": 0.93839378, "memory(GiB)": 740.28, "step": 12840, "train_speed(iter/s)": 0.948575 }, { "acc": 0.74553919, "epoch": 0.32585033905748834, "grad_norm": 4.4375, "learning_rate": 9.655378725874729e-06, "loss": 0.97563801, "memory(GiB)": 740.28, "step": 12845, "train_speed(iter/s)": 0.944232 }, { "acc": 0.75156503, "epoch": 0.3259771784265259, "grad_norm": 3.25, "learning_rate": 9.654996048168462e-06, "loss": 0.97555399, "memory(GiB)": 740.28, "step": 12850, "train_speed(iter/s)": 0.939848 }, { "acc": 0.75793633, "epoch": 0.32610401779556347, "grad_norm": 4.25, "learning_rate": 9.65461316570375e-06, "loss": 0.9380641, "memory(GiB)": 740.28, "step": 12855, "train_speed(iter/s)": 0.935621 }, { "acc": 0.75971117, "epoch": 0.32623085716460104, "grad_norm": 3.625, "learning_rate": 9.654230078497435e-06, "loss": 0.92038488, "memory(GiB)": 740.28, "step": 12860, "train_speed(iter/s)": 0.931816 }, { "acc": 0.75432544, "epoch": 0.3263576965336386, "grad_norm": 3.234375, "learning_rate": 9.653846786566368e-06, "loss": 0.97478895, "memory(GiB)": 740.28, "step": 12865, "train_speed(iter/s)": 0.927624 }, { "acc": 0.75152769, "epoch": 0.3264845359026762, "grad_norm": 3.65625, "learning_rate": 9.65346328992741e-06, "loss": 0.95365171, "memory(GiB)": 740.28, "step": 12870, "train_speed(iter/s)": 0.923189 }, { "acc": 0.75188208, "epoch": 0.3266113752717137, "grad_norm": 4.15625, "learning_rate": 9.65307958859743e-06, "loss": 0.97389927, "memory(GiB)": 740.28, "step": 12875, "train_speed(iter/s)": 0.919014 }, { "acc": 0.75438395, "epoch": 0.32673821464075126, "grad_norm": 4.65625, "learning_rate": 9.652695682593301e-06, "loss": 0.95647984, "memory(GiB)": 740.28, "step": 12880, "train_speed(iter/s)": 0.914931 }, { "acc": 0.75365095, "epoch": 0.3268650540097888, "grad_norm": 4.1875, "learning_rate": 9.652311571931915e-06, "loss": 0.98616476, "memory(GiB)": 740.28, "step": 12885, "train_speed(iter/s)": 0.91093 }, { "acc": 0.75778513, "epoch": 0.3269918933788264, "grad_norm": 3.015625, "learning_rate": 9.651927256630167e-06, "loss": 0.96510105, "memory(GiB)": 740.28, "step": 12890, "train_speed(iter/s)": 0.907328 }, { "acc": 0.7617631, "epoch": 0.32711873274786396, "grad_norm": 3.8125, "learning_rate": 9.65154273670496e-06, "loss": 0.91339664, "memory(GiB)": 740.28, "step": 12895, "train_speed(iter/s)": 0.903364 }, { "acc": 0.75548997, "epoch": 0.32724557211690153, "grad_norm": 4.34375, "learning_rate": 9.651158012173209e-06, "loss": 0.98928041, "memory(GiB)": 740.28, "step": 12900, "train_speed(iter/s)": 0.899718 }, { "acc": 0.76786637, "epoch": 0.32737241148593904, "grad_norm": 3.03125, "learning_rate": 9.650773083051837e-06, "loss": 0.88376942, "memory(GiB)": 740.28, "step": 12905, "train_speed(iter/s)": 0.895634 }, { "acc": 0.75132518, "epoch": 0.3274992508549766, "grad_norm": 3.125, "learning_rate": 9.650387949357774e-06, "loss": 0.93446894, "memory(GiB)": 740.28, "step": 12910, "train_speed(iter/s)": 0.891731 }, { "acc": 0.76227536, "epoch": 0.3276260902240142, "grad_norm": 4.21875, "learning_rate": 9.650002611107962e-06, "loss": 0.94607763, "memory(GiB)": 740.28, "step": 12915, "train_speed(iter/s)": 0.887981 }, { "acc": 0.75728526, "epoch": 0.32775292959305175, "grad_norm": 3.6875, "learning_rate": 9.649617068319353e-06, "loss": 0.9315443, "memory(GiB)": 740.28, "step": 12920, "train_speed(iter/s)": 0.884344 }, { "acc": 0.75386429, "epoch": 0.3278797689620893, "grad_norm": 3.125, "learning_rate": 9.649231321008901e-06, "loss": 0.94402657, "memory(GiB)": 740.28, "step": 12925, "train_speed(iter/s)": 0.880431 }, { "acc": 0.75442858, "epoch": 0.3280066083311269, "grad_norm": 3.171875, "learning_rate": 9.648845369193579e-06, "loss": 0.95757065, "memory(GiB)": 740.28, "step": 12930, "train_speed(iter/s)": 0.876677 }, { "acc": 0.77291107, "epoch": 0.3281334477001644, "grad_norm": 3.5625, "learning_rate": 9.64845921289036e-06, "loss": 0.9127182, "memory(GiB)": 740.28, "step": 12935, "train_speed(iter/s)": 0.873323 }, { "acc": 0.7664331, "epoch": 0.32826028706920196, "grad_norm": 3.84375, "learning_rate": 9.648072852116233e-06, "loss": 0.92141428, "memory(GiB)": 740.28, "step": 12940, "train_speed(iter/s)": 0.870011 }, { "acc": 0.76130142, "epoch": 0.32838712643823953, "grad_norm": 3.828125, "learning_rate": 9.647686286888188e-06, "loss": 0.91987724, "memory(GiB)": 740.28, "step": 12945, "train_speed(iter/s)": 0.86636 }, { "acc": 0.74401007, "epoch": 0.3285139658072771, "grad_norm": 3.8125, "learning_rate": 9.647299517223234e-06, "loss": 0.94926853, "memory(GiB)": 740.28, "step": 12950, "train_speed(iter/s)": 0.863044 }, { "acc": 0.77390666, "epoch": 0.32864080517631467, "grad_norm": 3.296875, "learning_rate": 9.64691254313838e-06, "loss": 0.89033413, "memory(GiB)": 740.28, "step": 12955, "train_speed(iter/s)": 0.859009 }, { "acc": 0.76349998, "epoch": 0.32876764454535223, "grad_norm": 3.75, "learning_rate": 9.64652536465065e-06, "loss": 0.9126358, "memory(GiB)": 740.28, "step": 12960, "train_speed(iter/s)": 0.855258 }, { "acc": 0.76894002, "epoch": 0.32889448391438975, "grad_norm": 4.5625, "learning_rate": 9.646137981777073e-06, "loss": 0.90682821, "memory(GiB)": 740.28, "step": 12965, "train_speed(iter/s)": 0.851798 }, { "acc": 0.76844668, "epoch": 0.3290213232834273, "grad_norm": 3.390625, "learning_rate": 9.64575039453469e-06, "loss": 0.9351243, "memory(GiB)": 740.28, "step": 12970, "train_speed(iter/s)": 0.848251 }, { "acc": 0.75940065, "epoch": 0.3291481626524649, "grad_norm": 2.921875, "learning_rate": 9.645362602940551e-06, "loss": 0.95268145, "memory(GiB)": 740.28, "step": 12975, "train_speed(iter/s)": 0.844805 }, { "acc": 0.75531445, "epoch": 0.32927500202150245, "grad_norm": 3.15625, "learning_rate": 9.64497460701171e-06, "loss": 0.96753159, "memory(GiB)": 740.28, "step": 12980, "train_speed(iter/s)": 0.841573 }, { "acc": 0.77588258, "epoch": 0.32940184139054, "grad_norm": 3.25, "learning_rate": 9.644586406765237e-06, "loss": 0.86422901, "memory(GiB)": 740.28, "step": 12985, "train_speed(iter/s)": 0.838539 }, { "acc": 0.75043793, "epoch": 0.3295286807595776, "grad_norm": 3.484375, "learning_rate": 9.644198002218206e-06, "loss": 0.93905191, "memory(GiB)": 740.28, "step": 12990, "train_speed(iter/s)": 0.835253 }, { "acc": 0.76864748, "epoch": 0.3296555201286151, "grad_norm": 2.875, "learning_rate": 9.643809393387703e-06, "loss": 0.89466267, "memory(GiB)": 740.28, "step": 12995, "train_speed(iter/s)": 0.831921 }, { "acc": 0.76869254, "epoch": 0.32978235949765267, "grad_norm": 8.0625, "learning_rate": 9.643420580290822e-06, "loss": 0.89402151, "memory(GiB)": 740.28, "step": 13000, "train_speed(iter/s)": 0.828437 }, { "epoch": 0.32978235949765267, "eval_acc": 0.7487772995270456, "eval_loss": 0.9010844230651855, "eval_runtime": 1148.1751, "eval_samples_per_second": 5.548, "eval_steps_per_second": 5.548, "step": 13000 }, { "acc": 0.76991014, "epoch": 0.32990919886669023, "grad_norm": 3.4375, "learning_rate": 9.64303156294466e-06, "loss": 0.91947889, "memory(GiB)": 740.28, "step": 13005, "train_speed(iter/s)": 0.740582 }, { "acc": 0.7711215, "epoch": 0.3300360382357278, "grad_norm": 3.25, "learning_rate": 9.642642341366337e-06, "loss": 0.88599968, "memory(GiB)": 740.28, "step": 13010, "train_speed(iter/s)": 0.738024 }, { "acc": 0.75438843, "epoch": 0.33016287760476537, "grad_norm": 4.28125, "learning_rate": 9.642252915572969e-06, "loss": 0.91788387, "memory(GiB)": 740.28, "step": 13015, "train_speed(iter/s)": 0.735362 }, { "acc": 0.76956253, "epoch": 0.33028971697380294, "grad_norm": 5.15625, "learning_rate": 9.641863285581685e-06, "loss": 0.86536484, "memory(GiB)": 740.28, "step": 13020, "train_speed(iter/s)": 0.732916 }, { "acc": 0.76332922, "epoch": 0.33041655634284045, "grad_norm": 3.109375, "learning_rate": 9.641473451409626e-06, "loss": 0.90412989, "memory(GiB)": 740.28, "step": 13025, "train_speed(iter/s)": 0.730333 }, { "acc": 0.75705843, "epoch": 0.330543395711878, "grad_norm": 3.078125, "learning_rate": 9.64108341307394e-06, "loss": 0.9156971, "memory(GiB)": 740.28, "step": 13030, "train_speed(iter/s)": 0.72767 }, { "acc": 0.75849733, "epoch": 0.3306702350809156, "grad_norm": 3.421875, "learning_rate": 9.64069317059178e-06, "loss": 0.92296762, "memory(GiB)": 740.28, "step": 13035, "train_speed(iter/s)": 0.725181 }, { "acc": 0.76173182, "epoch": 0.33079707444995315, "grad_norm": 3.125, "learning_rate": 9.640302723980314e-06, "loss": 0.90205173, "memory(GiB)": 740.28, "step": 13040, "train_speed(iter/s)": 0.722736 }, { "acc": 0.76353183, "epoch": 0.3309239138189907, "grad_norm": 3.46875, "learning_rate": 9.639912073256717e-06, "loss": 0.91121264, "memory(GiB)": 740.28, "step": 13045, "train_speed(iter/s)": 0.720566 }, { "acc": 0.75438514, "epoch": 0.3310507531880283, "grad_norm": 3.0625, "learning_rate": 9.63952121843817e-06, "loss": 0.94259844, "memory(GiB)": 740.28, "step": 13050, "train_speed(iter/s)": 0.718041 }, { "acc": 0.7714572, "epoch": 0.3311775925570658, "grad_norm": 3.453125, "learning_rate": 9.639130159541869e-06, "loss": 0.88939333, "memory(GiB)": 740.28, "step": 13055, "train_speed(iter/s)": 0.715793 }, { "acc": 0.75887275, "epoch": 0.33130443192610337, "grad_norm": 3.59375, "learning_rate": 9.638738896585012e-06, "loss": 0.91497517, "memory(GiB)": 740.28, "step": 13060, "train_speed(iter/s)": 0.713447 }, { "acc": 0.76740141, "epoch": 0.33143127129514094, "grad_norm": 4.125, "learning_rate": 9.638347429584812e-06, "loss": 0.90036564, "memory(GiB)": 740.28, "step": 13065, "train_speed(iter/s)": 0.710998 }, { "acc": 0.77369175, "epoch": 0.3315581106641785, "grad_norm": 3.65625, "learning_rate": 9.637955758558484e-06, "loss": 0.91142445, "memory(GiB)": 740.28, "step": 13070, "train_speed(iter/s)": 0.70876 }, { "acc": 0.76969666, "epoch": 0.3316849500332161, "grad_norm": 3.75, "learning_rate": 9.637563883523263e-06, "loss": 0.86975069, "memory(GiB)": 740.28, "step": 13075, "train_speed(iter/s)": 0.706224 }, { "acc": 0.76716633, "epoch": 0.33181178940225364, "grad_norm": 3.4375, "learning_rate": 9.637171804496381e-06, "loss": 0.91268816, "memory(GiB)": 740.28, "step": 13080, "train_speed(iter/s)": 0.704039 }, { "acc": 0.7567121, "epoch": 0.33193862877129116, "grad_norm": 3.734375, "learning_rate": 9.636779521495087e-06, "loss": 0.91545372, "memory(GiB)": 740.28, "step": 13085, "train_speed(iter/s)": 0.70162 }, { "acc": 0.77471423, "epoch": 0.3320654681403287, "grad_norm": 3.796875, "learning_rate": 9.636387034536638e-06, "loss": 0.93212967, "memory(GiB)": 740.28, "step": 13090, "train_speed(iter/s)": 0.699543 }, { "acc": 0.76348867, "epoch": 0.3321923075093663, "grad_norm": 3.125, "learning_rate": 9.635994343638293e-06, "loss": 0.89192772, "memory(GiB)": 740.28, "step": 13095, "train_speed(iter/s)": 0.696981 }, { "acc": 0.76292372, "epoch": 0.33231914687840386, "grad_norm": 3.421875, "learning_rate": 9.635601448817328e-06, "loss": 0.91303701, "memory(GiB)": 740.28, "step": 13100, "train_speed(iter/s)": 0.694813 }, { "acc": 0.76123095, "epoch": 0.3324459862474414, "grad_norm": 3.59375, "learning_rate": 9.635208350091025e-06, "loss": 0.95881443, "memory(GiB)": 740.28, "step": 13105, "train_speed(iter/s)": 0.692681 }, { "acc": 0.76916075, "epoch": 0.332572825616479, "grad_norm": 3.9375, "learning_rate": 9.634815047476678e-06, "loss": 0.9075655, "memory(GiB)": 740.28, "step": 13110, "train_speed(iter/s)": 0.690379 }, { "acc": 0.76020608, "epoch": 0.3326996649855165, "grad_norm": 3.28125, "learning_rate": 9.634421540991581e-06, "loss": 0.96642561, "memory(GiB)": 740.28, "step": 13115, "train_speed(iter/s)": 0.688368 }, { "acc": 0.77002482, "epoch": 0.3328265043545541, "grad_norm": 3.53125, "learning_rate": 9.63402783065305e-06, "loss": 0.88981876, "memory(GiB)": 740.28, "step": 13120, "train_speed(iter/s)": 0.686363 }, { "acc": 0.77148709, "epoch": 0.33295334372359164, "grad_norm": 3.1875, "learning_rate": 9.633633916478396e-06, "loss": 0.87809191, "memory(GiB)": 740.28, "step": 13125, "train_speed(iter/s)": 0.684251 }, { "acc": 0.76808391, "epoch": 0.3330801830926292, "grad_norm": 3.46875, "learning_rate": 9.633239798484952e-06, "loss": 0.87384176, "memory(GiB)": 740.28, "step": 13130, "train_speed(iter/s)": 0.682038 }, { "acc": 0.75054016, "epoch": 0.3332070224616668, "grad_norm": 3.546875, "learning_rate": 9.632845476690051e-06, "loss": 0.95611067, "memory(GiB)": 740.28, "step": 13135, "train_speed(iter/s)": 0.679685 }, { "acc": 0.75388165, "epoch": 0.33333386183070435, "grad_norm": 4.5, "learning_rate": 9.63245095111104e-06, "loss": 0.96449051, "memory(GiB)": 740.28, "step": 13140, "train_speed(iter/s)": 0.677859 }, { "acc": 0.76938081, "epoch": 0.33346070119974186, "grad_norm": 3.921875, "learning_rate": 9.632056221765269e-06, "loss": 0.87259226, "memory(GiB)": 740.28, "step": 13145, "train_speed(iter/s)": 0.675891 }, { "acc": 0.75595932, "epoch": 0.33358754056877943, "grad_norm": 3.515625, "learning_rate": 9.631661288670104e-06, "loss": 0.92212887, "memory(GiB)": 740.28, "step": 13150, "train_speed(iter/s)": 0.673837 }, { "acc": 0.76378503, "epoch": 0.333714379937817, "grad_norm": 4.15625, "learning_rate": 9.631266151842917e-06, "loss": 0.89390478, "memory(GiB)": 740.28, "step": 13155, "train_speed(iter/s)": 0.671767 }, { "acc": 0.75747428, "epoch": 0.33384121930685456, "grad_norm": 3.921875, "learning_rate": 9.63087081130109e-06, "loss": 0.89940119, "memory(GiB)": 740.28, "step": 13160, "train_speed(iter/s)": 0.669861 }, { "acc": 0.7366313, "epoch": 0.33396805867589213, "grad_norm": 3.234375, "learning_rate": 9.630475267062008e-06, "loss": 0.98779669, "memory(GiB)": 740.28, "step": 13165, "train_speed(iter/s)": 0.667692 }, { "acc": 0.75567408, "epoch": 0.3340948980449297, "grad_norm": 3.625, "learning_rate": 9.630079519143073e-06, "loss": 0.90728941, "memory(GiB)": 740.28, "step": 13170, "train_speed(iter/s)": 0.665558 }, { "acc": 0.77322016, "epoch": 0.3342217374139672, "grad_norm": 3.390625, "learning_rate": 9.629683567561694e-06, "loss": 0.92109995, "memory(GiB)": 740.28, "step": 13175, "train_speed(iter/s)": 0.663787 }, { "acc": 0.76028819, "epoch": 0.3343485767830048, "grad_norm": 3.46875, "learning_rate": 9.629287412335284e-06, "loss": 0.92883339, "memory(GiB)": 740.28, "step": 13180, "train_speed(iter/s)": 0.661582 }, { "acc": 0.74315143, "epoch": 0.33447541615204235, "grad_norm": 3.734375, "learning_rate": 9.628891053481271e-06, "loss": 0.93550138, "memory(GiB)": 740.28, "step": 13185, "train_speed(iter/s)": 0.659645 }, { "acc": 0.76005011, "epoch": 0.3346022555210799, "grad_norm": 3.796875, "learning_rate": 9.62849449101709e-06, "loss": 0.90404263, "memory(GiB)": 740.28, "step": 13190, "train_speed(iter/s)": 0.657508 }, { "acc": 0.73337369, "epoch": 0.3347290948901175, "grad_norm": 4.15625, "learning_rate": 9.628097724960184e-06, "loss": 1.02126999, "memory(GiB)": 740.28, "step": 13195, "train_speed(iter/s)": 0.655628 }, { "acc": 0.75159979, "epoch": 0.33485593425915505, "grad_norm": 3.1875, "learning_rate": 9.627700755328003e-06, "loss": 0.9022913, "memory(GiB)": 740.28, "step": 13200, "train_speed(iter/s)": 0.653515 }, { "acc": 0.75749888, "epoch": 0.33498277362819256, "grad_norm": 4.0625, "learning_rate": 9.627303582138013e-06, "loss": 0.95779057, "memory(GiB)": 740.28, "step": 13205, "train_speed(iter/s)": 0.651827 }, { "acc": 0.75894613, "epoch": 0.33510961299723013, "grad_norm": 3.328125, "learning_rate": 9.62690620540768e-06, "loss": 0.89146118, "memory(GiB)": 740.28, "step": 13210, "train_speed(iter/s)": 0.649829 }, { "acc": 0.75401602, "epoch": 0.3352364523662677, "grad_norm": 3.453125, "learning_rate": 9.626508625154485e-06, "loss": 0.94435415, "memory(GiB)": 740.28, "step": 13215, "train_speed(iter/s)": 0.6478 }, { "acc": 0.7723309, "epoch": 0.33536329173530527, "grad_norm": 3.640625, "learning_rate": 9.626110841395917e-06, "loss": 0.94428139, "memory(GiB)": 740.28, "step": 13220, "train_speed(iter/s)": 0.645894 }, { "acc": 0.75603995, "epoch": 0.33549013110434284, "grad_norm": 3.703125, "learning_rate": 9.625712854149474e-06, "loss": 0.966043, "memory(GiB)": 740.28, "step": 13225, "train_speed(iter/s)": 0.644145 }, { "acc": 0.74874663, "epoch": 0.3356169704733804, "grad_norm": 3.796875, "learning_rate": 9.62531466343266e-06, "loss": 0.96442099, "memory(GiB)": 740.28, "step": 13230, "train_speed(iter/s)": 0.642422 }, { "acc": 0.75956202, "epoch": 0.3357438098424179, "grad_norm": 3.578125, "learning_rate": 9.624916269262992e-06, "loss": 0.90694399, "memory(GiB)": 740.28, "step": 13235, "train_speed(iter/s)": 0.640489 }, { "acc": 0.76167727, "epoch": 0.3358706492114555, "grad_norm": 3.421875, "learning_rate": 9.624517671657991e-06, "loss": 0.92460928, "memory(GiB)": 740.28, "step": 13240, "train_speed(iter/s)": 0.638639 }, { "acc": 0.7644969, "epoch": 0.33599748858049305, "grad_norm": 3.875, "learning_rate": 9.624118870635194e-06, "loss": 0.85618105, "memory(GiB)": 740.28, "step": 13245, "train_speed(iter/s)": 0.636836 }, { "acc": 0.75790539, "epoch": 0.3361243279495306, "grad_norm": 3.65625, "learning_rate": 9.62371986621214e-06, "loss": 0.88142233, "memory(GiB)": 740.28, "step": 13250, "train_speed(iter/s)": 0.635236 }, { "acc": 0.7642417, "epoch": 0.3362511673185682, "grad_norm": 3.765625, "learning_rate": 9.623320658406384e-06, "loss": 0.91292553, "memory(GiB)": 740.28, "step": 13255, "train_speed(iter/s)": 0.633265 }, { "acc": 0.75016704, "epoch": 0.33637800668760576, "grad_norm": 4.9375, "learning_rate": 9.62292124723548e-06, "loss": 1.02072926, "memory(GiB)": 740.28, "step": 13260, "train_speed(iter/s)": 0.631605 }, { "acc": 0.76063032, "epoch": 0.33650484605664327, "grad_norm": 3.765625, "learning_rate": 9.622521632717002e-06, "loss": 0.96379137, "memory(GiB)": 740.28, "step": 13265, "train_speed(iter/s)": 0.630143 }, { "acc": 0.75195832, "epoch": 0.33663168542568084, "grad_norm": 3.890625, "learning_rate": 9.622121814868524e-06, "loss": 0.95046263, "memory(GiB)": 740.28, "step": 13270, "train_speed(iter/s)": 0.628386 }, { "acc": 0.75229697, "epoch": 0.3367585247947184, "grad_norm": 3.40625, "learning_rate": 9.621721793707635e-06, "loss": 0.90907106, "memory(GiB)": 740.28, "step": 13275, "train_speed(iter/s)": 0.626606 }, { "acc": 0.7655798, "epoch": 0.336885364163756, "grad_norm": 3.609375, "learning_rate": 9.621321569251929e-06, "loss": 0.91015949, "memory(GiB)": 740.28, "step": 13280, "train_speed(iter/s)": 0.624879 }, { "acc": 0.75752258, "epoch": 0.33701220353279354, "grad_norm": 3.5625, "learning_rate": 9.620921141519013e-06, "loss": 0.89859991, "memory(GiB)": 740.28, "step": 13285, "train_speed(iter/s)": 0.623105 }, { "acc": 0.77208486, "epoch": 0.3371390429018311, "grad_norm": 3.203125, "learning_rate": 9.620520510526498e-06, "loss": 0.83696451, "memory(GiB)": 740.28, "step": 13290, "train_speed(iter/s)": 0.621295 }, { "acc": 0.75451126, "epoch": 0.3372658822708686, "grad_norm": 3.40625, "learning_rate": 9.62011967629201e-06, "loss": 0.93535309, "memory(GiB)": 740.28, "step": 13295, "train_speed(iter/s)": 0.619681 }, { "acc": 0.74228559, "epoch": 0.3373927216399062, "grad_norm": 3.984375, "learning_rate": 9.619718638833175e-06, "loss": 1.00077553, "memory(GiB)": 740.28, "step": 13300, "train_speed(iter/s)": 0.618127 }, { "acc": 0.76039915, "epoch": 0.33751956100894376, "grad_norm": 4.09375, "learning_rate": 9.619317398167638e-06, "loss": 0.9167264, "memory(GiB)": 740.28, "step": 13305, "train_speed(iter/s)": 0.616537 }, { "acc": 0.764816, "epoch": 0.3376464003779813, "grad_norm": 4.25, "learning_rate": 9.618915954313046e-06, "loss": 0.88498945, "memory(GiB)": 740.28, "step": 13310, "train_speed(iter/s)": 0.614936 }, { "acc": 0.75197954, "epoch": 0.3377732397470189, "grad_norm": 3.53125, "learning_rate": 9.61851430728706e-06, "loss": 0.93727598, "memory(GiB)": 740.28, "step": 13315, "train_speed(iter/s)": 0.613423 }, { "acc": 0.760743, "epoch": 0.33790007911605646, "grad_norm": 4.25, "learning_rate": 9.618112457107342e-06, "loss": 0.92014637, "memory(GiB)": 740.28, "step": 13320, "train_speed(iter/s)": 0.611848 }, { "acc": 0.76085587, "epoch": 0.338026918485094, "grad_norm": 3.546875, "learning_rate": 9.617710403791572e-06, "loss": 0.91342745, "memory(GiB)": 740.28, "step": 13325, "train_speed(iter/s)": 0.610131 }, { "acc": 0.75400376, "epoch": 0.33815375785413154, "grad_norm": 3.375, "learning_rate": 9.617308147357436e-06, "loss": 0.90872679, "memory(GiB)": 740.28, "step": 13330, "train_speed(iter/s)": 0.60853 }, { "acc": 0.77756677, "epoch": 0.3382805972231691, "grad_norm": 3.5625, "learning_rate": 9.616905687822623e-06, "loss": 0.84129887, "memory(GiB)": 740.28, "step": 13335, "train_speed(iter/s)": 0.606688 }, { "acc": 0.76483784, "epoch": 0.3384074365922067, "grad_norm": 3.53125, "learning_rate": 9.616503025204843e-06, "loss": 0.90113764, "memory(GiB)": 740.28, "step": 13340, "train_speed(iter/s)": 0.604837 }, { "acc": 0.74719367, "epoch": 0.33853427596124425, "grad_norm": 3.53125, "learning_rate": 9.616100159521802e-06, "loss": 0.96300936, "memory(GiB)": 740.28, "step": 13345, "train_speed(iter/s)": 0.603109 }, { "acc": 0.75526032, "epoch": 0.3386611153302818, "grad_norm": 3.453125, "learning_rate": 9.615697090791222e-06, "loss": 0.930091, "memory(GiB)": 740.28, "step": 13350, "train_speed(iter/s)": 0.601407 }, { "acc": 0.75159059, "epoch": 0.3387879546993193, "grad_norm": 4.125, "learning_rate": 9.615293819030836e-06, "loss": 0.90801716, "memory(GiB)": 740.28, "step": 13355, "train_speed(iter/s)": 0.599835 }, { "acc": 0.75479994, "epoch": 0.3389147940683569, "grad_norm": 4.0625, "learning_rate": 9.614890344258379e-06, "loss": 0.97912512, "memory(GiB)": 740.28, "step": 13360, "train_speed(iter/s)": 0.598212 }, { "acc": 0.75033088, "epoch": 0.33904163343739446, "grad_norm": 3.828125, "learning_rate": 9.614486666491597e-06, "loss": 0.99447117, "memory(GiB)": 740.28, "step": 13365, "train_speed(iter/s)": 0.596407 }, { "acc": 0.75411239, "epoch": 0.33916847280643203, "grad_norm": 3.890625, "learning_rate": 9.614082785748253e-06, "loss": 0.92737112, "memory(GiB)": 740.28, "step": 13370, "train_speed(iter/s)": 0.594728 }, { "acc": 0.75972629, "epoch": 0.3392953121754696, "grad_norm": 4.03125, "learning_rate": 9.613678702046108e-06, "loss": 0.94713373, "memory(GiB)": 740.28, "step": 13375, "train_speed(iter/s)": 0.592951 }, { "acc": 0.75866561, "epoch": 0.33942215154450717, "grad_norm": 3.734375, "learning_rate": 9.613274415402935e-06, "loss": 0.95006037, "memory(GiB)": 740.28, "step": 13380, "train_speed(iter/s)": 0.591482 }, { "acc": 0.76597447, "epoch": 0.3395489909135447, "grad_norm": 3.640625, "learning_rate": 9.612869925836521e-06, "loss": 0.89618998, "memory(GiB)": 740.28, "step": 13385, "train_speed(iter/s)": 0.59001 }, { "acc": 0.75874891, "epoch": 0.33967583028258225, "grad_norm": 3.609375, "learning_rate": 9.612465233364656e-06, "loss": 0.88963337, "memory(GiB)": 740.28, "step": 13390, "train_speed(iter/s)": 0.588467 }, { "acc": 0.76634889, "epoch": 0.3398026696516198, "grad_norm": 3.46875, "learning_rate": 9.612060338005143e-06, "loss": 0.89570885, "memory(GiB)": 740.28, "step": 13395, "train_speed(iter/s)": 0.586826 }, { "acc": 0.76473327, "epoch": 0.3399295090206574, "grad_norm": 3.359375, "learning_rate": 9.611655239775788e-06, "loss": 0.91038609, "memory(GiB)": 740.28, "step": 13400, "train_speed(iter/s)": 0.58525 }, { "acc": 0.75249505, "epoch": 0.34005634838969495, "grad_norm": 4.65625, "learning_rate": 9.611249938694414e-06, "loss": 0.98250504, "memory(GiB)": 740.28, "step": 13405, "train_speed(iter/s)": 0.583778 }, { "acc": 0.74830346, "epoch": 0.3401831877587325, "grad_norm": 3.71875, "learning_rate": 9.610844434778847e-06, "loss": 0.93843193, "memory(GiB)": 740.28, "step": 13410, "train_speed(iter/s)": 0.582392 }, { "acc": 0.7599524, "epoch": 0.34031002712777003, "grad_norm": 4.03125, "learning_rate": 9.610438728046925e-06, "loss": 0.9061141, "memory(GiB)": 740.28, "step": 13415, "train_speed(iter/s)": 0.58101 }, { "acc": 0.76046371, "epoch": 0.3404368664968076, "grad_norm": 3.265625, "learning_rate": 9.610032818516492e-06, "loss": 0.93122501, "memory(GiB)": 740.28, "step": 13420, "train_speed(iter/s)": 0.579386 }, { "acc": 0.7594821, "epoch": 0.34056370586584517, "grad_norm": 3.765625, "learning_rate": 9.609626706205403e-06, "loss": 1.00650425, "memory(GiB)": 740.28, "step": 13425, "train_speed(iter/s)": 0.577945 }, { "acc": 0.76021581, "epoch": 0.34069054523488274, "grad_norm": 2.84375, "learning_rate": 9.609220391131527e-06, "loss": 0.9070487, "memory(GiB)": 740.28, "step": 13430, "train_speed(iter/s)": 0.576471 }, { "acc": 0.76142578, "epoch": 0.3408173846039203, "grad_norm": 3.25, "learning_rate": 9.608813873312728e-06, "loss": 0.93599463, "memory(GiB)": 740.28, "step": 13435, "train_speed(iter/s)": 0.574864 }, { "acc": 0.75229378, "epoch": 0.34094422397295787, "grad_norm": 5.125, "learning_rate": 9.608407152766892e-06, "loss": 0.95476236, "memory(GiB)": 740.28, "step": 13440, "train_speed(iter/s)": 0.573529 }, { "acc": 0.76504598, "epoch": 0.3410710633419954, "grad_norm": 3.296875, "learning_rate": 9.608000229511909e-06, "loss": 0.90604696, "memory(GiB)": 740.28, "step": 13445, "train_speed(iter/s)": 0.572044 }, { "acc": 0.74829116, "epoch": 0.34119790271103295, "grad_norm": 3.09375, "learning_rate": 9.607593103565677e-06, "loss": 0.9139677, "memory(GiB)": 740.28, "step": 13450, "train_speed(iter/s)": 0.570536 }, { "acc": 0.7729208, "epoch": 0.3413247420800705, "grad_norm": 3.53125, "learning_rate": 9.607185774946106e-06, "loss": 0.91315575, "memory(GiB)": 740.28, "step": 13455, "train_speed(iter/s)": 0.569036 }, { "acc": 0.76265597, "epoch": 0.3414515814491081, "grad_norm": 3.75, "learning_rate": 9.60677824367111e-06, "loss": 0.97836323, "memory(GiB)": 740.28, "step": 13460, "train_speed(iter/s)": 0.567669 }, { "acc": 0.76952486, "epoch": 0.34157842081814566, "grad_norm": 3.75, "learning_rate": 9.60637050975862e-06, "loss": 0.87398586, "memory(GiB)": 740.28, "step": 13465, "train_speed(iter/s)": 0.56637 }, { "acc": 0.76386423, "epoch": 0.3417052601871832, "grad_norm": 3.53125, "learning_rate": 9.605962573226567e-06, "loss": 0.91989555, "memory(GiB)": 740.28, "step": 13470, "train_speed(iter/s)": 0.565095 }, { "acc": 0.74663754, "epoch": 0.34183209955622074, "grad_norm": 3.28125, "learning_rate": 9.605554434092897e-06, "loss": 0.93763046, "memory(GiB)": 740.28, "step": 13475, "train_speed(iter/s)": 0.563734 }, { "acc": 0.75188074, "epoch": 0.3419589389252583, "grad_norm": 3.5625, "learning_rate": 9.60514609237556e-06, "loss": 0.99881201, "memory(GiB)": 740.28, "step": 13480, "train_speed(iter/s)": 0.562218 }, { "acc": 0.76259894, "epoch": 0.3420857782942959, "grad_norm": 3.609375, "learning_rate": 9.60473754809252e-06, "loss": 0.96864882, "memory(GiB)": 740.28, "step": 13485, "train_speed(iter/s)": 0.560867 }, { "acc": 0.75675931, "epoch": 0.34221261766333344, "grad_norm": 3.5, "learning_rate": 9.604328801261747e-06, "loss": 0.97463236, "memory(GiB)": 740.28, "step": 13490, "train_speed(iter/s)": 0.559418 }, { "acc": 0.76375666, "epoch": 0.342339457032371, "grad_norm": 4.03125, "learning_rate": 9.603919851901218e-06, "loss": 0.93990593, "memory(GiB)": 740.28, "step": 13495, "train_speed(iter/s)": 0.558068 }, { "acc": 0.757828, "epoch": 0.3424662964014086, "grad_norm": 3.421875, "learning_rate": 9.603510700028927e-06, "loss": 0.92031107, "memory(GiB)": 740.28, "step": 13500, "train_speed(iter/s)": 0.556788 }, { "epoch": 0.3424662964014086, "eval_acc": 0.7490835229648779, "eval_loss": 0.8998910188674927, "eval_runtime": 1152.1764, "eval_samples_per_second": 5.529, "eval_steps_per_second": 5.529, "step": 13500 }, { "acc": 0.75371785, "epoch": 0.3425931357704461, "grad_norm": 2.984375, "learning_rate": 9.603101345662866e-06, "loss": 0.94398212, "memory(GiB)": 740.28, "step": 13505, "train_speed(iter/s)": 0.515313 }, { "acc": 0.774966, "epoch": 0.34271997513948366, "grad_norm": 3.1875, "learning_rate": 9.602691788821044e-06, "loss": 0.8863122, "memory(GiB)": 740.28, "step": 13510, "train_speed(iter/s)": 0.514159 }, { "acc": 0.75874596, "epoch": 0.3428468145085212, "grad_norm": 4.0, "learning_rate": 9.602282029521475e-06, "loss": 0.93718348, "memory(GiB)": 740.28, "step": 13515, "train_speed(iter/s)": 0.51316 }, { "acc": 0.76370807, "epoch": 0.3429736538775588, "grad_norm": 3.90625, "learning_rate": 9.601872067782182e-06, "loss": 0.90787315, "memory(GiB)": 740.28, "step": 13520, "train_speed(iter/s)": 0.512003 }, { "acc": 0.75325155, "epoch": 0.34310049324659636, "grad_norm": 3.53125, "learning_rate": 9.601461903621202e-06, "loss": 0.90580893, "memory(GiB)": 740.28, "step": 13525, "train_speed(iter/s)": 0.510906 }, { "acc": 0.75843415, "epoch": 0.34322733261563393, "grad_norm": 3.59375, "learning_rate": 9.601051537056573e-06, "loss": 0.92236862, "memory(GiB)": 740.28, "step": 13530, "train_speed(iter/s)": 0.509841 }, { "acc": 0.751372, "epoch": 0.34335417198467144, "grad_norm": 3.375, "learning_rate": 9.600640968106346e-06, "loss": 1.01798, "memory(GiB)": 740.28, "step": 13535, "train_speed(iter/s)": 0.508891 }, { "acc": 0.75778604, "epoch": 0.343481011353709, "grad_norm": 4.3125, "learning_rate": 9.600230196788583e-06, "loss": 0.90552654, "memory(GiB)": 740.28, "step": 13540, "train_speed(iter/s)": 0.507974 }, { "acc": 0.74716077, "epoch": 0.3436078507227466, "grad_norm": 3.296875, "learning_rate": 9.599819223121348e-06, "loss": 0.96320477, "memory(GiB)": 740.28, "step": 13545, "train_speed(iter/s)": 0.506801 }, { "acc": 0.76704092, "epoch": 0.34373469009178415, "grad_norm": 5.3125, "learning_rate": 9.599408047122723e-06, "loss": 0.90864763, "memory(GiB)": 740.28, "step": 13550, "train_speed(iter/s)": 0.505738 }, { "acc": 0.74713349, "epoch": 0.3438615294608217, "grad_norm": 3.34375, "learning_rate": 9.598996668810793e-06, "loss": 0.94470892, "memory(GiB)": 740.28, "step": 13555, "train_speed(iter/s)": 0.504723 }, { "acc": 0.74262648, "epoch": 0.3439883688298593, "grad_norm": 3.46875, "learning_rate": 9.598585088203652e-06, "loss": 1.03347244, "memory(GiB)": 740.28, "step": 13560, "train_speed(iter/s)": 0.503645 }, { "acc": 0.77308016, "epoch": 0.3441152081988968, "grad_norm": 3.5, "learning_rate": 9.598173305319404e-06, "loss": 0.90175323, "memory(GiB)": 740.28, "step": 13565, "train_speed(iter/s)": 0.502611 }, { "acc": 0.75389171, "epoch": 0.34424204756793436, "grad_norm": 4.1875, "learning_rate": 9.597761320176165e-06, "loss": 0.94192839, "memory(GiB)": 740.28, "step": 13570, "train_speed(iter/s)": 0.501615 }, { "acc": 0.7578187, "epoch": 0.34436888693697193, "grad_norm": 3.328125, "learning_rate": 9.597349132792055e-06, "loss": 0.92321568, "memory(GiB)": 740.28, "step": 13575, "train_speed(iter/s)": 0.500618 }, { "acc": 0.77201791, "epoch": 0.3444957263060095, "grad_norm": 3.546875, "learning_rate": 9.596936743185203e-06, "loss": 0.88092766, "memory(GiB)": 740.28, "step": 13580, "train_speed(iter/s)": 0.499379 }, { "acc": 0.75527287, "epoch": 0.34462256567504707, "grad_norm": 2.734375, "learning_rate": 9.596524151373752e-06, "loss": 0.95864925, "memory(GiB)": 740.28, "step": 13585, "train_speed(iter/s)": 0.498316 }, { "acc": 0.75201793, "epoch": 0.34474940504408463, "grad_norm": 3.171875, "learning_rate": 9.596111357375848e-06, "loss": 0.90677433, "memory(GiB)": 740.28, "step": 13590, "train_speed(iter/s)": 0.497292 }, { "acc": 0.77822437, "epoch": 0.34487624441312215, "grad_norm": 3.28125, "learning_rate": 9.59569836120965e-06, "loss": 0.88263874, "memory(GiB)": 740.28, "step": 13595, "train_speed(iter/s)": 0.496317 }, { "acc": 0.76832743, "epoch": 0.3450030837821597, "grad_norm": 3.734375, "learning_rate": 9.595285162893325e-06, "loss": 0.90748005, "memory(GiB)": 740.28, "step": 13600, "train_speed(iter/s)": 0.495253 }, { "acc": 0.76195345, "epoch": 0.3451299231511973, "grad_norm": 4.21875, "learning_rate": 9.594871762445043e-06, "loss": 0.94759445, "memory(GiB)": 740.28, "step": 13605, "train_speed(iter/s)": 0.494233 }, { "acc": 0.75439343, "epoch": 0.34525676252023485, "grad_norm": 3.5625, "learning_rate": 9.594458159882996e-06, "loss": 0.90935841, "memory(GiB)": 740.28, "step": 13610, "train_speed(iter/s)": 0.49313 }, { "acc": 0.74553223, "epoch": 0.3453836018892724, "grad_norm": 4.09375, "learning_rate": 9.594044355225373e-06, "loss": 0.91462746, "memory(GiB)": 740.28, "step": 13615, "train_speed(iter/s)": 0.492213 }, { "acc": 0.76330924, "epoch": 0.34551044125831, "grad_norm": 3.859375, "learning_rate": 9.593630348490374e-06, "loss": 0.88364239, "memory(GiB)": 740.28, "step": 13620, "train_speed(iter/s)": 0.491125 }, { "acc": 0.77366414, "epoch": 0.3456372806273475, "grad_norm": 3.359375, "learning_rate": 9.593216139696215e-06, "loss": 0.84406338, "memory(GiB)": 740.28, "step": 13625, "train_speed(iter/s)": 0.490187 }, { "acc": 0.76806836, "epoch": 0.34576411999638507, "grad_norm": 2.828125, "learning_rate": 9.592801728861113e-06, "loss": 0.89270897, "memory(GiB)": 740.28, "step": 13630, "train_speed(iter/s)": 0.489288 }, { "acc": 0.76496181, "epoch": 0.34589095936542263, "grad_norm": 3.1875, "learning_rate": 9.592387116003294e-06, "loss": 0.88901234, "memory(GiB)": 740.28, "step": 13635, "train_speed(iter/s)": 0.488162 }, { "acc": 0.75421305, "epoch": 0.3460177987344602, "grad_norm": 3.828125, "learning_rate": 9.591972301141e-06, "loss": 0.89572973, "memory(GiB)": 740.28, "step": 13640, "train_speed(iter/s)": 0.487267 }, { "acc": 0.76970201, "epoch": 0.34614463810349777, "grad_norm": 3.421875, "learning_rate": 9.591557284292474e-06, "loss": 0.93286877, "memory(GiB)": 740.28, "step": 13645, "train_speed(iter/s)": 0.486275 }, { "acc": 0.77122569, "epoch": 0.34627147747253534, "grad_norm": 4.25, "learning_rate": 9.591142065475974e-06, "loss": 0.92897358, "memory(GiB)": 740.28, "step": 13650, "train_speed(iter/s)": 0.485312 }, { "acc": 0.75971532, "epoch": 0.34639831684157285, "grad_norm": 3.15625, "learning_rate": 9.590726644709762e-06, "loss": 0.95877581, "memory(GiB)": 740.28, "step": 13655, "train_speed(iter/s)": 0.484409 }, { "acc": 0.76132517, "epoch": 0.3465251562106104, "grad_norm": 3.65625, "learning_rate": 9.590311022012113e-06, "loss": 0.91397943, "memory(GiB)": 740.28, "step": 13660, "train_speed(iter/s)": 0.483437 }, { "acc": 0.76760044, "epoch": 0.346651995579648, "grad_norm": 3.625, "learning_rate": 9.589895197401307e-06, "loss": 0.92350826, "memory(GiB)": 740.28, "step": 13665, "train_speed(iter/s)": 0.482534 }, { "acc": 0.7608696, "epoch": 0.34677883494868555, "grad_norm": 3.25, "learning_rate": 9.589479170895636e-06, "loss": 1.01187115, "memory(GiB)": 740.28, "step": 13670, "train_speed(iter/s)": 0.481558 }, { "acc": 0.74378977, "epoch": 0.3469056743177231, "grad_norm": 3.34375, "learning_rate": 9.589062942513397e-06, "loss": 1.01557932, "memory(GiB)": 740.28, "step": 13675, "train_speed(iter/s)": 0.480536 }, { "acc": 0.76621523, "epoch": 0.3470325136867607, "grad_norm": 4.125, "learning_rate": 9.588646512272903e-06, "loss": 0.93125296, "memory(GiB)": 740.28, "step": 13680, "train_speed(iter/s)": 0.479637 }, { "acc": 0.7542563, "epoch": 0.3471593530557982, "grad_norm": 3.484375, "learning_rate": 9.588229880192468e-06, "loss": 0.98433065, "memory(GiB)": 740.28, "step": 13685, "train_speed(iter/s)": 0.47871 }, { "acc": 0.75202246, "epoch": 0.34728619242483577, "grad_norm": 3.375, "learning_rate": 9.58781304629042e-06, "loss": 0.97634811, "memory(GiB)": 740.28, "step": 13690, "train_speed(iter/s)": 0.477685 }, { "acc": 0.75723472, "epoch": 0.34741303179387334, "grad_norm": 4.125, "learning_rate": 9.587396010585094e-06, "loss": 0.94847364, "memory(GiB)": 740.28, "step": 13695, "train_speed(iter/s)": 0.476777 }, { "acc": 0.75583887, "epoch": 0.3475398711629109, "grad_norm": 3.953125, "learning_rate": 9.586978773094834e-06, "loss": 0.93055944, "memory(GiB)": 740.28, "step": 13700, "train_speed(iter/s)": 0.475733 }, { "acc": 0.76232386, "epoch": 0.3476667105319485, "grad_norm": 3.828125, "learning_rate": 9.586561333837994e-06, "loss": 0.94215651, "memory(GiB)": 740.28, "step": 13705, "train_speed(iter/s)": 0.474795 }, { "acc": 0.76069455, "epoch": 0.34779354990098604, "grad_norm": 3.515625, "learning_rate": 9.586143692832932e-06, "loss": 0.90891762, "memory(GiB)": 740.28, "step": 13710, "train_speed(iter/s)": 0.473908 }, { "acc": 0.75817561, "epoch": 0.34792038927002356, "grad_norm": 5.25, "learning_rate": 9.585725850098023e-06, "loss": 0.97613668, "memory(GiB)": 740.28, "step": 13715, "train_speed(iter/s)": 0.472972 }, { "acc": 0.76204772, "epoch": 0.3480472286390611, "grad_norm": 3.8125, "learning_rate": 9.585307805651644e-06, "loss": 0.91564837, "memory(GiB)": 740.28, "step": 13720, "train_speed(iter/s)": 0.472075 }, { "acc": 0.75432816, "epoch": 0.3481740680080987, "grad_norm": 3.890625, "learning_rate": 9.584889559512184e-06, "loss": 0.96085033, "memory(GiB)": 740.28, "step": 13725, "train_speed(iter/s)": 0.4712 }, { "acc": 0.76136265, "epoch": 0.34830090737713626, "grad_norm": 3.671875, "learning_rate": 9.584471111698042e-06, "loss": 0.93745384, "memory(GiB)": 740.28, "step": 13730, "train_speed(iter/s)": 0.470234 }, { "acc": 0.76555138, "epoch": 0.3484277467461738, "grad_norm": 3.625, "learning_rate": 9.584052462227621e-06, "loss": 0.85508299, "memory(GiB)": 740.28, "step": 13735, "train_speed(iter/s)": 0.469239 }, { "acc": 0.75677428, "epoch": 0.3485545861152114, "grad_norm": 3.3125, "learning_rate": 9.583633611119341e-06, "loss": 0.93172388, "memory(GiB)": 740.28, "step": 13740, "train_speed(iter/s)": 0.468291 }, { "acc": 0.75997548, "epoch": 0.3486814254842489, "grad_norm": 2.859375, "learning_rate": 9.58321455839162e-06, "loss": 0.93250208, "memory(GiB)": 740.28, "step": 13745, "train_speed(iter/s)": 0.467324 }, { "acc": 0.76019044, "epoch": 0.3488082648532865, "grad_norm": 3.421875, "learning_rate": 9.582795304062895e-06, "loss": 0.95974016, "memory(GiB)": 740.28, "step": 13750, "train_speed(iter/s)": 0.46641 }, { "acc": 0.76656952, "epoch": 0.34893510422232404, "grad_norm": 3.265625, "learning_rate": 9.582375848151607e-06, "loss": 0.8963356, "memory(GiB)": 740.28, "step": 13755, "train_speed(iter/s)": 0.465464 }, { "acc": 0.75952706, "epoch": 0.3490619435913616, "grad_norm": 3.4375, "learning_rate": 9.581956190676202e-06, "loss": 0.92879257, "memory(GiB)": 740.28, "step": 13760, "train_speed(iter/s)": 0.464553 }, { "acc": 0.75466514, "epoch": 0.3491887829603992, "grad_norm": 2.984375, "learning_rate": 9.581536331655146e-06, "loss": 0.92600231, "memory(GiB)": 740.28, "step": 13765, "train_speed(iter/s)": 0.463664 }, { "acc": 0.75969043, "epoch": 0.34931562232943675, "grad_norm": 3.15625, "learning_rate": 9.581116271106906e-06, "loss": 0.94996014, "memory(GiB)": 740.28, "step": 13770, "train_speed(iter/s)": 0.462817 }, { "acc": 0.76971512, "epoch": 0.34944246169847426, "grad_norm": 3.515625, "learning_rate": 9.580696009049954e-06, "loss": 0.951476, "memory(GiB)": 740.28, "step": 13775, "train_speed(iter/s)": 0.461923 }, { "acc": 0.7791976, "epoch": 0.34956930106751183, "grad_norm": 3.859375, "learning_rate": 9.580275545502781e-06, "loss": 0.8706439, "memory(GiB)": 740.28, "step": 13780, "train_speed(iter/s)": 0.460958 }, { "acc": 0.76478453, "epoch": 0.3496961404365494, "grad_norm": 3.125, "learning_rate": 9.579854880483881e-06, "loss": 0.9548192, "memory(GiB)": 740.28, "step": 13785, "train_speed(iter/s)": 0.460114 }, { "acc": 0.76413951, "epoch": 0.34982297980558696, "grad_norm": 3.796875, "learning_rate": 9.579434014011758e-06, "loss": 0.92815981, "memory(GiB)": 740.28, "step": 13790, "train_speed(iter/s)": 0.459321 }, { "acc": 0.75438137, "epoch": 0.34994981917462453, "grad_norm": 3.609375, "learning_rate": 9.579012946104921e-06, "loss": 0.94075584, "memory(GiB)": 740.28, "step": 13795, "train_speed(iter/s)": 0.458455 }, { "acc": 0.76243801, "epoch": 0.3500766585436621, "grad_norm": 4.65625, "learning_rate": 9.578591676781894e-06, "loss": 0.90785408, "memory(GiB)": 740.28, "step": 13800, "train_speed(iter/s)": 0.457593 }, { "acc": 0.76860008, "epoch": 0.3502034979126996, "grad_norm": 3.96875, "learning_rate": 9.57817020606121e-06, "loss": 0.85051107, "memory(GiB)": 740.28, "step": 13805, "train_speed(iter/s)": 0.456708 }, { "acc": 0.74665451, "epoch": 0.3503303372817372, "grad_norm": 3.1875, "learning_rate": 9.577748533961404e-06, "loss": 1.00974503, "memory(GiB)": 740.28, "step": 13810, "train_speed(iter/s)": 0.455783 }, { "acc": 0.76493888, "epoch": 0.35045717665077475, "grad_norm": 3.953125, "learning_rate": 9.577326660501027e-06, "loss": 0.93181458, "memory(GiB)": 740.28, "step": 13815, "train_speed(iter/s)": 0.454927 }, { "acc": 0.76966848, "epoch": 0.3505840160198123, "grad_norm": 3.421875, "learning_rate": 9.576904585698633e-06, "loss": 0.84465637, "memory(GiB)": 740.28, "step": 13820, "train_speed(iter/s)": 0.454114 }, { "acc": 0.76855907, "epoch": 0.3507108553888499, "grad_norm": 3.5, "learning_rate": 9.57648230957279e-06, "loss": 0.91694403, "memory(GiB)": 740.28, "step": 13825, "train_speed(iter/s)": 0.453344 }, { "acc": 0.77082043, "epoch": 0.35083769475788745, "grad_norm": 3.390625, "learning_rate": 9.57605983214207e-06, "loss": 0.87917471, "memory(GiB)": 740.28, "step": 13830, "train_speed(iter/s)": 0.452495 }, { "acc": 0.75944409, "epoch": 0.35096453412692497, "grad_norm": 3.71875, "learning_rate": 9.575637153425061e-06, "loss": 0.89132471, "memory(GiB)": 740.28, "step": 13835, "train_speed(iter/s)": 0.45169 }, { "acc": 0.7586544, "epoch": 0.35109137349596253, "grad_norm": 3.46875, "learning_rate": 9.575214273440353e-06, "loss": 0.92179623, "memory(GiB)": 740.28, "step": 13840, "train_speed(iter/s)": 0.450894 }, { "acc": 0.76095548, "epoch": 0.3512182128650001, "grad_norm": 3.96875, "learning_rate": 9.574791192206544e-06, "loss": 0.9512537, "memory(GiB)": 740.28, "step": 13845, "train_speed(iter/s)": 0.450083 }, { "acc": 0.76558537, "epoch": 0.35134505223403767, "grad_norm": 3.453125, "learning_rate": 9.574367909742246e-06, "loss": 0.92824621, "memory(GiB)": 740.28, "step": 13850, "train_speed(iter/s)": 0.449259 }, { "acc": 0.76042356, "epoch": 0.35147189160307524, "grad_norm": 3.3125, "learning_rate": 9.57394442606608e-06, "loss": 0.9411211, "memory(GiB)": 740.28, "step": 13855, "train_speed(iter/s)": 0.448333 }, { "acc": 0.75272183, "epoch": 0.3515987309721128, "grad_norm": 3.703125, "learning_rate": 9.573520741196672e-06, "loss": 0.92918406, "memory(GiB)": 740.28, "step": 13860, "train_speed(iter/s)": 0.447508 }, { "acc": 0.7636539, "epoch": 0.3517255703411503, "grad_norm": 3.234375, "learning_rate": 9.57309685515266e-06, "loss": 0.90775537, "memory(GiB)": 740.28, "step": 13865, "train_speed(iter/s)": 0.446663 }, { "acc": 0.76586685, "epoch": 0.3518524097101879, "grad_norm": 3.15625, "learning_rate": 9.572672767952687e-06, "loss": 0.91503849, "memory(GiB)": 740.28, "step": 13870, "train_speed(iter/s)": 0.445851 }, { "acc": 0.75930362, "epoch": 0.35197924907922545, "grad_norm": 3.0625, "learning_rate": 9.572248479615407e-06, "loss": 0.91867189, "memory(GiB)": 740.28, "step": 13875, "train_speed(iter/s)": 0.445086 }, { "acc": 0.7615767, "epoch": 0.352106088448263, "grad_norm": 3.171875, "learning_rate": 9.571823990159486e-06, "loss": 0.93352051, "memory(GiB)": 740.28, "step": 13880, "train_speed(iter/s)": 0.444307 }, { "acc": 0.76961703, "epoch": 0.3522329278173006, "grad_norm": 3.109375, "learning_rate": 9.571399299603594e-06, "loss": 0.93787537, "memory(GiB)": 740.28, "step": 13885, "train_speed(iter/s)": 0.443566 }, { "acc": 0.7582551, "epoch": 0.35235976718633816, "grad_norm": 3.65625, "learning_rate": 9.570974407966412e-06, "loss": 0.92513008, "memory(GiB)": 740.28, "step": 13890, "train_speed(iter/s)": 0.44273 }, { "acc": 0.75999022, "epoch": 0.35248660655537567, "grad_norm": 4.25, "learning_rate": 9.57054931526663e-06, "loss": 0.93301477, "memory(GiB)": 740.28, "step": 13895, "train_speed(iter/s)": 0.441823 }, { "acc": 0.76864109, "epoch": 0.35261344592441324, "grad_norm": 3.40625, "learning_rate": 9.570124021522947e-06, "loss": 0.92161255, "memory(GiB)": 740.28, "step": 13900, "train_speed(iter/s)": 0.441032 }, { "acc": 0.76233354, "epoch": 0.3527402852934508, "grad_norm": 3.890625, "learning_rate": 9.56969852675407e-06, "loss": 0.94049854, "memory(GiB)": 740.28, "step": 13905, "train_speed(iter/s)": 0.440209 }, { "acc": 0.75175257, "epoch": 0.3528671246624884, "grad_norm": 3.1875, "learning_rate": 9.569272830978713e-06, "loss": 0.97348385, "memory(GiB)": 740.28, "step": 13910, "train_speed(iter/s)": 0.439505 }, { "acc": 0.77415161, "epoch": 0.35299396403152594, "grad_norm": 3.8125, "learning_rate": 9.568846934215604e-06, "loss": 0.89404888, "memory(GiB)": 740.28, "step": 13915, "train_speed(iter/s)": 0.438773 }, { "acc": 0.75096378, "epoch": 0.3531208034005635, "grad_norm": 3.359375, "learning_rate": 9.568420836483475e-06, "loss": 1.00663242, "memory(GiB)": 740.28, "step": 13920, "train_speed(iter/s)": 0.438041 }, { "acc": 0.75851445, "epoch": 0.353247642769601, "grad_norm": 3.625, "learning_rate": 9.567994537801068e-06, "loss": 0.94441528, "memory(GiB)": 740.28, "step": 13925, "train_speed(iter/s)": 0.43719 }, { "acc": 0.76126289, "epoch": 0.3533744821386386, "grad_norm": 4.0, "learning_rate": 9.567568038187138e-06, "loss": 0.88945885, "memory(GiB)": 740.28, "step": 13930, "train_speed(iter/s)": 0.436374 }, { "acc": 0.76026096, "epoch": 0.35350132150767616, "grad_norm": 3.390625, "learning_rate": 9.567141337660442e-06, "loss": 0.95572271, "memory(GiB)": 740.28, "step": 13935, "train_speed(iter/s)": 0.435591 }, { "acc": 0.76490955, "epoch": 0.3536281608767137, "grad_norm": 3.234375, "learning_rate": 9.56671443623975e-06, "loss": 0.92859621, "memory(GiB)": 740.28, "step": 13940, "train_speed(iter/s)": 0.434886 }, { "acc": 0.75454063, "epoch": 0.3537550002457513, "grad_norm": 5.34375, "learning_rate": 9.566287333943842e-06, "loss": 0.95686216, "memory(GiB)": 740.28, "step": 13945, "train_speed(iter/s)": 0.434193 }, { "acc": 0.74852719, "epoch": 0.35388183961478886, "grad_norm": 2.765625, "learning_rate": 9.565860030791503e-06, "loss": 0.88073559, "memory(GiB)": 740.28, "step": 13950, "train_speed(iter/s)": 0.433405 }, { "acc": 0.76976862, "epoch": 0.3540086789838264, "grad_norm": 3.640625, "learning_rate": 9.565432526801527e-06, "loss": 0.89027081, "memory(GiB)": 740.28, "step": 13955, "train_speed(iter/s)": 0.432617 }, { "acc": 0.75902233, "epoch": 0.35413551835286394, "grad_norm": 3.171875, "learning_rate": 9.565004821992722e-06, "loss": 0.94119501, "memory(GiB)": 740.28, "step": 13960, "train_speed(iter/s)": 0.431885 }, { "acc": 0.77267323, "epoch": 0.3542623577219015, "grad_norm": 3.546875, "learning_rate": 9.5645769163839e-06, "loss": 0.84819756, "memory(GiB)": 740.28, "step": 13965, "train_speed(iter/s)": 0.431203 }, { "acc": 0.75078759, "epoch": 0.3543891970909391, "grad_norm": 3.34375, "learning_rate": 9.564148809993882e-06, "loss": 0.94597464, "memory(GiB)": 740.28, "step": 13970, "train_speed(iter/s)": 0.43044 }, { "acc": 0.75528526, "epoch": 0.35451603645997665, "grad_norm": 3.625, "learning_rate": 9.563720502841501e-06, "loss": 0.94929409, "memory(GiB)": 740.28, "step": 13975, "train_speed(iter/s)": 0.429622 }, { "acc": 0.74982772, "epoch": 0.3546428758290142, "grad_norm": 3.28125, "learning_rate": 9.563291994945595e-06, "loss": 0.95349674, "memory(GiB)": 740.28, "step": 13980, "train_speed(iter/s)": 0.428785 }, { "acc": 0.75999355, "epoch": 0.3547697151980517, "grad_norm": 3.5, "learning_rate": 9.562863286325015e-06, "loss": 0.91819048, "memory(GiB)": 740.28, "step": 13985, "train_speed(iter/s)": 0.428081 }, { "acc": 0.7547008, "epoch": 0.3548965545670893, "grad_norm": 3.6875, "learning_rate": 9.562434376998617e-06, "loss": 0.94649038, "memory(GiB)": 740.28, "step": 13990, "train_speed(iter/s)": 0.427385 }, { "acc": 0.75853529, "epoch": 0.35502339393612686, "grad_norm": 3.28125, "learning_rate": 9.562005266985267e-06, "loss": 0.91649437, "memory(GiB)": 740.28, "step": 13995, "train_speed(iter/s)": 0.426623 }, { "acc": 0.75559292, "epoch": 0.35515023330516443, "grad_norm": 3.78125, "learning_rate": 9.561575956303841e-06, "loss": 0.96865053, "memory(GiB)": 740.28, "step": 14000, "train_speed(iter/s)": 0.425958 }, { "epoch": 0.35515023330516443, "eval_acc": 0.7493788844526642, "eval_loss": 0.8985739350318909, "eval_runtime": 1151.0662, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 14000 }, { "acc": 0.75586915, "epoch": 0.355277072674202, "grad_norm": 3.71875, "learning_rate": 9.561146444973224e-06, "loss": 0.93441038, "memory(GiB)": 740.28, "step": 14005, "train_speed(iter/s)": 0.402192 }, { "acc": 0.76173873, "epoch": 0.35540391204323957, "grad_norm": 4.0625, "learning_rate": 9.560716733012306e-06, "loss": 0.9749074, "memory(GiB)": 740.28, "step": 14010, "train_speed(iter/s)": 0.401565 }, { "acc": 0.7537077, "epoch": 0.3555307514122771, "grad_norm": 3.625, "learning_rate": 9.56028682043999e-06, "loss": 0.95165586, "memory(GiB)": 740.28, "step": 14015, "train_speed(iter/s)": 0.400928 }, { "acc": 0.74676213, "epoch": 0.35565759078131465, "grad_norm": 3.234375, "learning_rate": 9.559856707275189e-06, "loss": 0.94882479, "memory(GiB)": 740.28, "step": 14020, "train_speed(iter/s)": 0.400283 }, { "acc": 0.75329638, "epoch": 0.3557844301503522, "grad_norm": 3.984375, "learning_rate": 9.559426393536816e-06, "loss": 0.99512005, "memory(GiB)": 740.28, "step": 14025, "train_speed(iter/s)": 0.399696 }, { "acc": 0.76298165, "epoch": 0.3559112695193898, "grad_norm": 3.84375, "learning_rate": 9.558995879243807e-06, "loss": 0.94250851, "memory(GiB)": 740.28, "step": 14030, "train_speed(iter/s)": 0.399166 }, { "acc": 0.75303249, "epoch": 0.35603810888842735, "grad_norm": 4.21875, "learning_rate": 9.558565164415092e-06, "loss": 1.00744991, "memory(GiB)": 740.28, "step": 14035, "train_speed(iter/s)": 0.398557 }, { "acc": 0.75491772, "epoch": 0.3561649482574649, "grad_norm": 4.28125, "learning_rate": 9.558134249069622e-06, "loss": 0.89736433, "memory(GiB)": 740.28, "step": 14040, "train_speed(iter/s)": 0.397878 }, { "acc": 0.75399947, "epoch": 0.35629178762650243, "grad_norm": 3.46875, "learning_rate": 9.557703133226351e-06, "loss": 0.93515682, "memory(GiB)": 740.28, "step": 14045, "train_speed(iter/s)": 0.397284 }, { "acc": 0.76116781, "epoch": 0.35641862699554, "grad_norm": 3.765625, "learning_rate": 9.557271816904238e-06, "loss": 0.93940258, "memory(GiB)": 740.28, "step": 14050, "train_speed(iter/s)": 0.396691 }, { "acc": 0.75422444, "epoch": 0.35654546636457757, "grad_norm": 3.625, "learning_rate": 9.556840300122258e-06, "loss": 0.95365496, "memory(GiB)": 740.28, "step": 14055, "train_speed(iter/s)": 0.39601 }, { "acc": 0.76651001, "epoch": 0.35667230573361514, "grad_norm": 3.6875, "learning_rate": 9.556408582899395e-06, "loss": 0.92416191, "memory(GiB)": 740.28, "step": 14060, "train_speed(iter/s)": 0.395389 }, { "acc": 0.75042152, "epoch": 0.3567991451026527, "grad_norm": 3.984375, "learning_rate": 9.555976665254634e-06, "loss": 0.93304996, "memory(GiB)": 740.28, "step": 14065, "train_speed(iter/s)": 0.394745 }, { "acc": 0.7682456, "epoch": 0.35692598447169027, "grad_norm": 3.953125, "learning_rate": 9.555544547206976e-06, "loss": 0.89100313, "memory(GiB)": 740.28, "step": 14070, "train_speed(iter/s)": 0.394151 }, { "acc": 0.75017896, "epoch": 0.3570528238407278, "grad_norm": 3.859375, "learning_rate": 9.555112228775427e-06, "loss": 0.98597164, "memory(GiB)": 740.28, "step": 14075, "train_speed(iter/s)": 0.393512 }, { "acc": 0.74739103, "epoch": 0.35717966320976535, "grad_norm": 4.03125, "learning_rate": 9.554679709979006e-06, "loss": 1.02070379, "memory(GiB)": 740.28, "step": 14080, "train_speed(iter/s)": 0.392854 }, { "acc": 0.77556701, "epoch": 0.3573065025788029, "grad_norm": 2.625, "learning_rate": 9.554246990836736e-06, "loss": 0.83487864, "memory(GiB)": 740.28, "step": 14085, "train_speed(iter/s)": 0.392199 }, { "acc": 0.75909281, "epoch": 0.3574333419478405, "grad_norm": 3.765625, "learning_rate": 9.553814071367654e-06, "loss": 0.93193636, "memory(GiB)": 740.28, "step": 14090, "train_speed(iter/s)": 0.391625 }, { "acc": 0.75233569, "epoch": 0.35756018131687806, "grad_norm": 3.453125, "learning_rate": 9.553380951590795e-06, "loss": 0.96281052, "memory(GiB)": 740.28, "step": 14095, "train_speed(iter/s)": 0.39098 }, { "acc": 0.76490746, "epoch": 0.3576870206859156, "grad_norm": 4.5, "learning_rate": 9.55294763152522e-06, "loss": 0.89493723, "memory(GiB)": 740.28, "step": 14100, "train_speed(iter/s)": 0.390348 }, { "acc": 0.76392226, "epoch": 0.35781386005495314, "grad_norm": 3.578125, "learning_rate": 9.552514111189985e-06, "loss": 0.93444386, "memory(GiB)": 740.28, "step": 14105, "train_speed(iter/s)": 0.389774 }, { "acc": 0.77519021, "epoch": 0.3579406994239907, "grad_norm": 3.71875, "learning_rate": 9.55208039060416e-06, "loss": 0.9176321, "memory(GiB)": 740.28, "step": 14110, "train_speed(iter/s)": 0.389251 }, { "acc": 0.77185826, "epoch": 0.3580675387930283, "grad_norm": 4.375, "learning_rate": 9.55164646978682e-06, "loss": 0.92874556, "memory(GiB)": 740.28, "step": 14115, "train_speed(iter/s)": 0.388724 }, { "acc": 0.74704618, "epoch": 0.35819437816206584, "grad_norm": 3.4375, "learning_rate": 9.551212348757056e-06, "loss": 0.91493149, "memory(GiB)": 740.28, "step": 14120, "train_speed(iter/s)": 0.388138 }, { "acc": 0.74670196, "epoch": 0.3583212175311034, "grad_norm": 3.859375, "learning_rate": 9.550778027533963e-06, "loss": 1.00150414, "memory(GiB)": 740.28, "step": 14125, "train_speed(iter/s)": 0.387468 }, { "acc": 0.76870675, "epoch": 0.358448056900141, "grad_norm": 3.515625, "learning_rate": 9.550343506136642e-06, "loss": 0.91314859, "memory(GiB)": 740.28, "step": 14130, "train_speed(iter/s)": 0.386912 }, { "acc": 0.7569634, "epoch": 0.3585748962691785, "grad_norm": 3.21875, "learning_rate": 9.549908784584208e-06, "loss": 0.91499405, "memory(GiB)": 740.28, "step": 14135, "train_speed(iter/s)": 0.386343 }, { "acc": 0.77299881, "epoch": 0.35870173563821606, "grad_norm": 7.40625, "learning_rate": 9.549473862895786e-06, "loss": 0.8672823, "memory(GiB)": 740.28, "step": 14140, "train_speed(iter/s)": 0.385853 }, { "acc": 0.75911884, "epoch": 0.3588285750072536, "grad_norm": 4.34375, "learning_rate": 9.549038741090501e-06, "loss": 0.92522068, "memory(GiB)": 740.28, "step": 14145, "train_speed(iter/s)": 0.38523 }, { "acc": 0.7589962, "epoch": 0.3589554143762912, "grad_norm": 3.40625, "learning_rate": 9.548603419187499e-06, "loss": 0.91243496, "memory(GiB)": 740.28, "step": 14150, "train_speed(iter/s)": 0.384645 }, { "acc": 0.76238284, "epoch": 0.35908225374532876, "grad_norm": 3.140625, "learning_rate": 9.548167897205923e-06, "loss": 0.93299141, "memory(GiB)": 740.28, "step": 14155, "train_speed(iter/s)": 0.384125 }, { "acc": 0.76634054, "epoch": 0.35920909311436633, "grad_norm": 3.21875, "learning_rate": 9.547732175164934e-06, "loss": 0.90474329, "memory(GiB)": 740.28, "step": 14160, "train_speed(iter/s)": 0.383531 }, { "acc": 0.765975, "epoch": 0.35933593248340384, "grad_norm": 3.15625, "learning_rate": 9.547296253083695e-06, "loss": 0.89602737, "memory(GiB)": 740.28, "step": 14165, "train_speed(iter/s)": 0.382975 }, { "acc": 0.76154065, "epoch": 0.3594627718524414, "grad_norm": 3.6875, "learning_rate": 9.546860130981384e-06, "loss": 0.95668249, "memory(GiB)": 740.28, "step": 14170, "train_speed(iter/s)": 0.382508 }, { "acc": 0.74384165, "epoch": 0.359589611221479, "grad_norm": 3.375, "learning_rate": 9.546423808877183e-06, "loss": 0.97416019, "memory(GiB)": 740.28, "step": 14175, "train_speed(iter/s)": 0.381928 }, { "acc": 0.76790686, "epoch": 0.35971645059051655, "grad_norm": 3.421875, "learning_rate": 9.545987286790283e-06, "loss": 0.90236034, "memory(GiB)": 740.28, "step": 14180, "train_speed(iter/s)": 0.381392 }, { "acc": 0.75725832, "epoch": 0.3598432899595541, "grad_norm": 3.609375, "learning_rate": 9.545550564739889e-06, "loss": 0.96356773, "memory(GiB)": 740.28, "step": 14185, "train_speed(iter/s)": 0.380809 }, { "acc": 0.7619328, "epoch": 0.3599701293285917, "grad_norm": 4.21875, "learning_rate": 9.545113642745207e-06, "loss": 0.88158035, "memory(GiB)": 740.28, "step": 14190, "train_speed(iter/s)": 0.380278 }, { "acc": 0.75776453, "epoch": 0.3600969686976292, "grad_norm": 3.671875, "learning_rate": 9.544676520825458e-06, "loss": 0.92368431, "memory(GiB)": 740.28, "step": 14195, "train_speed(iter/s)": 0.379812 }, { "acc": 0.77513299, "epoch": 0.36022380806666676, "grad_norm": 3.125, "learning_rate": 9.54423919899987e-06, "loss": 0.89985237, "memory(GiB)": 740.28, "step": 14200, "train_speed(iter/s)": 0.379273 }, { "acc": 0.75384393, "epoch": 0.36035064743570433, "grad_norm": 3.6875, "learning_rate": 9.543801677287676e-06, "loss": 0.96202135, "memory(GiB)": 740.28, "step": 14205, "train_speed(iter/s)": 0.378733 }, { "acc": 0.76073976, "epoch": 0.3604774868047419, "grad_norm": 3.328125, "learning_rate": 9.543363955708124e-06, "loss": 0.90059509, "memory(GiB)": 740.28, "step": 14210, "train_speed(iter/s)": 0.3782 }, { "acc": 0.74477844, "epoch": 0.36060432617377947, "grad_norm": 3.78125, "learning_rate": 9.542926034280471e-06, "loss": 0.93826704, "memory(GiB)": 740.28, "step": 14215, "train_speed(iter/s)": 0.377685 }, { "acc": 0.74798465, "epoch": 0.36073116554281703, "grad_norm": 3.3125, "learning_rate": 9.542487913023973e-06, "loss": 0.99291048, "memory(GiB)": 740.28, "step": 14220, "train_speed(iter/s)": 0.377146 }, { "acc": 0.76822782, "epoch": 0.36085800491185455, "grad_norm": 3.609375, "learning_rate": 9.542049591957906e-06, "loss": 0.85819292, "memory(GiB)": 740.28, "step": 14225, "train_speed(iter/s)": 0.3766 }, { "acc": 0.76790605, "epoch": 0.3609848442808921, "grad_norm": 3.015625, "learning_rate": 9.541611071101549e-06, "loss": 0.89169216, "memory(GiB)": 740.28, "step": 14230, "train_speed(iter/s)": 0.376066 }, { "acc": 0.75703321, "epoch": 0.3611116836499297, "grad_norm": 3.296875, "learning_rate": 9.541172350474193e-06, "loss": 0.90543051, "memory(GiB)": 740.28, "step": 14235, "train_speed(iter/s)": 0.375456 }, { "acc": 0.75438089, "epoch": 0.36123852301896725, "grad_norm": 4.71875, "learning_rate": 9.540733430095133e-06, "loss": 0.94620171, "memory(GiB)": 740.28, "step": 14240, "train_speed(iter/s)": 0.374918 }, { "acc": 0.75956168, "epoch": 0.3613653623880048, "grad_norm": 3.3125, "learning_rate": 9.540294309983675e-06, "loss": 0.95208845, "memory(GiB)": 740.28, "step": 14245, "train_speed(iter/s)": 0.374449 }, { "acc": 0.7756855, "epoch": 0.3614922017570424, "grad_norm": 2.984375, "learning_rate": 9.53985499015914e-06, "loss": 0.8930829, "memory(GiB)": 740.28, "step": 14250, "train_speed(iter/s)": 0.373916 }, { "acc": 0.76068387, "epoch": 0.3616190411260799, "grad_norm": 3.15625, "learning_rate": 9.539415470640847e-06, "loss": 0.93487997, "memory(GiB)": 740.28, "step": 14255, "train_speed(iter/s)": 0.373393 }, { "acc": 0.755969, "epoch": 0.36174588049511747, "grad_norm": 3.734375, "learning_rate": 9.538975751448132e-06, "loss": 1.0349617, "memory(GiB)": 740.28, "step": 14260, "train_speed(iter/s)": 0.37295 }, { "acc": 0.75584188, "epoch": 0.36187271986415503, "grad_norm": 3.234375, "learning_rate": 9.538535832600335e-06, "loss": 0.9425601, "memory(GiB)": 740.28, "step": 14265, "train_speed(iter/s)": 0.372405 }, { "acc": 0.76570153, "epoch": 0.3619995592331926, "grad_norm": 3.734375, "learning_rate": 9.538095714116809e-06, "loss": 0.91068735, "memory(GiB)": 740.28, "step": 14270, "train_speed(iter/s)": 0.371885 }, { "acc": 0.76655493, "epoch": 0.36212639860223017, "grad_norm": 3.015625, "learning_rate": 9.537655396016909e-06, "loss": 0.93455887, "memory(GiB)": 740.28, "step": 14275, "train_speed(iter/s)": 0.371319 }, { "acc": 0.77346129, "epoch": 0.36225323797126774, "grad_norm": 4.09375, "learning_rate": 9.537214878320007e-06, "loss": 0.86082058, "memory(GiB)": 740.28, "step": 14280, "train_speed(iter/s)": 0.370827 }, { "acc": 0.74845576, "epoch": 0.36238007734030525, "grad_norm": 3.234375, "learning_rate": 9.53677416104548e-06, "loss": 0.92534943, "memory(GiB)": 740.28, "step": 14285, "train_speed(iter/s)": 0.370348 }, { "acc": 0.77098479, "epoch": 0.3625069167093428, "grad_norm": 4.6875, "learning_rate": 9.536333244212712e-06, "loss": 0.92582026, "memory(GiB)": 740.28, "step": 14290, "train_speed(iter/s)": 0.369912 }, { "acc": 0.76798077, "epoch": 0.3626337560783804, "grad_norm": 3.390625, "learning_rate": 9.535892127841098e-06, "loss": 0.89736338, "memory(GiB)": 740.28, "step": 14295, "train_speed(iter/s)": 0.369404 }, { "acc": 0.74263897, "epoch": 0.36276059544741796, "grad_norm": 4.15625, "learning_rate": 9.535450811950042e-06, "loss": 0.97923317, "memory(GiB)": 740.28, "step": 14300, "train_speed(iter/s)": 0.368874 }, { "acc": 0.75648446, "epoch": 0.3628874348164555, "grad_norm": 3.328125, "learning_rate": 9.535009296558955e-06, "loss": 0.89240675, "memory(GiB)": 740.28, "step": 14305, "train_speed(iter/s)": 0.368338 }, { "acc": 0.7597805, "epoch": 0.3630142741854931, "grad_norm": 3.5625, "learning_rate": 9.534567581687259e-06, "loss": 0.88909807, "memory(GiB)": 740.28, "step": 14310, "train_speed(iter/s)": 0.367811 }, { "acc": 0.748246, "epoch": 0.3631411135545306, "grad_norm": 3.640625, "learning_rate": 9.534125667354383e-06, "loss": 1.01964064, "memory(GiB)": 740.28, "step": 14315, "train_speed(iter/s)": 0.367228 }, { "acc": 0.75549145, "epoch": 0.36326795292356817, "grad_norm": 3.5, "learning_rate": 9.533683553579765e-06, "loss": 0.92811785, "memory(GiB)": 740.28, "step": 14320, "train_speed(iter/s)": 0.366671 }, { "acc": 0.75225549, "epoch": 0.36339479229260574, "grad_norm": 3.390625, "learning_rate": 9.533241240382854e-06, "loss": 0.9887517, "memory(GiB)": 740.28, "step": 14325, "train_speed(iter/s)": 0.366178 }, { "acc": 0.76865187, "epoch": 0.3635216316616433, "grad_norm": 3.96875, "learning_rate": 9.532798727783103e-06, "loss": 0.88602629, "memory(GiB)": 740.28, "step": 14330, "train_speed(iter/s)": 0.365721 }, { "acc": 0.7519743, "epoch": 0.3636484710306809, "grad_norm": 3.78125, "learning_rate": 9.53235601579998e-06, "loss": 0.93856983, "memory(GiB)": 740.28, "step": 14335, "train_speed(iter/s)": 0.365219 }, { "acc": 0.75673037, "epoch": 0.36377531039971844, "grad_norm": 4.1875, "learning_rate": 9.531913104452954e-06, "loss": 0.95507174, "memory(GiB)": 740.28, "step": 14340, "train_speed(iter/s)": 0.364771 }, { "acc": 0.7530993, "epoch": 0.36390214976875596, "grad_norm": 3.453125, "learning_rate": 9.531469993761512e-06, "loss": 0.93815651, "memory(GiB)": 740.28, "step": 14345, "train_speed(iter/s)": 0.364272 }, { "acc": 0.76325302, "epoch": 0.3640289891377935, "grad_norm": 5.34375, "learning_rate": 9.531026683745144e-06, "loss": 0.92372246, "memory(GiB)": 740.28, "step": 14350, "train_speed(iter/s)": 0.363809 }, { "acc": 0.7485486, "epoch": 0.3641558285068311, "grad_norm": 3.375, "learning_rate": 9.530583174423349e-06, "loss": 0.93670292, "memory(GiB)": 740.28, "step": 14355, "train_speed(iter/s)": 0.363316 }, { "acc": 0.75914078, "epoch": 0.36428266787586866, "grad_norm": 3.3125, "learning_rate": 9.530139465815633e-06, "loss": 0.88937798, "memory(GiB)": 740.28, "step": 14360, "train_speed(iter/s)": 0.36282 }, { "acc": 0.77144465, "epoch": 0.36440950724490623, "grad_norm": 3.671875, "learning_rate": 9.529695557941518e-06, "loss": 0.89295187, "memory(GiB)": 740.28, "step": 14365, "train_speed(iter/s)": 0.362342 }, { "acc": 0.75602698, "epoch": 0.3645363466139438, "grad_norm": 3.46875, "learning_rate": 9.529251450820526e-06, "loss": 0.96805801, "memory(GiB)": 740.28, "step": 14370, "train_speed(iter/s)": 0.361864 }, { "acc": 0.76504717, "epoch": 0.3646631859829813, "grad_norm": 3.375, "learning_rate": 9.528807144472196e-06, "loss": 0.92118845, "memory(GiB)": 740.28, "step": 14375, "train_speed(iter/s)": 0.361302 }, { "acc": 0.76012197, "epoch": 0.3647900253520189, "grad_norm": 3.34375, "learning_rate": 9.528362638916069e-06, "loss": 0.95836191, "memory(GiB)": 740.28, "step": 14380, "train_speed(iter/s)": 0.360817 }, { "acc": 0.75597057, "epoch": 0.36491686472105644, "grad_norm": 3.140625, "learning_rate": 9.527917934171696e-06, "loss": 0.94461527, "memory(GiB)": 740.28, "step": 14385, "train_speed(iter/s)": 0.360373 }, { "acc": 0.74470782, "epoch": 0.365043704090094, "grad_norm": 4.25, "learning_rate": 9.527473030258642e-06, "loss": 1.00864038, "memory(GiB)": 740.28, "step": 14390, "train_speed(iter/s)": 0.359847 }, { "acc": 0.77177172, "epoch": 0.3651705434591316, "grad_norm": 3.65625, "learning_rate": 9.527027927196473e-06, "loss": 0.8346242, "memory(GiB)": 740.28, "step": 14395, "train_speed(iter/s)": 0.359413 }, { "acc": 0.75645685, "epoch": 0.36529738282816915, "grad_norm": 3.4375, "learning_rate": 9.526582625004772e-06, "loss": 0.92977295, "memory(GiB)": 740.28, "step": 14400, "train_speed(iter/s)": 0.358874 }, { "acc": 0.75568223, "epoch": 0.36542422219720666, "grad_norm": 3.484375, "learning_rate": 9.52613712370312e-06, "loss": 0.89822292, "memory(GiB)": 740.28, "step": 14405, "train_speed(iter/s)": 0.358403 }, { "acc": 0.75113516, "epoch": 0.36555106156624423, "grad_norm": 3.109375, "learning_rate": 9.52569142331112e-06, "loss": 0.92631826, "memory(GiB)": 740.28, "step": 14410, "train_speed(iter/s)": 0.357907 }, { "acc": 0.7445364, "epoch": 0.3656779009352818, "grad_norm": 3.21875, "learning_rate": 9.525245523848375e-06, "loss": 0.97668295, "memory(GiB)": 740.28, "step": 14415, "train_speed(iter/s)": 0.357451 }, { "acc": 0.74816661, "epoch": 0.36580474030431936, "grad_norm": 3.265625, "learning_rate": 9.524799425334494e-06, "loss": 0.9624136, "memory(GiB)": 740.28, "step": 14420, "train_speed(iter/s)": 0.356936 }, { "acc": 0.75982566, "epoch": 0.36593157967335693, "grad_norm": 3.484375, "learning_rate": 9.524353127789106e-06, "loss": 0.90873528, "memory(GiB)": 740.28, "step": 14425, "train_speed(iter/s)": 0.356495 }, { "acc": 0.76257954, "epoch": 0.3660584190423945, "grad_norm": 4.15625, "learning_rate": 9.52390663123184e-06, "loss": 0.93506002, "memory(GiB)": 740.28, "step": 14430, "train_speed(iter/s)": 0.356063 }, { "acc": 0.76129799, "epoch": 0.366185258411432, "grad_norm": 3.4375, "learning_rate": 9.523459935682332e-06, "loss": 0.94036608, "memory(GiB)": 740.28, "step": 14435, "train_speed(iter/s)": 0.355606 }, { "acc": 0.77530479, "epoch": 0.3663120977804696, "grad_norm": 4.09375, "learning_rate": 9.523013041160237e-06, "loss": 0.88150291, "memory(GiB)": 740.28, "step": 14440, "train_speed(iter/s)": 0.355187 }, { "acc": 0.74699502, "epoch": 0.36643893714950715, "grad_norm": 3.765625, "learning_rate": 9.522565947685207e-06, "loss": 0.98796692, "memory(GiB)": 740.28, "step": 14445, "train_speed(iter/s)": 0.354733 }, { "acc": 0.76896725, "epoch": 0.3665657765185447, "grad_norm": 3.65625, "learning_rate": 9.522118655276913e-06, "loss": 0.88096752, "memory(GiB)": 740.28, "step": 14450, "train_speed(iter/s)": 0.354266 }, { "acc": 0.77691927, "epoch": 0.3666926158875823, "grad_norm": 3.359375, "learning_rate": 9.521671163955026e-06, "loss": 0.89121151, "memory(GiB)": 740.28, "step": 14455, "train_speed(iter/s)": 0.353811 }, { "acc": 0.75617843, "epoch": 0.36681945525661985, "grad_norm": 3.15625, "learning_rate": 9.521223473739233e-06, "loss": 0.97315025, "memory(GiB)": 740.28, "step": 14460, "train_speed(iter/s)": 0.35335 }, { "acc": 0.75702391, "epoch": 0.36694629462565737, "grad_norm": 3.390625, "learning_rate": 9.520775584649225e-06, "loss": 0.94854698, "memory(GiB)": 740.28, "step": 14465, "train_speed(iter/s)": 0.352911 }, { "acc": 0.75745773, "epoch": 0.36707313399469493, "grad_norm": 3.46875, "learning_rate": 9.520327496704703e-06, "loss": 0.88823662, "memory(GiB)": 740.28, "step": 14470, "train_speed(iter/s)": 0.352404 }, { "acc": 0.7632566, "epoch": 0.3671999733637325, "grad_norm": 3.46875, "learning_rate": 9.519879209925374e-06, "loss": 0.86935635, "memory(GiB)": 740.28, "step": 14475, "train_speed(iter/s)": 0.351994 }, { "acc": 0.77261319, "epoch": 0.36732681273277007, "grad_norm": 3.859375, "learning_rate": 9.519430724330963e-06, "loss": 0.84093485, "memory(GiB)": 740.28, "step": 14480, "train_speed(iter/s)": 0.351571 }, { "acc": 0.770153, "epoch": 0.36745365210180764, "grad_norm": 3.5, "learning_rate": 9.518982039941194e-06, "loss": 0.90498486, "memory(GiB)": 740.28, "step": 14485, "train_speed(iter/s)": 0.351117 }, { "acc": 0.75018702, "epoch": 0.3675804914708452, "grad_norm": 2.96875, "learning_rate": 9.518533156775803e-06, "loss": 0.90786762, "memory(GiB)": 740.28, "step": 14490, "train_speed(iter/s)": 0.350646 }, { "acc": 0.7633718, "epoch": 0.3677073308398827, "grad_norm": 4.09375, "learning_rate": 9.518084074854535e-06, "loss": 0.91114273, "memory(GiB)": 740.28, "step": 14495, "train_speed(iter/s)": 0.350194 }, { "acc": 0.75243535, "epoch": 0.3678341702089203, "grad_norm": 4.46875, "learning_rate": 9.517634794197144e-06, "loss": 0.97741899, "memory(GiB)": 740.28, "step": 14500, "train_speed(iter/s)": 0.349745 }, { "epoch": 0.3678341702089203, "eval_acc": 0.7496587885500007, "eval_loss": 0.8958644866943359, "eval_runtime": 1151.7993, "eval_samples_per_second": 5.53, "eval_steps_per_second": 5.53, "step": 14500 }, { "acc": 0.77698359, "epoch": 0.36796100957795785, "grad_norm": 3.25, "learning_rate": 9.517185314823391e-06, "loss": 0.88857536, "memory(GiB)": 740.28, "step": 14505, "train_speed(iter/s)": 0.334084 }, { "acc": 0.7546771, "epoch": 0.3680878489469954, "grad_norm": 2.84375, "learning_rate": 9.516735636753051e-06, "loss": 0.89690161, "memory(GiB)": 740.28, "step": 14510, "train_speed(iter/s)": 0.333636 }, { "acc": 0.7609436, "epoch": 0.368214688316033, "grad_norm": 3.078125, "learning_rate": 9.5162857600059e-06, "loss": 0.92861118, "memory(GiB)": 740.28, "step": 14515, "train_speed(iter/s)": 0.333243 }, { "acc": 0.74994893, "epoch": 0.36834152768507056, "grad_norm": 3.671875, "learning_rate": 9.51583568460173e-06, "loss": 1.00284796, "memory(GiB)": 740.28, "step": 14520, "train_speed(iter/s)": 0.332861 }, { "acc": 0.75637622, "epoch": 0.36846836705410807, "grad_norm": 5.21875, "learning_rate": 9.515385410560337e-06, "loss": 0.9538415, "memory(GiB)": 740.28, "step": 14525, "train_speed(iter/s)": 0.332455 }, { "acc": 0.77549825, "epoch": 0.36859520642314564, "grad_norm": 3.953125, "learning_rate": 9.514934937901523e-06, "loss": 0.86055355, "memory(GiB)": 740.28, "step": 14530, "train_speed(iter/s)": 0.332068 }, { "acc": 0.7562685, "epoch": 0.3687220457921832, "grad_norm": 4.28125, "learning_rate": 9.51448426664511e-06, "loss": 0.97626219, "memory(GiB)": 740.28, "step": 14535, "train_speed(iter/s)": 0.33171 }, { "acc": 0.76264925, "epoch": 0.3688488851612208, "grad_norm": 4.75, "learning_rate": 9.514033396810917e-06, "loss": 0.9165597, "memory(GiB)": 740.28, "step": 14540, "train_speed(iter/s)": 0.331346 }, { "acc": 0.76299381, "epoch": 0.36897572453025834, "grad_norm": 3.578125, "learning_rate": 9.513582328418777e-06, "loss": 0.97683277, "memory(GiB)": 740.28, "step": 14545, "train_speed(iter/s)": 0.330973 }, { "acc": 0.77144899, "epoch": 0.3691025638992959, "grad_norm": 4.5, "learning_rate": 9.513131061488533e-06, "loss": 0.88258562, "memory(GiB)": 740.28, "step": 14550, "train_speed(iter/s)": 0.330616 }, { "acc": 0.75165, "epoch": 0.3692294032683334, "grad_norm": 3.28125, "learning_rate": 9.512679596040031e-06, "loss": 0.94000959, "memory(GiB)": 740.28, "step": 14555, "train_speed(iter/s)": 0.33018 }, { "acc": 0.77041607, "epoch": 0.369356242637371, "grad_norm": 3.203125, "learning_rate": 9.512227932093133e-06, "loss": 0.8642231, "memory(GiB)": 740.28, "step": 14560, "train_speed(iter/s)": 0.329766 }, { "acc": 0.75890865, "epoch": 0.36948308200640856, "grad_norm": 3.546875, "learning_rate": 9.511776069667705e-06, "loss": 1.0185606, "memory(GiB)": 740.28, "step": 14565, "train_speed(iter/s)": 0.329374 }, { "acc": 0.76771188, "epoch": 0.3696099213754461, "grad_norm": 3.453125, "learning_rate": 9.511324008783624e-06, "loss": 0.87577963, "memory(GiB)": 740.28, "step": 14570, "train_speed(iter/s)": 0.328978 }, { "acc": 0.75840397, "epoch": 0.3697367607444837, "grad_norm": 3.4375, "learning_rate": 9.510871749460772e-06, "loss": 0.9219759, "memory(GiB)": 740.28, "step": 14575, "train_speed(iter/s)": 0.328561 }, { "acc": 0.76130762, "epoch": 0.36986360011352126, "grad_norm": 3.765625, "learning_rate": 9.510419291719047e-06, "loss": 0.95289669, "memory(GiB)": 740.28, "step": 14580, "train_speed(iter/s)": 0.328184 }, { "acc": 0.76945534, "epoch": 0.3699904394825588, "grad_norm": 3.671875, "learning_rate": 9.509966635578345e-06, "loss": 0.90800314, "memory(GiB)": 740.28, "step": 14585, "train_speed(iter/s)": 0.327812 }, { "acc": 0.76877112, "epoch": 0.37011727885159634, "grad_norm": 3.53125, "learning_rate": 9.509513781058583e-06, "loss": 0.96281996, "memory(GiB)": 740.28, "step": 14590, "train_speed(iter/s)": 0.327414 }, { "acc": 0.75724082, "epoch": 0.3702441182206339, "grad_norm": 4.1875, "learning_rate": 9.509060728179677e-06, "loss": 0.92004623, "memory(GiB)": 740.28, "step": 14595, "train_speed(iter/s)": 0.327022 }, { "acc": 0.75090127, "epoch": 0.3703709575896715, "grad_norm": 3.15625, "learning_rate": 9.508607476961556e-06, "loss": 0.93088827, "memory(GiB)": 740.28, "step": 14600, "train_speed(iter/s)": 0.326614 }, { "acc": 0.76726661, "epoch": 0.37049779695870905, "grad_norm": 3.265625, "learning_rate": 9.508154027424158e-06, "loss": 0.91775379, "memory(GiB)": 740.28, "step": 14605, "train_speed(iter/s)": 0.326226 }, { "acc": 0.75945973, "epoch": 0.3706246363277466, "grad_norm": 3.53125, "learning_rate": 9.507700379587428e-06, "loss": 0.95072422, "memory(GiB)": 740.28, "step": 14610, "train_speed(iter/s)": 0.325855 }, { "acc": 0.75686498, "epoch": 0.3707514756967841, "grad_norm": 3.78125, "learning_rate": 9.50724653347132e-06, "loss": 0.91178236, "memory(GiB)": 740.28, "step": 14615, "train_speed(iter/s)": 0.325423 }, { "acc": 0.7404995, "epoch": 0.3708783150658217, "grad_norm": 3.1875, "learning_rate": 9.506792489095799e-06, "loss": 0.97323093, "memory(GiB)": 740.28, "step": 14620, "train_speed(iter/s)": 0.324985 }, { "acc": 0.75562673, "epoch": 0.37100515443485926, "grad_norm": 3.515625, "learning_rate": 9.506338246480834e-06, "loss": 0.9657239, "memory(GiB)": 740.28, "step": 14625, "train_speed(iter/s)": 0.324574 }, { "acc": 0.76660271, "epoch": 0.37113199380389683, "grad_norm": 4.09375, "learning_rate": 9.50588380564641e-06, "loss": 0.90902147, "memory(GiB)": 740.28, "step": 14630, "train_speed(iter/s)": 0.324237 }, { "acc": 0.75417652, "epoch": 0.3712588331729344, "grad_norm": 3.4375, "learning_rate": 9.505429166612514e-06, "loss": 0.88540487, "memory(GiB)": 740.28, "step": 14635, "train_speed(iter/s)": 0.323859 }, { "acc": 0.7607533, "epoch": 0.37138567254197197, "grad_norm": 3.765625, "learning_rate": 9.504974329399143e-06, "loss": 0.89209003, "memory(GiB)": 740.28, "step": 14640, "train_speed(iter/s)": 0.323524 }, { "acc": 0.76030531, "epoch": 0.3715125119110095, "grad_norm": 4.03125, "learning_rate": 9.504519294026306e-06, "loss": 0.93127117, "memory(GiB)": 740.28, "step": 14645, "train_speed(iter/s)": 0.323142 }, { "acc": 0.74492249, "epoch": 0.37163935128004705, "grad_norm": 3.3125, "learning_rate": 9.504064060514015e-06, "loss": 0.95165815, "memory(GiB)": 740.28, "step": 14650, "train_speed(iter/s)": 0.322807 }, { "acc": 0.78275924, "epoch": 0.3717661906490846, "grad_norm": 3.640625, "learning_rate": 9.503608628882299e-06, "loss": 0.84304857, "memory(GiB)": 740.28, "step": 14655, "train_speed(iter/s)": 0.322386 }, { "acc": 0.74090381, "epoch": 0.3718930300181222, "grad_norm": 3.296875, "learning_rate": 9.503152999151189e-06, "loss": 0.97300053, "memory(GiB)": 740.28, "step": 14660, "train_speed(iter/s)": 0.32206 }, { "acc": 0.75157895, "epoch": 0.37201986938715975, "grad_norm": 3.390625, "learning_rate": 9.502697171340726e-06, "loss": 0.90698156, "memory(GiB)": 740.28, "step": 14665, "train_speed(iter/s)": 0.32163 }, { "acc": 0.7650702, "epoch": 0.3721467087561973, "grad_norm": 3.53125, "learning_rate": 9.50224114547096e-06, "loss": 0.93270416, "memory(GiB)": 740.28, "step": 14670, "train_speed(iter/s)": 0.321285 }, { "acc": 0.75243864, "epoch": 0.37227354812523483, "grad_norm": 3.8125, "learning_rate": 9.501784921561952e-06, "loss": 0.94023333, "memory(GiB)": 740.28, "step": 14675, "train_speed(iter/s)": 0.320924 }, { "acc": 0.76271653, "epoch": 0.3724003874942724, "grad_norm": 3.90625, "learning_rate": 9.501328499633768e-06, "loss": 0.90502148, "memory(GiB)": 740.28, "step": 14680, "train_speed(iter/s)": 0.320565 }, { "acc": 0.7540318, "epoch": 0.37252722686330997, "grad_norm": 3.6875, "learning_rate": 9.500871879706487e-06, "loss": 0.96559258, "memory(GiB)": 740.28, "step": 14685, "train_speed(iter/s)": 0.320205 }, { "acc": 0.75125713, "epoch": 0.37265406623234754, "grad_norm": 3.421875, "learning_rate": 9.500415061800192e-06, "loss": 0.9022049, "memory(GiB)": 740.28, "step": 14690, "train_speed(iter/s)": 0.319826 }, { "acc": 0.75556464, "epoch": 0.3727809056013851, "grad_norm": 3.96875, "learning_rate": 9.499958045934977e-06, "loss": 0.98777075, "memory(GiB)": 740.28, "step": 14695, "train_speed(iter/s)": 0.319483 }, { "acc": 0.74282975, "epoch": 0.37290774497042267, "grad_norm": 4.0, "learning_rate": 9.499500832130945e-06, "loss": 1.0044817, "memory(GiB)": 740.28, "step": 14700, "train_speed(iter/s)": 0.319135 }, { "acc": 0.76786242, "epoch": 0.3730345843394602, "grad_norm": 3.546875, "learning_rate": 9.49904342040821e-06, "loss": 0.87925949, "memory(GiB)": 740.28, "step": 14705, "train_speed(iter/s)": 0.318726 }, { "acc": 0.76419792, "epoch": 0.37316142370849775, "grad_norm": 3.6875, "learning_rate": 9.498585810786886e-06, "loss": 0.87853355, "memory(GiB)": 740.28, "step": 14710, "train_speed(iter/s)": 0.318371 }, { "acc": 0.75786343, "epoch": 0.3732882630775353, "grad_norm": 2.84375, "learning_rate": 9.498128003287108e-06, "loss": 0.92463484, "memory(GiB)": 740.28, "step": 14715, "train_speed(iter/s)": 0.317977 }, { "acc": 0.77133121, "epoch": 0.3734151024465729, "grad_norm": 3.546875, "learning_rate": 9.497669997929011e-06, "loss": 0.89466591, "memory(GiB)": 740.28, "step": 14720, "train_speed(iter/s)": 0.31764 }, { "acc": 0.75494108, "epoch": 0.37354194181561046, "grad_norm": 2.96875, "learning_rate": 9.497211794732742e-06, "loss": 0.9647109, "memory(GiB)": 740.28, "step": 14725, "train_speed(iter/s)": 0.317262 }, { "acc": 0.75474472, "epoch": 0.373668781184648, "grad_norm": 3.328125, "learning_rate": 9.496753393718453e-06, "loss": 0.9697422, "memory(GiB)": 740.28, "step": 14730, "train_speed(iter/s)": 0.316927 }, { "acc": 0.75891066, "epoch": 0.37379562055368554, "grad_norm": 3.0625, "learning_rate": 9.49629479490631e-06, "loss": 0.90438347, "memory(GiB)": 740.28, "step": 14735, "train_speed(iter/s)": 0.316574 }, { "acc": 0.763168, "epoch": 0.3739224599227231, "grad_norm": 2.96875, "learning_rate": 9.495835998316486e-06, "loss": 0.92120647, "memory(GiB)": 740.28, "step": 14740, "train_speed(iter/s)": 0.316178 }, { "acc": 0.75415587, "epoch": 0.3740492992917607, "grad_norm": 3.234375, "learning_rate": 9.495377003969162e-06, "loss": 0.94731951, "memory(GiB)": 740.28, "step": 14745, "train_speed(iter/s)": 0.315794 }, { "acc": 0.77193799, "epoch": 0.37417613866079824, "grad_norm": 3.65625, "learning_rate": 9.494917811884525e-06, "loss": 0.90475807, "memory(GiB)": 740.28, "step": 14750, "train_speed(iter/s)": 0.315438 }, { "acc": 0.75968642, "epoch": 0.3743029780298358, "grad_norm": 3.1875, "learning_rate": 9.494458422082776e-06, "loss": 0.89460878, "memory(GiB)": 740.28, "step": 14755, "train_speed(iter/s)": 0.315096 }, { "acc": 0.76006942, "epoch": 0.3744298173988734, "grad_norm": 4.46875, "learning_rate": 9.493998834584121e-06, "loss": 0.97824326, "memory(GiB)": 740.28, "step": 14760, "train_speed(iter/s)": 0.314761 }, { "acc": 0.76588945, "epoch": 0.3745566567679109, "grad_norm": 3.359375, "learning_rate": 9.493539049408775e-06, "loss": 0.90622911, "memory(GiB)": 740.28, "step": 14765, "train_speed(iter/s)": 0.314413 }, { "acc": 0.76568389, "epoch": 0.37468349613694846, "grad_norm": 2.953125, "learning_rate": 9.493079066576966e-06, "loss": 0.90760651, "memory(GiB)": 740.28, "step": 14770, "train_speed(iter/s)": 0.314108 }, { "acc": 0.76541057, "epoch": 0.374810335505986, "grad_norm": 3.921875, "learning_rate": 9.492618886108924e-06, "loss": 0.91490946, "memory(GiB)": 740.28, "step": 14775, "train_speed(iter/s)": 0.313801 }, { "acc": 0.75901556, "epoch": 0.3749371748750236, "grad_norm": 4.15625, "learning_rate": 9.492158508024889e-06, "loss": 0.90110016, "memory(GiB)": 740.28, "step": 14780, "train_speed(iter/s)": 0.313442 }, { "acc": 0.75197148, "epoch": 0.37506401424406116, "grad_norm": 4.375, "learning_rate": 9.491697932345116e-06, "loss": 0.95477028, "memory(GiB)": 740.28, "step": 14785, "train_speed(iter/s)": 0.313068 }, { "acc": 0.7463829, "epoch": 0.37519085361309873, "grad_norm": 3.671875, "learning_rate": 9.491237159089863e-06, "loss": 0.9407711, "memory(GiB)": 740.28, "step": 14790, "train_speed(iter/s)": 0.312685 }, { "acc": 0.76663795, "epoch": 0.37531769298213624, "grad_norm": 2.921875, "learning_rate": 9.490776188279397e-06, "loss": 0.93349628, "memory(GiB)": 740.28, "step": 14795, "train_speed(iter/s)": 0.312348 }, { "acc": 0.75715661, "epoch": 0.3754445323511738, "grad_norm": 2.953125, "learning_rate": 9.490315019933993e-06, "loss": 0.99480085, "memory(GiB)": 740.28, "step": 14800, "train_speed(iter/s)": 0.311992 }, { "acc": 0.77055063, "epoch": 0.3755713717202114, "grad_norm": 3.59375, "learning_rate": 9.489853654073941e-06, "loss": 0.89253597, "memory(GiB)": 740.28, "step": 14805, "train_speed(iter/s)": 0.31164 }, { "acc": 0.75666318, "epoch": 0.37569821108924895, "grad_norm": 3.125, "learning_rate": 9.489392090719532e-06, "loss": 0.91267586, "memory(GiB)": 740.28, "step": 14810, "train_speed(iter/s)": 0.311297 }, { "acc": 0.74968476, "epoch": 0.3758250504582865, "grad_norm": 3.5, "learning_rate": 9.488930329891066e-06, "loss": 0.96703262, "memory(GiB)": 740.28, "step": 14815, "train_speed(iter/s)": 0.310954 }, { "acc": 0.75094447, "epoch": 0.3759518898273241, "grad_norm": 3.453125, "learning_rate": 9.488468371608861e-06, "loss": 0.91115532, "memory(GiB)": 740.28, "step": 14820, "train_speed(iter/s)": 0.310617 }, { "acc": 0.76724234, "epoch": 0.3760787291963616, "grad_norm": 3.546875, "learning_rate": 9.48800621589323e-06, "loss": 0.91889334, "memory(GiB)": 740.28, "step": 14825, "train_speed(iter/s)": 0.310273 }, { "acc": 0.76555367, "epoch": 0.37620556856539916, "grad_norm": 3.84375, "learning_rate": 9.487543862764509e-06, "loss": 0.9285882, "memory(GiB)": 740.28, "step": 14830, "train_speed(iter/s)": 0.309965 }, { "acc": 0.75449314, "epoch": 0.37633240793443673, "grad_norm": 4.03125, "learning_rate": 9.48708131224303e-06, "loss": 0.9745512, "memory(GiB)": 740.28, "step": 14835, "train_speed(iter/s)": 0.309664 }, { "acc": 0.76138763, "epoch": 0.3764592473034743, "grad_norm": 3.171875, "learning_rate": 9.486618564349141e-06, "loss": 0.87543106, "memory(GiB)": 740.28, "step": 14840, "train_speed(iter/s)": 0.309318 }, { "acc": 0.76220155, "epoch": 0.37658608667251187, "grad_norm": 3.171875, "learning_rate": 9.486155619103196e-06, "loss": 0.89237022, "memory(GiB)": 740.28, "step": 14845, "train_speed(iter/s)": 0.308976 }, { "acc": 0.76248822, "epoch": 0.37671292604154943, "grad_norm": 3.546875, "learning_rate": 9.485692476525558e-06, "loss": 0.8762188, "memory(GiB)": 740.28, "step": 14850, "train_speed(iter/s)": 0.308663 }, { "acc": 0.76876216, "epoch": 0.37683976541058695, "grad_norm": 3.765625, "learning_rate": 9.485229136636602e-06, "loss": 0.92925959, "memory(GiB)": 740.28, "step": 14855, "train_speed(iter/s)": 0.308381 }, { "acc": 0.76939168, "epoch": 0.3769666047796245, "grad_norm": 4.21875, "learning_rate": 9.484765599456706e-06, "loss": 0.9293045, "memory(GiB)": 740.28, "step": 14860, "train_speed(iter/s)": 0.308061 }, { "acc": 0.76629591, "epoch": 0.3770934441486621, "grad_norm": 3.453125, "learning_rate": 9.484301865006262e-06, "loss": 0.86264238, "memory(GiB)": 740.28, "step": 14865, "train_speed(iter/s)": 0.307773 }, { "acc": 0.75776978, "epoch": 0.37722028351769965, "grad_norm": 3.296875, "learning_rate": 9.483837933305667e-06, "loss": 0.91716967, "memory(GiB)": 740.28, "step": 14870, "train_speed(iter/s)": 0.307418 }, { "acc": 0.76374245, "epoch": 0.3773471228867372, "grad_norm": 3.875, "learning_rate": 9.483373804375327e-06, "loss": 0.93577538, "memory(GiB)": 740.28, "step": 14875, "train_speed(iter/s)": 0.307092 }, { "acc": 0.75421753, "epoch": 0.37747396225577473, "grad_norm": 3.203125, "learning_rate": 9.482909478235657e-06, "loss": 0.93672838, "memory(GiB)": 740.28, "step": 14880, "train_speed(iter/s)": 0.306778 }, { "acc": 0.75403605, "epoch": 0.3776008016248123, "grad_norm": 3.828125, "learning_rate": 9.482444954907084e-06, "loss": 0.95232964, "memory(GiB)": 740.28, "step": 14885, "train_speed(iter/s)": 0.306491 }, { "acc": 0.75608854, "epoch": 0.37772764099384987, "grad_norm": 3.84375, "learning_rate": 9.48198023441004e-06, "loss": 0.93066339, "memory(GiB)": 740.28, "step": 14890, "train_speed(iter/s)": 0.306148 }, { "acc": 0.77334833, "epoch": 0.37785448036288743, "grad_norm": 3.78125, "learning_rate": 9.481515316764964e-06, "loss": 0.86722364, "memory(GiB)": 740.28, "step": 14895, "train_speed(iter/s)": 0.305822 }, { "acc": 0.76170626, "epoch": 0.377981319731925, "grad_norm": 3.359375, "learning_rate": 9.481050201992307e-06, "loss": 0.92196856, "memory(GiB)": 740.29, "step": 14900, "train_speed(iter/s)": 0.305473 }, { "acc": 0.74529619, "epoch": 0.37810815910096257, "grad_norm": 3.515625, "learning_rate": 9.480584890112531e-06, "loss": 0.97052555, "memory(GiB)": 740.29, "step": 14905, "train_speed(iter/s)": 0.305135 }, { "acc": 0.74934058, "epoch": 0.3782349984700001, "grad_norm": 3.84375, "learning_rate": 9.480119381146103e-06, "loss": 0.97769051, "memory(GiB)": 740.29, "step": 14910, "train_speed(iter/s)": 0.304804 }, { "acc": 0.7740624, "epoch": 0.37836183783903765, "grad_norm": 3.390625, "learning_rate": 9.479653675113497e-06, "loss": 0.9043045, "memory(GiB)": 740.29, "step": 14915, "train_speed(iter/s)": 0.304523 }, { "acc": 0.7684978, "epoch": 0.3784886772080752, "grad_norm": 4.09375, "learning_rate": 9.479187772035198e-06, "loss": 0.86419897, "memory(GiB)": 740.29, "step": 14920, "train_speed(iter/s)": 0.304222 }, { "acc": 0.76326513, "epoch": 0.3786155165771128, "grad_norm": 3.625, "learning_rate": 9.478721671931701e-06, "loss": 0.96118307, "memory(GiB)": 740.29, "step": 14925, "train_speed(iter/s)": 0.30389 }, { "acc": 0.76245427, "epoch": 0.37874235594615036, "grad_norm": 3.140625, "learning_rate": 9.478255374823508e-06, "loss": 0.89523134, "memory(GiB)": 740.29, "step": 14930, "train_speed(iter/s)": 0.303522 }, { "acc": 0.76030297, "epoch": 0.3788691953151879, "grad_norm": 3.859375, "learning_rate": 9.477788880731131e-06, "loss": 0.95013742, "memory(GiB)": 740.29, "step": 14935, "train_speed(iter/s)": 0.303191 }, { "acc": 0.7504571, "epoch": 0.37899603468422544, "grad_norm": 3.515625, "learning_rate": 9.477322189675087e-06, "loss": 0.959622, "memory(GiB)": 740.29, "step": 14940, "train_speed(iter/s)": 0.302816 }, { "acc": 0.76919074, "epoch": 0.379122874053263, "grad_norm": 3.046875, "learning_rate": 9.476855301675905e-06, "loss": 0.89652462, "memory(GiB)": 740.29, "step": 14945, "train_speed(iter/s)": 0.302533 }, { "acc": 0.76546745, "epoch": 0.37924971342230057, "grad_norm": 3.359375, "learning_rate": 9.476388216754122e-06, "loss": 0.8842042, "memory(GiB)": 740.29, "step": 14950, "train_speed(iter/s)": 0.302189 }, { "acc": 0.76048379, "epoch": 0.37937655279133814, "grad_norm": 3.828125, "learning_rate": 9.475920934930285e-06, "loss": 0.92363787, "memory(GiB)": 740.29, "step": 14955, "train_speed(iter/s)": 0.301893 }, { "acc": 0.76859875, "epoch": 0.3795033921603757, "grad_norm": 3.5, "learning_rate": 9.475453456224946e-06, "loss": 0.89737701, "memory(GiB)": 740.29, "step": 14960, "train_speed(iter/s)": 0.301548 }, { "acc": 0.76339688, "epoch": 0.3796302315294133, "grad_norm": 3.125, "learning_rate": 9.47498578065867e-06, "loss": 0.90439234, "memory(GiB)": 740.29, "step": 14965, "train_speed(iter/s)": 0.301236 }, { "acc": 0.77070513, "epoch": 0.3797570708984508, "grad_norm": 3.640625, "learning_rate": 9.474517908252029e-06, "loss": 0.86041975, "memory(GiB)": 740.29, "step": 14970, "train_speed(iter/s)": 0.300936 }, { "acc": 0.76354003, "epoch": 0.37988391026748836, "grad_norm": 3.28125, "learning_rate": 9.474049839025601e-06, "loss": 0.88361053, "memory(GiB)": 740.29, "step": 14975, "train_speed(iter/s)": 0.300614 }, { "acc": 0.74788756, "epoch": 0.3800107496365259, "grad_norm": 3.84375, "learning_rate": 9.473581572999974e-06, "loss": 0.94566097, "memory(GiB)": 740.29, "step": 14980, "train_speed(iter/s)": 0.300283 }, { "acc": 0.75469346, "epoch": 0.3801375890055635, "grad_norm": 3.921875, "learning_rate": 9.47311311019575e-06, "loss": 0.97503185, "memory(GiB)": 740.29, "step": 14985, "train_speed(iter/s)": 0.300026 }, { "acc": 0.7808826, "epoch": 0.38026442837460106, "grad_norm": 3.171875, "learning_rate": 9.47264445063353e-06, "loss": 0.89196472, "memory(GiB)": 740.29, "step": 14990, "train_speed(iter/s)": 0.299725 }, { "acc": 0.76640863, "epoch": 0.38039126774363863, "grad_norm": 3.578125, "learning_rate": 9.472175594333932e-06, "loss": 0.90357733, "memory(GiB)": 740.29, "step": 14995, "train_speed(iter/s)": 0.299429 }, { "acc": 0.7644496, "epoch": 0.38051810711267614, "grad_norm": 3.328125, "learning_rate": 9.47170654131758e-06, "loss": 0.92444201, "memory(GiB)": 740.29, "step": 15000, "train_speed(iter/s)": 0.299127 }, { "epoch": 0.38051810711267614, "eval_acc": 0.7499549855724059, "eval_loss": 0.8950862288475037, "eval_runtime": 1150.836, "eval_samples_per_second": 5.535, "eval_steps_per_second": 5.535, "step": 15000 }, { "acc": 0.76349607, "epoch": 0.3806449464817137, "grad_norm": 3.984375, "learning_rate": 9.471237291605105e-06, "loss": 0.92128, "memory(GiB)": 740.29, "step": 15005, "train_speed(iter/s)": 0.287383 }, { "acc": 0.77532601, "epoch": 0.3807717858507513, "grad_norm": 3.578125, "learning_rate": 9.470767845217146e-06, "loss": 0.83099899, "memory(GiB)": 740.29, "step": 15010, "train_speed(iter/s)": 0.287121 }, { "acc": 0.75021348, "epoch": 0.38089862521978884, "grad_norm": 3.875, "learning_rate": 9.470298202174356e-06, "loss": 0.92859888, "memory(GiB)": 740.29, "step": 15015, "train_speed(iter/s)": 0.286875 }, { "acc": 0.75921421, "epoch": 0.3810254645888264, "grad_norm": 4.78125, "learning_rate": 9.469828362497391e-06, "loss": 0.94702358, "memory(GiB)": 740.29, "step": 15020, "train_speed(iter/s)": 0.286631 }, { "acc": 0.76020217, "epoch": 0.381152303957864, "grad_norm": 3.8125, "learning_rate": 9.469358326206916e-06, "loss": 0.91925411, "memory(GiB)": 740.29, "step": 15025, "train_speed(iter/s)": 0.286338 }, { "acc": 0.77130151, "epoch": 0.3812791433269015, "grad_norm": 3.59375, "learning_rate": 9.468888093323612e-06, "loss": 0.86575956, "memory(GiB)": 740.29, "step": 15030, "train_speed(iter/s)": 0.286078 }, { "acc": 0.76552582, "epoch": 0.38140598269593906, "grad_norm": 3.078125, "learning_rate": 9.468417663868156e-06, "loss": 0.92984114, "memory(GiB)": 740.29, "step": 15035, "train_speed(iter/s)": 0.285789 }, { "acc": 0.75107551, "epoch": 0.38153282206497663, "grad_norm": 3.484375, "learning_rate": 9.467947037861248e-06, "loss": 0.96585236, "memory(GiB)": 740.29, "step": 15040, "train_speed(iter/s)": 0.28553 }, { "acc": 0.76228724, "epoch": 0.3816596614340142, "grad_norm": 3.140625, "learning_rate": 9.467476215323583e-06, "loss": 0.92229691, "memory(GiB)": 740.29, "step": 15045, "train_speed(iter/s)": 0.285255 }, { "acc": 0.76099796, "epoch": 0.38178650080305176, "grad_norm": 4.0, "learning_rate": 9.467005196275874e-06, "loss": 0.96085396, "memory(GiB)": 740.29, "step": 15050, "train_speed(iter/s)": 0.284947 }, { "acc": 0.76578507, "epoch": 0.38191334017208933, "grad_norm": 3.828125, "learning_rate": 9.46653398073884e-06, "loss": 0.9189558, "memory(GiB)": 740.29, "step": 15055, "train_speed(iter/s)": 0.28465 }, { "acc": 0.75451179, "epoch": 0.38204017954112685, "grad_norm": 3.8125, "learning_rate": 9.466062568733205e-06, "loss": 0.91439905, "memory(GiB)": 740.29, "step": 15060, "train_speed(iter/s)": 0.284405 }, { "acc": 0.75781932, "epoch": 0.3821670189101644, "grad_norm": 3.171875, "learning_rate": 9.46559096027971e-06, "loss": 0.93728895, "memory(GiB)": 740.29, "step": 15065, "train_speed(iter/s)": 0.284149 }, { "acc": 0.76930194, "epoch": 0.382293858279202, "grad_norm": 3.296875, "learning_rate": 9.465119155399094e-06, "loss": 0.91609459, "memory(GiB)": 740.29, "step": 15070, "train_speed(iter/s)": 0.283862 }, { "acc": 0.7687696, "epoch": 0.38242069764823955, "grad_norm": 3.859375, "learning_rate": 9.464647154112116e-06, "loss": 0.88979464, "memory(GiB)": 740.29, "step": 15075, "train_speed(iter/s)": 0.283623 }, { "acc": 0.77427058, "epoch": 0.3825475370172771, "grad_norm": 3.359375, "learning_rate": 9.464174956439533e-06, "loss": 0.88687057, "memory(GiB)": 740.29, "step": 15080, "train_speed(iter/s)": 0.283326 }, { "acc": 0.7795825, "epoch": 0.3826743763863147, "grad_norm": 3.71875, "learning_rate": 9.463702562402119e-06, "loss": 0.87177038, "memory(GiB)": 740.29, "step": 15085, "train_speed(iter/s)": 0.283071 }, { "acc": 0.76049881, "epoch": 0.3828012157553522, "grad_norm": 3.484375, "learning_rate": 9.463229972020652e-06, "loss": 0.96699486, "memory(GiB)": 740.29, "step": 15090, "train_speed(iter/s)": 0.282791 }, { "acc": 0.76319127, "epoch": 0.38292805512438977, "grad_norm": 3.53125, "learning_rate": 9.462757185315917e-06, "loss": 0.88303194, "memory(GiB)": 740.29, "step": 15095, "train_speed(iter/s)": 0.282518 }, { "acc": 0.76082215, "epoch": 0.38305489449342733, "grad_norm": 3.078125, "learning_rate": 9.462284202308714e-06, "loss": 0.91054077, "memory(GiB)": 740.29, "step": 15100, "train_speed(iter/s)": 0.282227 }, { "acc": 0.78180108, "epoch": 0.3831817338624649, "grad_norm": 3.828125, "learning_rate": 9.461811023019846e-06, "loss": 0.86884584, "memory(GiB)": 740.29, "step": 15105, "train_speed(iter/s)": 0.281931 }, { "acc": 0.76092076, "epoch": 0.38330857323150247, "grad_norm": 3.5, "learning_rate": 9.461337647470128e-06, "loss": 0.93096619, "memory(GiB)": 740.29, "step": 15110, "train_speed(iter/s)": 0.281635 }, { "acc": 0.76649861, "epoch": 0.38343541260054004, "grad_norm": 3.59375, "learning_rate": 9.46086407568038e-06, "loss": 0.95396986, "memory(GiB)": 740.29, "step": 15115, "train_speed(iter/s)": 0.281377 }, { "acc": 0.76420236, "epoch": 0.38356225196957755, "grad_norm": 5.875, "learning_rate": 9.460390307671436e-06, "loss": 0.90134411, "memory(GiB)": 740.29, "step": 15120, "train_speed(iter/s)": 0.281143 }, { "acc": 0.75992293, "epoch": 0.3836890913386151, "grad_norm": 3.125, "learning_rate": 9.459916343464135e-06, "loss": 0.93194666, "memory(GiB)": 740.29, "step": 15125, "train_speed(iter/s)": 0.28088 }, { "acc": 0.77670403, "epoch": 0.3838159307076527, "grad_norm": 3.5, "learning_rate": 9.459442183079322e-06, "loss": 0.86811275, "memory(GiB)": 740.29, "step": 15130, "train_speed(iter/s)": 0.280649 }, { "acc": 0.76385818, "epoch": 0.38394277007669025, "grad_norm": 4.15625, "learning_rate": 9.458967826537857e-06, "loss": 0.88687963, "memory(GiB)": 740.29, "step": 15135, "train_speed(iter/s)": 0.28041 }, { "acc": 0.7536231, "epoch": 0.3840696094457278, "grad_norm": 4.0, "learning_rate": 9.458493273860604e-06, "loss": 0.96539097, "memory(GiB)": 740.29, "step": 15140, "train_speed(iter/s)": 0.280183 }, { "acc": 0.74467068, "epoch": 0.3841964488147654, "grad_norm": 3.421875, "learning_rate": 9.458018525068437e-06, "loss": 0.96340141, "memory(GiB)": 740.29, "step": 15145, "train_speed(iter/s)": 0.279927 }, { "acc": 0.76416588, "epoch": 0.3843232881838029, "grad_norm": 3.453125, "learning_rate": 9.45754358018224e-06, "loss": 0.94586515, "memory(GiB)": 740.29, "step": 15150, "train_speed(iter/s)": 0.279691 }, { "acc": 0.75257421, "epoch": 0.38445012755284047, "grad_norm": 3.65625, "learning_rate": 9.457068439222904e-06, "loss": 0.91421642, "memory(GiB)": 740.29, "step": 15155, "train_speed(iter/s)": 0.279426 }, { "acc": 0.76768551, "epoch": 0.38457696692187804, "grad_norm": 4.3125, "learning_rate": 9.45659310221133e-06, "loss": 0.93027639, "memory(GiB)": 740.29, "step": 15160, "train_speed(iter/s)": 0.27919 }, { "acc": 0.7649539, "epoch": 0.3847038062909156, "grad_norm": 4.59375, "learning_rate": 9.456117569168421e-06, "loss": 0.90196953, "memory(GiB)": 740.29, "step": 15165, "train_speed(iter/s)": 0.278942 }, { "acc": 0.76462207, "epoch": 0.3848306456599532, "grad_norm": 3.28125, "learning_rate": 9.4556418401151e-06, "loss": 0.96542997, "memory(GiB)": 740.29, "step": 15170, "train_speed(iter/s)": 0.278689 }, { "acc": 0.761133, "epoch": 0.38495748502899074, "grad_norm": 3.59375, "learning_rate": 9.455165915072292e-06, "loss": 0.92849398, "memory(GiB)": 740.29, "step": 15175, "train_speed(iter/s)": 0.278439 }, { "acc": 0.76058626, "epoch": 0.38508432439802825, "grad_norm": 3.46875, "learning_rate": 9.454689794060929e-06, "loss": 0.99372549, "memory(GiB)": 740.29, "step": 15180, "train_speed(iter/s)": 0.278194 }, { "acc": 0.77351317, "epoch": 0.3852111637670658, "grad_norm": 4.4375, "learning_rate": 9.454213477101957e-06, "loss": 0.88091288, "memory(GiB)": 740.29, "step": 15185, "train_speed(iter/s)": 0.27798 }, { "acc": 0.76040854, "epoch": 0.3853380031361034, "grad_norm": 5.5, "learning_rate": 9.453736964216324e-06, "loss": 0.94869146, "memory(GiB)": 740.29, "step": 15190, "train_speed(iter/s)": 0.277733 }, { "acc": 0.76982093, "epoch": 0.38546484250514096, "grad_norm": 3.78125, "learning_rate": 9.453260255424995e-06, "loss": 0.91560812, "memory(GiB)": 740.29, "step": 15195, "train_speed(iter/s)": 0.277521 }, { "acc": 0.75609059, "epoch": 0.3855916818741785, "grad_norm": 3.953125, "learning_rate": 9.452783350748934e-06, "loss": 0.93485241, "memory(GiB)": 740.29, "step": 15200, "train_speed(iter/s)": 0.27728 }, { "acc": 0.76234078, "epoch": 0.3857185212432161, "grad_norm": 3.140625, "learning_rate": 9.452306250209122e-06, "loss": 0.90152121, "memory(GiB)": 740.29, "step": 15205, "train_speed(iter/s)": 0.277033 }, { "acc": 0.76077356, "epoch": 0.3858453606122536, "grad_norm": 3.78125, "learning_rate": 9.451828953826544e-06, "loss": 0.94145126, "memory(GiB)": 740.29, "step": 15210, "train_speed(iter/s)": 0.276789 }, { "acc": 0.76761246, "epoch": 0.3859721999812912, "grad_norm": 4.15625, "learning_rate": 9.451351461622194e-06, "loss": 0.9084053, "memory(GiB)": 740.29, "step": 15215, "train_speed(iter/s)": 0.276559 }, { "acc": 0.76605496, "epoch": 0.38609903935032874, "grad_norm": 3.140625, "learning_rate": 9.450873773617077e-06, "loss": 0.90729818, "memory(GiB)": 740.29, "step": 15220, "train_speed(iter/s)": 0.276302 }, { "acc": 0.751509, "epoch": 0.3862258787193663, "grad_norm": 3.390625, "learning_rate": 9.450395889832204e-06, "loss": 0.97689047, "memory(GiB)": 740.29, "step": 15225, "train_speed(iter/s)": 0.276058 }, { "acc": 0.76352816, "epoch": 0.3863527180884039, "grad_norm": 3.484375, "learning_rate": 9.449917810288594e-06, "loss": 0.90819588, "memory(GiB)": 740.29, "step": 15230, "train_speed(iter/s)": 0.275791 }, { "acc": 0.77336793, "epoch": 0.38647955745744145, "grad_norm": 3.96875, "learning_rate": 9.44943953500728e-06, "loss": 0.86776791, "memory(GiB)": 740.29, "step": 15235, "train_speed(iter/s)": 0.275572 }, { "acc": 0.77004218, "epoch": 0.38660639682647896, "grad_norm": 3.65625, "learning_rate": 9.448961064009296e-06, "loss": 0.90250492, "memory(GiB)": 740.29, "step": 15240, "train_speed(iter/s)": 0.275345 }, { "acc": 0.74923587, "epoch": 0.3867332361955165, "grad_norm": 3.15625, "learning_rate": 9.44848239731569e-06, "loss": 0.97885666, "memory(GiB)": 740.29, "step": 15245, "train_speed(iter/s)": 0.275083 }, { "acc": 0.7642487, "epoch": 0.3868600755645541, "grad_norm": 3.84375, "learning_rate": 9.448003534947518e-06, "loss": 0.92690506, "memory(GiB)": 740.29, "step": 15250, "train_speed(iter/s)": 0.274841 }, { "acc": 0.74519382, "epoch": 0.38698691493359166, "grad_norm": 3.515625, "learning_rate": 9.447524476925843e-06, "loss": 0.99059372, "memory(GiB)": 740.29, "step": 15255, "train_speed(iter/s)": 0.274594 }, { "acc": 0.76336837, "epoch": 0.38711375430262923, "grad_norm": 3.234375, "learning_rate": 9.447045223271736e-06, "loss": 0.9138649, "memory(GiB)": 740.29, "step": 15260, "train_speed(iter/s)": 0.274374 }, { "acc": 0.75399966, "epoch": 0.3872405936716668, "grad_norm": 3.75, "learning_rate": 9.446565774006279e-06, "loss": 0.94991999, "memory(GiB)": 740.29, "step": 15265, "train_speed(iter/s)": 0.274154 }, { "acc": 0.76057858, "epoch": 0.3873674330407043, "grad_norm": 3.78125, "learning_rate": 9.446086129150562e-06, "loss": 0.91257811, "memory(GiB)": 740.29, "step": 15270, "train_speed(iter/s)": 0.273905 }, { "acc": 0.77947206, "epoch": 0.3874942724097419, "grad_norm": 3.90625, "learning_rate": 9.44560628872568e-06, "loss": 0.86730824, "memory(GiB)": 740.29, "step": 15275, "train_speed(iter/s)": 0.273658 }, { "acc": 0.75716066, "epoch": 0.38762111177877945, "grad_norm": 2.75, "learning_rate": 9.445126252752746e-06, "loss": 0.97682981, "memory(GiB)": 740.29, "step": 15280, "train_speed(iter/s)": 0.273418 }, { "acc": 0.76238933, "epoch": 0.387747951147817, "grad_norm": 3.484375, "learning_rate": 9.444646021252868e-06, "loss": 0.91375551, "memory(GiB)": 740.29, "step": 15285, "train_speed(iter/s)": 0.273153 }, { "acc": 0.75895181, "epoch": 0.3878747905168546, "grad_norm": 3.34375, "learning_rate": 9.444165594247175e-06, "loss": 0.93302069, "memory(GiB)": 740.29, "step": 15290, "train_speed(iter/s)": 0.272895 }, { "acc": 0.7633399, "epoch": 0.38800162988589215, "grad_norm": 3.421875, "learning_rate": 9.443684971756796e-06, "loss": 0.84700632, "memory(GiB)": 740.29, "step": 15295, "train_speed(iter/s)": 0.272673 }, { "acc": 0.75060086, "epoch": 0.38812846925492966, "grad_norm": 3.734375, "learning_rate": 9.443204153802874e-06, "loss": 0.9697258, "memory(GiB)": 740.29, "step": 15300, "train_speed(iter/s)": 0.272424 }, { "acc": 0.75205145, "epoch": 0.38825530862396723, "grad_norm": 3.359375, "learning_rate": 9.442723140406558e-06, "loss": 0.9348258, "memory(GiB)": 740.29, "step": 15305, "train_speed(iter/s)": 0.272181 }, { "acc": 0.77880921, "epoch": 0.3883821479930048, "grad_norm": 2.765625, "learning_rate": 9.442241931589007e-06, "loss": 0.85790977, "memory(GiB)": 740.29, "step": 15310, "train_speed(iter/s)": 0.271881 }, { "acc": 0.7569654, "epoch": 0.38850898736204237, "grad_norm": 3.296875, "learning_rate": 9.441760527371387e-06, "loss": 0.92638931, "memory(GiB)": 740.29, "step": 15315, "train_speed(iter/s)": 0.27166 }, { "acc": 0.75276761, "epoch": 0.38863582673107994, "grad_norm": 3.40625, "learning_rate": 9.441278927774873e-06, "loss": 0.95965919, "memory(GiB)": 740.29, "step": 15320, "train_speed(iter/s)": 0.27142 }, { "acc": 0.76127567, "epoch": 0.3887626661001175, "grad_norm": 3.84375, "learning_rate": 9.44079713282065e-06, "loss": 0.97048006, "memory(GiB)": 740.29, "step": 15325, "train_speed(iter/s)": 0.271157 }, { "acc": 0.76435714, "epoch": 0.388889505469155, "grad_norm": 4.25, "learning_rate": 9.44031514252991e-06, "loss": 0.90413027, "memory(GiB)": 740.29, "step": 15330, "train_speed(iter/s)": 0.270937 }, { "acc": 0.75707216, "epoch": 0.3890163448381926, "grad_norm": 3.421875, "learning_rate": 9.439832956923856e-06, "loss": 0.91093407, "memory(GiB)": 740.29, "step": 15335, "train_speed(iter/s)": 0.270728 }, { "acc": 0.75948467, "epoch": 0.38914318420723015, "grad_norm": 3.90625, "learning_rate": 9.439350576023697e-06, "loss": 0.96144247, "memory(GiB)": 740.29, "step": 15340, "train_speed(iter/s)": 0.270526 }, { "acc": 0.77424827, "epoch": 0.3892700235762677, "grad_norm": 3.78125, "learning_rate": 9.438867999850647e-06, "loss": 0.90573425, "memory(GiB)": 740.29, "step": 15345, "train_speed(iter/s)": 0.270318 }, { "acc": 0.76290841, "epoch": 0.3893968629453053, "grad_norm": 3.3125, "learning_rate": 9.43838522842594e-06, "loss": 0.92579737, "memory(GiB)": 740.29, "step": 15350, "train_speed(iter/s)": 0.270083 }, { "acc": 0.76857266, "epoch": 0.38952370231434286, "grad_norm": 3.328125, "learning_rate": 9.437902261770808e-06, "loss": 0.90785427, "memory(GiB)": 740.29, "step": 15355, "train_speed(iter/s)": 0.269858 }, { "acc": 0.7717988, "epoch": 0.38965054168338037, "grad_norm": 3.234375, "learning_rate": 9.437419099906493e-06, "loss": 0.87443886, "memory(GiB)": 740.29, "step": 15360, "train_speed(iter/s)": 0.26962 }, { "acc": 0.76255207, "epoch": 0.38977738105241794, "grad_norm": 3.234375, "learning_rate": 9.436935742854254e-06, "loss": 0.88644218, "memory(GiB)": 740.29, "step": 15365, "train_speed(iter/s)": 0.269397 }, { "acc": 0.7603085, "epoch": 0.3899042204214555, "grad_norm": 3.046875, "learning_rate": 9.436452190635346e-06, "loss": 0.9232728, "memory(GiB)": 740.29, "step": 15370, "train_speed(iter/s)": 0.269165 }, { "acc": 0.75473514, "epoch": 0.3900310597904931, "grad_norm": 3.5, "learning_rate": 9.435968443271044e-06, "loss": 0.98060331, "memory(GiB)": 740.29, "step": 15375, "train_speed(iter/s)": 0.268958 }, { "acc": 0.75010166, "epoch": 0.39015789915953064, "grad_norm": 3.859375, "learning_rate": 9.435484500782622e-06, "loss": 1.03108444, "memory(GiB)": 740.29, "step": 15380, "train_speed(iter/s)": 0.268737 }, { "acc": 0.76133533, "epoch": 0.3902847385285682, "grad_norm": 3.625, "learning_rate": 9.435000363191368e-06, "loss": 0.91705799, "memory(GiB)": 740.29, "step": 15385, "train_speed(iter/s)": 0.268492 }, { "acc": 0.76008434, "epoch": 0.3904115778976057, "grad_norm": 4.40625, "learning_rate": 9.434516030518581e-06, "loss": 0.95762882, "memory(GiB)": 740.29, "step": 15390, "train_speed(iter/s)": 0.268297 }, { "acc": 0.77292428, "epoch": 0.3905384172666433, "grad_norm": 3.8125, "learning_rate": 9.434031502785564e-06, "loss": 0.91592875, "memory(GiB)": 740.29, "step": 15395, "train_speed(iter/s)": 0.26804 }, { "acc": 0.77118897, "epoch": 0.39066525663568086, "grad_norm": 3.625, "learning_rate": 9.433546780013626e-06, "loss": 0.89698372, "memory(GiB)": 740.29, "step": 15400, "train_speed(iter/s)": 0.267809 }, { "acc": 0.76405158, "epoch": 0.3907920960047184, "grad_norm": 3.34375, "learning_rate": 9.433061862224093e-06, "loss": 0.94332485, "memory(GiB)": 740.29, "step": 15405, "train_speed(iter/s)": 0.267612 }, { "acc": 0.76027989, "epoch": 0.390918935373756, "grad_norm": 3.703125, "learning_rate": 9.432576749438293e-06, "loss": 0.95922041, "memory(GiB)": 740.29, "step": 15410, "train_speed(iter/s)": 0.267395 }, { "acc": 0.75933523, "epoch": 0.39104577474279356, "grad_norm": 3.53125, "learning_rate": 9.432091441677564e-06, "loss": 0.91987276, "memory(GiB)": 740.29, "step": 15415, "train_speed(iter/s)": 0.267188 }, { "acc": 0.75888529, "epoch": 0.3911726141118311, "grad_norm": 3.359375, "learning_rate": 9.431605938963256e-06, "loss": 0.9115037, "memory(GiB)": 740.29, "step": 15420, "train_speed(iter/s)": 0.266964 }, { "acc": 0.7575181, "epoch": 0.39129945348086864, "grad_norm": 3.859375, "learning_rate": 9.43112024131672e-06, "loss": 0.93293486, "memory(GiB)": 740.29, "step": 15425, "train_speed(iter/s)": 0.266756 }, { "acc": 0.76387672, "epoch": 0.3914262928499062, "grad_norm": 3.578125, "learning_rate": 9.430634348759325e-06, "loss": 0.89884329, "memory(GiB)": 740.29, "step": 15430, "train_speed(iter/s)": 0.266527 }, { "acc": 0.76143141, "epoch": 0.3915531322189438, "grad_norm": 3.25, "learning_rate": 9.430148261312442e-06, "loss": 0.93758841, "memory(GiB)": 740.29, "step": 15435, "train_speed(iter/s)": 0.266303 }, { "acc": 0.76879368, "epoch": 0.39167997158798135, "grad_norm": 3.5, "learning_rate": 9.42966197899745e-06, "loss": 0.86700764, "memory(GiB)": 740.29, "step": 15440, "train_speed(iter/s)": 0.26605 }, { "acc": 0.76748333, "epoch": 0.3918068109570189, "grad_norm": 3.125, "learning_rate": 9.429175501835743e-06, "loss": 0.87630367, "memory(GiB)": 740.29, "step": 15445, "train_speed(iter/s)": 0.265837 }, { "acc": 0.75731058, "epoch": 0.3919336503260564, "grad_norm": 4.3125, "learning_rate": 9.42868882984872e-06, "loss": 0.96357584, "memory(GiB)": 740.29, "step": 15450, "train_speed(iter/s)": 0.265619 }, { "acc": 0.76660595, "epoch": 0.392060489695094, "grad_norm": 4.15625, "learning_rate": 9.428201963057782e-06, "loss": 0.98383465, "memory(GiB)": 740.29, "step": 15455, "train_speed(iter/s)": 0.265367 }, { "acc": 0.74529362, "epoch": 0.39218732906413156, "grad_norm": 3.78125, "learning_rate": 9.427714901484351e-06, "loss": 0.99591684, "memory(GiB)": 740.29, "step": 15460, "train_speed(iter/s)": 0.265157 }, { "acc": 0.74990072, "epoch": 0.39231416843316913, "grad_norm": 3.96875, "learning_rate": 9.427227645149848e-06, "loss": 0.93507977, "memory(GiB)": 740.29, "step": 15465, "train_speed(iter/s)": 0.264947 }, { "acc": 0.77089314, "epoch": 0.3924410078022067, "grad_norm": 3.484375, "learning_rate": 9.426740194075708e-06, "loss": 0.88285131, "memory(GiB)": 740.29, "step": 15470, "train_speed(iter/s)": 0.26473 }, { "acc": 0.75733833, "epoch": 0.39256784717124427, "grad_norm": 3.359375, "learning_rate": 9.426252548283372e-06, "loss": 0.92404461, "memory(GiB)": 740.29, "step": 15475, "train_speed(iter/s)": 0.264516 }, { "acc": 0.76501093, "epoch": 0.3926946865402818, "grad_norm": 3.515625, "learning_rate": 9.425764707794287e-06, "loss": 0.90149422, "memory(GiB)": 740.29, "step": 15480, "train_speed(iter/s)": 0.264281 }, { "acc": 0.75862484, "epoch": 0.39282152590931935, "grad_norm": 3.546875, "learning_rate": 9.425276672629914e-06, "loss": 0.93114557, "memory(GiB)": 740.29, "step": 15485, "train_speed(iter/s)": 0.264082 }, { "acc": 0.74122624, "epoch": 0.3929483652783569, "grad_norm": 3.78125, "learning_rate": 9.424788442811722e-06, "loss": 0.97495222, "memory(GiB)": 740.29, "step": 15490, "train_speed(iter/s)": 0.263872 }, { "acc": 0.74212952, "epoch": 0.3930752046473945, "grad_norm": 4.09375, "learning_rate": 9.424300018361183e-06, "loss": 0.93967285, "memory(GiB)": 740.29, "step": 15495, "train_speed(iter/s)": 0.263679 }, { "acc": 0.78183289, "epoch": 0.39320204401643205, "grad_norm": 2.96875, "learning_rate": 9.423811399299784e-06, "loss": 0.84629488, "memory(GiB)": 740.29, "step": 15500, "train_speed(iter/s)": 0.26346 }, { "epoch": 0.39320204401643205, "eval_acc": 0.750419960587832, "eval_loss": 0.8936215043067932, "eval_runtime": 1149.2224, "eval_samples_per_second": 5.543, "eval_steps_per_second": 5.543, "step": 15500 }, { "acc": 0.76283684, "epoch": 0.3933288833854696, "grad_norm": 3.53125, "learning_rate": 9.423322585649015e-06, "loss": 0.94464102, "memory(GiB)": 688.27, "step": 15505, "train_speed(iter/s)": 33.635612 }, { "acc": 0.76979184, "epoch": 0.39345572275450713, "grad_norm": 3.984375, "learning_rate": 9.42283357743038e-06, "loss": 0.93201456, "memory(GiB)": 689.79, "step": 15510, "train_speed(iter/s)": 29.693453 }, { "acc": 0.75825505, "epoch": 0.3935825621235447, "grad_norm": 2.984375, "learning_rate": 9.42234437466539e-06, "loss": 0.91768713, "memory(GiB)": 689.79, "step": 15515, "train_speed(iter/s)": 26.541218 }, { "acc": 0.75993743, "epoch": 0.39370940149258227, "grad_norm": 3.5, "learning_rate": 9.42185497737556e-06, "loss": 0.91727638, "memory(GiB)": 689.79, "step": 15520, "train_speed(iter/s)": 23.751774 }, { "acc": 0.76255655, "epoch": 0.39383624086161984, "grad_norm": 3.625, "learning_rate": 9.421365385582417e-06, "loss": 0.90654345, "memory(GiB)": 689.79, "step": 15525, "train_speed(iter/s)": 21.564893 }, { "acc": 0.76779723, "epoch": 0.3939630802306574, "grad_norm": 3.6875, "learning_rate": 9.4208755993075e-06, "loss": 0.89580727, "memory(GiB)": 689.79, "step": 15530, "train_speed(iter/s)": 19.777784 }, { "acc": 0.75145674, "epoch": 0.39408991959969497, "grad_norm": 4.09375, "learning_rate": 9.42038561857235e-06, "loss": 0.90411978, "memory(GiB)": 689.79, "step": 15535, "train_speed(iter/s)": 18.156693 }, { "acc": 0.76513891, "epoch": 0.3942167589687325, "grad_norm": 3.765625, "learning_rate": 9.41989544339852e-06, "loss": 0.92897491, "memory(GiB)": 689.79, "step": 15540, "train_speed(iter/s)": 16.818291 }, { "acc": 0.75859275, "epoch": 0.39434359833777005, "grad_norm": 4.5625, "learning_rate": 9.419405073807573e-06, "loss": 0.94706421, "memory(GiB)": 689.79, "step": 15545, "train_speed(iter/s)": 15.822772 }, { "acc": 0.76180029, "epoch": 0.3944704377068076, "grad_norm": 3.53125, "learning_rate": 9.418914509821078e-06, "loss": 0.94568567, "memory(GiB)": 689.79, "step": 15550, "train_speed(iter/s)": 14.871213 }, { "acc": 0.7694726, "epoch": 0.3945972770758452, "grad_norm": 3.609375, "learning_rate": 9.418423751460613e-06, "loss": 0.86998625, "memory(GiB)": 689.79, "step": 15555, "train_speed(iter/s)": 14.046098 }, { "acc": 0.76335497, "epoch": 0.39472411644488276, "grad_norm": 3.625, "learning_rate": 9.417932798747766e-06, "loss": 0.95571384, "memory(GiB)": 689.81, "step": 15560, "train_speed(iter/s)": 13.216528 }, { "acc": 0.75942101, "epoch": 0.3948509558139203, "grad_norm": 3.109375, "learning_rate": 9.41744165170413e-06, "loss": 0.91150694, "memory(GiB)": 689.81, "step": 15565, "train_speed(iter/s)": 12.558642 }, { "acc": 0.77584348, "epoch": 0.39497779518295784, "grad_norm": 3.296875, "learning_rate": 9.416950310351311e-06, "loss": 0.84936295, "memory(GiB)": 689.81, "step": 15570, "train_speed(iter/s)": 11.980559 }, { "acc": 0.75075636, "epoch": 0.3951046345519954, "grad_norm": 3.484375, "learning_rate": 9.416458774710922e-06, "loss": 0.94508734, "memory(GiB)": 689.81, "step": 15575, "train_speed(iter/s)": 11.45495 }, { "acc": 0.75978394, "epoch": 0.39523147392103297, "grad_norm": 3.390625, "learning_rate": 9.415967044804584e-06, "loss": 0.90012589, "memory(GiB)": 689.81, "step": 15580, "train_speed(iter/s)": 10.925585 }, { "acc": 0.76674447, "epoch": 0.39535831329007054, "grad_norm": 3.828125, "learning_rate": 9.415475120653922e-06, "loss": 0.8893095, "memory(GiB)": 691.99, "step": 15585, "train_speed(iter/s)": 10.449036 }, { "acc": 0.75381117, "epoch": 0.3954851526591081, "grad_norm": 3.046875, "learning_rate": 9.414983002280583e-06, "loss": 0.92250376, "memory(GiB)": 701.75, "step": 15590, "train_speed(iter/s)": 9.966716 }, { "acc": 0.76510835, "epoch": 0.3956119920281457, "grad_norm": 3.453125, "learning_rate": 9.414490689706207e-06, "loss": 0.8773675, "memory(GiB)": 701.75, "step": 15595, "train_speed(iter/s)": 9.558099 }, { "acc": 0.77064915, "epoch": 0.3957388313971832, "grad_norm": 3.09375, "learning_rate": 9.413998182952448e-06, "loss": 0.90165472, "memory(GiB)": 701.75, "step": 15600, "train_speed(iter/s)": 9.168005 }, { "acc": 0.7692595, "epoch": 0.39586567076622076, "grad_norm": 4.5, "learning_rate": 9.413505482040976e-06, "loss": 0.89622631, "memory(GiB)": 701.75, "step": 15605, "train_speed(iter/s)": 8.840666 }, { "acc": 0.75369563, "epoch": 0.3959925101352583, "grad_norm": 3.546875, "learning_rate": 9.41301258699346e-06, "loss": 0.92776241, "memory(GiB)": 701.75, "step": 15610, "train_speed(iter/s)": 8.538964 }, { "acc": 0.76602468, "epoch": 0.3961193495042959, "grad_norm": 4.34375, "learning_rate": 9.412519497831579e-06, "loss": 0.87977762, "memory(GiB)": 701.75, "step": 15615, "train_speed(iter/s)": 8.242992 }, { "acc": 0.77283387, "epoch": 0.39624618887333346, "grad_norm": 4.25, "learning_rate": 9.412026214577025e-06, "loss": 0.9258625, "memory(GiB)": 701.75, "step": 15620, "train_speed(iter/s)": 7.935271 }, { "acc": 0.76594939, "epoch": 0.39637302824237103, "grad_norm": 4.15625, "learning_rate": 9.411532737251496e-06, "loss": 0.93203106, "memory(GiB)": 701.75, "step": 15625, "train_speed(iter/s)": 7.684249 }, { "acc": 0.75014992, "epoch": 0.39649986761140854, "grad_norm": 3.9375, "learning_rate": 9.411039065876698e-06, "loss": 0.96445894, "memory(GiB)": 701.75, "step": 15630, "train_speed(iter/s)": 7.432402 }, { "acc": 0.75503311, "epoch": 0.3966267069804461, "grad_norm": 4.5625, "learning_rate": 9.410545200474347e-06, "loss": 0.95832748, "memory(GiB)": 701.75, "step": 15635, "train_speed(iter/s)": 7.216106 }, { "acc": 0.76722922, "epoch": 0.3967535463494837, "grad_norm": 3.328125, "learning_rate": 9.410051141066164e-06, "loss": 0.89630547, "memory(GiB)": 701.75, "step": 15640, "train_speed(iter/s)": 7.008784 }, { "acc": 0.76178608, "epoch": 0.39688038571852124, "grad_norm": 3.109375, "learning_rate": 9.409556887673883e-06, "loss": 0.94644089, "memory(GiB)": 701.75, "step": 15645, "train_speed(iter/s)": 6.794764 }, { "acc": 0.74539876, "epoch": 0.3970072250875588, "grad_norm": 3.40625, "learning_rate": 9.409062440319245e-06, "loss": 0.97532272, "memory(GiB)": 701.75, "step": 15650, "train_speed(iter/s)": 6.605621 }, { "acc": 0.76731505, "epoch": 0.3971340644565964, "grad_norm": 4.03125, "learning_rate": 9.408567799023997e-06, "loss": 0.8822916, "memory(GiB)": 701.75, "step": 15655, "train_speed(iter/s)": 6.436146 }, { "acc": 0.77228189, "epoch": 0.3972609038256339, "grad_norm": 3.921875, "learning_rate": 9.408072963809897e-06, "loss": 0.92366552, "memory(GiB)": 701.75, "step": 15660, "train_speed(iter/s)": 6.263171 }, { "acc": 0.75773387, "epoch": 0.39738774319467146, "grad_norm": 8.9375, "learning_rate": 9.407577934698714e-06, "loss": 0.93649378, "memory(GiB)": 701.75, "step": 15665, "train_speed(iter/s)": 6.090018 }, { "acc": 0.7674675, "epoch": 0.39751458256370903, "grad_norm": 3.34375, "learning_rate": 9.407082711712221e-06, "loss": 0.90911856, "memory(GiB)": 701.75, "step": 15670, "train_speed(iter/s)": 5.929794 }, { "acc": 0.77356133, "epoch": 0.3976414219327466, "grad_norm": 3.75, "learning_rate": 9.406587294872203e-06, "loss": 0.88286877, "memory(GiB)": 701.75, "step": 15675, "train_speed(iter/s)": 5.797642 }, { "acc": 0.75700569, "epoch": 0.39776826130178417, "grad_norm": 3.34375, "learning_rate": 9.406091684200448e-06, "loss": 0.94122305, "memory(GiB)": 701.75, "step": 15680, "train_speed(iter/s)": 5.659927 }, { "acc": 0.76025181, "epoch": 0.39789510067082173, "grad_norm": 3.28125, "learning_rate": 9.40559587971876e-06, "loss": 0.95114193, "memory(GiB)": 701.75, "step": 15685, "train_speed(iter/s)": 5.522005 }, { "acc": 0.76230569, "epoch": 0.39802194003985925, "grad_norm": 3.421875, "learning_rate": 9.405099881448944e-06, "loss": 0.92564116, "memory(GiB)": 701.75, "step": 15690, "train_speed(iter/s)": 5.384617 }, { "acc": 0.77218037, "epoch": 0.3981487794088968, "grad_norm": 3.71875, "learning_rate": 9.40460368941282e-06, "loss": 0.90835133, "memory(GiB)": 701.75, "step": 15695, "train_speed(iter/s)": 5.255703 }, { "acc": 0.75961661, "epoch": 0.3982756187779344, "grad_norm": 3.453125, "learning_rate": 9.404107303632215e-06, "loss": 0.91281767, "memory(GiB)": 701.75, "step": 15700, "train_speed(iter/s)": 5.140352 }, { "acc": 0.7652216, "epoch": 0.39840245814697195, "grad_norm": 3.46875, "learning_rate": 9.403610724128963e-06, "loss": 0.94585924, "memory(GiB)": 701.75, "step": 15705, "train_speed(iter/s)": 5.029636 }, { "acc": 0.75758877, "epoch": 0.3985292975160095, "grad_norm": 2.828125, "learning_rate": 9.403113950924903e-06, "loss": 0.95825281, "memory(GiB)": 701.75, "step": 15710, "train_speed(iter/s)": 4.917714 }, { "acc": 0.75503092, "epoch": 0.3986561368850471, "grad_norm": 4.03125, "learning_rate": 9.402616984041893e-06, "loss": 0.92621012, "memory(GiB)": 701.75, "step": 15715, "train_speed(iter/s)": 4.823262 }, { "acc": 0.76366086, "epoch": 0.3987829762540846, "grad_norm": 3.140625, "learning_rate": 9.402119823501787e-06, "loss": 0.92741766, "memory(GiB)": 701.75, "step": 15720, "train_speed(iter/s)": 4.720936 }, { "acc": 0.74892764, "epoch": 0.39890981562312217, "grad_norm": 3.40625, "learning_rate": 9.401622469326457e-06, "loss": 0.94824114, "memory(GiB)": 701.75, "step": 15725, "train_speed(iter/s)": 4.6314 }, { "acc": 0.75577536, "epoch": 0.39903665499215973, "grad_norm": 3.96875, "learning_rate": 9.401124921537777e-06, "loss": 0.94918804, "memory(GiB)": 701.75, "step": 15730, "train_speed(iter/s)": 4.544827 }, { "acc": 0.76696849, "epoch": 0.3991634943611973, "grad_norm": 3.390625, "learning_rate": 9.400627180157636e-06, "loss": 0.87158337, "memory(GiB)": 701.75, "step": 15735, "train_speed(iter/s)": 4.45583 }, { "acc": 0.75365434, "epoch": 0.39929033373023487, "grad_norm": 3.71875, "learning_rate": 9.400129245207928e-06, "loss": 0.94563942, "memory(GiB)": 701.75, "step": 15740, "train_speed(iter/s)": 4.373062 }, { "acc": 0.75547428, "epoch": 0.39941717309927244, "grad_norm": 3.625, "learning_rate": 9.399631116710553e-06, "loss": 0.92961321, "memory(GiB)": 701.75, "step": 15745, "train_speed(iter/s)": 4.290282 }, { "acc": 0.76505332, "epoch": 0.39954401246830995, "grad_norm": 3.65625, "learning_rate": 9.399132794687423e-06, "loss": 0.94599361, "memory(GiB)": 701.75, "step": 15750, "train_speed(iter/s)": 4.210462 }, { "acc": 0.75900249, "epoch": 0.3996708518373475, "grad_norm": 3.625, "learning_rate": 9.398634279160456e-06, "loss": 0.94890099, "memory(GiB)": 701.75, "step": 15755, "train_speed(iter/s)": 4.143633 }, { "acc": 0.75627155, "epoch": 0.3997976912063851, "grad_norm": 2.734375, "learning_rate": 9.398135570151585e-06, "loss": 0.94092588, "memory(GiB)": 701.75, "step": 15760, "train_speed(iter/s)": 4.072273 }, { "acc": 0.76143298, "epoch": 0.39992453057542265, "grad_norm": 4.125, "learning_rate": 9.397636667682743e-06, "loss": 0.91804991, "memory(GiB)": 701.75, "step": 15765, "train_speed(iter/s)": 4.000103 }, { "acc": 0.75508785, "epoch": 0.4000513699444602, "grad_norm": 3.59375, "learning_rate": 9.397137571775876e-06, "loss": 0.93980532, "memory(GiB)": 701.75, "step": 15770, "train_speed(iter/s)": 3.936488 }, { "acc": 0.75342398, "epoch": 0.4001782093134978, "grad_norm": 3.09375, "learning_rate": 9.396638282452936e-06, "loss": 0.96258087, "memory(GiB)": 701.75, "step": 15775, "train_speed(iter/s)": 3.871648 }, { "acc": 0.76549773, "epoch": 0.4003050486825353, "grad_norm": 3.515625, "learning_rate": 9.396138799735887e-06, "loss": 0.90960703, "memory(GiB)": 712.22, "step": 15780, "train_speed(iter/s)": 3.805342 }, { "acc": 0.77089286, "epoch": 0.40043188805157287, "grad_norm": 3.109375, "learning_rate": 9.395639123646699e-06, "loss": 0.89888887, "memory(GiB)": 712.22, "step": 15785, "train_speed(iter/s)": 3.749332 }, { "acc": 0.75138078, "epoch": 0.40055872742061044, "grad_norm": 3.84375, "learning_rate": 9.395139254207351e-06, "loss": 0.9628027, "memory(GiB)": 712.22, "step": 15790, "train_speed(iter/s)": 3.698111 }, { "acc": 0.76455259, "epoch": 0.400685566789648, "grad_norm": 3.453125, "learning_rate": 9.394639191439832e-06, "loss": 0.91995354, "memory(GiB)": 725.08, "step": 15795, "train_speed(iter/s)": 3.631859 }, { "acc": 0.76280608, "epoch": 0.4008124061586856, "grad_norm": 3.125, "learning_rate": 9.394138935366137e-06, "loss": 0.91450682, "memory(GiB)": 725.08, "step": 15800, "train_speed(iter/s)": 3.581664 }, { "acc": 0.762323, "epoch": 0.40093924552772314, "grad_norm": 3.609375, "learning_rate": 9.393638486008269e-06, "loss": 0.91169538, "memory(GiB)": 725.08, "step": 15805, "train_speed(iter/s)": 3.526753 }, { "acc": 0.76446452, "epoch": 0.40106608489676066, "grad_norm": 3.59375, "learning_rate": 9.393137843388245e-06, "loss": 0.92298069, "memory(GiB)": 725.08, "step": 15810, "train_speed(iter/s)": 3.476073 }, { "acc": 0.75068974, "epoch": 0.4011929242657982, "grad_norm": 3.171875, "learning_rate": 9.392637007528085e-06, "loss": 0.97484255, "memory(GiB)": 725.08, "step": 15815, "train_speed(iter/s)": 3.426103 }, { "acc": 0.76800461, "epoch": 0.4013197636348358, "grad_norm": 3.265625, "learning_rate": 9.392135978449818e-06, "loss": 0.91515913, "memory(GiB)": 725.08, "step": 15820, "train_speed(iter/s)": 3.38015 }, { "acc": 0.7530766, "epoch": 0.40144660300387336, "grad_norm": 3.109375, "learning_rate": 9.391634756175483e-06, "loss": 0.93624001, "memory(GiB)": 725.08, "step": 15825, "train_speed(iter/s)": 3.330079 }, { "acc": 0.75591278, "epoch": 0.4015734423729109, "grad_norm": 3.25, "learning_rate": 9.391133340727127e-06, "loss": 0.88320312, "memory(GiB)": 725.08, "step": 15830, "train_speed(iter/s)": 3.282804 }, { "acc": 0.7576335, "epoch": 0.4017002817419485, "grad_norm": 3.890625, "learning_rate": 9.39063173212681e-06, "loss": 0.93955498, "memory(GiB)": 725.08, "step": 15835, "train_speed(iter/s)": 3.238835 }, { "acc": 0.76936798, "epoch": 0.401827121110986, "grad_norm": 3.640625, "learning_rate": 9.39012993039659e-06, "loss": 0.86063042, "memory(GiB)": 725.08, "step": 15840, "train_speed(iter/s)": 3.199779 }, { "acc": 0.75701537, "epoch": 0.4019539604800236, "grad_norm": 3.59375, "learning_rate": 9.38962793555854e-06, "loss": 0.92512569, "memory(GiB)": 725.08, "step": 15845, "train_speed(iter/s)": 3.158046 }, { "acc": 0.7764678, "epoch": 0.40208079984906114, "grad_norm": 4.09375, "learning_rate": 9.389125747634747e-06, "loss": 0.83948126, "memory(GiB)": 725.08, "step": 15850, "train_speed(iter/s)": 3.117945 }, { "acc": 0.76356072, "epoch": 0.4022076392180987, "grad_norm": 3.234375, "learning_rate": 9.388623366647295e-06, "loss": 0.8801405, "memory(GiB)": 725.08, "step": 15855, "train_speed(iter/s)": 3.079625 }, { "acc": 0.77902546, "epoch": 0.4023344785871363, "grad_norm": 3.203125, "learning_rate": 9.388120792618284e-06, "loss": 0.84392757, "memory(GiB)": 725.08, "step": 15860, "train_speed(iter/s)": 3.039318 }, { "acc": 0.74616971, "epoch": 0.40246131795617385, "grad_norm": 3.71875, "learning_rate": 9.387618025569823e-06, "loss": 1.0223402, "memory(GiB)": 725.08, "step": 15865, "train_speed(iter/s)": 2.999289 }, { "acc": 0.75797062, "epoch": 0.40258815732521136, "grad_norm": 3.59375, "learning_rate": 9.387115065524022e-06, "loss": 0.91711178, "memory(GiB)": 725.08, "step": 15870, "train_speed(iter/s)": 2.965419 }, { "acc": 0.76134119, "epoch": 0.40271499669424893, "grad_norm": 5.90625, "learning_rate": 9.386611912503008e-06, "loss": 0.92775431, "memory(GiB)": 725.08, "step": 15875, "train_speed(iter/s)": 2.930519 }, { "acc": 0.75609231, "epoch": 0.4028418360632865, "grad_norm": 4.21875, "learning_rate": 9.386108566528913e-06, "loss": 1.0085619, "memory(GiB)": 725.08, "step": 15880, "train_speed(iter/s)": 2.896424 }, { "acc": 0.75663791, "epoch": 0.40296867543232406, "grad_norm": 3.40625, "learning_rate": 9.385605027623878e-06, "loss": 0.93563051, "memory(GiB)": 725.08, "step": 15885, "train_speed(iter/s)": 2.85947 }, { "acc": 0.75678849, "epoch": 0.40309551480136163, "grad_norm": 3.859375, "learning_rate": 9.38510129581005e-06, "loss": 0.96065617, "memory(GiB)": 725.08, "step": 15890, "train_speed(iter/s)": 2.828408 }, { "acc": 0.77631197, "epoch": 0.4032223541703992, "grad_norm": 4.90625, "learning_rate": 9.38459737110959e-06, "loss": 0.86628456, "memory(GiB)": 725.08, "step": 15895, "train_speed(iter/s)": 2.7939 }, { "acc": 0.75304041, "epoch": 0.4033491935394367, "grad_norm": 3.1875, "learning_rate": 9.384093253544659e-06, "loss": 0.94093103, "memory(GiB)": 725.08, "step": 15900, "train_speed(iter/s)": 2.760439 }, { "acc": 0.75139585, "epoch": 0.4034760329084743, "grad_norm": 4.71875, "learning_rate": 9.383588943137436e-06, "loss": 0.90265236, "memory(GiB)": 725.08, "step": 15905, "train_speed(iter/s)": 2.733142 }, { "acc": 0.76708412, "epoch": 0.40360287227751185, "grad_norm": 4.28125, "learning_rate": 9.383084439910101e-06, "loss": 0.92349091, "memory(GiB)": 725.08, "step": 15910, "train_speed(iter/s)": 2.700826 }, { "acc": 0.76506672, "epoch": 0.4037297116465494, "grad_norm": 4.4375, "learning_rate": 9.38257974388485e-06, "loss": 0.89883966, "memory(GiB)": 725.08, "step": 15915, "train_speed(iter/s)": 2.671626 }, { "acc": 0.76575747, "epoch": 0.403856551015587, "grad_norm": 4.15625, "learning_rate": 9.382074855083875e-06, "loss": 0.90464745, "memory(GiB)": 725.08, "step": 15920, "train_speed(iter/s)": 2.641273 }, { "acc": 0.75874639, "epoch": 0.40398339038462455, "grad_norm": 3.859375, "learning_rate": 9.381569773529394e-06, "loss": 0.89224396, "memory(GiB)": 725.08, "step": 15925, "train_speed(iter/s)": 2.61184 }, { "acc": 0.76274896, "epoch": 0.40411022975366206, "grad_norm": 3.828125, "learning_rate": 9.381064499243617e-06, "loss": 0.90175095, "memory(GiB)": 725.08, "step": 15930, "train_speed(iter/s)": 2.583012 }, { "acc": 0.76757259, "epoch": 0.40423706912269963, "grad_norm": 3.40625, "learning_rate": 9.380559032248771e-06, "loss": 0.93390503, "memory(GiB)": 725.08, "step": 15935, "train_speed(iter/s)": 2.555614 }, { "acc": 0.76410947, "epoch": 0.4043639084917372, "grad_norm": 3.0, "learning_rate": 9.380053372567091e-06, "loss": 0.90084763, "memory(GiB)": 725.08, "step": 15940, "train_speed(iter/s)": 2.527667 }, { "acc": 0.7538548, "epoch": 0.40449074786077477, "grad_norm": 3.5625, "learning_rate": 9.379547520220818e-06, "loss": 0.94042082, "memory(GiB)": 725.08, "step": 15945, "train_speed(iter/s)": 2.501609 }, { "acc": 0.7621429, "epoch": 0.40461758722981234, "grad_norm": 4.0625, "learning_rate": 9.379041475232204e-06, "loss": 0.91561184, "memory(GiB)": 725.08, "step": 15950, "train_speed(iter/s)": 2.478546 }, { "acc": 0.75830722, "epoch": 0.4047444265988499, "grad_norm": 3.390625, "learning_rate": 9.37853523762351e-06, "loss": 0.92459621, "memory(GiB)": 725.08, "step": 15955, "train_speed(iter/s)": 2.454286 }, { "acc": 0.76086397, "epoch": 0.4048712659678874, "grad_norm": 3.21875, "learning_rate": 9.378028807417002e-06, "loss": 0.8928091, "memory(GiB)": 725.08, "step": 15960, "train_speed(iter/s)": 2.430202 }, { "acc": 0.77759151, "epoch": 0.404998105336925, "grad_norm": 3.859375, "learning_rate": 9.377522184634952e-06, "loss": 0.89825287, "memory(GiB)": 725.08, "step": 15965, "train_speed(iter/s)": 2.406896 }, { "acc": 0.75674558, "epoch": 0.40512494470596255, "grad_norm": 3.53125, "learning_rate": 9.377015369299651e-06, "loss": 0.95030394, "memory(GiB)": 725.08, "step": 15970, "train_speed(iter/s)": 2.38364 }, { "acc": 0.77248969, "epoch": 0.4052517840750001, "grad_norm": 4.21875, "learning_rate": 9.376508361433391e-06, "loss": 0.88603067, "memory(GiB)": 725.08, "step": 15975, "train_speed(iter/s)": 2.363573 }, { "acc": 0.76284537, "epoch": 0.4053786234440377, "grad_norm": 3.390625, "learning_rate": 9.37600116105847e-06, "loss": 0.87357559, "memory(GiB)": 725.08, "step": 15980, "train_speed(iter/s)": 2.343058 }, { "acc": 0.77604485, "epoch": 0.40550546281307526, "grad_norm": 3.734375, "learning_rate": 9.375493768197203e-06, "loss": 0.90491667, "memory(GiB)": 725.08, "step": 15985, "train_speed(iter/s)": 2.321806 }, { "acc": 0.7570365, "epoch": 0.40563230218211277, "grad_norm": 4.5, "learning_rate": 9.374986182871905e-06, "loss": 0.94456654, "memory(GiB)": 725.08, "step": 15990, "train_speed(iter/s)": 2.300191 }, { "acc": 0.76588211, "epoch": 0.40575914155115034, "grad_norm": 2.8125, "learning_rate": 9.374478405104906e-06, "loss": 0.91270885, "memory(GiB)": 725.08, "step": 15995, "train_speed(iter/s)": 2.277191 }, { "acc": 0.764993, "epoch": 0.4058859809201879, "grad_norm": 3.953125, "learning_rate": 9.37397043491854e-06, "loss": 0.89336138, "memory(GiB)": 725.08, "step": 16000, "train_speed(iter/s)": 2.256893 }, { "epoch": 0.4058859809201879, "eval_acc": 0.7507997110721287, "eval_loss": 0.8918020725250244, "eval_runtime": 1149.3129, "eval_samples_per_second": 5.542, "eval_steps_per_second": 5.542, "step": 16000 }, { "acc": 0.76054859, "epoch": 0.4060128202892255, "grad_norm": 3.40625, "learning_rate": 9.37346227233515e-06, "loss": 0.90746355, "memory(GiB)": 725.09, "step": 16005, "train_speed(iter/s)": 1.787701 }, { "acc": 0.74984288, "epoch": 0.40613965965826304, "grad_norm": 3.265625, "learning_rate": 9.372953917377088e-06, "loss": 0.95427265, "memory(GiB)": 725.09, "step": 16010, "train_speed(iter/s)": 1.775299 }, { "acc": 0.76820145, "epoch": 0.4062664990273006, "grad_norm": 3.875, "learning_rate": 9.372445370066718e-06, "loss": 0.88353176, "memory(GiB)": 725.09, "step": 16015, "train_speed(iter/s)": 1.763436 }, { "acc": 0.76852331, "epoch": 0.4063933383963381, "grad_norm": 3.34375, "learning_rate": 9.371936630426407e-06, "loss": 0.93488598, "memory(GiB)": 725.09, "step": 16020, "train_speed(iter/s)": 1.750226 }, { "acc": 0.77281451, "epoch": 0.4065201777653757, "grad_norm": 3.265625, "learning_rate": 9.371427698478534e-06, "loss": 0.90638037, "memory(GiB)": 725.09, "step": 16025, "train_speed(iter/s)": 1.738798 }, { "acc": 0.7616293, "epoch": 0.40664701713441326, "grad_norm": 3.734375, "learning_rate": 9.370918574245483e-06, "loss": 0.92091799, "memory(GiB)": 725.09, "step": 16030, "train_speed(iter/s)": 1.725908 }, { "acc": 0.74417439, "epoch": 0.4067738565034508, "grad_norm": 3.5, "learning_rate": 9.370409257749653e-06, "loss": 0.94567766, "memory(GiB)": 725.09, "step": 16035, "train_speed(iter/s)": 1.713701 }, { "acc": 0.75466509, "epoch": 0.4069006958724884, "grad_norm": 3.203125, "learning_rate": 9.369899749013443e-06, "loss": 0.93269691, "memory(GiB)": 725.09, "step": 16040, "train_speed(iter/s)": 1.702316 }, { "acc": 0.77590003, "epoch": 0.40702753524152596, "grad_norm": 3.265625, "learning_rate": 9.369390048059267e-06, "loss": 0.89073315, "memory(GiB)": 725.09, "step": 16045, "train_speed(iter/s)": 1.690665 }, { "acc": 0.77148967, "epoch": 0.4071543746105635, "grad_norm": 3.78125, "learning_rate": 9.368880154909544e-06, "loss": 0.92292175, "memory(GiB)": 725.09, "step": 16050, "train_speed(iter/s)": 1.679852 }, { "acc": 0.76629591, "epoch": 0.40728121397960104, "grad_norm": 3.984375, "learning_rate": 9.368370069586704e-06, "loss": 0.90781212, "memory(GiB)": 725.09, "step": 16055, "train_speed(iter/s)": 1.66993 }, { "acc": 0.75922894, "epoch": 0.4074080533486386, "grad_norm": 3.8125, "learning_rate": 9.367859792113183e-06, "loss": 0.91650391, "memory(GiB)": 725.09, "step": 16060, "train_speed(iter/s)": 1.658153 }, { "acc": 0.76090679, "epoch": 0.4075348927176762, "grad_norm": 3.609375, "learning_rate": 9.367349322511425e-06, "loss": 0.92961311, "memory(GiB)": 725.09, "step": 16065, "train_speed(iter/s)": 1.647542 }, { "acc": 0.76460681, "epoch": 0.40766173208671375, "grad_norm": 3.96875, "learning_rate": 9.366838660803888e-06, "loss": 0.91562195, "memory(GiB)": 725.09, "step": 16070, "train_speed(iter/s)": 1.637002 }, { "acc": 0.75321493, "epoch": 0.4077885714557513, "grad_norm": 3.609375, "learning_rate": 9.366327807013028e-06, "loss": 0.92633686, "memory(GiB)": 725.09, "step": 16075, "train_speed(iter/s)": 1.625704 }, { "acc": 0.76552186, "epoch": 0.4079154108247888, "grad_norm": 3.8125, "learning_rate": 9.365816761161325e-06, "loss": 0.88125458, "memory(GiB)": 725.09, "step": 16080, "train_speed(iter/s)": 1.615138 }, { "acc": 0.76542916, "epoch": 0.4080422501938264, "grad_norm": 4.21875, "learning_rate": 9.36530552327125e-06, "loss": 0.89019909, "memory(GiB)": 725.09, "step": 16085, "train_speed(iter/s)": 1.604652 }, { "acc": 0.76260891, "epoch": 0.40816908956286396, "grad_norm": 3.515625, "learning_rate": 9.364794093365293e-06, "loss": 0.95072021, "memory(GiB)": 725.09, "step": 16090, "train_speed(iter/s)": 1.593996 }, { "acc": 0.74947896, "epoch": 0.40829592893190153, "grad_norm": 3.21875, "learning_rate": 9.364282471465954e-06, "loss": 0.97246628, "memory(GiB)": 725.09, "step": 16095, "train_speed(iter/s)": 1.58421 }, { "acc": 0.76238713, "epoch": 0.4084227683009391, "grad_norm": 3.484375, "learning_rate": 9.363770657595732e-06, "loss": 0.88632622, "memory(GiB)": 725.09, "step": 16100, "train_speed(iter/s)": 1.574506 }, { "acc": 0.76538067, "epoch": 0.40854960766997667, "grad_norm": 3.828125, "learning_rate": 9.363258651777142e-06, "loss": 0.90459394, "memory(GiB)": 725.09, "step": 16105, "train_speed(iter/s)": 1.565022 }, { "acc": 0.76296129, "epoch": 0.4086764470390142, "grad_norm": 3.125, "learning_rate": 9.362746454032708e-06, "loss": 0.91831245, "memory(GiB)": 725.09, "step": 16110, "train_speed(iter/s)": 1.554946 }, { "acc": 0.76351938, "epoch": 0.40880328640805175, "grad_norm": 3.796875, "learning_rate": 9.362234064384958e-06, "loss": 0.93681746, "memory(GiB)": 725.09, "step": 16115, "train_speed(iter/s)": 1.545846 }, { "acc": 0.75800672, "epoch": 0.4089301257770893, "grad_norm": 3.25, "learning_rate": 9.36172148285643e-06, "loss": 0.91033993, "memory(GiB)": 725.09, "step": 16120, "train_speed(iter/s)": 1.535973 }, { "acc": 0.75347729, "epoch": 0.4090569651461269, "grad_norm": 3.8125, "learning_rate": 9.36120870946967e-06, "loss": 0.92657042, "memory(GiB)": 725.09, "step": 16125, "train_speed(iter/s)": 1.52667 }, { "acc": 0.75524535, "epoch": 0.40918380451516445, "grad_norm": 3.640625, "learning_rate": 9.360695744247235e-06, "loss": 0.92225199, "memory(GiB)": 725.09, "step": 16130, "train_speed(iter/s)": 1.518254 }, { "acc": 0.76434789, "epoch": 0.409310643884202, "grad_norm": 3.421875, "learning_rate": 9.360182587211689e-06, "loss": 0.94715509, "memory(GiB)": 725.09, "step": 16135, "train_speed(iter/s)": 1.509819 }, { "acc": 0.7548367, "epoch": 0.40943748325323953, "grad_norm": 3.46875, "learning_rate": 9.359669238385603e-06, "loss": 0.95061064, "memory(GiB)": 725.09, "step": 16140, "train_speed(iter/s)": 1.501129 }, { "acc": 0.76369843, "epoch": 0.4095643226222771, "grad_norm": 3.40625, "learning_rate": 9.359155697791558e-06, "loss": 0.93595524, "memory(GiB)": 725.09, "step": 16145, "train_speed(iter/s)": 1.492267 }, { "acc": 0.75212364, "epoch": 0.40969116199131467, "grad_norm": 3.375, "learning_rate": 9.358641965452144e-06, "loss": 0.92680979, "memory(GiB)": 725.09, "step": 16150, "train_speed(iter/s)": 1.484136 }, { "acc": 0.7599648, "epoch": 0.40981800136035224, "grad_norm": 3.765625, "learning_rate": 9.358128041389956e-06, "loss": 0.93552418, "memory(GiB)": 725.09, "step": 16155, "train_speed(iter/s)": 1.475713 }, { "acc": 0.75078974, "epoch": 0.4099448407293898, "grad_norm": 4.25, "learning_rate": 9.357613925627602e-06, "loss": 0.92087727, "memory(GiB)": 725.09, "step": 16160, "train_speed(iter/s)": 1.467512 }, { "acc": 0.75293236, "epoch": 0.41007168009842737, "grad_norm": 5.9375, "learning_rate": 9.357099618187696e-06, "loss": 0.96797867, "memory(GiB)": 725.09, "step": 16165, "train_speed(iter/s)": 1.460128 }, { "acc": 0.76308169, "epoch": 0.4101985194674649, "grad_norm": 4.09375, "learning_rate": 9.35658511909286e-06, "loss": 0.92649279, "memory(GiB)": 725.09, "step": 16170, "train_speed(iter/s)": 1.451642 }, { "acc": 0.76412678, "epoch": 0.41032535883650245, "grad_norm": 3.65625, "learning_rate": 9.356070428365725e-06, "loss": 0.86352119, "memory(GiB)": 725.09, "step": 16175, "train_speed(iter/s)": 1.443568 }, { "acc": 0.76174011, "epoch": 0.41045219820554, "grad_norm": 3.234375, "learning_rate": 9.355555546028931e-06, "loss": 0.92960472, "memory(GiB)": 725.09, "step": 16180, "train_speed(iter/s)": 1.435363 }, { "acc": 0.77808828, "epoch": 0.4105790375745776, "grad_norm": 4.03125, "learning_rate": 9.355040472105127e-06, "loss": 0.89492435, "memory(GiB)": 725.09, "step": 16185, "train_speed(iter/s)": 1.427859 }, { "acc": 0.75294833, "epoch": 0.41070587694361516, "grad_norm": 3.28125, "learning_rate": 9.354525206616967e-06, "loss": 0.93531742, "memory(GiB)": 725.09, "step": 16190, "train_speed(iter/s)": 1.420149 }, { "acc": 0.76672678, "epoch": 0.4108327163126527, "grad_norm": 3.296875, "learning_rate": 9.35400974958712e-06, "loss": 0.86689339, "memory(GiB)": 725.09, "step": 16195, "train_speed(iter/s)": 1.411703 }, { "acc": 0.747159, "epoch": 0.41095955568169024, "grad_norm": 3.3125, "learning_rate": 9.353494101038255e-06, "loss": 0.99104223, "memory(GiB)": 725.09, "step": 16200, "train_speed(iter/s)": 1.40452 }, { "acc": 0.7742516, "epoch": 0.4110863950507278, "grad_norm": 3.15625, "learning_rate": 9.352978260993057e-06, "loss": 0.88364429, "memory(GiB)": 725.09, "step": 16205, "train_speed(iter/s)": 1.396426 }, { "acc": 0.76590896, "epoch": 0.41121323441976537, "grad_norm": 3.234375, "learning_rate": 9.352462229474212e-06, "loss": 0.89324646, "memory(GiB)": 725.09, "step": 16210, "train_speed(iter/s)": 1.388704 }, { "acc": 0.77461133, "epoch": 0.41134007378880294, "grad_norm": 4.5, "learning_rate": 9.351946006504424e-06, "loss": 0.88684664, "memory(GiB)": 725.09, "step": 16215, "train_speed(iter/s)": 1.381179 }, { "acc": 0.77628741, "epoch": 0.4114669131578405, "grad_norm": 3.234375, "learning_rate": 9.351429592106397e-06, "loss": 0.85645647, "memory(GiB)": 725.09, "step": 16220, "train_speed(iter/s)": 1.373781 }, { "acc": 0.76714854, "epoch": 0.4115937525268781, "grad_norm": 4.09375, "learning_rate": 9.350912986302846e-06, "loss": 0.9148777, "memory(GiB)": 725.09, "step": 16225, "train_speed(iter/s)": 1.366793 }, { "acc": 0.7647213, "epoch": 0.4117205918959156, "grad_norm": 3.203125, "learning_rate": 9.350396189116497e-06, "loss": 0.90999479, "memory(GiB)": 725.09, "step": 16230, "train_speed(iter/s)": 1.359478 }, { "acc": 0.7629137, "epoch": 0.41184743126495316, "grad_norm": 4.21875, "learning_rate": 9.34987920057008e-06, "loss": 0.94169922, "memory(GiB)": 725.09, "step": 16235, "train_speed(iter/s)": 1.352823 }, { "acc": 0.76497416, "epoch": 0.4119742706339907, "grad_norm": 3.03125, "learning_rate": 9.349362020686335e-06, "loss": 0.88113146, "memory(GiB)": 725.09, "step": 16240, "train_speed(iter/s)": 1.346055 }, { "acc": 0.75922246, "epoch": 0.4121011100030283, "grad_norm": 3.359375, "learning_rate": 9.348844649488014e-06, "loss": 0.95648975, "memory(GiB)": 725.09, "step": 16245, "train_speed(iter/s)": 1.339593 }, { "acc": 0.75074892, "epoch": 0.41222794937206586, "grad_norm": 3.953125, "learning_rate": 9.348327086997874e-06, "loss": 0.94072638, "memory(GiB)": 738.51, "step": 16250, "train_speed(iter/s)": 1.332094 }, { "acc": 0.7569674, "epoch": 0.41235478874110343, "grad_norm": 3.671875, "learning_rate": 9.347809333238679e-06, "loss": 0.9538744, "memory(GiB)": 738.51, "step": 16255, "train_speed(iter/s)": 1.324852 }, { "acc": 0.75012541, "epoch": 0.41248162811014094, "grad_norm": 3.578125, "learning_rate": 9.347291388233204e-06, "loss": 0.91447954, "memory(GiB)": 738.51, "step": 16260, "train_speed(iter/s)": 1.31867 }, { "acc": 0.77616558, "epoch": 0.4126084674791785, "grad_norm": 3.421875, "learning_rate": 9.346773252004233e-06, "loss": 0.91511641, "memory(GiB)": 738.51, "step": 16265, "train_speed(iter/s)": 1.311855 }, { "acc": 0.75494485, "epoch": 0.4127353068482161, "grad_norm": 3.234375, "learning_rate": 9.346254924574556e-06, "loss": 0.93094168, "memory(GiB)": 738.51, "step": 16270, "train_speed(iter/s)": 1.304798 }, { "acc": 0.76655097, "epoch": 0.41286214621725364, "grad_norm": 2.984375, "learning_rate": 9.345736405966972e-06, "loss": 0.87581263, "memory(GiB)": 738.51, "step": 16275, "train_speed(iter/s)": 1.297503 }, { "acc": 0.75194817, "epoch": 0.4129889855862912, "grad_norm": 3.234375, "learning_rate": 9.345217696204292e-06, "loss": 0.90046282, "memory(GiB)": 738.51, "step": 16280, "train_speed(iter/s)": 1.290631 }, { "acc": 0.76416764, "epoch": 0.4131158249553288, "grad_norm": 3.84375, "learning_rate": 9.344698795309328e-06, "loss": 0.95129776, "memory(GiB)": 738.51, "step": 16285, "train_speed(iter/s)": 1.284097 }, { "acc": 0.76106977, "epoch": 0.4132426643243663, "grad_norm": 3.953125, "learning_rate": 9.344179703304909e-06, "loss": 0.93353357, "memory(GiB)": 738.51, "step": 16290, "train_speed(iter/s)": 1.277459 }, { "acc": 0.75770369, "epoch": 0.41336950369340386, "grad_norm": 3.8125, "learning_rate": 9.343660420213864e-06, "loss": 0.93180046, "memory(GiB)": 738.51, "step": 16295, "train_speed(iter/s)": 1.270413 }, { "acc": 0.7647521, "epoch": 0.41349634306244143, "grad_norm": 3.375, "learning_rate": 9.343140946059039e-06, "loss": 0.91200352, "memory(GiB)": 738.51, "step": 16300, "train_speed(iter/s)": 1.264436 }, { "acc": 0.76513534, "epoch": 0.413623182431479, "grad_norm": 2.859375, "learning_rate": 9.342621280863281e-06, "loss": 0.898174, "memory(GiB)": 738.51, "step": 16305, "train_speed(iter/s)": 1.258286 }, { "acc": 0.76177025, "epoch": 0.41375002180051657, "grad_norm": 2.921875, "learning_rate": 9.342101424649449e-06, "loss": 0.89108696, "memory(GiB)": 738.51, "step": 16310, "train_speed(iter/s)": 1.252158 }, { "acc": 0.76936035, "epoch": 0.41387686116955413, "grad_norm": 3.625, "learning_rate": 9.34158137744041e-06, "loss": 0.88949594, "memory(GiB)": 738.51, "step": 16315, "train_speed(iter/s)": 1.246608 }, { "acc": 0.75667191, "epoch": 0.41400370053859165, "grad_norm": 3.140625, "learning_rate": 9.34106113925904e-06, "loss": 0.93877096, "memory(GiB)": 738.51, "step": 16320, "train_speed(iter/s)": 1.240742 }, { "acc": 0.76579723, "epoch": 0.4141305399076292, "grad_norm": 3.96875, "learning_rate": 9.340540710128222e-06, "loss": 0.90065613, "memory(GiB)": 738.51, "step": 16325, "train_speed(iter/s)": 1.234176 }, { "acc": 0.76222224, "epoch": 0.4142573792766668, "grad_norm": 3.4375, "learning_rate": 9.340020090070848e-06, "loss": 0.88136749, "memory(GiB)": 738.51, "step": 16330, "train_speed(iter/s)": 1.22785 }, { "acc": 0.75053067, "epoch": 0.41438421864570435, "grad_norm": 4.25, "learning_rate": 9.339499279109818e-06, "loss": 0.98137035, "memory(GiB)": 738.51, "step": 16335, "train_speed(iter/s)": 1.222626 }, { "acc": 0.75941443, "epoch": 0.4145110580147419, "grad_norm": 3.40625, "learning_rate": 9.33897827726804e-06, "loss": 0.93568201, "memory(GiB)": 738.51, "step": 16340, "train_speed(iter/s)": 1.216601 }, { "acc": 0.76523786, "epoch": 0.4146378973837795, "grad_norm": 3.3125, "learning_rate": 9.338457084568435e-06, "loss": 0.88667669, "memory(GiB)": 738.51, "step": 16345, "train_speed(iter/s)": 1.210505 }, { "acc": 0.77550211, "epoch": 0.414764736752817, "grad_norm": 3.578125, "learning_rate": 9.337935701033925e-06, "loss": 0.92931414, "memory(GiB)": 738.51, "step": 16350, "train_speed(iter/s)": 1.204615 }, { "acc": 0.75978251, "epoch": 0.41489157612185457, "grad_norm": 4.1875, "learning_rate": 9.337414126687443e-06, "loss": 0.98204956, "memory(GiB)": 738.51, "step": 16355, "train_speed(iter/s)": 1.199463 }, { "acc": 0.76569037, "epoch": 0.41501841549089213, "grad_norm": 3.90625, "learning_rate": 9.336892361551936e-06, "loss": 0.9218071, "memory(GiB)": 738.51, "step": 16360, "train_speed(iter/s)": 1.194308 }, { "acc": 0.75278363, "epoch": 0.4151452548599297, "grad_norm": 3.625, "learning_rate": 9.33637040565035e-06, "loss": 0.94456539, "memory(GiB)": 738.51, "step": 16365, "train_speed(iter/s)": 1.188946 }, { "acc": 0.75666599, "epoch": 0.41527209422896727, "grad_norm": 3.515625, "learning_rate": 9.33584825900565e-06, "loss": 0.9007966, "memory(GiB)": 738.51, "step": 16370, "train_speed(iter/s)": 1.183777 }, { "acc": 0.77144675, "epoch": 0.41539893359800484, "grad_norm": 3.109375, "learning_rate": 9.335325921640797e-06, "loss": 0.92689342, "memory(GiB)": 738.51, "step": 16375, "train_speed(iter/s)": 1.179068 }, { "acc": 0.75795894, "epoch": 0.41552577296704235, "grad_norm": 3.421875, "learning_rate": 9.33480339357877e-06, "loss": 0.91783504, "memory(GiB)": 738.51, "step": 16380, "train_speed(iter/s)": 1.174085 }, { "acc": 0.73220921, "epoch": 0.4156526123360799, "grad_norm": 3.671875, "learning_rate": 9.334280674842554e-06, "loss": 1.02803402, "memory(GiB)": 738.51, "step": 16385, "train_speed(iter/s)": 1.168468 }, { "acc": 0.75359273, "epoch": 0.4157794517051175, "grad_norm": 3.25, "learning_rate": 9.33375776545514e-06, "loss": 0.95289955, "memory(GiB)": 738.51, "step": 16390, "train_speed(iter/s)": 1.163486 }, { "acc": 0.76277127, "epoch": 0.41590629107415505, "grad_norm": 3.3125, "learning_rate": 9.333234665439531e-06, "loss": 0.92442465, "memory(GiB)": 738.51, "step": 16395, "train_speed(iter/s)": 1.158531 }, { "acc": 0.76319838, "epoch": 0.4160331304431926, "grad_norm": 3.921875, "learning_rate": 9.332711374818737e-06, "loss": 0.86429443, "memory(GiB)": 738.51, "step": 16400, "train_speed(iter/s)": 1.153655 }, { "acc": 0.76266847, "epoch": 0.4161599698122302, "grad_norm": 3.5, "learning_rate": 9.332187893615771e-06, "loss": 0.90871544, "memory(GiB)": 738.51, "step": 16405, "train_speed(iter/s)": 1.14896 }, { "acc": 0.76117125, "epoch": 0.4162868091812677, "grad_norm": 3.15625, "learning_rate": 9.331664221853664e-06, "loss": 0.91311922, "memory(GiB)": 738.51, "step": 16410, "train_speed(iter/s)": 1.144061 }, { "acc": 0.76452918, "epoch": 0.41641364855030527, "grad_norm": 5.09375, "learning_rate": 9.331140359555451e-06, "loss": 0.95699377, "memory(GiB)": 738.51, "step": 16415, "train_speed(iter/s)": 1.1392 }, { "acc": 0.75632553, "epoch": 0.41654048791934284, "grad_norm": 3.546875, "learning_rate": 9.330616306744173e-06, "loss": 0.9366086, "memory(GiB)": 738.51, "step": 16420, "train_speed(iter/s)": 1.134378 }, { "acc": 0.75363197, "epoch": 0.4166673272883804, "grad_norm": 3.40625, "learning_rate": 9.330092063442882e-06, "loss": 0.97414608, "memory(GiB)": 738.51, "step": 16425, "train_speed(iter/s)": 1.129476 }, { "acc": 0.77278028, "epoch": 0.416794166657418, "grad_norm": 3.28125, "learning_rate": 9.329567629674636e-06, "loss": 0.87355957, "memory(GiB)": 738.51, "step": 16430, "train_speed(iter/s)": 1.12467 }, { "acc": 0.75700459, "epoch": 0.41692100602645554, "grad_norm": 3.328125, "learning_rate": 9.329043005462506e-06, "loss": 0.89693155, "memory(GiB)": 738.51, "step": 16435, "train_speed(iter/s)": 1.11985 }, { "acc": 0.7767766, "epoch": 0.41704784539549306, "grad_norm": 3.234375, "learning_rate": 9.328518190829567e-06, "loss": 0.84964294, "memory(GiB)": 738.51, "step": 16440, "train_speed(iter/s)": 1.114682 }, { "acc": 0.76156702, "epoch": 0.4171746847645306, "grad_norm": 3.625, "learning_rate": 9.327993185798905e-06, "loss": 0.88761415, "memory(GiB)": 738.51, "step": 16445, "train_speed(iter/s)": 1.109679 }, { "acc": 0.75219245, "epoch": 0.4173015241335682, "grad_norm": 3.09375, "learning_rate": 9.32746799039361e-06, "loss": 0.96091356, "memory(GiB)": 738.51, "step": 16450, "train_speed(iter/s)": 1.105145 }, { "acc": 0.769103, "epoch": 0.41742836350260576, "grad_norm": 3.8125, "learning_rate": 9.326942604636789e-06, "loss": 0.89685335, "memory(GiB)": 738.51, "step": 16455, "train_speed(iter/s)": 1.100473 }, { "acc": 0.76854177, "epoch": 0.4175552028716433, "grad_norm": 3.875, "learning_rate": 9.326417028551549e-06, "loss": 0.89348936, "memory(GiB)": 738.51, "step": 16460, "train_speed(iter/s)": 1.095895 }, { "acc": 0.75669484, "epoch": 0.4176820422406809, "grad_norm": 3.640625, "learning_rate": 9.325891262161009e-06, "loss": 0.92092361, "memory(GiB)": 738.51, "step": 16465, "train_speed(iter/s)": 1.090791 }, { "acc": 0.7624351, "epoch": 0.4178088816097184, "grad_norm": 3.359375, "learning_rate": 9.325365305488294e-06, "loss": 0.88901014, "memory(GiB)": 738.51, "step": 16470, "train_speed(iter/s)": 1.086421 }, { "acc": 0.75899282, "epoch": 0.417935720978756, "grad_norm": 3.984375, "learning_rate": 9.324839158556542e-06, "loss": 0.96036434, "memory(GiB)": 738.51, "step": 16475, "train_speed(iter/s)": 1.081953 }, { "acc": 0.76265669, "epoch": 0.41806256034779354, "grad_norm": 6.03125, "learning_rate": 9.324312821388895e-06, "loss": 0.906178, "memory(GiB)": 738.51, "step": 16480, "train_speed(iter/s)": 1.077914 }, { "acc": 0.74784203, "epoch": 0.4181893997168311, "grad_norm": 3.59375, "learning_rate": 9.323786294008504e-06, "loss": 0.94842329, "memory(GiB)": 738.51, "step": 16485, "train_speed(iter/s)": 1.073395 }, { "acc": 0.75624647, "epoch": 0.4183162390858687, "grad_norm": 3.609375, "learning_rate": 9.32325957643853e-06, "loss": 0.9185092, "memory(GiB)": 738.51, "step": 16490, "train_speed(iter/s)": 1.068771 }, { "acc": 0.7676785, "epoch": 0.41844307845490625, "grad_norm": 3.515625, "learning_rate": 9.322732668702143e-06, "loss": 0.90572233, "memory(GiB)": 738.51, "step": 16495, "train_speed(iter/s)": 1.063788 }, { "acc": 0.77183318, "epoch": 0.41856991782394376, "grad_norm": 4.1875, "learning_rate": 9.32220557082252e-06, "loss": 0.90734463, "memory(GiB)": 738.51, "step": 16500, "train_speed(iter/s)": 1.059044 }, { "epoch": 0.41856991782394376, "eval_acc": 0.7507863425182262, "eval_loss": 0.8906607031822205, "eval_runtime": 1153.9051, "eval_samples_per_second": 5.52, "eval_steps_per_second": 5.52, "step": 16500 }, { "acc": 0.7642879, "epoch": 0.41869675719298133, "grad_norm": 4.09375, "learning_rate": 9.321678282822845e-06, "loss": 0.91134825, "memory(GiB)": 738.51, "step": 16505, "train_speed(iter/s)": 0.945831 }, { "acc": 0.7595077, "epoch": 0.4188235965620189, "grad_norm": 3.53125, "learning_rate": 9.321150804726311e-06, "loss": 0.95756006, "memory(GiB)": 738.51, "step": 16510, "train_speed(iter/s)": 0.942519 }, { "acc": 0.7647943, "epoch": 0.41895043593105646, "grad_norm": 2.921875, "learning_rate": 9.320623136556122e-06, "loss": 0.93557987, "memory(GiB)": 738.51, "step": 16515, "train_speed(iter/s)": 0.939424 }, { "acc": 0.76082182, "epoch": 0.41907727530009403, "grad_norm": 3.484375, "learning_rate": 9.320095278335488e-06, "loss": 0.91936798, "memory(GiB)": 738.51, "step": 16520, "train_speed(iter/s)": 0.935987 }, { "acc": 0.76604376, "epoch": 0.4192041146691316, "grad_norm": 3.75, "learning_rate": 9.319567230087624e-06, "loss": 0.90411644, "memory(GiB)": 738.51, "step": 16525, "train_speed(iter/s)": 0.932736 }, { "acc": 0.77899518, "epoch": 0.4193309540381691, "grad_norm": 3.203125, "learning_rate": 9.319038991835765e-06, "loss": 0.869028, "memory(GiB)": 738.51, "step": 16530, "train_speed(iter/s)": 0.929454 }, { "acc": 0.76092677, "epoch": 0.4194577934072067, "grad_norm": 3.28125, "learning_rate": 9.31851056360314e-06, "loss": 0.8811574, "memory(GiB)": 738.51, "step": 16535, "train_speed(iter/s)": 0.926333 }, { "acc": 0.73496189, "epoch": 0.41958463277624425, "grad_norm": 3.421875, "learning_rate": 9.317981945412994e-06, "loss": 0.98745356, "memory(GiB)": 738.51, "step": 16540, "train_speed(iter/s)": 0.923194 }, { "acc": 0.74791627, "epoch": 0.4197114721452818, "grad_norm": 3.359375, "learning_rate": 9.31745313728858e-06, "loss": 0.94898357, "memory(GiB)": 738.51, "step": 16545, "train_speed(iter/s)": 0.919901 }, { "acc": 0.75619168, "epoch": 0.4198383115143194, "grad_norm": 4.0, "learning_rate": 9.316924139253161e-06, "loss": 0.92508879, "memory(GiB)": 738.51, "step": 16550, "train_speed(iter/s)": 0.91697 }, { "acc": 0.75445323, "epoch": 0.41996515088335695, "grad_norm": 3.875, "learning_rate": 9.316394951330003e-06, "loss": 0.88687143, "memory(GiB)": 738.51, "step": 16555, "train_speed(iter/s)": 0.913676 }, { "acc": 0.75428629, "epoch": 0.42009199025239446, "grad_norm": 3.578125, "learning_rate": 9.315865573542383e-06, "loss": 0.91777077, "memory(GiB)": 738.51, "step": 16560, "train_speed(iter/s)": 0.910472 }, { "acc": 0.75471382, "epoch": 0.42021882962143203, "grad_norm": 3.625, "learning_rate": 9.315336005913588e-06, "loss": 1.02253799, "memory(GiB)": 738.51, "step": 16565, "train_speed(iter/s)": 0.907358 }, { "acc": 0.75992641, "epoch": 0.4203456689904696, "grad_norm": 3.6875, "learning_rate": 9.314806248466912e-06, "loss": 0.91666441, "memory(GiB)": 752.04, "step": 16570, "train_speed(iter/s)": 0.904007 }, { "acc": 0.75105362, "epoch": 0.42047250835950717, "grad_norm": 3.640625, "learning_rate": 9.314276301225656e-06, "loss": 0.96577473, "memory(GiB)": 752.04, "step": 16575, "train_speed(iter/s)": 0.901125 }, { "acc": 0.75566568, "epoch": 0.42059934772854474, "grad_norm": 2.96875, "learning_rate": 9.313746164213133e-06, "loss": 0.95137491, "memory(GiB)": 752.04, "step": 16580, "train_speed(iter/s)": 0.89802 }, { "acc": 0.75309672, "epoch": 0.4207261870975823, "grad_norm": 3.28125, "learning_rate": 9.313215837452661e-06, "loss": 1.00488424, "memory(GiB)": 752.04, "step": 16585, "train_speed(iter/s)": 0.895177 }, { "acc": 0.76150217, "epoch": 0.4208530264666198, "grad_norm": 3.21875, "learning_rate": 9.312685320967566e-06, "loss": 0.92919245, "memory(GiB)": 752.04, "step": 16590, "train_speed(iter/s)": 0.891948 }, { "acc": 0.78006153, "epoch": 0.4209798658356574, "grad_norm": 3.421875, "learning_rate": 9.312154614781183e-06, "loss": 0.87409019, "memory(GiB)": 752.04, "step": 16595, "train_speed(iter/s)": 0.88934 }, { "acc": 0.76757083, "epoch": 0.42110670520469495, "grad_norm": 3.390625, "learning_rate": 9.31162371891686e-06, "loss": 0.93020077, "memory(GiB)": 752.04, "step": 16600, "train_speed(iter/s)": 0.886347 }, { "acc": 0.75810337, "epoch": 0.4212335445737325, "grad_norm": 3.890625, "learning_rate": 9.311092633397947e-06, "loss": 0.96029949, "memory(GiB)": 752.04, "step": 16605, "train_speed(iter/s)": 0.88377 }, { "acc": 0.76518798, "epoch": 0.4213603839427701, "grad_norm": 3.140625, "learning_rate": 9.310561358247805e-06, "loss": 0.87269697, "memory(GiB)": 752.04, "step": 16610, "train_speed(iter/s)": 0.880883 }, { "acc": 0.7738122, "epoch": 0.42148722331180766, "grad_norm": 3.75, "learning_rate": 9.310029893489804e-06, "loss": 0.89423752, "memory(GiB)": 752.04, "step": 16615, "train_speed(iter/s)": 0.878265 }, { "acc": 0.75596924, "epoch": 0.42161406268084517, "grad_norm": 2.90625, "learning_rate": 9.309498239147319e-06, "loss": 0.92177315, "memory(GiB)": 752.04, "step": 16620, "train_speed(iter/s)": 0.875541 }, { "acc": 0.76471148, "epoch": 0.42174090204988274, "grad_norm": 3.171875, "learning_rate": 9.308966395243737e-06, "loss": 0.87997732, "memory(GiB)": 752.04, "step": 16625, "train_speed(iter/s)": 0.872817 }, { "acc": 0.76569023, "epoch": 0.4218677414189203, "grad_norm": 3.59375, "learning_rate": 9.308434361802455e-06, "loss": 0.8419095, "memory(GiB)": 752.04, "step": 16630, "train_speed(iter/s)": 0.86974 }, { "acc": 0.75855484, "epoch": 0.4219945807879579, "grad_norm": 3.546875, "learning_rate": 9.307902138846872e-06, "loss": 0.87605181, "memory(GiB)": 752.04, "step": 16635, "train_speed(iter/s)": 0.86716 }, { "acc": 0.74962716, "epoch": 0.42212142015699544, "grad_norm": 3.09375, "learning_rate": 9.307369726400399e-06, "loss": 0.97733212, "memory(GiB)": 752.04, "step": 16640, "train_speed(iter/s)": 0.864273 }, { "acc": 0.75166054, "epoch": 0.422248259526033, "grad_norm": 2.5625, "learning_rate": 9.306837124486455e-06, "loss": 0.91459723, "memory(GiB)": 752.04, "step": 16645, "train_speed(iter/s)": 0.861083 }, { "acc": 0.76113348, "epoch": 0.4223750988950705, "grad_norm": 3.8125, "learning_rate": 9.30630433312847e-06, "loss": 0.93016272, "memory(GiB)": 752.04, "step": 16650, "train_speed(iter/s)": 0.858486 }, { "acc": 0.7607779, "epoch": 0.4225019382641081, "grad_norm": 3.9375, "learning_rate": 9.305771352349877e-06, "loss": 0.90249929, "memory(GiB)": 752.04, "step": 16655, "train_speed(iter/s)": 0.855559 }, { "acc": 0.76335125, "epoch": 0.42262877763314566, "grad_norm": 3.015625, "learning_rate": 9.305238182174121e-06, "loss": 0.94009342, "memory(GiB)": 752.04, "step": 16660, "train_speed(iter/s)": 0.852924 }, { "acc": 0.75765276, "epoch": 0.4227556170021832, "grad_norm": 3.703125, "learning_rate": 9.304704822624656e-06, "loss": 0.89940224, "memory(GiB)": 752.04, "step": 16665, "train_speed(iter/s)": 0.849809 }, { "acc": 0.77308612, "epoch": 0.4228824563712208, "grad_norm": 3.453125, "learning_rate": 9.304171273724942e-06, "loss": 0.8768033, "memory(GiB)": 752.04, "step": 16670, "train_speed(iter/s)": 0.847156 }, { "acc": 0.77375054, "epoch": 0.42300929574025836, "grad_norm": 3.921875, "learning_rate": 9.303637535498445e-06, "loss": 0.88723688, "memory(GiB)": 752.04, "step": 16675, "train_speed(iter/s)": 0.84461 }, { "acc": 0.75877581, "epoch": 0.4231361351092959, "grad_norm": 3.96875, "learning_rate": 9.303103607968647e-06, "loss": 0.92537403, "memory(GiB)": 752.04, "step": 16680, "train_speed(iter/s)": 0.84221 }, { "acc": 0.75334749, "epoch": 0.42326297447833344, "grad_norm": 3.140625, "learning_rate": 9.302569491159032e-06, "loss": 0.93089409, "memory(GiB)": 752.04, "step": 16685, "train_speed(iter/s)": 0.839693 }, { "acc": 0.75605149, "epoch": 0.423389813847371, "grad_norm": 4.59375, "learning_rate": 9.302035185093092e-06, "loss": 0.92447357, "memory(GiB)": 752.04, "step": 16690, "train_speed(iter/s)": 0.837148 }, { "acc": 0.76670775, "epoch": 0.4235166532164086, "grad_norm": 3.640625, "learning_rate": 9.301500689794334e-06, "loss": 0.87230682, "memory(GiB)": 752.04, "step": 16695, "train_speed(iter/s)": 0.834503 }, { "acc": 0.76726856, "epoch": 0.42364349258544615, "grad_norm": 3.3125, "learning_rate": 9.300966005286263e-06, "loss": 0.90847616, "memory(GiB)": 752.04, "step": 16700, "train_speed(iter/s)": 0.831805 }, { "acc": 0.756673, "epoch": 0.4237703319544837, "grad_norm": 3.46875, "learning_rate": 9.300431131592404e-06, "loss": 0.9441535, "memory(GiB)": 752.04, "step": 16705, "train_speed(iter/s)": 0.829252 }, { "acc": 0.75977774, "epoch": 0.4238971713235212, "grad_norm": 4.0625, "learning_rate": 9.299896068736281e-06, "loss": 0.91797304, "memory(GiB)": 752.04, "step": 16710, "train_speed(iter/s)": 0.826872 }, { "acc": 0.76273084, "epoch": 0.4240240106925588, "grad_norm": 3.546875, "learning_rate": 9.299360816741428e-06, "loss": 0.92499304, "memory(GiB)": 752.04, "step": 16715, "train_speed(iter/s)": 0.824273 }, { "acc": 0.7620595, "epoch": 0.42415085006159636, "grad_norm": 3.390625, "learning_rate": 9.298825375631394e-06, "loss": 0.9034337, "memory(GiB)": 752.04, "step": 16720, "train_speed(iter/s)": 0.821546 }, { "acc": 0.75370593, "epoch": 0.42427768943063393, "grad_norm": 3.578125, "learning_rate": 9.298289745429728e-06, "loss": 0.94746723, "memory(GiB)": 752.04, "step": 16725, "train_speed(iter/s)": 0.818854 }, { "acc": 0.7540307, "epoch": 0.4244045287996715, "grad_norm": 3.65625, "learning_rate": 9.29775392615999e-06, "loss": 0.89816217, "memory(GiB)": 752.04, "step": 16730, "train_speed(iter/s)": 0.816543 }, { "acc": 0.76118679, "epoch": 0.42453136816870907, "grad_norm": 3.953125, "learning_rate": 9.29721791784575e-06, "loss": 0.9329566, "memory(GiB)": 752.04, "step": 16735, "train_speed(iter/s)": 0.81422 }, { "acc": 0.76504612, "epoch": 0.4246582075377466, "grad_norm": 3.453125, "learning_rate": 9.296681720510586e-06, "loss": 0.87678909, "memory(GiB)": 752.04, "step": 16740, "train_speed(iter/s)": 0.811779 }, { "acc": 0.76255937, "epoch": 0.42478504690678415, "grad_norm": 3.40625, "learning_rate": 9.296145334178085e-06, "loss": 0.9154376, "memory(GiB)": 752.04, "step": 16745, "train_speed(iter/s)": 0.809568 }, { "acc": 0.76390824, "epoch": 0.4249118862758217, "grad_norm": 4.75, "learning_rate": 9.295608758871837e-06, "loss": 0.91762161, "memory(GiB)": 752.04, "step": 16750, "train_speed(iter/s)": 0.807053 }, { "acc": 0.75976591, "epoch": 0.4250387256448593, "grad_norm": 3.28125, "learning_rate": 9.295071994615447e-06, "loss": 0.9051137, "memory(GiB)": 752.04, "step": 16755, "train_speed(iter/s)": 0.804661 }, { "acc": 0.7662364, "epoch": 0.42516556501389685, "grad_norm": 4.125, "learning_rate": 9.294535041432525e-06, "loss": 0.94627104, "memory(GiB)": 752.04, "step": 16760, "train_speed(iter/s)": 0.802441 }, { "acc": 0.75861454, "epoch": 0.4252924043829344, "grad_norm": 3.9375, "learning_rate": 9.29399789934669e-06, "loss": 0.96450453, "memory(GiB)": 752.04, "step": 16765, "train_speed(iter/s)": 0.800007 }, { "acc": 0.75411601, "epoch": 0.42541924375197193, "grad_norm": 3.703125, "learning_rate": 9.29346056838157e-06, "loss": 0.92933464, "memory(GiB)": 752.04, "step": 16770, "train_speed(iter/s)": 0.797464 }, { "acc": 0.76811004, "epoch": 0.4255460831210095, "grad_norm": 3.453125, "learning_rate": 9.292923048560797e-06, "loss": 0.8883256, "memory(GiB)": 752.04, "step": 16775, "train_speed(iter/s)": 0.795415 }, { "acc": 0.76170321, "epoch": 0.42567292249004707, "grad_norm": 3.375, "learning_rate": 9.292385339908018e-06, "loss": 0.94448252, "memory(GiB)": 752.04, "step": 16780, "train_speed(iter/s)": 0.793182 }, { "acc": 0.76803608, "epoch": 0.42579976185908464, "grad_norm": 3.421875, "learning_rate": 9.291847442446884e-06, "loss": 0.91381645, "memory(GiB)": 752.04, "step": 16785, "train_speed(iter/s)": 0.790772 }, { "acc": 0.75273385, "epoch": 0.4259266012281222, "grad_norm": 3.421875, "learning_rate": 9.291309356201058e-06, "loss": 0.90931187, "memory(GiB)": 752.04, "step": 16790, "train_speed(iter/s)": 0.78865 }, { "acc": 0.760502, "epoch": 0.42605344059715977, "grad_norm": 3.671875, "learning_rate": 9.290771081194203e-06, "loss": 0.9447835, "memory(GiB)": 752.04, "step": 16795, "train_speed(iter/s)": 0.786285 }, { "acc": 0.76021204, "epoch": 0.4261802799661973, "grad_norm": 4.0, "learning_rate": 9.290232617450002e-06, "loss": 0.93934307, "memory(GiB)": 752.04, "step": 16800, "train_speed(iter/s)": 0.783864 }, { "acc": 0.76434655, "epoch": 0.42630711933523485, "grad_norm": 3.0625, "learning_rate": 9.289693964992137e-06, "loss": 0.90341444, "memory(GiB)": 752.04, "step": 16805, "train_speed(iter/s)": 0.781747 }, { "acc": 0.77956305, "epoch": 0.4264339587042724, "grad_norm": 3.625, "learning_rate": 9.289155123844302e-06, "loss": 0.90035381, "memory(GiB)": 752.04, "step": 16810, "train_speed(iter/s)": 0.77954 }, { "acc": 0.76763144, "epoch": 0.42656079807331, "grad_norm": 5.125, "learning_rate": 9.288616094030198e-06, "loss": 0.88087997, "memory(GiB)": 752.04, "step": 16815, "train_speed(iter/s)": 0.777373 }, { "acc": 0.76224618, "epoch": 0.42668763744234756, "grad_norm": 3.421875, "learning_rate": 9.288076875573537e-06, "loss": 0.91785631, "memory(GiB)": 752.04, "step": 16820, "train_speed(iter/s)": 0.775131 }, { "acc": 0.76773858, "epoch": 0.4268144768113851, "grad_norm": 3.515625, "learning_rate": 9.287537468498038e-06, "loss": 0.87247887, "memory(GiB)": 752.04, "step": 16825, "train_speed(iter/s)": 0.77296 }, { "acc": 0.75557237, "epoch": 0.42694131618042264, "grad_norm": 3.625, "learning_rate": 9.286997872827426e-06, "loss": 0.95427389, "memory(GiB)": 752.04, "step": 16830, "train_speed(iter/s)": 0.770796 }, { "acc": 0.76462746, "epoch": 0.4270681555494602, "grad_norm": 3.609375, "learning_rate": 9.286458088585437e-06, "loss": 0.87454824, "memory(GiB)": 752.04, "step": 16835, "train_speed(iter/s)": 0.768754 }, { "acc": 0.76404762, "epoch": 0.4271949949184978, "grad_norm": 4.625, "learning_rate": 9.285918115795814e-06, "loss": 0.97550812, "memory(GiB)": 752.04, "step": 16840, "train_speed(iter/s)": 0.766624 }, { "acc": 0.76044559, "epoch": 0.42732183428753534, "grad_norm": 4.8125, "learning_rate": 9.285377954482309e-06, "loss": 0.9426631, "memory(GiB)": 752.04, "step": 16845, "train_speed(iter/s)": 0.764517 }, { "acc": 0.75284977, "epoch": 0.4274486736565729, "grad_norm": 3.9375, "learning_rate": 9.284837604668682e-06, "loss": 0.94207935, "memory(GiB)": 752.04, "step": 16850, "train_speed(iter/s)": 0.762204 }, { "acc": 0.76622486, "epoch": 0.4275755130256105, "grad_norm": 3.75, "learning_rate": 9.2842970663787e-06, "loss": 0.95510674, "memory(GiB)": 752.04, "step": 16855, "train_speed(iter/s)": 0.760153 }, { "acc": 0.77089543, "epoch": 0.427702352394648, "grad_norm": 3.4375, "learning_rate": 9.283756339636141e-06, "loss": 0.87655487, "memory(GiB)": 752.04, "step": 16860, "train_speed(iter/s)": 0.758151 }, { "acc": 0.77055888, "epoch": 0.42782919176368556, "grad_norm": 3.4375, "learning_rate": 9.28321542446479e-06, "loss": 0.88990135, "memory(GiB)": 752.04, "step": 16865, "train_speed(iter/s)": 0.756146 }, { "acc": 0.75618305, "epoch": 0.4279560311327231, "grad_norm": 3.46875, "learning_rate": 9.282674320888441e-06, "loss": 0.96294298, "memory(GiB)": 752.04, "step": 16870, "train_speed(iter/s)": 0.753885 }, { "acc": 0.76751275, "epoch": 0.4280828705017607, "grad_norm": 3.9375, "learning_rate": 9.282133028930892e-06, "loss": 0.91186876, "memory(GiB)": 752.04, "step": 16875, "train_speed(iter/s)": 0.751872 }, { "acc": 0.74640231, "epoch": 0.42820970987079826, "grad_norm": 4.84375, "learning_rate": 9.281591548615955e-06, "loss": 1.0243516, "memory(GiB)": 752.04, "step": 16880, "train_speed(iter/s)": 0.749971 }, { "acc": 0.74708362, "epoch": 0.42833654923983583, "grad_norm": 3.765625, "learning_rate": 9.281049879967449e-06, "loss": 0.98390131, "memory(GiB)": 752.04, "step": 16885, "train_speed(iter/s)": 0.748012 }, { "acc": 0.76734414, "epoch": 0.42846338860887334, "grad_norm": 11.875, "learning_rate": 9.280508023009198e-06, "loss": 0.86379175, "memory(GiB)": 752.04, "step": 16890, "train_speed(iter/s)": 0.746161 }, { "acc": 0.75650539, "epoch": 0.4285902279779109, "grad_norm": 3.4375, "learning_rate": 9.279965977765037e-06, "loss": 0.95064707, "memory(GiB)": 752.04, "step": 16895, "train_speed(iter/s)": 0.744246 }, { "acc": 0.74352264, "epoch": 0.4287170673469485, "grad_norm": 3.59375, "learning_rate": 9.27942374425881e-06, "loss": 0.95996628, "memory(GiB)": 752.04, "step": 16900, "train_speed(iter/s)": 0.742372 }, { "acc": 0.75343275, "epoch": 0.42884390671598605, "grad_norm": 3.515625, "learning_rate": 9.278881322514368e-06, "loss": 0.91236382, "memory(GiB)": 752.04, "step": 16905, "train_speed(iter/s)": 0.740238 }, { "acc": 0.76255116, "epoch": 0.4289707460850236, "grad_norm": 3.8125, "learning_rate": 9.27833871255557e-06, "loss": 0.95483522, "memory(GiB)": 752.04, "step": 16910, "train_speed(iter/s)": 0.738441 }, { "acc": 0.76499972, "epoch": 0.4290975854540612, "grad_norm": 3.53125, "learning_rate": 9.277795914406284e-06, "loss": 0.92518911, "memory(GiB)": 752.04, "step": 16915, "train_speed(iter/s)": 0.736776 }, { "acc": 0.76947045, "epoch": 0.4292244248230987, "grad_norm": 3.5, "learning_rate": 9.277252928090386e-06, "loss": 0.93368673, "memory(GiB)": 752.04, "step": 16920, "train_speed(iter/s)": 0.734964 }, { "acc": 0.74321585, "epoch": 0.42935126419213626, "grad_norm": 3.578125, "learning_rate": 9.276709753631758e-06, "loss": 0.98927002, "memory(GiB)": 752.04, "step": 16925, "train_speed(iter/s)": 0.73319 }, { "acc": 0.74919853, "epoch": 0.42947810356117383, "grad_norm": 3.40625, "learning_rate": 9.276166391054295e-06, "loss": 0.89783354, "memory(GiB)": 752.04, "step": 16930, "train_speed(iter/s)": 0.731112 }, { "acc": 0.75746307, "epoch": 0.4296049429302114, "grad_norm": 3.046875, "learning_rate": 9.275622840381897e-06, "loss": 0.93282213, "memory(GiB)": 752.04, "step": 16935, "train_speed(iter/s)": 0.729153 }, { "acc": 0.7662921, "epoch": 0.42973178229924897, "grad_norm": 3.640625, "learning_rate": 9.275079101638473e-06, "loss": 0.9256628, "memory(GiB)": 752.04, "step": 16940, "train_speed(iter/s)": 0.727415 }, { "acc": 0.76170845, "epoch": 0.42985862166828653, "grad_norm": 3.9375, "learning_rate": 9.274535174847942e-06, "loss": 0.90086508, "memory(GiB)": 752.04, "step": 16945, "train_speed(iter/s)": 0.725432 }, { "acc": 0.7751976, "epoch": 0.42998546103732405, "grad_norm": 4.34375, "learning_rate": 9.273991060034228e-06, "loss": 0.85407124, "memory(GiB)": 752.04, "step": 16950, "train_speed(iter/s)": 0.723524 }, { "acc": 0.75338368, "epoch": 0.4301123004063616, "grad_norm": 3.84375, "learning_rate": 9.273446757221264e-06, "loss": 0.94367647, "memory(GiB)": 752.04, "step": 16955, "train_speed(iter/s)": 0.721798 }, { "acc": 0.75984764, "epoch": 0.4302391397753992, "grad_norm": 3.515625, "learning_rate": 9.272902266432993e-06, "loss": 0.89492788, "memory(GiB)": 752.04, "step": 16960, "train_speed(iter/s)": 0.719938 }, { "acc": 0.76830554, "epoch": 0.43036597914443675, "grad_norm": 4.4375, "learning_rate": 9.272357587693367e-06, "loss": 0.90827866, "memory(GiB)": 752.04, "step": 16965, "train_speed(iter/s)": 0.718313 }, { "acc": 0.75107775, "epoch": 0.4304928185134743, "grad_norm": 3.03125, "learning_rate": 9.271812721026342e-06, "loss": 0.9771718, "memory(GiB)": 752.04, "step": 16970, "train_speed(iter/s)": 0.716414 }, { "acc": 0.77086883, "epoch": 0.4306196578825119, "grad_norm": 3.390625, "learning_rate": 9.271267666455887e-06, "loss": 0.88362446, "memory(GiB)": 752.04, "step": 16975, "train_speed(iter/s)": 0.714454 }, { "acc": 0.75164852, "epoch": 0.4307464972515494, "grad_norm": 2.984375, "learning_rate": 9.270722424005975e-06, "loss": 0.94090652, "memory(GiB)": 752.04, "step": 16980, "train_speed(iter/s)": 0.71256 }, { "acc": 0.76599975, "epoch": 0.43087333662058697, "grad_norm": 3.28125, "learning_rate": 9.270176993700592e-06, "loss": 0.91677628, "memory(GiB)": 752.04, "step": 16985, "train_speed(iter/s)": 0.710645 }, { "acc": 0.75504212, "epoch": 0.43100017598962453, "grad_norm": 3.390625, "learning_rate": 9.26963137556373e-06, "loss": 0.93113918, "memory(GiB)": 752.04, "step": 16990, "train_speed(iter/s)": 0.708725 }, { "acc": 0.75724673, "epoch": 0.4311270153586621, "grad_norm": 3.46875, "learning_rate": 9.269085569619386e-06, "loss": 0.91140079, "memory(GiB)": 752.04, "step": 16995, "train_speed(iter/s)": 0.706841 }, { "acc": 0.7792625, "epoch": 0.43125385472769967, "grad_norm": 3.15625, "learning_rate": 9.268539575891571e-06, "loss": 0.82465162, "memory(GiB)": 752.04, "step": 17000, "train_speed(iter/s)": 0.705099 }, { "epoch": 0.43125385472769967, "eval_acc": 0.7510679176848004, "eval_loss": 0.889408528804779, "eval_runtime": 1148.3064, "eval_samples_per_second": 5.547, "eval_steps_per_second": 5.547, "step": 17000 }, { "acc": 0.76515369, "epoch": 0.43138069409673724, "grad_norm": 3.75, "learning_rate": 9.2679933944043e-06, "loss": 0.90755625, "memory(GiB)": 752.04, "step": 17005, "train_speed(iter/s)": 0.652462 }, { "acc": 0.75337224, "epoch": 0.43150753346577475, "grad_norm": 3.71875, "learning_rate": 9.267447025181597e-06, "loss": 0.9509922, "memory(GiB)": 752.04, "step": 17010, "train_speed(iter/s)": 0.650869 }, { "acc": 0.75005889, "epoch": 0.4316343728348123, "grad_norm": 4.125, "learning_rate": 9.266900468247497e-06, "loss": 0.9828248, "memory(GiB)": 752.04, "step": 17015, "train_speed(iter/s)": 0.649617 }, { "acc": 0.77477288, "epoch": 0.4317612122038499, "grad_norm": 4.125, "learning_rate": 9.26635372362604e-06, "loss": 0.88132124, "memory(GiB)": 752.04, "step": 17020, "train_speed(iter/s)": 0.648198 }, { "acc": 0.75046391, "epoch": 0.43188805157288745, "grad_norm": 3.5, "learning_rate": 9.265806791341278e-06, "loss": 0.9348135, "memory(GiB)": 752.04, "step": 17025, "train_speed(iter/s)": 0.646795 }, { "acc": 0.7646389, "epoch": 0.432014890941925, "grad_norm": 4.0625, "learning_rate": 9.265259671417266e-06, "loss": 0.9219388, "memory(GiB)": 752.04, "step": 17030, "train_speed(iter/s)": 0.645397 }, { "acc": 0.75886302, "epoch": 0.4321417303109626, "grad_norm": 3.84375, "learning_rate": 9.26471236387807e-06, "loss": 0.94367657, "memory(GiB)": 752.04, "step": 17035, "train_speed(iter/s)": 0.644031 }, { "acc": 0.76289482, "epoch": 0.4322685696800001, "grad_norm": 2.78125, "learning_rate": 9.264164868747767e-06, "loss": 0.92502069, "memory(GiB)": 752.04, "step": 17040, "train_speed(iter/s)": 0.642495 }, { "acc": 0.76361012, "epoch": 0.43239540904903767, "grad_norm": 3.53125, "learning_rate": 9.263617186050438e-06, "loss": 0.9424098, "memory(GiB)": 752.04, "step": 17045, "train_speed(iter/s)": 0.640964 }, { "acc": 0.75822692, "epoch": 0.43252224841807524, "grad_norm": 3.28125, "learning_rate": 9.263069315810171e-06, "loss": 0.98524389, "memory(GiB)": 752.04, "step": 17050, "train_speed(iter/s)": 0.63939 }, { "acc": 0.77268314, "epoch": 0.4326490877871128, "grad_norm": 4.21875, "learning_rate": 9.26252125805107e-06, "loss": 0.90477886, "memory(GiB)": 752.04, "step": 17055, "train_speed(iter/s)": 0.637988 }, { "acc": 0.762849, "epoch": 0.4327759271561504, "grad_norm": 3.953125, "learning_rate": 9.261973012797241e-06, "loss": 0.92850151, "memory(GiB)": 752.04, "step": 17060, "train_speed(iter/s)": 0.63643 }, { "acc": 0.7672657, "epoch": 0.43290276652518794, "grad_norm": 3.40625, "learning_rate": 9.261424580072797e-06, "loss": 0.84145012, "memory(GiB)": 752.04, "step": 17065, "train_speed(iter/s)": 0.635148 }, { "acc": 0.760184, "epoch": 0.43302960589422546, "grad_norm": 3.84375, "learning_rate": 9.260875959901863e-06, "loss": 0.91256018, "memory(GiB)": 752.04, "step": 17070, "train_speed(iter/s)": 0.633587 }, { "acc": 0.75056105, "epoch": 0.433156445263263, "grad_norm": 3.453125, "learning_rate": 9.260327152308573e-06, "loss": 0.9881917, "memory(GiB)": 752.04, "step": 17075, "train_speed(iter/s)": 0.632245 }, { "acc": 0.75741949, "epoch": 0.4332832846323006, "grad_norm": 3.953125, "learning_rate": 9.259778157317065e-06, "loss": 0.93030348, "memory(GiB)": 752.04, "step": 17080, "train_speed(iter/s)": 0.630845 }, { "acc": 0.7766603, "epoch": 0.43341012400133816, "grad_norm": 3.5625, "learning_rate": 9.259228974951488e-06, "loss": 0.85100384, "memory(GiB)": 752.04, "step": 17085, "train_speed(iter/s)": 0.629524 }, { "acc": 0.76292639, "epoch": 0.4335369633703757, "grad_norm": 3.171875, "learning_rate": 9.258679605235999e-06, "loss": 0.91935759, "memory(GiB)": 752.04, "step": 17090, "train_speed(iter/s)": 0.628078 }, { "acc": 0.76269054, "epoch": 0.4336638027394133, "grad_norm": 3.859375, "learning_rate": 9.258130048194764e-06, "loss": 0.91690292, "memory(GiB)": 752.04, "step": 17095, "train_speed(iter/s)": 0.626789 }, { "acc": 0.76336188, "epoch": 0.4337906421084508, "grad_norm": 3.4375, "learning_rate": 9.257580303851955e-06, "loss": 0.93913813, "memory(GiB)": 752.04, "step": 17100, "train_speed(iter/s)": 0.62557 }, { "acc": 0.75587959, "epoch": 0.4339174814774884, "grad_norm": 3.046875, "learning_rate": 9.257030372231754e-06, "loss": 0.93519793, "memory(GiB)": 752.04, "step": 17105, "train_speed(iter/s)": 0.624275 }, { "acc": 0.7673563, "epoch": 0.43404432084652594, "grad_norm": 3.515625, "learning_rate": 9.256480253358351e-06, "loss": 0.87076893, "memory(GiB)": 752.04, "step": 17110, "train_speed(iter/s)": 0.62281 }, { "acc": 0.76370339, "epoch": 0.4341711602155635, "grad_norm": 4.4375, "learning_rate": 9.255929947255942e-06, "loss": 0.92100649, "memory(GiB)": 752.04, "step": 17115, "train_speed(iter/s)": 0.621573 }, { "acc": 0.76429071, "epoch": 0.4342979995846011, "grad_norm": 3.609375, "learning_rate": 9.255379453948736e-06, "loss": 0.90534315, "memory(GiB)": 752.04, "step": 17120, "train_speed(iter/s)": 0.620177 }, { "acc": 0.75706472, "epoch": 0.43442483895363865, "grad_norm": 3.390625, "learning_rate": 9.254828773460946e-06, "loss": 0.90868883, "memory(GiB)": 752.04, "step": 17125, "train_speed(iter/s)": 0.618932 }, { "acc": 0.76396375, "epoch": 0.43455167832267616, "grad_norm": 3.53125, "learning_rate": 9.254277905816796e-06, "loss": 0.86945791, "memory(GiB)": 752.04, "step": 17130, "train_speed(iter/s)": 0.617683 }, { "acc": 0.77274747, "epoch": 0.43467851769171373, "grad_norm": 3.703125, "learning_rate": 9.253726851040516e-06, "loss": 0.87686348, "memory(GiB)": 752.04, "step": 17135, "train_speed(iter/s)": 0.616473 }, { "acc": 0.78024602, "epoch": 0.4348053570607513, "grad_norm": 3.84375, "learning_rate": 9.253175609156344e-06, "loss": 0.88984623, "memory(GiB)": 752.04, "step": 17140, "train_speed(iter/s)": 0.61528 }, { "acc": 0.76908555, "epoch": 0.43493219642978886, "grad_norm": 3.640625, "learning_rate": 9.252624180188529e-06, "loss": 0.90899973, "memory(GiB)": 752.04, "step": 17145, "train_speed(iter/s)": 0.613941 }, { "acc": 0.75080304, "epoch": 0.43505903579882643, "grad_norm": 3.078125, "learning_rate": 9.252072564161325e-06, "loss": 0.98085413, "memory(GiB)": 752.04, "step": 17150, "train_speed(iter/s)": 0.612607 }, { "acc": 0.76476345, "epoch": 0.435185875167864, "grad_norm": 3.453125, "learning_rate": 9.251520761098997e-06, "loss": 0.92950745, "memory(GiB)": 752.04, "step": 17155, "train_speed(iter/s)": 0.611268 }, { "acc": 0.74280372, "epoch": 0.4353127145369015, "grad_norm": 6.375, "learning_rate": 9.250968771025817e-06, "loss": 0.97175407, "memory(GiB)": 752.04, "step": 17160, "train_speed(iter/s)": 0.610057 }, { "acc": 0.768081, "epoch": 0.4354395539059391, "grad_norm": 3.609375, "learning_rate": 9.250416593966066e-06, "loss": 0.91688213, "memory(GiB)": 752.04, "step": 17165, "train_speed(iter/s)": 0.608901 }, { "acc": 0.76439004, "epoch": 0.43556639327497665, "grad_norm": 3.484375, "learning_rate": 9.249864229944032e-06, "loss": 0.96590624, "memory(GiB)": 752.04, "step": 17170, "train_speed(iter/s)": 0.607694 }, { "acc": 0.77416134, "epoch": 0.4356932326440142, "grad_norm": 3.546875, "learning_rate": 9.249311678984009e-06, "loss": 0.8546422, "memory(GiB)": 752.04, "step": 17175, "train_speed(iter/s)": 0.606465 }, { "acc": 0.74875579, "epoch": 0.4358200720130518, "grad_norm": 3.234375, "learning_rate": 9.248758941110307e-06, "loss": 0.90619917, "memory(GiB)": 752.04, "step": 17180, "train_speed(iter/s)": 0.60522 }, { "acc": 0.76832027, "epoch": 0.43594691138208935, "grad_norm": 4.0, "learning_rate": 9.248206016347237e-06, "loss": 0.87701044, "memory(GiB)": 752.04, "step": 17185, "train_speed(iter/s)": 0.604003 }, { "acc": 0.7567812, "epoch": 0.43607375075112687, "grad_norm": 3.359375, "learning_rate": 9.247652904719118e-06, "loss": 0.90384398, "memory(GiB)": 752.04, "step": 17190, "train_speed(iter/s)": 0.602755 }, { "acc": 0.75724292, "epoch": 0.43620059012016443, "grad_norm": 3.359375, "learning_rate": 9.247099606250282e-06, "loss": 0.89462719, "memory(GiB)": 752.04, "step": 17195, "train_speed(iter/s)": 0.601442 }, { "acc": 0.76763029, "epoch": 0.436327429489202, "grad_norm": 3.515625, "learning_rate": 9.246546120965066e-06, "loss": 0.93092909, "memory(GiB)": 752.04, "step": 17200, "train_speed(iter/s)": 0.600175 }, { "acc": 0.75007458, "epoch": 0.43645426885823957, "grad_norm": 3.3125, "learning_rate": 9.245992448887817e-06, "loss": 0.98475361, "memory(GiB)": 752.04, "step": 17205, "train_speed(iter/s)": 0.598867 }, { "acc": 0.76088395, "epoch": 0.43658110822727714, "grad_norm": 3.71875, "learning_rate": 9.245438590042887e-06, "loss": 0.91290674, "memory(GiB)": 752.04, "step": 17210, "train_speed(iter/s)": 0.597622 }, { "acc": 0.76507697, "epoch": 0.4367079475963147, "grad_norm": 3.734375, "learning_rate": 9.244884544454645e-06, "loss": 0.92375975, "memory(GiB)": 752.04, "step": 17215, "train_speed(iter/s)": 0.596503 }, { "acc": 0.75328612, "epoch": 0.4368347869653522, "grad_norm": 3.5, "learning_rate": 9.244330312147451e-06, "loss": 0.93155127, "memory(GiB)": 752.04, "step": 17220, "train_speed(iter/s)": 0.595295 }, { "acc": 0.76219182, "epoch": 0.4369616263343898, "grad_norm": 3.390625, "learning_rate": 9.243775893145695e-06, "loss": 0.88771505, "memory(GiB)": 752.04, "step": 17225, "train_speed(iter/s)": 0.594028 }, { "acc": 0.76123714, "epoch": 0.43708846570342735, "grad_norm": 4.21875, "learning_rate": 9.243221287473755e-06, "loss": 0.95718164, "memory(GiB)": 752.04, "step": 17230, "train_speed(iter/s)": 0.592903 }, { "acc": 0.75061526, "epoch": 0.4372153050724649, "grad_norm": 3.46875, "learning_rate": 9.242666495156033e-06, "loss": 0.93142223, "memory(GiB)": 752.04, "step": 17235, "train_speed(iter/s)": 0.591796 }, { "acc": 0.75328798, "epoch": 0.4373421444415025, "grad_norm": 4.125, "learning_rate": 9.242111516216928e-06, "loss": 0.97715549, "memory(GiB)": 752.04, "step": 17240, "train_speed(iter/s)": 0.590513 }, { "acc": 0.76242857, "epoch": 0.43746898381054006, "grad_norm": 3.3125, "learning_rate": 9.241556350680854e-06, "loss": 0.87242918, "memory(GiB)": 752.04, "step": 17245, "train_speed(iter/s)": 0.58935 }, { "acc": 0.75787868, "epoch": 0.43759582317957757, "grad_norm": 3.265625, "learning_rate": 9.241000998572232e-06, "loss": 0.94600792, "memory(GiB)": 752.04, "step": 17250, "train_speed(iter/s)": 0.588149 }, { "acc": 0.75710173, "epoch": 0.43772266254861514, "grad_norm": 3.21875, "learning_rate": 9.240445459915487e-06, "loss": 0.93969336, "memory(GiB)": 752.04, "step": 17255, "train_speed(iter/s)": 0.586909 }, { "acc": 0.76795993, "epoch": 0.4378495019176527, "grad_norm": 4.84375, "learning_rate": 9.239889734735058e-06, "loss": 0.91453438, "memory(GiB)": 752.04, "step": 17260, "train_speed(iter/s)": 0.585693 }, { "acc": 0.77325039, "epoch": 0.4379763412866903, "grad_norm": 3.8125, "learning_rate": 9.239333823055388e-06, "loss": 0.97185583, "memory(GiB)": 752.04, "step": 17265, "train_speed(iter/s)": 0.584624 }, { "acc": 0.75762429, "epoch": 0.43810318065572784, "grad_norm": 4.625, "learning_rate": 9.238777724900931e-06, "loss": 0.89505911, "memory(GiB)": 752.04, "step": 17270, "train_speed(iter/s)": 0.583409 }, { "acc": 0.76575289, "epoch": 0.4382300200247654, "grad_norm": 3.1875, "learning_rate": 9.238221440296147e-06, "loss": 0.91542511, "memory(GiB)": 752.04, "step": 17275, "train_speed(iter/s)": 0.582173 }, { "acc": 0.76158328, "epoch": 0.4383568593938029, "grad_norm": 3.40625, "learning_rate": 9.237664969265507e-06, "loss": 0.92246256, "memory(GiB)": 752.04, "step": 17280, "train_speed(iter/s)": 0.581113 }, { "acc": 0.75891047, "epoch": 0.4384836987628405, "grad_norm": 3.09375, "learning_rate": 9.237108311833484e-06, "loss": 0.93455648, "memory(GiB)": 752.04, "step": 17285, "train_speed(iter/s)": 0.579917 }, { "acc": 0.7632978, "epoch": 0.43861053813187806, "grad_norm": 3.203125, "learning_rate": 9.23655146802457e-06, "loss": 0.92778168, "memory(GiB)": 752.04, "step": 17290, "train_speed(iter/s)": 0.578741 }, { "acc": 0.76389809, "epoch": 0.4387373775009156, "grad_norm": 3.953125, "learning_rate": 9.235994437863253e-06, "loss": 0.96175823, "memory(GiB)": 752.04, "step": 17295, "train_speed(iter/s)": 0.577629 }, { "acc": 0.76675615, "epoch": 0.4388642168699532, "grad_norm": 4.375, "learning_rate": 9.23543722137404e-06, "loss": 0.89781456, "memory(GiB)": 752.04, "step": 17300, "train_speed(iter/s)": 0.57653 }, { "acc": 0.76186085, "epoch": 0.43899105623899076, "grad_norm": 3.796875, "learning_rate": 9.234879818581435e-06, "loss": 0.93564463, "memory(GiB)": 752.04, "step": 17305, "train_speed(iter/s)": 0.575465 }, { "acc": 0.77263341, "epoch": 0.4391178956080283, "grad_norm": 3.65625, "learning_rate": 9.234322229509963e-06, "loss": 0.90093317, "memory(GiB)": 752.04, "step": 17310, "train_speed(iter/s)": 0.574286 }, { "acc": 0.74716744, "epoch": 0.43924473497706584, "grad_norm": 4.375, "learning_rate": 9.233764454184146e-06, "loss": 0.93919716, "memory(GiB)": 752.04, "step": 17315, "train_speed(iter/s)": 0.573192 }, { "acc": 0.75492043, "epoch": 0.4393715743461034, "grad_norm": 3.453125, "learning_rate": 9.23320649262852e-06, "loss": 0.95168142, "memory(GiB)": 752.04, "step": 17320, "train_speed(iter/s)": 0.57217 }, { "acc": 0.7613008, "epoch": 0.439498413715141, "grad_norm": 3.546875, "learning_rate": 9.23264834486763e-06, "loss": 0.96028795, "memory(GiB)": 752.04, "step": 17325, "train_speed(iter/s)": 0.571083 }, { "acc": 0.74685087, "epoch": 0.43962525308417855, "grad_norm": 3.671875, "learning_rate": 9.232090010926024e-06, "loss": 0.98335762, "memory(GiB)": 752.04, "step": 17330, "train_speed(iter/s)": 0.569985 }, { "acc": 0.76077204, "epoch": 0.4397520924532161, "grad_norm": 3.015625, "learning_rate": 9.231531490828262e-06, "loss": 0.9507431, "memory(GiB)": 752.04, "step": 17335, "train_speed(iter/s)": 0.568803 }, { "acc": 0.75751109, "epoch": 0.4398789318222536, "grad_norm": 3.15625, "learning_rate": 9.230972784598915e-06, "loss": 0.94134064, "memory(GiB)": 752.04, "step": 17340, "train_speed(iter/s)": 0.567648 }, { "acc": 0.75473709, "epoch": 0.4400057711912912, "grad_norm": 3.328125, "learning_rate": 9.230413892262553e-06, "loss": 0.95476313, "memory(GiB)": 752.04, "step": 17345, "train_speed(iter/s)": 0.566495 }, { "acc": 0.77486362, "epoch": 0.44013261056032876, "grad_norm": 4.375, "learning_rate": 9.229854813843764e-06, "loss": 0.85357094, "memory(GiB)": 752.04, "step": 17350, "train_speed(iter/s)": 0.565475 }, { "acc": 0.76219711, "epoch": 0.44025944992936633, "grad_norm": 3.15625, "learning_rate": 9.22929554936714e-06, "loss": 0.88108149, "memory(GiB)": 752.04, "step": 17355, "train_speed(iter/s)": 0.564379 }, { "acc": 0.77172422, "epoch": 0.4403862892984039, "grad_norm": 3.09375, "learning_rate": 9.228736098857277e-06, "loss": 0.85505123, "memory(GiB)": 752.04, "step": 17360, "train_speed(iter/s)": 0.563312 }, { "acc": 0.76711431, "epoch": 0.44051312866744147, "grad_norm": 3.15625, "learning_rate": 9.228176462338787e-06, "loss": 0.90796909, "memory(GiB)": 752.04, "step": 17365, "train_speed(iter/s)": 0.562297 }, { "acc": 0.77698679, "epoch": 0.440639968036479, "grad_norm": 3.234375, "learning_rate": 9.227616639836288e-06, "loss": 0.83449478, "memory(GiB)": 752.04, "step": 17370, "train_speed(iter/s)": 0.561244 }, { "acc": 0.75703015, "epoch": 0.44076680740551655, "grad_norm": 3.921875, "learning_rate": 9.227056631374403e-06, "loss": 0.95664892, "memory(GiB)": 752.04, "step": 17375, "train_speed(iter/s)": 0.560164 }, { "acc": 0.75430093, "epoch": 0.4408936467745541, "grad_norm": 3.53125, "learning_rate": 9.226496436977765e-06, "loss": 0.90963001, "memory(GiB)": 752.04, "step": 17380, "train_speed(iter/s)": 0.559262 }, { "acc": 0.76130323, "epoch": 0.4410204861435917, "grad_norm": 3.078125, "learning_rate": 9.225936056671015e-06, "loss": 0.94779367, "memory(GiB)": 752.04, "step": 17385, "train_speed(iter/s)": 0.558189 }, { "acc": 0.75927315, "epoch": 0.44114732551262925, "grad_norm": 3.25, "learning_rate": 9.225375490478802e-06, "loss": 0.90282841, "memory(GiB)": 752.04, "step": 17390, "train_speed(iter/s)": 0.557135 }, { "acc": 0.75421748, "epoch": 0.4412741648816668, "grad_norm": 3.875, "learning_rate": 9.224814738425785e-06, "loss": 0.95386801, "memory(GiB)": 752.04, "step": 17395, "train_speed(iter/s)": 0.556151 }, { "acc": 0.76378808, "epoch": 0.44140100425070433, "grad_norm": 9.125, "learning_rate": 9.224253800536626e-06, "loss": 0.92578735, "memory(GiB)": 752.04, "step": 17400, "train_speed(iter/s)": 0.554772 }, { "acc": 0.77074757, "epoch": 0.4415278436197419, "grad_norm": 3.03125, "learning_rate": 9.223692676836004e-06, "loss": 0.87828417, "memory(GiB)": 752.04, "step": 17405, "train_speed(iter/s)": 0.553711 }, { "acc": 0.76196308, "epoch": 0.44165468298877947, "grad_norm": 3.921875, "learning_rate": 9.223131367348599e-06, "loss": 0.89292526, "memory(GiB)": 752.04, "step": 17410, "train_speed(iter/s)": 0.552791 }, { "acc": 0.76476245, "epoch": 0.44178152235781704, "grad_norm": 3.28125, "learning_rate": 9.2225698720991e-06, "loss": 0.94495611, "memory(GiB)": 752.04, "step": 17415, "train_speed(iter/s)": 0.551751 }, { "acc": 0.751441, "epoch": 0.4419083617268546, "grad_norm": 3.609375, "learning_rate": 9.222008191112206e-06, "loss": 0.95230522, "memory(GiB)": 752.04, "step": 17420, "train_speed(iter/s)": 0.550825 }, { "acc": 0.76194105, "epoch": 0.44203520109589217, "grad_norm": 3.71875, "learning_rate": 9.221446324412626e-06, "loss": 0.9375536, "memory(GiB)": 752.04, "step": 17425, "train_speed(iter/s)": 0.549796 }, { "acc": 0.76212487, "epoch": 0.4421620404649297, "grad_norm": 4.46875, "learning_rate": 9.22088427202507e-06, "loss": 0.90662994, "memory(GiB)": 752.04, "step": 17430, "train_speed(iter/s)": 0.548719 }, { "acc": 0.75604687, "epoch": 0.44228887983396725, "grad_norm": 3.0625, "learning_rate": 9.220322033974265e-06, "loss": 0.92824097, "memory(GiB)": 752.04, "step": 17435, "train_speed(iter/s)": 0.547686 }, { "acc": 0.76190267, "epoch": 0.4424157192030048, "grad_norm": 4.875, "learning_rate": 9.21975961028494e-06, "loss": 0.86878977, "memory(GiB)": 752.04, "step": 17440, "train_speed(iter/s)": 0.546714 }, { "acc": 0.75372524, "epoch": 0.4425425585720424, "grad_norm": 3.8125, "learning_rate": 9.219197000981833e-06, "loss": 0.93527908, "memory(GiB)": 752.04, "step": 17445, "train_speed(iter/s)": 0.545649 }, { "acc": 0.76829009, "epoch": 0.44266939794107996, "grad_norm": 3.453125, "learning_rate": 9.218634206089696e-06, "loss": 0.90702305, "memory(GiB)": 752.04, "step": 17450, "train_speed(iter/s)": 0.544618 }, { "acc": 0.76762052, "epoch": 0.4427962373101175, "grad_norm": 3.0, "learning_rate": 9.218071225633278e-06, "loss": 0.89882879, "memory(GiB)": 752.04, "step": 17455, "train_speed(iter/s)": 0.543531 }, { "acc": 0.76529312, "epoch": 0.44292307667915504, "grad_norm": 4.125, "learning_rate": 9.21750805963735e-06, "loss": 0.8996233, "memory(GiB)": 752.04, "step": 17460, "train_speed(iter/s)": 0.542529 }, { "acc": 0.77449884, "epoch": 0.4430499160481926, "grad_norm": 4.09375, "learning_rate": 9.216944708126678e-06, "loss": 0.86779318, "memory(GiB)": 752.04, "step": 17465, "train_speed(iter/s)": 0.541545 }, { "acc": 0.76082859, "epoch": 0.4431767554172302, "grad_norm": 3.6875, "learning_rate": 9.216381171126046e-06, "loss": 0.90079861, "memory(GiB)": 752.04, "step": 17470, "train_speed(iter/s)": 0.540414 }, { "acc": 0.78159513, "epoch": 0.44330359478626774, "grad_norm": 3.546875, "learning_rate": 9.21581744866024e-06, "loss": 0.8769146, "memory(GiB)": 752.04, "step": 17475, "train_speed(iter/s)": 0.539386 }, { "acc": 0.7688271, "epoch": 0.4434304341553053, "grad_norm": 3.640625, "learning_rate": 9.215253540754056e-06, "loss": 0.93135071, "memory(GiB)": 752.04, "step": 17480, "train_speed(iter/s)": 0.538469 }, { "acc": 0.76235785, "epoch": 0.4435572735243429, "grad_norm": 3.59375, "learning_rate": 9.2146894474323e-06, "loss": 0.92538319, "memory(GiB)": 752.04, "step": 17485, "train_speed(iter/s)": 0.53754 }, { "acc": 0.75718751, "epoch": 0.4436841128933804, "grad_norm": 3.375, "learning_rate": 9.214125168719783e-06, "loss": 0.94602633, "memory(GiB)": 752.04, "step": 17490, "train_speed(iter/s)": 0.536629 }, { "acc": 0.76703162, "epoch": 0.44381095226241796, "grad_norm": 3.765625, "learning_rate": 9.213560704641328e-06, "loss": 0.84320555, "memory(GiB)": 752.04, "step": 17495, "train_speed(iter/s)": 0.535522 }, { "acc": 0.74984469, "epoch": 0.4439377916314555, "grad_norm": 3.828125, "learning_rate": 9.212996055221763e-06, "loss": 0.96863089, "memory(GiB)": 752.04, "step": 17500, "train_speed(iter/s)": 0.53456 }, { "epoch": 0.4439377916314555, "eval_acc": 0.7515713272926966, "eval_loss": 0.8878415822982788, "eval_runtime": 1148.5724, "eval_samples_per_second": 5.546, "eval_steps_per_second": 5.546, "step": 17500 }, { "acc": 0.74772496, "epoch": 0.4440646310004931, "grad_norm": 3.625, "learning_rate": 9.212431220485925e-06, "loss": 0.91516685, "memory(GiB)": 752.04, "step": 17505, "train_speed(iter/s)": 0.504649 }, { "acc": 0.76623216, "epoch": 0.44419147036953066, "grad_norm": 3.90625, "learning_rate": 9.211866200458657e-06, "loss": 0.92075024, "memory(GiB)": 752.04, "step": 17510, "train_speed(iter/s)": 0.503906 }, { "acc": 0.75598779, "epoch": 0.44431830973856823, "grad_norm": 3.140625, "learning_rate": 9.211300995164818e-06, "loss": 0.93408527, "memory(GiB)": 752.04, "step": 17515, "train_speed(iter/s)": 0.50304 }, { "acc": 0.76824121, "epoch": 0.44444514910760574, "grad_norm": 3.296875, "learning_rate": 9.210735604629264e-06, "loss": 0.90594034, "memory(GiB)": 752.04, "step": 17520, "train_speed(iter/s)": 0.502223 }, { "acc": 0.763166, "epoch": 0.4445719884766433, "grad_norm": 3.578125, "learning_rate": 9.210170028876868e-06, "loss": 0.95275393, "memory(GiB)": 752.04, "step": 17525, "train_speed(iter/s)": 0.501205 }, { "acc": 0.74441042, "epoch": 0.4446988278456809, "grad_norm": 3.203125, "learning_rate": 9.209604267932506e-06, "loss": 0.96105356, "memory(GiB)": 752.04, "step": 17530, "train_speed(iter/s)": 0.500391 }, { "acc": 0.75991721, "epoch": 0.44482566721471845, "grad_norm": 3.21875, "learning_rate": 9.209038321821066e-06, "loss": 0.92141285, "memory(GiB)": 752.04, "step": 17535, "train_speed(iter/s)": 0.499522 }, { "acc": 0.76351867, "epoch": 0.444952506583756, "grad_norm": 3.515625, "learning_rate": 9.20847219056744e-06, "loss": 0.9070529, "memory(GiB)": 752.04, "step": 17540, "train_speed(iter/s)": 0.498709 }, { "acc": 0.77473431, "epoch": 0.4450793459527936, "grad_norm": 3.9375, "learning_rate": 9.207905874196532e-06, "loss": 0.88762779, "memory(GiB)": 752.04, "step": 17545, "train_speed(iter/s)": 0.497971 }, { "acc": 0.7636663, "epoch": 0.4452061853218311, "grad_norm": 3.734375, "learning_rate": 9.207339372733251e-06, "loss": 0.89415646, "memory(GiB)": 752.04, "step": 17550, "train_speed(iter/s)": 0.497132 }, { "acc": 0.74918108, "epoch": 0.44533302469086866, "grad_norm": 4.0625, "learning_rate": 9.206772686202517e-06, "loss": 0.95721169, "memory(GiB)": 752.04, "step": 17555, "train_speed(iter/s)": 0.496182 }, { "acc": 0.75677757, "epoch": 0.44545986405990623, "grad_norm": 3.34375, "learning_rate": 9.206205814629257e-06, "loss": 0.94767895, "memory(GiB)": 752.05, "step": 17560, "train_speed(iter/s)": 0.49535 }, { "acc": 0.76904621, "epoch": 0.4455867034289438, "grad_norm": 3.28125, "learning_rate": 9.205638758038403e-06, "loss": 0.91630182, "memory(GiB)": 752.05, "step": 17565, "train_speed(iter/s)": 0.49461 }, { "acc": 0.75825996, "epoch": 0.44571354279798137, "grad_norm": 3.03125, "learning_rate": 9.205071516454902e-06, "loss": 0.90606422, "memory(GiB)": 752.05, "step": 17570, "train_speed(iter/s)": 0.493857 }, { "acc": 0.76994724, "epoch": 0.44584038216701893, "grad_norm": 2.828125, "learning_rate": 9.204504089903702e-06, "loss": 0.857967, "memory(GiB)": 752.05, "step": 17575, "train_speed(iter/s)": 0.49314 }, { "acc": 0.74788499, "epoch": 0.44596722153605645, "grad_norm": 3.359375, "learning_rate": 9.203936478409764e-06, "loss": 0.98743954, "memory(GiB)": 752.05, "step": 17580, "train_speed(iter/s)": 0.492362 }, { "acc": 0.7736342, "epoch": 0.446094060905094, "grad_norm": 3.546875, "learning_rate": 9.203368681998054e-06, "loss": 0.89589434, "memory(GiB)": 752.05, "step": 17585, "train_speed(iter/s)": 0.491493 }, { "acc": 0.76729403, "epoch": 0.4462209002741316, "grad_norm": 3.5, "learning_rate": 9.202800700693548e-06, "loss": 0.91544886, "memory(GiB)": 752.05, "step": 17590, "train_speed(iter/s)": 0.490659 }, { "acc": 0.75378785, "epoch": 0.44634773964316915, "grad_norm": 3.59375, "learning_rate": 9.202232534521231e-06, "loss": 0.94626083, "memory(GiB)": 752.05, "step": 17595, "train_speed(iter/s)": 0.489908 }, { "acc": 0.76850247, "epoch": 0.4464745790122067, "grad_norm": 4.3125, "learning_rate": 9.201664183506095e-06, "loss": 0.92377682, "memory(GiB)": 752.05, "step": 17600, "train_speed(iter/s)": 0.489182 }, { "acc": 0.75833902, "epoch": 0.4466014183812443, "grad_norm": 3.046875, "learning_rate": 9.201095647673137e-06, "loss": 0.89572945, "memory(GiB)": 752.05, "step": 17605, "train_speed(iter/s)": 0.488455 }, { "acc": 0.77032013, "epoch": 0.4467282577502818, "grad_norm": 3.28125, "learning_rate": 9.200526927047368e-06, "loss": 0.9003623, "memory(GiB)": 752.05, "step": 17610, "train_speed(iter/s)": 0.487694 }, { "acc": 0.76348238, "epoch": 0.44685509711931937, "grad_norm": 3.125, "learning_rate": 9.199958021653803e-06, "loss": 0.87161541, "memory(GiB)": 752.05, "step": 17615, "train_speed(iter/s)": 0.486903 }, { "acc": 0.73721399, "epoch": 0.44698193648835693, "grad_norm": 2.890625, "learning_rate": 9.199388931517468e-06, "loss": 1.0186492, "memory(GiB)": 752.05, "step": 17620, "train_speed(iter/s)": 0.486134 }, { "acc": 0.7620213, "epoch": 0.4471087758573945, "grad_norm": 3.875, "learning_rate": 9.198819656663391e-06, "loss": 0.94242983, "memory(GiB)": 752.05, "step": 17625, "train_speed(iter/s)": 0.485307 }, { "acc": 0.75464468, "epoch": 0.44723561522643207, "grad_norm": 3.53125, "learning_rate": 9.198250197116618e-06, "loss": 0.94012728, "memory(GiB)": 752.05, "step": 17630, "train_speed(iter/s)": 0.484413 }, { "acc": 0.76744757, "epoch": 0.44736245459546964, "grad_norm": 4.0, "learning_rate": 9.197680552902195e-06, "loss": 0.92249222, "memory(GiB)": 752.05, "step": 17635, "train_speed(iter/s)": 0.483675 }, { "acc": 0.76808534, "epoch": 0.44748929396450715, "grad_norm": 2.671875, "learning_rate": 9.197110724045179e-06, "loss": 0.84109831, "memory(GiB)": 752.05, "step": 17640, "train_speed(iter/s)": 0.482882 }, { "acc": 0.75723119, "epoch": 0.4476161333335447, "grad_norm": 3.359375, "learning_rate": 9.196540710570634e-06, "loss": 0.97490978, "memory(GiB)": 752.05, "step": 17645, "train_speed(iter/s)": 0.482117 }, { "acc": 0.76128736, "epoch": 0.4477429727025823, "grad_norm": 3.78125, "learning_rate": 9.195970512503635e-06, "loss": 0.86586494, "memory(GiB)": 752.05, "step": 17650, "train_speed(iter/s)": 0.481356 }, { "acc": 0.75486913, "epoch": 0.44786981207161985, "grad_norm": 3.484375, "learning_rate": 9.19540012986926e-06, "loss": 0.97245874, "memory(GiB)": 752.05, "step": 17655, "train_speed(iter/s)": 0.480675 }, { "acc": 0.76162405, "epoch": 0.4479966514406574, "grad_norm": 3.625, "learning_rate": 9.194829562692604e-06, "loss": 0.93075523, "memory(GiB)": 752.05, "step": 17660, "train_speed(iter/s)": 0.480005 }, { "acc": 0.76610026, "epoch": 0.448123490809695, "grad_norm": 3.53125, "learning_rate": 9.194258810998759e-06, "loss": 0.90382338, "memory(GiB)": 752.05, "step": 17665, "train_speed(iter/s)": 0.479291 }, { "acc": 0.74831719, "epoch": 0.4482503301787325, "grad_norm": 3.328125, "learning_rate": 9.193687874812834e-06, "loss": 0.9286273, "memory(GiB)": 752.05, "step": 17670, "train_speed(iter/s)": 0.478562 }, { "acc": 0.76086097, "epoch": 0.44837716954777007, "grad_norm": 3.28125, "learning_rate": 9.19311675415994e-06, "loss": 0.92471762, "memory(GiB)": 752.05, "step": 17675, "train_speed(iter/s)": 0.477788 }, { "acc": 0.75138702, "epoch": 0.44850400891680764, "grad_norm": 3.625, "learning_rate": 9.192545449065202e-06, "loss": 0.955163, "memory(GiB)": 752.05, "step": 17680, "train_speed(iter/s)": 0.477076 }, { "acc": 0.74764628, "epoch": 0.4486308482858452, "grad_norm": 3.796875, "learning_rate": 9.191973959553746e-06, "loss": 0.99582691, "memory(GiB)": 752.05, "step": 17685, "train_speed(iter/s)": 0.4764 }, { "acc": 0.76516557, "epoch": 0.4487576876548828, "grad_norm": 3.859375, "learning_rate": 9.191402285650713e-06, "loss": 0.93513002, "memory(GiB)": 752.05, "step": 17690, "train_speed(iter/s)": 0.475681 }, { "acc": 0.76304507, "epoch": 0.44888452702392034, "grad_norm": 3.140625, "learning_rate": 9.190830427381248e-06, "loss": 0.92916193, "memory(GiB)": 752.05, "step": 17695, "train_speed(iter/s)": 0.474926 }, { "acc": 0.75405588, "epoch": 0.44901136639295786, "grad_norm": 3.53125, "learning_rate": 9.190258384770506e-06, "loss": 0.92045088, "memory(GiB)": 752.05, "step": 17700, "train_speed(iter/s)": 0.474248 }, { "acc": 0.75326319, "epoch": 0.4491382057619954, "grad_norm": 3.578125, "learning_rate": 9.189686157843647e-06, "loss": 0.90920963, "memory(GiB)": 752.05, "step": 17705, "train_speed(iter/s)": 0.473593 }, { "acc": 0.76230345, "epoch": 0.449265045131033, "grad_norm": 3.703125, "learning_rate": 9.189113746625843e-06, "loss": 0.87268143, "memory(GiB)": 752.05, "step": 17710, "train_speed(iter/s)": 0.472838 }, { "acc": 0.76623182, "epoch": 0.44939188450007056, "grad_norm": 3.703125, "learning_rate": 9.188541151142275e-06, "loss": 0.92856703, "memory(GiB)": 752.05, "step": 17715, "train_speed(iter/s)": 0.472226 }, { "acc": 0.75099616, "epoch": 0.4495187238691081, "grad_norm": 3.15625, "learning_rate": 9.187968371418125e-06, "loss": 0.94267321, "memory(GiB)": 752.05, "step": 17720, "train_speed(iter/s)": 0.471492 }, { "acc": 0.78027558, "epoch": 0.4496455632381457, "grad_norm": 3.90625, "learning_rate": 9.18739540747859e-06, "loss": 0.8328021, "memory(GiB)": 752.05, "step": 17725, "train_speed(iter/s)": 0.470762 }, { "acc": 0.76008425, "epoch": 0.4497724026071832, "grad_norm": 3.4375, "learning_rate": 9.186822259348873e-06, "loss": 0.90738621, "memory(GiB)": 752.05, "step": 17730, "train_speed(iter/s)": 0.470061 }, { "acc": 0.77442565, "epoch": 0.4498992419762208, "grad_norm": 3.0625, "learning_rate": 9.186248927054186e-06, "loss": 0.87908287, "memory(GiB)": 752.05, "step": 17735, "train_speed(iter/s)": 0.469391 }, { "acc": 0.76356134, "epoch": 0.45002608134525834, "grad_norm": 3.671875, "learning_rate": 9.185675410619746e-06, "loss": 0.90621347, "memory(GiB)": 752.05, "step": 17740, "train_speed(iter/s)": 0.468704 }, { "acc": 0.76465898, "epoch": 0.4501529207142959, "grad_norm": 3.078125, "learning_rate": 9.185101710070781e-06, "loss": 0.90296526, "memory(GiB)": 752.05, "step": 17745, "train_speed(iter/s)": 0.468009 }, { "acc": 0.7469522, "epoch": 0.4502797600833335, "grad_norm": 3.09375, "learning_rate": 9.184527825432527e-06, "loss": 0.93572006, "memory(GiB)": 752.05, "step": 17750, "train_speed(iter/s)": 0.467311 }, { "acc": 0.76399894, "epoch": 0.45040659945237105, "grad_norm": 4.3125, "learning_rate": 9.183953756730226e-06, "loss": 0.91985426, "memory(GiB)": 752.05, "step": 17755, "train_speed(iter/s)": 0.466588 }, { "acc": 0.76453347, "epoch": 0.45053343882140856, "grad_norm": 3.6875, "learning_rate": 9.18337950398913e-06, "loss": 0.96615839, "memory(GiB)": 752.05, "step": 17760, "train_speed(iter/s)": 0.465899 }, { "acc": 0.75736551, "epoch": 0.45066027819044613, "grad_norm": 3.484375, "learning_rate": 9.182805067234499e-06, "loss": 0.90560484, "memory(GiB)": 752.05, "step": 17765, "train_speed(iter/s)": 0.465138 }, { "acc": 0.76400871, "epoch": 0.4507871175594837, "grad_norm": 3.34375, "learning_rate": 9.1822304464916e-06, "loss": 0.88072662, "memory(GiB)": 752.05, "step": 17770, "train_speed(iter/s)": 0.464406 }, { "acc": 0.75959358, "epoch": 0.45091395692852126, "grad_norm": 4.0, "learning_rate": 9.181655641785711e-06, "loss": 0.91480713, "memory(GiB)": 752.05, "step": 17775, "train_speed(iter/s)": 0.463741 }, { "acc": 0.74769421, "epoch": 0.45104079629755883, "grad_norm": 3.296875, "learning_rate": 9.181080653142111e-06, "loss": 0.93703756, "memory(GiB)": 752.05, "step": 17780, "train_speed(iter/s)": 0.463051 }, { "acc": 0.75766015, "epoch": 0.4511676356665964, "grad_norm": 3.140625, "learning_rate": 9.180505480586098e-06, "loss": 0.92320509, "memory(GiB)": 752.05, "step": 17785, "train_speed(iter/s)": 0.462329 }, { "acc": 0.76152086, "epoch": 0.4512944750356339, "grad_norm": 3.3125, "learning_rate": 9.179930124142967e-06, "loss": 0.88554182, "memory(GiB)": 752.05, "step": 17790, "train_speed(iter/s)": 0.461643 }, { "acc": 0.75892663, "epoch": 0.4514213144046715, "grad_norm": 3.65625, "learning_rate": 9.179354583838028e-06, "loss": 0.95951128, "memory(GiB)": 752.05, "step": 17795, "train_speed(iter/s)": 0.461014 }, { "acc": 0.7544601, "epoch": 0.45154815377370905, "grad_norm": 4.15625, "learning_rate": 9.178778859696595e-06, "loss": 0.92202272, "memory(GiB)": 752.05, "step": 17800, "train_speed(iter/s)": 0.460307 }, { "acc": 0.7679266, "epoch": 0.4516749931427466, "grad_norm": 3.328125, "learning_rate": 9.178202951743997e-06, "loss": 0.89037275, "memory(GiB)": 752.05, "step": 17805, "train_speed(iter/s)": 0.459618 }, { "acc": 0.76115627, "epoch": 0.4518018325117842, "grad_norm": 3.703125, "learning_rate": 9.177626860005565e-06, "loss": 0.87891464, "memory(GiB)": 752.05, "step": 17810, "train_speed(iter/s)": 0.458976 }, { "acc": 0.764396, "epoch": 0.45192867188082175, "grad_norm": 3.6875, "learning_rate": 9.177050584506634e-06, "loss": 0.86646671, "memory(GiB)": 752.05, "step": 17815, "train_speed(iter/s)": 0.458289 }, { "acc": 0.75900812, "epoch": 0.45205551124985927, "grad_norm": 3.46875, "learning_rate": 9.17647412527256e-06, "loss": 0.92109394, "memory(GiB)": 752.05, "step": 17820, "train_speed(iter/s)": 0.457637 }, { "acc": 0.76919336, "epoch": 0.45218235061889683, "grad_norm": 3.421875, "learning_rate": 9.175897482328695e-06, "loss": 0.88860111, "memory(GiB)": 752.05, "step": 17825, "train_speed(iter/s)": 0.457023 }, { "acc": 0.75633879, "epoch": 0.4523091899879344, "grad_norm": 3.203125, "learning_rate": 9.175320655700407e-06, "loss": 0.94151754, "memory(GiB)": 752.05, "step": 17830, "train_speed(iter/s)": 0.45639 }, { "acc": 0.77038803, "epoch": 0.45243602935697197, "grad_norm": 3.5, "learning_rate": 9.174743645413063e-06, "loss": 0.91605644, "memory(GiB)": 752.05, "step": 17835, "train_speed(iter/s)": 0.455729 }, { "acc": 0.75202618, "epoch": 0.45256286872600954, "grad_norm": 3.78125, "learning_rate": 9.17416645149205e-06, "loss": 0.94211197, "memory(GiB)": 752.05, "step": 17840, "train_speed(iter/s)": 0.455118 }, { "acc": 0.77020712, "epoch": 0.4526897080950471, "grad_norm": 3.75, "learning_rate": 9.173589073962754e-06, "loss": 0.87147446, "memory(GiB)": 752.05, "step": 17845, "train_speed(iter/s)": 0.454434 }, { "acc": 0.76864471, "epoch": 0.4528165474640846, "grad_norm": 3.421875, "learning_rate": 9.173011512850572e-06, "loss": 0.94708052, "memory(GiB)": 752.05, "step": 17850, "train_speed(iter/s)": 0.453843 }, { "acc": 0.76942167, "epoch": 0.4529433868331222, "grad_norm": 3.859375, "learning_rate": 9.172433768180911e-06, "loss": 0.90773163, "memory(GiB)": 752.05, "step": 17855, "train_speed(iter/s)": 0.453182 }, { "acc": 0.76647153, "epoch": 0.45307022620215975, "grad_norm": 3.59375, "learning_rate": 9.171855839979182e-06, "loss": 0.88923206, "memory(GiB)": 752.05, "step": 17860, "train_speed(iter/s)": 0.452497 }, { "acc": 0.77706451, "epoch": 0.4531970655711973, "grad_norm": 2.984375, "learning_rate": 9.171277728270807e-06, "loss": 0.86972694, "memory(GiB)": 752.05, "step": 17865, "train_speed(iter/s)": 0.451851 }, { "acc": 0.7713171, "epoch": 0.4533239049402349, "grad_norm": 4.21875, "learning_rate": 9.170699433081214e-06, "loss": 0.88672094, "memory(GiB)": 752.05, "step": 17870, "train_speed(iter/s)": 0.451205 }, { "acc": 0.77020392, "epoch": 0.45345074430927246, "grad_norm": 3.96875, "learning_rate": 9.170120954435843e-06, "loss": 0.88425007, "memory(GiB)": 752.05, "step": 17875, "train_speed(iter/s)": 0.450527 }, { "acc": 0.77491245, "epoch": 0.45357758367830997, "grad_norm": 3.421875, "learning_rate": 9.169542292360138e-06, "loss": 0.84515724, "memory(GiB)": 752.05, "step": 17880, "train_speed(iter/s)": 0.449851 }, { "acc": 0.76407948, "epoch": 0.45370442304734754, "grad_norm": 3.453125, "learning_rate": 9.16896344687955e-06, "loss": 0.89116859, "memory(GiB)": 752.05, "step": 17885, "train_speed(iter/s)": 0.449213 }, { "acc": 0.75807009, "epoch": 0.4538312624163851, "grad_norm": 3.5, "learning_rate": 9.168384418019546e-06, "loss": 0.9563777, "memory(GiB)": 752.05, "step": 17890, "train_speed(iter/s)": 0.448537 }, { "acc": 0.76791968, "epoch": 0.4539581017854227, "grad_norm": 3.609375, "learning_rate": 9.167805205805593e-06, "loss": 0.90103598, "memory(GiB)": 752.05, "step": 17895, "train_speed(iter/s)": 0.447949 }, { "acc": 0.757724, "epoch": 0.45408494115446024, "grad_norm": 3.25, "learning_rate": 9.167225810263167e-06, "loss": 0.94388762, "memory(GiB)": 752.05, "step": 17900, "train_speed(iter/s)": 0.447234 }, { "acc": 0.76110048, "epoch": 0.4542117805234978, "grad_norm": 2.75, "learning_rate": 9.166646231417755e-06, "loss": 0.91624908, "memory(GiB)": 752.05, "step": 17905, "train_speed(iter/s)": 0.446622 }, { "acc": 0.74186664, "epoch": 0.4543386198925353, "grad_norm": 4.59375, "learning_rate": 9.166066469294853e-06, "loss": 0.95346651, "memory(GiB)": 752.05, "step": 17910, "train_speed(iter/s)": 0.446035 }, { "acc": 0.76841712, "epoch": 0.4544654592615729, "grad_norm": 3.96875, "learning_rate": 9.16548652391996e-06, "loss": 0.90444326, "memory(GiB)": 752.05, "step": 17915, "train_speed(iter/s)": 0.445482 }, { "acc": 0.76246028, "epoch": 0.45459229863061046, "grad_norm": 3.1875, "learning_rate": 9.164906395318585e-06, "loss": 0.91253576, "memory(GiB)": 752.05, "step": 17920, "train_speed(iter/s)": 0.444842 }, { "acc": 0.75996685, "epoch": 0.454719137999648, "grad_norm": 6.125, "learning_rate": 9.16432608351625e-06, "loss": 0.88786459, "memory(GiB)": 752.05, "step": 17925, "train_speed(iter/s)": 0.444173 }, { "acc": 0.75853567, "epoch": 0.4548459773686856, "grad_norm": 3.5, "learning_rate": 9.163745588538478e-06, "loss": 0.95684481, "memory(GiB)": 752.05, "step": 17930, "train_speed(iter/s)": 0.44354 }, { "acc": 0.7611258, "epoch": 0.45497281673772316, "grad_norm": 3.671875, "learning_rate": 9.163164910410804e-06, "loss": 0.90909529, "memory(GiB)": 752.05, "step": 17935, "train_speed(iter/s)": 0.442939 }, { "acc": 0.75644522, "epoch": 0.4550996561067607, "grad_norm": 3.21875, "learning_rate": 9.16258404915877e-06, "loss": 0.89527388, "memory(GiB)": 752.05, "step": 17940, "train_speed(iter/s)": 0.442327 }, { "acc": 0.77017498, "epoch": 0.45522649547579824, "grad_norm": 4.09375, "learning_rate": 9.162003004807928e-06, "loss": 0.89206657, "memory(GiB)": 752.05, "step": 17945, "train_speed(iter/s)": 0.441705 }, { "acc": 0.76351972, "epoch": 0.4553533348448358, "grad_norm": 3.21875, "learning_rate": 9.161421777383834e-06, "loss": 0.86524563, "memory(GiB)": 752.05, "step": 17950, "train_speed(iter/s)": 0.441028 }, { "acc": 0.773631, "epoch": 0.4554801742138734, "grad_norm": 4.0625, "learning_rate": 9.160840366912053e-06, "loss": 0.86945639, "memory(GiB)": 752.05, "step": 17955, "train_speed(iter/s)": 0.440451 }, { "acc": 0.76070328, "epoch": 0.45560701358291095, "grad_norm": 3.03125, "learning_rate": 9.160258773418165e-06, "loss": 0.92674208, "memory(GiB)": 752.05, "step": 17960, "train_speed(iter/s)": 0.439888 }, { "acc": 0.74915261, "epoch": 0.4557338529519485, "grad_norm": 2.9375, "learning_rate": 9.159676996927747e-06, "loss": 0.90578184, "memory(GiB)": 752.05, "step": 17965, "train_speed(iter/s)": 0.439243 }, { "acc": 0.76383038, "epoch": 0.455860692320986, "grad_norm": 3.765625, "learning_rate": 9.159095037466393e-06, "loss": 0.91477652, "memory(GiB)": 752.05, "step": 17970, "train_speed(iter/s)": 0.438659 }, { "acc": 0.76559434, "epoch": 0.4559875316900236, "grad_norm": 3.734375, "learning_rate": 9.158512895059698e-06, "loss": 0.93637257, "memory(GiB)": 752.05, "step": 17975, "train_speed(iter/s)": 0.43807 }, { "acc": 0.76887212, "epoch": 0.45611437105906116, "grad_norm": 4.75, "learning_rate": 9.15793056973327e-06, "loss": 0.92379246, "memory(GiB)": 752.05, "step": 17980, "train_speed(iter/s)": 0.437466 }, { "acc": 0.75861473, "epoch": 0.45624121042809873, "grad_norm": 3.90625, "learning_rate": 9.157348061512728e-06, "loss": 0.91952133, "memory(GiB)": 752.05, "step": 17985, "train_speed(iter/s)": 0.436848 }, { "acc": 0.76228404, "epoch": 0.4563680497971363, "grad_norm": 3.78125, "learning_rate": 9.156765370423687e-06, "loss": 0.9282958, "memory(GiB)": 752.05, "step": 17990, "train_speed(iter/s)": 0.436268 }, { "acc": 0.75817537, "epoch": 0.45649488916617387, "grad_norm": 3.828125, "learning_rate": 9.156182496491784e-06, "loss": 0.9310812, "memory(GiB)": 752.05, "step": 17995, "train_speed(iter/s)": 0.435675 }, { "acc": 0.76638298, "epoch": 0.4566217285352114, "grad_norm": 3.359375, "learning_rate": 9.155599439742653e-06, "loss": 0.94180403, "memory(GiB)": 752.05, "step": 18000, "train_speed(iter/s)": 0.43504 }, { "epoch": 0.4566217285352114, "eval_acc": 0.7516089263505478, "eval_loss": 0.886885941028595, "eval_runtime": 1150.1947, "eval_samples_per_second": 5.538, "eval_steps_per_second": 5.538, "step": 18000 }, { "acc": 0.76210403, "epoch": 0.45674856790424895, "grad_norm": 3.453125, "learning_rate": 9.155016200201944e-06, "loss": 0.9094635, "memory(GiB)": 752.05, "step": 18005, "train_speed(iter/s)": 0.415535 }, { "acc": 0.76895008, "epoch": 0.4568754072732865, "grad_norm": 3.671875, "learning_rate": 9.154432777895314e-06, "loss": 0.903477, "memory(GiB)": 752.05, "step": 18010, "train_speed(iter/s)": 0.415019 }, { "acc": 0.76079497, "epoch": 0.4570022466423241, "grad_norm": 4.15625, "learning_rate": 9.15384917284842e-06, "loss": 0.86246653, "memory(GiB)": 752.05, "step": 18015, "train_speed(iter/s)": 0.414491 }, { "acc": 0.77218485, "epoch": 0.45712908601136165, "grad_norm": 3.78125, "learning_rate": 9.153265385086936e-06, "loss": 0.88667202, "memory(GiB)": 752.05, "step": 18020, "train_speed(iter/s)": 0.41398 }, { "acc": 0.76120353, "epoch": 0.4572559253803992, "grad_norm": 3.328125, "learning_rate": 9.152681414636542e-06, "loss": 0.9709795, "memory(GiB)": 752.05, "step": 18025, "train_speed(iter/s)": 0.413469 }, { "acc": 0.74972239, "epoch": 0.45738276474943673, "grad_norm": 3.984375, "learning_rate": 9.152097261522924e-06, "loss": 0.96842251, "memory(GiB)": 752.05, "step": 18030, "train_speed(iter/s)": 0.412924 }, { "acc": 0.75177631, "epoch": 0.4575096041184743, "grad_norm": 3.4375, "learning_rate": 9.151512925771775e-06, "loss": 0.97969809, "memory(GiB)": 752.05, "step": 18035, "train_speed(iter/s)": 0.412328 }, { "acc": 0.75716267, "epoch": 0.45763644348751187, "grad_norm": 4.09375, "learning_rate": 9.150928407408801e-06, "loss": 0.94408045, "memory(GiB)": 752.05, "step": 18040, "train_speed(iter/s)": 0.411805 }, { "acc": 0.76385841, "epoch": 0.45776328285654944, "grad_norm": 4.1875, "learning_rate": 9.150343706459712e-06, "loss": 0.95693274, "memory(GiB)": 752.05, "step": 18045, "train_speed(iter/s)": 0.411254 }, { "acc": 0.74878263, "epoch": 0.457890122225587, "grad_norm": 2.796875, "learning_rate": 9.149758822950227e-06, "loss": 0.93684826, "memory(GiB)": 752.05, "step": 18050, "train_speed(iter/s)": 0.41068 }, { "acc": 0.75323792, "epoch": 0.45801696159462457, "grad_norm": 3.484375, "learning_rate": 9.149173756906073e-06, "loss": 0.91829948, "memory(GiB)": 752.05, "step": 18055, "train_speed(iter/s)": 0.410169 }, { "acc": 0.76991711, "epoch": 0.4581438009636621, "grad_norm": 3.828125, "learning_rate": 9.148588508352988e-06, "loss": 0.89777708, "memory(GiB)": 752.05, "step": 18060, "train_speed(iter/s)": 0.409661 }, { "acc": 0.7557261, "epoch": 0.45827064033269965, "grad_norm": 3.90625, "learning_rate": 9.14800307731671e-06, "loss": 0.9797349, "memory(GiB)": 752.05, "step": 18065, "train_speed(iter/s)": 0.409196 }, { "acc": 0.75750618, "epoch": 0.4583974797017372, "grad_norm": 4.09375, "learning_rate": 9.147417463822994e-06, "loss": 0.96779566, "memory(GiB)": 752.05, "step": 18070, "train_speed(iter/s)": 0.408693 }, { "acc": 0.76635036, "epoch": 0.4585243190707748, "grad_norm": 3.78125, "learning_rate": 9.146831667897597e-06, "loss": 0.95336533, "memory(GiB)": 752.05, "step": 18075, "train_speed(iter/s)": 0.408227 }, { "acc": 0.77681537, "epoch": 0.45865115843981236, "grad_norm": 3.1875, "learning_rate": 9.146245689566287e-06, "loss": 0.94463034, "memory(GiB)": 752.05, "step": 18080, "train_speed(iter/s)": 0.407673 }, { "acc": 0.76605854, "epoch": 0.4587779978088499, "grad_norm": 3.234375, "learning_rate": 9.145659528854842e-06, "loss": 0.95343399, "memory(GiB)": 752.05, "step": 18085, "train_speed(iter/s)": 0.407178 }, { "acc": 0.75464568, "epoch": 0.45890483717788744, "grad_norm": 3.625, "learning_rate": 9.145073185789043e-06, "loss": 0.93466511, "memory(GiB)": 752.05, "step": 18090, "train_speed(iter/s)": 0.40675 }, { "acc": 0.76294012, "epoch": 0.459031676546925, "grad_norm": 3.546875, "learning_rate": 9.14448666039468e-06, "loss": 0.98034716, "memory(GiB)": 752.05, "step": 18095, "train_speed(iter/s)": 0.406269 }, { "acc": 0.76461997, "epoch": 0.4591585159159626, "grad_norm": 4.0625, "learning_rate": 9.143899952697556e-06, "loss": 0.92781372, "memory(GiB)": 752.05, "step": 18100, "train_speed(iter/s)": 0.405802 }, { "acc": 0.75559587, "epoch": 0.45928535528500014, "grad_norm": 3.046875, "learning_rate": 9.143313062723476e-06, "loss": 0.94253712, "memory(GiB)": 752.05, "step": 18105, "train_speed(iter/s)": 0.405212 }, { "acc": 0.75191512, "epoch": 0.4594121946540377, "grad_norm": 3.75, "learning_rate": 9.142725990498253e-06, "loss": 1.00342894, "memory(GiB)": 752.05, "step": 18110, "train_speed(iter/s)": 0.40472 }, { "acc": 0.76020818, "epoch": 0.4595390340230753, "grad_norm": 3.890625, "learning_rate": 9.142138736047717e-06, "loss": 0.89935923, "memory(GiB)": 752.05, "step": 18115, "train_speed(iter/s)": 0.404218 }, { "acc": 0.77150097, "epoch": 0.4596658733921128, "grad_norm": 3.40625, "learning_rate": 9.141551299397693e-06, "loss": 0.87664547, "memory(GiB)": 752.05, "step": 18120, "train_speed(iter/s)": 0.403727 }, { "acc": 0.76710744, "epoch": 0.45979271276115036, "grad_norm": 2.90625, "learning_rate": 9.140963680574024e-06, "loss": 0.91743479, "memory(GiB)": 752.05, "step": 18125, "train_speed(iter/s)": 0.403253 }, { "acc": 0.76426749, "epoch": 0.4599195521301879, "grad_norm": 3.59375, "learning_rate": 9.140375879602556e-06, "loss": 0.87359991, "memory(GiB)": 752.05, "step": 18130, "train_speed(iter/s)": 0.4028 }, { "acc": 0.7663228, "epoch": 0.4600463914992255, "grad_norm": 4.09375, "learning_rate": 9.139787896509148e-06, "loss": 0.88034315, "memory(GiB)": 752.05, "step": 18135, "train_speed(iter/s)": 0.402286 }, { "acc": 0.76845422, "epoch": 0.46017323086826306, "grad_norm": 3.109375, "learning_rate": 9.139199731319657e-06, "loss": 0.92149887, "memory(GiB)": 752.05, "step": 18140, "train_speed(iter/s)": 0.401732 }, { "acc": 0.7588284, "epoch": 0.46030007023730063, "grad_norm": 3.25, "learning_rate": 9.13861138405996e-06, "loss": 0.92865353, "memory(GiB)": 752.05, "step": 18145, "train_speed(iter/s)": 0.401229 }, { "acc": 0.76688609, "epoch": 0.46042690960633814, "grad_norm": 3.109375, "learning_rate": 9.138022854755934e-06, "loss": 0.88728609, "memory(GiB)": 752.05, "step": 18150, "train_speed(iter/s)": 0.400665 }, { "acc": 0.76929765, "epoch": 0.4605537489753757, "grad_norm": 3.078125, "learning_rate": 9.137434143433467e-06, "loss": 0.88900137, "memory(GiB)": 752.05, "step": 18155, "train_speed(iter/s)": 0.400212 }, { "acc": 0.76921878, "epoch": 0.4606805883444133, "grad_norm": 3.6875, "learning_rate": 9.136845250118455e-06, "loss": 0.9082799, "memory(GiB)": 752.05, "step": 18160, "train_speed(iter/s)": 0.399763 }, { "acc": 0.79168901, "epoch": 0.46080742771345085, "grad_norm": 3.796875, "learning_rate": 9.136256174836801e-06, "loss": 0.85599222, "memory(GiB)": 752.05, "step": 18165, "train_speed(iter/s)": 0.399281 }, { "acc": 0.75251727, "epoch": 0.4609342670824884, "grad_norm": 3.28125, "learning_rate": 9.135666917614419e-06, "loss": 1.00041838, "memory(GiB)": 752.05, "step": 18170, "train_speed(iter/s)": 0.398837 }, { "acc": 0.76299949, "epoch": 0.461061106451526, "grad_norm": 3.109375, "learning_rate": 9.135077478477223e-06, "loss": 0.907652, "memory(GiB)": 752.05, "step": 18175, "train_speed(iter/s)": 0.398386 }, { "acc": 0.76435556, "epoch": 0.4611879458205635, "grad_norm": 3.59375, "learning_rate": 9.134487857451146e-06, "loss": 0.86856747, "memory(GiB)": 752.05, "step": 18180, "train_speed(iter/s)": 0.397937 }, { "acc": 0.76222887, "epoch": 0.46131478518960106, "grad_norm": 4.53125, "learning_rate": 9.13389805456212e-06, "loss": 0.94232426, "memory(GiB)": 752.05, "step": 18185, "train_speed(iter/s)": 0.397488 }, { "acc": 0.77734709, "epoch": 0.46144162455863863, "grad_norm": 4.0625, "learning_rate": 9.13330806983609e-06, "loss": 0.87781162, "memory(GiB)": 752.05, "step": 18190, "train_speed(iter/s)": 0.397011 }, { "acc": 0.7663115, "epoch": 0.4615684639276762, "grad_norm": 3.296875, "learning_rate": 9.13271790329901e-06, "loss": 0.87264404, "memory(GiB)": 752.05, "step": 18195, "train_speed(iter/s)": 0.396533 }, { "acc": 0.75786676, "epoch": 0.46169530329671377, "grad_norm": 3.359375, "learning_rate": 9.132127554976835e-06, "loss": 0.936553, "memory(GiB)": 752.05, "step": 18200, "train_speed(iter/s)": 0.396047 }, { "acc": 0.76031408, "epoch": 0.46182214266575133, "grad_norm": 4.15625, "learning_rate": 9.131537024895533e-06, "loss": 0.90576668, "memory(GiB)": 752.05, "step": 18205, "train_speed(iter/s)": 0.395579 }, { "acc": 0.76475158, "epoch": 0.46194898203478885, "grad_norm": 3.984375, "learning_rate": 9.130946313081084e-06, "loss": 0.92013474, "memory(GiB)": 752.05, "step": 18210, "train_speed(iter/s)": 0.395073 }, { "acc": 0.77814746, "epoch": 0.4620758214038264, "grad_norm": 4.8125, "learning_rate": 9.130355419559468e-06, "loss": 0.84650955, "memory(GiB)": 752.05, "step": 18215, "train_speed(iter/s)": 0.394635 }, { "acc": 0.75427136, "epoch": 0.462202660772864, "grad_norm": 4.03125, "learning_rate": 9.129764344356678e-06, "loss": 0.94773235, "memory(GiB)": 752.05, "step": 18220, "train_speed(iter/s)": 0.39421 }, { "acc": 0.7643012, "epoch": 0.46232950014190155, "grad_norm": 3.640625, "learning_rate": 9.12917308749871e-06, "loss": 0.94371567, "memory(GiB)": 752.05, "step": 18225, "train_speed(iter/s)": 0.393759 }, { "acc": 0.75827107, "epoch": 0.4624563395109391, "grad_norm": 3.234375, "learning_rate": 9.128581649011576e-06, "loss": 0.94043427, "memory(GiB)": 752.05, "step": 18230, "train_speed(iter/s)": 0.393314 }, { "acc": 0.75846548, "epoch": 0.4625831788799767, "grad_norm": 4.0625, "learning_rate": 9.12799002892129e-06, "loss": 0.93788624, "memory(GiB)": 752.05, "step": 18235, "train_speed(iter/s)": 0.39286 }, { "acc": 0.7439229, "epoch": 0.4627100182490142, "grad_norm": 4.0625, "learning_rate": 9.127398227253876e-06, "loss": 1.01978102, "memory(GiB)": 752.05, "step": 18240, "train_speed(iter/s)": 0.39237 }, { "acc": 0.76755414, "epoch": 0.46283685761805177, "grad_norm": 3.609375, "learning_rate": 9.126806244035363e-06, "loss": 0.88763638, "memory(GiB)": 752.05, "step": 18245, "train_speed(iter/s)": 0.391941 }, { "acc": 0.76709352, "epoch": 0.46296369698708933, "grad_norm": 3.75, "learning_rate": 9.126214079291792e-06, "loss": 0.8984107, "memory(GiB)": 752.05, "step": 18250, "train_speed(iter/s)": 0.391522 }, { "acc": 0.75322313, "epoch": 0.4630905363561269, "grad_norm": 4.21875, "learning_rate": 9.125621733049211e-06, "loss": 0.95958595, "memory(GiB)": 752.05, "step": 18255, "train_speed(iter/s)": 0.391103 }, { "acc": 0.76225138, "epoch": 0.46321737572516447, "grad_norm": 3.96875, "learning_rate": 9.125029205333676e-06, "loss": 0.91678982, "memory(GiB)": 752.05, "step": 18260, "train_speed(iter/s)": 0.390612 }, { "acc": 0.75613551, "epoch": 0.46334421509420204, "grad_norm": 3.421875, "learning_rate": 9.124436496171249e-06, "loss": 0.90989695, "memory(GiB)": 752.05, "step": 18265, "train_speed(iter/s)": 0.390154 }, { "acc": 0.75791764, "epoch": 0.46347105446323955, "grad_norm": 3.34375, "learning_rate": 9.123843605588001e-06, "loss": 0.90727482, "memory(GiB)": 752.05, "step": 18270, "train_speed(iter/s)": 0.389592 }, { "acc": 0.77633734, "epoch": 0.4635978938322771, "grad_norm": 3.265625, "learning_rate": 9.123250533610012e-06, "loss": 0.87259874, "memory(GiB)": 752.05, "step": 18275, "train_speed(iter/s)": 0.389137 }, { "acc": 0.77885718, "epoch": 0.4637247332013147, "grad_norm": 3.9375, "learning_rate": 9.122657280263369e-06, "loss": 0.8914382, "memory(GiB)": 752.05, "step": 18280, "train_speed(iter/s)": 0.388662 }, { "acc": 0.75863976, "epoch": 0.46385157257035226, "grad_norm": 3.484375, "learning_rate": 9.12206384557417e-06, "loss": 0.93139791, "memory(GiB)": 752.05, "step": 18285, "train_speed(iter/s)": 0.388235 }, { "acc": 0.78384347, "epoch": 0.4639784119393898, "grad_norm": 4.125, "learning_rate": 9.121470229568516e-06, "loss": 0.86918068, "memory(GiB)": 752.05, "step": 18290, "train_speed(iter/s)": 0.387828 }, { "acc": 0.7658628, "epoch": 0.4641052513084274, "grad_norm": 3.65625, "learning_rate": 9.120876432272516e-06, "loss": 0.88764372, "memory(GiB)": 752.05, "step": 18295, "train_speed(iter/s)": 0.387427 }, { "acc": 0.75976629, "epoch": 0.4642320906774649, "grad_norm": 3.375, "learning_rate": 9.120282453712294e-06, "loss": 0.90153055, "memory(GiB)": 752.05, "step": 18300, "train_speed(iter/s)": 0.38695 }, { "acc": 0.76304417, "epoch": 0.46435893004650247, "grad_norm": 3.71875, "learning_rate": 9.119688293913974e-06, "loss": 0.89133644, "memory(GiB)": 752.05, "step": 18305, "train_speed(iter/s)": 0.386516 }, { "acc": 0.76091161, "epoch": 0.46448576941554004, "grad_norm": 3.484375, "learning_rate": 9.119093952903693e-06, "loss": 0.87089787, "memory(GiB)": 752.05, "step": 18310, "train_speed(iter/s)": 0.386135 }, { "acc": 0.76008196, "epoch": 0.4646126087845776, "grad_norm": 3.625, "learning_rate": 9.11849943070759e-06, "loss": 0.92562056, "memory(GiB)": 752.05, "step": 18315, "train_speed(iter/s)": 0.385736 }, { "acc": 0.77184606, "epoch": 0.4647394481536152, "grad_norm": 3.890625, "learning_rate": 9.117904727351822e-06, "loss": 0.86342478, "memory(GiB)": 752.05, "step": 18320, "train_speed(iter/s)": 0.385318 }, { "acc": 0.77242532, "epoch": 0.46486628752265274, "grad_norm": 4.0, "learning_rate": 9.117309842862545e-06, "loss": 0.89669075, "memory(GiB)": 752.05, "step": 18325, "train_speed(iter/s)": 0.384871 }, { "acc": 0.7543664, "epoch": 0.46499312689169026, "grad_norm": 3.46875, "learning_rate": 9.116714777265927e-06, "loss": 0.87850609, "memory(GiB)": 752.05, "step": 18330, "train_speed(iter/s)": 0.384438 }, { "acc": 0.77043228, "epoch": 0.4651199662607278, "grad_norm": 3.453125, "learning_rate": 9.116119530588143e-06, "loss": 0.91440382, "memory(GiB)": 752.05, "step": 18335, "train_speed(iter/s)": 0.383979 }, { "acc": 0.76699524, "epoch": 0.4652468056297654, "grad_norm": 3.125, "learning_rate": 9.115524102855375e-06, "loss": 0.89501047, "memory(GiB)": 752.05, "step": 18340, "train_speed(iter/s)": 0.383536 }, { "acc": 0.74904065, "epoch": 0.46537364499880296, "grad_norm": 3.953125, "learning_rate": 9.114928494093815e-06, "loss": 0.96859045, "memory(GiB)": 752.05, "step": 18345, "train_speed(iter/s)": 0.383088 }, { "acc": 0.76631103, "epoch": 0.46550048436784053, "grad_norm": 3.71875, "learning_rate": 9.11433270432966e-06, "loss": 0.9325654, "memory(GiB)": 752.05, "step": 18350, "train_speed(iter/s)": 0.382625 }, { "acc": 0.76677165, "epoch": 0.4656273237368781, "grad_norm": 3.828125, "learning_rate": 9.11373673358912e-06, "loss": 0.90832233, "memory(GiB)": 752.05, "step": 18355, "train_speed(iter/s)": 0.382174 }, { "acc": 0.77107878, "epoch": 0.4657541631059156, "grad_norm": 3.4375, "learning_rate": 9.113140581898408e-06, "loss": 0.88276272, "memory(GiB)": 752.05, "step": 18360, "train_speed(iter/s)": 0.381754 }, { "acc": 0.77705617, "epoch": 0.4658810024749532, "grad_norm": 3.640625, "learning_rate": 9.112544249283746e-06, "loss": 0.88200588, "memory(GiB)": 752.05, "step": 18365, "train_speed(iter/s)": 0.381375 }, { "acc": 0.7566431, "epoch": 0.46600784184399074, "grad_norm": 2.984375, "learning_rate": 9.111947735771367e-06, "loss": 0.92948313, "memory(GiB)": 752.05, "step": 18370, "train_speed(iter/s)": 0.380944 }, { "acc": 0.76796522, "epoch": 0.4661346812130283, "grad_norm": 3.5, "learning_rate": 9.111351041387509e-06, "loss": 0.91123734, "memory(GiB)": 752.05, "step": 18375, "train_speed(iter/s)": 0.380535 }, { "acc": 0.75986252, "epoch": 0.4662615205820659, "grad_norm": 3.546875, "learning_rate": 9.110754166158416e-06, "loss": 0.87484665, "memory(GiB)": 752.05, "step": 18380, "train_speed(iter/s)": 0.380146 }, { "acc": 0.77321358, "epoch": 0.46638835995110345, "grad_norm": 3.8125, "learning_rate": 9.110157110110346e-06, "loss": 0.91160898, "memory(GiB)": 752.05, "step": 18385, "train_speed(iter/s)": 0.379685 }, { "acc": 0.74203081, "epoch": 0.46651519932014096, "grad_norm": 3.671875, "learning_rate": 9.109559873269562e-06, "loss": 0.98337641, "memory(GiB)": 752.05, "step": 18390, "train_speed(iter/s)": 0.379199 }, { "acc": 0.76531701, "epoch": 0.46664203868917853, "grad_norm": 3.5625, "learning_rate": 9.108962455662331e-06, "loss": 0.9109314, "memory(GiB)": 752.05, "step": 18395, "train_speed(iter/s)": 0.378812 }, { "acc": 0.75112562, "epoch": 0.4667688780582161, "grad_norm": 3.671875, "learning_rate": 9.108364857314935e-06, "loss": 0.94711857, "memory(GiB)": 752.05, "step": 18400, "train_speed(iter/s)": 0.378397 }, { "acc": 0.76817579, "epoch": 0.46689571742725366, "grad_norm": 4.03125, "learning_rate": 9.107767078253658e-06, "loss": 0.93205395, "memory(GiB)": 752.05, "step": 18405, "train_speed(iter/s)": 0.377961 }, { "acc": 0.75629506, "epoch": 0.46702255679629123, "grad_norm": 3.59375, "learning_rate": 9.107169118504795e-06, "loss": 0.93488865, "memory(GiB)": 752.05, "step": 18410, "train_speed(iter/s)": 0.377506 }, { "acc": 0.76373625, "epoch": 0.4671493961653288, "grad_norm": 3.75, "learning_rate": 9.10657097809465e-06, "loss": 0.90864096, "memory(GiB)": 752.05, "step": 18415, "train_speed(iter/s)": 0.377122 }, { "acc": 0.74944501, "epoch": 0.4672762355343663, "grad_norm": 3.421875, "learning_rate": 9.105972657049531e-06, "loss": 0.9478302, "memory(GiB)": 752.05, "step": 18420, "train_speed(iter/s)": 0.376735 }, { "acc": 0.75659013, "epoch": 0.4674030749034039, "grad_norm": 3.421875, "learning_rate": 9.105374155395758e-06, "loss": 0.96077785, "memory(GiB)": 752.05, "step": 18425, "train_speed(iter/s)": 0.376252 }, { "acc": 0.76091781, "epoch": 0.46752991427244145, "grad_norm": 3.515625, "learning_rate": 9.104775473159656e-06, "loss": 0.94753313, "memory(GiB)": 752.05, "step": 18430, "train_speed(iter/s)": 0.375848 }, { "acc": 0.76809669, "epoch": 0.467656753641479, "grad_norm": 3.125, "learning_rate": 9.10417661036756e-06, "loss": 0.86609097, "memory(GiB)": 752.05, "step": 18435, "train_speed(iter/s)": 0.375435 }, { "acc": 0.76447716, "epoch": 0.4677835930105166, "grad_norm": 3.765625, "learning_rate": 9.103577567045811e-06, "loss": 0.91873322, "memory(GiB)": 752.05, "step": 18440, "train_speed(iter/s)": 0.375029 }, { "acc": 0.74686012, "epoch": 0.46791043237955415, "grad_norm": 3.703125, "learning_rate": 9.10297834322076e-06, "loss": 0.98644085, "memory(GiB)": 752.05, "step": 18445, "train_speed(iter/s)": 0.37457 }, { "acc": 0.76920199, "epoch": 0.46803727174859167, "grad_norm": 3.0625, "learning_rate": 9.102378938918764e-06, "loss": 0.85529518, "memory(GiB)": 752.05, "step": 18450, "train_speed(iter/s)": 0.374117 }, { "acc": 0.76376853, "epoch": 0.46816411111762923, "grad_norm": 3.421875, "learning_rate": 9.10177935416619e-06, "loss": 0.90436602, "memory(GiB)": 752.05, "step": 18455, "train_speed(iter/s)": 0.373683 }, { "acc": 0.76439137, "epoch": 0.4682909504866668, "grad_norm": 3.375, "learning_rate": 9.101179588989411e-06, "loss": 0.86825256, "memory(GiB)": 752.05, "step": 18460, "train_speed(iter/s)": 0.373299 }, { "acc": 0.77398577, "epoch": 0.46841778985570437, "grad_norm": 4.15625, "learning_rate": 9.100579643414809e-06, "loss": 0.91067753, "memory(GiB)": 752.05, "step": 18465, "train_speed(iter/s)": 0.372864 }, { "acc": 0.76327491, "epoch": 0.46854462922474194, "grad_norm": 3.421875, "learning_rate": 9.099979517468774e-06, "loss": 0.90984011, "memory(GiB)": 752.05, "step": 18470, "train_speed(iter/s)": 0.372495 }, { "acc": 0.7803102, "epoch": 0.4686714685937795, "grad_norm": 3.140625, "learning_rate": 9.099379211177703e-06, "loss": 0.85796394, "memory(GiB)": 752.05, "step": 18475, "train_speed(iter/s)": 0.372074 }, { "acc": 0.77621021, "epoch": 0.468798307962817, "grad_norm": 3.546875, "learning_rate": 9.098778724568002e-06, "loss": 0.85149746, "memory(GiB)": 752.05, "step": 18480, "train_speed(iter/s)": 0.371651 }, { "acc": 0.76596746, "epoch": 0.4689251473318546, "grad_norm": 3.359375, "learning_rate": 9.098178057666085e-06, "loss": 0.92046986, "memory(GiB)": 752.05, "step": 18485, "train_speed(iter/s)": 0.371213 }, { "acc": 0.7692893, "epoch": 0.46905198670089215, "grad_norm": 3.6875, "learning_rate": 9.097577210498373e-06, "loss": 0.92027874, "memory(GiB)": 752.05, "step": 18490, "train_speed(iter/s)": 0.370826 }, { "acc": 0.76070681, "epoch": 0.4691788260699297, "grad_norm": 3.796875, "learning_rate": 9.096976183091295e-06, "loss": 0.89807005, "memory(GiB)": 752.05, "step": 18495, "train_speed(iter/s)": 0.370456 }, { "acc": 0.75551648, "epoch": 0.4693056654389673, "grad_norm": 3.171875, "learning_rate": 9.096374975471288e-06, "loss": 0.91731339, "memory(GiB)": 752.05, "step": 18500, "train_speed(iter/s)": 0.370059 }, { "epoch": 0.4693056654389673, "eval_acc": 0.7518161389360386, "eval_loss": 0.885528028011322, "eval_runtime": 1148.8961, "eval_samples_per_second": 5.544, "eval_steps_per_second": 5.544, "step": 18500 }, { "acc": 0.75943079, "epoch": 0.46943250480800486, "grad_norm": 3.28125, "learning_rate": 9.095773587664797e-06, "loss": 0.93321238, "memory(GiB)": 752.05, "step": 18505, "train_speed(iter/s)": 0.356328 }, { "acc": 0.75648789, "epoch": 0.46955934417704237, "grad_norm": 3.09375, "learning_rate": 9.095172019698277e-06, "loss": 0.94815493, "memory(GiB)": 752.05, "step": 18510, "train_speed(iter/s)": 0.355994 }, { "acc": 0.74730415, "epoch": 0.46968618354607994, "grad_norm": 3.765625, "learning_rate": 9.094570271598188e-06, "loss": 0.96888685, "memory(GiB)": 752.05, "step": 18515, "train_speed(iter/s)": 0.355611 }, { "acc": 0.75863914, "epoch": 0.4698130229151175, "grad_norm": 3.0, "learning_rate": 9.093968343390998e-06, "loss": 0.87175932, "memory(GiB)": 752.05, "step": 18520, "train_speed(iter/s)": 0.355284 }, { "acc": 0.77699184, "epoch": 0.4699398622841551, "grad_norm": 3.296875, "learning_rate": 9.093366235103185e-06, "loss": 0.86427221, "memory(GiB)": 752.05, "step": 18525, "train_speed(iter/s)": 0.354955 }, { "acc": 0.76581459, "epoch": 0.47006670165319264, "grad_norm": 3.453125, "learning_rate": 9.092763946761233e-06, "loss": 0.93524504, "memory(GiB)": 752.05, "step": 18530, "train_speed(iter/s)": 0.35456 }, { "acc": 0.77228541, "epoch": 0.4701935410222302, "grad_norm": 3.328125, "learning_rate": 9.092161478391636e-06, "loss": 0.88279657, "memory(GiB)": 752.05, "step": 18535, "train_speed(iter/s)": 0.354209 }, { "acc": 0.76520028, "epoch": 0.4703203803912677, "grad_norm": 3.1875, "learning_rate": 9.091558830020895e-06, "loss": 0.89703951, "memory(GiB)": 752.05, "step": 18540, "train_speed(iter/s)": 0.353878 }, { "acc": 0.74238658, "epoch": 0.4704472197603053, "grad_norm": 3.046875, "learning_rate": 9.090956001675515e-06, "loss": 0.9969244, "memory(GiB)": 752.05, "step": 18545, "train_speed(iter/s)": 0.353558 }, { "acc": 0.75861487, "epoch": 0.47057405912934286, "grad_norm": 3.328125, "learning_rate": 9.090352993382016e-06, "loss": 0.9286828, "memory(GiB)": 752.05, "step": 18550, "train_speed(iter/s)": 0.353174 }, { "acc": 0.75719237, "epoch": 0.4707008984983804, "grad_norm": 3.296875, "learning_rate": 9.089749805166924e-06, "loss": 0.89000244, "memory(GiB)": 752.05, "step": 18555, "train_speed(iter/s)": 0.35281 }, { "acc": 0.76047273, "epoch": 0.470827737867418, "grad_norm": 3.03125, "learning_rate": 9.089146437056766e-06, "loss": 0.88145943, "memory(GiB)": 752.05, "step": 18560, "train_speed(iter/s)": 0.352431 }, { "acc": 0.76164508, "epoch": 0.47095457723645556, "grad_norm": 3.515625, "learning_rate": 9.088542889078087e-06, "loss": 0.90016794, "memory(GiB)": 752.05, "step": 18565, "train_speed(iter/s)": 0.352075 }, { "acc": 0.73813891, "epoch": 0.4710814166054931, "grad_norm": 3.65625, "learning_rate": 9.087939161257431e-06, "loss": 0.99316797, "memory(GiB)": 752.05, "step": 18570, "train_speed(iter/s)": 0.351763 }, { "acc": 0.7619267, "epoch": 0.47120825597453064, "grad_norm": 3.09375, "learning_rate": 9.087335253621357e-06, "loss": 0.96362839, "memory(GiB)": 752.05, "step": 18575, "train_speed(iter/s)": 0.351419 }, { "acc": 0.75133634, "epoch": 0.4713350953435682, "grad_norm": 3.65625, "learning_rate": 9.08673116619643e-06, "loss": 0.9521081, "memory(GiB)": 752.05, "step": 18580, "train_speed(iter/s)": 0.351041 }, { "acc": 0.75080075, "epoch": 0.4714619347126058, "grad_norm": 3.375, "learning_rate": 9.086126899009217e-06, "loss": 0.9239749, "memory(GiB)": 752.05, "step": 18585, "train_speed(iter/s)": 0.350729 }, { "acc": 0.75589252, "epoch": 0.47158877408164335, "grad_norm": 3.140625, "learning_rate": 9.085522452086304e-06, "loss": 0.9401247, "memory(GiB)": 752.05, "step": 18590, "train_speed(iter/s)": 0.350361 }, { "acc": 0.76315026, "epoch": 0.4717156134506809, "grad_norm": 3.40625, "learning_rate": 9.084917825454273e-06, "loss": 0.88839712, "memory(GiB)": 752.05, "step": 18595, "train_speed(iter/s)": 0.350023 }, { "acc": 0.75649185, "epoch": 0.4718424528197184, "grad_norm": 3.375, "learning_rate": 9.084313019139725e-06, "loss": 0.92411423, "memory(GiB)": 752.05, "step": 18600, "train_speed(iter/s)": 0.349585 }, { "acc": 0.75802293, "epoch": 0.471969292188756, "grad_norm": 3.140625, "learning_rate": 9.083708033169259e-06, "loss": 0.91923666, "memory(GiB)": 752.05, "step": 18605, "train_speed(iter/s)": 0.349216 }, { "acc": 0.76485567, "epoch": 0.47209613155779356, "grad_norm": 3.5625, "learning_rate": 9.083102867569488e-06, "loss": 0.91684628, "memory(GiB)": 752.05, "step": 18610, "train_speed(iter/s)": 0.348849 }, { "acc": 0.76109505, "epoch": 0.47222297092683113, "grad_norm": 3.265625, "learning_rate": 9.082497522367031e-06, "loss": 0.91719484, "memory(GiB)": 752.05, "step": 18615, "train_speed(iter/s)": 0.348477 }, { "acc": 0.76764627, "epoch": 0.4723498102958687, "grad_norm": 3.265625, "learning_rate": 9.081891997588516e-06, "loss": 0.90411491, "memory(GiB)": 752.05, "step": 18620, "train_speed(iter/s)": 0.348129 }, { "acc": 0.74910469, "epoch": 0.47247664966490627, "grad_norm": 4.15625, "learning_rate": 9.081286293260578e-06, "loss": 0.92590694, "memory(GiB)": 752.05, "step": 18625, "train_speed(iter/s)": 0.347839 }, { "acc": 0.76923299, "epoch": 0.4726034890339438, "grad_norm": 3.765625, "learning_rate": 9.080680409409861e-06, "loss": 0.86761646, "memory(GiB)": 752.05, "step": 18630, "train_speed(iter/s)": 0.347521 }, { "acc": 0.75855751, "epoch": 0.47273032840298135, "grad_norm": 3.515625, "learning_rate": 9.080074346063012e-06, "loss": 0.93562899, "memory(GiB)": 752.05, "step": 18635, "train_speed(iter/s)": 0.347173 }, { "acc": 0.76587563, "epoch": 0.4728571677720189, "grad_norm": 3.4375, "learning_rate": 9.079468103246695e-06, "loss": 0.88370314, "memory(GiB)": 752.05, "step": 18640, "train_speed(iter/s)": 0.346857 }, { "acc": 0.75859256, "epoch": 0.4729840071410565, "grad_norm": 3.484375, "learning_rate": 9.078861680987573e-06, "loss": 0.94071398, "memory(GiB)": 752.05, "step": 18645, "train_speed(iter/s)": 0.346517 }, { "acc": 0.77373996, "epoch": 0.47311084651009405, "grad_norm": 3.40625, "learning_rate": 9.07825507931232e-06, "loss": 0.84203711, "memory(GiB)": 752.05, "step": 18650, "train_speed(iter/s)": 0.346205 }, { "acc": 0.76778622, "epoch": 0.4732376858791316, "grad_norm": 3.40625, "learning_rate": 9.077648298247623e-06, "loss": 0.87688951, "memory(GiB)": 752.05, "step": 18655, "train_speed(iter/s)": 0.34587 }, { "acc": 0.75635972, "epoch": 0.47336452524816913, "grad_norm": 3.421875, "learning_rate": 9.077041337820168e-06, "loss": 0.92884274, "memory(GiB)": 752.05, "step": 18660, "train_speed(iter/s)": 0.345564 }, { "acc": 0.76804914, "epoch": 0.4734913646172067, "grad_norm": 3.34375, "learning_rate": 9.076434198056657e-06, "loss": 0.88231583, "memory(GiB)": 752.05, "step": 18665, "train_speed(iter/s)": 0.345215 }, { "acc": 0.76319695, "epoch": 0.47361820398624427, "grad_norm": 4.28125, "learning_rate": 9.07582687898379e-06, "loss": 0.90979109, "memory(GiB)": 752.05, "step": 18670, "train_speed(iter/s)": 0.344892 }, { "acc": 0.75672503, "epoch": 0.47374504335528184, "grad_norm": 2.90625, "learning_rate": 9.075219380628288e-06, "loss": 0.92188501, "memory(GiB)": 752.05, "step": 18675, "train_speed(iter/s)": 0.34457 }, { "acc": 0.75891595, "epoch": 0.4738718827243194, "grad_norm": 3.625, "learning_rate": 9.07461170301687e-06, "loss": 0.89790821, "memory(GiB)": 752.05, "step": 18680, "train_speed(iter/s)": 0.344231 }, { "acc": 0.74977427, "epoch": 0.473998722093357, "grad_norm": 4.1875, "learning_rate": 9.074003846176263e-06, "loss": 0.97854576, "memory(GiB)": 752.05, "step": 18685, "train_speed(iter/s)": 0.343926 }, { "acc": 0.76821527, "epoch": 0.4741255614623945, "grad_norm": 3.5625, "learning_rate": 9.07339581013321e-06, "loss": 0.92281885, "memory(GiB)": 752.05, "step": 18690, "train_speed(iter/s)": 0.343603 }, { "acc": 0.75057049, "epoch": 0.47425240083143205, "grad_norm": 3.921875, "learning_rate": 9.072787594914451e-06, "loss": 0.92959766, "memory(GiB)": 752.05, "step": 18695, "train_speed(iter/s)": 0.343289 }, { "acc": 0.76378431, "epoch": 0.4743792402004696, "grad_norm": 3.84375, "learning_rate": 9.072179200546746e-06, "loss": 0.90562897, "memory(GiB)": 752.05, "step": 18700, "train_speed(iter/s)": 0.342949 }, { "acc": 0.75493779, "epoch": 0.4745060795695072, "grad_norm": 4.9375, "learning_rate": 9.07157062705685e-06, "loss": 0.92069988, "memory(GiB)": 752.05, "step": 18705, "train_speed(iter/s)": 0.342588 }, { "acc": 0.75936537, "epoch": 0.47463291893854476, "grad_norm": 3.78125, "learning_rate": 9.070961874471536e-06, "loss": 0.96015282, "memory(GiB)": 752.05, "step": 18710, "train_speed(iter/s)": 0.34228 }, { "acc": 0.7580101, "epoch": 0.4747597583075823, "grad_norm": 3.6875, "learning_rate": 9.07035294281758e-06, "loss": 0.94893417, "memory(GiB)": 752.05, "step": 18715, "train_speed(iter/s)": 0.34195 }, { "acc": 0.77811594, "epoch": 0.47488659767661984, "grad_norm": 4.40625, "learning_rate": 9.069743832121767e-06, "loss": 0.90573177, "memory(GiB)": 752.05, "step": 18720, "train_speed(iter/s)": 0.341639 }, { "acc": 0.7753767, "epoch": 0.4750134370456574, "grad_norm": 3.703125, "learning_rate": 9.069134542410891e-06, "loss": 0.90831413, "memory(GiB)": 752.05, "step": 18725, "train_speed(iter/s)": 0.341336 }, { "acc": 0.75423546, "epoch": 0.475140276414695, "grad_norm": 3.734375, "learning_rate": 9.06852507371175e-06, "loss": 1.01450462, "memory(GiB)": 752.05, "step": 18730, "train_speed(iter/s)": 0.341061 }, { "acc": 0.75868502, "epoch": 0.47526711578373254, "grad_norm": 3.09375, "learning_rate": 9.067915426051155e-06, "loss": 0.94321003, "memory(GiB)": 752.05, "step": 18735, "train_speed(iter/s)": 0.34071 }, { "acc": 0.76866117, "epoch": 0.4753939551527701, "grad_norm": 3.3125, "learning_rate": 9.06730559945592e-06, "loss": 0.91162472, "memory(GiB)": 752.05, "step": 18740, "train_speed(iter/s)": 0.340398 }, { "acc": 0.76196709, "epoch": 0.4755207945218077, "grad_norm": 3.765625, "learning_rate": 9.066695593952873e-06, "loss": 0.9131588, "memory(GiB)": 752.05, "step": 18745, "train_speed(iter/s)": 0.340097 }, { "acc": 0.77217407, "epoch": 0.4756476338908452, "grad_norm": 3.953125, "learning_rate": 9.066085409568843e-06, "loss": 0.8684063, "memory(GiB)": 752.05, "step": 18750, "train_speed(iter/s)": 0.339774 }, { "acc": 0.75793118, "epoch": 0.47577447325988276, "grad_norm": 3.515625, "learning_rate": 9.065475046330669e-06, "loss": 0.90136223, "memory(GiB)": 752.05, "step": 18755, "train_speed(iter/s)": 0.339435 }, { "acc": 0.76047454, "epoch": 0.4759013126289203, "grad_norm": 3.328125, "learning_rate": 9.064864504265205e-06, "loss": 0.9417758, "memory(GiB)": 752.05, "step": 18760, "train_speed(iter/s)": 0.339121 }, { "acc": 0.76916394, "epoch": 0.4760281519979579, "grad_norm": 3.0625, "learning_rate": 9.064253783399301e-06, "loss": 0.88051748, "memory(GiB)": 752.05, "step": 18765, "train_speed(iter/s)": 0.338784 }, { "acc": 0.75297146, "epoch": 0.47615499136699546, "grad_norm": 3.4375, "learning_rate": 9.063642883759822e-06, "loss": 0.97100687, "memory(GiB)": 752.05, "step": 18770, "train_speed(iter/s)": 0.338461 }, { "acc": 0.76540685, "epoch": 0.47628183073603303, "grad_norm": 3.484375, "learning_rate": 9.063031805373642e-06, "loss": 0.91832705, "memory(GiB)": 752.05, "step": 18775, "train_speed(iter/s)": 0.338158 }, { "acc": 0.76361575, "epoch": 0.47640867010507054, "grad_norm": 3.53125, "learning_rate": 9.062420548267636e-06, "loss": 0.91392508, "memory(GiB)": 752.05, "step": 18780, "train_speed(iter/s)": 0.337853 }, { "acc": 0.78321738, "epoch": 0.4765355094741081, "grad_norm": 3.421875, "learning_rate": 9.061809112468696e-06, "loss": 0.82005978, "memory(GiB)": 752.05, "step": 18785, "train_speed(iter/s)": 0.337557 }, { "acc": 0.76162467, "epoch": 0.4766623488431457, "grad_norm": 3.546875, "learning_rate": 9.061197498003715e-06, "loss": 0.97440453, "memory(GiB)": 752.05, "step": 18790, "train_speed(iter/s)": 0.337202 }, { "acc": 0.75697799, "epoch": 0.47678918821218325, "grad_norm": 3.890625, "learning_rate": 9.060585704899594e-06, "loss": 0.94905987, "memory(GiB)": 752.05, "step": 18795, "train_speed(iter/s)": 0.336803 }, { "acc": 0.76219277, "epoch": 0.4769160275812208, "grad_norm": 3.65625, "learning_rate": 9.059973733183248e-06, "loss": 0.91143188, "memory(GiB)": 752.05, "step": 18800, "train_speed(iter/s)": 0.336512 }, { "acc": 0.77095184, "epoch": 0.4770428669502584, "grad_norm": 3.9375, "learning_rate": 9.05936158288159e-06, "loss": 0.89743404, "memory(GiB)": 752.05, "step": 18805, "train_speed(iter/s)": 0.336151 }, { "acc": 0.76986561, "epoch": 0.4771697063192959, "grad_norm": 3.109375, "learning_rate": 9.058749254021553e-06, "loss": 0.87820578, "memory(GiB)": 752.05, "step": 18810, "train_speed(iter/s)": 0.335791 }, { "acc": 0.77993484, "epoch": 0.47729654568833346, "grad_norm": 3.84375, "learning_rate": 9.058136746630067e-06, "loss": 0.84426422, "memory(GiB)": 752.05, "step": 18815, "train_speed(iter/s)": 0.335487 }, { "acc": 0.75907431, "epoch": 0.47742338505737103, "grad_norm": 3.4375, "learning_rate": 9.057524060734075e-06, "loss": 0.94171324, "memory(GiB)": 752.05, "step": 18820, "train_speed(iter/s)": 0.335145 }, { "acc": 0.76221552, "epoch": 0.4775502244264086, "grad_norm": 3.546875, "learning_rate": 9.056911196360528e-06, "loss": 0.87566919, "memory(GiB)": 752.05, "step": 18825, "train_speed(iter/s)": 0.334859 }, { "acc": 0.76527829, "epoch": 0.47767706379544617, "grad_norm": 3.828125, "learning_rate": 9.056298153536383e-06, "loss": 0.92707701, "memory(GiB)": 752.05, "step": 18830, "train_speed(iter/s)": 0.33454 }, { "acc": 0.7537652, "epoch": 0.47780390316448373, "grad_norm": 2.828125, "learning_rate": 9.055684932288606e-06, "loss": 0.92656498, "memory(GiB)": 752.05, "step": 18835, "train_speed(iter/s)": 0.334175 }, { "acc": 0.74439683, "epoch": 0.47793074253352125, "grad_norm": 3.875, "learning_rate": 9.055071532644173e-06, "loss": 0.92696419, "memory(GiB)": 752.05, "step": 18840, "train_speed(iter/s)": 0.333861 }, { "acc": 0.76154857, "epoch": 0.4780575819025588, "grad_norm": 3.5625, "learning_rate": 9.054457954630062e-06, "loss": 0.91818457, "memory(GiB)": 752.05, "step": 18845, "train_speed(iter/s)": 0.333577 }, { "acc": 0.75529151, "epoch": 0.4781844212715964, "grad_norm": 3.640625, "learning_rate": 9.053844198273266e-06, "loss": 0.96650562, "memory(GiB)": 752.05, "step": 18850, "train_speed(iter/s)": 0.333268 }, { "acc": 0.77015986, "epoch": 0.47831126064063395, "grad_norm": 3.671875, "learning_rate": 9.053230263600779e-06, "loss": 0.9399683, "memory(GiB)": 752.05, "step": 18855, "train_speed(iter/s)": 0.332982 }, { "acc": 0.77733016, "epoch": 0.4784381000096715, "grad_norm": 3.625, "learning_rate": 9.052616150639606e-06, "loss": 0.90161972, "memory(GiB)": 752.05, "step": 18860, "train_speed(iter/s)": 0.33268 }, { "acc": 0.76067619, "epoch": 0.4785649393787091, "grad_norm": 2.828125, "learning_rate": 9.05200185941676e-06, "loss": 0.92838211, "memory(GiB)": 752.05, "step": 18865, "train_speed(iter/s)": 0.332396 }, { "acc": 0.76802731, "epoch": 0.4786917787477466, "grad_norm": 2.96875, "learning_rate": 9.051387389959265e-06, "loss": 0.91074438, "memory(GiB)": 752.05, "step": 18870, "train_speed(iter/s)": 0.332066 }, { "acc": 0.78215084, "epoch": 0.47881861811678417, "grad_norm": 3.3125, "learning_rate": 9.050772742294147e-06, "loss": 0.84282007, "memory(GiB)": 752.05, "step": 18875, "train_speed(iter/s)": 0.331776 }, { "acc": 0.76386509, "epoch": 0.47894545748582174, "grad_norm": 2.78125, "learning_rate": 9.050157916448443e-06, "loss": 0.90172815, "memory(GiB)": 752.05, "step": 18880, "train_speed(iter/s)": 0.331487 }, { "acc": 0.76856966, "epoch": 0.4790722968548593, "grad_norm": 3.90625, "learning_rate": 9.049542912449195e-06, "loss": 0.92278528, "memory(GiB)": 752.05, "step": 18885, "train_speed(iter/s)": 0.331185 }, { "acc": 0.75812006, "epoch": 0.47919913622389687, "grad_norm": 3.765625, "learning_rate": 9.048927730323458e-06, "loss": 0.91811571, "memory(GiB)": 752.05, "step": 18890, "train_speed(iter/s)": 0.330899 }, { "acc": 0.75866537, "epoch": 0.47932597559293444, "grad_norm": 3.25, "learning_rate": 9.04831237009829e-06, "loss": 0.94053097, "memory(GiB)": 752.05, "step": 18895, "train_speed(iter/s)": 0.330584 }, { "acc": 0.7687655, "epoch": 0.47945281496197195, "grad_norm": 3.03125, "learning_rate": 9.047696831800761e-06, "loss": 0.89861822, "memory(GiB)": 752.05, "step": 18900, "train_speed(iter/s)": 0.3303 }, { "acc": 0.763833, "epoch": 0.4795796543310095, "grad_norm": 3.96875, "learning_rate": 9.047081115457945e-06, "loss": 0.89826336, "memory(GiB)": 752.05, "step": 18905, "train_speed(iter/s)": 0.329987 }, { "acc": 0.76297598, "epoch": 0.4797064937000471, "grad_norm": 4.0, "learning_rate": 9.046465221096926e-06, "loss": 0.90061083, "memory(GiB)": 752.05, "step": 18910, "train_speed(iter/s)": 0.329653 }, { "acc": 0.75514503, "epoch": 0.47983333306908466, "grad_norm": 3.1875, "learning_rate": 9.045849148744793e-06, "loss": 0.95099945, "memory(GiB)": 752.05, "step": 18915, "train_speed(iter/s)": 0.329375 }, { "acc": 0.77332048, "epoch": 0.4799601724381222, "grad_norm": 2.859375, "learning_rate": 9.045232898428647e-06, "loss": 0.86488695, "memory(GiB)": 752.05, "step": 18920, "train_speed(iter/s)": 0.329081 }, { "acc": 0.74907169, "epoch": 0.4800870118071598, "grad_norm": 3.765625, "learning_rate": 9.044616470175594e-06, "loss": 0.98441687, "memory(GiB)": 752.05, "step": 18925, "train_speed(iter/s)": 0.328801 }, { "acc": 0.76330814, "epoch": 0.4802138511761973, "grad_norm": 4.78125, "learning_rate": 9.043999864012752e-06, "loss": 0.90698185, "memory(GiB)": 752.05, "step": 18930, "train_speed(iter/s)": 0.328505 }, { "acc": 0.77300735, "epoch": 0.48034069054523487, "grad_norm": 3.078125, "learning_rate": 9.043383079967238e-06, "loss": 0.9151022, "memory(GiB)": 752.05, "step": 18935, "train_speed(iter/s)": 0.328213 }, { "acc": 0.76375561, "epoch": 0.48046752991427244, "grad_norm": 4.0, "learning_rate": 9.042766118066187e-06, "loss": 0.94293537, "memory(GiB)": 752.05, "step": 18940, "train_speed(iter/s)": 0.32789 }, { "acc": 0.75589995, "epoch": 0.48059436928331, "grad_norm": 3.4375, "learning_rate": 9.042148978336736e-06, "loss": 0.91212549, "memory(GiB)": 752.05, "step": 18945, "train_speed(iter/s)": 0.327598 }, { "acc": 0.7620153, "epoch": 0.4807212086523476, "grad_norm": 4.125, "learning_rate": 9.04153166080603e-06, "loss": 0.93116465, "memory(GiB)": 752.05, "step": 18950, "train_speed(iter/s)": 0.32725 }, { "acc": 0.75926404, "epoch": 0.48084804802138514, "grad_norm": 4.03125, "learning_rate": 9.040914165501222e-06, "loss": 0.88928413, "memory(GiB)": 752.05, "step": 18955, "train_speed(iter/s)": 0.327002 }, { "acc": 0.76445217, "epoch": 0.48097488739042266, "grad_norm": 3.421875, "learning_rate": 9.040296492449476e-06, "loss": 0.92109528, "memory(GiB)": 752.05, "step": 18960, "train_speed(iter/s)": 0.32669 }, { "acc": 0.76975346, "epoch": 0.4811017267594602, "grad_norm": 3.375, "learning_rate": 9.039678641677962e-06, "loss": 0.91812391, "memory(GiB)": 752.05, "step": 18965, "train_speed(iter/s)": 0.326371 }, { "acc": 0.76707649, "epoch": 0.4812285661284978, "grad_norm": 3.125, "learning_rate": 9.039060613213854e-06, "loss": 0.86414995, "memory(GiB)": 752.05, "step": 18970, "train_speed(iter/s)": 0.32609 }, { "acc": 0.76294661, "epoch": 0.48135540549753536, "grad_norm": 3.375, "learning_rate": 9.038442407084339e-06, "loss": 0.92148809, "memory(GiB)": 752.05, "step": 18975, "train_speed(iter/s)": 0.325815 }, { "acc": 0.76650429, "epoch": 0.48148224486657293, "grad_norm": 3.8125, "learning_rate": 9.03782402331661e-06, "loss": 0.91796942, "memory(GiB)": 752.05, "step": 18980, "train_speed(iter/s)": 0.325511 }, { "acc": 0.77383585, "epoch": 0.4816090842356105, "grad_norm": 3.15625, "learning_rate": 9.037205461937869e-06, "loss": 0.88870916, "memory(GiB)": 752.05, "step": 18985, "train_speed(iter/s)": 0.325242 }, { "acc": 0.76184292, "epoch": 0.481735923604648, "grad_norm": 3.4375, "learning_rate": 9.03658672297532e-06, "loss": 0.92938366, "memory(GiB)": 752.05, "step": 18990, "train_speed(iter/s)": 0.324926 }, { "acc": 0.75839996, "epoch": 0.4818627629736856, "grad_norm": 3.484375, "learning_rate": 9.035967806456184e-06, "loss": 0.92088842, "memory(GiB)": 752.05, "step": 18995, "train_speed(iter/s)": 0.324641 }, { "acc": 0.76202054, "epoch": 0.48198960234272314, "grad_norm": 2.90625, "learning_rate": 9.035348712407683e-06, "loss": 0.944806, "memory(GiB)": 752.05, "step": 19000, "train_speed(iter/s)": 0.324368 }, { "epoch": 0.48198960234272314, "eval_acc": 0.751959015355873, "eval_loss": 0.8843561410903931, "eval_runtime": 1148.9978, "eval_samples_per_second": 5.544, "eval_steps_per_second": 5.544, "step": 19000 }, { "acc": 0.75506458, "epoch": 0.4821164417117607, "grad_norm": 3.96875, "learning_rate": 9.03472944085705e-06, "loss": 0.9404026, "memory(GiB)": 752.05, "step": 19005, "train_speed(iter/s)": 0.314046 }, { "acc": 0.75747151, "epoch": 0.4822432810807983, "grad_norm": 2.984375, "learning_rate": 9.034109991831525e-06, "loss": 0.92013235, "memory(GiB)": 752.05, "step": 19010, "train_speed(iter/s)": 0.313775 }, { "acc": 0.7712956, "epoch": 0.48237012044983585, "grad_norm": 3.359375, "learning_rate": 9.033490365358355e-06, "loss": 0.84578209, "memory(GiB)": 752.05, "step": 19015, "train_speed(iter/s)": 0.313544 }, { "acc": 0.76728578, "epoch": 0.48249695981887336, "grad_norm": 3.234375, "learning_rate": 9.032870561464796e-06, "loss": 0.88442945, "memory(GiB)": 752.05, "step": 19020, "train_speed(iter/s)": 0.313273 }, { "acc": 0.76733918, "epoch": 0.48262379918791093, "grad_norm": 3.21875, "learning_rate": 9.032250580178109e-06, "loss": 0.94169264, "memory(GiB)": 752.05, "step": 19025, "train_speed(iter/s)": 0.313023 }, { "acc": 0.75739555, "epoch": 0.4827506385569485, "grad_norm": 3.734375, "learning_rate": 9.031630421525566e-06, "loss": 0.93473883, "memory(GiB)": 752.05, "step": 19030, "train_speed(iter/s)": 0.312759 }, { "acc": 0.7464643, "epoch": 0.48287747792598606, "grad_norm": 3.296875, "learning_rate": 9.031010085534447e-06, "loss": 1.00240412, "memory(GiB)": 752.05, "step": 19035, "train_speed(iter/s)": 0.31249 }, { "acc": 0.75599985, "epoch": 0.48300431729502363, "grad_norm": 4.3125, "learning_rate": 9.030389572232036e-06, "loss": 0.89820318, "memory(GiB)": 752.05, "step": 19040, "train_speed(iter/s)": 0.31226 }, { "acc": 0.76645989, "epoch": 0.4831311566640612, "grad_norm": 3.265625, "learning_rate": 9.029768881645633e-06, "loss": 0.89623537, "memory(GiB)": 752.05, "step": 19045, "train_speed(iter/s)": 0.311997 }, { "acc": 0.77887568, "epoch": 0.4832579960330987, "grad_norm": 3.328125, "learning_rate": 9.029148013802535e-06, "loss": 0.86544361, "memory(GiB)": 752.05, "step": 19050, "train_speed(iter/s)": 0.311731 }, { "acc": 0.75888205, "epoch": 0.4833848354021363, "grad_norm": 3.375, "learning_rate": 9.028526968730053e-06, "loss": 0.89077797, "memory(GiB)": 752.05, "step": 19055, "train_speed(iter/s)": 0.311442 }, { "acc": 0.76568279, "epoch": 0.48351167477117385, "grad_norm": 3.796875, "learning_rate": 9.027905746455505e-06, "loss": 0.8971653, "memory(GiB)": 752.05, "step": 19060, "train_speed(iter/s)": 0.311201 }, { "acc": 0.78193688, "epoch": 0.4836385141402114, "grad_norm": 3.078125, "learning_rate": 9.027284347006217e-06, "loss": 0.82662449, "memory(GiB)": 752.05, "step": 19065, "train_speed(iter/s)": 0.310901 }, { "acc": 0.76799178, "epoch": 0.483765353509249, "grad_norm": 4.1875, "learning_rate": 9.026662770409524e-06, "loss": 0.87995186, "memory(GiB)": 752.05, "step": 19070, "train_speed(iter/s)": 0.310663 }, { "acc": 0.74782681, "epoch": 0.48389219287828655, "grad_norm": 3.6875, "learning_rate": 9.026041016692763e-06, "loss": 0.96862907, "memory(GiB)": 752.05, "step": 19075, "train_speed(iter/s)": 0.310386 }, { "acc": 0.76247487, "epoch": 0.48401903224732407, "grad_norm": 3.03125, "learning_rate": 9.025419085883286e-06, "loss": 0.91615458, "memory(GiB)": 752.05, "step": 19080, "train_speed(iter/s)": 0.310147 }, { "acc": 0.76037488, "epoch": 0.48414587161636163, "grad_norm": 3.546875, "learning_rate": 9.02479697800845e-06, "loss": 0.92346725, "memory(GiB)": 752.05, "step": 19085, "train_speed(iter/s)": 0.309901 }, { "acc": 0.75252767, "epoch": 0.4842727109853992, "grad_norm": 3.109375, "learning_rate": 9.02417469309562e-06, "loss": 0.95851431, "memory(GiB)": 752.05, "step": 19090, "train_speed(iter/s)": 0.309659 }, { "acc": 0.7666399, "epoch": 0.48439955035443677, "grad_norm": 2.890625, "learning_rate": 9.023552231172166e-06, "loss": 0.88905544, "memory(GiB)": 752.05, "step": 19095, "train_speed(iter/s)": 0.30942 }, { "acc": 0.75520215, "epoch": 0.48452638972347434, "grad_norm": 3.34375, "learning_rate": 9.022929592265468e-06, "loss": 0.90609207, "memory(GiB)": 752.05, "step": 19100, "train_speed(iter/s)": 0.309154 }, { "acc": 0.75714583, "epoch": 0.4846532290925119, "grad_norm": 3.578125, "learning_rate": 9.022306776402917e-06, "loss": 0.92354565, "memory(GiB)": 752.05, "step": 19105, "train_speed(iter/s)": 0.308892 }, { "acc": 0.75965924, "epoch": 0.4847800684615494, "grad_norm": 3.484375, "learning_rate": 9.021683783611905e-06, "loss": 0.95792732, "memory(GiB)": 752.05, "step": 19110, "train_speed(iter/s)": 0.308649 }, { "acc": 0.78270235, "epoch": 0.484906907830587, "grad_norm": 3.421875, "learning_rate": 9.021060613919837e-06, "loss": 0.87471943, "memory(GiB)": 752.05, "step": 19115, "train_speed(iter/s)": 0.308402 }, { "acc": 0.7659615, "epoch": 0.48503374719962455, "grad_norm": 3.046875, "learning_rate": 9.020437267354126e-06, "loss": 0.86610289, "memory(GiB)": 752.05, "step": 19120, "train_speed(iter/s)": 0.308167 }, { "acc": 0.76019654, "epoch": 0.4851605865686621, "grad_norm": 3.40625, "learning_rate": 9.019813743942189e-06, "loss": 0.91786242, "memory(GiB)": 752.05, "step": 19125, "train_speed(iter/s)": 0.307916 }, { "acc": 0.77372308, "epoch": 0.4852874259376997, "grad_norm": 2.859375, "learning_rate": 9.019190043711451e-06, "loss": 0.88124514, "memory(GiB)": 752.05, "step": 19130, "train_speed(iter/s)": 0.307641 }, { "acc": 0.76663756, "epoch": 0.48541426530673726, "grad_norm": 3.46875, "learning_rate": 9.018566166689352e-06, "loss": 0.89273739, "memory(GiB)": 752.05, "step": 19135, "train_speed(iter/s)": 0.30742 }, { "acc": 0.75598516, "epoch": 0.48554110467577477, "grad_norm": 3.15625, "learning_rate": 9.017942112903331e-06, "loss": 0.93708315, "memory(GiB)": 752.05, "step": 19140, "train_speed(iter/s)": 0.307169 }, { "acc": 0.75986929, "epoch": 0.48566794404481234, "grad_norm": 4.125, "learning_rate": 9.017317882380839e-06, "loss": 0.91198092, "memory(GiB)": 752.05, "step": 19145, "train_speed(iter/s)": 0.306931 }, { "acc": 0.75827265, "epoch": 0.4857947834138499, "grad_norm": 4.3125, "learning_rate": 9.016693475149332e-06, "loss": 0.90601072, "memory(GiB)": 752.05, "step": 19150, "train_speed(iter/s)": 0.30668 }, { "acc": 0.77724938, "epoch": 0.4859216227828875, "grad_norm": 3.828125, "learning_rate": 9.016068891236275e-06, "loss": 0.91883383, "memory(GiB)": 752.05, "step": 19155, "train_speed(iter/s)": 0.306405 }, { "acc": 0.76075697, "epoch": 0.48604846215192504, "grad_norm": 3.765625, "learning_rate": 9.015444130669146e-06, "loss": 0.92746563, "memory(GiB)": 752.05, "step": 19160, "train_speed(iter/s)": 0.306194 }, { "acc": 0.76405382, "epoch": 0.4861753015209626, "grad_norm": 3.203125, "learning_rate": 9.014819193475424e-06, "loss": 0.89346151, "memory(GiB)": 752.05, "step": 19165, "train_speed(iter/s)": 0.305983 }, { "acc": 0.75516634, "epoch": 0.4863021408900001, "grad_norm": 3.015625, "learning_rate": 9.014194079682596e-06, "loss": 0.87270651, "memory(GiB)": 752.05, "step": 19170, "train_speed(iter/s)": 0.305726 }, { "acc": 0.75735688, "epoch": 0.4864289802590377, "grad_norm": 3.46875, "learning_rate": 9.01356878931816e-06, "loss": 0.91549377, "memory(GiB)": 752.05, "step": 19175, "train_speed(iter/s)": 0.305452 }, { "acc": 0.7532227, "epoch": 0.48655581962807526, "grad_norm": 3.421875, "learning_rate": 9.012943322409624e-06, "loss": 0.96334171, "memory(GiB)": 752.05, "step": 19180, "train_speed(iter/s)": 0.305201 }, { "acc": 0.76023149, "epoch": 0.4866826589971128, "grad_norm": 3.359375, "learning_rate": 9.012317678984495e-06, "loss": 0.9221139, "memory(GiB)": 752.05, "step": 19185, "train_speed(iter/s)": 0.30497 }, { "acc": 0.74636598, "epoch": 0.4868094983661504, "grad_norm": 3.1875, "learning_rate": 9.011691859070296e-06, "loss": 0.94206486, "memory(GiB)": 752.05, "step": 19190, "train_speed(iter/s)": 0.304744 }, { "acc": 0.77847419, "epoch": 0.48693633773518796, "grad_norm": 4.0, "learning_rate": 9.011065862694554e-06, "loss": 0.81773014, "memory(GiB)": 752.05, "step": 19195, "train_speed(iter/s)": 0.304471 }, { "acc": 0.75867476, "epoch": 0.4870631771042255, "grad_norm": 3.90625, "learning_rate": 9.010439689884804e-06, "loss": 0.93989954, "memory(GiB)": 752.05, "step": 19200, "train_speed(iter/s)": 0.304244 }, { "acc": 0.76335092, "epoch": 0.48719001647326304, "grad_norm": 3.78125, "learning_rate": 9.009813340668591e-06, "loss": 0.89097443, "memory(GiB)": 752.05, "step": 19205, "train_speed(iter/s)": 0.303998 }, { "acc": 0.76065392, "epoch": 0.4873168558423006, "grad_norm": 3.015625, "learning_rate": 9.009186815073466e-06, "loss": 0.88178587, "memory(GiB)": 752.05, "step": 19210, "train_speed(iter/s)": 0.303771 }, { "acc": 0.75835819, "epoch": 0.4874436952113382, "grad_norm": 4.125, "learning_rate": 9.008560113126986e-06, "loss": 0.96638336, "memory(GiB)": 752.05, "step": 19215, "train_speed(iter/s)": 0.303526 }, { "acc": 0.75929728, "epoch": 0.48757053458037575, "grad_norm": 3.578125, "learning_rate": 9.007933234856718e-06, "loss": 0.89721622, "memory(GiB)": 752.05, "step": 19220, "train_speed(iter/s)": 0.303278 }, { "acc": 0.76125393, "epoch": 0.4876973739494133, "grad_norm": 4.75, "learning_rate": 9.007306180290238e-06, "loss": 0.94854002, "memory(GiB)": 752.05, "step": 19225, "train_speed(iter/s)": 0.303057 }, { "acc": 0.76487985, "epoch": 0.48782421331845083, "grad_norm": 3.375, "learning_rate": 9.006678949455127e-06, "loss": 0.95382185, "memory(GiB)": 752.05, "step": 19230, "train_speed(iter/s)": 0.30281 }, { "acc": 0.76108289, "epoch": 0.4879510526874884, "grad_norm": 5.78125, "learning_rate": 9.006051542378975e-06, "loss": 0.9371603, "memory(GiB)": 752.05, "step": 19235, "train_speed(iter/s)": 0.302581 }, { "acc": 0.76241441, "epoch": 0.48807789205652596, "grad_norm": 3.90625, "learning_rate": 9.005423959089378e-06, "loss": 0.92414608, "memory(GiB)": 752.05, "step": 19240, "train_speed(iter/s)": 0.302367 }, { "acc": 0.75060606, "epoch": 0.48820473142556353, "grad_norm": 4.0625, "learning_rate": 9.004796199613947e-06, "loss": 0.92163248, "memory(GiB)": 752.05, "step": 19245, "train_speed(iter/s)": 0.302113 }, { "acc": 0.75062675, "epoch": 0.4883315707946011, "grad_norm": 3.421875, "learning_rate": 9.004168263980288e-06, "loss": 0.92309647, "memory(GiB)": 752.05, "step": 19250, "train_speed(iter/s)": 0.301883 }, { "acc": 0.75485811, "epoch": 0.48845841016363867, "grad_norm": 3.46875, "learning_rate": 9.003540152216026e-06, "loss": 0.91767893, "memory(GiB)": 752.05, "step": 19255, "train_speed(iter/s)": 0.301643 }, { "acc": 0.75810308, "epoch": 0.4885852495326762, "grad_norm": 3.875, "learning_rate": 9.00291186434879e-06, "loss": 0.92209587, "memory(GiB)": 752.05, "step": 19260, "train_speed(iter/s)": 0.301401 }, { "acc": 0.7743824, "epoch": 0.48871208890171375, "grad_norm": 3.53125, "learning_rate": 9.002283400406213e-06, "loss": 0.8712347, "memory(GiB)": 752.05, "step": 19265, "train_speed(iter/s)": 0.30116 }, { "acc": 0.76175327, "epoch": 0.4888389282707513, "grad_norm": 3.109375, "learning_rate": 9.001654760415944e-06, "loss": 0.90271177, "memory(GiB)": 752.05, "step": 19270, "train_speed(iter/s)": 0.300919 }, { "acc": 0.7554194, "epoch": 0.4889657676397889, "grad_norm": 3.421875, "learning_rate": 9.00102594440563e-06, "loss": 0.92059002, "memory(GiB)": 752.05, "step": 19275, "train_speed(iter/s)": 0.30069 }, { "acc": 0.7548852, "epoch": 0.48909260700882645, "grad_norm": 3.3125, "learning_rate": 9.000396952402933e-06, "loss": 0.92524529, "memory(GiB)": 752.05, "step": 19280, "train_speed(iter/s)": 0.300445 }, { "acc": 0.78148475, "epoch": 0.489219446377864, "grad_norm": 3.171875, "learning_rate": 8.99976778443552e-06, "loss": 0.87789545, "memory(GiB)": 752.05, "step": 19285, "train_speed(iter/s)": 0.300215 }, { "acc": 0.75503049, "epoch": 0.48934628574690153, "grad_norm": 3.375, "learning_rate": 8.999138440531066e-06, "loss": 0.93487301, "memory(GiB)": 752.05, "step": 19290, "train_speed(iter/s)": 0.299978 }, { "acc": 0.7751225, "epoch": 0.4894731251159391, "grad_norm": 3.421875, "learning_rate": 8.998508920717255e-06, "loss": 0.86096048, "memory(GiB)": 752.05, "step": 19295, "train_speed(iter/s)": 0.299723 }, { "acc": 0.76355262, "epoch": 0.48959996448497667, "grad_norm": 3.296875, "learning_rate": 8.997879225021777e-06, "loss": 0.88044062, "memory(GiB)": 752.05, "step": 19300, "train_speed(iter/s)": 0.299457 }, { "acc": 0.75536308, "epoch": 0.48972680385401424, "grad_norm": 3.3125, "learning_rate": 8.997249353472327e-06, "loss": 0.93902731, "memory(GiB)": 752.05, "step": 19305, "train_speed(iter/s)": 0.299232 }, { "acc": 0.75999904, "epoch": 0.4898536432230518, "grad_norm": 3.625, "learning_rate": 8.996619306096616e-06, "loss": 0.91889219, "memory(GiB)": 752.05, "step": 19310, "train_speed(iter/s)": 0.298987 }, { "acc": 0.76906872, "epoch": 0.4899804825920894, "grad_norm": 3.328125, "learning_rate": 8.995989082922357e-06, "loss": 0.91379471, "memory(GiB)": 752.05, "step": 19315, "train_speed(iter/s)": 0.29875 }, { "acc": 0.74025822, "epoch": 0.4901073219611269, "grad_norm": 3.140625, "learning_rate": 8.995358683977269e-06, "loss": 0.91037216, "memory(GiB)": 752.05, "step": 19320, "train_speed(iter/s)": 0.298536 }, { "acc": 0.76264839, "epoch": 0.49023416133016445, "grad_norm": 3.046875, "learning_rate": 8.994728109289082e-06, "loss": 0.89095106, "memory(GiB)": 752.05, "step": 19325, "train_speed(iter/s)": 0.298299 }, { "acc": 0.7666893, "epoch": 0.490361000699202, "grad_norm": 4.375, "learning_rate": 8.994097358885535e-06, "loss": 0.88644209, "memory(GiB)": 752.05, "step": 19330, "train_speed(iter/s)": 0.298099 }, { "acc": 0.75884991, "epoch": 0.4904878400682396, "grad_norm": 3.796875, "learning_rate": 8.99346643279437e-06, "loss": 0.88479452, "memory(GiB)": 752.05, "step": 19335, "train_speed(iter/s)": 0.297857 }, { "acc": 0.73495641, "epoch": 0.49061467943727716, "grad_norm": 3.765625, "learning_rate": 8.992835331043341e-06, "loss": 1.00645523, "memory(GiB)": 752.05, "step": 19340, "train_speed(iter/s)": 0.297614 }, { "acc": 0.75419254, "epoch": 0.4907415188063147, "grad_norm": 3.34375, "learning_rate": 8.992204053660207e-06, "loss": 0.95018358, "memory(GiB)": 752.05, "step": 19345, "train_speed(iter/s)": 0.297403 }, { "acc": 0.76545286, "epoch": 0.49086835817535224, "grad_norm": 3.015625, "learning_rate": 8.991572600672737e-06, "loss": 0.92498951, "memory(GiB)": 752.05, "step": 19350, "train_speed(iter/s)": 0.297138 }, { "acc": 0.76078835, "epoch": 0.4909951975443898, "grad_norm": 3.84375, "learning_rate": 8.990940972108705e-06, "loss": 0.97011204, "memory(GiB)": 752.05, "step": 19355, "train_speed(iter/s)": 0.296908 }, { "acc": 0.77542181, "epoch": 0.4911220369134274, "grad_norm": 4.4375, "learning_rate": 8.990309167995897e-06, "loss": 0.90091953, "memory(GiB)": 752.05, "step": 19360, "train_speed(iter/s)": 0.29666 }, { "acc": 0.75054283, "epoch": 0.49124887628246494, "grad_norm": 3.03125, "learning_rate": 8.989677188362103e-06, "loss": 0.97669716, "memory(GiB)": 752.05, "step": 19365, "train_speed(iter/s)": 0.296429 }, { "acc": 0.76088648, "epoch": 0.4913757156515025, "grad_norm": 3.578125, "learning_rate": 8.989045033235121e-06, "loss": 0.90867071, "memory(GiB)": 752.05, "step": 19370, "train_speed(iter/s)": 0.296204 }, { "acc": 0.75993524, "epoch": 0.4915025550205401, "grad_norm": 3.28125, "learning_rate": 8.988412702642757e-06, "loss": 0.94835796, "memory(GiB)": 752.05, "step": 19375, "train_speed(iter/s)": 0.295977 }, { "acc": 0.76710372, "epoch": 0.4916293943895776, "grad_norm": 3.828125, "learning_rate": 8.987780196612828e-06, "loss": 0.92896681, "memory(GiB)": 752.05, "step": 19380, "train_speed(iter/s)": 0.295735 }, { "acc": 0.75867696, "epoch": 0.49175623375861516, "grad_norm": 3.515625, "learning_rate": 8.987147515173151e-06, "loss": 0.97379808, "memory(GiB)": 752.05, "step": 19385, "train_speed(iter/s)": 0.295504 }, { "acc": 0.75225883, "epoch": 0.4918830731276527, "grad_norm": 3.4375, "learning_rate": 8.986514658351562e-06, "loss": 0.95474052, "memory(GiB)": 752.05, "step": 19390, "train_speed(iter/s)": 0.295308 }, { "acc": 0.76010728, "epoch": 0.4920099124966903, "grad_norm": 3.203125, "learning_rate": 8.985881626175892e-06, "loss": 0.91047974, "memory(GiB)": 752.05, "step": 19395, "train_speed(iter/s)": 0.295094 }, { "acc": 0.78394041, "epoch": 0.49213675186572786, "grad_norm": 3.265625, "learning_rate": 8.985248418673991e-06, "loss": 0.86991034, "memory(GiB)": 752.05, "step": 19400, "train_speed(iter/s)": 0.294861 }, { "acc": 0.77251415, "epoch": 0.49226359123476543, "grad_norm": 3.421875, "learning_rate": 8.98461503587371e-06, "loss": 0.93430605, "memory(GiB)": 752.05, "step": 19405, "train_speed(iter/s)": 0.294634 }, { "acc": 0.77130208, "epoch": 0.49239043060380294, "grad_norm": 3.109375, "learning_rate": 8.983981477802907e-06, "loss": 0.87863283, "memory(GiB)": 752.05, "step": 19410, "train_speed(iter/s)": 0.2944 }, { "acc": 0.78089957, "epoch": 0.4925172699728405, "grad_norm": 3.328125, "learning_rate": 8.983347744489454e-06, "loss": 0.84414892, "memory(GiB)": 752.05, "step": 19415, "train_speed(iter/s)": 0.294197 }, { "acc": 0.76958261, "epoch": 0.4926441093418781, "grad_norm": 3.625, "learning_rate": 8.982713835961227e-06, "loss": 0.88563719, "memory(GiB)": 752.05, "step": 19420, "train_speed(iter/s)": 0.29396 }, { "acc": 0.77645974, "epoch": 0.49277094871091565, "grad_norm": 4.03125, "learning_rate": 8.982079752246106e-06, "loss": 0.92230988, "memory(GiB)": 752.05, "step": 19425, "train_speed(iter/s)": 0.293752 }, { "acc": 0.77883773, "epoch": 0.4928977880799532, "grad_norm": 4.0, "learning_rate": 8.981445493371985e-06, "loss": 0.88505573, "memory(GiB)": 752.05, "step": 19430, "train_speed(iter/s)": 0.293494 }, { "acc": 0.76995769, "epoch": 0.4930246274489908, "grad_norm": 3.84375, "learning_rate": 8.980811059366763e-06, "loss": 0.89954567, "memory(GiB)": 752.05, "step": 19435, "train_speed(iter/s)": 0.293268 }, { "acc": 0.75662546, "epoch": 0.4931514668180283, "grad_norm": 3.5625, "learning_rate": 8.980176450258344e-06, "loss": 0.92271214, "memory(GiB)": 752.05, "step": 19440, "train_speed(iter/s)": 0.292973 }, { "acc": 0.77513957, "epoch": 0.49327830618706586, "grad_norm": 3.203125, "learning_rate": 8.979541666074648e-06, "loss": 0.91275835, "memory(GiB)": 752.05, "step": 19445, "train_speed(iter/s)": 0.292765 }, { "acc": 0.76374302, "epoch": 0.49340514555610343, "grad_norm": 3.265625, "learning_rate": 8.978906706843592e-06, "loss": 0.91872177, "memory(GiB)": 752.05, "step": 19450, "train_speed(iter/s)": 0.292575 }, { "acc": 0.76182194, "epoch": 0.493531984925141, "grad_norm": 2.90625, "learning_rate": 8.978271572593106e-06, "loss": 0.90789375, "memory(GiB)": 752.05, "step": 19455, "train_speed(iter/s)": 0.29236 }, { "acc": 0.77718129, "epoch": 0.49365882429417857, "grad_norm": 3.921875, "learning_rate": 8.977636263351131e-06, "loss": 0.8697794, "memory(GiB)": 752.05, "step": 19460, "train_speed(iter/s)": 0.292153 }, { "acc": 0.75947175, "epoch": 0.49378566366321613, "grad_norm": 3.1875, "learning_rate": 8.977000779145609e-06, "loss": 0.88754892, "memory(GiB)": 752.05, "step": 19465, "train_speed(iter/s)": 0.291964 }, { "acc": 0.76016202, "epoch": 0.49391250303225365, "grad_norm": 3.75, "learning_rate": 8.976365120004496e-06, "loss": 0.9409771, "memory(GiB)": 752.05, "step": 19470, "train_speed(iter/s)": 0.291764 }, { "acc": 0.768503, "epoch": 0.4940393424012912, "grad_norm": 3.734375, "learning_rate": 8.975729285955748e-06, "loss": 0.92475739, "memory(GiB)": 752.05, "step": 19475, "train_speed(iter/s)": 0.291538 }, { "acc": 0.76060224, "epoch": 0.4941661817703288, "grad_norm": 3.1875, "learning_rate": 8.975093277027339e-06, "loss": 0.90182562, "memory(GiB)": 752.05, "step": 19480, "train_speed(iter/s)": 0.291325 }, { "acc": 0.75179777, "epoch": 0.49429302113936635, "grad_norm": 3.390625, "learning_rate": 8.97445709324724e-06, "loss": 0.96641293, "memory(GiB)": 752.05, "step": 19485, "train_speed(iter/s)": 0.29111 }, { "acc": 0.77310734, "epoch": 0.4944198605084039, "grad_norm": 4.3125, "learning_rate": 8.973820734643439e-06, "loss": 0.93415442, "memory(GiB)": 752.05, "step": 19490, "train_speed(iter/s)": 0.290905 }, { "acc": 0.77513061, "epoch": 0.4945466998774415, "grad_norm": 3.5, "learning_rate": 8.973184201243922e-06, "loss": 0.93268509, "memory(GiB)": 752.05, "step": 19495, "train_speed(iter/s)": 0.290671 }, { "acc": 0.76312075, "epoch": 0.494673539246479, "grad_norm": 4.3125, "learning_rate": 8.972547493076696e-06, "loss": 0.92889252, "memory(GiB)": 752.05, "step": 19500, "train_speed(iter/s)": 0.29045 }, { "epoch": 0.494673539246479, "eval_acc": 0.7524043553077545, "eval_loss": 0.8827304840087891, "eval_runtime": 1148.8507, "eval_samples_per_second": 5.545, "eval_steps_per_second": 5.545, "step": 19500 }, { "acc": 0.77396383, "epoch": 0.49480037861551657, "grad_norm": 3.625, "learning_rate": 8.971910610169759e-06, "loss": 0.87682257, "memory(GiB)": 752.05, "step": 19505, "train_speed(iter/s)": 0.282373 }, { "acc": 0.7657032, "epoch": 0.49492721798455414, "grad_norm": 3.640625, "learning_rate": 8.971273552551131e-06, "loss": 0.87752275, "memory(GiB)": 752.05, "step": 19510, "train_speed(iter/s)": 0.28217 }, { "acc": 0.76083541, "epoch": 0.4950540573535917, "grad_norm": 3.609375, "learning_rate": 8.970636320248833e-06, "loss": 0.91053333, "memory(GiB)": 752.05, "step": 19515, "train_speed(iter/s)": 0.281982 }, { "acc": 0.75999746, "epoch": 0.49518089672262927, "grad_norm": 4.78125, "learning_rate": 8.969998913290895e-06, "loss": 0.90952978, "memory(GiB)": 752.05, "step": 19520, "train_speed(iter/s)": 0.2818 }, { "acc": 0.75781937, "epoch": 0.49530773609166684, "grad_norm": 3.609375, "learning_rate": 8.969361331705354e-06, "loss": 0.92831154, "memory(GiB)": 752.05, "step": 19525, "train_speed(iter/s)": 0.281601 }, { "acc": 0.77271295, "epoch": 0.49543457546070435, "grad_norm": 3.046875, "learning_rate": 8.968723575520253e-06, "loss": 0.91730661, "memory(GiB)": 752.05, "step": 19530, "train_speed(iter/s)": 0.281409 }, { "acc": 0.75097833, "epoch": 0.4955614148297419, "grad_norm": 5.0, "learning_rate": 8.968085644763648e-06, "loss": 0.95583487, "memory(GiB)": 752.05, "step": 19535, "train_speed(iter/s)": 0.281246 }, { "acc": 0.76820974, "epoch": 0.4956882541987795, "grad_norm": 3.390625, "learning_rate": 8.9674475394636e-06, "loss": 0.92130299, "memory(GiB)": 752.05, "step": 19540, "train_speed(iter/s)": 0.281062 }, { "acc": 0.75596299, "epoch": 0.49581509356781706, "grad_norm": 3.703125, "learning_rate": 8.966809259648175e-06, "loss": 0.91743069, "memory(GiB)": 752.05, "step": 19545, "train_speed(iter/s)": 0.280899 }, { "acc": 0.76253738, "epoch": 0.4959419329368546, "grad_norm": 3.03125, "learning_rate": 8.96617080534545e-06, "loss": 0.93870611, "memory(GiB)": 752.05, "step": 19550, "train_speed(iter/s)": 0.280663 }, { "acc": 0.76236, "epoch": 0.4960687723058922, "grad_norm": 3.546875, "learning_rate": 8.965532176583506e-06, "loss": 0.89438038, "memory(GiB)": 752.05, "step": 19555, "train_speed(iter/s)": 0.280485 }, { "acc": 0.76687627, "epoch": 0.4961956116749297, "grad_norm": 3.78125, "learning_rate": 8.96489337339044e-06, "loss": 0.88100824, "memory(GiB)": 752.05, "step": 19560, "train_speed(iter/s)": 0.280307 }, { "acc": 0.77398181, "epoch": 0.49632245104396727, "grad_norm": 3.296875, "learning_rate": 8.964254395794345e-06, "loss": 0.86703215, "memory(GiB)": 752.05, "step": 19565, "train_speed(iter/s)": 0.280105 }, { "acc": 0.7620399, "epoch": 0.49644929041300484, "grad_norm": 4.15625, "learning_rate": 8.96361524382333e-06, "loss": 0.94108801, "memory(GiB)": 752.05, "step": 19570, "train_speed(iter/s)": 0.279884 }, { "acc": 0.76048231, "epoch": 0.4965761297820424, "grad_norm": 3.03125, "learning_rate": 8.962975917505511e-06, "loss": 0.92262192, "memory(GiB)": 752.05, "step": 19575, "train_speed(iter/s)": 0.279693 }, { "acc": 0.77009149, "epoch": 0.49670296915108, "grad_norm": 3.265625, "learning_rate": 8.962336416869007e-06, "loss": 0.92757664, "memory(GiB)": 752.05, "step": 19580, "train_speed(iter/s)": 0.27949 }, { "acc": 0.75667539, "epoch": 0.49682980852011754, "grad_norm": 4.25, "learning_rate": 8.96169674194195e-06, "loss": 0.93803568, "memory(GiB)": 752.05, "step": 19585, "train_speed(iter/s)": 0.279323 }, { "acc": 0.75278945, "epoch": 0.49695664788915506, "grad_norm": 3.875, "learning_rate": 8.961056892752475e-06, "loss": 0.91590376, "memory(GiB)": 752.05, "step": 19590, "train_speed(iter/s)": 0.279094 }, { "acc": 0.7687077, "epoch": 0.4970834872581926, "grad_norm": 2.875, "learning_rate": 8.960416869328728e-06, "loss": 0.87324514, "memory(GiB)": 752.05, "step": 19595, "train_speed(iter/s)": 0.278904 }, { "acc": 0.76317019, "epoch": 0.4972103266272302, "grad_norm": 3.90625, "learning_rate": 8.959776671698861e-06, "loss": 0.92160788, "memory(GiB)": 752.05, "step": 19600, "train_speed(iter/s)": 0.278693 }, { "acc": 0.76372552, "epoch": 0.49733716599626776, "grad_norm": 4.53125, "learning_rate": 8.959136299891036e-06, "loss": 0.96745377, "memory(GiB)": 752.05, "step": 19605, "train_speed(iter/s)": 0.278493 }, { "acc": 0.75155377, "epoch": 0.49746400536530533, "grad_norm": 3.34375, "learning_rate": 8.95849575393342e-06, "loss": 0.9638278, "memory(GiB)": 752.05, "step": 19610, "train_speed(iter/s)": 0.278303 }, { "acc": 0.76288619, "epoch": 0.4975908447343429, "grad_norm": 4.65625, "learning_rate": 8.957855033854187e-06, "loss": 0.96107817, "memory(GiB)": 752.05, "step": 19615, "train_speed(iter/s)": 0.278111 }, { "acc": 0.75342216, "epoch": 0.4977176841033804, "grad_norm": 3.828125, "learning_rate": 8.957214139681523e-06, "loss": 0.93014116, "memory(GiB)": 752.05, "step": 19620, "train_speed(iter/s)": 0.277889 }, { "acc": 0.76089478, "epoch": 0.497844523472418, "grad_norm": 3.265625, "learning_rate": 8.956573071443615e-06, "loss": 0.9301939, "memory(GiB)": 752.05, "step": 19625, "train_speed(iter/s)": 0.277697 }, { "acc": 0.7722012, "epoch": 0.49797136284145554, "grad_norm": 3.6875, "learning_rate": 8.955931829168667e-06, "loss": 0.89252977, "memory(GiB)": 752.05, "step": 19630, "train_speed(iter/s)": 0.277491 }, { "acc": 0.76907182, "epoch": 0.4980982022104931, "grad_norm": 4.0625, "learning_rate": 8.95529041288488e-06, "loss": 0.92847595, "memory(GiB)": 752.06, "step": 19635, "train_speed(iter/s)": 0.277286 }, { "acc": 0.77463427, "epoch": 0.4982250415795307, "grad_norm": 4.28125, "learning_rate": 8.95464882262047e-06, "loss": 0.90075884, "memory(GiB)": 752.06, "step": 19640, "train_speed(iter/s)": 0.277094 }, { "acc": 0.76275058, "epoch": 0.49835188094856825, "grad_norm": 4.375, "learning_rate": 8.95400705840366e-06, "loss": 0.90491686, "memory(GiB)": 752.06, "step": 19645, "train_speed(iter/s)": 0.276896 }, { "acc": 0.75312996, "epoch": 0.49847872031760576, "grad_norm": 4.0625, "learning_rate": 8.953365120262677e-06, "loss": 0.98193693, "memory(GiB)": 752.06, "step": 19650, "train_speed(iter/s)": 0.27669 }, { "acc": 0.76751428, "epoch": 0.49860555968664333, "grad_norm": 3.46875, "learning_rate": 8.952723008225758e-06, "loss": 0.89151192, "memory(GiB)": 752.06, "step": 19655, "train_speed(iter/s)": 0.276504 }, { "acc": 0.75872564, "epoch": 0.4987323990556809, "grad_norm": 4.09375, "learning_rate": 8.95208072232115e-06, "loss": 0.9338151, "memory(GiB)": 752.06, "step": 19660, "train_speed(iter/s)": 0.276278 }, { "acc": 0.76489744, "epoch": 0.49885923842471847, "grad_norm": 3.65625, "learning_rate": 8.951438262577101e-06, "loss": 0.92028589, "memory(GiB)": 752.06, "step": 19665, "train_speed(iter/s)": 0.276098 }, { "acc": 0.748136, "epoch": 0.49898607779375603, "grad_norm": 3.25, "learning_rate": 8.950795629021873e-06, "loss": 0.93817701, "memory(GiB)": 752.06, "step": 19670, "train_speed(iter/s)": 0.275896 }, { "acc": 0.75582819, "epoch": 0.4991129171627936, "grad_norm": 3.5625, "learning_rate": 8.950152821683734e-06, "loss": 0.95809851, "memory(GiB)": 752.06, "step": 19675, "train_speed(iter/s)": 0.275713 }, { "acc": 0.75783672, "epoch": 0.4992397565318311, "grad_norm": 2.9375, "learning_rate": 8.94950984059096e-06, "loss": 0.92471714, "memory(GiB)": 752.06, "step": 19680, "train_speed(iter/s)": 0.275516 }, { "acc": 0.75718756, "epoch": 0.4993665959008687, "grad_norm": 3.328125, "learning_rate": 8.94886668577183e-06, "loss": 0.92761459, "memory(GiB)": 752.06, "step": 19685, "train_speed(iter/s)": 0.27535 }, { "acc": 0.76595392, "epoch": 0.49949343526990625, "grad_norm": 3.84375, "learning_rate": 8.948223357254636e-06, "loss": 0.93357973, "memory(GiB)": 752.06, "step": 19690, "train_speed(iter/s)": 0.275138 }, { "acc": 0.75065494, "epoch": 0.4996202746389438, "grad_norm": 3.546875, "learning_rate": 8.947579855067678e-06, "loss": 0.95956802, "memory(GiB)": 752.06, "step": 19695, "train_speed(iter/s)": 0.274928 }, { "acc": 0.76629038, "epoch": 0.4997471140079814, "grad_norm": 3.046875, "learning_rate": 8.946936179239259e-06, "loss": 0.87133961, "memory(GiB)": 752.06, "step": 19700, "train_speed(iter/s)": 0.27476 }, { "acc": 0.77249851, "epoch": 0.49987395337701895, "grad_norm": 3.375, "learning_rate": 8.946292329797691e-06, "loss": 0.88963385, "memory(GiB)": 752.06, "step": 19705, "train_speed(iter/s)": 0.27458 }, { "acc": 0.75316625, "epoch": 0.5000007927460565, "grad_norm": 3.5, "learning_rate": 8.945648306771299e-06, "loss": 0.93942547, "memory(GiB)": 752.06, "step": 19710, "train_speed(iter/s)": 0.274379 }, { "acc": 0.75726495, "epoch": 0.500127632115094, "grad_norm": 3.28125, "learning_rate": 8.945004110188411e-06, "loss": 0.93253727, "memory(GiB)": 752.06, "step": 19715, "train_speed(iter/s)": 0.274195 }, { "acc": 0.75605712, "epoch": 0.5002544714841316, "grad_norm": 3.59375, "learning_rate": 8.94435974007736e-06, "loss": 0.95260878, "memory(GiB)": 752.06, "step": 19720, "train_speed(iter/s)": 0.274017 }, { "acc": 0.7629096, "epoch": 0.5003813108531692, "grad_norm": 2.890625, "learning_rate": 8.943715196466493e-06, "loss": 0.88975353, "memory(GiB)": 752.06, "step": 19725, "train_speed(iter/s)": 0.273801 }, { "acc": 0.75948172, "epoch": 0.5005081502222067, "grad_norm": 3.53125, "learning_rate": 8.94307047938416e-06, "loss": 0.93345184, "memory(GiB)": 752.06, "step": 19730, "train_speed(iter/s)": 0.273635 }, { "acc": 0.76061754, "epoch": 0.5006349895912443, "grad_norm": 3.6875, "learning_rate": 8.942425588858717e-06, "loss": 0.93578234, "memory(GiB)": 752.06, "step": 19735, "train_speed(iter/s)": 0.273452 }, { "acc": 0.76671467, "epoch": 0.5007618289602819, "grad_norm": 2.953125, "learning_rate": 8.941780524918537e-06, "loss": 0.91029072, "memory(GiB)": 752.06, "step": 19740, "train_speed(iter/s)": 0.273256 }, { "acc": 0.76892009, "epoch": 0.5008886683293194, "grad_norm": 3.515625, "learning_rate": 8.94113528759199e-06, "loss": 0.88066854, "memory(GiB)": 752.06, "step": 19745, "train_speed(iter/s)": 0.273063 }, { "acc": 0.76377683, "epoch": 0.501015507698357, "grad_norm": 3.09375, "learning_rate": 8.940489876907457e-06, "loss": 0.90248833, "memory(GiB)": 752.06, "step": 19750, "train_speed(iter/s)": 0.272888 }, { "acc": 0.76860895, "epoch": 0.5011423470673945, "grad_norm": 3.265625, "learning_rate": 8.939844292893331e-06, "loss": 0.84598312, "memory(GiB)": 752.06, "step": 19755, "train_speed(iter/s)": 0.2727 }, { "acc": 0.75882602, "epoch": 0.501269186436432, "grad_norm": 3.09375, "learning_rate": 8.939198535578008e-06, "loss": 0.90723028, "memory(GiB)": 752.06, "step": 19760, "train_speed(iter/s)": 0.272522 }, { "acc": 0.75576115, "epoch": 0.5013960258054696, "grad_norm": 3.359375, "learning_rate": 8.938552604989893e-06, "loss": 0.94443111, "memory(GiB)": 752.06, "step": 19765, "train_speed(iter/s)": 0.27231 }, { "acc": 0.75782886, "epoch": 0.5015228651745072, "grad_norm": 3.3125, "learning_rate": 8.937906501157397e-06, "loss": 0.95667143, "memory(GiB)": 752.06, "step": 19770, "train_speed(iter/s)": 0.272134 }, { "acc": 0.76380854, "epoch": 0.5016497045435447, "grad_norm": 3.921875, "learning_rate": 8.937260224108942e-06, "loss": 0.91437712, "memory(GiB)": 752.06, "step": 19775, "train_speed(iter/s)": 0.271949 }, { "acc": 0.75837212, "epoch": 0.5017765439125823, "grad_norm": 4.875, "learning_rate": 8.936613773872952e-06, "loss": 0.94381361, "memory(GiB)": 752.06, "step": 19780, "train_speed(iter/s)": 0.271775 }, { "acc": 0.76030416, "epoch": 0.5019033832816199, "grad_norm": 4.15625, "learning_rate": 8.935967150477866e-06, "loss": 0.97328844, "memory(GiB)": 752.06, "step": 19785, "train_speed(iter/s)": 0.271553 }, { "acc": 0.75757527, "epoch": 0.5020302226506574, "grad_norm": 3.640625, "learning_rate": 8.935320353952128e-06, "loss": 0.95390978, "memory(GiB)": 752.06, "step": 19790, "train_speed(iter/s)": 0.271381 }, { "acc": 0.76746607, "epoch": 0.502157062019695, "grad_norm": 3.28125, "learning_rate": 8.934673384324183e-06, "loss": 0.89956226, "memory(GiB)": 752.06, "step": 19795, "train_speed(iter/s)": 0.271199 }, { "acc": 0.76648793, "epoch": 0.5022839013887326, "grad_norm": 3.46875, "learning_rate": 8.934026241622495e-06, "loss": 0.91113167, "memory(GiB)": 752.06, "step": 19800, "train_speed(iter/s)": 0.271021 }, { "acc": 0.76433144, "epoch": 0.5024107407577701, "grad_norm": 3.375, "learning_rate": 8.933378925875526e-06, "loss": 0.90168285, "memory(GiB)": 752.06, "step": 19805, "train_speed(iter/s)": 0.270812 }, { "acc": 0.76561527, "epoch": 0.5025375801268076, "grad_norm": 3.234375, "learning_rate": 8.932731437111752e-06, "loss": 0.9131402, "memory(GiB)": 752.06, "step": 19810, "train_speed(iter/s)": 0.270617 }, { "acc": 0.76852326, "epoch": 0.5026644194958452, "grad_norm": 4.15625, "learning_rate": 8.93208377535965e-06, "loss": 0.90893087, "memory(GiB)": 752.06, "step": 19815, "train_speed(iter/s)": 0.270444 }, { "acc": 0.77224693, "epoch": 0.5027912588648827, "grad_norm": 3.953125, "learning_rate": 8.931435940647714e-06, "loss": 0.90944195, "memory(GiB)": 752.06, "step": 19820, "train_speed(iter/s)": 0.270254 }, { "acc": 0.75777249, "epoch": 0.5029180982339203, "grad_norm": 3.796875, "learning_rate": 8.930787933004434e-06, "loss": 0.95234118, "memory(GiB)": 752.06, "step": 19825, "train_speed(iter/s)": 0.270068 }, { "acc": 0.75967445, "epoch": 0.5030449376029579, "grad_norm": 3.515625, "learning_rate": 8.93013975245832e-06, "loss": 0.91620207, "memory(GiB)": 752.06, "step": 19830, "train_speed(iter/s)": 0.269882 }, { "acc": 0.75188246, "epoch": 0.5031717769719954, "grad_norm": 3.421875, "learning_rate": 8.929491399037879e-06, "loss": 0.94810944, "memory(GiB)": 752.06, "step": 19835, "train_speed(iter/s)": 0.2697 }, { "acc": 0.76184688, "epoch": 0.503298616341033, "grad_norm": 3.46875, "learning_rate": 8.928842872771632e-06, "loss": 0.89573946, "memory(GiB)": 752.06, "step": 19840, "train_speed(iter/s)": 0.269512 }, { "acc": 0.75736251, "epoch": 0.5034254557100706, "grad_norm": 4.40625, "learning_rate": 8.928194173688104e-06, "loss": 0.88814344, "memory(GiB)": 752.06, "step": 19845, "train_speed(iter/s)": 0.269348 }, { "acc": 0.76348763, "epoch": 0.5035522950791081, "grad_norm": 3.609375, "learning_rate": 8.927545301815832e-06, "loss": 0.89621601, "memory(GiB)": 752.06, "step": 19850, "train_speed(iter/s)": 0.269168 }, { "acc": 0.76988792, "epoch": 0.5036791344481457, "grad_norm": 3.265625, "learning_rate": 8.926896257183357e-06, "loss": 0.88582516, "memory(GiB)": 752.06, "step": 19855, "train_speed(iter/s)": 0.269001 }, { "acc": 0.76993361, "epoch": 0.5038059738171833, "grad_norm": 4.4375, "learning_rate": 8.926247039819225e-06, "loss": 0.90648537, "memory(GiB)": 752.06, "step": 19860, "train_speed(iter/s)": 0.268819 }, { "acc": 0.75659442, "epoch": 0.5039328131862209, "grad_norm": 3.796875, "learning_rate": 8.925597649751996e-06, "loss": 0.90039034, "memory(GiB)": 752.06, "step": 19865, "train_speed(iter/s)": 0.268636 }, { "acc": 0.77322359, "epoch": 0.5040596525552583, "grad_norm": 3.75, "learning_rate": 8.924948087010234e-06, "loss": 0.93855734, "memory(GiB)": 752.06, "step": 19870, "train_speed(iter/s)": 0.268477 }, { "acc": 0.75745473, "epoch": 0.5041864919242959, "grad_norm": 7.78125, "learning_rate": 8.924298351622512e-06, "loss": 0.87716398, "memory(GiB)": 752.06, "step": 19875, "train_speed(iter/s)": 0.268299 }, { "acc": 0.75684538, "epoch": 0.5043133312933334, "grad_norm": 3.625, "learning_rate": 8.92364844361741e-06, "loss": 0.91223326, "memory(GiB)": 752.06, "step": 19880, "train_speed(iter/s)": 0.268149 }, { "acc": 0.76055961, "epoch": 0.504440170662371, "grad_norm": 3.75, "learning_rate": 8.922998363023512e-06, "loss": 0.93570671, "memory(GiB)": 752.06, "step": 19885, "train_speed(iter/s)": 0.267979 }, { "acc": 0.77011313, "epoch": 0.5045670100314086, "grad_norm": 4.375, "learning_rate": 8.922348109869417e-06, "loss": 0.86928234, "memory(GiB)": 752.06, "step": 19890, "train_speed(iter/s)": 0.267791 }, { "acc": 0.76353216, "epoch": 0.5046938494004461, "grad_norm": 4.03125, "learning_rate": 8.921697684183726e-06, "loss": 0.90184536, "memory(GiB)": 752.06, "step": 19895, "train_speed(iter/s)": 0.267626 }, { "acc": 0.73886328, "epoch": 0.5048206887694837, "grad_norm": 3.828125, "learning_rate": 8.921047085995049e-06, "loss": 1.02089891, "memory(GiB)": 752.06, "step": 19900, "train_speed(iter/s)": 0.267451 }, { "acc": 0.74870095, "epoch": 0.5049475281385213, "grad_norm": 2.84375, "learning_rate": 8.920396315332004e-06, "loss": 0.96351795, "memory(GiB)": 752.06, "step": 19905, "train_speed(iter/s)": 0.267281 }, { "acc": 0.7587707, "epoch": 0.5050743675075589, "grad_norm": 4.65625, "learning_rate": 8.919745372223216e-06, "loss": 0.8882184, "memory(GiB)": 752.06, "step": 19910, "train_speed(iter/s)": 0.267081 }, { "acc": 0.74881001, "epoch": 0.5052012068765964, "grad_norm": 3.421875, "learning_rate": 8.919094256697318e-06, "loss": 0.98615656, "memory(GiB)": 752.06, "step": 19915, "train_speed(iter/s)": 0.266896 }, { "acc": 0.75906801, "epoch": 0.505328046245634, "grad_norm": 4.375, "learning_rate": 8.918442968782952e-06, "loss": 0.9405653, "memory(GiB)": 752.06, "step": 19920, "train_speed(iter/s)": 0.266722 }, { "acc": 0.76289358, "epoch": 0.5054548856146716, "grad_norm": 3.4375, "learning_rate": 8.917791508508762e-06, "loss": 0.89583721, "memory(GiB)": 752.06, "step": 19925, "train_speed(iter/s)": 0.266537 }, { "acc": 0.76742501, "epoch": 0.505581724983709, "grad_norm": 4.21875, "learning_rate": 8.91713987590341e-06, "loss": 0.88735237, "memory(GiB)": 752.06, "step": 19930, "train_speed(iter/s)": 0.266369 }, { "acc": 0.77003384, "epoch": 0.5057085643527466, "grad_norm": 3.578125, "learning_rate": 8.916488070995553e-06, "loss": 0.8976552, "memory(GiB)": 752.06, "step": 19935, "train_speed(iter/s)": 0.266196 }, { "acc": 0.75869937, "epoch": 0.5058354037217841, "grad_norm": 3.875, "learning_rate": 8.915836093813865e-06, "loss": 0.97655525, "memory(GiB)": 752.06, "step": 19940, "train_speed(iter/s)": 0.266046 }, { "acc": 0.76195149, "epoch": 0.5059622430908217, "grad_norm": 3.5625, "learning_rate": 8.915183944387022e-06, "loss": 0.90425138, "memory(GiB)": 752.06, "step": 19945, "train_speed(iter/s)": 0.265848 }, { "acc": 0.76003485, "epoch": 0.5060890824598593, "grad_norm": 3.125, "learning_rate": 8.914531622743716e-06, "loss": 0.89273777, "memory(GiB)": 752.06, "step": 19950, "train_speed(iter/s)": 0.265659 }, { "acc": 0.75991149, "epoch": 0.5062159218288969, "grad_norm": 3.53125, "learning_rate": 8.913879128912633e-06, "loss": 0.94376698, "memory(GiB)": 752.06, "step": 19955, "train_speed(iter/s)": 0.265496 }, { "acc": 0.74703956, "epoch": 0.5063427611979344, "grad_norm": 3.5, "learning_rate": 8.913226462922478e-06, "loss": 0.94121408, "memory(GiB)": 752.06, "step": 19960, "train_speed(iter/s)": 0.265295 }, { "acc": 0.76864409, "epoch": 0.506469600566972, "grad_norm": 3.59375, "learning_rate": 8.91257362480196e-06, "loss": 0.85791836, "memory(GiB)": 752.06, "step": 19965, "train_speed(iter/s)": 0.265135 }, { "acc": 0.76712146, "epoch": 0.5065964399360096, "grad_norm": 4.78125, "learning_rate": 8.911920614579795e-06, "loss": 0.88001089, "memory(GiB)": 752.07, "step": 19970, "train_speed(iter/s)": 0.264939 }, { "acc": 0.77276382, "epoch": 0.5067232793050471, "grad_norm": 3.609375, "learning_rate": 8.911267432284705e-06, "loss": 0.88723001, "memory(GiB)": 752.07, "step": 19975, "train_speed(iter/s)": 0.26478 }, { "acc": 0.76032696, "epoch": 0.5068501186740847, "grad_norm": 2.90625, "learning_rate": 8.910614077945423e-06, "loss": 0.90591564, "memory(GiB)": 752.07, "step": 19980, "train_speed(iter/s)": 0.264605 }, { "acc": 0.76541557, "epoch": 0.5069769580431223, "grad_norm": 3.640625, "learning_rate": 8.909960551590688e-06, "loss": 0.9179904, "memory(GiB)": 752.07, "step": 19985, "train_speed(iter/s)": 0.264423 }, { "acc": 0.75371137, "epoch": 0.5071037974121597, "grad_norm": 4.21875, "learning_rate": 8.909306853249247e-06, "loss": 0.92035513, "memory(GiB)": 752.07, "step": 19990, "train_speed(iter/s)": 0.264252 }, { "acc": 0.75506172, "epoch": 0.5072306367811973, "grad_norm": 3.609375, "learning_rate": 8.908652982949853e-06, "loss": 0.95576563, "memory(GiB)": 752.07, "step": 19995, "train_speed(iter/s)": 0.26408 }, { "acc": 0.76877427, "epoch": 0.5073574761502349, "grad_norm": 4.03125, "learning_rate": 8.907998940721266e-06, "loss": 0.88146734, "memory(GiB)": 752.07, "step": 20000, "train_speed(iter/s)": 0.263925 }, { "epoch": 0.5073574761502349, "eval_acc": 0.7525969460374144, "eval_loss": 0.8819580674171448, "eval_runtime": 1151.5041, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 20000 }, { "acc": 0.76967292, "epoch": 0.5074843155192724, "grad_norm": 3.796875, "learning_rate": 8.907344726592259e-06, "loss": 0.93195934, "memory(GiB)": 752.07, "step": 20005, "train_speed(iter/s)": 0.257411 }, { "acc": 0.73408318, "epoch": 0.50761115488831, "grad_norm": 3.125, "learning_rate": 8.906690340591608e-06, "loss": 0.9842761, "memory(GiB)": 752.07, "step": 20010, "train_speed(iter/s)": 0.257241 }, { "acc": 0.77144737, "epoch": 0.5077379942573476, "grad_norm": 4.375, "learning_rate": 8.906035782748095e-06, "loss": 0.87378674, "memory(GiB)": 752.07, "step": 20015, "train_speed(iter/s)": 0.257084 }, { "acc": 0.75415316, "epoch": 0.5078648336263851, "grad_norm": 2.96875, "learning_rate": 8.905381053090513e-06, "loss": 0.90272503, "memory(GiB)": 752.07, "step": 20020, "train_speed(iter/s)": 0.25692 }, { "acc": 0.76675019, "epoch": 0.5079916729954227, "grad_norm": 3.921875, "learning_rate": 8.904726151647662e-06, "loss": 0.88963547, "memory(GiB)": 752.07, "step": 20025, "train_speed(iter/s)": 0.256774 }, { "acc": 0.76174254, "epoch": 0.5081185123644603, "grad_norm": 3.1875, "learning_rate": 8.904071078448348e-06, "loss": 0.89328308, "memory(GiB)": 752.07, "step": 20030, "train_speed(iter/s)": 0.256604 }, { "acc": 0.75831904, "epoch": 0.5082453517334978, "grad_norm": 4.3125, "learning_rate": 8.903415833521389e-06, "loss": 0.92675238, "memory(GiB)": 752.07, "step": 20035, "train_speed(iter/s)": 0.25647 }, { "acc": 0.7667449, "epoch": 0.5083721911025354, "grad_norm": 3.28125, "learning_rate": 8.902760416895603e-06, "loss": 0.87572403, "memory(GiB)": 752.07, "step": 20040, "train_speed(iter/s)": 0.256326 }, { "acc": 0.7648972, "epoch": 0.508499030471573, "grad_norm": 3.203125, "learning_rate": 8.902104828599821e-06, "loss": 0.88003454, "memory(GiB)": 752.07, "step": 20045, "train_speed(iter/s)": 0.256187 }, { "acc": 0.7660409, "epoch": 0.5086258698406104, "grad_norm": 3.46875, "learning_rate": 8.901449068662881e-06, "loss": 0.91682005, "memory(GiB)": 752.07, "step": 20050, "train_speed(iter/s)": 0.256037 }, { "acc": 0.73880973, "epoch": 0.508752709209648, "grad_norm": 3.484375, "learning_rate": 8.900793137113626e-06, "loss": 0.9161582, "memory(GiB)": 752.07, "step": 20055, "train_speed(iter/s)": 0.255891 }, { "acc": 0.74344664, "epoch": 0.5088795485786856, "grad_norm": 3.71875, "learning_rate": 8.900137033980911e-06, "loss": 0.97389345, "memory(GiB)": 752.07, "step": 20060, "train_speed(iter/s)": 0.255742 }, { "acc": 0.7734911, "epoch": 0.5090063879477231, "grad_norm": 3.828125, "learning_rate": 8.899480759293595e-06, "loss": 0.88524933, "memory(GiB)": 752.07, "step": 20065, "train_speed(iter/s)": 0.255592 }, { "acc": 0.75893292, "epoch": 0.5091332273167607, "grad_norm": 4.5, "learning_rate": 8.898824313080542e-06, "loss": 0.90453978, "memory(GiB)": 752.07, "step": 20070, "train_speed(iter/s)": 0.255435 }, { "acc": 0.75130124, "epoch": 0.5092600666857983, "grad_norm": 3.25, "learning_rate": 8.898167695370631e-06, "loss": 0.94056349, "memory(GiB)": 752.07, "step": 20075, "train_speed(iter/s)": 0.255294 }, { "acc": 0.76808224, "epoch": 0.5093869060548358, "grad_norm": 3.109375, "learning_rate": 8.897510906192745e-06, "loss": 0.91140051, "memory(GiB)": 752.07, "step": 20080, "train_speed(iter/s)": 0.255126 }, { "acc": 0.76482491, "epoch": 0.5095137454238734, "grad_norm": 3.75, "learning_rate": 8.896853945575772e-06, "loss": 0.88054934, "memory(GiB)": 752.07, "step": 20085, "train_speed(iter/s)": 0.254979 }, { "acc": 0.75095015, "epoch": 0.509640584792911, "grad_norm": 3.796875, "learning_rate": 8.89619681354861e-06, "loss": 0.95574427, "memory(GiB)": 752.07, "step": 20090, "train_speed(iter/s)": 0.254841 }, { "acc": 0.75203037, "epoch": 0.5097674241619485, "grad_norm": 3.921875, "learning_rate": 8.895539510140165e-06, "loss": 0.91447716, "memory(GiB)": 752.07, "step": 20095, "train_speed(iter/s)": 0.254681 }, { "acc": 0.75826321, "epoch": 0.5098942635309861, "grad_norm": 3.6875, "learning_rate": 8.894882035379346e-06, "loss": 0.9424571, "memory(GiB)": 752.07, "step": 20100, "train_speed(iter/s)": 0.254525 }, { "acc": 0.75817647, "epoch": 0.5100211029000237, "grad_norm": 3.375, "learning_rate": 8.894224389295078e-06, "loss": 0.91175261, "memory(GiB)": 752.07, "step": 20105, "train_speed(iter/s)": 0.254365 }, { "acc": 0.74051085, "epoch": 0.5101479422690611, "grad_norm": 3.203125, "learning_rate": 8.893566571916288e-06, "loss": 0.90553541, "memory(GiB)": 752.07, "step": 20110, "train_speed(iter/s)": 0.254194 }, { "acc": 0.76819906, "epoch": 0.5102747816380987, "grad_norm": 3.296875, "learning_rate": 8.892908583271908e-06, "loss": 0.94459743, "memory(GiB)": 752.07, "step": 20115, "train_speed(iter/s)": 0.254053 }, { "acc": 0.76721835, "epoch": 0.5104016210071363, "grad_norm": 4.03125, "learning_rate": 8.892250423390885e-06, "loss": 0.91487741, "memory(GiB)": 752.07, "step": 20120, "train_speed(iter/s)": 0.25391 }, { "acc": 0.76129937, "epoch": 0.5105284603761738, "grad_norm": 3.546875, "learning_rate": 8.891592092302166e-06, "loss": 0.9097909, "memory(GiB)": 752.07, "step": 20125, "train_speed(iter/s)": 0.25377 }, { "acc": 0.77599044, "epoch": 0.5106552997452114, "grad_norm": 3.5625, "learning_rate": 8.89093359003471e-06, "loss": 0.86088085, "memory(GiB)": 752.07, "step": 20130, "train_speed(iter/s)": 0.253609 }, { "acc": 0.75779524, "epoch": 0.510782139114249, "grad_norm": 3.125, "learning_rate": 8.890274916617482e-06, "loss": 0.89804096, "memory(GiB)": 752.07, "step": 20135, "train_speed(iter/s)": 0.253464 }, { "acc": 0.76279125, "epoch": 0.5109089784832865, "grad_norm": 3.265625, "learning_rate": 8.889616072079459e-06, "loss": 0.91072121, "memory(GiB)": 752.07, "step": 20140, "train_speed(iter/s)": 0.253307 }, { "acc": 0.76460414, "epoch": 0.5110358178523241, "grad_norm": 3.8125, "learning_rate": 8.888957056449614e-06, "loss": 0.96028652, "memory(GiB)": 752.07, "step": 20145, "train_speed(iter/s)": 0.253134 }, { "acc": 0.76079078, "epoch": 0.5111626572213617, "grad_norm": 3.40625, "learning_rate": 8.888297869756942e-06, "loss": 0.8769249, "memory(GiB)": 752.07, "step": 20150, "train_speed(iter/s)": 0.252933 }, { "acc": 0.76507936, "epoch": 0.5112894965903992, "grad_norm": 2.828125, "learning_rate": 8.887638512030434e-06, "loss": 0.87441149, "memory(GiB)": 752.07, "step": 20155, "train_speed(iter/s)": 0.252772 }, { "acc": 0.77154803, "epoch": 0.5114163359594368, "grad_norm": 3.40625, "learning_rate": 8.886978983299094e-06, "loss": 0.93062305, "memory(GiB)": 752.07, "step": 20160, "train_speed(iter/s)": 0.252624 }, { "acc": 0.77094917, "epoch": 0.5115431753284744, "grad_norm": 3.4375, "learning_rate": 8.886319283591935e-06, "loss": 0.9138937, "memory(GiB)": 752.07, "step": 20165, "train_speed(iter/s)": 0.252484 }, { "acc": 0.76133981, "epoch": 0.5116700146975118, "grad_norm": 3.875, "learning_rate": 8.885659412937972e-06, "loss": 0.91164188, "memory(GiB)": 752.07, "step": 20170, "train_speed(iter/s)": 0.25234 }, { "acc": 0.77716894, "epoch": 0.5117968540665494, "grad_norm": 2.859375, "learning_rate": 8.884999371366233e-06, "loss": 0.87678099, "memory(GiB)": 752.07, "step": 20175, "train_speed(iter/s)": 0.252186 }, { "acc": 0.7606319, "epoch": 0.511923693435587, "grad_norm": 3.546875, "learning_rate": 8.88433915890575e-06, "loss": 0.91960678, "memory(GiB)": 752.07, "step": 20180, "train_speed(iter/s)": 0.25203 }, { "acc": 0.75819635, "epoch": 0.5120505328046245, "grad_norm": 3.53125, "learning_rate": 8.883678775585563e-06, "loss": 0.9005435, "memory(GiB)": 752.07, "step": 20185, "train_speed(iter/s)": 0.251894 }, { "acc": 0.76982002, "epoch": 0.5121773721736621, "grad_norm": 3.421875, "learning_rate": 8.883018221434721e-06, "loss": 0.91998005, "memory(GiB)": 752.07, "step": 20190, "train_speed(iter/s)": 0.251738 }, { "acc": 0.77152472, "epoch": 0.5123042115426997, "grad_norm": 3.53125, "learning_rate": 8.88235749648228e-06, "loss": 0.89224072, "memory(GiB)": 752.07, "step": 20195, "train_speed(iter/s)": 0.251595 }, { "acc": 0.76553683, "epoch": 0.5124310509117372, "grad_norm": 3.34375, "learning_rate": 8.881696600757302e-06, "loss": 0.95360575, "memory(GiB)": 752.07, "step": 20200, "train_speed(iter/s)": 0.251453 }, { "acc": 0.76712637, "epoch": 0.5125578902807748, "grad_norm": 3.5, "learning_rate": 8.881035534288858e-06, "loss": 0.89792852, "memory(GiB)": 752.07, "step": 20205, "train_speed(iter/s)": 0.251301 }, { "acc": 0.76445279, "epoch": 0.5126847296498124, "grad_norm": 3.8125, "learning_rate": 8.880374297106026e-06, "loss": 0.86003637, "memory(GiB)": 752.07, "step": 20210, "train_speed(iter/s)": 0.251119 }, { "acc": 0.77291017, "epoch": 0.5128115690188499, "grad_norm": 3.734375, "learning_rate": 8.879712889237895e-06, "loss": 0.85537195, "memory(GiB)": 752.07, "step": 20215, "train_speed(iter/s)": 0.250975 }, { "acc": 0.76224365, "epoch": 0.5129384083878875, "grad_norm": 3.90625, "learning_rate": 8.879051310713554e-06, "loss": 0.92747498, "memory(GiB)": 752.07, "step": 20220, "train_speed(iter/s)": 0.250837 }, { "acc": 0.75754051, "epoch": 0.5130652477569251, "grad_norm": 3.953125, "learning_rate": 8.878389561562105e-06, "loss": 0.92669468, "memory(GiB)": 752.07, "step": 20225, "train_speed(iter/s)": 0.250693 }, { "acc": 0.77718606, "epoch": 0.5131920871259625, "grad_norm": 4.1875, "learning_rate": 8.877727641812656e-06, "loss": 0.84399452, "memory(GiB)": 752.07, "step": 20230, "train_speed(iter/s)": 0.250563 }, { "acc": 0.77665753, "epoch": 0.5133189264950001, "grad_norm": 3.421875, "learning_rate": 8.877065551494324e-06, "loss": 0.90267715, "memory(GiB)": 752.07, "step": 20235, "train_speed(iter/s)": 0.250428 }, { "acc": 0.75859346, "epoch": 0.5134457658640377, "grad_norm": 3.09375, "learning_rate": 8.87640329063623e-06, "loss": 0.93905306, "memory(GiB)": 752.07, "step": 20240, "train_speed(iter/s)": 0.250281 }, { "acc": 0.75106955, "epoch": 0.5135726052330752, "grad_norm": 3.796875, "learning_rate": 8.875740859267508e-06, "loss": 0.95025082, "memory(GiB)": 752.07, "step": 20245, "train_speed(iter/s)": 0.250142 }, { "acc": 0.756989, "epoch": 0.5136994446021128, "grad_norm": 3.265625, "learning_rate": 8.875078257417294e-06, "loss": 0.91526566, "memory(GiB)": 752.07, "step": 20250, "train_speed(iter/s)": 0.249987 }, { "acc": 0.7567389, "epoch": 0.5138262839711504, "grad_norm": 3.6875, "learning_rate": 8.874415485114734e-06, "loss": 0.92381458, "memory(GiB)": 752.07, "step": 20255, "train_speed(iter/s)": 0.249843 }, { "acc": 0.77606821, "epoch": 0.513953123340188, "grad_norm": 3.4375, "learning_rate": 8.873752542388983e-06, "loss": 0.88697166, "memory(GiB)": 752.07, "step": 20260, "train_speed(iter/s)": 0.249679 }, { "acc": 0.76597476, "epoch": 0.5140799627092255, "grad_norm": 3.359375, "learning_rate": 8.873089429269198e-06, "loss": 0.88429174, "memory(GiB)": 752.07, "step": 20265, "train_speed(iter/s)": 0.249547 }, { "acc": 0.75677366, "epoch": 0.5142068020782631, "grad_norm": 3.015625, "learning_rate": 8.872426145784551e-06, "loss": 0.94510469, "memory(GiB)": 752.07, "step": 20270, "train_speed(iter/s)": 0.249407 }, { "acc": 0.76055098, "epoch": 0.5143336414473006, "grad_norm": 2.984375, "learning_rate": 8.871762691964215e-06, "loss": 0.93111887, "memory(GiB)": 752.07, "step": 20275, "train_speed(iter/s)": 0.249259 }, { "acc": 0.76282692, "epoch": 0.5144604808163382, "grad_norm": 4.34375, "learning_rate": 8.871099067837374e-06, "loss": 0.88619537, "memory(GiB)": 752.07, "step": 20280, "train_speed(iter/s)": 0.249127 }, { "acc": 0.7682426, "epoch": 0.5145873201853758, "grad_norm": 3.25, "learning_rate": 8.870435273433221e-06, "loss": 0.85263557, "memory(GiB)": 752.07, "step": 20285, "train_speed(iter/s)": 0.248963 }, { "acc": 0.76664023, "epoch": 0.5147141595544132, "grad_norm": 3.1875, "learning_rate": 8.86977130878095e-06, "loss": 0.86388454, "memory(GiB)": 752.07, "step": 20290, "train_speed(iter/s)": 0.248819 }, { "acc": 0.76922441, "epoch": 0.5148409989234508, "grad_norm": 3.78125, "learning_rate": 8.86910717390977e-06, "loss": 0.91115227, "memory(GiB)": 752.07, "step": 20295, "train_speed(iter/s)": 0.248686 }, { "acc": 0.76209564, "epoch": 0.5149678382924884, "grad_norm": 3.703125, "learning_rate": 8.868442868848894e-06, "loss": 0.9349494, "memory(GiB)": 752.07, "step": 20300, "train_speed(iter/s)": 0.248542 }, { "acc": 0.76561122, "epoch": 0.515094677661526, "grad_norm": 3.015625, "learning_rate": 8.867778393627542e-06, "loss": 0.86582441, "memory(GiB)": 752.07, "step": 20305, "train_speed(iter/s)": 0.248408 }, { "acc": 0.76082067, "epoch": 0.5152215170305635, "grad_norm": 3.015625, "learning_rate": 8.86711374827494e-06, "loss": 0.92049341, "memory(GiB)": 752.07, "step": 20310, "train_speed(iter/s)": 0.248268 }, { "acc": 0.75938807, "epoch": 0.5153483563996011, "grad_norm": 4.125, "learning_rate": 8.866448932820326e-06, "loss": 0.89041328, "memory(GiB)": 752.07, "step": 20315, "train_speed(iter/s)": 0.248126 }, { "acc": 0.76093993, "epoch": 0.5154751957686386, "grad_norm": 3.453125, "learning_rate": 8.865783947292945e-06, "loss": 0.90565176, "memory(GiB)": 752.07, "step": 20320, "train_speed(iter/s)": 0.247993 }, { "acc": 0.75004339, "epoch": 0.5156020351376762, "grad_norm": 4.0, "learning_rate": 8.865118791722043e-06, "loss": 0.97346859, "memory(GiB)": 752.07, "step": 20325, "train_speed(iter/s)": 0.247861 }, { "acc": 0.75839992, "epoch": 0.5157288745067138, "grad_norm": 3.015625, "learning_rate": 8.86445346613688e-06, "loss": 0.95876637, "memory(GiB)": 752.07, "step": 20330, "train_speed(iter/s)": 0.247713 }, { "acc": 0.75429192, "epoch": 0.5158557138757514, "grad_norm": 3.59375, "learning_rate": 8.863787970566723e-06, "loss": 0.93497362, "memory(GiB)": 752.07, "step": 20335, "train_speed(iter/s)": 0.247574 }, { "acc": 0.76841221, "epoch": 0.5159825532447889, "grad_norm": 3.40625, "learning_rate": 8.863122305040845e-06, "loss": 0.85670176, "memory(GiB)": 752.07, "step": 20340, "train_speed(iter/s)": 0.24741 }, { "acc": 0.77020369, "epoch": 0.5161093926138265, "grad_norm": 4.09375, "learning_rate": 8.862456469588523e-06, "loss": 0.88744011, "memory(GiB)": 752.07, "step": 20345, "train_speed(iter/s)": 0.247271 }, { "acc": 0.756321, "epoch": 0.516236231982864, "grad_norm": 3.53125, "learning_rate": 8.86179046423905e-06, "loss": 0.94513159, "memory(GiB)": 752.07, "step": 20350, "train_speed(iter/s)": 0.247149 }, { "acc": 0.76117115, "epoch": 0.5163630713519015, "grad_norm": 3.625, "learning_rate": 8.861124289021717e-06, "loss": 0.9080533, "memory(GiB)": 752.07, "step": 20355, "train_speed(iter/s)": 0.246985 }, { "acc": 0.77221842, "epoch": 0.5164899107209391, "grad_norm": 3.125, "learning_rate": 8.86045794396583e-06, "loss": 0.86292715, "memory(GiB)": 752.07, "step": 20360, "train_speed(iter/s)": 0.246853 }, { "acc": 0.75903602, "epoch": 0.5166167500899767, "grad_norm": 3.34375, "learning_rate": 8.859791429100697e-06, "loss": 0.94116545, "memory(GiB)": 752.07, "step": 20365, "train_speed(iter/s)": 0.246719 }, { "acc": 0.75886388, "epoch": 0.5167435894590142, "grad_norm": 3.390625, "learning_rate": 8.859124744455638e-06, "loss": 0.94130011, "memory(GiB)": 752.07, "step": 20370, "train_speed(iter/s)": 0.246575 }, { "acc": 0.75071669, "epoch": 0.5168704288280518, "grad_norm": 3.734375, "learning_rate": 8.858457890059977e-06, "loss": 0.95258303, "memory(GiB)": 752.07, "step": 20375, "train_speed(iter/s)": 0.246443 }, { "acc": 0.77226181, "epoch": 0.5169972681970894, "grad_norm": 3.25, "learning_rate": 8.857790865943047e-06, "loss": 0.91441965, "memory(GiB)": 752.07, "step": 20380, "train_speed(iter/s)": 0.246298 }, { "acc": 0.76697826, "epoch": 0.5171241075661269, "grad_norm": 3.515625, "learning_rate": 8.85712367213419e-06, "loss": 0.91398287, "memory(GiB)": 752.07, "step": 20385, "train_speed(iter/s)": 0.246155 }, { "acc": 0.7651238, "epoch": 0.5172509469351645, "grad_norm": 3.171875, "learning_rate": 8.85645630866275e-06, "loss": 0.88353968, "memory(GiB)": 752.07, "step": 20390, "train_speed(iter/s)": 0.246002 }, { "acc": 0.75885162, "epoch": 0.5173777863042021, "grad_norm": 3.703125, "learning_rate": 8.855788775558088e-06, "loss": 0.95234537, "memory(GiB)": 752.07, "step": 20395, "train_speed(iter/s)": 0.245876 }, { "acc": 0.76608372, "epoch": 0.5175046256732396, "grad_norm": 3.84375, "learning_rate": 8.85512107284956e-06, "loss": 0.92599592, "memory(GiB)": 752.07, "step": 20400, "train_speed(iter/s)": 0.245741 }, { "acc": 0.76305308, "epoch": 0.5176314650422772, "grad_norm": 3.53125, "learning_rate": 8.854453200566539e-06, "loss": 0.8988452, "memory(GiB)": 752.07, "step": 20405, "train_speed(iter/s)": 0.245612 }, { "acc": 0.76155491, "epoch": 0.5177583044113147, "grad_norm": 3.921875, "learning_rate": 8.853785158738403e-06, "loss": 0.83211241, "memory(GiB)": 752.07, "step": 20410, "train_speed(iter/s)": 0.245466 }, { "acc": 0.7557826, "epoch": 0.5178851437803522, "grad_norm": 3.921875, "learning_rate": 8.853116947394538e-06, "loss": 0.98599577, "memory(GiB)": 752.07, "step": 20415, "train_speed(iter/s)": 0.245344 }, { "acc": 0.76541929, "epoch": 0.5180119831493898, "grad_norm": 3.59375, "learning_rate": 8.852448566564333e-06, "loss": 0.90652637, "memory(GiB)": 752.07, "step": 20420, "train_speed(iter/s)": 0.245208 }, { "acc": 0.76326609, "epoch": 0.5181388225184274, "grad_norm": 4.34375, "learning_rate": 8.851780016277192e-06, "loss": 0.91215849, "memory(GiB)": 752.07, "step": 20425, "train_speed(iter/s)": 0.245075 }, { "acc": 0.76171341, "epoch": 0.5182656618874649, "grad_norm": 4.0, "learning_rate": 8.85111129656252e-06, "loss": 0.88188944, "memory(GiB)": 752.07, "step": 20430, "train_speed(iter/s)": 0.244929 }, { "acc": 0.76543412, "epoch": 0.5183925012565025, "grad_norm": 3.546875, "learning_rate": 8.850442407449733e-06, "loss": 0.89297447, "memory(GiB)": 752.07, "step": 20435, "train_speed(iter/s)": 0.244789 }, { "acc": 0.76631818, "epoch": 0.5185193406255401, "grad_norm": 3.390625, "learning_rate": 8.849773348968254e-06, "loss": 0.94489403, "memory(GiB)": 752.07, "step": 20440, "train_speed(iter/s)": 0.244653 }, { "acc": 0.75712595, "epoch": 0.5186461799945776, "grad_norm": 3.09375, "learning_rate": 8.84910412114751e-06, "loss": 0.91533356, "memory(GiB)": 752.07, "step": 20445, "train_speed(iter/s)": 0.244514 }, { "acc": 0.75240941, "epoch": 0.5187730193636152, "grad_norm": 3.296875, "learning_rate": 8.848434724016939e-06, "loss": 0.93589659, "memory(GiB)": 752.07, "step": 20450, "train_speed(iter/s)": 0.244351 }, { "acc": 0.77458568, "epoch": 0.5188998587326528, "grad_norm": 2.921875, "learning_rate": 8.847765157605987e-06, "loss": 0.84270658, "memory(GiB)": 752.07, "step": 20455, "train_speed(iter/s)": 0.24423 }, { "acc": 0.76923442, "epoch": 0.5190266981016903, "grad_norm": 3.390625, "learning_rate": 8.847095421944104e-06, "loss": 0.81815805, "memory(GiB)": 752.07, "step": 20460, "train_speed(iter/s)": 0.244079 }, { "acc": 0.77108073, "epoch": 0.5191535374707279, "grad_norm": 3.75, "learning_rate": 8.846425517060753e-06, "loss": 0.90521088, "memory(GiB)": 752.07, "step": 20465, "train_speed(iter/s)": 0.243936 }, { "acc": 0.75714908, "epoch": 0.5192803768397654, "grad_norm": 4.15625, "learning_rate": 8.8457554429854e-06, "loss": 0.93376904, "memory(GiB)": 752.07, "step": 20470, "train_speed(iter/s)": 0.243807 }, { "acc": 0.7643856, "epoch": 0.5194072162088029, "grad_norm": 4.1875, "learning_rate": 8.845085199747516e-06, "loss": 0.93060455, "memory(GiB)": 752.07, "step": 20475, "train_speed(iter/s)": 0.243654 }, { "acc": 0.7690536, "epoch": 0.5195340555778405, "grad_norm": 3.3125, "learning_rate": 8.844414787376587e-06, "loss": 0.920786, "memory(GiB)": 752.07, "step": 20480, "train_speed(iter/s)": 0.243526 }, { "acc": 0.77378483, "epoch": 0.5196608949468781, "grad_norm": 3.921875, "learning_rate": 8.8437442059021e-06, "loss": 0.85308323, "memory(GiB)": 752.07, "step": 20485, "train_speed(iter/s)": 0.243388 }, { "acc": 0.76291003, "epoch": 0.5197877343159156, "grad_norm": 3.484375, "learning_rate": 8.843073455353552e-06, "loss": 0.92562418, "memory(GiB)": 752.07, "step": 20490, "train_speed(iter/s)": 0.243268 }, { "acc": 0.77429285, "epoch": 0.5199145736849532, "grad_norm": 4.0, "learning_rate": 8.842402535760446e-06, "loss": 0.86349316, "memory(GiB)": 752.07, "step": 20495, "train_speed(iter/s)": 0.243146 }, { "acc": 0.7685297, "epoch": 0.5200414130539908, "grad_norm": 3.4375, "learning_rate": 8.841731447152298e-06, "loss": 0.88805857, "memory(GiB)": 752.07, "step": 20500, "train_speed(iter/s)": 0.243015 }, { "epoch": 0.5200414130539908, "eval_acc": 0.7525489027968268, "eval_loss": 0.8809731006622314, "eval_runtime": 1149.6362, "eval_samples_per_second": 5.541, "eval_steps_per_second": 5.541, "step": 20500 }, { "acc": 0.77783031, "epoch": 0.5201682524230283, "grad_norm": 4.34375, "learning_rate": 8.841060189558621e-06, "loss": 0.84022007, "memory(GiB)": 752.07, "step": 20505, "train_speed(iter/s)": 0.237626 }, { "acc": 0.74722147, "epoch": 0.5202950917920659, "grad_norm": 3.625, "learning_rate": 8.840388763008947e-06, "loss": 1.00984926, "memory(GiB)": 752.07, "step": 20510, "train_speed(iter/s)": 0.237492 }, { "acc": 0.75276852, "epoch": 0.5204219311611035, "grad_norm": 3.703125, "learning_rate": 8.839717167532806e-06, "loss": 0.91654425, "memory(GiB)": 752.07, "step": 20515, "train_speed(iter/s)": 0.237342 }, { "acc": 0.76454873, "epoch": 0.520548770530141, "grad_norm": 3.203125, "learning_rate": 8.839045403159742e-06, "loss": 0.90181723, "memory(GiB)": 752.07, "step": 20520, "train_speed(iter/s)": 0.237232 }, { "acc": 0.75320354, "epoch": 0.5206756098991786, "grad_norm": 4.0625, "learning_rate": 8.838373469919303e-06, "loss": 0.94869976, "memory(GiB)": 752.07, "step": 20525, "train_speed(iter/s)": 0.237112 }, { "acc": 0.75768909, "epoch": 0.5208024492682161, "grad_norm": 3.8125, "learning_rate": 8.837701367841045e-06, "loss": 0.93554001, "memory(GiB)": 752.07, "step": 20530, "train_speed(iter/s)": 0.236987 }, { "acc": 0.74978142, "epoch": 0.5209292886372536, "grad_norm": 3.546875, "learning_rate": 8.83702909695453e-06, "loss": 0.98683805, "memory(GiB)": 752.07, "step": 20535, "train_speed(iter/s)": 0.236862 }, { "acc": 0.75921078, "epoch": 0.5210561280062912, "grad_norm": 3.390625, "learning_rate": 8.83635665728933e-06, "loss": 0.9470768, "memory(GiB)": 752.07, "step": 20540, "train_speed(iter/s)": 0.236722 }, { "acc": 0.77122965, "epoch": 0.5211829673753288, "grad_norm": 2.984375, "learning_rate": 8.835684048875026e-06, "loss": 0.8679018, "memory(GiB)": 752.07, "step": 20545, "train_speed(iter/s)": 0.236603 }, { "acc": 0.75866151, "epoch": 0.5213098067443663, "grad_norm": 4.0, "learning_rate": 8.835011271741201e-06, "loss": 0.91209345, "memory(GiB)": 752.07, "step": 20550, "train_speed(iter/s)": 0.236487 }, { "acc": 0.76182489, "epoch": 0.5214366461134039, "grad_norm": 3.34375, "learning_rate": 8.834338325917451e-06, "loss": 0.90813217, "memory(GiB)": 752.07, "step": 20555, "train_speed(iter/s)": 0.236358 }, { "acc": 0.79365535, "epoch": 0.5215634854824415, "grad_norm": 4.0625, "learning_rate": 8.833665211433373e-06, "loss": 0.79964452, "memory(GiB)": 752.07, "step": 20560, "train_speed(iter/s)": 0.236236 }, { "acc": 0.76153641, "epoch": 0.521690324851479, "grad_norm": 4.0625, "learning_rate": 8.832991928318578e-06, "loss": 0.94412508, "memory(GiB)": 752.07, "step": 20565, "train_speed(iter/s)": 0.236131 }, { "acc": 0.76881151, "epoch": 0.5218171642205166, "grad_norm": 3.625, "learning_rate": 8.832318476602679e-06, "loss": 0.93152628, "memory(GiB)": 752.07, "step": 20570, "train_speed(iter/s)": 0.236025 }, { "acc": 0.75768094, "epoch": 0.5219440035895542, "grad_norm": 3.515625, "learning_rate": 8.831644856315302e-06, "loss": 0.93267879, "memory(GiB)": 752.07, "step": 20575, "train_speed(iter/s)": 0.235893 }, { "acc": 0.76809607, "epoch": 0.5220708429585917, "grad_norm": 3.859375, "learning_rate": 8.830971067486077e-06, "loss": 0.91181211, "memory(GiB)": 752.07, "step": 20580, "train_speed(iter/s)": 0.235763 }, { "acc": 0.77774215, "epoch": 0.5221976823276293, "grad_norm": 3.15625, "learning_rate": 8.83029711014464e-06, "loss": 0.84379787, "memory(GiB)": 752.07, "step": 20585, "train_speed(iter/s)": 0.235621 }, { "acc": 0.77517681, "epoch": 0.5223245216966668, "grad_norm": 3.484375, "learning_rate": 8.829622984320636e-06, "loss": 0.89149714, "memory(GiB)": 752.07, "step": 20590, "train_speed(iter/s)": 0.235502 }, { "acc": 0.76560373, "epoch": 0.5224513610657043, "grad_norm": 3.90625, "learning_rate": 8.828948690043721e-06, "loss": 0.92096186, "memory(GiB)": 752.07, "step": 20595, "train_speed(iter/s)": 0.235388 }, { "acc": 0.76574087, "epoch": 0.5225782004347419, "grad_norm": 3.296875, "learning_rate": 8.828274227343554e-06, "loss": 0.94102297, "memory(GiB)": 752.07, "step": 20600, "train_speed(iter/s)": 0.235251 }, { "acc": 0.75646844, "epoch": 0.5227050398037795, "grad_norm": 3.5, "learning_rate": 8.827599596249799e-06, "loss": 0.93639154, "memory(GiB)": 752.07, "step": 20605, "train_speed(iter/s)": 0.235125 }, { "acc": 0.76846633, "epoch": 0.522831879172817, "grad_norm": 3.375, "learning_rate": 8.826924796792134e-06, "loss": 0.88943872, "memory(GiB)": 752.07, "step": 20610, "train_speed(iter/s)": 0.235006 }, { "acc": 0.76320944, "epoch": 0.5229587185418546, "grad_norm": 3.34375, "learning_rate": 8.82624982900024e-06, "loss": 0.89367695, "memory(GiB)": 752.07, "step": 20615, "train_speed(iter/s)": 0.234877 }, { "acc": 0.75224242, "epoch": 0.5230855579108922, "grad_norm": 3.453125, "learning_rate": 8.825574692903806e-06, "loss": 0.96552839, "memory(GiB)": 752.07, "step": 20620, "train_speed(iter/s)": 0.234761 }, { "acc": 0.76344566, "epoch": 0.5232123972799297, "grad_norm": 4.0, "learning_rate": 8.824899388532534e-06, "loss": 0.90956545, "memory(GiB)": 752.07, "step": 20625, "train_speed(iter/s)": 0.234646 }, { "acc": 0.76592174, "epoch": 0.5233392366489673, "grad_norm": 3.265625, "learning_rate": 8.824223915916122e-06, "loss": 0.86362047, "memory(GiB)": 752.07, "step": 20630, "train_speed(iter/s)": 0.234537 }, { "acc": 0.7636888, "epoch": 0.5234660760180049, "grad_norm": 3.5625, "learning_rate": 8.823548275084284e-06, "loss": 0.931215, "memory(GiB)": 752.07, "step": 20635, "train_speed(iter/s)": 0.234421 }, { "acc": 0.74967847, "epoch": 0.5235929153870424, "grad_norm": 3.09375, "learning_rate": 8.822872466066742e-06, "loss": 0.95828104, "memory(GiB)": 752.07, "step": 20640, "train_speed(iter/s)": 0.2343 }, { "acc": 0.75569282, "epoch": 0.52371975475608, "grad_norm": 3.453125, "learning_rate": 8.822196488893218e-06, "loss": 0.92404318, "memory(GiB)": 752.07, "step": 20645, "train_speed(iter/s)": 0.234169 }, { "acc": 0.76552382, "epoch": 0.5238465941251175, "grad_norm": 3.6875, "learning_rate": 8.821520343593452e-06, "loss": 0.90670223, "memory(GiB)": 752.07, "step": 20650, "train_speed(iter/s)": 0.234054 }, { "acc": 0.76276903, "epoch": 0.523973433494155, "grad_norm": 4.25, "learning_rate": 8.82084403019718e-06, "loss": 0.87572947, "memory(GiB)": 752.07, "step": 20655, "train_speed(iter/s)": 0.23394 }, { "acc": 0.76602631, "epoch": 0.5241002728631926, "grad_norm": 3.34375, "learning_rate": 8.820167548734153e-06, "loss": 0.86595945, "memory(GiB)": 752.07, "step": 20660, "train_speed(iter/s)": 0.2338 }, { "acc": 0.74805675, "epoch": 0.5242271122322302, "grad_norm": 3.40625, "learning_rate": 8.81949089923413e-06, "loss": 0.97042217, "memory(GiB)": 752.07, "step": 20665, "train_speed(iter/s)": 0.233666 }, { "acc": 0.75839248, "epoch": 0.5243539516012677, "grad_norm": 3.078125, "learning_rate": 8.81881408172687e-06, "loss": 0.92332325, "memory(GiB)": 752.07, "step": 20670, "train_speed(iter/s)": 0.23353 }, { "acc": 0.76959801, "epoch": 0.5244807909703053, "grad_norm": 3.40625, "learning_rate": 8.818137096242146e-06, "loss": 0.87875156, "memory(GiB)": 752.07, "step": 20675, "train_speed(iter/s)": 0.233412 }, { "acc": 0.75891585, "epoch": 0.5246076303393429, "grad_norm": 4.09375, "learning_rate": 8.817459942809735e-06, "loss": 0.88003578, "memory(GiB)": 752.07, "step": 20680, "train_speed(iter/s)": 0.233301 }, { "acc": 0.74923806, "epoch": 0.5247344697083804, "grad_norm": 3.78125, "learning_rate": 8.816782621459426e-06, "loss": 0.97063131, "memory(GiB)": 752.07, "step": 20685, "train_speed(iter/s)": 0.233187 }, { "acc": 0.7655138, "epoch": 0.524861309077418, "grad_norm": 3.65625, "learning_rate": 8.816105132221012e-06, "loss": 0.89972754, "memory(GiB)": 752.07, "step": 20690, "train_speed(iter/s)": 0.23306 }, { "acc": 0.76839485, "epoch": 0.5249881484464556, "grad_norm": 4.03125, "learning_rate": 8.815427475124288e-06, "loss": 0.89162226, "memory(GiB)": 752.07, "step": 20695, "train_speed(iter/s)": 0.232966 }, { "acc": 0.77298293, "epoch": 0.5251149878154932, "grad_norm": 3.5625, "learning_rate": 8.81474965019907e-06, "loss": 0.86743269, "memory(GiB)": 752.07, "step": 20700, "train_speed(iter/s)": 0.232843 }, { "acc": 0.76284828, "epoch": 0.5252418271845307, "grad_norm": 3.578125, "learning_rate": 8.814071657475168e-06, "loss": 0.89641914, "memory(GiB)": 752.07, "step": 20705, "train_speed(iter/s)": 0.232719 }, { "acc": 0.75323281, "epoch": 0.5253686665535682, "grad_norm": 4.21875, "learning_rate": 8.813393496982406e-06, "loss": 0.94695873, "memory(GiB)": 752.07, "step": 20710, "train_speed(iter/s)": 0.232599 }, { "acc": 0.76228757, "epoch": 0.5254955059226057, "grad_norm": 3.21875, "learning_rate": 8.812715168750612e-06, "loss": 0.87239847, "memory(GiB)": 752.07, "step": 20715, "train_speed(iter/s)": 0.232483 }, { "acc": 0.77163234, "epoch": 0.5256223452916433, "grad_norm": 3.75, "learning_rate": 8.81203667280963e-06, "loss": 0.86666956, "memory(GiB)": 752.07, "step": 20720, "train_speed(iter/s)": 0.232372 }, { "acc": 0.75310316, "epoch": 0.5257491846606809, "grad_norm": 3.546875, "learning_rate": 8.811358009189298e-06, "loss": 0.93905134, "memory(GiB)": 752.07, "step": 20725, "train_speed(iter/s)": 0.232266 }, { "acc": 0.76827421, "epoch": 0.5258760240297184, "grad_norm": 3.734375, "learning_rate": 8.810679177919472e-06, "loss": 0.92072144, "memory(GiB)": 752.07, "step": 20730, "train_speed(iter/s)": 0.232134 }, { "acc": 0.76787472, "epoch": 0.526002863398756, "grad_norm": 3.4375, "learning_rate": 8.810000179030009e-06, "loss": 0.8994194, "memory(GiB)": 752.07, "step": 20735, "train_speed(iter/s)": 0.232017 }, { "acc": 0.76861582, "epoch": 0.5261297027677936, "grad_norm": 3.0625, "learning_rate": 8.80932101255078e-06, "loss": 0.90146856, "memory(GiB)": 752.07, "step": 20740, "train_speed(iter/s)": 0.231897 }, { "acc": 0.76679082, "epoch": 0.5262565421368312, "grad_norm": 3.28125, "learning_rate": 8.808641678511654e-06, "loss": 0.91972351, "memory(GiB)": 752.07, "step": 20745, "train_speed(iter/s)": 0.231785 }, { "acc": 0.76101003, "epoch": 0.5263833815058687, "grad_norm": 3.390625, "learning_rate": 8.807962176942517e-06, "loss": 0.90666485, "memory(GiB)": 752.07, "step": 20750, "train_speed(iter/s)": 0.231659 }, { "acc": 0.75840869, "epoch": 0.5265102208749063, "grad_norm": 3.171875, "learning_rate": 8.807282507873258e-06, "loss": 0.94904728, "memory(GiB)": 752.07, "step": 20755, "train_speed(iter/s)": 0.231548 }, { "acc": 0.77367177, "epoch": 0.5266370602439439, "grad_norm": 3.75, "learning_rate": 8.80660267133377e-06, "loss": 0.88704996, "memory(GiB)": 752.07, "step": 20760, "train_speed(iter/s)": 0.231442 }, { "acc": 0.7550108, "epoch": 0.5267638996129814, "grad_norm": 3.78125, "learning_rate": 8.80592266735396e-06, "loss": 0.92475376, "memory(GiB)": 752.07, "step": 20765, "train_speed(iter/s)": 0.231303 }, { "acc": 0.77028985, "epoch": 0.5268907389820189, "grad_norm": 3.96875, "learning_rate": 8.80524249596374e-06, "loss": 0.86847277, "memory(GiB)": 752.07, "step": 20770, "train_speed(iter/s)": 0.231183 }, { "acc": 0.75564113, "epoch": 0.5270175783510564, "grad_norm": 3.21875, "learning_rate": 8.804562157193026e-06, "loss": 0.97232428, "memory(GiB)": 752.07, "step": 20775, "train_speed(iter/s)": 0.231068 }, { "acc": 0.7540432, "epoch": 0.527144417720094, "grad_norm": 3.328125, "learning_rate": 8.803881651071745e-06, "loss": 0.93224277, "memory(GiB)": 752.07, "step": 20780, "train_speed(iter/s)": 0.230951 }, { "acc": 0.76123753, "epoch": 0.5272712570891316, "grad_norm": 3.875, "learning_rate": 8.803200977629828e-06, "loss": 0.97864656, "memory(GiB)": 752.07, "step": 20785, "train_speed(iter/s)": 0.230828 }, { "acc": 0.76492414, "epoch": 0.5273980964581692, "grad_norm": 3.421875, "learning_rate": 8.80252013689722e-06, "loss": 0.90525103, "memory(GiB)": 752.07, "step": 20790, "train_speed(iter/s)": 0.230723 }, { "acc": 0.74588485, "epoch": 0.5275249358272067, "grad_norm": 3.703125, "learning_rate": 8.801839128903864e-06, "loss": 0.94358425, "memory(GiB)": 752.07, "step": 20795, "train_speed(iter/s)": 0.230609 }, { "acc": 0.75786405, "epoch": 0.5276517751962443, "grad_norm": 4.03125, "learning_rate": 8.80115795367972e-06, "loss": 0.92179918, "memory(GiB)": 752.07, "step": 20800, "train_speed(iter/s)": 0.230488 }, { "acc": 0.76101007, "epoch": 0.5277786145652819, "grad_norm": 3.875, "learning_rate": 8.80047661125475e-06, "loss": 0.89133949, "memory(GiB)": 752.07, "step": 20805, "train_speed(iter/s)": 0.230384 }, { "acc": 0.77411175, "epoch": 0.5279054539343194, "grad_norm": 3.703125, "learning_rate": 8.79979510165892e-06, "loss": 0.90825663, "memory(GiB)": 752.07, "step": 20810, "train_speed(iter/s)": 0.230271 }, { "acc": 0.75322504, "epoch": 0.528032293303357, "grad_norm": 3.125, "learning_rate": 8.799113424922212e-06, "loss": 0.92644587, "memory(GiB)": 752.07, "step": 20815, "train_speed(iter/s)": 0.230143 }, { "acc": 0.7622355, "epoch": 0.5281591326723946, "grad_norm": 3.21875, "learning_rate": 8.798431581074609e-06, "loss": 0.90628529, "memory(GiB)": 752.07, "step": 20820, "train_speed(iter/s)": 0.230034 }, { "acc": 0.76249709, "epoch": 0.5282859720414321, "grad_norm": 3.171875, "learning_rate": 8.797749570146103e-06, "loss": 0.89759083, "memory(GiB)": 752.07, "step": 20825, "train_speed(iter/s)": 0.22991 }, { "acc": 0.75571308, "epoch": 0.5284128114104696, "grad_norm": 3.390625, "learning_rate": 8.797067392166696e-06, "loss": 0.95415545, "memory(GiB)": 752.07, "step": 20830, "train_speed(iter/s)": 0.229795 }, { "acc": 0.76009645, "epoch": 0.5285396507795072, "grad_norm": 4.03125, "learning_rate": 8.79638504716639e-06, "loss": 0.87891102, "memory(GiB)": 752.07, "step": 20835, "train_speed(iter/s)": 0.22969 }, { "acc": 0.75646276, "epoch": 0.5286664901485447, "grad_norm": 3.78125, "learning_rate": 8.795702535175201e-06, "loss": 0.91452703, "memory(GiB)": 752.07, "step": 20840, "train_speed(iter/s)": 0.229577 }, { "acc": 0.77144656, "epoch": 0.5287933295175823, "grad_norm": 3.0625, "learning_rate": 8.795019856223153e-06, "loss": 0.89537687, "memory(GiB)": 752.07, "step": 20845, "train_speed(iter/s)": 0.22946 }, { "acc": 0.76270804, "epoch": 0.5289201688866199, "grad_norm": 3.46875, "learning_rate": 8.794337010340274e-06, "loss": 0.90934505, "memory(GiB)": 752.07, "step": 20850, "train_speed(iter/s)": 0.229354 }, { "acc": 0.77969503, "epoch": 0.5290470082556574, "grad_norm": 3.46875, "learning_rate": 8.793653997556599e-06, "loss": 0.88007126, "memory(GiB)": 752.07, "step": 20855, "train_speed(iter/s)": 0.229235 }, { "acc": 0.76062322, "epoch": 0.529173847624695, "grad_norm": 4.09375, "learning_rate": 8.792970817902171e-06, "loss": 0.90760717, "memory(GiB)": 752.07, "step": 20860, "train_speed(iter/s)": 0.229111 }, { "acc": 0.7530623, "epoch": 0.5293006869937326, "grad_norm": 3.296875, "learning_rate": 8.792287471407042e-06, "loss": 0.95552025, "memory(GiB)": 752.07, "step": 20865, "train_speed(iter/s)": 0.22899 }, { "acc": 0.74447412, "epoch": 0.5294275263627701, "grad_norm": 4.0625, "learning_rate": 8.791603958101271e-06, "loss": 0.99443083, "memory(GiB)": 752.07, "step": 20870, "train_speed(iter/s)": 0.228866 }, { "acc": 0.77175665, "epoch": 0.5295543657318077, "grad_norm": 3.296875, "learning_rate": 8.790920278014921e-06, "loss": 0.90896072, "memory(GiB)": 752.07, "step": 20875, "train_speed(iter/s)": 0.228745 }, { "acc": 0.75785718, "epoch": 0.5296812051008453, "grad_norm": 3.4375, "learning_rate": 8.790236431178069e-06, "loss": 0.91569271, "memory(GiB)": 752.07, "step": 20880, "train_speed(iter/s)": 0.228623 }, { "acc": 0.75428376, "epoch": 0.5298080444698828, "grad_norm": 2.96875, "learning_rate": 8.78955241762079e-06, "loss": 0.96259308, "memory(GiB)": 752.07, "step": 20885, "train_speed(iter/s)": 0.228501 }, { "acc": 0.76371779, "epoch": 0.5299348838389203, "grad_norm": 3.265625, "learning_rate": 8.788868237373176e-06, "loss": 0.92630806, "memory(GiB)": 752.07, "step": 20890, "train_speed(iter/s)": 0.228373 }, { "acc": 0.76505184, "epoch": 0.5300617232079579, "grad_norm": 3.59375, "learning_rate": 8.78818389046532e-06, "loss": 0.89431696, "memory(GiB)": 752.07, "step": 20895, "train_speed(iter/s)": 0.228273 }, { "acc": 0.75059047, "epoch": 0.5301885625769954, "grad_norm": 3.4375, "learning_rate": 8.787499376927326e-06, "loss": 0.96456099, "memory(GiB)": 752.07, "step": 20900, "train_speed(iter/s)": 0.228152 }, { "acc": 0.76611605, "epoch": 0.530315401946033, "grad_norm": 3.53125, "learning_rate": 8.786814696789301e-06, "loss": 0.91151657, "memory(GiB)": 752.07, "step": 20905, "train_speed(iter/s)": 0.228038 }, { "acc": 0.75881906, "epoch": 0.5304422413150706, "grad_norm": 4.34375, "learning_rate": 8.786129850081363e-06, "loss": 0.89751387, "memory(GiB)": 752.07, "step": 20910, "train_speed(iter/s)": 0.227923 }, { "acc": 0.76797457, "epoch": 0.5305690806841081, "grad_norm": 3.828125, "learning_rate": 8.785444836833636e-06, "loss": 0.86484203, "memory(GiB)": 752.07, "step": 20915, "train_speed(iter/s)": 0.227805 }, { "acc": 0.76418152, "epoch": 0.5306959200531457, "grad_norm": 3.28125, "learning_rate": 8.784759657076251e-06, "loss": 0.90322371, "memory(GiB)": 752.07, "step": 20920, "train_speed(iter/s)": 0.227683 }, { "acc": 0.76484318, "epoch": 0.5308227594221833, "grad_norm": 3.4375, "learning_rate": 8.784074310839348e-06, "loss": 0.92608566, "memory(GiB)": 752.07, "step": 20925, "train_speed(iter/s)": 0.227587 }, { "acc": 0.7740262, "epoch": 0.5309495987912208, "grad_norm": 3.5625, "learning_rate": 8.783388798153075e-06, "loss": 0.87050505, "memory(GiB)": 752.07, "step": 20930, "train_speed(iter/s)": 0.227462 }, { "acc": 0.76038327, "epoch": 0.5310764381602584, "grad_norm": 3.515625, "learning_rate": 8.782703119047582e-06, "loss": 0.92051926, "memory(GiB)": 752.07, "step": 20935, "train_speed(iter/s)": 0.227338 }, { "acc": 0.74321451, "epoch": 0.531203277529296, "grad_norm": 3.9375, "learning_rate": 8.78201727355303e-06, "loss": 0.99988174, "memory(GiB)": 752.07, "step": 20940, "train_speed(iter/s)": 0.227224 }, { "acc": 0.76844053, "epoch": 0.5313301168983335, "grad_norm": 3.234375, "learning_rate": 8.78133126169959e-06, "loss": 0.89269161, "memory(GiB)": 752.07, "step": 20945, "train_speed(iter/s)": 0.227114 }, { "acc": 0.74921017, "epoch": 0.531456956267371, "grad_norm": 3.140625, "learning_rate": 8.780645083517435e-06, "loss": 0.94385071, "memory(GiB)": 752.07, "step": 20950, "train_speed(iter/s)": 0.22701 }, { "acc": 0.77177992, "epoch": 0.5315837956364086, "grad_norm": 3.328125, "learning_rate": 8.779958739036749e-06, "loss": 0.82983036, "memory(GiB)": 752.07, "step": 20955, "train_speed(iter/s)": 0.226889 }, { "acc": 0.7567472, "epoch": 0.5317106350054461, "grad_norm": 3.515625, "learning_rate": 8.779272228287721e-06, "loss": 0.95694656, "memory(GiB)": 752.07, "step": 20960, "train_speed(iter/s)": 0.226773 }, { "acc": 0.76499515, "epoch": 0.5318374743744837, "grad_norm": 4.59375, "learning_rate": 8.77858555130055e-06, "loss": 0.91983862, "memory(GiB)": 752.07, "step": 20965, "train_speed(iter/s)": 0.226658 }, { "acc": 0.75763898, "epoch": 0.5319643137435213, "grad_norm": 3.359375, "learning_rate": 8.77789870810544e-06, "loss": 0.95894852, "memory(GiB)": 752.07, "step": 20970, "train_speed(iter/s)": 0.22654 }, { "acc": 0.76878805, "epoch": 0.5320911531125588, "grad_norm": 4.65625, "learning_rate": 8.777211698732602e-06, "loss": 0.92641497, "memory(GiB)": 752.07, "step": 20975, "train_speed(iter/s)": 0.226442 }, { "acc": 0.76849384, "epoch": 0.5322179924815964, "grad_norm": 3.671875, "learning_rate": 8.776524523212257e-06, "loss": 0.90643988, "memory(GiB)": 752.07, "step": 20980, "train_speed(iter/s)": 0.226335 }, { "acc": 0.76800518, "epoch": 0.532344831850634, "grad_norm": 3.21875, "learning_rate": 8.775837181574632e-06, "loss": 0.8890913, "memory(GiB)": 752.07, "step": 20985, "train_speed(iter/s)": 0.226211 }, { "acc": 0.78089852, "epoch": 0.5324716712196715, "grad_norm": 3.4375, "learning_rate": 8.775149673849959e-06, "loss": 0.87493601, "memory(GiB)": 752.07, "step": 20990, "train_speed(iter/s)": 0.226096 }, { "acc": 0.74856524, "epoch": 0.5325985105887091, "grad_norm": 4.125, "learning_rate": 8.77446200006848e-06, "loss": 0.94270229, "memory(GiB)": 752.07, "step": 20995, "train_speed(iter/s)": 0.225982 }, { "acc": 0.76369886, "epoch": 0.5327253499577467, "grad_norm": 3.40625, "learning_rate": 8.773774160260444e-06, "loss": 0.90557337, "memory(GiB)": 752.07, "step": 21000, "train_speed(iter/s)": 0.225873 }, { "epoch": 0.5327253499577467, "eval_acc": 0.7528864587828684, "eval_loss": 0.8800991177558899, "eval_runtime": 1148.6501, "eval_samples_per_second": 5.546, "eval_steps_per_second": 5.546, "step": 21000 }, { "acc": 0.75946255, "epoch": 0.5328521893267842, "grad_norm": 3.484375, "learning_rate": 8.773086154456106e-06, "loss": 0.90586538, "memory(GiB)": 752.07, "step": 21005, "train_speed(iter/s)": 0.221325 }, { "acc": 0.76745238, "epoch": 0.5329790286958217, "grad_norm": 3.4375, "learning_rate": 8.77239798268573e-06, "loss": 0.88890638, "memory(GiB)": 752.07, "step": 21010, "train_speed(iter/s)": 0.22121 }, { "acc": 0.77129078, "epoch": 0.5331058680648593, "grad_norm": 4.03125, "learning_rate": 8.771709644979586e-06, "loss": 0.87452421, "memory(GiB)": 752.07, "step": 21015, "train_speed(iter/s)": 0.22112 }, { "acc": 0.75337276, "epoch": 0.5332327074338968, "grad_norm": 3.84375, "learning_rate": 8.771021141367954e-06, "loss": 0.94806404, "memory(GiB)": 752.07, "step": 21020, "train_speed(iter/s)": 0.221028 }, { "acc": 0.75104976, "epoch": 0.5333595468029344, "grad_norm": 3.203125, "learning_rate": 8.770332471881117e-06, "loss": 0.95562496, "memory(GiB)": 752.07, "step": 21025, "train_speed(iter/s)": 0.22091 }, { "acc": 0.77334561, "epoch": 0.533486386171972, "grad_norm": 3.796875, "learning_rate": 8.769643636549367e-06, "loss": 0.90418653, "memory(GiB)": 752.07, "step": 21030, "train_speed(iter/s)": 0.220822 }, { "acc": 0.77099357, "epoch": 0.5336132255410095, "grad_norm": 2.703125, "learning_rate": 8.768954635403003e-06, "loss": 0.85923691, "memory(GiB)": 752.07, "step": 21035, "train_speed(iter/s)": 0.220724 }, { "acc": 0.76601315, "epoch": 0.5337400649100471, "grad_norm": 4.09375, "learning_rate": 8.768265468472335e-06, "loss": 0.93799267, "memory(GiB)": 752.07, "step": 21040, "train_speed(iter/s)": 0.220625 }, { "acc": 0.75250201, "epoch": 0.5338669042790847, "grad_norm": 3.65625, "learning_rate": 8.767576135787674e-06, "loss": 0.9689229, "memory(GiB)": 752.07, "step": 21045, "train_speed(iter/s)": 0.220522 }, { "acc": 0.76353998, "epoch": 0.5339937436481222, "grad_norm": 3.296875, "learning_rate": 8.766886637379344e-06, "loss": 0.90928917, "memory(GiB)": 752.07, "step": 21050, "train_speed(iter/s)": 0.220423 }, { "acc": 0.76962576, "epoch": 0.5341205830171598, "grad_norm": 4.0, "learning_rate": 8.766196973277672e-06, "loss": 0.92173338, "memory(GiB)": 752.07, "step": 21055, "train_speed(iter/s)": 0.220306 }, { "acc": 0.77173886, "epoch": 0.5342474223861974, "grad_norm": 3.75, "learning_rate": 8.765507143512994e-06, "loss": 0.8358099, "memory(GiB)": 752.07, "step": 21060, "train_speed(iter/s)": 0.220206 }, { "acc": 0.76171303, "epoch": 0.534374261755235, "grad_norm": 3.40625, "learning_rate": 8.764817148115657e-06, "loss": 0.93451166, "memory(GiB)": 752.07, "step": 21065, "train_speed(iter/s)": 0.220101 }, { "acc": 0.76724048, "epoch": 0.5345011011242724, "grad_norm": 2.765625, "learning_rate": 8.764126987116009e-06, "loss": 0.85818548, "memory(GiB)": 752.07, "step": 21070, "train_speed(iter/s)": 0.22 }, { "acc": 0.77311039, "epoch": 0.53462794049331, "grad_norm": 4.28125, "learning_rate": 8.763436660544407e-06, "loss": 0.89360266, "memory(GiB)": 752.07, "step": 21075, "train_speed(iter/s)": 0.219916 }, { "acc": 0.77090917, "epoch": 0.5347547798623475, "grad_norm": 3.140625, "learning_rate": 8.762746168431217e-06, "loss": 0.8735755, "memory(GiB)": 752.07, "step": 21080, "train_speed(iter/s)": 0.219794 }, { "acc": 0.75462055, "epoch": 0.5348816192313851, "grad_norm": 3.546875, "learning_rate": 8.762055510806813e-06, "loss": 0.94744816, "memory(GiB)": 752.07, "step": 21085, "train_speed(iter/s)": 0.219702 }, { "acc": 0.78720493, "epoch": 0.5350084586004227, "grad_norm": 3.9375, "learning_rate": 8.761364687701573e-06, "loss": 0.88854952, "memory(GiB)": 752.07, "step": 21090, "train_speed(iter/s)": 0.219594 }, { "acc": 0.7582799, "epoch": 0.5351352979694602, "grad_norm": 3.3125, "learning_rate": 8.760673699145882e-06, "loss": 0.87916679, "memory(GiB)": 752.07, "step": 21095, "train_speed(iter/s)": 0.219487 }, { "acc": 0.78531046, "epoch": 0.5352621373384978, "grad_norm": 3.015625, "learning_rate": 8.75998254517014e-06, "loss": 0.88709154, "memory(GiB)": 752.07, "step": 21100, "train_speed(iter/s)": 0.219394 }, { "acc": 0.76797886, "epoch": 0.5353889767075354, "grad_norm": 3.859375, "learning_rate": 8.759291225804746e-06, "loss": 0.94103003, "memory(GiB)": 752.07, "step": 21105, "train_speed(iter/s)": 0.219295 }, { "acc": 0.76755538, "epoch": 0.535515816076573, "grad_norm": 3.296875, "learning_rate": 8.75859974108011e-06, "loss": 0.85682116, "memory(GiB)": 752.07, "step": 21110, "train_speed(iter/s)": 0.2192 }, { "acc": 0.76261778, "epoch": 0.5356426554456105, "grad_norm": 3.4375, "learning_rate": 8.757908091026646e-06, "loss": 0.93727312, "memory(GiB)": 752.07, "step": 21115, "train_speed(iter/s)": 0.219095 }, { "acc": 0.77575498, "epoch": 0.5357694948146481, "grad_norm": 3.203125, "learning_rate": 8.757216275674777e-06, "loss": 0.86431065, "memory(GiB)": 752.07, "step": 21120, "train_speed(iter/s)": 0.218985 }, { "acc": 0.77605486, "epoch": 0.5358963341836857, "grad_norm": 3.0625, "learning_rate": 8.756524295054937e-06, "loss": 0.86195774, "memory(GiB)": 752.07, "step": 21125, "train_speed(iter/s)": 0.218891 }, { "acc": 0.76510434, "epoch": 0.5360231735527231, "grad_norm": 3.359375, "learning_rate": 8.755832149197562e-06, "loss": 0.88696737, "memory(GiB)": 752.07, "step": 21130, "train_speed(iter/s)": 0.21878 }, { "acc": 0.76850314, "epoch": 0.5361500129217607, "grad_norm": 3.171875, "learning_rate": 8.755139838133098e-06, "loss": 0.92617397, "memory(GiB)": 752.07, "step": 21135, "train_speed(iter/s)": 0.218674 }, { "acc": 0.76975236, "epoch": 0.5362768522907982, "grad_norm": 3.1875, "learning_rate": 8.754447361891995e-06, "loss": 0.84012775, "memory(GiB)": 752.07, "step": 21140, "train_speed(iter/s)": 0.218574 }, { "acc": 0.76484642, "epoch": 0.5364036916598358, "grad_norm": 3.15625, "learning_rate": 8.753754720504717e-06, "loss": 0.93876152, "memory(GiB)": 752.07, "step": 21145, "train_speed(iter/s)": 0.218463 }, { "acc": 0.75015039, "epoch": 0.5365305310288734, "grad_norm": 3.15625, "learning_rate": 8.753061914001728e-06, "loss": 0.92971869, "memory(GiB)": 752.07, "step": 21150, "train_speed(iter/s)": 0.21836 }, { "acc": 0.7593194, "epoch": 0.536657370397911, "grad_norm": 3.25, "learning_rate": 8.752368942413504e-06, "loss": 0.88620005, "memory(GiB)": 752.07, "step": 21155, "train_speed(iter/s)": 0.218256 }, { "acc": 0.76428647, "epoch": 0.5367842097669485, "grad_norm": 4.28125, "learning_rate": 8.751675805770526e-06, "loss": 0.92454453, "memory(GiB)": 752.07, "step": 21160, "train_speed(iter/s)": 0.21816 }, { "acc": 0.76176782, "epoch": 0.5369110491359861, "grad_norm": 4.8125, "learning_rate": 8.750982504103282e-06, "loss": 0.87381706, "memory(GiB)": 752.07, "step": 21165, "train_speed(iter/s)": 0.218056 }, { "acc": 0.77071838, "epoch": 0.5370378885050237, "grad_norm": 3.84375, "learning_rate": 8.75028903744227e-06, "loss": 0.91084204, "memory(GiB)": 752.07, "step": 21170, "train_speed(iter/s)": 0.217958 }, { "acc": 0.76002793, "epoch": 0.5371647278740612, "grad_norm": 3.75, "learning_rate": 8.749595405817993e-06, "loss": 0.98053036, "memory(GiB)": 752.07, "step": 21175, "train_speed(iter/s)": 0.217872 }, { "acc": 0.76856561, "epoch": 0.5372915672430988, "grad_norm": 4.125, "learning_rate": 8.748901609260958e-06, "loss": 0.90505266, "memory(GiB)": 752.07, "step": 21180, "train_speed(iter/s)": 0.217782 }, { "acc": 0.77279782, "epoch": 0.5374184066121364, "grad_norm": 4.0625, "learning_rate": 8.748207647801689e-06, "loss": 0.85594177, "memory(GiB)": 752.07, "step": 21185, "train_speed(iter/s)": 0.217688 }, { "acc": 0.76646023, "epoch": 0.5375452459811738, "grad_norm": 3.265625, "learning_rate": 8.747513521470707e-06, "loss": 0.91565924, "memory(GiB)": 752.07, "step": 21190, "train_speed(iter/s)": 0.217602 }, { "acc": 0.76255836, "epoch": 0.5376720853502114, "grad_norm": 5.84375, "learning_rate": 8.746819230298547e-06, "loss": 0.87223473, "memory(GiB)": 752.07, "step": 21195, "train_speed(iter/s)": 0.217506 }, { "acc": 0.77866383, "epoch": 0.537798924719249, "grad_norm": 3.15625, "learning_rate": 8.746124774315745e-06, "loss": 0.865452, "memory(GiB)": 752.07, "step": 21200, "train_speed(iter/s)": 0.217407 }, { "acc": 0.75632839, "epoch": 0.5379257640882865, "grad_norm": 3.171875, "learning_rate": 8.74543015355285e-06, "loss": 0.92298498, "memory(GiB)": 752.07, "step": 21205, "train_speed(iter/s)": 0.21731 }, { "acc": 0.75768008, "epoch": 0.5380526034573241, "grad_norm": 4.53125, "learning_rate": 8.744735368040419e-06, "loss": 0.9415781, "memory(GiB)": 752.07, "step": 21210, "train_speed(iter/s)": 0.217214 }, { "acc": 0.75766945, "epoch": 0.5381794428263617, "grad_norm": 3.40625, "learning_rate": 8.744040417809007e-06, "loss": 0.92924404, "memory(GiB)": 752.07, "step": 21215, "train_speed(iter/s)": 0.217112 }, { "acc": 0.75298424, "epoch": 0.5383062821953992, "grad_norm": 3.53125, "learning_rate": 8.743345302889188e-06, "loss": 0.96047621, "memory(GiB)": 752.07, "step": 21220, "train_speed(iter/s)": 0.217018 }, { "acc": 0.76264148, "epoch": 0.5384331215644368, "grad_norm": 4.5625, "learning_rate": 8.742650023311536e-06, "loss": 0.87137423, "memory(GiB)": 752.07, "step": 21225, "train_speed(iter/s)": 0.216915 }, { "acc": 0.77479887, "epoch": 0.5385599609334744, "grad_norm": 3.0625, "learning_rate": 8.741954579106636e-06, "loss": 0.84665956, "memory(GiB)": 752.07, "step": 21230, "train_speed(iter/s)": 0.216815 }, { "acc": 0.76040583, "epoch": 0.5386868003025119, "grad_norm": 3.28125, "learning_rate": 8.741258970305073e-06, "loss": 0.89746151, "memory(GiB)": 752.07, "step": 21235, "train_speed(iter/s)": 0.216704 }, { "acc": 0.76824455, "epoch": 0.5388136396715495, "grad_norm": 3.296875, "learning_rate": 8.74056319693745e-06, "loss": 0.88593245, "memory(GiB)": 752.07, "step": 21240, "train_speed(iter/s)": 0.216601 }, { "acc": 0.75644851, "epoch": 0.5389404790405871, "grad_norm": 3.578125, "learning_rate": 8.739867259034372e-06, "loss": 0.92552328, "memory(GiB)": 752.07, "step": 21245, "train_speed(iter/s)": 0.216498 }, { "acc": 0.76436801, "epoch": 0.5390673184096245, "grad_norm": 3.453125, "learning_rate": 8.739171156626446e-06, "loss": 0.88207846, "memory(GiB)": 752.07, "step": 21250, "train_speed(iter/s)": 0.216407 }, { "acc": 0.7670805, "epoch": 0.5391941577786621, "grad_norm": 3.578125, "learning_rate": 8.738474889744297e-06, "loss": 0.93737974, "memory(GiB)": 752.07, "step": 21255, "train_speed(iter/s)": 0.216299 }, { "acc": 0.76784453, "epoch": 0.5393209971476997, "grad_norm": 4.1875, "learning_rate": 8.737778458418549e-06, "loss": 0.90443001, "memory(GiB)": 752.07, "step": 21260, "train_speed(iter/s)": 0.216212 }, { "acc": 0.7378684, "epoch": 0.5394478365167372, "grad_norm": 3.34375, "learning_rate": 8.737081862679836e-06, "loss": 1.00058336, "memory(GiB)": 752.07, "step": 21265, "train_speed(iter/s)": 0.216112 }, { "acc": 0.75929708, "epoch": 0.5395746758857748, "grad_norm": 3.265625, "learning_rate": 8.736385102558797e-06, "loss": 0.92259369, "memory(GiB)": 752.07, "step": 21270, "train_speed(iter/s)": 0.216025 }, { "acc": 0.76554265, "epoch": 0.5397015152548124, "grad_norm": 3.015625, "learning_rate": 8.735688178086084e-06, "loss": 0.87378979, "memory(GiB)": 752.07, "step": 21275, "train_speed(iter/s)": 0.215932 }, { "acc": 0.77788696, "epoch": 0.5398283546238499, "grad_norm": 3.03125, "learning_rate": 8.734991089292349e-06, "loss": 0.91011887, "memory(GiB)": 752.07, "step": 21280, "train_speed(iter/s)": 0.215847 }, { "acc": 0.75237699, "epoch": 0.5399551939928875, "grad_norm": 3.265625, "learning_rate": 8.734293836208257e-06, "loss": 0.90247288, "memory(GiB)": 752.07, "step": 21285, "train_speed(iter/s)": 0.21574 }, { "acc": 0.77783794, "epoch": 0.5400820333619251, "grad_norm": 3.265625, "learning_rate": 8.733596418864478e-06, "loss": 0.8593297, "memory(GiB)": 752.07, "step": 21290, "train_speed(iter/s)": 0.215637 }, { "acc": 0.75404892, "epoch": 0.5402088727309626, "grad_norm": 3.53125, "learning_rate": 8.732898837291687e-06, "loss": 0.91705179, "memory(GiB)": 752.07, "step": 21295, "train_speed(iter/s)": 0.21554 }, { "acc": 0.76183186, "epoch": 0.5403357121000002, "grad_norm": 3.484375, "learning_rate": 8.732201091520572e-06, "loss": 0.92591267, "memory(GiB)": 752.07, "step": 21300, "train_speed(iter/s)": 0.215437 }, { "acc": 0.74962935, "epoch": 0.5404625514690378, "grad_norm": 3.71875, "learning_rate": 8.731503181581822e-06, "loss": 0.93911333, "memory(GiB)": 752.07, "step": 21305, "train_speed(iter/s)": 0.215327 }, { "acc": 0.77270284, "epoch": 0.5405893908380752, "grad_norm": 3.34375, "learning_rate": 8.730805107506135e-06, "loss": 0.84898462, "memory(GiB)": 752.07, "step": 21310, "train_speed(iter/s)": 0.215236 }, { "acc": 0.76143475, "epoch": 0.5407162302071128, "grad_norm": 3.84375, "learning_rate": 8.73010686932422e-06, "loss": 0.94406939, "memory(GiB)": 752.07, "step": 21315, "train_speed(iter/s)": 0.215133 }, { "acc": 0.7646143, "epoch": 0.5408430695761504, "grad_norm": 2.921875, "learning_rate": 8.729408467066788e-06, "loss": 0.89918756, "memory(GiB)": 752.07, "step": 21320, "train_speed(iter/s)": 0.215045 }, { "acc": 0.75927124, "epoch": 0.5409699089451879, "grad_norm": 3.609375, "learning_rate": 8.72870990076456e-06, "loss": 0.90941038, "memory(GiB)": 752.07, "step": 21325, "train_speed(iter/s)": 0.214958 }, { "acc": 0.7601665, "epoch": 0.5410967483142255, "grad_norm": 3.375, "learning_rate": 8.728011170448264e-06, "loss": 0.90967188, "memory(GiB)": 752.07, "step": 21330, "train_speed(iter/s)": 0.214865 }, { "acc": 0.74570241, "epoch": 0.5412235876832631, "grad_norm": 3.65625, "learning_rate": 8.727312276148634e-06, "loss": 0.96237659, "memory(GiB)": 752.07, "step": 21335, "train_speed(iter/s)": 0.214785 }, { "acc": 0.76644735, "epoch": 0.5413504270523006, "grad_norm": 3.859375, "learning_rate": 8.726613217896413e-06, "loss": 0.94777632, "memory(GiB)": 752.07, "step": 21340, "train_speed(iter/s)": 0.214688 }, { "acc": 0.76055961, "epoch": 0.5414772664213382, "grad_norm": 3.8125, "learning_rate": 8.725913995722352e-06, "loss": 0.90556078, "memory(GiB)": 752.07, "step": 21345, "train_speed(iter/s)": 0.214602 }, { "acc": 0.76880727, "epoch": 0.5416041057903758, "grad_norm": 3.03125, "learning_rate": 8.725214609657204e-06, "loss": 0.89909353, "memory(GiB)": 752.07, "step": 21350, "train_speed(iter/s)": 0.214506 }, { "acc": 0.75757961, "epoch": 0.5417309451594133, "grad_norm": 3.21875, "learning_rate": 8.724515059731737e-06, "loss": 0.94925842, "memory(GiB)": 752.07, "step": 21355, "train_speed(iter/s)": 0.214417 }, { "acc": 0.76529746, "epoch": 0.5418577845284509, "grad_norm": 3.296875, "learning_rate": 8.723815345976718e-06, "loss": 0.90162649, "memory(GiB)": 752.07, "step": 21360, "train_speed(iter/s)": 0.214327 }, { "acc": 0.7787159, "epoch": 0.5419846238974885, "grad_norm": 3.78125, "learning_rate": 8.723115468422927e-06, "loss": 0.8281167, "memory(GiB)": 752.07, "step": 21365, "train_speed(iter/s)": 0.214223 }, { "acc": 0.75652509, "epoch": 0.5421114632665259, "grad_norm": 2.734375, "learning_rate": 8.722415427101147e-06, "loss": 0.96740761, "memory(GiB)": 752.07, "step": 21370, "train_speed(iter/s)": 0.214124 }, { "acc": 0.78109145, "epoch": 0.5422383026355635, "grad_norm": 4.65625, "learning_rate": 8.721715222042175e-06, "loss": 0.84372482, "memory(GiB)": 752.07, "step": 21375, "train_speed(iter/s)": 0.214038 }, { "acc": 0.76371398, "epoch": 0.5423651420046011, "grad_norm": 3.265625, "learning_rate": 8.721014853276808e-06, "loss": 0.94085903, "memory(GiB)": 752.07, "step": 21380, "train_speed(iter/s)": 0.213945 }, { "acc": 0.76316104, "epoch": 0.5424919813736386, "grad_norm": 3.875, "learning_rate": 8.720314320835855e-06, "loss": 0.97094641, "memory(GiB)": 752.07, "step": 21385, "train_speed(iter/s)": 0.213861 }, { "acc": 0.75270476, "epoch": 0.5426188207426762, "grad_norm": 3.40625, "learning_rate": 8.719613624750125e-06, "loss": 0.9387187, "memory(GiB)": 752.07, "step": 21390, "train_speed(iter/s)": 0.213766 }, { "acc": 0.77049775, "epoch": 0.5427456601117138, "grad_norm": 4.28125, "learning_rate": 8.718912765050446e-06, "loss": 0.90892944, "memory(GiB)": 752.07, "step": 21395, "train_speed(iter/s)": 0.21366 }, { "acc": 0.76007185, "epoch": 0.5428724994807513, "grad_norm": 4.375, "learning_rate": 8.718211741767642e-06, "loss": 0.93127584, "memory(GiB)": 752.07, "step": 21400, "train_speed(iter/s)": 0.21357 }, { "acc": 0.75829577, "epoch": 0.5429993388497889, "grad_norm": 3.828125, "learning_rate": 8.717510554932551e-06, "loss": 0.96441755, "memory(GiB)": 752.07, "step": 21405, "train_speed(iter/s)": 0.213474 }, { "acc": 0.76944766, "epoch": 0.5431261782188265, "grad_norm": 3.203125, "learning_rate": 8.716809204576016e-06, "loss": 0.87774725, "memory(GiB)": 752.07, "step": 21410, "train_speed(iter/s)": 0.213374 }, { "acc": 0.7605041, "epoch": 0.543253017587864, "grad_norm": 3.265625, "learning_rate": 8.716107690728887e-06, "loss": 0.94538679, "memory(GiB)": 752.07, "step": 21415, "train_speed(iter/s)": 0.21327 }, { "acc": 0.74239297, "epoch": 0.5433798569569016, "grad_norm": 4.8125, "learning_rate": 8.715406013422019e-06, "loss": 0.97240677, "memory(GiB)": 752.07, "step": 21420, "train_speed(iter/s)": 0.213169 }, { "acc": 0.76241693, "epoch": 0.5435066963259392, "grad_norm": 3.546875, "learning_rate": 8.71470417268628e-06, "loss": 0.93398809, "memory(GiB)": 752.07, "step": 21425, "train_speed(iter/s)": 0.213081 }, { "acc": 0.76669035, "epoch": 0.5436335356949766, "grad_norm": 3.90625, "learning_rate": 8.71400216855254e-06, "loss": 0.9096736, "memory(GiB)": 752.07, "step": 21430, "train_speed(iter/s)": 0.212996 }, { "acc": 0.77109761, "epoch": 0.5437603750640142, "grad_norm": 3.703125, "learning_rate": 8.713300001051678e-06, "loss": 0.92127724, "memory(GiB)": 752.07, "step": 21435, "train_speed(iter/s)": 0.212905 }, { "acc": 0.77311969, "epoch": 0.5438872144330518, "grad_norm": 5.09375, "learning_rate": 8.71259767021458e-06, "loss": 0.82964649, "memory(GiB)": 752.07, "step": 21440, "train_speed(iter/s)": 0.212801 }, { "acc": 0.77373729, "epoch": 0.5440140538020893, "grad_norm": 2.984375, "learning_rate": 8.71189517607214e-06, "loss": 0.87847452, "memory(GiB)": 752.07, "step": 21445, "train_speed(iter/s)": 0.212702 }, { "acc": 0.75724702, "epoch": 0.5441408931711269, "grad_norm": 4.28125, "learning_rate": 8.711192518655259e-06, "loss": 0.92769756, "memory(GiB)": 752.07, "step": 21450, "train_speed(iter/s)": 0.212605 }, { "acc": 0.7628294, "epoch": 0.5442677325401645, "grad_norm": 3.625, "learning_rate": 8.710489697994841e-06, "loss": 0.94889088, "memory(GiB)": 752.07, "step": 21455, "train_speed(iter/s)": 0.212524 }, { "acc": 0.75271487, "epoch": 0.544394571909202, "grad_norm": 4.5, "learning_rate": 8.709786714121803e-06, "loss": 0.88580294, "memory(GiB)": 752.07, "step": 21460, "train_speed(iter/s)": 0.212442 }, { "acc": 0.75850096, "epoch": 0.5445214112782396, "grad_norm": 3.890625, "learning_rate": 8.709083567067072e-06, "loss": 0.9072197, "memory(GiB)": 752.07, "step": 21465, "train_speed(iter/s)": 0.212355 }, { "acc": 0.76196713, "epoch": 0.5446482506472772, "grad_norm": 3.8125, "learning_rate": 8.708380256861569e-06, "loss": 0.90781641, "memory(GiB)": 752.07, "step": 21470, "train_speed(iter/s)": 0.212254 }, { "acc": 0.78655391, "epoch": 0.5447750900163147, "grad_norm": 4.1875, "learning_rate": 8.707676783536236e-06, "loss": 0.83434896, "memory(GiB)": 752.07, "step": 21475, "train_speed(iter/s)": 0.212177 }, { "acc": 0.74992404, "epoch": 0.5449019293853523, "grad_norm": 4.09375, "learning_rate": 8.706973147122014e-06, "loss": 0.9969018, "memory(GiB)": 752.07, "step": 21480, "train_speed(iter/s)": 0.212076 }, { "acc": 0.76739297, "epoch": 0.5450287687543899, "grad_norm": 4.0625, "learning_rate": 8.706269347649854e-06, "loss": 0.89392433, "memory(GiB)": 752.07, "step": 21485, "train_speed(iter/s)": 0.211978 }, { "acc": 0.75854506, "epoch": 0.5451556081234273, "grad_norm": 3.078125, "learning_rate": 8.705565385150715e-06, "loss": 0.94348907, "memory(GiB)": 752.07, "step": 21490, "train_speed(iter/s)": 0.211891 }, { "acc": 0.76279979, "epoch": 0.5452824474924649, "grad_norm": 3.8125, "learning_rate": 8.704861259655562e-06, "loss": 0.90335131, "memory(GiB)": 752.07, "step": 21495, "train_speed(iter/s)": 0.211801 }, { "acc": 0.75979457, "epoch": 0.5454092868615025, "grad_norm": 3.1875, "learning_rate": 8.704156971195365e-06, "loss": 0.9179369, "memory(GiB)": 752.07, "step": 21500, "train_speed(iter/s)": 0.21171 }, { "epoch": 0.5454092868615025, "eval_acc": 0.7530042691641353, "eval_loss": 0.87980055809021, "eval_runtime": 1147.4361, "eval_samples_per_second": 5.552, "eval_steps_per_second": 5.552, "step": 21500 }, { "acc": 0.7758657, "epoch": 0.54553612623054, "grad_norm": 3.78125, "learning_rate": 8.703452519801105e-06, "loss": 0.8645421, "memory(GiB)": 752.07, "step": 21505, "train_speed(iter/s)": 0.207796 }, { "acc": 0.76895733, "epoch": 0.5456629655995776, "grad_norm": 3.734375, "learning_rate": 8.70274790550377e-06, "loss": 0.89575567, "memory(GiB)": 752.07, "step": 21510, "train_speed(iter/s)": 0.207697 }, { "acc": 0.76148334, "epoch": 0.5457898049686152, "grad_norm": 3.53125, "learning_rate": 8.702043128334351e-06, "loss": 0.93775473, "memory(GiB)": 752.07, "step": 21515, "train_speed(iter/s)": 0.207619 }, { "acc": 0.76228776, "epoch": 0.5459166443376527, "grad_norm": 2.859375, "learning_rate": 8.70133818832385e-06, "loss": 0.93749104, "memory(GiB)": 752.07, "step": 21520, "train_speed(iter/s)": 0.207528 }, { "acc": 0.75443163, "epoch": 0.5460434837066903, "grad_norm": 3.4375, "learning_rate": 8.700633085503278e-06, "loss": 0.97878647, "memory(GiB)": 752.07, "step": 21525, "train_speed(iter/s)": 0.207441 }, { "acc": 0.75651827, "epoch": 0.5461703230757279, "grad_norm": 3.53125, "learning_rate": 8.699927819903646e-06, "loss": 0.94369888, "memory(GiB)": 752.07, "step": 21530, "train_speed(iter/s)": 0.207339 }, { "acc": 0.77432103, "epoch": 0.5462971624447654, "grad_norm": 3.859375, "learning_rate": 8.699222391555976e-06, "loss": 0.87197504, "memory(GiB)": 752.07, "step": 21535, "train_speed(iter/s)": 0.207256 }, { "acc": 0.75296745, "epoch": 0.546424001813803, "grad_norm": 4.125, "learning_rate": 8.698516800491301e-06, "loss": 0.93925467, "memory(GiB)": 752.07, "step": 21540, "train_speed(iter/s)": 0.20718 }, { "acc": 0.75668912, "epoch": 0.5465508411828406, "grad_norm": 3.453125, "learning_rate": 8.697811046740657e-06, "loss": 0.92356157, "memory(GiB)": 752.07, "step": 21545, "train_speed(iter/s)": 0.207102 }, { "acc": 0.78323655, "epoch": 0.546677680551878, "grad_norm": 3.1875, "learning_rate": 8.697105130335084e-06, "loss": 0.81284676, "memory(GiB)": 752.07, "step": 21550, "train_speed(iter/s)": 0.207015 }, { "acc": 0.76521964, "epoch": 0.5468045199209156, "grad_norm": 3.03125, "learning_rate": 8.69639905130564e-06, "loss": 0.91229792, "memory(GiB)": 752.07, "step": 21555, "train_speed(iter/s)": 0.206929 }, { "acc": 0.76398573, "epoch": 0.5469313592899532, "grad_norm": 3.65625, "learning_rate": 8.695692809683375e-06, "loss": 0.92590456, "memory(GiB)": 752.07, "step": 21560, "train_speed(iter/s)": 0.206845 }, { "acc": 0.76324339, "epoch": 0.5470581986589907, "grad_norm": 2.859375, "learning_rate": 8.694986405499362e-06, "loss": 0.90177469, "memory(GiB)": 752.07, "step": 21565, "train_speed(iter/s)": 0.206762 }, { "acc": 0.76919346, "epoch": 0.5471850380280283, "grad_norm": 3.703125, "learning_rate": 8.694279838784669e-06, "loss": 0.88483973, "memory(GiB)": 752.07, "step": 21570, "train_speed(iter/s)": 0.206683 }, { "acc": 0.77017245, "epoch": 0.5473118773970659, "grad_norm": 3.15625, "learning_rate": 8.693573109570377e-06, "loss": 0.89965973, "memory(GiB)": 752.07, "step": 21575, "train_speed(iter/s)": 0.206594 }, { "acc": 0.76653414, "epoch": 0.5474387167661035, "grad_norm": 3.90625, "learning_rate": 8.692866217887572e-06, "loss": 0.85838747, "memory(GiB)": 752.07, "step": 21580, "train_speed(iter/s)": 0.20651 }, { "acc": 0.75803623, "epoch": 0.547565556135141, "grad_norm": 3.5625, "learning_rate": 8.692159163767346e-06, "loss": 0.90683889, "memory(GiB)": 752.07, "step": 21585, "train_speed(iter/s)": 0.206427 }, { "acc": 0.76482291, "epoch": 0.5476923955041786, "grad_norm": 3.75, "learning_rate": 8.691451947240805e-06, "loss": 0.93101616, "memory(GiB)": 752.07, "step": 21590, "train_speed(iter/s)": 0.206343 }, { "acc": 0.76104436, "epoch": 0.5478192348732162, "grad_norm": 3.140625, "learning_rate": 8.690744568339053e-06, "loss": 0.88921432, "memory(GiB)": 752.07, "step": 21595, "train_speed(iter/s)": 0.206254 }, { "acc": 0.76580153, "epoch": 0.5479460742422537, "grad_norm": 3.5625, "learning_rate": 8.690037027093207e-06, "loss": 0.9182003, "memory(GiB)": 752.07, "step": 21600, "train_speed(iter/s)": 0.206176 }, { "acc": 0.74479733, "epoch": 0.5480729136112913, "grad_norm": 3.671875, "learning_rate": 8.689329323534391e-06, "loss": 0.90957069, "memory(GiB)": 752.07, "step": 21605, "train_speed(iter/s)": 0.206092 }, { "acc": 0.7583168, "epoch": 0.5481997529803287, "grad_norm": 4.15625, "learning_rate": 8.688621457693732e-06, "loss": 0.93057528, "memory(GiB)": 752.07, "step": 21610, "train_speed(iter/s)": 0.206012 }, { "acc": 0.77636356, "epoch": 0.5483265923493663, "grad_norm": 3.828125, "learning_rate": 8.687913429602369e-06, "loss": 0.85090294, "memory(GiB)": 752.07, "step": 21615, "train_speed(iter/s)": 0.205933 }, { "acc": 0.75290198, "epoch": 0.5484534317184039, "grad_norm": 3.203125, "learning_rate": 8.687205239291442e-06, "loss": 0.95750322, "memory(GiB)": 752.07, "step": 21620, "train_speed(iter/s)": 0.205845 }, { "acc": 0.74890609, "epoch": 0.5485802710874415, "grad_norm": 2.921875, "learning_rate": 8.686496886792106e-06, "loss": 0.91784563, "memory(GiB)": 752.07, "step": 21625, "train_speed(iter/s)": 0.205769 }, { "acc": 0.76665311, "epoch": 0.548707110456479, "grad_norm": 3.375, "learning_rate": 8.685788372135517e-06, "loss": 0.87579241, "memory(GiB)": 752.07, "step": 21630, "train_speed(iter/s)": 0.205681 }, { "acc": 0.77260647, "epoch": 0.5488339498255166, "grad_norm": 3.53125, "learning_rate": 8.685079695352843e-06, "loss": 0.8991313, "memory(GiB)": 752.07, "step": 21635, "train_speed(iter/s)": 0.20561 }, { "acc": 0.7625227, "epoch": 0.5489607891945542, "grad_norm": 3.9375, "learning_rate": 8.684370856475253e-06, "loss": 0.8964962, "memory(GiB)": 752.07, "step": 21640, "train_speed(iter/s)": 0.205537 }, { "acc": 0.76894426, "epoch": 0.5490876285635917, "grad_norm": 3.21875, "learning_rate": 8.683661855533929e-06, "loss": 0.84589615, "memory(GiB)": 752.07, "step": 21645, "train_speed(iter/s)": 0.205466 }, { "acc": 0.76138859, "epoch": 0.5492144679326293, "grad_norm": 3.546875, "learning_rate": 8.682952692560057e-06, "loss": 0.92271957, "memory(GiB)": 752.07, "step": 21650, "train_speed(iter/s)": 0.205386 }, { "acc": 0.76298189, "epoch": 0.5493413073016669, "grad_norm": 3.46875, "learning_rate": 8.68224336758483e-06, "loss": 0.89286423, "memory(GiB)": 752.07, "step": 21655, "train_speed(iter/s)": 0.2053 }, { "acc": 0.75870261, "epoch": 0.5494681466707044, "grad_norm": 3.71875, "learning_rate": 8.68153388063945e-06, "loss": 0.92668762, "memory(GiB)": 752.07, "step": 21660, "train_speed(iter/s)": 0.205219 }, { "acc": 0.76940494, "epoch": 0.549594986039742, "grad_norm": 3.234375, "learning_rate": 8.680824231755125e-06, "loss": 0.89175949, "memory(GiB)": 752.07, "step": 21665, "train_speed(iter/s)": 0.20514 }, { "acc": 0.76148458, "epoch": 0.5497218254087795, "grad_norm": 3.671875, "learning_rate": 8.68011442096307e-06, "loss": 0.9836338, "memory(GiB)": 752.07, "step": 21670, "train_speed(iter/s)": 0.205056 }, { "acc": 0.77015519, "epoch": 0.549848664777817, "grad_norm": 3.28125, "learning_rate": 8.679404448294504e-06, "loss": 0.91169758, "memory(GiB)": 752.07, "step": 21675, "train_speed(iter/s)": 0.204974 }, { "acc": 0.76690545, "epoch": 0.5499755041468546, "grad_norm": 3.046875, "learning_rate": 8.678694313780662e-06, "loss": 0.90256081, "memory(GiB)": 752.07, "step": 21680, "train_speed(iter/s)": 0.204889 }, { "acc": 0.75871086, "epoch": 0.5501023435158922, "grad_norm": 4.6875, "learning_rate": 8.677984017452779e-06, "loss": 0.89517622, "memory(GiB)": 752.07, "step": 21685, "train_speed(iter/s)": 0.204777 }, { "acc": 0.75372643, "epoch": 0.5502291828849297, "grad_norm": 3.375, "learning_rate": 8.677273559342098e-06, "loss": 0.92948732, "memory(GiB)": 752.07, "step": 21690, "train_speed(iter/s)": 0.204699 }, { "acc": 0.77306094, "epoch": 0.5503560222539673, "grad_norm": 3.359375, "learning_rate": 8.676562939479868e-06, "loss": 0.86197128, "memory(GiB)": 752.07, "step": 21695, "train_speed(iter/s)": 0.204613 }, { "acc": 0.7652246, "epoch": 0.5504828616230049, "grad_norm": 3.390625, "learning_rate": 8.67585215789735e-06, "loss": 0.92067509, "memory(GiB)": 752.07, "step": 21700, "train_speed(iter/s)": 0.204525 }, { "acc": 0.76567698, "epoch": 0.5506097009920424, "grad_norm": 2.984375, "learning_rate": 8.675141214625806e-06, "loss": 0.90602388, "memory(GiB)": 752.07, "step": 21705, "train_speed(iter/s)": 0.20443 }, { "acc": 0.76418743, "epoch": 0.55073654036108, "grad_norm": 3.859375, "learning_rate": 8.674430109696511e-06, "loss": 0.93574114, "memory(GiB)": 752.07, "step": 21710, "train_speed(iter/s)": 0.20435 }, { "acc": 0.75749359, "epoch": 0.5508633797301176, "grad_norm": 3.984375, "learning_rate": 8.673718843140743e-06, "loss": 0.94202671, "memory(GiB)": 752.07, "step": 21715, "train_speed(iter/s)": 0.204275 }, { "acc": 0.76206112, "epoch": 0.5509902190991551, "grad_norm": 3.0, "learning_rate": 8.673007414989787e-06, "loss": 0.86738806, "memory(GiB)": 752.07, "step": 21720, "train_speed(iter/s)": 0.204188 }, { "acc": 0.75020084, "epoch": 0.5511170584681927, "grad_norm": 3.96875, "learning_rate": 8.672295825274939e-06, "loss": 0.95139227, "memory(GiB)": 752.07, "step": 21725, "train_speed(iter/s)": 0.204102 }, { "acc": 0.7544405, "epoch": 0.5512438978372302, "grad_norm": 2.78125, "learning_rate": 8.671584074027498e-06, "loss": 0.91032734, "memory(GiB)": 752.07, "step": 21730, "train_speed(iter/s)": 0.204018 }, { "acc": 0.75673265, "epoch": 0.5513707372062677, "grad_norm": 4.1875, "learning_rate": 8.670872161278772e-06, "loss": 0.97121925, "memory(GiB)": 752.07, "step": 21735, "train_speed(iter/s)": 0.203947 }, { "acc": 0.75683804, "epoch": 0.5514975765753053, "grad_norm": 2.84375, "learning_rate": 8.670160087060075e-06, "loss": 0.91375952, "memory(GiB)": 752.07, "step": 21740, "train_speed(iter/s)": 0.203863 }, { "acc": 0.76698933, "epoch": 0.5516244159443429, "grad_norm": 3.625, "learning_rate": 8.66944785140273e-06, "loss": 0.88380642, "memory(GiB)": 752.07, "step": 21745, "train_speed(iter/s)": 0.203779 }, { "acc": 0.75287623, "epoch": 0.5517512553133804, "grad_norm": 3.71875, "learning_rate": 8.668735454338066e-06, "loss": 0.93423557, "memory(GiB)": 752.07, "step": 21750, "train_speed(iter/s)": 0.203696 }, { "acc": 0.77709928, "epoch": 0.551878094682418, "grad_norm": 4.46875, "learning_rate": 8.668022895897418e-06, "loss": 0.89083357, "memory(GiB)": 752.07, "step": 21755, "train_speed(iter/s)": 0.203619 }, { "acc": 0.76016893, "epoch": 0.5520049340514556, "grad_norm": 3.625, "learning_rate": 8.66731017611213e-06, "loss": 0.86708832, "memory(GiB)": 752.07, "step": 21760, "train_speed(iter/s)": 0.203545 }, { "acc": 0.76868591, "epoch": 0.5521317734204931, "grad_norm": 3.484375, "learning_rate": 8.666597295013552e-06, "loss": 0.86926966, "memory(GiB)": 752.07, "step": 21765, "train_speed(iter/s)": 0.203473 }, { "acc": 0.76171122, "epoch": 0.5522586127895307, "grad_norm": 3.359375, "learning_rate": 8.665884252633042e-06, "loss": 0.90715036, "memory(GiB)": 752.07, "step": 21770, "train_speed(iter/s)": 0.203394 }, { "acc": 0.77277188, "epoch": 0.5523854521585683, "grad_norm": 3.375, "learning_rate": 8.665171049001964e-06, "loss": 0.85367203, "memory(GiB)": 752.07, "step": 21775, "train_speed(iter/s)": 0.203319 }, { "acc": 0.75593281, "epoch": 0.5525122915276058, "grad_norm": 3.703125, "learning_rate": 8.66445768415169e-06, "loss": 0.92330208, "memory(GiB)": 752.07, "step": 21780, "train_speed(iter/s)": 0.203229 }, { "acc": 0.76162176, "epoch": 0.5526391308966434, "grad_norm": 3.484375, "learning_rate": 8.663744158113596e-06, "loss": 0.91579027, "memory(GiB)": 752.07, "step": 21785, "train_speed(iter/s)": 0.203141 }, { "acc": 0.77537341, "epoch": 0.5527659702656809, "grad_norm": 4.375, "learning_rate": 8.663030470919072e-06, "loss": 0.8713851, "memory(GiB)": 752.07, "step": 21790, "train_speed(iter/s)": 0.203071 }, { "acc": 0.75637293, "epoch": 0.5528928096347184, "grad_norm": 3.171875, "learning_rate": 8.662316622599508e-06, "loss": 0.89329119, "memory(GiB)": 752.07, "step": 21795, "train_speed(iter/s)": 0.202995 }, { "acc": 0.76732993, "epoch": 0.553019649003756, "grad_norm": 3.1875, "learning_rate": 8.661602613186304e-06, "loss": 0.89016085, "memory(GiB)": 752.07, "step": 21800, "train_speed(iter/s)": 0.202914 }, { "acc": 0.7575191, "epoch": 0.5531464883727936, "grad_norm": 3.53125, "learning_rate": 8.660888442710868e-06, "loss": 0.97737141, "memory(GiB)": 752.07, "step": 21805, "train_speed(iter/s)": 0.202842 }, { "acc": 0.7611135, "epoch": 0.5532733277418311, "grad_norm": 3.203125, "learning_rate": 8.660174111204616e-06, "loss": 0.89131889, "memory(GiB)": 752.07, "step": 21810, "train_speed(iter/s)": 0.20276 }, { "acc": 0.75904937, "epoch": 0.5534001671108687, "grad_norm": 3.46875, "learning_rate": 8.659459618698964e-06, "loss": 0.97290316, "memory(GiB)": 752.07, "step": 21815, "train_speed(iter/s)": 0.20268 }, { "acc": 0.75371799, "epoch": 0.5535270064799063, "grad_norm": 3.984375, "learning_rate": 8.658744965225344e-06, "loss": 0.89491005, "memory(GiB)": 752.07, "step": 21820, "train_speed(iter/s)": 0.202602 }, { "acc": 0.76118979, "epoch": 0.5536538458489438, "grad_norm": 3.375, "learning_rate": 8.65803015081519e-06, "loss": 0.95693636, "memory(GiB)": 752.07, "step": 21825, "train_speed(iter/s)": 0.202526 }, { "acc": 0.75970659, "epoch": 0.5537806852179814, "grad_norm": 3.453125, "learning_rate": 8.657315175499945e-06, "loss": 0.95033417, "memory(GiB)": 752.07, "step": 21830, "train_speed(iter/s)": 0.202444 }, { "acc": 0.7720644, "epoch": 0.553907524587019, "grad_norm": 3.4375, "learning_rate": 8.656600039311058e-06, "loss": 0.92102871, "memory(GiB)": 752.07, "step": 21835, "train_speed(iter/s)": 0.202367 }, { "acc": 0.77768941, "epoch": 0.5540343639560565, "grad_norm": 4.40625, "learning_rate": 8.655884742279987e-06, "loss": 0.90818262, "memory(GiB)": 752.07, "step": 21840, "train_speed(iter/s)": 0.202287 }, { "acc": 0.75445895, "epoch": 0.5541612033250941, "grad_norm": 3.234375, "learning_rate": 8.655169284438194e-06, "loss": 0.92255621, "memory(GiB)": 752.07, "step": 21845, "train_speed(iter/s)": 0.2022 }, { "acc": 0.77119031, "epoch": 0.5542880426941316, "grad_norm": 3.421875, "learning_rate": 8.654453665817151e-06, "loss": 0.8776763, "memory(GiB)": 752.07, "step": 21850, "train_speed(iter/s)": 0.202121 }, { "acc": 0.77140222, "epoch": 0.5544148820631691, "grad_norm": 3.453125, "learning_rate": 8.653737886448334e-06, "loss": 0.88340244, "memory(GiB)": 752.07, "step": 21855, "train_speed(iter/s)": 0.202049 }, { "acc": 0.75865426, "epoch": 0.5545417214322067, "grad_norm": 3.6875, "learning_rate": 8.65302194636323e-06, "loss": 0.88512011, "memory(GiB)": 752.07, "step": 21860, "train_speed(iter/s)": 0.201961 }, { "acc": 0.7619956, "epoch": 0.5546685608012443, "grad_norm": 3.4375, "learning_rate": 8.652305845593329e-06, "loss": 0.88567905, "memory(GiB)": 752.07, "step": 21865, "train_speed(iter/s)": 0.201868 }, { "acc": 0.77274971, "epoch": 0.5547954001702818, "grad_norm": 3.15625, "learning_rate": 8.65158958417013e-06, "loss": 0.85764627, "memory(GiB)": 752.07, "step": 21870, "train_speed(iter/s)": 0.201791 }, { "acc": 0.77463059, "epoch": 0.5549222395393194, "grad_norm": 3.421875, "learning_rate": 8.650873162125142e-06, "loss": 0.85806732, "memory(GiB)": 752.07, "step": 21875, "train_speed(iter/s)": 0.201713 }, { "acc": 0.7832942, "epoch": 0.555049078908357, "grad_norm": 2.75, "learning_rate": 8.650156579489875e-06, "loss": 0.85675507, "memory(GiB)": 752.07, "step": 21880, "train_speed(iter/s)": 0.201636 }, { "acc": 0.76210861, "epoch": 0.5551759182773945, "grad_norm": 3.34375, "learning_rate": 8.649439836295848e-06, "loss": 0.88074799, "memory(GiB)": 752.07, "step": 21885, "train_speed(iter/s)": 0.201559 }, { "acc": 0.7554131, "epoch": 0.5553027576464321, "grad_norm": 3.5625, "learning_rate": 8.648722932574594e-06, "loss": 0.91090784, "memory(GiB)": 752.07, "step": 21890, "train_speed(iter/s)": 0.201484 }, { "acc": 0.77238216, "epoch": 0.5554295970154697, "grad_norm": 2.9375, "learning_rate": 8.64800586835764e-06, "loss": 0.85823946, "memory(GiB)": 752.07, "step": 21895, "train_speed(iter/s)": 0.201399 }, { "acc": 0.77366614, "epoch": 0.5555564363845072, "grad_norm": 4.59375, "learning_rate": 8.647288643676535e-06, "loss": 0.91543713, "memory(GiB)": 752.07, "step": 21900, "train_speed(iter/s)": 0.201311 }, { "acc": 0.77386942, "epoch": 0.5556832757535448, "grad_norm": 3.46875, "learning_rate": 8.646571258562821e-06, "loss": 0.88629532, "memory(GiB)": 752.07, "step": 21905, "train_speed(iter/s)": 0.20124 }, { "acc": 0.76562276, "epoch": 0.5558101151225823, "grad_norm": 3.78125, "learning_rate": 8.645853713048057e-06, "loss": 0.92782545, "memory(GiB)": 752.07, "step": 21910, "train_speed(iter/s)": 0.201167 }, { "acc": 0.76236305, "epoch": 0.5559369544916198, "grad_norm": 3.328125, "learning_rate": 8.645136007163803e-06, "loss": 0.91316414, "memory(GiB)": 752.07, "step": 21915, "train_speed(iter/s)": 0.201081 }, { "acc": 0.77644305, "epoch": 0.5560637938606574, "grad_norm": 3.96875, "learning_rate": 8.644418140941633e-06, "loss": 0.85233974, "memory(GiB)": 752.07, "step": 21920, "train_speed(iter/s)": 0.201013 }, { "acc": 0.7704206, "epoch": 0.556190633229695, "grad_norm": 3.09375, "learning_rate": 8.643700114413118e-06, "loss": 0.86053801, "memory(GiB)": 752.07, "step": 21925, "train_speed(iter/s)": 0.200941 }, { "acc": 0.78393211, "epoch": 0.5563174725987325, "grad_norm": 3.65625, "learning_rate": 8.642981927609846e-06, "loss": 0.82634487, "memory(GiB)": 752.07, "step": 21930, "train_speed(iter/s)": 0.200872 }, { "acc": 0.77313266, "epoch": 0.5564443119677701, "grad_norm": 3.96875, "learning_rate": 8.642263580563406e-06, "loss": 0.90368176, "memory(GiB)": 752.07, "step": 21935, "train_speed(iter/s)": 0.200785 }, { "acc": 0.78215256, "epoch": 0.5565711513368077, "grad_norm": 3.875, "learning_rate": 8.641545073305397e-06, "loss": 0.84609537, "memory(GiB)": 752.07, "step": 21940, "train_speed(iter/s)": 0.200705 }, { "acc": 0.7778626, "epoch": 0.5566979907058452, "grad_norm": 3.3125, "learning_rate": 8.640826405867422e-06, "loss": 0.85124149, "memory(GiB)": 752.07, "step": 21945, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75614867, "epoch": 0.5568248300748828, "grad_norm": 3.1875, "learning_rate": 8.640107578281094e-06, "loss": 0.92908421, "memory(GiB)": 752.07, "step": 21950, "train_speed(iter/s)": 0.200533 }, { "acc": 0.76525083, "epoch": 0.5569516694439204, "grad_norm": 3.671875, "learning_rate": 8.63938859057803e-06, "loss": 0.92551985, "memory(GiB)": 752.07, "step": 21955, "train_speed(iter/s)": 0.200452 }, { "acc": 0.75089445, "epoch": 0.557078508812958, "grad_norm": 3.3125, "learning_rate": 8.63866944278986e-06, "loss": 0.93044367, "memory(GiB)": 752.07, "step": 21960, "train_speed(iter/s)": 0.200375 }, { "acc": 0.750879, "epoch": 0.5572053481819955, "grad_norm": 3.4375, "learning_rate": 8.637950134948214e-06, "loss": 0.91663523, "memory(GiB)": 752.07, "step": 21965, "train_speed(iter/s)": 0.200285 }, { "acc": 0.76952591, "epoch": 0.557332187551033, "grad_norm": 4.21875, "learning_rate": 8.637230667084733e-06, "loss": 0.92652788, "memory(GiB)": 752.07, "step": 21970, "train_speed(iter/s)": 0.200206 }, { "acc": 0.7590395, "epoch": 0.5574590269200705, "grad_norm": 4.0625, "learning_rate": 8.636511039231064e-06, "loss": 0.91647482, "memory(GiB)": 752.07, "step": 21975, "train_speed(iter/s)": 0.200125 }, { "acc": 0.7580555, "epoch": 0.5575858662891081, "grad_norm": 4.0625, "learning_rate": 8.635791251418859e-06, "loss": 0.93990965, "memory(GiB)": 752.07, "step": 21980, "train_speed(iter/s)": 0.200044 }, { "acc": 0.75983543, "epoch": 0.5577127056581457, "grad_norm": 2.921875, "learning_rate": 8.635071303679782e-06, "loss": 0.92770624, "memory(GiB)": 752.07, "step": 21985, "train_speed(iter/s)": 0.199968 }, { "acc": 0.75906787, "epoch": 0.5578395450271832, "grad_norm": 3.03125, "learning_rate": 8.634351196045502e-06, "loss": 0.91451206, "memory(GiB)": 752.07, "step": 21990, "train_speed(iter/s)": 0.199891 }, { "acc": 0.75716524, "epoch": 0.5579663843962208, "grad_norm": 3.328125, "learning_rate": 8.633630928547693e-06, "loss": 0.9304287, "memory(GiB)": 752.07, "step": 21995, "train_speed(iter/s)": 0.199808 }, { "acc": 0.76315293, "epoch": 0.5580932237652584, "grad_norm": 4.0, "learning_rate": 8.632910501218035e-06, "loss": 0.94789543, "memory(GiB)": 752.07, "step": 22000, "train_speed(iter/s)": 0.199733 }, { "epoch": 0.5580932237652584, "eval_acc": 0.753247827505549, "eval_loss": 0.8782313466072083, "eval_runtime": 1148.9702, "eval_samples_per_second": 5.544, "eval_steps_per_second": 5.544, "step": 22000 }, { "acc": 0.76344991, "epoch": 0.558220063134296, "grad_norm": 3.28125, "learning_rate": 8.63218991408822e-06, "loss": 0.91323309, "memory(GiB)": 752.07, "step": 22005, "train_speed(iter/s)": 0.196323 }, { "acc": 0.7744689, "epoch": 0.5583469025033335, "grad_norm": 3.34375, "learning_rate": 8.631469167189944e-06, "loss": 0.9214983, "memory(GiB)": 752.07, "step": 22010, "train_speed(iter/s)": 0.196261 }, { "acc": 0.76033516, "epoch": 0.5584737418723711, "grad_norm": 3.140625, "learning_rate": 8.630748260554909e-06, "loss": 0.96568851, "memory(GiB)": 752.07, "step": 22015, "train_speed(iter/s)": 0.196189 }, { "acc": 0.77227402, "epoch": 0.5586005812414087, "grad_norm": 3.25, "learning_rate": 8.630027194214827e-06, "loss": 0.87737494, "memory(GiB)": 752.07, "step": 22020, "train_speed(iter/s)": 0.196116 }, { "acc": 0.75857825, "epoch": 0.5587274206104462, "grad_norm": 3.421875, "learning_rate": 8.629305968201413e-06, "loss": 0.89486895, "memory(GiB)": 752.07, "step": 22025, "train_speed(iter/s)": 0.196042 }, { "acc": 0.75924997, "epoch": 0.5588542599794837, "grad_norm": 3.078125, "learning_rate": 8.628584582546395e-06, "loss": 0.9299861, "memory(GiB)": 752.07, "step": 22030, "train_speed(iter/s)": 0.195966 }, { "acc": 0.76261554, "epoch": 0.5589810993485212, "grad_norm": 3.171875, "learning_rate": 8.627863037281503e-06, "loss": 0.88229723, "memory(GiB)": 752.07, "step": 22035, "train_speed(iter/s)": 0.195888 }, { "acc": 0.76885695, "epoch": 0.5591079387175588, "grad_norm": 3.34375, "learning_rate": 8.627141332438473e-06, "loss": 0.86156464, "memory(GiB)": 752.07, "step": 22040, "train_speed(iter/s)": 0.195817 }, { "acc": 0.76153803, "epoch": 0.5592347780865964, "grad_norm": 3.390625, "learning_rate": 8.626419468049054e-06, "loss": 0.96543379, "memory(GiB)": 752.07, "step": 22045, "train_speed(iter/s)": 0.195748 }, { "acc": 0.75604897, "epoch": 0.559361617455634, "grad_norm": 3.265625, "learning_rate": 8.625697444144997e-06, "loss": 0.9613678, "memory(GiB)": 752.07, "step": 22050, "train_speed(iter/s)": 0.19567 }, { "acc": 0.76053982, "epoch": 0.5594884568246715, "grad_norm": 3.875, "learning_rate": 8.624975260758062e-06, "loss": 0.96256218, "memory(GiB)": 752.07, "step": 22055, "train_speed(iter/s)": 0.195597 }, { "acc": 0.77368741, "epoch": 0.5596152961937091, "grad_norm": 4.8125, "learning_rate": 8.624252917920013e-06, "loss": 0.92917786, "memory(GiB)": 752.07, "step": 22060, "train_speed(iter/s)": 0.195534 }, { "acc": 0.77564526, "epoch": 0.5597421355627467, "grad_norm": 3.96875, "learning_rate": 8.623530415662628e-06, "loss": 0.86022263, "memory(GiB)": 752.07, "step": 22065, "train_speed(iter/s)": 0.195469 }, { "acc": 0.77426372, "epoch": 0.5598689749317842, "grad_norm": 3.171875, "learning_rate": 8.622807754017685e-06, "loss": 0.89700756, "memory(GiB)": 752.07, "step": 22070, "train_speed(iter/s)": 0.195392 }, { "acc": 0.76357279, "epoch": 0.5599958143008218, "grad_norm": 3.140625, "learning_rate": 8.62208493301697e-06, "loss": 0.89599476, "memory(GiB)": 752.07, "step": 22075, "train_speed(iter/s)": 0.195322 }, { "acc": 0.74785547, "epoch": 0.5601226536698594, "grad_norm": 3.453125, "learning_rate": 8.621361952692283e-06, "loss": 0.93274584, "memory(GiB)": 752.07, "step": 22080, "train_speed(iter/s)": 0.195247 }, { "acc": 0.7640605, "epoch": 0.5602494930388969, "grad_norm": 5.0, "learning_rate": 8.620638813075419e-06, "loss": 0.93004055, "memory(GiB)": 752.07, "step": 22085, "train_speed(iter/s)": 0.195173 }, { "acc": 0.76986742, "epoch": 0.5603763324079344, "grad_norm": 3.21875, "learning_rate": 8.619915514198191e-06, "loss": 0.85749454, "memory(GiB)": 752.07, "step": 22090, "train_speed(iter/s)": 0.195095 }, { "acc": 0.77293854, "epoch": 0.560503171776972, "grad_norm": 3.15625, "learning_rate": 8.619192056092413e-06, "loss": 0.85669184, "memory(GiB)": 752.07, "step": 22095, "train_speed(iter/s)": 0.195016 }, { "acc": 0.77034469, "epoch": 0.5606300111460095, "grad_norm": 3.0625, "learning_rate": 8.618468438789907e-06, "loss": 0.88660498, "memory(GiB)": 752.07, "step": 22100, "train_speed(iter/s)": 0.194933 }, { "acc": 0.76690016, "epoch": 0.5607568505150471, "grad_norm": 3.953125, "learning_rate": 8.617744662322505e-06, "loss": 0.89471416, "memory(GiB)": 752.07, "step": 22105, "train_speed(iter/s)": 0.194859 }, { "acc": 0.76426473, "epoch": 0.5608836898840847, "grad_norm": 7.09375, "learning_rate": 8.617020726722041e-06, "loss": 0.91680889, "memory(GiB)": 752.07, "step": 22110, "train_speed(iter/s)": 0.194781 }, { "acc": 0.76301851, "epoch": 0.5610105292531222, "grad_norm": 3.328125, "learning_rate": 8.61629663202036e-06, "loss": 0.90326967, "memory(GiB)": 752.07, "step": 22115, "train_speed(iter/s)": 0.19471 }, { "acc": 0.76744609, "epoch": 0.5611373686221598, "grad_norm": 3.078125, "learning_rate": 8.615572378249313e-06, "loss": 0.89917374, "memory(GiB)": 752.07, "step": 22120, "train_speed(iter/s)": 0.194649 }, { "acc": 0.76161938, "epoch": 0.5612642079911974, "grad_norm": 3.46875, "learning_rate": 8.614847965440756e-06, "loss": 0.93457298, "memory(GiB)": 752.07, "step": 22125, "train_speed(iter/s)": 0.194577 }, { "acc": 0.77050586, "epoch": 0.5613910473602349, "grad_norm": 3.515625, "learning_rate": 8.614123393626555e-06, "loss": 0.89926453, "memory(GiB)": 752.07, "step": 22130, "train_speed(iter/s)": 0.194516 }, { "acc": 0.75802708, "epoch": 0.5615178867292725, "grad_norm": 3.296875, "learning_rate": 8.613398662838581e-06, "loss": 0.90105953, "memory(GiB)": 752.07, "step": 22135, "train_speed(iter/s)": 0.194455 }, { "acc": 0.76308994, "epoch": 0.5616447260983101, "grad_norm": 3.71875, "learning_rate": 8.612673773108713e-06, "loss": 0.92686634, "memory(GiB)": 752.07, "step": 22140, "train_speed(iter/s)": 0.194372 }, { "acc": 0.77055011, "epoch": 0.5617715654673476, "grad_norm": 3.4375, "learning_rate": 8.611948724468838e-06, "loss": 0.87143288, "memory(GiB)": 752.07, "step": 22145, "train_speed(iter/s)": 0.194306 }, { "acc": 0.7528048, "epoch": 0.5618984048363851, "grad_norm": 3.4375, "learning_rate": 8.611223516950844e-06, "loss": 0.90587835, "memory(GiB)": 752.07, "step": 22150, "train_speed(iter/s)": 0.194225 }, { "acc": 0.75899434, "epoch": 0.5620252442054227, "grad_norm": 3.390625, "learning_rate": 8.610498150586637e-06, "loss": 0.90097418, "memory(GiB)": 752.07, "step": 22155, "train_speed(iter/s)": 0.194141 }, { "acc": 0.76900263, "epoch": 0.5621520835744602, "grad_norm": 3.28125, "learning_rate": 8.609772625408117e-06, "loss": 0.91270514, "memory(GiB)": 752.07, "step": 22160, "train_speed(iter/s)": 0.19408 }, { "acc": 0.77260723, "epoch": 0.5622789229434978, "grad_norm": 3.328125, "learning_rate": 8.609046941447203e-06, "loss": 0.80163136, "memory(GiB)": 752.07, "step": 22165, "train_speed(iter/s)": 0.19402 }, { "acc": 0.76348643, "epoch": 0.5624057623125354, "grad_norm": 3.421875, "learning_rate": 8.608321098735813e-06, "loss": 0.90909348, "memory(GiB)": 752.07, "step": 22170, "train_speed(iter/s)": 0.193961 }, { "acc": 0.77246828, "epoch": 0.5625326016815729, "grad_norm": 3.046875, "learning_rate": 8.607595097305873e-06, "loss": 0.8878643, "memory(GiB)": 752.07, "step": 22175, "train_speed(iter/s)": 0.193892 }, { "acc": 0.76683793, "epoch": 0.5626594410506105, "grad_norm": 4.15625, "learning_rate": 8.60686893718932e-06, "loss": 0.88952999, "memory(GiB)": 752.07, "step": 22180, "train_speed(iter/s)": 0.193826 }, { "acc": 0.77012768, "epoch": 0.5627862804196481, "grad_norm": 3.71875, "learning_rate": 8.606142618418094e-06, "loss": 0.90456886, "memory(GiB)": 752.07, "step": 22185, "train_speed(iter/s)": 0.193753 }, { "acc": 0.76302342, "epoch": 0.5629131197886856, "grad_norm": 3.890625, "learning_rate": 8.605416141024144e-06, "loss": 0.94723625, "memory(GiB)": 752.07, "step": 22190, "train_speed(iter/s)": 0.193685 }, { "acc": 0.75729527, "epoch": 0.5630399591577232, "grad_norm": 3.140625, "learning_rate": 8.604689505039428e-06, "loss": 0.9361474, "memory(GiB)": 752.07, "step": 22195, "train_speed(iter/s)": 0.193607 }, { "acc": 0.76788445, "epoch": 0.5631667985267608, "grad_norm": 3.421875, "learning_rate": 8.603962710495904e-06, "loss": 0.8944314, "memory(GiB)": 752.07, "step": 22200, "train_speed(iter/s)": 0.193544 }, { "acc": 0.7583468, "epoch": 0.5632936378957983, "grad_norm": 3.578125, "learning_rate": 8.603235757425542e-06, "loss": 0.92601995, "memory(GiB)": 752.07, "step": 22205, "train_speed(iter/s)": 0.193475 }, { "acc": 0.76521072, "epoch": 0.5634204772648358, "grad_norm": 3.515625, "learning_rate": 8.602508645860321e-06, "loss": 0.90269318, "memory(GiB)": 752.07, "step": 22210, "train_speed(iter/s)": 0.193403 }, { "acc": 0.73991404, "epoch": 0.5635473166338734, "grad_norm": 3.71875, "learning_rate": 8.601781375832223e-06, "loss": 0.98826504, "memory(GiB)": 752.07, "step": 22215, "train_speed(iter/s)": 0.19333 }, { "acc": 0.76216812, "epoch": 0.5636741560029109, "grad_norm": 3.453125, "learning_rate": 8.601053947373239e-06, "loss": 0.91006622, "memory(GiB)": 752.07, "step": 22220, "train_speed(iter/s)": 0.193266 }, { "acc": 0.75485859, "epoch": 0.5638009953719485, "grad_norm": 3.296875, "learning_rate": 8.600326360515365e-06, "loss": 0.9436594, "memory(GiB)": 752.07, "step": 22225, "train_speed(iter/s)": 0.193201 }, { "acc": 0.76097541, "epoch": 0.5639278347409861, "grad_norm": 3.484375, "learning_rate": 8.599598615290606e-06, "loss": 0.91093493, "memory(GiB)": 752.07, "step": 22230, "train_speed(iter/s)": 0.193127 }, { "acc": 0.77082987, "epoch": 0.5640546741100236, "grad_norm": 3.609375, "learning_rate": 8.598870711730972e-06, "loss": 0.92341204, "memory(GiB)": 752.07, "step": 22235, "train_speed(iter/s)": 0.193058 }, { "acc": 0.7551259, "epoch": 0.5641815134790612, "grad_norm": 3.140625, "learning_rate": 8.598142649868481e-06, "loss": 0.92706871, "memory(GiB)": 752.07, "step": 22240, "train_speed(iter/s)": 0.192998 }, { "acc": 0.75534706, "epoch": 0.5643083528480988, "grad_norm": 3.40625, "learning_rate": 8.59741442973516e-06, "loss": 0.93684731, "memory(GiB)": 752.07, "step": 22245, "train_speed(iter/s)": 0.192927 }, { "acc": 0.75848079, "epoch": 0.5644351922171363, "grad_norm": 3.4375, "learning_rate": 8.596686051363043e-06, "loss": 0.90208035, "memory(GiB)": 752.07, "step": 22250, "train_speed(iter/s)": 0.192853 }, { "acc": 0.77460876, "epoch": 0.5645620315861739, "grad_norm": 4.0, "learning_rate": 8.595957514784163e-06, "loss": 0.94855185, "memory(GiB)": 752.07, "step": 22255, "train_speed(iter/s)": 0.192795 }, { "acc": 0.77374487, "epoch": 0.5646888709552115, "grad_norm": 3.296875, "learning_rate": 8.59522882003057e-06, "loss": 0.87974997, "memory(GiB)": 752.07, "step": 22260, "train_speed(iter/s)": 0.192735 }, { "acc": 0.76059532, "epoch": 0.564815710324249, "grad_norm": 3.4375, "learning_rate": 8.594499967134317e-06, "loss": 0.9665616, "memory(GiB)": 752.07, "step": 22265, "train_speed(iter/s)": 0.192669 }, { "acc": 0.75718575, "epoch": 0.5649425496932865, "grad_norm": 3.140625, "learning_rate": 8.593770956127464e-06, "loss": 0.97192822, "memory(GiB)": 752.07, "step": 22270, "train_speed(iter/s)": 0.192598 }, { "acc": 0.7707818, "epoch": 0.5650693890623241, "grad_norm": 3.21875, "learning_rate": 8.593041787042077e-06, "loss": 0.94640493, "memory(GiB)": 752.07, "step": 22275, "train_speed(iter/s)": 0.192532 }, { "acc": 0.75116773, "epoch": 0.5651962284313616, "grad_norm": 3.671875, "learning_rate": 8.592312459910229e-06, "loss": 0.93545723, "memory(GiB)": 752.07, "step": 22280, "train_speed(iter/s)": 0.192473 }, { "acc": 0.76678543, "epoch": 0.5653230678003992, "grad_norm": 2.984375, "learning_rate": 8.591582974764001e-06, "loss": 0.88973589, "memory(GiB)": 752.07, "step": 22285, "train_speed(iter/s)": 0.192397 }, { "acc": 0.76875281, "epoch": 0.5654499071694368, "grad_norm": 3.65625, "learning_rate": 8.590853331635482e-06, "loss": 0.89477959, "memory(GiB)": 752.07, "step": 22290, "train_speed(iter/s)": 0.192333 }, { "acc": 0.77216887, "epoch": 0.5655767465384743, "grad_norm": 3.71875, "learning_rate": 8.590123530556768e-06, "loss": 0.89096384, "memory(GiB)": 752.07, "step": 22295, "train_speed(iter/s)": 0.192266 }, { "acc": 0.7706532, "epoch": 0.5657035859075119, "grad_norm": 3.03125, "learning_rate": 8.589393571559956e-06, "loss": 0.90161724, "memory(GiB)": 752.07, "step": 22300, "train_speed(iter/s)": 0.192198 }, { "acc": 0.75628686, "epoch": 0.5658304252765495, "grad_norm": 3.625, "learning_rate": 8.588663454677159e-06, "loss": 0.93147058, "memory(GiB)": 752.07, "step": 22305, "train_speed(iter/s)": 0.192135 }, { "acc": 0.76317348, "epoch": 0.565957264645587, "grad_norm": 3.109375, "learning_rate": 8.587933179940491e-06, "loss": 0.94400892, "memory(GiB)": 752.07, "step": 22310, "train_speed(iter/s)": 0.192072 }, { "acc": 0.75754714, "epoch": 0.5660841040146246, "grad_norm": 3.671875, "learning_rate": 8.587202747382072e-06, "loss": 0.95505323, "memory(GiB)": 752.07, "step": 22315, "train_speed(iter/s)": 0.191992 }, { "acc": 0.75725846, "epoch": 0.5662109433836622, "grad_norm": 3.6875, "learning_rate": 8.586472157034037e-06, "loss": 0.93007135, "memory(GiB)": 752.07, "step": 22320, "train_speed(iter/s)": 0.191923 }, { "acc": 0.77643127, "epoch": 0.5663377827526997, "grad_norm": 3.375, "learning_rate": 8.585741408928517e-06, "loss": 0.8757618, "memory(GiB)": 752.07, "step": 22325, "train_speed(iter/s)": 0.191861 }, { "acc": 0.75363536, "epoch": 0.5664646221217372, "grad_norm": 3.53125, "learning_rate": 8.585010503097657e-06, "loss": 0.90857534, "memory(GiB)": 752.07, "step": 22330, "train_speed(iter/s)": 0.191794 }, { "acc": 0.76241665, "epoch": 0.5665914614907748, "grad_norm": 3.859375, "learning_rate": 8.584279439573609e-06, "loss": 0.8951849, "memory(GiB)": 752.07, "step": 22335, "train_speed(iter/s)": 0.191723 }, { "acc": 0.78242121, "epoch": 0.5667183008598123, "grad_norm": 3.453125, "learning_rate": 8.583548218388526e-06, "loss": 0.79860449, "memory(GiB)": 752.07, "step": 22340, "train_speed(iter/s)": 0.191657 }, { "acc": 0.77529788, "epoch": 0.5668451402288499, "grad_norm": 3.484375, "learning_rate": 8.582816839574577e-06, "loss": 0.89276037, "memory(GiB)": 752.07, "step": 22345, "train_speed(iter/s)": 0.191587 }, { "acc": 0.75940175, "epoch": 0.5669719795978875, "grad_norm": 3.75, "learning_rate": 8.58208530316393e-06, "loss": 0.91093102, "memory(GiB)": 752.07, "step": 22350, "train_speed(iter/s)": 0.191513 }, { "acc": 0.76142001, "epoch": 0.567098818966925, "grad_norm": 3.796875, "learning_rate": 8.581353609188764e-06, "loss": 0.90289087, "memory(GiB)": 752.07, "step": 22355, "train_speed(iter/s)": 0.191445 }, { "acc": 0.76037521, "epoch": 0.5672256583359626, "grad_norm": 3.796875, "learning_rate": 8.580621757681264e-06, "loss": 0.94395447, "memory(GiB)": 752.07, "step": 22360, "train_speed(iter/s)": 0.191393 }, { "acc": 0.76151247, "epoch": 0.5673524977050002, "grad_norm": 3.375, "learning_rate": 8.57988974867362e-06, "loss": 0.9001276, "memory(GiB)": 752.07, "step": 22365, "train_speed(iter/s)": 0.191329 }, { "acc": 0.75876536, "epoch": 0.5674793370740377, "grad_norm": 4.5625, "learning_rate": 8.579157582198035e-06, "loss": 0.91095085, "memory(GiB)": 752.07, "step": 22370, "train_speed(iter/s)": 0.19127 }, { "acc": 0.77907705, "epoch": 0.5676061764430753, "grad_norm": 5.25, "learning_rate": 8.57842525828671e-06, "loss": 0.86347065, "memory(GiB)": 752.07, "step": 22375, "train_speed(iter/s)": 0.191215 }, { "acc": 0.77417941, "epoch": 0.5677330158121129, "grad_norm": 2.953125, "learning_rate": 8.57769277697186e-06, "loss": 0.82793941, "memory(GiB)": 752.07, "step": 22380, "train_speed(iter/s)": 0.191139 }, { "acc": 0.76602993, "epoch": 0.5678598551811505, "grad_norm": 3.40625, "learning_rate": 8.576960138285701e-06, "loss": 0.89990168, "memory(GiB)": 752.07, "step": 22385, "train_speed(iter/s)": 0.191073 }, { "acc": 0.7487092, "epoch": 0.5679866945501879, "grad_norm": 3.890625, "learning_rate": 8.576227342260466e-06, "loss": 0.96764507, "memory(GiB)": 752.07, "step": 22390, "train_speed(iter/s)": 0.19102 }, { "acc": 0.76493196, "epoch": 0.5681135339192255, "grad_norm": 3.5625, "learning_rate": 8.575494388928384e-06, "loss": 0.91293144, "memory(GiB)": 752.07, "step": 22395, "train_speed(iter/s)": 0.190955 }, { "acc": 0.75025687, "epoch": 0.568240373288263, "grad_norm": 3.375, "learning_rate": 8.574761278321695e-06, "loss": 0.94543123, "memory(GiB)": 752.07, "step": 22400, "train_speed(iter/s)": 0.190888 }, { "acc": 0.76026797, "epoch": 0.5683672126573006, "grad_norm": 3.03125, "learning_rate": 8.574028010472649e-06, "loss": 0.94357605, "memory(GiB)": 752.07, "step": 22405, "train_speed(iter/s)": 0.190817 }, { "acc": 0.75866351, "epoch": 0.5684940520263382, "grad_norm": 3.5625, "learning_rate": 8.573294585413496e-06, "loss": 0.9460803, "memory(GiB)": 752.07, "step": 22410, "train_speed(iter/s)": 0.190749 }, { "acc": 0.74830751, "epoch": 0.5686208913953757, "grad_norm": 3.96875, "learning_rate": 8.5725610031765e-06, "loss": 0.95264416, "memory(GiB)": 752.07, "step": 22415, "train_speed(iter/s)": 0.190686 }, { "acc": 0.75457644, "epoch": 0.5687477307644133, "grad_norm": 4.625, "learning_rate": 8.571827263793931e-06, "loss": 0.94154797, "memory(GiB)": 752.07, "step": 22420, "train_speed(iter/s)": 0.190624 }, { "acc": 0.76212187, "epoch": 0.5688745701334509, "grad_norm": 3.015625, "learning_rate": 8.571093367298058e-06, "loss": 0.92149382, "memory(GiB)": 752.07, "step": 22425, "train_speed(iter/s)": 0.190558 }, { "acc": 0.76716852, "epoch": 0.5690014095024885, "grad_norm": 3.0, "learning_rate": 8.570359313721167e-06, "loss": 0.87609997, "memory(GiB)": 752.07, "step": 22430, "train_speed(iter/s)": 0.190493 }, { "acc": 0.76934319, "epoch": 0.569128248871526, "grad_norm": 4.09375, "learning_rate": 8.569625103095546e-06, "loss": 0.88435774, "memory(GiB)": 752.07, "step": 22435, "train_speed(iter/s)": 0.19042 }, { "acc": 0.75716829, "epoch": 0.5692550882405636, "grad_norm": 3.25, "learning_rate": 8.56889073545349e-06, "loss": 0.94482059, "memory(GiB)": 752.07, "step": 22440, "train_speed(iter/s)": 0.190349 }, { "acc": 0.77612886, "epoch": 0.5693819276096012, "grad_norm": 3.515625, "learning_rate": 8.568156210827304e-06, "loss": 0.8173564, "memory(GiB)": 752.07, "step": 22445, "train_speed(iter/s)": 0.190286 }, { "acc": 0.76740637, "epoch": 0.5695087669786386, "grad_norm": 3.390625, "learning_rate": 8.567421529249292e-06, "loss": 0.88819456, "memory(GiB)": 752.07, "step": 22450, "train_speed(iter/s)": 0.190216 }, { "acc": 0.76077838, "epoch": 0.5696356063476762, "grad_norm": 3.1875, "learning_rate": 8.566686690751773e-06, "loss": 0.91664314, "memory(GiB)": 752.07, "step": 22455, "train_speed(iter/s)": 0.190154 }, { "acc": 0.75690875, "epoch": 0.5697624457167138, "grad_norm": 3.515625, "learning_rate": 8.565951695367073e-06, "loss": 0.95720949, "memory(GiB)": 752.07, "step": 22460, "train_speed(iter/s)": 0.190092 }, { "acc": 0.77084246, "epoch": 0.5698892850857513, "grad_norm": 3.484375, "learning_rate": 8.565216543127518e-06, "loss": 0.88572807, "memory(GiB)": 752.07, "step": 22465, "train_speed(iter/s)": 0.190033 }, { "acc": 0.76780839, "epoch": 0.5700161244547889, "grad_norm": 3.265625, "learning_rate": 8.564481234065446e-06, "loss": 0.89255428, "memory(GiB)": 752.07, "step": 22470, "train_speed(iter/s)": 0.18997 }, { "acc": 0.76312518, "epoch": 0.5701429638238265, "grad_norm": 3.640625, "learning_rate": 8.563745768213205e-06, "loss": 0.8694788, "memory(GiB)": 752.07, "step": 22475, "train_speed(iter/s)": 0.18991 }, { "acc": 0.75864506, "epoch": 0.570269803192864, "grad_norm": 3.171875, "learning_rate": 8.56301014560314e-06, "loss": 0.90519924, "memory(GiB)": 752.07, "step": 22480, "train_speed(iter/s)": 0.189835 }, { "acc": 0.77369661, "epoch": 0.5703966425619016, "grad_norm": 3.59375, "learning_rate": 8.562274366267612e-06, "loss": 0.92242823, "memory(GiB)": 752.07, "step": 22485, "train_speed(iter/s)": 0.189764 }, { "acc": 0.74804769, "epoch": 0.5705234819309392, "grad_norm": 3.734375, "learning_rate": 8.561538430238984e-06, "loss": 0.95044947, "memory(GiB)": 752.07, "step": 22490, "train_speed(iter/s)": 0.189691 }, { "acc": 0.77243404, "epoch": 0.5706503212999767, "grad_norm": 3.578125, "learning_rate": 8.56080233754963e-06, "loss": 0.8751235, "memory(GiB)": 752.07, "step": 22495, "train_speed(iter/s)": 0.189617 }, { "acc": 0.75571184, "epoch": 0.5707771606690143, "grad_norm": 3.421875, "learning_rate": 8.560066088231924e-06, "loss": 0.94839859, "memory(GiB)": 752.07, "step": 22500, "train_speed(iter/s)": 0.189543 }, { "epoch": 0.5707771606690143, "eval_acc": 0.7535632418241893, "eval_loss": 0.8770894408226013, "eval_runtime": 1153.8348, "eval_samples_per_second": 5.521, "eval_steps_per_second": 5.521, "step": 22500 }, { "acc": 0.77744513, "epoch": 0.5709040000380519, "grad_norm": 3.90625, "learning_rate": 8.559329682318255e-06, "loss": 0.88437357, "memory(GiB)": 687.59, "step": 22505, "train_speed(iter/s)": 47.976446 }, { "acc": 0.7802145, "epoch": 0.5710308394070893, "grad_norm": 3.109375, "learning_rate": 8.558593119841014e-06, "loss": 0.82725191, "memory(GiB)": 687.65, "step": 22510, "train_speed(iter/s)": 42.135354 }, { "acc": 0.75513997, "epoch": 0.5711576787761269, "grad_norm": 3.265625, "learning_rate": 8.5578564008326e-06, "loss": 1.00471621, "memory(GiB)": 687.65, "step": 22515, "train_speed(iter/s)": 37.487272 }, { "acc": 0.76937666, "epoch": 0.5712845181451645, "grad_norm": 3.703125, "learning_rate": 8.55711952532542e-06, "loss": 0.94397116, "memory(GiB)": 687.65, "step": 22520, "train_speed(iter/s)": 33.441441 }, { "acc": 0.77358737, "epoch": 0.571411357514202, "grad_norm": 2.6875, "learning_rate": 8.556382493351884e-06, "loss": 0.8930213, "memory(GiB)": 687.65, "step": 22525, "train_speed(iter/s)": 30.27932 }, { "acc": 0.77530255, "epoch": 0.5715381968832396, "grad_norm": 3.53125, "learning_rate": 8.555645304944415e-06, "loss": 0.91857462, "memory(GiB)": 698.12, "step": 22530, "train_speed(iter/s)": 27.585552 }, { "acc": 0.76229119, "epoch": 0.5716650362522772, "grad_norm": 3.421875, "learning_rate": 8.554907960135436e-06, "loss": 0.93242197, "memory(GiB)": 698.12, "step": 22535, "train_speed(iter/s)": 25.455403 }, { "acc": 0.78097095, "epoch": 0.5717918756213147, "grad_norm": 4.96875, "learning_rate": 8.554170458957386e-06, "loss": 0.84696541, "memory(GiB)": 698.12, "step": 22540, "train_speed(iter/s)": 23.761686 }, { "acc": 0.7633678, "epoch": 0.5719187149903523, "grad_norm": 3.34375, "learning_rate": 8.553432801442698e-06, "loss": 0.94208126, "memory(GiB)": 698.12, "step": 22545, "train_speed(iter/s)": 22.193548 }, { "acc": 0.77877936, "epoch": 0.5720455543593899, "grad_norm": 3.421875, "learning_rate": 8.552694987623824e-06, "loss": 0.87053204, "memory(GiB)": 698.12, "step": 22550, "train_speed(iter/s)": 20.797918 }, { "acc": 0.76458602, "epoch": 0.5721723937284274, "grad_norm": 3.296875, "learning_rate": 8.551957017533219e-06, "loss": 0.91081629, "memory(GiB)": 698.12, "step": 22555, "train_speed(iter/s)": 19.572183 }, { "acc": 0.77122598, "epoch": 0.572299233097465, "grad_norm": 3.125, "learning_rate": 8.551218891203338e-06, "loss": 0.92163687, "memory(GiB)": 698.12, "step": 22560, "train_speed(iter/s)": 18.522213 }, { "acc": 0.76777196, "epoch": 0.5724260724665026, "grad_norm": 4.65625, "learning_rate": 8.550480608666655e-06, "loss": 0.90815105, "memory(GiB)": 698.12, "step": 22565, "train_speed(iter/s)": 17.5406 }, { "acc": 0.76766152, "epoch": 0.57255291183554, "grad_norm": 3.46875, "learning_rate": 8.549742169955645e-06, "loss": 0.87168894, "memory(GiB)": 698.12, "step": 22570, "train_speed(iter/s)": 16.702822 }, { "acc": 0.7678978, "epoch": 0.5726797512045776, "grad_norm": 3.796875, "learning_rate": 8.549003575102786e-06, "loss": 0.89884043, "memory(GiB)": 698.12, "step": 22575, "train_speed(iter/s)": 15.957126 }, { "acc": 0.7504406, "epoch": 0.5728065905736152, "grad_norm": 3.890625, "learning_rate": 8.548264824140566e-06, "loss": 0.96268177, "memory(GiB)": 698.12, "step": 22580, "train_speed(iter/s)": 15.1803 }, { "acc": 0.7617713, "epoch": 0.5729334299426527, "grad_norm": 3.6875, "learning_rate": 8.547525917101482e-06, "loss": 0.91639051, "memory(GiB)": 698.12, "step": 22585, "train_speed(iter/s)": 14.542309 }, { "acc": 0.76079159, "epoch": 0.5730602693116903, "grad_norm": 2.859375, "learning_rate": 8.546786854018036e-06, "loss": 0.87559986, "memory(GiB)": 698.12, "step": 22590, "train_speed(iter/s)": 13.9013 }, { "acc": 0.77355461, "epoch": 0.5731871086807279, "grad_norm": 3.25, "learning_rate": 8.546047634922739e-06, "loss": 0.87547626, "memory(GiB)": 698.12, "step": 22595, "train_speed(iter/s)": 13.351309 }, { "acc": 0.77455983, "epoch": 0.5733139480497654, "grad_norm": 3.609375, "learning_rate": 8.545308259848102e-06, "loss": 0.8762166, "memory(GiB)": 698.12, "step": 22600, "train_speed(iter/s)": 12.762791 }, { "acc": 0.76221743, "epoch": 0.573440787418803, "grad_norm": 3.6875, "learning_rate": 8.544568728826652e-06, "loss": 0.90943861, "memory(GiB)": 698.12, "step": 22605, "train_speed(iter/s)": 12.34334 }, { "acc": 0.74672093, "epoch": 0.5735676267878406, "grad_norm": 3.75, "learning_rate": 8.54382904189092e-06, "loss": 0.97907104, "memory(GiB)": 698.12, "step": 22610, "train_speed(iter/s)": 11.873477 }, { "acc": 0.75394516, "epoch": 0.5736944661568781, "grad_norm": 3.5, "learning_rate": 8.543089199073435e-06, "loss": 0.95997972, "memory(GiB)": 698.12, "step": 22615, "train_speed(iter/s)": 11.472645 }, { "acc": 0.77497487, "epoch": 0.5738213055259157, "grad_norm": 3.703125, "learning_rate": 8.542349200406747e-06, "loss": 0.8766367, "memory(GiB)": 698.12, "step": 22620, "train_speed(iter/s)": 11.096663 }, { "acc": 0.76735907, "epoch": 0.5739481448949533, "grad_norm": 3.59375, "learning_rate": 8.541609045923405e-06, "loss": 0.88620558, "memory(GiB)": 698.12, "step": 22625, "train_speed(iter/s)": 10.731006 }, { "acc": 0.76078873, "epoch": 0.5740749842639907, "grad_norm": 3.453125, "learning_rate": 8.540868735655966e-06, "loss": 0.9217885, "memory(GiB)": 698.12, "step": 22630, "train_speed(iter/s)": 10.419503 }, { "acc": 0.74791875, "epoch": 0.5742018236330283, "grad_norm": 3.734375, "learning_rate": 8.540128269636992e-06, "loss": 0.91443958, "memory(GiB)": 698.12, "step": 22635, "train_speed(iter/s)": 10.136557 }, { "acc": 0.76188989, "epoch": 0.5743286630020659, "grad_norm": 3.375, "learning_rate": 8.539387647899055e-06, "loss": 0.89057446, "memory(GiB)": 698.12, "step": 22640, "train_speed(iter/s)": 9.822855 }, { "acc": 0.75898204, "epoch": 0.5744555023711034, "grad_norm": 3.671875, "learning_rate": 8.538646870474732e-06, "loss": 0.90999403, "memory(GiB)": 698.12, "step": 22645, "train_speed(iter/s)": 9.545516 }, { "acc": 0.76247568, "epoch": 0.574582341740141, "grad_norm": 3.359375, "learning_rate": 8.53790593739661e-06, "loss": 0.86100368, "memory(GiB)": 698.12, "step": 22650, "train_speed(iter/s)": 9.289064 }, { "acc": 0.77430987, "epoch": 0.5747091811091786, "grad_norm": 4.4375, "learning_rate": 8.537164848697276e-06, "loss": 0.88615694, "memory(GiB)": 698.12, "step": 22655, "train_speed(iter/s)": 9.067181 }, { "acc": 0.77382627, "epoch": 0.5748360204782161, "grad_norm": 3.28125, "learning_rate": 8.536423604409331e-06, "loss": 0.85374813, "memory(GiB)": 698.12, "step": 22660, "train_speed(iter/s)": 8.841022 }, { "acc": 0.76031661, "epoch": 0.5749628598472537, "grad_norm": 3.15625, "learning_rate": 8.53568220456538e-06, "loss": 0.89460192, "memory(GiB)": 698.12, "step": 22665, "train_speed(iter/s)": 8.623794 }, { "acc": 0.76915245, "epoch": 0.5750896992162913, "grad_norm": 3.90625, "learning_rate": 8.534940649198034e-06, "loss": 0.87719917, "memory(GiB)": 698.12, "step": 22670, "train_speed(iter/s)": 8.382465 }, { "acc": 0.76570897, "epoch": 0.5752165385853288, "grad_norm": 3.34375, "learning_rate": 8.534198938339912e-06, "loss": 0.91848364, "memory(GiB)": 698.12, "step": 22675, "train_speed(iter/s)": 8.146963 }, { "acc": 0.75303788, "epoch": 0.5753433779543664, "grad_norm": 5.125, "learning_rate": 8.533457072023638e-06, "loss": 0.90131817, "memory(GiB)": 698.12, "step": 22680, "train_speed(iter/s)": 7.960185 }, { "acc": 0.7447547, "epoch": 0.575470217323404, "grad_norm": 3.953125, "learning_rate": 8.532715050281847e-06, "loss": 0.96416178, "memory(GiB)": 698.12, "step": 22685, "train_speed(iter/s)": 7.781504 }, { "acc": 0.75383215, "epoch": 0.5755970566924414, "grad_norm": 3.734375, "learning_rate": 8.531972873147177e-06, "loss": 0.90653009, "memory(GiB)": 698.12, "step": 22690, "train_speed(iter/s)": 7.594198 }, { "acc": 0.75917382, "epoch": 0.575723896061479, "grad_norm": 2.8125, "learning_rate": 8.531230540652273e-06, "loss": 0.91438169, "memory(GiB)": 698.12, "step": 22695, "train_speed(iter/s)": 7.425888 }, { "acc": 0.77649016, "epoch": 0.5758507354305166, "grad_norm": 3.203125, "learning_rate": 8.53048805282979e-06, "loss": 0.89303293, "memory(GiB)": 698.12, "step": 22700, "train_speed(iter/s)": 7.275301 }, { "acc": 0.77230849, "epoch": 0.5759775747995541, "grad_norm": 3.296875, "learning_rate": 8.529745409712385e-06, "loss": 0.89871426, "memory(GiB)": 698.12, "step": 22705, "train_speed(iter/s)": 7.121762 }, { "acc": 0.76213021, "epoch": 0.5761044141685917, "grad_norm": 3.78125, "learning_rate": 8.529002611332726e-06, "loss": 0.93952971, "memory(GiB)": 698.12, "step": 22710, "train_speed(iter/s)": 6.989205 }, { "acc": 0.75488658, "epoch": 0.5762312535376293, "grad_norm": 3.703125, "learning_rate": 8.528259657723488e-06, "loss": 0.99691296, "memory(GiB)": 698.12, "step": 22715, "train_speed(iter/s)": 6.848489 }, { "acc": 0.77195678, "epoch": 0.5763580929066668, "grad_norm": 3.734375, "learning_rate": 8.527516548917348e-06, "loss": 0.88091516, "memory(GiB)": 698.12, "step": 22720, "train_speed(iter/s)": 6.714022 }, { "acc": 0.76845627, "epoch": 0.5764849322757044, "grad_norm": 3.5, "learning_rate": 8.526773284946995e-06, "loss": 0.87412977, "memory(GiB)": 698.12, "step": 22725, "train_speed(iter/s)": 6.59198 }, { "acc": 0.75951371, "epoch": 0.576611771644742, "grad_norm": 3.3125, "learning_rate": 8.52602986584512e-06, "loss": 0.93961706, "memory(GiB)": 698.12, "step": 22730, "train_speed(iter/s)": 6.464809 }, { "acc": 0.77102728, "epoch": 0.5767386110137795, "grad_norm": 3.15625, "learning_rate": 8.525286291644428e-06, "loss": 0.87183533, "memory(GiB)": 698.12, "step": 22735, "train_speed(iter/s)": 6.33557 }, { "acc": 0.77074132, "epoch": 0.5768654503828171, "grad_norm": 3.46875, "learning_rate": 8.524542562377623e-06, "loss": 0.91526451, "memory(GiB)": 698.12, "step": 22740, "train_speed(iter/s)": 6.216699 }, { "acc": 0.76005707, "epoch": 0.5769922897518547, "grad_norm": 3.515625, "learning_rate": 8.523798678077422e-06, "loss": 0.84958715, "memory(GiB)": 712.82, "step": 22745, "train_speed(iter/s)": 6.092631 }, { "acc": 0.75758042, "epoch": 0.5771191291208921, "grad_norm": 3.625, "learning_rate": 8.523054638776543e-06, "loss": 0.96048994, "memory(GiB)": 712.82, "step": 22750, "train_speed(iter/s)": 5.992953 }, { "acc": 0.77966781, "epoch": 0.5772459684899297, "grad_norm": 3.765625, "learning_rate": 8.522310444507716e-06, "loss": 0.85432434, "memory(GiB)": 712.82, "step": 22755, "train_speed(iter/s)": 5.892925 }, { "acc": 0.76933169, "epoch": 0.5773728078589673, "grad_norm": 3.5, "learning_rate": 8.521566095303677e-06, "loss": 0.9015255, "memory(GiB)": 712.82, "step": 22760, "train_speed(iter/s)": 5.795126 }, { "acc": 0.76300821, "epoch": 0.5774996472280048, "grad_norm": 3.296875, "learning_rate": 8.520821591197163e-06, "loss": 0.91061497, "memory(GiB)": 712.82, "step": 22765, "train_speed(iter/s)": 5.689814 }, { "acc": 0.77659836, "epoch": 0.5776264865970424, "grad_norm": 3.9375, "learning_rate": 8.520076932220927e-06, "loss": 0.8806922, "memory(GiB)": 712.82, "step": 22770, "train_speed(iter/s)": 5.604957 }, { "acc": 0.74779243, "epoch": 0.57775332596608, "grad_norm": 3.890625, "learning_rate": 8.519332118407724e-06, "loss": 0.97127151, "memory(GiB)": 712.82, "step": 22775, "train_speed(iter/s)": 5.513919 }, { "acc": 0.76274118, "epoch": 0.5778801653351175, "grad_norm": 3.234375, "learning_rate": 8.518587149790312e-06, "loss": 0.92948904, "memory(GiB)": 712.84, "step": 22780, "train_speed(iter/s)": 5.432059 }, { "acc": 0.76341982, "epoch": 0.5780070047041551, "grad_norm": 3.84375, "learning_rate": 8.517842026401463e-06, "loss": 0.92713127, "memory(GiB)": 712.84, "step": 22785, "train_speed(iter/s)": 5.351443 }, { "acc": 0.75562339, "epoch": 0.5781338440731927, "grad_norm": 3.3125, "learning_rate": 8.517096748273951e-06, "loss": 0.97194452, "memory(GiB)": 712.84, "step": 22790, "train_speed(iter/s)": 5.260852 }, { "acc": 0.76171894, "epoch": 0.5782606834422302, "grad_norm": 3.15625, "learning_rate": 8.516351315440561e-06, "loss": 0.90319767, "memory(GiB)": 712.84, "step": 22795, "train_speed(iter/s)": 5.181072 }, { "acc": 0.76519179, "epoch": 0.5783875228112678, "grad_norm": 3.0625, "learning_rate": 8.51560572793408e-06, "loss": 0.90314484, "memory(GiB)": 712.84, "step": 22800, "train_speed(iter/s)": 5.107789 }, { "acc": 0.76128521, "epoch": 0.5785143621803054, "grad_norm": 3.375, "learning_rate": 8.514859985787303e-06, "loss": 0.96204815, "memory(GiB)": 712.84, "step": 22805, "train_speed(iter/s)": 5.032848 }, { "acc": 0.77009845, "epoch": 0.5786412015493428, "grad_norm": 3.1875, "learning_rate": 8.514114089033034e-06, "loss": 0.87943335, "memory(GiB)": 712.84, "step": 22810, "train_speed(iter/s)": 4.953655 }, { "acc": 0.77140651, "epoch": 0.5787680409183804, "grad_norm": 3.53125, "learning_rate": 8.513368037704084e-06, "loss": 0.85639668, "memory(GiB)": 712.84, "step": 22815, "train_speed(iter/s)": 4.878362 }, { "acc": 0.76714892, "epoch": 0.578894880287418, "grad_norm": 3.609375, "learning_rate": 8.512621831833268e-06, "loss": 0.88431692, "memory(GiB)": 712.84, "step": 22820, "train_speed(iter/s)": 4.804349 }, { "acc": 0.76308074, "epoch": 0.5790217196564555, "grad_norm": 4.0, "learning_rate": 8.51187547145341e-06, "loss": 0.83318892, "memory(GiB)": 712.84, "step": 22825, "train_speed(iter/s)": 4.738031 }, { "acc": 0.74762478, "epoch": 0.5791485590254931, "grad_norm": 3.828125, "learning_rate": 8.511128956597339e-06, "loss": 0.94948549, "memory(GiB)": 712.84, "step": 22830, "train_speed(iter/s)": 4.669996 }, { "acc": 0.74416118, "epoch": 0.5792753983945307, "grad_norm": 3.375, "learning_rate": 8.510382287297892e-06, "loss": 0.96853571, "memory(GiB)": 712.84, "step": 22835, "train_speed(iter/s)": 4.606167 }, { "acc": 0.75601373, "epoch": 0.5794022377635683, "grad_norm": 3.375, "learning_rate": 8.509635463587913e-06, "loss": 0.91536217, "memory(GiB)": 712.84, "step": 22840, "train_speed(iter/s)": 4.540611 }, { "acc": 0.75652623, "epoch": 0.5795290771326058, "grad_norm": 3.8125, "learning_rate": 8.508888485500252e-06, "loss": 0.92429409, "memory(GiB)": 712.84, "step": 22845, "train_speed(iter/s)": 4.48063 }, { "acc": 0.76463771, "epoch": 0.5796559165016434, "grad_norm": 3.5625, "learning_rate": 8.508141353067769e-06, "loss": 0.86680641, "memory(GiB)": 712.84, "step": 22850, "train_speed(iter/s)": 4.422181 }, { "acc": 0.76217508, "epoch": 0.579782755870681, "grad_norm": 4.40625, "learning_rate": 8.507394066323323e-06, "loss": 0.90065737, "memory(GiB)": 712.84, "step": 22855, "train_speed(iter/s)": 4.358898 }, { "acc": 0.77325253, "epoch": 0.5799095952397185, "grad_norm": 3.6875, "learning_rate": 8.506646625299788e-06, "loss": 0.90258303, "memory(GiB)": 712.84, "step": 22860, "train_speed(iter/s)": 4.307456 }, { "acc": 0.77295275, "epoch": 0.5800364346087561, "grad_norm": 3.15625, "learning_rate": 8.505899030030042e-06, "loss": 0.86054745, "memory(GiB)": 712.84, "step": 22865, "train_speed(iter/s)": 4.253055 }, { "acc": 0.76937041, "epoch": 0.5801632739777935, "grad_norm": 2.984375, "learning_rate": 8.505151280546965e-06, "loss": 0.93468895, "memory(GiB)": 712.84, "step": 22870, "train_speed(iter/s)": 4.202037 }, { "acc": 0.77677341, "epoch": 0.5802901133468311, "grad_norm": 3.859375, "learning_rate": 8.504403376883455e-06, "loss": 0.87890749, "memory(GiB)": 712.84, "step": 22875, "train_speed(iter/s)": 4.154654 }, { "acc": 0.76988873, "epoch": 0.5804169527158687, "grad_norm": 4.21875, "learning_rate": 8.503655319072403e-06, "loss": 0.85225649, "memory(GiB)": 712.84, "step": 22880, "train_speed(iter/s)": 4.09888 }, { "acc": 0.78132639, "epoch": 0.5805437920849063, "grad_norm": 3.203125, "learning_rate": 8.502907107146719e-06, "loss": 0.86958265, "memory(GiB)": 712.84, "step": 22885, "train_speed(iter/s)": 4.046444 }, { "acc": 0.75562739, "epoch": 0.5806706314539438, "grad_norm": 3.15625, "learning_rate": 8.502158741139312e-06, "loss": 0.98679495, "memory(GiB)": 712.84, "step": 22890, "train_speed(iter/s)": 3.9995 }, { "acc": 0.75780849, "epoch": 0.5807974708229814, "grad_norm": 3.625, "learning_rate": 8.5014102210831e-06, "loss": 0.94336739, "memory(GiB)": 712.84, "step": 22895, "train_speed(iter/s)": 3.954175 }, { "acc": 0.77171721, "epoch": 0.580924310192019, "grad_norm": 3.515625, "learning_rate": 8.500661547011009e-06, "loss": 0.9196744, "memory(GiB)": 712.84, "step": 22900, "train_speed(iter/s)": 3.910709 }, { "acc": 0.77570195, "epoch": 0.5810511495610565, "grad_norm": 5.25, "learning_rate": 8.49991271895597e-06, "loss": 0.88324833, "memory(GiB)": 712.84, "step": 22905, "train_speed(iter/s)": 3.86625 }, { "acc": 0.77336397, "epoch": 0.5811779889300941, "grad_norm": 3.796875, "learning_rate": 8.499163736950921e-06, "loss": 0.94862642, "memory(GiB)": 712.84, "step": 22910, "train_speed(iter/s)": 3.824637 }, { "acc": 0.76963511, "epoch": 0.5813048282991317, "grad_norm": 3.765625, "learning_rate": 8.498414601028809e-06, "loss": 0.89249268, "memory(GiB)": 712.84, "step": 22915, "train_speed(iter/s)": 3.786609 }, { "acc": 0.75543008, "epoch": 0.5814316676681692, "grad_norm": 3.140625, "learning_rate": 8.497665311222586e-06, "loss": 0.92918386, "memory(GiB)": 712.84, "step": 22920, "train_speed(iter/s)": 3.742184 }, { "acc": 0.7643342, "epoch": 0.5815585070372068, "grad_norm": 3.25, "learning_rate": 8.49691586756521e-06, "loss": 0.86533642, "memory(GiB)": 712.84, "step": 22925, "train_speed(iter/s)": 3.704818 }, { "acc": 0.75520277, "epoch": 0.5816853464062443, "grad_norm": 3.171875, "learning_rate": 8.496166270089646e-06, "loss": 0.92382307, "memory(GiB)": 712.84, "step": 22930, "train_speed(iter/s)": 3.66574 }, { "acc": 0.76947017, "epoch": 0.5818121857752818, "grad_norm": 3.890625, "learning_rate": 8.495416518828867e-06, "loss": 0.88827791, "memory(GiB)": 712.84, "step": 22935, "train_speed(iter/s)": 3.628015 }, { "acc": 0.76853738, "epoch": 0.5819390251443194, "grad_norm": 3.453125, "learning_rate": 8.494666613815855e-06, "loss": 0.91463547, "memory(GiB)": 712.84, "step": 22940, "train_speed(iter/s)": 3.590664 }, { "acc": 0.75575142, "epoch": 0.582065864513357, "grad_norm": 4.0, "learning_rate": 8.49391655508359e-06, "loss": 0.95961275, "memory(GiB)": 712.84, "step": 22945, "train_speed(iter/s)": 3.55133 }, { "acc": 0.76448669, "epoch": 0.5821927038823945, "grad_norm": 3.71875, "learning_rate": 8.49316634266507e-06, "loss": 0.88703384, "memory(GiB)": 712.84, "step": 22950, "train_speed(iter/s)": 3.517012 }, { "acc": 0.76110344, "epoch": 0.5823195432514321, "grad_norm": 4.5625, "learning_rate": 8.49241597659329e-06, "loss": 0.96609049, "memory(GiB)": 712.84, "step": 22955, "train_speed(iter/s)": 3.485528 }, { "acc": 0.77165098, "epoch": 0.5824463826204697, "grad_norm": 3.15625, "learning_rate": 8.491665456901262e-06, "loss": 0.88388166, "memory(GiB)": 712.84, "step": 22960, "train_speed(iter/s)": 3.451073 }, { "acc": 0.74602985, "epoch": 0.5825732219895072, "grad_norm": 4.1875, "learning_rate": 8.490914783621994e-06, "loss": 0.97473669, "memory(GiB)": 712.84, "step": 22965, "train_speed(iter/s)": 3.417316 }, { "acc": 0.75944166, "epoch": 0.5827000613585448, "grad_norm": 3.625, "learning_rate": 8.490163956788508e-06, "loss": 0.9448535, "memory(GiB)": 712.84, "step": 22970, "train_speed(iter/s)": 3.381505 }, { "acc": 0.75690846, "epoch": 0.5828269007275824, "grad_norm": 4.4375, "learning_rate": 8.48941297643383e-06, "loss": 0.94084768, "memory(GiB)": 712.84, "step": 22975, "train_speed(iter/s)": 3.352375 }, { "acc": 0.75635886, "epoch": 0.5829537400966199, "grad_norm": 4.09375, "learning_rate": 8.488661842590994e-06, "loss": 0.96907549, "memory(GiB)": 712.84, "step": 22980, "train_speed(iter/s)": 3.321585 }, { "acc": 0.77098737, "epoch": 0.5830805794656575, "grad_norm": 3.21875, "learning_rate": 8.487910555293038e-06, "loss": 0.88454466, "memory(GiB)": 712.84, "step": 22985, "train_speed(iter/s)": 3.288489 }, { "acc": 0.77475901, "epoch": 0.583207418834695, "grad_norm": 3.140625, "learning_rate": 8.487159114573009e-06, "loss": 0.87311926, "memory(GiB)": 712.84, "step": 22990, "train_speed(iter/s)": 3.258692 }, { "acc": 0.76441827, "epoch": 0.5833342582037325, "grad_norm": 3.8125, "learning_rate": 8.486407520463961e-06, "loss": 0.87744799, "memory(GiB)": 712.84, "step": 22995, "train_speed(iter/s)": 3.230505 }, { "acc": 0.77166185, "epoch": 0.5834610975727701, "grad_norm": 3.296875, "learning_rate": 8.485655772998957e-06, "loss": 0.872579, "memory(GiB)": 712.84, "step": 23000, "train_speed(iter/s)": 3.199742 }, { "epoch": 0.5834610975727701, "eval_acc": 0.7536635059784591, "eval_loss": 0.8765835165977478, "eval_runtime": 1148.1075, "eval_samples_per_second": 5.548, "eval_steps_per_second": 5.548, "step": 23000 }, { "acc": 0.75922174, "epoch": 0.5835879369418077, "grad_norm": 4.125, "learning_rate": 8.484903872211058e-06, "loss": 0.93739204, "memory(GiB)": 712.84, "step": 23005, "train_speed(iter/s)": 2.542645 }, { "acc": 0.75436969, "epoch": 0.5837147763108452, "grad_norm": 3.203125, "learning_rate": 8.484151818133345e-06, "loss": 0.91495495, "memory(GiB)": 712.84, "step": 23010, "train_speed(iter/s)": 2.522054 }, { "acc": 0.76085982, "epoch": 0.5838416156798828, "grad_norm": 3.453125, "learning_rate": 8.483399610798892e-06, "loss": 0.92161093, "memory(GiB)": 712.84, "step": 23015, "train_speed(iter/s)": 2.503853 }, { "acc": 0.75768118, "epoch": 0.5839684550489204, "grad_norm": 3.046875, "learning_rate": 8.482647250240791e-06, "loss": 0.91332626, "memory(GiB)": 712.84, "step": 23020, "train_speed(iter/s)": 2.486853 }, { "acc": 0.77178841, "epoch": 0.5840952944179579, "grad_norm": 3.46875, "learning_rate": 8.481894736492133e-06, "loss": 0.88777685, "memory(GiB)": 712.84, "step": 23025, "train_speed(iter/s)": 2.469141 }, { "acc": 0.76611428, "epoch": 0.5842221337869955, "grad_norm": 3.34375, "learning_rate": 8.481142069586019e-06, "loss": 0.85847826, "memory(GiB)": 712.84, "step": 23030, "train_speed(iter/s)": 2.452005 }, { "acc": 0.75655828, "epoch": 0.5843489731560331, "grad_norm": 3.171875, "learning_rate": 8.480389249555557e-06, "loss": 0.94067974, "memory(GiB)": 712.84, "step": 23035, "train_speed(iter/s)": 2.434783 }, { "acc": 0.76120915, "epoch": 0.5844758125250706, "grad_norm": 3.109375, "learning_rate": 8.479636276433862e-06, "loss": 0.92662544, "memory(GiB)": 712.84, "step": 23040, "train_speed(iter/s)": 2.417652 }, { "acc": 0.77682443, "epoch": 0.5846026518941082, "grad_norm": 3.25, "learning_rate": 8.478883150254053e-06, "loss": 0.85813007, "memory(GiB)": 712.84, "step": 23045, "train_speed(iter/s)": 2.40023 }, { "acc": 0.75820031, "epoch": 0.5847294912631457, "grad_norm": 5.03125, "learning_rate": 8.478129871049259e-06, "loss": 0.95379066, "memory(GiB)": 712.84, "step": 23050, "train_speed(iter/s)": 2.384463 }, { "acc": 0.76675773, "epoch": 0.5848563306321832, "grad_norm": 2.953125, "learning_rate": 8.477376438852614e-06, "loss": 0.886765, "memory(GiB)": 712.84, "step": 23055, "train_speed(iter/s)": 2.36864 }, { "acc": 0.77033916, "epoch": 0.5849831700012208, "grad_norm": 3.046875, "learning_rate": 8.476622853697258e-06, "loss": 0.89784451, "memory(GiB)": 712.84, "step": 23060, "train_speed(iter/s)": 2.353379 }, { "acc": 0.7656805, "epoch": 0.5851100093702584, "grad_norm": 3.34375, "learning_rate": 8.475869115616341e-06, "loss": 0.89509106, "memory(GiB)": 712.84, "step": 23065, "train_speed(iter/s)": 2.337717 }, { "acc": 0.76058292, "epoch": 0.5852368487392959, "grad_norm": 3.625, "learning_rate": 8.475115224643018e-06, "loss": 0.89557896, "memory(GiB)": 712.84, "step": 23070, "train_speed(iter/s)": 2.321762 }, { "acc": 0.7692853, "epoch": 0.5853636881083335, "grad_norm": 3.515625, "learning_rate": 8.474361180810446e-06, "loss": 0.85778503, "memory(GiB)": 712.84, "step": 23075, "train_speed(iter/s)": 2.305348 }, { "acc": 0.76860952, "epoch": 0.5854905274773711, "grad_norm": 3.15625, "learning_rate": 8.473606984151797e-06, "loss": 0.89893913, "memory(GiB)": 712.84, "step": 23080, "train_speed(iter/s)": 2.289259 }, { "acc": 0.76227036, "epoch": 0.5856173668464086, "grad_norm": 4.125, "learning_rate": 8.472852634700244e-06, "loss": 0.91050615, "memory(GiB)": 712.84, "step": 23085, "train_speed(iter/s)": 2.274885 }, { "acc": 0.77162347, "epoch": 0.5857442062154462, "grad_norm": 3.53125, "learning_rate": 8.472098132488968e-06, "loss": 0.89432106, "memory(GiB)": 712.84, "step": 23090, "train_speed(iter/s)": 2.260486 }, { "acc": 0.76675777, "epoch": 0.5858710455844838, "grad_norm": 3.375, "learning_rate": 8.471343477551159e-06, "loss": 0.90344849, "memory(GiB)": 712.84, "step": 23095, "train_speed(iter/s)": 2.246632 }, { "acc": 0.77216492, "epoch": 0.5859978849535213, "grad_norm": 3.5, "learning_rate": 8.47058866992001e-06, "loss": 0.85524788, "memory(GiB)": 712.84, "step": 23100, "train_speed(iter/s)": 2.231952 }, { "acc": 0.76352172, "epoch": 0.5861247243225589, "grad_norm": 3.25, "learning_rate": 8.469833709628725e-06, "loss": 0.92755899, "memory(GiB)": 712.84, "step": 23105, "train_speed(iter/s)": 2.218778 }, { "acc": 0.77645555, "epoch": 0.5862515636915964, "grad_norm": 3.5625, "learning_rate": 8.46907859671051e-06, "loss": 0.81915693, "memory(GiB)": 712.84, "step": 23110, "train_speed(iter/s)": 2.204889 }, { "acc": 0.75878329, "epoch": 0.5863784030606339, "grad_norm": 3.859375, "learning_rate": 8.46832333119858e-06, "loss": 0.91814222, "memory(GiB)": 712.84, "step": 23115, "train_speed(iter/s)": 2.190431 }, { "acc": 0.75173821, "epoch": 0.5865052424296715, "grad_norm": 4.0, "learning_rate": 8.467567913126157e-06, "loss": 0.96843662, "memory(GiB)": 712.84, "step": 23120, "train_speed(iter/s)": 2.177362 }, { "acc": 0.76673155, "epoch": 0.5866320817987091, "grad_norm": 3.296875, "learning_rate": 8.466812342526471e-06, "loss": 0.86878376, "memory(GiB)": 712.84, "step": 23125, "train_speed(iter/s)": 2.165619 }, { "acc": 0.76730528, "epoch": 0.5867589211677466, "grad_norm": 3.078125, "learning_rate": 8.466056619432755e-06, "loss": 0.88275938, "memory(GiB)": 712.84, "step": 23130, "train_speed(iter/s)": 2.151792 }, { "acc": 0.76447945, "epoch": 0.5868857605367842, "grad_norm": 3.203125, "learning_rate": 8.465300743878253e-06, "loss": 0.92855034, "memory(GiB)": 712.84, "step": 23135, "train_speed(iter/s)": 2.139064 }, { "acc": 0.74972167, "epoch": 0.5870125999058218, "grad_norm": 3.453125, "learning_rate": 8.46454471589621e-06, "loss": 0.94842253, "memory(GiB)": 712.84, "step": 23140, "train_speed(iter/s)": 2.125645 }, { "acc": 0.77104349, "epoch": 0.5871394392748593, "grad_norm": 3.53125, "learning_rate": 8.463788535519885e-06, "loss": 0.86623354, "memory(GiB)": 712.84, "step": 23145, "train_speed(iter/s)": 2.112934 }, { "acc": 0.76982145, "epoch": 0.5872662786438969, "grad_norm": 3.0625, "learning_rate": 8.46303220278254e-06, "loss": 0.92038269, "memory(GiB)": 712.84, "step": 23150, "train_speed(iter/s)": 2.099533 }, { "acc": 0.76397419, "epoch": 0.5873931180129345, "grad_norm": 3.71875, "learning_rate": 8.46227571771744e-06, "loss": 0.88560963, "memory(GiB)": 712.84, "step": 23155, "train_speed(iter/s)": 2.087631 }, { "acc": 0.76451087, "epoch": 0.587519957381972, "grad_norm": 3.859375, "learning_rate": 8.461519080357865e-06, "loss": 0.89516602, "memory(GiB)": 712.84, "step": 23160, "train_speed(iter/s)": 2.076075 }, { "acc": 0.74402323, "epoch": 0.5876467967510096, "grad_norm": 3.421875, "learning_rate": 8.460762290737095e-06, "loss": 0.96163235, "memory(GiB)": 712.84, "step": 23165, "train_speed(iter/s)": 2.064546 }, { "acc": 0.78289332, "epoch": 0.5877736361200471, "grad_norm": 4.09375, "learning_rate": 8.460005348888416e-06, "loss": 0.84249535, "memory(GiB)": 712.84, "step": 23170, "train_speed(iter/s)": 2.053275 }, { "acc": 0.76511016, "epoch": 0.5879004754890846, "grad_norm": 3.484375, "learning_rate": 8.459248254845127e-06, "loss": 0.89688511, "memory(GiB)": 712.84, "step": 23175, "train_speed(iter/s)": 2.041671 }, { "acc": 0.76414433, "epoch": 0.5880273148581222, "grad_norm": 3.71875, "learning_rate": 8.45849100864053e-06, "loss": 0.87754707, "memory(GiB)": 712.84, "step": 23180, "train_speed(iter/s)": 2.031124 }, { "acc": 0.76443429, "epoch": 0.5881541542271598, "grad_norm": 3.65625, "learning_rate": 8.457733610307932e-06, "loss": 0.85014477, "memory(GiB)": 712.84, "step": 23185, "train_speed(iter/s)": 2.01851 }, { "acc": 0.77754226, "epoch": 0.5882809935961973, "grad_norm": 3.546875, "learning_rate": 8.456976059880648e-06, "loss": 0.90656147, "memory(GiB)": 712.84, "step": 23190, "train_speed(iter/s)": 2.007236 }, { "acc": 0.7755342, "epoch": 0.5884078329652349, "grad_norm": 2.953125, "learning_rate": 8.456218357392006e-06, "loss": 0.86891527, "memory(GiB)": 712.84, "step": 23195, "train_speed(iter/s)": 1.995704 }, { "acc": 0.77025628, "epoch": 0.5885346723342725, "grad_norm": 3.515625, "learning_rate": 8.455460502875327e-06, "loss": 0.89241838, "memory(GiB)": 712.84, "step": 23200, "train_speed(iter/s)": 1.984774 }, { "acc": 0.76049261, "epoch": 0.58866151170331, "grad_norm": 3.75, "learning_rate": 8.454702496363951e-06, "loss": 0.9207612, "memory(GiB)": 712.84, "step": 23205, "train_speed(iter/s)": 1.974639 }, { "acc": 0.77009764, "epoch": 0.5887883510723476, "grad_norm": 4.28125, "learning_rate": 8.45394433789122e-06, "loss": 0.86149359, "memory(GiB)": 712.84, "step": 23210, "train_speed(iter/s)": 1.964385 }, { "acc": 0.76980267, "epoch": 0.5889151904413852, "grad_norm": 3.28125, "learning_rate": 8.453186027490482e-06, "loss": 0.88802767, "memory(GiB)": 712.84, "step": 23215, "train_speed(iter/s)": 1.953787 }, { "acc": 0.76195383, "epoch": 0.5890420298104228, "grad_norm": 3.984375, "learning_rate": 8.452427565195093e-06, "loss": 0.87890501, "memory(GiB)": 712.84, "step": 23220, "train_speed(iter/s)": 1.942778 }, { "acc": 0.76787281, "epoch": 0.5891688691794603, "grad_norm": 4.28125, "learning_rate": 8.451668951038417e-06, "loss": 0.90182419, "memory(GiB)": 712.84, "step": 23225, "train_speed(iter/s)": 1.933389 }, { "acc": 0.77519727, "epoch": 0.5892957085484978, "grad_norm": 4.3125, "learning_rate": 8.45091018505382e-06, "loss": 0.85455503, "memory(GiB)": 712.84, "step": 23230, "train_speed(iter/s)": 1.924167 }, { "acc": 0.76322389, "epoch": 0.5894225479175353, "grad_norm": 3.359375, "learning_rate": 8.45015126727468e-06, "loss": 0.90241203, "memory(GiB)": 712.84, "step": 23235, "train_speed(iter/s)": 1.913307 }, { "acc": 0.76702857, "epoch": 0.5895493872865729, "grad_norm": 4.0, "learning_rate": 8.449392197734379e-06, "loss": 0.87391891, "memory(GiB)": 712.84, "step": 23240, "train_speed(iter/s)": 1.903199 }, { "acc": 0.77785678, "epoch": 0.5896762266556105, "grad_norm": 3.515625, "learning_rate": 8.448632976466305e-06, "loss": 0.83907881, "memory(GiB)": 712.84, "step": 23245, "train_speed(iter/s)": 1.894014 }, { "acc": 0.76823769, "epoch": 0.589803066024648, "grad_norm": 3.84375, "learning_rate": 8.447873603503855e-06, "loss": 0.88631506, "memory(GiB)": 712.84, "step": 23250, "train_speed(iter/s)": 1.884244 }, { "acc": 0.77749901, "epoch": 0.5899299053936856, "grad_norm": 3.34375, "learning_rate": 8.44711407888043e-06, "loss": 0.86796408, "memory(GiB)": 712.84, "step": 23255, "train_speed(iter/s)": 1.873891 }, { "acc": 0.77440166, "epoch": 0.5900567447627232, "grad_norm": 4.09375, "learning_rate": 8.44635440262944e-06, "loss": 0.85713911, "memory(GiB)": 712.84, "step": 23260, "train_speed(iter/s)": 1.864007 }, { "acc": 0.74543357, "epoch": 0.5901835841317608, "grad_norm": 3.234375, "learning_rate": 8.4455945747843e-06, "loss": 0.99415417, "memory(GiB)": 712.84, "step": 23265, "train_speed(iter/s)": 1.854611 }, { "acc": 0.75866723, "epoch": 0.5903104235007983, "grad_norm": 3.453125, "learning_rate": 8.444834595378434e-06, "loss": 0.94315434, "memory(GiB)": 712.84, "step": 23270, "train_speed(iter/s)": 1.844239 }, { "acc": 0.76974988, "epoch": 0.5904372628698359, "grad_norm": 3.671875, "learning_rate": 8.444074464445268e-06, "loss": 0.89288282, "memory(GiB)": 712.84, "step": 23275, "train_speed(iter/s)": 1.83478 }, { "acc": 0.76561108, "epoch": 0.5905641022388735, "grad_norm": 3.546875, "learning_rate": 8.443314182018244e-06, "loss": 0.90499659, "memory(GiB)": 712.84, "step": 23280, "train_speed(iter/s)": 1.824848 }, { "acc": 0.7704464, "epoch": 0.590690941607911, "grad_norm": 4.125, "learning_rate": 8.442553748130796e-06, "loss": 0.8870573, "memory(GiB)": 712.84, "step": 23285, "train_speed(iter/s)": 1.816041 }, { "acc": 0.76947842, "epoch": 0.5908177809769485, "grad_norm": 3.53125, "learning_rate": 8.44179316281638e-06, "loss": 0.88944263, "memory(GiB)": 712.84, "step": 23290, "train_speed(iter/s)": 1.806655 }, { "acc": 0.75556846, "epoch": 0.590944620345986, "grad_norm": 4.34375, "learning_rate": 8.441032426108447e-06, "loss": 0.93908234, "memory(GiB)": 712.84, "step": 23295, "train_speed(iter/s)": 1.797594 }, { "acc": 0.75468268, "epoch": 0.5910714597150236, "grad_norm": 3.734375, "learning_rate": 8.440271538040463e-06, "loss": 0.9479229, "memory(GiB)": 712.84, "step": 23300, "train_speed(iter/s)": 1.788915 }, { "acc": 0.76867404, "epoch": 0.5911982990840612, "grad_norm": 4.09375, "learning_rate": 8.439510498645896e-06, "loss": 0.89627495, "memory(GiB)": 712.84, "step": 23305, "train_speed(iter/s)": 1.780146 }, { "acc": 0.76966791, "epoch": 0.5913251384530988, "grad_norm": 4.125, "learning_rate": 8.438749307958218e-06, "loss": 0.86883659, "memory(GiB)": 712.84, "step": 23310, "train_speed(iter/s)": 1.770892 }, { "acc": 0.7616425, "epoch": 0.5914519778221363, "grad_norm": 3.59375, "learning_rate": 8.437987966010916e-06, "loss": 0.94283152, "memory(GiB)": 712.84, "step": 23315, "train_speed(iter/s)": 1.762877 }, { "acc": 0.77187371, "epoch": 0.5915788171911739, "grad_norm": 3.734375, "learning_rate": 8.437226472837478e-06, "loss": 0.88790112, "memory(GiB)": 712.84, "step": 23320, "train_speed(iter/s)": 1.754832 }, { "acc": 0.77178473, "epoch": 0.5917056565602115, "grad_norm": 3.0625, "learning_rate": 8.436464828471399e-06, "loss": 0.89244919, "memory(GiB)": 712.84, "step": 23325, "train_speed(iter/s)": 1.745791 }, { "acc": 0.77422161, "epoch": 0.591832495929249, "grad_norm": 3.875, "learning_rate": 8.435703032946181e-06, "loss": 0.89414291, "memory(GiB)": 712.84, "step": 23330, "train_speed(iter/s)": 1.737492 }, { "acc": 0.78056884, "epoch": 0.5919593352982866, "grad_norm": 8.75, "learning_rate": 8.434941086295334e-06, "loss": 0.86485214, "memory(GiB)": 712.84, "step": 23335, "train_speed(iter/s)": 1.728452 }, { "acc": 0.76381211, "epoch": 0.5920861746673242, "grad_norm": 3.640625, "learning_rate": 8.434178988552372e-06, "loss": 0.86996889, "memory(GiB)": 712.84, "step": 23340, "train_speed(iter/s)": 1.720655 }, { "acc": 0.75469918, "epoch": 0.5922130140363617, "grad_norm": 3.359375, "learning_rate": 8.433416739750819e-06, "loss": 0.93826237, "memory(GiB)": 712.84, "step": 23345, "train_speed(iter/s)": 1.711976 }, { "acc": 0.76080122, "epoch": 0.5923398534053992, "grad_norm": 3.015625, "learning_rate": 8.432654339924204e-06, "loss": 0.93688583, "memory(GiB)": 712.84, "step": 23350, "train_speed(iter/s)": 1.704236 }, { "acc": 0.76727266, "epoch": 0.5924666927744368, "grad_norm": 3.546875, "learning_rate": 8.431891789106062e-06, "loss": 0.91537323, "memory(GiB)": 712.84, "step": 23355, "train_speed(iter/s)": 1.696245 }, { "acc": 0.7750185, "epoch": 0.5925935321434743, "grad_norm": 2.8125, "learning_rate": 8.431129087329933e-06, "loss": 0.87621593, "memory(GiB)": 712.84, "step": 23360, "train_speed(iter/s)": 1.688803 }, { "acc": 0.7688097, "epoch": 0.5927203715125119, "grad_norm": 3.734375, "learning_rate": 8.430366234629368e-06, "loss": 0.9272047, "memory(GiB)": 712.84, "step": 23365, "train_speed(iter/s)": 1.681931 }, { "acc": 0.76430421, "epoch": 0.5928472108815495, "grad_norm": 3.203125, "learning_rate": 8.429603231037923e-06, "loss": 0.91631756, "memory(GiB)": 712.84, "step": 23370, "train_speed(iter/s)": 1.67455 }, { "acc": 0.78051682, "epoch": 0.592974050250587, "grad_norm": 3.765625, "learning_rate": 8.428840076589159e-06, "loss": 0.86412554, "memory(GiB)": 712.84, "step": 23375, "train_speed(iter/s)": 1.667053 }, { "acc": 0.76999817, "epoch": 0.5931008896196246, "grad_norm": 3.3125, "learning_rate": 8.428076771316644e-06, "loss": 0.8647295, "memory(GiB)": 712.84, "step": 23380, "train_speed(iter/s)": 1.658919 }, { "acc": 0.76712561, "epoch": 0.5932277289886622, "grad_norm": 3.625, "learning_rate": 8.427313315253956e-06, "loss": 0.89133463, "memory(GiB)": 712.84, "step": 23385, "train_speed(iter/s)": 1.652166 }, { "acc": 0.76481533, "epoch": 0.5933545683576997, "grad_norm": 3.359375, "learning_rate": 8.426549708434674e-06, "loss": 0.90995455, "memory(GiB)": 712.84, "step": 23390, "train_speed(iter/s)": 1.644598 }, { "acc": 0.76258225, "epoch": 0.5934814077267373, "grad_norm": 3.5625, "learning_rate": 8.425785950892389e-06, "loss": 0.93049946, "memory(GiB)": 712.84, "step": 23395, "train_speed(iter/s)": 1.637224 }, { "acc": 0.7674921, "epoch": 0.5936082470957749, "grad_norm": 3.34375, "learning_rate": 8.425022042660694e-06, "loss": 0.89088211, "memory(GiB)": 712.84, "step": 23400, "train_speed(iter/s)": 1.630406 }, { "acc": 0.75015554, "epoch": 0.5937350864648124, "grad_norm": 3.390625, "learning_rate": 8.424257983773193e-06, "loss": 0.94637041, "memory(GiB)": 712.84, "step": 23405, "train_speed(iter/s)": 1.62346 }, { "acc": 0.75628114, "epoch": 0.5938619258338499, "grad_norm": 3.8125, "learning_rate": 8.423493774263494e-06, "loss": 0.93891306, "memory(GiB)": 712.84, "step": 23410, "train_speed(iter/s)": 1.616625 }, { "acc": 0.76506014, "epoch": 0.5939887652028875, "grad_norm": 3.359375, "learning_rate": 8.422729414165212e-06, "loss": 0.90693865, "memory(GiB)": 712.84, "step": 23415, "train_speed(iter/s)": 1.609978 }, { "acc": 0.77019215, "epoch": 0.594115604571925, "grad_norm": 3.015625, "learning_rate": 8.421964903511967e-06, "loss": 0.88411169, "memory(GiB)": 712.84, "step": 23420, "train_speed(iter/s)": 1.60166 }, { "acc": 0.77567492, "epoch": 0.5942424439409626, "grad_norm": 4.5, "learning_rate": 8.42120024233739e-06, "loss": 0.88623619, "memory(GiB)": 712.84, "step": 23425, "train_speed(iter/s)": 1.595228 }, { "acc": 0.76541023, "epoch": 0.5943692833100002, "grad_norm": 3.390625, "learning_rate": 8.420435430675114e-06, "loss": 0.89580278, "memory(GiB)": 712.84, "step": 23430, "train_speed(iter/s)": 1.588242 }, { "acc": 0.76087861, "epoch": 0.5944961226790377, "grad_norm": 3.515625, "learning_rate": 8.419670468558783e-06, "loss": 0.88943844, "memory(GiB)": 712.84, "step": 23435, "train_speed(iter/s)": 1.581446 }, { "acc": 0.76879625, "epoch": 0.5946229620480753, "grad_norm": 3.4375, "learning_rate": 8.418905356022043e-06, "loss": 0.91794777, "memory(GiB)": 712.84, "step": 23440, "train_speed(iter/s)": 1.57474 }, { "acc": 0.76762843, "epoch": 0.5947498014171129, "grad_norm": 3.484375, "learning_rate": 8.418140093098551e-06, "loss": 0.84953432, "memory(GiB)": 712.84, "step": 23445, "train_speed(iter/s)": 1.568003 }, { "acc": 0.76345301, "epoch": 0.5948766407861504, "grad_norm": 3.5625, "learning_rate": 8.417374679821964e-06, "loss": 0.90283928, "memory(GiB)": 712.84, "step": 23450, "train_speed(iter/s)": 1.561244 }, { "acc": 0.76153808, "epoch": 0.595003480155188, "grad_norm": 3.40625, "learning_rate": 8.416609116225956e-06, "loss": 0.91692696, "memory(GiB)": 712.84, "step": 23455, "train_speed(iter/s)": 1.555078 }, { "acc": 0.74728427, "epoch": 0.5951303195242256, "grad_norm": 6.90625, "learning_rate": 8.415843402344198e-06, "loss": 0.95913734, "memory(GiB)": 712.84, "step": 23460, "train_speed(iter/s)": 1.548256 }, { "acc": 0.76052036, "epoch": 0.5952571588932631, "grad_norm": 3.875, "learning_rate": 8.415077538210371e-06, "loss": 0.88032312, "memory(GiB)": 712.84, "step": 23465, "train_speed(iter/s)": 1.541041 }, { "acc": 0.76282034, "epoch": 0.5953839982623006, "grad_norm": 3.5, "learning_rate": 8.414311523858167e-06, "loss": 0.8853281, "memory(GiB)": 712.84, "step": 23470, "train_speed(iter/s)": 1.534497 }, { "acc": 0.76197567, "epoch": 0.5955108376313382, "grad_norm": 3.984375, "learning_rate": 8.413545359321276e-06, "loss": 0.91281681, "memory(GiB)": 712.84, "step": 23475, "train_speed(iter/s)": 1.528529 }, { "acc": 0.77759748, "epoch": 0.5956376770003757, "grad_norm": 3.234375, "learning_rate": 8.412779044633401e-06, "loss": 0.85711031, "memory(GiB)": 712.84, "step": 23480, "train_speed(iter/s)": 1.521987 }, { "acc": 0.77375073, "epoch": 0.5957645163694133, "grad_norm": 3.34375, "learning_rate": 8.41201257982825e-06, "loss": 0.87860785, "memory(GiB)": 712.84, "step": 23485, "train_speed(iter/s)": 1.515549 }, { "acc": 0.75993147, "epoch": 0.5958913557384509, "grad_norm": 3.46875, "learning_rate": 8.411245964939537e-06, "loss": 0.91445408, "memory(GiB)": 712.84, "step": 23490, "train_speed(iter/s)": 1.509416 }, { "acc": 0.75218554, "epoch": 0.5960181951074884, "grad_norm": 3.765625, "learning_rate": 8.410479200000984e-06, "loss": 0.94134188, "memory(GiB)": 712.84, "step": 23495, "train_speed(iter/s)": 1.503344 }, { "acc": 0.75037751, "epoch": 0.596145034476526, "grad_norm": 3.015625, "learning_rate": 8.409712285046314e-06, "loss": 0.92169323, "memory(GiB)": 712.84, "step": 23500, "train_speed(iter/s)": 1.496887 }, { "epoch": 0.596145034476526, "eval_acc": 0.7535461133645016, "eval_loss": 0.8760190010070801, "eval_runtime": 1149.4245, "eval_samples_per_second": 5.542, "eval_steps_per_second": 5.542, "step": 23500 }, { "acc": 0.759131, "epoch": 0.5962718738455636, "grad_norm": 3.9375, "learning_rate": 8.408945220109268e-06, "loss": 0.90986977, "memory(GiB)": 712.84, "step": 23505, "train_speed(iter/s)": 1.338109 }, { "acc": 0.76927114, "epoch": 0.5963987132146011, "grad_norm": 3.734375, "learning_rate": 8.408178005223581e-06, "loss": 0.8813508, "memory(GiB)": 712.84, "step": 23510, "train_speed(iter/s)": 1.333032 }, { "acc": 0.76826539, "epoch": 0.5965255525836387, "grad_norm": 3.515625, "learning_rate": 8.407410640423003e-06, "loss": 0.92434607, "memory(GiB)": 712.84, "step": 23515, "train_speed(iter/s)": 1.328076 }, { "acc": 0.75499282, "epoch": 0.5966523919526763, "grad_norm": 4.125, "learning_rate": 8.40664312574129e-06, "loss": 0.95820227, "memory(GiB)": 712.84, "step": 23520, "train_speed(iter/s)": 1.323324 }, { "acc": 0.76707034, "epoch": 0.5967792313217138, "grad_norm": 3.328125, "learning_rate": 8.405875461212198e-06, "loss": 0.94129019, "memory(GiB)": 712.84, "step": 23525, "train_speed(iter/s)": 1.318267 }, { "acc": 0.76019869, "epoch": 0.5969060706907513, "grad_norm": 3.4375, "learning_rate": 8.405107646869496e-06, "loss": 0.93546295, "memory(GiB)": 712.84, "step": 23530, "train_speed(iter/s)": 1.313161 }, { "acc": 0.78094668, "epoch": 0.5970329100597889, "grad_norm": 3.375, "learning_rate": 8.40433968274696e-06, "loss": 0.81817207, "memory(GiB)": 712.84, "step": 23535, "train_speed(iter/s)": 1.307996 }, { "acc": 0.76067638, "epoch": 0.5971597494288264, "grad_norm": 3.125, "learning_rate": 8.403571568878365e-06, "loss": 0.95612888, "memory(GiB)": 712.84, "step": 23540, "train_speed(iter/s)": 1.303342 }, { "acc": 0.76216564, "epoch": 0.597286588797864, "grad_norm": 3.734375, "learning_rate": 8.402803305297505e-06, "loss": 0.87242775, "memory(GiB)": 712.84, "step": 23545, "train_speed(iter/s)": 1.298914 }, { "acc": 0.76542549, "epoch": 0.5974134281669016, "grad_norm": 3.5, "learning_rate": 8.402034892038167e-06, "loss": 0.93285341, "memory(GiB)": 712.84, "step": 23550, "train_speed(iter/s)": 1.294535 }, { "acc": 0.77526779, "epoch": 0.5975402675359391, "grad_norm": 3.5, "learning_rate": 8.401266329134154e-06, "loss": 0.87895527, "memory(GiB)": 712.84, "step": 23555, "train_speed(iter/s)": 1.289932 }, { "acc": 0.76138854, "epoch": 0.5976671069049767, "grad_norm": 3.828125, "learning_rate": 8.400497616619273e-06, "loss": 0.91224165, "memory(GiB)": 712.84, "step": 23560, "train_speed(iter/s)": 1.28561 }, { "acc": 0.75676484, "epoch": 0.5977939462740143, "grad_norm": 4.0625, "learning_rate": 8.399728754527336e-06, "loss": 0.96251822, "memory(GiB)": 712.84, "step": 23565, "train_speed(iter/s)": 1.281252 }, { "acc": 0.75641112, "epoch": 0.5979207856430518, "grad_norm": 3.25, "learning_rate": 8.398959742892164e-06, "loss": 0.90463009, "memory(GiB)": 712.84, "step": 23570, "train_speed(iter/s)": 1.276493 }, { "acc": 0.77191577, "epoch": 0.5980476250120894, "grad_norm": 3.40625, "learning_rate": 8.398190581747581e-06, "loss": 0.88824844, "memory(GiB)": 712.84, "step": 23575, "train_speed(iter/s)": 1.272283 }, { "acc": 0.75422664, "epoch": 0.598174464381127, "grad_norm": 4.03125, "learning_rate": 8.397421271127423e-06, "loss": 0.9215188, "memory(GiB)": 712.84, "step": 23580, "train_speed(iter/s)": 1.268171 }, { "acc": 0.75341659, "epoch": 0.5983013037501645, "grad_norm": 4.03125, "learning_rate": 8.396651811065528e-06, "loss": 0.91987181, "memory(GiB)": 712.84, "step": 23585, "train_speed(iter/s)": 1.263271 }, { "acc": 0.76896157, "epoch": 0.598428143119202, "grad_norm": 3.1875, "learning_rate": 8.395882201595742e-06, "loss": 0.93440256, "memory(GiB)": 712.84, "step": 23590, "train_speed(iter/s)": 1.258687 }, { "acc": 0.78492861, "epoch": 0.5985549824882396, "grad_norm": 3.21875, "learning_rate": 8.395112442751917e-06, "loss": 0.87697268, "memory(GiB)": 712.84, "step": 23595, "train_speed(iter/s)": 1.254456 }, { "acc": 0.76730661, "epoch": 0.5986818218572771, "grad_norm": 3.265625, "learning_rate": 8.394342534567914e-06, "loss": 0.8243351, "memory(GiB)": 712.84, "step": 23600, "train_speed(iter/s)": 1.250294 }, { "acc": 0.76998572, "epoch": 0.5988086612263147, "grad_norm": 3.53125, "learning_rate": 8.393572477077598e-06, "loss": 0.87350521, "memory(GiB)": 712.84, "step": 23605, "train_speed(iter/s)": 1.245718 }, { "acc": 0.75306625, "epoch": 0.5989355005953523, "grad_norm": 4.375, "learning_rate": 8.392802270314842e-06, "loss": 0.96040068, "memory(GiB)": 712.84, "step": 23610, "train_speed(iter/s)": 1.24142 }, { "acc": 0.76336927, "epoch": 0.5990623399643898, "grad_norm": 3.625, "learning_rate": 8.392031914313523e-06, "loss": 0.88044281, "memory(GiB)": 712.84, "step": 23615, "train_speed(iter/s)": 1.237437 }, { "acc": 0.76418128, "epoch": 0.5991891793334274, "grad_norm": 3.359375, "learning_rate": 8.391261409107527e-06, "loss": 0.92206097, "memory(GiB)": 712.84, "step": 23620, "train_speed(iter/s)": 1.233413 }, { "acc": 0.75938859, "epoch": 0.599316018702465, "grad_norm": 3.1875, "learning_rate": 8.390490754730749e-06, "loss": 0.92685652, "memory(GiB)": 712.84, "step": 23625, "train_speed(iter/s)": 1.229479 }, { "acc": 0.77458019, "epoch": 0.5994428580715025, "grad_norm": 3.65625, "learning_rate": 8.389719951217084e-06, "loss": 0.89437494, "memory(GiB)": 712.84, "step": 23630, "train_speed(iter/s)": 1.22542 }, { "acc": 0.75902324, "epoch": 0.5995696974405401, "grad_norm": 3.484375, "learning_rate": 8.388948998600438e-06, "loss": 0.92393179, "memory(GiB)": 712.84, "step": 23635, "train_speed(iter/s)": 1.221686 }, { "acc": 0.75543203, "epoch": 0.5996965368095777, "grad_norm": 3.640625, "learning_rate": 8.388177896914725e-06, "loss": 0.93485231, "memory(GiB)": 712.84, "step": 23640, "train_speed(iter/s)": 1.217183 }, { "acc": 0.76512227, "epoch": 0.5998233761786153, "grad_norm": 3.109375, "learning_rate": 8.387406646193861e-06, "loss": 0.88559647, "memory(GiB)": 712.84, "step": 23645, "train_speed(iter/s)": 1.213258 }, { "acc": 0.74537253, "epoch": 0.5999502155476527, "grad_norm": 3.46875, "learning_rate": 8.386635246471772e-06, "loss": 0.97047272, "memory(GiB)": 712.84, "step": 23650, "train_speed(iter/s)": 1.209277 }, { "acc": 0.76480455, "epoch": 0.6000770549166903, "grad_norm": 3.578125, "learning_rate": 8.385863697782389e-06, "loss": 0.90800285, "memory(GiB)": 712.84, "step": 23655, "train_speed(iter/s)": 1.205122 }, { "acc": 0.77505264, "epoch": 0.6002038942857278, "grad_norm": 3.796875, "learning_rate": 8.38509200015965e-06, "loss": 0.88896227, "memory(GiB)": 712.84, "step": 23660, "train_speed(iter/s)": 1.201218 }, { "acc": 0.77832465, "epoch": 0.6003307336547654, "grad_norm": 3.109375, "learning_rate": 8.384320153637499e-06, "loss": 0.90318737, "memory(GiB)": 712.84, "step": 23665, "train_speed(iter/s)": 1.197519 }, { "acc": 0.75851269, "epoch": 0.600457573023803, "grad_norm": 4.21875, "learning_rate": 8.383548158249888e-06, "loss": 0.89808807, "memory(GiB)": 712.84, "step": 23670, "train_speed(iter/s)": 1.193657 }, { "acc": 0.76689534, "epoch": 0.6005844123928405, "grad_norm": 3.8125, "learning_rate": 8.382776014030774e-06, "loss": 0.88611107, "memory(GiB)": 712.87, "step": 23675, "train_speed(iter/s)": 1.190102 }, { "acc": 0.76311717, "epoch": 0.6007112517618781, "grad_norm": 3.34375, "learning_rate": 8.382003721014122e-06, "loss": 0.92355194, "memory(GiB)": 712.87, "step": 23680, "train_speed(iter/s)": 1.186092 }, { "acc": 0.76784458, "epoch": 0.6008380911309157, "grad_norm": 3.375, "learning_rate": 8.381231279233901e-06, "loss": 0.91849232, "memory(GiB)": 712.87, "step": 23685, "train_speed(iter/s)": 1.182498 }, { "acc": 0.7730566, "epoch": 0.6009649304999533, "grad_norm": 3.703125, "learning_rate": 8.38045868872409e-06, "loss": 0.8687624, "memory(GiB)": 712.87, "step": 23690, "train_speed(iter/s)": 1.178876 }, { "acc": 0.75599685, "epoch": 0.6010917698689908, "grad_norm": 3.328125, "learning_rate": 8.379685949518671e-06, "loss": 0.95054979, "memory(GiB)": 712.87, "step": 23695, "train_speed(iter/s)": 1.175241 }, { "acc": 0.77521343, "epoch": 0.6012186092380284, "grad_norm": 4.0625, "learning_rate": 8.378913061651636e-06, "loss": 0.85939083, "memory(GiB)": 712.87, "step": 23700, "train_speed(iter/s)": 1.171992 }, { "acc": 0.75670557, "epoch": 0.601345448607066, "grad_norm": 3.59375, "learning_rate": 8.378140025156982e-06, "loss": 0.91708546, "memory(GiB)": 712.87, "step": 23705, "train_speed(iter/s)": 1.16868 }, { "acc": 0.76300035, "epoch": 0.6014722879761034, "grad_norm": 3.15625, "learning_rate": 8.377366840068711e-06, "loss": 0.86050377, "memory(GiB)": 712.87, "step": 23710, "train_speed(iter/s)": 1.165177 }, { "acc": 0.77189646, "epoch": 0.601599127345141, "grad_norm": 3.015625, "learning_rate": 8.376593506420834e-06, "loss": 0.86932955, "memory(GiB)": 712.87, "step": 23715, "train_speed(iter/s)": 1.161839 }, { "acc": 0.77244449, "epoch": 0.6017259667141786, "grad_norm": 3.34375, "learning_rate": 8.375820024247367e-06, "loss": 0.87973948, "memory(GiB)": 712.87, "step": 23720, "train_speed(iter/s)": 1.158387 }, { "acc": 0.77274437, "epoch": 0.6018528060832161, "grad_norm": 4.4375, "learning_rate": 8.375046393582332e-06, "loss": 0.8974165, "memory(GiB)": 712.87, "step": 23725, "train_speed(iter/s)": 1.154522 }, { "acc": 0.76174827, "epoch": 0.6019796454522537, "grad_norm": 4.6875, "learning_rate": 8.37427261445976e-06, "loss": 0.96124105, "memory(GiB)": 712.87, "step": 23730, "train_speed(iter/s)": 1.151094 }, { "acc": 0.7690774, "epoch": 0.6021064848212913, "grad_norm": 3.484375, "learning_rate": 8.373498686913688e-06, "loss": 0.87109747, "memory(GiB)": 712.87, "step": 23735, "train_speed(iter/s)": 1.147487 }, { "acc": 0.76779609, "epoch": 0.6022333241903288, "grad_norm": 4.375, "learning_rate": 8.372724610978157e-06, "loss": 0.93654814, "memory(GiB)": 712.87, "step": 23740, "train_speed(iter/s)": 1.143764 }, { "acc": 0.77754664, "epoch": 0.6023601635593664, "grad_norm": 3.59375, "learning_rate": 8.371950386687217e-06, "loss": 0.82540731, "memory(GiB)": 712.87, "step": 23745, "train_speed(iter/s)": 1.14041 }, { "acc": 0.76977782, "epoch": 0.602487002928404, "grad_norm": 3.921875, "learning_rate": 8.371176014074923e-06, "loss": 0.91697721, "memory(GiB)": 712.87, "step": 23750, "train_speed(iter/s)": 1.136933 }, { "acc": 0.77596045, "epoch": 0.6026138422974415, "grad_norm": 3.671875, "learning_rate": 8.370401493175336e-06, "loss": 0.88178272, "memory(GiB)": 712.87, "step": 23755, "train_speed(iter/s)": 1.133148 }, { "acc": 0.76990743, "epoch": 0.6027406816664791, "grad_norm": 3.34375, "learning_rate": 8.369626824022528e-06, "loss": 0.90877733, "memory(GiB)": 712.87, "step": 23760, "train_speed(iter/s)": 1.129643 }, { "acc": 0.7811419, "epoch": 0.6028675210355167, "grad_norm": 4.21875, "learning_rate": 8.36885200665057e-06, "loss": 0.84985256, "memory(GiB)": 712.87, "step": 23765, "train_speed(iter/s)": 1.126183 }, { "acc": 0.75820136, "epoch": 0.6029943604045541, "grad_norm": 4.0625, "learning_rate": 8.368077041093547e-06, "loss": 0.97022657, "memory(GiB)": 712.87, "step": 23770, "train_speed(iter/s)": 1.122501 }, { "acc": 0.77473526, "epoch": 0.6031211997735917, "grad_norm": 3.234375, "learning_rate": 8.367301927385547e-06, "loss": 0.88025846, "memory(GiB)": 712.87, "step": 23775, "train_speed(iter/s)": 1.119527 }, { "acc": 0.75659885, "epoch": 0.6032480391426293, "grad_norm": 3.515625, "learning_rate": 8.366526665560663e-06, "loss": 0.92561617, "memory(GiB)": 712.87, "step": 23780, "train_speed(iter/s)": 1.116216 }, { "acc": 0.76040483, "epoch": 0.6033748785116668, "grad_norm": 3.984375, "learning_rate": 8.365751255652998e-06, "loss": 0.92057686, "memory(GiB)": 712.87, "step": 23785, "train_speed(iter/s)": 1.113189 }, { "acc": 0.76936288, "epoch": 0.6035017178807044, "grad_norm": 3.671875, "learning_rate": 8.36497569769666e-06, "loss": 0.95790586, "memory(GiB)": 712.87, "step": 23790, "train_speed(iter/s)": 1.109841 }, { "acc": 0.75448408, "epoch": 0.603628557249742, "grad_norm": 3.953125, "learning_rate": 8.364199991725764e-06, "loss": 0.93954124, "memory(GiB)": 712.87, "step": 23795, "train_speed(iter/s)": 1.106815 }, { "acc": 0.76412129, "epoch": 0.6037553966187795, "grad_norm": 3.078125, "learning_rate": 8.363424137774424e-06, "loss": 0.87816477, "memory(GiB)": 712.87, "step": 23800, "train_speed(iter/s)": 1.10353 }, { "acc": 0.76642013, "epoch": 0.6038822359878171, "grad_norm": 3.109375, "learning_rate": 8.362648135876778e-06, "loss": 0.88498726, "memory(GiB)": 712.87, "step": 23805, "train_speed(iter/s)": 1.100289 }, { "acc": 0.75371752, "epoch": 0.6040090753568547, "grad_norm": 3.46875, "learning_rate": 8.361871986066952e-06, "loss": 0.92344398, "memory(GiB)": 712.87, "step": 23810, "train_speed(iter/s)": 1.097126 }, { "acc": 0.7779696, "epoch": 0.6041359147258922, "grad_norm": 2.921875, "learning_rate": 8.361095688379089e-06, "loss": 0.84107876, "memory(GiB)": 712.87, "step": 23815, "train_speed(iter/s)": 1.093682 }, { "acc": 0.76976051, "epoch": 0.6042627540949298, "grad_norm": 3.484375, "learning_rate": 8.360319242847337e-06, "loss": 0.91004677, "memory(GiB)": 712.87, "step": 23820, "train_speed(iter/s)": 1.090674 }, { "acc": 0.77117357, "epoch": 0.6043895934639674, "grad_norm": 3.65625, "learning_rate": 8.359542649505847e-06, "loss": 0.92977734, "memory(GiB)": 712.87, "step": 23825, "train_speed(iter/s)": 1.087884 }, { "acc": 0.76803579, "epoch": 0.6045164328330048, "grad_norm": 3.765625, "learning_rate": 8.358765908388781e-06, "loss": 0.87864981, "memory(GiB)": 712.87, "step": 23830, "train_speed(iter/s)": 1.084871 }, { "acc": 0.75835967, "epoch": 0.6046432722020424, "grad_norm": 2.984375, "learning_rate": 8.357989019530303e-06, "loss": 0.89505491, "memory(GiB)": 712.87, "step": 23835, "train_speed(iter/s)": 1.081837 }, { "acc": 0.76619053, "epoch": 0.60477011157108, "grad_norm": 3.453125, "learning_rate": 8.357211982964588e-06, "loss": 0.90073786, "memory(GiB)": 712.87, "step": 23840, "train_speed(iter/s)": 1.079115 }, { "acc": 0.77643871, "epoch": 0.6048969509401175, "grad_norm": 3.59375, "learning_rate": 8.356434798725814e-06, "loss": 0.83640165, "memory(GiB)": 712.87, "step": 23845, "train_speed(iter/s)": 1.075798 }, { "acc": 0.74504628, "epoch": 0.6050237903091551, "grad_norm": 3.375, "learning_rate": 8.355657466848168e-06, "loss": 0.97848282, "memory(GiB)": 712.87, "step": 23850, "train_speed(iter/s)": 1.072391 }, { "acc": 0.75772262, "epoch": 0.6051506296781927, "grad_norm": 3.65625, "learning_rate": 8.354879987365842e-06, "loss": 0.93875418, "memory(GiB)": 712.87, "step": 23855, "train_speed(iter/s)": 1.069613 }, { "acc": 0.75910482, "epoch": 0.6052774690472302, "grad_norm": 3.53125, "learning_rate": 8.354102360313034e-06, "loss": 0.89543762, "memory(GiB)": 712.87, "step": 23860, "train_speed(iter/s)": 1.066852 }, { "acc": 0.76773219, "epoch": 0.6054043084162678, "grad_norm": 3.21875, "learning_rate": 8.353324585723951e-06, "loss": 0.89095316, "memory(GiB)": 712.87, "step": 23865, "train_speed(iter/s)": 1.063673 }, { "acc": 0.76949997, "epoch": 0.6055311477853054, "grad_norm": 3.390625, "learning_rate": 8.352546663632804e-06, "loss": 0.91677485, "memory(GiB)": 712.87, "step": 23870, "train_speed(iter/s)": 1.060939 }, { "acc": 0.76164584, "epoch": 0.6056579871543429, "grad_norm": 3.625, "learning_rate": 8.351768594073811e-06, "loss": 0.90298319, "memory(GiB)": 712.87, "step": 23875, "train_speed(iter/s)": 1.058119 }, { "acc": 0.75607457, "epoch": 0.6057848265233805, "grad_norm": 3.4375, "learning_rate": 8.350990377081197e-06, "loss": 0.89550915, "memory(GiB)": 712.87, "step": 23880, "train_speed(iter/s)": 1.055428 }, { "acc": 0.76291347, "epoch": 0.6059116658924181, "grad_norm": 4.21875, "learning_rate": 8.350212012689192e-06, "loss": 0.94914322, "memory(GiB)": 712.87, "step": 23885, "train_speed(iter/s)": 1.052561 }, { "acc": 0.75296717, "epoch": 0.6060385052614555, "grad_norm": 3.484375, "learning_rate": 8.349433500932034e-06, "loss": 0.95538139, "memory(GiB)": 712.87, "step": 23890, "train_speed(iter/s)": 1.049441 }, { "acc": 0.74561539, "epoch": 0.6061653446304931, "grad_norm": 3.71875, "learning_rate": 8.34865484184397e-06, "loss": 0.9522068, "memory(GiB)": 712.87, "step": 23895, "train_speed(iter/s)": 1.04648 }, { "acc": 0.75906482, "epoch": 0.6062921839995307, "grad_norm": 3.671875, "learning_rate": 8.347876035459248e-06, "loss": 0.94439287, "memory(GiB)": 712.87, "step": 23900, "train_speed(iter/s)": 1.04351 }, { "acc": 0.75876069, "epoch": 0.6064190233685682, "grad_norm": 3.984375, "learning_rate": 8.347097081812128e-06, "loss": 0.92514706, "memory(GiB)": 712.87, "step": 23905, "train_speed(iter/s)": 1.040903 }, { "acc": 0.76373796, "epoch": 0.6065458627376058, "grad_norm": 3.390625, "learning_rate": 8.34631798093687e-06, "loss": 0.89805593, "memory(GiB)": 712.87, "step": 23910, "train_speed(iter/s)": 1.038022 }, { "acc": 0.74789286, "epoch": 0.6066727021066434, "grad_norm": 3.6875, "learning_rate": 8.345538732867747e-06, "loss": 0.94720154, "memory(GiB)": 712.87, "step": 23915, "train_speed(iter/s)": 1.035285 }, { "acc": 0.76970582, "epoch": 0.6067995414756809, "grad_norm": 3.546875, "learning_rate": 8.344759337639035e-06, "loss": 0.85910416, "memory(GiB)": 712.87, "step": 23920, "train_speed(iter/s)": 1.032563 }, { "acc": 0.75852799, "epoch": 0.6069263808447185, "grad_norm": 3.9375, "learning_rate": 8.343979795285015e-06, "loss": 0.93097162, "memory(GiB)": 712.87, "step": 23925, "train_speed(iter/s)": 1.029877 }, { "acc": 0.75922832, "epoch": 0.6070532202137561, "grad_norm": 3.296875, "learning_rate": 8.34320010583998e-06, "loss": 0.90028515, "memory(GiB)": 712.87, "step": 23930, "train_speed(iter/s)": 1.026794 }, { "acc": 0.75685043, "epoch": 0.6071800595827936, "grad_norm": 3.71875, "learning_rate": 8.342420269338225e-06, "loss": 0.91278362, "memory(GiB)": 712.87, "step": 23935, "train_speed(iter/s)": 1.023608 }, { "acc": 0.75759254, "epoch": 0.6073068989518312, "grad_norm": 3.734375, "learning_rate": 8.34164028581405e-06, "loss": 0.99827042, "memory(GiB)": 712.87, "step": 23940, "train_speed(iter/s)": 1.020992 }, { "acc": 0.74830246, "epoch": 0.6074337383208688, "grad_norm": 3.25, "learning_rate": 8.340860155301767e-06, "loss": 0.94049692, "memory(GiB)": 712.87, "step": 23945, "train_speed(iter/s)": 1.018265 }, { "acc": 0.76274581, "epoch": 0.6075605776899062, "grad_norm": 3.828125, "learning_rate": 8.340079877835691e-06, "loss": 0.92451668, "memory(GiB)": 712.87, "step": 23950, "train_speed(iter/s)": 1.015752 }, { "acc": 0.7522861, "epoch": 0.6076874170589438, "grad_norm": 3.609375, "learning_rate": 8.339299453450142e-06, "loss": 0.9563179, "memory(GiB)": 712.87, "step": 23955, "train_speed(iter/s)": 1.013092 }, { "acc": 0.7642941, "epoch": 0.6078142564279814, "grad_norm": 3.328125, "learning_rate": 8.338518882179452e-06, "loss": 0.94226618, "memory(GiB)": 726.27, "step": 23960, "train_speed(iter/s)": 1.010161 }, { "acc": 0.7606843, "epoch": 0.6079410957970189, "grad_norm": 3.28125, "learning_rate": 8.337738164057951e-06, "loss": 0.90758991, "memory(GiB)": 726.27, "step": 23965, "train_speed(iter/s)": 1.007381 }, { "acc": 0.77399869, "epoch": 0.6080679351660565, "grad_norm": 3.359375, "learning_rate": 8.336957299119982e-06, "loss": 0.9167325, "memory(GiB)": 726.27, "step": 23970, "train_speed(iter/s)": 1.005017 }, { "acc": 0.75383873, "epoch": 0.6081947745350941, "grad_norm": 3.21875, "learning_rate": 8.336176287399898e-06, "loss": 0.89816227, "memory(GiB)": 726.27, "step": 23975, "train_speed(iter/s)": 1.002483 }, { "acc": 0.75064311, "epoch": 0.6083216139041316, "grad_norm": 3.421875, "learning_rate": 8.335395128932046e-06, "loss": 0.90979881, "memory(GiB)": 726.27, "step": 23980, "train_speed(iter/s)": 0.999695 }, { "acc": 0.75189371, "epoch": 0.6084484532731692, "grad_norm": 3.828125, "learning_rate": 8.33461382375079e-06, "loss": 0.94228086, "memory(GiB)": 726.27, "step": 23985, "train_speed(iter/s)": 0.996925 }, { "acc": 0.76171513, "epoch": 0.6085752926422068, "grad_norm": 3.34375, "learning_rate": 8.333832371890495e-06, "loss": 0.91306887, "memory(GiB)": 726.27, "step": 23990, "train_speed(iter/s)": 0.994458 }, { "acc": 0.77581692, "epoch": 0.6087021320112443, "grad_norm": 4.1875, "learning_rate": 8.333050773385538e-06, "loss": 0.90286484, "memory(GiB)": 726.27, "step": 23995, "train_speed(iter/s)": 0.991984 }, { "acc": 0.7542841, "epoch": 0.6088289713802819, "grad_norm": 3.65625, "learning_rate": 8.332269028270297e-06, "loss": 0.9382576, "memory(GiB)": 726.27, "step": 24000, "train_speed(iter/s)": 0.989537 }, { "epoch": 0.6088289713802819, "eval_acc": 0.7540933885398907, "eval_loss": 0.8749490976333618, "eval_runtime": 1148.1056, "eval_samples_per_second": 5.548, "eval_steps_per_second": 5.548, "step": 24000 }, { "acc": 0.77358365, "epoch": 0.6089558107493195, "grad_norm": 4.46875, "learning_rate": 8.33148713657916e-06, "loss": 0.92750416, "memory(GiB)": 726.27, "step": 24005, "train_speed(iter/s)": 0.915634 }, { "acc": 0.75963087, "epoch": 0.6090826501183569, "grad_norm": 2.1875, "learning_rate": 8.330705098346517e-06, "loss": 0.91565685, "memory(GiB)": 726.27, "step": 24010, "train_speed(iter/s)": 0.91328 }, { "acc": 0.76382575, "epoch": 0.6092094894873945, "grad_norm": 4.0625, "learning_rate": 8.329922913606767e-06, "loss": 0.92128716, "memory(GiB)": 726.27, "step": 24015, "train_speed(iter/s)": 0.911226 }, { "acc": 0.76636372, "epoch": 0.6093363288564321, "grad_norm": 3.328125, "learning_rate": 8.32914058239432e-06, "loss": 0.91948071, "memory(GiB)": 726.27, "step": 24020, "train_speed(iter/s)": 0.909336 }, { "acc": 0.75904946, "epoch": 0.6094631682254696, "grad_norm": 3.359375, "learning_rate": 8.328358104743588e-06, "loss": 0.95164518, "memory(GiB)": 726.27, "step": 24025, "train_speed(iter/s)": 0.907327 }, { "acc": 0.76720271, "epoch": 0.6095900075945072, "grad_norm": 3.59375, "learning_rate": 8.327575480688985e-06, "loss": 0.84857254, "memory(GiB)": 726.27, "step": 24030, "train_speed(iter/s)": 0.90537 }, { "acc": 0.78338127, "epoch": 0.6097168469635448, "grad_norm": 3.734375, "learning_rate": 8.326792710264939e-06, "loss": 0.86073666, "memory(GiB)": 726.27, "step": 24035, "train_speed(iter/s)": 0.903497 }, { "acc": 0.76303506, "epoch": 0.6098436863325823, "grad_norm": 3.671875, "learning_rate": 8.326009793505882e-06, "loss": 0.92441845, "memory(GiB)": 726.27, "step": 24040, "train_speed(iter/s)": 0.901585 }, { "acc": 0.7655817, "epoch": 0.6099705257016199, "grad_norm": 4.21875, "learning_rate": 8.325226730446252e-06, "loss": 0.88859892, "memory(GiB)": 726.27, "step": 24045, "train_speed(iter/s)": 0.899307 }, { "acc": 0.76980433, "epoch": 0.6100973650706575, "grad_norm": 3.734375, "learning_rate": 8.324443521120494e-06, "loss": 0.84063244, "memory(GiB)": 726.27, "step": 24050, "train_speed(iter/s)": 0.897018 }, { "acc": 0.75692558, "epoch": 0.610224204439695, "grad_norm": 3.65625, "learning_rate": 8.323660165563054e-06, "loss": 0.86615524, "memory(GiB)": 726.27, "step": 24055, "train_speed(iter/s)": 0.894989 }, { "acc": 0.75586424, "epoch": 0.6103510438087326, "grad_norm": 3.21875, "learning_rate": 8.322876663808397e-06, "loss": 0.92093182, "memory(GiB)": 726.27, "step": 24060, "train_speed(iter/s)": 0.892707 }, { "acc": 0.75430398, "epoch": 0.6104778831777702, "grad_norm": 3.796875, "learning_rate": 8.322093015890979e-06, "loss": 0.91844759, "memory(GiB)": 726.27, "step": 24065, "train_speed(iter/s)": 0.890935 }, { "acc": 0.77120376, "epoch": 0.6106047225468076, "grad_norm": 3.90625, "learning_rate": 8.321309221845278e-06, "loss": 0.90491419, "memory(GiB)": 726.27, "step": 24070, "train_speed(iter/s)": 0.888754 }, { "acc": 0.77036967, "epoch": 0.6107315619158452, "grad_norm": 3.265625, "learning_rate": 8.320525281705765e-06, "loss": 0.90060358, "memory(GiB)": 726.27, "step": 24075, "train_speed(iter/s)": 0.88687 }, { "acc": 0.75576053, "epoch": 0.6108584012848828, "grad_norm": 3.109375, "learning_rate": 8.319741195506925e-06, "loss": 0.98737001, "memory(GiB)": 726.27, "step": 24080, "train_speed(iter/s)": 0.884681 }, { "acc": 0.7585113, "epoch": 0.6109852406539203, "grad_norm": 3.578125, "learning_rate": 8.318956963283246e-06, "loss": 0.93369732, "memory(GiB)": 726.27, "step": 24085, "train_speed(iter/s)": 0.88266 }, { "acc": 0.76510291, "epoch": 0.6111120800229579, "grad_norm": 3.640625, "learning_rate": 8.318172585069227e-06, "loss": 0.93919973, "memory(GiB)": 726.27, "step": 24090, "train_speed(iter/s)": 0.880832 }, { "acc": 0.74736662, "epoch": 0.6112389193919955, "grad_norm": 3.4375, "learning_rate": 8.317388060899367e-06, "loss": 0.98371363, "memory(GiB)": 726.27, "step": 24095, "train_speed(iter/s)": 0.879009 }, { "acc": 0.77685385, "epoch": 0.611365758761033, "grad_norm": 3.328125, "learning_rate": 8.316603390808179e-06, "loss": 0.83945541, "memory(GiB)": 726.27, "step": 24100, "train_speed(iter/s)": 0.877068 }, { "acc": 0.7757515, "epoch": 0.6114925981300706, "grad_norm": 3.046875, "learning_rate": 8.31581857483017e-06, "loss": 0.9003891, "memory(GiB)": 726.27, "step": 24105, "train_speed(iter/s)": 0.875208 }, { "acc": 0.77499261, "epoch": 0.6116194374991082, "grad_norm": 3.34375, "learning_rate": 8.31503361299987e-06, "loss": 0.88947477, "memory(GiB)": 726.27, "step": 24110, "train_speed(iter/s)": 0.873435 }, { "acc": 0.76837759, "epoch": 0.6117462768681458, "grad_norm": 3.984375, "learning_rate": 8.314248505351803e-06, "loss": 0.86806488, "memory(GiB)": 726.27, "step": 24115, "train_speed(iter/s)": 0.871516 }, { "acc": 0.77369938, "epoch": 0.6118731162371833, "grad_norm": 3.609375, "learning_rate": 8.313463251920504e-06, "loss": 0.91531687, "memory(GiB)": 726.27, "step": 24120, "train_speed(iter/s)": 0.869731 }, { "acc": 0.75824561, "epoch": 0.6119999556062209, "grad_norm": 4.53125, "learning_rate": 8.312677852740514e-06, "loss": 0.93205681, "memory(GiB)": 726.27, "step": 24125, "train_speed(iter/s)": 0.867827 }, { "acc": 0.75998549, "epoch": 0.6121267949752583, "grad_norm": 4.15625, "learning_rate": 8.311892307846382e-06, "loss": 0.94196596, "memory(GiB)": 726.27, "step": 24130, "train_speed(iter/s)": 0.866142 }, { "acc": 0.76570439, "epoch": 0.6122536343442959, "grad_norm": 3.4375, "learning_rate": 8.311106617272657e-06, "loss": 0.92452269, "memory(GiB)": 726.27, "step": 24135, "train_speed(iter/s)": 0.864333 }, { "acc": 0.77364206, "epoch": 0.6123804737133335, "grad_norm": 2.90625, "learning_rate": 8.310320781053902e-06, "loss": 0.92300911, "memory(GiB)": 726.27, "step": 24140, "train_speed(iter/s)": 0.862542 }, { "acc": 0.75792804, "epoch": 0.612507313082371, "grad_norm": 2.984375, "learning_rate": 8.309534799224683e-06, "loss": 0.94657555, "memory(GiB)": 726.27, "step": 24145, "train_speed(iter/s)": 0.860468 }, { "acc": 0.77898426, "epoch": 0.6126341524514086, "grad_norm": 3.453125, "learning_rate": 8.308748671819574e-06, "loss": 0.90933456, "memory(GiB)": 726.27, "step": 24150, "train_speed(iter/s)": 0.858567 }, { "acc": 0.75543985, "epoch": 0.6127609918204462, "grad_norm": 4.75, "learning_rate": 8.307962398873153e-06, "loss": 0.92867069, "memory(GiB)": 726.27, "step": 24155, "train_speed(iter/s)": 0.856723 }, { "acc": 0.77366037, "epoch": 0.6128878311894838, "grad_norm": 4.25, "learning_rate": 8.307175980420005e-06, "loss": 0.91715755, "memory(GiB)": 726.27, "step": 24160, "train_speed(iter/s)": 0.854797 }, { "acc": 0.771771, "epoch": 0.6130146705585213, "grad_norm": 3.109375, "learning_rate": 8.306389416494725e-06, "loss": 0.84921684, "memory(GiB)": 726.27, "step": 24165, "train_speed(iter/s)": 0.853006 }, { "acc": 0.77834291, "epoch": 0.6131415099275589, "grad_norm": 3.6875, "learning_rate": 8.305602707131908e-06, "loss": 0.87028875, "memory(GiB)": 726.27, "step": 24170, "train_speed(iter/s)": 0.851235 }, { "acc": 0.76772113, "epoch": 0.6132683492965965, "grad_norm": 3.328125, "learning_rate": 8.304815852366161e-06, "loss": 0.89818735, "memory(GiB)": 726.27, "step": 24175, "train_speed(iter/s)": 0.849442 }, { "acc": 0.76001468, "epoch": 0.613395188665634, "grad_norm": 3.25, "learning_rate": 8.304028852232093e-06, "loss": 0.92085457, "memory(GiB)": 726.27, "step": 24180, "train_speed(iter/s)": 0.847667 }, { "acc": 0.76426358, "epoch": 0.6135220280346716, "grad_norm": 3.25, "learning_rate": 8.303241706764324e-06, "loss": 0.89938936, "memory(GiB)": 726.27, "step": 24185, "train_speed(iter/s)": 0.845931 }, { "acc": 0.77219305, "epoch": 0.613648867403709, "grad_norm": 3.203125, "learning_rate": 8.302454415997477e-06, "loss": 0.89728889, "memory(GiB)": 726.27, "step": 24190, "train_speed(iter/s)": 0.844256 }, { "acc": 0.74926019, "epoch": 0.6137757067727466, "grad_norm": 3.25, "learning_rate": 8.301666979966184e-06, "loss": 0.92090816, "memory(GiB)": 726.27, "step": 24195, "train_speed(iter/s)": 0.842345 }, { "acc": 0.76399226, "epoch": 0.6139025461417842, "grad_norm": 3.625, "learning_rate": 8.300879398705077e-06, "loss": 0.92097616, "memory(GiB)": 726.27, "step": 24200, "train_speed(iter/s)": 0.84066 }, { "acc": 0.75403295, "epoch": 0.6140293855108218, "grad_norm": 4.03125, "learning_rate": 8.300091672248804e-06, "loss": 0.89456139, "memory(GiB)": 726.27, "step": 24205, "train_speed(iter/s)": 0.838732 }, { "acc": 0.76439109, "epoch": 0.6141562248798593, "grad_norm": 3.484375, "learning_rate": 8.299303800632013e-06, "loss": 0.91780167, "memory(GiB)": 726.27, "step": 24210, "train_speed(iter/s)": 0.837054 }, { "acc": 0.76461568, "epoch": 0.6142830642488969, "grad_norm": 2.96875, "learning_rate": 8.29851578388936e-06, "loss": 0.90914373, "memory(GiB)": 726.27, "step": 24215, "train_speed(iter/s)": 0.835245 }, { "acc": 0.75962143, "epoch": 0.6144099036179345, "grad_norm": 2.84375, "learning_rate": 8.297727622055508e-06, "loss": 0.93415918, "memory(GiB)": 726.27, "step": 24220, "train_speed(iter/s)": 0.833365 }, { "acc": 0.76467538, "epoch": 0.614536742986972, "grad_norm": 3.640625, "learning_rate": 8.296939315165125e-06, "loss": 0.90845823, "memory(GiB)": 726.27, "step": 24225, "train_speed(iter/s)": 0.831699 }, { "acc": 0.77383809, "epoch": 0.6146635823560096, "grad_norm": 4.21875, "learning_rate": 8.296150863252886e-06, "loss": 0.91881456, "memory(GiB)": 726.27, "step": 24230, "train_speed(iter/s)": 0.829871 }, { "acc": 0.76584249, "epoch": 0.6147904217250472, "grad_norm": 3.421875, "learning_rate": 8.295362266353472e-06, "loss": 0.87425928, "memory(GiB)": 726.27, "step": 24235, "train_speed(iter/s)": 0.828187 }, { "acc": 0.75652113, "epoch": 0.6149172610940847, "grad_norm": 3.03125, "learning_rate": 8.294573524501574e-06, "loss": 0.96153297, "memory(GiB)": 726.27, "step": 24240, "train_speed(iter/s)": 0.826378 }, { "acc": 0.74727087, "epoch": 0.6150441004631223, "grad_norm": 3.765625, "learning_rate": 8.293784637731882e-06, "loss": 0.94087639, "memory(GiB)": 726.27, "step": 24245, "train_speed(iter/s)": 0.82453 }, { "acc": 0.76635184, "epoch": 0.6151709398321598, "grad_norm": 3.796875, "learning_rate": 8.292995606079099e-06, "loss": 0.86356716, "memory(GiB)": 726.27, "step": 24250, "train_speed(iter/s)": 0.822862 }, { "acc": 0.77387266, "epoch": 0.6152977792011973, "grad_norm": 3.546875, "learning_rate": 8.292206429577931e-06, "loss": 0.896033, "memory(GiB)": 726.27, "step": 24255, "train_speed(iter/s)": 0.821076 }, { "acc": 0.75062165, "epoch": 0.6154246185702349, "grad_norm": 3.609375, "learning_rate": 8.29141710826309e-06, "loss": 0.94569092, "memory(GiB)": 726.27, "step": 24260, "train_speed(iter/s)": 0.819411 }, { "acc": 0.7709404, "epoch": 0.6155514579392725, "grad_norm": 3.359375, "learning_rate": 8.2906276421693e-06, "loss": 0.9092248, "memory(GiB)": 726.27, "step": 24265, "train_speed(iter/s)": 0.817728 }, { "acc": 0.75641398, "epoch": 0.61567829730831, "grad_norm": 3.171875, "learning_rate": 8.289838031331285e-06, "loss": 0.91280937, "memory(GiB)": 726.27, "step": 24270, "train_speed(iter/s)": 0.815874 }, { "acc": 0.75539031, "epoch": 0.6158051366773476, "grad_norm": 3.609375, "learning_rate": 8.289048275783776e-06, "loss": 0.97773838, "memory(GiB)": 726.27, "step": 24275, "train_speed(iter/s)": 0.8141 }, { "acc": 0.77260985, "epoch": 0.6159319760463852, "grad_norm": 4.15625, "learning_rate": 8.288258375561512e-06, "loss": 0.87568932, "memory(GiB)": 726.27, "step": 24280, "train_speed(iter/s)": 0.812389 }, { "acc": 0.76615114, "epoch": 0.6160588154154227, "grad_norm": 3.328125, "learning_rate": 8.28746833069924e-06, "loss": 0.90299654, "memory(GiB)": 726.27, "step": 24285, "train_speed(iter/s)": 0.810678 }, { "acc": 0.75135522, "epoch": 0.6161856547844603, "grad_norm": 3.40625, "learning_rate": 8.286678141231712e-06, "loss": 0.9413908, "memory(GiB)": 726.27, "step": 24290, "train_speed(iter/s)": 0.80907 }, { "acc": 0.74641647, "epoch": 0.6163124941534979, "grad_norm": 3.875, "learning_rate": 8.285887807193682e-06, "loss": 0.94338427, "memory(GiB)": 726.27, "step": 24295, "train_speed(iter/s)": 0.807412 }, { "acc": 0.76181464, "epoch": 0.6164393335225354, "grad_norm": 4.75, "learning_rate": 8.285097328619918e-06, "loss": 0.91087685, "memory(GiB)": 726.27, "step": 24300, "train_speed(iter/s)": 0.805675 }, { "acc": 0.75561128, "epoch": 0.616566172891573, "grad_norm": 4.0625, "learning_rate": 8.284306705545188e-06, "loss": 0.87784834, "memory(GiB)": 726.27, "step": 24305, "train_speed(iter/s)": 0.803894 }, { "acc": 0.75815973, "epoch": 0.6166930122606105, "grad_norm": 3.71875, "learning_rate": 8.283515938004273e-06, "loss": 0.95340567, "memory(GiB)": 726.27, "step": 24310, "train_speed(iter/s)": 0.802316 }, { "acc": 0.76120229, "epoch": 0.616819851629648, "grad_norm": 3.671875, "learning_rate": 8.282725026031953e-06, "loss": 0.89820137, "memory(GiB)": 726.27, "step": 24315, "train_speed(iter/s)": 0.800781 }, { "acc": 0.7645452, "epoch": 0.6169466909986856, "grad_norm": 3.828125, "learning_rate": 8.281933969663018e-06, "loss": 0.95597544, "memory(GiB)": 726.27, "step": 24320, "train_speed(iter/s)": 0.799258 }, { "acc": 0.77363009, "epoch": 0.6170735303677232, "grad_norm": 3.28125, "learning_rate": 8.281142768932265e-06, "loss": 0.86997271, "memory(GiB)": 726.27, "step": 24325, "train_speed(iter/s)": 0.797723 }, { "acc": 0.77139955, "epoch": 0.6172003697367607, "grad_norm": 4.59375, "learning_rate": 8.280351423874495e-06, "loss": 0.86339331, "memory(GiB)": 726.27, "step": 24330, "train_speed(iter/s)": 0.796137 }, { "acc": 0.76675286, "epoch": 0.6173272091057983, "grad_norm": 3.484375, "learning_rate": 8.27955993452452e-06, "loss": 0.84538755, "memory(GiB)": 726.27, "step": 24335, "train_speed(iter/s)": 0.794602 }, { "acc": 0.77411242, "epoch": 0.6174540484748359, "grad_norm": 3.515625, "learning_rate": 8.27876830091715e-06, "loss": 0.87950487, "memory(GiB)": 726.27, "step": 24340, "train_speed(iter/s)": 0.793026 }, { "acc": 0.76753149, "epoch": 0.6175808878438734, "grad_norm": 3.40625, "learning_rate": 8.27797652308721e-06, "loss": 0.85233812, "memory(GiB)": 726.27, "step": 24345, "train_speed(iter/s)": 0.791514 }, { "acc": 0.75454087, "epoch": 0.617707727212911, "grad_norm": 3.546875, "learning_rate": 8.277184601069528e-06, "loss": 0.92972269, "memory(GiB)": 726.27, "step": 24350, "train_speed(iter/s)": 0.789906 }, { "acc": 0.78398795, "epoch": 0.6178345665819486, "grad_norm": 3.28125, "learning_rate": 8.276392534898935e-06, "loss": 0.89626513, "memory(GiB)": 726.27, "step": 24355, "train_speed(iter/s)": 0.788402 }, { "acc": 0.76568789, "epoch": 0.6179614059509861, "grad_norm": 5.65625, "learning_rate": 8.275600324610275e-06, "loss": 0.89461803, "memory(GiB)": 726.27, "step": 24360, "train_speed(iter/s)": 0.786734 }, { "acc": 0.76915641, "epoch": 0.6180882453200237, "grad_norm": 2.984375, "learning_rate": 8.274807970238394e-06, "loss": 0.8790123, "memory(GiB)": 726.27, "step": 24365, "train_speed(iter/s)": 0.785152 }, { "acc": 0.7656714, "epoch": 0.6182150846890612, "grad_norm": 3.53125, "learning_rate": 8.274015471818145e-06, "loss": 0.93666925, "memory(GiB)": 726.27, "step": 24370, "train_speed(iter/s)": 0.783602 }, { "acc": 0.77596607, "epoch": 0.6183419240580987, "grad_norm": 3.890625, "learning_rate": 8.273222829384386e-06, "loss": 0.87190876, "memory(GiB)": 726.27, "step": 24375, "train_speed(iter/s)": 0.782053 }, { "acc": 0.76575165, "epoch": 0.6184687634271363, "grad_norm": 2.921875, "learning_rate": 8.272430042971983e-06, "loss": 0.93895454, "memory(GiB)": 726.27, "step": 24380, "train_speed(iter/s)": 0.780569 }, { "acc": 0.77512302, "epoch": 0.6185956027961739, "grad_norm": 3.609375, "learning_rate": 8.271637112615812e-06, "loss": 0.89032869, "memory(GiB)": 726.27, "step": 24385, "train_speed(iter/s)": 0.779204 }, { "acc": 0.74819717, "epoch": 0.6187224421652114, "grad_norm": 3.515625, "learning_rate": 8.270844038350746e-06, "loss": 0.93500862, "memory(GiB)": 726.27, "step": 24390, "train_speed(iter/s)": 0.777722 }, { "acc": 0.75884304, "epoch": 0.618849281534249, "grad_norm": 3.234375, "learning_rate": 8.270050820211673e-06, "loss": 0.91498089, "memory(GiB)": 726.27, "step": 24395, "train_speed(iter/s)": 0.776298 }, { "acc": 0.76709042, "epoch": 0.6189761209032866, "grad_norm": 2.953125, "learning_rate": 8.269257458233484e-06, "loss": 0.90587635, "memory(GiB)": 726.27, "step": 24400, "train_speed(iter/s)": 0.774723 }, { "acc": 0.7640913, "epoch": 0.6191029602723241, "grad_norm": 3.515625, "learning_rate": 8.268463952451074e-06, "loss": 0.94235411, "memory(GiB)": 726.27, "step": 24405, "train_speed(iter/s)": 0.773201 }, { "acc": 0.77202005, "epoch": 0.6192297996413617, "grad_norm": 3.25, "learning_rate": 8.267670302899351e-06, "loss": 0.8839962, "memory(GiB)": 726.27, "step": 24410, "train_speed(iter/s)": 0.771744 }, { "acc": 0.76369901, "epoch": 0.6193566390103993, "grad_norm": 3.9375, "learning_rate": 8.266876509613222e-06, "loss": 0.9046484, "memory(GiB)": 726.27, "step": 24415, "train_speed(iter/s)": 0.77027 }, { "acc": 0.78228412, "epoch": 0.6194834783794368, "grad_norm": 3.765625, "learning_rate": 8.266082572627603e-06, "loss": 0.88844967, "memory(GiB)": 726.27, "step": 24420, "train_speed(iter/s)": 0.768751 }, { "acc": 0.77094007, "epoch": 0.6196103177484744, "grad_norm": 3.515625, "learning_rate": 8.26528849197742e-06, "loss": 0.9102355, "memory(GiB)": 726.27, "step": 24425, "train_speed(iter/s)": 0.767463 }, { "acc": 0.773171, "epoch": 0.6197371571175119, "grad_norm": 3.5, "learning_rate": 8.264494267697598e-06, "loss": 0.90910349, "memory(GiB)": 726.27, "step": 24430, "train_speed(iter/s)": 0.76599 }, { "acc": 0.7773694, "epoch": 0.6198639964865494, "grad_norm": 3.515625, "learning_rate": 8.263699899823077e-06, "loss": 0.85529499, "memory(GiB)": 726.27, "step": 24435, "train_speed(iter/s)": 0.764606 }, { "acc": 0.77027178, "epoch": 0.619990835855587, "grad_norm": 4.09375, "learning_rate": 8.262905388388793e-06, "loss": 0.88875122, "memory(GiB)": 726.27, "step": 24440, "train_speed(iter/s)": 0.763164 }, { "acc": 0.75099206, "epoch": 0.6201176752246246, "grad_norm": 3.4375, "learning_rate": 8.2621107334297e-06, "loss": 0.97332067, "memory(GiB)": 726.27, "step": 24445, "train_speed(iter/s)": 0.761868 }, { "acc": 0.76075802, "epoch": 0.6202445145936621, "grad_norm": 3.8125, "learning_rate": 8.261315934980749e-06, "loss": 0.92089291, "memory(GiB)": 726.27, "step": 24450, "train_speed(iter/s)": 0.760415 }, { "acc": 0.7635098, "epoch": 0.6203713539626997, "grad_norm": 4.53125, "learning_rate": 8.260520993076899e-06, "loss": 0.93364096, "memory(GiB)": 726.27, "step": 24455, "train_speed(iter/s)": 0.759013 }, { "acc": 0.76665969, "epoch": 0.6204981933317373, "grad_norm": 3.421875, "learning_rate": 8.259725907753122e-06, "loss": 0.92628498, "memory(GiB)": 726.27, "step": 24460, "train_speed(iter/s)": 0.757557 }, { "acc": 0.76616483, "epoch": 0.6206250327007748, "grad_norm": 3.296875, "learning_rate": 8.258930679044388e-06, "loss": 0.89718027, "memory(GiB)": 726.27, "step": 24465, "train_speed(iter/s)": 0.755974 }, { "acc": 0.75683208, "epoch": 0.6207518720698124, "grad_norm": 3.78125, "learning_rate": 8.258135306985676e-06, "loss": 0.94253473, "memory(GiB)": 726.27, "step": 24470, "train_speed(iter/s)": 0.754699 }, { "acc": 0.77503023, "epoch": 0.62087871143885, "grad_norm": 3.078125, "learning_rate": 8.257339791611974e-06, "loss": 0.83887129, "memory(GiB)": 726.27, "step": 24475, "train_speed(iter/s)": 0.753381 }, { "acc": 0.75816159, "epoch": 0.6210055508078876, "grad_norm": 3.34375, "learning_rate": 8.25654413295827e-06, "loss": 0.9153554, "memory(GiB)": 726.27, "step": 24480, "train_speed(iter/s)": 0.751924 }, { "acc": 0.76843214, "epoch": 0.6211323901769251, "grad_norm": 3.6875, "learning_rate": 8.25574833105957e-06, "loss": 0.83628092, "memory(GiB)": 726.27, "step": 24485, "train_speed(iter/s)": 0.7505 }, { "acc": 0.77165761, "epoch": 0.6212592295459626, "grad_norm": 3.5, "learning_rate": 8.254952385950873e-06, "loss": 0.90170889, "memory(GiB)": 726.27, "step": 24490, "train_speed(iter/s)": 0.749127 }, { "acc": 0.75757904, "epoch": 0.6213860689150001, "grad_norm": 3.328125, "learning_rate": 8.254156297667189e-06, "loss": 0.89137764, "memory(GiB)": 726.27, "step": 24495, "train_speed(iter/s)": 0.747691 }, { "acc": 0.7632185, "epoch": 0.6215129082840377, "grad_norm": 6.53125, "learning_rate": 8.253360066243538e-06, "loss": 0.92420321, "memory(GiB)": 726.27, "step": 24500, "train_speed(iter/s)": 0.746264 }, { "epoch": 0.6215129082840377, "eval_acc": 0.7541656622844268, "eval_loss": 0.8741711378097534, "eval_runtime": 1146.9697, "eval_samples_per_second": 5.554, "eval_steps_per_second": 5.554, "step": 24500 }, { "acc": 0.75931902, "epoch": 0.6216397476530753, "grad_norm": 4.5, "learning_rate": 8.252563691714945e-06, "loss": 0.90183039, "memory(GiB)": 726.27, "step": 24505, "train_speed(iter/s)": 0.704887 }, { "acc": 0.74757028, "epoch": 0.6217665870221128, "grad_norm": 3.375, "learning_rate": 8.251767174116439e-06, "loss": 0.98290253, "memory(GiB)": 726.27, "step": 24510, "train_speed(iter/s)": 0.703632 }, { "acc": 0.75740204, "epoch": 0.6218934263911504, "grad_norm": 3.703125, "learning_rate": 8.250970513483056e-06, "loss": 0.91841259, "memory(GiB)": 726.27, "step": 24515, "train_speed(iter/s)": 0.702413 }, { "acc": 0.77094078, "epoch": 0.622020265760188, "grad_norm": 3.28125, "learning_rate": 8.250173709849838e-06, "loss": 0.88726702, "memory(GiB)": 726.27, "step": 24520, "train_speed(iter/s)": 0.701251 }, { "acc": 0.766117, "epoch": 0.6221471051292256, "grad_norm": 3.453125, "learning_rate": 8.249376763251833e-06, "loss": 0.92840862, "memory(GiB)": 726.27, "step": 24525, "train_speed(iter/s)": 0.70006 }, { "acc": 0.76057434, "epoch": 0.6222739444982631, "grad_norm": 3.609375, "learning_rate": 8.2485796737241e-06, "loss": 0.935427, "memory(GiB)": 726.27, "step": 24530, "train_speed(iter/s)": 0.698861 }, { "acc": 0.76690068, "epoch": 0.6224007838673007, "grad_norm": 4.0, "learning_rate": 8.247782441301696e-06, "loss": 0.90024576, "memory(GiB)": 726.27, "step": 24535, "train_speed(iter/s)": 0.697631 }, { "acc": 0.76026936, "epoch": 0.6225276232363383, "grad_norm": 3.875, "learning_rate": 8.246985066019689e-06, "loss": 0.90031662, "memory(GiB)": 726.27, "step": 24540, "train_speed(iter/s)": 0.696478 }, { "acc": 0.75270171, "epoch": 0.6226544626053758, "grad_norm": 3.203125, "learning_rate": 8.246187547913155e-06, "loss": 0.9214777, "memory(GiB)": 726.27, "step": 24545, "train_speed(iter/s)": 0.695369 }, { "acc": 0.77481089, "epoch": 0.6227813019744133, "grad_norm": 3.578125, "learning_rate": 8.245389887017177e-06, "loss": 0.84947653, "memory(GiB)": 726.27, "step": 24550, "train_speed(iter/s)": 0.694198 }, { "acc": 0.7721714, "epoch": 0.6229081413434508, "grad_norm": 3.421875, "learning_rate": 8.244592083366836e-06, "loss": 0.89551945, "memory(GiB)": 726.27, "step": 24555, "train_speed(iter/s)": 0.692979 }, { "acc": 0.75282583, "epoch": 0.6230349807124884, "grad_norm": 3.734375, "learning_rate": 8.243794136997226e-06, "loss": 0.96538696, "memory(GiB)": 726.27, "step": 24560, "train_speed(iter/s)": 0.691867 }, { "acc": 0.77188334, "epoch": 0.623161820081526, "grad_norm": 3.625, "learning_rate": 8.242996047943448e-06, "loss": 0.89905281, "memory(GiB)": 726.27, "step": 24565, "train_speed(iter/s)": 0.690727 }, { "acc": 0.76520419, "epoch": 0.6232886594505636, "grad_norm": 3.640625, "learning_rate": 8.242197816240607e-06, "loss": 0.92771521, "memory(GiB)": 726.27, "step": 24570, "train_speed(iter/s)": 0.689671 }, { "acc": 0.75874805, "epoch": 0.6234154988196011, "grad_norm": 4.0, "learning_rate": 8.241399441923813e-06, "loss": 0.88135824, "memory(GiB)": 726.27, "step": 24575, "train_speed(iter/s)": 0.688479 }, { "acc": 0.77769661, "epoch": 0.6235423381886387, "grad_norm": 3.25, "learning_rate": 8.240600925028189e-06, "loss": 0.89186964, "memory(GiB)": 726.27, "step": 24580, "train_speed(iter/s)": 0.687395 }, { "acc": 0.76214676, "epoch": 0.6236691775576763, "grad_norm": 4.03125, "learning_rate": 8.239802265588853e-06, "loss": 0.91329708, "memory(GiB)": 726.27, "step": 24585, "train_speed(iter/s)": 0.686157 }, { "acc": 0.75752268, "epoch": 0.6237960169267138, "grad_norm": 3.671875, "learning_rate": 8.239003463640938e-06, "loss": 0.93475685, "memory(GiB)": 726.27, "step": 24590, "train_speed(iter/s)": 0.685103 }, { "acc": 0.75641575, "epoch": 0.6239228562957514, "grad_norm": 3.546875, "learning_rate": 8.238204519219582e-06, "loss": 0.91853323, "memory(GiB)": 726.27, "step": 24595, "train_speed(iter/s)": 0.684023 }, { "acc": 0.77499108, "epoch": 0.624049695664789, "grad_norm": 3.28125, "learning_rate": 8.237405432359923e-06, "loss": 0.89348526, "memory(GiB)": 726.27, "step": 24600, "train_speed(iter/s)": 0.682925 }, { "acc": 0.74449329, "epoch": 0.6241765350338265, "grad_norm": 3.671875, "learning_rate": 8.236606203097118e-06, "loss": 1.00193853, "memory(GiB)": 726.27, "step": 24605, "train_speed(iter/s)": 0.681797 }, { "acc": 0.77707357, "epoch": 0.624303374402864, "grad_norm": 3.515625, "learning_rate": 8.235806831466318e-06, "loss": 0.86078091, "memory(GiB)": 726.27, "step": 24610, "train_speed(iter/s)": 0.680444 }, { "acc": 0.76299133, "epoch": 0.6244302137719016, "grad_norm": 3.609375, "learning_rate": 8.235007317502683e-06, "loss": 0.90621529, "memory(GiB)": 726.27, "step": 24615, "train_speed(iter/s)": 0.679265 }, { "acc": 0.7751986, "epoch": 0.6245570531409391, "grad_norm": 3.765625, "learning_rate": 8.234207661241387e-06, "loss": 0.89199905, "memory(GiB)": 726.27, "step": 24620, "train_speed(iter/s)": 0.678167 }, { "acc": 0.74046888, "epoch": 0.6246838925099767, "grad_norm": 3.8125, "learning_rate": 8.233407862717598e-06, "loss": 0.96517172, "memory(GiB)": 726.27, "step": 24625, "train_speed(iter/s)": 0.677089 }, { "acc": 0.76910009, "epoch": 0.6248107318790143, "grad_norm": 3.453125, "learning_rate": 8.2326079219665e-06, "loss": 0.8577219, "memory(GiB)": 726.27, "step": 24630, "train_speed(iter/s)": 0.675906 }, { "acc": 0.76082788, "epoch": 0.6249375712480518, "grad_norm": 3.421875, "learning_rate": 8.23180783902328e-06, "loss": 0.92981806, "memory(GiB)": 726.27, "step": 24635, "train_speed(iter/s)": 0.674752 }, { "acc": 0.76322718, "epoch": 0.6250644106170894, "grad_norm": 3.265625, "learning_rate": 8.23100761392313e-06, "loss": 0.93171082, "memory(GiB)": 726.27, "step": 24640, "train_speed(iter/s)": 0.673643 }, { "acc": 0.78372498, "epoch": 0.625191249986127, "grad_norm": 3.421875, "learning_rate": 8.230207246701248e-06, "loss": 0.86918526, "memory(GiB)": 726.27, "step": 24645, "train_speed(iter/s)": 0.672608 }, { "acc": 0.76660538, "epoch": 0.6253180893551645, "grad_norm": 3.375, "learning_rate": 8.229406737392843e-06, "loss": 0.84768019, "memory(GiB)": 726.27, "step": 24650, "train_speed(iter/s)": 0.671619 }, { "acc": 0.7650619, "epoch": 0.6254449287242021, "grad_norm": 4.90625, "learning_rate": 8.228606086033125e-06, "loss": 0.94867182, "memory(GiB)": 726.27, "step": 24655, "train_speed(iter/s)": 0.67058 }, { "acc": 0.776194, "epoch": 0.6255717680932397, "grad_norm": 3.453125, "learning_rate": 8.227805292657313e-06, "loss": 0.85319967, "memory(GiB)": 726.27, "step": 24660, "train_speed(iter/s)": 0.669499 }, { "acc": 0.76429691, "epoch": 0.6256986074622772, "grad_norm": 2.859375, "learning_rate": 8.22700435730063e-06, "loss": 0.90169458, "memory(GiB)": 726.27, "step": 24665, "train_speed(iter/s)": 0.66847 }, { "acc": 0.76330791, "epoch": 0.6258254468313147, "grad_norm": 3.65625, "learning_rate": 8.226203279998305e-06, "loss": 0.93614988, "memory(GiB)": 726.27, "step": 24670, "train_speed(iter/s)": 0.667363 }, { "acc": 0.78145146, "epoch": 0.6259522862003523, "grad_norm": 3.546875, "learning_rate": 8.225402060785581e-06, "loss": 0.86237764, "memory(GiB)": 726.27, "step": 24675, "train_speed(iter/s)": 0.666312 }, { "acc": 0.76499386, "epoch": 0.6260791255693898, "grad_norm": 3.671875, "learning_rate": 8.224600699697694e-06, "loss": 0.91361246, "memory(GiB)": 726.27, "step": 24680, "train_speed(iter/s)": 0.665258 }, { "acc": 0.78514972, "epoch": 0.6262059649384274, "grad_norm": 3.265625, "learning_rate": 8.223799196769899e-06, "loss": 0.84212809, "memory(GiB)": 726.27, "step": 24685, "train_speed(iter/s)": 0.664206 }, { "acc": 0.77667308, "epoch": 0.626332804307465, "grad_norm": 3.203125, "learning_rate": 8.222997552037446e-06, "loss": 0.85469751, "memory(GiB)": 726.27, "step": 24690, "train_speed(iter/s)": 0.66319 }, { "acc": 0.78059154, "epoch": 0.6264596436765025, "grad_norm": 3.34375, "learning_rate": 8.222195765535601e-06, "loss": 0.83119469, "memory(GiB)": 726.27, "step": 24695, "train_speed(iter/s)": 0.662192 }, { "acc": 0.76695518, "epoch": 0.6265864830455401, "grad_norm": 3.078125, "learning_rate": 8.221393837299632e-06, "loss": 0.87413864, "memory(GiB)": 726.27, "step": 24700, "train_speed(iter/s)": 0.661215 }, { "acc": 0.76460061, "epoch": 0.6267133224145777, "grad_norm": 3.4375, "learning_rate": 8.220591767364812e-06, "loss": 0.89880819, "memory(GiB)": 726.27, "step": 24705, "train_speed(iter/s)": 0.660097 }, { "acc": 0.75817213, "epoch": 0.6268401617836152, "grad_norm": 3.3125, "learning_rate": 8.21978955576642e-06, "loss": 0.89218874, "memory(GiB)": 726.27, "step": 24710, "train_speed(iter/s)": 0.658913 }, { "acc": 0.75303698, "epoch": 0.6269670011526528, "grad_norm": 3.234375, "learning_rate": 8.218987202539745e-06, "loss": 0.92448997, "memory(GiB)": 726.27, "step": 24715, "train_speed(iter/s)": 0.657801 }, { "acc": 0.74947305, "epoch": 0.6270938405216904, "grad_norm": 3.265625, "learning_rate": 8.21818470772008e-06, "loss": 0.91398296, "memory(GiB)": 726.27, "step": 24720, "train_speed(iter/s)": 0.656664 }, { "acc": 0.77185044, "epoch": 0.6272206798907279, "grad_norm": 3.28125, "learning_rate": 8.217382071342722e-06, "loss": 0.86791248, "memory(GiB)": 726.27, "step": 24725, "train_speed(iter/s)": 0.655709 }, { "acc": 0.76427131, "epoch": 0.6273475192597654, "grad_norm": 3.15625, "learning_rate": 8.216579293442978e-06, "loss": 0.86070013, "memory(GiB)": 726.27, "step": 24730, "train_speed(iter/s)": 0.654704 }, { "acc": 0.76657486, "epoch": 0.627474358628803, "grad_norm": 3.9375, "learning_rate": 8.215776374056161e-06, "loss": 0.95524693, "memory(GiB)": 726.27, "step": 24735, "train_speed(iter/s)": 0.653716 }, { "acc": 0.75341163, "epoch": 0.6276011979978405, "grad_norm": 3.5625, "learning_rate": 8.214973313217586e-06, "loss": 0.93070517, "memory(GiB)": 726.27, "step": 24740, "train_speed(iter/s)": 0.652659 }, { "acc": 0.76719441, "epoch": 0.6277280373668781, "grad_norm": 3.359375, "learning_rate": 8.214170110962581e-06, "loss": 0.92376108, "memory(GiB)": 726.27, "step": 24745, "train_speed(iter/s)": 0.651642 }, { "acc": 0.75872192, "epoch": 0.6278548767359157, "grad_norm": 4.0, "learning_rate": 8.213366767326471e-06, "loss": 0.89699516, "memory(GiB)": 726.27, "step": 24750, "train_speed(iter/s)": 0.650637 }, { "acc": 0.77095876, "epoch": 0.6279817161049532, "grad_norm": 2.8125, "learning_rate": 8.212563282344596e-06, "loss": 0.87408791, "memory(GiB)": 726.27, "step": 24755, "train_speed(iter/s)": 0.649604 }, { "acc": 0.76439486, "epoch": 0.6281085554739908, "grad_norm": 4.3125, "learning_rate": 8.211759656052298e-06, "loss": 0.88773251, "memory(GiB)": 726.27, "step": 24760, "train_speed(iter/s)": 0.648702 }, { "acc": 0.75725851, "epoch": 0.6282353948430284, "grad_norm": 3.34375, "learning_rate": 8.210955888484927e-06, "loss": 0.88101444, "memory(GiB)": 726.27, "step": 24765, "train_speed(iter/s)": 0.647694 }, { "acc": 0.75986538, "epoch": 0.6283622342120659, "grad_norm": 3.59375, "learning_rate": 8.210151979677837e-06, "loss": 0.88921728, "memory(GiB)": 726.27, "step": 24770, "train_speed(iter/s)": 0.64662 }, { "acc": 0.77062893, "epoch": 0.6284890735811035, "grad_norm": 5.34375, "learning_rate": 8.20934792966639e-06, "loss": 0.93024712, "memory(GiB)": 726.27, "step": 24775, "train_speed(iter/s)": 0.645656 }, { "acc": 0.7652844, "epoch": 0.6286159129501411, "grad_norm": 3.609375, "learning_rate": 8.20854373848595e-06, "loss": 0.89110498, "memory(GiB)": 726.27, "step": 24780, "train_speed(iter/s)": 0.644623 }, { "acc": 0.76459837, "epoch": 0.6287427523191785, "grad_norm": 3.5625, "learning_rate": 8.207739406171898e-06, "loss": 0.91496315, "memory(GiB)": 726.27, "step": 24785, "train_speed(iter/s)": 0.643586 }, { "acc": 0.77439551, "epoch": 0.6288695916882161, "grad_norm": 2.796875, "learning_rate": 8.206934932759608e-06, "loss": 0.88971415, "memory(GiB)": 726.27, "step": 24790, "train_speed(iter/s)": 0.642635 }, { "acc": 0.76328583, "epoch": 0.6289964310572537, "grad_norm": 3.90625, "learning_rate": 8.206130318284466e-06, "loss": 0.91135693, "memory(GiB)": 726.27, "step": 24795, "train_speed(iter/s)": 0.641685 }, { "acc": 0.76791515, "epoch": 0.6291232704262912, "grad_norm": 3.453125, "learning_rate": 8.20532556278187e-06, "loss": 1.0289834, "memory(GiB)": 726.27, "step": 24800, "train_speed(iter/s)": 0.640634 }, { "acc": 0.75742421, "epoch": 0.6292501097953288, "grad_norm": 3.265625, "learning_rate": 8.204520666287215e-06, "loss": 0.91219883, "memory(GiB)": 726.27, "step": 24805, "train_speed(iter/s)": 0.639642 }, { "acc": 0.75579262, "epoch": 0.6293769491643664, "grad_norm": 3.375, "learning_rate": 8.203715628835905e-06, "loss": 0.91117954, "memory(GiB)": 726.27, "step": 24810, "train_speed(iter/s)": 0.638657 }, { "acc": 0.76448455, "epoch": 0.6295037885334039, "grad_norm": 3.28125, "learning_rate": 8.202910450463353e-06, "loss": 0.93192425, "memory(GiB)": 726.27, "step": 24815, "train_speed(iter/s)": 0.637742 }, { "acc": 0.77107873, "epoch": 0.6296306279024415, "grad_norm": 3.328125, "learning_rate": 8.202105131204974e-06, "loss": 0.90260391, "memory(GiB)": 726.27, "step": 24820, "train_speed(iter/s)": 0.636741 }, { "acc": 0.75652699, "epoch": 0.6297574672714791, "grad_norm": 3.21875, "learning_rate": 8.201299671096194e-06, "loss": 0.90760012, "memory(GiB)": 726.27, "step": 24825, "train_speed(iter/s)": 0.635615 }, { "acc": 0.74377818, "epoch": 0.6298843066405166, "grad_norm": 3.921875, "learning_rate": 8.200494070172442e-06, "loss": 0.9297349, "memory(GiB)": 726.27, "step": 24830, "train_speed(iter/s)": 0.634647 }, { "acc": 0.76047568, "epoch": 0.6300111460095542, "grad_norm": 3.765625, "learning_rate": 8.199688328469153e-06, "loss": 0.87095747, "memory(GiB)": 726.27, "step": 24835, "train_speed(iter/s)": 0.633648 }, { "acc": 0.75451913, "epoch": 0.6301379853785918, "grad_norm": 3.859375, "learning_rate": 8.19888244602177e-06, "loss": 0.93462372, "memory(GiB)": 726.27, "step": 24840, "train_speed(iter/s)": 0.632787 }, { "acc": 0.75503607, "epoch": 0.6302648247476292, "grad_norm": 3.3125, "learning_rate": 8.198076422865737e-06, "loss": 0.94445868, "memory(GiB)": 726.27, "step": 24845, "train_speed(iter/s)": 0.631832 }, { "acc": 0.76986203, "epoch": 0.6303916641166668, "grad_norm": 3.859375, "learning_rate": 8.197270259036516e-06, "loss": 0.90629988, "memory(GiB)": 726.27, "step": 24850, "train_speed(iter/s)": 0.630983 }, { "acc": 0.76134596, "epoch": 0.6305185034857044, "grad_norm": 3.078125, "learning_rate": 8.19646395456956e-06, "loss": 0.90841904, "memory(GiB)": 726.27, "step": 24855, "train_speed(iter/s)": 0.63002 }, { "acc": 0.76476784, "epoch": 0.6306453428547419, "grad_norm": 3.53125, "learning_rate": 8.19565750950034e-06, "loss": 0.90208149, "memory(GiB)": 726.27, "step": 24860, "train_speed(iter/s)": 0.629163 }, { "acc": 0.76185513, "epoch": 0.6307721822237795, "grad_norm": 3.328125, "learning_rate": 8.194850923864331e-06, "loss": 0.89824867, "memory(GiB)": 726.27, "step": 24865, "train_speed(iter/s)": 0.628228 }, { "acc": 0.75572915, "epoch": 0.6308990215928171, "grad_norm": 3.4375, "learning_rate": 8.194044197697007e-06, "loss": 0.89991379, "memory(GiB)": 726.27, "step": 24870, "train_speed(iter/s)": 0.627346 }, { "acc": 0.7711441, "epoch": 0.6310258609618546, "grad_norm": 4.03125, "learning_rate": 8.193237331033855e-06, "loss": 0.91900063, "memory(GiB)": 726.27, "step": 24875, "train_speed(iter/s)": 0.626411 }, { "acc": 0.76908946, "epoch": 0.6311527003308922, "grad_norm": 3.703125, "learning_rate": 8.192430323910368e-06, "loss": 0.93839684, "memory(GiB)": 726.27, "step": 24880, "train_speed(iter/s)": 0.625484 }, { "acc": 0.76548042, "epoch": 0.6312795396999298, "grad_norm": 3.796875, "learning_rate": 8.191623176362042e-06, "loss": 0.90808392, "memory(GiB)": 726.27, "step": 24885, "train_speed(iter/s)": 0.624643 }, { "acc": 0.76306467, "epoch": 0.6314063790689673, "grad_norm": 3.046875, "learning_rate": 8.190815888424383e-06, "loss": 0.90305901, "memory(GiB)": 726.27, "step": 24890, "train_speed(iter/s)": 0.623699 }, { "acc": 0.77663851, "epoch": 0.6315332184380049, "grad_norm": 19.625, "learning_rate": 8.1900084601329e-06, "loss": 0.85027857, "memory(GiB)": 726.27, "step": 24895, "train_speed(iter/s)": 0.622835 }, { "acc": 0.76044335, "epoch": 0.6316600578070425, "grad_norm": 2.796875, "learning_rate": 8.189200891523107e-06, "loss": 0.88728828, "memory(GiB)": 726.27, "step": 24900, "train_speed(iter/s)": 0.621948 }, { "acc": 0.75354648, "epoch": 0.6317868971760799, "grad_norm": 2.828125, "learning_rate": 8.188393182630529e-06, "loss": 0.93771935, "memory(GiB)": 726.27, "step": 24905, "train_speed(iter/s)": 0.620997 }, { "acc": 0.76941957, "epoch": 0.6319137365451175, "grad_norm": 3.484375, "learning_rate": 8.187585333490693e-06, "loss": 0.91706667, "memory(GiB)": 726.27, "step": 24910, "train_speed(iter/s)": 0.62006 }, { "acc": 0.76309791, "epoch": 0.6320405759141551, "grad_norm": 3.390625, "learning_rate": 8.186777344139134e-06, "loss": 0.88052111, "memory(GiB)": 726.27, "step": 24915, "train_speed(iter/s)": 0.61918 }, { "acc": 0.77021112, "epoch": 0.6321674152831926, "grad_norm": 4.1875, "learning_rate": 8.185969214611396e-06, "loss": 0.89382172, "memory(GiB)": 726.27, "step": 24920, "train_speed(iter/s)": 0.618212 }, { "acc": 0.77169251, "epoch": 0.6322942546522302, "grad_norm": 3.3125, "learning_rate": 8.185160944943022e-06, "loss": 0.90382004, "memory(GiB)": 726.27, "step": 24925, "train_speed(iter/s)": 0.617293 }, { "acc": 0.76453228, "epoch": 0.6324210940212678, "grad_norm": 3.75, "learning_rate": 8.184352535169566e-06, "loss": 0.88450079, "memory(GiB)": 726.27, "step": 24930, "train_speed(iter/s)": 0.616441 }, { "acc": 0.77673364, "epoch": 0.6325479333903054, "grad_norm": 3.21875, "learning_rate": 8.183543985326588e-06, "loss": 0.8877737, "memory(GiB)": 726.27, "step": 24935, "train_speed(iter/s)": 0.615536 }, { "acc": 0.76860576, "epoch": 0.6326747727593429, "grad_norm": 3.703125, "learning_rate": 8.182735295449654e-06, "loss": 0.89581823, "memory(GiB)": 726.27, "step": 24940, "train_speed(iter/s)": 0.614656 }, { "acc": 0.76185713, "epoch": 0.6328016121283805, "grad_norm": 4.25, "learning_rate": 8.181926465574334e-06, "loss": 0.94148111, "memory(GiB)": 726.27, "step": 24945, "train_speed(iter/s)": 0.613782 }, { "acc": 0.76379457, "epoch": 0.632928451497418, "grad_norm": 3.421875, "learning_rate": 8.181117495736209e-06, "loss": 0.91180334, "memory(GiB)": 726.27, "step": 24950, "train_speed(iter/s)": 0.612797 }, { "acc": 0.75998702, "epoch": 0.6330552908664556, "grad_norm": 3.703125, "learning_rate": 8.18030838597086e-06, "loss": 0.93537102, "memory(GiB)": 726.27, "step": 24955, "train_speed(iter/s)": 0.611918 }, { "acc": 0.76393347, "epoch": 0.6331821302354932, "grad_norm": 4.0625, "learning_rate": 8.179499136313877e-06, "loss": 0.90407553, "memory(GiB)": 726.27, "step": 24960, "train_speed(iter/s)": 0.61107 }, { "acc": 0.77310448, "epoch": 0.6333089696045306, "grad_norm": 3.609375, "learning_rate": 8.178689746800859e-06, "loss": 0.86047878, "memory(GiB)": 726.27, "step": 24965, "train_speed(iter/s)": 0.610257 }, { "acc": 0.76481452, "epoch": 0.6334358089735682, "grad_norm": 3.390625, "learning_rate": 8.177880217467405e-06, "loss": 0.9525526, "memory(GiB)": 726.27, "step": 24970, "train_speed(iter/s)": 0.609224 }, { "acc": 0.76261392, "epoch": 0.6335626483426058, "grad_norm": 3.203125, "learning_rate": 8.177070548349128e-06, "loss": 0.8977376, "memory(GiB)": 726.27, "step": 24975, "train_speed(iter/s)": 0.608416 }, { "acc": 0.75827985, "epoch": 0.6336894877116434, "grad_norm": 3.21875, "learning_rate": 8.176260739481639e-06, "loss": 0.94064226, "memory(GiB)": 726.27, "step": 24980, "train_speed(iter/s)": 0.607565 }, { "acc": 0.76957369, "epoch": 0.6338163270806809, "grad_norm": 3.46875, "learning_rate": 8.175450790900562e-06, "loss": 0.92800283, "memory(GiB)": 726.27, "step": 24985, "train_speed(iter/s)": 0.606655 }, { "acc": 0.78635902, "epoch": 0.6339431664497185, "grad_norm": 3.703125, "learning_rate": 8.17464070264152e-06, "loss": 0.86491299, "memory(GiB)": 726.27, "step": 24990, "train_speed(iter/s)": 0.605767 }, { "acc": 0.76146502, "epoch": 0.634070005818756, "grad_norm": 3.140625, "learning_rate": 8.173830474740149e-06, "loss": 0.91123295, "memory(GiB)": 726.27, "step": 24995, "train_speed(iter/s)": 0.604918 }, { "acc": 0.75238857, "epoch": 0.6341968451877936, "grad_norm": 4.125, "learning_rate": 8.173020107232088e-06, "loss": 0.97770214, "memory(GiB)": 726.27, "step": 25000, "train_speed(iter/s)": 0.604074 }, { "epoch": 0.6341968451877936, "eval_acc": 0.7544058784873648, "eval_loss": 0.8732805252075195, "eval_runtime": 1153.5043, "eval_samples_per_second": 5.522, "eval_steps_per_second": 5.522, "step": 25000 }, { "acc": 0.775702, "epoch": 0.6343236845568312, "grad_norm": 3.5, "learning_rate": 8.172209600152983e-06, "loss": 0.82436075, "memory(GiB)": 728.98, "step": 25005, "train_speed(iter/s)": 0.577029 }, { "acc": 0.76440468, "epoch": 0.6344505239258688, "grad_norm": 3.5, "learning_rate": 8.171398953538484e-06, "loss": 0.86734304, "memory(GiB)": 728.98, "step": 25010, "train_speed(iter/s)": 0.576296 }, { "acc": 0.7751195, "epoch": 0.6345773632949063, "grad_norm": 3.75, "learning_rate": 8.170588167424251e-06, "loss": 0.86520939, "memory(GiB)": 728.98, "step": 25015, "train_speed(iter/s)": 0.575495 }, { "acc": 0.76336517, "epoch": 0.6347042026639439, "grad_norm": 3.578125, "learning_rate": 8.169777241845946e-06, "loss": 0.87321682, "memory(GiB)": 728.98, "step": 25020, "train_speed(iter/s)": 0.574737 }, { "acc": 0.75722022, "epoch": 0.6348310420329814, "grad_norm": 3.078125, "learning_rate": 8.168966176839242e-06, "loss": 0.9230792, "memory(GiB)": 728.98, "step": 25025, "train_speed(iter/s)": 0.573985 }, { "acc": 0.76951771, "epoch": 0.6349578814020189, "grad_norm": 4.59375, "learning_rate": 8.168154972439809e-06, "loss": 0.88826551, "memory(GiB)": 728.98, "step": 25030, "train_speed(iter/s)": 0.57314 }, { "acc": 0.75559072, "epoch": 0.6350847207710565, "grad_norm": 3.234375, "learning_rate": 8.167343628683335e-06, "loss": 0.91158924, "memory(GiB)": 728.98, "step": 25035, "train_speed(iter/s)": 0.572428 }, { "acc": 0.76622763, "epoch": 0.6352115601400941, "grad_norm": 3.421875, "learning_rate": 8.166532145605505e-06, "loss": 0.85881071, "memory(GiB)": 728.98, "step": 25040, "train_speed(iter/s)": 0.571624 }, { "acc": 0.77300696, "epoch": 0.6353383995091316, "grad_norm": 3.484375, "learning_rate": 8.165720523242018e-06, "loss": 0.88973827, "memory(GiB)": 728.98, "step": 25045, "train_speed(iter/s)": 0.570943 }, { "acc": 0.75401196, "epoch": 0.6354652388781692, "grad_norm": 3.40625, "learning_rate": 8.164908761628569e-06, "loss": 0.91945858, "memory(GiB)": 728.98, "step": 25050, "train_speed(iter/s)": 0.570147 }, { "acc": 0.76446276, "epoch": 0.6355920782472068, "grad_norm": 3.8125, "learning_rate": 8.16409686080087e-06, "loss": 0.89678946, "memory(GiB)": 728.98, "step": 25055, "train_speed(iter/s)": 0.569382 }, { "acc": 0.75361443, "epoch": 0.6357189176162443, "grad_norm": 3.0625, "learning_rate": 8.163284820794631e-06, "loss": 0.9173192, "memory(GiB)": 728.98, "step": 25060, "train_speed(iter/s)": 0.568684 }, { "acc": 0.77259002, "epoch": 0.6358457569852819, "grad_norm": 3.078125, "learning_rate": 8.162472641645569e-06, "loss": 0.85945406, "memory(GiB)": 728.98, "step": 25065, "train_speed(iter/s)": 0.567829 }, { "acc": 0.76545873, "epoch": 0.6359725963543195, "grad_norm": 3.9375, "learning_rate": 8.161660323389413e-06, "loss": 0.90295191, "memory(GiB)": 728.98, "step": 25070, "train_speed(iter/s)": 0.567042 }, { "acc": 0.76646361, "epoch": 0.636099435723357, "grad_norm": 3.71875, "learning_rate": 8.160847866061893e-06, "loss": 0.88031464, "memory(GiB)": 728.98, "step": 25075, "train_speed(iter/s)": 0.566299 }, { "acc": 0.76393576, "epoch": 0.6362262750923946, "grad_norm": 3.265625, "learning_rate": 8.160035269698747e-06, "loss": 0.91143408, "memory(GiB)": 728.98, "step": 25080, "train_speed(iter/s)": 0.565549 }, { "acc": 0.75407863, "epoch": 0.6363531144614321, "grad_norm": 4.25, "learning_rate": 8.159222534335718e-06, "loss": 0.97643976, "memory(GiB)": 728.98, "step": 25085, "train_speed(iter/s)": 0.564796 }, { "acc": 0.75765309, "epoch": 0.6364799538304696, "grad_norm": 3.5, "learning_rate": 8.158409660008558e-06, "loss": 0.95394211, "memory(GiB)": 728.98, "step": 25090, "train_speed(iter/s)": 0.564006 }, { "acc": 0.77050958, "epoch": 0.6366067931995072, "grad_norm": 3.09375, "learning_rate": 8.157596646753017e-06, "loss": 0.82072678, "memory(GiB)": 728.98, "step": 25095, "train_speed(iter/s)": 0.563157 }, { "acc": 0.77213135, "epoch": 0.6367336325685448, "grad_norm": 3.140625, "learning_rate": 8.15678349460486e-06, "loss": 0.89880953, "memory(GiB)": 728.98, "step": 25100, "train_speed(iter/s)": 0.56241 }, { "acc": 0.75322652, "epoch": 0.6368604719375823, "grad_norm": 3.625, "learning_rate": 8.155970203599857e-06, "loss": 0.9362937, "memory(GiB)": 728.98, "step": 25105, "train_speed(iter/s)": 0.561737 }, { "acc": 0.76604114, "epoch": 0.6369873113066199, "grad_norm": 3.671875, "learning_rate": 8.15515677377378e-06, "loss": 0.91824512, "memory(GiB)": 728.98, "step": 25110, "train_speed(iter/s)": 0.561034 }, { "acc": 0.77009444, "epoch": 0.6371141506756575, "grad_norm": 3.34375, "learning_rate": 8.15434320516241e-06, "loss": 0.82177954, "memory(GiB)": 728.98, "step": 25115, "train_speed(iter/s)": 0.560315 }, { "acc": 0.7755868, "epoch": 0.637240990044695, "grad_norm": 5.0, "learning_rate": 8.153529497801531e-06, "loss": 0.87035141, "memory(GiB)": 728.98, "step": 25120, "train_speed(iter/s)": 0.55962 }, { "acc": 0.76142707, "epoch": 0.6373678294137326, "grad_norm": 3.546875, "learning_rate": 8.152715651726938e-06, "loss": 0.8959918, "memory(GiB)": 728.98, "step": 25125, "train_speed(iter/s)": 0.558876 }, { "acc": 0.78202596, "epoch": 0.6374946687827702, "grad_norm": 3.984375, "learning_rate": 8.151901666974429e-06, "loss": 0.8638093, "memory(GiB)": 728.98, "step": 25130, "train_speed(iter/s)": 0.558132 }, { "acc": 0.74687028, "epoch": 0.6376215081518077, "grad_norm": 4.46875, "learning_rate": 8.151087543579808e-06, "loss": 0.95500565, "memory(GiB)": 728.98, "step": 25135, "train_speed(iter/s)": 0.557479 }, { "acc": 0.77107754, "epoch": 0.6377483475208453, "grad_norm": 3.5625, "learning_rate": 8.150273281578885e-06, "loss": 0.87389193, "memory(GiB)": 728.98, "step": 25140, "train_speed(iter/s)": 0.556763 }, { "acc": 0.75525665, "epoch": 0.6378751868898828, "grad_norm": 3.34375, "learning_rate": 8.14945888100748e-06, "loss": 0.94381304, "memory(GiB)": 728.98, "step": 25145, "train_speed(iter/s)": 0.556068 }, { "acc": 0.75602403, "epoch": 0.6380020262589203, "grad_norm": 3.25, "learning_rate": 8.14864434190141e-06, "loss": 0.92777624, "memory(GiB)": 728.98, "step": 25150, "train_speed(iter/s)": 0.555346 }, { "acc": 0.75402722, "epoch": 0.6381288656279579, "grad_norm": 3.796875, "learning_rate": 8.147829664296512e-06, "loss": 0.91780043, "memory(GiB)": 728.98, "step": 25155, "train_speed(iter/s)": 0.554665 }, { "acc": 0.77260094, "epoch": 0.6382557049969955, "grad_norm": 2.859375, "learning_rate": 8.147014848228614e-06, "loss": 0.88357038, "memory(GiB)": 728.98, "step": 25160, "train_speed(iter/s)": 0.553873 }, { "acc": 0.77444115, "epoch": 0.638382544366033, "grad_norm": 3.03125, "learning_rate": 8.146199893733559e-06, "loss": 0.80570898, "memory(GiB)": 728.98, "step": 25165, "train_speed(iter/s)": 0.553004 }, { "acc": 0.77343493, "epoch": 0.6385093837350706, "grad_norm": 3.765625, "learning_rate": 8.145384800847198e-06, "loss": 0.88882523, "memory(GiB)": 728.98, "step": 25170, "train_speed(iter/s)": 0.552265 }, { "acc": 0.76616497, "epoch": 0.6386362231041082, "grad_norm": 3.90625, "learning_rate": 8.144569569605378e-06, "loss": 0.88810167, "memory(GiB)": 728.98, "step": 25175, "train_speed(iter/s)": 0.551617 }, { "acc": 0.76283698, "epoch": 0.6387630624731457, "grad_norm": 3.328125, "learning_rate": 8.143754200043965e-06, "loss": 0.94551353, "memory(GiB)": 728.98, "step": 25180, "train_speed(iter/s)": 0.55099 }, { "acc": 0.75995812, "epoch": 0.6388899018421833, "grad_norm": 3.125, "learning_rate": 8.14293869219882e-06, "loss": 0.93004751, "memory(GiB)": 728.98, "step": 25185, "train_speed(iter/s)": 0.550287 }, { "acc": 0.74656472, "epoch": 0.6390167412112209, "grad_norm": 3.90625, "learning_rate": 8.142123046105815e-06, "loss": 0.95770063, "memory(GiB)": 728.98, "step": 25190, "train_speed(iter/s)": 0.549602 }, { "acc": 0.77948661, "epoch": 0.6391435805802584, "grad_norm": 4.125, "learning_rate": 8.14130726180083e-06, "loss": 0.88035345, "memory(GiB)": 728.98, "step": 25195, "train_speed(iter/s)": 0.548854 }, { "acc": 0.75720978, "epoch": 0.639270419949296, "grad_norm": 3.375, "learning_rate": 8.140491339319747e-06, "loss": 0.88272047, "memory(GiB)": 728.98, "step": 25200, "train_speed(iter/s)": 0.548122 }, { "acc": 0.77958078, "epoch": 0.6393972593183335, "grad_norm": 3.078125, "learning_rate": 8.139675278698455e-06, "loss": 0.79627542, "memory(GiB)": 728.98, "step": 25205, "train_speed(iter/s)": 0.547424 }, { "acc": 0.7756494, "epoch": 0.639524098687371, "grad_norm": 3.15625, "learning_rate": 8.138859079972853e-06, "loss": 0.87087173, "memory(GiB)": 728.98, "step": 25210, "train_speed(iter/s)": 0.546661 }, { "acc": 0.77586646, "epoch": 0.6396509380564086, "grad_norm": 3.03125, "learning_rate": 8.13804274317884e-06, "loss": 0.87289906, "memory(GiB)": 728.98, "step": 25215, "train_speed(iter/s)": 0.545987 }, { "acc": 0.77286406, "epoch": 0.6397777774254462, "grad_norm": 3.71875, "learning_rate": 8.137226268352327e-06, "loss": 0.86527834, "memory(GiB)": 728.98, "step": 25220, "train_speed(iter/s)": 0.545272 }, { "acc": 0.77117057, "epoch": 0.6399046167944837, "grad_norm": 3.15625, "learning_rate": 8.136409655529223e-06, "loss": 0.89138794, "memory(GiB)": 728.98, "step": 25225, "train_speed(iter/s)": 0.544596 }, { "acc": 0.76770229, "epoch": 0.6400314561635213, "grad_norm": 4.28125, "learning_rate": 8.135592904745453e-06, "loss": 0.89742661, "memory(GiB)": 728.98, "step": 25230, "train_speed(iter/s)": 0.543879 }, { "acc": 0.76073432, "epoch": 0.6401582955325589, "grad_norm": 3.140625, "learning_rate": 8.134776016036942e-06, "loss": 0.90145378, "memory(GiB)": 728.98, "step": 25235, "train_speed(iter/s)": 0.543111 }, { "acc": 0.75502586, "epoch": 0.6402851349015964, "grad_norm": 3.34375, "learning_rate": 8.133958989439622e-06, "loss": 0.92790203, "memory(GiB)": 728.98, "step": 25240, "train_speed(iter/s)": 0.542369 }, { "acc": 0.7614924, "epoch": 0.640411974270634, "grad_norm": 3.390625, "learning_rate": 8.133141824989432e-06, "loss": 0.93639088, "memory(GiB)": 728.98, "step": 25245, "train_speed(iter/s)": 0.54172 }, { "acc": 0.76786995, "epoch": 0.6405388136396716, "grad_norm": 3.53125, "learning_rate": 8.132324522722315e-06, "loss": 0.87607517, "memory(GiB)": 728.98, "step": 25250, "train_speed(iter/s)": 0.541009 }, { "acc": 0.74799104, "epoch": 0.6406656530087091, "grad_norm": 3.484375, "learning_rate": 8.131507082674223e-06, "loss": 0.95200729, "memory(GiB)": 728.98, "step": 25255, "train_speed(iter/s)": 0.540341 }, { "acc": 0.75834222, "epoch": 0.6407924923777467, "grad_norm": 2.84375, "learning_rate": 8.130689504881112e-06, "loss": 0.88395338, "memory(GiB)": 728.98, "step": 25260, "train_speed(iter/s)": 0.539597 }, { "acc": 0.75357985, "epoch": 0.6409193317467842, "grad_norm": 3.90625, "learning_rate": 8.129871789378946e-06, "loss": 0.8931735, "memory(GiB)": 728.98, "step": 25265, "train_speed(iter/s)": 0.538941 }, { "acc": 0.77245383, "epoch": 0.6410461711158217, "grad_norm": 2.875, "learning_rate": 8.129053936203688e-06, "loss": 0.83703547, "memory(GiB)": 728.98, "step": 25270, "train_speed(iter/s)": 0.538242 }, { "acc": 0.76345048, "epoch": 0.6411730104848593, "grad_norm": 3.046875, "learning_rate": 8.128235945391321e-06, "loss": 0.90310593, "memory(GiB)": 728.98, "step": 25275, "train_speed(iter/s)": 0.537479 }, { "acc": 0.76847715, "epoch": 0.6412998498538969, "grad_norm": 3.0625, "learning_rate": 8.12741781697782e-06, "loss": 0.87958393, "memory(GiB)": 728.98, "step": 25280, "train_speed(iter/s)": 0.536868 }, { "acc": 0.78378482, "epoch": 0.6414266892229344, "grad_norm": 4.09375, "learning_rate": 8.126599550999174e-06, "loss": 0.86059456, "memory(GiB)": 728.98, "step": 25285, "train_speed(iter/s)": 0.536215 }, { "acc": 0.76414466, "epoch": 0.641553528591972, "grad_norm": 3.421875, "learning_rate": 8.125781147491378e-06, "loss": 0.9001441, "memory(GiB)": 728.98, "step": 25290, "train_speed(iter/s)": 0.535577 }, { "acc": 0.74524393, "epoch": 0.6416803679610096, "grad_norm": 3.375, "learning_rate": 8.124962606490427e-06, "loss": 0.94309483, "memory(GiB)": 728.98, "step": 25295, "train_speed(iter/s)": 0.534912 }, { "acc": 0.75836196, "epoch": 0.6418072073300471, "grad_norm": 6.59375, "learning_rate": 8.124143928032328e-06, "loss": 0.96608963, "memory(GiB)": 728.98, "step": 25300, "train_speed(iter/s)": 0.534266 }, { "acc": 0.75566959, "epoch": 0.6419340466990847, "grad_norm": 3.09375, "learning_rate": 8.12332511215309e-06, "loss": 0.87834349, "memory(GiB)": 728.98, "step": 25305, "train_speed(iter/s)": 0.533701 }, { "acc": 0.77260041, "epoch": 0.6420608860681223, "grad_norm": 3.6875, "learning_rate": 8.122506158888734e-06, "loss": 0.86569166, "memory(GiB)": 728.98, "step": 25310, "train_speed(iter/s)": 0.533032 }, { "acc": 0.75448027, "epoch": 0.6421877254371599, "grad_norm": 2.859375, "learning_rate": 8.121687068275282e-06, "loss": 0.88285723, "memory(GiB)": 728.98, "step": 25315, "train_speed(iter/s)": 0.53226 }, { "acc": 0.7738318, "epoch": 0.6423145648061974, "grad_norm": 3.5, "learning_rate": 8.120867840348758e-06, "loss": 0.90454035, "memory(GiB)": 728.98, "step": 25320, "train_speed(iter/s)": 0.531611 }, { "acc": 0.76273427, "epoch": 0.6424414041752349, "grad_norm": 3.421875, "learning_rate": 8.120048475145205e-06, "loss": 0.9305934, "memory(GiB)": 728.98, "step": 25325, "train_speed(iter/s)": 0.530988 }, { "acc": 0.74597569, "epoch": 0.6425682435442724, "grad_norm": 4.78125, "learning_rate": 8.11922897270066e-06, "loss": 0.96668749, "memory(GiB)": 728.98, "step": 25330, "train_speed(iter/s)": 0.530314 }, { "acc": 0.73715944, "epoch": 0.64269508291331, "grad_norm": 4.4375, "learning_rate": 8.11840933305117e-06, "loss": 1.00044212, "memory(GiB)": 728.98, "step": 25335, "train_speed(iter/s)": 0.529595 }, { "acc": 0.76480646, "epoch": 0.6428219222823476, "grad_norm": 3.40625, "learning_rate": 8.11758955623279e-06, "loss": 0.89162502, "memory(GiB)": 728.98, "step": 25340, "train_speed(iter/s)": 0.528963 }, { "acc": 0.76407313, "epoch": 0.6429487616513851, "grad_norm": 3.546875, "learning_rate": 8.116769642281579e-06, "loss": 0.92790518, "memory(GiB)": 728.98, "step": 25345, "train_speed(iter/s)": 0.528332 }, { "acc": 0.76458397, "epoch": 0.6430756010204227, "grad_norm": 3.4375, "learning_rate": 8.1159495912336e-06, "loss": 0.94345503, "memory(GiB)": 728.98, "step": 25350, "train_speed(iter/s)": 0.527744 }, { "acc": 0.76000781, "epoch": 0.6432024403894603, "grad_norm": 2.859375, "learning_rate": 8.115129403124929e-06, "loss": 0.91107979, "memory(GiB)": 728.98, "step": 25355, "train_speed(iter/s)": 0.5271 }, { "acc": 0.76813121, "epoch": 0.6433292797584979, "grad_norm": 4.0, "learning_rate": 8.11430907799164e-06, "loss": 0.90678225, "memory(GiB)": 728.98, "step": 25360, "train_speed(iter/s)": 0.52647 }, { "acc": 0.76800766, "epoch": 0.6434561191275354, "grad_norm": 3.9375, "learning_rate": 8.113488615869818e-06, "loss": 0.86228142, "memory(GiB)": 728.98, "step": 25365, "train_speed(iter/s)": 0.525792 }, { "acc": 0.77553196, "epoch": 0.643582958496573, "grad_norm": 3.671875, "learning_rate": 8.11266801679555e-06, "loss": 0.91649046, "memory(GiB)": 728.98, "step": 25370, "train_speed(iter/s)": 0.525185 }, { "acc": 0.76699772, "epoch": 0.6437097978656106, "grad_norm": 3.328125, "learning_rate": 8.111847280804935e-06, "loss": 0.93267632, "memory(GiB)": 728.98, "step": 25375, "train_speed(iter/s)": 0.524578 }, { "acc": 0.77915769, "epoch": 0.6438366372346481, "grad_norm": 3.609375, "learning_rate": 8.111026407934073e-06, "loss": 0.87452669, "memory(GiB)": 728.98, "step": 25380, "train_speed(iter/s)": 0.523948 }, { "acc": 0.76085787, "epoch": 0.6439634766036856, "grad_norm": 3.21875, "learning_rate": 8.110205398219072e-06, "loss": 0.90816908, "memory(GiB)": 728.98, "step": 25385, "train_speed(iter/s)": 0.523322 }, { "acc": 0.7528954, "epoch": 0.6440903159727231, "grad_norm": 3.140625, "learning_rate": 8.109384251696044e-06, "loss": 0.96726274, "memory(GiB)": 728.98, "step": 25390, "train_speed(iter/s)": 0.522679 }, { "acc": 0.76239948, "epoch": 0.6442171553417607, "grad_norm": 2.75, "learning_rate": 8.10856296840111e-06, "loss": 0.90465422, "memory(GiB)": 728.98, "step": 25395, "train_speed(iter/s)": 0.522095 }, { "acc": 0.75748887, "epoch": 0.6443439947107983, "grad_norm": 3.625, "learning_rate": 8.107741548370396e-06, "loss": 0.91582203, "memory(GiB)": 728.98, "step": 25400, "train_speed(iter/s)": 0.521491 }, { "acc": 0.77216516, "epoch": 0.6444708340798359, "grad_norm": 3.09375, "learning_rate": 8.106919991640034e-06, "loss": 0.90159969, "memory(GiB)": 728.98, "step": 25405, "train_speed(iter/s)": 0.520834 }, { "acc": 0.7711884, "epoch": 0.6445976734488734, "grad_norm": 3.65625, "learning_rate": 8.106098298246162e-06, "loss": 0.8788991, "memory(GiB)": 728.98, "step": 25410, "train_speed(iter/s)": 0.520239 }, { "acc": 0.75818291, "epoch": 0.644724512817911, "grad_norm": 3.375, "learning_rate": 8.10527646822492e-06, "loss": 0.98049545, "memory(GiB)": 728.98, "step": 25415, "train_speed(iter/s)": 0.519544 }, { "acc": 0.76267848, "epoch": 0.6448513521869486, "grad_norm": 3.5, "learning_rate": 8.10445450161246e-06, "loss": 0.88616619, "memory(GiB)": 728.98, "step": 25420, "train_speed(iter/s)": 0.518859 }, { "acc": 0.76757016, "epoch": 0.6449781915559861, "grad_norm": 3.765625, "learning_rate": 8.103632398444938e-06, "loss": 0.8943573, "memory(GiB)": 728.98, "step": 25425, "train_speed(iter/s)": 0.5183 }, { "acc": 0.76117439, "epoch": 0.6451050309250237, "grad_norm": 2.90625, "learning_rate": 8.102810158758517e-06, "loss": 0.90722427, "memory(GiB)": 728.98, "step": 25430, "train_speed(iter/s)": 0.517717 }, { "acc": 0.75593367, "epoch": 0.6452318702940613, "grad_norm": 3.203125, "learning_rate": 8.101987782589362e-06, "loss": 0.89903107, "memory(GiB)": 728.98, "step": 25435, "train_speed(iter/s)": 0.517127 }, { "acc": 0.76786056, "epoch": 0.6453587096630988, "grad_norm": 3.28125, "learning_rate": 8.10116526997365e-06, "loss": 0.88227482, "memory(GiB)": 728.98, "step": 25440, "train_speed(iter/s)": 0.516569 }, { "acc": 0.76931195, "epoch": 0.6454855490321363, "grad_norm": 3.53125, "learning_rate": 8.100342620947557e-06, "loss": 0.88139439, "memory(GiB)": 728.98, "step": 25445, "train_speed(iter/s)": 0.516018 }, { "acc": 0.7692677, "epoch": 0.6456123884011739, "grad_norm": 3.15625, "learning_rate": 8.09951983554727e-06, "loss": 0.8600008, "memory(GiB)": 728.98, "step": 25450, "train_speed(iter/s)": 0.515449 }, { "acc": 0.76905417, "epoch": 0.6457392277702114, "grad_norm": 3.515625, "learning_rate": 8.098696913808983e-06, "loss": 0.88369074, "memory(GiB)": 728.98, "step": 25455, "train_speed(iter/s)": 0.51485 }, { "acc": 0.76825328, "epoch": 0.645866067139249, "grad_norm": 4.34375, "learning_rate": 8.09787385576889e-06, "loss": 0.89802561, "memory(GiB)": 728.98, "step": 25460, "train_speed(iter/s)": 0.514302 }, { "acc": 0.77815881, "epoch": 0.6459929065082866, "grad_norm": 3.765625, "learning_rate": 8.097050661463198e-06, "loss": 0.87545242, "memory(GiB)": 728.98, "step": 25465, "train_speed(iter/s)": 0.513662 }, { "acc": 0.76280222, "epoch": 0.6461197458773241, "grad_norm": 3.46875, "learning_rate": 8.096227330928113e-06, "loss": 0.90945282, "memory(GiB)": 728.98, "step": 25470, "train_speed(iter/s)": 0.513054 }, { "acc": 0.76855464, "epoch": 0.6462465852463617, "grad_norm": 4.375, "learning_rate": 8.095403864199854e-06, "loss": 0.85595036, "memory(GiB)": 728.98, "step": 25475, "train_speed(iter/s)": 0.512491 }, { "acc": 0.75311213, "epoch": 0.6463734246153993, "grad_norm": 3.1875, "learning_rate": 8.094580261314642e-06, "loss": 0.91645126, "memory(GiB)": 728.98, "step": 25480, "train_speed(iter/s)": 0.511933 }, { "acc": 0.77982721, "epoch": 0.6465002639844368, "grad_norm": 3.984375, "learning_rate": 8.093756522308704e-06, "loss": 0.8587615, "memory(GiB)": 728.98, "step": 25485, "train_speed(iter/s)": 0.511308 }, { "acc": 0.76731257, "epoch": 0.6466271033534744, "grad_norm": 5.25, "learning_rate": 8.092932647218274e-06, "loss": 0.86989145, "memory(GiB)": 728.98, "step": 25490, "train_speed(iter/s)": 0.510767 }, { "acc": 0.76901827, "epoch": 0.646753942722512, "grad_norm": 3.140625, "learning_rate": 8.092108636079592e-06, "loss": 0.87765884, "memory(GiB)": 728.98, "step": 25495, "train_speed(iter/s)": 0.510162 }, { "acc": 0.76296449, "epoch": 0.6468807820915495, "grad_norm": 3.34375, "learning_rate": 8.091284488928902e-06, "loss": 0.90695591, "memory(GiB)": 728.98, "step": 25500, "train_speed(iter/s)": 0.509565 }, { "epoch": 0.6468807820915495, "eval_acc": 0.7542751173195047, "eval_loss": 0.8730300068855286, "eval_runtime": 1153.6701, "eval_samples_per_second": 5.522, "eval_steps_per_second": 5.522, "step": 25500 }, { "acc": 0.76530156, "epoch": 0.647007621460587, "grad_norm": 3.75, "learning_rate": 8.090460205802457e-06, "loss": 0.90685158, "memory(GiB)": 728.98, "step": 25505, "train_speed(iter/s)": 0.490588 }, { "acc": 0.76566024, "epoch": 0.6471344608296246, "grad_norm": 3.546875, "learning_rate": 8.089635786736516e-06, "loss": 0.90318708, "memory(GiB)": 728.98, "step": 25510, "train_speed(iter/s)": 0.490036 }, { "acc": 0.76162057, "epoch": 0.6472613001986621, "grad_norm": 3.359375, "learning_rate": 8.08881123176734e-06, "loss": 0.90504551, "memory(GiB)": 728.98, "step": 25515, "train_speed(iter/s)": 0.489405 }, { "acc": 0.75744796, "epoch": 0.6473881395676997, "grad_norm": 3.0625, "learning_rate": 8.087986540931196e-06, "loss": 0.91263952, "memory(GiB)": 728.98, "step": 25520, "train_speed(iter/s)": 0.488864 }, { "acc": 0.76455803, "epoch": 0.6475149789367373, "grad_norm": 4.09375, "learning_rate": 8.087161714264366e-06, "loss": 0.86632662, "memory(GiB)": 728.98, "step": 25525, "train_speed(iter/s)": 0.488376 }, { "acc": 0.7675602, "epoch": 0.6476418183057748, "grad_norm": 3.390625, "learning_rate": 8.08633675180313e-06, "loss": 0.8852087, "memory(GiB)": 728.98, "step": 25530, "train_speed(iter/s)": 0.487794 }, { "acc": 0.75548248, "epoch": 0.6477686576748124, "grad_norm": 2.8125, "learning_rate": 8.085511653583772e-06, "loss": 0.9183465, "memory(GiB)": 728.98, "step": 25535, "train_speed(iter/s)": 0.487306 }, { "acc": 0.75543718, "epoch": 0.64789549704385, "grad_norm": 3.6875, "learning_rate": 8.08468641964259e-06, "loss": 0.95104752, "memory(GiB)": 728.98, "step": 25540, "train_speed(iter/s)": 0.486805 }, { "acc": 0.7663434, "epoch": 0.6480223364128875, "grad_norm": 3.296875, "learning_rate": 8.083861050015878e-06, "loss": 0.9287734, "memory(GiB)": 728.98, "step": 25545, "train_speed(iter/s)": 0.486323 }, { "acc": 0.76011691, "epoch": 0.6481491757819251, "grad_norm": 3.515625, "learning_rate": 8.083035544739944e-06, "loss": 0.88465624, "memory(GiB)": 728.98, "step": 25550, "train_speed(iter/s)": 0.485799 }, { "acc": 0.77368689, "epoch": 0.6482760151509627, "grad_norm": 3.5625, "learning_rate": 8.0822099038511e-06, "loss": 0.86356783, "memory(GiB)": 728.98, "step": 25555, "train_speed(iter/s)": 0.485296 }, { "acc": 0.75441656, "epoch": 0.6484028545200002, "grad_norm": 3.0, "learning_rate": 8.081384127385663e-06, "loss": 0.94324589, "memory(GiB)": 728.98, "step": 25560, "train_speed(iter/s)": 0.484745 }, { "acc": 0.76219225, "epoch": 0.6485296938890377, "grad_norm": 2.734375, "learning_rate": 8.080558215379957e-06, "loss": 0.87475872, "memory(GiB)": 728.98, "step": 25565, "train_speed(iter/s)": 0.484237 }, { "acc": 0.78121285, "epoch": 0.6486565332580753, "grad_norm": 3.6875, "learning_rate": 8.07973216787031e-06, "loss": 0.87274733, "memory(GiB)": 728.98, "step": 25570, "train_speed(iter/s)": 0.483684 }, { "acc": 0.76385598, "epoch": 0.6487833726271128, "grad_norm": 3.359375, "learning_rate": 8.078905984893057e-06, "loss": 0.87442379, "memory(GiB)": 728.98, "step": 25575, "train_speed(iter/s)": 0.483193 }, { "acc": 0.76024418, "epoch": 0.6489102119961504, "grad_norm": 3.96875, "learning_rate": 8.07807966648454e-06, "loss": 0.90298033, "memory(GiB)": 728.98, "step": 25580, "train_speed(iter/s)": 0.482671 }, { "acc": 0.75534658, "epoch": 0.649037051365188, "grad_norm": 2.875, "learning_rate": 8.077253212681106e-06, "loss": 0.89645109, "memory(GiB)": 728.98, "step": 25585, "train_speed(iter/s)": 0.482157 }, { "acc": 0.77158175, "epoch": 0.6491638907342255, "grad_norm": 3.109375, "learning_rate": 8.076426623519108e-06, "loss": 0.86081352, "memory(GiB)": 728.98, "step": 25590, "train_speed(iter/s)": 0.48162 }, { "acc": 0.77273846, "epoch": 0.6492907301032631, "grad_norm": 3.0625, "learning_rate": 8.075599899034905e-06, "loss": 0.88831968, "memory(GiB)": 728.98, "step": 25595, "train_speed(iter/s)": 0.480994 }, { "acc": 0.76244974, "epoch": 0.6494175694723007, "grad_norm": 4.0625, "learning_rate": 8.074773039264862e-06, "loss": 0.95115461, "memory(GiB)": 728.98, "step": 25600, "train_speed(iter/s)": 0.480472 }, { "acc": 0.77665734, "epoch": 0.6495444088413382, "grad_norm": 3.765625, "learning_rate": 8.073946044245349e-06, "loss": 0.86268454, "memory(GiB)": 728.98, "step": 25605, "train_speed(iter/s)": 0.479966 }, { "acc": 0.7628334, "epoch": 0.6496712482103758, "grad_norm": 3.65625, "learning_rate": 8.073118914012745e-06, "loss": 0.88926983, "memory(GiB)": 728.98, "step": 25610, "train_speed(iter/s)": 0.479452 }, { "acc": 0.76138172, "epoch": 0.6497980875794134, "grad_norm": 4.3125, "learning_rate": 8.07229164860343e-06, "loss": 0.95404205, "memory(GiB)": 728.98, "step": 25615, "train_speed(iter/s)": 0.478953 }, { "acc": 0.76312981, "epoch": 0.6499249269484509, "grad_norm": 2.953125, "learning_rate": 8.071464248053796e-06, "loss": 0.89433832, "memory(GiB)": 728.98, "step": 25620, "train_speed(iter/s)": 0.47844 }, { "acc": 0.76490927, "epoch": 0.6500517663174884, "grad_norm": 4.21875, "learning_rate": 8.070636712400234e-06, "loss": 0.91365805, "memory(GiB)": 728.98, "step": 25625, "train_speed(iter/s)": 0.477945 }, { "acc": 0.76680102, "epoch": 0.650178605686526, "grad_norm": 3.390625, "learning_rate": 8.069809041679147e-06, "loss": 0.90277071, "memory(GiB)": 728.98, "step": 25630, "train_speed(iter/s)": 0.477475 }, { "acc": 0.76154165, "epoch": 0.6503054450555635, "grad_norm": 2.78125, "learning_rate": 8.068981235926943e-06, "loss": 0.86946526, "memory(GiB)": 728.98, "step": 25635, "train_speed(iter/s)": 0.476936 }, { "acc": 0.77769623, "epoch": 0.6504322844246011, "grad_norm": 3.875, "learning_rate": 8.068153295180032e-06, "loss": 0.88675985, "memory(GiB)": 728.98, "step": 25640, "train_speed(iter/s)": 0.476449 }, { "acc": 0.75080948, "epoch": 0.6505591237936387, "grad_norm": 3.671875, "learning_rate": 8.067325219474832e-06, "loss": 0.96165743, "memory(GiB)": 728.98, "step": 25645, "train_speed(iter/s)": 0.475961 }, { "acc": 0.7625988, "epoch": 0.6506859631626762, "grad_norm": 3.71875, "learning_rate": 8.06649700884777e-06, "loss": 0.87938614, "memory(GiB)": 728.98, "step": 25650, "train_speed(iter/s)": 0.475471 }, { "acc": 0.76445289, "epoch": 0.6508128025317138, "grad_norm": 3.453125, "learning_rate": 8.065668663335276e-06, "loss": 0.93283844, "memory(GiB)": 728.98, "step": 25655, "train_speed(iter/s)": 0.474987 }, { "acc": 0.76182113, "epoch": 0.6509396419007514, "grad_norm": 3.734375, "learning_rate": 8.064840182973783e-06, "loss": 0.9050086, "memory(GiB)": 728.98, "step": 25660, "train_speed(iter/s)": 0.474464 }, { "acc": 0.75925612, "epoch": 0.6510664812697889, "grad_norm": 3.765625, "learning_rate": 8.064011567799737e-06, "loss": 0.89252071, "memory(GiB)": 728.98, "step": 25665, "train_speed(iter/s)": 0.47398 }, { "acc": 0.77015696, "epoch": 0.6511933206388265, "grad_norm": 4.0, "learning_rate": 8.063182817849582e-06, "loss": 0.8695714, "memory(GiB)": 728.98, "step": 25670, "train_speed(iter/s)": 0.473507 }, { "acc": 0.76375194, "epoch": 0.6513201600078641, "grad_norm": 4.21875, "learning_rate": 8.062353933159778e-06, "loss": 0.87364483, "memory(GiB)": 728.98, "step": 25675, "train_speed(iter/s)": 0.472982 }, { "acc": 0.76915069, "epoch": 0.6514469993769016, "grad_norm": 4.25, "learning_rate": 8.06152491376678e-06, "loss": 0.84140587, "memory(GiB)": 728.98, "step": 25680, "train_speed(iter/s)": 0.472475 }, { "acc": 0.75895576, "epoch": 0.6515738387459391, "grad_norm": 3.453125, "learning_rate": 8.060695759707058e-06, "loss": 0.87798491, "memory(GiB)": 728.98, "step": 25685, "train_speed(iter/s)": 0.471883 }, { "acc": 0.76969681, "epoch": 0.6517006781149767, "grad_norm": 3.09375, "learning_rate": 8.05986647101708e-06, "loss": 0.87676096, "memory(GiB)": 728.98, "step": 25690, "train_speed(iter/s)": 0.471407 }, { "acc": 0.77098985, "epoch": 0.6518275174840142, "grad_norm": 2.921875, "learning_rate": 8.059037047733325e-06, "loss": 0.86163902, "memory(GiB)": 728.98, "step": 25695, "train_speed(iter/s)": 0.470924 }, { "acc": 0.76500297, "epoch": 0.6519543568530518, "grad_norm": 3.546875, "learning_rate": 8.058207489892275e-06, "loss": 0.88591795, "memory(GiB)": 728.98, "step": 25700, "train_speed(iter/s)": 0.470333 }, { "acc": 0.7613502, "epoch": 0.6520811962220894, "grad_norm": 3.484375, "learning_rate": 8.057377797530426e-06, "loss": 0.88134708, "memory(GiB)": 728.98, "step": 25705, "train_speed(iter/s)": 0.469907 }, { "acc": 0.76073184, "epoch": 0.6522080355911269, "grad_norm": 3.609375, "learning_rate": 8.056547970684265e-06, "loss": 0.96238203, "memory(GiB)": 728.98, "step": 25710, "train_speed(iter/s)": 0.469448 }, { "acc": 0.76320591, "epoch": 0.6523348749601645, "grad_norm": 3.28125, "learning_rate": 8.055718009390301e-06, "loss": 0.92616501, "memory(GiB)": 728.98, "step": 25715, "train_speed(iter/s)": 0.468956 }, { "acc": 0.75526066, "epoch": 0.6524617143292021, "grad_norm": 3.640625, "learning_rate": 8.054887913685036e-06, "loss": 0.96851463, "memory(GiB)": 728.98, "step": 25720, "train_speed(iter/s)": 0.46851 }, { "acc": 0.78536301, "epoch": 0.6525885536982396, "grad_norm": 3.359375, "learning_rate": 8.054057683604984e-06, "loss": 0.78938246, "memory(GiB)": 728.98, "step": 25725, "train_speed(iter/s)": 0.468059 }, { "acc": 0.77548847, "epoch": 0.6527153930672772, "grad_norm": 3.21875, "learning_rate": 8.053227319186667e-06, "loss": 0.88754358, "memory(GiB)": 728.98, "step": 25730, "train_speed(iter/s)": 0.467599 }, { "acc": 0.76231351, "epoch": 0.6528422324363148, "grad_norm": 3.71875, "learning_rate": 8.052396820466608e-06, "loss": 0.89550734, "memory(GiB)": 728.98, "step": 25735, "train_speed(iter/s)": 0.467088 }, { "acc": 0.75286975, "epoch": 0.6529690718053524, "grad_norm": 3.1875, "learning_rate": 8.051566187481339e-06, "loss": 0.91546898, "memory(GiB)": 728.98, "step": 25740, "train_speed(iter/s)": 0.46658 }, { "acc": 0.76345363, "epoch": 0.6530959111743898, "grad_norm": 3.015625, "learning_rate": 8.050735420267396e-06, "loss": 0.9079814, "memory(GiB)": 728.98, "step": 25745, "train_speed(iter/s)": 0.466044 }, { "acc": 0.76893611, "epoch": 0.6532227505434274, "grad_norm": 3.15625, "learning_rate": 8.049904518861321e-06, "loss": 0.95988445, "memory(GiB)": 728.98, "step": 25750, "train_speed(iter/s)": 0.465484 }, { "acc": 0.76655602, "epoch": 0.653349589912465, "grad_norm": 3.828125, "learning_rate": 8.049073483299666e-06, "loss": 0.91711855, "memory(GiB)": 728.98, "step": 25755, "train_speed(iter/s)": 0.465012 }, { "acc": 0.77432137, "epoch": 0.6534764292815025, "grad_norm": 3.875, "learning_rate": 8.048242313618982e-06, "loss": 0.91328087, "memory(GiB)": 728.98, "step": 25760, "train_speed(iter/s)": 0.46458 }, { "acc": 0.77693095, "epoch": 0.6536032686505401, "grad_norm": 3.078125, "learning_rate": 8.04741100985583e-06, "loss": 0.83725157, "memory(GiB)": 728.98, "step": 25765, "train_speed(iter/s)": 0.464111 }, { "acc": 0.77659478, "epoch": 0.6537301080195776, "grad_norm": 3.296875, "learning_rate": 8.046579572046778e-06, "loss": 0.87727528, "memory(GiB)": 728.98, "step": 25770, "train_speed(iter/s)": 0.463647 }, { "acc": 0.76504273, "epoch": 0.6538569473886152, "grad_norm": 3.546875, "learning_rate": 8.045748000228399e-06, "loss": 0.89459467, "memory(GiB)": 728.98, "step": 25775, "train_speed(iter/s)": 0.463179 }, { "acc": 0.75263405, "epoch": 0.6539837867576528, "grad_norm": 3.4375, "learning_rate": 8.044916294437269e-06, "loss": 0.91989565, "memory(GiB)": 728.98, "step": 25780, "train_speed(iter/s)": 0.462708 }, { "acc": 0.76871738, "epoch": 0.6541106261266904, "grad_norm": 2.984375, "learning_rate": 8.04408445470997e-06, "loss": 0.92995844, "memory(GiB)": 728.98, "step": 25785, "train_speed(iter/s)": 0.462276 }, { "acc": 0.76737113, "epoch": 0.6542374654957279, "grad_norm": 4.21875, "learning_rate": 8.0432524810831e-06, "loss": 0.92585878, "memory(GiB)": 728.98, "step": 25790, "train_speed(iter/s)": 0.461733 }, { "acc": 0.77191706, "epoch": 0.6543643048647655, "grad_norm": 3.65625, "learning_rate": 8.042420373593245e-06, "loss": 0.92554779, "memory(GiB)": 728.98, "step": 25795, "train_speed(iter/s)": 0.461306 }, { "acc": 0.76243105, "epoch": 0.6544911442338031, "grad_norm": 2.984375, "learning_rate": 8.041588132277014e-06, "loss": 0.85453806, "memory(GiB)": 728.98, "step": 25800, "train_speed(iter/s)": 0.460807 }, { "acc": 0.7615736, "epoch": 0.6546179836028405, "grad_norm": 3.359375, "learning_rate": 8.040755757171013e-06, "loss": 0.87715378, "memory(GiB)": 728.98, "step": 25805, "train_speed(iter/s)": 0.460387 }, { "acc": 0.76539135, "epoch": 0.6547448229718781, "grad_norm": 2.9375, "learning_rate": 8.039923248311854e-06, "loss": 0.891537, "memory(GiB)": 728.98, "step": 25810, "train_speed(iter/s)": 0.459961 }, { "acc": 0.79023952, "epoch": 0.6548716623409157, "grad_norm": 3.140625, "learning_rate": 8.039090605736156e-06, "loss": 0.83581133, "memory(GiB)": 728.98, "step": 25815, "train_speed(iter/s)": 0.459505 }, { "acc": 0.77019963, "epoch": 0.6549985017099532, "grad_norm": 4.125, "learning_rate": 8.038257829480546e-06, "loss": 0.87861843, "memory(GiB)": 728.98, "step": 25820, "train_speed(iter/s)": 0.459045 }, { "acc": 0.77539797, "epoch": 0.6551253410789908, "grad_norm": 3.59375, "learning_rate": 8.037424919581654e-06, "loss": 0.8728653, "memory(GiB)": 728.98, "step": 25825, "train_speed(iter/s)": 0.458536 }, { "acc": 0.74952111, "epoch": 0.6552521804480284, "grad_norm": 3.796875, "learning_rate": 8.036591876076119e-06, "loss": 0.89683466, "memory(GiB)": 728.98, "step": 25830, "train_speed(iter/s)": 0.458098 }, { "acc": 0.76725283, "epoch": 0.6553790198170659, "grad_norm": 4.5, "learning_rate": 8.035758699000582e-06, "loss": 0.93489218, "memory(GiB)": 728.98, "step": 25835, "train_speed(iter/s)": 0.457686 }, { "acc": 0.7712564, "epoch": 0.6555058591861035, "grad_norm": 3.21875, "learning_rate": 8.034925388391693e-06, "loss": 0.91872654, "memory(GiB)": 728.98, "step": 25840, "train_speed(iter/s)": 0.457211 }, { "acc": 0.77089953, "epoch": 0.6556326985551411, "grad_norm": 5.71875, "learning_rate": 8.034091944286104e-06, "loss": 0.90700979, "memory(GiB)": 728.98, "step": 25845, "train_speed(iter/s)": 0.456742 }, { "acc": 0.76516294, "epoch": 0.6557595379241786, "grad_norm": 3.234375, "learning_rate": 8.033258366720478e-06, "loss": 0.86356201, "memory(GiB)": 728.98, "step": 25850, "train_speed(iter/s)": 0.456328 }, { "acc": 0.77029309, "epoch": 0.6558863772932162, "grad_norm": 3.625, "learning_rate": 8.032424655731483e-06, "loss": 0.91239719, "memory(GiB)": 728.98, "step": 25855, "train_speed(iter/s)": 0.45589 }, { "acc": 0.75199752, "epoch": 0.6560132166622538, "grad_norm": 16.0, "learning_rate": 8.031590811355788e-06, "loss": 0.90933924, "memory(GiB)": 728.98, "step": 25860, "train_speed(iter/s)": 0.455404 }, { "acc": 0.76841764, "epoch": 0.6561400560312912, "grad_norm": 3.53125, "learning_rate": 8.030756833630071e-06, "loss": 0.9124259, "memory(GiB)": 728.98, "step": 25865, "train_speed(iter/s)": 0.454968 }, { "acc": 0.76549282, "epoch": 0.6562668954003288, "grad_norm": 2.953125, "learning_rate": 8.02992272259102e-06, "loss": 0.88776579, "memory(GiB)": 728.98, "step": 25870, "train_speed(iter/s)": 0.454531 }, { "acc": 0.76956449, "epoch": 0.6563937347693664, "grad_norm": 4.21875, "learning_rate": 8.02908847827532e-06, "loss": 0.87297716, "memory(GiB)": 728.98, "step": 25875, "train_speed(iter/s)": 0.45403 }, { "acc": 0.76894264, "epoch": 0.6565205741384039, "grad_norm": 3.78125, "learning_rate": 8.028254100719671e-06, "loss": 0.89911528, "memory(GiB)": 728.98, "step": 25880, "train_speed(iter/s)": 0.453567 }, { "acc": 0.77724185, "epoch": 0.6566474135074415, "grad_norm": 3.171875, "learning_rate": 8.027419589960773e-06, "loss": 0.88542929, "memory(GiB)": 728.98, "step": 25885, "train_speed(iter/s)": 0.453128 }, { "acc": 0.76543546, "epoch": 0.6567742528764791, "grad_norm": 3.921875, "learning_rate": 8.026584946035332e-06, "loss": 0.93374195, "memory(GiB)": 728.98, "step": 25890, "train_speed(iter/s)": 0.452682 }, { "acc": 0.7530869, "epoch": 0.6569010922455166, "grad_norm": 3.03125, "learning_rate": 8.025750168980063e-06, "loss": 0.94743214, "memory(GiB)": 728.98, "step": 25895, "train_speed(iter/s)": 0.452223 }, { "acc": 0.76445122, "epoch": 0.6570279316145542, "grad_norm": 3.609375, "learning_rate": 8.024915258831684e-06, "loss": 0.90322351, "memory(GiB)": 728.98, "step": 25900, "train_speed(iter/s)": 0.451812 }, { "acc": 0.76800179, "epoch": 0.6571547709835918, "grad_norm": 4.125, "learning_rate": 8.024080215626921e-06, "loss": 0.92394772, "memory(GiB)": 728.98, "step": 25905, "train_speed(iter/s)": 0.451403 }, { "acc": 0.76961241, "epoch": 0.6572816103526293, "grad_norm": 3.5625, "learning_rate": 8.023245039402505e-06, "loss": 0.90205145, "memory(GiB)": 728.98, "step": 25910, "train_speed(iter/s)": 0.450921 }, { "acc": 0.75762525, "epoch": 0.6574084497216669, "grad_norm": 3.265625, "learning_rate": 8.022409730195172e-06, "loss": 0.91298256, "memory(GiB)": 728.98, "step": 25915, "train_speed(iter/s)": 0.450522 }, { "acc": 0.75705829, "epoch": 0.6575352890907045, "grad_norm": 3.9375, "learning_rate": 8.021574288041665e-06, "loss": 0.96446962, "memory(GiB)": 728.98, "step": 25920, "train_speed(iter/s)": 0.450086 }, { "acc": 0.75844975, "epoch": 0.6576621284597419, "grad_norm": 3.453125, "learning_rate": 8.020738712978733e-06, "loss": 0.87863636, "memory(GiB)": 728.98, "step": 25925, "train_speed(iter/s)": 0.44966 }, { "acc": 0.77006826, "epoch": 0.6577889678287795, "grad_norm": 3.828125, "learning_rate": 8.01990300504313e-06, "loss": 0.87493391, "memory(GiB)": 728.98, "step": 25930, "train_speed(iter/s)": 0.449244 }, { "acc": 0.77676563, "epoch": 0.6579158071978171, "grad_norm": 3.5625, "learning_rate": 8.019067164271614e-06, "loss": 0.84932089, "memory(GiB)": 728.98, "step": 25935, "train_speed(iter/s)": 0.448813 }, { "acc": 0.76685419, "epoch": 0.6580426465668546, "grad_norm": 4.4375, "learning_rate": 8.018231190700954e-06, "loss": 0.88672237, "memory(GiB)": 728.98, "step": 25940, "train_speed(iter/s)": 0.448399 }, { "acc": 0.7747817, "epoch": 0.6581694859358922, "grad_norm": 3.171875, "learning_rate": 8.01739508436792e-06, "loss": 0.83851385, "memory(GiB)": 728.98, "step": 25945, "train_speed(iter/s)": 0.44794 }, { "acc": 0.7639071, "epoch": 0.6582963253049298, "grad_norm": 3.46875, "learning_rate": 8.016558845309292e-06, "loss": 0.8788146, "memory(GiB)": 728.98, "step": 25950, "train_speed(iter/s)": 0.447513 }, { "acc": 0.76523104, "epoch": 0.6584231646739673, "grad_norm": 3.75, "learning_rate": 8.015722473561849e-06, "loss": 0.91268425, "memory(GiB)": 728.98, "step": 25955, "train_speed(iter/s)": 0.447108 }, { "acc": 0.75317483, "epoch": 0.6585500040430049, "grad_norm": 3.59375, "learning_rate": 8.014885969162385e-06, "loss": 0.87626305, "memory(GiB)": 728.98, "step": 25960, "train_speed(iter/s)": 0.446677 }, { "acc": 0.76247797, "epoch": 0.6586768434120425, "grad_norm": 3.1875, "learning_rate": 8.014049332147693e-06, "loss": 0.92568941, "memory(GiB)": 728.98, "step": 25965, "train_speed(iter/s)": 0.446226 }, { "acc": 0.77615008, "epoch": 0.65880368278108, "grad_norm": 4.0, "learning_rate": 8.013212562554573e-06, "loss": 0.87022877, "memory(GiB)": 728.98, "step": 25970, "train_speed(iter/s)": 0.445848 }, { "acc": 0.7644134, "epoch": 0.6589305221501176, "grad_norm": 3.71875, "learning_rate": 8.012375660419835e-06, "loss": 0.9683382, "memory(GiB)": 728.98, "step": 25975, "train_speed(iter/s)": 0.445419 }, { "acc": 0.76547046, "epoch": 0.6590573615191552, "grad_norm": 3.546875, "learning_rate": 8.011538625780288e-06, "loss": 0.93157082, "memory(GiB)": 728.98, "step": 25980, "train_speed(iter/s)": 0.44497 }, { "acc": 0.76663432, "epoch": 0.6591842008881926, "grad_norm": 4.15625, "learning_rate": 8.010701458672753e-06, "loss": 0.92802563, "memory(GiB)": 728.98, "step": 25985, "train_speed(iter/s)": 0.444582 }, { "acc": 0.75770173, "epoch": 0.6593110402572302, "grad_norm": 3.375, "learning_rate": 8.009864159134053e-06, "loss": 0.90774183, "memory(GiB)": 728.98, "step": 25990, "train_speed(iter/s)": 0.444056 }, { "acc": 0.77032447, "epoch": 0.6594378796262678, "grad_norm": 3.359375, "learning_rate": 8.00902672720102e-06, "loss": 0.88868895, "memory(GiB)": 728.98, "step": 25995, "train_speed(iter/s)": 0.44364 }, { "acc": 0.76387715, "epoch": 0.6595647189953053, "grad_norm": 4.03125, "learning_rate": 8.008189162910488e-06, "loss": 0.88726416, "memory(GiB)": 728.98, "step": 26000, "train_speed(iter/s)": 0.443185 }, { "epoch": 0.6595647189953053, "eval_acc": 0.7545023827358495, "eval_loss": 0.8720152974128723, "eval_runtime": 1149.602, "eval_samples_per_second": 5.541, "eval_steps_per_second": 5.541, "step": 26000 }, { "acc": 0.77528954, "epoch": 0.6596915583643429, "grad_norm": 3.796875, "learning_rate": 8.007351466299298e-06, "loss": 0.88758297, "memory(GiB)": 728.98, "step": 26005, "train_speed(iter/s)": 0.429097 }, { "acc": 0.76589622, "epoch": 0.6598183977333805, "grad_norm": 3.453125, "learning_rate": 8.006513637404301e-06, "loss": 0.8904561, "memory(GiB)": 728.98, "step": 26010, "train_speed(iter/s)": 0.428734 }, { "acc": 0.76742558, "epoch": 0.659945237102418, "grad_norm": 3.921875, "learning_rate": 8.005675676262347e-06, "loss": 0.86902647, "memory(GiB)": 728.98, "step": 26015, "train_speed(iter/s)": 0.428371 }, { "acc": 0.75687971, "epoch": 0.6600720764714556, "grad_norm": 3.203125, "learning_rate": 8.004837582910296e-06, "loss": 0.90948038, "memory(GiB)": 728.98, "step": 26020, "train_speed(iter/s)": 0.427963 }, { "acc": 0.76274719, "epoch": 0.6601989158404932, "grad_norm": 3.3125, "learning_rate": 8.003999357385015e-06, "loss": 0.89549437, "memory(GiB)": 728.98, "step": 26025, "train_speed(iter/s)": 0.427543 }, { "acc": 0.77181067, "epoch": 0.6603257552095307, "grad_norm": 3.296875, "learning_rate": 8.003160999723374e-06, "loss": 0.88246517, "memory(GiB)": 728.98, "step": 26030, "train_speed(iter/s)": 0.42714 }, { "acc": 0.76050177, "epoch": 0.6604525945785683, "grad_norm": 3.578125, "learning_rate": 8.002322509962249e-06, "loss": 0.89968376, "memory(GiB)": 728.98, "step": 26035, "train_speed(iter/s)": 0.426712 }, { "acc": 0.76886439, "epoch": 0.6605794339476059, "grad_norm": 4.15625, "learning_rate": 8.00148388813852e-06, "loss": 0.85677614, "memory(GiB)": 728.98, "step": 26040, "train_speed(iter/s)": 0.426353 }, { "acc": 0.76810274, "epoch": 0.6607062733166433, "grad_norm": 3.609375, "learning_rate": 8.000645134289082e-06, "loss": 0.91169415, "memory(GiB)": 728.98, "step": 26045, "train_speed(iter/s)": 0.425953 }, { "acc": 0.76381035, "epoch": 0.6608331126856809, "grad_norm": 3.359375, "learning_rate": 7.999806248450826e-06, "loss": 0.90249062, "memory(GiB)": 728.98, "step": 26050, "train_speed(iter/s)": 0.425553 }, { "acc": 0.76780014, "epoch": 0.6609599520547185, "grad_norm": 3.5625, "learning_rate": 7.998967230660648e-06, "loss": 0.89332895, "memory(GiB)": 728.98, "step": 26055, "train_speed(iter/s)": 0.425203 }, { "acc": 0.76606836, "epoch": 0.661086791423756, "grad_norm": 4.96875, "learning_rate": 7.998128080955457e-06, "loss": 0.95010109, "memory(GiB)": 728.98, "step": 26060, "train_speed(iter/s)": 0.424773 }, { "acc": 0.77670608, "epoch": 0.6612136307927936, "grad_norm": 3.671875, "learning_rate": 7.997288799372164e-06, "loss": 0.86942892, "memory(GiB)": 728.98, "step": 26065, "train_speed(iter/s)": 0.424408 }, { "acc": 0.76125798, "epoch": 0.6613404701618312, "grad_norm": 4.15625, "learning_rate": 7.996449385947689e-06, "loss": 0.91864634, "memory(GiB)": 728.98, "step": 26070, "train_speed(iter/s)": 0.424039 }, { "acc": 0.76626348, "epoch": 0.6614673095308687, "grad_norm": 4.0, "learning_rate": 7.995609840718952e-06, "loss": 0.85267506, "memory(GiB)": 728.98, "step": 26075, "train_speed(iter/s)": 0.423679 }, { "acc": 0.76186991, "epoch": 0.6615941488999063, "grad_norm": 3.25, "learning_rate": 7.994770163722883e-06, "loss": 0.90201349, "memory(GiB)": 728.98, "step": 26080, "train_speed(iter/s)": 0.423301 }, { "acc": 0.76925449, "epoch": 0.6617209882689439, "grad_norm": 3.25, "learning_rate": 7.993930354996414e-06, "loss": 0.86723919, "memory(GiB)": 728.98, "step": 26085, "train_speed(iter/s)": 0.422871 }, { "acc": 0.76513181, "epoch": 0.6618478276379814, "grad_norm": 2.9375, "learning_rate": 7.99309041457649e-06, "loss": 0.86515331, "memory(GiB)": 728.98, "step": 26090, "train_speed(iter/s)": 0.422521 }, { "acc": 0.75893226, "epoch": 0.661974667007019, "grad_norm": 2.8125, "learning_rate": 7.992250342500054e-06, "loss": 0.91431684, "memory(GiB)": 728.98, "step": 26095, "train_speed(iter/s)": 0.422133 }, { "acc": 0.77723165, "epoch": 0.6621015063760566, "grad_norm": 3.671875, "learning_rate": 7.991410138804062e-06, "loss": 0.87365713, "memory(GiB)": 728.98, "step": 26100, "train_speed(iter/s)": 0.421696 }, { "acc": 0.76263447, "epoch": 0.662228345745094, "grad_norm": 3.296875, "learning_rate": 7.990569803525465e-06, "loss": 0.90247469, "memory(GiB)": 728.98, "step": 26105, "train_speed(iter/s)": 0.421309 }, { "acc": 0.75708694, "epoch": 0.6623551851141316, "grad_norm": 3.515625, "learning_rate": 7.989729336701233e-06, "loss": 0.91989555, "memory(GiB)": 728.98, "step": 26110, "train_speed(iter/s)": 0.420933 }, { "acc": 0.7659193, "epoch": 0.6624820244831692, "grad_norm": 4.3125, "learning_rate": 7.988888738368335e-06, "loss": 0.87390633, "memory(GiB)": 728.98, "step": 26115, "train_speed(iter/s)": 0.420581 }, { "acc": 0.7767828, "epoch": 0.6626088638522067, "grad_norm": 3.765625, "learning_rate": 7.988048008563742e-06, "loss": 0.89211369, "memory(GiB)": 728.98, "step": 26120, "train_speed(iter/s)": 0.420199 }, { "acc": 0.76893969, "epoch": 0.6627357032212443, "grad_norm": 3.25, "learning_rate": 7.987207147324438e-06, "loss": 0.87056446, "memory(GiB)": 728.98, "step": 26125, "train_speed(iter/s)": 0.419821 }, { "acc": 0.7674017, "epoch": 0.6628625425902819, "grad_norm": 3.484375, "learning_rate": 7.98636615468741e-06, "loss": 0.92258968, "memory(GiB)": 728.98, "step": 26130, "train_speed(iter/s)": 0.419435 }, { "acc": 0.76830187, "epoch": 0.6629893819593194, "grad_norm": 2.96875, "learning_rate": 7.985525030689652e-06, "loss": 0.88088884, "memory(GiB)": 728.98, "step": 26135, "train_speed(iter/s)": 0.41909 }, { "acc": 0.75284262, "epoch": 0.663116221328357, "grad_norm": 3.90625, "learning_rate": 7.984683775368158e-06, "loss": 0.91730757, "memory(GiB)": 728.98, "step": 26140, "train_speed(iter/s)": 0.418732 }, { "acc": 0.75449486, "epoch": 0.6632430606973946, "grad_norm": 4.09375, "learning_rate": 7.983842388759935e-06, "loss": 0.96987028, "memory(GiB)": 728.98, "step": 26145, "train_speed(iter/s)": 0.418341 }, { "acc": 0.76515632, "epoch": 0.6633699000664321, "grad_norm": 3.625, "learning_rate": 7.983000870901991e-06, "loss": 0.91545, "memory(GiB)": 728.98, "step": 26150, "train_speed(iter/s)": 0.417941 }, { "acc": 0.77294102, "epoch": 0.6634967394354697, "grad_norm": 2.953125, "learning_rate": 7.982159221831345e-06, "loss": 0.83677635, "memory(GiB)": 728.98, "step": 26155, "train_speed(iter/s)": 0.417623 }, { "acc": 0.76842093, "epoch": 0.6636235788045073, "grad_norm": 3.15625, "learning_rate": 7.981317441585017e-06, "loss": 0.853895, "memory(GiB)": 728.98, "step": 26160, "train_speed(iter/s)": 0.417247 }, { "acc": 0.76324563, "epoch": 0.6637504181735447, "grad_norm": 4.25, "learning_rate": 7.980475530200032e-06, "loss": 0.92792625, "memory(GiB)": 728.98, "step": 26165, "train_speed(iter/s)": 0.416867 }, { "acc": 0.7687542, "epoch": 0.6638772575425823, "grad_norm": 3.453125, "learning_rate": 7.979633487713427e-06, "loss": 0.88090916, "memory(GiB)": 728.98, "step": 26170, "train_speed(iter/s)": 0.416522 }, { "acc": 0.75599732, "epoch": 0.6640040969116199, "grad_norm": 3.296875, "learning_rate": 7.978791314162236e-06, "loss": 0.88373947, "memory(GiB)": 728.98, "step": 26175, "train_speed(iter/s)": 0.416126 }, { "acc": 0.75948133, "epoch": 0.6641309362806574, "grad_norm": 3.578125, "learning_rate": 7.977949009583508e-06, "loss": 0.92331371, "memory(GiB)": 728.98, "step": 26180, "train_speed(iter/s)": 0.415747 }, { "acc": 0.76929393, "epoch": 0.664257775649695, "grad_norm": 3.546875, "learning_rate": 7.977106574014291e-06, "loss": 0.88999357, "memory(GiB)": 728.98, "step": 26185, "train_speed(iter/s)": 0.415432 }, { "acc": 0.75259066, "epoch": 0.6643846150187326, "grad_norm": 3.21875, "learning_rate": 7.97626400749164e-06, "loss": 0.94078426, "memory(GiB)": 728.98, "step": 26190, "train_speed(iter/s)": 0.415096 }, { "acc": 0.74415212, "epoch": 0.6645114543877702, "grad_norm": 3.828125, "learning_rate": 7.97542131005262e-06, "loss": 0.97121277, "memory(GiB)": 728.98, "step": 26195, "train_speed(iter/s)": 0.414746 }, { "acc": 0.76167154, "epoch": 0.6646382937568077, "grad_norm": 4.03125, "learning_rate": 7.974578481734296e-06, "loss": 0.89758701, "memory(GiB)": 728.98, "step": 26200, "train_speed(iter/s)": 0.41434 }, { "acc": 0.77324843, "epoch": 0.6647651331258453, "grad_norm": 3.015625, "learning_rate": 7.973735522573741e-06, "loss": 0.86762896, "memory(GiB)": 728.98, "step": 26205, "train_speed(iter/s)": 0.413989 }, { "acc": 0.74688001, "epoch": 0.6648919724948829, "grad_norm": 3.328125, "learning_rate": 7.97289243260804e-06, "loss": 0.91243439, "memory(GiB)": 728.98, "step": 26210, "train_speed(iter/s)": 0.41358 }, { "acc": 0.76060328, "epoch": 0.6650188118639204, "grad_norm": 4.03125, "learning_rate": 7.972049211874268e-06, "loss": 0.91896973, "memory(GiB)": 728.98, "step": 26215, "train_speed(iter/s)": 0.413224 }, { "acc": 0.75756898, "epoch": 0.665145651232958, "grad_norm": 3.390625, "learning_rate": 7.971205860409523e-06, "loss": 0.88998308, "memory(GiB)": 728.98, "step": 26220, "train_speed(iter/s)": 0.412894 }, { "acc": 0.76677437, "epoch": 0.6652724906019954, "grad_norm": 3.296875, "learning_rate": 7.9703623782509e-06, "loss": 0.90587358, "memory(GiB)": 728.98, "step": 26225, "train_speed(iter/s)": 0.412552 }, { "acc": 0.76198649, "epoch": 0.665399329971033, "grad_norm": 3.703125, "learning_rate": 7.969518765435498e-06, "loss": 0.93587332, "memory(GiB)": 728.98, "step": 26230, "train_speed(iter/s)": 0.41219 }, { "acc": 0.76633883, "epoch": 0.6655261693400706, "grad_norm": 4.03125, "learning_rate": 7.968675022000428e-06, "loss": 0.90150242, "memory(GiB)": 728.98, "step": 26235, "train_speed(iter/s)": 0.411863 }, { "acc": 0.77850461, "epoch": 0.6656530087091082, "grad_norm": 3.546875, "learning_rate": 7.9678311479828e-06, "loss": 0.8521018, "memory(GiB)": 728.98, "step": 26240, "train_speed(iter/s)": 0.411516 }, { "acc": 0.75134664, "epoch": 0.6657798480781457, "grad_norm": 4.0625, "learning_rate": 7.96698714341974e-06, "loss": 0.91410818, "memory(GiB)": 728.98, "step": 26245, "train_speed(iter/s)": 0.411141 }, { "acc": 0.7707243, "epoch": 0.6659066874471833, "grad_norm": 3.453125, "learning_rate": 7.966143008348366e-06, "loss": 0.89746399, "memory(GiB)": 728.98, "step": 26250, "train_speed(iter/s)": 0.410776 }, { "acc": 0.75744314, "epoch": 0.6660335268162209, "grad_norm": 3.203125, "learning_rate": 7.965298742805813e-06, "loss": 0.95211258, "memory(GiB)": 728.98, "step": 26255, "train_speed(iter/s)": 0.410425 }, { "acc": 0.76371903, "epoch": 0.6661603661852584, "grad_norm": 3.53125, "learning_rate": 7.964454346829216e-06, "loss": 0.88759155, "memory(GiB)": 728.98, "step": 26260, "train_speed(iter/s)": 0.410088 }, { "acc": 0.77379193, "epoch": 0.666287205554296, "grad_norm": 3.640625, "learning_rate": 7.963609820455717e-06, "loss": 0.86151495, "memory(GiB)": 728.98, "step": 26265, "train_speed(iter/s)": 0.409742 }, { "acc": 0.76601024, "epoch": 0.6664140449233336, "grad_norm": 3.34375, "learning_rate": 7.962765163722465e-06, "loss": 0.86542101, "memory(GiB)": 728.98, "step": 26270, "train_speed(iter/s)": 0.409324 }, { "acc": 0.75031214, "epoch": 0.6665408842923711, "grad_norm": 3.984375, "learning_rate": 7.961920376666614e-06, "loss": 0.99374056, "memory(GiB)": 728.98, "step": 26275, "train_speed(iter/s)": 0.408982 }, { "acc": 0.76658216, "epoch": 0.6666677236614087, "grad_norm": 3.265625, "learning_rate": 7.961075459325322e-06, "loss": 0.88826199, "memory(GiB)": 728.98, "step": 26280, "train_speed(iter/s)": 0.408633 }, { "acc": 0.76319575, "epoch": 0.6667945630304462, "grad_norm": 3.296875, "learning_rate": 7.960230411735757e-06, "loss": 0.89639854, "memory(GiB)": 728.98, "step": 26285, "train_speed(iter/s)": 0.408215 }, { "acc": 0.75852232, "epoch": 0.6669214023994837, "grad_norm": 3.015625, "learning_rate": 7.959385233935087e-06, "loss": 0.89199162, "memory(GiB)": 728.98, "step": 26290, "train_speed(iter/s)": 0.407869 }, { "acc": 0.76354842, "epoch": 0.6670482417685213, "grad_norm": 3.59375, "learning_rate": 7.958539925960489e-06, "loss": 0.90995836, "memory(GiB)": 728.98, "step": 26295, "train_speed(iter/s)": 0.40753 }, { "acc": 0.75515604, "epoch": 0.6671750811375589, "grad_norm": 3.21875, "learning_rate": 7.957694487849148e-06, "loss": 0.9358367, "memory(GiB)": 728.98, "step": 26300, "train_speed(iter/s)": 0.407183 }, { "acc": 0.76781006, "epoch": 0.6673019205065964, "grad_norm": 3.734375, "learning_rate": 7.956848919638248e-06, "loss": 0.86097736, "memory(GiB)": 728.98, "step": 26305, "train_speed(iter/s)": 0.406878 }, { "acc": 0.76625032, "epoch": 0.667428759875634, "grad_norm": 3.234375, "learning_rate": 7.956003221364986e-06, "loss": 0.86205378, "memory(GiB)": 728.98, "step": 26310, "train_speed(iter/s)": 0.406542 }, { "acc": 0.770296, "epoch": 0.6675555992446716, "grad_norm": 3.25, "learning_rate": 7.955157393066561e-06, "loss": 0.88732224, "memory(GiB)": 728.98, "step": 26315, "train_speed(iter/s)": 0.406204 }, { "acc": 0.76556478, "epoch": 0.6676824386137091, "grad_norm": 2.796875, "learning_rate": 7.954311434780178e-06, "loss": 0.85207014, "memory(GiB)": 728.98, "step": 26320, "train_speed(iter/s)": 0.405886 }, { "acc": 0.76656532, "epoch": 0.6678092779827467, "grad_norm": 4.0625, "learning_rate": 7.953465346543048e-06, "loss": 0.93410244, "memory(GiB)": 728.98, "step": 26325, "train_speed(iter/s)": 0.405559 }, { "acc": 0.75400362, "epoch": 0.6679361173517843, "grad_norm": 4.53125, "learning_rate": 7.952619128392388e-06, "loss": 0.93071899, "memory(GiB)": 728.98, "step": 26330, "train_speed(iter/s)": 0.405244 }, { "acc": 0.76041017, "epoch": 0.6680629567208218, "grad_norm": 3.359375, "learning_rate": 7.951772780365419e-06, "loss": 0.92038631, "memory(GiB)": 728.98, "step": 26335, "train_speed(iter/s)": 0.404901 }, { "acc": 0.76849914, "epoch": 0.6681897960898594, "grad_norm": 3.0625, "learning_rate": 7.95092630249937e-06, "loss": 0.82213831, "memory(GiB)": 728.98, "step": 26340, "train_speed(iter/s)": 0.40455 }, { "acc": 0.76045251, "epoch": 0.6683166354588969, "grad_norm": 4.5, "learning_rate": 7.950079694831477e-06, "loss": 0.9322547, "memory(GiB)": 728.98, "step": 26345, "train_speed(iter/s)": 0.40425 }, { "acc": 0.77382212, "epoch": 0.6684434748279344, "grad_norm": 3.5, "learning_rate": 7.949232957398978e-06, "loss": 0.84593773, "memory(GiB)": 728.98, "step": 26350, "train_speed(iter/s)": 0.403917 }, { "acc": 0.75498099, "epoch": 0.668570314196972, "grad_norm": 3.71875, "learning_rate": 7.948386090239116e-06, "loss": 0.92634611, "memory(GiB)": 728.98, "step": 26355, "train_speed(iter/s)": 0.403612 }, { "acc": 0.76176934, "epoch": 0.6686971535660096, "grad_norm": 4.15625, "learning_rate": 7.947539093389145e-06, "loss": 0.91619253, "memory(GiB)": 728.98, "step": 26360, "train_speed(iter/s)": 0.403293 }, { "acc": 0.74675236, "epoch": 0.6688239929350471, "grad_norm": 3.78125, "learning_rate": 7.94669196688632e-06, "loss": 0.90863914, "memory(GiB)": 728.98, "step": 26365, "train_speed(iter/s)": 0.402929 }, { "acc": 0.74893665, "epoch": 0.6689508323040847, "grad_norm": 3.53125, "learning_rate": 7.945844710767906e-06, "loss": 0.93797779, "memory(GiB)": 728.98, "step": 26370, "train_speed(iter/s)": 0.402567 }, { "acc": 0.7644331, "epoch": 0.6690776716731223, "grad_norm": 3.46875, "learning_rate": 7.944997325071168e-06, "loss": 0.89851675, "memory(GiB)": 728.98, "step": 26375, "train_speed(iter/s)": 0.402242 }, { "acc": 0.772925, "epoch": 0.6692045110421598, "grad_norm": 2.859375, "learning_rate": 7.944149809833381e-06, "loss": 0.87032804, "memory(GiB)": 728.98, "step": 26380, "train_speed(iter/s)": 0.401895 }, { "acc": 0.76089892, "epoch": 0.6693313504111974, "grad_norm": 3.53125, "learning_rate": 7.943302165091823e-06, "loss": 0.94240456, "memory(GiB)": 728.98, "step": 26385, "train_speed(iter/s)": 0.401545 }, { "acc": 0.76629214, "epoch": 0.669458189780235, "grad_norm": 3.78125, "learning_rate": 7.942454390883782e-06, "loss": 0.93551083, "memory(GiB)": 728.98, "step": 26390, "train_speed(iter/s)": 0.40124 }, { "acc": 0.77147264, "epoch": 0.6695850291492725, "grad_norm": 3.21875, "learning_rate": 7.94160648724655e-06, "loss": 0.88545618, "memory(GiB)": 728.98, "step": 26395, "train_speed(iter/s)": 0.400862 }, { "acc": 0.77324257, "epoch": 0.6697118685183101, "grad_norm": 3.34375, "learning_rate": 7.940758454217417e-06, "loss": 0.88855076, "memory(GiB)": 728.98, "step": 26400, "train_speed(iter/s)": 0.400531 }, { "acc": 0.75995693, "epoch": 0.6698387078873476, "grad_norm": 3.578125, "learning_rate": 7.939910291833691e-06, "loss": 0.90470114, "memory(GiB)": 728.98, "step": 26405, "train_speed(iter/s)": 0.400213 }, { "acc": 0.77373347, "epoch": 0.6699655472563851, "grad_norm": 3.234375, "learning_rate": 7.939062000132679e-06, "loss": 0.88745079, "memory(GiB)": 728.98, "step": 26410, "train_speed(iter/s)": 0.399894 }, { "acc": 0.75489807, "epoch": 0.6700923866254227, "grad_norm": 3.71875, "learning_rate": 7.938213579151695e-06, "loss": 0.94405069, "memory(GiB)": 728.98, "step": 26415, "train_speed(iter/s)": 0.399533 }, { "acc": 0.77042699, "epoch": 0.6702192259944603, "grad_norm": 3.265625, "learning_rate": 7.937365028928054e-06, "loss": 0.86872549, "memory(GiB)": 728.98, "step": 26420, "train_speed(iter/s)": 0.399197 }, { "acc": 0.75438805, "epoch": 0.6703460653634978, "grad_norm": 4.0625, "learning_rate": 7.936516349499089e-06, "loss": 0.91330805, "memory(GiB)": 728.98, "step": 26425, "train_speed(iter/s)": 0.398897 }, { "acc": 0.77175946, "epoch": 0.6704729047325354, "grad_norm": 3.640625, "learning_rate": 7.935667540902122e-06, "loss": 0.86777496, "memory(GiB)": 728.98, "step": 26430, "train_speed(iter/s)": 0.398615 }, { "acc": 0.77035508, "epoch": 0.670599744101573, "grad_norm": 3.578125, "learning_rate": 7.934818603174496e-06, "loss": 0.82535191, "memory(GiB)": 728.98, "step": 26435, "train_speed(iter/s)": 0.39824 }, { "acc": 0.75948367, "epoch": 0.6707265834706105, "grad_norm": 3.25, "learning_rate": 7.93396953635355e-06, "loss": 0.93921289, "memory(GiB)": 728.98, "step": 26440, "train_speed(iter/s)": 0.397934 }, { "acc": 0.7756484, "epoch": 0.6708534228396481, "grad_norm": 3.34375, "learning_rate": 7.933120340476631e-06, "loss": 0.86584492, "memory(GiB)": 728.98, "step": 26445, "train_speed(iter/s)": 0.397551 }, { "acc": 0.75937524, "epoch": 0.6709802622086857, "grad_norm": 3.140625, "learning_rate": 7.932271015581094e-06, "loss": 0.92853985, "memory(GiB)": 728.98, "step": 26450, "train_speed(iter/s)": 0.397224 }, { "acc": 0.76670904, "epoch": 0.6711071015777232, "grad_norm": 3.46875, "learning_rate": 7.9314215617043e-06, "loss": 0.83850145, "memory(GiB)": 728.98, "step": 26455, "train_speed(iter/s)": 0.396915 }, { "acc": 0.77199125, "epoch": 0.6712339409467608, "grad_norm": 3.234375, "learning_rate": 7.93057197888361e-06, "loss": 0.88840294, "memory(GiB)": 728.98, "step": 26460, "train_speed(iter/s)": 0.396581 }, { "acc": 0.7581759, "epoch": 0.6713607803157983, "grad_norm": 4.375, "learning_rate": 7.929722267156395e-06, "loss": 0.91882086, "memory(GiB)": 728.98, "step": 26465, "train_speed(iter/s)": 0.396205 }, { "acc": 0.7601079, "epoch": 0.6714876196848358, "grad_norm": 3.859375, "learning_rate": 7.928872426560034e-06, "loss": 0.95587101, "memory(GiB)": 728.98, "step": 26470, "train_speed(iter/s)": 0.395869 }, { "acc": 0.76077881, "epoch": 0.6716144590538734, "grad_norm": 3.28125, "learning_rate": 7.928022457131906e-06, "loss": 0.9127141, "memory(GiB)": 728.98, "step": 26475, "train_speed(iter/s)": 0.395542 }, { "acc": 0.77697668, "epoch": 0.671741298422911, "grad_norm": 4.0625, "learning_rate": 7.927172358909399e-06, "loss": 0.92853622, "memory(GiB)": 728.98, "step": 26480, "train_speed(iter/s)": 0.395232 }, { "acc": 0.76303535, "epoch": 0.6718681377919485, "grad_norm": 3.796875, "learning_rate": 7.926322131929907e-06, "loss": 0.91652765, "memory(GiB)": 728.98, "step": 26485, "train_speed(iter/s)": 0.394884 }, { "acc": 0.77329054, "epoch": 0.6719949771609861, "grad_norm": 3.90625, "learning_rate": 7.925471776230828e-06, "loss": 0.88397169, "memory(GiB)": 728.98, "step": 26490, "train_speed(iter/s)": 0.394525 }, { "acc": 0.76930356, "epoch": 0.6721218165300237, "grad_norm": 3.3125, "learning_rate": 7.924621291849569e-06, "loss": 0.89765768, "memory(GiB)": 728.98, "step": 26495, "train_speed(iter/s)": 0.394228 }, { "acc": 0.75006814, "epoch": 0.6722486558990612, "grad_norm": 3.09375, "learning_rate": 7.923770678823535e-06, "loss": 0.9813941, "memory(GiB)": 728.98, "step": 26500, "train_speed(iter/s)": 0.393881 }, { "epoch": 0.6722486558990612, "eval_acc": 0.7547354968945267, "eval_loss": 0.8710131645202637, "eval_runtime": 1151.4562, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 26500 }, { "acc": 0.77729998, "epoch": 0.6723754952680988, "grad_norm": 3.484375, "learning_rate": 7.922919937190145e-06, "loss": 0.86770124, "memory(GiB)": 728.98, "step": 26505, "train_speed(iter/s)": 0.38287 }, { "acc": 0.76998687, "epoch": 0.6725023346371364, "grad_norm": 3.578125, "learning_rate": 7.92206906698682e-06, "loss": 0.9103673, "memory(GiB)": 728.98, "step": 26510, "train_speed(iter/s)": 0.382585 }, { "acc": 0.76700945, "epoch": 0.672629174006174, "grad_norm": 3.015625, "learning_rate": 7.921218068250986e-06, "loss": 0.90473909, "memory(GiB)": 728.98, "step": 26515, "train_speed(iter/s)": 0.382275 }, { "acc": 0.77044272, "epoch": 0.6727560133752115, "grad_norm": 3.890625, "learning_rate": 7.920366941020079e-06, "loss": 0.85933371, "memory(GiB)": 728.98, "step": 26520, "train_speed(iter/s)": 0.381996 }, { "acc": 0.77375703, "epoch": 0.672882852744249, "grad_norm": 3.96875, "learning_rate": 7.919515685331534e-06, "loss": 0.8875576, "memory(GiB)": 728.98, "step": 26525, "train_speed(iter/s)": 0.381731 }, { "acc": 0.77159491, "epoch": 0.6730096921132865, "grad_norm": 3.8125, "learning_rate": 7.918664301222796e-06, "loss": 0.94098997, "memory(GiB)": 728.98, "step": 26530, "train_speed(iter/s)": 0.381442 }, { "acc": 0.75620985, "epoch": 0.6731365314823241, "grad_norm": 3.6875, "learning_rate": 7.917812788731316e-06, "loss": 0.92277765, "memory(GiB)": 728.98, "step": 26535, "train_speed(iter/s)": 0.381168 }, { "acc": 0.74703641, "epoch": 0.6732633708513617, "grad_norm": 3.9375, "learning_rate": 7.916961147894547e-06, "loss": 0.97651901, "memory(GiB)": 728.98, "step": 26540, "train_speed(iter/s)": 0.380828 }, { "acc": 0.76348128, "epoch": 0.6733902102203992, "grad_norm": 4.03125, "learning_rate": 7.91610937874995e-06, "loss": 0.92187214, "memory(GiB)": 728.98, "step": 26545, "train_speed(iter/s)": 0.38054 }, { "acc": 0.7672493, "epoch": 0.6735170495894368, "grad_norm": 3.453125, "learning_rate": 7.915257481334994e-06, "loss": 0.89142838, "memory(GiB)": 728.98, "step": 26550, "train_speed(iter/s)": 0.380258 }, { "acc": 0.76123104, "epoch": 0.6736438889584744, "grad_norm": 5.9375, "learning_rate": 7.91440545568715e-06, "loss": 0.91264591, "memory(GiB)": 728.98, "step": 26555, "train_speed(iter/s)": 0.379986 }, { "acc": 0.77969756, "epoch": 0.673770728327512, "grad_norm": 3.421875, "learning_rate": 7.913553301843895e-06, "loss": 0.8496541, "memory(GiB)": 728.98, "step": 26560, "train_speed(iter/s)": 0.379733 }, { "acc": 0.7606987, "epoch": 0.6738975676965495, "grad_norm": 3.46875, "learning_rate": 7.912701019842715e-06, "loss": 0.92493172, "memory(GiB)": 728.98, "step": 26565, "train_speed(iter/s)": 0.379433 }, { "acc": 0.75498352, "epoch": 0.6740244070655871, "grad_norm": 3.265625, "learning_rate": 7.911848609721096e-06, "loss": 0.92867317, "memory(GiB)": 728.98, "step": 26570, "train_speed(iter/s)": 0.379131 }, { "acc": 0.75591283, "epoch": 0.6741512464346247, "grad_norm": 3.59375, "learning_rate": 7.910996071516535e-06, "loss": 0.92541189, "memory(GiB)": 728.98, "step": 26575, "train_speed(iter/s)": 0.378856 }, { "acc": 0.76197882, "epoch": 0.6742780858036622, "grad_norm": 3.40625, "learning_rate": 7.910143405266532e-06, "loss": 0.89375343, "memory(GiB)": 728.98, "step": 26580, "train_speed(iter/s)": 0.378515 }, { "acc": 0.76188531, "epoch": 0.6744049251726997, "grad_norm": 3.359375, "learning_rate": 7.909290611008594e-06, "loss": 0.91084194, "memory(GiB)": 728.98, "step": 26585, "train_speed(iter/s)": 0.378204 }, { "acc": 0.76472406, "epoch": 0.6745317645417372, "grad_norm": 3.234375, "learning_rate": 7.90843768878023e-06, "loss": 0.86322708, "memory(GiB)": 728.98, "step": 26590, "train_speed(iter/s)": 0.377924 }, { "acc": 0.76806464, "epoch": 0.6746586039107748, "grad_norm": 3.3125, "learning_rate": 7.907584638618959e-06, "loss": 0.87556095, "memory(GiB)": 728.98, "step": 26595, "train_speed(iter/s)": 0.377657 }, { "acc": 0.76648078, "epoch": 0.6747854432798124, "grad_norm": 3.796875, "learning_rate": 7.906731460562305e-06, "loss": 0.89372292, "memory(GiB)": 728.98, "step": 26600, "train_speed(iter/s)": 0.377404 }, { "acc": 0.77239914, "epoch": 0.67491228264885, "grad_norm": 3.546875, "learning_rate": 7.905878154647794e-06, "loss": 0.8841609, "memory(GiB)": 728.98, "step": 26605, "train_speed(iter/s)": 0.377077 }, { "acc": 0.76347475, "epoch": 0.6750391220178875, "grad_norm": 3.71875, "learning_rate": 7.905024720912962e-06, "loss": 0.91705542, "memory(GiB)": 728.98, "step": 26610, "train_speed(iter/s)": 0.376785 }, { "acc": 0.76133227, "epoch": 0.6751659613869251, "grad_norm": 2.921875, "learning_rate": 7.90417115939535e-06, "loss": 0.93761387, "memory(GiB)": 728.98, "step": 26615, "train_speed(iter/s)": 0.376495 }, { "acc": 0.7653224, "epoch": 0.6752928007559627, "grad_norm": 3.296875, "learning_rate": 7.9033174701325e-06, "loss": 0.90890322, "memory(GiB)": 728.98, "step": 26620, "train_speed(iter/s)": 0.376236 }, { "acc": 0.77271786, "epoch": 0.6754196401250002, "grad_norm": 3.828125, "learning_rate": 7.902463653161967e-06, "loss": 0.87372923, "memory(GiB)": 728.98, "step": 26625, "train_speed(iter/s)": 0.37593 }, { "acc": 0.77084379, "epoch": 0.6755464794940378, "grad_norm": 3.765625, "learning_rate": 7.901609708521304e-06, "loss": 0.88720236, "memory(GiB)": 728.98, "step": 26630, "train_speed(iter/s)": 0.375628 }, { "acc": 0.76485066, "epoch": 0.6756733188630754, "grad_norm": 3.65625, "learning_rate": 7.900755636248076e-06, "loss": 0.86000795, "memory(GiB)": 728.98, "step": 26635, "train_speed(iter/s)": 0.375341 }, { "acc": 0.75879335, "epoch": 0.6758001582321129, "grad_norm": 4.375, "learning_rate": 7.89990143637985e-06, "loss": 0.95829487, "memory(GiB)": 728.98, "step": 26640, "train_speed(iter/s)": 0.375087 }, { "acc": 0.74939361, "epoch": 0.6759269976011504, "grad_norm": 3.765625, "learning_rate": 7.899047108954199e-06, "loss": 0.94880705, "memory(GiB)": 728.98, "step": 26645, "train_speed(iter/s)": 0.374812 }, { "acc": 0.75980039, "epoch": 0.676053836970188, "grad_norm": 3.5, "learning_rate": 7.898192654008702e-06, "loss": 0.88873978, "memory(GiB)": 728.98, "step": 26650, "train_speed(iter/s)": 0.37451 }, { "acc": 0.76153054, "epoch": 0.6761806763392255, "grad_norm": 3.796875, "learning_rate": 7.897338071580944e-06, "loss": 0.94341717, "memory(GiB)": 728.98, "step": 26655, "train_speed(iter/s)": 0.374225 }, { "acc": 0.75094275, "epoch": 0.6763075157082631, "grad_norm": 3.109375, "learning_rate": 7.896483361708518e-06, "loss": 0.9079174, "memory(GiB)": 728.98, "step": 26660, "train_speed(iter/s)": 0.37395 }, { "acc": 0.76122456, "epoch": 0.6764343550773007, "grad_norm": 3.171875, "learning_rate": 7.895628524429015e-06, "loss": 0.90365782, "memory(GiB)": 728.98, "step": 26665, "train_speed(iter/s)": 0.37368 }, { "acc": 0.76528883, "epoch": 0.6765611944463382, "grad_norm": 4.59375, "learning_rate": 7.894773559780042e-06, "loss": 0.90400133, "memory(GiB)": 728.98, "step": 26670, "train_speed(iter/s)": 0.373407 }, { "acc": 0.77297812, "epoch": 0.6766880338153758, "grad_norm": 3.203125, "learning_rate": 7.893918467799202e-06, "loss": 0.90209484, "memory(GiB)": 728.98, "step": 26675, "train_speed(iter/s)": 0.373086 }, { "acc": 0.75158443, "epoch": 0.6768148731844134, "grad_norm": 2.875, "learning_rate": 7.893063248524108e-06, "loss": 0.9439271, "memory(GiB)": 728.98, "step": 26680, "train_speed(iter/s)": 0.372805 }, { "acc": 0.77749839, "epoch": 0.6769417125534509, "grad_norm": 3.375, "learning_rate": 7.892207901992381e-06, "loss": 0.84146557, "memory(GiB)": 728.98, "step": 26685, "train_speed(iter/s)": 0.372555 }, { "acc": 0.76366348, "epoch": 0.6770685519224885, "grad_norm": 3.546875, "learning_rate": 7.891352428241643e-06, "loss": 0.91698246, "memory(GiB)": 728.98, "step": 26690, "train_speed(iter/s)": 0.372269 }, { "acc": 0.76852431, "epoch": 0.6771953912915261, "grad_norm": 3.703125, "learning_rate": 7.890496827309523e-06, "loss": 0.91475201, "memory(GiB)": 728.98, "step": 26695, "train_speed(iter/s)": 0.372018 }, { "acc": 0.7622745, "epoch": 0.6773222306605636, "grad_norm": 3.71875, "learning_rate": 7.88964109923366e-06, "loss": 0.88314142, "memory(GiB)": 728.98, "step": 26700, "train_speed(iter/s)": 0.371765 }, { "acc": 0.76384854, "epoch": 0.6774490700296011, "grad_norm": 3.375, "learning_rate": 7.88878524405169e-06, "loss": 0.90856256, "memory(GiB)": 728.98, "step": 26705, "train_speed(iter/s)": 0.37148 }, { "acc": 0.76738653, "epoch": 0.6775759093986387, "grad_norm": 3.53125, "learning_rate": 7.887929261801262e-06, "loss": 0.92767649, "memory(GiB)": 728.98, "step": 26710, "train_speed(iter/s)": 0.371206 }, { "acc": 0.7540875, "epoch": 0.6777027487676762, "grad_norm": 3.1875, "learning_rate": 7.887073152520028e-06, "loss": 0.9718791, "memory(GiB)": 728.98, "step": 26715, "train_speed(iter/s)": 0.370912 }, { "acc": 0.77152414, "epoch": 0.6778295881367138, "grad_norm": 3.71875, "learning_rate": 7.886216916245644e-06, "loss": 0.84602842, "memory(GiB)": 728.98, "step": 26720, "train_speed(iter/s)": 0.370647 }, { "acc": 0.76981115, "epoch": 0.6779564275057514, "grad_norm": 3.453125, "learning_rate": 7.885360553015775e-06, "loss": 0.86321125, "memory(GiB)": 728.98, "step": 26725, "train_speed(iter/s)": 0.37031 }, { "acc": 0.776614, "epoch": 0.6780832668747889, "grad_norm": 2.71875, "learning_rate": 7.884504062868086e-06, "loss": 0.84947023, "memory(GiB)": 728.98, "step": 26730, "train_speed(iter/s)": 0.370025 }, { "acc": 0.75862465, "epoch": 0.6782101062438265, "grad_norm": 3.1875, "learning_rate": 7.883647445840256e-06, "loss": 0.94112377, "memory(GiB)": 728.98, "step": 26735, "train_speed(iter/s)": 0.369763 }, { "acc": 0.76328692, "epoch": 0.6783369456128641, "grad_norm": 3.34375, "learning_rate": 7.882790701969962e-06, "loss": 0.92871342, "memory(GiB)": 728.98, "step": 26740, "train_speed(iter/s)": 0.369504 }, { "acc": 0.76966076, "epoch": 0.6784637849819016, "grad_norm": 4.1875, "learning_rate": 7.88193383129489e-06, "loss": 0.88221684, "memory(GiB)": 728.98, "step": 26745, "train_speed(iter/s)": 0.369232 }, { "acc": 0.77101231, "epoch": 0.6785906243509392, "grad_norm": 3.328125, "learning_rate": 7.881076833852733e-06, "loss": 0.88482809, "memory(GiB)": 728.98, "step": 26750, "train_speed(iter/s)": 0.368945 }, { "acc": 0.76018543, "epoch": 0.6787174637199768, "grad_norm": 4.6875, "learning_rate": 7.880219709681183e-06, "loss": 0.93675842, "memory(GiB)": 728.98, "step": 26755, "train_speed(iter/s)": 0.36865 }, { "acc": 0.77965975, "epoch": 0.6788443030890143, "grad_norm": 3.453125, "learning_rate": 7.879362458817946e-06, "loss": 0.90963383, "memory(GiB)": 728.98, "step": 26760, "train_speed(iter/s)": 0.368377 }, { "acc": 0.74989743, "epoch": 0.6789711424580518, "grad_norm": 4.15625, "learning_rate": 7.878505081300729e-06, "loss": 0.97244473, "memory(GiB)": 728.98, "step": 26765, "train_speed(iter/s)": 0.368119 }, { "acc": 0.76923223, "epoch": 0.6790979818270894, "grad_norm": 3.484375, "learning_rate": 7.877647577167245e-06, "loss": 0.90863647, "memory(GiB)": 728.98, "step": 26770, "train_speed(iter/s)": 0.367849 }, { "acc": 0.78140569, "epoch": 0.6792248211961269, "grad_norm": 3.125, "learning_rate": 7.876789946455212e-06, "loss": 0.83707542, "memory(GiB)": 728.98, "step": 26775, "train_speed(iter/s)": 0.367543 }, { "acc": 0.7690155, "epoch": 0.6793516605651645, "grad_norm": 3.46875, "learning_rate": 7.875932189202355e-06, "loss": 0.86044531, "memory(GiB)": 728.98, "step": 26780, "train_speed(iter/s)": 0.367273 }, { "acc": 0.7573411, "epoch": 0.6794784999342021, "grad_norm": 3.5625, "learning_rate": 7.875074305446405e-06, "loss": 0.97341909, "memory(GiB)": 728.98, "step": 26785, "train_speed(iter/s)": 0.366995 }, { "acc": 0.75919075, "epoch": 0.6796053393032396, "grad_norm": 2.984375, "learning_rate": 7.874216295225098e-06, "loss": 0.92468529, "memory(GiB)": 728.98, "step": 26790, "train_speed(iter/s)": 0.366717 }, { "acc": 0.76783166, "epoch": 0.6797321786722772, "grad_norm": 3.5625, "learning_rate": 7.873358158576172e-06, "loss": 0.92145281, "memory(GiB)": 728.98, "step": 26795, "train_speed(iter/s)": 0.366421 }, { "acc": 0.7741406, "epoch": 0.6798590180413148, "grad_norm": 3.5625, "learning_rate": 7.872499895537378e-06, "loss": 0.87287159, "memory(GiB)": 728.98, "step": 26800, "train_speed(iter/s)": 0.366149 }, { "acc": 0.77040558, "epoch": 0.6799858574103523, "grad_norm": 3.671875, "learning_rate": 7.871641506146464e-06, "loss": 0.8682765, "memory(GiB)": 728.98, "step": 26805, "train_speed(iter/s)": 0.365895 }, { "acc": 0.76824918, "epoch": 0.6801126967793899, "grad_norm": 3.890625, "learning_rate": 7.87078299044119e-06, "loss": 0.87525978, "memory(GiB)": 728.98, "step": 26810, "train_speed(iter/s)": 0.365634 }, { "acc": 0.74978952, "epoch": 0.6802395361484275, "grad_norm": 3.265625, "learning_rate": 7.869924348459318e-06, "loss": 0.9628108, "memory(GiB)": 728.98, "step": 26815, "train_speed(iter/s)": 0.365373 }, { "acc": 0.75962958, "epoch": 0.680366375517465, "grad_norm": 3.671875, "learning_rate": 7.869065580238621e-06, "loss": 0.9128767, "memory(GiB)": 728.98, "step": 26820, "train_speed(iter/s)": 0.365094 }, { "acc": 0.74570546, "epoch": 0.6804932148865025, "grad_norm": 3.78125, "learning_rate": 7.868206685816868e-06, "loss": 0.90977278, "memory(GiB)": 728.98, "step": 26825, "train_speed(iter/s)": 0.364853 }, { "acc": 0.76012964, "epoch": 0.6806200542555401, "grad_norm": 3.8125, "learning_rate": 7.867347665231844e-06, "loss": 0.91079807, "memory(GiB)": 728.98, "step": 26830, "train_speed(iter/s)": 0.364605 }, { "acc": 0.78775678, "epoch": 0.6807468936245776, "grad_norm": 3.78125, "learning_rate": 7.86648851852133e-06, "loss": 0.84602585, "memory(GiB)": 728.98, "step": 26835, "train_speed(iter/s)": 0.364358 }, { "acc": 0.75679755, "epoch": 0.6808737329936152, "grad_norm": 3.8125, "learning_rate": 7.865629245723119e-06, "loss": 0.9476408, "memory(GiB)": 728.98, "step": 26840, "train_speed(iter/s)": 0.3641 }, { "acc": 0.77311521, "epoch": 0.6810005723626528, "grad_norm": 3.96875, "learning_rate": 7.86476984687501e-06, "loss": 0.84485188, "memory(GiB)": 728.98, "step": 26845, "train_speed(iter/s)": 0.363824 }, { "acc": 0.76143537, "epoch": 0.6811274117316903, "grad_norm": 3.40625, "learning_rate": 7.863910322014801e-06, "loss": 0.89554567, "memory(GiB)": 728.98, "step": 26850, "train_speed(iter/s)": 0.363538 }, { "acc": 0.77069626, "epoch": 0.6812542511007279, "grad_norm": 3.390625, "learning_rate": 7.863050671180304e-06, "loss": 0.84296579, "memory(GiB)": 728.98, "step": 26855, "train_speed(iter/s)": 0.363272 }, { "acc": 0.76337686, "epoch": 0.6813810904697655, "grad_norm": 3.953125, "learning_rate": 7.862190894409328e-06, "loss": 0.85378265, "memory(GiB)": 728.98, "step": 26860, "train_speed(iter/s)": 0.363019 }, { "acc": 0.77151051, "epoch": 0.681507929838803, "grad_norm": 3.609375, "learning_rate": 7.861330991739696e-06, "loss": 0.87362947, "memory(GiB)": 728.98, "step": 26865, "train_speed(iter/s)": 0.362766 }, { "acc": 0.76762938, "epoch": 0.6816347692078406, "grad_norm": 3.28125, "learning_rate": 7.860470963209228e-06, "loss": 0.86731119, "memory(GiB)": 728.98, "step": 26870, "train_speed(iter/s)": 0.36252 }, { "acc": 0.76422644, "epoch": 0.6817616085768782, "grad_norm": 3.546875, "learning_rate": 7.859610808855759e-06, "loss": 0.88131313, "memory(GiB)": 728.98, "step": 26875, "train_speed(iter/s)": 0.362243 }, { "acc": 0.76262012, "epoch": 0.6818884479459157, "grad_norm": 3.1875, "learning_rate": 7.85875052871712e-06, "loss": 0.88109274, "memory(GiB)": 728.98, "step": 26880, "train_speed(iter/s)": 0.361961 }, { "acc": 0.7612864, "epoch": 0.6820152873149532, "grad_norm": 3.625, "learning_rate": 7.857890122831155e-06, "loss": 0.93210001, "memory(GiB)": 728.98, "step": 26885, "train_speed(iter/s)": 0.361683 }, { "acc": 0.7547276, "epoch": 0.6821421266839908, "grad_norm": 3.484375, "learning_rate": 7.85702959123571e-06, "loss": 0.92523012, "memory(GiB)": 728.98, "step": 26890, "train_speed(iter/s)": 0.361416 }, { "acc": 0.76699138, "epoch": 0.6822689660530283, "grad_norm": 3.140625, "learning_rate": 7.856168933968635e-06, "loss": 0.93762522, "memory(GiB)": 728.98, "step": 26895, "train_speed(iter/s)": 0.361168 }, { "acc": 0.76426344, "epoch": 0.6823958054220659, "grad_norm": 3.203125, "learning_rate": 7.855308151067788e-06, "loss": 0.90451279, "memory(GiB)": 728.98, "step": 26900, "train_speed(iter/s)": 0.360949 }, { "acc": 0.75687189, "epoch": 0.6825226447911035, "grad_norm": 3.46875, "learning_rate": 7.854447242571035e-06, "loss": 0.9190258, "memory(GiB)": 728.98, "step": 26905, "train_speed(iter/s)": 0.360688 }, { "acc": 0.7723485, "epoch": 0.682649484160141, "grad_norm": 4.5, "learning_rate": 7.85358620851624e-06, "loss": 0.89750757, "memory(GiB)": 728.98, "step": 26910, "train_speed(iter/s)": 0.36043 }, { "acc": 0.77598596, "epoch": 0.6827763235291786, "grad_norm": 3.09375, "learning_rate": 7.852725048941283e-06, "loss": 0.84816427, "memory(GiB)": 728.98, "step": 26915, "train_speed(iter/s)": 0.360166 }, { "acc": 0.76476622, "epoch": 0.6829031628982162, "grad_norm": 3.640625, "learning_rate": 7.851863763884038e-06, "loss": 0.92065372, "memory(GiB)": 728.98, "step": 26920, "train_speed(iter/s)": 0.359913 }, { "acc": 0.7668066, "epoch": 0.6830300022672537, "grad_norm": 4.1875, "learning_rate": 7.851002353382393e-06, "loss": 0.90730639, "memory(GiB)": 728.98, "step": 26925, "train_speed(iter/s)": 0.359646 }, { "acc": 0.77063847, "epoch": 0.6831568416362913, "grad_norm": 3.359375, "learning_rate": 7.850140817474239e-06, "loss": 0.88495884, "memory(GiB)": 728.98, "step": 26930, "train_speed(iter/s)": 0.35941 }, { "acc": 0.75379744, "epoch": 0.6832836810053289, "grad_norm": 4.03125, "learning_rate": 7.849279156197472e-06, "loss": 0.9565218, "memory(GiB)": 728.98, "step": 26935, "train_speed(iter/s)": 0.359147 }, { "acc": 0.7720799, "epoch": 0.6834105203743664, "grad_norm": 3.359375, "learning_rate": 7.848417369589993e-06, "loss": 0.89302626, "memory(GiB)": 728.98, "step": 26940, "train_speed(iter/s)": 0.358882 }, { "acc": 0.75411043, "epoch": 0.6835373597434039, "grad_norm": 4.0625, "learning_rate": 7.847555457689709e-06, "loss": 0.95271406, "memory(GiB)": 728.98, "step": 26945, "train_speed(iter/s)": 0.35864 }, { "acc": 0.75776772, "epoch": 0.6836641991124415, "grad_norm": 3.140625, "learning_rate": 7.846693420534533e-06, "loss": 0.93066053, "memory(GiB)": 728.98, "step": 26950, "train_speed(iter/s)": 0.358346 }, { "acc": 0.77030802, "epoch": 0.683791038481479, "grad_norm": 3.328125, "learning_rate": 7.845831258162385e-06, "loss": 0.89235716, "memory(GiB)": 728.98, "step": 26955, "train_speed(iter/s)": 0.358075 }, { "acc": 0.75755048, "epoch": 0.6839178778505166, "grad_norm": 3.265625, "learning_rate": 7.844968970611184e-06, "loss": 0.93814764, "memory(GiB)": 728.98, "step": 26960, "train_speed(iter/s)": 0.357819 }, { "acc": 0.76169801, "epoch": 0.6840447172195542, "grad_norm": 3.375, "learning_rate": 7.844106557918866e-06, "loss": 0.91884518, "memory(GiB)": 728.98, "step": 26965, "train_speed(iter/s)": 0.357568 }, { "acc": 0.76655831, "epoch": 0.6841715565885917, "grad_norm": 3.859375, "learning_rate": 7.843244020123362e-06, "loss": 0.8778966, "memory(GiB)": 728.98, "step": 26970, "train_speed(iter/s)": 0.357319 }, { "acc": 0.77802005, "epoch": 0.6842983959576293, "grad_norm": 3.640625, "learning_rate": 7.842381357262611e-06, "loss": 0.81082926, "memory(GiB)": 728.98, "step": 26975, "train_speed(iter/s)": 0.357092 }, { "acc": 0.75848846, "epoch": 0.6844252353266669, "grad_norm": 3.796875, "learning_rate": 7.841518569374561e-06, "loss": 0.9850893, "memory(GiB)": 728.98, "step": 26980, "train_speed(iter/s)": 0.356841 }, { "acc": 0.77369938, "epoch": 0.6845520746957044, "grad_norm": 3.546875, "learning_rate": 7.840655656497163e-06, "loss": 0.81613874, "memory(GiB)": 728.98, "step": 26985, "train_speed(iter/s)": 0.356587 }, { "acc": 0.75912662, "epoch": 0.684678914064742, "grad_norm": 3.0, "learning_rate": 7.839792618668374e-06, "loss": 0.97768431, "memory(GiB)": 728.98, "step": 26990, "train_speed(iter/s)": 0.356343 }, { "acc": 0.75985503, "epoch": 0.6848057534337796, "grad_norm": 3.640625, "learning_rate": 7.838929455926155e-06, "loss": 0.90028706, "memory(GiB)": 728.98, "step": 26995, "train_speed(iter/s)": 0.356098 }, { "acc": 0.76391349, "epoch": 0.6849325928028172, "grad_norm": 3.53125, "learning_rate": 7.838066168308476e-06, "loss": 0.9242012, "memory(GiB)": 728.98, "step": 27000, "train_speed(iter/s)": 0.35586 }, { "epoch": 0.6849325928028172, "eval_acc": 0.7548779555470516, "eval_loss": 0.8706737160682678, "eval_runtime": 1153.7836, "eval_samples_per_second": 5.521, "eval_steps_per_second": 5.521, "step": 27000 }, { "acc": 0.77579956, "epoch": 0.6850594321718546, "grad_norm": 3.125, "learning_rate": 7.837202755853309e-06, "loss": 0.88512506, "memory(GiB)": 728.98, "step": 27005, "train_speed(iter/s)": 0.347067 }, { "acc": 0.76579051, "epoch": 0.6851862715408922, "grad_norm": 3.5, "learning_rate": 7.83633921859863e-06, "loss": 0.87027426, "memory(GiB)": 728.98, "step": 27010, "train_speed(iter/s)": 0.346841 }, { "acc": 0.76193619, "epoch": 0.6853131109099297, "grad_norm": 3.8125, "learning_rate": 7.835475556582429e-06, "loss": 0.91624718, "memory(GiB)": 728.98, "step": 27015, "train_speed(iter/s)": 0.346625 }, { "acc": 0.76242299, "epoch": 0.6854399502789673, "grad_norm": 3.34375, "learning_rate": 7.83461176984269e-06, "loss": 0.89379673, "memory(GiB)": 728.98, "step": 27020, "train_speed(iter/s)": 0.346385 }, { "acc": 0.74771543, "epoch": 0.6855667896480049, "grad_norm": 3.046875, "learning_rate": 7.833747858417413e-06, "loss": 0.90632124, "memory(GiB)": 728.98, "step": 27025, "train_speed(iter/s)": 0.34617 }, { "acc": 0.75327048, "epoch": 0.6856936290170424, "grad_norm": 2.984375, "learning_rate": 7.832883822344596e-06, "loss": 0.91701593, "memory(GiB)": 728.98, "step": 27030, "train_speed(iter/s)": 0.345933 }, { "acc": 0.76580634, "epoch": 0.68582046838608, "grad_norm": 4.53125, "learning_rate": 7.832019661662244e-06, "loss": 0.88314056, "memory(GiB)": 728.98, "step": 27035, "train_speed(iter/s)": 0.345655 }, { "acc": 0.76419153, "epoch": 0.6859473077551176, "grad_norm": 3.9375, "learning_rate": 7.831155376408373e-06, "loss": 0.87930346, "memory(GiB)": 728.98, "step": 27040, "train_speed(iter/s)": 0.345431 }, { "acc": 0.76211348, "epoch": 0.6860741471241552, "grad_norm": 3.421875, "learning_rate": 7.830290966620997e-06, "loss": 0.93657703, "memory(GiB)": 728.98, "step": 27045, "train_speed(iter/s)": 0.345208 }, { "acc": 0.76073837, "epoch": 0.6862009864931927, "grad_norm": 3.46875, "learning_rate": 7.829426432338138e-06, "loss": 0.896315, "memory(GiB)": 728.98, "step": 27050, "train_speed(iter/s)": 0.344988 }, { "acc": 0.76596737, "epoch": 0.6863278258622303, "grad_norm": 3.671875, "learning_rate": 7.828561773597827e-06, "loss": 0.86509867, "memory(GiB)": 728.98, "step": 27055, "train_speed(iter/s)": 0.34473 }, { "acc": 0.75294347, "epoch": 0.6864546652312679, "grad_norm": 3.234375, "learning_rate": 7.827696990438096e-06, "loss": 0.89732695, "memory(GiB)": 728.98, "step": 27060, "train_speed(iter/s)": 0.344477 }, { "acc": 0.7550221, "epoch": 0.6865815046003053, "grad_norm": 4.0, "learning_rate": 7.826832082896982e-06, "loss": 0.93752346, "memory(GiB)": 728.98, "step": 27065, "train_speed(iter/s)": 0.344197 }, { "acc": 0.76724401, "epoch": 0.6867083439693429, "grad_norm": 3.265625, "learning_rate": 7.825967051012533e-06, "loss": 0.87737007, "memory(GiB)": 728.98, "step": 27070, "train_speed(iter/s)": 0.343952 }, { "acc": 0.76832762, "epoch": 0.6868351833383805, "grad_norm": 3.1875, "learning_rate": 7.825101894822797e-06, "loss": 0.91502314, "memory(GiB)": 728.98, "step": 27075, "train_speed(iter/s)": 0.343681 }, { "acc": 0.75693898, "epoch": 0.686962022707418, "grad_norm": 3.9375, "learning_rate": 7.82423661436583e-06, "loss": 0.93426552, "memory(GiB)": 728.98, "step": 27080, "train_speed(iter/s)": 0.343485 }, { "acc": 0.75565214, "epoch": 0.6870888620764556, "grad_norm": 3.078125, "learning_rate": 7.823371209679694e-06, "loss": 0.91332321, "memory(GiB)": 728.98, "step": 27085, "train_speed(iter/s)": 0.343231 }, { "acc": 0.77477427, "epoch": 0.6872157014454932, "grad_norm": 3.265625, "learning_rate": 7.822505680802452e-06, "loss": 0.86544333, "memory(GiB)": 728.98, "step": 27090, "train_speed(iter/s)": 0.343019 }, { "acc": 0.77003698, "epoch": 0.6873425408145307, "grad_norm": 3.828125, "learning_rate": 7.82164002777218e-06, "loss": 0.83440838, "memory(GiB)": 728.98, "step": 27095, "train_speed(iter/s)": 0.342787 }, { "acc": 0.7681179, "epoch": 0.6874693801835683, "grad_norm": 3.1875, "learning_rate": 7.820774250626953e-06, "loss": 0.87998343, "memory(GiB)": 728.98, "step": 27100, "train_speed(iter/s)": 0.34255 }, { "acc": 0.77854729, "epoch": 0.6875962195526059, "grad_norm": 3.203125, "learning_rate": 7.819908349404853e-06, "loss": 0.82857933, "memory(GiB)": 728.98, "step": 27105, "train_speed(iter/s)": 0.342324 }, { "acc": 0.77946873, "epoch": 0.6877230589216434, "grad_norm": 3.953125, "learning_rate": 7.81904232414397e-06, "loss": 0.85186605, "memory(GiB)": 728.98, "step": 27110, "train_speed(iter/s)": 0.342109 }, { "acc": 0.78018136, "epoch": 0.687849898290681, "grad_norm": 3.0, "learning_rate": 7.818176174882398e-06, "loss": 0.86329746, "memory(GiB)": 728.98, "step": 27115, "train_speed(iter/s)": 0.341858 }, { "acc": 0.76625733, "epoch": 0.6879767376597186, "grad_norm": 3.734375, "learning_rate": 7.817309901658236e-06, "loss": 0.9370717, "memory(GiB)": 728.98, "step": 27120, "train_speed(iter/s)": 0.341647 }, { "acc": 0.75785966, "epoch": 0.688103577028756, "grad_norm": 3.859375, "learning_rate": 7.816443504509587e-06, "loss": 0.99061165, "memory(GiB)": 728.98, "step": 27125, "train_speed(iter/s)": 0.34145 }, { "acc": 0.75256262, "epoch": 0.6882304163977936, "grad_norm": 3.671875, "learning_rate": 7.815576983474562e-06, "loss": 0.93406115, "memory(GiB)": 728.98, "step": 27130, "train_speed(iter/s)": 0.34123 }, { "acc": 0.76389232, "epoch": 0.6883572557668312, "grad_norm": 3.8125, "learning_rate": 7.814710338591276e-06, "loss": 0.89005222, "memory(GiB)": 728.98, "step": 27135, "train_speed(iter/s)": 0.340993 }, { "acc": 0.76911979, "epoch": 0.6884840951358687, "grad_norm": 3.359375, "learning_rate": 7.81384356989785e-06, "loss": 0.89299641, "memory(GiB)": 728.98, "step": 27140, "train_speed(iter/s)": 0.340754 }, { "acc": 0.75957961, "epoch": 0.6886109345049063, "grad_norm": 2.75, "learning_rate": 7.812976677432412e-06, "loss": 0.88612576, "memory(GiB)": 728.98, "step": 27145, "train_speed(iter/s)": 0.340494 }, { "acc": 0.77412763, "epoch": 0.6887377738739439, "grad_norm": 3.1875, "learning_rate": 7.812109661233093e-06, "loss": 0.81084423, "memory(GiB)": 728.98, "step": 27150, "train_speed(iter/s)": 0.340235 }, { "acc": 0.77785511, "epoch": 0.6888646132429814, "grad_norm": 3.03125, "learning_rate": 7.811242521338029e-06, "loss": 0.85989466, "memory(GiB)": 728.98, "step": 27155, "train_speed(iter/s)": 0.340023 }, { "acc": 0.76369939, "epoch": 0.688991452612019, "grad_norm": 3.578125, "learning_rate": 7.810375257785362e-06, "loss": 0.88688841, "memory(GiB)": 728.98, "step": 27160, "train_speed(iter/s)": 0.339774 }, { "acc": 0.7633461, "epoch": 0.6891182919810566, "grad_norm": 3.6875, "learning_rate": 7.809507870613246e-06, "loss": 0.94524956, "memory(GiB)": 728.98, "step": 27165, "train_speed(iter/s)": 0.33953 }, { "acc": 0.76346617, "epoch": 0.6892451313500941, "grad_norm": 3.140625, "learning_rate": 7.808640359859828e-06, "loss": 0.89368734, "memory(GiB)": 728.98, "step": 27170, "train_speed(iter/s)": 0.339323 }, { "acc": 0.76168461, "epoch": 0.6893719707191317, "grad_norm": 4.15625, "learning_rate": 7.80777272556327e-06, "loss": 0.91975784, "memory(GiB)": 728.98, "step": 27175, "train_speed(iter/s)": 0.339135 }, { "acc": 0.74619865, "epoch": 0.6894988100881693, "grad_norm": 3.9375, "learning_rate": 7.806904967761735e-06, "loss": 0.96272602, "memory(GiB)": 728.98, "step": 27180, "train_speed(iter/s)": 0.338895 }, { "acc": 0.76422758, "epoch": 0.6896256494572067, "grad_norm": 4.1875, "learning_rate": 7.806037086493395e-06, "loss": 0.92446728, "memory(GiB)": 728.98, "step": 27185, "train_speed(iter/s)": 0.338663 }, { "acc": 0.76139832, "epoch": 0.6897524888262443, "grad_norm": 3.46875, "learning_rate": 7.805169081796421e-06, "loss": 0.91352243, "memory(GiB)": 728.98, "step": 27190, "train_speed(iter/s)": 0.338457 }, { "acc": 0.76511378, "epoch": 0.6898793281952819, "grad_norm": 3.328125, "learning_rate": 7.804300953709e-06, "loss": 0.88285007, "memory(GiB)": 728.98, "step": 27195, "train_speed(iter/s)": 0.338244 }, { "acc": 0.75291762, "epoch": 0.6900061675643194, "grad_norm": 3.296875, "learning_rate": 7.803432702269316e-06, "loss": 0.96736259, "memory(GiB)": 728.98, "step": 27200, "train_speed(iter/s)": 0.338005 }, { "acc": 0.76065831, "epoch": 0.690133006933357, "grad_norm": 3.59375, "learning_rate": 7.802564327515559e-06, "loss": 0.88046675, "memory(GiB)": 728.98, "step": 27205, "train_speed(iter/s)": 0.337765 }, { "acc": 0.77223663, "epoch": 0.6902598463023946, "grad_norm": 4.59375, "learning_rate": 7.801695829485924e-06, "loss": 0.88113766, "memory(GiB)": 728.98, "step": 27210, "train_speed(iter/s)": 0.337555 }, { "acc": 0.76612024, "epoch": 0.6903866856714321, "grad_norm": 4.125, "learning_rate": 7.800827208218619e-06, "loss": 0.85573997, "memory(GiB)": 728.98, "step": 27215, "train_speed(iter/s)": 0.337345 }, { "acc": 0.77321448, "epoch": 0.6905135250404697, "grad_norm": 3.5625, "learning_rate": 7.799958463751848e-06, "loss": 0.89181328, "memory(GiB)": 728.98, "step": 27220, "train_speed(iter/s)": 0.337092 }, { "acc": 0.77075744, "epoch": 0.6906403644095073, "grad_norm": 3.328125, "learning_rate": 7.799089596123826e-06, "loss": 0.89816971, "memory(GiB)": 728.98, "step": 27225, "train_speed(iter/s)": 0.336877 }, { "acc": 0.75945034, "epoch": 0.6907672037785448, "grad_norm": 3.1875, "learning_rate": 7.798220605372771e-06, "loss": 0.92102118, "memory(GiB)": 728.98, "step": 27230, "train_speed(iter/s)": 0.336658 }, { "acc": 0.77133431, "epoch": 0.6908940431475824, "grad_norm": 3.75, "learning_rate": 7.797351491536906e-06, "loss": 0.89827747, "memory(GiB)": 728.98, "step": 27235, "train_speed(iter/s)": 0.336447 }, { "acc": 0.76300192, "epoch": 0.69102088251662, "grad_norm": 3.453125, "learning_rate": 7.796482254654463e-06, "loss": 0.90319576, "memory(GiB)": 728.98, "step": 27240, "train_speed(iter/s)": 0.33623 }, { "acc": 0.758781, "epoch": 0.6911477218856574, "grad_norm": 3.90625, "learning_rate": 7.795612894763676e-06, "loss": 0.95697441, "memory(GiB)": 728.98, "step": 27245, "train_speed(iter/s)": 0.336 }, { "acc": 0.76047926, "epoch": 0.691274561254695, "grad_norm": 3.84375, "learning_rate": 7.794743411902784e-06, "loss": 0.9565753, "memory(GiB)": 728.98, "step": 27250, "train_speed(iter/s)": 0.335817 }, { "acc": 0.77638497, "epoch": 0.6914014006237326, "grad_norm": 3.171875, "learning_rate": 7.793873806110036e-06, "loss": 0.83487797, "memory(GiB)": 728.98, "step": 27255, "train_speed(iter/s)": 0.335596 }, { "acc": 0.77762079, "epoch": 0.6915282399927701, "grad_norm": 3.671875, "learning_rate": 7.793004077423677e-06, "loss": 0.87064114, "memory(GiB)": 728.98, "step": 27260, "train_speed(iter/s)": 0.335381 }, { "acc": 0.76554904, "epoch": 0.6916550793618077, "grad_norm": 3.53125, "learning_rate": 7.792134225881972e-06, "loss": 0.87712431, "memory(GiB)": 728.98, "step": 27265, "train_speed(iter/s)": 0.335174 }, { "acc": 0.77054343, "epoch": 0.6917819187308453, "grad_norm": 4.40625, "learning_rate": 7.791264251523177e-06, "loss": 0.86384134, "memory(GiB)": 728.98, "step": 27270, "train_speed(iter/s)": 0.334976 }, { "acc": 0.76573696, "epoch": 0.6919087580998828, "grad_norm": 3.671875, "learning_rate": 7.79039415438556e-06, "loss": 0.91351252, "memory(GiB)": 728.98, "step": 27275, "train_speed(iter/s)": 0.334776 }, { "acc": 0.77388244, "epoch": 0.6920355974689204, "grad_norm": 3.5, "learning_rate": 7.789523934507397e-06, "loss": 0.87132215, "memory(GiB)": 728.98, "step": 27280, "train_speed(iter/s)": 0.334575 }, { "acc": 0.76371865, "epoch": 0.692162436837958, "grad_norm": 4.15625, "learning_rate": 7.788653591926963e-06, "loss": 0.88989096, "memory(GiB)": 728.98, "step": 27285, "train_speed(iter/s)": 0.334378 }, { "acc": 0.77834263, "epoch": 0.6922892762069955, "grad_norm": 3.3125, "learning_rate": 7.787783126682543e-06, "loss": 0.87801991, "memory(GiB)": 728.98, "step": 27290, "train_speed(iter/s)": 0.334145 }, { "acc": 0.76286287, "epoch": 0.6924161155760331, "grad_norm": 3.609375, "learning_rate": 7.786912538812426e-06, "loss": 0.90886145, "memory(GiB)": 728.98, "step": 27295, "train_speed(iter/s)": 0.333917 }, { "acc": 0.76668797, "epoch": 0.6925429549450707, "grad_norm": 3.359375, "learning_rate": 7.786041828354904e-06, "loss": 0.86935272, "memory(GiB)": 728.98, "step": 27300, "train_speed(iter/s)": 0.333695 }, { "acc": 0.77690668, "epoch": 0.6926697943141081, "grad_norm": 3.5, "learning_rate": 7.785170995348282e-06, "loss": 0.84446497, "memory(GiB)": 728.98, "step": 27305, "train_speed(iter/s)": 0.333512 }, { "acc": 0.76852212, "epoch": 0.6927966336831457, "grad_norm": 3.203125, "learning_rate": 7.784300039830858e-06, "loss": 0.89415131, "memory(GiB)": 728.98, "step": 27310, "train_speed(iter/s)": 0.333301 }, { "acc": 0.77242022, "epoch": 0.6929234730521833, "grad_norm": 3.578125, "learning_rate": 7.783428961840949e-06, "loss": 0.89902182, "memory(GiB)": 728.98, "step": 27315, "train_speed(iter/s)": 0.333113 }, { "acc": 0.7656455, "epoch": 0.6930503124212208, "grad_norm": 3.578125, "learning_rate": 7.78255776141687e-06, "loss": 0.94021521, "memory(GiB)": 728.98, "step": 27320, "train_speed(iter/s)": 0.33292 }, { "acc": 0.77740798, "epoch": 0.6931771517902584, "grad_norm": 4.0, "learning_rate": 7.781686438596939e-06, "loss": 0.84773321, "memory(GiB)": 728.98, "step": 27325, "train_speed(iter/s)": 0.332721 }, { "acc": 0.77000561, "epoch": 0.693303991159296, "grad_norm": 3.421875, "learning_rate": 7.780814993419484e-06, "loss": 0.85598326, "memory(GiB)": 728.98, "step": 27330, "train_speed(iter/s)": 0.332529 }, { "acc": 0.74523716, "epoch": 0.6934308305283335, "grad_norm": 3.296875, "learning_rate": 7.779943425922839e-06, "loss": 0.9210413, "memory(GiB)": 728.98, "step": 27335, "train_speed(iter/s)": 0.332309 }, { "acc": 0.75844593, "epoch": 0.6935576698973711, "grad_norm": 3.078125, "learning_rate": 7.779071736145339e-06, "loss": 0.95650835, "memory(GiB)": 728.98, "step": 27340, "train_speed(iter/s)": 0.332085 }, { "acc": 0.77458291, "epoch": 0.6936845092664087, "grad_norm": 3.34375, "learning_rate": 7.77819992412533e-06, "loss": 0.87018461, "memory(GiB)": 728.98, "step": 27345, "train_speed(iter/s)": 0.331865 }, { "acc": 0.77671771, "epoch": 0.6938113486354462, "grad_norm": 3.53125, "learning_rate": 7.777327989901156e-06, "loss": 0.84759884, "memory(GiB)": 728.98, "step": 27350, "train_speed(iter/s)": 0.331676 }, { "acc": 0.76644526, "epoch": 0.6939381880044838, "grad_norm": 3.71875, "learning_rate": 7.776455933511174e-06, "loss": 0.90332642, "memory(GiB)": 728.98, "step": 27355, "train_speed(iter/s)": 0.331459 }, { "acc": 0.76527405, "epoch": 0.6940650273735214, "grad_norm": 4.125, "learning_rate": 7.775583754993741e-06, "loss": 0.9756834, "memory(GiB)": 728.98, "step": 27360, "train_speed(iter/s)": 0.331269 }, { "acc": 0.75547943, "epoch": 0.6941918667425588, "grad_norm": 2.703125, "learning_rate": 7.774711454387223e-06, "loss": 0.93098764, "memory(GiB)": 728.98, "step": 27365, "train_speed(iter/s)": 0.331073 }, { "acc": 0.78072476, "epoch": 0.6943187061115964, "grad_norm": 3.40625, "learning_rate": 7.773839031729988e-06, "loss": 0.86112652, "memory(GiB)": 728.98, "step": 27370, "train_speed(iter/s)": 0.330865 }, { "acc": 0.76293731, "epoch": 0.694445545480634, "grad_norm": 3.640625, "learning_rate": 7.772966487060413e-06, "loss": 0.94312401, "memory(GiB)": 728.98, "step": 27375, "train_speed(iter/s)": 0.330679 }, { "acc": 0.77155781, "epoch": 0.6945723848496715, "grad_norm": 4.25, "learning_rate": 7.772093820416877e-06, "loss": 0.86836357, "memory(GiB)": 728.98, "step": 27380, "train_speed(iter/s)": 0.330467 }, { "acc": 0.76149063, "epoch": 0.6946992242187091, "grad_norm": 3.390625, "learning_rate": 7.771221031837766e-06, "loss": 0.98633099, "memory(GiB)": 728.98, "step": 27385, "train_speed(iter/s)": 0.330264 }, { "acc": 0.75977669, "epoch": 0.6948260635877467, "grad_norm": 3.125, "learning_rate": 7.770348121361473e-06, "loss": 0.93736839, "memory(GiB)": 728.98, "step": 27390, "train_speed(iter/s)": 0.330046 }, { "acc": 0.75424104, "epoch": 0.6949529029567842, "grad_norm": 3.09375, "learning_rate": 7.76947508902639e-06, "loss": 0.9591713, "memory(GiB)": 728.98, "step": 27395, "train_speed(iter/s)": 0.329837 }, { "acc": 0.76848011, "epoch": 0.6950797423258218, "grad_norm": 4.4375, "learning_rate": 7.768601934870924e-06, "loss": 0.8765007, "memory(GiB)": 728.98, "step": 27400, "train_speed(iter/s)": 0.329636 }, { "acc": 0.76401701, "epoch": 0.6952065816948594, "grad_norm": 3.484375, "learning_rate": 7.76772865893348e-06, "loss": 0.9266736, "memory(GiB)": 728.98, "step": 27405, "train_speed(iter/s)": 0.32941 }, { "acc": 0.77637734, "epoch": 0.695333421063897, "grad_norm": 2.265625, "learning_rate": 7.766855261252471e-06, "loss": 0.84425449, "memory(GiB)": 728.98, "step": 27410, "train_speed(iter/s)": 0.329178 }, { "acc": 0.76044073, "epoch": 0.6954602604329345, "grad_norm": 3.15625, "learning_rate": 7.765981741866314e-06, "loss": 0.8975935, "memory(GiB)": 728.98, "step": 27415, "train_speed(iter/s)": 0.328977 }, { "acc": 0.76870365, "epoch": 0.6955870998019721, "grad_norm": 5.375, "learning_rate": 7.765108100813434e-06, "loss": 0.87920637, "memory(GiB)": 728.98, "step": 27420, "train_speed(iter/s)": 0.328754 }, { "acc": 0.76579571, "epoch": 0.6957139391710095, "grad_norm": 3.21875, "learning_rate": 7.764234338132257e-06, "loss": 0.87753963, "memory(GiB)": 728.98, "step": 27425, "train_speed(iter/s)": 0.328557 }, { "acc": 0.76031337, "epoch": 0.6958407785400471, "grad_norm": 3.421875, "learning_rate": 7.763360453861222e-06, "loss": 0.94168711, "memory(GiB)": 728.98, "step": 27430, "train_speed(iter/s)": 0.328379 }, { "acc": 0.7673099, "epoch": 0.6959676179090847, "grad_norm": 3.546875, "learning_rate": 7.762486448038763e-06, "loss": 0.88625803, "memory(GiB)": 728.98, "step": 27435, "train_speed(iter/s)": 0.328178 }, { "acc": 0.76855764, "epoch": 0.6960944572781222, "grad_norm": 6.46875, "learning_rate": 7.761612320703326e-06, "loss": 0.95329714, "memory(GiB)": 728.98, "step": 27440, "train_speed(iter/s)": 0.328012 }, { "acc": 0.76661315, "epoch": 0.6962212966471598, "grad_norm": 3.359375, "learning_rate": 7.760738071893364e-06, "loss": 0.90388985, "memory(GiB)": 728.98, "step": 27445, "train_speed(iter/s)": 0.327806 }, { "acc": 0.75114546, "epoch": 0.6963481360161974, "grad_norm": 4.34375, "learning_rate": 7.75986370164733e-06, "loss": 0.89794655, "memory(GiB)": 728.98, "step": 27450, "train_speed(iter/s)": 0.327605 }, { "acc": 0.76858101, "epoch": 0.696474975385235, "grad_norm": 2.921875, "learning_rate": 7.758989210003684e-06, "loss": 0.93354473, "memory(GiB)": 728.98, "step": 27455, "train_speed(iter/s)": 0.327435 }, { "acc": 0.75788994, "epoch": 0.6966018147542725, "grad_norm": 3.359375, "learning_rate": 7.758114597000896e-06, "loss": 0.92691774, "memory(GiB)": 728.98, "step": 27460, "train_speed(iter/s)": 0.327231 }, { "acc": 0.75524435, "epoch": 0.6967286541233101, "grad_norm": 3.703125, "learning_rate": 7.757239862677433e-06, "loss": 0.95780783, "memory(GiB)": 728.98, "step": 27465, "train_speed(iter/s)": 0.326989 }, { "acc": 0.7585556, "epoch": 0.6968554934923477, "grad_norm": 3.921875, "learning_rate": 7.756365007071775e-06, "loss": 0.91310339, "memory(GiB)": 728.98, "step": 27470, "train_speed(iter/s)": 0.326796 }, { "acc": 0.77515588, "epoch": 0.6969823328613852, "grad_norm": 4.6875, "learning_rate": 7.755490030222401e-06, "loss": 0.88141766, "memory(GiB)": 728.98, "step": 27475, "train_speed(iter/s)": 0.326583 }, { "acc": 0.76976824, "epoch": 0.6971091722304228, "grad_norm": 3.578125, "learning_rate": 7.7546149321678e-06, "loss": 0.88207216, "memory(GiB)": 728.98, "step": 27480, "train_speed(iter/s)": 0.326379 }, { "acc": 0.77433653, "epoch": 0.6972360115994602, "grad_norm": 3.546875, "learning_rate": 7.753739712946464e-06, "loss": 0.90715904, "memory(GiB)": 728.98, "step": 27485, "train_speed(iter/s)": 0.326193 }, { "acc": 0.76716166, "epoch": 0.6973628509684978, "grad_norm": 3.59375, "learning_rate": 7.752864372596896e-06, "loss": 0.91970081, "memory(GiB)": 728.98, "step": 27490, "train_speed(iter/s)": 0.325998 }, { "acc": 0.7635911, "epoch": 0.6974896903375354, "grad_norm": 3.359375, "learning_rate": 7.751988911157592e-06, "loss": 0.91029081, "memory(GiB)": 728.98, "step": 27495, "train_speed(iter/s)": 0.325792 }, { "acc": 0.76627817, "epoch": 0.697616529706573, "grad_norm": 3.34375, "learning_rate": 7.751113328667064e-06, "loss": 0.89973936, "memory(GiB)": 728.98, "step": 27500, "train_speed(iter/s)": 0.325629 }, { "epoch": 0.697616529706573, "eval_acc": 0.7551265270961788, "eval_loss": 0.869121789932251, "eval_runtime": 1149.221, "eval_samples_per_second": 5.543, "eval_steps_per_second": 5.543, "step": 27500 }, { "acc": 0.7662807, "epoch": 0.6977433690756105, "grad_norm": 3.4375, "learning_rate": 7.750237625163828e-06, "loss": 0.89726505, "memory(GiB)": 728.98, "step": 27505, "train_speed(iter/s)": 0.318387 }, { "acc": 0.769736, "epoch": 0.6978702084446481, "grad_norm": 3.09375, "learning_rate": 7.749361800686402e-06, "loss": 0.87646847, "memory(GiB)": 728.98, "step": 27510, "train_speed(iter/s)": 0.318186 }, { "acc": 0.75717468, "epoch": 0.6979970478136857, "grad_norm": 3.65625, "learning_rate": 7.748485855273308e-06, "loss": 0.96393538, "memory(GiB)": 728.98, "step": 27515, "train_speed(iter/s)": 0.317977 }, { "acc": 0.73861737, "epoch": 0.6981238871827232, "grad_norm": 5.46875, "learning_rate": 7.74760978896308e-06, "loss": 0.96209421, "memory(GiB)": 728.98, "step": 27520, "train_speed(iter/s)": 0.317784 }, { "acc": 0.76044326, "epoch": 0.6982507265517608, "grad_norm": 3.34375, "learning_rate": 7.746733601794251e-06, "loss": 0.93972349, "memory(GiB)": 728.98, "step": 27525, "train_speed(iter/s)": 0.317595 }, { "acc": 0.77203641, "epoch": 0.6983775659207984, "grad_norm": 3.390625, "learning_rate": 7.745857293805364e-06, "loss": 0.87694244, "memory(GiB)": 728.98, "step": 27530, "train_speed(iter/s)": 0.317402 }, { "acc": 0.76409249, "epoch": 0.6985044052898359, "grad_norm": 3.859375, "learning_rate": 7.744980865034962e-06, "loss": 0.90629702, "memory(GiB)": 728.98, "step": 27535, "train_speed(iter/s)": 0.317209 }, { "acc": 0.77698793, "epoch": 0.6986312446588735, "grad_norm": 3.546875, "learning_rate": 7.7441043155216e-06, "loss": 0.91049271, "memory(GiB)": 728.98, "step": 27540, "train_speed(iter/s)": 0.317036 }, { "acc": 0.73937068, "epoch": 0.698758084027911, "grad_norm": 3.578125, "learning_rate": 7.743227645303829e-06, "loss": 0.94487677, "memory(GiB)": 728.98, "step": 27545, "train_speed(iter/s)": 0.316855 }, { "acc": 0.76677675, "epoch": 0.6988849233969485, "grad_norm": 3.4375, "learning_rate": 7.742350854420215e-06, "loss": 0.89339581, "memory(GiB)": 728.98, "step": 27550, "train_speed(iter/s)": 0.316676 }, { "acc": 0.76991677, "epoch": 0.6990117627659861, "grad_norm": 2.984375, "learning_rate": 7.741473942909326e-06, "loss": 0.90978498, "memory(GiB)": 728.98, "step": 27555, "train_speed(iter/s)": 0.316492 }, { "acc": 0.78200011, "epoch": 0.6991386021350237, "grad_norm": 3.171875, "learning_rate": 7.740596910809733e-06, "loss": 0.87652254, "memory(GiB)": 728.98, "step": 27560, "train_speed(iter/s)": 0.316314 }, { "acc": 0.76410775, "epoch": 0.6992654415040612, "grad_norm": 3.59375, "learning_rate": 7.739719758160014e-06, "loss": 0.944034, "memory(GiB)": 728.98, "step": 27565, "train_speed(iter/s)": 0.316128 }, { "acc": 0.76650934, "epoch": 0.6993922808730988, "grad_norm": 3.875, "learning_rate": 7.738842484998753e-06, "loss": 0.92963409, "memory(GiB)": 728.98, "step": 27570, "train_speed(iter/s)": 0.31595 }, { "acc": 0.758744, "epoch": 0.6995191202421364, "grad_norm": 3.609375, "learning_rate": 7.737965091364535e-06, "loss": 0.92459154, "memory(GiB)": 728.98, "step": 27575, "train_speed(iter/s)": 0.315777 }, { "acc": 0.77161164, "epoch": 0.6996459596111739, "grad_norm": 3.78125, "learning_rate": 7.737087577295958e-06, "loss": 0.86267099, "memory(GiB)": 728.98, "step": 27580, "train_speed(iter/s)": 0.315598 }, { "acc": 0.76783071, "epoch": 0.6997727989802115, "grad_norm": 3.78125, "learning_rate": 7.736209942831618e-06, "loss": 0.93327894, "memory(GiB)": 728.98, "step": 27585, "train_speed(iter/s)": 0.315403 }, { "acc": 0.75791149, "epoch": 0.6998996383492491, "grad_norm": 3.390625, "learning_rate": 7.735332188010123e-06, "loss": 0.87337246, "memory(GiB)": 728.98, "step": 27590, "train_speed(iter/s)": 0.315207 }, { "acc": 0.76999645, "epoch": 0.7000264777182866, "grad_norm": 3.109375, "learning_rate": 7.734454312870079e-06, "loss": 0.8636281, "memory(GiB)": 728.98, "step": 27595, "train_speed(iter/s)": 0.315029 }, { "acc": 0.76611128, "epoch": 0.7001533170873242, "grad_norm": 3.671875, "learning_rate": 7.733576317450103e-06, "loss": 0.8972209, "memory(GiB)": 728.98, "step": 27600, "train_speed(iter/s)": 0.314854 }, { "acc": 0.76933684, "epoch": 0.7002801564563617, "grad_norm": 3.0, "learning_rate": 7.732698201788812e-06, "loss": 0.89811735, "memory(GiB)": 728.98, "step": 27605, "train_speed(iter/s)": 0.314653 }, { "acc": 0.78048911, "epoch": 0.7004069958253992, "grad_norm": 3.40625, "learning_rate": 7.731819965924836e-06, "loss": 0.86918907, "memory(GiB)": 728.98, "step": 27610, "train_speed(iter/s)": 0.314471 }, { "acc": 0.75691247, "epoch": 0.7005338351944368, "grad_norm": 4.25, "learning_rate": 7.730941609896805e-06, "loss": 0.93216734, "memory(GiB)": 728.98, "step": 27615, "train_speed(iter/s)": 0.314315 }, { "acc": 0.78367381, "epoch": 0.7006606745634744, "grad_norm": 3.828125, "learning_rate": 7.730063133743353e-06, "loss": 0.84290895, "memory(GiB)": 728.98, "step": 27620, "train_speed(iter/s)": 0.314128 }, { "acc": 0.77160568, "epoch": 0.7007875139325119, "grad_norm": 4.0625, "learning_rate": 7.72918453750312e-06, "loss": 0.90122862, "memory(GiB)": 728.98, "step": 27625, "train_speed(iter/s)": 0.313941 }, { "acc": 0.76793895, "epoch": 0.7009143533015495, "grad_norm": 4.1875, "learning_rate": 7.728305821214758e-06, "loss": 0.9063282, "memory(GiB)": 728.98, "step": 27630, "train_speed(iter/s)": 0.313759 }, { "acc": 0.77347784, "epoch": 0.7010411926705871, "grad_norm": 4.03125, "learning_rate": 7.727426984916915e-06, "loss": 0.92148762, "memory(GiB)": 728.98, "step": 27635, "train_speed(iter/s)": 0.31359 }, { "acc": 0.76998796, "epoch": 0.7011680320396246, "grad_norm": 3.53125, "learning_rate": 7.72654802864825e-06, "loss": 0.89058409, "memory(GiB)": 728.98, "step": 27640, "train_speed(iter/s)": 0.313406 }, { "acc": 0.76708713, "epoch": 0.7012948714086622, "grad_norm": 4.90625, "learning_rate": 7.725668952447421e-06, "loss": 0.91395435, "memory(GiB)": 728.98, "step": 27645, "train_speed(iter/s)": 0.313222 }, { "acc": 0.76967468, "epoch": 0.7014217107776998, "grad_norm": 3.203125, "learning_rate": 7.7247897563531e-06, "loss": 0.8758913, "memory(GiB)": 728.98, "step": 27650, "train_speed(iter/s)": 0.313048 }, { "acc": 0.76290088, "epoch": 0.7015485501467373, "grad_norm": 3.75, "learning_rate": 7.723910440403964e-06, "loss": 0.8961894, "memory(GiB)": 728.98, "step": 27655, "train_speed(iter/s)": 0.312855 }, { "acc": 0.77035007, "epoch": 0.7016753895157749, "grad_norm": 3.21875, "learning_rate": 7.723031004638683e-06, "loss": 0.90042686, "memory(GiB)": 728.98, "step": 27660, "train_speed(iter/s)": 0.312666 }, { "acc": 0.775107, "epoch": 0.7018022288848124, "grad_norm": 3.234375, "learning_rate": 7.722151449095945e-06, "loss": 0.9210681, "memory(GiB)": 728.98, "step": 27665, "train_speed(iter/s)": 0.312501 }, { "acc": 0.76152396, "epoch": 0.7019290682538499, "grad_norm": 3.609375, "learning_rate": 7.721271773814436e-06, "loss": 0.91077147, "memory(GiB)": 728.98, "step": 27670, "train_speed(iter/s)": 0.312326 }, { "acc": 0.77932782, "epoch": 0.7020559076228875, "grad_norm": 3.25, "learning_rate": 7.720391978832855e-06, "loss": 0.83473549, "memory(GiB)": 728.98, "step": 27675, "train_speed(iter/s)": 0.312138 }, { "acc": 0.77896914, "epoch": 0.7021827469919251, "grad_norm": 2.96875, "learning_rate": 7.719512064189897e-06, "loss": 0.87854557, "memory(GiB)": 728.98, "step": 27680, "train_speed(iter/s)": 0.31195 }, { "acc": 0.76317258, "epoch": 0.7023095863609626, "grad_norm": 3.65625, "learning_rate": 7.71863202992427e-06, "loss": 0.90285664, "memory(GiB)": 728.98, "step": 27685, "train_speed(iter/s)": 0.311781 }, { "acc": 0.77497587, "epoch": 0.7024364257300002, "grad_norm": 3.328125, "learning_rate": 7.71775187607468e-06, "loss": 0.84851065, "memory(GiB)": 728.98, "step": 27690, "train_speed(iter/s)": 0.31159 }, { "acc": 0.76523294, "epoch": 0.7025632650990378, "grad_norm": 3.390625, "learning_rate": 7.716871602679845e-06, "loss": 0.86878262, "memory(GiB)": 728.98, "step": 27695, "train_speed(iter/s)": 0.311397 }, { "acc": 0.76380982, "epoch": 0.7026901044680753, "grad_norm": 3.71875, "learning_rate": 7.715991209778484e-06, "loss": 0.91190538, "memory(GiB)": 728.98, "step": 27700, "train_speed(iter/s)": 0.311201 }, { "acc": 0.75321131, "epoch": 0.7028169438371129, "grad_norm": 3.46875, "learning_rate": 7.715110697409325e-06, "loss": 0.94945965, "memory(GiB)": 728.98, "step": 27705, "train_speed(iter/s)": 0.311021 }, { "acc": 0.76541896, "epoch": 0.7029437832061505, "grad_norm": 4.15625, "learning_rate": 7.714230065611096e-06, "loss": 0.90376158, "memory(GiB)": 728.98, "step": 27710, "train_speed(iter/s)": 0.310842 }, { "acc": 0.75576835, "epoch": 0.703070622575188, "grad_norm": 3.078125, "learning_rate": 7.713349314422533e-06, "loss": 0.89351988, "memory(GiB)": 728.98, "step": 27715, "train_speed(iter/s)": 0.310672 }, { "acc": 0.76924424, "epoch": 0.7031974619442256, "grad_norm": 3.328125, "learning_rate": 7.712468443882382e-06, "loss": 0.90993652, "memory(GiB)": 728.98, "step": 27720, "train_speed(iter/s)": 0.310487 }, { "acc": 0.75755773, "epoch": 0.7033243013132631, "grad_norm": 3.296875, "learning_rate": 7.711587454029386e-06, "loss": 0.87341194, "memory(GiB)": 728.98, "step": 27725, "train_speed(iter/s)": 0.3103 }, { "acc": 0.77834587, "epoch": 0.7034511406823006, "grad_norm": 4.34375, "learning_rate": 7.710706344902296e-06, "loss": 0.86635342, "memory(GiB)": 728.98, "step": 27730, "train_speed(iter/s)": 0.310123 }, { "acc": 0.77687664, "epoch": 0.7035779800513382, "grad_norm": 3.453125, "learning_rate": 7.70982511653987e-06, "loss": 0.88216105, "memory(GiB)": 728.98, "step": 27735, "train_speed(iter/s)": 0.309956 }, { "acc": 0.7555831, "epoch": 0.7037048194203758, "grad_norm": 3.765625, "learning_rate": 7.708943768980875e-06, "loss": 0.91309261, "memory(GiB)": 728.98, "step": 27740, "train_speed(iter/s)": 0.309796 }, { "acc": 0.77455235, "epoch": 0.7038316587894133, "grad_norm": 3.40625, "learning_rate": 7.708062302264072e-06, "loss": 0.87826071, "memory(GiB)": 728.98, "step": 27745, "train_speed(iter/s)": 0.309601 }, { "acc": 0.75846381, "epoch": 0.7039584981584509, "grad_norm": 3.390625, "learning_rate": 7.707180716428237e-06, "loss": 0.94423075, "memory(GiB)": 728.98, "step": 27750, "train_speed(iter/s)": 0.309441 }, { "acc": 0.75158663, "epoch": 0.7040853375274885, "grad_norm": 3.3125, "learning_rate": 7.706299011512148e-06, "loss": 0.96905746, "memory(GiB)": 728.98, "step": 27755, "train_speed(iter/s)": 0.309269 }, { "acc": 0.76960974, "epoch": 0.704212176896526, "grad_norm": 3.34375, "learning_rate": 7.70541718755459e-06, "loss": 0.93153877, "memory(GiB)": 728.98, "step": 27760, "train_speed(iter/s)": 0.309101 }, { "acc": 0.76158257, "epoch": 0.7043390162655636, "grad_norm": 3.625, "learning_rate": 7.704535244594346e-06, "loss": 0.89768515, "memory(GiB)": 728.98, "step": 27765, "train_speed(iter/s)": 0.308945 }, { "acc": 0.77311196, "epoch": 0.7044658556346012, "grad_norm": 3.609375, "learning_rate": 7.703653182670218e-06, "loss": 0.88153954, "memory(GiB)": 728.98, "step": 27770, "train_speed(iter/s)": 0.308795 }, { "acc": 0.78052201, "epoch": 0.7045926950036387, "grad_norm": 3.765625, "learning_rate": 7.702771001821e-06, "loss": 0.85291929, "memory(GiB)": 728.98, "step": 27775, "train_speed(iter/s)": 0.308604 }, { "acc": 0.76893754, "epoch": 0.7047195343726763, "grad_norm": 3.25, "learning_rate": 7.701888702085493e-06, "loss": 0.84971552, "memory(GiB)": 728.98, "step": 27780, "train_speed(iter/s)": 0.308425 }, { "acc": 0.75930958, "epoch": 0.7048463737417138, "grad_norm": 2.875, "learning_rate": 7.701006283502516e-06, "loss": 0.90820656, "memory(GiB)": 728.98, "step": 27785, "train_speed(iter/s)": 0.308258 }, { "acc": 0.74840865, "epoch": 0.7049732131107513, "grad_norm": 3.9375, "learning_rate": 7.700123746110876e-06, "loss": 0.90666084, "memory(GiB)": 728.98, "step": 27790, "train_speed(iter/s)": 0.308095 }, { "acc": 0.75313435, "epoch": 0.7051000524797889, "grad_norm": 2.9375, "learning_rate": 7.699241089949396e-06, "loss": 0.92183933, "memory(GiB)": 728.98, "step": 27795, "train_speed(iter/s)": 0.307915 }, { "acc": 0.76104121, "epoch": 0.7052268918488265, "grad_norm": 3.328125, "learning_rate": 7.6983583150569e-06, "loss": 0.92944231, "memory(GiB)": 728.98, "step": 27800, "train_speed(iter/s)": 0.307737 }, { "acc": 0.76501889, "epoch": 0.705353731217864, "grad_norm": 3.71875, "learning_rate": 7.697475421472221e-06, "loss": 0.90873985, "memory(GiB)": 728.98, "step": 27805, "train_speed(iter/s)": 0.307568 }, { "acc": 0.75759134, "epoch": 0.7054805705869016, "grad_norm": 3.125, "learning_rate": 7.696592409234192e-06, "loss": 0.87396669, "memory(GiB)": 728.98, "step": 27810, "train_speed(iter/s)": 0.30738 }, { "acc": 0.76356173, "epoch": 0.7056074099559392, "grad_norm": 3.203125, "learning_rate": 7.695709278381653e-06, "loss": 0.91508274, "memory(GiB)": 728.98, "step": 27815, "train_speed(iter/s)": 0.307216 }, { "acc": 0.76830101, "epoch": 0.7057342493249767, "grad_norm": 4.125, "learning_rate": 7.694826028953455e-06, "loss": 0.89226341, "memory(GiB)": 728.98, "step": 27820, "train_speed(iter/s)": 0.307039 }, { "acc": 0.76733475, "epoch": 0.7058610886940143, "grad_norm": 3.640625, "learning_rate": 7.693942660988445e-06, "loss": 0.88581982, "memory(GiB)": 728.98, "step": 27825, "train_speed(iter/s)": 0.306843 }, { "acc": 0.76503077, "epoch": 0.7059879280630519, "grad_norm": 4.375, "learning_rate": 7.69305917452548e-06, "loss": 0.92422647, "memory(GiB)": 728.98, "step": 27830, "train_speed(iter/s)": 0.306689 }, { "acc": 0.76861296, "epoch": 0.7061147674320895, "grad_norm": 3.3125, "learning_rate": 7.692175569603423e-06, "loss": 0.93242807, "memory(GiB)": 728.98, "step": 27835, "train_speed(iter/s)": 0.306497 }, { "acc": 0.76290011, "epoch": 0.706241606801127, "grad_norm": 4.09375, "learning_rate": 7.69129184626114e-06, "loss": 0.87622576, "memory(GiB)": 728.98, "step": 27840, "train_speed(iter/s)": 0.306312 }, { "acc": 0.77226853, "epoch": 0.7063684461701645, "grad_norm": 3.671875, "learning_rate": 7.690408004537505e-06, "loss": 0.87136545, "memory(GiB)": 728.98, "step": 27845, "train_speed(iter/s)": 0.306127 }, { "acc": 0.76599321, "epoch": 0.706495285539202, "grad_norm": 3.65625, "learning_rate": 7.689524044471392e-06, "loss": 0.90375032, "memory(GiB)": 728.98, "step": 27850, "train_speed(iter/s)": 0.305967 }, { "acc": 0.76699958, "epoch": 0.7066221249082396, "grad_norm": 3.46875, "learning_rate": 7.688639966101688e-06, "loss": 0.88032141, "memory(GiB)": 728.98, "step": 27855, "train_speed(iter/s)": 0.305786 }, { "acc": 0.76330166, "epoch": 0.7067489642772772, "grad_norm": 3.203125, "learning_rate": 7.687755769467277e-06, "loss": 0.8734087, "memory(GiB)": 728.98, "step": 27860, "train_speed(iter/s)": 0.305632 }, { "acc": 0.75571322, "epoch": 0.7068758036463147, "grad_norm": 3.96875, "learning_rate": 7.686871454607052e-06, "loss": 0.92453308, "memory(GiB)": 728.98, "step": 27865, "train_speed(iter/s)": 0.305466 }, { "acc": 0.76233644, "epoch": 0.7070026430153523, "grad_norm": 3.65625, "learning_rate": 7.685987021559916e-06, "loss": 0.90247831, "memory(GiB)": 728.98, "step": 27870, "train_speed(iter/s)": 0.305286 }, { "acc": 0.76897879, "epoch": 0.7071294823843899, "grad_norm": 3.78125, "learning_rate": 7.685102470364767e-06, "loss": 0.953967, "memory(GiB)": 728.98, "step": 27875, "train_speed(iter/s)": 0.305122 }, { "acc": 0.7681622, "epoch": 0.7072563217534275, "grad_norm": 3.734375, "learning_rate": 7.684217801060517e-06, "loss": 0.96128168, "memory(GiB)": 728.98, "step": 27880, "train_speed(iter/s)": 0.304966 }, { "acc": 0.7678905, "epoch": 0.707383161122465, "grad_norm": 3.328125, "learning_rate": 7.683333013686076e-06, "loss": 0.88085289, "memory(GiB)": 728.98, "step": 27885, "train_speed(iter/s)": 0.304769 }, { "acc": 0.75237498, "epoch": 0.7075100004915026, "grad_norm": 3.421875, "learning_rate": 7.682448108280368e-06, "loss": 0.93278284, "memory(GiB)": 728.98, "step": 27890, "train_speed(iter/s)": 0.304577 }, { "acc": 0.76475635, "epoch": 0.7076368398605402, "grad_norm": 3.171875, "learning_rate": 7.681563084882312e-06, "loss": 0.88258877, "memory(GiB)": 728.98, "step": 27895, "train_speed(iter/s)": 0.304403 }, { "acc": 0.76647062, "epoch": 0.7077636792295777, "grad_norm": 3.609375, "learning_rate": 7.680677943530842e-06, "loss": 0.87758656, "memory(GiB)": 728.98, "step": 27900, "train_speed(iter/s)": 0.304243 }, { "acc": 0.75544438, "epoch": 0.7078905185986152, "grad_norm": 3.515625, "learning_rate": 7.679792684264889e-06, "loss": 0.9265872, "memory(GiB)": 728.98, "step": 27905, "train_speed(iter/s)": 0.304077 }, { "acc": 0.77523088, "epoch": 0.7080173579676527, "grad_norm": 3.28125, "learning_rate": 7.678907307123394e-06, "loss": 0.88944082, "memory(GiB)": 728.98, "step": 27910, "train_speed(iter/s)": 0.303907 }, { "acc": 0.75710988, "epoch": 0.7081441973366903, "grad_norm": 3.4375, "learning_rate": 7.678021812145303e-06, "loss": 0.90976563, "memory(GiB)": 728.98, "step": 27915, "train_speed(iter/s)": 0.30373 }, { "acc": 0.76995769, "epoch": 0.7082710367057279, "grad_norm": 3.5, "learning_rate": 7.677136199369563e-06, "loss": 0.86877813, "memory(GiB)": 728.98, "step": 27920, "train_speed(iter/s)": 0.303562 }, { "acc": 0.77769027, "epoch": 0.7083978760747655, "grad_norm": 3.0, "learning_rate": 7.676250468835135e-06, "loss": 0.86596603, "memory(GiB)": 728.98, "step": 27925, "train_speed(iter/s)": 0.303366 }, { "acc": 0.75310225, "epoch": 0.708524715443803, "grad_norm": 3.671875, "learning_rate": 7.675364620580973e-06, "loss": 0.95722189, "memory(GiB)": 728.98, "step": 27930, "train_speed(iter/s)": 0.303183 }, { "acc": 0.76667433, "epoch": 0.7086515548128406, "grad_norm": 3.65625, "learning_rate": 7.674478654646046e-06, "loss": 0.86168032, "memory(GiB)": 728.98, "step": 27935, "train_speed(iter/s)": 0.30302 }, { "acc": 0.76688094, "epoch": 0.7087783941818782, "grad_norm": 3.03125, "learning_rate": 7.673592571069325e-06, "loss": 0.84719572, "memory(GiB)": 728.98, "step": 27940, "train_speed(iter/s)": 0.302831 }, { "acc": 0.76087317, "epoch": 0.7089052335509157, "grad_norm": 3.390625, "learning_rate": 7.672706369889788e-06, "loss": 0.94952192, "memory(GiB)": 728.98, "step": 27945, "train_speed(iter/s)": 0.302675 }, { "acc": 0.76512527, "epoch": 0.7090320729199533, "grad_norm": 3.828125, "learning_rate": 7.67182005114641e-06, "loss": 0.92969837, "memory(GiB)": 728.98, "step": 27950, "train_speed(iter/s)": 0.302515 }, { "acc": 0.77109346, "epoch": 0.7091589122889909, "grad_norm": 3.796875, "learning_rate": 7.670933614878182e-06, "loss": 0.87738094, "memory(GiB)": 728.98, "step": 27955, "train_speed(iter/s)": 0.30234 }, { "acc": 0.77003908, "epoch": 0.7092857516580284, "grad_norm": 3.515625, "learning_rate": 7.670047061124094e-06, "loss": 0.92699957, "memory(GiB)": 728.98, "step": 27960, "train_speed(iter/s)": 0.30217 }, { "acc": 0.76858492, "epoch": 0.7094125910270659, "grad_norm": 3.578125, "learning_rate": 7.669160389923145e-06, "loss": 0.88428507, "memory(GiB)": 728.98, "step": 27965, "train_speed(iter/s)": 0.302021 }, { "acc": 0.75698156, "epoch": 0.7095394303961035, "grad_norm": 3.296875, "learning_rate": 7.668273601314334e-06, "loss": 0.93676939, "memory(GiB)": 728.98, "step": 27970, "train_speed(iter/s)": 0.301859 }, { "acc": 0.75745664, "epoch": 0.709666269765141, "grad_norm": 3.65625, "learning_rate": 7.667386695336667e-06, "loss": 0.93353262, "memory(GiB)": 728.98, "step": 27975, "train_speed(iter/s)": 0.301716 }, { "acc": 0.75804543, "epoch": 0.7097931091341786, "grad_norm": 3.1875, "learning_rate": 7.66649967202916e-06, "loss": 0.90822105, "memory(GiB)": 728.98, "step": 27980, "train_speed(iter/s)": 0.301537 }, { "acc": 0.77411203, "epoch": 0.7099199485032162, "grad_norm": 3.578125, "learning_rate": 7.665612531430827e-06, "loss": 0.87263584, "memory(GiB)": 728.98, "step": 27985, "train_speed(iter/s)": 0.301374 }, { "acc": 0.76739841, "epoch": 0.7100467878722537, "grad_norm": 2.90625, "learning_rate": 7.664725273580693e-06, "loss": 0.87960243, "memory(GiB)": 728.98, "step": 27990, "train_speed(iter/s)": 0.30118 }, { "acc": 0.76745167, "epoch": 0.7101736272412913, "grad_norm": 3.59375, "learning_rate": 7.663837898517784e-06, "loss": 0.89948645, "memory(GiB)": 728.98, "step": 27995, "train_speed(iter/s)": 0.301032 }, { "acc": 0.75680389, "epoch": 0.7103004666103289, "grad_norm": 4.03125, "learning_rate": 7.662950406281133e-06, "loss": 0.9036273, "memory(GiB)": 728.98, "step": 28000, "train_speed(iter/s)": 0.300877 }, { "epoch": 0.7103004666103289, "eval_acc": 0.7553805296203289, "eval_loss": 0.868760347366333, "eval_runtime": 1153.1221, "eval_samples_per_second": 5.524, "eval_steps_per_second": 5.524, "step": 28000 }, { "acc": 0.75376711, "epoch": 0.7104273059793664, "grad_norm": 3.609375, "learning_rate": 7.662062796909781e-06, "loss": 0.90994663, "memory(GiB)": 728.98, "step": 28005, "train_speed(iter/s)": 0.294777 }, { "acc": 0.77091885, "epoch": 0.710554145348404, "grad_norm": 3.265625, "learning_rate": 7.661175070442765e-06, "loss": 0.8313282, "memory(GiB)": 728.98, "step": 28010, "train_speed(iter/s)": 0.294606 }, { "acc": 0.76150136, "epoch": 0.7106809847174416, "grad_norm": 4.15625, "learning_rate": 7.660287226919137e-06, "loss": 0.92723761, "memory(GiB)": 728.98, "step": 28015, "train_speed(iter/s)": 0.294443 }, { "acc": 0.73945341, "epoch": 0.7108078240864791, "grad_norm": 3.0625, "learning_rate": 7.65939926637795e-06, "loss": 0.93256483, "memory(GiB)": 728.98, "step": 28020, "train_speed(iter/s)": 0.294282 }, { "acc": 0.77570367, "epoch": 0.7109346634555166, "grad_norm": 3.359375, "learning_rate": 7.658511188858265e-06, "loss": 0.88770199, "memory(GiB)": 728.98, "step": 28025, "train_speed(iter/s)": 0.294126 }, { "acc": 0.77041464, "epoch": 0.7110615028245542, "grad_norm": 2.921875, "learning_rate": 7.657622994399143e-06, "loss": 0.865483, "memory(GiB)": 728.98, "step": 28030, "train_speed(iter/s)": 0.293959 }, { "acc": 0.77080679, "epoch": 0.7111883421935917, "grad_norm": 3.296875, "learning_rate": 7.656734683039651e-06, "loss": 0.90397968, "memory(GiB)": 728.98, "step": 28035, "train_speed(iter/s)": 0.293807 }, { "acc": 0.77450728, "epoch": 0.7113151815626293, "grad_norm": 3.53125, "learning_rate": 7.655846254818867e-06, "loss": 0.92836256, "memory(GiB)": 728.98, "step": 28040, "train_speed(iter/s)": 0.293617 }, { "acc": 0.76253257, "epoch": 0.7114420209316669, "grad_norm": 4.96875, "learning_rate": 7.654957709775867e-06, "loss": 0.94682093, "memory(GiB)": 728.98, "step": 28045, "train_speed(iter/s)": 0.293474 }, { "acc": 0.76299987, "epoch": 0.7115688603007044, "grad_norm": 4.09375, "learning_rate": 7.654069047949738e-06, "loss": 0.90233774, "memory(GiB)": 728.98, "step": 28050, "train_speed(iter/s)": 0.29331 }, { "acc": 0.77608566, "epoch": 0.711695699669742, "grad_norm": 3.453125, "learning_rate": 7.653180269379567e-06, "loss": 0.88790741, "memory(GiB)": 728.98, "step": 28055, "train_speed(iter/s)": 0.293166 }, { "acc": 0.77553668, "epoch": 0.7118225390387796, "grad_norm": 3.703125, "learning_rate": 7.65229137410445e-06, "loss": 0.80682402, "memory(GiB)": 728.98, "step": 28060, "train_speed(iter/s)": 0.293018 }, { "acc": 0.76760421, "epoch": 0.7119493784078171, "grad_norm": 3.4375, "learning_rate": 7.651402362163485e-06, "loss": 0.93133364, "memory(GiB)": 728.98, "step": 28065, "train_speed(iter/s)": 0.292877 }, { "acc": 0.76858778, "epoch": 0.7120762177768547, "grad_norm": 3.453125, "learning_rate": 7.650513233595779e-06, "loss": 0.86938286, "memory(GiB)": 728.98, "step": 28070, "train_speed(iter/s)": 0.292722 }, { "acc": 0.75271659, "epoch": 0.7122030571458923, "grad_norm": 3.5, "learning_rate": 7.64962398844044e-06, "loss": 0.93246164, "memory(GiB)": 728.98, "step": 28075, "train_speed(iter/s)": 0.292578 }, { "acc": 0.76721644, "epoch": 0.7123298965149298, "grad_norm": 3.171875, "learning_rate": 7.648734626736588e-06, "loss": 0.81867132, "memory(GiB)": 728.98, "step": 28080, "train_speed(iter/s)": 0.292408 }, { "acc": 0.76403637, "epoch": 0.7124567358839673, "grad_norm": 3.21875, "learning_rate": 7.647845148523334e-06, "loss": 0.93667326, "memory(GiB)": 728.98, "step": 28085, "train_speed(iter/s)": 0.292258 }, { "acc": 0.75199137, "epoch": 0.7125835752530049, "grad_norm": 3.421875, "learning_rate": 7.64695555383981e-06, "loss": 0.92014561, "memory(GiB)": 728.98, "step": 28090, "train_speed(iter/s)": 0.292085 }, { "acc": 0.75533376, "epoch": 0.7127104146220424, "grad_norm": 3.5, "learning_rate": 7.646065842725145e-06, "loss": 0.90165758, "memory(GiB)": 728.98, "step": 28095, "train_speed(iter/s)": 0.291941 }, { "acc": 0.77054744, "epoch": 0.71283725399108, "grad_norm": 4.4375, "learning_rate": 7.645176015218475e-06, "loss": 0.93982058, "memory(GiB)": 728.98, "step": 28100, "train_speed(iter/s)": 0.291804 }, { "acc": 0.76400843, "epoch": 0.7129640933601176, "grad_norm": 3.625, "learning_rate": 7.64428607135894e-06, "loss": 0.88948145, "memory(GiB)": 728.98, "step": 28105, "train_speed(iter/s)": 0.291647 }, { "acc": 0.76418433, "epoch": 0.7130909327291551, "grad_norm": 3.0625, "learning_rate": 7.643396011185686e-06, "loss": 0.9283308, "memory(GiB)": 728.98, "step": 28110, "train_speed(iter/s)": 0.291476 }, { "acc": 0.78514285, "epoch": 0.7132177720981927, "grad_norm": 3.453125, "learning_rate": 7.642505834737863e-06, "loss": 0.81253204, "memory(GiB)": 728.98, "step": 28115, "train_speed(iter/s)": 0.291305 }, { "acc": 0.76571584, "epoch": 0.7133446114672303, "grad_norm": 3.21875, "learning_rate": 7.641615542054629e-06, "loss": 0.91909666, "memory(GiB)": 728.98, "step": 28120, "train_speed(iter/s)": 0.291147 }, { "acc": 0.75619078, "epoch": 0.7134714508362678, "grad_norm": 3.0625, "learning_rate": 7.640725133175144e-06, "loss": 0.93380585, "memory(GiB)": 728.98, "step": 28125, "train_speed(iter/s)": 0.290987 }, { "acc": 0.76587954, "epoch": 0.7135982902053054, "grad_norm": 3.125, "learning_rate": 7.639834608138574e-06, "loss": 0.85637236, "memory(GiB)": 728.98, "step": 28130, "train_speed(iter/s)": 0.290834 }, { "acc": 0.75712037, "epoch": 0.713725129574343, "grad_norm": 3.40625, "learning_rate": 7.63894396698409e-06, "loss": 0.94099979, "memory(GiB)": 728.98, "step": 28135, "train_speed(iter/s)": 0.290688 }, { "acc": 0.77006807, "epoch": 0.7138519689433805, "grad_norm": 3.203125, "learning_rate": 7.638053209750869e-06, "loss": 0.83714218, "memory(GiB)": 728.98, "step": 28140, "train_speed(iter/s)": 0.290543 }, { "acc": 0.77496905, "epoch": 0.713978808312418, "grad_norm": 3.96875, "learning_rate": 7.637162336478093e-06, "loss": 0.85699844, "memory(GiB)": 728.98, "step": 28145, "train_speed(iter/s)": 0.290393 }, { "acc": 0.76942921, "epoch": 0.7141056476814556, "grad_norm": 3.140625, "learning_rate": 7.636271347204948e-06, "loss": 0.84933319, "memory(GiB)": 728.98, "step": 28150, "train_speed(iter/s)": 0.290241 }, { "acc": 0.75308604, "epoch": 0.7142324870504931, "grad_norm": 3.34375, "learning_rate": 7.635380241970626e-06, "loss": 0.91220922, "memory(GiB)": 728.98, "step": 28155, "train_speed(iter/s)": 0.29008 }, { "acc": 0.76218648, "epoch": 0.7143593264195307, "grad_norm": 4.21875, "learning_rate": 7.634489020814325e-06, "loss": 0.9624855, "memory(GiB)": 728.98, "step": 28160, "train_speed(iter/s)": 0.289942 }, { "acc": 0.77223687, "epoch": 0.7144861657885683, "grad_norm": 3.234375, "learning_rate": 7.633597683775245e-06, "loss": 0.83024702, "memory(GiB)": 728.98, "step": 28165, "train_speed(iter/s)": 0.28979 }, { "acc": 0.75866261, "epoch": 0.7146130051576058, "grad_norm": 3.859375, "learning_rate": 7.632706230892595e-06, "loss": 0.89248495, "memory(GiB)": 728.98, "step": 28170, "train_speed(iter/s)": 0.28964 }, { "acc": 0.77035728, "epoch": 0.7147398445266434, "grad_norm": 3.25, "learning_rate": 7.631814662205586e-06, "loss": 0.90420465, "memory(GiB)": 728.98, "step": 28175, "train_speed(iter/s)": 0.289502 }, { "acc": 0.76584339, "epoch": 0.714866683895681, "grad_norm": 3.3125, "learning_rate": 7.630922977753435e-06, "loss": 0.90530205, "memory(GiB)": 728.98, "step": 28180, "train_speed(iter/s)": 0.289369 }, { "acc": 0.75944405, "epoch": 0.7149935232647185, "grad_norm": 3.5, "learning_rate": 7.630031177575366e-06, "loss": 0.85997295, "memory(GiB)": 728.98, "step": 28185, "train_speed(iter/s)": 0.289203 }, { "acc": 0.76480069, "epoch": 0.7151203626337561, "grad_norm": 2.96875, "learning_rate": 7.629139261710603e-06, "loss": 0.94036245, "memory(GiB)": 728.98, "step": 28190, "train_speed(iter/s)": 0.289066 }, { "acc": 0.75265446, "epoch": 0.7152472020027937, "grad_norm": 3.265625, "learning_rate": 7.628247230198382e-06, "loss": 0.94309855, "memory(GiB)": 728.98, "step": 28195, "train_speed(iter/s)": 0.288918 }, { "acc": 0.76431999, "epoch": 0.7153740413718312, "grad_norm": 3.765625, "learning_rate": 7.62735508307794e-06, "loss": 0.90408707, "memory(GiB)": 728.98, "step": 28200, "train_speed(iter/s)": 0.288783 }, { "acc": 0.76568871, "epoch": 0.7155008807408687, "grad_norm": 3.515625, "learning_rate": 7.62646282038852e-06, "loss": 0.86304531, "memory(GiB)": 728.98, "step": 28205, "train_speed(iter/s)": 0.288638 }, { "acc": 0.76779056, "epoch": 0.7156277201099063, "grad_norm": 3.375, "learning_rate": 7.625570442169368e-06, "loss": 0.92311916, "memory(GiB)": 728.98, "step": 28210, "train_speed(iter/s)": 0.288505 }, { "acc": 0.76917372, "epoch": 0.7157545594789438, "grad_norm": 4.46875, "learning_rate": 7.6246779484597365e-06, "loss": 0.90355082, "memory(GiB)": 728.98, "step": 28215, "train_speed(iter/s)": 0.28837 }, { "acc": 0.76324425, "epoch": 0.7158813988479814, "grad_norm": 3.5625, "learning_rate": 7.623785339298886e-06, "loss": 0.91937218, "memory(GiB)": 728.98, "step": 28220, "train_speed(iter/s)": 0.288223 }, { "acc": 0.75817342, "epoch": 0.716008238217019, "grad_norm": 3.578125, "learning_rate": 7.622892614726078e-06, "loss": 0.90295858, "memory(GiB)": 728.98, "step": 28225, "train_speed(iter/s)": 0.288069 }, { "acc": 0.77975788, "epoch": 0.7161350775860565, "grad_norm": 3.25, "learning_rate": 7.621999774780582e-06, "loss": 0.8296854, "memory(GiB)": 728.98, "step": 28230, "train_speed(iter/s)": 0.287913 }, { "acc": 0.76756077, "epoch": 0.7162619169550941, "grad_norm": 2.8125, "learning_rate": 7.621106819501668e-06, "loss": 0.87861824, "memory(GiB)": 728.98, "step": 28235, "train_speed(iter/s)": 0.287768 }, { "acc": 0.77491665, "epoch": 0.7163887563241317, "grad_norm": 3.125, "learning_rate": 7.620213748928617e-06, "loss": 0.85295963, "memory(GiB)": 728.98, "step": 28240, "train_speed(iter/s)": 0.287603 }, { "acc": 0.74852962, "epoch": 0.7165155956931692, "grad_norm": 3.6875, "learning_rate": 7.6193205631007124e-06, "loss": 0.99664602, "memory(GiB)": 728.98, "step": 28245, "train_speed(iter/s)": 0.287477 }, { "acc": 0.75617976, "epoch": 0.7166424350622068, "grad_norm": 3.40625, "learning_rate": 7.618427262057242e-06, "loss": 0.94049292, "memory(GiB)": 728.98, "step": 28250, "train_speed(iter/s)": 0.287333 }, { "acc": 0.76531343, "epoch": 0.7167692744312444, "grad_norm": 3.421875, "learning_rate": 7.617533845837499e-06, "loss": 0.90692034, "memory(GiB)": 728.98, "step": 28255, "train_speed(iter/s)": 0.287193 }, { "acc": 0.76678181, "epoch": 0.716896113800282, "grad_norm": 3.703125, "learning_rate": 7.616640314480783e-06, "loss": 0.90245724, "memory(GiB)": 728.98, "step": 28260, "train_speed(iter/s)": 0.287037 }, { "acc": 0.75979595, "epoch": 0.7170229531693194, "grad_norm": 3.296875, "learning_rate": 7.615746668026396e-06, "loss": 0.94741993, "memory(GiB)": 728.98, "step": 28265, "train_speed(iter/s)": 0.286884 }, { "acc": 0.77651472, "epoch": 0.717149792538357, "grad_norm": 3.203125, "learning_rate": 7.614852906513646e-06, "loss": 0.86601868, "memory(GiB)": 728.98, "step": 28270, "train_speed(iter/s)": 0.286735 }, { "acc": 0.7752305, "epoch": 0.7172766319073945, "grad_norm": 3.25, "learning_rate": 7.61395902998185e-06, "loss": 0.81595287, "memory(GiB)": 728.98, "step": 28275, "train_speed(iter/s)": 0.286588 }, { "acc": 0.76428447, "epoch": 0.7174034712764321, "grad_norm": 4.625, "learning_rate": 7.613065038470323e-06, "loss": 0.9039897, "memory(GiB)": 728.98, "step": 28280, "train_speed(iter/s)": 0.286442 }, { "acc": 0.76545506, "epoch": 0.7175303106454697, "grad_norm": 3.21875, "learning_rate": 7.612170932018392e-06, "loss": 0.91536407, "memory(GiB)": 728.98, "step": 28285, "train_speed(iter/s)": 0.286288 }, { "acc": 0.77281685, "epoch": 0.7176571500145073, "grad_norm": 3.09375, "learning_rate": 7.611276710665384e-06, "loss": 0.87849293, "memory(GiB)": 728.98, "step": 28290, "train_speed(iter/s)": 0.28615 }, { "acc": 0.76544809, "epoch": 0.7177839893835448, "grad_norm": 4.75, "learning_rate": 7.610382374450633e-06, "loss": 0.88024092, "memory(GiB)": 728.98, "step": 28295, "train_speed(iter/s)": 0.285977 }, { "acc": 0.76655564, "epoch": 0.7179108287525824, "grad_norm": 3.296875, "learning_rate": 7.60948792341348e-06, "loss": 0.90526581, "memory(GiB)": 728.98, "step": 28300, "train_speed(iter/s)": 0.285836 }, { "acc": 0.77905107, "epoch": 0.71803766812162, "grad_norm": 3.828125, "learning_rate": 7.608593357593266e-06, "loss": 0.86352835, "memory(GiB)": 728.98, "step": 28305, "train_speed(iter/s)": 0.285685 }, { "acc": 0.75302444, "epoch": 0.7181645074906575, "grad_norm": 3.78125, "learning_rate": 7.607698677029342e-06, "loss": 0.87360191, "memory(GiB)": 728.98, "step": 28310, "train_speed(iter/s)": 0.285539 }, { "acc": 0.77156968, "epoch": 0.7182913468596951, "grad_norm": 3.015625, "learning_rate": 7.606803881761063e-06, "loss": 0.86097326, "memory(GiB)": 728.98, "step": 28315, "train_speed(iter/s)": 0.285403 }, { "acc": 0.76097164, "epoch": 0.7184181862287327, "grad_norm": 3.296875, "learning_rate": 7.605908971827787e-06, "loss": 0.93294163, "memory(GiB)": 728.98, "step": 28320, "train_speed(iter/s)": 0.285252 }, { "acc": 0.75526967, "epoch": 0.7185450255977701, "grad_norm": 3.4375, "learning_rate": 7.605013947268878e-06, "loss": 0.90887127, "memory(GiB)": 728.98, "step": 28325, "train_speed(iter/s)": 0.285118 }, { "acc": 0.78000445, "epoch": 0.7186718649668077, "grad_norm": 3.546875, "learning_rate": 7.604118808123705e-06, "loss": 0.82461443, "memory(GiB)": 728.98, "step": 28330, "train_speed(iter/s)": 0.284963 }, { "acc": 0.7590066, "epoch": 0.7187987043358453, "grad_norm": 3.25, "learning_rate": 7.603223554431644e-06, "loss": 0.91423893, "memory(GiB)": 728.98, "step": 28335, "train_speed(iter/s)": 0.284834 }, { "acc": 0.75893683, "epoch": 0.7189255437048828, "grad_norm": 3.53125, "learning_rate": 7.6023281862320705e-06, "loss": 0.91752977, "memory(GiB)": 728.98, "step": 28340, "train_speed(iter/s)": 0.284688 }, { "acc": 0.76738462, "epoch": 0.7190523830739204, "grad_norm": 3.484375, "learning_rate": 7.601432703564375e-06, "loss": 0.89531956, "memory(GiB)": 728.98, "step": 28345, "train_speed(iter/s)": 0.284544 }, { "acc": 0.76949301, "epoch": 0.719179222442958, "grad_norm": 4.1875, "learning_rate": 7.600537106467941e-06, "loss": 0.89262381, "memory(GiB)": 728.98, "step": 28350, "train_speed(iter/s)": 0.284373 }, { "acc": 0.75871005, "epoch": 0.7193060618119955, "grad_norm": 3.484375, "learning_rate": 7.599641394982167e-06, "loss": 0.90004826, "memory(GiB)": 728.98, "step": 28355, "train_speed(iter/s)": 0.284223 }, { "acc": 0.76194897, "epoch": 0.7194329011810331, "grad_norm": 3.8125, "learning_rate": 7.5987455691464505e-06, "loss": 0.90713911, "memory(GiB)": 728.98, "step": 28360, "train_speed(iter/s)": 0.284087 }, { "acc": 0.79210458, "epoch": 0.7195597405500707, "grad_norm": 3.03125, "learning_rate": 7.5978496290001955e-06, "loss": 0.81041784, "memory(GiB)": 728.98, "step": 28365, "train_speed(iter/s)": 0.283953 }, { "acc": 0.77582111, "epoch": 0.7196865799191082, "grad_norm": 4.40625, "learning_rate": 7.596953574582815e-06, "loss": 0.82591543, "memory(GiB)": 728.98, "step": 28370, "train_speed(iter/s)": 0.283803 }, { "acc": 0.7670012, "epoch": 0.7198134192881458, "grad_norm": 3.09375, "learning_rate": 7.596057405933719e-06, "loss": 0.85812979, "memory(GiB)": 728.98, "step": 28375, "train_speed(iter/s)": 0.283655 }, { "acc": 0.75399413, "epoch": 0.7199402586571834, "grad_norm": 3.28125, "learning_rate": 7.59516112309233e-06, "loss": 0.90102501, "memory(GiB)": 728.98, "step": 28380, "train_speed(iter/s)": 0.283507 }, { "acc": 0.77636962, "epoch": 0.7200670980262208, "grad_norm": 3.421875, "learning_rate": 7.594264726098072e-06, "loss": 0.85758934, "memory(GiB)": 728.98, "step": 28385, "train_speed(iter/s)": 0.283354 }, { "acc": 0.76504006, "epoch": 0.7201939373952584, "grad_norm": 3.09375, "learning_rate": 7.593368214990376e-06, "loss": 0.86601572, "memory(GiB)": 728.98, "step": 28390, "train_speed(iter/s)": 0.283229 }, { "acc": 0.7792604, "epoch": 0.720320776764296, "grad_norm": 4.34375, "learning_rate": 7.592471589808672e-06, "loss": 0.89606485, "memory(GiB)": 728.98, "step": 28395, "train_speed(iter/s)": 0.283105 }, { "acc": 0.75813389, "epoch": 0.7204476161333335, "grad_norm": 3.578125, "learning_rate": 7.591574850592405e-06, "loss": 0.92495947, "memory(GiB)": 728.98, "step": 28400, "train_speed(iter/s)": 0.282983 }, { "acc": 0.77421355, "epoch": 0.7205744555023711, "grad_norm": 3.703125, "learning_rate": 7.590677997381016e-06, "loss": 0.867278, "memory(GiB)": 728.98, "step": 28405, "train_speed(iter/s)": 0.282821 }, { "acc": 0.76467075, "epoch": 0.7207012948714087, "grad_norm": 3.765625, "learning_rate": 7.5897810302139586e-06, "loss": 0.85899105, "memory(GiB)": 728.98, "step": 28410, "train_speed(iter/s)": 0.282676 }, { "acc": 0.75885181, "epoch": 0.7208281342404462, "grad_norm": 3.90625, "learning_rate": 7.588883949130683e-06, "loss": 0.91525269, "memory(GiB)": 728.98, "step": 28415, "train_speed(iter/s)": 0.282519 }, { "acc": 0.74525762, "epoch": 0.7209549736094838, "grad_norm": 3.328125, "learning_rate": 7.5879867541706535e-06, "loss": 0.90717249, "memory(GiB)": 728.98, "step": 28420, "train_speed(iter/s)": 0.282386 }, { "acc": 0.77017169, "epoch": 0.7210818129785214, "grad_norm": 3.5, "learning_rate": 7.58708944537333e-06, "loss": 0.88832808, "memory(GiB)": 728.98, "step": 28425, "train_speed(iter/s)": 0.28225 }, { "acc": 0.77874522, "epoch": 0.7212086523475589, "grad_norm": 4.4375, "learning_rate": 7.586192022778186e-06, "loss": 0.85599365, "memory(GiB)": 728.98, "step": 28430, "train_speed(iter/s)": 0.282126 }, { "acc": 0.77686682, "epoch": 0.7213354917165965, "grad_norm": 4.15625, "learning_rate": 7.585294486424694e-06, "loss": 0.81722631, "memory(GiB)": 728.98, "step": 28435, "train_speed(iter/s)": 0.281983 }, { "acc": 0.7710156, "epoch": 0.7214623310856341, "grad_norm": 4.28125, "learning_rate": 7.584396836352337e-06, "loss": 0.85209694, "memory(GiB)": 728.98, "step": 28440, "train_speed(iter/s)": 0.281847 }, { "acc": 0.77267947, "epoch": 0.7215891704546715, "grad_norm": 3.640625, "learning_rate": 7.583499072600595e-06, "loss": 0.87512827, "memory(GiB)": 728.98, "step": 28445, "train_speed(iter/s)": 0.281709 }, { "acc": 0.76060662, "epoch": 0.7217160098237091, "grad_norm": 3.078125, "learning_rate": 7.582601195208963e-06, "loss": 0.90775375, "memory(GiB)": 728.98, "step": 28450, "train_speed(iter/s)": 0.281565 }, { "acc": 0.76169114, "epoch": 0.7218428491927467, "grad_norm": 3.296875, "learning_rate": 7.58170320421693e-06, "loss": 0.91383219, "memory(GiB)": 728.98, "step": 28455, "train_speed(iter/s)": 0.281433 }, { "acc": 0.77593703, "epoch": 0.7219696885617842, "grad_norm": 3.96875, "learning_rate": 7.580805099664001e-06, "loss": 0.85971575, "memory(GiB)": 728.98, "step": 28460, "train_speed(iter/s)": 0.281275 }, { "acc": 0.75336103, "epoch": 0.7220965279308218, "grad_norm": 3.546875, "learning_rate": 7.579906881589678e-06, "loss": 0.93251381, "memory(GiB)": 728.98, "step": 28465, "train_speed(iter/s)": 0.281151 }, { "acc": 0.75255923, "epoch": 0.7222233672998594, "grad_norm": 4.53125, "learning_rate": 7.5790085500334736e-06, "loss": 0.91313543, "memory(GiB)": 728.98, "step": 28470, "train_speed(iter/s)": 0.281008 }, { "acc": 0.77317624, "epoch": 0.7223502066688969, "grad_norm": 4.09375, "learning_rate": 7.578110105034897e-06, "loss": 0.9423934, "memory(GiB)": 728.98, "step": 28475, "train_speed(iter/s)": 0.280874 }, { "acc": 0.77852631, "epoch": 0.7224770460379345, "grad_norm": 3.265625, "learning_rate": 7.577211546633472e-06, "loss": 0.8877532, "memory(GiB)": 728.98, "step": 28480, "train_speed(iter/s)": 0.280721 }, { "acc": 0.77356968, "epoch": 0.7226038854069721, "grad_norm": 3.515625, "learning_rate": 7.5763128748687235e-06, "loss": 0.86454639, "memory(GiB)": 728.98, "step": 28485, "train_speed(iter/s)": 0.280591 }, { "acc": 0.76043544, "epoch": 0.7227307247760096, "grad_norm": 3.359375, "learning_rate": 7.57541408978018e-06, "loss": 0.91930885, "memory(GiB)": 728.98, "step": 28490, "train_speed(iter/s)": 0.280459 }, { "acc": 0.77103596, "epoch": 0.7228575641450472, "grad_norm": 3.90625, "learning_rate": 7.574515191407377e-06, "loss": 0.92126493, "memory(GiB)": 728.98, "step": 28495, "train_speed(iter/s)": 0.2803 }, { "acc": 0.76867495, "epoch": 0.7229844035140848, "grad_norm": 4.1875, "learning_rate": 7.573616179789851e-06, "loss": 0.89506063, "memory(GiB)": 728.98, "step": 28500, "train_speed(iter/s)": 0.280161 }, { "epoch": 0.7229844035140848, "eval_acc": 0.7554139510050855, "eval_loss": 0.8673712611198425, "eval_runtime": 1152.1296, "eval_samples_per_second": 5.529, "eval_steps_per_second": 5.529, "step": 28500 }, { "acc": 0.77902136, "epoch": 0.7231112428831222, "grad_norm": 3.3125, "learning_rate": 7.572717054967152e-06, "loss": 0.8134656, "memory(GiB)": 728.98, "step": 28505, "train_speed(iter/s)": 0.274955 }, { "acc": 0.76797609, "epoch": 0.7232380822521598, "grad_norm": 3.953125, "learning_rate": 7.571817816978826e-06, "loss": 0.90474939, "memory(GiB)": 728.98, "step": 28510, "train_speed(iter/s)": 0.274836 }, { "acc": 0.76936102, "epoch": 0.7233649216211974, "grad_norm": 3.015625, "learning_rate": 7.5709184658644295e-06, "loss": 0.87377186, "memory(GiB)": 728.98, "step": 28515, "train_speed(iter/s)": 0.274708 }, { "acc": 0.75891786, "epoch": 0.7234917609902349, "grad_norm": 3.765625, "learning_rate": 7.57001900166352e-06, "loss": 0.9045929, "memory(GiB)": 728.98, "step": 28520, "train_speed(iter/s)": 0.274591 }, { "acc": 0.75044522, "epoch": 0.7236186003592725, "grad_norm": 3.71875, "learning_rate": 7.569119424415663e-06, "loss": 0.91916046, "memory(GiB)": 728.98, "step": 28525, "train_speed(iter/s)": 0.274466 }, { "acc": 0.74962363, "epoch": 0.7237454397283101, "grad_norm": 3.71875, "learning_rate": 7.568219734160429e-06, "loss": 0.9470787, "memory(GiB)": 728.98, "step": 28530, "train_speed(iter/s)": 0.274319 }, { "acc": 0.76279774, "epoch": 0.7238722790973476, "grad_norm": 3.703125, "learning_rate": 7.5673199309373915e-06, "loss": 0.91915894, "memory(GiB)": 728.98, "step": 28535, "train_speed(iter/s)": 0.274185 }, { "acc": 0.76495938, "epoch": 0.7239991184663852, "grad_norm": 4.40625, "learning_rate": 7.566420014786131e-06, "loss": 0.84658823, "memory(GiB)": 728.98, "step": 28540, "train_speed(iter/s)": 0.274058 }, { "acc": 0.76712055, "epoch": 0.7241259578354228, "grad_norm": 3.234375, "learning_rate": 7.56551998574623e-06, "loss": 0.88780031, "memory(GiB)": 728.98, "step": 28545, "train_speed(iter/s)": 0.273931 }, { "acc": 0.77363572, "epoch": 0.7242527972044603, "grad_norm": 3.421875, "learning_rate": 7.564619843857279e-06, "loss": 0.84716187, "memory(GiB)": 728.98, "step": 28550, "train_speed(iter/s)": 0.273811 }, { "acc": 0.77492895, "epoch": 0.7243796365734979, "grad_norm": 4.375, "learning_rate": 7.563719589158874e-06, "loss": 0.8877593, "memory(GiB)": 728.98, "step": 28555, "train_speed(iter/s)": 0.27368 }, { "acc": 0.7691628, "epoch": 0.7245064759425355, "grad_norm": 3.421875, "learning_rate": 7.562819221690611e-06, "loss": 0.8785285, "memory(GiB)": 728.98, "step": 28560, "train_speed(iter/s)": 0.273537 }, { "acc": 0.77015457, "epoch": 0.7246333153115729, "grad_norm": 3.46875, "learning_rate": 7.561918741492096e-06, "loss": 0.88401012, "memory(GiB)": 728.98, "step": 28565, "train_speed(iter/s)": 0.273398 }, { "acc": 0.75380683, "epoch": 0.7247601546806105, "grad_norm": 3.640625, "learning_rate": 7.5610181486029395e-06, "loss": 0.868402, "memory(GiB)": 728.98, "step": 28570, "train_speed(iter/s)": 0.273262 }, { "acc": 0.74398589, "epoch": 0.7248869940496481, "grad_norm": 3.421875, "learning_rate": 7.560117443062753e-06, "loss": 0.98509331, "memory(GiB)": 728.98, "step": 28575, "train_speed(iter/s)": 0.273124 }, { "acc": 0.75509472, "epoch": 0.7250138334186856, "grad_norm": 3.578125, "learning_rate": 7.55921662491116e-06, "loss": 0.94276686, "memory(GiB)": 728.98, "step": 28580, "train_speed(iter/s)": 0.273011 }, { "acc": 0.77481289, "epoch": 0.7251406727877232, "grad_norm": 4.6875, "learning_rate": 7.558315694187779e-06, "loss": 0.86626234, "memory(GiB)": 728.98, "step": 28585, "train_speed(iter/s)": 0.272873 }, { "acc": 0.76786938, "epoch": 0.7252675121567608, "grad_norm": 3.515625, "learning_rate": 7.557414650932245e-06, "loss": 0.87657146, "memory(GiB)": 728.98, "step": 28590, "train_speed(iter/s)": 0.272745 }, { "acc": 0.77297058, "epoch": 0.7253943515257983, "grad_norm": 3.234375, "learning_rate": 7.5565134951841855e-06, "loss": 0.85350447, "memory(GiB)": 728.98, "step": 28595, "train_speed(iter/s)": 0.272624 }, { "acc": 0.76788082, "epoch": 0.7255211908948359, "grad_norm": 6.15625, "learning_rate": 7.5556122269832445e-06, "loss": 0.84254246, "memory(GiB)": 728.98, "step": 28600, "train_speed(iter/s)": 0.272488 }, { "acc": 0.74215574, "epoch": 0.7256480302638735, "grad_norm": 4.21875, "learning_rate": 7.5547108463690646e-06, "loss": 0.95675974, "memory(GiB)": 728.98, "step": 28605, "train_speed(iter/s)": 0.272366 }, { "acc": 0.76552019, "epoch": 0.725774869632911, "grad_norm": 4.25, "learning_rate": 7.553809353381293e-06, "loss": 0.93503838, "memory(GiB)": 728.98, "step": 28610, "train_speed(iter/s)": 0.27224 }, { "acc": 0.77393942, "epoch": 0.7259017090019486, "grad_norm": 3.03125, "learning_rate": 7.552907748059586e-06, "loss": 0.85323353, "memory(GiB)": 728.98, "step": 28615, "train_speed(iter/s)": 0.272114 }, { "acc": 0.7533916, "epoch": 0.7260285483709862, "grad_norm": 3.265625, "learning_rate": 7.552006030443601e-06, "loss": 0.92066927, "memory(GiB)": 728.98, "step": 28620, "train_speed(iter/s)": 0.271978 }, { "acc": 0.76602745, "epoch": 0.7261553877400236, "grad_norm": 3.6875, "learning_rate": 7.551104200573003e-06, "loss": 0.928965, "memory(GiB)": 728.98, "step": 28625, "train_speed(iter/s)": 0.271836 }, { "acc": 0.77939582, "epoch": 0.7262822271090612, "grad_norm": 3.140625, "learning_rate": 7.550202258487458e-06, "loss": 0.83706074, "memory(GiB)": 728.98, "step": 28630, "train_speed(iter/s)": 0.271695 }, { "acc": 0.75868926, "epoch": 0.7264090664780988, "grad_norm": 2.921875, "learning_rate": 7.549300204226642e-06, "loss": 0.91900864, "memory(GiB)": 728.98, "step": 28635, "train_speed(iter/s)": 0.27157 }, { "acc": 0.75866332, "epoch": 0.7265359058471363, "grad_norm": 3.53125, "learning_rate": 7.548398037830231e-06, "loss": 0.90996141, "memory(GiB)": 728.98, "step": 28640, "train_speed(iter/s)": 0.271466 }, { "acc": 0.76996837, "epoch": 0.7266627452161739, "grad_norm": 3.09375, "learning_rate": 7.547495759337912e-06, "loss": 0.86968184, "memory(GiB)": 728.98, "step": 28645, "train_speed(iter/s)": 0.271347 }, { "acc": 0.78109746, "epoch": 0.7267895845852115, "grad_norm": 3.25, "learning_rate": 7.54659336878937e-06, "loss": 0.9024847, "memory(GiB)": 728.98, "step": 28650, "train_speed(iter/s)": 0.271212 }, { "acc": 0.77415357, "epoch": 0.726916423954249, "grad_norm": 2.734375, "learning_rate": 7.5456908662243e-06, "loss": 0.82153053, "memory(GiB)": 728.98, "step": 28655, "train_speed(iter/s)": 0.271072 }, { "acc": 0.76751771, "epoch": 0.7270432633232866, "grad_norm": 4.03125, "learning_rate": 7.5447882516824e-06, "loss": 0.91013889, "memory(GiB)": 728.98, "step": 28660, "train_speed(iter/s)": 0.270948 }, { "acc": 0.76615567, "epoch": 0.7271701026923242, "grad_norm": 3.890625, "learning_rate": 7.543885525203374e-06, "loss": 0.94316387, "memory(GiB)": 728.98, "step": 28665, "train_speed(iter/s)": 0.270841 }, { "acc": 0.75284972, "epoch": 0.7272969420613618, "grad_norm": 3.3125, "learning_rate": 7.5429826868269275e-06, "loss": 0.90038233, "memory(GiB)": 728.98, "step": 28670, "train_speed(iter/s)": 0.270708 }, { "acc": 0.76017227, "epoch": 0.7274237814303993, "grad_norm": 3.75, "learning_rate": 7.542079736592775e-06, "loss": 0.94057922, "memory(GiB)": 728.98, "step": 28675, "train_speed(iter/s)": 0.270592 }, { "acc": 0.76265841, "epoch": 0.7275506207994369, "grad_norm": 3.421875, "learning_rate": 7.5411766745406355e-06, "loss": 0.94126463, "memory(GiB)": 728.98, "step": 28680, "train_speed(iter/s)": 0.270472 }, { "acc": 0.77140474, "epoch": 0.7276774601684743, "grad_norm": 3.765625, "learning_rate": 7.540273500710231e-06, "loss": 0.85922461, "memory(GiB)": 728.98, "step": 28685, "train_speed(iter/s)": 0.270338 }, { "acc": 0.772716, "epoch": 0.7278042995375119, "grad_norm": 4.4375, "learning_rate": 7.539370215141289e-06, "loss": 0.90478115, "memory(GiB)": 728.98, "step": 28690, "train_speed(iter/s)": 0.270224 }, { "acc": 0.75999732, "epoch": 0.7279311389065495, "grad_norm": 4.09375, "learning_rate": 7.538466817873541e-06, "loss": 0.95559587, "memory(GiB)": 728.98, "step": 28695, "train_speed(iter/s)": 0.270099 }, { "acc": 0.7616611, "epoch": 0.728057978275587, "grad_norm": 4.15625, "learning_rate": 7.537563308946726e-06, "loss": 0.92129622, "memory(GiB)": 728.98, "step": 28700, "train_speed(iter/s)": 0.269956 }, { "acc": 0.7557013, "epoch": 0.7281848176446246, "grad_norm": 3.546875, "learning_rate": 7.536659688400587e-06, "loss": 0.92757311, "memory(GiB)": 728.98, "step": 28705, "train_speed(iter/s)": 0.269835 }, { "acc": 0.77857089, "epoch": 0.7283116570136622, "grad_norm": 3.1875, "learning_rate": 7.5357559562748705e-06, "loss": 0.88009548, "memory(GiB)": 728.98, "step": 28710, "train_speed(iter/s)": 0.269721 }, { "acc": 0.76586986, "epoch": 0.7284384963826998, "grad_norm": 3.359375, "learning_rate": 7.534852112609328e-06, "loss": 0.90592775, "memory(GiB)": 728.98, "step": 28715, "train_speed(iter/s)": 0.269592 }, { "acc": 0.76432409, "epoch": 0.7285653357517373, "grad_norm": 3.40625, "learning_rate": 7.533948157443718e-06, "loss": 0.88539238, "memory(GiB)": 728.98, "step": 28720, "train_speed(iter/s)": 0.269466 }, { "acc": 0.74632473, "epoch": 0.7286921751207749, "grad_norm": 3.296875, "learning_rate": 7.533044090817803e-06, "loss": 0.90807772, "memory(GiB)": 728.98, "step": 28725, "train_speed(iter/s)": 0.269323 }, { "acc": 0.76024442, "epoch": 0.7288190144898125, "grad_norm": 3.296875, "learning_rate": 7.532139912771348e-06, "loss": 0.9312748, "memory(GiB)": 728.98, "step": 28730, "train_speed(iter/s)": 0.269201 }, { "acc": 0.7745419, "epoch": 0.72894585385885, "grad_norm": 3.515625, "learning_rate": 7.531235623344127e-06, "loss": 0.86408339, "memory(GiB)": 728.98, "step": 28735, "train_speed(iter/s)": 0.269072 }, { "acc": 0.78243599, "epoch": 0.7290726932278876, "grad_norm": 3.1875, "learning_rate": 7.530331222575915e-06, "loss": 0.82654505, "memory(GiB)": 728.98, "step": 28740, "train_speed(iter/s)": 0.26895 }, { "acc": 0.76476412, "epoch": 0.729199532596925, "grad_norm": 3.109375, "learning_rate": 7.529426710506495e-06, "loss": 0.8691555, "memory(GiB)": 728.98, "step": 28745, "train_speed(iter/s)": 0.268822 }, { "acc": 0.77375326, "epoch": 0.7293263719659626, "grad_norm": 3.484375, "learning_rate": 7.528522087175654e-06, "loss": 0.86316738, "memory(GiB)": 728.98, "step": 28750, "train_speed(iter/s)": 0.26872 }, { "acc": 0.77104545, "epoch": 0.7294532113350002, "grad_norm": 4.40625, "learning_rate": 7.5276173526231825e-06, "loss": 0.92126551, "memory(GiB)": 728.98, "step": 28755, "train_speed(iter/s)": 0.268615 }, { "acc": 0.74867182, "epoch": 0.7295800507040378, "grad_norm": 4.03125, "learning_rate": 7.526712506888876e-06, "loss": 0.94501743, "memory(GiB)": 728.98, "step": 28760, "train_speed(iter/s)": 0.268494 }, { "acc": 0.7766171, "epoch": 0.7297068900730753, "grad_norm": 3.5625, "learning_rate": 7.525807550012538e-06, "loss": 0.9084178, "memory(GiB)": 728.98, "step": 28765, "train_speed(iter/s)": 0.26838 }, { "acc": 0.75971313, "epoch": 0.7298337294421129, "grad_norm": 3.09375, "learning_rate": 7.524902482033974e-06, "loss": 0.92711182, "memory(GiB)": 728.98, "step": 28770, "train_speed(iter/s)": 0.268248 }, { "acc": 0.77429447, "epoch": 0.7299605688111505, "grad_norm": 3.203125, "learning_rate": 7.523997302992994e-06, "loss": 0.85102329, "memory(GiB)": 728.98, "step": 28775, "train_speed(iter/s)": 0.268126 }, { "acc": 0.76394725, "epoch": 0.730087408180188, "grad_norm": 3.5625, "learning_rate": 7.523092012929414e-06, "loss": 0.94024315, "memory(GiB)": 728.98, "step": 28780, "train_speed(iter/s)": 0.268007 }, { "acc": 0.76410551, "epoch": 0.7302142475492256, "grad_norm": 3.90625, "learning_rate": 7.522186611883055e-06, "loss": 0.87546997, "memory(GiB)": 728.98, "step": 28785, "train_speed(iter/s)": 0.267902 }, { "acc": 0.77190413, "epoch": 0.7303410869182632, "grad_norm": 3.359375, "learning_rate": 7.521281099893745e-06, "loss": 0.91146765, "memory(GiB)": 728.98, "step": 28790, "train_speed(iter/s)": 0.26777 }, { "acc": 0.75482922, "epoch": 0.7304679262873007, "grad_norm": 3.671875, "learning_rate": 7.520375477001312e-06, "loss": 0.88878727, "memory(GiB)": 728.98, "step": 28795, "train_speed(iter/s)": 0.267652 }, { "acc": 0.77691112, "epoch": 0.7305947656563383, "grad_norm": 3.625, "learning_rate": 7.519469743245592e-06, "loss": 0.85431767, "memory(GiB)": 728.98, "step": 28800, "train_speed(iter/s)": 0.26754 }, { "acc": 0.76401868, "epoch": 0.7307216050253758, "grad_norm": 4.0625, "learning_rate": 7.518563898666425e-06, "loss": 0.86033974, "memory(GiB)": 728.98, "step": 28805, "train_speed(iter/s)": 0.2674 }, { "acc": 0.77811627, "epoch": 0.7308484443944133, "grad_norm": 2.859375, "learning_rate": 7.5176579433036575e-06, "loss": 0.868153, "memory(GiB)": 728.98, "step": 28810, "train_speed(iter/s)": 0.267283 }, { "acc": 0.76527376, "epoch": 0.7309752837634509, "grad_norm": 3.671875, "learning_rate": 7.516751877197138e-06, "loss": 0.8598218, "memory(GiB)": 728.98, "step": 28815, "train_speed(iter/s)": 0.267144 }, { "acc": 0.76866074, "epoch": 0.7311021231324885, "grad_norm": 3.53125, "learning_rate": 7.515845700386723e-06, "loss": 0.91837788, "memory(GiB)": 728.98, "step": 28820, "train_speed(iter/s)": 0.267017 }, { "acc": 0.77055073, "epoch": 0.731228962501526, "grad_norm": 4.625, "learning_rate": 7.514939412912272e-06, "loss": 0.87704077, "memory(GiB)": 728.98, "step": 28825, "train_speed(iter/s)": 0.266891 }, { "acc": 0.76776433, "epoch": 0.7313558018705636, "grad_norm": 4.03125, "learning_rate": 7.514033014813649e-06, "loss": 0.86267366, "memory(GiB)": 728.98, "step": 28830, "train_speed(iter/s)": 0.266758 }, { "acc": 0.77479901, "epoch": 0.7314826412396012, "grad_norm": 3.515625, "learning_rate": 7.513126506130724e-06, "loss": 0.82617941, "memory(GiB)": 728.98, "step": 28835, "train_speed(iter/s)": 0.266635 }, { "acc": 0.77232528, "epoch": 0.7316094806086387, "grad_norm": 3.734375, "learning_rate": 7.51221988690337e-06, "loss": 0.86617785, "memory(GiB)": 728.98, "step": 28840, "train_speed(iter/s)": 0.266518 }, { "acc": 0.76486969, "epoch": 0.7317363199776763, "grad_norm": 2.8125, "learning_rate": 7.511313157171469e-06, "loss": 0.88844566, "memory(GiB)": 728.98, "step": 28845, "train_speed(iter/s)": 0.266394 }, { "acc": 0.78942876, "epoch": 0.7318631593467139, "grad_norm": 3.953125, "learning_rate": 7.510406316974903e-06, "loss": 0.80760489, "memory(GiB)": 728.98, "step": 28850, "train_speed(iter/s)": 0.266279 }, { "acc": 0.76275229, "epoch": 0.7319899987157514, "grad_norm": 3.796875, "learning_rate": 7.509499366353563e-06, "loss": 0.91450891, "memory(GiB)": 728.98, "step": 28855, "train_speed(iter/s)": 0.266152 }, { "acc": 0.76016712, "epoch": 0.732116838084789, "grad_norm": 3.828125, "learning_rate": 7.508592305347339e-06, "loss": 0.92085028, "memory(GiB)": 728.98, "step": 28860, "train_speed(iter/s)": 0.266017 }, { "acc": 0.75934906, "epoch": 0.7322436774538265, "grad_norm": 3.515625, "learning_rate": 7.507685133996136e-06, "loss": 0.92085409, "memory(GiB)": 728.98, "step": 28865, "train_speed(iter/s)": 0.265885 }, { "acc": 0.764183, "epoch": 0.732370516822864, "grad_norm": 3.28125, "learning_rate": 7.50677785233985e-06, "loss": 0.91214647, "memory(GiB)": 728.98, "step": 28870, "train_speed(iter/s)": 0.26578 }, { "acc": 0.78128757, "epoch": 0.7324973561919016, "grad_norm": 3.4375, "learning_rate": 7.5058704604183966e-06, "loss": 0.83202419, "memory(GiB)": 728.98, "step": 28875, "train_speed(iter/s)": 0.26566 }, { "acc": 0.75885496, "epoch": 0.7326241955609392, "grad_norm": 2.75, "learning_rate": 7.504962958271684e-06, "loss": 0.9305438, "memory(GiB)": 728.98, "step": 28880, "train_speed(iter/s)": 0.265554 }, { "acc": 0.76046658, "epoch": 0.7327510349299767, "grad_norm": 2.96875, "learning_rate": 7.504055345939631e-06, "loss": 0.95939922, "memory(GiB)": 728.98, "step": 28885, "train_speed(iter/s)": 0.265423 }, { "acc": 0.76357789, "epoch": 0.7328778742990143, "grad_norm": 10.9375, "learning_rate": 7.503147623462166e-06, "loss": 0.93995018, "memory(GiB)": 728.98, "step": 28890, "train_speed(iter/s)": 0.265292 }, { "acc": 0.76711736, "epoch": 0.7330047136680519, "grad_norm": 3.671875, "learning_rate": 7.502239790879209e-06, "loss": 0.88845358, "memory(GiB)": 728.98, "step": 28895, "train_speed(iter/s)": 0.265175 }, { "acc": 0.76044221, "epoch": 0.7331315530370894, "grad_norm": 3.375, "learning_rate": 7.501331848230697e-06, "loss": 0.90311174, "memory(GiB)": 728.98, "step": 28900, "train_speed(iter/s)": 0.265048 }, { "acc": 0.75955372, "epoch": 0.733258392406127, "grad_norm": 3.6875, "learning_rate": 7.500423795556566e-06, "loss": 0.86730394, "memory(GiB)": 728.98, "step": 28905, "train_speed(iter/s)": 0.264937 }, { "acc": 0.77257657, "epoch": 0.7333852317751646, "grad_norm": 3.0, "learning_rate": 7.499515632896761e-06, "loss": 0.86469784, "memory(GiB)": 728.98, "step": 28910, "train_speed(iter/s)": 0.264816 }, { "acc": 0.7849987, "epoch": 0.7335120711442021, "grad_norm": 3.8125, "learning_rate": 7.498607360291227e-06, "loss": 0.86190567, "memory(GiB)": 728.98, "step": 28915, "train_speed(iter/s)": 0.264696 }, { "acc": 0.76890225, "epoch": 0.7336389105132397, "grad_norm": 3.765625, "learning_rate": 7.497698977779915e-06, "loss": 0.89154863, "memory(GiB)": 728.98, "step": 28920, "train_speed(iter/s)": 0.264576 }, { "acc": 0.76001019, "epoch": 0.7337657498822772, "grad_norm": 3.53125, "learning_rate": 7.496790485402784e-06, "loss": 0.90139666, "memory(GiB)": 728.98, "step": 28925, "train_speed(iter/s)": 0.264458 }, { "acc": 0.7761035, "epoch": 0.7338925892513147, "grad_norm": 3.734375, "learning_rate": 7.495881883199794e-06, "loss": 0.84836721, "memory(GiB)": 728.98, "step": 28930, "train_speed(iter/s)": 0.264347 }, { "acc": 0.7545959, "epoch": 0.7340194286203523, "grad_norm": 3.375, "learning_rate": 7.494973171210913e-06, "loss": 0.9545188, "memory(GiB)": 728.98, "step": 28935, "train_speed(iter/s)": 0.264219 }, { "acc": 0.74751453, "epoch": 0.7341462679893899, "grad_norm": 4.34375, "learning_rate": 7.494064349476111e-06, "loss": 0.98866205, "memory(GiB)": 728.98, "step": 28940, "train_speed(iter/s)": 0.264102 }, { "acc": 0.7632916, "epoch": 0.7342731073584274, "grad_norm": 3.28125, "learning_rate": 7.493155418035366e-06, "loss": 0.91266546, "memory(GiB)": 728.98, "step": 28945, "train_speed(iter/s)": 0.263989 }, { "acc": 0.77499075, "epoch": 0.734399946727465, "grad_norm": 3.578125, "learning_rate": 7.492246376928657e-06, "loss": 0.8511178, "memory(GiB)": 728.98, "step": 28950, "train_speed(iter/s)": 0.263868 }, { "acc": 0.76831665, "epoch": 0.7345267860965026, "grad_norm": 3.34375, "learning_rate": 7.491337226195972e-06, "loss": 0.8974637, "memory(GiB)": 728.98, "step": 28955, "train_speed(iter/s)": 0.26376 }, { "acc": 0.76056528, "epoch": 0.7346536254655401, "grad_norm": 3.46875, "learning_rate": 7.490427965877298e-06, "loss": 0.91552515, "memory(GiB)": 728.98, "step": 28960, "train_speed(iter/s)": 0.263631 }, { "acc": 0.76686373, "epoch": 0.7347804648345777, "grad_norm": 2.671875, "learning_rate": 7.489518596012634e-06, "loss": 0.87912903, "memory(GiB)": 728.98, "step": 28965, "train_speed(iter/s)": 0.263526 }, { "acc": 0.76182065, "epoch": 0.7349073042036153, "grad_norm": 3.203125, "learning_rate": 7.488609116641979e-06, "loss": 0.87355757, "memory(GiB)": 728.98, "step": 28970, "train_speed(iter/s)": 0.263407 }, { "acc": 0.76334176, "epoch": 0.7350341435726528, "grad_norm": 3.90625, "learning_rate": 7.487699527805338e-06, "loss": 0.91609898, "memory(GiB)": 728.98, "step": 28975, "train_speed(iter/s)": 0.263292 }, { "acc": 0.78750186, "epoch": 0.7351609829416904, "grad_norm": 3.5, "learning_rate": 7.486789829542722e-06, "loss": 0.85513802, "memory(GiB)": 728.98, "step": 28980, "train_speed(iter/s)": 0.263169 }, { "acc": 0.77416797, "epoch": 0.7352878223107279, "grad_norm": 3.546875, "learning_rate": 7.485880021894143e-06, "loss": 0.87183132, "memory(GiB)": 728.98, "step": 28985, "train_speed(iter/s)": 0.263069 }, { "acc": 0.76467381, "epoch": 0.7354146616797654, "grad_norm": 4.0625, "learning_rate": 7.484970104899624e-06, "loss": 0.898419, "memory(GiB)": 728.98, "step": 28990, "train_speed(iter/s)": 0.262955 }, { "acc": 0.76523657, "epoch": 0.735541501048803, "grad_norm": 3.28125, "learning_rate": 7.484060078599187e-06, "loss": 0.9340106, "memory(GiB)": 728.98, "step": 28995, "train_speed(iter/s)": 0.262855 }, { "acc": 0.77584515, "epoch": 0.7356683404178406, "grad_norm": 3.453125, "learning_rate": 7.4831499430328615e-06, "loss": 0.83659716, "memory(GiB)": 728.98, "step": 29000, "train_speed(iter/s)": 0.262743 }, { "epoch": 0.7356683404178406, "eval_acc": 0.7555768802557739, "eval_loss": 0.8668288588523865, "eval_runtime": 1153.0629, "eval_samples_per_second": 5.524, "eval_steps_per_second": 5.524, "step": 29000 }, { "acc": 0.76465778, "epoch": 0.7357951797868781, "grad_norm": 3.828125, "learning_rate": 7.4822396982406835e-06, "loss": 0.89348459, "memory(GiB)": 728.98, "step": 29005, "train_speed(iter/s)": 0.258253 }, { "acc": 0.77269964, "epoch": 0.7359220191559157, "grad_norm": 2.6875, "learning_rate": 7.4813293442626865e-06, "loss": 0.80629787, "memory(GiB)": 728.98, "step": 29010, "train_speed(iter/s)": 0.258136 }, { "acc": 0.77466044, "epoch": 0.7360488585249533, "grad_norm": 3.78125, "learning_rate": 7.480418881138921e-06, "loss": 0.85839796, "memory(GiB)": 728.98, "step": 29015, "train_speed(iter/s)": 0.25804 }, { "acc": 0.76813812, "epoch": 0.7361756978939908, "grad_norm": 3.1875, "learning_rate": 7.4795083089094315e-06, "loss": 0.86998892, "memory(GiB)": 728.98, "step": 29020, "train_speed(iter/s)": 0.257938 }, { "acc": 0.75677619, "epoch": 0.7363025372630284, "grad_norm": 8.25, "learning_rate": 7.47859762761427e-06, "loss": 0.96251402, "memory(GiB)": 728.98, "step": 29025, "train_speed(iter/s)": 0.257822 }, { "acc": 0.75833435, "epoch": 0.736429376632066, "grad_norm": 4.0, "learning_rate": 7.477686837293498e-06, "loss": 0.90548658, "memory(GiB)": 728.98, "step": 29030, "train_speed(iter/s)": 0.257695 }, { "acc": 0.76908889, "epoch": 0.7365562160011035, "grad_norm": 3.484375, "learning_rate": 7.476775937987174e-06, "loss": 0.92701206, "memory(GiB)": 728.98, "step": 29035, "train_speed(iter/s)": 0.257586 }, { "acc": 0.77412515, "epoch": 0.7366830553701411, "grad_norm": 3.3125, "learning_rate": 7.47586492973537e-06, "loss": 0.85962553, "memory(GiB)": 728.98, "step": 29040, "train_speed(iter/s)": 0.257486 }, { "acc": 0.76426091, "epoch": 0.7368098947391786, "grad_norm": 4.0, "learning_rate": 7.474953812578156e-06, "loss": 0.87210093, "memory(GiB)": 728.98, "step": 29045, "train_speed(iter/s)": 0.257378 }, { "acc": 0.76063485, "epoch": 0.7369367341082161, "grad_norm": 3.171875, "learning_rate": 7.47404258655561e-06, "loss": 0.90110035, "memory(GiB)": 728.98, "step": 29050, "train_speed(iter/s)": 0.257266 }, { "acc": 0.75707474, "epoch": 0.7370635734772537, "grad_norm": 3.59375, "learning_rate": 7.473131251707812e-06, "loss": 0.93697834, "memory(GiB)": 728.98, "step": 29055, "train_speed(iter/s)": 0.257156 }, { "acc": 0.77765083, "epoch": 0.7371904128462913, "grad_norm": 3.90625, "learning_rate": 7.472219808074851e-06, "loss": 0.82786875, "memory(GiB)": 728.98, "step": 29060, "train_speed(iter/s)": 0.257061 }, { "acc": 0.77113843, "epoch": 0.7373172522153288, "grad_norm": 3.265625, "learning_rate": 7.4713082556968175e-06, "loss": 0.87062721, "memory(GiB)": 728.98, "step": 29065, "train_speed(iter/s)": 0.256954 }, { "acc": 0.77984161, "epoch": 0.7374440915843664, "grad_norm": 3.5, "learning_rate": 7.470396594613808e-06, "loss": 0.87707663, "memory(GiB)": 728.98, "step": 29070, "train_speed(iter/s)": 0.25685 }, { "acc": 0.7606111, "epoch": 0.737570930953404, "grad_norm": 3.484375, "learning_rate": 7.469484824865922e-06, "loss": 0.93666058, "memory(GiB)": 728.98, "step": 29075, "train_speed(iter/s)": 0.256743 }, { "acc": 0.76624632, "epoch": 0.7376977703224415, "grad_norm": 3.8125, "learning_rate": 7.468572946493269e-06, "loss": 0.89266338, "memory(GiB)": 728.98, "step": 29080, "train_speed(iter/s)": 0.25664 }, { "acc": 0.77261186, "epoch": 0.7378246096914791, "grad_norm": 3.125, "learning_rate": 7.467660959535958e-06, "loss": 0.91315479, "memory(GiB)": 728.98, "step": 29085, "train_speed(iter/s)": 0.256535 }, { "acc": 0.78449483, "epoch": 0.7379514490605167, "grad_norm": 6.125, "learning_rate": 7.466748864034101e-06, "loss": 0.82642822, "memory(GiB)": 728.98, "step": 29090, "train_speed(iter/s)": 0.256422 }, { "acc": 0.75396299, "epoch": 0.7380782884295543, "grad_norm": 3.734375, "learning_rate": 7.4658366600278235e-06, "loss": 0.94868603, "memory(GiB)": 728.98, "step": 29095, "train_speed(iter/s)": 0.256305 }, { "acc": 0.76314888, "epoch": 0.7382051277985918, "grad_norm": 3.546875, "learning_rate": 7.464924347557247e-06, "loss": 0.9156621, "memory(GiB)": 728.98, "step": 29100, "train_speed(iter/s)": 0.2562 }, { "acc": 0.77068362, "epoch": 0.7383319671676293, "grad_norm": 3.53125, "learning_rate": 7.464011926662502e-06, "loss": 0.8656168, "memory(GiB)": 728.98, "step": 29105, "train_speed(iter/s)": 0.256088 }, { "acc": 0.76848001, "epoch": 0.7384588065366668, "grad_norm": 3.15625, "learning_rate": 7.463099397383723e-06, "loss": 0.9013299, "memory(GiB)": 728.98, "step": 29110, "train_speed(iter/s)": 0.255982 }, { "acc": 0.76429324, "epoch": 0.7385856459057044, "grad_norm": 3.140625, "learning_rate": 7.46218675976105e-06, "loss": 0.92263832, "memory(GiB)": 728.98, "step": 29115, "train_speed(iter/s)": 0.255876 }, { "acc": 0.76506901, "epoch": 0.738712485274742, "grad_norm": 4.09375, "learning_rate": 7.461274013834625e-06, "loss": 0.92779913, "memory(GiB)": 728.98, "step": 29120, "train_speed(iter/s)": 0.255775 }, { "acc": 0.76718006, "epoch": 0.7388393246437795, "grad_norm": 2.921875, "learning_rate": 7.4603611596446005e-06, "loss": 0.87790794, "memory(GiB)": 728.98, "step": 29125, "train_speed(iter/s)": 0.255669 }, { "acc": 0.76127405, "epoch": 0.7389661640128171, "grad_norm": 3.3125, "learning_rate": 7.459448197231127e-06, "loss": 0.86100988, "memory(GiB)": 728.98, "step": 29130, "train_speed(iter/s)": 0.255569 }, { "acc": 0.75733504, "epoch": 0.7390930033818547, "grad_norm": 3.109375, "learning_rate": 7.458535126634363e-06, "loss": 0.88213606, "memory(GiB)": 728.98, "step": 29135, "train_speed(iter/s)": 0.255471 }, { "acc": 0.76356282, "epoch": 0.7392198427508923, "grad_norm": 4.21875, "learning_rate": 7.457621947894472e-06, "loss": 0.90501804, "memory(GiB)": 728.98, "step": 29140, "train_speed(iter/s)": 0.255358 }, { "acc": 0.75292692, "epoch": 0.7393466821199298, "grad_norm": 3.703125, "learning_rate": 7.4567086610516225e-06, "loss": 0.93929596, "memory(GiB)": 728.98, "step": 29145, "train_speed(iter/s)": 0.255247 }, { "acc": 0.76482887, "epoch": 0.7394735214889674, "grad_norm": 3.96875, "learning_rate": 7.455795266145986e-06, "loss": 0.8708744, "memory(GiB)": 728.98, "step": 29150, "train_speed(iter/s)": 0.255139 }, { "acc": 0.75998397, "epoch": 0.739600360858005, "grad_norm": 3.53125, "learning_rate": 7.45488176321774e-06, "loss": 0.92076731, "memory(GiB)": 728.98, "step": 29155, "train_speed(iter/s)": 0.255017 }, { "acc": 0.76240773, "epoch": 0.7397272002270425, "grad_norm": 4.25, "learning_rate": 7.4539681523070696e-06, "loss": 0.87392521, "memory(GiB)": 728.98, "step": 29160, "train_speed(iter/s)": 0.254898 }, { "acc": 0.75501752, "epoch": 0.73985403959608, "grad_norm": 3.484375, "learning_rate": 7.453054433454156e-06, "loss": 0.90912981, "memory(GiB)": 728.98, "step": 29165, "train_speed(iter/s)": 0.254792 }, { "acc": 0.75346303, "epoch": 0.7399808789651176, "grad_norm": 4.375, "learning_rate": 7.452140606699196e-06, "loss": 0.92094059, "memory(GiB)": 728.98, "step": 29170, "train_speed(iter/s)": 0.254681 }, { "acc": 0.76615658, "epoch": 0.7401077183341551, "grad_norm": 3.171875, "learning_rate": 7.451226672082383e-06, "loss": 0.93180847, "memory(GiB)": 728.98, "step": 29175, "train_speed(iter/s)": 0.254578 }, { "acc": 0.75956793, "epoch": 0.7402345577031927, "grad_norm": 3.40625, "learning_rate": 7.450312629643917e-06, "loss": 0.88643112, "memory(GiB)": 728.98, "step": 29180, "train_speed(iter/s)": 0.254474 }, { "acc": 0.77048054, "epoch": 0.7403613970722303, "grad_norm": 3.5, "learning_rate": 7.449398479424007e-06, "loss": 0.87352505, "memory(GiB)": 728.98, "step": 29185, "train_speed(iter/s)": 0.254362 }, { "acc": 0.77564673, "epoch": 0.7404882364412678, "grad_norm": 3.859375, "learning_rate": 7.448484221462861e-06, "loss": 0.85561504, "memory(GiB)": 728.98, "step": 29190, "train_speed(iter/s)": 0.254258 }, { "acc": 0.77147989, "epoch": 0.7406150758103054, "grad_norm": 3.46875, "learning_rate": 7.447569855800698e-06, "loss": 0.88789558, "memory(GiB)": 728.98, "step": 29195, "train_speed(iter/s)": 0.254146 }, { "acc": 0.76614122, "epoch": 0.740741915179343, "grad_norm": 4.28125, "learning_rate": 7.446655382477733e-06, "loss": 0.89960833, "memory(GiB)": 728.98, "step": 29200, "train_speed(iter/s)": 0.254037 }, { "acc": 0.77307878, "epoch": 0.7408687545483805, "grad_norm": 3.25, "learning_rate": 7.4457408015341955e-06, "loss": 0.86212063, "memory(GiB)": 728.98, "step": 29205, "train_speed(iter/s)": 0.253942 }, { "acc": 0.75689073, "epoch": 0.7409955939174181, "grad_norm": 4.0, "learning_rate": 7.444826113010311e-06, "loss": 0.87400255, "memory(GiB)": 728.98, "step": 29210, "train_speed(iter/s)": 0.253836 }, { "acc": 0.76474667, "epoch": 0.7411224332864557, "grad_norm": 3.640625, "learning_rate": 7.443911316946316e-06, "loss": 0.8771739, "memory(GiB)": 728.98, "step": 29215, "train_speed(iter/s)": 0.253723 }, { "acc": 0.7747725, "epoch": 0.7412492726554932, "grad_norm": 4.03125, "learning_rate": 7.442996413382451e-06, "loss": 0.88592548, "memory(GiB)": 728.98, "step": 29220, "train_speed(iter/s)": 0.253605 }, { "acc": 0.77197232, "epoch": 0.7413761120245307, "grad_norm": 3.515625, "learning_rate": 7.4420814023589535e-06, "loss": 0.90024195, "memory(GiB)": 728.98, "step": 29225, "train_speed(iter/s)": 0.253497 }, { "acc": 0.76034555, "epoch": 0.7415029513935683, "grad_norm": 3.4375, "learning_rate": 7.4411662839160795e-06, "loss": 0.92312136, "memory(GiB)": 728.98, "step": 29230, "train_speed(iter/s)": 0.253395 }, { "acc": 0.77806115, "epoch": 0.7416297907626058, "grad_norm": 3.609375, "learning_rate": 7.440251058094076e-06, "loss": 0.85248899, "memory(GiB)": 728.98, "step": 29235, "train_speed(iter/s)": 0.253297 }, { "acc": 0.77084384, "epoch": 0.7417566301316434, "grad_norm": 3.5625, "learning_rate": 7.4393357249332055e-06, "loss": 0.83380365, "memory(GiB)": 728.98, "step": 29240, "train_speed(iter/s)": 0.253198 }, { "acc": 0.7692163, "epoch": 0.741883469500681, "grad_norm": 5.03125, "learning_rate": 7.438420284473729e-06, "loss": 0.82245817, "memory(GiB)": 728.98, "step": 29245, "train_speed(iter/s)": 0.253087 }, { "acc": 0.75562367, "epoch": 0.7420103088697185, "grad_norm": 45.5, "learning_rate": 7.4375047367559114e-06, "loss": 0.8805337, "memory(GiB)": 728.98, "step": 29250, "train_speed(iter/s)": 0.252989 }, { "acc": 0.76153831, "epoch": 0.7421371482387561, "grad_norm": 3.140625, "learning_rate": 7.436589081820029e-06, "loss": 0.87150221, "memory(GiB)": 728.98, "step": 29255, "train_speed(iter/s)": 0.252879 }, { "acc": 0.76040072, "epoch": 0.7422639876077937, "grad_norm": 3.65625, "learning_rate": 7.4356733197063544e-06, "loss": 0.90253735, "memory(GiB)": 728.98, "step": 29260, "train_speed(iter/s)": 0.252788 }, { "acc": 0.76688175, "epoch": 0.7423908269768312, "grad_norm": 3.546875, "learning_rate": 7.4347574504551735e-06, "loss": 0.90435934, "memory(GiB)": 728.98, "step": 29265, "train_speed(iter/s)": 0.252684 }, { "acc": 0.76692944, "epoch": 0.7425176663458688, "grad_norm": 3.4375, "learning_rate": 7.433841474106768e-06, "loss": 0.88342257, "memory(GiB)": 728.98, "step": 29270, "train_speed(iter/s)": 0.252586 }, { "acc": 0.76347604, "epoch": 0.7426445057149064, "grad_norm": 4.21875, "learning_rate": 7.432925390701431e-06, "loss": 0.88153191, "memory(GiB)": 728.98, "step": 29275, "train_speed(iter/s)": 0.252479 }, { "acc": 0.75311303, "epoch": 0.7427713450839439, "grad_norm": 3.75, "learning_rate": 7.432009200279458e-06, "loss": 0.87375431, "memory(GiB)": 728.98, "step": 29280, "train_speed(iter/s)": 0.25237 }, { "acc": 0.76833196, "epoch": 0.7428981844529814, "grad_norm": 3.5, "learning_rate": 7.4310929028811495e-06, "loss": 0.88400497, "memory(GiB)": 728.98, "step": 29285, "train_speed(iter/s)": 0.252265 }, { "acc": 0.76784887, "epoch": 0.743025023822019, "grad_norm": 3.59375, "learning_rate": 7.430176498546809e-06, "loss": 0.87166491, "memory(GiB)": 728.98, "step": 29290, "train_speed(iter/s)": 0.252154 }, { "acc": 0.77170777, "epoch": 0.7431518631910565, "grad_norm": 3.015625, "learning_rate": 7.429259987316748e-06, "loss": 0.87372284, "memory(GiB)": 728.98, "step": 29295, "train_speed(iter/s)": 0.252049 }, { "acc": 0.76859589, "epoch": 0.7432787025600941, "grad_norm": 3.625, "learning_rate": 7.428343369231281e-06, "loss": 0.85699453, "memory(GiB)": 728.98, "step": 29300, "train_speed(iter/s)": 0.251946 }, { "acc": 0.77853222, "epoch": 0.7434055419291317, "grad_norm": 3.46875, "learning_rate": 7.427426644330726e-06, "loss": 0.85490198, "memory(GiB)": 728.98, "step": 29305, "train_speed(iter/s)": 0.251849 }, { "acc": 0.76927309, "epoch": 0.7435323812981692, "grad_norm": 4.21875, "learning_rate": 7.4265098126554065e-06, "loss": 0.91698418, "memory(GiB)": 728.98, "step": 29310, "train_speed(iter/s)": 0.251738 }, { "acc": 0.7731276, "epoch": 0.7436592206672068, "grad_norm": 3.0625, "learning_rate": 7.425592874245652e-06, "loss": 0.84619808, "memory(GiB)": 728.98, "step": 29315, "train_speed(iter/s)": 0.251627 }, { "acc": 0.75334177, "epoch": 0.7437860600362444, "grad_norm": 3.625, "learning_rate": 7.4246758291417956e-06, "loss": 0.97192497, "memory(GiB)": 728.98, "step": 29320, "train_speed(iter/s)": 0.251535 }, { "acc": 0.77266912, "epoch": 0.7439128994052819, "grad_norm": 3.5, "learning_rate": 7.423758677384174e-06, "loss": 0.80904608, "memory(GiB)": 728.98, "step": 29325, "train_speed(iter/s)": 0.251429 }, { "acc": 0.77409673, "epoch": 0.7440397387743195, "grad_norm": 3.53125, "learning_rate": 7.4228414190131315e-06, "loss": 0.8813035, "memory(GiB)": 728.98, "step": 29330, "train_speed(iter/s)": 0.251326 }, { "acc": 0.76541519, "epoch": 0.7441665781433571, "grad_norm": 3.828125, "learning_rate": 7.421924054069014e-06, "loss": 0.89125633, "memory(GiB)": 728.98, "step": 29335, "train_speed(iter/s)": 0.251216 }, { "acc": 0.77394681, "epoch": 0.7442934175123946, "grad_norm": 3.015625, "learning_rate": 7.421006582592175e-06, "loss": 0.8240695, "memory(GiB)": 728.98, "step": 29340, "train_speed(iter/s)": 0.25111 }, { "acc": 0.76872249, "epoch": 0.7444202568814321, "grad_norm": 3.40625, "learning_rate": 7.42008900462297e-06, "loss": 0.88353195, "memory(GiB)": 728.98, "step": 29345, "train_speed(iter/s)": 0.251009 }, { "acc": 0.78429871, "epoch": 0.7445470962504697, "grad_norm": 2.921875, "learning_rate": 7.41917132020176e-06, "loss": 0.79689541, "memory(GiB)": 728.98, "step": 29350, "train_speed(iter/s)": 0.250909 }, { "acc": 0.77276611, "epoch": 0.7446739356195072, "grad_norm": 4.03125, "learning_rate": 7.4182535293689126e-06, "loss": 0.89810858, "memory(GiB)": 728.98, "step": 29355, "train_speed(iter/s)": 0.25081 }, { "acc": 0.75481153, "epoch": 0.7448007749885448, "grad_norm": 3.34375, "learning_rate": 7.417335632164797e-06, "loss": 1.00260143, "memory(GiB)": 728.98, "step": 29360, "train_speed(iter/s)": 0.25069 }, { "acc": 0.76445737, "epoch": 0.7449276143575824, "grad_norm": 3.765625, "learning_rate": 7.416417628629789e-06, "loss": 0.89557638, "memory(GiB)": 728.98, "step": 29365, "train_speed(iter/s)": 0.250588 }, { "acc": 0.77718744, "epoch": 0.7450544537266199, "grad_norm": 3.390625, "learning_rate": 7.415499518804268e-06, "loss": 0.85820017, "memory(GiB)": 728.98, "step": 29370, "train_speed(iter/s)": 0.250476 }, { "acc": 0.76220584, "epoch": 0.7451812930956575, "grad_norm": 3.1875, "learning_rate": 7.4145813027286206e-06, "loss": 0.91503124, "memory(GiB)": 728.98, "step": 29375, "train_speed(iter/s)": 0.250368 }, { "acc": 0.76752472, "epoch": 0.7453081324646951, "grad_norm": 3.75, "learning_rate": 7.413662980443234e-06, "loss": 0.91223278, "memory(GiB)": 728.98, "step": 29380, "train_speed(iter/s)": 0.250269 }, { "acc": 0.76619239, "epoch": 0.7454349718337326, "grad_norm": 3.109375, "learning_rate": 7.412744551988504e-06, "loss": 0.85383244, "memory(GiB)": 728.98, "step": 29385, "train_speed(iter/s)": 0.250164 }, { "acc": 0.76426148, "epoch": 0.7455618112027702, "grad_norm": 3.5625, "learning_rate": 7.41182601740483e-06, "loss": 0.89637852, "memory(GiB)": 728.98, "step": 29390, "train_speed(iter/s)": 0.250057 }, { "acc": 0.77220445, "epoch": 0.7456886505718078, "grad_norm": 3.8125, "learning_rate": 7.410907376732611e-06, "loss": 0.91594095, "memory(GiB)": 728.98, "step": 29395, "train_speed(iter/s)": 0.249961 }, { "acc": 0.76901965, "epoch": 0.7458154899408453, "grad_norm": 3.546875, "learning_rate": 7.409988630012261e-06, "loss": 0.89153891, "memory(GiB)": 728.98, "step": 29400, "train_speed(iter/s)": 0.249859 }, { "acc": 0.76563759, "epoch": 0.7459423293098828, "grad_norm": 3.03125, "learning_rate": 7.409069777284188e-06, "loss": 0.90007095, "memory(GiB)": 728.98, "step": 29405, "train_speed(iter/s)": 0.249767 }, { "acc": 0.77554288, "epoch": 0.7460691686789204, "grad_norm": 3.34375, "learning_rate": 7.408150818588812e-06, "loss": 0.87230263, "memory(GiB)": 728.98, "step": 29410, "train_speed(iter/s)": 0.249667 }, { "acc": 0.76794295, "epoch": 0.7461960080479579, "grad_norm": 3.3125, "learning_rate": 7.407231753966553e-06, "loss": 0.8821907, "memory(GiB)": 728.98, "step": 29415, "train_speed(iter/s)": 0.249571 }, { "acc": 0.7667058, "epoch": 0.7463228474169955, "grad_norm": 3.03125, "learning_rate": 7.406312583457841e-06, "loss": 0.89644375, "memory(GiB)": 728.98, "step": 29420, "train_speed(iter/s)": 0.249455 }, { "acc": 0.76453609, "epoch": 0.7464496867860331, "grad_norm": 3.46875, "learning_rate": 7.405393307103105e-06, "loss": 0.93301353, "memory(GiB)": 728.98, "step": 29425, "train_speed(iter/s)": 0.249341 }, { "acc": 0.77486296, "epoch": 0.7465765261550706, "grad_norm": 3.296875, "learning_rate": 7.404473924942781e-06, "loss": 0.85161343, "memory(GiB)": 728.98, "step": 29430, "train_speed(iter/s)": 0.249255 }, { "acc": 0.7583324, "epoch": 0.7467033655241082, "grad_norm": 3.015625, "learning_rate": 7.40355443701731e-06, "loss": 0.93834972, "memory(GiB)": 728.98, "step": 29435, "train_speed(iter/s)": 0.249148 }, { "acc": 0.75072713, "epoch": 0.7468302048931458, "grad_norm": 3.71875, "learning_rate": 7.402634843367137e-06, "loss": 0.95895243, "memory(GiB)": 728.98, "step": 29440, "train_speed(iter/s)": 0.249046 }, { "acc": 0.78483152, "epoch": 0.7469570442621833, "grad_norm": 3.4375, "learning_rate": 7.401715144032714e-06, "loss": 0.88396168, "memory(GiB)": 728.98, "step": 29445, "train_speed(iter/s)": 0.248948 }, { "acc": 0.77337623, "epoch": 0.7470838836312209, "grad_norm": 3.75, "learning_rate": 7.400795339054493e-06, "loss": 0.90558929, "memory(GiB)": 728.98, "step": 29450, "train_speed(iter/s)": 0.248852 }, { "acc": 0.77142196, "epoch": 0.7472107230002585, "grad_norm": 3.5, "learning_rate": 7.399875428472934e-06, "loss": 0.85536175, "memory(GiB)": 728.98, "step": 29455, "train_speed(iter/s)": 0.248758 }, { "acc": 0.75549946, "epoch": 0.747337562369296, "grad_norm": 4.21875, "learning_rate": 7.3989554123285025e-06, "loss": 0.90149355, "memory(GiB)": 728.98, "step": 29460, "train_speed(iter/s)": 0.248654 }, { "acc": 0.75948014, "epoch": 0.7474644017383335, "grad_norm": 5.15625, "learning_rate": 7.398035290661666e-06, "loss": 0.90906992, "memory(GiB)": 728.98, "step": 29465, "train_speed(iter/s)": 0.248538 }, { "acc": 0.77590513, "epoch": 0.7475912411073711, "grad_norm": 3.609375, "learning_rate": 7.397115063512896e-06, "loss": 0.87318153, "memory(GiB)": 728.98, "step": 29470, "train_speed(iter/s)": 0.24843 }, { "acc": 0.77358298, "epoch": 0.7477180804764086, "grad_norm": 3.453125, "learning_rate": 7.396194730922674e-06, "loss": 0.88678627, "memory(GiB)": 728.98, "step": 29475, "train_speed(iter/s)": 0.248337 }, { "acc": 0.75669637, "epoch": 0.7478449198454462, "grad_norm": 3.59375, "learning_rate": 7.39527429293148e-06, "loss": 0.90968409, "memory(GiB)": 728.98, "step": 29480, "train_speed(iter/s)": 0.248215 }, { "acc": 0.76777592, "epoch": 0.7479717592144838, "grad_norm": 3.953125, "learning_rate": 7.394353749579801e-06, "loss": 0.88823586, "memory(GiB)": 728.98, "step": 29485, "train_speed(iter/s)": 0.248111 }, { "acc": 0.75320048, "epoch": 0.7480985985835213, "grad_norm": 3.296875, "learning_rate": 7.39343310090813e-06, "loss": 0.92756433, "memory(GiB)": 728.98, "step": 29490, "train_speed(iter/s)": 0.248004 }, { "acc": 0.75471697, "epoch": 0.7482254379525589, "grad_norm": 3.5, "learning_rate": 7.392512346956964e-06, "loss": 0.91068392, "memory(GiB)": 728.98, "step": 29495, "train_speed(iter/s)": 0.247911 }, { "acc": 0.78409462, "epoch": 0.7483522773215965, "grad_norm": 3.375, "learning_rate": 7.3915914877668e-06, "loss": 0.86711006, "memory(GiB)": 728.98, "step": 29500, "train_speed(iter/s)": 0.247814 }, { "epoch": 0.7483522773215965, "eval_acc": 0.7556412164214303, "eval_loss": 0.8666667342185974, "eval_runtime": 1151.0587, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 29500 }, { "acc": 0.77175517, "epoch": 0.748479116690634, "grad_norm": 3.5, "learning_rate": 7.3906705233781495e-06, "loss": 0.87173557, "memory(GiB)": 728.98, "step": 29505, "train_speed(iter/s)": 0.243885 }, { "acc": 0.75955367, "epoch": 0.7486059560596716, "grad_norm": 3.671875, "learning_rate": 7.389749453831517e-06, "loss": 0.93293171, "memory(GiB)": 728.98, "step": 29510, "train_speed(iter/s)": 0.243791 }, { "acc": 0.75921917, "epoch": 0.7487327954287092, "grad_norm": 3.125, "learning_rate": 7.388828279167423e-06, "loss": 0.90412226, "memory(GiB)": 728.98, "step": 29515, "train_speed(iter/s)": 0.243692 }, { "acc": 0.77178478, "epoch": 0.7488596347977468, "grad_norm": 3.53125, "learning_rate": 7.387906999426382e-06, "loss": 0.87102814, "memory(GiB)": 728.98, "step": 29520, "train_speed(iter/s)": 0.243603 }, { "acc": 0.75446863, "epoch": 0.7489864741667842, "grad_norm": 3.71875, "learning_rate": 7.3869856146489215e-06, "loss": 0.96316271, "memory(GiB)": 728.98, "step": 29525, "train_speed(iter/s)": 0.243515 }, { "acc": 0.78589816, "epoch": 0.7491133135358218, "grad_norm": 3.625, "learning_rate": 7.38606412487557e-06, "loss": 0.82290535, "memory(GiB)": 728.98, "step": 29530, "train_speed(iter/s)": 0.243426 }, { "acc": 0.76275344, "epoch": 0.7492401529048593, "grad_norm": 3.171875, "learning_rate": 7.385142530146858e-06, "loss": 0.86186895, "memory(GiB)": 728.98, "step": 29535, "train_speed(iter/s)": 0.243328 }, { "acc": 0.77015915, "epoch": 0.7493669922738969, "grad_norm": 3.484375, "learning_rate": 7.384220830503327e-06, "loss": 0.89862528, "memory(GiB)": 728.98, "step": 29540, "train_speed(iter/s)": 0.243224 }, { "acc": 0.76449943, "epoch": 0.7494938316429345, "grad_norm": 3.84375, "learning_rate": 7.383299025985519e-06, "loss": 0.86406441, "memory(GiB)": 728.98, "step": 29545, "train_speed(iter/s)": 0.243128 }, { "acc": 0.77650747, "epoch": 0.749620671011972, "grad_norm": 3.015625, "learning_rate": 7.38237711663398e-06, "loss": 0.83899937, "memory(GiB)": 728.98, "step": 29550, "train_speed(iter/s)": 0.243028 }, { "acc": 0.76200304, "epoch": 0.7497475103810096, "grad_norm": 3.421875, "learning_rate": 7.381455102489261e-06, "loss": 0.88603973, "memory(GiB)": 728.98, "step": 29555, "train_speed(iter/s)": 0.24295 }, { "acc": 0.76722937, "epoch": 0.7498743497500472, "grad_norm": 4.0, "learning_rate": 7.380532983591922e-06, "loss": 0.90001421, "memory(GiB)": 728.98, "step": 29560, "train_speed(iter/s)": 0.242845 }, { "acc": 0.76545768, "epoch": 0.7500011891190848, "grad_norm": 3.171875, "learning_rate": 7.3796107599825195e-06, "loss": 0.88763046, "memory(GiB)": 728.98, "step": 29565, "train_speed(iter/s)": 0.242749 }, { "acc": 0.7471405, "epoch": 0.7501280284881223, "grad_norm": 2.84375, "learning_rate": 7.378688431701624e-06, "loss": 0.9277729, "memory(GiB)": 728.98, "step": 29570, "train_speed(iter/s)": 0.242646 }, { "acc": 0.7675374, "epoch": 0.7502548678571599, "grad_norm": 3.828125, "learning_rate": 7.377765998789801e-06, "loss": 0.89718618, "memory(GiB)": 728.98, "step": 29575, "train_speed(iter/s)": 0.242544 }, { "acc": 0.76668749, "epoch": 0.7503817072261975, "grad_norm": 2.9375, "learning_rate": 7.37684346128763e-06, "loss": 0.88738966, "memory(GiB)": 728.98, "step": 29580, "train_speed(iter/s)": 0.24245 }, { "acc": 0.77318501, "epoch": 0.7505085465952349, "grad_norm": 3.484375, "learning_rate": 7.3759208192356854e-06, "loss": 0.86477213, "memory(GiB)": 728.98, "step": 29585, "train_speed(iter/s)": 0.242363 }, { "acc": 0.77091413, "epoch": 0.7506353859642725, "grad_norm": 3.6875, "learning_rate": 7.374998072674556e-06, "loss": 0.91380625, "memory(GiB)": 728.98, "step": 29590, "train_speed(iter/s)": 0.24228 }, { "acc": 0.76572595, "epoch": 0.75076222533331, "grad_norm": 3.15625, "learning_rate": 7.374075221644829e-06, "loss": 0.91745577, "memory(GiB)": 728.98, "step": 29595, "train_speed(iter/s)": 0.242196 }, { "acc": 0.75923557, "epoch": 0.7508890647023476, "grad_norm": 3.8125, "learning_rate": 7.3731522661870954e-06, "loss": 0.92258015, "memory(GiB)": 728.98, "step": 29600, "train_speed(iter/s)": 0.24211 }, { "acc": 0.76081734, "epoch": 0.7510159040713852, "grad_norm": 4.09375, "learning_rate": 7.372229206341958e-06, "loss": 0.93430586, "memory(GiB)": 728.98, "step": 29605, "train_speed(iter/s)": 0.242015 }, { "acc": 0.76550546, "epoch": 0.7511427434404228, "grad_norm": 3.828125, "learning_rate": 7.371306042150013e-06, "loss": 0.90250368, "memory(GiB)": 728.98, "step": 29610, "train_speed(iter/s)": 0.241919 }, { "acc": 0.77651281, "epoch": 0.7512695828094603, "grad_norm": 3.515625, "learning_rate": 7.370382773651872e-06, "loss": 0.89121466, "memory(GiB)": 728.98, "step": 29615, "train_speed(iter/s)": 0.241838 }, { "acc": 0.76066513, "epoch": 0.7513964221784979, "grad_norm": 3.640625, "learning_rate": 7.369459400888146e-06, "loss": 0.94125967, "memory(GiB)": 728.98, "step": 29620, "train_speed(iter/s)": 0.241759 }, { "acc": 0.75686598, "epoch": 0.7515232615475355, "grad_norm": 3.828125, "learning_rate": 7.368535923899449e-06, "loss": 0.96982012, "memory(GiB)": 728.98, "step": 29625, "train_speed(iter/s)": 0.241668 }, { "acc": 0.75760088, "epoch": 0.751650100916573, "grad_norm": 3.78125, "learning_rate": 7.367612342726405e-06, "loss": 0.92205181, "memory(GiB)": 728.98, "step": 29630, "train_speed(iter/s)": 0.241587 }, { "acc": 0.7641592, "epoch": 0.7517769402856106, "grad_norm": 2.65625, "learning_rate": 7.366688657409635e-06, "loss": 0.87043495, "memory(GiB)": 728.98, "step": 29635, "train_speed(iter/s)": 0.24149 }, { "acc": 0.76711674, "epoch": 0.7519037796546482, "grad_norm": 3.328125, "learning_rate": 7.365764867989775e-06, "loss": 0.8693779, "memory(GiB)": 728.98, "step": 29640, "train_speed(iter/s)": 0.241399 }, { "acc": 0.75115404, "epoch": 0.7520306190236856, "grad_norm": 3.90625, "learning_rate": 7.364840974507455e-06, "loss": 0.94335327, "memory(GiB)": 728.98, "step": 29645, "train_speed(iter/s)": 0.241297 }, { "acc": 0.76182113, "epoch": 0.7521574583927232, "grad_norm": 3.546875, "learning_rate": 7.3639169770033144e-06, "loss": 0.9260006, "memory(GiB)": 728.98, "step": 29650, "train_speed(iter/s)": 0.241205 }, { "acc": 0.75428796, "epoch": 0.7522842977617608, "grad_norm": 3.25, "learning_rate": 7.362992875517998e-06, "loss": 0.90145559, "memory(GiB)": 728.98, "step": 29655, "train_speed(iter/s)": 0.241107 }, { "acc": 0.75758505, "epoch": 0.7524111371307983, "grad_norm": 3.203125, "learning_rate": 7.362068670092155e-06, "loss": 0.92135124, "memory(GiB)": 728.98, "step": 29660, "train_speed(iter/s)": 0.24101 }, { "acc": 0.78763232, "epoch": 0.7525379764998359, "grad_norm": 4.1875, "learning_rate": 7.361144360766438e-06, "loss": 0.83371601, "memory(GiB)": 728.98, "step": 29665, "train_speed(iter/s)": 0.240929 }, { "acc": 0.7529048, "epoch": 0.7526648158688735, "grad_norm": 3.140625, "learning_rate": 7.3602199475815016e-06, "loss": 0.92608938, "memory(GiB)": 728.98, "step": 29670, "train_speed(iter/s)": 0.240821 }, { "acc": 0.76466155, "epoch": 0.752791655237911, "grad_norm": 4.4375, "learning_rate": 7.359295430578011e-06, "loss": 0.8836503, "memory(GiB)": 728.98, "step": 29675, "train_speed(iter/s)": 0.240732 }, { "acc": 0.76727424, "epoch": 0.7529184946069486, "grad_norm": 3.515625, "learning_rate": 7.358370809796631e-06, "loss": 0.89489374, "memory(GiB)": 728.98, "step": 29680, "train_speed(iter/s)": 0.240647 }, { "acc": 0.77007251, "epoch": 0.7530453339759862, "grad_norm": 3.203125, "learning_rate": 7.357446085278034e-06, "loss": 0.86573477, "memory(GiB)": 728.98, "step": 29685, "train_speed(iter/s)": 0.240554 }, { "acc": 0.76295829, "epoch": 0.7531721733450237, "grad_norm": 3.9375, "learning_rate": 7.356521257062894e-06, "loss": 0.89899693, "memory(GiB)": 728.98, "step": 29690, "train_speed(iter/s)": 0.240465 }, { "acc": 0.75332866, "epoch": 0.7532990127140613, "grad_norm": 4.125, "learning_rate": 7.355596325191892e-06, "loss": 0.92635889, "memory(GiB)": 728.98, "step": 29695, "train_speed(iter/s)": 0.240366 }, { "acc": 0.7835959, "epoch": 0.7534258520830989, "grad_norm": 3.265625, "learning_rate": 7.354671289705715e-06, "loss": 0.84832659, "memory(GiB)": 728.98, "step": 29700, "train_speed(iter/s)": 0.240283 }, { "acc": 0.76890106, "epoch": 0.7535526914521363, "grad_norm": 3.625, "learning_rate": 7.353746150645049e-06, "loss": 0.87418346, "memory(GiB)": 728.98, "step": 29705, "train_speed(iter/s)": 0.240197 }, { "acc": 0.766888, "epoch": 0.7536795308211739, "grad_norm": 3.71875, "learning_rate": 7.352820908050588e-06, "loss": 0.95821772, "memory(GiB)": 728.98, "step": 29710, "train_speed(iter/s)": 0.240094 }, { "acc": 0.76172123, "epoch": 0.7538063701902115, "grad_norm": 3.859375, "learning_rate": 7.351895561963035e-06, "loss": 0.91066828, "memory(GiB)": 728.98, "step": 29715, "train_speed(iter/s)": 0.240011 }, { "acc": 0.77880001, "epoch": 0.753933209559249, "grad_norm": 2.90625, "learning_rate": 7.350970112423088e-06, "loss": 0.82776775, "memory(GiB)": 728.98, "step": 29720, "train_speed(iter/s)": 0.239909 }, { "acc": 0.77035012, "epoch": 0.7540600489282866, "grad_norm": 3.078125, "learning_rate": 7.350044559471456e-06, "loss": 0.84175949, "memory(GiB)": 728.98, "step": 29725, "train_speed(iter/s)": 0.239814 }, { "acc": 0.76393795, "epoch": 0.7541868882973242, "grad_norm": 3.375, "learning_rate": 7.349118903148853e-06, "loss": 0.89865341, "memory(GiB)": 728.98, "step": 29730, "train_speed(iter/s)": 0.239727 }, { "acc": 0.77003284, "epoch": 0.7543137276663617, "grad_norm": 3.90625, "learning_rate": 7.348193143495993e-06, "loss": 0.89619036, "memory(GiB)": 728.98, "step": 29735, "train_speed(iter/s)": 0.239626 }, { "acc": 0.7714498, "epoch": 0.7544405670353993, "grad_norm": 3.25, "learning_rate": 7.3472672805535985e-06, "loss": 0.8844738, "memory(GiB)": 728.98, "step": 29740, "train_speed(iter/s)": 0.239537 }, { "acc": 0.76954079, "epoch": 0.7545674064044369, "grad_norm": 3.53125, "learning_rate": 7.3463413143623954e-06, "loss": 0.85154219, "memory(GiB)": 728.98, "step": 29745, "train_speed(iter/s)": 0.239455 }, { "acc": 0.75337658, "epoch": 0.7546942457734744, "grad_norm": 3.53125, "learning_rate": 7.345415244963114e-06, "loss": 0.8995388, "memory(GiB)": 728.98, "step": 29750, "train_speed(iter/s)": 0.239364 }, { "acc": 0.75796943, "epoch": 0.754821085142512, "grad_norm": 3.3125, "learning_rate": 7.3444890723964876e-06, "loss": 0.89418154, "memory(GiB)": 728.98, "step": 29755, "train_speed(iter/s)": 0.239276 }, { "acc": 0.77522049, "epoch": 0.7549479245115495, "grad_norm": 3.421875, "learning_rate": 7.343562796703259e-06, "loss": 0.88792133, "memory(GiB)": 728.98, "step": 29760, "train_speed(iter/s)": 0.239197 }, { "acc": 0.76850157, "epoch": 0.755074763880587, "grad_norm": 3.328125, "learning_rate": 7.342636417924168e-06, "loss": 0.87223616, "memory(GiB)": 728.98, "step": 29765, "train_speed(iter/s)": 0.239112 }, { "acc": 0.76903572, "epoch": 0.7552016032496246, "grad_norm": 3.1875, "learning_rate": 7.341709936099967e-06, "loss": 0.88431549, "memory(GiB)": 728.98, "step": 29770, "train_speed(iter/s)": 0.239021 }, { "acc": 0.76674333, "epoch": 0.7553284426186622, "grad_norm": 5.25, "learning_rate": 7.340783351271406e-06, "loss": 0.90881262, "memory(GiB)": 728.98, "step": 29775, "train_speed(iter/s)": 0.238934 }, { "acc": 0.77341056, "epoch": 0.7554552819876997, "grad_norm": 3.9375, "learning_rate": 7.3398566634792435e-06, "loss": 0.88074055, "memory(GiB)": 728.98, "step": 29780, "train_speed(iter/s)": 0.238855 }, { "acc": 0.77636852, "epoch": 0.7555821213567373, "grad_norm": 3.65625, "learning_rate": 7.338929872764242e-06, "loss": 0.85854464, "memory(GiB)": 728.98, "step": 29785, "train_speed(iter/s)": 0.238768 }, { "acc": 0.76659021, "epoch": 0.7557089607257749, "grad_norm": 3.296875, "learning_rate": 7.3380029791671685e-06, "loss": 0.88439455, "memory(GiB)": 728.98, "step": 29790, "train_speed(iter/s)": 0.238674 }, { "acc": 0.78244081, "epoch": 0.7558358000948124, "grad_norm": 3.09375, "learning_rate": 7.3370759827287905e-06, "loss": 0.8666048, "memory(GiB)": 728.98, "step": 29795, "train_speed(iter/s)": 0.238587 }, { "acc": 0.76910405, "epoch": 0.75596263946385, "grad_norm": 3.46875, "learning_rate": 7.336148883489889e-06, "loss": 0.91238756, "memory(GiB)": 728.98, "step": 29800, "train_speed(iter/s)": 0.238504 }, { "acc": 0.78831525, "epoch": 0.7560894788328876, "grad_norm": 5.0625, "learning_rate": 7.33522168149124e-06, "loss": 0.84811869, "memory(GiB)": 728.98, "step": 29805, "train_speed(iter/s)": 0.238402 }, { "acc": 0.75981526, "epoch": 0.7562163182019251, "grad_norm": 2.953125, "learning_rate": 7.334294376773632e-06, "loss": 0.92858276, "memory(GiB)": 728.98, "step": 29810, "train_speed(iter/s)": 0.23832 }, { "acc": 0.76052642, "epoch": 0.7563431575709627, "grad_norm": 2.96875, "learning_rate": 7.333366969377849e-06, "loss": 0.90001307, "memory(GiB)": 728.98, "step": 29815, "train_speed(iter/s)": 0.238234 }, { "acc": 0.76196923, "epoch": 0.7564699969400002, "grad_norm": 3.578125, "learning_rate": 7.3324394593446894e-06, "loss": 0.9136694, "memory(GiB)": 728.98, "step": 29820, "train_speed(iter/s)": 0.238152 }, { "acc": 0.76934681, "epoch": 0.7565968363090377, "grad_norm": 3.546875, "learning_rate": 7.331511846714948e-06, "loss": 0.85375242, "memory(GiB)": 728.98, "step": 29825, "train_speed(iter/s)": 0.238062 }, { "acc": 0.77374997, "epoch": 0.7567236756780753, "grad_norm": 3.796875, "learning_rate": 7.330584131529431e-06, "loss": 0.89601212, "memory(GiB)": 728.98, "step": 29830, "train_speed(iter/s)": 0.237976 }, { "acc": 0.76836538, "epoch": 0.7568505150471129, "grad_norm": 3.421875, "learning_rate": 7.329656313828943e-06, "loss": 0.87956247, "memory(GiB)": 728.98, "step": 29835, "train_speed(iter/s)": 0.237893 }, { "acc": 0.76475158, "epoch": 0.7569773544161504, "grad_norm": 3.515625, "learning_rate": 7.328728393654296e-06, "loss": 0.91961136, "memory(GiB)": 728.98, "step": 29840, "train_speed(iter/s)": 0.237799 }, { "acc": 0.76318493, "epoch": 0.757104193785188, "grad_norm": 3.609375, "learning_rate": 7.3278003710463074e-06, "loss": 0.87822027, "memory(GiB)": 728.98, "step": 29845, "train_speed(iter/s)": 0.2377 }, { "acc": 0.77500725, "epoch": 0.7572310331542256, "grad_norm": 2.875, "learning_rate": 7.326872246045798e-06, "loss": 0.81377144, "memory(GiB)": 728.98, "step": 29850, "train_speed(iter/s)": 0.237613 }, { "acc": 0.76480384, "epoch": 0.7573578725232631, "grad_norm": 3.40625, "learning_rate": 7.325944018693592e-06, "loss": 0.91796207, "memory(GiB)": 728.98, "step": 29855, "train_speed(iter/s)": 0.237525 }, { "acc": 0.76333609, "epoch": 0.7574847118923007, "grad_norm": 3.84375, "learning_rate": 7.3250156890305176e-06, "loss": 0.9102479, "memory(GiB)": 728.98, "step": 29860, "train_speed(iter/s)": 0.237438 }, { "acc": 0.75518513, "epoch": 0.7576115512613383, "grad_norm": 3.71875, "learning_rate": 7.324087257097413e-06, "loss": 0.92841349, "memory(GiB)": 728.98, "step": 29865, "train_speed(iter/s)": 0.237335 }, { "acc": 0.77569137, "epoch": 0.7577383906303758, "grad_norm": 3.75, "learning_rate": 7.323158722935113e-06, "loss": 0.87258682, "memory(GiB)": 728.98, "step": 29870, "train_speed(iter/s)": 0.237244 }, { "acc": 0.76451216, "epoch": 0.7578652299994134, "grad_norm": 3.703125, "learning_rate": 7.3222300865844655e-06, "loss": 0.90601311, "memory(GiB)": 728.98, "step": 29875, "train_speed(iter/s)": 0.237156 }, { "acc": 0.77392273, "epoch": 0.7579920693684509, "grad_norm": 3.0625, "learning_rate": 7.321301348086313e-06, "loss": 0.87271023, "memory(GiB)": 728.98, "step": 29880, "train_speed(iter/s)": 0.237057 }, { "acc": 0.75117769, "epoch": 0.7581189087374884, "grad_norm": 4.09375, "learning_rate": 7.3203725074815125e-06, "loss": 0.93836184, "memory(GiB)": 728.98, "step": 29885, "train_speed(iter/s)": 0.23696 }, { "acc": 0.76069098, "epoch": 0.758245748106526, "grad_norm": 4.4375, "learning_rate": 7.319443564810918e-06, "loss": 0.89573975, "memory(GiB)": 728.98, "step": 29890, "train_speed(iter/s)": 0.236868 }, { "acc": 0.76007166, "epoch": 0.7583725874755636, "grad_norm": 3.0625, "learning_rate": 7.318514520115389e-06, "loss": 0.90021534, "memory(GiB)": 728.98, "step": 29895, "train_speed(iter/s)": 0.23678 }, { "acc": 0.76832094, "epoch": 0.7584994268446011, "grad_norm": 3.34375, "learning_rate": 7.317585373435797e-06, "loss": 0.8672657, "memory(GiB)": 728.98, "step": 29900, "train_speed(iter/s)": 0.236691 }, { "acc": 0.75985436, "epoch": 0.7586262662136387, "grad_norm": 3.171875, "learning_rate": 7.316656124813006e-06, "loss": 0.90357256, "memory(GiB)": 728.98, "step": 29905, "train_speed(iter/s)": 0.236608 }, { "acc": 0.77208939, "epoch": 0.7587531055826763, "grad_norm": 3.703125, "learning_rate": 7.315726774287895e-06, "loss": 0.79601016, "memory(GiB)": 728.98, "step": 29910, "train_speed(iter/s)": 0.236515 }, { "acc": 0.7722116, "epoch": 0.7588799449517138, "grad_norm": 3.734375, "learning_rate": 7.314797321901341e-06, "loss": 0.83571711, "memory(GiB)": 728.98, "step": 29915, "train_speed(iter/s)": 0.236428 }, { "acc": 0.77198644, "epoch": 0.7590067843207514, "grad_norm": 3.46875, "learning_rate": 7.313867767694227e-06, "loss": 0.88852406, "memory(GiB)": 728.98, "step": 29920, "train_speed(iter/s)": 0.236341 }, { "acc": 0.77095213, "epoch": 0.759133623689789, "grad_norm": 4.03125, "learning_rate": 7.312938111707444e-06, "loss": 0.91740761, "memory(GiB)": 728.98, "step": 29925, "train_speed(iter/s)": 0.236268 }, { "acc": 0.77568927, "epoch": 0.7592604630588266, "grad_norm": 3.296875, "learning_rate": 7.312008353981882e-06, "loss": 0.89302435, "memory(GiB)": 728.98, "step": 29930, "train_speed(iter/s)": 0.23619 }, { "acc": 0.7730978, "epoch": 0.7593873024278641, "grad_norm": 3.046875, "learning_rate": 7.31107849455844e-06, "loss": 0.87731352, "memory(GiB)": 728.98, "step": 29935, "train_speed(iter/s)": 0.236111 }, { "acc": 0.75549421, "epoch": 0.7595141417969016, "grad_norm": 4.28125, "learning_rate": 7.310148533478017e-06, "loss": 0.95729551, "memory(GiB)": 728.98, "step": 29940, "train_speed(iter/s)": 0.236032 }, { "acc": 0.75633197, "epoch": 0.7596409811659391, "grad_norm": 3.171875, "learning_rate": 7.309218470781523e-06, "loss": 0.92083406, "memory(GiB)": 728.98, "step": 29945, "train_speed(iter/s)": 0.235946 }, { "acc": 0.74301281, "epoch": 0.7597678205349767, "grad_norm": 2.84375, "learning_rate": 7.308288306509864e-06, "loss": 0.94736996, "memory(GiB)": 728.98, "step": 29950, "train_speed(iter/s)": 0.235859 }, { "acc": 0.75730596, "epoch": 0.7598946599040143, "grad_norm": 3.265625, "learning_rate": 7.307358040703958e-06, "loss": 0.9294158, "memory(GiB)": 728.98, "step": 29955, "train_speed(iter/s)": 0.235777 }, { "acc": 0.78010054, "epoch": 0.7600214992730518, "grad_norm": 3.171875, "learning_rate": 7.306427673404723e-06, "loss": 0.82461843, "memory(GiB)": 728.98, "step": 29960, "train_speed(iter/s)": 0.235695 }, { "acc": 0.77143941, "epoch": 0.7601483386420894, "grad_norm": 3.375, "learning_rate": 7.305497204653085e-06, "loss": 0.866049, "memory(GiB)": 728.98, "step": 29965, "train_speed(iter/s)": 0.235616 }, { "acc": 0.76351905, "epoch": 0.760275178011127, "grad_norm": 3.265625, "learning_rate": 7.304566634489969e-06, "loss": 0.94736452, "memory(GiB)": 728.98, "step": 29970, "train_speed(iter/s)": 0.235534 }, { "acc": 0.76533761, "epoch": 0.7604020173801646, "grad_norm": 2.84375, "learning_rate": 7.303635962956309e-06, "loss": 0.90993576, "memory(GiB)": 728.98, "step": 29975, "train_speed(iter/s)": 0.235437 }, { "acc": 0.76259093, "epoch": 0.7605288567492021, "grad_norm": 2.96875, "learning_rate": 7.302705190093045e-06, "loss": 0.92313175, "memory(GiB)": 728.98, "step": 29980, "train_speed(iter/s)": 0.235334 }, { "acc": 0.76559052, "epoch": 0.7606556961182397, "grad_norm": 3.125, "learning_rate": 7.301774315941115e-06, "loss": 0.90625668, "memory(GiB)": 728.98, "step": 29985, "train_speed(iter/s)": 0.23525 }, { "acc": 0.78022943, "epoch": 0.7607825354872773, "grad_norm": 3.109375, "learning_rate": 7.300843340541467e-06, "loss": 0.82773952, "memory(GiB)": 728.98, "step": 29990, "train_speed(iter/s)": 0.23517 }, { "acc": 0.76337795, "epoch": 0.7609093748563148, "grad_norm": 3.578125, "learning_rate": 7.299912263935051e-06, "loss": 0.88643999, "memory(GiB)": 728.98, "step": 29995, "train_speed(iter/s)": 0.235099 }, { "acc": 0.78533688, "epoch": 0.7610362142253523, "grad_norm": 3.046875, "learning_rate": 7.298981086162823e-06, "loss": 0.82400999, "memory(GiB)": 728.98, "step": 30000, "train_speed(iter/s)": 0.235015 }, { "epoch": 0.7610362142253523, "eval_acc": 0.755893965643652, "eval_loss": 0.8659295439720154, "eval_runtime": 1151.5701, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 30000 }, { "acc": 0.75909472, "epoch": 0.7611630535943898, "grad_norm": 3.59375, "learning_rate": 7.298049807265742e-06, "loss": 0.95584602, "memory(GiB)": 728.98, "step": 30005, "train_speed(iter/s)": 0.231534 }, { "acc": 0.77503972, "epoch": 0.7612898929634274, "grad_norm": 3.03125, "learning_rate": 7.297118427284772e-06, "loss": 0.83554335, "memory(GiB)": 728.98, "step": 30010, "train_speed(iter/s)": 0.231448 }, { "acc": 0.7608077, "epoch": 0.761416732332465, "grad_norm": 3.046875, "learning_rate": 7.2961869462608815e-06, "loss": 0.91333323, "memory(GiB)": 728.98, "step": 30015, "train_speed(iter/s)": 0.231363 }, { "acc": 0.77873635, "epoch": 0.7615435717015026, "grad_norm": 2.984375, "learning_rate": 7.295255364235043e-06, "loss": 0.88604679, "memory(GiB)": 728.98, "step": 30020, "train_speed(iter/s)": 0.231281 }, { "acc": 0.7705708, "epoch": 0.7616704110705401, "grad_norm": 3.171875, "learning_rate": 7.2943236812482345e-06, "loss": 0.86535444, "memory(GiB)": 728.98, "step": 30025, "train_speed(iter/s)": 0.231198 }, { "acc": 0.78035445, "epoch": 0.7617972504395777, "grad_norm": 3.28125, "learning_rate": 7.293391897341437e-06, "loss": 0.86627569, "memory(GiB)": 728.98, "step": 30030, "train_speed(iter/s)": 0.231126 }, { "acc": 0.7589798, "epoch": 0.7619240898086153, "grad_norm": 3.765625, "learning_rate": 7.292460012555638e-06, "loss": 0.88787842, "memory(GiB)": 728.98, "step": 30035, "train_speed(iter/s)": 0.231022 }, { "acc": 0.77210894, "epoch": 0.7620509291776528, "grad_norm": 3.40625, "learning_rate": 7.291528026931825e-06, "loss": 0.8776082, "memory(GiB)": 728.98, "step": 30040, "train_speed(iter/s)": 0.230949 }, { "acc": 0.77186933, "epoch": 0.7621777685466904, "grad_norm": 3.3125, "learning_rate": 7.290595940510998e-06, "loss": 0.92811022, "memory(GiB)": 728.98, "step": 30045, "train_speed(iter/s)": 0.230869 }, { "acc": 0.76185546, "epoch": 0.762304607915728, "grad_norm": 3.390625, "learning_rate": 7.289663753334153e-06, "loss": 0.88294802, "memory(GiB)": 728.98, "step": 30050, "train_speed(iter/s)": 0.230776 }, { "acc": 0.76842413, "epoch": 0.7624314472847655, "grad_norm": 3.78125, "learning_rate": 7.288731465442294e-06, "loss": 0.89030962, "memory(GiB)": 728.98, "step": 30055, "train_speed(iter/s)": 0.230691 }, { "acc": 0.76813116, "epoch": 0.762558286653803, "grad_norm": 3.765625, "learning_rate": 7.287799076876432e-06, "loss": 0.9315773, "memory(GiB)": 728.98, "step": 30060, "train_speed(iter/s)": 0.230599 }, { "acc": 0.76983061, "epoch": 0.7626851260228406, "grad_norm": 5.0, "learning_rate": 7.286866587677576e-06, "loss": 0.91796122, "memory(GiB)": 728.98, "step": 30065, "train_speed(iter/s)": 0.23052 }, { "acc": 0.76338344, "epoch": 0.7628119653918781, "grad_norm": 3.203125, "learning_rate": 7.285933997886746e-06, "loss": 0.87427311, "memory(GiB)": 728.98, "step": 30070, "train_speed(iter/s)": 0.23043 }, { "acc": 0.75963898, "epoch": 0.7629388047609157, "grad_norm": 3.265625, "learning_rate": 7.285001307544962e-06, "loss": 0.91822004, "memory(GiB)": 728.98, "step": 30075, "train_speed(iter/s)": 0.230349 }, { "acc": 0.76048231, "epoch": 0.7630656441299533, "grad_norm": 3.625, "learning_rate": 7.284068516693252e-06, "loss": 0.9556592, "memory(GiB)": 728.98, "step": 30080, "train_speed(iter/s)": 0.230267 }, { "acc": 0.76589622, "epoch": 0.7631924834989908, "grad_norm": 3.78125, "learning_rate": 7.283135625372646e-06, "loss": 0.899265, "memory(GiB)": 728.98, "step": 30085, "train_speed(iter/s)": 0.230183 }, { "acc": 0.75770912, "epoch": 0.7633193228680284, "grad_norm": 3.453125, "learning_rate": 7.282202633624178e-06, "loss": 0.88788204, "memory(GiB)": 728.98, "step": 30090, "train_speed(iter/s)": 0.230102 }, { "acc": 0.76450529, "epoch": 0.763446162237066, "grad_norm": 3.765625, "learning_rate": 7.281269541488887e-06, "loss": 0.88160963, "memory(GiB)": 728.98, "step": 30095, "train_speed(iter/s)": 0.230014 }, { "acc": 0.76206555, "epoch": 0.7635730016061035, "grad_norm": 3.8125, "learning_rate": 7.280336349007817e-06, "loss": 0.95214682, "memory(GiB)": 728.98, "step": 30100, "train_speed(iter/s)": 0.229941 }, { "acc": 0.75901704, "epoch": 0.7636998409751411, "grad_norm": 3.109375, "learning_rate": 7.2794030562220186e-06, "loss": 0.89010544, "memory(GiB)": 728.98, "step": 30105, "train_speed(iter/s)": 0.229861 }, { "acc": 0.7614017, "epoch": 0.7638266803441787, "grad_norm": 3.75, "learning_rate": 7.27846966317254e-06, "loss": 0.90880251, "memory(GiB)": 728.98, "step": 30110, "train_speed(iter/s)": 0.229791 }, { "acc": 0.77229681, "epoch": 0.7639535197132162, "grad_norm": 3.78125, "learning_rate": 7.277536169900443e-06, "loss": 0.87242594, "memory(GiB)": 728.98, "step": 30115, "train_speed(iter/s)": 0.229714 }, { "acc": 0.77530961, "epoch": 0.7640803590822537, "grad_norm": 3.03125, "learning_rate": 7.276602576446785e-06, "loss": 0.85962601, "memory(GiB)": 728.98, "step": 30120, "train_speed(iter/s)": 0.229615 }, { "acc": 0.76795902, "epoch": 0.7642071984512913, "grad_norm": 3.21875, "learning_rate": 7.275668882852634e-06, "loss": 0.89677525, "memory(GiB)": 728.98, "step": 30125, "train_speed(iter/s)": 0.229526 }, { "acc": 0.76964984, "epoch": 0.7643340378203288, "grad_norm": 3.59375, "learning_rate": 7.27473508915906e-06, "loss": 0.91137075, "memory(GiB)": 728.98, "step": 30130, "train_speed(iter/s)": 0.229443 }, { "acc": 0.76660895, "epoch": 0.7644608771893664, "grad_norm": 3.5, "learning_rate": 7.273801195407136e-06, "loss": 0.87709389, "memory(GiB)": 728.98, "step": 30135, "train_speed(iter/s)": 0.229373 }, { "acc": 0.77511787, "epoch": 0.764587716558404, "grad_norm": 3.09375, "learning_rate": 7.272867201637942e-06, "loss": 0.87936344, "memory(GiB)": 728.98, "step": 30140, "train_speed(iter/s)": 0.229301 }, { "acc": 0.75202069, "epoch": 0.7647145559274415, "grad_norm": 2.953125, "learning_rate": 7.2719331078925615e-06, "loss": 0.94938745, "memory(GiB)": 728.98, "step": 30145, "train_speed(iter/s)": 0.229216 }, { "acc": 0.77150621, "epoch": 0.7648413952964791, "grad_norm": 3.34375, "learning_rate": 7.270998914212084e-06, "loss": 0.87641897, "memory(GiB)": 728.98, "step": 30150, "train_speed(iter/s)": 0.229149 }, { "acc": 0.78576546, "epoch": 0.7649682346655167, "grad_norm": 3.59375, "learning_rate": 7.270064620637598e-06, "loss": 0.80569162, "memory(GiB)": 728.98, "step": 30155, "train_speed(iter/s)": 0.229074 }, { "acc": 0.76909428, "epoch": 0.7650950740345542, "grad_norm": 3.328125, "learning_rate": 7.269130227210204e-06, "loss": 0.87555485, "memory(GiB)": 728.98, "step": 30160, "train_speed(iter/s)": 0.228994 }, { "acc": 0.75373173, "epoch": 0.7652219134035918, "grad_norm": 4.03125, "learning_rate": 7.268195733971001e-06, "loss": 0.89421568, "memory(GiB)": 728.98, "step": 30165, "train_speed(iter/s)": 0.22893 }, { "acc": 0.76621275, "epoch": 0.7653487527726294, "grad_norm": 3.53125, "learning_rate": 7.2672611409610935e-06, "loss": 0.87620506, "memory(GiB)": 728.98, "step": 30170, "train_speed(iter/s)": 0.228863 }, { "acc": 0.77563214, "epoch": 0.7654755921416669, "grad_norm": 3.546875, "learning_rate": 7.266326448221594e-06, "loss": 0.8692771, "memory(GiB)": 728.98, "step": 30175, "train_speed(iter/s)": 0.228788 }, { "acc": 0.7662344, "epoch": 0.7656024315107044, "grad_norm": 3.109375, "learning_rate": 7.265391655793613e-06, "loss": 0.91746473, "memory(GiB)": 728.98, "step": 30180, "train_speed(iter/s)": 0.228707 }, { "acc": 0.75845714, "epoch": 0.765729270879742, "grad_norm": 3.453125, "learning_rate": 7.2644567637182725e-06, "loss": 0.91289473, "memory(GiB)": 728.98, "step": 30185, "train_speed(iter/s)": 0.228633 }, { "acc": 0.76963029, "epoch": 0.7658561102487795, "grad_norm": 3.28125, "learning_rate": 7.263521772036693e-06, "loss": 0.86757059, "memory(GiB)": 728.98, "step": 30190, "train_speed(iter/s)": 0.228558 }, { "acc": 0.76139193, "epoch": 0.7659829496178171, "grad_norm": 3.578125, "learning_rate": 7.2625866807900044e-06, "loss": 0.90311298, "memory(GiB)": 728.98, "step": 30195, "train_speed(iter/s)": 0.228485 }, { "acc": 0.7661128, "epoch": 0.7661097889868547, "grad_norm": 3.609375, "learning_rate": 7.261651490019334e-06, "loss": 0.87948875, "memory(GiB)": 728.98, "step": 30200, "train_speed(iter/s)": 0.22841 }, { "acc": 0.74951077, "epoch": 0.7662366283558922, "grad_norm": 3.359375, "learning_rate": 7.260716199765823e-06, "loss": 0.94704542, "memory(GiB)": 728.98, "step": 30205, "train_speed(iter/s)": 0.22832 }, { "acc": 0.77064772, "epoch": 0.7663634677249298, "grad_norm": 3.59375, "learning_rate": 7.2597808100706095e-06, "loss": 0.90411453, "memory(GiB)": 728.98, "step": 30210, "train_speed(iter/s)": 0.22825 }, { "acc": 0.78305583, "epoch": 0.7664903070939674, "grad_norm": 3.5, "learning_rate": 7.258845320974837e-06, "loss": 0.85365744, "memory(GiB)": 728.98, "step": 30215, "train_speed(iter/s)": 0.228176 }, { "acc": 0.77638774, "epoch": 0.7666171464630049, "grad_norm": 4.0625, "learning_rate": 7.257909732519655e-06, "loss": 0.84456415, "memory(GiB)": 728.98, "step": 30220, "train_speed(iter/s)": 0.228096 }, { "acc": 0.74569654, "epoch": 0.7667439858320425, "grad_norm": 3.921875, "learning_rate": 7.25697404474622e-06, "loss": 0.94437647, "memory(GiB)": 728.98, "step": 30225, "train_speed(iter/s)": 0.228015 }, { "acc": 0.76342087, "epoch": 0.7668708252010801, "grad_norm": 3.296875, "learning_rate": 7.2560382576956875e-06, "loss": 0.86395502, "memory(GiB)": 728.98, "step": 30230, "train_speed(iter/s)": 0.227931 }, { "acc": 0.75698204, "epoch": 0.7669976645701176, "grad_norm": 3.53125, "learning_rate": 7.25510237140922e-06, "loss": 0.9131074, "memory(GiB)": 728.98, "step": 30235, "train_speed(iter/s)": 0.227859 }, { "acc": 0.78475714, "epoch": 0.7671245039391551, "grad_norm": 3.625, "learning_rate": 7.254166385927984e-06, "loss": 0.80972509, "memory(GiB)": 728.98, "step": 30240, "train_speed(iter/s)": 0.227785 }, { "acc": 0.76364484, "epoch": 0.7672513433081927, "grad_norm": 4.84375, "learning_rate": 7.253230301293151e-06, "loss": 0.91840525, "memory(GiB)": 728.98, "step": 30245, "train_speed(iter/s)": 0.227706 }, { "acc": 0.77534595, "epoch": 0.7673781826772302, "grad_norm": 3.046875, "learning_rate": 7.252294117545897e-06, "loss": 0.92670059, "memory(GiB)": 728.98, "step": 30250, "train_speed(iter/s)": 0.227604 }, { "acc": 0.75923371, "epoch": 0.7675050220462678, "grad_norm": 3.40625, "learning_rate": 7.251357834727401e-06, "loss": 0.89421864, "memory(GiB)": 728.98, "step": 30255, "train_speed(iter/s)": 0.227524 }, { "acc": 0.76766052, "epoch": 0.7676318614153054, "grad_norm": 3.609375, "learning_rate": 7.250421452878846e-06, "loss": 0.87033348, "memory(GiB)": 728.98, "step": 30260, "train_speed(iter/s)": 0.227452 }, { "acc": 0.7641366, "epoch": 0.7677587007843429, "grad_norm": 3.8125, "learning_rate": 7.249484972041423e-06, "loss": 0.87201424, "memory(GiB)": 728.98, "step": 30265, "train_speed(iter/s)": 0.227378 }, { "acc": 0.77296801, "epoch": 0.7678855401533805, "grad_norm": 3.34375, "learning_rate": 7.248548392256323e-06, "loss": 0.90901918, "memory(GiB)": 728.98, "step": 30270, "train_speed(iter/s)": 0.227307 }, { "acc": 0.76662073, "epoch": 0.7680123795224181, "grad_norm": 3.609375, "learning_rate": 7.247611713564744e-06, "loss": 0.89689236, "memory(GiB)": 728.98, "step": 30275, "train_speed(iter/s)": 0.227242 }, { "acc": 0.75944409, "epoch": 0.7681392188914556, "grad_norm": 3.375, "learning_rate": 7.246674936007884e-06, "loss": 0.93872824, "memory(GiB)": 728.98, "step": 30280, "train_speed(iter/s)": 0.22716 }, { "acc": 0.76089449, "epoch": 0.7682660582604932, "grad_norm": 3.0625, "learning_rate": 7.245738059626955e-06, "loss": 0.94192734, "memory(GiB)": 728.98, "step": 30285, "train_speed(iter/s)": 0.227071 }, { "acc": 0.76452112, "epoch": 0.7683928976295308, "grad_norm": 3.5625, "learning_rate": 7.244801084463163e-06, "loss": 0.9159256, "memory(GiB)": 728.98, "step": 30290, "train_speed(iter/s)": 0.226991 }, { "acc": 0.76242809, "epoch": 0.7685197369985683, "grad_norm": 4.3125, "learning_rate": 7.243864010557725e-06, "loss": 0.93952007, "memory(GiB)": 728.98, "step": 30295, "train_speed(iter/s)": 0.226913 }, { "acc": 0.76572042, "epoch": 0.7686465763676058, "grad_norm": 3.65625, "learning_rate": 7.242926837951858e-06, "loss": 0.89279919, "memory(GiB)": 728.98, "step": 30300, "train_speed(iter/s)": 0.226842 }, { "acc": 0.77390356, "epoch": 0.7687734157366434, "grad_norm": 4.1875, "learning_rate": 7.241989566686784e-06, "loss": 0.85778198, "memory(GiB)": 728.98, "step": 30305, "train_speed(iter/s)": 0.226768 }, { "acc": 0.77899818, "epoch": 0.7689002551056809, "grad_norm": 3.5625, "learning_rate": 7.241052196803735e-06, "loss": 0.81013184, "memory(GiB)": 728.98, "step": 30310, "train_speed(iter/s)": 0.226691 }, { "acc": 0.76253805, "epoch": 0.7690270944747185, "grad_norm": 3.71875, "learning_rate": 7.240114728343939e-06, "loss": 0.89869165, "memory(GiB)": 728.98, "step": 30315, "train_speed(iter/s)": 0.226623 }, { "acc": 0.76576486, "epoch": 0.7691539338437561, "grad_norm": 3.609375, "learning_rate": 7.239177161348634e-06, "loss": 0.90999641, "memory(GiB)": 728.98, "step": 30320, "train_speed(iter/s)": 0.226542 }, { "acc": 0.75124316, "epoch": 0.7692807732127936, "grad_norm": 3.578125, "learning_rate": 7.2382394958590585e-06, "loss": 0.91025076, "memory(GiB)": 728.98, "step": 30325, "train_speed(iter/s)": 0.226461 }, { "acc": 0.75974703, "epoch": 0.7694076125818312, "grad_norm": 3.53125, "learning_rate": 7.237301731916461e-06, "loss": 0.93326168, "memory(GiB)": 728.98, "step": 30330, "train_speed(iter/s)": 0.226377 }, { "acc": 0.76965013, "epoch": 0.7695344519508688, "grad_norm": 3.9375, "learning_rate": 7.236363869562086e-06, "loss": 0.91873932, "memory(GiB)": 728.98, "step": 30335, "train_speed(iter/s)": 0.226311 }, { "acc": 0.76142554, "epoch": 0.7696612913199063, "grad_norm": 3.296875, "learning_rate": 7.2354259088371936e-06, "loss": 0.88764009, "memory(GiB)": 728.98, "step": 30340, "train_speed(iter/s)": 0.226244 }, { "acc": 0.77484069, "epoch": 0.7697881306889439, "grad_norm": 3.515625, "learning_rate": 7.234487849783035e-06, "loss": 0.89010839, "memory(GiB)": 728.98, "step": 30345, "train_speed(iter/s)": 0.226168 }, { "acc": 0.75765071, "epoch": 0.7699149700579815, "grad_norm": 3.421875, "learning_rate": 7.233549692440877e-06, "loss": 0.93814459, "memory(GiB)": 728.98, "step": 30350, "train_speed(iter/s)": 0.226102 }, { "acc": 0.77534933, "epoch": 0.770041809427019, "grad_norm": 3.015625, "learning_rate": 7.232611436851984e-06, "loss": 0.89184628, "memory(GiB)": 728.98, "step": 30355, "train_speed(iter/s)": 0.226025 }, { "acc": 0.7767735, "epoch": 0.7701686487960565, "grad_norm": 3.6875, "learning_rate": 7.2316730830576275e-06, "loss": 0.83067417, "memory(GiB)": 728.98, "step": 30360, "train_speed(iter/s)": 0.225942 }, { "acc": 0.77766757, "epoch": 0.7702954881650941, "grad_norm": 3.15625, "learning_rate": 7.230734631099083e-06, "loss": 0.88830538, "memory(GiB)": 728.98, "step": 30365, "train_speed(iter/s)": 0.225871 }, { "acc": 0.75676889, "epoch": 0.7704223275341316, "grad_norm": 3.09375, "learning_rate": 7.229796081017628e-06, "loss": 0.92233229, "memory(GiB)": 728.98, "step": 30370, "train_speed(iter/s)": 0.225796 }, { "acc": 0.76603899, "epoch": 0.7705491669031692, "grad_norm": 3.84375, "learning_rate": 7.2288574328545505e-06, "loss": 0.90904417, "memory(GiB)": 728.98, "step": 30375, "train_speed(iter/s)": 0.225714 }, { "acc": 0.77107563, "epoch": 0.7706760062722068, "grad_norm": 3.03125, "learning_rate": 7.2279186866511355e-06, "loss": 0.86897259, "memory(GiB)": 728.98, "step": 30380, "train_speed(iter/s)": 0.225643 }, { "acc": 0.7678102, "epoch": 0.7708028456412443, "grad_norm": 3.578125, "learning_rate": 7.226979842448675e-06, "loss": 0.89795761, "memory(GiB)": 728.98, "step": 30385, "train_speed(iter/s)": 0.225568 }, { "acc": 0.763732, "epoch": 0.7709296850102819, "grad_norm": 3.96875, "learning_rate": 7.226040900288468e-06, "loss": 0.9328783, "memory(GiB)": 728.98, "step": 30390, "train_speed(iter/s)": 0.225484 }, { "acc": 0.76611757, "epoch": 0.7710565243793195, "grad_norm": 3.734375, "learning_rate": 7.2251018602118125e-06, "loss": 0.90264759, "memory(GiB)": 728.98, "step": 30395, "train_speed(iter/s)": 0.225414 }, { "acc": 0.7508584, "epoch": 0.771183363748357, "grad_norm": 3.453125, "learning_rate": 7.224162722260017e-06, "loss": 0.91552935, "memory(GiB)": 728.98, "step": 30400, "train_speed(iter/s)": 0.225343 }, { "acc": 0.7723196, "epoch": 0.7713102031173946, "grad_norm": 3.34375, "learning_rate": 7.22322348647439e-06, "loss": 0.91513958, "memory(GiB)": 728.98, "step": 30405, "train_speed(iter/s)": 0.225271 }, { "acc": 0.76358805, "epoch": 0.7714370424864322, "grad_norm": 3.09375, "learning_rate": 7.222284152896246e-06, "loss": 0.92795115, "memory(GiB)": 728.98, "step": 30410, "train_speed(iter/s)": 0.225199 }, { "acc": 0.78854656, "epoch": 0.7715638818554698, "grad_norm": 3.859375, "learning_rate": 7.221344721566902e-06, "loss": 0.79778452, "memory(GiB)": 728.98, "step": 30415, "train_speed(iter/s)": 0.225134 }, { "acc": 0.76185079, "epoch": 0.7716907212245072, "grad_norm": 3.46875, "learning_rate": 7.220405192527682e-06, "loss": 0.92221985, "memory(GiB)": 728.98, "step": 30420, "train_speed(iter/s)": 0.225046 }, { "acc": 0.76301861, "epoch": 0.7718175605935448, "grad_norm": 3.8125, "learning_rate": 7.219465565819912e-06, "loss": 0.89310865, "memory(GiB)": 728.98, "step": 30425, "train_speed(iter/s)": 0.224966 }, { "acc": 0.77638812, "epoch": 0.7719443999625824, "grad_norm": 3.703125, "learning_rate": 7.218525841484923e-06, "loss": 0.86396914, "memory(GiB)": 728.98, "step": 30430, "train_speed(iter/s)": 0.224895 }, { "acc": 0.78067865, "epoch": 0.7720712393316199, "grad_norm": 3.5, "learning_rate": 7.2175860195640515e-06, "loss": 0.81338577, "memory(GiB)": 728.98, "step": 30435, "train_speed(iter/s)": 0.224824 }, { "acc": 0.76809125, "epoch": 0.7721980787006575, "grad_norm": 3.734375, "learning_rate": 7.216646100098634e-06, "loss": 0.91762304, "memory(GiB)": 728.98, "step": 30440, "train_speed(iter/s)": 0.224744 }, { "acc": 0.76431422, "epoch": 0.772324918069695, "grad_norm": 3.734375, "learning_rate": 7.215706083130021e-06, "loss": 0.87089977, "memory(GiB)": 728.98, "step": 30445, "train_speed(iter/s)": 0.224666 }, { "acc": 0.76805024, "epoch": 0.7724517574387326, "grad_norm": 3.359375, "learning_rate": 7.2147659686995545e-06, "loss": 0.9380703, "memory(GiB)": 728.98, "step": 30450, "train_speed(iter/s)": 0.224597 }, { "acc": 0.77459593, "epoch": 0.7725785968077702, "grad_norm": 3.890625, "learning_rate": 7.213825756848592e-06, "loss": 0.86875629, "memory(GiB)": 728.98, "step": 30455, "train_speed(iter/s)": 0.224517 }, { "acc": 0.76198001, "epoch": 0.7727054361768078, "grad_norm": 3.1875, "learning_rate": 7.212885447618485e-06, "loss": 0.85705719, "memory(GiB)": 728.98, "step": 30460, "train_speed(iter/s)": 0.224444 }, { "acc": 0.77287602, "epoch": 0.7728322755458453, "grad_norm": 3.46875, "learning_rate": 7.211945041050601e-06, "loss": 0.86704044, "memory(GiB)": 728.98, "step": 30465, "train_speed(iter/s)": 0.224375 }, { "acc": 0.75450945, "epoch": 0.7729591149148829, "grad_norm": 3.421875, "learning_rate": 7.211004537186301e-06, "loss": 0.91895027, "memory(GiB)": 728.98, "step": 30470, "train_speed(iter/s)": 0.2243 }, { "acc": 0.77969065, "epoch": 0.7730859542839205, "grad_norm": 3.5, "learning_rate": 7.210063936066956e-06, "loss": 0.81616602, "memory(GiB)": 728.98, "step": 30475, "train_speed(iter/s)": 0.224224 }, { "acc": 0.7815527, "epoch": 0.7732127936529579, "grad_norm": 3.203125, "learning_rate": 7.20912323773394e-06, "loss": 0.82041702, "memory(GiB)": 728.98, "step": 30480, "train_speed(iter/s)": 0.224147 }, { "acc": 0.7805687, "epoch": 0.7733396330219955, "grad_norm": 3.78125, "learning_rate": 7.208182442228631e-06, "loss": 0.83919201, "memory(GiB)": 728.98, "step": 30485, "train_speed(iter/s)": 0.224075 }, { "acc": 0.76502814, "epoch": 0.773466472391033, "grad_norm": 3.765625, "learning_rate": 7.207241549592413e-06, "loss": 0.89078741, "memory(GiB)": 728.98, "step": 30490, "train_speed(iter/s)": 0.22401 }, { "acc": 0.76662512, "epoch": 0.7735933117600706, "grad_norm": 4.09375, "learning_rate": 7.2063005598666705e-06, "loss": 0.88860378, "memory(GiB)": 728.98, "step": 30495, "train_speed(iter/s)": 0.223932 }, { "acc": 0.74789147, "epoch": 0.7737201511291082, "grad_norm": 3.6875, "learning_rate": 7.205359473092798e-06, "loss": 0.91580229, "memory(GiB)": 728.98, "step": 30500, "train_speed(iter/s)": 0.22387 }, { "epoch": 0.7737201511291082, "eval_acc": 0.7560021673768015, "eval_loss": 0.8651418685913086, "eval_runtime": 1149.8103, "eval_samples_per_second": 5.54, "eval_steps_per_second": 5.54, "step": 30500 }, { "acc": 0.76371083, "epoch": 0.7738469904981458, "grad_norm": 3.1875, "learning_rate": 7.204418289312188e-06, "loss": 0.97383928, "memory(GiB)": 728.98, "step": 30505, "train_speed(iter/s)": 0.220762 }, { "acc": 0.75747709, "epoch": 0.7739738298671833, "grad_norm": 3.734375, "learning_rate": 7.203477008566241e-06, "loss": 0.92471113, "memory(GiB)": 728.98, "step": 30510, "train_speed(iter/s)": 0.220689 }, { "acc": 0.77401314, "epoch": 0.7741006692362209, "grad_norm": 3.71875, "learning_rate": 7.202535630896362e-06, "loss": 0.91815586, "memory(GiB)": 728.98, "step": 30515, "train_speed(iter/s)": 0.220622 }, { "acc": 0.76074443, "epoch": 0.7742275086052585, "grad_norm": 3.578125, "learning_rate": 7.201594156343957e-06, "loss": 0.93450985, "memory(GiB)": 728.98, "step": 30520, "train_speed(iter/s)": 0.220539 }, { "acc": 0.76755805, "epoch": 0.774354347974296, "grad_norm": 3.265625, "learning_rate": 7.200652584950442e-06, "loss": 0.94011049, "memory(GiB)": 728.98, "step": 30525, "train_speed(iter/s)": 0.220469 }, { "acc": 0.75685287, "epoch": 0.7744811873433336, "grad_norm": 2.859375, "learning_rate": 7.199710916757229e-06, "loss": 0.88435907, "memory(GiB)": 728.98, "step": 30530, "train_speed(iter/s)": 0.220395 }, { "acc": 0.76416583, "epoch": 0.7746080267123712, "grad_norm": 2.703125, "learning_rate": 7.198769151805743e-06, "loss": 0.92832279, "memory(GiB)": 728.98, "step": 30535, "train_speed(iter/s)": 0.220317 }, { "acc": 0.75572968, "epoch": 0.7747348660814086, "grad_norm": 3.359375, "learning_rate": 7.197827290137407e-06, "loss": 0.91841393, "memory(GiB)": 728.98, "step": 30540, "train_speed(iter/s)": 0.220244 }, { "acc": 0.76639624, "epoch": 0.7748617054504462, "grad_norm": 3.84375, "learning_rate": 7.196885331793651e-06, "loss": 0.90485792, "memory(GiB)": 728.98, "step": 30545, "train_speed(iter/s)": 0.220182 }, { "acc": 0.76845703, "epoch": 0.7749885448194838, "grad_norm": 4.3125, "learning_rate": 7.195943276815911e-06, "loss": 0.88853235, "memory(GiB)": 728.98, "step": 30550, "train_speed(iter/s)": 0.220098 }, { "acc": 0.77560143, "epoch": 0.7751153841885213, "grad_norm": 3.515625, "learning_rate": 7.1950011252456195e-06, "loss": 0.89340124, "memory(GiB)": 728.98, "step": 30555, "train_speed(iter/s)": 0.220027 }, { "acc": 0.78069768, "epoch": 0.7752422235575589, "grad_norm": 3.453125, "learning_rate": 7.194058877124226e-06, "loss": 0.88812847, "memory(GiB)": 728.98, "step": 30560, "train_speed(iter/s)": 0.219945 }, { "acc": 0.76757598, "epoch": 0.7753690629265965, "grad_norm": 4.21875, "learning_rate": 7.19311653249317e-06, "loss": 0.87022772, "memory(GiB)": 728.98, "step": 30565, "train_speed(iter/s)": 0.219881 }, { "acc": 0.76670747, "epoch": 0.775495902295634, "grad_norm": 3.390625, "learning_rate": 7.192174091393907e-06, "loss": 0.90280514, "memory(GiB)": 728.98, "step": 30570, "train_speed(iter/s)": 0.219813 }, { "acc": 0.77626758, "epoch": 0.7756227416646716, "grad_norm": 3.28125, "learning_rate": 7.19123155386789e-06, "loss": 0.88232203, "memory(GiB)": 728.98, "step": 30575, "train_speed(iter/s)": 0.219735 }, { "acc": 0.75807328, "epoch": 0.7757495810337092, "grad_norm": 3.75, "learning_rate": 7.190288919956579e-06, "loss": 0.9117734, "memory(GiB)": 728.98, "step": 30580, "train_speed(iter/s)": 0.219655 }, { "acc": 0.76628127, "epoch": 0.7758764204027467, "grad_norm": 4.15625, "learning_rate": 7.189346189701436e-06, "loss": 0.90069971, "memory(GiB)": 728.98, "step": 30585, "train_speed(iter/s)": 0.219591 }, { "acc": 0.75949702, "epoch": 0.7760032597717843, "grad_norm": 3.6875, "learning_rate": 7.188403363143932e-06, "loss": 0.888412, "memory(GiB)": 728.98, "step": 30590, "train_speed(iter/s)": 0.219516 }, { "acc": 0.76106544, "epoch": 0.7761300991408219, "grad_norm": 3.4375, "learning_rate": 7.187460440325536e-06, "loss": 0.89628315, "memory(GiB)": 728.98, "step": 30595, "train_speed(iter/s)": 0.219436 }, { "acc": 0.76537476, "epoch": 0.7762569385098593, "grad_norm": 3.0, "learning_rate": 7.186517421287723e-06, "loss": 0.93204784, "memory(GiB)": 728.98, "step": 30600, "train_speed(iter/s)": 0.219364 }, { "acc": 0.7603622, "epoch": 0.7763837778788969, "grad_norm": 3.84375, "learning_rate": 7.1855743060719775e-06, "loss": 0.90780287, "memory(GiB)": 728.98, "step": 30605, "train_speed(iter/s)": 0.219298 }, { "acc": 0.76888099, "epoch": 0.7765106172479345, "grad_norm": 3.125, "learning_rate": 7.184631094719781e-06, "loss": 0.89048414, "memory(GiB)": 728.98, "step": 30610, "train_speed(iter/s)": 0.219227 }, { "acc": 0.77195997, "epoch": 0.776637456616972, "grad_norm": 3.609375, "learning_rate": 7.183687787272623e-06, "loss": 0.91955996, "memory(GiB)": 728.98, "step": 30615, "train_speed(iter/s)": 0.219155 }, { "acc": 0.76623464, "epoch": 0.7767642959860096, "grad_norm": 3.6875, "learning_rate": 7.1827443837719964e-06, "loss": 0.88990908, "memory(GiB)": 728.98, "step": 30620, "train_speed(iter/s)": 0.21909 }, { "acc": 0.78759251, "epoch": 0.7768911353550472, "grad_norm": 4.09375, "learning_rate": 7.1818008842594e-06, "loss": 0.82603712, "memory(GiB)": 728.98, "step": 30625, "train_speed(iter/s)": 0.219025 }, { "acc": 0.76933079, "epoch": 0.7770179747240847, "grad_norm": 3.234375, "learning_rate": 7.180857288776334e-06, "loss": 0.88504467, "memory(GiB)": 728.98, "step": 30630, "train_speed(iter/s)": 0.218952 }, { "acc": 0.76320291, "epoch": 0.7771448140931223, "grad_norm": 3.1875, "learning_rate": 7.179913597364305e-06, "loss": 0.89382505, "memory(GiB)": 728.98, "step": 30635, "train_speed(iter/s)": 0.218881 }, { "acc": 0.76161857, "epoch": 0.7772716534621599, "grad_norm": 4.625, "learning_rate": 7.178969810064822e-06, "loss": 0.86534786, "memory(GiB)": 728.98, "step": 30640, "train_speed(iter/s)": 0.218802 }, { "acc": 0.76383705, "epoch": 0.7773984928311974, "grad_norm": 3.78125, "learning_rate": 7.178025926919398e-06, "loss": 0.89286404, "memory(GiB)": 728.98, "step": 30645, "train_speed(iter/s)": 0.218744 }, { "acc": 0.77372427, "epoch": 0.777525332200235, "grad_norm": 4.40625, "learning_rate": 7.177081947969556e-06, "loss": 0.85989447, "memory(GiB)": 728.98, "step": 30650, "train_speed(iter/s)": 0.218664 }, { "acc": 0.77294779, "epoch": 0.7776521715692726, "grad_norm": 3.078125, "learning_rate": 7.176137873256815e-06, "loss": 0.90301943, "memory(GiB)": 728.98, "step": 30655, "train_speed(iter/s)": 0.218589 }, { "acc": 0.76366048, "epoch": 0.77777901093831, "grad_norm": 3.5, "learning_rate": 7.1751937028227025e-06, "loss": 0.8976573, "memory(GiB)": 728.98, "step": 30660, "train_speed(iter/s)": 0.218522 }, { "acc": 0.76006775, "epoch": 0.7779058503073476, "grad_norm": 3.421875, "learning_rate": 7.174249436708749e-06, "loss": 0.91174374, "memory(GiB)": 728.98, "step": 30665, "train_speed(iter/s)": 0.218445 }, { "acc": 0.76778932, "epoch": 0.7780326896763852, "grad_norm": 3.484375, "learning_rate": 7.17330507495649e-06, "loss": 0.8743371, "memory(GiB)": 728.98, "step": 30670, "train_speed(iter/s)": 0.218369 }, { "acc": 0.76206388, "epoch": 0.7781595290454227, "grad_norm": 3.734375, "learning_rate": 7.1723606176074665e-06, "loss": 0.89611387, "memory(GiB)": 728.98, "step": 30675, "train_speed(iter/s)": 0.218301 }, { "acc": 0.75910921, "epoch": 0.7782863684144603, "grad_norm": 3.5, "learning_rate": 7.171416064703221e-06, "loss": 0.92672157, "memory(GiB)": 728.98, "step": 30680, "train_speed(iter/s)": 0.218232 }, { "acc": 0.76833172, "epoch": 0.7784132077834979, "grad_norm": 3.453125, "learning_rate": 7.170471416285301e-06, "loss": 0.8873745, "memory(GiB)": 728.98, "step": 30685, "train_speed(iter/s)": 0.218168 }, { "acc": 0.76153932, "epoch": 0.7785400471525354, "grad_norm": 3.734375, "learning_rate": 7.1695266723952585e-06, "loss": 0.94101563, "memory(GiB)": 728.98, "step": 30690, "train_speed(iter/s)": 0.218094 }, { "acc": 0.77842331, "epoch": 0.778666886521573, "grad_norm": 3.578125, "learning_rate": 7.168581833074651e-06, "loss": 0.8806181, "memory(GiB)": 728.98, "step": 30695, "train_speed(iter/s)": 0.218033 }, { "acc": 0.76864133, "epoch": 0.7787937258906106, "grad_norm": 3.25, "learning_rate": 7.167636898365037e-06, "loss": 0.92489977, "memory(GiB)": 728.98, "step": 30700, "train_speed(iter/s)": 0.217974 }, { "acc": 0.76796498, "epoch": 0.7789205652596481, "grad_norm": 4.25, "learning_rate": 7.166691868307983e-06, "loss": 0.96020088, "memory(GiB)": 728.98, "step": 30705, "train_speed(iter/s)": 0.217913 }, { "acc": 0.76576953, "epoch": 0.7790474046286857, "grad_norm": 3.09375, "learning_rate": 7.165746742945057e-06, "loss": 0.92731314, "memory(GiB)": 728.98, "step": 30710, "train_speed(iter/s)": 0.217845 }, { "acc": 0.77431531, "epoch": 0.7791742439977233, "grad_norm": 3.78125, "learning_rate": 7.164801522317833e-06, "loss": 0.86592951, "memory(GiB)": 728.98, "step": 30715, "train_speed(iter/s)": 0.217776 }, { "acc": 0.76462798, "epoch": 0.7793010833667607, "grad_norm": 3.734375, "learning_rate": 7.163856206467887e-06, "loss": 0.94111671, "memory(GiB)": 728.98, "step": 30720, "train_speed(iter/s)": 0.217708 }, { "acc": 0.76282101, "epoch": 0.7794279227357983, "grad_norm": 3.015625, "learning_rate": 7.1629107954368004e-06, "loss": 0.9022131, "memory(GiB)": 728.98, "step": 30725, "train_speed(iter/s)": 0.217645 }, { "acc": 0.77590179, "epoch": 0.7795547621048359, "grad_norm": 3.3125, "learning_rate": 7.161965289266161e-06, "loss": 0.84574747, "memory(GiB)": 728.98, "step": 30730, "train_speed(iter/s)": 0.217578 }, { "acc": 0.75589561, "epoch": 0.7796816014738734, "grad_norm": 3.828125, "learning_rate": 7.161019687997555e-06, "loss": 0.97531404, "memory(GiB)": 728.98, "step": 30735, "train_speed(iter/s)": 0.217517 }, { "acc": 0.75065794, "epoch": 0.779808440842911, "grad_norm": 3.40625, "learning_rate": 7.16007399167258e-06, "loss": 0.93108616, "memory(GiB)": 728.98, "step": 30740, "train_speed(iter/s)": 0.217445 }, { "acc": 0.75663724, "epoch": 0.7799352802119486, "grad_norm": 3.71875, "learning_rate": 7.159128200332831e-06, "loss": 0.96139574, "memory(GiB)": 728.98, "step": 30745, "train_speed(iter/s)": 0.217383 }, { "acc": 0.75500774, "epoch": 0.7800621195809861, "grad_norm": 3.4375, "learning_rate": 7.158182314019912e-06, "loss": 0.94866524, "memory(GiB)": 728.98, "step": 30750, "train_speed(iter/s)": 0.217305 }, { "acc": 0.75634007, "epoch": 0.7801889589500237, "grad_norm": 3.828125, "learning_rate": 7.15723633277543e-06, "loss": 0.95852623, "memory(GiB)": 728.98, "step": 30755, "train_speed(iter/s)": 0.217237 }, { "acc": 0.75602937, "epoch": 0.7803157983190613, "grad_norm": 4.3125, "learning_rate": 7.156290256640996e-06, "loss": 0.94258404, "memory(GiB)": 728.98, "step": 30760, "train_speed(iter/s)": 0.217172 }, { "acc": 0.77516003, "epoch": 0.7804426376880989, "grad_norm": 3.640625, "learning_rate": 7.155344085658224e-06, "loss": 0.87656488, "memory(GiB)": 728.98, "step": 30765, "train_speed(iter/s)": 0.217107 }, { "acc": 0.74704666, "epoch": 0.7805694770571364, "grad_norm": 3.15625, "learning_rate": 7.154397819868732e-06, "loss": 0.93318415, "memory(GiB)": 728.98, "step": 30770, "train_speed(iter/s)": 0.217032 }, { "acc": 0.76977844, "epoch": 0.780696316426174, "grad_norm": 3.5, "learning_rate": 7.153451459314145e-06, "loss": 0.82511854, "memory(GiB)": 728.98, "step": 30775, "train_speed(iter/s)": 0.216958 }, { "acc": 0.7766655, "epoch": 0.7808231557952114, "grad_norm": 3.328125, "learning_rate": 7.1525050040360875e-06, "loss": 0.84093523, "memory(GiB)": 728.98, "step": 30780, "train_speed(iter/s)": 0.216899 }, { "acc": 0.77345681, "epoch": 0.780949995164249, "grad_norm": 3.328125, "learning_rate": 7.151558454076194e-06, "loss": 0.87091665, "memory(GiB)": 728.98, "step": 30785, "train_speed(iter/s)": 0.216826 }, { "acc": 0.7696043, "epoch": 0.7810768345332866, "grad_norm": 3.140625, "learning_rate": 7.1506118094761e-06, "loss": 0.91736851, "memory(GiB)": 728.98, "step": 30790, "train_speed(iter/s)": 0.216758 }, { "acc": 0.7745378, "epoch": 0.7812036739023241, "grad_norm": 3.765625, "learning_rate": 7.149665070277445e-06, "loss": 0.88938112, "memory(GiB)": 728.98, "step": 30795, "train_speed(iter/s)": 0.216696 }, { "acc": 0.75610814, "epoch": 0.7813305132713617, "grad_norm": 3.671875, "learning_rate": 7.1487182365218734e-06, "loss": 0.93748531, "memory(GiB)": 728.98, "step": 30800, "train_speed(iter/s)": 0.216631 }, { "acc": 0.76015515, "epoch": 0.7814573526403993, "grad_norm": 3.46875, "learning_rate": 7.147771308251031e-06, "loss": 0.92434349, "memory(GiB)": 728.98, "step": 30805, "train_speed(iter/s)": 0.216549 }, { "acc": 0.76969514, "epoch": 0.7815841920094369, "grad_norm": 3.03125, "learning_rate": 7.146824285506574e-06, "loss": 0.85479927, "memory(GiB)": 728.98, "step": 30810, "train_speed(iter/s)": 0.216479 }, { "acc": 0.7604506, "epoch": 0.7817110313784744, "grad_norm": 4.34375, "learning_rate": 7.1458771683301555e-06, "loss": 0.87013769, "memory(GiB)": 728.98, "step": 30815, "train_speed(iter/s)": 0.216418 }, { "acc": 0.7722692, "epoch": 0.781837870747512, "grad_norm": 3.296875, "learning_rate": 7.144929956763438e-06, "loss": 0.85230331, "memory(GiB)": 728.98, "step": 30820, "train_speed(iter/s)": 0.216351 }, { "acc": 0.76777239, "epoch": 0.7819647101165496, "grad_norm": 3.28125, "learning_rate": 7.143982650848085e-06, "loss": 0.90700874, "memory(GiB)": 728.98, "step": 30825, "train_speed(iter/s)": 0.216282 }, { "acc": 0.76003399, "epoch": 0.7820915494855871, "grad_norm": 3.765625, "learning_rate": 7.143035250625767e-06, "loss": 0.90562868, "memory(GiB)": 728.98, "step": 30830, "train_speed(iter/s)": 0.216225 }, { "acc": 0.75539103, "epoch": 0.7822183888546247, "grad_norm": 3.9375, "learning_rate": 7.142087756138156e-06, "loss": 0.97947111, "memory(GiB)": 728.98, "step": 30835, "train_speed(iter/s)": 0.216161 }, { "acc": 0.75448699, "epoch": 0.7823452282236621, "grad_norm": 4.03125, "learning_rate": 7.141140167426932e-06, "loss": 0.92594004, "memory(GiB)": 728.98, "step": 30840, "train_speed(iter/s)": 0.216098 }, { "acc": 0.76347408, "epoch": 0.7824720675926997, "grad_norm": 2.875, "learning_rate": 7.140192484533772e-06, "loss": 0.90207806, "memory(GiB)": 728.98, "step": 30845, "train_speed(iter/s)": 0.216031 }, { "acc": 0.76337519, "epoch": 0.7825989069617373, "grad_norm": 3.25, "learning_rate": 7.139244707500363e-06, "loss": 0.90059605, "memory(GiB)": 728.98, "step": 30850, "train_speed(iter/s)": 0.215968 }, { "acc": 0.76260381, "epoch": 0.7827257463307749, "grad_norm": 3.25, "learning_rate": 7.138296836368398e-06, "loss": 0.93317413, "memory(GiB)": 728.98, "step": 30855, "train_speed(iter/s)": 0.215897 }, { "acc": 0.77446365, "epoch": 0.7828525856998124, "grad_norm": 3.09375, "learning_rate": 7.137348871179566e-06, "loss": 0.85997524, "memory(GiB)": 728.98, "step": 30860, "train_speed(iter/s)": 0.215833 }, { "acc": 0.77216487, "epoch": 0.78297942506885, "grad_norm": 3.078125, "learning_rate": 7.136400811975568e-06, "loss": 0.88591032, "memory(GiB)": 728.98, "step": 30865, "train_speed(iter/s)": 0.215767 }, { "acc": 0.77898278, "epoch": 0.7831062644378876, "grad_norm": 3.84375, "learning_rate": 7.135452658798105e-06, "loss": 0.88481541, "memory(GiB)": 728.98, "step": 30870, "train_speed(iter/s)": 0.215698 }, { "acc": 0.77318516, "epoch": 0.7832331038069251, "grad_norm": 3.84375, "learning_rate": 7.134504411688884e-06, "loss": 0.85916462, "memory(GiB)": 728.98, "step": 30875, "train_speed(iter/s)": 0.215639 }, { "acc": 0.75973263, "epoch": 0.7833599431759627, "grad_norm": 3.78125, "learning_rate": 7.1335560706896134e-06, "loss": 0.90090399, "memory(GiB)": 728.98, "step": 30880, "train_speed(iter/s)": 0.215566 }, { "acc": 0.76535244, "epoch": 0.7834867825450003, "grad_norm": 3.421875, "learning_rate": 7.132607635842011e-06, "loss": 0.95304174, "memory(GiB)": 728.98, "step": 30885, "train_speed(iter/s)": 0.215505 }, { "acc": 0.78526373, "epoch": 0.7836136219140378, "grad_norm": 3.21875, "learning_rate": 7.131659107187794e-06, "loss": 0.8461339, "memory(GiB)": 728.98, "step": 30890, "train_speed(iter/s)": 0.215436 }, { "acc": 0.77466359, "epoch": 0.7837404612830754, "grad_norm": 3.9375, "learning_rate": 7.130710484768683e-06, "loss": 0.88000507, "memory(GiB)": 728.98, "step": 30895, "train_speed(iter/s)": 0.215363 }, { "acc": 0.77467799, "epoch": 0.7838673006521129, "grad_norm": 3.390625, "learning_rate": 7.129761768626406e-06, "loss": 0.85412941, "memory(GiB)": 728.98, "step": 30900, "train_speed(iter/s)": 0.215294 }, { "acc": 0.75045552, "epoch": 0.7839941400211504, "grad_norm": 4.1875, "learning_rate": 7.128812958802697e-06, "loss": 0.95817099, "memory(GiB)": 728.98, "step": 30905, "train_speed(iter/s)": 0.215236 }, { "acc": 0.7724577, "epoch": 0.784120979390188, "grad_norm": 3.828125, "learning_rate": 7.127864055339287e-06, "loss": 0.92579832, "memory(GiB)": 728.98, "step": 30910, "train_speed(iter/s)": 0.215165 }, { "acc": 0.75754685, "epoch": 0.7842478187592256, "grad_norm": 3.5625, "learning_rate": 7.126915058277918e-06, "loss": 0.90390873, "memory(GiB)": 728.98, "step": 30915, "train_speed(iter/s)": 0.215097 }, { "acc": 0.78965631, "epoch": 0.7843746581282631, "grad_norm": 3.84375, "learning_rate": 7.1259659676603324e-06, "loss": 0.85496044, "memory(GiB)": 728.98, "step": 30920, "train_speed(iter/s)": 0.215032 }, { "acc": 0.76246772, "epoch": 0.7845014974973007, "grad_norm": 3.21875, "learning_rate": 7.125016783528276e-06, "loss": 0.91469927, "memory(GiB)": 728.98, "step": 30925, "train_speed(iter/s)": 0.21497 }, { "acc": 0.75952811, "epoch": 0.7846283368663383, "grad_norm": 4.125, "learning_rate": 7.124067505923504e-06, "loss": 0.92247486, "memory(GiB)": 728.98, "step": 30930, "train_speed(iter/s)": 0.2149 }, { "acc": 0.76666622, "epoch": 0.7847551762353758, "grad_norm": 3.53125, "learning_rate": 7.123118134887769e-06, "loss": 0.89467945, "memory(GiB)": 728.98, "step": 30935, "train_speed(iter/s)": 0.214832 }, { "acc": 0.76941905, "epoch": 0.7848820156044134, "grad_norm": 3.96875, "learning_rate": 7.122168670462831e-06, "loss": 0.87583122, "memory(GiB)": 728.98, "step": 30940, "train_speed(iter/s)": 0.214773 }, { "acc": 0.76343613, "epoch": 0.785008854973451, "grad_norm": 3.4375, "learning_rate": 7.121219112690456e-06, "loss": 0.92087374, "memory(GiB)": 728.98, "step": 30945, "train_speed(iter/s)": 0.214706 }, { "acc": 0.7769475, "epoch": 0.7851356943424885, "grad_norm": 2.890625, "learning_rate": 7.120269461612411e-06, "loss": 0.86271067, "memory(GiB)": 728.98, "step": 30950, "train_speed(iter/s)": 0.214638 }, { "acc": 0.77899551, "epoch": 0.7852625337115261, "grad_norm": 3.734375, "learning_rate": 7.119319717270469e-06, "loss": 0.86038008, "memory(GiB)": 728.98, "step": 30955, "train_speed(iter/s)": 0.214584 }, { "acc": 0.76887507, "epoch": 0.7853893730805636, "grad_norm": 3.3125, "learning_rate": 7.1183698797064035e-06, "loss": 0.81985455, "memory(GiB)": 728.98, "step": 30960, "train_speed(iter/s)": 0.214511 }, { "acc": 0.76422091, "epoch": 0.7855162124496011, "grad_norm": 3.40625, "learning_rate": 7.117419948961996e-06, "loss": 0.89658985, "memory(GiB)": 728.98, "step": 30965, "train_speed(iter/s)": 0.21445 }, { "acc": 0.76542292, "epoch": 0.7856430518186387, "grad_norm": 3.734375, "learning_rate": 7.116469925079034e-06, "loss": 0.89312582, "memory(GiB)": 728.98, "step": 30970, "train_speed(iter/s)": 0.214381 }, { "acc": 0.76663551, "epoch": 0.7857698911876763, "grad_norm": 3.90625, "learning_rate": 7.115519808099302e-06, "loss": 0.84981365, "memory(GiB)": 728.98, "step": 30975, "train_speed(iter/s)": 0.214311 }, { "acc": 0.76005707, "epoch": 0.7858967305567138, "grad_norm": 3.5625, "learning_rate": 7.1145695980645935e-06, "loss": 0.95433569, "memory(GiB)": 728.98, "step": 30980, "train_speed(iter/s)": 0.214247 }, { "acc": 0.76776943, "epoch": 0.7860235699257514, "grad_norm": 3.15625, "learning_rate": 7.113619295016707e-06, "loss": 0.89695692, "memory(GiB)": 728.98, "step": 30985, "train_speed(iter/s)": 0.214192 }, { "acc": 0.77125411, "epoch": 0.786150409294789, "grad_norm": 3.375, "learning_rate": 7.112668898997442e-06, "loss": 0.87865677, "memory(GiB)": 728.98, "step": 30990, "train_speed(iter/s)": 0.214122 }, { "acc": 0.75909863, "epoch": 0.7862772486638265, "grad_norm": 3.125, "learning_rate": 7.1117184100486024e-06, "loss": 0.90586243, "memory(GiB)": 728.98, "step": 30995, "train_speed(iter/s)": 0.214053 }, { "acc": 0.76362257, "epoch": 0.7864040880328641, "grad_norm": 4.5625, "learning_rate": 7.110767828212e-06, "loss": 0.89931974, "memory(GiB)": 728.98, "step": 31000, "train_speed(iter/s)": 0.21399 }, { "epoch": 0.7864040880328641, "eval_acc": 0.7559833678478759, "eval_loss": 0.8650665879249573, "eval_runtime": 1153.2515, "eval_samples_per_second": 5.524, "eval_steps_per_second": 5.524, "step": 31000 }, { "acc": 0.7618268, "epoch": 0.7865309274019017, "grad_norm": 3.203125, "learning_rate": 7.109817153529444e-06, "loss": 0.91934147, "memory(GiB)": 728.98, "step": 31005, "train_speed(iter/s)": 0.211202 }, { "acc": 0.77501717, "epoch": 0.7866577667709392, "grad_norm": 3.6875, "learning_rate": 7.1088663860427545e-06, "loss": 0.86323004, "memory(GiB)": 728.98, "step": 31010, "train_speed(iter/s)": 0.211143 }, { "acc": 0.77349443, "epoch": 0.7867846061399768, "grad_norm": 4.4375, "learning_rate": 7.107915525793753e-06, "loss": 0.86411591, "memory(GiB)": 728.98, "step": 31015, "train_speed(iter/s)": 0.211074 }, { "acc": 0.76641755, "epoch": 0.7869114455090143, "grad_norm": 3.296875, "learning_rate": 7.106964572824262e-06, "loss": 0.87486095, "memory(GiB)": 728.98, "step": 31020, "train_speed(iter/s)": 0.211006 }, { "acc": 0.76922755, "epoch": 0.7870382848780518, "grad_norm": 3.796875, "learning_rate": 7.106013527176114e-06, "loss": 0.88117876, "memory(GiB)": 728.98, "step": 31025, "train_speed(iter/s)": 0.210937 }, { "acc": 0.7674613, "epoch": 0.7871651242470894, "grad_norm": 3.328125, "learning_rate": 7.105062388891141e-06, "loss": 0.89830503, "memory(GiB)": 728.98, "step": 31030, "train_speed(iter/s)": 0.210879 }, { "acc": 0.75886879, "epoch": 0.787291963616127, "grad_norm": 3.203125, "learning_rate": 7.104111158011181e-06, "loss": 0.916574, "memory(GiB)": 728.98, "step": 31035, "train_speed(iter/s)": 0.210819 }, { "acc": 0.7694684, "epoch": 0.7874188029851645, "grad_norm": 3.546875, "learning_rate": 7.1031598345780735e-06, "loss": 0.86055307, "memory(GiB)": 728.98, "step": 31040, "train_speed(iter/s)": 0.210759 }, { "acc": 0.75949354, "epoch": 0.7875456423542021, "grad_norm": 4.25, "learning_rate": 7.102208418633668e-06, "loss": 0.92725115, "memory(GiB)": 728.98, "step": 31045, "train_speed(iter/s)": 0.210694 }, { "acc": 0.77381845, "epoch": 0.7876724817232397, "grad_norm": 3.28125, "learning_rate": 7.10125691021981e-06, "loss": 0.8839653, "memory(GiB)": 728.98, "step": 31050, "train_speed(iter/s)": 0.210636 }, { "acc": 0.77186642, "epoch": 0.7877993210922772, "grad_norm": 3.6875, "learning_rate": 7.100305309378358e-06, "loss": 0.8775301, "memory(GiB)": 728.98, "step": 31055, "train_speed(iter/s)": 0.210572 }, { "acc": 0.78171434, "epoch": 0.7879261604613148, "grad_norm": 5.0625, "learning_rate": 7.099353616151167e-06, "loss": 0.89325428, "memory(GiB)": 728.98, "step": 31060, "train_speed(iter/s)": 0.210519 }, { "acc": 0.76512775, "epoch": 0.7880529998303524, "grad_norm": 4.75, "learning_rate": 7.098401830580097e-06, "loss": 0.88978786, "memory(GiB)": 728.98, "step": 31065, "train_speed(iter/s)": 0.210462 }, { "acc": 0.76019177, "epoch": 0.7881798391993899, "grad_norm": 4.15625, "learning_rate": 7.097449952707019e-06, "loss": 0.89475393, "memory(GiB)": 728.98, "step": 31070, "train_speed(iter/s)": 0.210404 }, { "acc": 0.77085576, "epoch": 0.7883066785684275, "grad_norm": 4.1875, "learning_rate": 7.096497982573799e-06, "loss": 0.84747219, "memory(GiB)": 728.98, "step": 31075, "train_speed(iter/s)": 0.210342 }, { "acc": 0.7701961, "epoch": 0.788433517937465, "grad_norm": 3.375, "learning_rate": 7.095545920222313e-06, "loss": 0.89659319, "memory(GiB)": 728.98, "step": 31080, "train_speed(iter/s)": 0.210284 }, { "acc": 0.7621109, "epoch": 0.7885603573065025, "grad_norm": 3.671875, "learning_rate": 7.09459376569444e-06, "loss": 0.96181211, "memory(GiB)": 728.98, "step": 31085, "train_speed(iter/s)": 0.210231 }, { "acc": 0.76775947, "epoch": 0.7886871966755401, "grad_norm": 3.546875, "learning_rate": 7.093641519032058e-06, "loss": 0.86132812, "memory(GiB)": 728.98, "step": 31090, "train_speed(iter/s)": 0.210174 }, { "acc": 0.77961073, "epoch": 0.7888140360445777, "grad_norm": 3.234375, "learning_rate": 7.092689180277059e-06, "loss": 0.83995667, "memory(GiB)": 728.98, "step": 31095, "train_speed(iter/s)": 0.210109 }, { "acc": 0.7699079, "epoch": 0.7889408754136152, "grad_norm": 3.59375, "learning_rate": 7.091736749471329e-06, "loss": 0.89802475, "memory(GiB)": 728.98, "step": 31100, "train_speed(iter/s)": 0.210051 }, { "acc": 0.76635938, "epoch": 0.7890677147826528, "grad_norm": 3.78125, "learning_rate": 7.090784226656766e-06, "loss": 0.91483507, "memory(GiB)": 728.98, "step": 31105, "train_speed(iter/s)": 0.209988 }, { "acc": 0.77260566, "epoch": 0.7891945541516904, "grad_norm": 3.109375, "learning_rate": 7.089831611875265e-06, "loss": 0.885184, "memory(GiB)": 728.98, "step": 31110, "train_speed(iter/s)": 0.209928 }, { "acc": 0.75479765, "epoch": 0.7893213935207279, "grad_norm": 3.765625, "learning_rate": 7.0888789051687315e-06, "loss": 0.90076494, "memory(GiB)": 728.98, "step": 31115, "train_speed(iter/s)": 0.209873 }, { "acc": 0.75871062, "epoch": 0.7894482328897655, "grad_norm": 3.234375, "learning_rate": 7.087926106579068e-06, "loss": 0.94244394, "memory(GiB)": 728.98, "step": 31120, "train_speed(iter/s)": 0.209815 }, { "acc": 0.77331367, "epoch": 0.7895750722588031, "grad_norm": 3.5625, "learning_rate": 7.08697321614819e-06, "loss": 0.8796154, "memory(GiB)": 728.98, "step": 31125, "train_speed(iter/s)": 0.209756 }, { "acc": 0.76264911, "epoch": 0.7897019116278406, "grad_norm": 3.90625, "learning_rate": 7.08602023391801e-06, "loss": 0.940695, "memory(GiB)": 728.98, "step": 31130, "train_speed(iter/s)": 0.209699 }, { "acc": 0.75872636, "epoch": 0.7898287509968782, "grad_norm": 3.1875, "learning_rate": 7.085067159930445e-06, "loss": 0.91033068, "memory(GiB)": 728.98, "step": 31135, "train_speed(iter/s)": 0.209636 }, { "acc": 0.76993518, "epoch": 0.7899555903659157, "grad_norm": 2.8125, "learning_rate": 7.084113994227419e-06, "loss": 0.89829626, "memory(GiB)": 728.98, "step": 31140, "train_speed(iter/s)": 0.209561 }, { "acc": 0.75885706, "epoch": 0.7900824297349532, "grad_norm": 3.390625, "learning_rate": 7.083160736850858e-06, "loss": 0.92646542, "memory(GiB)": 728.98, "step": 31145, "train_speed(iter/s)": 0.209497 }, { "acc": 0.76717525, "epoch": 0.7902092691039908, "grad_norm": 3.578125, "learning_rate": 7.082207387842696e-06, "loss": 0.86118908, "memory(GiB)": 728.98, "step": 31150, "train_speed(iter/s)": 0.209434 }, { "acc": 0.76433115, "epoch": 0.7903361084730284, "grad_norm": 3.875, "learning_rate": 7.081253947244863e-06, "loss": 0.90655336, "memory(GiB)": 728.98, "step": 31155, "train_speed(iter/s)": 0.209371 }, { "acc": 0.74532547, "epoch": 0.7904629478420659, "grad_norm": 3.09375, "learning_rate": 7.080300415099301e-06, "loss": 0.9309761, "memory(GiB)": 728.98, "step": 31160, "train_speed(iter/s)": 0.209307 }, { "acc": 0.78651919, "epoch": 0.7905897872111035, "grad_norm": 2.96875, "learning_rate": 7.079346791447952e-06, "loss": 0.82272682, "memory(GiB)": 728.98, "step": 31165, "train_speed(iter/s)": 0.209242 }, { "acc": 0.75199556, "epoch": 0.7907166265801411, "grad_norm": 3.34375, "learning_rate": 7.078393076332763e-06, "loss": 0.90028725, "memory(GiB)": 728.98, "step": 31170, "train_speed(iter/s)": 0.20918 }, { "acc": 0.75290895, "epoch": 0.7908434659491786, "grad_norm": 3.53125, "learning_rate": 7.077439269795685e-06, "loss": 0.91514797, "memory(GiB)": 728.98, "step": 31175, "train_speed(iter/s)": 0.209112 }, { "acc": 0.74933124, "epoch": 0.7909703053182162, "grad_norm": 3.3125, "learning_rate": 7.076485371878672e-06, "loss": 0.98899946, "memory(GiB)": 728.98, "step": 31180, "train_speed(iter/s)": 0.209052 }, { "acc": 0.77005963, "epoch": 0.7910971446872538, "grad_norm": 3.265625, "learning_rate": 7.075531382623685e-06, "loss": 0.88492622, "memory(GiB)": 728.98, "step": 31185, "train_speed(iter/s)": 0.208996 }, { "acc": 0.76043897, "epoch": 0.7912239840562914, "grad_norm": 3.3125, "learning_rate": 7.074577302072684e-06, "loss": 0.91004486, "memory(GiB)": 728.98, "step": 31190, "train_speed(iter/s)": 0.208935 }, { "acc": 0.75625958, "epoch": 0.7913508234253289, "grad_norm": 3.90625, "learning_rate": 7.073623130267638e-06, "loss": 0.98311558, "memory(GiB)": 728.98, "step": 31195, "train_speed(iter/s)": 0.208883 }, { "acc": 0.75904484, "epoch": 0.7914776627943664, "grad_norm": 3.28125, "learning_rate": 7.072668867250516e-06, "loss": 0.88803844, "memory(GiB)": 728.98, "step": 31200, "train_speed(iter/s)": 0.20883 }, { "acc": 0.76697245, "epoch": 0.791604502163404, "grad_norm": 3.703125, "learning_rate": 7.071714513063297e-06, "loss": 0.89324522, "memory(GiB)": 728.98, "step": 31205, "train_speed(iter/s)": 0.208763 }, { "acc": 0.77554049, "epoch": 0.7917313415324415, "grad_norm": 3.15625, "learning_rate": 7.070760067747956e-06, "loss": 0.87612305, "memory(GiB)": 728.98, "step": 31210, "train_speed(iter/s)": 0.208702 }, { "acc": 0.77448997, "epoch": 0.7918581809014791, "grad_norm": 3.296875, "learning_rate": 7.069805531346478e-06, "loss": 0.8316721, "memory(GiB)": 728.98, "step": 31215, "train_speed(iter/s)": 0.208647 }, { "acc": 0.75293036, "epoch": 0.7919850202705166, "grad_norm": 4.0625, "learning_rate": 7.068850903900849e-06, "loss": 0.92992239, "memory(GiB)": 728.98, "step": 31220, "train_speed(iter/s)": 0.208576 }, { "acc": 0.77627506, "epoch": 0.7921118596395542, "grad_norm": 4.28125, "learning_rate": 7.067896185453061e-06, "loss": 0.888342, "memory(GiB)": 728.98, "step": 31225, "train_speed(iter/s)": 0.208526 }, { "acc": 0.75798979, "epoch": 0.7922386990085918, "grad_norm": 3.046875, "learning_rate": 7.066941376045108e-06, "loss": 0.93166666, "memory(GiB)": 728.98, "step": 31230, "train_speed(iter/s)": 0.208462 }, { "acc": 0.75792527, "epoch": 0.7923655383776294, "grad_norm": 3.65625, "learning_rate": 7.065986475718989e-06, "loss": 0.92659578, "memory(GiB)": 728.98, "step": 31235, "train_speed(iter/s)": 0.208406 }, { "acc": 0.76500978, "epoch": 0.7924923777466669, "grad_norm": 3.453125, "learning_rate": 7.065031484516709e-06, "loss": 0.91479158, "memory(GiB)": 728.98, "step": 31240, "train_speed(iter/s)": 0.20835 }, { "acc": 0.75500827, "epoch": 0.7926192171157045, "grad_norm": 4.0, "learning_rate": 7.064076402480272e-06, "loss": 0.9300539, "memory(GiB)": 728.98, "step": 31245, "train_speed(iter/s)": 0.208286 }, { "acc": 0.77691164, "epoch": 0.7927460564847421, "grad_norm": 3.328125, "learning_rate": 7.063121229651693e-06, "loss": 0.85553913, "memory(GiB)": 728.98, "step": 31250, "train_speed(iter/s)": 0.208222 }, { "acc": 0.77840719, "epoch": 0.7928728958537796, "grad_norm": 3.6875, "learning_rate": 7.062165966072982e-06, "loss": 0.87938442, "memory(GiB)": 728.98, "step": 31255, "train_speed(iter/s)": 0.208162 }, { "acc": 0.76532831, "epoch": 0.7929997352228171, "grad_norm": 3.4375, "learning_rate": 7.0612106117861625e-06, "loss": 0.92955399, "memory(GiB)": 728.98, "step": 31260, "train_speed(iter/s)": 0.208107 }, { "acc": 0.76673422, "epoch": 0.7931265745918546, "grad_norm": 3.359375, "learning_rate": 7.0602551668332544e-06, "loss": 0.91907511, "memory(GiB)": 728.98, "step": 31265, "train_speed(iter/s)": 0.208046 }, { "acc": 0.7717937, "epoch": 0.7932534139608922, "grad_norm": 3.3125, "learning_rate": 7.059299631256287e-06, "loss": 0.91038151, "memory(GiB)": 728.98, "step": 31270, "train_speed(iter/s)": 0.207992 }, { "acc": 0.76534076, "epoch": 0.7933802533299298, "grad_norm": 3.640625, "learning_rate": 7.0583440050972895e-06, "loss": 0.89739733, "memory(GiB)": 728.98, "step": 31275, "train_speed(iter/s)": 0.207928 }, { "acc": 0.7707921, "epoch": 0.7935070926989674, "grad_norm": 2.9375, "learning_rate": 7.057388288398297e-06, "loss": 0.85940247, "memory(GiB)": 728.98, "step": 31280, "train_speed(iter/s)": 0.207876 }, { "acc": 0.76018505, "epoch": 0.7936339320680049, "grad_norm": 4.34375, "learning_rate": 7.056432481201349e-06, "loss": 0.92913704, "memory(GiB)": 728.98, "step": 31285, "train_speed(iter/s)": 0.207817 }, { "acc": 0.75450258, "epoch": 0.7937607714370425, "grad_norm": 2.984375, "learning_rate": 7.0554765835484885e-06, "loss": 0.92076073, "memory(GiB)": 728.98, "step": 31290, "train_speed(iter/s)": 0.207758 }, { "acc": 0.76423731, "epoch": 0.7938876108060801, "grad_norm": 4.1875, "learning_rate": 7.054520595481763e-06, "loss": 0.9293973, "memory(GiB)": 728.98, "step": 31295, "train_speed(iter/s)": 0.207699 }, { "acc": 0.76574426, "epoch": 0.7940144501751176, "grad_norm": 3.40625, "learning_rate": 7.053564517043223e-06, "loss": 0.94556036, "memory(GiB)": 728.98, "step": 31300, "train_speed(iter/s)": 0.207645 }, { "acc": 0.75786409, "epoch": 0.7941412895441552, "grad_norm": 3.5, "learning_rate": 7.052608348274921e-06, "loss": 0.92433739, "memory(GiB)": 728.98, "step": 31305, "train_speed(iter/s)": 0.207579 }, { "acc": 0.76660523, "epoch": 0.7942681289131928, "grad_norm": 3.453125, "learning_rate": 7.05165208921892e-06, "loss": 0.91299372, "memory(GiB)": 728.98, "step": 31310, "train_speed(iter/s)": 0.207515 }, { "acc": 0.76267776, "epoch": 0.7943949682822303, "grad_norm": 3.96875, "learning_rate": 7.050695739917279e-06, "loss": 0.90367889, "memory(GiB)": 728.98, "step": 31315, "train_speed(iter/s)": 0.207452 }, { "acc": 0.77059689, "epoch": 0.7945218076512678, "grad_norm": 4.0, "learning_rate": 7.0497393004120666e-06, "loss": 0.89062233, "memory(GiB)": 728.98, "step": 31320, "train_speed(iter/s)": 0.207405 }, { "acc": 0.76961112, "epoch": 0.7946486470203054, "grad_norm": 3.390625, "learning_rate": 7.048782770745353e-06, "loss": 0.85786619, "memory(GiB)": 728.98, "step": 31325, "train_speed(iter/s)": 0.207351 }, { "acc": 0.7730176, "epoch": 0.7947754863893429, "grad_norm": 3.296875, "learning_rate": 7.047826150959214e-06, "loss": 0.84721498, "memory(GiB)": 728.98, "step": 31330, "train_speed(iter/s)": 0.207289 }, { "acc": 0.77668772, "epoch": 0.7949023257583805, "grad_norm": 3.6875, "learning_rate": 7.046869441095726e-06, "loss": 0.8483779, "memory(GiB)": 728.98, "step": 31335, "train_speed(iter/s)": 0.207232 }, { "acc": 0.75801644, "epoch": 0.7950291651274181, "grad_norm": 3.4375, "learning_rate": 7.045912641196974e-06, "loss": 0.91715593, "memory(GiB)": 728.98, "step": 31340, "train_speed(iter/s)": 0.207171 }, { "acc": 0.75644574, "epoch": 0.7951560044964556, "grad_norm": 3.6875, "learning_rate": 7.044955751305044e-06, "loss": 0.92440825, "memory(GiB)": 728.98, "step": 31345, "train_speed(iter/s)": 0.207103 }, { "acc": 0.75577536, "epoch": 0.7952828438654932, "grad_norm": 3.53125, "learning_rate": 7.0439987714620265e-06, "loss": 1.00227757, "memory(GiB)": 728.98, "step": 31350, "train_speed(iter/s)": 0.20705 }, { "acc": 0.76932888, "epoch": 0.7954096832345308, "grad_norm": 3.5, "learning_rate": 7.043041701710014e-06, "loss": 0.87589998, "memory(GiB)": 728.98, "step": 31355, "train_speed(iter/s)": 0.206987 }, { "acc": 0.78289971, "epoch": 0.7955365226035683, "grad_norm": 4.21875, "learning_rate": 7.042084542091108e-06, "loss": 0.88716946, "memory(GiB)": 728.98, "step": 31360, "train_speed(iter/s)": 0.206937 }, { "acc": 0.76759806, "epoch": 0.7956633619726059, "grad_norm": 4.59375, "learning_rate": 7.041127292647408e-06, "loss": 0.94179859, "memory(GiB)": 728.98, "step": 31365, "train_speed(iter/s)": 0.206884 }, { "acc": 0.78365722, "epoch": 0.7957902013416435, "grad_norm": 3.890625, "learning_rate": 7.040169953421021e-06, "loss": 0.80511436, "memory(GiB)": 728.98, "step": 31370, "train_speed(iter/s)": 0.206828 }, { "acc": 0.76947012, "epoch": 0.795917040710681, "grad_norm": 3.59375, "learning_rate": 7.039212524454061e-06, "loss": 0.8912919, "memory(GiB)": 728.98, "step": 31375, "train_speed(iter/s)": 0.206774 }, { "acc": 0.78336706, "epoch": 0.7960438800797185, "grad_norm": 3.640625, "learning_rate": 7.038255005788639e-06, "loss": 0.81498079, "memory(GiB)": 728.98, "step": 31380, "train_speed(iter/s)": 0.206727 }, { "acc": 0.77275386, "epoch": 0.7961707194487561, "grad_norm": 3.984375, "learning_rate": 7.037297397466871e-06, "loss": 0.89191265, "memory(GiB)": 728.98, "step": 31385, "train_speed(iter/s)": 0.206666 }, { "acc": 0.77792201, "epoch": 0.7962975588177936, "grad_norm": 3.53125, "learning_rate": 7.036339699530884e-06, "loss": 0.86406317, "memory(GiB)": 728.98, "step": 31390, "train_speed(iter/s)": 0.206614 }, { "acc": 0.76543899, "epoch": 0.7964243981868312, "grad_norm": 3.609375, "learning_rate": 7.0353819120228005e-06, "loss": 0.91724825, "memory(GiB)": 728.98, "step": 31395, "train_speed(iter/s)": 0.206555 }, { "acc": 0.7600565, "epoch": 0.7965512375558688, "grad_norm": 3.546875, "learning_rate": 7.034424034984753e-06, "loss": 0.85774078, "memory(GiB)": 728.98, "step": 31400, "train_speed(iter/s)": 0.206495 }, { "acc": 0.75735669, "epoch": 0.7966780769249063, "grad_norm": 3.59375, "learning_rate": 7.033466068458874e-06, "loss": 0.90172548, "memory(GiB)": 728.98, "step": 31405, "train_speed(iter/s)": 0.206437 }, { "acc": 0.776508, "epoch": 0.7968049162939439, "grad_norm": 3.09375, "learning_rate": 7.032508012487301e-06, "loss": 0.86639681, "memory(GiB)": 728.98, "step": 31410, "train_speed(iter/s)": 0.206383 }, { "acc": 0.76747713, "epoch": 0.7969317556629815, "grad_norm": 3.6875, "learning_rate": 7.031549867112178e-06, "loss": 0.87203302, "memory(GiB)": 728.98, "step": 31415, "train_speed(iter/s)": 0.206316 }, { "acc": 0.76638756, "epoch": 0.797058595032019, "grad_norm": 4.25, "learning_rate": 7.0305916323756475e-06, "loss": 0.88867111, "memory(GiB)": 728.98, "step": 31420, "train_speed(iter/s)": 0.206257 }, { "acc": 0.76548223, "epoch": 0.7971854344010566, "grad_norm": 3.21875, "learning_rate": 7.029633308319862e-06, "loss": 0.89758987, "memory(GiB)": 728.98, "step": 31425, "train_speed(iter/s)": 0.206204 }, { "acc": 0.77261357, "epoch": 0.7973122737700942, "grad_norm": 2.9375, "learning_rate": 7.028674894986973e-06, "loss": 0.84294777, "memory(GiB)": 728.98, "step": 31430, "train_speed(iter/s)": 0.206141 }, { "acc": 0.75147591, "epoch": 0.7974391131391317, "grad_norm": 3.21875, "learning_rate": 7.0277163924191405e-06, "loss": 0.94515982, "memory(GiB)": 728.98, "step": 31435, "train_speed(iter/s)": 0.206086 }, { "acc": 0.78013468, "epoch": 0.7975659525081692, "grad_norm": 3.890625, "learning_rate": 7.026757800658524e-06, "loss": 0.87252855, "memory(GiB)": 728.98, "step": 31440, "train_speed(iter/s)": 0.206034 }, { "acc": 0.77724376, "epoch": 0.7976927918772068, "grad_norm": 3.125, "learning_rate": 7.025799119747289e-06, "loss": 0.85198259, "memory(GiB)": 728.98, "step": 31445, "train_speed(iter/s)": 0.205978 }, { "acc": 0.74775529, "epoch": 0.7978196312462443, "grad_norm": 3.640625, "learning_rate": 7.024840349727606e-06, "loss": 0.93472624, "memory(GiB)": 728.98, "step": 31450, "train_speed(iter/s)": 0.205918 }, { "acc": 0.75626659, "epoch": 0.7979464706152819, "grad_norm": 3.5, "learning_rate": 7.023881490641647e-06, "loss": 0.91679087, "memory(GiB)": 728.98, "step": 31455, "train_speed(iter/s)": 0.205865 }, { "acc": 0.76202054, "epoch": 0.7980733099843195, "grad_norm": 3.9375, "learning_rate": 7.022922542531589e-06, "loss": 0.95061016, "memory(GiB)": 728.98, "step": 31460, "train_speed(iter/s)": 0.205807 }, { "acc": 0.77329426, "epoch": 0.798200149353357, "grad_norm": 3.234375, "learning_rate": 7.021963505439616e-06, "loss": 0.86806746, "memory(GiB)": 728.98, "step": 31465, "train_speed(iter/s)": 0.205752 }, { "acc": 0.76584778, "epoch": 0.7983269887223946, "grad_norm": 3.40625, "learning_rate": 7.02100437940791e-06, "loss": 0.84468584, "memory(GiB)": 728.98, "step": 31470, "train_speed(iter/s)": 0.205699 }, { "acc": 0.76510897, "epoch": 0.7984538280914322, "grad_norm": 3.25, "learning_rate": 7.02004516447866e-06, "loss": 0.90554075, "memory(GiB)": 728.98, "step": 31475, "train_speed(iter/s)": 0.20564 }, { "acc": 0.76722584, "epoch": 0.7985806674604697, "grad_norm": 3.484375, "learning_rate": 7.019085860694061e-06, "loss": 0.89853334, "memory(GiB)": 728.98, "step": 31480, "train_speed(iter/s)": 0.205589 }, { "acc": 0.75117493, "epoch": 0.7987075068295073, "grad_norm": 3.8125, "learning_rate": 7.018126468096306e-06, "loss": 0.92164717, "memory(GiB)": 728.98, "step": 31485, "train_speed(iter/s)": 0.205543 }, { "acc": 0.78233371, "epoch": 0.7988343461985449, "grad_norm": 3.40625, "learning_rate": 7.0171669867275986e-06, "loss": 0.86728458, "memory(GiB)": 728.98, "step": 31490, "train_speed(iter/s)": 0.205492 }, { "acc": 0.76635389, "epoch": 0.7989611855675824, "grad_norm": 3.140625, "learning_rate": 7.016207416630142e-06, "loss": 0.89897919, "memory(GiB)": 728.98, "step": 31495, "train_speed(iter/s)": 0.205433 }, { "acc": 0.75993958, "epoch": 0.7990880249366199, "grad_norm": 3.359375, "learning_rate": 7.015247757846147e-06, "loss": 0.91593466, "memory(GiB)": 728.98, "step": 31500, "train_speed(iter/s)": 0.205369 }, { "epoch": 0.7990880249366199, "eval_acc": 0.7561947581064613, "eval_loss": 0.8644923567771912, "eval_runtime": 1151.3727, "eval_samples_per_second": 5.533, "eval_steps_per_second": 5.533, "step": 31500 }, { "acc": 0.78299732, "epoch": 0.7992148643056575, "grad_norm": 3.421875, "learning_rate": 7.014288010417821e-06, "loss": 0.85000019, "memory(GiB)": 728.98, "step": 31505, "train_speed(iter/s)": 0.202838 }, { "acc": 0.76689239, "epoch": 0.799341703674695, "grad_norm": 4.03125, "learning_rate": 7.013328174387386e-06, "loss": 0.91339178, "memory(GiB)": 728.98, "step": 31510, "train_speed(iter/s)": 0.202773 }, { "acc": 0.77646914, "epoch": 0.7994685430437326, "grad_norm": 3.875, "learning_rate": 7.012368249797058e-06, "loss": 0.84917841, "memory(GiB)": 728.98, "step": 31515, "train_speed(iter/s)": 0.202717 }, { "acc": 0.7601696, "epoch": 0.7995953824127702, "grad_norm": 3.421875, "learning_rate": 7.011408236689063e-06, "loss": 0.90197964, "memory(GiB)": 728.98, "step": 31520, "train_speed(iter/s)": 0.202655 }, { "acc": 0.75787635, "epoch": 0.7997222217818077, "grad_norm": 3.6875, "learning_rate": 7.01044813510563e-06, "loss": 0.87749443, "memory(GiB)": 728.98, "step": 31525, "train_speed(iter/s)": 0.202602 }, { "acc": 0.75291309, "epoch": 0.7998490611508453, "grad_norm": 3.484375, "learning_rate": 7.009487945088986e-06, "loss": 0.92209015, "memory(GiB)": 728.98, "step": 31530, "train_speed(iter/s)": 0.202544 }, { "acc": 0.76431613, "epoch": 0.7999759005198829, "grad_norm": 3.5, "learning_rate": 7.008527666681373e-06, "loss": 0.92516317, "memory(GiB)": 728.98, "step": 31535, "train_speed(iter/s)": 0.20249 }, { "acc": 0.75888109, "epoch": 0.8001027398889204, "grad_norm": 3.15625, "learning_rate": 7.007567299925026e-06, "loss": 0.9040247, "memory(GiB)": 728.98, "step": 31540, "train_speed(iter/s)": 0.202426 }, { "acc": 0.75229993, "epoch": 0.800229579257958, "grad_norm": 3.484375, "learning_rate": 7.0066068448621896e-06, "loss": 0.99473066, "memory(GiB)": 728.98, "step": 31545, "train_speed(iter/s)": 0.202371 }, { "acc": 0.76296458, "epoch": 0.8003564186269956, "grad_norm": 3.125, "learning_rate": 7.005646301535113e-06, "loss": 0.88818798, "memory(GiB)": 728.98, "step": 31550, "train_speed(iter/s)": 0.202314 }, { "acc": 0.75611796, "epoch": 0.8004832579960331, "grad_norm": 3.0, "learning_rate": 7.004685669986044e-06, "loss": 0.94314032, "memory(GiB)": 728.98, "step": 31555, "train_speed(iter/s)": 0.202254 }, { "acc": 0.76638913, "epoch": 0.8006100973650706, "grad_norm": 3.703125, "learning_rate": 7.003724950257241e-06, "loss": 0.89422779, "memory(GiB)": 728.98, "step": 31560, "train_speed(iter/s)": 0.202192 }, { "acc": 0.7719171, "epoch": 0.8007369367341082, "grad_norm": 3.34375, "learning_rate": 7.002764142390961e-06, "loss": 0.93567343, "memory(GiB)": 728.98, "step": 31565, "train_speed(iter/s)": 0.202141 }, { "acc": 0.75934596, "epoch": 0.8008637761031457, "grad_norm": 3.4375, "learning_rate": 7.001803246429469e-06, "loss": 0.90093613, "memory(GiB)": 728.98, "step": 31570, "train_speed(iter/s)": 0.202092 }, { "acc": 0.76877995, "epoch": 0.8009906154721833, "grad_norm": 3.515625, "learning_rate": 7.0008422624150285e-06, "loss": 0.90548487, "memory(GiB)": 728.98, "step": 31575, "train_speed(iter/s)": 0.202042 }, { "acc": 0.75880394, "epoch": 0.8011174548412209, "grad_norm": 3.75, "learning_rate": 6.999881190389914e-06, "loss": 0.93029623, "memory(GiB)": 728.98, "step": 31580, "train_speed(iter/s)": 0.20198 }, { "acc": 0.7592885, "epoch": 0.8012442942102584, "grad_norm": 4.0, "learning_rate": 6.998920030396395e-06, "loss": 0.94126978, "memory(GiB)": 728.98, "step": 31585, "train_speed(iter/s)": 0.201915 }, { "acc": 0.77137065, "epoch": 0.801371133579296, "grad_norm": 3.71875, "learning_rate": 6.997958782476756e-06, "loss": 0.83733015, "memory(GiB)": 728.98, "step": 31590, "train_speed(iter/s)": 0.201853 }, { "acc": 0.75421619, "epoch": 0.8014979729483336, "grad_norm": 3.0, "learning_rate": 6.996997446673275e-06, "loss": 0.92498827, "memory(GiB)": 728.98, "step": 31595, "train_speed(iter/s)": 0.201797 }, { "acc": 0.7849617, "epoch": 0.8016248123173711, "grad_norm": 4.03125, "learning_rate": 6.996036023028239e-06, "loss": 0.82512722, "memory(GiB)": 728.98, "step": 31600, "train_speed(iter/s)": 0.201742 }, { "acc": 0.76823773, "epoch": 0.8017516516864087, "grad_norm": 3.84375, "learning_rate": 6.995074511583938e-06, "loss": 0.8720438, "memory(GiB)": 728.98, "step": 31605, "train_speed(iter/s)": 0.201682 }, { "acc": 0.75317769, "epoch": 0.8018784910554463, "grad_norm": 3.609375, "learning_rate": 6.994112912382665e-06, "loss": 0.89494696, "memory(GiB)": 728.98, "step": 31610, "train_speed(iter/s)": 0.201622 }, { "acc": 0.77297444, "epoch": 0.8020053304244839, "grad_norm": 2.96875, "learning_rate": 6.99315122546672e-06, "loss": 0.88353338, "memory(GiB)": 728.98, "step": 31615, "train_speed(iter/s)": 0.201565 }, { "acc": 0.74865417, "epoch": 0.8021321697935213, "grad_norm": 3.328125, "learning_rate": 6.992189450878401e-06, "loss": 0.92411423, "memory(GiB)": 728.98, "step": 31620, "train_speed(iter/s)": 0.201509 }, { "acc": 0.76844735, "epoch": 0.8022590091625589, "grad_norm": 3.15625, "learning_rate": 6.991227588660019e-06, "loss": 0.90165834, "memory(GiB)": 728.98, "step": 31625, "train_speed(iter/s)": 0.201454 }, { "acc": 0.76484404, "epoch": 0.8023858485315964, "grad_norm": 3.234375, "learning_rate": 6.990265638853877e-06, "loss": 0.86887341, "memory(GiB)": 728.98, "step": 31630, "train_speed(iter/s)": 0.201402 }, { "acc": 0.76053028, "epoch": 0.802512687900634, "grad_norm": 3.0, "learning_rate": 6.989303601502292e-06, "loss": 0.89996815, "memory(GiB)": 728.98, "step": 31635, "train_speed(iter/s)": 0.201351 }, { "acc": 0.76414876, "epoch": 0.8026395272696716, "grad_norm": 3.953125, "learning_rate": 6.98834147664758e-06, "loss": 0.86525097, "memory(GiB)": 728.98, "step": 31640, "train_speed(iter/s)": 0.20129 }, { "acc": 0.7506371, "epoch": 0.8027663666387092, "grad_norm": 3.140625, "learning_rate": 6.987379264332062e-06, "loss": 0.92172823, "memory(GiB)": 728.98, "step": 31645, "train_speed(iter/s)": 0.201239 }, { "acc": 0.75767894, "epoch": 0.8028932060077467, "grad_norm": 3.5625, "learning_rate": 6.986416964598062e-06, "loss": 0.92317982, "memory(GiB)": 728.98, "step": 31650, "train_speed(iter/s)": 0.201178 }, { "acc": 0.76306992, "epoch": 0.8030200453767843, "grad_norm": 3.5625, "learning_rate": 6.985454577487908e-06, "loss": 0.96205769, "memory(GiB)": 728.98, "step": 31655, "train_speed(iter/s)": 0.201127 }, { "acc": 0.76866283, "epoch": 0.8031468847458219, "grad_norm": 4.53125, "learning_rate": 6.9844921030439335e-06, "loss": 0.90919666, "memory(GiB)": 728.98, "step": 31660, "train_speed(iter/s)": 0.201079 }, { "acc": 0.76407251, "epoch": 0.8032737241148594, "grad_norm": 3.21875, "learning_rate": 6.983529541308474e-06, "loss": 0.87321968, "memory(GiB)": 728.98, "step": 31665, "train_speed(iter/s)": 0.201032 }, { "acc": 0.76520152, "epoch": 0.803400563483897, "grad_norm": 3.78125, "learning_rate": 6.982566892323871e-06, "loss": 0.87444944, "memory(GiB)": 728.98, "step": 31670, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75799675, "epoch": 0.8035274028529346, "grad_norm": 3.5625, "learning_rate": 6.981604156132466e-06, "loss": 0.95611362, "memory(GiB)": 728.98, "step": 31675, "train_speed(iter/s)": 0.200923 }, { "acc": 0.76870518, "epoch": 0.803654242221972, "grad_norm": 3.71875, "learning_rate": 6.980641332776609e-06, "loss": 0.85808687, "memory(GiB)": 728.98, "step": 31680, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74997354, "epoch": 0.8037810815910096, "grad_norm": 3.90625, "learning_rate": 6.97967842229865e-06, "loss": 0.96857805, "memory(GiB)": 728.98, "step": 31685, "train_speed(iter/s)": 0.200811 }, { "acc": 0.76002998, "epoch": 0.8039079209600472, "grad_norm": 3.109375, "learning_rate": 6.978715424740945e-06, "loss": 0.86216221, "memory(GiB)": 728.98, "step": 31690, "train_speed(iter/s)": 0.200755 }, { "acc": 0.77056389, "epoch": 0.8040347603290847, "grad_norm": 3.796875, "learning_rate": 6.977752340145854e-06, "loss": 0.92768431, "memory(GiB)": 728.98, "step": 31695, "train_speed(iter/s)": 0.200708 }, { "acc": 0.77842064, "epoch": 0.8041615996981223, "grad_norm": 3.734375, "learning_rate": 6.976789168555738e-06, "loss": 0.85469465, "memory(GiB)": 728.98, "step": 31700, "train_speed(iter/s)": 0.200656 }, { "acc": 0.76080418, "epoch": 0.8042884390671599, "grad_norm": 3.203125, "learning_rate": 6.975825910012966e-06, "loss": 0.94916706, "memory(GiB)": 728.98, "step": 31705, "train_speed(iter/s)": 0.200608 }, { "acc": 0.7559628, "epoch": 0.8044152784361974, "grad_norm": 3.65625, "learning_rate": 6.974862564559906e-06, "loss": 0.96989374, "memory(GiB)": 728.98, "step": 31710, "train_speed(iter/s)": 0.200553 }, { "acc": 0.77315655, "epoch": 0.804542117805235, "grad_norm": 3.359375, "learning_rate": 6.9738991322389345e-06, "loss": 0.88600359, "memory(GiB)": 728.98, "step": 31715, "train_speed(iter/s)": 0.200501 }, { "acc": 0.74654145, "epoch": 0.8046689571742726, "grad_norm": 3.515625, "learning_rate": 6.97293561309243e-06, "loss": 1.02253609, "memory(GiB)": 728.98, "step": 31720, "train_speed(iter/s)": 0.200448 }, { "acc": 0.78161025, "epoch": 0.8047957965433101, "grad_norm": 3.421875, "learning_rate": 6.971972007162773e-06, "loss": 0.83390846, "memory(GiB)": 728.98, "step": 31725, "train_speed(iter/s)": 0.200397 }, { "acc": 0.75709782, "epoch": 0.8049226359123477, "grad_norm": 4.15625, "learning_rate": 6.971008314492352e-06, "loss": 0.87396402, "memory(GiB)": 728.98, "step": 31730, "train_speed(iter/s)": 0.200351 }, { "acc": 0.77184291, "epoch": 0.8050494752813853, "grad_norm": 3.328125, "learning_rate": 6.970044535123554e-06, "loss": 0.86499929, "memory(GiB)": 728.98, "step": 31735, "train_speed(iter/s)": 0.200295 }, { "acc": 0.76389589, "epoch": 0.8051763146504227, "grad_norm": 3.484375, "learning_rate": 6.969080669098774e-06, "loss": 0.930091, "memory(GiB)": 728.98, "step": 31740, "train_speed(iter/s)": 0.200246 }, { "acc": 0.7647573, "epoch": 0.8053031540194603, "grad_norm": 3.546875, "learning_rate": 6.968116716460409e-06, "loss": 0.88241434, "memory(GiB)": 728.98, "step": 31745, "train_speed(iter/s)": 0.20019 }, { "acc": 0.76765943, "epoch": 0.8054299933884979, "grad_norm": 3.296875, "learning_rate": 6.967152677250862e-06, "loss": 0.88331079, "memory(GiB)": 728.98, "step": 31750, "train_speed(iter/s)": 0.200133 }, { "acc": 0.76686187, "epoch": 0.8055568327575354, "grad_norm": 3.796875, "learning_rate": 6.9661885515125345e-06, "loss": 0.90544653, "memory(GiB)": 728.98, "step": 31755, "train_speed(iter/s)": 0.200087 }, { "acc": 0.77111034, "epoch": 0.805683672126573, "grad_norm": 3.234375, "learning_rate": 6.96522433928784e-06, "loss": 0.89168482, "memory(GiB)": 728.98, "step": 31760, "train_speed(iter/s)": 0.200034 }, { "acc": 0.76986575, "epoch": 0.8058105114956106, "grad_norm": 3.453125, "learning_rate": 6.9642600406191864e-06, "loss": 0.9418539, "memory(GiB)": 728.98, "step": 31765, "train_speed(iter/s)": 0.199984 }, { "acc": 0.77418528, "epoch": 0.8059373508646481, "grad_norm": 3.734375, "learning_rate": 6.963295655548992e-06, "loss": 0.87722139, "memory(GiB)": 728.98, "step": 31770, "train_speed(iter/s)": 0.199932 }, { "acc": 0.78250875, "epoch": 0.8060641902336857, "grad_norm": 4.03125, "learning_rate": 6.962331184119677e-06, "loss": 0.8727457, "memory(GiB)": 728.98, "step": 31775, "train_speed(iter/s)": 0.199878 }, { "acc": 0.7559907, "epoch": 0.8061910296027233, "grad_norm": 3.34375, "learning_rate": 6.961366626373666e-06, "loss": 0.94103632, "memory(GiB)": 728.98, "step": 31780, "train_speed(iter/s)": 0.199831 }, { "acc": 0.75947042, "epoch": 0.8063178689717608, "grad_norm": 3.09375, "learning_rate": 6.9604019823533865e-06, "loss": 0.95527, "memory(GiB)": 728.98, "step": 31785, "train_speed(iter/s)": 0.199777 }, { "acc": 0.77354126, "epoch": 0.8064447083407984, "grad_norm": 3.6875, "learning_rate": 6.95943725210127e-06, "loss": 0.87459497, "memory(GiB)": 728.98, "step": 31790, "train_speed(iter/s)": 0.199725 }, { "acc": 0.77567911, "epoch": 0.806571547709836, "grad_norm": 3.3125, "learning_rate": 6.958472435659752e-06, "loss": 0.87282963, "memory(GiB)": 728.98, "step": 31795, "train_speed(iter/s)": 0.199674 }, { "acc": 0.78607717, "epoch": 0.8066983870788734, "grad_norm": 3.0625, "learning_rate": 6.95750753307127e-06, "loss": 0.82191229, "memory(GiB)": 728.98, "step": 31800, "train_speed(iter/s)": 0.199616 }, { "acc": 0.77369471, "epoch": 0.806825226447911, "grad_norm": 3.859375, "learning_rate": 6.956542544378271e-06, "loss": 0.87010317, "memory(GiB)": 728.98, "step": 31805, "train_speed(iter/s)": 0.199558 }, { "acc": 0.76399322, "epoch": 0.8069520658169486, "grad_norm": 3.3125, "learning_rate": 6.955577469623198e-06, "loss": 0.88236361, "memory(GiB)": 728.98, "step": 31810, "train_speed(iter/s)": 0.199504 }, { "acc": 0.78020668, "epoch": 0.8070789051859861, "grad_norm": 3.40625, "learning_rate": 6.954612308848503e-06, "loss": 0.83853188, "memory(GiB)": 728.98, "step": 31815, "train_speed(iter/s)": 0.199441 }, { "acc": 0.76329026, "epoch": 0.8072057445550237, "grad_norm": 3.765625, "learning_rate": 6.953647062096643e-06, "loss": 0.9457922, "memory(GiB)": 728.98, "step": 31820, "train_speed(iter/s)": 0.199389 }, { "acc": 0.76736851, "epoch": 0.8073325839240613, "grad_norm": 3.875, "learning_rate": 6.952681729410069e-06, "loss": 0.88676062, "memory(GiB)": 728.98, "step": 31825, "train_speed(iter/s)": 0.199333 }, { "acc": 0.77551689, "epoch": 0.8074594232930988, "grad_norm": 3.34375, "learning_rate": 6.95171631083125e-06, "loss": 0.85622168, "memory(GiB)": 728.98, "step": 31830, "train_speed(iter/s)": 0.199284 }, { "acc": 0.75946193, "epoch": 0.8075862626621364, "grad_norm": 3.515625, "learning_rate": 6.950750806402648e-06, "loss": 0.93466682, "memory(GiB)": 728.98, "step": 31835, "train_speed(iter/s)": 0.19923 }, { "acc": 0.76379414, "epoch": 0.807713102031174, "grad_norm": 3.84375, "learning_rate": 6.9497852161667356e-06, "loss": 0.89322195, "memory(GiB)": 728.98, "step": 31840, "train_speed(iter/s)": 0.199178 }, { "acc": 0.76053424, "epoch": 0.8078399414002115, "grad_norm": 3.171875, "learning_rate": 6.948819540165983e-06, "loss": 0.92331991, "memory(GiB)": 728.98, "step": 31845, "train_speed(iter/s)": 0.199125 }, { "acc": 0.76226101, "epoch": 0.8079667807692491, "grad_norm": 4.0625, "learning_rate": 6.947853778442867e-06, "loss": 0.93350191, "memory(GiB)": 728.98, "step": 31850, "train_speed(iter/s)": 0.199072 }, { "acc": 0.76275187, "epoch": 0.8080936201382867, "grad_norm": 4.28125, "learning_rate": 6.94688793103987e-06, "loss": 0.91966705, "memory(GiB)": 728.98, "step": 31855, "train_speed(iter/s)": 0.199017 }, { "acc": 0.75928268, "epoch": 0.8082204595073241, "grad_norm": 3.640625, "learning_rate": 6.945921997999476e-06, "loss": 0.91804428, "memory(GiB)": 728.98, "step": 31860, "train_speed(iter/s)": 0.198971 }, { "acc": 0.75725408, "epoch": 0.8083472988763617, "grad_norm": 3.671875, "learning_rate": 6.944955979364174e-06, "loss": 0.94679136, "memory(GiB)": 728.98, "step": 31865, "train_speed(iter/s)": 0.198904 }, { "acc": 0.76556191, "epoch": 0.8084741382453993, "grad_norm": 3.59375, "learning_rate": 6.9439898751764545e-06, "loss": 0.84813108, "memory(GiB)": 728.98, "step": 31870, "train_speed(iter/s)": 0.198858 }, { "acc": 0.77709737, "epoch": 0.8086009776144368, "grad_norm": 3.453125, "learning_rate": 6.9430236854788156e-06, "loss": 0.81259136, "memory(GiB)": 728.98, "step": 31875, "train_speed(iter/s)": 0.198805 }, { "acc": 0.77744617, "epoch": 0.8087278169834744, "grad_norm": 3.203125, "learning_rate": 6.942057410313754e-06, "loss": 0.88454065, "memory(GiB)": 728.98, "step": 31880, "train_speed(iter/s)": 0.19875 }, { "acc": 0.77096753, "epoch": 0.808854656352512, "grad_norm": 3.0625, "learning_rate": 6.941091049723776e-06, "loss": 0.88469667, "memory(GiB)": 728.98, "step": 31885, "train_speed(iter/s)": 0.198688 }, { "acc": 0.75464988, "epoch": 0.8089814957215495, "grad_norm": 2.953125, "learning_rate": 6.940124603751388e-06, "loss": 0.91703558, "memory(GiB)": 728.98, "step": 31890, "train_speed(iter/s)": 0.198625 }, { "acc": 0.76896372, "epoch": 0.8091083350905871, "grad_norm": 3.5625, "learning_rate": 6.9391580724390984e-06, "loss": 0.89082127, "memory(GiB)": 728.98, "step": 31895, "train_speed(iter/s)": 0.198569 }, { "acc": 0.74938087, "epoch": 0.8092351744596247, "grad_norm": 3.96875, "learning_rate": 6.938191455829425e-06, "loss": 0.9876195, "memory(GiB)": 728.98, "step": 31900, "train_speed(iter/s)": 0.198526 }, { "acc": 0.75789027, "epoch": 0.8093620138286622, "grad_norm": 3.96875, "learning_rate": 6.937224753964884e-06, "loss": 0.90533476, "memory(GiB)": 728.98, "step": 31905, "train_speed(iter/s)": 0.198477 }, { "acc": 0.7632185, "epoch": 0.8094888531976998, "grad_norm": 3.625, "learning_rate": 6.936257966888001e-06, "loss": 0.91612463, "memory(GiB)": 728.98, "step": 31910, "train_speed(iter/s)": 0.198427 }, { "acc": 0.76724901, "epoch": 0.8096156925667374, "grad_norm": 3.546875, "learning_rate": 6.935291094641296e-06, "loss": 0.8444313, "memory(GiB)": 728.98, "step": 31915, "train_speed(iter/s)": 0.198374 }, { "acc": 0.7638608, "epoch": 0.8097425319357748, "grad_norm": 3.65625, "learning_rate": 6.934324137267303e-06, "loss": 0.89239931, "memory(GiB)": 728.98, "step": 31920, "train_speed(iter/s)": 0.198315 }, { "acc": 0.76865783, "epoch": 0.8098693713048124, "grad_norm": 3.453125, "learning_rate": 6.933357094808555e-06, "loss": 0.86002989, "memory(GiB)": 728.98, "step": 31925, "train_speed(iter/s)": 0.198266 }, { "acc": 0.7747211, "epoch": 0.80999621067385, "grad_norm": 3.296875, "learning_rate": 6.932389967307588e-06, "loss": 0.8498579, "memory(GiB)": 728.98, "step": 31930, "train_speed(iter/s)": 0.198202 }, { "acc": 0.76224527, "epoch": 0.8101230500428875, "grad_norm": 3.328125, "learning_rate": 6.931422754806945e-06, "loss": 0.89040918, "memory(GiB)": 728.98, "step": 31935, "train_speed(iter/s)": 0.198155 }, { "acc": 0.75724869, "epoch": 0.8102498894119251, "grad_norm": 3.203125, "learning_rate": 6.930455457349165e-06, "loss": 0.87260895, "memory(GiB)": 728.98, "step": 31940, "train_speed(iter/s)": 0.198108 }, { "acc": 0.76317, "epoch": 0.8103767287809627, "grad_norm": 3.09375, "learning_rate": 6.929488074976804e-06, "loss": 0.89199448, "memory(GiB)": 728.98, "step": 31945, "train_speed(iter/s)": 0.198052 }, { "acc": 0.77739911, "epoch": 0.8105035681500002, "grad_norm": 4.46875, "learning_rate": 6.928520607732409e-06, "loss": 0.89748268, "memory(GiB)": 728.98, "step": 31950, "train_speed(iter/s)": 0.198006 }, { "acc": 0.75960722, "epoch": 0.8106304075190378, "grad_norm": 3.890625, "learning_rate": 6.927553055658536e-06, "loss": 0.9343154, "memory(GiB)": 728.98, "step": 31955, "train_speed(iter/s)": 0.197955 }, { "acc": 0.76354294, "epoch": 0.8107572468880754, "grad_norm": 3.84375, "learning_rate": 6.926585418797745e-06, "loss": 0.91561384, "memory(GiB)": 728.98, "step": 31960, "train_speed(iter/s)": 0.197904 }, { "acc": 0.77166805, "epoch": 0.810884086257113, "grad_norm": 3.703125, "learning_rate": 6.925617697192603e-06, "loss": 0.87464018, "memory(GiB)": 728.98, "step": 31965, "train_speed(iter/s)": 0.197857 }, { "acc": 0.75675521, "epoch": 0.8110109256261505, "grad_norm": 5.59375, "learning_rate": 6.924649890885673e-06, "loss": 0.9284214, "memory(GiB)": 728.98, "step": 31970, "train_speed(iter/s)": 0.197806 }, { "acc": 0.7571826, "epoch": 0.8111377649951881, "grad_norm": 3.796875, "learning_rate": 6.9236819999195245e-06, "loss": 0.91345119, "memory(GiB)": 728.98, "step": 31975, "train_speed(iter/s)": 0.197754 }, { "acc": 0.77322311, "epoch": 0.8112646043642255, "grad_norm": 2.78125, "learning_rate": 6.922714024336736e-06, "loss": 0.87503481, "memory(GiB)": 728.98, "step": 31980, "train_speed(iter/s)": 0.197705 }, { "acc": 0.78260689, "epoch": 0.8113914437332631, "grad_norm": 3.78125, "learning_rate": 6.921745964179883e-06, "loss": 0.84800787, "memory(GiB)": 728.98, "step": 31985, "train_speed(iter/s)": 0.19765 }, { "acc": 0.77002301, "epoch": 0.8115182831023007, "grad_norm": 3.71875, "learning_rate": 6.9207778194915485e-06, "loss": 0.9349721, "memory(GiB)": 728.98, "step": 31990, "train_speed(iter/s)": 0.197608 }, { "acc": 0.74790859, "epoch": 0.8116451224713382, "grad_norm": 3.640625, "learning_rate": 6.919809590314317e-06, "loss": 0.89155979, "memory(GiB)": 728.98, "step": 31995, "train_speed(iter/s)": 0.197558 }, { "acc": 0.77605743, "epoch": 0.8117719618403758, "grad_norm": 3.453125, "learning_rate": 6.91884127669078e-06, "loss": 0.86501427, "memory(GiB)": 728.98, "step": 32000, "train_speed(iter/s)": 0.19749 }, { "epoch": 0.8117719618403758, "eval_acc": 0.7562294327931463, "eval_loss": 0.8635233044624329, "eval_runtime": 1152.4129, "eval_samples_per_second": 5.528, "eval_steps_per_second": 5.528, "step": 32000 }, { "acc": 0.77332001, "epoch": 0.8118988012094134, "grad_norm": 4.1875, "learning_rate": 6.917872878663527e-06, "loss": 0.8948596, "memory(GiB)": 728.98, "step": 32005, "train_speed(iter/s)": 0.195178 }, { "acc": 0.77368388, "epoch": 0.812025640578451, "grad_norm": 3.578125, "learning_rate": 6.9169043962751596e-06, "loss": 0.88109112, "memory(GiB)": 728.98, "step": 32010, "train_speed(iter/s)": 0.195122 }, { "acc": 0.77345319, "epoch": 0.8121524799474885, "grad_norm": 3.0, "learning_rate": 6.915935829568273e-06, "loss": 0.85292273, "memory(GiB)": 728.98, "step": 32015, "train_speed(iter/s)": 0.195076 }, { "acc": 0.75731692, "epoch": 0.8122793193165261, "grad_norm": 3.21875, "learning_rate": 6.914967178585474e-06, "loss": 0.90902557, "memory(GiB)": 728.98, "step": 32020, "train_speed(iter/s)": 0.195023 }, { "acc": 0.75976982, "epoch": 0.8124061586855637, "grad_norm": 3.4375, "learning_rate": 6.913998443369372e-06, "loss": 0.92846966, "memory(GiB)": 728.98, "step": 32025, "train_speed(iter/s)": 0.194976 }, { "acc": 0.78809566, "epoch": 0.8125329980546012, "grad_norm": 3.375, "learning_rate": 6.913029623962575e-06, "loss": 0.83364811, "memory(GiB)": 728.98, "step": 32030, "train_speed(iter/s)": 0.194921 }, { "acc": 0.76842542, "epoch": 0.8126598374236388, "grad_norm": 3.078125, "learning_rate": 6.912060720407701e-06, "loss": 0.93363829, "memory(GiB)": 728.98, "step": 32035, "train_speed(iter/s)": 0.194874 }, { "acc": 0.77928581, "epoch": 0.8127866767926762, "grad_norm": 5.03125, "learning_rate": 6.911091732747368e-06, "loss": 0.88756104, "memory(GiB)": 728.98, "step": 32040, "train_speed(iter/s)": 0.194825 }, { "acc": 0.76074891, "epoch": 0.8129135161617138, "grad_norm": 3.3125, "learning_rate": 6.910122661024199e-06, "loss": 0.86559134, "memory(GiB)": 728.98, "step": 32045, "train_speed(iter/s)": 0.194774 }, { "acc": 0.74887233, "epoch": 0.8130403555307514, "grad_norm": 3.765625, "learning_rate": 6.909153505280819e-06, "loss": 1.02889299, "memory(GiB)": 728.98, "step": 32050, "train_speed(iter/s)": 0.194721 }, { "acc": 0.76071243, "epoch": 0.813167194899789, "grad_norm": 3.03125, "learning_rate": 6.908184265559861e-06, "loss": 0.90520897, "memory(GiB)": 728.98, "step": 32055, "train_speed(iter/s)": 0.194668 }, { "acc": 0.7726944, "epoch": 0.8132940342688265, "grad_norm": 3.171875, "learning_rate": 6.907214941903956e-06, "loss": 0.90199728, "memory(GiB)": 728.98, "step": 32060, "train_speed(iter/s)": 0.194619 }, { "acc": 0.75241709, "epoch": 0.8134208736378641, "grad_norm": 3.359375, "learning_rate": 6.906245534355742e-06, "loss": 0.92101936, "memory(GiB)": 728.98, "step": 32065, "train_speed(iter/s)": 0.194571 }, { "acc": 0.7659606, "epoch": 0.8135477130069017, "grad_norm": 3.34375, "learning_rate": 6.905276042957861e-06, "loss": 0.91176386, "memory(GiB)": 728.98, "step": 32070, "train_speed(iter/s)": 0.194524 }, { "acc": 0.75815482, "epoch": 0.8136745523759392, "grad_norm": 3.734375, "learning_rate": 6.904306467752958e-06, "loss": 0.90207787, "memory(GiB)": 728.98, "step": 32075, "train_speed(iter/s)": 0.194473 }, { "acc": 0.76834083, "epoch": 0.8138013917449768, "grad_norm": 2.96875, "learning_rate": 6.90333680878368e-06, "loss": 0.86428785, "memory(GiB)": 728.98, "step": 32080, "train_speed(iter/s)": 0.194421 }, { "acc": 0.76208301, "epoch": 0.8139282311140144, "grad_norm": 4.375, "learning_rate": 6.9023670660926814e-06, "loss": 0.9558959, "memory(GiB)": 728.98, "step": 32085, "train_speed(iter/s)": 0.194379 }, { "acc": 0.76318016, "epoch": 0.8140550704830519, "grad_norm": 3.453125, "learning_rate": 6.901397239722616e-06, "loss": 0.90098591, "memory(GiB)": 728.98, "step": 32090, "train_speed(iter/s)": 0.194334 }, { "acc": 0.75868702, "epoch": 0.8141819098520895, "grad_norm": 4.40625, "learning_rate": 6.900427329716144e-06, "loss": 0.97778826, "memory(GiB)": 728.98, "step": 32095, "train_speed(iter/s)": 0.194286 }, { "acc": 0.77553244, "epoch": 0.814308749221127, "grad_norm": 3.59375, "learning_rate": 6.89945733611593e-06, "loss": 0.89446821, "memory(GiB)": 728.98, "step": 32100, "train_speed(iter/s)": 0.194237 }, { "acc": 0.7661922, "epoch": 0.8144355885901645, "grad_norm": 2.375, "learning_rate": 6.898487258964639e-06, "loss": 0.90220699, "memory(GiB)": 728.98, "step": 32105, "train_speed(iter/s)": 0.194183 }, { "acc": 0.76347127, "epoch": 0.8145624279592021, "grad_norm": 3.734375, "learning_rate": 6.8975170983049425e-06, "loss": 0.91552544, "memory(GiB)": 728.98, "step": 32110, "train_speed(iter/s)": 0.194132 }, { "acc": 0.76489882, "epoch": 0.8146892673282397, "grad_norm": 3.421875, "learning_rate": 6.896546854179514e-06, "loss": 0.91895781, "memory(GiB)": 728.98, "step": 32115, "train_speed(iter/s)": 0.194085 }, { "acc": 0.76262393, "epoch": 0.8148161066972772, "grad_norm": 3.71875, "learning_rate": 6.895576526631032e-06, "loss": 0.89982452, "memory(GiB)": 728.98, "step": 32120, "train_speed(iter/s)": 0.194038 }, { "acc": 0.76325383, "epoch": 0.8149429460663148, "grad_norm": 3.765625, "learning_rate": 6.8946061157021804e-06, "loss": 0.9207139, "memory(GiB)": 728.98, "step": 32125, "train_speed(iter/s)": 0.193993 }, { "acc": 0.75395398, "epoch": 0.8150697854353524, "grad_norm": 4.53125, "learning_rate": 6.893635621435642e-06, "loss": 1.00944691, "memory(GiB)": 728.98, "step": 32130, "train_speed(iter/s)": 0.193946 }, { "acc": 0.76756516, "epoch": 0.8151966248043899, "grad_norm": 3.828125, "learning_rate": 6.8926650438741046e-06, "loss": 0.90813265, "memory(GiB)": 728.98, "step": 32135, "train_speed(iter/s)": 0.1939 }, { "acc": 0.76927724, "epoch": 0.8153234641734275, "grad_norm": 3.265625, "learning_rate": 6.891694383060263e-06, "loss": 0.90702019, "memory(GiB)": 728.98, "step": 32140, "train_speed(iter/s)": 0.193851 }, { "acc": 0.76672211, "epoch": 0.8154503035424651, "grad_norm": 2.84375, "learning_rate": 6.890723639036813e-06, "loss": 0.88190413, "memory(GiB)": 728.98, "step": 32145, "train_speed(iter/s)": 0.1938 }, { "acc": 0.76004014, "epoch": 0.8155771429115026, "grad_norm": 3.40625, "learning_rate": 6.889752811846454e-06, "loss": 0.89192419, "memory(GiB)": 728.98, "step": 32150, "train_speed(iter/s)": 0.193748 }, { "acc": 0.76526022, "epoch": 0.8157039822805402, "grad_norm": 4.34375, "learning_rate": 6.8887819015318904e-06, "loss": 0.89080486, "memory(GiB)": 728.98, "step": 32155, "train_speed(iter/s)": 0.193702 }, { "acc": 0.77893696, "epoch": 0.8158308216495777, "grad_norm": 4.21875, "learning_rate": 6.887810908135828e-06, "loss": 0.87268343, "memory(GiB)": 728.98, "step": 32160, "train_speed(iter/s)": 0.193656 }, { "acc": 0.76631322, "epoch": 0.8159576610186152, "grad_norm": 3.40625, "learning_rate": 6.88683983170098e-06, "loss": 0.89892969, "memory(GiB)": 728.98, "step": 32165, "train_speed(iter/s)": 0.193607 }, { "acc": 0.77052803, "epoch": 0.8160845003876528, "grad_norm": 3.734375, "learning_rate": 6.885868672270058e-06, "loss": 0.90179739, "memory(GiB)": 728.98, "step": 32170, "train_speed(iter/s)": 0.193552 }, { "acc": 0.76272926, "epoch": 0.8162113397566904, "grad_norm": 3.328125, "learning_rate": 6.884897429885783e-06, "loss": 0.8799922, "memory(GiB)": 728.98, "step": 32175, "train_speed(iter/s)": 0.19351 }, { "acc": 0.77967134, "epoch": 0.8163381791257279, "grad_norm": 3.984375, "learning_rate": 6.883926104590877e-06, "loss": 0.82658319, "memory(GiB)": 728.98, "step": 32180, "train_speed(iter/s)": 0.193466 }, { "acc": 0.77010365, "epoch": 0.8164650184947655, "grad_norm": 3.359375, "learning_rate": 6.8829546964280625e-06, "loss": 0.8933672, "memory(GiB)": 728.98, "step": 32185, "train_speed(iter/s)": 0.193419 }, { "acc": 0.76248174, "epoch": 0.8165918578638031, "grad_norm": 3.109375, "learning_rate": 6.88198320544007e-06, "loss": 0.87037439, "memory(GiB)": 728.98, "step": 32190, "train_speed(iter/s)": 0.193364 }, { "acc": 0.77444038, "epoch": 0.8167186972328406, "grad_norm": 3.40625, "learning_rate": 6.881011631669634e-06, "loss": 0.82811108, "memory(GiB)": 728.98, "step": 32195, "train_speed(iter/s)": 0.193299 }, { "acc": 0.77279544, "epoch": 0.8168455366018782, "grad_norm": 3.21875, "learning_rate": 6.880039975159488e-06, "loss": 0.873985, "memory(GiB)": 728.98, "step": 32200, "train_speed(iter/s)": 0.193257 }, { "acc": 0.76914392, "epoch": 0.8169723759709158, "grad_norm": 3.375, "learning_rate": 6.879068235952375e-06, "loss": 0.94575911, "memory(GiB)": 728.98, "step": 32205, "train_speed(iter/s)": 0.193198 }, { "acc": 0.76303096, "epoch": 0.8170992153399533, "grad_norm": 3.765625, "learning_rate": 6.878096414091037e-06, "loss": 0.88062525, "memory(GiB)": 728.98, "step": 32210, "train_speed(iter/s)": 0.193152 }, { "acc": 0.77577291, "epoch": 0.8172260547089909, "grad_norm": 4.25, "learning_rate": 6.877124509618222e-06, "loss": 0.87753897, "memory(GiB)": 728.98, "step": 32215, "train_speed(iter/s)": 0.193108 }, { "acc": 0.75121822, "epoch": 0.8173528940780284, "grad_norm": 3.078125, "learning_rate": 6.876152522576679e-06, "loss": 0.94928637, "memory(GiB)": 728.98, "step": 32220, "train_speed(iter/s)": 0.19305 }, { "acc": 0.76248765, "epoch": 0.8174797334470659, "grad_norm": 3.359375, "learning_rate": 6.875180453009167e-06, "loss": 0.90880547, "memory(GiB)": 728.98, "step": 32225, "train_speed(iter/s)": 0.193004 }, { "acc": 0.76322227, "epoch": 0.8176065728161035, "grad_norm": 3.5625, "learning_rate": 6.87420830095844e-06, "loss": 0.90347595, "memory(GiB)": 728.98, "step": 32230, "train_speed(iter/s)": 0.192961 }, { "acc": 0.76288424, "epoch": 0.8177334121851411, "grad_norm": 3.46875, "learning_rate": 6.873236066467261e-06, "loss": 0.92158689, "memory(GiB)": 728.98, "step": 32235, "train_speed(iter/s)": 0.192916 }, { "acc": 0.78001823, "epoch": 0.8178602515541786, "grad_norm": 3.578125, "learning_rate": 6.872263749578397e-06, "loss": 0.78657413, "memory(GiB)": 728.98, "step": 32240, "train_speed(iter/s)": 0.192864 }, { "acc": 0.76038356, "epoch": 0.8179870909232162, "grad_norm": 3.40625, "learning_rate": 6.871291350334614e-06, "loss": 0.93861732, "memory(GiB)": 728.98, "step": 32245, "train_speed(iter/s)": 0.192816 }, { "acc": 0.77405415, "epoch": 0.8181139302922538, "grad_norm": 2.859375, "learning_rate": 6.87031886877869e-06, "loss": 0.85996647, "memory(GiB)": 728.98, "step": 32250, "train_speed(iter/s)": 0.192767 }, { "acc": 0.77833719, "epoch": 0.8182407696612913, "grad_norm": 3.140625, "learning_rate": 6.869346304953396e-06, "loss": 0.87278585, "memory(GiB)": 728.98, "step": 32255, "train_speed(iter/s)": 0.192717 }, { "acc": 0.77669916, "epoch": 0.8183676090303289, "grad_norm": 3.796875, "learning_rate": 6.868373658901515e-06, "loss": 0.8970789, "memory(GiB)": 728.98, "step": 32260, "train_speed(iter/s)": 0.192675 }, { "acc": 0.75962448, "epoch": 0.8184944483993665, "grad_norm": 3.34375, "learning_rate": 6.867400930665829e-06, "loss": 0.93841619, "memory(GiB)": 728.98, "step": 32265, "train_speed(iter/s)": 0.19263 }, { "acc": 0.75110259, "epoch": 0.818621287768404, "grad_norm": 3.359375, "learning_rate": 6.866428120289126e-06, "loss": 0.89750252, "memory(GiB)": 728.98, "step": 32270, "train_speed(iter/s)": 0.192584 }, { "acc": 0.77596951, "epoch": 0.8187481271374416, "grad_norm": 4.1875, "learning_rate": 6.865455227814197e-06, "loss": 0.87993412, "memory(GiB)": 728.98, "step": 32275, "train_speed(iter/s)": 0.192544 }, { "acc": 0.76550703, "epoch": 0.8188749665064791, "grad_norm": 4.0625, "learning_rate": 6.864482253283837e-06, "loss": 0.88363981, "memory(GiB)": 728.98, "step": 32280, "train_speed(iter/s)": 0.192496 }, { "acc": 0.77416806, "epoch": 0.8190018058755166, "grad_norm": 2.578125, "learning_rate": 6.863509196740843e-06, "loss": 0.83763494, "memory(GiB)": 728.98, "step": 32285, "train_speed(iter/s)": 0.192436 }, { "acc": 0.75412345, "epoch": 0.8191286452445542, "grad_norm": 3.890625, "learning_rate": 6.862536058228016e-06, "loss": 0.90389977, "memory(GiB)": 728.98, "step": 32290, "train_speed(iter/s)": 0.192388 }, { "acc": 0.78414545, "epoch": 0.8192554846135918, "grad_norm": 8.5, "learning_rate": 6.861562837788164e-06, "loss": 0.81284752, "memory(GiB)": 728.98, "step": 32295, "train_speed(iter/s)": 0.192336 }, { "acc": 0.76542516, "epoch": 0.8193823239826293, "grad_norm": 3.0, "learning_rate": 6.860589535464092e-06, "loss": 0.91553926, "memory(GiB)": 728.98, "step": 32300, "train_speed(iter/s)": 0.192283 }, { "acc": 0.75313377, "epoch": 0.8195091633516669, "grad_norm": 3.4375, "learning_rate": 6.859616151298616e-06, "loss": 0.97337837, "memory(GiB)": 728.98, "step": 32305, "train_speed(iter/s)": 0.192231 }, { "acc": 0.76386352, "epoch": 0.8196360027207045, "grad_norm": 3.078125, "learning_rate": 6.858642685334551e-06, "loss": 0.91579752, "memory(GiB)": 728.98, "step": 32310, "train_speed(iter/s)": 0.19218 }, { "acc": 0.76919155, "epoch": 0.819762842089742, "grad_norm": 4.21875, "learning_rate": 6.8576691376147144e-06, "loss": 0.89705343, "memory(GiB)": 728.98, "step": 32315, "train_speed(iter/s)": 0.19214 }, { "acc": 0.78359103, "epoch": 0.8198896814587796, "grad_norm": 3.703125, "learning_rate": 6.856695508181932e-06, "loss": 0.81414356, "memory(GiB)": 728.98, "step": 32320, "train_speed(iter/s)": 0.192091 }, { "acc": 0.76656113, "epoch": 0.8200165208278172, "grad_norm": 3.234375, "learning_rate": 6.85572179707903e-06, "loss": 0.94793186, "memory(GiB)": 728.98, "step": 32325, "train_speed(iter/s)": 0.192038 }, { "acc": 0.76233454, "epoch": 0.8201433601968547, "grad_norm": 4.6875, "learning_rate": 6.85474800434884e-06, "loss": 0.90246696, "memory(GiB)": 728.98, "step": 32330, "train_speed(iter/s)": 0.191991 }, { "acc": 0.77447143, "epoch": 0.8202701995658923, "grad_norm": 3.5625, "learning_rate": 6.853774130034193e-06, "loss": 0.86610842, "memory(GiB)": 728.98, "step": 32335, "train_speed(iter/s)": 0.191942 }, { "acc": 0.75690775, "epoch": 0.8203970389349298, "grad_norm": 3.5625, "learning_rate": 6.85280017417793e-06, "loss": 0.87627268, "memory(GiB)": 728.98, "step": 32340, "train_speed(iter/s)": 0.191899 }, { "acc": 0.77782474, "epoch": 0.8205238783039673, "grad_norm": 4.59375, "learning_rate": 6.85182613682289e-06, "loss": 0.81278181, "memory(GiB)": 728.98, "step": 32345, "train_speed(iter/s)": 0.191849 }, { "acc": 0.76468463, "epoch": 0.8206507176730049, "grad_norm": 4.5, "learning_rate": 6.85085201801192e-06, "loss": 0.88097105, "memory(GiB)": 728.98, "step": 32350, "train_speed(iter/s)": 0.191801 }, { "acc": 0.76116467, "epoch": 0.8207775570420425, "grad_norm": 3.78125, "learning_rate": 6.849877817787866e-06, "loss": 0.8870369, "memory(GiB)": 728.98, "step": 32355, "train_speed(iter/s)": 0.191743 }, { "acc": 0.75153499, "epoch": 0.82090439641108, "grad_norm": 3.15625, "learning_rate": 6.84890353619358e-06, "loss": 0.91122856, "memory(GiB)": 728.98, "step": 32360, "train_speed(iter/s)": 0.191689 }, { "acc": 0.7605341, "epoch": 0.8210312357801176, "grad_norm": 3.96875, "learning_rate": 6.847929173271919e-06, "loss": 0.92385492, "memory(GiB)": 728.98, "step": 32365, "train_speed(iter/s)": 0.191642 }, { "acc": 0.76325536, "epoch": 0.8211580751491552, "grad_norm": 3.5625, "learning_rate": 6.84695472906574e-06, "loss": 0.9224968, "memory(GiB)": 728.98, "step": 32370, "train_speed(iter/s)": 0.191594 }, { "acc": 0.76730351, "epoch": 0.8212849145181927, "grad_norm": 3.8125, "learning_rate": 6.84598020361791e-06, "loss": 0.90765047, "memory(GiB)": 728.98, "step": 32375, "train_speed(iter/s)": 0.191548 }, { "acc": 0.7552393, "epoch": 0.8214117538872303, "grad_norm": 3.84375, "learning_rate": 6.84500559697129e-06, "loss": 0.93946686, "memory(GiB)": 728.98, "step": 32380, "train_speed(iter/s)": 0.191507 }, { "acc": 0.75925918, "epoch": 0.8215385932562679, "grad_norm": 3.328125, "learning_rate": 6.844030909168754e-06, "loss": 0.95742006, "memory(GiB)": 728.98, "step": 32385, "train_speed(iter/s)": 0.191458 }, { "acc": 0.77789412, "epoch": 0.8216654326253054, "grad_norm": 3.828125, "learning_rate": 6.8430561402531725e-06, "loss": 0.89970407, "memory(GiB)": 728.98, "step": 32390, "train_speed(iter/s)": 0.191414 }, { "acc": 0.77108469, "epoch": 0.821792271994343, "grad_norm": 3.03125, "learning_rate": 6.842081290267424e-06, "loss": 0.88633432, "memory(GiB)": 728.98, "step": 32395, "train_speed(iter/s)": 0.191363 }, { "acc": 0.7679141, "epoch": 0.8219191113633805, "grad_norm": 3.15625, "learning_rate": 6.841106359254391e-06, "loss": 0.84881926, "memory(GiB)": 728.98, "step": 32400, "train_speed(iter/s)": 0.191314 }, { "acc": 0.75635362, "epoch": 0.822045950732418, "grad_norm": 3.5625, "learning_rate": 6.840131347256952e-06, "loss": 0.96973095, "memory(GiB)": 728.98, "step": 32405, "train_speed(iter/s)": 0.191271 }, { "acc": 0.770961, "epoch": 0.8221727901014556, "grad_norm": 3.5625, "learning_rate": 6.839156254318e-06, "loss": 0.89314985, "memory(GiB)": 728.98, "step": 32410, "train_speed(iter/s)": 0.191225 }, { "acc": 0.76479168, "epoch": 0.8222996294704932, "grad_norm": 3.125, "learning_rate": 6.8381810804804235e-06, "loss": 0.90767279, "memory(GiB)": 728.98, "step": 32415, "train_speed(iter/s)": 0.191179 }, { "acc": 0.76704698, "epoch": 0.8224264688395307, "grad_norm": 3.28125, "learning_rate": 6.837205825787119e-06, "loss": 0.90285215, "memory(GiB)": 728.98, "step": 32420, "train_speed(iter/s)": 0.191121 }, { "acc": 0.76928086, "epoch": 0.8225533082085683, "grad_norm": 6.375, "learning_rate": 6.836230490280984e-06, "loss": 0.91763096, "memory(GiB)": 728.98, "step": 32425, "train_speed(iter/s)": 0.19108 }, { "acc": 0.76897955, "epoch": 0.8226801475776059, "grad_norm": 3.15625, "learning_rate": 6.8352550740049195e-06, "loss": 0.86071072, "memory(GiB)": 728.98, "step": 32430, "train_speed(iter/s)": 0.191031 }, { "acc": 0.77411838, "epoch": 0.8228069869466434, "grad_norm": 3.6875, "learning_rate": 6.834279577001832e-06, "loss": 0.84446802, "memory(GiB)": 728.98, "step": 32435, "train_speed(iter/s)": 0.190995 }, { "acc": 0.76255994, "epoch": 0.822933826315681, "grad_norm": 3.046875, "learning_rate": 6.83330399931463e-06, "loss": 0.87975254, "memory(GiB)": 728.98, "step": 32440, "train_speed(iter/s)": 0.19095 }, { "acc": 0.77314959, "epoch": 0.8230606656847186, "grad_norm": 3.609375, "learning_rate": 6.832328340986227e-06, "loss": 0.87320595, "memory(GiB)": 728.98, "step": 32445, "train_speed(iter/s)": 0.190903 }, { "acc": 0.76694932, "epoch": 0.8231875050537562, "grad_norm": 3.546875, "learning_rate": 6.831352602059538e-06, "loss": 0.92391005, "memory(GiB)": 728.98, "step": 32450, "train_speed(iter/s)": 0.190857 }, { "acc": 0.77227831, "epoch": 0.8233143444227937, "grad_norm": 11.3125, "learning_rate": 6.830376782577484e-06, "loss": 0.92334595, "memory(GiB)": 728.98, "step": 32455, "train_speed(iter/s)": 0.190811 }, { "acc": 0.76673374, "epoch": 0.8234411837918312, "grad_norm": 2.984375, "learning_rate": 6.829400882582985e-06, "loss": 0.88015718, "memory(GiB)": 728.98, "step": 32460, "train_speed(iter/s)": 0.190769 }, { "acc": 0.77402472, "epoch": 0.8235680231608687, "grad_norm": 3.40625, "learning_rate": 6.828424902118972e-06, "loss": 0.8557354, "memory(GiB)": 728.98, "step": 32465, "train_speed(iter/s)": 0.190712 }, { "acc": 0.75831866, "epoch": 0.8236948625299063, "grad_norm": 3.5, "learning_rate": 6.8274488412283725e-06, "loss": 0.91797342, "memory(GiB)": 728.98, "step": 32470, "train_speed(iter/s)": 0.190663 }, { "acc": 0.76600523, "epoch": 0.8238217018989439, "grad_norm": 3.34375, "learning_rate": 6.826472699954121e-06, "loss": 0.90832644, "memory(GiB)": 728.98, "step": 32475, "train_speed(iter/s)": 0.190621 }, { "acc": 0.76200066, "epoch": 0.8239485412679814, "grad_norm": 3.15625, "learning_rate": 6.8254964783391565e-06, "loss": 0.90714464, "memory(GiB)": 728.98, "step": 32480, "train_speed(iter/s)": 0.190576 }, { "acc": 0.76832571, "epoch": 0.824075380637019, "grad_norm": 3.625, "learning_rate": 6.824520176426416e-06, "loss": 0.8697073, "memory(GiB)": 728.98, "step": 32485, "train_speed(iter/s)": 0.190532 }, { "acc": 0.74424763, "epoch": 0.8242022200060566, "grad_norm": 3.296875, "learning_rate": 6.8235437942588476e-06, "loss": 0.94575138, "memory(GiB)": 728.98, "step": 32490, "train_speed(iter/s)": 0.190488 }, { "acc": 0.76437197, "epoch": 0.8243290593750942, "grad_norm": 3.640625, "learning_rate": 6.822567331879397e-06, "loss": 0.90027599, "memory(GiB)": 728.98, "step": 32495, "train_speed(iter/s)": 0.190437 }, { "acc": 0.7709662, "epoch": 0.8244558987441317, "grad_norm": 3.015625, "learning_rate": 6.8215907893310175e-06, "loss": 0.86846352, "memory(GiB)": 728.98, "step": 32500, "train_speed(iter/s)": 0.190395 }, { "epoch": 0.8244558987441317, "eval_acc": 0.7563535096840551, "eval_loss": 0.8632639050483704, "eval_runtime": 1153.0265, "eval_samples_per_second": 5.525, "eval_steps_per_second": 5.525, "step": 32500 }, { "acc": 0.77342896, "epoch": 0.8245827381131693, "grad_norm": 3.828125, "learning_rate": 6.820614166656663e-06, "loss": 0.89875011, "memory(GiB)": 728.98, "step": 32505, "train_speed(iter/s)": 0.188282 }, { "acc": 0.76764398, "epoch": 0.8247095774822069, "grad_norm": 3.40625, "learning_rate": 6.8196374638992914e-06, "loss": 0.88275204, "memory(GiB)": 728.98, "step": 32510, "train_speed(iter/s)": 0.188233 }, { "acc": 0.7407958, "epoch": 0.8248364168512444, "grad_norm": 3.515625, "learning_rate": 6.8186606811018655e-06, "loss": 0.96837091, "memory(GiB)": 728.98, "step": 32515, "train_speed(iter/s)": 0.188184 }, { "acc": 0.78017578, "epoch": 0.8249632562202819, "grad_norm": 3.109375, "learning_rate": 6.817683818307351e-06, "loss": 0.81581841, "memory(GiB)": 728.98, "step": 32520, "train_speed(iter/s)": 0.188137 }, { "acc": 0.76036825, "epoch": 0.8250900955893194, "grad_norm": 3.109375, "learning_rate": 6.816706875558718e-06, "loss": 0.87978868, "memory(GiB)": 728.98, "step": 32525, "train_speed(iter/s)": 0.188087 }, { "acc": 0.77536497, "epoch": 0.825216934958357, "grad_norm": 4.5625, "learning_rate": 6.815729852898936e-06, "loss": 0.84308767, "memory(GiB)": 728.98, "step": 32530, "train_speed(iter/s)": 0.188047 }, { "acc": 0.77563324, "epoch": 0.8253437743273946, "grad_norm": 3.890625, "learning_rate": 6.8147527503709855e-06, "loss": 0.81263905, "memory(GiB)": 728.98, "step": 32535, "train_speed(iter/s)": 0.188005 }, { "acc": 0.75565381, "epoch": 0.8254706136964322, "grad_norm": 3.546875, "learning_rate": 6.8137755680178396e-06, "loss": 0.92923069, "memory(GiB)": 728.98, "step": 32540, "train_speed(iter/s)": 0.187954 }, { "acc": 0.77918253, "epoch": 0.8255974530654697, "grad_norm": 4.34375, "learning_rate": 6.812798305882488e-06, "loss": 0.91453514, "memory(GiB)": 728.98, "step": 32545, "train_speed(iter/s)": 0.187906 }, { "acc": 0.77809968, "epoch": 0.8257242924345073, "grad_norm": 3.109375, "learning_rate": 6.8118209640079155e-06, "loss": 0.87891197, "memory(GiB)": 728.98, "step": 32550, "train_speed(iter/s)": 0.187861 }, { "acc": 0.77402554, "epoch": 0.8258511318035449, "grad_norm": 3.375, "learning_rate": 6.810843542437111e-06, "loss": 0.87875462, "memory(GiB)": 728.98, "step": 32555, "train_speed(iter/s)": 0.187815 }, { "acc": 0.76278896, "epoch": 0.8259779711725824, "grad_norm": 3.828125, "learning_rate": 6.809866041213069e-06, "loss": 0.89761753, "memory(GiB)": 728.98, "step": 32560, "train_speed(iter/s)": 0.187772 }, { "acc": 0.7662941, "epoch": 0.82610481054162, "grad_norm": 4.125, "learning_rate": 6.8088884603787866e-06, "loss": 0.92733116, "memory(GiB)": 728.98, "step": 32565, "train_speed(iter/s)": 0.187732 }, { "acc": 0.75652876, "epoch": 0.8262316499106576, "grad_norm": 3.9375, "learning_rate": 6.807910799977264e-06, "loss": 0.89408941, "memory(GiB)": 728.98, "step": 32570, "train_speed(iter/s)": 0.187684 }, { "acc": 0.76857057, "epoch": 0.8263584892796951, "grad_norm": 3.640625, "learning_rate": 6.806933060051505e-06, "loss": 0.91508369, "memory(GiB)": 728.98, "step": 32575, "train_speed(iter/s)": 0.187641 }, { "acc": 0.76806421, "epoch": 0.8264853286487326, "grad_norm": 3.34375, "learning_rate": 6.805955240644519e-06, "loss": 0.87620859, "memory(GiB)": 728.98, "step": 32580, "train_speed(iter/s)": 0.187595 }, { "acc": 0.76231618, "epoch": 0.8266121680177702, "grad_norm": 3.421875, "learning_rate": 6.8049773417993144e-06, "loss": 0.90162888, "memory(GiB)": 728.98, "step": 32585, "train_speed(iter/s)": 0.187539 }, { "acc": 0.76977658, "epoch": 0.8267390073868077, "grad_norm": 3.140625, "learning_rate": 6.8039993635589095e-06, "loss": 0.91005297, "memory(GiB)": 728.98, "step": 32590, "train_speed(iter/s)": 0.187496 }, { "acc": 0.76431212, "epoch": 0.8268658467558453, "grad_norm": 3.328125, "learning_rate": 6.803021305966319e-06, "loss": 0.94339151, "memory(GiB)": 728.98, "step": 32595, "train_speed(iter/s)": 0.187448 }, { "acc": 0.78010788, "epoch": 0.8269926861248829, "grad_norm": 4.96875, "learning_rate": 6.802043169064567e-06, "loss": 0.89003763, "memory(GiB)": 728.98, "step": 32600, "train_speed(iter/s)": 0.187403 }, { "acc": 0.76474276, "epoch": 0.8271195254939204, "grad_norm": 3.71875, "learning_rate": 6.801064952896679e-06, "loss": 0.84655914, "memory(GiB)": 728.98, "step": 32605, "train_speed(iter/s)": 0.187357 }, { "acc": 0.77072582, "epoch": 0.827246364862958, "grad_norm": 3.421875, "learning_rate": 6.80008665750568e-06, "loss": 0.884307, "memory(GiB)": 728.98, "step": 32610, "train_speed(iter/s)": 0.187307 }, { "acc": 0.76682782, "epoch": 0.8273732042319956, "grad_norm": 3.875, "learning_rate": 6.799108282934605e-06, "loss": 0.91617765, "memory(GiB)": 728.98, "step": 32615, "train_speed(iter/s)": 0.187266 }, { "acc": 0.76914978, "epoch": 0.8275000436010331, "grad_norm": 2.875, "learning_rate": 6.798129829226489e-06, "loss": 0.84377022, "memory(GiB)": 728.98, "step": 32620, "train_speed(iter/s)": 0.187211 }, { "acc": 0.77108288, "epoch": 0.8276268829700707, "grad_norm": 4.0625, "learning_rate": 6.797151296424371e-06, "loss": 0.85253782, "memory(GiB)": 728.98, "step": 32625, "train_speed(iter/s)": 0.187165 }, { "acc": 0.77408295, "epoch": 0.8277537223391083, "grad_norm": 4.15625, "learning_rate": 6.796172684571293e-06, "loss": 0.86779308, "memory(GiB)": 728.98, "step": 32630, "train_speed(iter/s)": 0.187129 }, { "acc": 0.7638628, "epoch": 0.8278805617081458, "grad_norm": 3.3125, "learning_rate": 6.795193993710303e-06, "loss": 0.91302824, "memory(GiB)": 728.98, "step": 32635, "train_speed(iter/s)": 0.187091 }, { "acc": 0.76946449, "epoch": 0.8280074010771833, "grad_norm": 4.6875, "learning_rate": 6.794215223884447e-06, "loss": 0.88036022, "memory(GiB)": 728.98, "step": 32640, "train_speed(iter/s)": 0.187045 }, { "acc": 0.77642422, "epoch": 0.8281342404462209, "grad_norm": 2.953125, "learning_rate": 6.793236375136781e-06, "loss": 0.90271053, "memory(GiB)": 728.98, "step": 32645, "train_speed(iter/s)": 0.186997 }, { "acc": 0.76924653, "epoch": 0.8282610798152584, "grad_norm": 3.078125, "learning_rate": 6.7922574475103616e-06, "loss": 0.87922344, "memory(GiB)": 728.98, "step": 32650, "train_speed(iter/s)": 0.186954 }, { "acc": 0.75979376, "epoch": 0.828387919184296, "grad_norm": 3.359375, "learning_rate": 6.791278441048246e-06, "loss": 0.94884148, "memory(GiB)": 728.98, "step": 32655, "train_speed(iter/s)": 0.186908 }, { "acc": 0.76325727, "epoch": 0.8285147585533336, "grad_norm": 4.125, "learning_rate": 6.790299355793501e-06, "loss": 0.93510618, "memory(GiB)": 728.98, "step": 32660, "train_speed(iter/s)": 0.186866 }, { "acc": 0.75629377, "epoch": 0.8286415979223711, "grad_norm": 3.03125, "learning_rate": 6.7893201917891895e-06, "loss": 0.95024376, "memory(GiB)": 728.98, "step": 32665, "train_speed(iter/s)": 0.186824 }, { "acc": 0.76598048, "epoch": 0.8287684372914087, "grad_norm": 3.546875, "learning_rate": 6.7883409490783845e-06, "loss": 0.87815914, "memory(GiB)": 728.98, "step": 32670, "train_speed(iter/s)": 0.186783 }, { "acc": 0.78525467, "epoch": 0.8288952766604463, "grad_norm": 3.5, "learning_rate": 6.787361627704159e-06, "loss": 0.82657957, "memory(GiB)": 728.98, "step": 32675, "train_speed(iter/s)": 0.186741 }, { "acc": 0.75955725, "epoch": 0.8290221160294838, "grad_norm": 5.96875, "learning_rate": 6.786382227709592e-06, "loss": 0.94149513, "memory(GiB)": 728.98, "step": 32680, "train_speed(iter/s)": 0.186693 }, { "acc": 0.76786761, "epoch": 0.8291489553985214, "grad_norm": 3.375, "learning_rate": 6.78540274913776e-06, "loss": 0.87201195, "memory(GiB)": 728.98, "step": 32685, "train_speed(iter/s)": 0.186647 }, { "acc": 0.76351733, "epoch": 0.829275794767559, "grad_norm": 3.515625, "learning_rate": 6.78442319203175e-06, "loss": 0.88161154, "memory(GiB)": 728.98, "step": 32690, "train_speed(iter/s)": 0.186607 }, { "acc": 0.74891286, "epoch": 0.8294026341365965, "grad_norm": 4.21875, "learning_rate": 6.783443556434651e-06, "loss": 0.93326588, "memory(GiB)": 728.98, "step": 32695, "train_speed(iter/s)": 0.186566 }, { "acc": 0.76187377, "epoch": 0.829529473505634, "grad_norm": 3.390625, "learning_rate": 6.782463842389552e-06, "loss": 0.90306025, "memory(GiB)": 728.98, "step": 32700, "train_speed(iter/s)": 0.186524 }, { "acc": 0.77053208, "epoch": 0.8296563128746716, "grad_norm": 3.171875, "learning_rate": 6.781484049939547e-06, "loss": 0.89313183, "memory(GiB)": 728.98, "step": 32705, "train_speed(iter/s)": 0.186477 }, { "acc": 0.76969008, "epoch": 0.8297831522437091, "grad_norm": 3.640625, "learning_rate": 6.780504179127735e-06, "loss": 0.91735039, "memory(GiB)": 728.98, "step": 32710, "train_speed(iter/s)": 0.186435 }, { "acc": 0.7688426, "epoch": 0.8299099916127467, "grad_norm": 4.5, "learning_rate": 6.779524229997218e-06, "loss": 0.88809557, "memory(GiB)": 728.98, "step": 32715, "train_speed(iter/s)": 0.186391 }, { "acc": 0.75509095, "epoch": 0.8300368309817843, "grad_norm": 3.734375, "learning_rate": 6.7785442025911e-06, "loss": 0.93371391, "memory(GiB)": 728.98, "step": 32720, "train_speed(iter/s)": 0.186352 }, { "acc": 0.75284986, "epoch": 0.8301636703508218, "grad_norm": 4.15625, "learning_rate": 6.777564096952488e-06, "loss": 0.88858624, "memory(GiB)": 728.98, "step": 32725, "train_speed(iter/s)": 0.186311 }, { "acc": 0.77922196, "epoch": 0.8302905097198594, "grad_norm": 3.28125, "learning_rate": 6.776583913124497e-06, "loss": 0.83570232, "memory(GiB)": 728.98, "step": 32730, "train_speed(iter/s)": 0.186268 }, { "acc": 0.77946749, "epoch": 0.830417349088897, "grad_norm": 3.25, "learning_rate": 6.775603651150238e-06, "loss": 0.83592176, "memory(GiB)": 728.98, "step": 32735, "train_speed(iter/s)": 0.186221 }, { "acc": 0.76870465, "epoch": 0.8305441884579345, "grad_norm": 3.640625, "learning_rate": 6.774623311072833e-06, "loss": 0.86517591, "memory(GiB)": 728.98, "step": 32740, "train_speed(iter/s)": 0.186175 }, { "acc": 0.77616382, "epoch": 0.8306710278269721, "grad_norm": 3.34375, "learning_rate": 6.773642892935402e-06, "loss": 0.90147858, "memory(GiB)": 728.98, "step": 32745, "train_speed(iter/s)": 0.186135 }, { "acc": 0.75384922, "epoch": 0.8307978671960097, "grad_norm": 3.953125, "learning_rate": 6.772662396781074e-06, "loss": 0.92706537, "memory(GiB)": 728.98, "step": 32750, "train_speed(iter/s)": 0.186092 }, { "acc": 0.77077618, "epoch": 0.8309247065650472, "grad_norm": 3.859375, "learning_rate": 6.771681822652971e-06, "loss": 0.90483751, "memory(GiB)": 728.98, "step": 32755, "train_speed(iter/s)": 0.186049 }, { "acc": 0.76602569, "epoch": 0.8310515459340847, "grad_norm": 3.625, "learning_rate": 6.770701170594233e-06, "loss": 0.89879904, "memory(GiB)": 728.98, "step": 32760, "train_speed(iter/s)": 0.186007 }, { "acc": 0.75616488, "epoch": 0.8311783853031223, "grad_norm": 3.09375, "learning_rate": 6.769720440647991e-06, "loss": 0.88824215, "memory(GiB)": 728.98, "step": 32765, "train_speed(iter/s)": 0.185961 }, { "acc": 0.75221505, "epoch": 0.8313052246721598, "grad_norm": 3.453125, "learning_rate": 6.768739632857386e-06, "loss": 0.9196475, "memory(GiB)": 728.98, "step": 32770, "train_speed(iter/s)": 0.185921 }, { "acc": 0.76333694, "epoch": 0.8314320640411974, "grad_norm": 3.0625, "learning_rate": 6.76775874726556e-06, "loss": 0.89509773, "memory(GiB)": 728.98, "step": 32775, "train_speed(iter/s)": 0.185875 }, { "acc": 0.77008495, "epoch": 0.831558903410235, "grad_norm": 4.65625, "learning_rate": 6.766777783915658e-06, "loss": 0.89320011, "memory(GiB)": 728.98, "step": 32780, "train_speed(iter/s)": 0.185822 }, { "acc": 0.76142149, "epoch": 0.8316857427792725, "grad_norm": 3.359375, "learning_rate": 6.765796742850832e-06, "loss": 0.84503431, "memory(GiB)": 728.98, "step": 32785, "train_speed(iter/s)": 0.185767 }, { "acc": 0.76626325, "epoch": 0.8318125821483101, "grad_norm": 3.609375, "learning_rate": 6.764815624114232e-06, "loss": 0.9125247, "memory(GiB)": 728.98, "step": 32790, "train_speed(iter/s)": 0.185726 }, { "acc": 0.77165952, "epoch": 0.8319394215173477, "grad_norm": 3.171875, "learning_rate": 6.763834427749018e-06, "loss": 0.87355986, "memory(GiB)": 728.98, "step": 32795, "train_speed(iter/s)": 0.185685 }, { "acc": 0.75657206, "epoch": 0.8320662608863852, "grad_norm": 3.3125, "learning_rate": 6.7628531537983445e-06, "loss": 0.89071741, "memory(GiB)": 728.98, "step": 32800, "train_speed(iter/s)": 0.185647 }, { "acc": 0.76009221, "epoch": 0.8321931002554228, "grad_norm": 4.0625, "learning_rate": 6.7618718023053795e-06, "loss": 0.91441736, "memory(GiB)": 728.98, "step": 32805, "train_speed(iter/s)": 0.185607 }, { "acc": 0.76626496, "epoch": 0.8323199396244604, "grad_norm": 3.125, "learning_rate": 6.760890373313287e-06, "loss": 0.8710165, "memory(GiB)": 728.98, "step": 32810, "train_speed(iter/s)": 0.185563 }, { "acc": 0.76872773, "epoch": 0.832446778993498, "grad_norm": 10.0625, "learning_rate": 6.759908866865238e-06, "loss": 0.91554575, "memory(GiB)": 728.98, "step": 32815, "train_speed(iter/s)": 0.185516 }, { "acc": 0.77080793, "epoch": 0.8325736183625354, "grad_norm": 3.453125, "learning_rate": 6.758927283004406e-06, "loss": 0.91969147, "memory(GiB)": 728.98, "step": 32820, "train_speed(iter/s)": 0.185472 }, { "acc": 0.76179595, "epoch": 0.832700457731573, "grad_norm": 3.703125, "learning_rate": 6.757945621773965e-06, "loss": 0.92560453, "memory(GiB)": 728.98, "step": 32825, "train_speed(iter/s)": 0.185436 }, { "acc": 0.76959248, "epoch": 0.8328272971006105, "grad_norm": 3.640625, "learning_rate": 6.756963883217099e-06, "loss": 0.91585064, "memory(GiB)": 728.98, "step": 32830, "train_speed(iter/s)": 0.185398 }, { "acc": 0.75995531, "epoch": 0.8329541364696481, "grad_norm": 3.0625, "learning_rate": 6.755982067376987e-06, "loss": 0.92973385, "memory(GiB)": 728.98, "step": 32835, "train_speed(iter/s)": 0.185353 }, { "acc": 0.77509737, "epoch": 0.8330809758386857, "grad_norm": 3.90625, "learning_rate": 6.755000174296822e-06, "loss": 0.81660032, "memory(GiB)": 728.98, "step": 32840, "train_speed(iter/s)": 0.185316 }, { "acc": 0.76370959, "epoch": 0.8332078152077232, "grad_norm": 2.53125, "learning_rate": 6.75401820401979e-06, "loss": 0.89791021, "memory(GiB)": 728.98, "step": 32845, "train_speed(iter/s)": 0.185269 }, { "acc": 0.77168379, "epoch": 0.8333346545767608, "grad_norm": 36.5, "learning_rate": 6.753036156589086e-06, "loss": 0.8793273, "memory(GiB)": 728.98, "step": 32850, "train_speed(iter/s)": 0.185228 }, { "acc": 0.78141832, "epoch": 0.8334614939457984, "grad_norm": 3.984375, "learning_rate": 6.7520540320479076e-06, "loss": 0.84566469, "memory(GiB)": 728.98, "step": 32855, "train_speed(iter/s)": 0.185189 }, { "acc": 0.76417813, "epoch": 0.833588333314836, "grad_norm": 3.203125, "learning_rate": 6.751071830439453e-06, "loss": 0.86643028, "memory(GiB)": 728.98, "step": 32860, "train_speed(iter/s)": 0.185148 }, { "acc": 0.75803533, "epoch": 0.8337151726838735, "grad_norm": 3.21875, "learning_rate": 6.75008955180693e-06, "loss": 0.93584099, "memory(GiB)": 728.98, "step": 32865, "train_speed(iter/s)": 0.185108 }, { "acc": 0.76312423, "epoch": 0.8338420120529111, "grad_norm": 3.046875, "learning_rate": 6.749107196193542e-06, "loss": 0.88739491, "memory(GiB)": 728.98, "step": 32870, "train_speed(iter/s)": 0.18507 }, { "acc": 0.76242824, "epoch": 0.8339688514219487, "grad_norm": 3.4375, "learning_rate": 6.7481247636425016e-06, "loss": 0.91331596, "memory(GiB)": 728.98, "step": 32875, "train_speed(iter/s)": 0.185025 }, { "acc": 0.77236071, "epoch": 0.8340956907909861, "grad_norm": 3.03125, "learning_rate": 6.747142254197024e-06, "loss": 0.88456049, "memory(GiB)": 728.98, "step": 32880, "train_speed(iter/s)": 0.18498 }, { "acc": 0.77308822, "epoch": 0.8342225301600237, "grad_norm": 3.4375, "learning_rate": 6.746159667900324e-06, "loss": 0.85864401, "memory(GiB)": 728.98, "step": 32885, "train_speed(iter/s)": 0.184938 }, { "acc": 0.77984738, "epoch": 0.8343493695290612, "grad_norm": 3.75, "learning_rate": 6.745177004795624e-06, "loss": 0.85319462, "memory(GiB)": 728.98, "step": 32890, "train_speed(iter/s)": 0.184898 }, { "acc": 0.76138401, "epoch": 0.8344762088980988, "grad_norm": 3.3125, "learning_rate": 6.7441942649261485e-06, "loss": 0.86273985, "memory(GiB)": 728.98, "step": 32895, "train_speed(iter/s)": 0.184859 }, { "acc": 0.76121125, "epoch": 0.8346030482671364, "grad_norm": 3.921875, "learning_rate": 6.743211448335125e-06, "loss": 0.88412561, "memory(GiB)": 728.98, "step": 32900, "train_speed(iter/s)": 0.184823 }, { "acc": 0.76562972, "epoch": 0.834729887636174, "grad_norm": 3.59375, "learning_rate": 6.742228555065783e-06, "loss": 0.89419889, "memory(GiB)": 728.98, "step": 32905, "train_speed(iter/s)": 0.184773 }, { "acc": 0.75493588, "epoch": 0.8348567270052115, "grad_norm": 3.6875, "learning_rate": 6.741245585161359e-06, "loss": 0.95336046, "memory(GiB)": 728.98, "step": 32910, "train_speed(iter/s)": 0.184732 }, { "acc": 0.7638236, "epoch": 0.8349835663742491, "grad_norm": 3.59375, "learning_rate": 6.7402625386650885e-06, "loss": 0.83073139, "memory(GiB)": 728.98, "step": 32915, "train_speed(iter/s)": 0.184679 }, { "acc": 0.77785192, "epoch": 0.8351104057432867, "grad_norm": 3.234375, "learning_rate": 6.739279415620215e-06, "loss": 0.8153842, "memory(GiB)": 728.98, "step": 32920, "train_speed(iter/s)": 0.184637 }, { "acc": 0.74788132, "epoch": 0.8352372451123242, "grad_norm": 3.125, "learning_rate": 6.73829621606998e-06, "loss": 0.95654211, "memory(GiB)": 728.98, "step": 32925, "train_speed(iter/s)": 0.184599 }, { "acc": 0.75615973, "epoch": 0.8353640844813618, "grad_norm": 3.53125, "learning_rate": 6.737312940057635e-06, "loss": 0.89223757, "memory(GiB)": 728.98, "step": 32930, "train_speed(iter/s)": 0.184555 }, { "acc": 0.76361055, "epoch": 0.8354909238503994, "grad_norm": 3.703125, "learning_rate": 6.7363295876264275e-06, "loss": 0.91746931, "memory(GiB)": 728.98, "step": 32935, "train_speed(iter/s)": 0.184515 }, { "acc": 0.76539006, "epoch": 0.8356177632194368, "grad_norm": 3.90625, "learning_rate": 6.735346158819616e-06, "loss": 0.90841694, "memory(GiB)": 728.98, "step": 32940, "train_speed(iter/s)": 0.184472 }, { "acc": 0.77173462, "epoch": 0.8357446025884744, "grad_norm": 4.03125, "learning_rate": 6.734362653680454e-06, "loss": 0.87890596, "memory(GiB)": 728.98, "step": 32945, "train_speed(iter/s)": 0.184431 }, { "acc": 0.77297177, "epoch": 0.835871441957512, "grad_norm": 3.640625, "learning_rate": 6.733379072252205e-06, "loss": 0.90001307, "memory(GiB)": 728.98, "step": 32950, "train_speed(iter/s)": 0.184387 }, { "acc": 0.76801696, "epoch": 0.8359982813265495, "grad_norm": 4.28125, "learning_rate": 6.7323954145781345e-06, "loss": 0.85038033, "memory(GiB)": 728.98, "step": 32955, "train_speed(iter/s)": 0.18435 }, { "acc": 0.77114296, "epoch": 0.8361251206955871, "grad_norm": 3.703125, "learning_rate": 6.731411680701508e-06, "loss": 0.88845911, "memory(GiB)": 728.98, "step": 32960, "train_speed(iter/s)": 0.184312 }, { "acc": 0.77643929, "epoch": 0.8362519600646247, "grad_norm": 3.140625, "learning_rate": 6.7304278706656015e-06, "loss": 0.87826109, "memory(GiB)": 728.98, "step": 32965, "train_speed(iter/s)": 0.184274 }, { "acc": 0.78336415, "epoch": 0.8363787994336622, "grad_norm": 3.3125, "learning_rate": 6.729443984513683e-06, "loss": 0.84450779, "memory(GiB)": 728.98, "step": 32970, "train_speed(iter/s)": 0.184222 }, { "acc": 0.77126403, "epoch": 0.8365056388026998, "grad_norm": 4.34375, "learning_rate": 6.728460022289035e-06, "loss": 0.87572899, "memory(GiB)": 728.98, "step": 32975, "train_speed(iter/s)": 0.18418 }, { "acc": 0.76088142, "epoch": 0.8366324781717374, "grad_norm": 2.796875, "learning_rate": 6.727475984034938e-06, "loss": 0.90803537, "memory(GiB)": 728.98, "step": 32980, "train_speed(iter/s)": 0.184137 }, { "acc": 0.77042742, "epoch": 0.8367593175407749, "grad_norm": 5.03125, "learning_rate": 6.726491869794676e-06, "loss": 0.89778748, "memory(GiB)": 728.98, "step": 32985, "train_speed(iter/s)": 0.184099 }, { "acc": 0.74803081, "epoch": 0.8368861569098125, "grad_norm": 3.875, "learning_rate": 6.725507679611538e-06, "loss": 0.95105381, "memory(GiB)": 728.98, "step": 32990, "train_speed(iter/s)": 0.184053 }, { "acc": 0.74626436, "epoch": 0.8370129962788501, "grad_norm": 3.65625, "learning_rate": 6.724523413528815e-06, "loss": 0.92290325, "memory(GiB)": 728.98, "step": 32995, "train_speed(iter/s)": 0.18401 }, { "acc": 0.76844287, "epoch": 0.8371398356478875, "grad_norm": 4.28125, "learning_rate": 6.723539071589802e-06, "loss": 0.8842452, "memory(GiB)": 728.98, "step": 33000, "train_speed(iter/s)": 0.183972 }, { "epoch": 0.8371398356478875, "eval_acc": 0.7563781579553132, "eval_loss": 0.8627378344535828, "eval_runtime": 1151.1007, "eval_samples_per_second": 5.534, "eval_steps_per_second": 5.534, "step": 33000 }, { "acc": 0.76356654, "epoch": 0.8372666750169251, "grad_norm": 4.21875, "learning_rate": 6.7225546538377964e-06, "loss": 0.90795956, "memory(GiB)": 728.98, "step": 33005, "train_speed(iter/s)": 0.18203 }, { "acc": 0.77634211, "epoch": 0.8373935143859627, "grad_norm": 3.53125, "learning_rate": 6.7215701603160995e-06, "loss": 0.91928968, "memory(GiB)": 728.98, "step": 33010, "train_speed(iter/s)": 0.181984 }, { "acc": 0.76994748, "epoch": 0.8375203537550002, "grad_norm": 3.40625, "learning_rate": 6.720585591068018e-06, "loss": 0.85794678, "memory(GiB)": 728.98, "step": 33015, "train_speed(iter/s)": 0.18194 }, { "acc": 0.76760511, "epoch": 0.8376471931240378, "grad_norm": 3.171875, "learning_rate": 6.7196009461368575e-06, "loss": 0.8968133, "memory(GiB)": 728.98, "step": 33020, "train_speed(iter/s)": 0.181897 }, { "acc": 0.75043116, "epoch": 0.8377740324930754, "grad_norm": 3.796875, "learning_rate": 6.71861622556593e-06, "loss": 0.96773319, "memory(GiB)": 728.98, "step": 33025, "train_speed(iter/s)": 0.181861 }, { "acc": 0.78326817, "epoch": 0.8379008718621129, "grad_norm": 2.890625, "learning_rate": 6.717631429398551e-06, "loss": 0.82026396, "memory(GiB)": 728.98, "step": 33030, "train_speed(iter/s)": 0.181825 }, { "acc": 0.7745893, "epoch": 0.8380277112311505, "grad_norm": 3.625, "learning_rate": 6.716646557678039e-06, "loss": 0.85211973, "memory(GiB)": 728.98, "step": 33035, "train_speed(iter/s)": 0.181792 }, { "acc": 0.75403156, "epoch": 0.8381545506001881, "grad_norm": 3.453125, "learning_rate": 6.715661610447714e-06, "loss": 0.93465433, "memory(GiB)": 728.98, "step": 33040, "train_speed(iter/s)": 0.181754 }, { "acc": 0.75623145, "epoch": 0.8382813899692256, "grad_norm": 3.453125, "learning_rate": 6.714676587750902e-06, "loss": 0.91984634, "memory(GiB)": 728.98, "step": 33045, "train_speed(iter/s)": 0.181718 }, { "acc": 0.76279707, "epoch": 0.8384082293382632, "grad_norm": 5.03125, "learning_rate": 6.713691489630927e-06, "loss": 0.8974287, "memory(GiB)": 728.98, "step": 33050, "train_speed(iter/s)": 0.181676 }, { "acc": 0.76566725, "epoch": 0.8385350687073008, "grad_norm": 3.234375, "learning_rate": 6.7127063161311255e-06, "loss": 0.87180166, "memory(GiB)": 728.98, "step": 33055, "train_speed(iter/s)": 0.18164 }, { "acc": 0.77123308, "epoch": 0.8386619080763382, "grad_norm": 3.609375, "learning_rate": 6.711721067294832e-06, "loss": 0.85595264, "memory(GiB)": 728.98, "step": 33060, "train_speed(iter/s)": 0.181604 }, { "acc": 0.77324433, "epoch": 0.8387887474453758, "grad_norm": 3.234375, "learning_rate": 6.7107357431653795e-06, "loss": 0.89470739, "memory(GiB)": 728.98, "step": 33065, "train_speed(iter/s)": 0.181563 }, { "acc": 0.77453871, "epoch": 0.8389155868144134, "grad_norm": 3.484375, "learning_rate": 6.709750343786115e-06, "loss": 0.87503195, "memory(GiB)": 728.98, "step": 33070, "train_speed(iter/s)": 0.181514 }, { "acc": 0.76857257, "epoch": 0.8390424261834509, "grad_norm": 3.15625, "learning_rate": 6.708764869200378e-06, "loss": 0.86620312, "memory(GiB)": 728.98, "step": 33075, "train_speed(iter/s)": 0.181471 }, { "acc": 0.77238431, "epoch": 0.8391692655524885, "grad_norm": 3.328125, "learning_rate": 6.707779319451521e-06, "loss": 0.89070988, "memory(GiB)": 728.98, "step": 33080, "train_speed(iter/s)": 0.18143 }, { "acc": 0.76236157, "epoch": 0.8392961049215261, "grad_norm": 3.75, "learning_rate": 6.706793694582892e-06, "loss": 0.89724941, "memory(GiB)": 728.98, "step": 33085, "train_speed(iter/s)": 0.181388 }, { "acc": 0.77186708, "epoch": 0.8394229442905636, "grad_norm": 3.8125, "learning_rate": 6.705807994637847e-06, "loss": 0.9188261, "memory(GiB)": 728.98, "step": 33090, "train_speed(iter/s)": 0.181351 }, { "acc": 0.7687326, "epoch": 0.8395497836596012, "grad_norm": 3.34375, "learning_rate": 6.704822219659742e-06, "loss": 0.8897728, "memory(GiB)": 728.98, "step": 33095, "train_speed(iter/s)": 0.18131 }, { "acc": 0.75344739, "epoch": 0.8396766230286388, "grad_norm": 3.515625, "learning_rate": 6.703836369691941e-06, "loss": 0.95724573, "memory(GiB)": 728.98, "step": 33100, "train_speed(iter/s)": 0.181267 }, { "acc": 0.77627263, "epoch": 0.8398034623976763, "grad_norm": 4.0625, "learning_rate": 6.702850444777807e-06, "loss": 0.83681393, "memory(GiB)": 728.98, "step": 33105, "train_speed(iter/s)": 0.18123 }, { "acc": 0.75398631, "epoch": 0.8399303017667139, "grad_norm": 3.953125, "learning_rate": 6.701864444960706e-06, "loss": 0.92609396, "memory(GiB)": 728.98, "step": 33110, "train_speed(iter/s)": 0.181196 }, { "acc": 0.75744915, "epoch": 0.8400571411357515, "grad_norm": 7.21875, "learning_rate": 6.700878370284012e-06, "loss": 0.89799862, "memory(GiB)": 728.98, "step": 33115, "train_speed(iter/s)": 0.18115 }, { "acc": 0.77088494, "epoch": 0.8401839805047889, "grad_norm": 3.734375, "learning_rate": 6.699892220791098e-06, "loss": 0.86898355, "memory(GiB)": 728.98, "step": 33120, "train_speed(iter/s)": 0.181111 }, { "acc": 0.77627268, "epoch": 0.8403108198738265, "grad_norm": 5.40625, "learning_rate": 6.69890599652534e-06, "loss": 0.89484854, "memory(GiB)": 728.98, "step": 33125, "train_speed(iter/s)": 0.181074 }, { "acc": 0.78097715, "epoch": 0.8404376592428641, "grad_norm": 3.359375, "learning_rate": 6.6979196975301194e-06, "loss": 0.88863373, "memory(GiB)": 728.98, "step": 33130, "train_speed(iter/s)": 0.181034 }, { "acc": 0.77999554, "epoch": 0.8405644986119016, "grad_norm": 3.421875, "learning_rate": 6.696933323848824e-06, "loss": 0.83502913, "memory(GiB)": 728.98, "step": 33135, "train_speed(iter/s)": 0.180996 }, { "acc": 0.7605895, "epoch": 0.8406913379809392, "grad_norm": 3.734375, "learning_rate": 6.695946875524837e-06, "loss": 0.93405123, "memory(GiB)": 728.98, "step": 33140, "train_speed(iter/s)": 0.180956 }, { "acc": 0.76785097, "epoch": 0.8408181773499768, "grad_norm": 4.0, "learning_rate": 6.6949603526015505e-06, "loss": 0.88710279, "memory(GiB)": 728.98, "step": 33145, "train_speed(iter/s)": 0.180919 }, { "acc": 0.76248369, "epoch": 0.8409450167190143, "grad_norm": 3.65625, "learning_rate": 6.693973755122359e-06, "loss": 0.90336704, "memory(GiB)": 728.98, "step": 33150, "train_speed(iter/s)": 0.180885 }, { "acc": 0.75988536, "epoch": 0.8410718560880519, "grad_norm": 3.265625, "learning_rate": 6.692987083130659e-06, "loss": 0.89472141, "memory(GiB)": 728.98, "step": 33155, "train_speed(iter/s)": 0.180846 }, { "acc": 0.76552567, "epoch": 0.8411986954570895, "grad_norm": 3.21875, "learning_rate": 6.6920003366698514e-06, "loss": 0.89541636, "memory(GiB)": 728.98, "step": 33160, "train_speed(iter/s)": 0.180804 }, { "acc": 0.77622857, "epoch": 0.841325534826127, "grad_norm": 3.09375, "learning_rate": 6.69101351578334e-06, "loss": 0.86651764, "memory(GiB)": 728.98, "step": 33165, "train_speed(iter/s)": 0.180756 }, { "acc": 0.77094507, "epoch": 0.8414523741951646, "grad_norm": 3.78125, "learning_rate": 6.690026620514533e-06, "loss": 0.95174046, "memory(GiB)": 728.98, "step": 33170, "train_speed(iter/s)": 0.180722 }, { "acc": 0.75384154, "epoch": 0.8415792135642022, "grad_norm": 3.578125, "learning_rate": 6.689039650906838e-06, "loss": 0.9096899, "memory(GiB)": 728.98, "step": 33175, "train_speed(iter/s)": 0.180687 }, { "acc": 0.76077962, "epoch": 0.8417060529332396, "grad_norm": 3.796875, "learning_rate": 6.688052607003672e-06, "loss": 0.87898092, "memory(GiB)": 728.98, "step": 33180, "train_speed(iter/s)": 0.180655 }, { "acc": 0.75484505, "epoch": 0.8418328923022772, "grad_norm": 3.65625, "learning_rate": 6.68706548884845e-06, "loss": 0.88379402, "memory(GiB)": 728.98, "step": 33185, "train_speed(iter/s)": 0.180613 }, { "acc": 0.77288599, "epoch": 0.8419597316713148, "grad_norm": 3.609375, "learning_rate": 6.68607829648459e-06, "loss": 0.88247347, "memory(GiB)": 728.98, "step": 33190, "train_speed(iter/s)": 0.180579 }, { "acc": 0.76939244, "epoch": 0.8420865710403523, "grad_norm": 3.40625, "learning_rate": 6.685091029955522e-06, "loss": 0.88785992, "memory(GiB)": 728.98, "step": 33195, "train_speed(iter/s)": 0.180542 }, { "acc": 0.75452509, "epoch": 0.8422134104093899, "grad_norm": 3.546875, "learning_rate": 6.684103689304665e-06, "loss": 0.97193909, "memory(GiB)": 728.98, "step": 33200, "train_speed(iter/s)": 0.180504 }, { "acc": 0.77810531, "epoch": 0.8423402497784275, "grad_norm": 3.09375, "learning_rate": 6.6831162745754555e-06, "loss": 0.84152622, "memory(GiB)": 728.98, "step": 33205, "train_speed(iter/s)": 0.180461 }, { "acc": 0.76736455, "epoch": 0.842467089147465, "grad_norm": 3.921875, "learning_rate": 6.682128785811322e-06, "loss": 0.87049561, "memory(GiB)": 728.98, "step": 33210, "train_speed(iter/s)": 0.180425 }, { "acc": 0.76654797, "epoch": 0.8425939285165026, "grad_norm": 4.84375, "learning_rate": 6.681141223055703e-06, "loss": 0.91852913, "memory(GiB)": 728.98, "step": 33215, "train_speed(iter/s)": 0.180393 }, { "acc": 0.75900955, "epoch": 0.8427207678855402, "grad_norm": 3.28125, "learning_rate": 6.680153586352037e-06, "loss": 0.89843254, "memory(GiB)": 728.98, "step": 33220, "train_speed(iter/s)": 0.180352 }, { "acc": 0.7674314, "epoch": 0.8428476072545777, "grad_norm": 4.1875, "learning_rate": 6.679165875743771e-06, "loss": 0.90417471, "memory(GiB)": 728.98, "step": 33225, "train_speed(iter/s)": 0.180314 }, { "acc": 0.7684835, "epoch": 0.8429744466236153, "grad_norm": 4.4375, "learning_rate": 6.6781780912743465e-06, "loss": 0.91669254, "memory(GiB)": 728.98, "step": 33230, "train_speed(iter/s)": 0.180281 }, { "acc": 0.78642335, "epoch": 0.8431012859926529, "grad_norm": 3.4375, "learning_rate": 6.677190232987214e-06, "loss": 0.83224449, "memory(GiB)": 728.98, "step": 33235, "train_speed(iter/s)": 0.18025 }, { "acc": 0.7818346, "epoch": 0.8432281253616903, "grad_norm": 3.296875, "learning_rate": 6.676202300925828e-06, "loss": 0.84547768, "memory(GiB)": 728.98, "step": 33240, "train_speed(iter/s)": 0.180212 }, { "acc": 0.76434965, "epoch": 0.8433549647307279, "grad_norm": 3.71875, "learning_rate": 6.675214295133643e-06, "loss": 0.92803679, "memory(GiB)": 728.98, "step": 33245, "train_speed(iter/s)": 0.180179 }, { "acc": 0.78100939, "epoch": 0.8434818040997655, "grad_norm": 3.328125, "learning_rate": 6.674226215654119e-06, "loss": 0.81597385, "memory(GiB)": 728.98, "step": 33250, "train_speed(iter/s)": 0.180141 }, { "acc": 0.76942759, "epoch": 0.843608643468803, "grad_norm": 3.90625, "learning_rate": 6.673238062530718e-06, "loss": 0.86620455, "memory(GiB)": 728.98, "step": 33255, "train_speed(iter/s)": 0.180103 }, { "acc": 0.77764678, "epoch": 0.8437354828378406, "grad_norm": 3.140625, "learning_rate": 6.672249835806906e-06, "loss": 0.84499588, "memory(GiB)": 728.98, "step": 33260, "train_speed(iter/s)": 0.180061 }, { "acc": 0.76824651, "epoch": 0.8438623222068782, "grad_norm": 3.28125, "learning_rate": 6.671261535526152e-06, "loss": 0.84208193, "memory(GiB)": 728.98, "step": 33265, "train_speed(iter/s)": 0.180012 }, { "acc": 0.75033302, "epoch": 0.8439891615759157, "grad_norm": 4.0, "learning_rate": 6.670273161731929e-06, "loss": 0.94582748, "memory(GiB)": 728.98, "step": 33270, "train_speed(iter/s)": 0.179975 }, { "acc": 0.76813984, "epoch": 0.8441160009449533, "grad_norm": 4.0625, "learning_rate": 6.66928471446771e-06, "loss": 0.92413368, "memory(GiB)": 728.98, "step": 33275, "train_speed(iter/s)": 0.179943 }, { "acc": 0.76756091, "epoch": 0.8442428403139909, "grad_norm": 3.65625, "learning_rate": 6.668296193776974e-06, "loss": 0.90020189, "memory(GiB)": 728.98, "step": 33280, "train_speed(iter/s)": 0.179905 }, { "acc": 0.75719366, "epoch": 0.8443696796830285, "grad_norm": 3.265625, "learning_rate": 6.667307599703206e-06, "loss": 0.89920454, "memory(GiB)": 728.98, "step": 33285, "train_speed(iter/s)": 0.179864 }, { "acc": 0.78023419, "epoch": 0.844496519052066, "grad_norm": 3.03125, "learning_rate": 6.666318932289888e-06, "loss": 0.8423934, "memory(GiB)": 728.98, "step": 33290, "train_speed(iter/s)": 0.179825 }, { "acc": 0.76658478, "epoch": 0.8446233584211036, "grad_norm": 3.5, "learning_rate": 6.66533019158051e-06, "loss": 0.8910346, "memory(GiB)": 728.98, "step": 33295, "train_speed(iter/s)": 0.17979 }, { "acc": 0.75898714, "epoch": 0.844750197790141, "grad_norm": 3.84375, "learning_rate": 6.664341377618563e-06, "loss": 0.94379396, "memory(GiB)": 728.98, "step": 33300, "train_speed(iter/s)": 0.179756 }, { "acc": 0.77056923, "epoch": 0.8448770371591786, "grad_norm": 3.125, "learning_rate": 6.663352490447541e-06, "loss": 0.86450214, "memory(GiB)": 728.98, "step": 33305, "train_speed(iter/s)": 0.179717 }, { "acc": 0.76889143, "epoch": 0.8450038765282162, "grad_norm": 4.0, "learning_rate": 6.662363530110945e-06, "loss": 0.89750576, "memory(GiB)": 728.98, "step": 33310, "train_speed(iter/s)": 0.179681 }, { "acc": 0.76077147, "epoch": 0.8451307158972537, "grad_norm": 3.109375, "learning_rate": 6.6613744966522705e-06, "loss": 0.87374411, "memory(GiB)": 728.98, "step": 33315, "train_speed(iter/s)": 0.179642 }, { "acc": 0.75897322, "epoch": 0.8452575552662913, "grad_norm": 3.25, "learning_rate": 6.6603853901150274e-06, "loss": 0.91100988, "memory(GiB)": 728.98, "step": 33320, "train_speed(iter/s)": 0.179601 }, { "acc": 0.76370993, "epoch": 0.8453843946353289, "grad_norm": 3.359375, "learning_rate": 6.659396210542721e-06, "loss": 0.89774675, "memory(GiB)": 728.98, "step": 33325, "train_speed(iter/s)": 0.179564 }, { "acc": 0.77694101, "epoch": 0.8455112340043665, "grad_norm": 4.6875, "learning_rate": 6.658406957978862e-06, "loss": 0.85317421, "memory(GiB)": 728.98, "step": 33330, "train_speed(iter/s)": 0.179525 }, { "acc": 0.76576843, "epoch": 0.845638073373404, "grad_norm": 3.34375, "learning_rate": 6.657417632466965e-06, "loss": 0.88124781, "memory(GiB)": 728.98, "step": 33335, "train_speed(iter/s)": 0.179479 }, { "acc": 0.77016463, "epoch": 0.8457649127424416, "grad_norm": 3.0625, "learning_rate": 6.6564282340505484e-06, "loss": 0.8447567, "memory(GiB)": 728.98, "step": 33340, "train_speed(iter/s)": 0.179439 }, { "acc": 0.7503396, "epoch": 0.8458917521114792, "grad_norm": 3.125, "learning_rate": 6.65543876277313e-06, "loss": 0.9314085, "memory(GiB)": 728.98, "step": 33345, "train_speed(iter/s)": 0.179405 }, { "acc": 0.76696544, "epoch": 0.8460185914805167, "grad_norm": 3.5, "learning_rate": 6.654449218678236e-06, "loss": 0.87903814, "memory(GiB)": 728.98, "step": 33350, "train_speed(iter/s)": 0.179367 }, { "acc": 0.75942707, "epoch": 0.8461454308495543, "grad_norm": 3.671875, "learning_rate": 6.653459601809392e-06, "loss": 0.93913622, "memory(GiB)": 728.98, "step": 33355, "train_speed(iter/s)": 0.179331 }, { "acc": 0.78059845, "epoch": 0.8462722702185917, "grad_norm": 3.296875, "learning_rate": 6.652469912210128e-06, "loss": 0.8755455, "memory(GiB)": 728.98, "step": 33360, "train_speed(iter/s)": 0.179287 }, { "acc": 0.76547809, "epoch": 0.8463991095876293, "grad_norm": 3.53125, "learning_rate": 6.651480149923978e-06, "loss": 0.89446163, "memory(GiB)": 728.98, "step": 33365, "train_speed(iter/s)": 0.17925 }, { "acc": 0.76809754, "epoch": 0.8465259489566669, "grad_norm": 3.453125, "learning_rate": 6.650490314994478e-06, "loss": 0.87380409, "memory(GiB)": 728.98, "step": 33370, "train_speed(iter/s)": 0.17921 }, { "acc": 0.76422424, "epoch": 0.8466527883257045, "grad_norm": 4.21875, "learning_rate": 6.649500407465168e-06, "loss": 0.94789419, "memory(GiB)": 728.98, "step": 33375, "train_speed(iter/s)": 0.179176 }, { "acc": 0.75565305, "epoch": 0.846779627694742, "grad_norm": 2.9375, "learning_rate": 6.64851042737959e-06, "loss": 0.87248383, "memory(GiB)": 728.98, "step": 33380, "train_speed(iter/s)": 0.179133 }, { "acc": 0.77343764, "epoch": 0.8469064670637796, "grad_norm": 3.15625, "learning_rate": 6.647520374781291e-06, "loss": 0.90038681, "memory(GiB)": 728.98, "step": 33385, "train_speed(iter/s)": 0.179097 }, { "acc": 0.75922785, "epoch": 0.8470333064328172, "grad_norm": 4.0625, "learning_rate": 6.64653024971382e-06, "loss": 0.93814898, "memory(GiB)": 728.98, "step": 33390, "train_speed(iter/s)": 0.17906 }, { "acc": 0.77655025, "epoch": 0.8471601458018547, "grad_norm": 4.0, "learning_rate": 6.64554005222073e-06, "loss": 0.89398251, "memory(GiB)": 728.98, "step": 33395, "train_speed(iter/s)": 0.179026 }, { "acc": 0.76803069, "epoch": 0.8472869851708923, "grad_norm": 3.171875, "learning_rate": 6.644549782345576e-06, "loss": 0.87681074, "memory(GiB)": 728.98, "step": 33400, "train_speed(iter/s)": 0.178989 }, { "acc": 0.76480503, "epoch": 0.8474138245399299, "grad_norm": 3.78125, "learning_rate": 6.643559440131917e-06, "loss": 0.88841352, "memory(GiB)": 728.98, "step": 33405, "train_speed(iter/s)": 0.178954 }, { "acc": 0.77273645, "epoch": 0.8475406639089674, "grad_norm": 3.84375, "learning_rate": 6.642569025623316e-06, "loss": 0.90237694, "memory(GiB)": 728.98, "step": 33410, "train_speed(iter/s)": 0.178918 }, { "acc": 0.77389336, "epoch": 0.847667503278005, "grad_norm": 3.375, "learning_rate": 6.641578538863335e-06, "loss": 0.88341761, "memory(GiB)": 728.98, "step": 33415, "train_speed(iter/s)": 0.178878 }, { "acc": 0.74453363, "epoch": 0.8477943426470425, "grad_norm": 2.859375, "learning_rate": 6.640587979895544e-06, "loss": 0.92507534, "memory(GiB)": 728.98, "step": 33420, "train_speed(iter/s)": 0.178838 }, { "acc": 0.77205944, "epoch": 0.84792118201608, "grad_norm": 4.75, "learning_rate": 6.6395973487635155e-06, "loss": 0.91384296, "memory(GiB)": 728.98, "step": 33425, "train_speed(iter/s)": 0.178804 }, { "acc": 0.76037493, "epoch": 0.8480480213851176, "grad_norm": 3.546875, "learning_rate": 6.638606645510826e-06, "loss": 0.90640297, "memory(GiB)": 728.98, "step": 33430, "train_speed(iter/s)": 0.178763 }, { "acc": 0.78345242, "epoch": 0.8481748607541552, "grad_norm": 3.625, "learning_rate": 6.637615870181049e-06, "loss": 0.84911661, "memory(GiB)": 728.98, "step": 33435, "train_speed(iter/s)": 0.178722 }, { "acc": 0.78077774, "epoch": 0.8483017001231927, "grad_norm": 3.328125, "learning_rate": 6.6366250228177684e-06, "loss": 0.88018112, "memory(GiB)": 728.98, "step": 33440, "train_speed(iter/s)": 0.178683 }, { "acc": 0.78652315, "epoch": 0.8484285394922303, "grad_norm": 3.15625, "learning_rate": 6.635634103464568e-06, "loss": 0.77900681, "memory(GiB)": 728.98, "step": 33445, "train_speed(iter/s)": 0.178646 }, { "acc": 0.76451902, "epoch": 0.8485553788612679, "grad_norm": 3.640625, "learning_rate": 6.634643112165033e-06, "loss": 0.87570248, "memory(GiB)": 728.98, "step": 33450, "train_speed(iter/s)": 0.178599 }, { "acc": 0.77847052, "epoch": 0.8486822182303054, "grad_norm": 3.8125, "learning_rate": 6.633652048962757e-06, "loss": 0.81456165, "memory(GiB)": 728.98, "step": 33455, "train_speed(iter/s)": 0.17856 }, { "acc": 0.76526289, "epoch": 0.848809057599343, "grad_norm": 4.375, "learning_rate": 6.632660913901333e-06, "loss": 0.94645424, "memory(GiB)": 728.98, "step": 33460, "train_speed(iter/s)": 0.178522 }, { "acc": 0.77821689, "epoch": 0.8489358969683806, "grad_norm": 3.703125, "learning_rate": 6.631669707024357e-06, "loss": 0.86017876, "memory(GiB)": 728.98, "step": 33465, "train_speed(iter/s)": 0.178488 }, { "acc": 0.77196789, "epoch": 0.8490627363374181, "grad_norm": 3.296875, "learning_rate": 6.630678428375429e-06, "loss": 0.88953981, "memory(GiB)": 728.98, "step": 33470, "train_speed(iter/s)": 0.178452 }, { "acc": 0.76630316, "epoch": 0.8491895757064557, "grad_norm": 3.9375, "learning_rate": 6.629687077998154e-06, "loss": 0.89265852, "memory(GiB)": 728.98, "step": 33475, "train_speed(iter/s)": 0.178417 }, { "acc": 0.7701262, "epoch": 0.8493164150754932, "grad_norm": 3.65625, "learning_rate": 6.628695655936136e-06, "loss": 0.90287485, "memory(GiB)": 728.98, "step": 33480, "train_speed(iter/s)": 0.178379 }, { "acc": 0.74670382, "epoch": 0.8494432544445307, "grad_norm": 4.25, "learning_rate": 6.6277041622329855e-06, "loss": 1.00445328, "memory(GiB)": 728.98, "step": 33485, "train_speed(iter/s)": 0.178345 }, { "acc": 0.76402636, "epoch": 0.8495700938135683, "grad_norm": 3.046875, "learning_rate": 6.626712596932314e-06, "loss": 0.92532587, "memory(GiB)": 728.98, "step": 33490, "train_speed(iter/s)": 0.178312 }, { "acc": 0.75327997, "epoch": 0.8496969331826059, "grad_norm": 3.859375, "learning_rate": 6.6257209600777396e-06, "loss": 0.95852022, "memory(GiB)": 728.98, "step": 33495, "train_speed(iter/s)": 0.178277 }, { "acc": 0.77821655, "epoch": 0.8498237725516434, "grad_norm": 3.609375, "learning_rate": 6.6247292517128805e-06, "loss": 0.82288408, "memory(GiB)": 728.98, "step": 33500, "train_speed(iter/s)": 0.178242 }, { "epoch": 0.8498237725516434, "eval_acc": 0.756569913150354, "eval_loss": 0.861929714679718, "eval_runtime": 1155.3845, "eval_samples_per_second": 5.513, "eval_steps_per_second": 5.513, "step": 33500 }, { "acc": 0.7695364, "epoch": 0.849950611920681, "grad_norm": 3.34375, "learning_rate": 6.623737471881356e-06, "loss": 0.84571066, "memory(GiB)": 728.98, "step": 33505, "train_speed(iter/s)": 0.176438 }, { "acc": 0.79168272, "epoch": 0.8500774512897186, "grad_norm": 2.921875, "learning_rate": 6.622745620626796e-06, "loss": 0.83320761, "memory(GiB)": 728.98, "step": 33510, "train_speed(iter/s)": 0.176398 }, { "acc": 0.7669095, "epoch": 0.8502042906587561, "grad_norm": 3.0625, "learning_rate": 6.621753697992825e-06, "loss": 0.9162714, "memory(GiB)": 728.98, "step": 33515, "train_speed(iter/s)": 0.176362 }, { "acc": 0.7694963, "epoch": 0.8503311300277937, "grad_norm": 3.609375, "learning_rate": 6.620761704023078e-06, "loss": 0.94209499, "memory(GiB)": 728.98, "step": 33520, "train_speed(iter/s)": 0.176328 }, { "acc": 0.75468416, "epoch": 0.8504579693968313, "grad_norm": 3.25, "learning_rate": 6.619769638761187e-06, "loss": 0.9512105, "memory(GiB)": 728.98, "step": 33525, "train_speed(iter/s)": 0.176286 }, { "acc": 0.76894312, "epoch": 0.8505848087658688, "grad_norm": 3.5625, "learning_rate": 6.6187775022507895e-06, "loss": 0.89902744, "memory(GiB)": 728.98, "step": 33530, "train_speed(iter/s)": 0.17625 }, { "acc": 0.76707587, "epoch": 0.8507116481349064, "grad_norm": 3.28125, "learning_rate": 6.617785294535528e-06, "loss": 0.83433943, "memory(GiB)": 728.98, "step": 33535, "train_speed(iter/s)": 0.176214 }, { "acc": 0.77867479, "epoch": 0.8508384875039439, "grad_norm": 3.46875, "learning_rate": 6.6167930156590455e-06, "loss": 0.82723808, "memory(GiB)": 728.98, "step": 33540, "train_speed(iter/s)": 0.176171 }, { "acc": 0.7634799, "epoch": 0.8509653268729814, "grad_norm": 4.78125, "learning_rate": 6.615800665664991e-06, "loss": 0.90652819, "memory(GiB)": 728.98, "step": 33545, "train_speed(iter/s)": 0.176132 }, { "acc": 0.76930099, "epoch": 0.851092166242019, "grad_norm": 3.46875, "learning_rate": 6.6148082445970104e-06, "loss": 0.88690987, "memory(GiB)": 728.98, "step": 33550, "train_speed(iter/s)": 0.176094 }, { "acc": 0.7689219, "epoch": 0.8512190056110566, "grad_norm": 3.859375, "learning_rate": 6.613815752498763e-06, "loss": 0.89097557, "memory(GiB)": 728.98, "step": 33555, "train_speed(iter/s)": 0.176063 }, { "acc": 0.76755471, "epoch": 0.8513458449800941, "grad_norm": 4.25, "learning_rate": 6.6128231894139016e-06, "loss": 0.84286785, "memory(GiB)": 728.98, "step": 33560, "train_speed(iter/s)": 0.176031 }, { "acc": 0.77310004, "epoch": 0.8514726843491317, "grad_norm": 3.109375, "learning_rate": 6.611830555386088e-06, "loss": 0.84542294, "memory(GiB)": 728.98, "step": 33565, "train_speed(iter/s)": 0.175993 }, { "acc": 0.77722363, "epoch": 0.8515995237181693, "grad_norm": 3.5625, "learning_rate": 6.610837850458982e-06, "loss": 0.82187243, "memory(GiB)": 728.98, "step": 33570, "train_speed(iter/s)": 0.175958 }, { "acc": 0.7581965, "epoch": 0.8517263630872068, "grad_norm": 3.953125, "learning_rate": 6.609845074676252e-06, "loss": 0.9046979, "memory(GiB)": 728.98, "step": 33575, "train_speed(iter/s)": 0.175922 }, { "acc": 0.77389607, "epoch": 0.8518532024562444, "grad_norm": 3.59375, "learning_rate": 6.608852228081568e-06, "loss": 0.86045713, "memory(GiB)": 728.98, "step": 33580, "train_speed(iter/s)": 0.175883 }, { "acc": 0.77418327, "epoch": 0.851980041825282, "grad_norm": 3.328125, "learning_rate": 6.607859310718598e-06, "loss": 0.88594198, "memory(GiB)": 728.98, "step": 33585, "train_speed(iter/s)": 0.175844 }, { "acc": 0.76493602, "epoch": 0.8521068811943195, "grad_norm": 3.8125, "learning_rate": 6.606866322631022e-06, "loss": 0.91310101, "memory(GiB)": 728.98, "step": 33590, "train_speed(iter/s)": 0.1758 }, { "acc": 0.77250113, "epoch": 0.8522337205633571, "grad_norm": 3.40625, "learning_rate": 6.605873263862516e-06, "loss": 0.89446115, "memory(GiB)": 728.98, "step": 33595, "train_speed(iter/s)": 0.175765 }, { "acc": 0.75678415, "epoch": 0.8523605599323946, "grad_norm": 3.234375, "learning_rate": 6.604880134456761e-06, "loss": 0.94894228, "memory(GiB)": 728.98, "step": 33600, "train_speed(iter/s)": 0.175727 }, { "acc": 0.74497137, "epoch": 0.8524873993014321, "grad_norm": 4.0625, "learning_rate": 6.603886934457444e-06, "loss": 0.92791271, "memory(GiB)": 728.98, "step": 33605, "train_speed(iter/s)": 0.175696 }, { "acc": 0.7526782, "epoch": 0.8526142386704697, "grad_norm": 4.4375, "learning_rate": 6.602893663908248e-06, "loss": 0.94225674, "memory(GiB)": 728.98, "step": 33610, "train_speed(iter/s)": 0.175663 }, { "acc": 0.7556673, "epoch": 0.8527410780395073, "grad_norm": 3.265625, "learning_rate": 6.60190032285287e-06, "loss": 0.93614759, "memory(GiB)": 728.98, "step": 33615, "train_speed(iter/s)": 0.175629 }, { "acc": 0.78075662, "epoch": 0.8528679174085448, "grad_norm": 4.8125, "learning_rate": 6.600906911334999e-06, "loss": 0.82332373, "memory(GiB)": 728.98, "step": 33620, "train_speed(iter/s)": 0.175594 }, { "acc": 0.75904741, "epoch": 0.8529947567775824, "grad_norm": 3.203125, "learning_rate": 6.599913429398335e-06, "loss": 0.90723963, "memory(GiB)": 728.98, "step": 33625, "train_speed(iter/s)": 0.175555 }, { "acc": 0.76657228, "epoch": 0.85312159614662, "grad_norm": 3.53125, "learning_rate": 6.598919877086575e-06, "loss": 0.90102129, "memory(GiB)": 728.98, "step": 33630, "train_speed(iter/s)": 0.175522 }, { "acc": 0.77936797, "epoch": 0.8532484355156575, "grad_norm": 3.296875, "learning_rate": 6.597926254443426e-06, "loss": 0.85857334, "memory(GiB)": 728.98, "step": 33635, "train_speed(iter/s)": 0.175489 }, { "acc": 0.77219567, "epoch": 0.8533752748846951, "grad_norm": 3.3125, "learning_rate": 6.59693256151259e-06, "loss": 0.88146677, "memory(GiB)": 728.98, "step": 33640, "train_speed(iter/s)": 0.175455 }, { "acc": 0.76243114, "epoch": 0.8535021142537327, "grad_norm": 4.09375, "learning_rate": 6.595938798337781e-06, "loss": 0.87348356, "memory(GiB)": 728.98, "step": 33645, "train_speed(iter/s)": 0.175419 }, { "acc": 0.76781955, "epoch": 0.8536289536227702, "grad_norm": 3.0625, "learning_rate": 6.594944964962708e-06, "loss": 0.88235884, "memory(GiB)": 728.98, "step": 33650, "train_speed(iter/s)": 0.175386 }, { "acc": 0.7719749, "epoch": 0.8537557929918078, "grad_norm": 3.578125, "learning_rate": 6.593951061431088e-06, "loss": 0.87837229, "memory(GiB)": 728.98, "step": 33655, "train_speed(iter/s)": 0.175355 }, { "acc": 0.7809566, "epoch": 0.8538826323608453, "grad_norm": 3.421875, "learning_rate": 6.592957087786641e-06, "loss": 0.86630859, "memory(GiB)": 728.98, "step": 33660, "train_speed(iter/s)": 0.175315 }, { "acc": 0.76680593, "epoch": 0.8540094717298828, "grad_norm": 4.53125, "learning_rate": 6.591963044073085e-06, "loss": 0.89844379, "memory(GiB)": 728.98, "step": 33665, "train_speed(iter/s)": 0.175282 }, { "acc": 0.76482592, "epoch": 0.8541363110989204, "grad_norm": 3.21875, "learning_rate": 6.590968930334148e-06, "loss": 0.91853476, "memory(GiB)": 728.98, "step": 33670, "train_speed(iter/s)": 0.175245 }, { "acc": 0.7656497, "epoch": 0.854263150467958, "grad_norm": 3.265625, "learning_rate": 6.589974746613557e-06, "loss": 0.87973375, "memory(GiB)": 728.98, "step": 33675, "train_speed(iter/s)": 0.175204 }, { "acc": 0.77929654, "epoch": 0.8543899898369955, "grad_norm": 3.3125, "learning_rate": 6.588980492955043e-06, "loss": 0.87092104, "memory(GiB)": 728.98, "step": 33680, "train_speed(iter/s)": 0.175167 }, { "acc": 0.75647392, "epoch": 0.8545168292060331, "grad_norm": 3.53125, "learning_rate": 6.5879861694023385e-06, "loss": 0.93369961, "memory(GiB)": 728.98, "step": 33685, "train_speed(iter/s)": 0.175132 }, { "acc": 0.77522521, "epoch": 0.8546436685750707, "grad_norm": 3.5, "learning_rate": 6.586991775999184e-06, "loss": 0.8784852, "memory(GiB)": 728.98, "step": 33690, "train_speed(iter/s)": 0.175099 }, { "acc": 0.7564847, "epoch": 0.8547705079441082, "grad_norm": 3.296875, "learning_rate": 6.5859973127893175e-06, "loss": 0.9171833, "memory(GiB)": 728.98, "step": 33695, "train_speed(iter/s)": 0.175067 }, { "acc": 0.77590537, "epoch": 0.8548973473131458, "grad_norm": 3.09375, "learning_rate": 6.585002779816481e-06, "loss": 0.83196678, "memory(GiB)": 728.98, "step": 33700, "train_speed(iter/s)": 0.175032 }, { "acc": 0.77384057, "epoch": 0.8550241866821834, "grad_norm": 4.40625, "learning_rate": 6.5840081771244235e-06, "loss": 0.92141352, "memory(GiB)": 728.98, "step": 33705, "train_speed(iter/s)": 0.174994 }, { "acc": 0.7610971, "epoch": 0.855151026051221, "grad_norm": 2.96875, "learning_rate": 6.583013504756892e-06, "loss": 0.91363964, "memory(GiB)": 728.98, "step": 33710, "train_speed(iter/s)": 0.174958 }, { "acc": 0.75989609, "epoch": 0.8552778654202585, "grad_norm": 4.125, "learning_rate": 6.582018762757641e-06, "loss": 0.9025465, "memory(GiB)": 728.98, "step": 33715, "train_speed(iter/s)": 0.174921 }, { "acc": 0.76386452, "epoch": 0.855404704789296, "grad_norm": 3.90625, "learning_rate": 6.581023951170424e-06, "loss": 0.88963938, "memory(GiB)": 728.98, "step": 33720, "train_speed(iter/s)": 0.17488 }, { "acc": 0.75955067, "epoch": 0.8555315441583335, "grad_norm": 3.25, "learning_rate": 6.580029070039004e-06, "loss": 0.94256363, "memory(GiB)": 728.98, "step": 33725, "train_speed(iter/s)": 0.174848 }, { "acc": 0.75519214, "epoch": 0.8556583835273711, "grad_norm": 3.265625, "learning_rate": 6.579034119407137e-06, "loss": 0.89940653, "memory(GiB)": 728.98, "step": 33730, "train_speed(iter/s)": 0.174804 }, { "acc": 0.76132441, "epoch": 0.8557852228964087, "grad_norm": 3.9375, "learning_rate": 6.578039099318592e-06, "loss": 0.94443846, "memory(GiB)": 728.98, "step": 33735, "train_speed(iter/s)": 0.174773 }, { "acc": 0.76266561, "epoch": 0.8559120622654462, "grad_norm": 3.5, "learning_rate": 6.577044009817133e-06, "loss": 0.87593031, "memory(GiB)": 728.98, "step": 33740, "train_speed(iter/s)": 0.174736 }, { "acc": 0.77124958, "epoch": 0.8560389016344838, "grad_norm": 4.15625, "learning_rate": 6.576048850946533e-06, "loss": 0.86352472, "memory(GiB)": 728.98, "step": 33745, "train_speed(iter/s)": 0.174699 }, { "acc": 0.78060899, "epoch": 0.8561657410035214, "grad_norm": 3.203125, "learning_rate": 6.5750536227505665e-06, "loss": 0.87271833, "memory(GiB)": 728.98, "step": 33750, "train_speed(iter/s)": 0.174663 }, { "acc": 0.76430025, "epoch": 0.856292580372559, "grad_norm": 3.484375, "learning_rate": 6.5740583252730095e-06, "loss": 0.91313763, "memory(GiB)": 728.98, "step": 33755, "train_speed(iter/s)": 0.174627 }, { "acc": 0.77067027, "epoch": 0.8564194197415965, "grad_norm": 4.15625, "learning_rate": 6.573062958557642e-06, "loss": 0.85073671, "memory(GiB)": 728.98, "step": 33760, "train_speed(iter/s)": 0.174586 }, { "acc": 0.76735964, "epoch": 0.8565462591106341, "grad_norm": 2.96875, "learning_rate": 6.572067522648247e-06, "loss": 0.8887311, "memory(GiB)": 728.98, "step": 33765, "train_speed(iter/s)": 0.174548 }, { "acc": 0.7772419, "epoch": 0.8566730984796717, "grad_norm": 3.953125, "learning_rate": 6.571072017588611e-06, "loss": 0.84596043, "memory(GiB)": 728.98, "step": 33770, "train_speed(iter/s)": 0.174514 }, { "acc": 0.78137479, "epoch": 0.8567999378487092, "grad_norm": 3.515625, "learning_rate": 6.570076443422523e-06, "loss": 0.85333424, "memory(GiB)": 728.98, "step": 33775, "train_speed(iter/s)": 0.174476 }, { "acc": 0.7746139, "epoch": 0.8569267772177467, "grad_norm": 3.40625, "learning_rate": 6.5690808001937725e-06, "loss": 0.88989506, "memory(GiB)": 728.98, "step": 33780, "train_speed(iter/s)": 0.174441 }, { "acc": 0.75491848, "epoch": 0.8570536165867843, "grad_norm": 3.171875, "learning_rate": 6.568085087946159e-06, "loss": 0.89854851, "memory(GiB)": 728.98, "step": 33785, "train_speed(iter/s)": 0.174407 }, { "acc": 0.76658459, "epoch": 0.8571804559558218, "grad_norm": 3.140625, "learning_rate": 6.567089306723477e-06, "loss": 0.86056871, "memory(GiB)": 728.98, "step": 33790, "train_speed(iter/s)": 0.174371 }, { "acc": 0.76858044, "epoch": 0.8573072953248594, "grad_norm": 3.078125, "learning_rate": 6.566093456569532e-06, "loss": 0.89653244, "memory(GiB)": 728.98, "step": 33795, "train_speed(iter/s)": 0.17434 }, { "acc": 0.77210035, "epoch": 0.857434134693897, "grad_norm": 3.015625, "learning_rate": 6.565097537528124e-06, "loss": 0.876443, "memory(GiB)": 728.98, "step": 33800, "train_speed(iter/s)": 0.174301 }, { "acc": 0.77331142, "epoch": 0.8575609740629345, "grad_norm": 3.328125, "learning_rate": 6.564101549643062e-06, "loss": 0.85199347, "memory(GiB)": 728.98, "step": 33805, "train_speed(iter/s)": 0.17427 }, { "acc": 0.75820589, "epoch": 0.8576878134319721, "grad_norm": 3.328125, "learning_rate": 6.563105492958158e-06, "loss": 0.96487131, "memory(GiB)": 728.98, "step": 33810, "train_speed(iter/s)": 0.174232 }, { "acc": 0.76218758, "epoch": 0.8578146528010097, "grad_norm": 4.90625, "learning_rate": 6.562109367517222e-06, "loss": 0.94063187, "memory(GiB)": 728.98, "step": 33815, "train_speed(iter/s)": 0.174204 }, { "acc": 0.77220879, "epoch": 0.8579414921700472, "grad_norm": 3.734375, "learning_rate": 6.561113173364073e-06, "loss": 0.8864419, "memory(GiB)": 728.98, "step": 33820, "train_speed(iter/s)": 0.174172 }, { "acc": 0.76005177, "epoch": 0.8580683315390848, "grad_norm": 3.640625, "learning_rate": 6.560116910542529e-06, "loss": 0.86516104, "memory(GiB)": 728.98, "step": 33825, "train_speed(iter/s)": 0.174134 }, { "acc": 0.76986041, "epoch": 0.8581951709081224, "grad_norm": 3.5625, "learning_rate": 6.559120579096413e-06, "loss": 0.89750309, "memory(GiB)": 728.98, "step": 33830, "train_speed(iter/s)": 0.174103 }, { "acc": 0.75158033, "epoch": 0.8583220102771599, "grad_norm": 3.34375, "learning_rate": 6.55812417906955e-06, "loss": 0.88020277, "memory(GiB)": 728.98, "step": 33835, "train_speed(iter/s)": 0.17407 }, { "acc": 0.7702507, "epoch": 0.8584488496461974, "grad_norm": 2.953125, "learning_rate": 6.55712771050577e-06, "loss": 0.89218006, "memory(GiB)": 728.98, "step": 33840, "train_speed(iter/s)": 0.174035 }, { "acc": 0.76979499, "epoch": 0.858575689015235, "grad_norm": 3.484375, "learning_rate": 6.556131173448902e-06, "loss": 0.91552515, "memory(GiB)": 728.98, "step": 33845, "train_speed(iter/s)": 0.174002 }, { "acc": 0.7803957, "epoch": 0.8587025283842725, "grad_norm": 3.265625, "learning_rate": 6.555134567942782e-06, "loss": 0.86648407, "memory(GiB)": 728.98, "step": 33850, "train_speed(iter/s)": 0.173961 }, { "acc": 0.77951055, "epoch": 0.8588293677533101, "grad_norm": 3.15625, "learning_rate": 6.554137894031248e-06, "loss": 0.82156286, "memory(GiB)": 728.98, "step": 33855, "train_speed(iter/s)": 0.173929 }, { "acc": 0.76058989, "epoch": 0.8589562071223477, "grad_norm": 2.984375, "learning_rate": 6.553141151758139e-06, "loss": 0.90738993, "memory(GiB)": 728.98, "step": 33860, "train_speed(iter/s)": 0.173895 }, { "acc": 0.75896878, "epoch": 0.8590830464913852, "grad_norm": 3.09375, "learning_rate": 6.5521443411673e-06, "loss": 0.90810938, "memory(GiB)": 728.98, "step": 33865, "train_speed(iter/s)": 0.173861 }, { "acc": 0.77141194, "epoch": 0.8592098858604228, "grad_norm": 3.234375, "learning_rate": 6.5511474623025765e-06, "loss": 0.90173035, "memory(GiB)": 728.98, "step": 33870, "train_speed(iter/s)": 0.173826 }, { "acc": 0.77049518, "epoch": 0.8593367252294604, "grad_norm": 3.21875, "learning_rate": 6.550150515207819e-06, "loss": 0.82508221, "memory(GiB)": 728.98, "step": 33875, "train_speed(iter/s)": 0.173795 }, { "acc": 0.77503052, "epoch": 0.8594635645984979, "grad_norm": 3.3125, "learning_rate": 6.549153499926879e-06, "loss": 0.84760818, "memory(GiB)": 728.98, "step": 33880, "train_speed(iter/s)": 0.173763 }, { "acc": 0.75875425, "epoch": 0.8595904039675355, "grad_norm": 4.28125, "learning_rate": 6.5481564165036125e-06, "loss": 0.91669607, "memory(GiB)": 728.98, "step": 33885, "train_speed(iter/s)": 0.173736 }, { "acc": 0.76957316, "epoch": 0.8597172433365731, "grad_norm": 3.21875, "learning_rate": 6.547159264981878e-06, "loss": 0.88473263, "memory(GiB)": 728.98, "step": 33890, "train_speed(iter/s)": 0.173701 }, { "acc": 0.77769661, "epoch": 0.8598440827056106, "grad_norm": 2.875, "learning_rate": 6.546162045405538e-06, "loss": 0.88303261, "memory(GiB)": 728.98, "step": 33895, "train_speed(iter/s)": 0.173669 }, { "acc": 0.76764684, "epoch": 0.8599709220746481, "grad_norm": 3.171875, "learning_rate": 6.545164757818455e-06, "loss": 0.92993679, "memory(GiB)": 728.98, "step": 33900, "train_speed(iter/s)": 0.173637 }, { "acc": 0.77433372, "epoch": 0.8600977614436857, "grad_norm": 3.734375, "learning_rate": 6.544167402264496e-06, "loss": 0.85451813, "memory(GiB)": 728.98, "step": 33905, "train_speed(iter/s)": 0.173606 }, { "acc": 0.77241874, "epoch": 0.8602246008127232, "grad_norm": 3.5, "learning_rate": 6.543169978787536e-06, "loss": 0.90136051, "memory(GiB)": 728.98, "step": 33910, "train_speed(iter/s)": 0.173576 }, { "acc": 0.77431817, "epoch": 0.8603514401817608, "grad_norm": 3.625, "learning_rate": 6.542172487431443e-06, "loss": 0.92479553, "memory(GiB)": 728.98, "step": 33915, "train_speed(iter/s)": 0.173543 }, { "acc": 0.7625998, "epoch": 0.8604782795507984, "grad_norm": 3.703125, "learning_rate": 6.541174928240097e-06, "loss": 0.93700886, "memory(GiB)": 728.98, "step": 33920, "train_speed(iter/s)": 0.173515 }, { "acc": 0.7650991, "epoch": 0.8606051189198359, "grad_norm": 2.921875, "learning_rate": 6.540177301257376e-06, "loss": 0.84217596, "memory(GiB)": 728.98, "step": 33925, "train_speed(iter/s)": 0.173484 }, { "acc": 0.7627367, "epoch": 0.8607319582888735, "grad_norm": 3.75, "learning_rate": 6.539179606527164e-06, "loss": 0.90285778, "memory(GiB)": 728.98, "step": 33930, "train_speed(iter/s)": 0.173449 }, { "acc": 0.76492763, "epoch": 0.8608587976579111, "grad_norm": 3.953125, "learning_rate": 6.538181844093343e-06, "loss": 0.90251617, "memory(GiB)": 728.98, "step": 33935, "train_speed(iter/s)": 0.173411 }, { "acc": 0.77891421, "epoch": 0.8609856370269486, "grad_norm": 4.21875, "learning_rate": 6.5371840139998066e-06, "loss": 0.87855129, "memory(GiB)": 728.98, "step": 33940, "train_speed(iter/s)": 0.173375 }, { "acc": 0.7634222, "epoch": 0.8611124763959862, "grad_norm": 4.0625, "learning_rate": 6.536186116290442e-06, "loss": 0.91336174, "memory(GiB)": 728.98, "step": 33945, "train_speed(iter/s)": 0.173335 }, { "acc": 0.75831351, "epoch": 0.8612393157650238, "grad_norm": 3.328125, "learning_rate": 6.535188151009143e-06, "loss": 0.93066492, "memory(GiB)": 728.98, "step": 33950, "train_speed(iter/s)": 0.173305 }, { "acc": 0.76525569, "epoch": 0.8613661551340613, "grad_norm": 3.28125, "learning_rate": 6.53419011819981e-06, "loss": 0.92450628, "memory(GiB)": 728.98, "step": 33955, "train_speed(iter/s)": 0.173268 }, { "acc": 0.76070337, "epoch": 0.8614929945030988, "grad_norm": 3.671875, "learning_rate": 6.533192017906343e-06, "loss": 0.92537546, "memory(GiB)": 728.98, "step": 33960, "train_speed(iter/s)": 0.173232 }, { "acc": 0.76190047, "epoch": 0.8616198338721364, "grad_norm": 3.953125, "learning_rate": 6.5321938501726425e-06, "loss": 0.95488939, "memory(GiB)": 728.98, "step": 33965, "train_speed(iter/s)": 0.173203 }, { "acc": 0.76182051, "epoch": 0.8617466732411739, "grad_norm": 3.40625, "learning_rate": 6.531195615042616e-06, "loss": 0.91177263, "memory(GiB)": 728.98, "step": 33970, "train_speed(iter/s)": 0.173174 }, { "acc": 0.7671639, "epoch": 0.8618735126102115, "grad_norm": 3.421875, "learning_rate": 6.530197312560174e-06, "loss": 0.88229399, "memory(GiB)": 728.98, "step": 33975, "train_speed(iter/s)": 0.173141 }, { "acc": 0.77136636, "epoch": 0.8620003519792491, "grad_norm": 23.0, "learning_rate": 6.529198942769228e-06, "loss": 0.8809907, "memory(GiB)": 728.98, "step": 33980, "train_speed(iter/s)": 0.173111 }, { "acc": 0.76499033, "epoch": 0.8621271913482866, "grad_norm": 3.0625, "learning_rate": 6.528200505713693e-06, "loss": 0.87365351, "memory(GiB)": 728.98, "step": 33985, "train_speed(iter/s)": 0.173069 }, { "acc": 0.78018899, "epoch": 0.8622540307173242, "grad_norm": 4.25, "learning_rate": 6.527202001437487e-06, "loss": 0.89630976, "memory(GiB)": 728.98, "step": 33990, "train_speed(iter/s)": 0.173041 }, { "acc": 0.76753502, "epoch": 0.8623808700863618, "grad_norm": 3.078125, "learning_rate": 6.52620342998453e-06, "loss": 0.91594782, "memory(GiB)": 728.98, "step": 33995, "train_speed(iter/s)": 0.173006 }, { "acc": 0.78488693, "epoch": 0.8625077094553993, "grad_norm": 3.265625, "learning_rate": 6.5252047913987475e-06, "loss": 0.81154108, "memory(GiB)": 728.98, "step": 34000, "train_speed(iter/s)": 0.172975 }, { "epoch": 0.8625077094553993, "eval_acc": 0.7567574906723004, "eval_loss": 0.8613465428352356, "eval_runtime": 1154.9465, "eval_samples_per_second": 5.515, "eval_steps_per_second": 5.515, "step": 34000 }, { "acc": 0.78154283, "epoch": 0.8626345488244369, "grad_norm": 3.34375, "learning_rate": 6.524206085724064e-06, "loss": 0.83003187, "memory(GiB)": 728.98, "step": 34005, "train_speed(iter/s)": 0.171301 }, { "acc": 0.76855879, "epoch": 0.8627613881934745, "grad_norm": 3.53125, "learning_rate": 6.523207313004414e-06, "loss": 0.90397549, "memory(GiB)": 728.98, "step": 34010, "train_speed(iter/s)": 0.171263 }, { "acc": 0.75792937, "epoch": 0.862888227562512, "grad_norm": 3.15625, "learning_rate": 6.522208473283727e-06, "loss": 0.90517654, "memory(GiB)": 728.98, "step": 34015, "train_speed(iter/s)": 0.171229 }, { "acc": 0.76199107, "epoch": 0.8630150669315495, "grad_norm": 3.859375, "learning_rate": 6.521209566605937e-06, "loss": 0.9407403, "memory(GiB)": 728.98, "step": 34020, "train_speed(iter/s)": 0.171194 }, { "acc": 0.76182151, "epoch": 0.8631419063005871, "grad_norm": 3.421875, "learning_rate": 6.520210593014987e-06, "loss": 0.94975061, "memory(GiB)": 728.98, "step": 34025, "train_speed(iter/s)": 0.17116 }, { "acc": 0.78471646, "epoch": 0.8632687456696246, "grad_norm": 4.1875, "learning_rate": 6.5192115525548165e-06, "loss": 0.8502861, "memory(GiB)": 728.98, "step": 34030, "train_speed(iter/s)": 0.171125 }, { "acc": 0.77005253, "epoch": 0.8633955850386622, "grad_norm": 3.421875, "learning_rate": 6.518212445269371e-06, "loss": 0.86895533, "memory(GiB)": 728.98, "step": 34035, "train_speed(iter/s)": 0.171093 }, { "acc": 0.76753287, "epoch": 0.8635224244076998, "grad_norm": 4.46875, "learning_rate": 6.517213271202596e-06, "loss": 0.90903931, "memory(GiB)": 728.98, "step": 34040, "train_speed(iter/s)": 0.171058 }, { "acc": 0.76264892, "epoch": 0.8636492637767373, "grad_norm": 4.0625, "learning_rate": 6.516214030398444e-06, "loss": 0.92650356, "memory(GiB)": 728.98, "step": 34045, "train_speed(iter/s)": 0.171029 }, { "acc": 0.76131835, "epoch": 0.8637761031457749, "grad_norm": 3.015625, "learning_rate": 6.515214722900867e-06, "loss": 0.90048084, "memory(GiB)": 728.98, "step": 34050, "train_speed(iter/s)": 0.170995 }, { "acc": 0.76345339, "epoch": 0.8639029425148125, "grad_norm": 2.8125, "learning_rate": 6.5142153487538225e-06, "loss": 0.89436913, "memory(GiB)": 728.98, "step": 34055, "train_speed(iter/s)": 0.170957 }, { "acc": 0.77891278, "epoch": 0.86402978188385, "grad_norm": 4.09375, "learning_rate": 6.51321590800127e-06, "loss": 0.86135387, "memory(GiB)": 728.98, "step": 34060, "train_speed(iter/s)": 0.17092 }, { "acc": 0.76749468, "epoch": 0.8641566212528876, "grad_norm": 4.59375, "learning_rate": 6.512216400687171e-06, "loss": 0.9064332, "memory(GiB)": 728.98, "step": 34065, "train_speed(iter/s)": 0.170892 }, { "acc": 0.77504516, "epoch": 0.8642834606219252, "grad_norm": 3.875, "learning_rate": 6.511216826855491e-06, "loss": 0.91775627, "memory(GiB)": 728.98, "step": 34070, "train_speed(iter/s)": 0.170857 }, { "acc": 0.77100763, "epoch": 0.8644102999909627, "grad_norm": 3.28125, "learning_rate": 6.510217186550196e-06, "loss": 0.91118479, "memory(GiB)": 728.98, "step": 34075, "train_speed(iter/s)": 0.170823 }, { "acc": 0.7653029, "epoch": 0.8645371393600002, "grad_norm": 4.125, "learning_rate": 6.509217479815262e-06, "loss": 0.91223507, "memory(GiB)": 728.98, "step": 34080, "train_speed(iter/s)": 0.170789 }, { "acc": 0.76660709, "epoch": 0.8646639787290378, "grad_norm": 3.03125, "learning_rate": 6.508217706694657e-06, "loss": 0.91713533, "memory(GiB)": 728.98, "step": 34085, "train_speed(iter/s)": 0.170758 }, { "acc": 0.77510867, "epoch": 0.8647908180980753, "grad_norm": 3.234375, "learning_rate": 6.507217867232361e-06, "loss": 0.84499226, "memory(GiB)": 728.98, "step": 34090, "train_speed(iter/s)": 0.17072 }, { "acc": 0.75509214, "epoch": 0.8649176574671129, "grad_norm": 3.296875, "learning_rate": 6.506217961472355e-06, "loss": 0.86576071, "memory(GiB)": 728.98, "step": 34095, "train_speed(iter/s)": 0.170685 }, { "acc": 0.7705719, "epoch": 0.8650444968361505, "grad_norm": 3.53125, "learning_rate": 6.5052179894586174e-06, "loss": 0.89993553, "memory(GiB)": 728.98, "step": 34100, "train_speed(iter/s)": 0.170657 }, { "acc": 0.77461848, "epoch": 0.865171336205188, "grad_norm": 3.4375, "learning_rate": 6.504217951235138e-06, "loss": 0.86373034, "memory(GiB)": 728.98, "step": 34105, "train_speed(iter/s)": 0.170624 }, { "acc": 0.76091013, "epoch": 0.8652981755742256, "grad_norm": 3.65625, "learning_rate": 6.503217846845904e-06, "loss": 0.94378986, "memory(GiB)": 728.98, "step": 34110, "train_speed(iter/s)": 0.170592 }, { "acc": 0.77303009, "epoch": 0.8654250149432632, "grad_norm": 4.0, "learning_rate": 6.502217676334905e-06, "loss": 0.91375818, "memory(GiB)": 728.98, "step": 34115, "train_speed(iter/s)": 0.170559 }, { "acc": 0.76928525, "epoch": 0.8655518543123008, "grad_norm": 3.234375, "learning_rate": 6.501217439746138e-06, "loss": 0.85846519, "memory(GiB)": 728.98, "step": 34120, "train_speed(iter/s)": 0.170529 }, { "acc": 0.76620417, "epoch": 0.8656786936813383, "grad_norm": 3.421875, "learning_rate": 6.500217137123599e-06, "loss": 0.8949028, "memory(GiB)": 728.98, "step": 34125, "train_speed(iter/s)": 0.170501 }, { "acc": 0.76759248, "epoch": 0.8658055330503759, "grad_norm": 4.84375, "learning_rate": 6.499216768511287e-06, "loss": 0.88423462, "memory(GiB)": 728.98, "step": 34130, "train_speed(iter/s)": 0.170469 }, { "acc": 0.76715765, "epoch": 0.8659323724194135, "grad_norm": 3.765625, "learning_rate": 6.4982163339532066e-06, "loss": 0.87685843, "memory(GiB)": 728.98, "step": 34135, "train_speed(iter/s)": 0.170432 }, { "acc": 0.78347979, "epoch": 0.8660592117884509, "grad_norm": 3.515625, "learning_rate": 6.497215833493363e-06, "loss": 0.86070614, "memory(GiB)": 728.98, "step": 34140, "train_speed(iter/s)": 0.170402 }, { "acc": 0.76940103, "epoch": 0.8661860511574885, "grad_norm": 4.4375, "learning_rate": 6.496215267175767e-06, "loss": 0.88611984, "memory(GiB)": 728.98, "step": 34145, "train_speed(iter/s)": 0.170371 }, { "acc": 0.76010251, "epoch": 0.866312890526526, "grad_norm": 3.0, "learning_rate": 6.495214635044427e-06, "loss": 0.91179266, "memory(GiB)": 728.98, "step": 34150, "train_speed(iter/s)": 0.170339 }, { "acc": 0.76786408, "epoch": 0.8664397298955636, "grad_norm": 3.078125, "learning_rate": 6.494213937143359e-06, "loss": 0.87667589, "memory(GiB)": 728.98, "step": 34155, "train_speed(iter/s)": 0.170306 }, { "acc": 0.76960654, "epoch": 0.8665665692646012, "grad_norm": 3.25, "learning_rate": 6.493213173516582e-06, "loss": 0.87374363, "memory(GiB)": 728.98, "step": 34160, "train_speed(iter/s)": 0.170266 }, { "acc": 0.77955408, "epoch": 0.8666934086336388, "grad_norm": 3.71875, "learning_rate": 6.4922123442081145e-06, "loss": 0.85785503, "memory(GiB)": 728.98, "step": 34165, "train_speed(iter/s)": 0.170239 }, { "acc": 0.76953855, "epoch": 0.8668202480026763, "grad_norm": 4.28125, "learning_rate": 6.491211449261981e-06, "loss": 0.88870201, "memory(GiB)": 728.98, "step": 34170, "train_speed(iter/s)": 0.17021 }, { "acc": 0.76776996, "epoch": 0.8669470873717139, "grad_norm": 2.796875, "learning_rate": 6.490210488722205e-06, "loss": 0.90223703, "memory(GiB)": 728.98, "step": 34175, "train_speed(iter/s)": 0.170177 }, { "acc": 0.7762711, "epoch": 0.8670739267407515, "grad_norm": 3.265625, "learning_rate": 6.48920946263282e-06, "loss": 0.8495183, "memory(GiB)": 728.98, "step": 34180, "train_speed(iter/s)": 0.170146 }, { "acc": 0.77049975, "epoch": 0.867200766109789, "grad_norm": 3.75, "learning_rate": 6.488208371037856e-06, "loss": 0.83964558, "memory(GiB)": 728.98, "step": 34185, "train_speed(iter/s)": 0.170114 }, { "acc": 0.76876101, "epoch": 0.8673276054788266, "grad_norm": 3.359375, "learning_rate": 6.487207213981345e-06, "loss": 0.89183836, "memory(GiB)": 728.98, "step": 34190, "train_speed(iter/s)": 0.170082 }, { "acc": 0.76110005, "epoch": 0.8674544448478642, "grad_norm": 3.953125, "learning_rate": 6.4862059915073305e-06, "loss": 0.92304249, "memory(GiB)": 728.98, "step": 34195, "train_speed(iter/s)": 0.170047 }, { "acc": 0.76498294, "epoch": 0.8675812842169016, "grad_norm": 3.359375, "learning_rate": 6.485204703659846e-06, "loss": 0.92514229, "memory(GiB)": 728.98, "step": 34200, "train_speed(iter/s)": 0.170009 }, { "acc": 0.76570072, "epoch": 0.8677081235859392, "grad_norm": 3.1875, "learning_rate": 6.484203350482943e-06, "loss": 0.84921064, "memory(GiB)": 728.98, "step": 34205, "train_speed(iter/s)": 0.169975 }, { "acc": 0.7701252, "epoch": 0.8678349629549768, "grad_norm": 3.03125, "learning_rate": 6.48320193202066e-06, "loss": 0.85715399, "memory(GiB)": 728.98, "step": 34210, "train_speed(iter/s)": 0.169939 }, { "acc": 0.76999593, "epoch": 0.8679618023240143, "grad_norm": 3.671875, "learning_rate": 6.482200448317051e-06, "loss": 0.9215867, "memory(GiB)": 728.98, "step": 34215, "train_speed(iter/s)": 0.1699 }, { "acc": 0.7828918, "epoch": 0.8680886416930519, "grad_norm": 3.640625, "learning_rate": 6.481198899416167e-06, "loss": 0.8888566, "memory(GiB)": 728.98, "step": 34220, "train_speed(iter/s)": 0.169871 }, { "acc": 0.76302471, "epoch": 0.8682154810620895, "grad_norm": 3.640625, "learning_rate": 6.480197285362063e-06, "loss": 0.90796528, "memory(GiB)": 728.98, "step": 34225, "train_speed(iter/s)": 0.169843 }, { "acc": 0.7725563, "epoch": 0.868342320431127, "grad_norm": 3.78125, "learning_rate": 6.479195606198796e-06, "loss": 0.85149593, "memory(GiB)": 728.98, "step": 34230, "train_speed(iter/s)": 0.169807 }, { "acc": 0.77343197, "epoch": 0.8684691598001646, "grad_norm": 3.40625, "learning_rate": 6.478193861970428e-06, "loss": 0.88095751, "memory(GiB)": 728.98, "step": 34235, "train_speed(iter/s)": 0.169776 }, { "acc": 0.77672143, "epoch": 0.8685959991692022, "grad_norm": 3.28125, "learning_rate": 6.4771920527210215e-06, "loss": 0.86820841, "memory(GiB)": 728.98, "step": 34240, "train_speed(iter/s)": 0.16975 }, { "acc": 0.75752964, "epoch": 0.8687228385382397, "grad_norm": 3.203125, "learning_rate": 6.476190178494643e-06, "loss": 0.87004347, "memory(GiB)": 728.98, "step": 34245, "train_speed(iter/s)": 0.169719 }, { "acc": 0.76614385, "epoch": 0.8688496779072773, "grad_norm": 3.78125, "learning_rate": 6.475188239335363e-06, "loss": 0.93023291, "memory(GiB)": 728.98, "step": 34250, "train_speed(iter/s)": 0.169691 }, { "acc": 0.74311533, "epoch": 0.8689765172763149, "grad_norm": 3.203125, "learning_rate": 6.474186235287251e-06, "loss": 0.99044285, "memory(GiB)": 728.98, "step": 34255, "train_speed(iter/s)": 0.16966 }, { "acc": 0.77492175, "epoch": 0.8691033566453523, "grad_norm": 4.03125, "learning_rate": 6.473184166394385e-06, "loss": 0.87629566, "memory(GiB)": 728.98, "step": 34260, "train_speed(iter/s)": 0.169625 }, { "acc": 0.75768671, "epoch": 0.8692301960143899, "grad_norm": 3.6875, "learning_rate": 6.47218203270084e-06, "loss": 0.93391113, "memory(GiB)": 728.98, "step": 34265, "train_speed(iter/s)": 0.169595 }, { "acc": 0.77232442, "epoch": 0.8693570353834275, "grad_norm": 3.53125, "learning_rate": 6.471179834250699e-06, "loss": 0.93960295, "memory(GiB)": 728.98, "step": 34270, "train_speed(iter/s)": 0.169567 }, { "acc": 0.75703478, "epoch": 0.869483874752465, "grad_norm": 3.484375, "learning_rate": 6.4701775710880435e-06, "loss": 0.86507425, "memory(GiB)": 728.98, "step": 34275, "train_speed(iter/s)": 0.16954 }, { "acc": 0.78486352, "epoch": 0.8696107141215026, "grad_norm": 3.34375, "learning_rate": 6.469175243256962e-06, "loss": 0.86153336, "memory(GiB)": 728.98, "step": 34280, "train_speed(iter/s)": 0.169512 }, { "acc": 0.76367311, "epoch": 0.8697375534905402, "grad_norm": 4.3125, "learning_rate": 6.468172850801543e-06, "loss": 0.89597826, "memory(GiB)": 728.98, "step": 34285, "train_speed(iter/s)": 0.169483 }, { "acc": 0.77162561, "epoch": 0.8698643928595777, "grad_norm": 3.4375, "learning_rate": 6.467170393765877e-06, "loss": 0.84144602, "memory(GiB)": 728.98, "step": 34290, "train_speed(iter/s)": 0.169451 }, { "acc": 0.77606001, "epoch": 0.8699912322286153, "grad_norm": 3.515625, "learning_rate": 6.4661678721940615e-06, "loss": 0.89762316, "memory(GiB)": 728.98, "step": 34295, "train_speed(iter/s)": 0.169421 }, { "acc": 0.76510944, "epoch": 0.8701180715976529, "grad_norm": 3.8125, "learning_rate": 6.465165286130191e-06, "loss": 0.92830439, "memory(GiB)": 728.98, "step": 34300, "train_speed(iter/s)": 0.169392 }, { "acc": 0.76964993, "epoch": 0.8702449109666904, "grad_norm": 3.625, "learning_rate": 6.464162635618369e-06, "loss": 0.86615982, "memory(GiB)": 728.98, "step": 34305, "train_speed(iter/s)": 0.169358 }, { "acc": 0.75904264, "epoch": 0.870371750335728, "grad_norm": 3.828125, "learning_rate": 6.463159920702697e-06, "loss": 0.95582304, "memory(GiB)": 728.98, "step": 34310, "train_speed(iter/s)": 0.169325 }, { "acc": 0.76760683, "epoch": 0.8704985897047656, "grad_norm": 2.9375, "learning_rate": 6.462157141427282e-06, "loss": 0.87510214, "memory(GiB)": 728.98, "step": 34315, "train_speed(iter/s)": 0.169289 }, { "acc": 0.76842637, "epoch": 0.870625429073803, "grad_norm": 3.515625, "learning_rate": 6.461154297836234e-06, "loss": 0.86305189, "memory(GiB)": 728.98, "step": 34320, "train_speed(iter/s)": 0.169261 }, { "acc": 0.76643381, "epoch": 0.8707522684428406, "grad_norm": 3.84375, "learning_rate": 6.460151389973664e-06, "loss": 0.89197807, "memory(GiB)": 728.98, "step": 34325, "train_speed(iter/s)": 0.169228 }, { "acc": 0.76428304, "epoch": 0.8708791078118782, "grad_norm": 3.28125, "learning_rate": 6.459148417883686e-06, "loss": 0.91320915, "memory(GiB)": 728.98, "step": 34330, "train_speed(iter/s)": 0.169191 }, { "acc": 0.75640402, "epoch": 0.8710059471809157, "grad_norm": 3.25, "learning_rate": 6.458145381610418e-06, "loss": 0.89206448, "memory(GiB)": 728.98, "step": 34335, "train_speed(iter/s)": 0.169159 }, { "acc": 0.76203995, "epoch": 0.8711327865499533, "grad_norm": 4.40625, "learning_rate": 6.457142281197981e-06, "loss": 0.97084923, "memory(GiB)": 728.98, "step": 34340, "train_speed(iter/s)": 0.16913 }, { "acc": 0.76228352, "epoch": 0.8712596259189909, "grad_norm": 3.609375, "learning_rate": 6.456139116690498e-06, "loss": 0.9253027, "memory(GiB)": 728.98, "step": 34345, "train_speed(iter/s)": 0.169096 }, { "acc": 0.75415297, "epoch": 0.8713864652880284, "grad_norm": 4.09375, "learning_rate": 6.455135888132095e-06, "loss": 0.9323103, "memory(GiB)": 728.98, "step": 34350, "train_speed(iter/s)": 0.169064 }, { "acc": 0.7764802, "epoch": 0.871513304657066, "grad_norm": 3.90625, "learning_rate": 6.4541325955669e-06, "loss": 0.86642771, "memory(GiB)": 728.98, "step": 34355, "train_speed(iter/s)": 0.169031 }, { "acc": 0.77049642, "epoch": 0.8716401440261036, "grad_norm": 3.109375, "learning_rate": 6.453129239039046e-06, "loss": 0.84789858, "memory(GiB)": 728.98, "step": 34360, "train_speed(iter/s)": 0.169002 }, { "acc": 0.77264385, "epoch": 0.8717669833951411, "grad_norm": 3.15625, "learning_rate": 6.452125818592666e-06, "loss": 0.86731739, "memory(GiB)": 728.98, "step": 34365, "train_speed(iter/s)": 0.168972 }, { "acc": 0.77512679, "epoch": 0.8718938227641787, "grad_norm": 3.28125, "learning_rate": 6.4511223342718974e-06, "loss": 0.85927992, "memory(GiB)": 728.98, "step": 34370, "train_speed(iter/s)": 0.168934 }, { "acc": 0.75824528, "epoch": 0.8720206621332163, "grad_norm": 3.21875, "learning_rate": 6.450118786120882e-06, "loss": 0.90115175, "memory(GiB)": 728.98, "step": 34375, "train_speed(iter/s)": 0.168902 }, { "acc": 0.78179059, "epoch": 0.8721475015022537, "grad_norm": 3.09375, "learning_rate": 6.449115174183761e-06, "loss": 0.8496829, "memory(GiB)": 728.98, "step": 34380, "train_speed(iter/s)": 0.168867 }, { "acc": 0.77123775, "epoch": 0.8722743408712913, "grad_norm": 3.71875, "learning_rate": 6.448111498504681e-06, "loss": 0.87458267, "memory(GiB)": 728.98, "step": 34385, "train_speed(iter/s)": 0.168836 }, { "acc": 0.78458261, "epoch": 0.8724011802403289, "grad_norm": 3.640625, "learning_rate": 6.44710775912779e-06, "loss": 0.82170639, "memory(GiB)": 728.98, "step": 34390, "train_speed(iter/s)": 0.168807 }, { "acc": 0.75913687, "epoch": 0.8725280196093664, "grad_norm": 8.125, "learning_rate": 6.446103956097241e-06, "loss": 0.9302825, "memory(GiB)": 728.98, "step": 34395, "train_speed(iter/s)": 0.168777 }, { "acc": 0.76925192, "epoch": 0.872654858978404, "grad_norm": 3.1875, "learning_rate": 6.445100089457184e-06, "loss": 0.88598928, "memory(GiB)": 728.98, "step": 34400, "train_speed(iter/s)": 0.168747 }, { "acc": 0.76922193, "epoch": 0.8727816983474416, "grad_norm": 3.25, "learning_rate": 6.444096159251779e-06, "loss": 0.88949537, "memory(GiB)": 728.98, "step": 34405, "train_speed(iter/s)": 0.168716 }, { "acc": 0.76769991, "epoch": 0.8729085377164791, "grad_norm": 3.78125, "learning_rate": 6.443092165525186e-06, "loss": 0.87050066, "memory(GiB)": 728.98, "step": 34410, "train_speed(iter/s)": 0.168687 }, { "acc": 0.77587371, "epoch": 0.8730353770855167, "grad_norm": 3.546875, "learning_rate": 6.442088108321566e-06, "loss": 0.87757626, "memory(GiB)": 728.98, "step": 34415, "train_speed(iter/s)": 0.168659 }, { "acc": 0.77560172, "epoch": 0.8731622164545543, "grad_norm": 3.0, "learning_rate": 6.441083987685086e-06, "loss": 0.84689112, "memory(GiB)": 728.98, "step": 34420, "train_speed(iter/s)": 0.168627 }, { "acc": 0.76272106, "epoch": 0.8732890558235918, "grad_norm": 3.625, "learning_rate": 6.440079803659911e-06, "loss": 0.93002853, "memory(GiB)": 728.98, "step": 34425, "train_speed(iter/s)": 0.168601 }, { "acc": 0.76183586, "epoch": 0.8734158951926294, "grad_norm": 3.375, "learning_rate": 6.439075556290215e-06, "loss": 0.92151461, "memory(GiB)": 728.98, "step": 34430, "train_speed(iter/s)": 0.168572 }, { "acc": 0.76986399, "epoch": 0.873542734561667, "grad_norm": 3.6875, "learning_rate": 6.43807124562017e-06, "loss": 0.89704847, "memory(GiB)": 728.98, "step": 34435, "train_speed(iter/s)": 0.168541 }, { "acc": 0.77378554, "epoch": 0.8736695739307044, "grad_norm": 3.890625, "learning_rate": 6.437066871693954e-06, "loss": 0.89045219, "memory(GiB)": 728.98, "step": 34440, "train_speed(iter/s)": 0.168503 }, { "acc": 0.7664434, "epoch": 0.873796413299742, "grad_norm": 3.203125, "learning_rate": 6.436062434555743e-06, "loss": 0.88606653, "memory(GiB)": 728.98, "step": 34445, "train_speed(iter/s)": 0.168478 }, { "acc": 0.76235046, "epoch": 0.8739232526687796, "grad_norm": 3.703125, "learning_rate": 6.435057934249722e-06, "loss": 0.85663157, "memory(GiB)": 728.98, "step": 34450, "train_speed(iter/s)": 0.168438 }, { "acc": 0.76027975, "epoch": 0.8740500920378171, "grad_norm": 3.75, "learning_rate": 6.434053370820077e-06, "loss": 0.93128042, "memory(GiB)": 728.98, "step": 34455, "train_speed(iter/s)": 0.168411 }, { "acc": 0.76747184, "epoch": 0.8741769314068547, "grad_norm": 3.5, "learning_rate": 6.4330487443109905e-06, "loss": 0.89399309, "memory(GiB)": 728.98, "step": 34460, "train_speed(iter/s)": 0.168386 }, { "acc": 0.76510825, "epoch": 0.8743037707758923, "grad_norm": 2.96875, "learning_rate": 6.4320440547666575e-06, "loss": 0.91002903, "memory(GiB)": 728.98, "step": 34465, "train_speed(iter/s)": 0.168355 }, { "acc": 0.77412066, "epoch": 0.8744306101449298, "grad_norm": 3.109375, "learning_rate": 6.4310393022312675e-06, "loss": 0.88415174, "memory(GiB)": 728.98, "step": 34470, "train_speed(iter/s)": 0.168319 }, { "acc": 0.76498938, "epoch": 0.8745574495139674, "grad_norm": 3.703125, "learning_rate": 6.43003448674902e-06, "loss": 0.8874712, "memory(GiB)": 728.98, "step": 34475, "train_speed(iter/s)": 0.168292 }, { "acc": 0.77885013, "epoch": 0.874684288883005, "grad_norm": 3.578125, "learning_rate": 6.429029608364111e-06, "loss": 0.86571655, "memory(GiB)": 728.98, "step": 34480, "train_speed(iter/s)": 0.16826 }, { "acc": 0.77569299, "epoch": 0.8748111282520425, "grad_norm": 4.25, "learning_rate": 6.428024667120743e-06, "loss": 0.87489681, "memory(GiB)": 728.98, "step": 34485, "train_speed(iter/s)": 0.168231 }, { "acc": 0.77284245, "epoch": 0.8749379676210801, "grad_norm": 3.625, "learning_rate": 6.4270196630631195e-06, "loss": 0.90158844, "memory(GiB)": 728.98, "step": 34490, "train_speed(iter/s)": 0.168198 }, { "acc": 0.76334157, "epoch": 0.8750648069901177, "grad_norm": 3.171875, "learning_rate": 6.426014596235448e-06, "loss": 0.9368454, "memory(GiB)": 728.98, "step": 34495, "train_speed(iter/s)": 0.168165 }, { "acc": 0.76600671, "epoch": 0.8751916463591551, "grad_norm": 3.078125, "learning_rate": 6.425009466681938e-06, "loss": 0.94913912, "memory(GiB)": 728.98, "step": 34500, "train_speed(iter/s)": 0.168136 }, { "epoch": 0.8751916463591551, "eval_acc": 0.756781721176249, "eval_loss": 0.8613130450248718, "eval_runtime": 1151.4788, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 34500 }, { "acc": 0.75608988, "epoch": 0.8753184857281927, "grad_norm": 3.421875, "learning_rate": 6.424004274446801e-06, "loss": 0.90444298, "memory(GiB)": 728.98, "step": 34505, "train_speed(iter/s)": 0.166579 }, { "acc": 0.7684988, "epoch": 0.8754453250972303, "grad_norm": 4.28125, "learning_rate": 6.4229990195742546e-06, "loss": 0.85480709, "memory(GiB)": 728.98, "step": 34510, "train_speed(iter/s)": 0.166548 }, { "acc": 0.77291656, "epoch": 0.8755721644662678, "grad_norm": 3.484375, "learning_rate": 6.421993702108514e-06, "loss": 0.94029074, "memory(GiB)": 728.98, "step": 34515, "train_speed(iter/s)": 0.166521 }, { "acc": 0.77446942, "epoch": 0.8756990038353054, "grad_norm": 3.34375, "learning_rate": 6.420988322093802e-06, "loss": 0.83480673, "memory(GiB)": 728.98, "step": 34520, "train_speed(iter/s)": 0.166485 }, { "acc": 0.77492967, "epoch": 0.875825843204343, "grad_norm": 4.03125, "learning_rate": 6.419982879574342e-06, "loss": 0.88110819, "memory(GiB)": 728.98, "step": 34525, "train_speed(iter/s)": 0.16646 }, { "acc": 0.76757879, "epoch": 0.8759526825733805, "grad_norm": 3.203125, "learning_rate": 6.418977374594357e-06, "loss": 0.88743734, "memory(GiB)": 728.98, "step": 34530, "train_speed(iter/s)": 0.166432 }, { "acc": 0.76680775, "epoch": 0.8760795219424181, "grad_norm": 3.609375, "learning_rate": 6.417971807198081e-06, "loss": 0.89929743, "memory(GiB)": 728.98, "step": 34535, "train_speed(iter/s)": 0.166401 }, { "acc": 0.75996866, "epoch": 0.8762063613114557, "grad_norm": 3.296875, "learning_rate": 6.416966177429742e-06, "loss": 0.89244213, "memory(GiB)": 728.98, "step": 34540, "train_speed(iter/s)": 0.166372 }, { "acc": 0.75806627, "epoch": 0.8763332006804933, "grad_norm": 3.109375, "learning_rate": 6.415960485333577e-06, "loss": 0.90155582, "memory(GiB)": 728.98, "step": 34545, "train_speed(iter/s)": 0.166341 }, { "acc": 0.7710475, "epoch": 0.8764600400495308, "grad_norm": 3.640625, "learning_rate": 6.4149547309538195e-06, "loss": 0.9771594, "memory(GiB)": 728.98, "step": 34550, "train_speed(iter/s)": 0.166318 }, { "acc": 0.77015409, "epoch": 0.8765868794185684, "grad_norm": 3.3125, "learning_rate": 6.413948914334713e-06, "loss": 0.86791344, "memory(GiB)": 728.98, "step": 34555, "train_speed(iter/s)": 0.166291 }, { "acc": 0.77557817, "epoch": 0.8767137187876058, "grad_norm": 3.28125, "learning_rate": 6.412943035520498e-06, "loss": 0.82094002, "memory(GiB)": 728.98, "step": 34560, "train_speed(iter/s)": 0.16626 }, { "acc": 0.78122239, "epoch": 0.8768405581566434, "grad_norm": 3.0625, "learning_rate": 6.411937094555421e-06, "loss": 0.806359, "memory(GiB)": 728.98, "step": 34565, "train_speed(iter/s)": 0.16623 }, { "acc": 0.76317849, "epoch": 0.876967397525681, "grad_norm": 3.0, "learning_rate": 6.41093109148373e-06, "loss": 0.90471373, "memory(GiB)": 728.98, "step": 34570, "train_speed(iter/s)": 0.166201 }, { "acc": 0.75957289, "epoch": 0.8770942368947185, "grad_norm": 3.671875, "learning_rate": 6.409925026349677e-06, "loss": 0.91425533, "memory(GiB)": 728.98, "step": 34575, "train_speed(iter/s)": 0.166175 }, { "acc": 0.77198553, "epoch": 0.8772210762637561, "grad_norm": 3.1875, "learning_rate": 6.4089188991975135e-06, "loss": 0.81087713, "memory(GiB)": 728.98, "step": 34580, "train_speed(iter/s)": 0.166143 }, { "acc": 0.78128529, "epoch": 0.8773479156327937, "grad_norm": 3.359375, "learning_rate": 6.407912710071495e-06, "loss": 0.87842073, "memory(GiB)": 728.98, "step": 34585, "train_speed(iter/s)": 0.166112 }, { "acc": 0.76317339, "epoch": 0.8774747550018313, "grad_norm": 4.84375, "learning_rate": 6.406906459015885e-06, "loss": 0.90765953, "memory(GiB)": 728.98, "step": 34590, "train_speed(iter/s)": 0.16608 }, { "acc": 0.78060231, "epoch": 0.8776015943708688, "grad_norm": 3.078125, "learning_rate": 6.405900146074941e-06, "loss": 0.86366682, "memory(GiB)": 728.98, "step": 34595, "train_speed(iter/s)": 0.166049 }, { "acc": 0.75460038, "epoch": 0.8777284337399064, "grad_norm": 2.984375, "learning_rate": 6.404893771292929e-06, "loss": 0.91438475, "memory(GiB)": 728.98, "step": 34600, "train_speed(iter/s)": 0.166022 }, { "acc": 0.77837567, "epoch": 0.877855273108944, "grad_norm": 3.078125, "learning_rate": 6.403887334714117e-06, "loss": 0.83604164, "memory(GiB)": 728.98, "step": 34605, "train_speed(iter/s)": 0.165993 }, { "acc": 0.75582438, "epoch": 0.8779821124779815, "grad_norm": 3.984375, "learning_rate": 6.402880836382773e-06, "loss": 0.92606678, "memory(GiB)": 728.98, "step": 34610, "train_speed(iter/s)": 0.165963 }, { "acc": 0.76737056, "epoch": 0.8781089518470191, "grad_norm": 3.5, "learning_rate": 6.401874276343173e-06, "loss": 0.89431725, "memory(GiB)": 728.98, "step": 34615, "train_speed(iter/s)": 0.165937 }, { "acc": 0.75575676, "epoch": 0.8782357912160565, "grad_norm": 3.671875, "learning_rate": 6.400867654639588e-06, "loss": 0.91692448, "memory(GiB)": 728.98, "step": 34620, "train_speed(iter/s)": 0.165906 }, { "acc": 0.76517572, "epoch": 0.8783626305850941, "grad_norm": 4.625, "learning_rate": 6.399860971316301e-06, "loss": 0.89272518, "memory(GiB)": 728.98, "step": 34625, "train_speed(iter/s)": 0.165875 }, { "acc": 0.75394611, "epoch": 0.8784894699541317, "grad_norm": 3.609375, "learning_rate": 6.398854226417587e-06, "loss": 0.92284241, "memory(GiB)": 728.98, "step": 34630, "train_speed(iter/s)": 0.165849 }, { "acc": 0.77469049, "epoch": 0.8786163093231693, "grad_norm": 4.78125, "learning_rate": 6.3978474199877346e-06, "loss": 0.87026558, "memory(GiB)": 728.98, "step": 34635, "train_speed(iter/s)": 0.165821 }, { "acc": 0.76271248, "epoch": 0.8787431486922068, "grad_norm": 3.234375, "learning_rate": 6.396840552071026e-06, "loss": 0.8697813, "memory(GiB)": 728.98, "step": 34640, "train_speed(iter/s)": 0.165788 }, { "acc": 0.76260405, "epoch": 0.8788699880612444, "grad_norm": 3.4375, "learning_rate": 6.395833622711754e-06, "loss": 0.91927214, "memory(GiB)": 728.98, "step": 34645, "train_speed(iter/s)": 0.165762 }, { "acc": 0.75555367, "epoch": 0.878996827430282, "grad_norm": 3.15625, "learning_rate": 6.39482663195421e-06, "loss": 0.86895266, "memory(GiB)": 728.98, "step": 34650, "train_speed(iter/s)": 0.165731 }, { "acc": 0.75530243, "epoch": 0.8791236667993195, "grad_norm": 3.578125, "learning_rate": 6.393819579842683e-06, "loss": 0.92910767, "memory(GiB)": 728.98, "step": 34655, "train_speed(iter/s)": 0.165697 }, { "acc": 0.75479479, "epoch": 0.8792505061683571, "grad_norm": 3.296875, "learning_rate": 6.392812466421476e-06, "loss": 1.00247784, "memory(GiB)": 728.98, "step": 34660, "train_speed(iter/s)": 0.165672 }, { "acc": 0.77240109, "epoch": 0.8793773455373947, "grad_norm": 4.125, "learning_rate": 6.391805291734885e-06, "loss": 0.87333879, "memory(GiB)": 728.98, "step": 34665, "train_speed(iter/s)": 0.16564 }, { "acc": 0.78180337, "epoch": 0.8795041849064322, "grad_norm": 3.328125, "learning_rate": 6.390798055827214e-06, "loss": 0.84539061, "memory(GiB)": 728.98, "step": 34670, "train_speed(iter/s)": 0.165608 }, { "acc": 0.75411177, "epoch": 0.8796310242754697, "grad_norm": 3.171875, "learning_rate": 6.389790758742769e-06, "loss": 0.94321814, "memory(GiB)": 728.98, "step": 34675, "train_speed(iter/s)": 0.165579 }, { "acc": 0.78046918, "epoch": 0.8797578636445073, "grad_norm": 3.203125, "learning_rate": 6.3887834005258555e-06, "loss": 0.86290684, "memory(GiB)": 728.98, "step": 34680, "train_speed(iter/s)": 0.165553 }, { "acc": 0.7735991, "epoch": 0.8798847030135448, "grad_norm": 3.078125, "learning_rate": 6.387775981220785e-06, "loss": 0.87353239, "memory(GiB)": 728.98, "step": 34685, "train_speed(iter/s)": 0.165521 }, { "acc": 0.77842813, "epoch": 0.8800115423825824, "grad_norm": 3.40625, "learning_rate": 6.386768500871871e-06, "loss": 0.89237032, "memory(GiB)": 728.98, "step": 34690, "train_speed(iter/s)": 0.165491 }, { "acc": 0.77111325, "epoch": 0.88013838175162, "grad_norm": 3.65625, "learning_rate": 6.38576095952343e-06, "loss": 0.87592144, "memory(GiB)": 728.98, "step": 34695, "train_speed(iter/s)": 0.165466 }, { "acc": 0.7619278, "epoch": 0.8802652211206575, "grad_norm": 3.84375, "learning_rate": 6.384753357219779e-06, "loss": 0.92930365, "memory(GiB)": 728.98, "step": 34700, "train_speed(iter/s)": 0.165435 }, { "acc": 0.76349759, "epoch": 0.8803920604896951, "grad_norm": 4.1875, "learning_rate": 6.383745694005241e-06, "loss": 0.91845875, "memory(GiB)": 728.98, "step": 34705, "train_speed(iter/s)": 0.165401 }, { "acc": 0.76398063, "epoch": 0.8805188998587327, "grad_norm": 3.3125, "learning_rate": 6.382737969924137e-06, "loss": 0.91263647, "memory(GiB)": 728.98, "step": 34710, "train_speed(iter/s)": 0.165369 }, { "acc": 0.75892048, "epoch": 0.8806457392277702, "grad_norm": 3.8125, "learning_rate": 6.381730185020798e-06, "loss": 0.96005297, "memory(GiB)": 728.98, "step": 34715, "train_speed(iter/s)": 0.165332 }, { "acc": 0.77314143, "epoch": 0.8807725785968078, "grad_norm": 3.3125, "learning_rate": 6.380722339339548e-06, "loss": 0.87262087, "memory(GiB)": 728.98, "step": 34720, "train_speed(iter/s)": 0.165306 }, { "acc": 0.77273912, "epoch": 0.8808994179658454, "grad_norm": 3.40625, "learning_rate": 6.379714432924723e-06, "loss": 0.91980295, "memory(GiB)": 728.98, "step": 34725, "train_speed(iter/s)": 0.165279 }, { "acc": 0.76592054, "epoch": 0.8810262573348829, "grad_norm": 3.46875, "learning_rate": 6.378706465820654e-06, "loss": 0.86079931, "memory(GiB)": 728.98, "step": 34730, "train_speed(iter/s)": 0.165253 }, { "acc": 0.76606655, "epoch": 0.8811530967039204, "grad_norm": 3.484375, "learning_rate": 6.377698438071682e-06, "loss": 0.86610851, "memory(GiB)": 728.98, "step": 34735, "train_speed(iter/s)": 0.165219 }, { "acc": 0.7754221, "epoch": 0.881279936072958, "grad_norm": 3.328125, "learning_rate": 6.3766903497221455e-06, "loss": 0.90611725, "memory(GiB)": 728.98, "step": 34740, "train_speed(iter/s)": 0.165189 }, { "acc": 0.76847887, "epoch": 0.8814067754419955, "grad_norm": 4.40625, "learning_rate": 6.375682200816387e-06, "loss": 0.88903513, "memory(GiB)": 728.98, "step": 34745, "train_speed(iter/s)": 0.165157 }, { "acc": 0.77092404, "epoch": 0.8815336148110331, "grad_norm": 3.21875, "learning_rate": 6.374673991398752e-06, "loss": 0.86916971, "memory(GiB)": 728.98, "step": 34750, "train_speed(iter/s)": 0.165121 }, { "acc": 0.75798097, "epoch": 0.8816604541800707, "grad_norm": 3.515625, "learning_rate": 6.373665721513587e-06, "loss": 0.87257137, "memory(GiB)": 728.98, "step": 34755, "train_speed(iter/s)": 0.165093 }, { "acc": 0.76349831, "epoch": 0.8817872935491082, "grad_norm": 3.296875, "learning_rate": 6.372657391205244e-06, "loss": 0.91591654, "memory(GiB)": 728.98, "step": 34760, "train_speed(iter/s)": 0.165066 }, { "acc": 0.76495633, "epoch": 0.8819141329181458, "grad_norm": 3.203125, "learning_rate": 6.371649000518075e-06, "loss": 0.87209311, "memory(GiB)": 728.98, "step": 34765, "train_speed(iter/s)": 0.165035 }, { "acc": 0.76433878, "epoch": 0.8820409722871834, "grad_norm": 3.296875, "learning_rate": 6.3706405494964385e-06, "loss": 0.94008417, "memory(GiB)": 728.98, "step": 34770, "train_speed(iter/s)": 0.165009 }, { "acc": 0.76440749, "epoch": 0.8821678116562209, "grad_norm": 2.890625, "learning_rate": 6.3696320381846925e-06, "loss": 0.89566193, "memory(GiB)": 728.98, "step": 34775, "train_speed(iter/s)": 0.16498 }, { "acc": 0.77545505, "epoch": 0.8822946510252585, "grad_norm": 3.375, "learning_rate": 6.368623466627194e-06, "loss": 0.84601765, "memory(GiB)": 728.98, "step": 34780, "train_speed(iter/s)": 0.164951 }, { "acc": 0.76076627, "epoch": 0.8824214903942961, "grad_norm": 3.515625, "learning_rate": 6.367614834868312e-06, "loss": 0.86257124, "memory(GiB)": 728.98, "step": 34785, "train_speed(iter/s)": 0.164918 }, { "acc": 0.77151918, "epoch": 0.8825483297633336, "grad_norm": 3.96875, "learning_rate": 6.3666061429524096e-06, "loss": 0.91517267, "memory(GiB)": 728.98, "step": 34790, "train_speed(iter/s)": 0.164888 }, { "acc": 0.77550311, "epoch": 0.8826751691323711, "grad_norm": 3.5, "learning_rate": 6.365597390923858e-06, "loss": 0.87328863, "memory(GiB)": 728.98, "step": 34795, "train_speed(iter/s)": 0.164861 }, { "acc": 0.77474866, "epoch": 0.8828020085014087, "grad_norm": 3.90625, "learning_rate": 6.364588578827028e-06, "loss": 0.88466291, "memory(GiB)": 728.98, "step": 34800, "train_speed(iter/s)": 0.164829 }, { "acc": 0.77946568, "epoch": 0.8829288478704462, "grad_norm": 3.59375, "learning_rate": 6.363579706706294e-06, "loss": 0.88140421, "memory(GiB)": 728.98, "step": 34805, "train_speed(iter/s)": 0.164796 }, { "acc": 0.76209698, "epoch": 0.8830556872394838, "grad_norm": 3.75, "learning_rate": 6.362570774606034e-06, "loss": 0.94326391, "memory(GiB)": 728.98, "step": 34810, "train_speed(iter/s)": 0.164773 }, { "acc": 0.77098389, "epoch": 0.8831825266085214, "grad_norm": 3.34375, "learning_rate": 6.3615617825706265e-06, "loss": 0.88862791, "memory(GiB)": 728.98, "step": 34815, "train_speed(iter/s)": 0.164741 }, { "acc": 0.77335768, "epoch": 0.8833093659775589, "grad_norm": 3.09375, "learning_rate": 6.360552730644455e-06, "loss": 0.90565195, "memory(GiB)": 728.98, "step": 34820, "train_speed(iter/s)": 0.164712 }, { "acc": 0.77677665, "epoch": 0.8834362053465965, "grad_norm": 3.984375, "learning_rate": 6.359543618871903e-06, "loss": 0.88871937, "memory(GiB)": 728.98, "step": 34825, "train_speed(iter/s)": 0.164679 }, { "acc": 0.77162595, "epoch": 0.8835630447156341, "grad_norm": 3.4375, "learning_rate": 6.35853444729736e-06, "loss": 0.91252308, "memory(GiB)": 728.98, "step": 34830, "train_speed(iter/s)": 0.164647 }, { "acc": 0.76875525, "epoch": 0.8836898840846716, "grad_norm": 3.671875, "learning_rate": 6.357525215965214e-06, "loss": 0.89639473, "memory(GiB)": 728.98, "step": 34835, "train_speed(iter/s)": 0.164619 }, { "acc": 0.76025367, "epoch": 0.8838167234537092, "grad_norm": 3.1875, "learning_rate": 6.35651592491986e-06, "loss": 0.93736906, "memory(GiB)": 728.98, "step": 34840, "train_speed(iter/s)": 0.164591 }, { "acc": 0.76299734, "epoch": 0.8839435628227468, "grad_norm": 3.984375, "learning_rate": 6.355506574205691e-06, "loss": 0.89334459, "memory(GiB)": 728.98, "step": 34845, "train_speed(iter/s)": 0.164565 }, { "acc": 0.77042289, "epoch": 0.8840704021917843, "grad_norm": 4.0625, "learning_rate": 6.354497163867108e-06, "loss": 0.88450069, "memory(GiB)": 728.98, "step": 34850, "train_speed(iter/s)": 0.164535 }, { "acc": 0.76159673, "epoch": 0.8841972415608218, "grad_norm": 3.953125, "learning_rate": 6.35348769394851e-06, "loss": 0.97136822, "memory(GiB)": 728.98, "step": 34855, "train_speed(iter/s)": 0.164511 }, { "acc": 0.77413092, "epoch": 0.8843240809298594, "grad_norm": 3.390625, "learning_rate": 6.352478164494303e-06, "loss": 0.88419456, "memory(GiB)": 728.98, "step": 34860, "train_speed(iter/s)": 0.164482 }, { "acc": 0.77863479, "epoch": 0.8844509202988969, "grad_norm": 3.234375, "learning_rate": 6.3514685755488885e-06, "loss": 0.85345373, "memory(GiB)": 728.98, "step": 34865, "train_speed(iter/s)": 0.164455 }, { "acc": 0.77508416, "epoch": 0.8845777596679345, "grad_norm": 4.25, "learning_rate": 6.350458927156678e-06, "loss": 0.87045279, "memory(GiB)": 728.98, "step": 34870, "train_speed(iter/s)": 0.164431 }, { "acc": 0.78605394, "epoch": 0.8847045990369721, "grad_norm": 3.21875, "learning_rate": 6.3494492193620825e-06, "loss": 0.79522824, "memory(GiB)": 728.98, "step": 34875, "train_speed(iter/s)": 0.164402 }, { "acc": 0.76219521, "epoch": 0.8848314384060096, "grad_norm": 3.28125, "learning_rate": 6.348439452209514e-06, "loss": 0.89691038, "memory(GiB)": 728.98, "step": 34880, "train_speed(iter/s)": 0.164375 }, { "acc": 0.75615273, "epoch": 0.8849582777750472, "grad_norm": 4.90625, "learning_rate": 6.347429625743391e-06, "loss": 0.90150471, "memory(GiB)": 728.98, "step": 34885, "train_speed(iter/s)": 0.164346 }, { "acc": 0.77611179, "epoch": 0.8850851171440848, "grad_norm": 3.265625, "learning_rate": 6.3464197400081315e-06, "loss": 0.8228055, "memory(GiB)": 728.98, "step": 34890, "train_speed(iter/s)": 0.164315 }, { "acc": 0.77988105, "epoch": 0.8852119565131223, "grad_norm": 3.328125, "learning_rate": 6.345409795048158e-06, "loss": 0.85871687, "memory(GiB)": 728.98, "step": 34895, "train_speed(iter/s)": 0.164284 }, { "acc": 0.75109205, "epoch": 0.8853387958821599, "grad_norm": 4.15625, "learning_rate": 6.344399790907896e-06, "loss": 0.99089594, "memory(GiB)": 728.98, "step": 34900, "train_speed(iter/s)": 0.164258 }, { "acc": 0.7563899, "epoch": 0.8854656352511975, "grad_norm": 3.1875, "learning_rate": 6.3433897276317695e-06, "loss": 0.93480797, "memory(GiB)": 728.98, "step": 34905, "train_speed(iter/s)": 0.164226 }, { "acc": 0.76334338, "epoch": 0.885592474620235, "grad_norm": 2.921875, "learning_rate": 6.342379605264209e-06, "loss": 0.89516029, "memory(GiB)": 728.98, "step": 34910, "train_speed(iter/s)": 0.164198 }, { "acc": 0.75920553, "epoch": 0.8857193139892725, "grad_norm": 3.234375, "learning_rate": 6.3413694238496474e-06, "loss": 0.92299881, "memory(GiB)": 728.98, "step": 34915, "train_speed(iter/s)": 0.164161 }, { "acc": 0.76335392, "epoch": 0.8858461533583101, "grad_norm": 3.3125, "learning_rate": 6.340359183432518e-06, "loss": 0.86147394, "memory(GiB)": 728.98, "step": 34920, "train_speed(iter/s)": 0.164133 }, { "acc": 0.77214928, "epoch": 0.8859729927273476, "grad_norm": 3.84375, "learning_rate": 6.339348884057259e-06, "loss": 0.90176744, "memory(GiB)": 728.98, "step": 34925, "train_speed(iter/s)": 0.164104 }, { "acc": 0.75577946, "epoch": 0.8860998320963852, "grad_norm": 3.8125, "learning_rate": 6.33833852576831e-06, "loss": 0.89648304, "memory(GiB)": 728.98, "step": 34930, "train_speed(iter/s)": 0.164072 }, { "acc": 0.75463643, "epoch": 0.8862266714654228, "grad_norm": 4.5, "learning_rate": 6.337328108610112e-06, "loss": 0.91996231, "memory(GiB)": 728.98, "step": 34935, "train_speed(iter/s)": 0.16404 }, { "acc": 0.76623745, "epoch": 0.8863535108344603, "grad_norm": 3.453125, "learning_rate": 6.336317632627115e-06, "loss": 0.88608704, "memory(GiB)": 728.98, "step": 34940, "train_speed(iter/s)": 0.164013 }, { "acc": 0.77556682, "epoch": 0.8864803502034979, "grad_norm": 3.09375, "learning_rate": 6.335307097863761e-06, "loss": 0.89570589, "memory(GiB)": 728.98, "step": 34945, "train_speed(iter/s)": 0.163986 }, { "acc": 0.75392337, "epoch": 0.8866071895725355, "grad_norm": 3.421875, "learning_rate": 6.334296504364501e-06, "loss": 0.93131676, "memory(GiB)": 728.98, "step": 34950, "train_speed(iter/s)": 0.163958 }, { "acc": 0.76141863, "epoch": 0.886734028941573, "grad_norm": 3.109375, "learning_rate": 6.333285852173792e-06, "loss": 0.85248537, "memory(GiB)": 728.98, "step": 34955, "train_speed(iter/s)": 0.163928 }, { "acc": 0.75827136, "epoch": 0.8868608683106106, "grad_norm": 3.5, "learning_rate": 6.332275141336084e-06, "loss": 0.99519606, "memory(GiB)": 728.98, "step": 34960, "train_speed(iter/s)": 0.163906 }, { "acc": 0.75958524, "epoch": 0.8869877076796482, "grad_norm": 3.09375, "learning_rate": 6.331264371895839e-06, "loss": 0.90076265, "memory(GiB)": 728.98, "step": 34965, "train_speed(iter/s)": 0.16388 }, { "acc": 0.76864319, "epoch": 0.8871145470486858, "grad_norm": 3.9375, "learning_rate": 6.330253543897514e-06, "loss": 0.89121542, "memory(GiB)": 728.98, "step": 34970, "train_speed(iter/s)": 0.163857 }, { "acc": 0.77249236, "epoch": 0.8872413864177232, "grad_norm": 3.96875, "learning_rate": 6.329242657385577e-06, "loss": 0.86966505, "memory(GiB)": 728.98, "step": 34975, "train_speed(iter/s)": 0.163829 }, { "acc": 0.76172166, "epoch": 0.8873682257867608, "grad_norm": 3.515625, "learning_rate": 6.3282317124044875e-06, "loss": 0.90325785, "memory(GiB)": 728.98, "step": 34980, "train_speed(iter/s)": 0.163803 }, { "acc": 0.76150289, "epoch": 0.8874950651557983, "grad_norm": 3.734375, "learning_rate": 6.327220708998719e-06, "loss": 0.91850958, "memory(GiB)": 728.98, "step": 34985, "train_speed(iter/s)": 0.163777 }, { "acc": 0.75383453, "epoch": 0.8876219045248359, "grad_norm": 3.09375, "learning_rate": 6.326209647212741e-06, "loss": 0.92731743, "memory(GiB)": 728.98, "step": 34990, "train_speed(iter/s)": 0.163747 }, { "acc": 0.76991539, "epoch": 0.8877487438938735, "grad_norm": 3.53125, "learning_rate": 6.325198527091024e-06, "loss": 0.89024172, "memory(GiB)": 728.98, "step": 34995, "train_speed(iter/s)": 0.163722 }, { "acc": 0.77254577, "epoch": 0.887875583262911, "grad_norm": 4.125, "learning_rate": 6.324187348678048e-06, "loss": 0.93082581, "memory(GiB)": 728.98, "step": 35000, "train_speed(iter/s)": 0.163695 }, { "epoch": 0.887875583262911, "eval_acc": 0.7569567656789116, "eval_loss": 0.8604587316513062, "eval_runtime": 1150.0652, "eval_samples_per_second": 5.539, "eval_steps_per_second": 5.539, "step": 35000 }, { "acc": 0.76474524, "epoch": 0.8880024226319486, "grad_norm": 3.234375, "learning_rate": 6.323176112018288e-06, "loss": 0.90565796, "memory(GiB)": 728.98, "step": 35005, "train_speed(iter/s)": 0.16224 }, { "acc": 0.77355609, "epoch": 0.8881292620009862, "grad_norm": 3.828125, "learning_rate": 6.322164817156229e-06, "loss": 0.86903715, "memory(GiB)": 728.98, "step": 35010, "train_speed(iter/s)": 0.162216 }, { "acc": 0.76955767, "epoch": 0.8882561013700238, "grad_norm": 3.265625, "learning_rate": 6.321153464136349e-06, "loss": 0.8615222, "memory(GiB)": 728.98, "step": 35015, "train_speed(iter/s)": 0.162192 }, { "acc": 0.78721185, "epoch": 0.8883829407390613, "grad_norm": 3.828125, "learning_rate": 6.320142053003141e-06, "loss": 0.8484436, "memory(GiB)": 728.98, "step": 35020, "train_speed(iter/s)": 0.162163 }, { "acc": 0.75014973, "epoch": 0.8885097801080989, "grad_norm": 3.515625, "learning_rate": 6.319130583801087e-06, "loss": 0.95675755, "memory(GiB)": 728.98, "step": 35025, "train_speed(iter/s)": 0.16214 }, { "acc": 0.76111641, "epoch": 0.8886366194771365, "grad_norm": 3.8125, "learning_rate": 6.318119056574684e-06, "loss": 0.90066299, "memory(GiB)": 728.98, "step": 35030, "train_speed(iter/s)": 0.162114 }, { "acc": 0.75190077, "epoch": 0.8887634588461739, "grad_norm": 3.390625, "learning_rate": 6.3171074713684235e-06, "loss": 0.91442785, "memory(GiB)": 728.98, "step": 35035, "train_speed(iter/s)": 0.162091 }, { "acc": 0.76471834, "epoch": 0.8888902982152115, "grad_norm": 3.796875, "learning_rate": 6.3160958282268e-06, "loss": 0.92154207, "memory(GiB)": 728.98, "step": 35040, "train_speed(iter/s)": 0.162066 }, { "acc": 0.76733441, "epoch": 0.889017137584249, "grad_norm": 3.625, "learning_rate": 6.315084127194317e-06, "loss": 0.8909956, "memory(GiB)": 728.98, "step": 35045, "train_speed(iter/s)": 0.162038 }, { "acc": 0.76144118, "epoch": 0.8891439769532866, "grad_norm": 3.09375, "learning_rate": 6.3140723683154715e-06, "loss": 0.9797308, "memory(GiB)": 728.98, "step": 35050, "train_speed(iter/s)": 0.162015 }, { "acc": 0.75553327, "epoch": 0.8892708163223242, "grad_norm": 3.359375, "learning_rate": 6.31306055163477e-06, "loss": 0.90129948, "memory(GiB)": 728.98, "step": 35055, "train_speed(iter/s)": 0.161987 }, { "acc": 0.77141213, "epoch": 0.8893976556913618, "grad_norm": 3.375, "learning_rate": 6.3120486771967185e-06, "loss": 0.87540693, "memory(GiB)": 728.98, "step": 35060, "train_speed(iter/s)": 0.161963 }, { "acc": 0.7358614, "epoch": 0.8895244950603993, "grad_norm": 3.390625, "learning_rate": 6.311036745045826e-06, "loss": 0.99455566, "memory(GiB)": 728.98, "step": 35065, "train_speed(iter/s)": 0.161933 }, { "acc": 0.7656455, "epoch": 0.8896513344294369, "grad_norm": 4.1875, "learning_rate": 6.310024755226605e-06, "loss": 0.94002991, "memory(GiB)": 728.98, "step": 35070, "train_speed(iter/s)": 0.161907 }, { "acc": 0.75958152, "epoch": 0.8897781737984745, "grad_norm": 3.5, "learning_rate": 6.309012707783567e-06, "loss": 0.90406322, "memory(GiB)": 728.98, "step": 35075, "train_speed(iter/s)": 0.161876 }, { "acc": 0.75482244, "epoch": 0.889905013167512, "grad_norm": 3.453125, "learning_rate": 6.308000602761233e-06, "loss": 0.93527813, "memory(GiB)": 728.98, "step": 35080, "train_speed(iter/s)": 0.161849 }, { "acc": 0.75511508, "epoch": 0.8900318525365496, "grad_norm": 4.0, "learning_rate": 6.306988440204118e-06, "loss": 0.94965258, "memory(GiB)": 728.98, "step": 35085, "train_speed(iter/s)": 0.161825 }, { "acc": 0.7684021, "epoch": 0.8901586919055872, "grad_norm": 3.671875, "learning_rate": 6.3059762201567485e-06, "loss": 0.89926901, "memory(GiB)": 728.98, "step": 35090, "train_speed(iter/s)": 0.161801 }, { "acc": 0.75213022, "epoch": 0.8902855312746246, "grad_norm": 3.53125, "learning_rate": 6.304963942663644e-06, "loss": 0.88402443, "memory(GiB)": 728.98, "step": 35095, "train_speed(iter/s)": 0.161773 }, { "acc": 0.76389928, "epoch": 0.8904123706436622, "grad_norm": 3.703125, "learning_rate": 6.303951607769334e-06, "loss": 0.87656527, "memory(GiB)": 728.98, "step": 35100, "train_speed(iter/s)": 0.161742 }, { "acc": 0.76936803, "epoch": 0.8905392100126998, "grad_norm": 2.8125, "learning_rate": 6.302939215518348e-06, "loss": 0.89607868, "memory(GiB)": 728.98, "step": 35105, "train_speed(iter/s)": 0.161716 }, { "acc": 0.75270014, "epoch": 0.8906660493817373, "grad_norm": 3.265625, "learning_rate": 6.301926765955218e-06, "loss": 0.89074478, "memory(GiB)": 728.98, "step": 35110, "train_speed(iter/s)": 0.161686 }, { "acc": 0.76193666, "epoch": 0.8907928887507749, "grad_norm": 3.171875, "learning_rate": 6.300914259124475e-06, "loss": 0.89397001, "memory(GiB)": 728.98, "step": 35115, "train_speed(iter/s)": 0.161659 }, { "acc": 0.77612724, "epoch": 0.8909197281198125, "grad_norm": 4.0, "learning_rate": 6.299901695070661e-06, "loss": 0.86241102, "memory(GiB)": 728.98, "step": 35120, "train_speed(iter/s)": 0.161632 }, { "acc": 0.77745972, "epoch": 0.89104656748885, "grad_norm": 4.53125, "learning_rate": 6.298889073838312e-06, "loss": 0.88450031, "memory(GiB)": 728.98, "step": 35125, "train_speed(iter/s)": 0.161611 }, { "acc": 0.76986918, "epoch": 0.8911734068578876, "grad_norm": 3.4375, "learning_rate": 6.297876395471972e-06, "loss": 0.8658392, "memory(GiB)": 728.98, "step": 35130, "train_speed(iter/s)": 0.161585 }, { "acc": 0.76872654, "epoch": 0.8913002462269252, "grad_norm": 3.84375, "learning_rate": 6.296863660016185e-06, "loss": 0.86988144, "memory(GiB)": 728.98, "step": 35135, "train_speed(iter/s)": 0.161554 }, { "acc": 0.76483564, "epoch": 0.8914270855959627, "grad_norm": 3.265625, "learning_rate": 6.2958508675154965e-06, "loss": 0.91925631, "memory(GiB)": 728.98, "step": 35140, "train_speed(iter/s)": 0.161528 }, { "acc": 0.77552948, "epoch": 0.8915539249650003, "grad_norm": 3.21875, "learning_rate": 6.294838018014457e-06, "loss": 0.9099411, "memory(GiB)": 728.98, "step": 35145, "train_speed(iter/s)": 0.161505 }, { "acc": 0.77126579, "epoch": 0.8916807643340379, "grad_norm": 3.40625, "learning_rate": 6.2938251115576195e-06, "loss": 0.84492388, "memory(GiB)": 728.98, "step": 35150, "train_speed(iter/s)": 0.161481 }, { "acc": 0.77681603, "epoch": 0.8918076037030753, "grad_norm": 3.203125, "learning_rate": 6.292812148189539e-06, "loss": 0.87469711, "memory(GiB)": 728.98, "step": 35155, "train_speed(iter/s)": 0.161455 }, { "acc": 0.76404209, "epoch": 0.8919344430721129, "grad_norm": 4.0625, "learning_rate": 6.29179912795477e-06, "loss": 0.8922121, "memory(GiB)": 728.98, "step": 35160, "train_speed(iter/s)": 0.161426 }, { "acc": 0.76993833, "epoch": 0.8920612824411505, "grad_norm": 3.375, "learning_rate": 6.290786050897873e-06, "loss": 0.88589907, "memory(GiB)": 728.98, "step": 35165, "train_speed(iter/s)": 0.161401 }, { "acc": 0.7679213, "epoch": 0.892188121810188, "grad_norm": 3.875, "learning_rate": 6.28977291706341e-06, "loss": 0.92644176, "memory(GiB)": 728.98, "step": 35170, "train_speed(iter/s)": 0.161378 }, { "acc": 0.77629142, "epoch": 0.8923149611792256, "grad_norm": 3.5625, "learning_rate": 6.288759726495945e-06, "loss": 0.88000975, "memory(GiB)": 728.98, "step": 35175, "train_speed(iter/s)": 0.161352 }, { "acc": 0.78142066, "epoch": 0.8924418005482632, "grad_norm": 3.609375, "learning_rate": 6.287746479240048e-06, "loss": 0.81696901, "memory(GiB)": 728.98, "step": 35180, "train_speed(iter/s)": 0.161327 }, { "acc": 0.76461496, "epoch": 0.8925686399173007, "grad_norm": 3.546875, "learning_rate": 6.2867331753402845e-06, "loss": 0.84248781, "memory(GiB)": 728.98, "step": 35185, "train_speed(iter/s)": 0.161304 }, { "acc": 0.77604494, "epoch": 0.8926954792863383, "grad_norm": 3.140625, "learning_rate": 6.28571981484123e-06, "loss": 0.87817087, "memory(GiB)": 728.98, "step": 35190, "train_speed(iter/s)": 0.161279 }, { "acc": 0.76833992, "epoch": 0.8928223186553759, "grad_norm": 3.453125, "learning_rate": 6.284706397787458e-06, "loss": 0.85952177, "memory(GiB)": 728.98, "step": 35195, "train_speed(iter/s)": 0.161255 }, { "acc": 0.76377573, "epoch": 0.8929491580244134, "grad_norm": 3.4375, "learning_rate": 6.283692924223544e-06, "loss": 0.88764381, "memory(GiB)": 728.98, "step": 35200, "train_speed(iter/s)": 0.161228 }, { "acc": 0.7681994, "epoch": 0.893075997393451, "grad_norm": 2.765625, "learning_rate": 6.2826793941940675e-06, "loss": 0.8800293, "memory(GiB)": 728.98, "step": 35205, "train_speed(iter/s)": 0.161206 }, { "acc": 0.76414356, "epoch": 0.8932028367624886, "grad_norm": 3.109375, "learning_rate": 6.281665807743611e-06, "loss": 0.8644393, "memory(GiB)": 728.98, "step": 35210, "train_speed(iter/s)": 0.161183 }, { "acc": 0.76385674, "epoch": 0.893329676131526, "grad_norm": 4.125, "learning_rate": 6.28065216491676e-06, "loss": 0.90129433, "memory(GiB)": 728.98, "step": 35215, "train_speed(iter/s)": 0.161156 }, { "acc": 0.77314291, "epoch": 0.8934565155005636, "grad_norm": 3.296875, "learning_rate": 6.279638465758101e-06, "loss": 0.88989735, "memory(GiB)": 728.98, "step": 35220, "train_speed(iter/s)": 0.16113 }, { "acc": 0.78438187, "epoch": 0.8935833548696012, "grad_norm": 3.28125, "learning_rate": 6.2786247103122235e-06, "loss": 0.86737947, "memory(GiB)": 728.98, "step": 35225, "train_speed(iter/s)": 0.161101 }, { "acc": 0.77868242, "epoch": 0.8937101942386387, "grad_norm": 3.65625, "learning_rate": 6.277610898623718e-06, "loss": 0.92064562, "memory(GiB)": 728.98, "step": 35230, "train_speed(iter/s)": 0.161073 }, { "acc": 0.77420182, "epoch": 0.8938370336076763, "grad_norm": 3.09375, "learning_rate": 6.27659703073718e-06, "loss": 0.84302044, "memory(GiB)": 728.98, "step": 35235, "train_speed(iter/s)": 0.161046 }, { "acc": 0.75449891, "epoch": 0.8939638729767139, "grad_norm": 4.25, "learning_rate": 6.275583106697207e-06, "loss": 0.9473875, "memory(GiB)": 728.98, "step": 35240, "train_speed(iter/s)": 0.16102 }, { "acc": 0.76619244, "epoch": 0.8940907123457514, "grad_norm": 3.78125, "learning_rate": 6.274569126548396e-06, "loss": 0.92053757, "memory(GiB)": 728.98, "step": 35245, "train_speed(iter/s)": 0.160993 }, { "acc": 0.75796976, "epoch": 0.894217551714789, "grad_norm": 3.625, "learning_rate": 6.273555090335351e-06, "loss": 0.92134647, "memory(GiB)": 728.98, "step": 35250, "train_speed(iter/s)": 0.160968 }, { "acc": 0.77609444, "epoch": 0.8943443910838266, "grad_norm": 3.390625, "learning_rate": 6.272540998102675e-06, "loss": 0.85152044, "memory(GiB)": 728.98, "step": 35255, "train_speed(iter/s)": 0.160936 }, { "acc": 0.75843058, "epoch": 0.8944712304528641, "grad_norm": 3.5625, "learning_rate": 6.271526849894975e-06, "loss": 0.9265934, "memory(GiB)": 728.98, "step": 35260, "train_speed(iter/s)": 0.160908 }, { "acc": 0.7749177, "epoch": 0.8945980698219017, "grad_norm": 3.359375, "learning_rate": 6.270512645756861e-06, "loss": 0.84986115, "memory(GiB)": 728.98, "step": 35265, "train_speed(iter/s)": 0.160877 }, { "acc": 0.76833067, "epoch": 0.8947249091909393, "grad_norm": 3.578125, "learning_rate": 6.269498385732944e-06, "loss": 0.88627491, "memory(GiB)": 728.98, "step": 35270, "train_speed(iter/s)": 0.160851 }, { "acc": 0.7602839, "epoch": 0.8948517485599767, "grad_norm": 3.921875, "learning_rate": 6.268484069867836e-06, "loss": 0.92331133, "memory(GiB)": 728.98, "step": 35275, "train_speed(iter/s)": 0.160828 }, { "acc": 0.77228575, "epoch": 0.8949785879290143, "grad_norm": 3.375, "learning_rate": 6.2674696982061565e-06, "loss": 0.87201691, "memory(GiB)": 728.98, "step": 35280, "train_speed(iter/s)": 0.1608 }, { "acc": 0.76194472, "epoch": 0.8951054272980519, "grad_norm": 3.65625, "learning_rate": 6.2664552707925245e-06, "loss": 0.96419935, "memory(GiB)": 728.98, "step": 35285, "train_speed(iter/s)": 0.160777 }, { "acc": 0.75339947, "epoch": 0.8952322666670894, "grad_norm": 3.375, "learning_rate": 6.265440787671558e-06, "loss": 0.9092452, "memory(GiB)": 728.98, "step": 35290, "train_speed(iter/s)": 0.160747 }, { "acc": 0.76095443, "epoch": 0.895359106036127, "grad_norm": 4.125, "learning_rate": 6.264426248887886e-06, "loss": 0.88846636, "memory(GiB)": 728.98, "step": 35295, "train_speed(iter/s)": 0.160714 }, { "acc": 0.77538996, "epoch": 0.8954859454051646, "grad_norm": 3.75, "learning_rate": 6.263411654486129e-06, "loss": 0.83487387, "memory(GiB)": 728.98, "step": 35300, "train_speed(iter/s)": 0.160688 }, { "acc": 0.76073313, "epoch": 0.8956127847742021, "grad_norm": 4.03125, "learning_rate": 6.26239700451092e-06, "loss": 0.91434736, "memory(GiB)": 728.98, "step": 35305, "train_speed(iter/s)": 0.160661 }, { "acc": 0.76688194, "epoch": 0.8957396241432397, "grad_norm": 3.9375, "learning_rate": 6.26138229900689e-06, "loss": 0.86086082, "memory(GiB)": 728.98, "step": 35310, "train_speed(iter/s)": 0.160636 }, { "acc": 0.77772222, "epoch": 0.8958664635122773, "grad_norm": 3.140625, "learning_rate": 6.260367538018671e-06, "loss": 0.83017883, "memory(GiB)": 728.98, "step": 35315, "train_speed(iter/s)": 0.160607 }, { "acc": 0.77208486, "epoch": 0.8959933028813148, "grad_norm": 3.28125, "learning_rate": 6.259352721590899e-06, "loss": 0.87234745, "memory(GiB)": 728.98, "step": 35320, "train_speed(iter/s)": 0.160582 }, { "acc": 0.76011753, "epoch": 0.8961201422503524, "grad_norm": 4.09375, "learning_rate": 6.258337849768213e-06, "loss": 0.91250448, "memory(GiB)": 728.98, "step": 35325, "train_speed(iter/s)": 0.160562 }, { "acc": 0.77316861, "epoch": 0.89624698161939, "grad_norm": 3.75, "learning_rate": 6.257322922595257e-06, "loss": 0.85967417, "memory(GiB)": 728.98, "step": 35330, "train_speed(iter/s)": 0.16054 }, { "acc": 0.76449904, "epoch": 0.8963738209884274, "grad_norm": 2.859375, "learning_rate": 6.25630794011667e-06, "loss": 0.89976988, "memory(GiB)": 728.98, "step": 35335, "train_speed(iter/s)": 0.160512 }, { "acc": 0.77356834, "epoch": 0.896500660357465, "grad_norm": 3.15625, "learning_rate": 6.255292902377099e-06, "loss": 0.86299067, "memory(GiB)": 728.98, "step": 35340, "train_speed(iter/s)": 0.160487 }, { "acc": 0.77150702, "epoch": 0.8966274997265026, "grad_norm": 3.421875, "learning_rate": 6.254277809421193e-06, "loss": 0.89579067, "memory(GiB)": 728.98, "step": 35345, "train_speed(iter/s)": 0.160461 }, { "acc": 0.76149502, "epoch": 0.8967543390955401, "grad_norm": 4.0, "learning_rate": 6.2532626612936035e-06, "loss": 0.95121965, "memory(GiB)": 728.98, "step": 35350, "train_speed(iter/s)": 0.160436 }, { "acc": 0.76218743, "epoch": 0.8968811784645777, "grad_norm": 5.6875, "learning_rate": 6.252247458038981e-06, "loss": 0.88613529, "memory(GiB)": 728.98, "step": 35355, "train_speed(iter/s)": 0.16041 }, { "acc": 0.77347298, "epoch": 0.8970080178336153, "grad_norm": 3.640625, "learning_rate": 6.2512321997019845e-06, "loss": 0.86858892, "memory(GiB)": 728.98, "step": 35360, "train_speed(iter/s)": 0.160381 }, { "acc": 0.75014205, "epoch": 0.8971348572026528, "grad_norm": 4.71875, "learning_rate": 6.25021688632727e-06, "loss": 0.91599417, "memory(GiB)": 728.98, "step": 35365, "train_speed(iter/s)": 0.160358 }, { "acc": 0.77800455, "epoch": 0.8972616965716904, "grad_norm": 3.515625, "learning_rate": 6.2492015179594975e-06, "loss": 0.8715579, "memory(GiB)": 728.98, "step": 35370, "train_speed(iter/s)": 0.160332 }, { "acc": 0.77625771, "epoch": 0.897388535940728, "grad_norm": 3.453125, "learning_rate": 6.24818609464333e-06, "loss": 0.83810701, "memory(GiB)": 728.98, "step": 35375, "train_speed(iter/s)": 0.160306 }, { "acc": 0.77211876, "epoch": 0.8975153753097656, "grad_norm": 3.65625, "learning_rate": 6.247170616423434e-06, "loss": 0.91804485, "memory(GiB)": 728.98, "step": 35380, "train_speed(iter/s)": 0.160284 }, { "acc": 0.77363148, "epoch": 0.8976422146788031, "grad_norm": 3.28125, "learning_rate": 6.246155083344476e-06, "loss": 0.8702837, "memory(GiB)": 728.98, "step": 35385, "train_speed(iter/s)": 0.160257 }, { "acc": 0.76929107, "epoch": 0.8977690540478407, "grad_norm": 4.09375, "learning_rate": 6.245139495451127e-06, "loss": 0.86969795, "memory(GiB)": 728.98, "step": 35390, "train_speed(iter/s)": 0.160235 }, { "acc": 0.77366185, "epoch": 0.8978958934168781, "grad_norm": 3.3125, "learning_rate": 6.244123852788058e-06, "loss": 0.89854813, "memory(GiB)": 728.98, "step": 35395, "train_speed(iter/s)": 0.160211 }, { "acc": 0.78338203, "epoch": 0.8980227327859157, "grad_norm": 3.953125, "learning_rate": 6.243108155399944e-06, "loss": 0.85611706, "memory(GiB)": 728.98, "step": 35400, "train_speed(iter/s)": 0.160188 }, { "acc": 0.77211833, "epoch": 0.8981495721549533, "grad_norm": 3.65625, "learning_rate": 6.242092403331466e-06, "loss": 0.88166466, "memory(GiB)": 728.98, "step": 35405, "train_speed(iter/s)": 0.160166 }, { "acc": 0.76360741, "epoch": 0.8982764115239908, "grad_norm": 2.984375, "learning_rate": 6.241076596627298e-06, "loss": 0.8986661, "memory(GiB)": 728.98, "step": 35410, "train_speed(iter/s)": 0.160142 }, { "acc": 0.76296992, "epoch": 0.8984032508930284, "grad_norm": 3.1875, "learning_rate": 6.240060735332125e-06, "loss": 0.95187359, "memory(GiB)": 728.98, "step": 35415, "train_speed(iter/s)": 0.160118 }, { "acc": 0.76167479, "epoch": 0.898530090262066, "grad_norm": 3.703125, "learning_rate": 6.239044819490633e-06, "loss": 0.89806175, "memory(GiB)": 728.98, "step": 35420, "train_speed(iter/s)": 0.160096 }, { "acc": 0.77737789, "epoch": 0.8986569296311036, "grad_norm": 3.78125, "learning_rate": 6.238028849147505e-06, "loss": 0.85752573, "memory(GiB)": 728.98, "step": 35425, "train_speed(iter/s)": 0.160074 }, { "acc": 0.76694031, "epoch": 0.8987837690001411, "grad_norm": 3.796875, "learning_rate": 6.237012824347435e-06, "loss": 0.85152874, "memory(GiB)": 728.98, "step": 35430, "train_speed(iter/s)": 0.160046 }, { "acc": 0.76284819, "epoch": 0.8989106083691787, "grad_norm": 4.5, "learning_rate": 6.23599674513511e-06, "loss": 0.90854549, "memory(GiB)": 728.98, "step": 35435, "train_speed(iter/s)": 0.160015 }, { "acc": 0.76605973, "epoch": 0.8990374477382163, "grad_norm": 3.546875, "learning_rate": 6.234980611555227e-06, "loss": 0.88387079, "memory(GiB)": 728.98, "step": 35440, "train_speed(iter/s)": 0.159991 }, { "acc": 0.76556416, "epoch": 0.8991642871072538, "grad_norm": 3.453125, "learning_rate": 6.233964423652482e-06, "loss": 0.89856625, "memory(GiB)": 728.98, "step": 35445, "train_speed(iter/s)": 0.159964 }, { "acc": 0.74386215, "epoch": 0.8992911264762914, "grad_norm": 4.03125, "learning_rate": 6.232948181471574e-06, "loss": 0.94206467, "memory(GiB)": 728.98, "step": 35450, "train_speed(iter/s)": 0.159938 }, { "acc": 0.75545578, "epoch": 0.8994179658453288, "grad_norm": 3.546875, "learning_rate": 6.231931885057204e-06, "loss": 0.90631342, "memory(GiB)": 728.98, "step": 35455, "train_speed(iter/s)": 0.159914 }, { "acc": 0.75945206, "epoch": 0.8995448052143664, "grad_norm": 3.34375, "learning_rate": 6.230915534454074e-06, "loss": 0.936901, "memory(GiB)": 728.98, "step": 35460, "train_speed(iter/s)": 0.159885 }, { "acc": 0.77333255, "epoch": 0.899671644583404, "grad_norm": 4.09375, "learning_rate": 6.229899129706893e-06, "loss": 0.89911089, "memory(GiB)": 728.98, "step": 35465, "train_speed(iter/s)": 0.15986 }, { "acc": 0.77009525, "epoch": 0.8997984839524416, "grad_norm": 3.609375, "learning_rate": 6.228882670860367e-06, "loss": 0.84744644, "memory(GiB)": 728.98, "step": 35470, "train_speed(iter/s)": 0.159836 }, { "acc": 0.75627909, "epoch": 0.8999253233214791, "grad_norm": 4.25, "learning_rate": 6.227866157959208e-06, "loss": 0.91098213, "memory(GiB)": 728.98, "step": 35475, "train_speed(iter/s)": 0.15981 }, { "acc": 0.77562771, "epoch": 0.9000521626905167, "grad_norm": 3.109375, "learning_rate": 6.2268495910481295e-06, "loss": 0.86638422, "memory(GiB)": 728.98, "step": 35480, "train_speed(iter/s)": 0.159781 }, { "acc": 0.76551538, "epoch": 0.9001790020595543, "grad_norm": 3.296875, "learning_rate": 6.225832970171847e-06, "loss": 0.910884, "memory(GiB)": 728.98, "step": 35485, "train_speed(iter/s)": 0.159758 }, { "acc": 0.7674562, "epoch": 0.9003058414285918, "grad_norm": 3.21875, "learning_rate": 6.224816295375077e-06, "loss": 0.89326582, "memory(GiB)": 728.98, "step": 35490, "train_speed(iter/s)": 0.159726 }, { "acc": 0.7631186, "epoch": 0.9004326807976294, "grad_norm": 3.9375, "learning_rate": 6.223799566702541e-06, "loss": 0.88129082, "memory(GiB)": 728.98, "step": 35495, "train_speed(iter/s)": 0.159701 }, { "acc": 0.77089367, "epoch": 0.900559520166667, "grad_norm": 3.796875, "learning_rate": 6.222782784198961e-06, "loss": 0.8948391, "memory(GiB)": 728.98, "step": 35500, "train_speed(iter/s)": 0.159678 }, { "epoch": 0.900559520166667, "eval_acc": 0.7570444968138976, "eval_loss": 0.8600671291351318, "eval_runtime": 1149.2445, "eval_samples_per_second": 5.543, "eval_steps_per_second": 5.543, "step": 35500 }, { "acc": 0.7768981, "epoch": 0.9006863595357045, "grad_norm": 3.84375, "learning_rate": 6.221765947909062e-06, "loss": 0.86536303, "memory(GiB)": 728.98, "step": 35505, "train_speed(iter/s)": 0.158317 }, { "acc": 0.77040386, "epoch": 0.9008131989047421, "grad_norm": 3.296875, "learning_rate": 6.220749057877573e-06, "loss": 0.90933323, "memory(GiB)": 728.98, "step": 35510, "train_speed(iter/s)": 0.15829 }, { "acc": 0.76005569, "epoch": 0.9009400382737796, "grad_norm": 3.625, "learning_rate": 6.21973211414922e-06, "loss": 0.90143614, "memory(GiB)": 728.98, "step": 35515, "train_speed(iter/s)": 0.158266 }, { "acc": 0.78150482, "epoch": 0.9010668776428171, "grad_norm": 4.0625, "learning_rate": 6.218715116768741e-06, "loss": 0.83526716, "memory(GiB)": 728.98, "step": 35520, "train_speed(iter/s)": 0.15824 }, { "acc": 0.76315303, "epoch": 0.9011937170118547, "grad_norm": 4.4375, "learning_rate": 6.217698065780865e-06, "loss": 0.91098471, "memory(GiB)": 728.98, "step": 35525, "train_speed(iter/s)": 0.158212 }, { "acc": 0.77842278, "epoch": 0.9013205563808923, "grad_norm": 3.5, "learning_rate": 6.216680961230331e-06, "loss": 0.87728415, "memory(GiB)": 728.98, "step": 35530, "train_speed(iter/s)": 0.15819 }, { "acc": 0.77548609, "epoch": 0.9014473957499298, "grad_norm": 3.4375, "learning_rate": 6.2156638031618796e-06, "loss": 0.8624402, "memory(GiB)": 728.98, "step": 35535, "train_speed(iter/s)": 0.158165 }, { "acc": 0.76645403, "epoch": 0.9015742351189674, "grad_norm": 3.765625, "learning_rate": 6.214646591620249e-06, "loss": 0.90128345, "memory(GiB)": 728.98, "step": 35540, "train_speed(iter/s)": 0.158141 }, { "acc": 0.77421207, "epoch": 0.901701074488005, "grad_norm": 3.078125, "learning_rate": 6.213629326650186e-06, "loss": 0.83307323, "memory(GiB)": 728.98, "step": 35545, "train_speed(iter/s)": 0.158117 }, { "acc": 0.77457213, "epoch": 0.9018279138570425, "grad_norm": 3.40625, "learning_rate": 6.212612008296434e-06, "loss": 0.89032803, "memory(GiB)": 728.98, "step": 35550, "train_speed(iter/s)": 0.158091 }, { "acc": 0.76656899, "epoch": 0.9019547532260801, "grad_norm": 2.703125, "learning_rate": 6.211594636603746e-06, "loss": 0.85084305, "memory(GiB)": 728.98, "step": 35555, "train_speed(iter/s)": 0.158063 }, { "acc": 0.76680064, "epoch": 0.9020815925951177, "grad_norm": 3.78125, "learning_rate": 6.210577211616869e-06, "loss": 0.93487463, "memory(GiB)": 728.98, "step": 35560, "train_speed(iter/s)": 0.158038 }, { "acc": 0.75962234, "epoch": 0.9022084319641552, "grad_norm": 4.0625, "learning_rate": 6.209559733380558e-06, "loss": 0.90448799, "memory(GiB)": 728.98, "step": 35565, "train_speed(iter/s)": 0.158012 }, { "acc": 0.74865327, "epoch": 0.9023352713331928, "grad_norm": 3.359375, "learning_rate": 6.208542201939567e-06, "loss": 0.95547028, "memory(GiB)": 728.98, "step": 35570, "train_speed(iter/s)": 0.157983 }, { "acc": 0.76432853, "epoch": 0.9024621107022303, "grad_norm": 3.109375, "learning_rate": 6.2075246173386574e-06, "loss": 0.88894205, "memory(GiB)": 728.98, "step": 35575, "train_speed(iter/s)": 0.157961 }, { "acc": 0.76525106, "epoch": 0.9025889500712678, "grad_norm": 3.6875, "learning_rate": 6.206506979622586e-06, "loss": 0.90240116, "memory(GiB)": 728.98, "step": 35580, "train_speed(iter/s)": 0.157932 }, { "acc": 0.75335932, "epoch": 0.9027157894403054, "grad_norm": 3.484375, "learning_rate": 6.205489288836117e-06, "loss": 0.93895912, "memory(GiB)": 728.98, "step": 35585, "train_speed(iter/s)": 0.157904 }, { "acc": 0.77446256, "epoch": 0.902842628809343, "grad_norm": 3.078125, "learning_rate": 6.204471545024014e-06, "loss": 0.90887108, "memory(GiB)": 728.98, "step": 35590, "train_speed(iter/s)": 0.157884 }, { "acc": 0.76587734, "epoch": 0.9029694681783805, "grad_norm": 3.578125, "learning_rate": 6.203453748231046e-06, "loss": 0.86931143, "memory(GiB)": 728.98, "step": 35595, "train_speed(iter/s)": 0.15786 }, { "acc": 0.76680498, "epoch": 0.9030963075474181, "grad_norm": 4.09375, "learning_rate": 6.202435898501983e-06, "loss": 0.87658329, "memory(GiB)": 728.98, "step": 35600, "train_speed(iter/s)": 0.157834 }, { "acc": 0.77231283, "epoch": 0.9032231469164557, "grad_norm": 3.140625, "learning_rate": 6.201417995881594e-06, "loss": 0.92791748, "memory(GiB)": 728.98, "step": 35605, "train_speed(iter/s)": 0.157805 }, { "acc": 0.77256446, "epoch": 0.9033499862854932, "grad_norm": 3.09375, "learning_rate": 6.200400040414658e-06, "loss": 0.85640574, "memory(GiB)": 728.98, "step": 35610, "train_speed(iter/s)": 0.157779 }, { "acc": 0.76097441, "epoch": 0.9034768256545308, "grad_norm": 3.1875, "learning_rate": 6.199382032145947e-06, "loss": 0.87538548, "memory(GiB)": 728.98, "step": 35615, "train_speed(iter/s)": 0.157752 }, { "acc": 0.76470704, "epoch": 0.9036036650235684, "grad_norm": 3.90625, "learning_rate": 6.198363971120241e-06, "loss": 0.86402016, "memory(GiB)": 728.98, "step": 35620, "train_speed(iter/s)": 0.157729 }, { "acc": 0.77087722, "epoch": 0.9037305043926059, "grad_norm": 3.53125, "learning_rate": 6.197345857382324e-06, "loss": 0.8776535, "memory(GiB)": 728.98, "step": 35625, "train_speed(iter/s)": 0.157707 }, { "acc": 0.75557623, "epoch": 0.9038573437616435, "grad_norm": 3.5, "learning_rate": 6.196327690976978e-06, "loss": 0.92621632, "memory(GiB)": 728.98, "step": 35630, "train_speed(iter/s)": 0.157682 }, { "acc": 0.76341891, "epoch": 0.903984183130681, "grad_norm": 3.4375, "learning_rate": 6.195309471948987e-06, "loss": 0.9022687, "memory(GiB)": 728.98, "step": 35635, "train_speed(iter/s)": 0.157659 }, { "acc": 0.76375971, "epoch": 0.9041110224997185, "grad_norm": 3.625, "learning_rate": 6.194291200343142e-06, "loss": 0.90075235, "memory(GiB)": 728.98, "step": 35640, "train_speed(iter/s)": 0.157636 }, { "acc": 0.76342435, "epoch": 0.9042378618687561, "grad_norm": 3.46875, "learning_rate": 6.193272876204232e-06, "loss": 0.85886955, "memory(GiB)": 728.98, "step": 35645, "train_speed(iter/s)": 0.157615 }, { "acc": 0.75282879, "epoch": 0.9043647012377937, "grad_norm": 3.15625, "learning_rate": 6.192254499577051e-06, "loss": 0.90842228, "memory(GiB)": 728.98, "step": 35650, "train_speed(iter/s)": 0.157594 }, { "acc": 0.77027349, "epoch": 0.9044915406068312, "grad_norm": 3.265625, "learning_rate": 6.191236070506393e-06, "loss": 0.89139299, "memory(GiB)": 728.98, "step": 35655, "train_speed(iter/s)": 0.157568 }, { "acc": 0.76662059, "epoch": 0.9046183799758688, "grad_norm": 3.078125, "learning_rate": 6.190217589037054e-06, "loss": 0.87677212, "memory(GiB)": 728.98, "step": 35660, "train_speed(iter/s)": 0.157538 }, { "acc": 0.75752258, "epoch": 0.9047452193449064, "grad_norm": 3.953125, "learning_rate": 6.189199055213837e-06, "loss": 0.9000845, "memory(GiB)": 728.98, "step": 35665, "train_speed(iter/s)": 0.157519 }, { "acc": 0.75629721, "epoch": 0.9048720587139439, "grad_norm": 3.125, "learning_rate": 6.188180469081543e-06, "loss": 0.87667179, "memory(GiB)": 728.98, "step": 35670, "train_speed(iter/s)": 0.15749 }, { "acc": 0.77440882, "epoch": 0.9049988980829815, "grad_norm": 3.46875, "learning_rate": 6.187161830684975e-06, "loss": 0.86168423, "memory(GiB)": 728.98, "step": 35675, "train_speed(iter/s)": 0.157471 }, { "acc": 0.77692099, "epoch": 0.9051257374520191, "grad_norm": 3.765625, "learning_rate": 6.186143140068941e-06, "loss": 0.84311514, "memory(GiB)": 728.98, "step": 35680, "train_speed(iter/s)": 0.157445 }, { "acc": 0.75636888, "epoch": 0.9052525768210566, "grad_norm": 4.1875, "learning_rate": 6.185124397278248e-06, "loss": 0.9047925, "memory(GiB)": 728.98, "step": 35685, "train_speed(iter/s)": 0.157418 }, { "acc": 0.75578203, "epoch": 0.9053794161900942, "grad_norm": 3.515625, "learning_rate": 6.184105602357712e-06, "loss": 0.92663002, "memory(GiB)": 728.98, "step": 35690, "train_speed(iter/s)": 0.157399 }, { "acc": 0.76508904, "epoch": 0.9055062555591317, "grad_norm": 3.265625, "learning_rate": 6.183086755352141e-06, "loss": 0.86891909, "memory(GiB)": 728.98, "step": 35695, "train_speed(iter/s)": 0.157375 }, { "acc": 0.75400057, "epoch": 0.9056330949281692, "grad_norm": 3.078125, "learning_rate": 6.182067856306354e-06, "loss": 0.90043373, "memory(GiB)": 728.98, "step": 35700, "train_speed(iter/s)": 0.157352 }, { "acc": 0.76648612, "epoch": 0.9057599342972068, "grad_norm": 3.3125, "learning_rate": 6.181048905265169e-06, "loss": 0.90679388, "memory(GiB)": 728.98, "step": 35705, "train_speed(iter/s)": 0.157324 }, { "acc": 0.76785727, "epoch": 0.9058867736662444, "grad_norm": 3.71875, "learning_rate": 6.180029902273403e-06, "loss": 0.89455366, "memory(GiB)": 728.98, "step": 35710, "train_speed(iter/s)": 0.157299 }, { "acc": 0.77090425, "epoch": 0.9060136130352819, "grad_norm": 3.453125, "learning_rate": 6.179010847375885e-06, "loss": 0.90320845, "memory(GiB)": 728.98, "step": 35715, "train_speed(iter/s)": 0.157273 }, { "acc": 0.76239481, "epoch": 0.9061404524043195, "grad_norm": 3.453125, "learning_rate": 6.177991740617434e-06, "loss": 0.89915686, "memory(GiB)": 728.98, "step": 35720, "train_speed(iter/s)": 0.157244 }, { "acc": 0.75660267, "epoch": 0.9062672917733571, "grad_norm": 3.640625, "learning_rate": 6.17697258204288e-06, "loss": 0.92598209, "memory(GiB)": 728.98, "step": 35725, "train_speed(iter/s)": 0.157216 }, { "acc": 0.75607553, "epoch": 0.9063941311423946, "grad_norm": 3.546875, "learning_rate": 6.175953371697052e-06, "loss": 0.90326767, "memory(GiB)": 728.98, "step": 35730, "train_speed(iter/s)": 0.157194 }, { "acc": 0.76830597, "epoch": 0.9065209705114322, "grad_norm": 3.28125, "learning_rate": 6.174934109624782e-06, "loss": 0.89082136, "memory(GiB)": 728.98, "step": 35735, "train_speed(iter/s)": 0.157173 }, { "acc": 0.7633357, "epoch": 0.9066478098804698, "grad_norm": 3.5625, "learning_rate": 6.173914795870905e-06, "loss": 0.87814779, "memory(GiB)": 728.98, "step": 35740, "train_speed(iter/s)": 0.157149 }, { "acc": 0.77577434, "epoch": 0.9067746492495073, "grad_norm": 3.0625, "learning_rate": 6.172895430480257e-06, "loss": 0.85983095, "memory(GiB)": 728.98, "step": 35745, "train_speed(iter/s)": 0.15712 }, { "acc": 0.76328444, "epoch": 0.9069014886185449, "grad_norm": 3.59375, "learning_rate": 6.171876013497675e-06, "loss": 0.88902302, "memory(GiB)": 728.98, "step": 35750, "train_speed(iter/s)": 0.157098 }, { "acc": 0.76881042, "epoch": 0.9070283279875824, "grad_norm": 5.125, "learning_rate": 6.170856544968e-06, "loss": 0.9200242, "memory(GiB)": 728.98, "step": 35755, "train_speed(iter/s)": 0.157075 }, { "acc": 0.76809244, "epoch": 0.9071551673566199, "grad_norm": 3.9375, "learning_rate": 6.1698370249360775e-06, "loss": 0.90721931, "memory(GiB)": 728.98, "step": 35760, "train_speed(iter/s)": 0.157048 }, { "acc": 0.74796014, "epoch": 0.9072820067256575, "grad_norm": 3.765625, "learning_rate": 6.16881745344675e-06, "loss": 0.91053553, "memory(GiB)": 728.98, "step": 35765, "train_speed(iter/s)": 0.157021 }, { "acc": 0.76529517, "epoch": 0.9074088460946951, "grad_norm": 3.390625, "learning_rate": 6.167797830544868e-06, "loss": 0.87018776, "memory(GiB)": 728.98, "step": 35770, "train_speed(iter/s)": 0.156995 }, { "acc": 0.75117249, "epoch": 0.9075356854637326, "grad_norm": 3.171875, "learning_rate": 6.166778156275281e-06, "loss": 0.96331377, "memory(GiB)": 728.98, "step": 35775, "train_speed(iter/s)": 0.156972 }, { "acc": 0.76592975, "epoch": 0.9076625248327702, "grad_norm": 4.1875, "learning_rate": 6.16575843068284e-06, "loss": 0.87165203, "memory(GiB)": 728.98, "step": 35780, "train_speed(iter/s)": 0.15695 }, { "acc": 0.76606507, "epoch": 0.9077893642018078, "grad_norm": 4.03125, "learning_rate": 6.1647386538124e-06, "loss": 0.93550491, "memory(GiB)": 728.98, "step": 35785, "train_speed(iter/s)": 0.156926 }, { "acc": 0.77152352, "epoch": 0.9079162035708453, "grad_norm": 3.53125, "learning_rate": 6.163718825708817e-06, "loss": 0.83011246, "memory(GiB)": 728.98, "step": 35790, "train_speed(iter/s)": 0.156906 }, { "acc": 0.78200769, "epoch": 0.9080430429398829, "grad_norm": 3.640625, "learning_rate": 6.162698946416952e-06, "loss": 0.86757154, "memory(GiB)": 728.98, "step": 35795, "train_speed(iter/s)": 0.156882 }, { "acc": 0.75797925, "epoch": 0.9081698823089205, "grad_norm": 3.078125, "learning_rate": 6.161679015981663e-06, "loss": 0.92711134, "memory(GiB)": 728.98, "step": 35800, "train_speed(iter/s)": 0.156859 }, { "acc": 0.77037878, "epoch": 0.908296721677958, "grad_norm": 3.828125, "learning_rate": 6.1606590344478166e-06, "loss": 0.91743879, "memory(GiB)": 728.98, "step": 35805, "train_speed(iter/s)": 0.156833 }, { "acc": 0.76588416, "epoch": 0.9084235610469956, "grad_norm": 3.609375, "learning_rate": 6.159639001860277e-06, "loss": 0.89570732, "memory(GiB)": 728.98, "step": 35810, "train_speed(iter/s)": 0.15681 }, { "acc": 0.76058307, "epoch": 0.9085504004160331, "grad_norm": 2.96875, "learning_rate": 6.1586189182639135e-06, "loss": 0.93287287, "memory(GiB)": 728.98, "step": 35815, "train_speed(iter/s)": 0.156783 }, { "acc": 0.76185799, "epoch": 0.9086772397850706, "grad_norm": 3.921875, "learning_rate": 6.157598783703595e-06, "loss": 0.92211037, "memory(GiB)": 728.98, "step": 35820, "train_speed(iter/s)": 0.156758 }, { "acc": 0.77411847, "epoch": 0.9088040791541082, "grad_norm": 3.28125, "learning_rate": 6.1565785982241945e-06, "loss": 0.89478798, "memory(GiB)": 728.98, "step": 35825, "train_speed(iter/s)": 0.156736 }, { "acc": 0.76966915, "epoch": 0.9089309185231458, "grad_norm": 3.953125, "learning_rate": 6.155558361870586e-06, "loss": 0.89155226, "memory(GiB)": 728.98, "step": 35830, "train_speed(iter/s)": 0.156712 }, { "acc": 0.79089894, "epoch": 0.9090577578921833, "grad_norm": 3.796875, "learning_rate": 6.154538074687646e-06, "loss": 0.81567593, "memory(GiB)": 728.98, "step": 35835, "train_speed(iter/s)": 0.156689 }, { "acc": 0.7670959, "epoch": 0.9091845972612209, "grad_norm": 3.078125, "learning_rate": 6.153517736720257e-06, "loss": 0.90590277, "memory(GiB)": 728.98, "step": 35840, "train_speed(iter/s)": 0.156663 }, { "acc": 0.75238328, "epoch": 0.9093114366302585, "grad_norm": 3.671875, "learning_rate": 6.152497348013297e-06, "loss": 0.96438513, "memory(GiB)": 728.98, "step": 35845, "train_speed(iter/s)": 0.156639 }, { "acc": 0.78006544, "epoch": 0.909438275999296, "grad_norm": 4.0625, "learning_rate": 6.15147690861165e-06, "loss": 0.84587831, "memory(GiB)": 728.98, "step": 35850, "train_speed(iter/s)": 0.156609 }, { "acc": 0.75974941, "epoch": 0.9095651153683336, "grad_norm": 4.78125, "learning_rate": 6.150456418560204e-06, "loss": 0.96898909, "memory(GiB)": 728.98, "step": 35855, "train_speed(iter/s)": 0.156586 }, { "acc": 0.75360966, "epoch": 0.9096919547373712, "grad_norm": 4.28125, "learning_rate": 6.149435877903845e-06, "loss": 0.90501509, "memory(GiB)": 728.98, "step": 35860, "train_speed(iter/s)": 0.156568 }, { "acc": 0.76075053, "epoch": 0.9098187941064088, "grad_norm": 3.265625, "learning_rate": 6.148415286687462e-06, "loss": 0.96572113, "memory(GiB)": 728.98, "step": 35865, "train_speed(iter/s)": 0.156546 }, { "acc": 0.77067084, "epoch": 0.9099456334754463, "grad_norm": 3.921875, "learning_rate": 6.147394644955953e-06, "loss": 0.87163448, "memory(GiB)": 728.98, "step": 35870, "train_speed(iter/s)": 0.156526 }, { "acc": 0.77341042, "epoch": 0.9100724728444838, "grad_norm": 3.625, "learning_rate": 6.146373952754209e-06, "loss": 0.93486929, "memory(GiB)": 728.98, "step": 35875, "train_speed(iter/s)": 0.156505 }, { "acc": 0.77647176, "epoch": 0.9101993122135213, "grad_norm": 3.859375, "learning_rate": 6.145353210127125e-06, "loss": 0.8457263, "memory(GiB)": 728.98, "step": 35880, "train_speed(iter/s)": 0.156477 }, { "acc": 0.76612706, "epoch": 0.9103261515825589, "grad_norm": 3.546875, "learning_rate": 6.144332417119605e-06, "loss": 0.85743427, "memory(GiB)": 728.98, "step": 35885, "train_speed(iter/s)": 0.156455 }, { "acc": 0.7714951, "epoch": 0.9104529909515965, "grad_norm": 3.96875, "learning_rate": 6.143311573776546e-06, "loss": 0.86693687, "memory(GiB)": 728.98, "step": 35890, "train_speed(iter/s)": 0.156432 }, { "acc": 0.76405702, "epoch": 0.910579830320634, "grad_norm": 3.671875, "learning_rate": 6.142290680142855e-06, "loss": 0.88479576, "memory(GiB)": 728.98, "step": 35895, "train_speed(iter/s)": 0.156406 }, { "acc": 0.76303902, "epoch": 0.9107066696896716, "grad_norm": 4.5625, "learning_rate": 6.141269736263437e-06, "loss": 0.88408127, "memory(GiB)": 728.98, "step": 35900, "train_speed(iter/s)": 0.156387 }, { "acc": 0.76067381, "epoch": 0.9108335090587092, "grad_norm": 3.40625, "learning_rate": 6.140248742183198e-06, "loss": 0.96275415, "memory(GiB)": 728.98, "step": 35905, "train_speed(iter/s)": 0.15636 }, { "acc": 0.77236047, "epoch": 0.9109603484277468, "grad_norm": 2.9375, "learning_rate": 6.139227697947051e-06, "loss": 0.87903929, "memory(GiB)": 728.98, "step": 35910, "train_speed(iter/s)": 0.15634 }, { "acc": 0.757757, "epoch": 0.9110871877967843, "grad_norm": 3.703125, "learning_rate": 6.138206603599906e-06, "loss": 0.90150461, "memory(GiB)": 728.98, "step": 35915, "train_speed(iter/s)": 0.156316 }, { "acc": 0.77415986, "epoch": 0.9112140271658219, "grad_norm": 3.875, "learning_rate": 6.137185459186681e-06, "loss": 0.84583578, "memory(GiB)": 728.98, "step": 35920, "train_speed(iter/s)": 0.156293 }, { "acc": 0.76680751, "epoch": 0.9113408665348595, "grad_norm": 3.5, "learning_rate": 6.136164264752289e-06, "loss": 0.86170368, "memory(GiB)": 742.41, "step": 35925, "train_speed(iter/s)": 0.156265 }, { "acc": 0.7581491, "epoch": 0.911467705903897, "grad_norm": 3.578125, "learning_rate": 6.135143020341653e-06, "loss": 0.90717211, "memory(GiB)": 742.41, "step": 35930, "train_speed(iter/s)": 0.156238 }, { "acc": 0.7556931, "epoch": 0.9115945452729345, "grad_norm": 3.3125, "learning_rate": 6.134121725999691e-06, "loss": 0.93519316, "memory(GiB)": 742.41, "step": 35935, "train_speed(iter/s)": 0.156212 }, { "acc": 0.76296377, "epoch": 0.911721384641972, "grad_norm": 3.34375, "learning_rate": 6.133100381771329e-06, "loss": 0.91547785, "memory(GiB)": 742.41, "step": 35940, "train_speed(iter/s)": 0.156187 }, { "acc": 0.78693514, "epoch": 0.9118482240110096, "grad_norm": 3.828125, "learning_rate": 6.132078987701491e-06, "loss": 0.8387969, "memory(GiB)": 742.41, "step": 35945, "train_speed(iter/s)": 0.156164 }, { "acc": 0.77315812, "epoch": 0.9119750633800472, "grad_norm": 4.875, "learning_rate": 6.131057543835105e-06, "loss": 0.86933556, "memory(GiB)": 742.41, "step": 35950, "train_speed(iter/s)": 0.156145 }, { "acc": 0.76397195, "epoch": 0.9121019027490848, "grad_norm": 3.515625, "learning_rate": 6.130036050217102e-06, "loss": 0.9803937, "memory(GiB)": 742.41, "step": 35955, "train_speed(iter/s)": 0.156119 }, { "acc": 0.76344829, "epoch": 0.9122287421181223, "grad_norm": 3.859375, "learning_rate": 6.129014506892412e-06, "loss": 0.94466314, "memory(GiB)": 742.41, "step": 35960, "train_speed(iter/s)": 0.156097 }, { "acc": 0.76406827, "epoch": 0.9123555814871599, "grad_norm": 3.703125, "learning_rate": 6.127992913905973e-06, "loss": 0.93377609, "memory(GiB)": 742.41, "step": 35965, "train_speed(iter/s)": 0.156076 }, { "acc": 0.77616482, "epoch": 0.9124824208561975, "grad_norm": 3.328125, "learning_rate": 6.126971271302718e-06, "loss": 0.85519962, "memory(GiB)": 742.41, "step": 35970, "train_speed(iter/s)": 0.156048 }, { "acc": 0.75434289, "epoch": 0.912609260225235, "grad_norm": 3.75, "learning_rate": 6.12594957912759e-06, "loss": 0.96570187, "memory(GiB)": 742.41, "step": 35975, "train_speed(iter/s)": 0.156029 }, { "acc": 0.74252553, "epoch": 0.9127360995942726, "grad_norm": 3.984375, "learning_rate": 6.124927837425525e-06, "loss": 1.01131287, "memory(GiB)": 742.41, "step": 35980, "train_speed(iter/s)": 0.156002 }, { "acc": 0.77186027, "epoch": 0.9128629389633102, "grad_norm": 3.296875, "learning_rate": 6.1239060462414705e-06, "loss": 0.89069471, "memory(GiB)": 742.41, "step": 35985, "train_speed(iter/s)": 0.155976 }, { "acc": 0.77025757, "epoch": 0.9129897783323477, "grad_norm": 3.1875, "learning_rate": 6.122884205620368e-06, "loss": 0.87595587, "memory(GiB)": 742.41, "step": 35990, "train_speed(iter/s)": 0.155944 }, { "acc": 0.77725835, "epoch": 0.9131166177013852, "grad_norm": 3.203125, "learning_rate": 6.12186231560717e-06, "loss": 0.83396902, "memory(GiB)": 742.41, "step": 35995, "train_speed(iter/s)": 0.155923 }, { "acc": 0.75903473, "epoch": 0.9132434570704228, "grad_norm": 3.703125, "learning_rate": 6.120840376246822e-06, "loss": 0.87943459, "memory(GiB)": 742.41, "step": 36000, "train_speed(iter/s)": 0.155899 }, { "epoch": 0.9132434570704228, "eval_acc": 0.7571581295220701, "eval_loss": 0.859788715839386, "eval_runtime": 1148.818, "eval_samples_per_second": 5.545, "eval_steps_per_second": 5.545, "step": 36000 }, { "acc": 0.76852541, "epoch": 0.9133702964394603, "grad_norm": 3.5, "learning_rate": 6.119818387584275e-06, "loss": 0.8842247, "memory(GiB)": 742.41, "step": 36005, "train_speed(iter/s)": 0.154622 }, { "acc": 0.78044505, "epoch": 0.9134971358084979, "grad_norm": 3.4375, "learning_rate": 6.118796349664487e-06, "loss": 0.87272892, "memory(GiB)": 742.41, "step": 36010, "train_speed(iter/s)": 0.154601 }, { "acc": 0.76029429, "epoch": 0.9136239751775355, "grad_norm": 3.046875, "learning_rate": 6.117774262532412e-06, "loss": 0.91246405, "memory(GiB)": 742.41, "step": 36015, "train_speed(iter/s)": 0.154576 }, { "acc": 0.77479548, "epoch": 0.913750814546573, "grad_norm": 3.84375, "learning_rate": 6.116752126233008e-06, "loss": 0.86102867, "memory(GiB)": 742.41, "step": 36020, "train_speed(iter/s)": 0.15455 }, { "acc": 0.76829047, "epoch": 0.9138776539156106, "grad_norm": 3.671875, "learning_rate": 6.115729940811237e-06, "loss": 0.85413351, "memory(GiB)": 742.41, "step": 36025, "train_speed(iter/s)": 0.15453 }, { "acc": 0.76172552, "epoch": 0.9140044932846482, "grad_norm": 3.9375, "learning_rate": 6.11470770631206e-06, "loss": 0.93110628, "memory(GiB)": 742.41, "step": 36030, "train_speed(iter/s)": 0.154507 }, { "acc": 0.78104277, "epoch": 0.9141313326536857, "grad_norm": 3.40625, "learning_rate": 6.113685422780445e-06, "loss": 0.89168682, "memory(GiB)": 742.41, "step": 36035, "train_speed(iter/s)": 0.154487 }, { "acc": 0.76644855, "epoch": 0.9142581720227233, "grad_norm": 3.265625, "learning_rate": 6.112663090261355e-06, "loss": 0.8786643, "memory(GiB)": 742.41, "step": 36040, "train_speed(iter/s)": 0.154465 }, { "acc": 0.76674843, "epoch": 0.9143850113917609, "grad_norm": 3.71875, "learning_rate": 6.111640708799761e-06, "loss": 0.9159832, "memory(GiB)": 742.41, "step": 36045, "train_speed(iter/s)": 0.154444 }, { "acc": 0.76912317, "epoch": 0.9145118507607984, "grad_norm": 3.21875, "learning_rate": 6.110618278440633e-06, "loss": 0.88148365, "memory(GiB)": 742.41, "step": 36050, "train_speed(iter/s)": 0.15442 }, { "acc": 0.76191187, "epoch": 0.9146386901298359, "grad_norm": 4.03125, "learning_rate": 6.109595799228946e-06, "loss": 0.91981936, "memory(GiB)": 742.41, "step": 36055, "train_speed(iter/s)": 0.154393 }, { "acc": 0.77026281, "epoch": 0.9147655294988735, "grad_norm": 9.8125, "learning_rate": 6.108573271209675e-06, "loss": 0.89235058, "memory(GiB)": 742.41, "step": 36060, "train_speed(iter/s)": 0.154369 }, { "acc": 0.77674751, "epoch": 0.914892368867911, "grad_norm": 3.109375, "learning_rate": 6.107550694427798e-06, "loss": 0.89411306, "memory(GiB)": 742.41, "step": 36065, "train_speed(iter/s)": 0.154343 }, { "acc": 0.76112976, "epoch": 0.9150192082369486, "grad_norm": 3.015625, "learning_rate": 6.106528068928297e-06, "loss": 0.91943769, "memory(GiB)": 742.41, "step": 36070, "train_speed(iter/s)": 0.154311 }, { "acc": 0.76319942, "epoch": 0.9151460476059862, "grad_norm": 3.453125, "learning_rate": 6.105505394756149e-06, "loss": 0.92262897, "memory(GiB)": 742.41, "step": 36075, "train_speed(iter/s)": 0.154286 }, { "acc": 0.75720515, "epoch": 0.9152728869750237, "grad_norm": 3.0, "learning_rate": 6.1044826719563435e-06, "loss": 0.93024731, "memory(GiB)": 742.41, "step": 36080, "train_speed(iter/s)": 0.154258 }, { "acc": 0.76871834, "epoch": 0.9153997263440613, "grad_norm": 4.0, "learning_rate": 6.103459900573861e-06, "loss": 0.9100421, "memory(GiB)": 742.41, "step": 36085, "train_speed(iter/s)": 0.154237 }, { "acc": 0.76523418, "epoch": 0.9155265657130989, "grad_norm": 3.375, "learning_rate": 6.102437080653697e-06, "loss": 0.89241018, "memory(GiB)": 742.41, "step": 36090, "train_speed(iter/s)": 0.154215 }, { "acc": 0.7535706, "epoch": 0.9156534050821364, "grad_norm": 3.390625, "learning_rate": 6.101414212240836e-06, "loss": 0.89846172, "memory(GiB)": 742.41, "step": 36095, "train_speed(iter/s)": 0.154195 }, { "acc": 0.7703681, "epoch": 0.915780244451174, "grad_norm": 3.515625, "learning_rate": 6.100391295380275e-06, "loss": 0.85701246, "memory(GiB)": 742.41, "step": 36100, "train_speed(iter/s)": 0.154172 }, { "acc": 0.76470542, "epoch": 0.9159070838202116, "grad_norm": 3.140625, "learning_rate": 6.0993683301170046e-06, "loss": 0.89224272, "memory(GiB)": 742.41, "step": 36105, "train_speed(iter/s)": 0.154145 }, { "acc": 0.79428077, "epoch": 0.9160339231892491, "grad_norm": 4.5625, "learning_rate": 6.098345316496026e-06, "loss": 0.75114183, "memory(GiB)": 742.41, "step": 36110, "train_speed(iter/s)": 0.154119 }, { "acc": 0.76612287, "epoch": 0.9161607625582866, "grad_norm": 4.3125, "learning_rate": 6.097322254562336e-06, "loss": 0.96084127, "memory(GiB)": 742.41, "step": 36115, "train_speed(iter/s)": 0.154095 }, { "acc": 0.77789402, "epoch": 0.9162876019273242, "grad_norm": 4.25, "learning_rate": 6.096299144360937e-06, "loss": 0.88754578, "memory(GiB)": 742.41, "step": 36120, "train_speed(iter/s)": 0.15407 }, { "acc": 0.76594863, "epoch": 0.9164144412963617, "grad_norm": 3.078125, "learning_rate": 6.095275985936831e-06, "loss": 0.911656, "memory(GiB)": 742.41, "step": 36125, "train_speed(iter/s)": 0.154047 }, { "acc": 0.7578042, "epoch": 0.9165412806653993, "grad_norm": 3.875, "learning_rate": 6.094252779335024e-06, "loss": 0.91955481, "memory(GiB)": 742.41, "step": 36130, "train_speed(iter/s)": 0.154023 }, { "acc": 0.7717495, "epoch": 0.9166681200344369, "grad_norm": 3.921875, "learning_rate": 6.093229524600524e-06, "loss": 0.91932564, "memory(GiB)": 742.41, "step": 36135, "train_speed(iter/s)": 0.154001 }, { "acc": 0.77748537, "epoch": 0.9167949594034744, "grad_norm": 3.34375, "learning_rate": 6.09220622177834e-06, "loss": 0.8876914, "memory(GiB)": 742.41, "step": 36140, "train_speed(iter/s)": 0.15398 }, { "acc": 0.77674623, "epoch": 0.916921798772512, "grad_norm": 3.296875, "learning_rate": 6.0911828709134845e-06, "loss": 0.88554163, "memory(GiB)": 742.41, "step": 36145, "train_speed(iter/s)": 0.15396 }, { "acc": 0.77002807, "epoch": 0.9170486381415496, "grad_norm": 4.5625, "learning_rate": 6.09015947205097e-06, "loss": 0.8760663, "memory(GiB)": 742.41, "step": 36150, "train_speed(iter/s)": 0.153938 }, { "acc": 0.76365509, "epoch": 0.9171754775105871, "grad_norm": 3.296875, "learning_rate": 6.089136025235815e-06, "loss": 0.85494261, "memory(GiB)": 742.41, "step": 36155, "train_speed(iter/s)": 0.153914 }, { "acc": 0.77001657, "epoch": 0.9173023168796247, "grad_norm": 3.75, "learning_rate": 6.088112530513037e-06, "loss": 0.88683748, "memory(GiB)": 742.41, "step": 36160, "train_speed(iter/s)": 0.153893 }, { "acc": 0.77837377, "epoch": 0.9174291562486623, "grad_norm": 3.453125, "learning_rate": 6.087088987927655e-06, "loss": 0.86204348, "memory(GiB)": 742.41, "step": 36165, "train_speed(iter/s)": 0.15387 }, { "acc": 0.76722546, "epoch": 0.9175559956176998, "grad_norm": 3.359375, "learning_rate": 6.086065397524693e-06, "loss": 0.88460331, "memory(GiB)": 742.41, "step": 36170, "train_speed(iter/s)": 0.153845 }, { "acc": 0.75943618, "epoch": 0.9176828349867373, "grad_norm": 3.671875, "learning_rate": 6.085041759349174e-06, "loss": 0.9475709, "memory(GiB)": 742.41, "step": 36175, "train_speed(iter/s)": 0.153824 }, { "acc": 0.75303259, "epoch": 0.9178096743557749, "grad_norm": 3.140625, "learning_rate": 6.084018073446125e-06, "loss": 0.91510248, "memory(GiB)": 742.41, "step": 36180, "train_speed(iter/s)": 0.153794 }, { "acc": 0.77348881, "epoch": 0.9179365137248124, "grad_norm": 3.671875, "learning_rate": 6.082994339860575e-06, "loss": 0.89328718, "memory(GiB)": 742.41, "step": 36185, "train_speed(iter/s)": 0.153769 }, { "acc": 0.77615199, "epoch": 0.91806335309385, "grad_norm": 3.03125, "learning_rate": 6.081970558637555e-06, "loss": 0.87439709, "memory(GiB)": 742.41, "step": 36190, "train_speed(iter/s)": 0.153746 }, { "acc": 0.76752672, "epoch": 0.9181901924628876, "grad_norm": 3.6875, "learning_rate": 6.0809467298220955e-06, "loss": 0.88009567, "memory(GiB)": 742.41, "step": 36195, "train_speed(iter/s)": 0.153724 }, { "acc": 0.75386744, "epoch": 0.9183170318319251, "grad_norm": 3.796875, "learning_rate": 6.079922853459235e-06, "loss": 0.93647919, "memory(GiB)": 742.41, "step": 36200, "train_speed(iter/s)": 0.153704 }, { "acc": 0.76955295, "epoch": 0.9184438712009627, "grad_norm": 3.25, "learning_rate": 6.078898929594009e-06, "loss": 0.8550602, "memory(GiB)": 742.41, "step": 36205, "train_speed(iter/s)": 0.153682 }, { "acc": 0.75834446, "epoch": 0.9185707105700003, "grad_norm": 3.46875, "learning_rate": 6.077874958271457e-06, "loss": 0.90889072, "memory(GiB)": 742.41, "step": 36210, "train_speed(iter/s)": 0.153658 }, { "acc": 0.76425219, "epoch": 0.9186975499390378, "grad_norm": 3.625, "learning_rate": 6.076850939536619e-06, "loss": 0.93254118, "memory(GiB)": 742.41, "step": 36215, "train_speed(iter/s)": 0.153631 }, { "acc": 0.76230731, "epoch": 0.9188243893080754, "grad_norm": 6.90625, "learning_rate": 6.07582687343454e-06, "loss": 0.92717953, "memory(GiB)": 742.41, "step": 36220, "train_speed(iter/s)": 0.153607 }, { "acc": 0.77168369, "epoch": 0.918951228677113, "grad_norm": 3.71875, "learning_rate": 6.074802760010264e-06, "loss": 0.90414248, "memory(GiB)": 742.41, "step": 36225, "train_speed(iter/s)": 0.153587 }, { "acc": 0.76330199, "epoch": 0.9190780680461506, "grad_norm": 3.078125, "learning_rate": 6.073778599308839e-06, "loss": 0.90603647, "memory(GiB)": 742.41, "step": 36230, "train_speed(iter/s)": 0.153566 }, { "acc": 0.77658195, "epoch": 0.919204907415188, "grad_norm": 3.71875, "learning_rate": 6.072754391375316e-06, "loss": 0.84811621, "memory(GiB)": 742.41, "step": 36235, "train_speed(iter/s)": 0.153547 }, { "acc": 0.77203422, "epoch": 0.9193317467842256, "grad_norm": 3.078125, "learning_rate": 6.071730136254745e-06, "loss": 0.84977083, "memory(GiB)": 742.41, "step": 36240, "train_speed(iter/s)": 0.153524 }, { "acc": 0.76936264, "epoch": 0.9194585861532631, "grad_norm": 4.15625, "learning_rate": 6.0707058339921784e-06, "loss": 0.90419769, "memory(GiB)": 742.41, "step": 36245, "train_speed(iter/s)": 0.153498 }, { "acc": 0.77888536, "epoch": 0.9195854255223007, "grad_norm": 3.65625, "learning_rate": 6.069681484632677e-06, "loss": 0.85051966, "memory(GiB)": 742.41, "step": 36250, "train_speed(iter/s)": 0.15347 }, { "acc": 0.76823101, "epoch": 0.9197122648913383, "grad_norm": 3.453125, "learning_rate": 6.068657088221292e-06, "loss": 0.87140198, "memory(GiB)": 742.41, "step": 36255, "train_speed(iter/s)": 0.153449 }, { "acc": 0.75726242, "epoch": 0.9198391042603759, "grad_norm": 3.859375, "learning_rate": 6.067632644803088e-06, "loss": 0.94087067, "memory(GiB)": 742.41, "step": 36260, "train_speed(iter/s)": 0.153429 }, { "acc": 0.7725275, "epoch": 0.9199659436294134, "grad_norm": 3.78125, "learning_rate": 6.0666081544231245e-06, "loss": 0.88365765, "memory(GiB)": 742.41, "step": 36265, "train_speed(iter/s)": 0.153404 }, { "acc": 0.77131734, "epoch": 0.920092782998451, "grad_norm": 3.734375, "learning_rate": 6.065583617126468e-06, "loss": 0.87379742, "memory(GiB)": 742.41, "step": 36270, "train_speed(iter/s)": 0.15338 }, { "acc": 0.76547956, "epoch": 0.9202196223674886, "grad_norm": 4.15625, "learning_rate": 6.064559032958183e-06, "loss": 0.89501419, "memory(GiB)": 742.41, "step": 36275, "train_speed(iter/s)": 0.153356 }, { "acc": 0.78031631, "epoch": 0.9203464617365261, "grad_norm": 4.125, "learning_rate": 6.0635344019633376e-06, "loss": 0.85943613, "memory(GiB)": 742.41, "step": 36280, "train_speed(iter/s)": 0.153332 }, { "acc": 0.74795551, "epoch": 0.9204733011055637, "grad_norm": 3.1875, "learning_rate": 6.062509724187005e-06, "loss": 0.9871932, "memory(GiB)": 742.41, "step": 36285, "train_speed(iter/s)": 0.153308 }, { "acc": 0.77602954, "epoch": 0.9206001404746013, "grad_norm": 3.109375, "learning_rate": 6.061484999674252e-06, "loss": 0.87856865, "memory(GiB)": 742.41, "step": 36290, "train_speed(iter/s)": 0.153288 }, { "acc": 0.76819425, "epoch": 0.9207269798436387, "grad_norm": 3.1875, "learning_rate": 6.060460228470157e-06, "loss": 0.87837534, "memory(GiB)": 742.41, "step": 36295, "train_speed(iter/s)": 0.15327 }, { "acc": 0.75494943, "epoch": 0.9208538192126763, "grad_norm": 3.28125, "learning_rate": 6.059435410619794e-06, "loss": 0.92050467, "memory(GiB)": 742.41, "step": 36300, "train_speed(iter/s)": 0.153245 }, { "acc": 0.76468401, "epoch": 0.9209806585817139, "grad_norm": 3.5625, "learning_rate": 6.0584105461682455e-06, "loss": 0.92174282, "memory(GiB)": 742.41, "step": 36305, "train_speed(iter/s)": 0.153226 }, { "acc": 0.76694746, "epoch": 0.9211074979507514, "grad_norm": 3.84375, "learning_rate": 6.057385635160587e-06, "loss": 0.85829849, "memory(GiB)": 742.41, "step": 36310, "train_speed(iter/s)": 0.153202 }, { "acc": 0.76404948, "epoch": 0.921234337319789, "grad_norm": 3.28125, "learning_rate": 6.056360677641905e-06, "loss": 0.89560385, "memory(GiB)": 742.41, "step": 36315, "train_speed(iter/s)": 0.153178 }, { "acc": 0.76836753, "epoch": 0.9213611766888266, "grad_norm": 3.25, "learning_rate": 6.055335673657281e-06, "loss": 0.92046089, "memory(GiB)": 742.41, "step": 36320, "train_speed(iter/s)": 0.153156 }, { "acc": 0.75685863, "epoch": 0.9214880160578641, "grad_norm": 3.25, "learning_rate": 6.054310623251804e-06, "loss": 0.87663679, "memory(GiB)": 742.41, "step": 36325, "train_speed(iter/s)": 0.153135 }, { "acc": 0.76809564, "epoch": 0.9216148554269017, "grad_norm": 3.71875, "learning_rate": 6.053285526470563e-06, "loss": 0.87611694, "memory(GiB)": 742.41, "step": 36330, "train_speed(iter/s)": 0.153114 }, { "acc": 0.76406975, "epoch": 0.9217416947959393, "grad_norm": 3.59375, "learning_rate": 6.052260383358647e-06, "loss": 0.94878445, "memory(GiB)": 742.41, "step": 36335, "train_speed(iter/s)": 0.153092 }, { "acc": 0.76232119, "epoch": 0.9218685341649768, "grad_norm": 3.859375, "learning_rate": 6.051235193961149e-06, "loss": 0.91835022, "memory(GiB)": 742.41, "step": 36340, "train_speed(iter/s)": 0.153071 }, { "acc": 0.7653564, "epoch": 0.9219953735340144, "grad_norm": 3.015625, "learning_rate": 6.050209958323163e-06, "loss": 0.90221081, "memory(GiB)": 742.41, "step": 36345, "train_speed(iter/s)": 0.153047 }, { "acc": 0.758007, "epoch": 0.922122212903052, "grad_norm": 3.109375, "learning_rate": 6.049184676489789e-06, "loss": 0.9024621, "memory(GiB)": 742.41, "step": 36350, "train_speed(iter/s)": 0.153023 }, { "acc": 0.78484697, "epoch": 0.9222490522720894, "grad_norm": 3.75, "learning_rate": 6.048159348506123e-06, "loss": 0.85891533, "memory(GiB)": 742.41, "step": 36355, "train_speed(iter/s)": 0.153002 }, { "acc": 0.77113929, "epoch": 0.922375891641127, "grad_norm": 3.3125, "learning_rate": 6.047133974417269e-06, "loss": 0.90440178, "memory(GiB)": 742.41, "step": 36360, "train_speed(iter/s)": 0.15298 }, { "acc": 0.76271968, "epoch": 0.9225027310101646, "grad_norm": 10.3125, "learning_rate": 6.046108554268326e-06, "loss": 0.91551332, "memory(GiB)": 742.41, "step": 36365, "train_speed(iter/s)": 0.152953 }, { "acc": 0.76372299, "epoch": 0.9226295703792021, "grad_norm": 2.75, "learning_rate": 6.0450830881044e-06, "loss": 0.93256607, "memory(GiB)": 742.41, "step": 36370, "train_speed(iter/s)": 0.152929 }, { "acc": 0.77176037, "epoch": 0.9227564097482397, "grad_norm": 3.515625, "learning_rate": 6.044057575970601e-06, "loss": 0.85436945, "memory(GiB)": 742.41, "step": 36375, "train_speed(iter/s)": 0.152909 }, { "acc": 0.7534255, "epoch": 0.9228832491172773, "grad_norm": 3.59375, "learning_rate": 6.043032017912034e-06, "loss": 0.94960413, "memory(GiB)": 742.41, "step": 36380, "train_speed(iter/s)": 0.152886 }, { "acc": 0.75469618, "epoch": 0.9230100884863148, "grad_norm": 5.21875, "learning_rate": 6.042006413973814e-06, "loss": 0.93453808, "memory(GiB)": 742.41, "step": 36385, "train_speed(iter/s)": 0.152865 }, { "acc": 0.76907425, "epoch": 0.9231369278553524, "grad_norm": 4.59375, "learning_rate": 6.040980764201049e-06, "loss": 0.87891855, "memory(GiB)": 742.41, "step": 36390, "train_speed(iter/s)": 0.152845 }, { "acc": 0.77029796, "epoch": 0.92326376722439, "grad_norm": 3.34375, "learning_rate": 6.039955068638858e-06, "loss": 0.88606796, "memory(GiB)": 742.41, "step": 36395, "train_speed(iter/s)": 0.152824 }, { "acc": 0.78860765, "epoch": 0.9233906065934275, "grad_norm": 3.625, "learning_rate": 6.038929327332357e-06, "loss": 0.81239595, "memory(GiB)": 742.41, "step": 36400, "train_speed(iter/s)": 0.1528 }, { "acc": 0.75962057, "epoch": 0.9235174459624651, "grad_norm": 3.8125, "learning_rate": 6.037903540326666e-06, "loss": 0.90412149, "memory(GiB)": 742.41, "step": 36405, "train_speed(iter/s)": 0.152778 }, { "acc": 0.7774888, "epoch": 0.9236442853315027, "grad_norm": 3.3125, "learning_rate": 6.036877707666906e-06, "loss": 0.8255578, "memory(GiB)": 742.41, "step": 36410, "train_speed(iter/s)": 0.152758 }, { "acc": 0.77060962, "epoch": 0.9237711247005401, "grad_norm": 3.03125, "learning_rate": 6.0358518293981976e-06, "loss": 0.85566206, "memory(GiB)": 742.41, "step": 36415, "train_speed(iter/s)": 0.152735 }, { "acc": 0.77291985, "epoch": 0.9238979640695777, "grad_norm": 3.21875, "learning_rate": 6.034825905565668e-06, "loss": 0.92858496, "memory(GiB)": 742.41, "step": 36420, "train_speed(iter/s)": 0.152711 }, { "acc": 0.7602232, "epoch": 0.9240248034386153, "grad_norm": 3.40625, "learning_rate": 6.033799936214444e-06, "loss": 0.9083374, "memory(GiB)": 742.41, "step": 36425, "train_speed(iter/s)": 0.152689 }, { "acc": 0.77786436, "epoch": 0.9241516428076528, "grad_norm": 3.96875, "learning_rate": 6.032773921389655e-06, "loss": 0.84723091, "memory(GiB)": 742.41, "step": 36430, "train_speed(iter/s)": 0.152664 }, { "acc": 0.76706228, "epoch": 0.9242784821766904, "grad_norm": 3.25, "learning_rate": 6.031747861136431e-06, "loss": 0.89093723, "memory(GiB)": 742.41, "step": 36435, "train_speed(iter/s)": 0.152642 }, { "acc": 0.76423168, "epoch": 0.924405321545728, "grad_norm": 4.0625, "learning_rate": 6.030721755499906e-06, "loss": 0.95394974, "memory(GiB)": 742.41, "step": 36440, "train_speed(iter/s)": 0.152614 }, { "acc": 0.75990324, "epoch": 0.9245321609147655, "grad_norm": 3.515625, "learning_rate": 6.029695604525216e-06, "loss": 0.91347532, "memory(GiB)": 742.41, "step": 36445, "train_speed(iter/s)": 0.152592 }, { "acc": 0.75973601, "epoch": 0.9246590002838031, "grad_norm": 3.9375, "learning_rate": 6.028669408257497e-06, "loss": 0.93133659, "memory(GiB)": 742.41, "step": 36450, "train_speed(iter/s)": 0.15257 }, { "acc": 0.75222979, "epoch": 0.9247858396528407, "grad_norm": 3.859375, "learning_rate": 6.027643166741889e-06, "loss": 0.90563574, "memory(GiB)": 742.41, "step": 36455, "train_speed(iter/s)": 0.152546 }, { "acc": 0.76536598, "epoch": 0.9249126790218782, "grad_norm": 3.328125, "learning_rate": 6.02661688002353e-06, "loss": 0.90440006, "memory(GiB)": 742.41, "step": 36460, "train_speed(iter/s)": 0.152522 }, { "acc": 0.76743398, "epoch": 0.9250395183909158, "grad_norm": 3.125, "learning_rate": 6.025590548147569e-06, "loss": 0.91566725, "memory(GiB)": 742.41, "step": 36465, "train_speed(iter/s)": 0.152503 }, { "acc": 0.75086493, "epoch": 0.9251663577599534, "grad_norm": 3.4375, "learning_rate": 6.024564171159144e-06, "loss": 0.93452492, "memory(GiB)": 742.41, "step": 36470, "train_speed(iter/s)": 0.152485 }, { "acc": 0.76425095, "epoch": 0.9252931971289908, "grad_norm": 3.609375, "learning_rate": 6.023537749103407e-06, "loss": 0.91460381, "memory(GiB)": 742.41, "step": 36475, "train_speed(iter/s)": 0.152461 }, { "acc": 0.76621318, "epoch": 0.9254200364980284, "grad_norm": 3.359375, "learning_rate": 6.022511282025505e-06, "loss": 0.83599911, "memory(GiB)": 742.41, "step": 36480, "train_speed(iter/s)": 0.152441 }, { "acc": 0.76507807, "epoch": 0.925546875867066, "grad_norm": 3.921875, "learning_rate": 6.021484769970591e-06, "loss": 0.8834897, "memory(GiB)": 742.41, "step": 36485, "train_speed(iter/s)": 0.152422 }, { "acc": 0.76306539, "epoch": 0.9256737152361035, "grad_norm": 3.546875, "learning_rate": 6.020458212983815e-06, "loss": 0.88409863, "memory(GiB)": 742.41, "step": 36490, "train_speed(iter/s)": 0.1524 }, { "acc": 0.76606255, "epoch": 0.9258005546051411, "grad_norm": 3.515625, "learning_rate": 6.019431611110334e-06, "loss": 0.89406528, "memory(GiB)": 742.41, "step": 36495, "train_speed(iter/s)": 0.152379 }, { "acc": 0.76968718, "epoch": 0.9259273939741787, "grad_norm": 4.25, "learning_rate": 6.018404964395306e-06, "loss": 0.91707258, "memory(GiB)": 742.41, "step": 36500, "train_speed(iter/s)": 0.152355 }, { "epoch": 0.9259273939741787, "eval_acc": 0.7571618894278551, "eval_loss": 0.859330952167511, "eval_runtime": 1147.7575, "eval_samples_per_second": 5.55, "eval_steps_per_second": 5.55, "step": 36500 }, { "acc": 0.77761478, "epoch": 0.9260542333432162, "grad_norm": 3.421875, "learning_rate": 6.017378272883886e-06, "loss": 0.81947041, "memory(GiB)": 742.41, "step": 36505, "train_speed(iter/s)": 0.151154 }, { "acc": 0.7757946, "epoch": 0.9261810727122538, "grad_norm": 4.125, "learning_rate": 6.016351536621239e-06, "loss": 0.83236685, "memory(GiB)": 742.41, "step": 36510, "train_speed(iter/s)": 0.151135 }, { "acc": 0.77307553, "epoch": 0.9263079120812914, "grad_norm": 3.609375, "learning_rate": 6.015324755652525e-06, "loss": 0.91600771, "memory(GiB)": 742.41, "step": 36515, "train_speed(iter/s)": 0.151115 }, { "acc": 0.7732079, "epoch": 0.9264347514503289, "grad_norm": 3.1875, "learning_rate": 6.014297930022912e-06, "loss": 0.88755293, "memory(GiB)": 742.41, "step": 36520, "train_speed(iter/s)": 0.151094 }, { "acc": 0.74524431, "epoch": 0.9265615908193665, "grad_norm": 3.484375, "learning_rate": 6.013271059777563e-06, "loss": 0.95963573, "memory(GiB)": 742.41, "step": 36525, "train_speed(iter/s)": 0.151073 }, { "acc": 0.77661505, "epoch": 0.9266884301884041, "grad_norm": 3.921875, "learning_rate": 6.01224414496165e-06, "loss": 0.86017923, "memory(GiB)": 742.41, "step": 36530, "train_speed(iter/s)": 0.151051 }, { "acc": 0.75724611, "epoch": 0.9268152695574415, "grad_norm": 3.40625, "learning_rate": 6.011217185620342e-06, "loss": 0.93305645, "memory(GiB)": 742.41, "step": 36535, "train_speed(iter/s)": 0.151031 }, { "acc": 0.75652084, "epoch": 0.9269421089264791, "grad_norm": 23.75, "learning_rate": 6.010190181798811e-06, "loss": 0.91863823, "memory(GiB)": 742.41, "step": 36540, "train_speed(iter/s)": 0.151009 }, { "acc": 0.76506701, "epoch": 0.9270689482955167, "grad_norm": 3.40625, "learning_rate": 6.009163133542233e-06, "loss": 0.92791052, "memory(GiB)": 742.41, "step": 36545, "train_speed(iter/s)": 0.150989 }, { "acc": 0.75387564, "epoch": 0.9271957876645542, "grad_norm": 3.828125, "learning_rate": 6.008136040895784e-06, "loss": 0.98352518, "memory(GiB)": 742.41, "step": 36550, "train_speed(iter/s)": 0.15097 }, { "acc": 0.7485971, "epoch": 0.9273226270335918, "grad_norm": 3.515625, "learning_rate": 6.007108903904644e-06, "loss": 0.92155638, "memory(GiB)": 742.41, "step": 36555, "train_speed(iter/s)": 0.150948 }, { "acc": 0.7678968, "epoch": 0.9274494664026294, "grad_norm": 3.453125, "learning_rate": 6.00608172261399e-06, "loss": 0.91358023, "memory(GiB)": 742.41, "step": 36560, "train_speed(iter/s)": 0.150926 }, { "acc": 0.77470026, "epoch": 0.9275763057716669, "grad_norm": 3.359375, "learning_rate": 6.0050544970690085e-06, "loss": 0.91850147, "memory(GiB)": 742.41, "step": 36565, "train_speed(iter/s)": 0.150904 }, { "acc": 0.77545385, "epoch": 0.9277031451407045, "grad_norm": 3.953125, "learning_rate": 6.0040272273148805e-06, "loss": 0.84743357, "memory(GiB)": 742.41, "step": 36570, "train_speed(iter/s)": 0.150882 }, { "acc": 0.76607294, "epoch": 0.9278299845097421, "grad_norm": 3.375, "learning_rate": 6.002999913396794e-06, "loss": 0.88785381, "memory(GiB)": 742.41, "step": 36575, "train_speed(iter/s)": 0.150861 }, { "acc": 0.76368294, "epoch": 0.9279568238787796, "grad_norm": 3.171875, "learning_rate": 6.001972555359938e-06, "loss": 0.91317654, "memory(GiB)": 742.41, "step": 36580, "train_speed(iter/s)": 0.150841 }, { "acc": 0.76629138, "epoch": 0.9280836632478172, "grad_norm": 2.9375, "learning_rate": 6.000945153249501e-06, "loss": 0.89092073, "memory(GiB)": 742.41, "step": 36585, "train_speed(iter/s)": 0.150821 }, { "acc": 0.75505753, "epoch": 0.9282105026168548, "grad_norm": 3.21875, "learning_rate": 5.999917707110677e-06, "loss": 0.89973593, "memory(GiB)": 742.41, "step": 36590, "train_speed(iter/s)": 0.150798 }, { "acc": 0.77020555, "epoch": 0.9283373419858922, "grad_norm": 3.546875, "learning_rate": 5.998890216988657e-06, "loss": 0.92024708, "memory(GiB)": 742.41, "step": 36595, "train_speed(iter/s)": 0.150773 }, { "acc": 0.7687211, "epoch": 0.9284641813549298, "grad_norm": 3.578125, "learning_rate": 5.997862682928641e-06, "loss": 0.86061611, "memory(GiB)": 742.41, "step": 36600, "train_speed(iter/s)": 0.150752 }, { "acc": 0.74948368, "epoch": 0.9285910207239674, "grad_norm": 3.796875, "learning_rate": 5.996835104975824e-06, "loss": 0.90426607, "memory(GiB)": 742.41, "step": 36605, "train_speed(iter/s)": 0.150725 }, { "acc": 0.77376523, "epoch": 0.9287178600930049, "grad_norm": 3.296875, "learning_rate": 5.9958074831754065e-06, "loss": 0.86754503, "memory(GiB)": 742.41, "step": 36610, "train_speed(iter/s)": 0.150705 }, { "acc": 0.77448583, "epoch": 0.9288446994620425, "grad_norm": 3.140625, "learning_rate": 5.9947798175725914e-06, "loss": 0.87553577, "memory(GiB)": 742.41, "step": 36615, "train_speed(iter/s)": 0.150685 }, { "acc": 0.77243667, "epoch": 0.9289715388310801, "grad_norm": 2.828125, "learning_rate": 5.993752108212582e-06, "loss": 0.87093611, "memory(GiB)": 742.41, "step": 36620, "train_speed(iter/s)": 0.150664 }, { "acc": 0.76214609, "epoch": 0.9290983782001176, "grad_norm": 3.453125, "learning_rate": 5.9927243551405835e-06, "loss": 0.91491518, "memory(GiB)": 742.41, "step": 36625, "train_speed(iter/s)": 0.150641 }, { "acc": 0.76373258, "epoch": 0.9292252175691552, "grad_norm": 4.0, "learning_rate": 5.991696558401801e-06, "loss": 0.89949684, "memory(GiB)": 742.41, "step": 36630, "train_speed(iter/s)": 0.15062 }, { "acc": 0.77787862, "epoch": 0.9293520569381928, "grad_norm": 3.453125, "learning_rate": 5.99066871804145e-06, "loss": 0.86145201, "memory(GiB)": 742.41, "step": 36635, "train_speed(iter/s)": 0.150603 }, { "acc": 0.76070004, "epoch": 0.9294788963072304, "grad_norm": 4.03125, "learning_rate": 5.989640834104736e-06, "loss": 0.94893503, "memory(GiB)": 742.41, "step": 36640, "train_speed(iter/s)": 0.15058 }, { "acc": 0.77620349, "epoch": 0.9296057356762679, "grad_norm": 3.4375, "learning_rate": 5.988612906636877e-06, "loss": 0.80418148, "memory(GiB)": 742.41, "step": 36645, "train_speed(iter/s)": 0.150557 }, { "acc": 0.78568711, "epoch": 0.9297325750453055, "grad_norm": 3.28125, "learning_rate": 5.987584935683085e-06, "loss": 0.83314114, "memory(GiB)": 742.41, "step": 36650, "train_speed(iter/s)": 0.150538 }, { "acc": 0.75498915, "epoch": 0.9298594144143429, "grad_norm": 3.609375, "learning_rate": 5.9865569212885785e-06, "loss": 0.91464872, "memory(GiB)": 742.41, "step": 36655, "train_speed(iter/s)": 0.150516 }, { "acc": 0.76086464, "epoch": 0.9299862537833805, "grad_norm": 2.953125, "learning_rate": 5.985528863498578e-06, "loss": 0.90773458, "memory(GiB)": 742.41, "step": 36660, "train_speed(iter/s)": 0.150489 }, { "acc": 0.76918049, "epoch": 0.9301130931524181, "grad_norm": 3.4375, "learning_rate": 5.984500762358301e-06, "loss": 0.87144308, "memory(GiB)": 742.41, "step": 36665, "train_speed(iter/s)": 0.150468 }, { "acc": 0.75912561, "epoch": 0.9302399325214556, "grad_norm": 3.421875, "learning_rate": 5.983472617912973e-06, "loss": 0.91146326, "memory(GiB)": 742.41, "step": 36670, "train_speed(iter/s)": 0.150449 }, { "acc": 0.76390324, "epoch": 0.9303667718904932, "grad_norm": 3.25, "learning_rate": 5.982444430207816e-06, "loss": 0.9272851, "memory(GiB)": 742.41, "step": 36675, "train_speed(iter/s)": 0.150424 }, { "acc": 0.77189274, "epoch": 0.9304936112595308, "grad_norm": 3.515625, "learning_rate": 5.981416199288062e-06, "loss": 0.90827026, "memory(GiB)": 742.41, "step": 36680, "train_speed(iter/s)": 0.150407 }, { "acc": 0.75857158, "epoch": 0.9306204506285684, "grad_norm": 2.9375, "learning_rate": 5.980387925198935e-06, "loss": 0.9347086, "memory(GiB)": 742.41, "step": 36685, "train_speed(iter/s)": 0.150388 }, { "acc": 0.77666893, "epoch": 0.9307472899976059, "grad_norm": 3.375, "learning_rate": 5.979359607985666e-06, "loss": 0.87671738, "memory(GiB)": 742.41, "step": 36690, "train_speed(iter/s)": 0.150371 }, { "acc": 0.76595254, "epoch": 0.9308741293666435, "grad_norm": 3.953125, "learning_rate": 5.97833124769349e-06, "loss": 0.88916159, "memory(GiB)": 742.41, "step": 36695, "train_speed(iter/s)": 0.150352 }, { "acc": 0.76371536, "epoch": 0.9310009687356811, "grad_norm": 3.8125, "learning_rate": 5.977302844367639e-06, "loss": 0.90285339, "memory(GiB)": 742.41, "step": 36700, "train_speed(iter/s)": 0.150334 }, { "acc": 0.78097382, "epoch": 0.9311278081047186, "grad_norm": 3.15625, "learning_rate": 5.97627439805335e-06, "loss": 0.83653069, "memory(GiB)": 742.41, "step": 36705, "train_speed(iter/s)": 0.150314 }, { "acc": 0.77427883, "epoch": 0.9312546474737562, "grad_norm": 4.0, "learning_rate": 5.9752459087958595e-06, "loss": 0.87305679, "memory(GiB)": 742.41, "step": 36710, "train_speed(iter/s)": 0.150293 }, { "acc": 0.75767822, "epoch": 0.9313814868427936, "grad_norm": 3.734375, "learning_rate": 5.97421737664041e-06, "loss": 0.91340227, "memory(GiB)": 742.41, "step": 36715, "train_speed(iter/s)": 0.150272 }, { "acc": 0.76276026, "epoch": 0.9315083262118312, "grad_norm": 3.171875, "learning_rate": 5.973188801632242e-06, "loss": 0.93270302, "memory(GiB)": 742.41, "step": 36720, "train_speed(iter/s)": 0.15025 }, { "acc": 0.76179967, "epoch": 0.9316351655808688, "grad_norm": 4.0625, "learning_rate": 5.9721601838165985e-06, "loss": 0.94690046, "memory(GiB)": 742.41, "step": 36725, "train_speed(iter/s)": 0.150228 }, { "acc": 0.76512599, "epoch": 0.9317620049499064, "grad_norm": 2.859375, "learning_rate": 5.9711315232387265e-06, "loss": 0.8029355, "memory(GiB)": 742.41, "step": 36730, "train_speed(iter/s)": 0.150204 }, { "acc": 0.76880755, "epoch": 0.9318888443189439, "grad_norm": 3.921875, "learning_rate": 5.970102819943873e-06, "loss": 0.90844984, "memory(GiB)": 742.41, "step": 36735, "train_speed(iter/s)": 0.150173 }, { "acc": 0.77353067, "epoch": 0.9320156836879815, "grad_norm": 2.9375, "learning_rate": 5.969074073977288e-06, "loss": 0.87033768, "memory(GiB)": 742.41, "step": 36740, "train_speed(iter/s)": 0.150153 }, { "acc": 0.77210989, "epoch": 0.9321425230570191, "grad_norm": 6.3125, "learning_rate": 5.968045285384222e-06, "loss": 0.89930124, "memory(GiB)": 742.41, "step": 36745, "train_speed(iter/s)": 0.150129 }, { "acc": 0.77693853, "epoch": 0.9322693624260566, "grad_norm": 3.390625, "learning_rate": 5.967016454209928e-06, "loss": 0.86079102, "memory(GiB)": 742.41, "step": 36750, "train_speed(iter/s)": 0.150105 }, { "acc": 0.74932876, "epoch": 0.9323962017950942, "grad_norm": 4.28125, "learning_rate": 5.965987580499662e-06, "loss": 0.9621376, "memory(GiB)": 742.41, "step": 36755, "train_speed(iter/s)": 0.150086 }, { "acc": 0.78739157, "epoch": 0.9325230411641318, "grad_norm": 3.234375, "learning_rate": 5.964958664298679e-06, "loss": 0.8081172, "memory(GiB)": 742.41, "step": 36760, "train_speed(iter/s)": 0.150067 }, { "acc": 0.77002506, "epoch": 0.9326498805331693, "grad_norm": 3.4375, "learning_rate": 5.9639297056522385e-06, "loss": 0.88536386, "memory(GiB)": 742.41, "step": 36765, "train_speed(iter/s)": 0.150051 }, { "acc": 0.75746088, "epoch": 0.9327767199022069, "grad_norm": 3.25, "learning_rate": 5.962900704605603e-06, "loss": 0.93723431, "memory(GiB)": 742.41, "step": 36770, "train_speed(iter/s)": 0.150029 }, { "acc": 0.76448112, "epoch": 0.9329035592712444, "grad_norm": 3.453125, "learning_rate": 5.961871661204032e-06, "loss": 0.92950401, "memory(GiB)": 742.41, "step": 36775, "train_speed(iter/s)": 0.150003 }, { "acc": 0.77002654, "epoch": 0.9330303986402819, "grad_norm": 6.9375, "learning_rate": 5.9608425754927925e-06, "loss": 0.8948452, "memory(GiB)": 742.41, "step": 36780, "train_speed(iter/s)": 0.149981 }, { "acc": 0.76396704, "epoch": 0.9331572380093195, "grad_norm": 3.546875, "learning_rate": 5.959813447517149e-06, "loss": 0.92268238, "memory(GiB)": 742.41, "step": 36785, "train_speed(iter/s)": 0.149957 }, { "acc": 0.75774727, "epoch": 0.9332840773783571, "grad_norm": 3.171875, "learning_rate": 5.958784277322369e-06, "loss": 0.90939522, "memory(GiB)": 742.41, "step": 36790, "train_speed(iter/s)": 0.149933 }, { "acc": 0.76882687, "epoch": 0.9334109167473946, "grad_norm": 3.125, "learning_rate": 5.957755064953726e-06, "loss": 0.88698616, "memory(GiB)": 742.41, "step": 36795, "train_speed(iter/s)": 0.149915 }, { "acc": 0.76382012, "epoch": 0.9335377561164322, "grad_norm": 4.375, "learning_rate": 5.9567258104564875e-06, "loss": 0.90221376, "memory(GiB)": 742.41, "step": 36800, "train_speed(iter/s)": 0.149897 }, { "acc": 0.77106299, "epoch": 0.9336645954854698, "grad_norm": 3.046875, "learning_rate": 5.9556965138759294e-06, "loss": 0.86342316, "memory(GiB)": 742.41, "step": 36805, "train_speed(iter/s)": 0.149876 }, { "acc": 0.7663578, "epoch": 0.9337914348545073, "grad_norm": 4.15625, "learning_rate": 5.954667175257327e-06, "loss": 0.89709606, "memory(GiB)": 742.41, "step": 36810, "train_speed(iter/s)": 0.149856 }, { "acc": 0.75288987, "epoch": 0.9339182742235449, "grad_norm": 3.203125, "learning_rate": 5.953637794645956e-06, "loss": 0.93780327, "memory(GiB)": 742.41, "step": 36815, "train_speed(iter/s)": 0.149832 }, { "acc": 0.77130737, "epoch": 0.9340451135925825, "grad_norm": 3.25, "learning_rate": 5.9526083720870986e-06, "loss": 0.90350399, "memory(GiB)": 742.41, "step": 36820, "train_speed(iter/s)": 0.149812 }, { "acc": 0.78143792, "epoch": 0.93417195296162, "grad_norm": 3.0625, "learning_rate": 5.951578907626034e-06, "loss": 0.82930136, "memory(GiB)": 742.41, "step": 36825, "train_speed(iter/s)": 0.149793 }, { "acc": 0.77663474, "epoch": 0.9342987923306576, "grad_norm": 3.40625, "learning_rate": 5.9505494013080455e-06, "loss": 0.88501453, "memory(GiB)": 742.41, "step": 36830, "train_speed(iter/s)": 0.149768 }, { "acc": 0.77352948, "epoch": 0.9344256316996951, "grad_norm": 3.15625, "learning_rate": 5.949519853178416e-06, "loss": 0.89889307, "memory(GiB)": 742.41, "step": 36835, "train_speed(iter/s)": 0.149748 }, { "acc": 0.76833591, "epoch": 0.9345524710687326, "grad_norm": 3.921875, "learning_rate": 5.948490263282434e-06, "loss": 0.86461887, "memory(GiB)": 742.41, "step": 36840, "train_speed(iter/s)": 0.149729 }, { "acc": 0.7681663, "epoch": 0.9346793104377702, "grad_norm": 3.4375, "learning_rate": 5.947460631665387e-06, "loss": 0.93019361, "memory(GiB)": 742.41, "step": 36845, "train_speed(iter/s)": 0.149709 }, { "acc": 0.75967469, "epoch": 0.9348061498068078, "grad_norm": 3.328125, "learning_rate": 5.946430958372567e-06, "loss": 0.87437143, "memory(GiB)": 742.41, "step": 36850, "train_speed(iter/s)": 0.149683 }, { "acc": 0.77301702, "epoch": 0.9349329891758453, "grad_norm": 2.984375, "learning_rate": 5.945401243449262e-06, "loss": 0.87774696, "memory(GiB)": 742.41, "step": 36855, "train_speed(iter/s)": 0.149665 }, { "acc": 0.76861272, "epoch": 0.9350598285448829, "grad_norm": 3.3125, "learning_rate": 5.944371486940772e-06, "loss": 0.91275988, "memory(GiB)": 742.41, "step": 36860, "train_speed(iter/s)": 0.149645 }, { "acc": 0.78117681, "epoch": 0.9351866679139205, "grad_norm": 2.578125, "learning_rate": 5.943341688892386e-06, "loss": 0.86275635, "memory(GiB)": 742.41, "step": 36865, "train_speed(iter/s)": 0.149618 }, { "acc": 0.75426998, "epoch": 0.935313507282958, "grad_norm": 3.28125, "learning_rate": 5.942311849349408e-06, "loss": 0.95767937, "memory(GiB)": 742.41, "step": 36870, "train_speed(iter/s)": 0.149598 }, { "acc": 0.75846643, "epoch": 0.9354403466519956, "grad_norm": 3.25, "learning_rate": 5.941281968357133e-06, "loss": 0.91946354, "memory(GiB)": 742.41, "step": 36875, "train_speed(iter/s)": 0.149579 }, { "acc": 0.76981711, "epoch": 0.9355671860210332, "grad_norm": 3.90625, "learning_rate": 5.940252045960862e-06, "loss": 0.90475397, "memory(GiB)": 742.41, "step": 36880, "train_speed(iter/s)": 0.14956 }, { "acc": 0.76684871, "epoch": 0.9356940253900707, "grad_norm": 3.78125, "learning_rate": 5.9392220822059e-06, "loss": 0.90449076, "memory(GiB)": 742.41, "step": 36885, "train_speed(iter/s)": 0.149536 }, { "acc": 0.77292547, "epoch": 0.9358208647591083, "grad_norm": 3.921875, "learning_rate": 5.9381920771375515e-06, "loss": 0.90404959, "memory(GiB)": 742.41, "step": 36890, "train_speed(iter/s)": 0.149517 }, { "acc": 0.76050262, "epoch": 0.9359477041281458, "grad_norm": 3.28125, "learning_rate": 5.937162030801122e-06, "loss": 0.90910082, "memory(GiB)": 742.41, "step": 36895, "train_speed(iter/s)": 0.149498 }, { "acc": 0.76372681, "epoch": 0.9360745434971833, "grad_norm": 3.265625, "learning_rate": 5.936131943241922e-06, "loss": 0.91485062, "memory(GiB)": 742.41, "step": 36900, "train_speed(iter/s)": 0.149482 }, { "acc": 0.76269813, "epoch": 0.9362013828662209, "grad_norm": 3.109375, "learning_rate": 5.9351018145052596e-06, "loss": 0.92650204, "memory(GiB)": 742.41, "step": 36905, "train_speed(iter/s)": 0.14946 }, { "acc": 0.76064777, "epoch": 0.9363282222352585, "grad_norm": 3.046875, "learning_rate": 5.9340716446364476e-06, "loss": 0.90665951, "memory(GiB)": 742.41, "step": 36910, "train_speed(iter/s)": 0.149436 }, { "acc": 0.77497935, "epoch": 0.936455061604296, "grad_norm": 2.796875, "learning_rate": 5.933041433680801e-06, "loss": 0.83610773, "memory(GiB)": 742.41, "step": 36915, "train_speed(iter/s)": 0.149414 }, { "acc": 0.77615418, "epoch": 0.9365819009733336, "grad_norm": 3.421875, "learning_rate": 5.932011181683634e-06, "loss": 0.87913599, "memory(GiB)": 742.41, "step": 36920, "train_speed(iter/s)": 0.149389 }, { "acc": 0.76465793, "epoch": 0.9367087403423712, "grad_norm": 3.671875, "learning_rate": 5.9309808886902655e-06, "loss": 0.95117903, "memory(GiB)": 742.41, "step": 36925, "train_speed(iter/s)": 0.149366 }, { "acc": 0.76419649, "epoch": 0.9368355797114087, "grad_norm": 4.03125, "learning_rate": 5.929950554746012e-06, "loss": 0.8586977, "memory(GiB)": 742.41, "step": 36930, "train_speed(iter/s)": 0.149344 }, { "acc": 0.77944679, "epoch": 0.9369624190804463, "grad_norm": 3.171875, "learning_rate": 5.928920179896197e-06, "loss": 0.83902435, "memory(GiB)": 742.41, "step": 36935, "train_speed(iter/s)": 0.149325 }, { "acc": 0.7617424, "epoch": 0.9370892584494839, "grad_norm": 3.453125, "learning_rate": 5.927889764186144e-06, "loss": 0.88867369, "memory(GiB)": 742.41, "step": 36940, "train_speed(iter/s)": 0.149302 }, { "acc": 0.77844777, "epoch": 0.9372160978185214, "grad_norm": 3.390625, "learning_rate": 5.926859307661178e-06, "loss": 0.87018461, "memory(GiB)": 742.41, "step": 36945, "train_speed(iter/s)": 0.149283 }, { "acc": 0.77803907, "epoch": 0.937342937187559, "grad_norm": 2.765625, "learning_rate": 5.9258288103666215e-06, "loss": 0.83140612, "memory(GiB)": 742.41, "step": 36950, "train_speed(iter/s)": 0.149264 }, { "acc": 0.77256508, "epoch": 0.9374697765565965, "grad_norm": 2.8125, "learning_rate": 5.924798272347808e-06, "loss": 0.84238329, "memory(GiB)": 742.41, "step": 36955, "train_speed(iter/s)": 0.149245 }, { "acc": 0.7506536, "epoch": 0.937596615925634, "grad_norm": 3.96875, "learning_rate": 5.9237676936500634e-06, "loss": 0.96726856, "memory(GiB)": 742.41, "step": 36960, "train_speed(iter/s)": 0.149228 }, { "acc": 0.77284799, "epoch": 0.9377234552946716, "grad_norm": 3.40625, "learning_rate": 5.922737074318722e-06, "loss": 0.88581839, "memory(GiB)": 742.41, "step": 36965, "train_speed(iter/s)": 0.149207 }, { "acc": 0.75389881, "epoch": 0.9378502946637092, "grad_norm": 3.125, "learning_rate": 5.921706414399115e-06, "loss": 0.91587276, "memory(GiB)": 742.41, "step": 36970, "train_speed(iter/s)": 0.149189 }, { "acc": 0.76697407, "epoch": 0.9379771340327467, "grad_norm": 3.546875, "learning_rate": 5.920675713936582e-06, "loss": 0.87199621, "memory(GiB)": 742.41, "step": 36975, "train_speed(iter/s)": 0.149169 }, { "acc": 0.76340971, "epoch": 0.9381039734017843, "grad_norm": 3.734375, "learning_rate": 5.919644972976457e-06, "loss": 0.85391254, "memory(GiB)": 742.41, "step": 36980, "train_speed(iter/s)": 0.149151 }, { "acc": 0.75898619, "epoch": 0.9382308127708219, "grad_norm": 3.546875, "learning_rate": 5.918614191564079e-06, "loss": 0.92930193, "memory(GiB)": 742.41, "step": 36985, "train_speed(iter/s)": 0.149132 }, { "acc": 0.75563369, "epoch": 0.9383576521398594, "grad_norm": 4.28125, "learning_rate": 5.91758336974479e-06, "loss": 0.94419928, "memory(GiB)": 742.41, "step": 36990, "train_speed(iter/s)": 0.149112 }, { "acc": 0.76080317, "epoch": 0.938484491508897, "grad_norm": 3.546875, "learning_rate": 5.916552507563933e-06, "loss": 0.9269928, "memory(GiB)": 742.41, "step": 36995, "train_speed(iter/s)": 0.149093 }, { "acc": 0.77493792, "epoch": 0.9386113308779346, "grad_norm": 3.5, "learning_rate": 5.915521605066852e-06, "loss": 0.85573902, "memory(GiB)": 742.41, "step": 37000, "train_speed(iter/s)": 0.149074 }, { "epoch": 0.9386113308779346, "eval_acc": 0.7572429362858899, "eval_loss": 0.8592785000801086, "eval_runtime": 1150.3682, "eval_samples_per_second": 5.537, "eval_steps_per_second": 5.537, "step": 37000 }, { "acc": 0.78209996, "epoch": 0.9387381702469721, "grad_norm": 4.375, "learning_rate": 5.914490662298891e-06, "loss": 0.88479691, "memory(GiB)": 742.41, "step": 37005, "train_speed(iter/s)": 0.147935 }, { "acc": 0.77032824, "epoch": 0.9388650096160097, "grad_norm": 3.953125, "learning_rate": 5.9134596793054e-06, "loss": 0.87484932, "memory(GiB)": 742.41, "step": 37010, "train_speed(iter/s)": 0.147918 }, { "acc": 0.76559529, "epoch": 0.9389918489850472, "grad_norm": 3.546875, "learning_rate": 5.912428656131728e-06, "loss": 0.92775049, "memory(GiB)": 742.41, "step": 37015, "train_speed(iter/s)": 0.147899 }, { "acc": 0.74378757, "epoch": 0.9391186883540847, "grad_norm": 2.640625, "learning_rate": 5.911397592823227e-06, "loss": 0.91449585, "memory(GiB)": 742.41, "step": 37020, "train_speed(iter/s)": 0.147876 }, { "acc": 0.76432872, "epoch": 0.9392455277231223, "grad_norm": 3.734375, "learning_rate": 5.910366489425249e-06, "loss": 0.91482744, "memory(GiB)": 742.41, "step": 37025, "train_speed(iter/s)": 0.147856 }, { "acc": 0.75774145, "epoch": 0.9393723670921599, "grad_norm": 39.0, "learning_rate": 5.90933534598315e-06, "loss": 0.9130372, "memory(GiB)": 742.41, "step": 37030, "train_speed(iter/s)": 0.147838 }, { "acc": 0.74867716, "epoch": 0.9394992064611974, "grad_norm": 3.671875, "learning_rate": 5.908304162542287e-06, "loss": 0.92360992, "memory(GiB)": 742.41, "step": 37035, "train_speed(iter/s)": 0.147821 }, { "acc": 0.76732788, "epoch": 0.939626045830235, "grad_norm": 2.84375, "learning_rate": 5.907272939148018e-06, "loss": 0.89674301, "memory(GiB)": 742.41, "step": 37040, "train_speed(iter/s)": 0.147795 }, { "acc": 0.77102985, "epoch": 0.9397528851992726, "grad_norm": 3.9375, "learning_rate": 5.906241675845703e-06, "loss": 0.87424593, "memory(GiB)": 742.41, "step": 37045, "train_speed(iter/s)": 0.147779 }, { "acc": 0.75340805, "epoch": 0.9398797245683101, "grad_norm": 4.03125, "learning_rate": 5.905210372680704e-06, "loss": 0.96622705, "memory(GiB)": 742.41, "step": 37050, "train_speed(iter/s)": 0.147758 }, { "acc": 0.77970738, "epoch": 0.9400065639373477, "grad_norm": 3.40625, "learning_rate": 5.904179029698385e-06, "loss": 0.85999393, "memory(GiB)": 742.41, "step": 37055, "train_speed(iter/s)": 0.147741 }, { "acc": 0.77183037, "epoch": 0.9401334033063853, "grad_norm": 3.671875, "learning_rate": 5.903147646944111e-06, "loss": 0.91285067, "memory(GiB)": 742.41, "step": 37060, "train_speed(iter/s)": 0.147722 }, { "acc": 0.77395897, "epoch": 0.9402602426754229, "grad_norm": 3.8125, "learning_rate": 5.90211622446325e-06, "loss": 0.84128733, "memory(GiB)": 742.41, "step": 37065, "train_speed(iter/s)": 0.147702 }, { "acc": 0.77343259, "epoch": 0.9403870820444604, "grad_norm": 4.0, "learning_rate": 5.90108476230117e-06, "loss": 0.90070419, "memory(GiB)": 742.41, "step": 37070, "train_speed(iter/s)": 0.147683 }, { "acc": 0.7581615, "epoch": 0.9405139214134979, "grad_norm": 2.9375, "learning_rate": 5.9000532605032425e-06, "loss": 0.95975075, "memory(GiB)": 742.41, "step": 37075, "train_speed(iter/s)": 0.14766 }, { "acc": 0.7693531, "epoch": 0.9406407607825354, "grad_norm": 3.765625, "learning_rate": 5.89902171911484e-06, "loss": 0.8713583, "memory(GiB)": 742.41, "step": 37080, "train_speed(iter/s)": 0.147641 }, { "acc": 0.75969286, "epoch": 0.940767600151573, "grad_norm": 3.671875, "learning_rate": 5.897990138181337e-06, "loss": 0.91626177, "memory(GiB)": 742.41, "step": 37085, "train_speed(iter/s)": 0.14762 }, { "acc": 0.75760493, "epoch": 0.9408944395206106, "grad_norm": 3.65625, "learning_rate": 5.896958517748108e-06, "loss": 0.92967911, "memory(GiB)": 742.41, "step": 37090, "train_speed(iter/s)": 0.147601 }, { "acc": 0.77077417, "epoch": 0.9410212788896481, "grad_norm": 4.03125, "learning_rate": 5.895926857860532e-06, "loss": 0.87543449, "memory(GiB)": 742.41, "step": 37095, "train_speed(iter/s)": 0.147583 }, { "acc": 0.76417732, "epoch": 0.9411481182586857, "grad_norm": 3.21875, "learning_rate": 5.894895158563989e-06, "loss": 0.9039155, "memory(GiB)": 742.41, "step": 37100, "train_speed(iter/s)": 0.147566 }, { "acc": 0.7723947, "epoch": 0.9412749576277233, "grad_norm": 3.625, "learning_rate": 5.893863419903858e-06, "loss": 0.85367479, "memory(GiB)": 742.41, "step": 37105, "train_speed(iter/s)": 0.147546 }, { "acc": 0.77678847, "epoch": 0.9414017969967609, "grad_norm": 3.859375, "learning_rate": 5.8928316419255246e-06, "loss": 0.8374754, "memory(GiB)": 742.41, "step": 37110, "train_speed(iter/s)": 0.147527 }, { "acc": 0.772542, "epoch": 0.9415286363657984, "grad_norm": 3.234375, "learning_rate": 5.8917998246743715e-06, "loss": 0.92034254, "memory(GiB)": 742.41, "step": 37115, "train_speed(iter/s)": 0.147509 }, { "acc": 0.76371493, "epoch": 0.941655475734836, "grad_norm": 4.0, "learning_rate": 5.890767968195785e-06, "loss": 0.89251747, "memory(GiB)": 742.41, "step": 37120, "train_speed(iter/s)": 0.147492 }, { "acc": 0.76285725, "epoch": 0.9417823151038736, "grad_norm": 3.859375, "learning_rate": 5.889736072535155e-06, "loss": 0.91791201, "memory(GiB)": 742.41, "step": 37125, "train_speed(iter/s)": 0.147471 }, { "acc": 0.77861929, "epoch": 0.9419091544729111, "grad_norm": 3.59375, "learning_rate": 5.888704137737868e-06, "loss": 0.90927057, "memory(GiB)": 742.41, "step": 37130, "train_speed(iter/s)": 0.14745 }, { "acc": 0.75529981, "epoch": 0.9420359938419486, "grad_norm": 3.5, "learning_rate": 5.887672163849319e-06, "loss": 0.9053462, "memory(GiB)": 742.41, "step": 37135, "train_speed(iter/s)": 0.147431 }, { "acc": 0.77249641, "epoch": 0.9421628332109862, "grad_norm": 3.015625, "learning_rate": 5.8866401509149e-06, "loss": 0.85121202, "memory(GiB)": 742.41, "step": 37140, "train_speed(iter/s)": 0.147414 }, { "acc": 0.76264582, "epoch": 0.9422896725800237, "grad_norm": 3.96875, "learning_rate": 5.885608098980006e-06, "loss": 0.93701267, "memory(GiB)": 742.41, "step": 37145, "train_speed(iter/s)": 0.147396 }, { "acc": 0.76439328, "epoch": 0.9424165119490613, "grad_norm": 3.453125, "learning_rate": 5.884576008090032e-06, "loss": 0.93988428, "memory(GiB)": 742.41, "step": 37150, "train_speed(iter/s)": 0.147375 }, { "acc": 0.76707139, "epoch": 0.9425433513180989, "grad_norm": 3.578125, "learning_rate": 5.883543878290379e-06, "loss": 0.88687611, "memory(GiB)": 742.41, "step": 37155, "train_speed(iter/s)": 0.147353 }, { "acc": 0.76691766, "epoch": 0.9426701906871364, "grad_norm": 3.15625, "learning_rate": 5.8825117096264436e-06, "loss": 0.90380278, "memory(GiB)": 742.41, "step": 37160, "train_speed(iter/s)": 0.147336 }, { "acc": 0.76952834, "epoch": 0.942797030056174, "grad_norm": 3.0, "learning_rate": 5.881479502143633e-06, "loss": 0.87694492, "memory(GiB)": 742.41, "step": 37165, "train_speed(iter/s)": 0.147315 }, { "acc": 0.78374691, "epoch": 0.9429238694252116, "grad_norm": 3.90625, "learning_rate": 5.880447255887347e-06, "loss": 0.85101776, "memory(GiB)": 742.41, "step": 37170, "train_speed(iter/s)": 0.147301 }, { "acc": 0.76841931, "epoch": 0.9430507087942491, "grad_norm": 3.265625, "learning_rate": 5.87941497090299e-06, "loss": 0.86736946, "memory(GiB)": 742.41, "step": 37175, "train_speed(iter/s)": 0.14728 }, { "acc": 0.77058015, "epoch": 0.9431775481632867, "grad_norm": 3.859375, "learning_rate": 5.878382647235973e-06, "loss": 0.91935778, "memory(GiB)": 742.41, "step": 37180, "train_speed(iter/s)": 0.147261 }, { "acc": 0.77460642, "epoch": 0.9433043875323243, "grad_norm": 13.1875, "learning_rate": 5.8773502849317e-06, "loss": 0.81994648, "memory(GiB)": 742.41, "step": 37185, "train_speed(iter/s)": 0.147238 }, { "acc": 0.78320971, "epoch": 0.9434312269013618, "grad_norm": 4.0625, "learning_rate": 5.876317884035584e-06, "loss": 0.88269005, "memory(GiB)": 742.41, "step": 37190, "train_speed(iter/s)": 0.147222 }, { "acc": 0.78043895, "epoch": 0.9435580662703993, "grad_norm": 3.859375, "learning_rate": 5.875285444593035e-06, "loss": 0.85083256, "memory(GiB)": 742.41, "step": 37195, "train_speed(iter/s)": 0.147202 }, { "acc": 0.76653447, "epoch": 0.9436849056394369, "grad_norm": 3.046875, "learning_rate": 5.874252966649471e-06, "loss": 0.8518425, "memory(GiB)": 742.41, "step": 37200, "train_speed(iter/s)": 0.14718 }, { "acc": 0.76260366, "epoch": 0.9438117450084744, "grad_norm": 3.5, "learning_rate": 5.873220450250302e-06, "loss": 0.8713747, "memory(GiB)": 742.41, "step": 37205, "train_speed(iter/s)": 0.147161 }, { "acc": 0.76457682, "epoch": 0.943938584377512, "grad_norm": 3.6875, "learning_rate": 5.8721878954409485e-06, "loss": 0.90271578, "memory(GiB)": 742.41, "step": 37210, "train_speed(iter/s)": 0.147142 }, { "acc": 0.77422175, "epoch": 0.9440654237465496, "grad_norm": 3.671875, "learning_rate": 5.871155302266829e-06, "loss": 0.90592194, "memory(GiB)": 742.41, "step": 37215, "train_speed(iter/s)": 0.147122 }, { "acc": 0.76537352, "epoch": 0.9441922631155871, "grad_norm": 3.203125, "learning_rate": 5.8701226707733625e-06, "loss": 0.8618084, "memory(GiB)": 742.41, "step": 37220, "train_speed(iter/s)": 0.1471 }, { "acc": 0.77102556, "epoch": 0.9443191024846247, "grad_norm": 2.6875, "learning_rate": 5.869090001005972e-06, "loss": 0.87491665, "memory(GiB)": 742.41, "step": 37225, "train_speed(iter/s)": 0.14708 }, { "acc": 0.7693676, "epoch": 0.9444459418536623, "grad_norm": 3.4375, "learning_rate": 5.868057293010081e-06, "loss": 0.90174856, "memory(GiB)": 742.41, "step": 37230, "train_speed(iter/s)": 0.147061 }, { "acc": 0.77211585, "epoch": 0.9445727812226998, "grad_norm": 3.609375, "learning_rate": 5.867024546831117e-06, "loss": 0.88139629, "memory(GiB)": 742.41, "step": 37235, "train_speed(iter/s)": 0.147042 }, { "acc": 0.77474833, "epoch": 0.9446996205917374, "grad_norm": 4.28125, "learning_rate": 5.865991762514505e-06, "loss": 0.88792124, "memory(GiB)": 742.41, "step": 37240, "train_speed(iter/s)": 0.147017 }, { "acc": 0.76951733, "epoch": 0.944826459960775, "grad_norm": 3.328125, "learning_rate": 5.864958940105675e-06, "loss": 0.96197948, "memory(GiB)": 742.41, "step": 37245, "train_speed(iter/s)": 0.147001 }, { "acc": 0.75356154, "epoch": 0.9449532993298125, "grad_norm": 3.40625, "learning_rate": 5.863926079650057e-06, "loss": 0.95921774, "memory(GiB)": 742.41, "step": 37250, "train_speed(iter/s)": 0.146983 }, { "acc": 0.76141129, "epoch": 0.94508013869885, "grad_norm": 3.296875, "learning_rate": 5.862893181193084e-06, "loss": 0.85557823, "memory(GiB)": 742.41, "step": 37255, "train_speed(iter/s)": 0.146962 }, { "acc": 0.76875839, "epoch": 0.9452069780678876, "grad_norm": 3.046875, "learning_rate": 5.8618602447801884e-06, "loss": 0.88965893, "memory(GiB)": 742.41, "step": 37260, "train_speed(iter/s)": 0.146942 }, { "acc": 0.77704086, "epoch": 0.9453338174369251, "grad_norm": 3.515625, "learning_rate": 5.860827270456806e-06, "loss": 0.88155575, "memory(GiB)": 742.41, "step": 37265, "train_speed(iter/s)": 0.146923 }, { "acc": 0.76290793, "epoch": 0.9454606568059627, "grad_norm": 3.734375, "learning_rate": 5.859794258268377e-06, "loss": 0.90912647, "memory(GiB)": 742.41, "step": 37270, "train_speed(iter/s)": 0.146906 }, { "acc": 0.7732872, "epoch": 0.9455874961750003, "grad_norm": 3.578125, "learning_rate": 5.858761208260335e-06, "loss": 0.88089561, "memory(GiB)": 742.41, "step": 37275, "train_speed(iter/s)": 0.146883 }, { "acc": 0.7752316, "epoch": 0.9457143355440378, "grad_norm": 3.21875, "learning_rate": 5.857728120478126e-06, "loss": 0.87234974, "memory(GiB)": 742.41, "step": 37280, "train_speed(iter/s)": 0.146859 }, { "acc": 0.77805901, "epoch": 0.9458411749130754, "grad_norm": 3.15625, "learning_rate": 5.856694994967188e-06, "loss": 0.85362158, "memory(GiB)": 742.41, "step": 37285, "train_speed(iter/s)": 0.146842 }, { "acc": 0.76420403, "epoch": 0.945968014282113, "grad_norm": 3.75, "learning_rate": 5.855661831772968e-06, "loss": 0.90083561, "memory(GiB)": 742.41, "step": 37290, "train_speed(iter/s)": 0.146823 }, { "acc": 0.76709075, "epoch": 0.9460948536511505, "grad_norm": 3.203125, "learning_rate": 5.854628630940912e-06, "loss": 0.93688107, "memory(GiB)": 742.41, "step": 37295, "train_speed(iter/s)": 0.146801 }, { "acc": 0.78167877, "epoch": 0.9462216930201881, "grad_norm": 3.9375, "learning_rate": 5.853595392516463e-06, "loss": 0.84331617, "memory(GiB)": 742.41, "step": 37300, "train_speed(iter/s)": 0.146785 }, { "acc": 0.76653185, "epoch": 0.9463485323892257, "grad_norm": 3.171875, "learning_rate": 5.852562116545073e-06, "loss": 0.92680855, "memory(GiB)": 742.41, "step": 37305, "train_speed(iter/s)": 0.146768 }, { "acc": 0.76024876, "epoch": 0.9464753717582632, "grad_norm": 3.140625, "learning_rate": 5.851528803072192e-06, "loss": 0.89925652, "memory(GiB)": 742.41, "step": 37310, "train_speed(iter/s)": 0.146751 }, { "acc": 0.76174545, "epoch": 0.9466022111273007, "grad_norm": 3.453125, "learning_rate": 5.850495452143272e-06, "loss": 0.93244247, "memory(GiB)": 742.41, "step": 37315, "train_speed(iter/s)": 0.146729 }, { "acc": 0.76745872, "epoch": 0.9467290504963383, "grad_norm": 3.796875, "learning_rate": 5.849462063803767e-06, "loss": 0.91045647, "memory(GiB)": 742.41, "step": 37320, "train_speed(iter/s)": 0.146709 }, { "acc": 0.76834364, "epoch": 0.9468558898653758, "grad_norm": 4.28125, "learning_rate": 5.848428638099133e-06, "loss": 0.90730286, "memory(GiB)": 742.41, "step": 37325, "train_speed(iter/s)": 0.146686 }, { "acc": 0.76063142, "epoch": 0.9469827292344134, "grad_norm": 3.359375, "learning_rate": 5.847395175074826e-06, "loss": 0.94201746, "memory(GiB)": 742.41, "step": 37330, "train_speed(iter/s)": 0.146667 }, { "acc": 0.76763368, "epoch": 0.947109568603451, "grad_norm": 3.484375, "learning_rate": 5.846361674776305e-06, "loss": 0.85472651, "memory(GiB)": 742.41, "step": 37335, "train_speed(iter/s)": 0.146645 }, { "acc": 0.77417002, "epoch": 0.9472364079724885, "grad_norm": 3.671875, "learning_rate": 5.845328137249031e-06, "loss": 0.88330994, "memory(GiB)": 742.41, "step": 37340, "train_speed(iter/s)": 0.146623 }, { "acc": 0.7659215, "epoch": 0.9473632473415261, "grad_norm": 3.671875, "learning_rate": 5.844294562538465e-06, "loss": 0.91046963, "memory(GiB)": 742.41, "step": 37345, "train_speed(iter/s)": 0.146605 }, { "acc": 0.77226348, "epoch": 0.9474900867105637, "grad_norm": 3.015625, "learning_rate": 5.8432609506900714e-06, "loss": 0.89876442, "memory(GiB)": 742.41, "step": 37350, "train_speed(iter/s)": 0.146583 }, { "acc": 0.78484392, "epoch": 0.9476169260796012, "grad_norm": 3.515625, "learning_rate": 5.842227301749316e-06, "loss": 0.80630798, "memory(GiB)": 742.41, "step": 37355, "train_speed(iter/s)": 0.146564 }, { "acc": 0.76104107, "epoch": 0.9477437654486388, "grad_norm": 3.359375, "learning_rate": 5.841193615761664e-06, "loss": 0.93449583, "memory(GiB)": 742.41, "step": 37360, "train_speed(iter/s)": 0.146543 }, { "acc": 0.76899061, "epoch": 0.9478706048176764, "grad_norm": 3.8125, "learning_rate": 5.840159892772586e-06, "loss": 0.88523121, "memory(GiB)": 742.41, "step": 37365, "train_speed(iter/s)": 0.146523 }, { "acc": 0.76537924, "epoch": 0.947997444186714, "grad_norm": 2.953125, "learning_rate": 5.83912613282755e-06, "loss": 0.86414728, "memory(GiB)": 742.41, "step": 37370, "train_speed(iter/s)": 0.146504 }, { "acc": 0.76337523, "epoch": 0.9481242835557514, "grad_norm": 3.453125, "learning_rate": 5.838092335972031e-06, "loss": 0.86465836, "memory(GiB)": 742.41, "step": 37375, "train_speed(iter/s)": 0.146485 }, { "acc": 0.76240263, "epoch": 0.948251122924789, "grad_norm": 3.359375, "learning_rate": 5.837058502251499e-06, "loss": 0.87340879, "memory(GiB)": 742.41, "step": 37380, "train_speed(iter/s)": 0.146465 }, { "acc": 0.7600472, "epoch": 0.9483779622938265, "grad_norm": 3.75, "learning_rate": 5.836024631711431e-06, "loss": 0.92422915, "memory(GiB)": 742.41, "step": 37385, "train_speed(iter/s)": 0.146448 }, { "acc": 0.77358308, "epoch": 0.9485048016628641, "grad_norm": 3.0625, "learning_rate": 5.834990724397302e-06, "loss": 0.88088293, "memory(GiB)": 742.41, "step": 37390, "train_speed(iter/s)": 0.146431 }, { "acc": 0.77205935, "epoch": 0.9486316410319017, "grad_norm": 6.25, "learning_rate": 5.833956780354594e-06, "loss": 0.84875574, "memory(GiB)": 742.41, "step": 37395, "train_speed(iter/s)": 0.146412 }, { "acc": 0.75746775, "epoch": 0.9487584804009392, "grad_norm": 4.03125, "learning_rate": 5.832922799628783e-06, "loss": 0.83097706, "memory(GiB)": 742.41, "step": 37400, "train_speed(iter/s)": 0.146389 }, { "acc": 0.76636968, "epoch": 0.9488853197699768, "grad_norm": 4.1875, "learning_rate": 5.831888782265353e-06, "loss": 0.89633446, "memory(GiB)": 742.41, "step": 37405, "train_speed(iter/s)": 0.146373 }, { "acc": 0.76744056, "epoch": 0.9490121591390144, "grad_norm": 3.8125, "learning_rate": 5.830854728309785e-06, "loss": 0.86323032, "memory(GiB)": 742.41, "step": 37410, "train_speed(iter/s)": 0.146355 }, { "acc": 0.76221485, "epoch": 0.949138998508052, "grad_norm": 2.84375, "learning_rate": 5.829820637807564e-06, "loss": 0.89122658, "memory(GiB)": 742.41, "step": 37415, "train_speed(iter/s)": 0.146332 }, { "acc": 0.75893917, "epoch": 0.9492658378770895, "grad_norm": 3.3125, "learning_rate": 5.8287865108041786e-06, "loss": 0.92676277, "memory(GiB)": 742.41, "step": 37420, "train_speed(iter/s)": 0.146313 }, { "acc": 0.76874595, "epoch": 0.9493926772461271, "grad_norm": 3.140625, "learning_rate": 5.827752347345114e-06, "loss": 0.86889534, "memory(GiB)": 742.41, "step": 37425, "train_speed(iter/s)": 0.146292 }, { "acc": 0.78049355, "epoch": 0.9495195166151646, "grad_norm": 2.828125, "learning_rate": 5.826718147475862e-06, "loss": 0.86900406, "memory(GiB)": 742.41, "step": 37430, "train_speed(iter/s)": 0.146272 }, { "acc": 0.76820502, "epoch": 0.9496463559842021, "grad_norm": 3.78125, "learning_rate": 5.825683911241911e-06, "loss": 0.85852938, "memory(GiB)": 742.41, "step": 37435, "train_speed(iter/s)": 0.146249 }, { "acc": 0.78146939, "epoch": 0.9497731953532397, "grad_norm": 15.875, "learning_rate": 5.824649638688757e-06, "loss": 0.86434259, "memory(GiB)": 742.41, "step": 37440, "train_speed(iter/s)": 0.14623 }, { "acc": 0.7578105, "epoch": 0.9499000347222772, "grad_norm": 3.9375, "learning_rate": 5.823615329861892e-06, "loss": 0.91677189, "memory(GiB)": 742.41, "step": 37445, "train_speed(iter/s)": 0.14621 }, { "acc": 0.76111965, "epoch": 0.9500268740913148, "grad_norm": 4.125, "learning_rate": 5.8225809848068146e-06, "loss": 0.91548319, "memory(GiB)": 742.41, "step": 37450, "train_speed(iter/s)": 0.146193 }, { "acc": 0.76256938, "epoch": 0.9501537134603524, "grad_norm": 3.65625, "learning_rate": 5.82154660356902e-06, "loss": 0.91093988, "memory(GiB)": 742.41, "step": 37455, "train_speed(iter/s)": 0.146176 }, { "acc": 0.76960678, "epoch": 0.95028055282939, "grad_norm": 3.5625, "learning_rate": 5.820512186194007e-06, "loss": 0.91659002, "memory(GiB)": 742.41, "step": 37460, "train_speed(iter/s)": 0.146158 }, { "acc": 0.76939588, "epoch": 0.9504073921984275, "grad_norm": 4.0, "learning_rate": 5.819477732727278e-06, "loss": 0.90051565, "memory(GiB)": 742.41, "step": 37465, "train_speed(iter/s)": 0.146136 }, { "acc": 0.75370817, "epoch": 0.9505342315674651, "grad_norm": 3.03125, "learning_rate": 5.8184432432143335e-06, "loss": 0.85818567, "memory(GiB)": 742.41, "step": 37470, "train_speed(iter/s)": 0.146116 }, { "acc": 0.76074514, "epoch": 0.9506610709365027, "grad_norm": 3.25, "learning_rate": 5.81740871770068e-06, "loss": 0.91871986, "memory(GiB)": 742.41, "step": 37475, "train_speed(iter/s)": 0.146097 }, { "acc": 0.76325331, "epoch": 0.9507879103055402, "grad_norm": 3.171875, "learning_rate": 5.81637415623182e-06, "loss": 0.89976215, "memory(GiB)": 742.41, "step": 37480, "train_speed(iter/s)": 0.14608 }, { "acc": 0.77616506, "epoch": 0.9509147496745778, "grad_norm": 3.65625, "learning_rate": 5.8153395588532635e-06, "loss": 0.85073023, "memory(GiB)": 742.41, "step": 37485, "train_speed(iter/s)": 0.146063 }, { "acc": 0.77791533, "epoch": 0.9510415890436154, "grad_norm": 3.5625, "learning_rate": 5.8143049256105155e-06, "loss": 0.87292728, "memory(GiB)": 742.41, "step": 37490, "train_speed(iter/s)": 0.146044 }, { "acc": 0.75757642, "epoch": 0.9511684284126528, "grad_norm": 3.03125, "learning_rate": 5.81327025654909e-06, "loss": 0.94425859, "memory(GiB)": 742.41, "step": 37495, "train_speed(iter/s)": 0.146027 }, { "acc": 0.77589707, "epoch": 0.9512952677816904, "grad_norm": 3.953125, "learning_rate": 5.812235551714497e-06, "loss": 0.84890318, "memory(GiB)": 742.41, "step": 37500, "train_speed(iter/s)": 0.146008 }, { "epoch": 0.9512952677816904, "eval_acc": 0.7573945858192229, "eval_loss": 0.8586077094078064, "eval_runtime": 1152.8663, "eval_samples_per_second": 5.525, "eval_steps_per_second": 5.525, "step": 37500 }, { "acc": 0.77073298, "epoch": 0.951422107150728, "grad_norm": 3.84375, "learning_rate": 5.81120081115225e-06, "loss": 0.88070383, "memory(GiB)": 742.41, "step": 37505, "train_speed(iter/s)": 0.144929 }, { "acc": 0.77096543, "epoch": 0.9515489465197655, "grad_norm": 3.703125, "learning_rate": 5.8101660349078644e-06, "loss": 0.85291061, "memory(GiB)": 742.41, "step": 37510, "train_speed(iter/s)": 0.14491 }, { "acc": 0.76167746, "epoch": 0.9516757858888031, "grad_norm": 3.609375, "learning_rate": 5.809131223026854e-06, "loss": 0.89557838, "memory(GiB)": 742.41, "step": 37515, "train_speed(iter/s)": 0.144892 }, { "acc": 0.75686474, "epoch": 0.9518026252578407, "grad_norm": 3.5625, "learning_rate": 5.808096375554742e-06, "loss": 0.91065989, "memory(GiB)": 742.41, "step": 37520, "train_speed(iter/s)": 0.144875 }, { "acc": 0.75205188, "epoch": 0.9519294646268782, "grad_norm": 3.734375, "learning_rate": 5.807061492537043e-06, "loss": 0.9127552, "memory(GiB)": 742.41, "step": 37525, "train_speed(iter/s)": 0.14486 }, { "acc": 0.78182225, "epoch": 0.9520563039959158, "grad_norm": 3.328125, "learning_rate": 5.806026574019281e-06, "loss": 0.86611557, "memory(GiB)": 742.41, "step": 37530, "train_speed(iter/s)": 0.144843 }, { "acc": 0.75791764, "epoch": 0.9521831433649534, "grad_norm": 3.203125, "learning_rate": 5.804991620046979e-06, "loss": 0.91872511, "memory(GiB)": 742.41, "step": 37535, "train_speed(iter/s)": 0.14482 }, { "acc": 0.76695938, "epoch": 0.9523099827339909, "grad_norm": 3.890625, "learning_rate": 5.803956630665661e-06, "loss": 0.8940073, "memory(GiB)": 742.41, "step": 37540, "train_speed(iter/s)": 0.1448 }, { "acc": 0.76917334, "epoch": 0.9524368221030285, "grad_norm": 3.6875, "learning_rate": 5.8029216059208505e-06, "loss": 0.85739231, "memory(GiB)": 742.41, "step": 37545, "train_speed(iter/s)": 0.144782 }, { "acc": 0.76401463, "epoch": 0.9525636614720661, "grad_norm": 3.28125, "learning_rate": 5.8018865458580775e-06, "loss": 0.90498075, "memory(GiB)": 742.41, "step": 37550, "train_speed(iter/s)": 0.144765 }, { "acc": 0.75929346, "epoch": 0.9526905008411035, "grad_norm": 3.28125, "learning_rate": 5.8008514505228706e-06, "loss": 0.931954, "memory(GiB)": 742.41, "step": 37555, "train_speed(iter/s)": 0.144745 }, { "acc": 0.76659093, "epoch": 0.9528173402101411, "grad_norm": 4.125, "learning_rate": 5.799816319960759e-06, "loss": 0.91477823, "memory(GiB)": 742.41, "step": 37560, "train_speed(iter/s)": 0.144725 }, { "acc": 0.7842617, "epoch": 0.9529441795791787, "grad_norm": 3.375, "learning_rate": 5.798781154217278e-06, "loss": 0.85518246, "memory(GiB)": 742.41, "step": 37565, "train_speed(iter/s)": 0.144709 }, { "acc": 0.77144046, "epoch": 0.9530710189482162, "grad_norm": 4.0, "learning_rate": 5.797745953337957e-06, "loss": 0.88169985, "memory(GiB)": 742.41, "step": 37570, "train_speed(iter/s)": 0.144692 }, { "acc": 0.76585984, "epoch": 0.9531978583172538, "grad_norm": 3.46875, "learning_rate": 5.7967107173683345e-06, "loss": 0.89636793, "memory(GiB)": 742.41, "step": 37575, "train_speed(iter/s)": 0.144676 }, { "acc": 0.75629368, "epoch": 0.9533246976862914, "grad_norm": 3.484375, "learning_rate": 5.795675446353947e-06, "loss": 0.93630342, "memory(GiB)": 742.41, "step": 37580, "train_speed(iter/s)": 0.144657 }, { "acc": 0.76271753, "epoch": 0.9534515370553289, "grad_norm": 3.953125, "learning_rate": 5.794640140340329e-06, "loss": 0.91703529, "memory(GiB)": 742.41, "step": 37585, "train_speed(iter/s)": 0.144642 }, { "acc": 0.76403947, "epoch": 0.9535783764243665, "grad_norm": 4.3125, "learning_rate": 5.793604799373026e-06, "loss": 0.90620937, "memory(GiB)": 742.41, "step": 37590, "train_speed(iter/s)": 0.144622 }, { "acc": 0.7670228, "epoch": 0.9537052157934041, "grad_norm": 3.515625, "learning_rate": 5.792569423497575e-06, "loss": 0.89792347, "memory(GiB)": 742.41, "step": 37595, "train_speed(iter/s)": 0.144607 }, { "acc": 0.7737432, "epoch": 0.9538320551624416, "grad_norm": 3.328125, "learning_rate": 5.791534012759521e-06, "loss": 0.8975028, "memory(GiB)": 742.41, "step": 37600, "train_speed(iter/s)": 0.14459 }, { "acc": 0.76923366, "epoch": 0.9539588945314792, "grad_norm": 3.84375, "learning_rate": 5.7904985672044054e-06, "loss": 0.9445343, "memory(GiB)": 742.41, "step": 37605, "train_speed(iter/s)": 0.144571 }, { "acc": 0.76365957, "epoch": 0.9540857339005168, "grad_norm": 3.09375, "learning_rate": 5.78946308687778e-06, "loss": 0.93322287, "memory(GiB)": 742.41, "step": 37610, "train_speed(iter/s)": 0.144548 }, { "acc": 0.77403808, "epoch": 0.9542125732695542, "grad_norm": 3.859375, "learning_rate": 5.788427571825186e-06, "loss": 0.88737268, "memory(GiB)": 742.41, "step": 37615, "train_speed(iter/s)": 0.144532 }, { "acc": 0.77500973, "epoch": 0.9543394126385918, "grad_norm": 4.03125, "learning_rate": 5.787392022092177e-06, "loss": 0.83609715, "memory(GiB)": 742.41, "step": 37620, "train_speed(iter/s)": 0.144512 }, { "acc": 0.77797732, "epoch": 0.9544662520076294, "grad_norm": 3.203125, "learning_rate": 5.786356437724301e-06, "loss": 0.86126003, "memory(GiB)": 742.41, "step": 37625, "train_speed(iter/s)": 0.144491 }, { "acc": 0.76765122, "epoch": 0.9545930913766669, "grad_norm": 4.90625, "learning_rate": 5.7853208187671106e-06, "loss": 0.88406706, "memory(GiB)": 742.41, "step": 37630, "train_speed(iter/s)": 0.144474 }, { "acc": 0.7540019, "epoch": 0.9547199307457045, "grad_norm": 3.453125, "learning_rate": 5.78428516526616e-06, "loss": 0.87987118, "memory(GiB)": 742.41, "step": 37635, "train_speed(iter/s)": 0.144457 }, { "acc": 0.76579132, "epoch": 0.9548467701147421, "grad_norm": 4.0625, "learning_rate": 5.783249477267003e-06, "loss": 0.92519569, "memory(GiB)": 742.41, "step": 37640, "train_speed(iter/s)": 0.144438 }, { "acc": 0.76681604, "epoch": 0.9549736094837796, "grad_norm": 3.0, "learning_rate": 5.782213754815199e-06, "loss": 0.90408735, "memory(GiB)": 742.41, "step": 37645, "train_speed(iter/s)": 0.144418 }, { "acc": 0.77037053, "epoch": 0.9551004488528172, "grad_norm": 3.375, "learning_rate": 5.781177997956302e-06, "loss": 0.85119648, "memory(GiB)": 742.41, "step": 37650, "train_speed(iter/s)": 0.144395 }, { "acc": 0.77048488, "epoch": 0.9552272882218548, "grad_norm": 3.09375, "learning_rate": 5.780142206735875e-06, "loss": 0.90536737, "memory(GiB)": 742.41, "step": 37655, "train_speed(iter/s)": 0.144377 }, { "acc": 0.78287067, "epoch": 0.9553541275908923, "grad_norm": 3.390625, "learning_rate": 5.7791063811994776e-06, "loss": 0.87296791, "memory(GiB)": 742.41, "step": 37660, "train_speed(iter/s)": 0.14436 }, { "acc": 0.76798863, "epoch": 0.9554809669599299, "grad_norm": 3.3125, "learning_rate": 5.7780705213926725e-06, "loss": 0.89614191, "memory(GiB)": 742.41, "step": 37665, "train_speed(iter/s)": 0.144346 }, { "acc": 0.75519538, "epoch": 0.9556078063289675, "grad_norm": 2.859375, "learning_rate": 5.777034627361025e-06, "loss": 0.90060587, "memory(GiB)": 742.41, "step": 37670, "train_speed(iter/s)": 0.144324 }, { "acc": 0.7617197, "epoch": 0.9557346456980049, "grad_norm": 2.84375, "learning_rate": 5.775998699150099e-06, "loss": 0.88350201, "memory(GiB)": 742.41, "step": 37675, "train_speed(iter/s)": 0.144301 }, { "acc": 0.77547426, "epoch": 0.9558614850670425, "grad_norm": 4.1875, "learning_rate": 5.774962736805465e-06, "loss": 0.8100173, "memory(GiB)": 742.41, "step": 37680, "train_speed(iter/s)": 0.144281 }, { "acc": 0.76531792, "epoch": 0.9559883244360801, "grad_norm": 3.71875, "learning_rate": 5.773926740372688e-06, "loss": 0.89469957, "memory(GiB)": 742.41, "step": 37685, "train_speed(iter/s)": 0.144262 }, { "acc": 0.76222272, "epoch": 0.9561151638051176, "grad_norm": 3.296875, "learning_rate": 5.7728907098973385e-06, "loss": 0.87129202, "memory(GiB)": 742.41, "step": 37690, "train_speed(iter/s)": 0.144241 }, { "acc": 0.77622418, "epoch": 0.9562420031741552, "grad_norm": 3.984375, "learning_rate": 5.771854645424991e-06, "loss": 0.85848484, "memory(GiB)": 742.41, "step": 37695, "train_speed(iter/s)": 0.144223 }, { "acc": 0.76347132, "epoch": 0.9563688425431928, "grad_norm": 3.390625, "learning_rate": 5.770818547001216e-06, "loss": 0.88443251, "memory(GiB)": 742.41, "step": 37700, "train_speed(iter/s)": 0.144209 }, { "acc": 0.77470131, "epoch": 0.9564956819122303, "grad_norm": 3.171875, "learning_rate": 5.76978241467159e-06, "loss": 0.86120024, "memory(GiB)": 742.41, "step": 37705, "train_speed(iter/s)": 0.144191 }, { "acc": 0.76719713, "epoch": 0.9566225212812679, "grad_norm": 3.40625, "learning_rate": 5.768746248481687e-06, "loss": 0.89183445, "memory(GiB)": 742.41, "step": 37710, "train_speed(iter/s)": 0.144174 }, { "acc": 0.77949319, "epoch": 0.9567493606503055, "grad_norm": 3.84375, "learning_rate": 5.767710048477086e-06, "loss": 0.86830235, "memory(GiB)": 742.41, "step": 37715, "train_speed(iter/s)": 0.144158 }, { "acc": 0.77298861, "epoch": 0.956876200019343, "grad_norm": 3.8125, "learning_rate": 5.766673814703366e-06, "loss": 0.85176964, "memory(GiB)": 742.41, "step": 37720, "train_speed(iter/s)": 0.144139 }, { "acc": 0.75383391, "epoch": 0.9570030393883806, "grad_norm": 3.265625, "learning_rate": 5.765637547206109e-06, "loss": 0.92099123, "memory(GiB)": 742.41, "step": 37725, "train_speed(iter/s)": 0.144124 }, { "acc": 0.78408809, "epoch": 0.9571298787574182, "grad_norm": 5.1875, "learning_rate": 5.764601246030894e-06, "loss": 0.89335346, "memory(GiB)": 742.41, "step": 37730, "train_speed(iter/s)": 0.144106 }, { "acc": 0.78862028, "epoch": 0.9572567181264556, "grad_norm": 4.03125, "learning_rate": 5.763564911223308e-06, "loss": 0.82036543, "memory(GiB)": 742.41, "step": 37735, "train_speed(iter/s)": 0.144089 }, { "acc": 0.76814928, "epoch": 0.9573835574954932, "grad_norm": 3.25, "learning_rate": 5.762528542828933e-06, "loss": 0.85497971, "memory(GiB)": 742.41, "step": 37740, "train_speed(iter/s)": 0.14407 }, { "acc": 0.76648717, "epoch": 0.9575103968645308, "grad_norm": 3.5625, "learning_rate": 5.7614921408933585e-06, "loss": 0.86549406, "memory(GiB)": 742.41, "step": 37745, "train_speed(iter/s)": 0.14405 }, { "acc": 0.75091896, "epoch": 0.9576372362335683, "grad_norm": 3.46875, "learning_rate": 5.7604557054621716e-06, "loss": 0.91280365, "memory(GiB)": 742.41, "step": 37750, "train_speed(iter/s)": 0.144031 }, { "acc": 0.77807856, "epoch": 0.9577640756026059, "grad_norm": 3.75, "learning_rate": 5.759419236580959e-06, "loss": 0.84186106, "memory(GiB)": 742.41, "step": 37755, "train_speed(iter/s)": 0.144012 }, { "acc": 0.77333183, "epoch": 0.9578909149716435, "grad_norm": 3.125, "learning_rate": 5.758382734295315e-06, "loss": 0.82585325, "memory(GiB)": 742.41, "step": 37760, "train_speed(iter/s)": 0.143991 }, { "acc": 0.77670441, "epoch": 0.958017754340681, "grad_norm": 3.921875, "learning_rate": 5.75734619865083e-06, "loss": 0.85356932, "memory(GiB)": 742.41, "step": 37765, "train_speed(iter/s)": 0.143974 }, { "acc": 0.76825352, "epoch": 0.9581445937097186, "grad_norm": 3.28125, "learning_rate": 5.7563096296930995e-06, "loss": 0.8585887, "memory(GiB)": 742.41, "step": 37770, "train_speed(iter/s)": 0.143959 }, { "acc": 0.76594739, "epoch": 0.9582714330787562, "grad_norm": 3.46875, "learning_rate": 5.7552730274677175e-06, "loss": 0.87677221, "memory(GiB)": 742.41, "step": 37775, "train_speed(iter/s)": 0.143942 }, { "acc": 0.75890131, "epoch": 0.9583982724477937, "grad_norm": 3.34375, "learning_rate": 5.754236392020281e-06, "loss": 0.93760347, "memory(GiB)": 742.41, "step": 37780, "train_speed(iter/s)": 0.143922 }, { "acc": 0.76946898, "epoch": 0.9585251118168313, "grad_norm": 2.953125, "learning_rate": 5.75319972339639e-06, "loss": 0.88860722, "memory(GiB)": 742.41, "step": 37785, "train_speed(iter/s)": 0.143905 }, { "acc": 0.76716933, "epoch": 0.9586519511858689, "grad_norm": 3.765625, "learning_rate": 5.752163021641644e-06, "loss": 0.90420465, "memory(GiB)": 742.41, "step": 37790, "train_speed(iter/s)": 0.143888 }, { "acc": 0.76804099, "epoch": 0.9587787905549063, "grad_norm": 3.171875, "learning_rate": 5.75112628680164e-06, "loss": 0.85419331, "memory(GiB)": 742.41, "step": 37795, "train_speed(iter/s)": 0.143869 }, { "acc": 0.77539477, "epoch": 0.9589056299239439, "grad_norm": 3.546875, "learning_rate": 5.7500895189219866e-06, "loss": 0.86027651, "memory(GiB)": 742.41, "step": 37800, "train_speed(iter/s)": 0.143854 }, { "acc": 0.76089396, "epoch": 0.9590324692929815, "grad_norm": 3.203125, "learning_rate": 5.749052718048285e-06, "loss": 0.91171055, "memory(GiB)": 742.41, "step": 37805, "train_speed(iter/s)": 0.143835 }, { "acc": 0.77002473, "epoch": 0.959159308662019, "grad_norm": 3.84375, "learning_rate": 5.748015884226138e-06, "loss": 0.88819132, "memory(GiB)": 742.41, "step": 37810, "train_speed(iter/s)": 0.143819 }, { "acc": 0.78451791, "epoch": 0.9592861480310566, "grad_norm": 3.59375, "learning_rate": 5.746979017501159e-06, "loss": 0.8095849, "memory(GiB)": 742.41, "step": 37815, "train_speed(iter/s)": 0.143801 }, { "acc": 0.76812325, "epoch": 0.9594129874000942, "grad_norm": 3.5, "learning_rate": 5.745942117918951e-06, "loss": 0.89698477, "memory(GiB)": 742.41, "step": 37820, "train_speed(iter/s)": 0.143782 }, { "acc": 0.76509409, "epoch": 0.9595398267691317, "grad_norm": 3.6875, "learning_rate": 5.744905185525127e-06, "loss": 0.92495298, "memory(GiB)": 742.41, "step": 37825, "train_speed(iter/s)": 0.143765 }, { "acc": 0.74560933, "epoch": 0.9596666661381693, "grad_norm": 3.40625, "learning_rate": 5.743868220365297e-06, "loss": 0.91312628, "memory(GiB)": 742.41, "step": 37830, "train_speed(iter/s)": 0.143747 }, { "acc": 0.75676236, "epoch": 0.9597935055072069, "grad_norm": 3.28125, "learning_rate": 5.742831222485075e-06, "loss": 0.95717611, "memory(GiB)": 742.41, "step": 37835, "train_speed(iter/s)": 0.143731 }, { "acc": 0.76695895, "epoch": 0.9599203448762444, "grad_norm": 3.515625, "learning_rate": 5.741794191930073e-06, "loss": 0.90291033, "memory(GiB)": 742.41, "step": 37840, "train_speed(iter/s)": 0.143717 }, { "acc": 0.76229358, "epoch": 0.960047184245282, "grad_norm": 3.640625, "learning_rate": 5.740757128745909e-06, "loss": 0.85765848, "memory(GiB)": 742.41, "step": 37845, "train_speed(iter/s)": 0.143701 }, { "acc": 0.76891174, "epoch": 0.9601740236143196, "grad_norm": 3.609375, "learning_rate": 5.739720032978199e-06, "loss": 0.86262169, "memory(GiB)": 742.41, "step": 37850, "train_speed(iter/s)": 0.143678 }, { "acc": 0.76650028, "epoch": 0.960300862983357, "grad_norm": 3.375, "learning_rate": 5.73868290467256e-06, "loss": 0.9005599, "memory(GiB)": 742.41, "step": 37855, "train_speed(iter/s)": 0.14366 }, { "acc": 0.76784172, "epoch": 0.9604277023523946, "grad_norm": 3.53125, "learning_rate": 5.737645743874616e-06, "loss": 0.90510435, "memory(GiB)": 742.41, "step": 37860, "train_speed(iter/s)": 0.143645 }, { "acc": 0.79061885, "epoch": 0.9605545417214322, "grad_norm": 3.015625, "learning_rate": 5.736608550629984e-06, "loss": 0.81720028, "memory(GiB)": 742.41, "step": 37865, "train_speed(iter/s)": 0.143628 }, { "acc": 0.77399001, "epoch": 0.9606813810904697, "grad_norm": 3.203125, "learning_rate": 5.735571324984291e-06, "loss": 0.89068213, "memory(GiB)": 742.41, "step": 37870, "train_speed(iter/s)": 0.14361 }, { "acc": 0.78062172, "epoch": 0.9608082204595073, "grad_norm": 4.84375, "learning_rate": 5.7345340669831575e-06, "loss": 0.83341675, "memory(GiB)": 742.41, "step": 37875, "train_speed(iter/s)": 0.14359 }, { "acc": 0.76912212, "epoch": 0.9609350598285449, "grad_norm": 4.28125, "learning_rate": 5.733496776672211e-06, "loss": 0.88620663, "memory(GiB)": 742.41, "step": 37880, "train_speed(iter/s)": 0.143575 }, { "acc": 0.75682974, "epoch": 0.9610618991975824, "grad_norm": 3.4375, "learning_rate": 5.732459454097079e-06, "loss": 0.92205944, "memory(GiB)": 742.41, "step": 37885, "train_speed(iter/s)": 0.143557 }, { "acc": 0.7639111, "epoch": 0.96118873856662, "grad_norm": 3.796875, "learning_rate": 5.731422099303387e-06, "loss": 0.85569811, "memory(GiB)": 742.41, "step": 37890, "train_speed(iter/s)": 0.143539 }, { "acc": 0.77330823, "epoch": 0.9613155779356576, "grad_norm": 3.265625, "learning_rate": 5.730384712336771e-06, "loss": 0.83805294, "memory(GiB)": 742.41, "step": 37895, "train_speed(iter/s)": 0.143523 }, { "acc": 0.76032453, "epoch": 0.9614424173046952, "grad_norm": 3.546875, "learning_rate": 5.729347293242855e-06, "loss": 0.89867363, "memory(GiB)": 742.41, "step": 37900, "train_speed(iter/s)": 0.143501 }, { "acc": 0.74992895, "epoch": 0.9615692566737327, "grad_norm": 3.296875, "learning_rate": 5.728309842067278e-06, "loss": 0.92808781, "memory(GiB)": 742.41, "step": 37905, "train_speed(iter/s)": 0.143485 }, { "acc": 0.77646103, "epoch": 0.9616960960427703, "grad_norm": 3.8125, "learning_rate": 5.72727235885567e-06, "loss": 0.85666256, "memory(GiB)": 742.41, "step": 37910, "train_speed(iter/s)": 0.143464 }, { "acc": 0.774685, "epoch": 0.9618229354118077, "grad_norm": 3.59375, "learning_rate": 5.7262348436536695e-06, "loss": 0.87599401, "memory(GiB)": 742.41, "step": 37915, "train_speed(iter/s)": 0.143445 }, { "acc": 0.76339164, "epoch": 0.9619497747808453, "grad_norm": 2.984375, "learning_rate": 5.725197296506912e-06, "loss": 0.86041069, "memory(GiB)": 742.41, "step": 37920, "train_speed(iter/s)": 0.143429 }, { "acc": 0.76226716, "epoch": 0.9620766141498829, "grad_norm": 3.796875, "learning_rate": 5.7241597174610354e-06, "loss": 0.88735132, "memory(GiB)": 742.41, "step": 37925, "train_speed(iter/s)": 0.143413 }, { "acc": 0.77310328, "epoch": 0.9622034535189204, "grad_norm": 3.5625, "learning_rate": 5.72312210656168e-06, "loss": 0.86095877, "memory(GiB)": 742.41, "step": 37930, "train_speed(iter/s)": 0.143398 }, { "acc": 0.75481076, "epoch": 0.962330292887958, "grad_norm": 3.125, "learning_rate": 5.722084463854488e-06, "loss": 0.92725677, "memory(GiB)": 742.41, "step": 37935, "train_speed(iter/s)": 0.14338 }, { "acc": 0.77112002, "epoch": 0.9624571322569956, "grad_norm": 3.171875, "learning_rate": 5.7210467893851e-06, "loss": 0.87879524, "memory(GiB)": 742.41, "step": 37940, "train_speed(iter/s)": 0.143361 }, { "acc": 0.77147036, "epoch": 0.9625839716260332, "grad_norm": 3.234375, "learning_rate": 5.720009083199163e-06, "loss": 0.8772275, "memory(GiB)": 742.41, "step": 37945, "train_speed(iter/s)": 0.143349 }, { "acc": 0.77439494, "epoch": 0.9627108109950707, "grad_norm": 2.984375, "learning_rate": 5.718971345342319e-06, "loss": 0.79521236, "memory(GiB)": 742.41, "step": 37950, "train_speed(iter/s)": 0.143327 }, { "acc": 0.76581511, "epoch": 0.9628376503641083, "grad_norm": 3.234375, "learning_rate": 5.7179335758602164e-06, "loss": 0.90554476, "memory(GiB)": 742.41, "step": 37955, "train_speed(iter/s)": 0.14331 }, { "acc": 0.76769943, "epoch": 0.9629644897331459, "grad_norm": 3.203125, "learning_rate": 5.7168957747985034e-06, "loss": 0.90378761, "memory(GiB)": 742.41, "step": 37960, "train_speed(iter/s)": 0.143293 }, { "acc": 0.76792731, "epoch": 0.9630913291021834, "grad_norm": 3.4375, "learning_rate": 5.7158579422028295e-06, "loss": 0.90900183, "memory(GiB)": 742.41, "step": 37965, "train_speed(iter/s)": 0.143274 }, { "acc": 0.77717199, "epoch": 0.963218168471221, "grad_norm": 4.09375, "learning_rate": 5.714820078118845e-06, "loss": 0.88852587, "memory(GiB)": 742.41, "step": 37970, "train_speed(iter/s)": 0.143259 }, { "acc": 0.75798926, "epoch": 0.9633450078402584, "grad_norm": 4.25, "learning_rate": 5.713782182592203e-06, "loss": 0.92488461, "memory(GiB)": 742.41, "step": 37975, "train_speed(iter/s)": 0.143242 }, { "acc": 0.76567278, "epoch": 0.963471847209296, "grad_norm": 3.203125, "learning_rate": 5.712744255668558e-06, "loss": 0.95667191, "memory(GiB)": 742.41, "step": 37980, "train_speed(iter/s)": 0.143224 }, { "acc": 0.75336776, "epoch": 0.9635986865783336, "grad_norm": 3.453125, "learning_rate": 5.711706297393563e-06, "loss": 0.94406948, "memory(GiB)": 742.41, "step": 37985, "train_speed(iter/s)": 0.143207 }, { "acc": 0.76618724, "epoch": 0.9637255259473712, "grad_norm": 3.3125, "learning_rate": 5.710668307812877e-06, "loss": 0.92631149, "memory(GiB)": 742.41, "step": 37990, "train_speed(iter/s)": 0.143191 }, { "acc": 0.77406287, "epoch": 0.9638523653164087, "grad_norm": 3.6875, "learning_rate": 5.709630286972155e-06, "loss": 0.89643583, "memory(GiB)": 742.41, "step": 37995, "train_speed(iter/s)": 0.143171 }, { "acc": 0.77071409, "epoch": 0.9639792046854463, "grad_norm": 3.6875, "learning_rate": 5.70859223491706e-06, "loss": 0.83539934, "memory(GiB)": 742.41, "step": 38000, "train_speed(iter/s)": 0.143157 }, { "epoch": 0.9639792046854463, "eval_acc": 0.7575428932140803, "eval_loss": 0.857914388179779, "eval_runtime": 1153.9404, "eval_samples_per_second": 5.52, "eval_steps_per_second": 5.52, "step": 38000 }, { "acc": 0.77525854, "epoch": 0.9641060440544839, "grad_norm": 4.59375, "learning_rate": 5.707554151693248e-06, "loss": 0.87661114, "memory(GiB)": 742.41, "step": 38005, "train_speed(iter/s)": 0.142136 }, { "acc": 0.76994205, "epoch": 0.9642328834235214, "grad_norm": 3.796875, "learning_rate": 5.706516037346383e-06, "loss": 0.85321407, "memory(GiB)": 742.41, "step": 38010, "train_speed(iter/s)": 0.142122 }, { "acc": 0.78963413, "epoch": 0.964359722792559, "grad_norm": 3.28125, "learning_rate": 5.70547789192213e-06, "loss": 0.80949745, "memory(GiB)": 742.41, "step": 38015, "train_speed(iter/s)": 0.142103 }, { "acc": 0.76583633, "epoch": 0.9644865621615966, "grad_norm": 3.0, "learning_rate": 5.704439715466152e-06, "loss": 0.8741086, "memory(GiB)": 742.41, "step": 38020, "train_speed(iter/s)": 0.142087 }, { "acc": 0.78239412, "epoch": 0.9646134015306341, "grad_norm": 3.171875, "learning_rate": 5.7034015080241145e-06, "loss": 0.86460266, "memory(GiB)": 742.41, "step": 38025, "train_speed(iter/s)": 0.14207 }, { "acc": 0.76974907, "epoch": 0.9647402408996717, "grad_norm": 3.96875, "learning_rate": 5.702363269641688e-06, "loss": 0.86461906, "memory(GiB)": 742.41, "step": 38030, "train_speed(iter/s)": 0.142054 }, { "acc": 0.77853169, "epoch": 0.9648670802687092, "grad_norm": 3.109375, "learning_rate": 5.701325000364537e-06, "loss": 0.86580849, "memory(GiB)": 742.41, "step": 38035, "train_speed(iter/s)": 0.142037 }, { "acc": 0.76853633, "epoch": 0.9649939196377467, "grad_norm": 3.6875, "learning_rate": 5.700286700238335e-06, "loss": 0.8685154, "memory(GiB)": 742.41, "step": 38040, "train_speed(iter/s)": 0.142024 }, { "acc": 0.76855769, "epoch": 0.9651207590067843, "grad_norm": 3.21875, "learning_rate": 5.699248369308752e-06, "loss": 0.84929695, "memory(GiB)": 742.41, "step": 38045, "train_speed(iter/s)": 0.142007 }, { "acc": 0.77766099, "epoch": 0.9652475983758219, "grad_norm": 3.25, "learning_rate": 5.698210007621461e-06, "loss": 0.86246986, "memory(GiB)": 742.41, "step": 38050, "train_speed(iter/s)": 0.141989 }, { "acc": 0.77870684, "epoch": 0.9653744377448594, "grad_norm": 3.90625, "learning_rate": 5.697171615222136e-06, "loss": 0.87089205, "memory(GiB)": 742.41, "step": 38055, "train_speed(iter/s)": 0.141972 }, { "acc": 0.76431046, "epoch": 0.965501277113897, "grad_norm": 3.5, "learning_rate": 5.6961331921564534e-06, "loss": 0.9108984, "memory(GiB)": 742.41, "step": 38060, "train_speed(iter/s)": 0.141957 }, { "acc": 0.76577859, "epoch": 0.9656281164829346, "grad_norm": 4.0625, "learning_rate": 5.69509473847009e-06, "loss": 0.89088211, "memory(GiB)": 742.41, "step": 38065, "train_speed(iter/s)": 0.141938 }, { "acc": 0.76109109, "epoch": 0.9657549558519721, "grad_norm": 3.109375, "learning_rate": 5.694056254208723e-06, "loss": 0.9082305, "memory(GiB)": 742.41, "step": 38070, "train_speed(iter/s)": 0.141923 }, { "acc": 0.78540182, "epoch": 0.9658817952210097, "grad_norm": 3.984375, "learning_rate": 5.693017739418033e-06, "loss": 0.79028416, "memory(GiB)": 742.41, "step": 38075, "train_speed(iter/s)": 0.141906 }, { "acc": 0.77162042, "epoch": 0.9660086345900473, "grad_norm": 2.859375, "learning_rate": 5.6919791941437e-06, "loss": 0.85633974, "memory(GiB)": 742.41, "step": 38080, "train_speed(iter/s)": 0.14189 }, { "acc": 0.77953553, "epoch": 0.9661354739590848, "grad_norm": 3.03125, "learning_rate": 5.69094061843141e-06, "loss": 0.82518997, "memory(GiB)": 742.41, "step": 38085, "train_speed(iter/s)": 0.141874 }, { "acc": 0.76431713, "epoch": 0.9662623133281224, "grad_norm": 3.578125, "learning_rate": 5.68990201232684e-06, "loss": 0.94134912, "memory(GiB)": 742.41, "step": 38090, "train_speed(iter/s)": 0.141852 }, { "acc": 0.75960264, "epoch": 0.9663891526971599, "grad_norm": 4.96875, "learning_rate": 5.688863375875682e-06, "loss": 0.90387926, "memory(GiB)": 742.41, "step": 38095, "train_speed(iter/s)": 0.141839 }, { "acc": 0.7809073, "epoch": 0.9665159920661974, "grad_norm": 3.859375, "learning_rate": 5.687824709123616e-06, "loss": 0.83629446, "memory(GiB)": 742.41, "step": 38100, "train_speed(iter/s)": 0.141823 }, { "acc": 0.76220894, "epoch": 0.966642831435235, "grad_norm": 3.859375, "learning_rate": 5.686786012116333e-06, "loss": 0.92806406, "memory(GiB)": 742.41, "step": 38105, "train_speed(iter/s)": 0.141805 }, { "acc": 0.76825948, "epoch": 0.9667696708042726, "grad_norm": 3.765625, "learning_rate": 5.685747284899523e-06, "loss": 0.90663214, "memory(GiB)": 742.41, "step": 38110, "train_speed(iter/s)": 0.141785 }, { "acc": 0.76783161, "epoch": 0.9668965101733101, "grad_norm": 3.71875, "learning_rate": 5.684708527518874e-06, "loss": 0.87586451, "memory(GiB)": 742.41, "step": 38115, "train_speed(iter/s)": 0.141763 }, { "acc": 0.75993085, "epoch": 0.9670233495423477, "grad_norm": 3.671875, "learning_rate": 5.683669740020079e-06, "loss": 0.95267038, "memory(GiB)": 742.41, "step": 38120, "train_speed(iter/s)": 0.141749 }, { "acc": 0.76073847, "epoch": 0.9671501889113853, "grad_norm": 3.359375, "learning_rate": 5.682630922448831e-06, "loss": 0.92682018, "memory(GiB)": 742.41, "step": 38125, "train_speed(iter/s)": 0.141732 }, { "acc": 0.76190348, "epoch": 0.9672770282804228, "grad_norm": 3.359375, "learning_rate": 5.6815920748508215e-06, "loss": 0.92958174, "memory(GiB)": 742.41, "step": 38130, "train_speed(iter/s)": 0.141714 }, { "acc": 0.76530447, "epoch": 0.9674038676494604, "grad_norm": 3.671875, "learning_rate": 5.680553197271751e-06, "loss": 0.92104654, "memory(GiB)": 742.41, "step": 38135, "train_speed(iter/s)": 0.141695 }, { "acc": 0.75204782, "epoch": 0.967530707018498, "grad_norm": 3.890625, "learning_rate": 5.679514289757311e-06, "loss": 0.9384964, "memory(GiB)": 742.41, "step": 38140, "train_speed(iter/s)": 0.141677 }, { "acc": 0.77748876, "epoch": 0.9676575463875355, "grad_norm": 2.953125, "learning_rate": 5.6784753523532046e-06, "loss": 0.83838844, "memory(GiB)": 742.41, "step": 38145, "train_speed(iter/s)": 0.14166 }, { "acc": 0.76712985, "epoch": 0.9677843857565731, "grad_norm": 3.46875, "learning_rate": 5.677436385105128e-06, "loss": 0.87455692, "memory(GiB)": 742.41, "step": 38150, "train_speed(iter/s)": 0.141645 }, { "acc": 0.76199245, "epoch": 0.9679112251256106, "grad_norm": 3.765625, "learning_rate": 5.676397388058784e-06, "loss": 0.92092838, "memory(GiB)": 742.41, "step": 38155, "train_speed(iter/s)": 0.14163 }, { "acc": 0.75792451, "epoch": 0.9680380644946481, "grad_norm": 3.046875, "learning_rate": 5.675358361259873e-06, "loss": 0.87069817, "memory(GiB)": 742.41, "step": 38160, "train_speed(iter/s)": 0.141616 }, { "acc": 0.75253515, "epoch": 0.9681649038636857, "grad_norm": 3.84375, "learning_rate": 5.674319304754101e-06, "loss": 0.94476385, "memory(GiB)": 742.41, "step": 38165, "train_speed(iter/s)": 0.1416 }, { "acc": 0.76523948, "epoch": 0.9682917432327233, "grad_norm": 3.625, "learning_rate": 5.673280218587171e-06, "loss": 0.86418524, "memory(GiB)": 742.41, "step": 38170, "train_speed(iter/s)": 0.141582 }, { "acc": 0.7820107, "epoch": 0.9684185826017608, "grad_norm": 3.59375, "learning_rate": 5.672241102804789e-06, "loss": 0.83418732, "memory(GiB)": 742.41, "step": 38175, "train_speed(iter/s)": 0.141562 }, { "acc": 0.76707239, "epoch": 0.9685454219707984, "grad_norm": 5.03125, "learning_rate": 5.671201957452663e-06, "loss": 0.88011112, "memory(GiB)": 742.41, "step": 38180, "train_speed(iter/s)": 0.141545 }, { "acc": 0.76633029, "epoch": 0.968672261339836, "grad_norm": 4.0625, "learning_rate": 5.6701627825765e-06, "loss": 0.88986998, "memory(GiB)": 742.41, "step": 38185, "train_speed(iter/s)": 0.141532 }, { "acc": 0.77928538, "epoch": 0.9687991007088735, "grad_norm": 3.546875, "learning_rate": 5.6691235782220135e-06, "loss": 0.86289225, "memory(GiB)": 742.41, "step": 38190, "train_speed(iter/s)": 0.141517 }, { "acc": 0.75496206, "epoch": 0.9689259400779111, "grad_norm": 3.5625, "learning_rate": 5.668084344434912e-06, "loss": 0.89874973, "memory(GiB)": 742.41, "step": 38195, "train_speed(iter/s)": 0.141497 }, { "acc": 0.75220785, "epoch": 0.9690527794469487, "grad_norm": 3.90625, "learning_rate": 5.667045081260909e-06, "loss": 0.92006311, "memory(GiB)": 742.41, "step": 38200, "train_speed(iter/s)": 0.141477 }, { "acc": 0.76538553, "epoch": 0.9691796188159862, "grad_norm": 4.15625, "learning_rate": 5.6660057887457175e-06, "loss": 0.88328428, "memory(GiB)": 742.41, "step": 38205, "train_speed(iter/s)": 0.141461 }, { "acc": 0.7828639, "epoch": 0.9693064581850238, "grad_norm": 4.125, "learning_rate": 5.664966466935055e-06, "loss": 0.85315027, "memory(GiB)": 742.41, "step": 38210, "train_speed(iter/s)": 0.141443 }, { "acc": 0.76650062, "epoch": 0.9694332975540613, "grad_norm": 3.640625, "learning_rate": 5.663927115874635e-06, "loss": 0.92455416, "memory(GiB)": 742.41, "step": 38215, "train_speed(iter/s)": 0.141432 }, { "acc": 0.7630621, "epoch": 0.9695601369230988, "grad_norm": 3.828125, "learning_rate": 5.662887735610176e-06, "loss": 0.91994972, "memory(GiB)": 742.41, "step": 38220, "train_speed(iter/s)": 0.141414 }, { "acc": 0.76124091, "epoch": 0.9696869762921364, "grad_norm": 3.65625, "learning_rate": 5.661848326187399e-06, "loss": 0.903162, "memory(GiB)": 742.41, "step": 38225, "train_speed(iter/s)": 0.141396 }, { "acc": 0.7737031, "epoch": 0.969813815661174, "grad_norm": 3.71875, "learning_rate": 5.660808887652022e-06, "loss": 0.83639545, "memory(GiB)": 742.41, "step": 38230, "train_speed(iter/s)": 0.141379 }, { "acc": 0.77030621, "epoch": 0.9699406550302115, "grad_norm": 3.734375, "learning_rate": 5.6597694200497675e-06, "loss": 0.88180008, "memory(GiB)": 742.41, "step": 38235, "train_speed(iter/s)": 0.141364 }, { "acc": 0.76474032, "epoch": 0.9700674943992491, "grad_norm": 3.828125, "learning_rate": 5.658729923426358e-06, "loss": 0.92042608, "memory(GiB)": 742.41, "step": 38240, "train_speed(iter/s)": 0.141347 }, { "acc": 0.77720671, "epoch": 0.9701943337682867, "grad_norm": 3.125, "learning_rate": 5.657690397827519e-06, "loss": 0.81943102, "memory(GiB)": 742.41, "step": 38245, "train_speed(iter/s)": 0.141331 }, { "acc": 0.77375431, "epoch": 0.9703211731373242, "grad_norm": 3.859375, "learning_rate": 5.656650843298974e-06, "loss": 0.87936039, "memory(GiB)": 742.41, "step": 38250, "train_speed(iter/s)": 0.141312 }, { "acc": 0.76622062, "epoch": 0.9704480125063618, "grad_norm": 3.59375, "learning_rate": 5.655611259886451e-06, "loss": 0.85513697, "memory(GiB)": 742.41, "step": 38255, "train_speed(iter/s)": 0.141293 }, { "acc": 0.7693697, "epoch": 0.9705748518753994, "grad_norm": 3.5, "learning_rate": 5.654571647635678e-06, "loss": 0.89717588, "memory(GiB)": 742.41, "step": 38260, "train_speed(iter/s)": 0.141278 }, { "acc": 0.75781622, "epoch": 0.970701691244437, "grad_norm": 3.234375, "learning_rate": 5.653532006592384e-06, "loss": 0.91092606, "memory(GiB)": 742.41, "step": 38265, "train_speed(iter/s)": 0.141264 }, { "acc": 0.74835401, "epoch": 0.9708285306134745, "grad_norm": 3.609375, "learning_rate": 5.652492336802298e-06, "loss": 0.94637308, "memory(GiB)": 742.41, "step": 38270, "train_speed(iter/s)": 0.141246 }, { "acc": 0.77355523, "epoch": 0.970955369982512, "grad_norm": 4.71875, "learning_rate": 5.6514526383111545e-06, "loss": 0.90817766, "memory(GiB)": 742.41, "step": 38275, "train_speed(iter/s)": 0.141233 }, { "acc": 0.7797121, "epoch": 0.9710822093515495, "grad_norm": 3.484375, "learning_rate": 5.6504129111646845e-06, "loss": 0.85684385, "memory(GiB)": 742.41, "step": 38280, "train_speed(iter/s)": 0.141218 }, { "acc": 0.77805448, "epoch": 0.9712090487205871, "grad_norm": 3.203125, "learning_rate": 5.649373155408623e-06, "loss": 0.87994823, "memory(GiB)": 742.41, "step": 38285, "train_speed(iter/s)": 0.141201 }, { "acc": 0.75738335, "epoch": 0.9713358880896247, "grad_norm": 3.765625, "learning_rate": 5.648333371088705e-06, "loss": 0.88452101, "memory(GiB)": 742.41, "step": 38290, "train_speed(iter/s)": 0.141186 }, { "acc": 0.77397714, "epoch": 0.9714627274586622, "grad_norm": 3.5, "learning_rate": 5.647293558250669e-06, "loss": 0.86272306, "memory(GiB)": 742.41, "step": 38295, "train_speed(iter/s)": 0.14117 }, { "acc": 0.77175221, "epoch": 0.9715895668276998, "grad_norm": 4.0, "learning_rate": 5.646253716940251e-06, "loss": 0.90562372, "memory(GiB)": 742.41, "step": 38300, "train_speed(iter/s)": 0.141151 }, { "acc": 0.78798609, "epoch": 0.9717164061967374, "grad_norm": 3.234375, "learning_rate": 5.645213847203191e-06, "loss": 0.79710836, "memory(GiB)": 742.41, "step": 38305, "train_speed(iter/s)": 0.141134 }, { "acc": 0.7702095, "epoch": 0.971843245565775, "grad_norm": 3.859375, "learning_rate": 5.644173949085229e-06, "loss": 0.91137142, "memory(GiB)": 742.41, "step": 38310, "train_speed(iter/s)": 0.141119 }, { "acc": 0.7640964, "epoch": 0.9719700849348125, "grad_norm": 3.90625, "learning_rate": 5.643134022632109e-06, "loss": 0.89652376, "memory(GiB)": 742.41, "step": 38315, "train_speed(iter/s)": 0.141104 }, { "acc": 0.77043447, "epoch": 0.9720969243038501, "grad_norm": 3.828125, "learning_rate": 5.6420940678895704e-06, "loss": 0.87778311, "memory(GiB)": 742.41, "step": 38320, "train_speed(iter/s)": 0.141089 }, { "acc": 0.77941165, "epoch": 0.9722237636728877, "grad_norm": 3.421875, "learning_rate": 5.6410540849033614e-06, "loss": 0.8845685, "memory(GiB)": 742.41, "step": 38325, "train_speed(iter/s)": 0.14107 }, { "acc": 0.75884042, "epoch": 0.9723506030419252, "grad_norm": 3.6875, "learning_rate": 5.640014073719224e-06, "loss": 0.96740808, "memory(GiB)": 742.41, "step": 38330, "train_speed(iter/s)": 0.141056 }, { "acc": 0.75794892, "epoch": 0.9724774424109627, "grad_norm": 3.09375, "learning_rate": 5.638974034382909e-06, "loss": 0.91052866, "memory(GiB)": 742.41, "step": 38335, "train_speed(iter/s)": 0.14104 }, { "acc": 0.76824646, "epoch": 0.9726042817800002, "grad_norm": 3.796875, "learning_rate": 5.637933966940161e-06, "loss": 0.90645447, "memory(GiB)": 742.41, "step": 38340, "train_speed(iter/s)": 0.141025 }, { "acc": 0.77328138, "epoch": 0.9727311211490378, "grad_norm": 2.78125, "learning_rate": 5.636893871436729e-06, "loss": 0.8782814, "memory(GiB)": 742.41, "step": 38345, "train_speed(iter/s)": 0.141004 }, { "acc": 0.75887327, "epoch": 0.9728579605180754, "grad_norm": 3.78125, "learning_rate": 5.635853747918367e-06, "loss": 0.94626951, "memory(GiB)": 742.41, "step": 38350, "train_speed(iter/s)": 0.140989 }, { "acc": 0.77147255, "epoch": 0.972984799887113, "grad_norm": 3.640625, "learning_rate": 5.634813596430823e-06, "loss": 0.83670092, "memory(GiB)": 742.41, "step": 38355, "train_speed(iter/s)": 0.140973 }, { "acc": 0.75803871, "epoch": 0.9731116392561505, "grad_norm": 4.125, "learning_rate": 5.633773417019853e-06, "loss": 0.90723848, "memory(GiB)": 742.41, "step": 38360, "train_speed(iter/s)": 0.140956 }, { "acc": 0.77560601, "epoch": 0.9732384786251881, "grad_norm": 3.25, "learning_rate": 5.632733209731208e-06, "loss": 0.84104757, "memory(GiB)": 742.41, "step": 38365, "train_speed(iter/s)": 0.140938 }, { "acc": 0.77148228, "epoch": 0.9733653179942257, "grad_norm": 3.546875, "learning_rate": 5.631692974610647e-06, "loss": 0.89414253, "memory(GiB)": 742.41, "step": 38370, "train_speed(iter/s)": 0.140925 }, { "acc": 0.76786833, "epoch": 0.9734921573632632, "grad_norm": 3.078125, "learning_rate": 5.630652711703924e-06, "loss": 0.8885602, "memory(GiB)": 742.41, "step": 38375, "train_speed(iter/s)": 0.140908 }, { "acc": 0.76782804, "epoch": 0.9736189967323008, "grad_norm": 3.453125, "learning_rate": 5.629612421056797e-06, "loss": 0.85328703, "memory(GiB)": 742.41, "step": 38380, "train_speed(iter/s)": 0.14089 }, { "acc": 0.76225429, "epoch": 0.9737458361013384, "grad_norm": 4.71875, "learning_rate": 5.628572102715027e-06, "loss": 0.8947979, "memory(GiB)": 742.41, "step": 38385, "train_speed(iter/s)": 0.14087 }, { "acc": 0.75451398, "epoch": 0.9738726754703759, "grad_norm": 3.46875, "learning_rate": 5.627531756724371e-06, "loss": 0.89278412, "memory(GiB)": 742.41, "step": 38390, "train_speed(iter/s)": 0.140854 }, { "acc": 0.76736069, "epoch": 0.9739995148394134, "grad_norm": 3.28125, "learning_rate": 5.626491383130593e-06, "loss": 0.92817259, "memory(GiB)": 742.41, "step": 38395, "train_speed(iter/s)": 0.140838 }, { "acc": 0.76128416, "epoch": 0.974126354208451, "grad_norm": 3.6875, "learning_rate": 5.625450981979455e-06, "loss": 0.96816425, "memory(GiB)": 742.41, "step": 38400, "train_speed(iter/s)": 0.140823 }, { "acc": 0.77059484, "epoch": 0.9742531935774885, "grad_norm": 3.921875, "learning_rate": 5.624410553316723e-06, "loss": 0.90737762, "memory(GiB)": 742.41, "step": 38405, "train_speed(iter/s)": 0.140805 }, { "acc": 0.76705313, "epoch": 0.9743800329465261, "grad_norm": 3.671875, "learning_rate": 5.623370097188158e-06, "loss": 0.89585276, "memory(GiB)": 742.41, "step": 38410, "train_speed(iter/s)": 0.140787 }, { "acc": 0.77047272, "epoch": 0.9745068723155637, "grad_norm": 3.6875, "learning_rate": 5.62232961363953e-06, "loss": 0.88166342, "memory(GiB)": 742.41, "step": 38415, "train_speed(iter/s)": 0.140769 }, { "acc": 0.76624103, "epoch": 0.9746337116846012, "grad_norm": 3.53125, "learning_rate": 5.621289102716605e-06, "loss": 0.92650185, "memory(GiB)": 742.41, "step": 38420, "train_speed(iter/s)": 0.140754 }, { "acc": 0.76640196, "epoch": 0.9747605510536388, "grad_norm": 3.40625, "learning_rate": 5.620248564465152e-06, "loss": 0.87622395, "memory(GiB)": 742.41, "step": 38425, "train_speed(iter/s)": 0.140738 }, { "acc": 0.75524611, "epoch": 0.9748873904226764, "grad_norm": 3.109375, "learning_rate": 5.61920799893094e-06, "loss": 0.90746117, "memory(GiB)": 742.41, "step": 38430, "train_speed(iter/s)": 0.140723 }, { "acc": 0.76575294, "epoch": 0.9750142297917139, "grad_norm": 4.15625, "learning_rate": 5.6181674061597415e-06, "loss": 0.90551376, "memory(GiB)": 742.41, "step": 38435, "train_speed(iter/s)": 0.140711 }, { "acc": 0.76909542, "epoch": 0.9751410691607515, "grad_norm": 3.03125, "learning_rate": 5.6171267861973285e-06, "loss": 0.90406761, "memory(GiB)": 742.41, "step": 38440, "train_speed(iter/s)": 0.140695 }, { "acc": 0.76285744, "epoch": 0.9752679085297891, "grad_norm": 3.703125, "learning_rate": 5.616086139089475e-06, "loss": 0.88998785, "memory(GiB)": 742.41, "step": 38445, "train_speed(iter/s)": 0.140677 }, { "acc": 0.77200389, "epoch": 0.9753947478988266, "grad_norm": 3.453125, "learning_rate": 5.6150454648819555e-06, "loss": 0.8676259, "memory(GiB)": 742.41, "step": 38450, "train_speed(iter/s)": 0.140661 }, { "acc": 0.76435237, "epoch": 0.9755215872678641, "grad_norm": 3.09375, "learning_rate": 5.614004763620543e-06, "loss": 0.88997879, "memory(GiB)": 742.41, "step": 38455, "train_speed(iter/s)": 0.140645 }, { "acc": 0.78193431, "epoch": 0.9756484266369017, "grad_norm": 2.8125, "learning_rate": 5.612964035351021e-06, "loss": 0.85729198, "memory(GiB)": 742.41, "step": 38460, "train_speed(iter/s)": 0.140631 }, { "acc": 0.77834153, "epoch": 0.9757752660059392, "grad_norm": 3.328125, "learning_rate": 5.611923280119162e-06, "loss": 0.86526632, "memory(GiB)": 742.41, "step": 38465, "train_speed(iter/s)": 0.140612 }, { "acc": 0.77898793, "epoch": 0.9759021053749768, "grad_norm": 3.578125, "learning_rate": 5.610882497970747e-06, "loss": 0.87191505, "memory(GiB)": 742.41, "step": 38470, "train_speed(iter/s)": 0.140595 }, { "acc": 0.77608151, "epoch": 0.9760289447440144, "grad_norm": 3.359375, "learning_rate": 5.609841688951558e-06, "loss": 0.89566317, "memory(GiB)": 742.41, "step": 38475, "train_speed(iter/s)": 0.140583 }, { "acc": 0.77877765, "epoch": 0.9761557841130519, "grad_norm": 2.96875, "learning_rate": 5.608800853107377e-06, "loss": 0.84429379, "memory(GiB)": 742.41, "step": 38480, "train_speed(iter/s)": 0.140563 }, { "acc": 0.76898122, "epoch": 0.9762826234820895, "grad_norm": 3.515625, "learning_rate": 5.607759990483985e-06, "loss": 0.85627184, "memory(GiB)": 742.41, "step": 38485, "train_speed(iter/s)": 0.140548 }, { "acc": 0.77311544, "epoch": 0.9764094628511271, "grad_norm": 3.40625, "learning_rate": 5.606719101127168e-06, "loss": 0.87698345, "memory(GiB)": 742.41, "step": 38490, "train_speed(iter/s)": 0.140534 }, { "acc": 0.76387053, "epoch": 0.9765363022201646, "grad_norm": 4.09375, "learning_rate": 5.605678185082711e-06, "loss": 0.8887517, "memory(GiB)": 742.41, "step": 38495, "train_speed(iter/s)": 0.140518 }, { "acc": 0.76647224, "epoch": 0.9766631415892022, "grad_norm": 3.328125, "learning_rate": 5.6046372423964e-06, "loss": 0.89647808, "memory(GiB)": 742.41, "step": 38500, "train_speed(iter/s)": 0.140502 }, { "epoch": 0.9766631415892022, "eval_acc": 0.7575988740335475, "eval_loss": 0.8576856255531311, "eval_runtime": 1151.5303, "eval_samples_per_second": 5.532, "eval_steps_per_second": 5.532, "step": 38500 } ], "logging_steps": 5, "max_steps": 78838, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4379770409316614e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }