diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,136655 @@ +{ + "best_metric": 1.57305241, + "best_model_checkpoint": "/home/ubuntu/s2/output_qwen05B_2_epochs/qwen2_5-0_5b-instruct/v0-20250319-182107/checkpoint-68000", + "epoch": 1.7250126839167934, + "eval_steps": 1000, + "global_step": 68000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc": 0.6415714, + "epoch": 2.536783358701167e-05, + "grad_norm": 8.8125, + "learning_rate": 2.536783358701167e-09, + "loss": 1.76253867, + "memory(GiB)": 10.73, + "step": 1, + "train_speed(iter/s)": 0.0206 + }, + { + "acc": 0.60247374, + "epoch": 0.00012683916793505834, + "grad_norm": 7.8125, + "learning_rate": 1.2683916793505834e-08, + "loss": 1.97762632, + "memory(GiB)": 13.65, + "step": 5, + "train_speed(iter/s)": 0.098412 + }, + { + "acc": 0.57780371, + "epoch": 0.0002536783358701167, + "grad_norm": 7.125, + "learning_rate": 2.536783358701167e-08, + "loss": 2.08873425, + "memory(GiB)": 13.65, + "step": 10, + "train_speed(iter/s)": 0.186774 + }, + { + "acc": 0.56816006, + "epoch": 0.000380517503805175, + "grad_norm": 7.34375, + "learning_rate": 3.80517503805175e-08, + "loss": 2.17052231, + "memory(GiB)": 17.81, + "step": 15, + "train_speed(iter/s)": 0.266638 + }, + { + "acc": 0.57780371, + "epoch": 0.0005073566717402334, + "grad_norm": 9.0625, + "learning_rate": 5.073566717402334e-08, + "loss": 2.07844791, + "memory(GiB)": 17.81, + "step": 20, + "train_speed(iter/s)": 0.338851 + }, + { + "acc": 0.57653689, + "epoch": 0.0006341958396752917, + "grad_norm": 8.125, + "learning_rate": 6.341958396752917e-08, + "loss": 2.13973999, + "memory(GiB)": 17.81, + "step": 25, + "train_speed(iter/s)": 0.405014 + }, + { + "acc": 0.58652143, + "epoch": 0.00076103500761035, + "grad_norm": 13.625, + "learning_rate": 7.6103500761035e-08, + "loss": 2.11944504, + "memory(GiB)": 17.81, + "step": 30, + "train_speed(iter/s)": 0.465447 + }, + { + "acc": 0.58463202, + "epoch": 0.0008878741755454084, + "grad_norm": 8.0625, + "learning_rate": 8.878741755454084e-08, + "loss": 2.11370773, + "memory(GiB)": 17.81, + "step": 35, + "train_speed(iter/s)": 0.52101 + }, + { + "acc": 0.59612646, + "epoch": 0.0010147133434804667, + "grad_norm": 8.4375, + "learning_rate": 1.0147133434804667e-07, + "loss": 2.06100388, + "memory(GiB)": 17.81, + "step": 40, + "train_speed(iter/s)": 0.57192 + }, + { + "acc": 0.5892786, + "epoch": 0.001141552511415525, + "grad_norm": 8.25, + "learning_rate": 1.1415525114155251e-07, + "loss": 2.03739967, + "memory(GiB)": 17.81, + "step": 45, + "train_speed(iter/s)": 0.619351 + }, + { + "acc": 0.60097218, + "epoch": 0.0012683916793505834, + "grad_norm": 8.5, + "learning_rate": 1.2683916793505834e-07, + "loss": 2.05330925, + "memory(GiB)": 17.81, + "step": 50, + "train_speed(iter/s)": 0.663375 + }, + { + "acc": 0.58988309, + "epoch": 0.0013952308472856417, + "grad_norm": 8.5625, + "learning_rate": 1.3952308472856418e-07, + "loss": 2.16794739, + "memory(GiB)": 17.81, + "step": 55, + "train_speed(iter/s)": 0.704311 + }, + { + "acc": 0.56837025, + "epoch": 0.0015220700152207, + "grad_norm": 9.0, + "learning_rate": 1.5220700152207e-07, + "loss": 2.12977352, + "memory(GiB)": 17.81, + "step": 60, + "train_speed(iter/s)": 0.742433 + }, + { + "acc": 0.58301125, + "epoch": 0.0016489091831557584, + "grad_norm": 8.0625, + "learning_rate": 1.6489091831557585e-07, + "loss": 2.06642132, + "memory(GiB)": 17.81, + "step": 65, + "train_speed(iter/s)": 0.778083 + }, + { + "acc": 0.5904233, + "epoch": 0.0017757483510908167, + "grad_norm": 7.375, + "learning_rate": 1.7757483510908168e-07, + "loss": 2.0742939, + "memory(GiB)": 17.81, + "step": 70, + "train_speed(iter/s)": 0.811541 + }, + { + "acc": 0.56817775, + "epoch": 0.001902587519025875, + "grad_norm": 8.9375, + "learning_rate": 1.9025875190258752e-07, + "loss": 2.16684647, + "memory(GiB)": 17.81, + "step": 75, + "train_speed(iter/s)": 0.84281 + }, + { + "acc": 0.59556479, + "epoch": 0.0020294266869609334, + "grad_norm": 8.75, + "learning_rate": 2.0294266869609335e-07, + "loss": 2.00218239, + "memory(GiB)": 17.81, + "step": 80, + "train_speed(iter/s)": 0.872192 + }, + { + "acc": 0.56787491, + "epoch": 0.0021562658548959918, + "grad_norm": 8.0, + "learning_rate": 2.1562658548959918e-07, + "loss": 2.19605064, + "memory(GiB)": 23.68, + "step": 85, + "train_speed(iter/s)": 0.900011 + }, + { + "acc": 0.5814374, + "epoch": 0.00228310502283105, + "grad_norm": 7.84375, + "learning_rate": 2.2831050228310502e-07, + "loss": 2.06067905, + "memory(GiB)": 23.68, + "step": 90, + "train_speed(iter/s)": 0.92634 + }, + { + "acc": 0.57568212, + "epoch": 0.0024099441907661084, + "grad_norm": 7.3125, + "learning_rate": 2.409944190766109e-07, + "loss": 2.16754417, + "memory(GiB)": 23.7, + "step": 95, + "train_speed(iter/s)": 0.951167 + }, + { + "acc": 0.58423882, + "epoch": 0.0025367833587011668, + "grad_norm": 8.125, + "learning_rate": 2.536783358701167e-07, + "loss": 2.08557796, + "memory(GiB)": 23.7, + "step": 100, + "train_speed(iter/s)": 0.974621 + }, + { + "acc": 0.57016425, + "epoch": 0.002663622526636225, + "grad_norm": 8.25, + "learning_rate": 2.6636225266362255e-07, + "loss": 2.16158848, + "memory(GiB)": 23.7, + "step": 105, + "train_speed(iter/s)": 0.996991 + }, + { + "acc": 0.59947433, + "epoch": 0.0027904616945712835, + "grad_norm": 6.875, + "learning_rate": 2.7904616945712836e-07, + "loss": 2.02864761, + "memory(GiB)": 23.7, + "step": 110, + "train_speed(iter/s)": 1.018037 + }, + { + "acc": 0.59076071, + "epoch": 0.002917300862506342, + "grad_norm": 8.0, + "learning_rate": 2.917300862506342e-07, + "loss": 2.14251175, + "memory(GiB)": 23.7, + "step": 115, + "train_speed(iter/s)": 1.038235 + }, + { + "acc": 0.59897242, + "epoch": 0.0030441400304414, + "grad_norm": 7.28125, + "learning_rate": 3.0441400304414e-07, + "loss": 2.0877779, + "memory(GiB)": 23.7, + "step": 120, + "train_speed(iter/s)": 1.057373 + }, + { + "acc": 0.59436555, + "epoch": 0.0031709791983764585, + "grad_norm": 8.3125, + "learning_rate": 3.170979198376459e-07, + "loss": 2.01410065, + "memory(GiB)": 23.7, + "step": 125, + "train_speed(iter/s)": 1.075761 + }, + { + "acc": 0.58036852, + "epoch": 0.003297818366311517, + "grad_norm": 7.90625, + "learning_rate": 3.297818366311517e-07, + "loss": 2.07639065, + "memory(GiB)": 23.7, + "step": 130, + "train_speed(iter/s)": 1.093303 + }, + { + "acc": 0.59717317, + "epoch": 0.003424657534246575, + "grad_norm": 7.34375, + "learning_rate": 3.4246575342465755e-07, + "loss": 2.04616165, + "memory(GiB)": 23.7, + "step": 135, + "train_speed(iter/s)": 1.110006 + }, + { + "acc": 0.59098282, + "epoch": 0.0035514967021816335, + "grad_norm": 9.6875, + "learning_rate": 3.5514967021816336e-07, + "loss": 2.03467312, + "memory(GiB)": 23.7, + "step": 140, + "train_speed(iter/s)": 1.125831 + }, + { + "acc": 0.58623438, + "epoch": 0.003678335870116692, + "grad_norm": 9.0625, + "learning_rate": 3.678335870116692e-07, + "loss": 2.08755283, + "memory(GiB)": 23.7, + "step": 145, + "train_speed(iter/s)": 1.1411 + }, + { + "acc": 0.59764867, + "epoch": 0.00380517503805175, + "grad_norm": 10.5, + "learning_rate": 3.8051750380517503e-07, + "loss": 2.01638031, + "memory(GiB)": 23.7, + "step": 150, + "train_speed(iter/s)": 1.155741 + }, + { + "acc": 0.58929129, + "epoch": 0.0039320142059868085, + "grad_norm": 10.8125, + "learning_rate": 3.932014205986809e-07, + "loss": 2.06034012, + "memory(GiB)": 23.7, + "step": 155, + "train_speed(iter/s)": 1.169957 + }, + { + "acc": 0.59738331, + "epoch": 0.004058853373921867, + "grad_norm": 7.96875, + "learning_rate": 4.058853373921867e-07, + "loss": 2.05501957, + "memory(GiB)": 23.7, + "step": 160, + "train_speed(iter/s)": 1.183313 + }, + { + "acc": 0.58028755, + "epoch": 0.004185692541856925, + "grad_norm": 9.0, + "learning_rate": 4.1856925418569256e-07, + "loss": 2.15213394, + "memory(GiB)": 23.7, + "step": 165, + "train_speed(iter/s)": 1.195829 + }, + { + "acc": 0.57383642, + "epoch": 0.0043125317097919835, + "grad_norm": 7.9375, + "learning_rate": 4.3125317097919837e-07, + "loss": 2.14222946, + "memory(GiB)": 23.7, + "step": 170, + "train_speed(iter/s)": 1.208079 + }, + { + "acc": 0.57469215, + "epoch": 0.004439370877727042, + "grad_norm": 10.75, + "learning_rate": 4.4393708777270423e-07, + "loss": 2.18012199, + "memory(GiB)": 23.7, + "step": 175, + "train_speed(iter/s)": 1.22011 + }, + { + "acc": 0.57810321, + "epoch": 0.0045662100456621, + "grad_norm": 7.65625, + "learning_rate": 4.5662100456621004e-07, + "loss": 2.16264229, + "memory(GiB)": 23.7, + "step": 180, + "train_speed(iter/s)": 1.231613 + }, + { + "acc": 0.60413814, + "epoch": 0.0046930492135971585, + "grad_norm": 8.5, + "learning_rate": 4.693049213597159e-07, + "loss": 2.05986958, + "memory(GiB)": 23.7, + "step": 185, + "train_speed(iter/s)": 1.242767 + }, + { + "acc": 0.57760043, + "epoch": 0.004819888381532217, + "grad_norm": 6.9375, + "learning_rate": 4.819888381532218e-07, + "loss": 2.09638023, + "memory(GiB)": 23.7, + "step": 190, + "train_speed(iter/s)": 1.25346 + }, + { + "acc": 0.59249134, + "epoch": 0.004946727549467275, + "grad_norm": 8.1875, + "learning_rate": 4.946727549467275e-07, + "loss": 2.11143036, + "memory(GiB)": 23.7, + "step": 195, + "train_speed(iter/s)": 1.263822 + }, + { + "acc": 0.58348241, + "epoch": 0.0050735667174023336, + "grad_norm": 7.875, + "learning_rate": 5.073566717402334e-07, + "loss": 2.09154167, + "memory(GiB)": 23.7, + "step": 200, + "train_speed(iter/s)": 1.273627 + }, + { + "acc": 0.58108397, + "epoch": 0.005200405885337392, + "grad_norm": 8.3125, + "learning_rate": 5.200405885337392e-07, + "loss": 2.10265102, + "memory(GiB)": 23.7, + "step": 205, + "train_speed(iter/s)": 1.283285 + }, + { + "acc": 0.58467541, + "epoch": 0.00532724505327245, + "grad_norm": 8.375, + "learning_rate": 5.327245053272451e-07, + "loss": 2.09133244, + "memory(GiB)": 23.7, + "step": 210, + "train_speed(iter/s)": 1.292618 + }, + { + "acc": 0.57472029, + "epoch": 0.005454084221207509, + "grad_norm": 7.65625, + "learning_rate": 5.454084221207509e-07, + "loss": 2.10947704, + "memory(GiB)": 23.7, + "step": 215, + "train_speed(iter/s)": 1.301665 + }, + { + "acc": 0.58943138, + "epoch": 0.005580923389142567, + "grad_norm": 7.34375, + "learning_rate": 5.580923389142567e-07, + "loss": 2.02641792, + "memory(GiB)": 23.7, + "step": 220, + "train_speed(iter/s)": 1.310391 + }, + { + "acc": 0.59376535, + "epoch": 0.005707762557077625, + "grad_norm": 7.875, + "learning_rate": 5.707762557077626e-07, + "loss": 2.04560623, + "memory(GiB)": 23.7, + "step": 225, + "train_speed(iter/s)": 1.318825 + }, + { + "acc": 0.58068542, + "epoch": 0.005834601725012684, + "grad_norm": 7.71875, + "learning_rate": 5.834601725012684e-07, + "loss": 2.09772301, + "memory(GiB)": 23.7, + "step": 230, + "train_speed(iter/s)": 1.326981 + }, + { + "acc": 0.597399, + "epoch": 0.005961440892947742, + "grad_norm": 7.78125, + "learning_rate": 5.961440892947743e-07, + "loss": 2.00362549, + "memory(GiB)": 23.7, + "step": 235, + "train_speed(iter/s)": 1.334921 + }, + { + "acc": 0.57693577, + "epoch": 0.0060882800608828, + "grad_norm": 7.125, + "learning_rate": 6.0882800608828e-07, + "loss": 2.16377869, + "memory(GiB)": 23.7, + "step": 240, + "train_speed(iter/s)": 1.342669 + }, + { + "acc": 0.59407139, + "epoch": 0.006215119228817859, + "grad_norm": 9.0, + "learning_rate": 6.215119228817859e-07, + "loss": 2.07984734, + "memory(GiB)": 23.7, + "step": 245, + "train_speed(iter/s)": 1.350127 + }, + { + "acc": 0.58396721, + "epoch": 0.006341958396752917, + "grad_norm": 7.65625, + "learning_rate": 6.341958396752918e-07, + "loss": 2.2032732, + "memory(GiB)": 23.7, + "step": 250, + "train_speed(iter/s)": 1.357356 + }, + { + "acc": 0.56820168, + "epoch": 0.006468797564687975, + "grad_norm": 8.5, + "learning_rate": 6.468797564687976e-07, + "loss": 2.147155, + "memory(GiB)": 23.7, + "step": 255, + "train_speed(iter/s)": 1.364239 + }, + { + "acc": 0.58067222, + "epoch": 0.006595636732623034, + "grad_norm": 8.375, + "learning_rate": 6.595636732623034e-07, + "loss": 2.05872955, + "memory(GiB)": 23.7, + "step": 260, + "train_speed(iter/s)": 1.371152 + }, + { + "acc": 0.59502158, + "epoch": 0.006722475900558092, + "grad_norm": 7.90625, + "learning_rate": 6.722475900558092e-07, + "loss": 2.07336369, + "memory(GiB)": 23.7, + "step": 265, + "train_speed(iter/s)": 1.377754 + }, + { + "acc": 0.60438747, + "epoch": 0.00684931506849315, + "grad_norm": 8.4375, + "learning_rate": 6.849315068493151e-07, + "loss": 2.00989799, + "memory(GiB)": 23.7, + "step": 270, + "train_speed(iter/s)": 1.384326 + }, + { + "acc": 0.5757288, + "epoch": 0.006976154236428209, + "grad_norm": 7.75, + "learning_rate": 6.97615423642821e-07, + "loss": 2.07700272, + "memory(GiB)": 23.7, + "step": 275, + "train_speed(iter/s)": 1.39051 + }, + { + "acc": 0.58312683, + "epoch": 0.007102993404363267, + "grad_norm": 8.125, + "learning_rate": 7.102993404363267e-07, + "loss": 2.09542122, + "memory(GiB)": 23.7, + "step": 280, + "train_speed(iter/s)": 1.396509 + }, + { + "acc": 0.59664135, + "epoch": 0.007229832572298325, + "grad_norm": 8.375, + "learning_rate": 7.229832572298326e-07, + "loss": 1.99681587, + "memory(GiB)": 23.7, + "step": 285, + "train_speed(iter/s)": 1.402414 + }, + { + "acc": 0.61559401, + "epoch": 0.007356671740233384, + "grad_norm": 8.0625, + "learning_rate": 7.356671740233384e-07, + "loss": 2.007934, + "memory(GiB)": 23.7, + "step": 290, + "train_speed(iter/s)": 1.40824 + }, + { + "acc": 0.59848299, + "epoch": 0.007483510908168442, + "grad_norm": 8.375, + "learning_rate": 7.483510908168443e-07, + "loss": 2.0704546, + "memory(GiB)": 23.7, + "step": 295, + "train_speed(iter/s)": 1.413935 + }, + { + "acc": 0.59056811, + "epoch": 0.0076103500761035, + "grad_norm": 7.34375, + "learning_rate": 7.610350076103501e-07, + "loss": 2.03679428, + "memory(GiB)": 30.44, + "step": 300, + "train_speed(iter/s)": 1.419217 + }, + { + "acc": 0.59642339, + "epoch": 0.007737189244038559, + "grad_norm": 8.4375, + "learning_rate": 7.737189244038559e-07, + "loss": 2.08682041, + "memory(GiB)": 30.44, + "step": 305, + "train_speed(iter/s)": 1.424615 + }, + { + "acc": 0.59266996, + "epoch": 0.007864028411973617, + "grad_norm": 7.5, + "learning_rate": 7.864028411973618e-07, + "loss": 1.98927956, + "memory(GiB)": 30.44, + "step": 310, + "train_speed(iter/s)": 1.429741 + }, + { + "acc": 0.58240619, + "epoch": 0.007990867579908675, + "grad_norm": 7.40625, + "learning_rate": 7.990867579908676e-07, + "loss": 2.15537796, + "memory(GiB)": 30.44, + "step": 315, + "train_speed(iter/s)": 1.434744 + }, + { + "acc": 0.59544802, + "epoch": 0.008117706747843734, + "grad_norm": 7.9375, + "learning_rate": 8.117706747843734e-07, + "loss": 2.0775713, + "memory(GiB)": 30.44, + "step": 320, + "train_speed(iter/s)": 1.439629 + }, + { + "acc": 0.56318431, + "epoch": 0.008244545915778792, + "grad_norm": 7.34375, + "learning_rate": 8.244545915778793e-07, + "loss": 2.14091702, + "memory(GiB)": 30.44, + "step": 325, + "train_speed(iter/s)": 1.444485 + }, + { + "acc": 0.57702899, + "epoch": 0.00837138508371385, + "grad_norm": 7.1875, + "learning_rate": 8.371385083713851e-07, + "loss": 2.12820282, + "memory(GiB)": 30.44, + "step": 330, + "train_speed(iter/s)": 1.449223 + }, + { + "acc": 0.58454962, + "epoch": 0.008498224251648909, + "grad_norm": 8.5, + "learning_rate": 8.49822425164891e-07, + "loss": 2.14468822, + "memory(GiB)": 30.44, + "step": 335, + "train_speed(iter/s)": 1.453945 + }, + { + "acc": 0.58777819, + "epoch": 0.008625063419583967, + "grad_norm": 7.84375, + "learning_rate": 8.625063419583967e-07, + "loss": 2.06611099, + "memory(GiB)": 30.44, + "step": 340, + "train_speed(iter/s)": 1.458459 + }, + { + "acc": 0.59058614, + "epoch": 0.008751902587519025, + "grad_norm": 6.65625, + "learning_rate": 8.751902587519026e-07, + "loss": 2.08925419, + "memory(GiB)": 30.44, + "step": 345, + "train_speed(iter/s)": 1.462888 + }, + { + "acc": 0.57748685, + "epoch": 0.008878741755454084, + "grad_norm": 7.9375, + "learning_rate": 8.878741755454085e-07, + "loss": 2.08326321, + "memory(GiB)": 30.44, + "step": 350, + "train_speed(iter/s)": 1.467172 + }, + { + "acc": 0.59131088, + "epoch": 0.009005580923389142, + "grad_norm": 9.75, + "learning_rate": 9.005580923389143e-07, + "loss": 2.08673401, + "memory(GiB)": 30.44, + "step": 355, + "train_speed(iter/s)": 1.471447 + }, + { + "acc": 0.58826323, + "epoch": 0.0091324200913242, + "grad_norm": 7.375, + "learning_rate": 9.132420091324201e-07, + "loss": 1.99175434, + "memory(GiB)": 30.44, + "step": 360, + "train_speed(iter/s)": 1.475559 + }, + { + "acc": 0.59775381, + "epoch": 0.009259259259259259, + "grad_norm": 8.3125, + "learning_rate": 9.259259259259259e-07, + "loss": 2.05985851, + "memory(GiB)": 30.44, + "step": 365, + "train_speed(iter/s)": 1.479608 + }, + { + "acc": 0.57983561, + "epoch": 0.009386098427194317, + "grad_norm": 7.65625, + "learning_rate": 9.386098427194318e-07, + "loss": 2.09405727, + "memory(GiB)": 30.44, + "step": 370, + "train_speed(iter/s)": 1.483702 + }, + { + "acc": 0.58051119, + "epoch": 0.009512937595129375, + "grad_norm": 7.21875, + "learning_rate": 9.512937595129377e-07, + "loss": 2.1734066, + "memory(GiB)": 30.44, + "step": 375, + "train_speed(iter/s)": 1.487584 + }, + { + "acc": 0.59764881, + "epoch": 0.009639776763064434, + "grad_norm": 8.875, + "learning_rate": 9.639776763064435e-07, + "loss": 2.01658096, + "memory(GiB)": 30.44, + "step": 380, + "train_speed(iter/s)": 1.491261 + }, + { + "acc": 0.58592234, + "epoch": 0.009766615930999492, + "grad_norm": 9.375, + "learning_rate": 9.766615930999493e-07, + "loss": 2.06452904, + "memory(GiB)": 30.44, + "step": 385, + "train_speed(iter/s)": 1.494978 + }, + { + "acc": 0.58475637, + "epoch": 0.00989345509893455, + "grad_norm": 7.84375, + "learning_rate": 9.89345509893455e-07, + "loss": 2.07754631, + "memory(GiB)": 30.44, + "step": 390, + "train_speed(iter/s)": 1.498661 + }, + { + "acc": 0.59650869, + "epoch": 0.010020294266869609, + "grad_norm": 7.71875, + "learning_rate": 1.002029426686961e-06, + "loss": 2.06810417, + "memory(GiB)": 30.44, + "step": 395, + "train_speed(iter/s)": 1.502239 + }, + { + "acc": 0.57616796, + "epoch": 0.010147133434804667, + "grad_norm": 10.4375, + "learning_rate": 1.0147133434804667e-06, + "loss": 2.12413635, + "memory(GiB)": 30.44, + "step": 400, + "train_speed(iter/s)": 1.505829 + }, + { + "acc": 0.58666806, + "epoch": 0.010273972602739725, + "grad_norm": 6.9375, + "learning_rate": 1.0273972602739727e-06, + "loss": 2.087327, + "memory(GiB)": 30.44, + "step": 405, + "train_speed(iter/s)": 1.509305 + }, + { + "acc": 0.59116058, + "epoch": 0.010400811770674784, + "grad_norm": 8.5, + "learning_rate": 1.0400811770674785e-06, + "loss": 2.09962368, + "memory(GiB)": 30.44, + "step": 410, + "train_speed(iter/s)": 1.512594 + }, + { + "acc": 0.6074688, + "epoch": 0.010527650938609842, + "grad_norm": 7.78125, + "learning_rate": 1.0527650938609842e-06, + "loss": 1.99229374, + "memory(GiB)": 30.44, + "step": 415, + "train_speed(iter/s)": 1.515805 + }, + { + "acc": 0.59611673, + "epoch": 0.0106544901065449, + "grad_norm": 9.0, + "learning_rate": 1.0654490106544902e-06, + "loss": 2.07241859, + "memory(GiB)": 30.44, + "step": 420, + "train_speed(iter/s)": 1.518971 + }, + { + "acc": 0.58405657, + "epoch": 0.010781329274479959, + "grad_norm": 7.21875, + "learning_rate": 1.078132927447996e-06, + "loss": 2.06975822, + "memory(GiB)": 30.44, + "step": 425, + "train_speed(iter/s)": 1.522202 + }, + { + "acc": 0.58463049, + "epoch": 0.010908168442415017, + "grad_norm": 7.25, + "learning_rate": 1.0908168442415017e-06, + "loss": 2.05938606, + "memory(GiB)": 30.44, + "step": 430, + "train_speed(iter/s)": 1.525413 + }, + { + "acc": 0.58507853, + "epoch": 0.011035007610350075, + "grad_norm": 7.96875, + "learning_rate": 1.1035007610350077e-06, + "loss": 2.08135872, + "memory(GiB)": 30.44, + "step": 435, + "train_speed(iter/s)": 1.528495 + }, + { + "acc": 0.59827957, + "epoch": 0.011161846778285134, + "grad_norm": 8.6875, + "learning_rate": 1.1161846778285134e-06, + "loss": 2.00571327, + "memory(GiB)": 30.44, + "step": 440, + "train_speed(iter/s)": 1.531595 + }, + { + "acc": 0.60118923, + "epoch": 0.011288685946220192, + "grad_norm": 6.96875, + "learning_rate": 1.1288685946220194e-06, + "loss": 2.01260796, + "memory(GiB)": 30.44, + "step": 445, + "train_speed(iter/s)": 1.534584 + }, + { + "acc": 0.5814682, + "epoch": 0.01141552511415525, + "grad_norm": 7.0, + "learning_rate": 1.1415525114155251e-06, + "loss": 2.06783485, + "memory(GiB)": 30.44, + "step": 450, + "train_speed(iter/s)": 1.537517 + }, + { + "acc": 0.58993053, + "epoch": 0.011542364282090309, + "grad_norm": 7.46875, + "learning_rate": 1.154236428209031e-06, + "loss": 2.06729431, + "memory(GiB)": 30.44, + "step": 455, + "train_speed(iter/s)": 1.540385 + }, + { + "acc": 0.59743342, + "epoch": 0.011669203450025367, + "grad_norm": 6.65625, + "learning_rate": 1.1669203450025369e-06, + "loss": 2.02055702, + "memory(GiB)": 30.44, + "step": 460, + "train_speed(iter/s)": 1.543118 + }, + { + "acc": 0.59117823, + "epoch": 0.011796042617960426, + "grad_norm": 7.375, + "learning_rate": 1.1796042617960426e-06, + "loss": 2.09539967, + "memory(GiB)": 30.44, + "step": 465, + "train_speed(iter/s)": 1.545787 + }, + { + "acc": 0.57558403, + "epoch": 0.011922881785895484, + "grad_norm": 7.375, + "learning_rate": 1.1922881785895486e-06, + "loss": 2.16062622, + "memory(GiB)": 30.44, + "step": 470, + "train_speed(iter/s)": 1.548442 + }, + { + "acc": 0.6126267, + "epoch": 0.012049720953830542, + "grad_norm": 6.125, + "learning_rate": 1.2049720953830543e-06, + "loss": 1.98560028, + "memory(GiB)": 30.44, + "step": 475, + "train_speed(iter/s)": 1.551007 + }, + { + "acc": 0.59188223, + "epoch": 0.0121765601217656, + "grad_norm": 7.3125, + "learning_rate": 1.21765601217656e-06, + "loss": 2.07525215, + "memory(GiB)": 30.44, + "step": 480, + "train_speed(iter/s)": 1.553644 + }, + { + "acc": 0.58786926, + "epoch": 0.012303399289700659, + "grad_norm": 5.21875, + "learning_rate": 1.230339928970066e-06, + "loss": 2.05112915, + "memory(GiB)": 30.44, + "step": 485, + "train_speed(iter/s)": 1.55618 + }, + { + "acc": 0.60624104, + "epoch": 0.012430238457635717, + "grad_norm": 6.4375, + "learning_rate": 1.2430238457635718e-06, + "loss": 2.00564232, + "memory(GiB)": 30.44, + "step": 490, + "train_speed(iter/s)": 1.55861 + }, + { + "acc": 0.59575095, + "epoch": 0.012557077625570776, + "grad_norm": 8.75, + "learning_rate": 1.2557077625570776e-06, + "loss": 2.08506622, + "memory(GiB)": 30.44, + "step": 495, + "train_speed(iter/s)": 1.56104 + }, + { + "acc": 0.58726721, + "epoch": 0.012683916793505834, + "grad_norm": 7.875, + "learning_rate": 1.2683916793505835e-06, + "loss": 2.02813663, + "memory(GiB)": 30.44, + "step": 500, + "train_speed(iter/s)": 1.563386 + }, + { + "acc": 0.57700849, + "epoch": 0.012810755961440892, + "grad_norm": 7.625, + "learning_rate": 1.2810755961440893e-06, + "loss": 2.07331543, + "memory(GiB)": 30.44, + "step": 505, + "train_speed(iter/s)": 1.565667 + }, + { + "acc": 0.59316711, + "epoch": 0.01293759512937595, + "grad_norm": 7.5625, + "learning_rate": 1.2937595129375953e-06, + "loss": 2.10827484, + "memory(GiB)": 30.44, + "step": 510, + "train_speed(iter/s)": 1.567896 + }, + { + "acc": 0.58322258, + "epoch": 0.013064434297311009, + "grad_norm": 7.03125, + "learning_rate": 1.306443429731101e-06, + "loss": 2.11230087, + "memory(GiB)": 30.44, + "step": 515, + "train_speed(iter/s)": 1.57008 + }, + { + "acc": 0.60459247, + "epoch": 0.013191273465246067, + "grad_norm": 6.15625, + "learning_rate": 1.3191273465246068e-06, + "loss": 1.98551025, + "memory(GiB)": 30.44, + "step": 520, + "train_speed(iter/s)": 1.572359 + }, + { + "acc": 0.5908164, + "epoch": 0.013318112633181126, + "grad_norm": 7.3125, + "learning_rate": 1.3318112633181127e-06, + "loss": 2.03188496, + "memory(GiB)": 30.44, + "step": 525, + "train_speed(iter/s)": 1.574537 + }, + { + "acc": 0.61041069, + "epoch": 0.013444951801116184, + "grad_norm": 7.375, + "learning_rate": 1.3444951801116185e-06, + "loss": 1.97797852, + "memory(GiB)": 30.44, + "step": 530, + "train_speed(iter/s)": 1.576773 + }, + { + "acc": 0.60151291, + "epoch": 0.013571790969051242, + "grad_norm": 7.3125, + "learning_rate": 1.3571790969051243e-06, + "loss": 1.9924551, + "memory(GiB)": 30.44, + "step": 535, + "train_speed(iter/s)": 1.578905 + }, + { + "acc": 0.59988222, + "epoch": 0.0136986301369863, + "grad_norm": 6.875, + "learning_rate": 1.3698630136986302e-06, + "loss": 2.02952709, + "memory(GiB)": 30.44, + "step": 540, + "train_speed(iter/s)": 1.581017 + }, + { + "acc": 0.58190384, + "epoch": 0.013825469304921359, + "grad_norm": 7.6875, + "learning_rate": 1.382546930492136e-06, + "loss": 2.10898304, + "memory(GiB)": 30.44, + "step": 545, + "train_speed(iter/s)": 1.58318 + }, + { + "acc": 0.58921618, + "epoch": 0.013952308472856417, + "grad_norm": 7.84375, + "learning_rate": 1.395230847285642e-06, + "loss": 2.10822105, + "memory(GiB)": 30.44, + "step": 550, + "train_speed(iter/s)": 1.5853 + }, + { + "acc": 0.5891572, + "epoch": 0.014079147640791476, + "grad_norm": 7.3125, + "learning_rate": 1.4079147640791477e-06, + "loss": 2.01958981, + "memory(GiB)": 30.44, + "step": 555, + "train_speed(iter/s)": 1.587307 + }, + { + "acc": 0.60352993, + "epoch": 0.014205986808726534, + "grad_norm": 7.6875, + "learning_rate": 1.4205986808726534e-06, + "loss": 1.99034004, + "memory(GiB)": 30.44, + "step": 560, + "train_speed(iter/s)": 1.589387 + }, + { + "acc": 0.59245887, + "epoch": 0.014332825976661592, + "grad_norm": 10.125, + "learning_rate": 1.4332825976661594e-06, + "loss": 2.06775417, + "memory(GiB)": 30.44, + "step": 565, + "train_speed(iter/s)": 1.591357 + }, + { + "acc": 0.5993598, + "epoch": 0.01445966514459665, + "grad_norm": 9.4375, + "learning_rate": 1.4459665144596652e-06, + "loss": 1.99099979, + "memory(GiB)": 30.44, + "step": 570, + "train_speed(iter/s)": 1.593288 + }, + { + "acc": 0.61712952, + "epoch": 0.014586504312531709, + "grad_norm": 6.59375, + "learning_rate": 1.458650431253171e-06, + "loss": 1.91461067, + "memory(GiB)": 38.53, + "step": 575, + "train_speed(iter/s)": 1.595075 + }, + { + "acc": 0.58643556, + "epoch": 0.014713343480466767, + "grad_norm": 7.21875, + "learning_rate": 1.4713343480466769e-06, + "loss": 2.13691216, + "memory(GiB)": 38.53, + "step": 580, + "train_speed(iter/s)": 1.596917 + }, + { + "acc": 0.59246416, + "epoch": 0.014840182648401826, + "grad_norm": 7.25, + "learning_rate": 1.4840182648401826e-06, + "loss": 2.11300983, + "memory(GiB)": 38.53, + "step": 585, + "train_speed(iter/s)": 1.598892 + }, + { + "acc": 0.58048639, + "epoch": 0.014967021816336884, + "grad_norm": 6.0, + "learning_rate": 1.4967021816336886e-06, + "loss": 2.03772526, + "memory(GiB)": 38.53, + "step": 590, + "train_speed(iter/s)": 1.60065 + }, + { + "acc": 0.57663441, + "epoch": 0.015093860984271942, + "grad_norm": 6.34375, + "learning_rate": 1.5093860984271944e-06, + "loss": 2.04765453, + "memory(GiB)": 38.53, + "step": 595, + "train_speed(iter/s)": 1.602525 + }, + { + "acc": 0.60374422, + "epoch": 0.015220700152207, + "grad_norm": 6.1875, + "learning_rate": 1.5220700152207001e-06, + "loss": 1.97475071, + "memory(GiB)": 38.53, + "step": 600, + "train_speed(iter/s)": 1.604246 + }, + { + "acc": 0.60834856, + "epoch": 0.015347539320142059, + "grad_norm": 6.4375, + "learning_rate": 1.534753932014206e-06, + "loss": 1.9379118, + "memory(GiB)": 38.53, + "step": 605, + "train_speed(iter/s)": 1.605993 + }, + { + "acc": 0.59056187, + "epoch": 0.015474378488077117, + "grad_norm": 6.375, + "learning_rate": 1.5474378488077118e-06, + "loss": 2.02729664, + "memory(GiB)": 38.53, + "step": 610, + "train_speed(iter/s)": 1.607765 + }, + { + "acc": 0.60268025, + "epoch": 0.015601217656012176, + "grad_norm": 7.0625, + "learning_rate": 1.5601217656012176e-06, + "loss": 2.02083397, + "memory(GiB)": 38.53, + "step": 615, + "train_speed(iter/s)": 1.609482 + }, + { + "acc": 0.61031318, + "epoch": 0.015728056823947234, + "grad_norm": 7.03125, + "learning_rate": 1.5728056823947236e-06, + "loss": 1.95255795, + "memory(GiB)": 38.53, + "step": 620, + "train_speed(iter/s)": 1.61111 + }, + { + "acc": 0.59041319, + "epoch": 0.015854895991882292, + "grad_norm": 6.3125, + "learning_rate": 1.5854895991882293e-06, + "loss": 2.07051506, + "memory(GiB)": 38.53, + "step": 625, + "train_speed(iter/s)": 1.612804 + }, + { + "acc": 0.59723597, + "epoch": 0.01598173515981735, + "grad_norm": 6.96875, + "learning_rate": 1.5981735159817353e-06, + "loss": 2.05251141, + "memory(GiB)": 38.53, + "step": 630, + "train_speed(iter/s)": 1.614459 + }, + { + "acc": 0.60010281, + "epoch": 0.01610857432775241, + "grad_norm": 9.8125, + "learning_rate": 1.610857432775241e-06, + "loss": 2.0439394, + "memory(GiB)": 38.53, + "step": 635, + "train_speed(iter/s)": 1.616059 + }, + { + "acc": 0.58903327, + "epoch": 0.016235413495687467, + "grad_norm": 7.15625, + "learning_rate": 1.6235413495687468e-06, + "loss": 2.03954334, + "memory(GiB)": 38.53, + "step": 640, + "train_speed(iter/s)": 1.61776 + }, + { + "acc": 0.59707794, + "epoch": 0.016362252663622526, + "grad_norm": 6.5625, + "learning_rate": 1.6362252663622528e-06, + "loss": 2.03261719, + "memory(GiB)": 38.53, + "step": 645, + "train_speed(iter/s)": 1.619225 + }, + { + "acc": 0.58319554, + "epoch": 0.016489091831557584, + "grad_norm": 6.40625, + "learning_rate": 1.6489091831557585e-06, + "loss": 2.04252071, + "memory(GiB)": 38.53, + "step": 650, + "train_speed(iter/s)": 1.620694 + }, + { + "acc": 0.59543414, + "epoch": 0.016615930999492642, + "grad_norm": 6.0, + "learning_rate": 1.6615930999492643e-06, + "loss": 2.05091801, + "memory(GiB)": 38.53, + "step": 655, + "train_speed(iter/s)": 1.622209 + }, + { + "acc": 0.58364267, + "epoch": 0.0167427701674277, + "grad_norm": 6.125, + "learning_rate": 1.6742770167427702e-06, + "loss": 2.02139549, + "memory(GiB)": 38.53, + "step": 660, + "train_speed(iter/s)": 1.623756 + }, + { + "acc": 0.60236578, + "epoch": 0.01686960933536276, + "grad_norm": 6.15625, + "learning_rate": 1.686960933536276e-06, + "loss": 1.91847992, + "memory(GiB)": 38.53, + "step": 665, + "train_speed(iter/s)": 1.625136 + }, + { + "acc": 0.60077438, + "epoch": 0.016996448503297817, + "grad_norm": 6.21875, + "learning_rate": 1.699644850329782e-06, + "loss": 2.01930237, + "memory(GiB)": 38.53, + "step": 670, + "train_speed(iter/s)": 1.626499 + }, + { + "acc": 0.60120392, + "epoch": 0.017123287671232876, + "grad_norm": 7.03125, + "learning_rate": 1.7123287671232877e-06, + "loss": 2.00528069, + "memory(GiB)": 38.53, + "step": 675, + "train_speed(iter/s)": 1.62787 + }, + { + "acc": 0.5836062, + "epoch": 0.017250126839167934, + "grad_norm": 6.6875, + "learning_rate": 1.7250126839167935e-06, + "loss": 2.14715729, + "memory(GiB)": 38.53, + "step": 680, + "train_speed(iter/s)": 1.6293 + }, + { + "acc": 0.59741678, + "epoch": 0.017376966007102992, + "grad_norm": 6.15625, + "learning_rate": 1.7376966007102994e-06, + "loss": 2.00957355, + "memory(GiB)": 38.53, + "step": 685, + "train_speed(iter/s)": 1.630563 + }, + { + "acc": 0.60352869, + "epoch": 0.01750380517503805, + "grad_norm": 6.28125, + "learning_rate": 1.7503805175038052e-06, + "loss": 2.03679848, + "memory(GiB)": 38.53, + "step": 690, + "train_speed(iter/s)": 1.631989 + }, + { + "acc": 0.59241843, + "epoch": 0.01763064434297311, + "grad_norm": 8.625, + "learning_rate": 1.7630644342973112e-06, + "loss": 2.01553612, + "memory(GiB)": 38.53, + "step": 695, + "train_speed(iter/s)": 1.633397 + }, + { + "acc": 0.58871098, + "epoch": 0.017757483510908167, + "grad_norm": 5.78125, + "learning_rate": 1.775748351090817e-06, + "loss": 1.98986931, + "memory(GiB)": 38.53, + "step": 700, + "train_speed(iter/s)": 1.63469 + }, + { + "acc": 0.58268156, + "epoch": 0.017884322678843226, + "grad_norm": 6.125, + "learning_rate": 1.7884322678843227e-06, + "loss": 1.99861069, + "memory(GiB)": 38.53, + "step": 705, + "train_speed(iter/s)": 1.636068 + }, + { + "acc": 0.58590431, + "epoch": 0.018011161846778284, + "grad_norm": 7.21875, + "learning_rate": 1.8011161846778286e-06, + "loss": 2.05112705, + "memory(GiB)": 38.53, + "step": 710, + "train_speed(iter/s)": 1.637263 + }, + { + "acc": 0.58075576, + "epoch": 0.018138001014713342, + "grad_norm": 5.6875, + "learning_rate": 1.8138001014713344e-06, + "loss": 2.09609985, + "memory(GiB)": 38.53, + "step": 715, + "train_speed(iter/s)": 1.63855 + }, + { + "acc": 0.59707537, + "epoch": 0.0182648401826484, + "grad_norm": 6.78125, + "learning_rate": 1.8264840182648401e-06, + "loss": 1.99963741, + "memory(GiB)": 38.53, + "step": 720, + "train_speed(iter/s)": 1.639946 + }, + { + "acc": 0.58498487, + "epoch": 0.01839167935058346, + "grad_norm": 6.84375, + "learning_rate": 1.8391679350583461e-06, + "loss": 2.08599586, + "memory(GiB)": 38.53, + "step": 725, + "train_speed(iter/s)": 1.641132 + }, + { + "acc": 0.58994751, + "epoch": 0.018518518518518517, + "grad_norm": 7.03125, + "learning_rate": 1.8518518518518519e-06, + "loss": 1.93367462, + "memory(GiB)": 38.53, + "step": 730, + "train_speed(iter/s)": 1.64245 + }, + { + "acc": 0.58264194, + "epoch": 0.018645357686453576, + "grad_norm": 6.03125, + "learning_rate": 1.8645357686453578e-06, + "loss": 2.06114101, + "memory(GiB)": 38.53, + "step": 735, + "train_speed(iter/s)": 1.643655 + }, + { + "acc": 0.59040194, + "epoch": 0.018772196854388634, + "grad_norm": 6.84375, + "learning_rate": 1.8772196854388636e-06, + "loss": 2.05004845, + "memory(GiB)": 38.53, + "step": 740, + "train_speed(iter/s)": 1.644852 + }, + { + "acc": 0.59944916, + "epoch": 0.018899036022323693, + "grad_norm": 6.46875, + "learning_rate": 1.8899036022323693e-06, + "loss": 1.94879971, + "memory(GiB)": 38.53, + "step": 745, + "train_speed(iter/s)": 1.645978 + }, + { + "acc": 0.58941746, + "epoch": 0.01902587519025875, + "grad_norm": 6.21875, + "learning_rate": 1.9025875190258753e-06, + "loss": 2.0135458, + "memory(GiB)": 38.53, + "step": 750, + "train_speed(iter/s)": 1.647116 + }, + { + "acc": 0.60376492, + "epoch": 0.01915271435819381, + "grad_norm": 6.34375, + "learning_rate": 1.915271435819381e-06, + "loss": 1.97635822, + "memory(GiB)": 38.53, + "step": 755, + "train_speed(iter/s)": 1.648182 + }, + { + "acc": 0.58112669, + "epoch": 0.019279553526128868, + "grad_norm": 5.6875, + "learning_rate": 1.927955352612887e-06, + "loss": 2.03324509, + "memory(GiB)": 38.53, + "step": 760, + "train_speed(iter/s)": 1.649217 + }, + { + "acc": 0.60946598, + "epoch": 0.019406392694063926, + "grad_norm": 7.0625, + "learning_rate": 1.9406392694063926e-06, + "loss": 2.03556252, + "memory(GiB)": 38.53, + "step": 765, + "train_speed(iter/s)": 1.650329 + }, + { + "acc": 0.5945179, + "epoch": 0.019533231861998984, + "grad_norm": 7.65625, + "learning_rate": 1.9533231861998985e-06, + "loss": 2.01756153, + "memory(GiB)": 38.53, + "step": 770, + "train_speed(iter/s)": 1.651472 + }, + { + "acc": 0.60753369, + "epoch": 0.019660071029934043, + "grad_norm": 6.1875, + "learning_rate": 1.9660071029934045e-06, + "loss": 2.01441383, + "memory(GiB)": 38.53, + "step": 775, + "train_speed(iter/s)": 1.652488 + }, + { + "acc": 0.57172837, + "epoch": 0.0197869101978691, + "grad_norm": 6.59375, + "learning_rate": 1.97869101978691e-06, + "loss": 2.08121719, + "memory(GiB)": 38.53, + "step": 780, + "train_speed(iter/s)": 1.653552 + }, + { + "acc": 0.61228228, + "epoch": 0.01991374936580416, + "grad_norm": 6.75, + "learning_rate": 1.991374936580416e-06, + "loss": 1.89397602, + "memory(GiB)": 38.53, + "step": 785, + "train_speed(iter/s)": 1.654584 + }, + { + "acc": 0.59674997, + "epoch": 0.020040588533739218, + "grad_norm": 6.625, + "learning_rate": 2.004058853373922e-06, + "loss": 2.0047966, + "memory(GiB)": 38.53, + "step": 790, + "train_speed(iter/s)": 1.655673 + }, + { + "acc": 0.60163665, + "epoch": 0.020167427701674276, + "grad_norm": 8.6875, + "learning_rate": 2.016742770167428e-06, + "loss": 1.93692665, + "memory(GiB)": 38.53, + "step": 795, + "train_speed(iter/s)": 1.656734 + }, + { + "acc": 0.60401855, + "epoch": 0.020294266869609334, + "grad_norm": 6.09375, + "learning_rate": 2.0294266869609335e-06, + "loss": 1.92346802, + "memory(GiB)": 38.53, + "step": 800, + "train_speed(iter/s)": 1.657753 + }, + { + "acc": 0.58779025, + "epoch": 0.020421106037544393, + "grad_norm": 6.84375, + "learning_rate": 2.0421106037544395e-06, + "loss": 2.02111969, + "memory(GiB)": 38.53, + "step": 805, + "train_speed(iter/s)": 1.658819 + }, + { + "acc": 0.60377512, + "epoch": 0.02054794520547945, + "grad_norm": 7.59375, + "learning_rate": 2.0547945205479454e-06, + "loss": 1.95429077, + "memory(GiB)": 38.53, + "step": 810, + "train_speed(iter/s)": 1.659819 + }, + { + "acc": 0.60720506, + "epoch": 0.02067478437341451, + "grad_norm": 6.78125, + "learning_rate": 2.067478437341451e-06, + "loss": 2.03128891, + "memory(GiB)": 38.53, + "step": 815, + "train_speed(iter/s)": 1.660861 + }, + { + "acc": 0.59849381, + "epoch": 0.020801623541349568, + "grad_norm": 7.65625, + "learning_rate": 2.080162354134957e-06, + "loss": 2.00123291, + "memory(GiB)": 38.53, + "step": 820, + "train_speed(iter/s)": 1.661747 + }, + { + "acc": 0.57835402, + "epoch": 0.020928462709284626, + "grad_norm": 5.96875, + "learning_rate": 2.092846270928463e-06, + "loss": 2.04284058, + "memory(GiB)": 38.53, + "step": 825, + "train_speed(iter/s)": 1.662649 + }, + { + "acc": 0.58629961, + "epoch": 0.021055301877219684, + "grad_norm": 6.71875, + "learning_rate": 2.1055301877219685e-06, + "loss": 2.1173254, + "memory(GiB)": 38.53, + "step": 830, + "train_speed(iter/s)": 1.663679 + }, + { + "acc": 0.60785379, + "epoch": 0.021182141045154743, + "grad_norm": 5.65625, + "learning_rate": 2.1182141045154744e-06, + "loss": 1.94242649, + "memory(GiB)": 38.53, + "step": 835, + "train_speed(iter/s)": 1.664639 + }, + { + "acc": 0.60013657, + "epoch": 0.0213089802130898, + "grad_norm": 8.3125, + "learning_rate": 2.1308980213089804e-06, + "loss": 1.96173286, + "memory(GiB)": 47.04, + "step": 840, + "train_speed(iter/s)": 1.665496 + }, + { + "acc": 0.59912806, + "epoch": 0.02143581938102486, + "grad_norm": 7.03125, + "learning_rate": 2.143581938102486e-06, + "loss": 1.94571953, + "memory(GiB)": 47.04, + "step": 845, + "train_speed(iter/s)": 1.666446 + }, + { + "acc": 0.61398377, + "epoch": 0.021562658548959918, + "grad_norm": 6.25, + "learning_rate": 2.156265854895992e-06, + "loss": 1.97966213, + "memory(GiB)": 47.04, + "step": 850, + "train_speed(iter/s)": 1.667412 + }, + { + "acc": 0.57985878, + "epoch": 0.021689497716894976, + "grad_norm": 7.4375, + "learning_rate": 2.168949771689498e-06, + "loss": 2.04382324, + "memory(GiB)": 47.04, + "step": 855, + "train_speed(iter/s)": 1.668301 + }, + { + "acc": 0.60781727, + "epoch": 0.021816336884830034, + "grad_norm": 6.90625, + "learning_rate": 2.1816336884830034e-06, + "loss": 2.01011772, + "memory(GiB)": 47.04, + "step": 860, + "train_speed(iter/s)": 1.669175 + }, + { + "acc": 0.58056746, + "epoch": 0.021943176052765093, + "grad_norm": 8.25, + "learning_rate": 2.1943176052765094e-06, + "loss": 1.98371544, + "memory(GiB)": 47.04, + "step": 865, + "train_speed(iter/s)": 1.670081 + }, + { + "acc": 0.59575796, + "epoch": 0.02207001522070015, + "grad_norm": 7.21875, + "learning_rate": 2.2070015220700153e-06, + "loss": 2.07190132, + "memory(GiB)": 47.04, + "step": 870, + "train_speed(iter/s)": 1.670983 + }, + { + "acc": 0.59166737, + "epoch": 0.02219685438863521, + "grad_norm": 8.1875, + "learning_rate": 2.2196854388635213e-06, + "loss": 2.0117609, + "memory(GiB)": 47.04, + "step": 875, + "train_speed(iter/s)": 1.671883 + }, + { + "acc": 0.60695286, + "epoch": 0.022323693556570268, + "grad_norm": 8.125, + "learning_rate": 2.232369355657027e-06, + "loss": 1.95530815, + "memory(GiB)": 47.04, + "step": 880, + "train_speed(iter/s)": 1.672678 + }, + { + "acc": 0.59405203, + "epoch": 0.022450532724505326, + "grad_norm": 7.96875, + "learning_rate": 2.245053272450533e-06, + "loss": 2.03537674, + "memory(GiB)": 47.04, + "step": 885, + "train_speed(iter/s)": 1.673527 + }, + { + "acc": 0.5937891, + "epoch": 0.022577371892440384, + "grad_norm": 7.15625, + "learning_rate": 2.2577371892440388e-06, + "loss": 2.01818981, + "memory(GiB)": 47.04, + "step": 890, + "train_speed(iter/s)": 1.67439 + }, + { + "acc": 0.61011105, + "epoch": 0.022704211060375443, + "grad_norm": 5.5, + "learning_rate": 2.2704211060375443e-06, + "loss": 2.00760136, + "memory(GiB)": 47.04, + "step": 895, + "train_speed(iter/s)": 1.675213 + }, + { + "acc": 0.60563164, + "epoch": 0.0228310502283105, + "grad_norm": 7.28125, + "learning_rate": 2.2831050228310503e-06, + "loss": 2.01051254, + "memory(GiB)": 47.04, + "step": 900, + "train_speed(iter/s)": 1.676019 + }, + { + "acc": 0.5876997, + "epoch": 0.02295788939624556, + "grad_norm": 6.6875, + "learning_rate": 2.2957889396245563e-06, + "loss": 2.02636986, + "memory(GiB)": 47.04, + "step": 905, + "train_speed(iter/s)": 1.676833 + }, + { + "acc": 0.58789148, + "epoch": 0.023084728564180618, + "grad_norm": 6.3125, + "learning_rate": 2.308472856418062e-06, + "loss": 2.01125336, + "memory(GiB)": 47.04, + "step": 910, + "train_speed(iter/s)": 1.677702 + }, + { + "acc": 0.60056543, + "epoch": 0.023211567732115676, + "grad_norm": 5.40625, + "learning_rate": 2.3211567732115678e-06, + "loss": 1.96869583, + "memory(GiB)": 47.04, + "step": 915, + "train_speed(iter/s)": 1.67851 + }, + { + "acc": 0.62972975, + "epoch": 0.023338406900050734, + "grad_norm": 5.21875, + "learning_rate": 2.3338406900050737e-06, + "loss": 1.89889717, + "memory(GiB)": 47.04, + "step": 920, + "train_speed(iter/s)": 1.679346 + }, + { + "acc": 0.59116406, + "epoch": 0.023465246067985793, + "grad_norm": 6.0, + "learning_rate": 2.3465246067985793e-06, + "loss": 2.0276123, + "memory(GiB)": 47.04, + "step": 925, + "train_speed(iter/s)": 1.680174 + }, + { + "acc": 0.60044823, + "epoch": 0.02359208523592085, + "grad_norm": 12.25, + "learning_rate": 2.3592085235920852e-06, + "loss": 1.98527832, + "memory(GiB)": 47.04, + "step": 930, + "train_speed(iter/s)": 1.681039 + }, + { + "acc": 0.60560637, + "epoch": 0.02371892440385591, + "grad_norm": 8.25, + "learning_rate": 2.371892440385591e-06, + "loss": 1.90980549, + "memory(GiB)": 47.04, + "step": 935, + "train_speed(iter/s)": 1.681786 + }, + { + "acc": 0.59694967, + "epoch": 0.023845763571790968, + "grad_norm": 6.46875, + "learning_rate": 2.384576357179097e-06, + "loss": 1.9686676, + "memory(GiB)": 47.04, + "step": 940, + "train_speed(iter/s)": 1.682594 + }, + { + "acc": 0.60201035, + "epoch": 0.023972602739726026, + "grad_norm": 7.84375, + "learning_rate": 2.3972602739726027e-06, + "loss": 1.98535118, + "memory(GiB)": 47.04, + "step": 945, + "train_speed(iter/s)": 1.683453 + }, + { + "acc": 0.59199324, + "epoch": 0.024099441907661084, + "grad_norm": 7.46875, + "learning_rate": 2.4099441907661087e-06, + "loss": 2.01176834, + "memory(GiB)": 47.04, + "step": 950, + "train_speed(iter/s)": 1.684276 + }, + { + "acc": 0.61499534, + "epoch": 0.024226281075596143, + "grad_norm": 10.625, + "learning_rate": 2.4226281075596147e-06, + "loss": 1.94725342, + "memory(GiB)": 47.04, + "step": 955, + "train_speed(iter/s)": 1.685038 + }, + { + "acc": 0.59902039, + "epoch": 0.0243531202435312, + "grad_norm": 8.6875, + "learning_rate": 2.43531202435312e-06, + "loss": 2.03876152, + "memory(GiB)": 47.04, + "step": 960, + "train_speed(iter/s)": 1.68579 + }, + { + "acc": 0.59024591, + "epoch": 0.02447995941146626, + "grad_norm": 6.46875, + "learning_rate": 2.447995941146626e-06, + "loss": 2.04958992, + "memory(GiB)": 47.04, + "step": 965, + "train_speed(iter/s)": 1.686533 + }, + { + "acc": 0.59412632, + "epoch": 0.024606798579401318, + "grad_norm": 5.84375, + "learning_rate": 2.460679857940132e-06, + "loss": 2.03471642, + "memory(GiB)": 47.04, + "step": 970, + "train_speed(iter/s)": 1.687248 + }, + { + "acc": 0.59009824, + "epoch": 0.024733637747336376, + "grad_norm": 6.59375, + "learning_rate": 2.4733637747336377e-06, + "loss": 2.05138016, + "memory(GiB)": 47.04, + "step": 975, + "train_speed(iter/s)": 1.688001 + }, + { + "acc": 0.62229795, + "epoch": 0.024860476915271434, + "grad_norm": 6.1875, + "learning_rate": 2.4860476915271436e-06, + "loss": 1.94331303, + "memory(GiB)": 47.04, + "step": 980, + "train_speed(iter/s)": 1.688725 + }, + { + "acc": 0.60050669, + "epoch": 0.024987316083206493, + "grad_norm": 7.0, + "learning_rate": 2.4987316083206496e-06, + "loss": 2.00850029, + "memory(GiB)": 47.04, + "step": 985, + "train_speed(iter/s)": 1.689395 + }, + { + "acc": 0.58993878, + "epoch": 0.02511415525114155, + "grad_norm": 6.625, + "learning_rate": 2.511415525114155e-06, + "loss": 2.06494789, + "memory(GiB)": 47.04, + "step": 990, + "train_speed(iter/s)": 1.690082 + }, + { + "acc": 0.58899655, + "epoch": 0.02524099441907661, + "grad_norm": 5.6875, + "learning_rate": 2.5240994419076615e-06, + "loss": 1.99828987, + "memory(GiB)": 47.04, + "step": 995, + "train_speed(iter/s)": 1.690771 + }, + { + "acc": 0.60925131, + "epoch": 0.025367833587011668, + "grad_norm": 7.9375, + "learning_rate": 2.536783358701167e-06, + "loss": 1.91453819, + "memory(GiB)": 47.04, + "step": 1000, + "train_speed(iter/s)": 1.691461 + }, + { + "epoch": 0.025367833587011668, + "eval_acc": 0.5865617501675866, + "eval_loss": 1.9484902620315552, + "eval_runtime": 58.4022, + "eval_samples_per_second": 109.071, + "eval_steps_per_second": 27.276, + "step": 1000 + }, + { + "acc": 0.59601641, + "epoch": 0.025494672754946726, + "grad_norm": 6.65625, + "learning_rate": 2.549467275494673e-06, + "loss": 2.02587528, + "memory(GiB)": 52.96, + "step": 1005, + "train_speed(iter/s)": 1.531002 + }, + { + "acc": 0.60238309, + "epoch": 0.025621511922881784, + "grad_norm": 8.6875, + "learning_rate": 2.5621511922881786e-06, + "loss": 2.07746582, + "memory(GiB)": 52.96, + "step": 1010, + "train_speed(iter/s)": 1.532202 + }, + { + "acc": 0.59062929, + "epoch": 0.025748351090816843, + "grad_norm": 6.96875, + "learning_rate": 2.5748351090816846e-06, + "loss": 2.02643967, + "memory(GiB)": 52.96, + "step": 1015, + "train_speed(iter/s)": 1.53335 + }, + { + "acc": 0.59743261, + "epoch": 0.0258751902587519, + "grad_norm": 7.46875, + "learning_rate": 2.5875190258751905e-06, + "loss": 2.0047905, + "memory(GiB)": 52.96, + "step": 1020, + "train_speed(iter/s)": 1.534553 + }, + { + "acc": 0.61405964, + "epoch": 0.02600202942668696, + "grad_norm": 6.34375, + "learning_rate": 2.6002029426686965e-06, + "loss": 1.98388634, + "memory(GiB)": 52.96, + "step": 1025, + "train_speed(iter/s)": 1.535725 + }, + { + "acc": 0.5995717, + "epoch": 0.026128868594622018, + "grad_norm": 5.5625, + "learning_rate": 2.612886859462202e-06, + "loss": 1.96609459, + "memory(GiB)": 52.96, + "step": 1030, + "train_speed(iter/s)": 1.536888 + }, + { + "acc": 0.61300802, + "epoch": 0.026255707762557076, + "grad_norm": 7.03125, + "learning_rate": 2.625570776255708e-06, + "loss": 1.98041763, + "memory(GiB)": 52.96, + "step": 1035, + "train_speed(iter/s)": 1.538054 + }, + { + "acc": 0.59687238, + "epoch": 0.026382546930492135, + "grad_norm": 8.6875, + "learning_rate": 2.6382546930492135e-06, + "loss": 1.96327171, + "memory(GiB)": 52.96, + "step": 1040, + "train_speed(iter/s)": 1.53921 + }, + { + "acc": 0.61108966, + "epoch": 0.026509386098427193, + "grad_norm": 5.875, + "learning_rate": 2.65093860984272e-06, + "loss": 2.03963013, + "memory(GiB)": 52.96, + "step": 1045, + "train_speed(iter/s)": 1.54031 + }, + { + "acc": 0.61035624, + "epoch": 0.02663622526636225, + "grad_norm": 6.65625, + "learning_rate": 2.6636225266362255e-06, + "loss": 1.94076576, + "memory(GiB)": 52.96, + "step": 1050, + "train_speed(iter/s)": 1.541463 + }, + { + "acc": 0.59615335, + "epoch": 0.02676306443429731, + "grad_norm": 7.375, + "learning_rate": 2.6763064434297314e-06, + "loss": 1.9409626, + "memory(GiB)": 52.96, + "step": 1055, + "train_speed(iter/s)": 1.542592 + }, + { + "acc": 0.59662423, + "epoch": 0.026889903602232368, + "grad_norm": 5.96875, + "learning_rate": 2.688990360223237e-06, + "loss": 1.9571476, + "memory(GiB)": 52.96, + "step": 1060, + "train_speed(iter/s)": 1.543681 + }, + { + "acc": 0.60188313, + "epoch": 0.027016742770167426, + "grad_norm": 5.6875, + "learning_rate": 2.701674277016743e-06, + "loss": 1.95284061, + "memory(GiB)": 52.96, + "step": 1065, + "train_speed(iter/s)": 1.544773 + }, + { + "acc": 0.6035326, + "epoch": 0.027143581938102485, + "grad_norm": 6.75, + "learning_rate": 2.7143581938102485e-06, + "loss": 2.00576782, + "memory(GiB)": 52.96, + "step": 1070, + "train_speed(iter/s)": 1.54589 + }, + { + "acc": 0.59823966, + "epoch": 0.027270421106037543, + "grad_norm": 6.5, + "learning_rate": 2.727042110603755e-06, + "loss": 1.98063354, + "memory(GiB)": 52.96, + "step": 1075, + "train_speed(iter/s)": 1.546951 + }, + { + "acc": 0.580969, + "epoch": 0.0273972602739726, + "grad_norm": 5.90625, + "learning_rate": 2.7397260273972604e-06, + "loss": 2.08416595, + "memory(GiB)": 52.96, + "step": 1080, + "train_speed(iter/s)": 1.548025 + }, + { + "acc": 0.61635456, + "epoch": 0.02752409944190766, + "grad_norm": 6.5, + "learning_rate": 2.7524099441907664e-06, + "loss": 1.94256172, + "memory(GiB)": 52.96, + "step": 1085, + "train_speed(iter/s)": 1.549137 + }, + { + "acc": 0.61907287, + "epoch": 0.027650938609842718, + "grad_norm": 5.65625, + "learning_rate": 2.765093860984272e-06, + "loss": 1.92250996, + "memory(GiB)": 52.96, + "step": 1090, + "train_speed(iter/s)": 1.550195 + }, + { + "acc": 0.58657169, + "epoch": 0.027777777777777776, + "grad_norm": 6.84375, + "learning_rate": 2.7777777777777783e-06, + "loss": 1.98541336, + "memory(GiB)": 52.96, + "step": 1095, + "train_speed(iter/s)": 1.551289 + }, + { + "acc": 0.60796785, + "epoch": 0.027904616945712835, + "grad_norm": 6.8125, + "learning_rate": 2.790461694571284e-06, + "loss": 1.93136368, + "memory(GiB)": 52.96, + "step": 1100, + "train_speed(iter/s)": 1.552339 + }, + { + "acc": 0.60143771, + "epoch": 0.028031456113647893, + "grad_norm": 6.6875, + "learning_rate": 2.80314561136479e-06, + "loss": 1.98077202, + "memory(GiB)": 52.96, + "step": 1105, + "train_speed(iter/s)": 1.5534 + }, + { + "acc": 0.60933809, + "epoch": 0.02815829528158295, + "grad_norm": 7.03125, + "learning_rate": 2.8158295281582954e-06, + "loss": 1.92722111, + "memory(GiB)": 52.96, + "step": 1110, + "train_speed(iter/s)": 1.554441 + }, + { + "acc": 0.59950509, + "epoch": 0.02828513444951801, + "grad_norm": 6.40625, + "learning_rate": 2.8285134449518014e-06, + "loss": 2.03983688, + "memory(GiB)": 52.96, + "step": 1115, + "train_speed(iter/s)": 1.555466 + }, + { + "acc": 0.58854389, + "epoch": 0.028411973617453068, + "grad_norm": 6.78125, + "learning_rate": 2.841197361745307e-06, + "loss": 2.03416462, + "memory(GiB)": 52.96, + "step": 1120, + "train_speed(iter/s)": 1.55649 + }, + { + "acc": 0.58544197, + "epoch": 0.028538812785388126, + "grad_norm": 6.15625, + "learning_rate": 2.8538812785388133e-06, + "loss": 1.98238449, + "memory(GiB)": 52.96, + "step": 1125, + "train_speed(iter/s)": 1.557527 + }, + { + "acc": 0.60424638, + "epoch": 0.028665651953323185, + "grad_norm": 7.125, + "learning_rate": 2.866565195332319e-06, + "loss": 1.94077835, + "memory(GiB)": 52.96, + "step": 1130, + "train_speed(iter/s)": 1.558518 + }, + { + "acc": 0.60898528, + "epoch": 0.028792491121258243, + "grad_norm": 6.34375, + "learning_rate": 2.879249112125825e-06, + "loss": 1.92646523, + "memory(GiB)": 52.96, + "step": 1135, + "train_speed(iter/s)": 1.559489 + }, + { + "acc": 0.59700308, + "epoch": 0.0289193302891933, + "grad_norm": 6.96875, + "learning_rate": 2.8919330289193303e-06, + "loss": 2.03535538, + "memory(GiB)": 52.96, + "step": 1140, + "train_speed(iter/s)": 1.560425 + }, + { + "acc": 0.59102287, + "epoch": 0.02904616945712836, + "grad_norm": 5.3125, + "learning_rate": 2.9046169457128363e-06, + "loss": 1.96916809, + "memory(GiB)": 52.96, + "step": 1145, + "train_speed(iter/s)": 1.561401 + }, + { + "acc": 0.59929562, + "epoch": 0.029173008625063418, + "grad_norm": 6.53125, + "learning_rate": 2.917300862506342e-06, + "loss": 1.99325848, + "memory(GiB)": 52.96, + "step": 1150, + "train_speed(iter/s)": 1.562381 + }, + { + "acc": 0.59558825, + "epoch": 0.029299847792998476, + "grad_norm": 6.90625, + "learning_rate": 2.9299847792998482e-06, + "loss": 1.96991768, + "memory(GiB)": 52.96, + "step": 1155, + "train_speed(iter/s)": 1.56342 + }, + { + "acc": 0.61323395, + "epoch": 0.029426686960933535, + "grad_norm": 6.71875, + "learning_rate": 2.9426686960933538e-06, + "loss": 1.96245213, + "memory(GiB)": 52.96, + "step": 1160, + "train_speed(iter/s)": 1.564392 + }, + { + "acc": 0.60217309, + "epoch": 0.029553526128868593, + "grad_norm": 7.21875, + "learning_rate": 2.9553526128868598e-06, + "loss": 1.93401833, + "memory(GiB)": 52.96, + "step": 1165, + "train_speed(iter/s)": 1.56536 + }, + { + "acc": 0.59514866, + "epoch": 0.02968036529680365, + "grad_norm": 5.90625, + "learning_rate": 2.9680365296803653e-06, + "loss": 2.00032043, + "memory(GiB)": 52.96, + "step": 1170, + "train_speed(iter/s)": 1.56618 + }, + { + "acc": 0.60530767, + "epoch": 0.02980720446473871, + "grad_norm": 6.03125, + "learning_rate": 2.9807204464738717e-06, + "loss": 1.95743942, + "memory(GiB)": 52.96, + "step": 1175, + "train_speed(iter/s)": 1.56715 + }, + { + "acc": 0.61269002, + "epoch": 0.029934043632673768, + "grad_norm": 6.25, + "learning_rate": 2.9934043632673772e-06, + "loss": 1.88470802, + "memory(GiB)": 52.96, + "step": 1180, + "train_speed(iter/s)": 1.568079 + }, + { + "acc": 0.59085321, + "epoch": 0.030060882800608826, + "grad_norm": 6.125, + "learning_rate": 3.006088280060883e-06, + "loss": 1.98053436, + "memory(GiB)": 52.96, + "step": 1185, + "train_speed(iter/s)": 1.569022 + }, + { + "acc": 0.59253659, + "epoch": 0.030187721968543885, + "grad_norm": 6.8125, + "learning_rate": 3.0187721968543887e-06, + "loss": 1.98493576, + "memory(GiB)": 55.86, + "step": 1190, + "train_speed(iter/s)": 1.56996 + }, + { + "acc": 0.59364595, + "epoch": 0.030314561136478943, + "grad_norm": 6.65625, + "learning_rate": 3.0314561136478947e-06, + "loss": 1.98580132, + "memory(GiB)": 55.86, + "step": 1195, + "train_speed(iter/s)": 1.570865 + }, + { + "acc": 0.63210149, + "epoch": 0.030441400304414, + "grad_norm": 6.78125, + "learning_rate": 3.0441400304414002e-06, + "loss": 1.91288757, + "memory(GiB)": 55.86, + "step": 1200, + "train_speed(iter/s)": 1.571737 + }, + { + "acc": 0.59039435, + "epoch": 0.03056823947234906, + "grad_norm": 11.5, + "learning_rate": 3.0568239472349066e-06, + "loss": 2.07269115, + "memory(GiB)": 55.86, + "step": 1205, + "train_speed(iter/s)": 1.572636 + }, + { + "acc": 0.59262409, + "epoch": 0.030695078640284118, + "grad_norm": 7.4375, + "learning_rate": 3.069507864028412e-06, + "loss": 1.94299812, + "memory(GiB)": 55.86, + "step": 1210, + "train_speed(iter/s)": 1.573537 + }, + { + "acc": 0.60261555, + "epoch": 0.030821917808219176, + "grad_norm": 7.0625, + "learning_rate": 3.082191780821918e-06, + "loss": 1.97984161, + "memory(GiB)": 55.86, + "step": 1215, + "train_speed(iter/s)": 1.574387 + }, + { + "acc": 0.59777422, + "epoch": 0.030948756976154235, + "grad_norm": 7.4375, + "learning_rate": 3.0948756976154237e-06, + "loss": 1.96874218, + "memory(GiB)": 55.86, + "step": 1220, + "train_speed(iter/s)": 1.575294 + }, + { + "acc": 0.6122798, + "epoch": 0.031075596144089293, + "grad_norm": 5.78125, + "learning_rate": 3.1075596144089297e-06, + "loss": 1.97807961, + "memory(GiB)": 55.86, + "step": 1225, + "train_speed(iter/s)": 1.576191 + }, + { + "acc": 0.60784287, + "epoch": 0.03120243531202435, + "grad_norm": 6.25, + "learning_rate": 3.120243531202435e-06, + "loss": 1.99195366, + "memory(GiB)": 55.86, + "step": 1230, + "train_speed(iter/s)": 1.577021 + }, + { + "acc": 0.59283824, + "epoch": 0.03132927447995941, + "grad_norm": 6.375, + "learning_rate": 3.1329274479959416e-06, + "loss": 2.04663258, + "memory(GiB)": 55.86, + "step": 1235, + "train_speed(iter/s)": 1.577879 + }, + { + "acc": 0.60226316, + "epoch": 0.03145611364789447, + "grad_norm": 6.34375, + "learning_rate": 3.145611364789447e-06, + "loss": 1.97379227, + "memory(GiB)": 55.86, + "step": 1240, + "train_speed(iter/s)": 1.578801 + }, + { + "acc": 0.60626063, + "epoch": 0.031582952815829526, + "grad_norm": 6.84375, + "learning_rate": 3.158295281582953e-06, + "loss": 2.02573814, + "memory(GiB)": 55.86, + "step": 1245, + "train_speed(iter/s)": 1.579659 + }, + { + "acc": 0.6025857, + "epoch": 0.031709791983764585, + "grad_norm": 6.90625, + "learning_rate": 3.1709791983764586e-06, + "loss": 1.97974224, + "memory(GiB)": 55.86, + "step": 1250, + "train_speed(iter/s)": 1.580496 + }, + { + "acc": 0.60810347, + "epoch": 0.03183663115169964, + "grad_norm": 6.28125, + "learning_rate": 3.183663115169965e-06, + "loss": 1.98515491, + "memory(GiB)": 55.86, + "step": 1255, + "train_speed(iter/s)": 1.58135 + }, + { + "acc": 0.60704136, + "epoch": 0.0319634703196347, + "grad_norm": 5.875, + "learning_rate": 3.1963470319634706e-06, + "loss": 2.01439991, + "memory(GiB)": 55.86, + "step": 1260, + "train_speed(iter/s)": 1.582191 + }, + { + "acc": 0.60790958, + "epoch": 0.03209030948756976, + "grad_norm": 6.15625, + "learning_rate": 3.2090309487569765e-06, + "loss": 1.93412552, + "memory(GiB)": 55.86, + "step": 1265, + "train_speed(iter/s)": 1.583018 + }, + { + "acc": 0.60558281, + "epoch": 0.03221714865550482, + "grad_norm": 6.3125, + "learning_rate": 3.221714865550482e-06, + "loss": 2.01308784, + "memory(GiB)": 55.86, + "step": 1270, + "train_speed(iter/s)": 1.583856 + }, + { + "acc": 0.60381126, + "epoch": 0.032343987823439876, + "grad_norm": 5.5625, + "learning_rate": 3.234398782343988e-06, + "loss": 2.00786781, + "memory(GiB)": 55.86, + "step": 1275, + "train_speed(iter/s)": 1.584656 + }, + { + "acc": 0.6057766, + "epoch": 0.032470826991374935, + "grad_norm": 5.28125, + "learning_rate": 3.2470826991374936e-06, + "loss": 1.9677681, + "memory(GiB)": 55.86, + "step": 1280, + "train_speed(iter/s)": 1.585491 + }, + { + "acc": 0.6083662, + "epoch": 0.03259766615930999, + "grad_norm": 6.46875, + "learning_rate": 3.259766615931e-06, + "loss": 1.98414745, + "memory(GiB)": 55.86, + "step": 1285, + "train_speed(iter/s)": 1.586271 + }, + { + "acc": 0.61575689, + "epoch": 0.03272450532724505, + "grad_norm": 5.8125, + "learning_rate": 3.2724505327245055e-06, + "loss": 1.91510162, + "memory(GiB)": 55.86, + "step": 1290, + "train_speed(iter/s)": 1.587107 + }, + { + "acc": 0.59676428, + "epoch": 0.03285134449518011, + "grad_norm": 6.4375, + "learning_rate": 3.2851344495180115e-06, + "loss": 1.98412094, + "memory(GiB)": 55.86, + "step": 1295, + "train_speed(iter/s)": 1.587839 + }, + { + "acc": 0.59345059, + "epoch": 0.03297818366311517, + "grad_norm": 6.0, + "learning_rate": 3.297818366311517e-06, + "loss": 2.06957817, + "memory(GiB)": 55.86, + "step": 1300, + "train_speed(iter/s)": 1.588593 + }, + { + "acc": 0.61568213, + "epoch": 0.033105022831050226, + "grad_norm": 9.125, + "learning_rate": 3.310502283105023e-06, + "loss": 1.91323452, + "memory(GiB)": 55.86, + "step": 1305, + "train_speed(iter/s)": 1.589424 + }, + { + "acc": 0.60652704, + "epoch": 0.033231861998985285, + "grad_norm": 5.8125, + "learning_rate": 3.3231861998985286e-06, + "loss": 1.94738922, + "memory(GiB)": 55.86, + "step": 1310, + "train_speed(iter/s)": 1.590254 + }, + { + "acc": 0.62636309, + "epoch": 0.03335870116692034, + "grad_norm": 5.71875, + "learning_rate": 3.335870116692035e-06, + "loss": 1.83315792, + "memory(GiB)": 55.86, + "step": 1315, + "train_speed(iter/s)": 1.591055 + }, + { + "acc": 0.60056524, + "epoch": 0.0334855403348554, + "grad_norm": 6.34375, + "learning_rate": 3.3485540334855405e-06, + "loss": 1.95279179, + "memory(GiB)": 55.86, + "step": 1320, + "train_speed(iter/s)": 1.591818 + }, + { + "acc": 0.6080533, + "epoch": 0.03361237950279046, + "grad_norm": 7.15625, + "learning_rate": 3.3612379502790465e-06, + "loss": 1.84236565, + "memory(GiB)": 55.86, + "step": 1325, + "train_speed(iter/s)": 1.592603 + }, + { + "acc": 0.61878386, + "epoch": 0.03373921867072552, + "grad_norm": 8.125, + "learning_rate": 3.373921867072552e-06, + "loss": 1.9659586, + "memory(GiB)": 55.86, + "step": 1330, + "train_speed(iter/s)": 1.593399 + }, + { + "acc": 0.60569687, + "epoch": 0.033866057838660577, + "grad_norm": 5.9375, + "learning_rate": 3.3866057838660584e-06, + "loss": 2.02585487, + "memory(GiB)": 55.86, + "step": 1335, + "train_speed(iter/s)": 1.594149 + }, + { + "acc": 0.60939317, + "epoch": 0.033992897006595635, + "grad_norm": 6.1875, + "learning_rate": 3.399289700659564e-06, + "loss": 1.92616482, + "memory(GiB)": 55.86, + "step": 1340, + "train_speed(iter/s)": 1.594921 + }, + { + "acc": 0.62465162, + "epoch": 0.03411973617453069, + "grad_norm": 5.625, + "learning_rate": 3.41197361745307e-06, + "loss": 1.93077335, + "memory(GiB)": 55.86, + "step": 1345, + "train_speed(iter/s)": 1.595681 + }, + { + "acc": 0.59684696, + "epoch": 0.03424657534246575, + "grad_norm": 5.71875, + "learning_rate": 3.4246575342465754e-06, + "loss": 1.93622208, + "memory(GiB)": 55.86, + "step": 1350, + "train_speed(iter/s)": 1.596412 + }, + { + "acc": 0.6266459, + "epoch": 0.03437341451040081, + "grad_norm": 6.59375, + "learning_rate": 3.4373414510400814e-06, + "loss": 1.88011494, + "memory(GiB)": 55.86, + "step": 1355, + "train_speed(iter/s)": 1.59716 + }, + { + "acc": 0.61261072, + "epoch": 0.03450025367833587, + "grad_norm": 7.4375, + "learning_rate": 3.450025367833587e-06, + "loss": 1.95805855, + "memory(GiB)": 55.86, + "step": 1360, + "train_speed(iter/s)": 1.597891 + }, + { + "acc": 0.60318518, + "epoch": 0.03462709284627093, + "grad_norm": 6.15625, + "learning_rate": 3.4627092846270933e-06, + "loss": 1.96464386, + "memory(GiB)": 55.86, + "step": 1365, + "train_speed(iter/s)": 1.598665 + }, + { + "acc": 0.59814706, + "epoch": 0.034753932014205985, + "grad_norm": 5.96875, + "learning_rate": 3.475393201420599e-06, + "loss": 1.96369362, + "memory(GiB)": 55.86, + "step": 1370, + "train_speed(iter/s)": 1.599384 + }, + { + "acc": 0.60429707, + "epoch": 0.03488077118214104, + "grad_norm": 5.84375, + "learning_rate": 3.488077118214105e-06, + "loss": 1.91435337, + "memory(GiB)": 55.86, + "step": 1375, + "train_speed(iter/s)": 1.600112 + }, + { + "acc": 0.61875076, + "epoch": 0.0350076103500761, + "grad_norm": 5.65625, + "learning_rate": 3.5007610350076104e-06, + "loss": 1.95240536, + "memory(GiB)": 55.86, + "step": 1380, + "train_speed(iter/s)": 1.60083 + }, + { + "acc": 0.61646738, + "epoch": 0.03513444951801116, + "grad_norm": 6.15625, + "learning_rate": 3.5134449518011164e-06, + "loss": 1.91961479, + "memory(GiB)": 55.86, + "step": 1385, + "train_speed(iter/s)": 1.601568 + }, + { + "acc": 0.59834375, + "epoch": 0.03526128868594622, + "grad_norm": 5.8125, + "learning_rate": 3.5261288685946223e-06, + "loss": 2.00084267, + "memory(GiB)": 55.86, + "step": 1390, + "train_speed(iter/s)": 1.602225 + }, + { + "acc": 0.58981743, + "epoch": 0.03538812785388128, + "grad_norm": 6.0625, + "learning_rate": 3.5388127853881283e-06, + "loss": 2.05951519, + "memory(GiB)": 55.86, + "step": 1395, + "train_speed(iter/s)": 1.60292 + }, + { + "acc": 0.59085937, + "epoch": 0.035514967021816335, + "grad_norm": 5.84375, + "learning_rate": 3.551496702181634e-06, + "loss": 1.9258213, + "memory(GiB)": 55.86, + "step": 1400, + "train_speed(iter/s)": 1.60363 + }, + { + "acc": 0.59757051, + "epoch": 0.03564180618975139, + "grad_norm": 7.53125, + "learning_rate": 3.56418061897514e-06, + "loss": 2.03836746, + "memory(GiB)": 55.86, + "step": 1405, + "train_speed(iter/s)": 1.604323 + }, + { + "acc": 0.5950069, + "epoch": 0.03576864535768645, + "grad_norm": 6.84375, + "learning_rate": 3.5768645357686453e-06, + "loss": 2.00823231, + "memory(GiB)": 55.86, + "step": 1410, + "train_speed(iter/s)": 1.605035 + }, + { + "acc": 0.60393982, + "epoch": 0.03589548452562151, + "grad_norm": 5.78125, + "learning_rate": 3.5895484525621517e-06, + "loss": 1.97716637, + "memory(GiB)": 55.86, + "step": 1415, + "train_speed(iter/s)": 1.605676 + }, + { + "acc": 0.61806784, + "epoch": 0.03602232369355657, + "grad_norm": 7.21875, + "learning_rate": 3.6022323693556573e-06, + "loss": 1.90677795, + "memory(GiB)": 55.86, + "step": 1420, + "train_speed(iter/s)": 1.606377 + }, + { + "acc": 0.60860581, + "epoch": 0.03614916286149163, + "grad_norm": 6.46875, + "learning_rate": 3.6149162861491632e-06, + "loss": 1.98624191, + "memory(GiB)": 55.86, + "step": 1425, + "train_speed(iter/s)": 1.607012 + }, + { + "acc": 0.61801763, + "epoch": 0.036276002029426685, + "grad_norm": 5.625, + "learning_rate": 3.6276002029426688e-06, + "loss": 1.89556847, + "memory(GiB)": 55.86, + "step": 1430, + "train_speed(iter/s)": 1.607647 + }, + { + "acc": 0.59453616, + "epoch": 0.03640284119736174, + "grad_norm": 6.34375, + "learning_rate": 3.6402841197361748e-06, + "loss": 1.99719849, + "memory(GiB)": 55.86, + "step": 1435, + "train_speed(iter/s)": 1.608291 + }, + { + "acc": 0.61071548, + "epoch": 0.0365296803652968, + "grad_norm": 7.6875, + "learning_rate": 3.6529680365296803e-06, + "loss": 1.98920689, + "memory(GiB)": 55.86, + "step": 1440, + "train_speed(iter/s)": 1.608964 + }, + { + "acc": 0.62530737, + "epoch": 0.03665651953323186, + "grad_norm": 5.6875, + "learning_rate": 3.6656519533231867e-06, + "loss": 1.93359718, + "memory(GiB)": 55.86, + "step": 1445, + "train_speed(iter/s)": 1.609581 + }, + { + "acc": 0.59595771, + "epoch": 0.03678335870116692, + "grad_norm": 5.5625, + "learning_rate": 3.6783358701166922e-06, + "loss": 2.05783997, + "memory(GiB)": 55.86, + "step": 1450, + "train_speed(iter/s)": 1.610219 + }, + { + "acc": 0.62097073, + "epoch": 0.03691019786910198, + "grad_norm": 5.53125, + "learning_rate": 3.691019786910198e-06, + "loss": 1.9678236, + "memory(GiB)": 55.86, + "step": 1455, + "train_speed(iter/s)": 1.610914 + }, + { + "acc": 0.61242352, + "epoch": 0.037037037037037035, + "grad_norm": 6.46875, + "learning_rate": 3.7037037037037037e-06, + "loss": 1.84934578, + "memory(GiB)": 55.86, + "step": 1460, + "train_speed(iter/s)": 1.611589 + }, + { + "acc": 0.60875711, + "epoch": 0.03716387620497209, + "grad_norm": 6.59375, + "learning_rate": 3.7163876204972097e-06, + "loss": 1.97669029, + "memory(GiB)": 55.86, + "step": 1465, + "train_speed(iter/s)": 1.612199 + }, + { + "acc": 0.60609407, + "epoch": 0.03729071537290715, + "grad_norm": 5.53125, + "learning_rate": 3.7290715372907157e-06, + "loss": 1.96448975, + "memory(GiB)": 55.86, + "step": 1470, + "train_speed(iter/s)": 1.612835 + }, + { + "acc": 0.60854063, + "epoch": 0.03741755454084221, + "grad_norm": 6.6875, + "learning_rate": 3.7417554540842216e-06, + "loss": 1.9614727, + "memory(GiB)": 55.86, + "step": 1475, + "train_speed(iter/s)": 1.613463 + }, + { + "acc": 0.6250205, + "epoch": 0.03754439370877727, + "grad_norm": 5.53125, + "learning_rate": 3.754439370877727e-06, + "loss": 1.88623543, + "memory(GiB)": 55.86, + "step": 1480, + "train_speed(iter/s)": 1.614113 + }, + { + "acc": 0.6044075, + "epoch": 0.03767123287671233, + "grad_norm": 6.90625, + "learning_rate": 3.767123287671233e-06, + "loss": 1.8949955, + "memory(GiB)": 55.86, + "step": 1485, + "train_speed(iter/s)": 1.614737 + }, + { + "acc": 0.60617695, + "epoch": 0.037798072044647385, + "grad_norm": 6.625, + "learning_rate": 3.7798072044647387e-06, + "loss": 1.95957718, + "memory(GiB)": 55.86, + "step": 1490, + "train_speed(iter/s)": 1.615374 + }, + { + "acc": 0.58851228, + "epoch": 0.03792491121258244, + "grad_norm": 7.71875, + "learning_rate": 3.792491121258245e-06, + "loss": 2.05935116, + "memory(GiB)": 55.86, + "step": 1495, + "train_speed(iter/s)": 1.615982 + }, + { + "acc": 0.59769611, + "epoch": 0.0380517503805175, + "grad_norm": 5.96875, + "learning_rate": 3.8051750380517506e-06, + "loss": 1.96287575, + "memory(GiB)": 55.86, + "step": 1500, + "train_speed(iter/s)": 1.61658 + }, + { + "acc": 0.60486722, + "epoch": 0.03817858954845256, + "grad_norm": 5.71875, + "learning_rate": 3.817858954845256e-06, + "loss": 1.93798161, + "memory(GiB)": 55.86, + "step": 1505, + "train_speed(iter/s)": 1.617179 + }, + { + "acc": 0.60161924, + "epoch": 0.03830542871638762, + "grad_norm": 6.9375, + "learning_rate": 3.830542871638762e-06, + "loss": 1.95662956, + "memory(GiB)": 55.86, + "step": 1510, + "train_speed(iter/s)": 1.617764 + }, + { + "acc": 0.60661659, + "epoch": 0.03843226788432268, + "grad_norm": 5.8125, + "learning_rate": 3.843226788432268e-06, + "loss": 1.90788918, + "memory(GiB)": 55.86, + "step": 1515, + "train_speed(iter/s)": 1.618326 + }, + { + "acc": 0.61499333, + "epoch": 0.038559107052257735, + "grad_norm": 9.75, + "learning_rate": 3.855910705225774e-06, + "loss": 1.90940704, + "memory(GiB)": 55.86, + "step": 1520, + "train_speed(iter/s)": 1.618894 + }, + { + "acc": 0.60315471, + "epoch": 0.03868594622019279, + "grad_norm": 5.65625, + "learning_rate": 3.86859462201928e-06, + "loss": 1.92785397, + "memory(GiB)": 55.86, + "step": 1525, + "train_speed(iter/s)": 1.619498 + }, + { + "acc": 0.59764004, + "epoch": 0.03881278538812785, + "grad_norm": 6.03125, + "learning_rate": 3.881278538812785e-06, + "loss": 1.95198708, + "memory(GiB)": 55.86, + "step": 1530, + "train_speed(iter/s)": 1.620092 + }, + { + "acc": 0.61831112, + "epoch": 0.03893962455606291, + "grad_norm": 6.34375, + "learning_rate": 3.893962455606292e-06, + "loss": 1.94009991, + "memory(GiB)": 55.86, + "step": 1535, + "train_speed(iter/s)": 1.620668 + }, + { + "acc": 0.62329388, + "epoch": 0.03906646372399797, + "grad_norm": 7.40625, + "learning_rate": 3.906646372399797e-06, + "loss": 1.85862293, + "memory(GiB)": 55.86, + "step": 1540, + "train_speed(iter/s)": 1.621287 + }, + { + "acc": 0.5830049, + "epoch": 0.03919330289193303, + "grad_norm": 6.15625, + "learning_rate": 3.919330289193303e-06, + "loss": 2.00897255, + "memory(GiB)": 55.86, + "step": 1545, + "train_speed(iter/s)": 1.621806 + }, + { + "acc": 0.5943553, + "epoch": 0.039320142059868085, + "grad_norm": 6.25, + "learning_rate": 3.932014205986809e-06, + "loss": 1.96419506, + "memory(GiB)": 55.86, + "step": 1550, + "train_speed(iter/s)": 1.62245 + }, + { + "acc": 0.62053938, + "epoch": 0.03944698122780314, + "grad_norm": 6.625, + "learning_rate": 3.944698122780315e-06, + "loss": 1.90389671, + "memory(GiB)": 55.86, + "step": 1555, + "train_speed(iter/s)": 1.623037 + }, + { + "acc": 0.60830789, + "epoch": 0.0395738203957382, + "grad_norm": 9.5, + "learning_rate": 3.95738203957382e-06, + "loss": 1.97414207, + "memory(GiB)": 55.86, + "step": 1560, + "train_speed(iter/s)": 1.623615 + }, + { + "acc": 0.61599102, + "epoch": 0.03970065956367326, + "grad_norm": 6.59375, + "learning_rate": 3.970065956367327e-06, + "loss": 1.9083416, + "memory(GiB)": 55.86, + "step": 1565, + "train_speed(iter/s)": 1.624233 + }, + { + "acc": 0.60456533, + "epoch": 0.03982749873160832, + "grad_norm": 6.09375, + "learning_rate": 3.982749873160832e-06, + "loss": 1.94531078, + "memory(GiB)": 55.86, + "step": 1570, + "train_speed(iter/s)": 1.624847 + }, + { + "acc": 0.58893738, + "epoch": 0.03995433789954338, + "grad_norm": 8.5, + "learning_rate": 3.995433789954338e-06, + "loss": 2.01982937, + "memory(GiB)": 55.86, + "step": 1575, + "train_speed(iter/s)": 1.625461 + }, + { + "acc": 0.61439967, + "epoch": 0.040081177067478435, + "grad_norm": 5.1875, + "learning_rate": 4.008117706747844e-06, + "loss": 1.96854458, + "memory(GiB)": 55.86, + "step": 1580, + "train_speed(iter/s)": 1.626016 + }, + { + "acc": 0.60537062, + "epoch": 0.04020801623541349, + "grad_norm": 7.65625, + "learning_rate": 4.02080162354135e-06, + "loss": 1.90504494, + "memory(GiB)": 55.86, + "step": 1585, + "train_speed(iter/s)": 1.626596 + }, + { + "acc": 0.62116857, + "epoch": 0.04033485540334855, + "grad_norm": 6.6875, + "learning_rate": 4.033485540334856e-06, + "loss": 1.89018879, + "memory(GiB)": 55.86, + "step": 1590, + "train_speed(iter/s)": 1.627163 + }, + { + "acc": 0.60072103, + "epoch": 0.04046169457128361, + "grad_norm": 5.4375, + "learning_rate": 4.046169457128362e-06, + "loss": 1.97833405, + "memory(GiB)": 55.86, + "step": 1595, + "train_speed(iter/s)": 1.627719 + }, + { + "acc": 0.5929966, + "epoch": 0.04058853373921867, + "grad_norm": 6.84375, + "learning_rate": 4.058853373921867e-06, + "loss": 1.9963623, + "memory(GiB)": 55.86, + "step": 1600, + "train_speed(iter/s)": 1.628297 + }, + { + "acc": 0.62433748, + "epoch": 0.04071537290715373, + "grad_norm": 5.75, + "learning_rate": 4.071537290715373e-06, + "loss": 1.83874855, + "memory(GiB)": 55.86, + "step": 1605, + "train_speed(iter/s)": 1.628839 + }, + { + "acc": 0.60330362, + "epoch": 0.040842212075088785, + "grad_norm": 7.78125, + "learning_rate": 4.084221207508879e-06, + "loss": 1.93582878, + "memory(GiB)": 55.86, + "step": 1610, + "train_speed(iter/s)": 1.629402 + }, + { + "acc": 0.60428982, + "epoch": 0.040969051243023843, + "grad_norm": 8.5625, + "learning_rate": 4.096905124302385e-06, + "loss": 1.9523243, + "memory(GiB)": 55.86, + "step": 1615, + "train_speed(iter/s)": 1.629974 + }, + { + "acc": 0.61570668, + "epoch": 0.0410958904109589, + "grad_norm": 6.0, + "learning_rate": 4.109589041095891e-06, + "loss": 1.8981863, + "memory(GiB)": 55.86, + "step": 1620, + "train_speed(iter/s)": 1.630511 + }, + { + "acc": 0.59638033, + "epoch": 0.04122272957889396, + "grad_norm": 6.25, + "learning_rate": 4.122272957889397e-06, + "loss": 2.06798515, + "memory(GiB)": 55.86, + "step": 1625, + "train_speed(iter/s)": 1.631054 + }, + { + "acc": 0.59134579, + "epoch": 0.04134956874682902, + "grad_norm": 7.875, + "learning_rate": 4.134956874682902e-06, + "loss": 1.95156212, + "memory(GiB)": 55.86, + "step": 1630, + "train_speed(iter/s)": 1.631589 + }, + { + "acc": 0.63578887, + "epoch": 0.04147640791476408, + "grad_norm": 6.28125, + "learning_rate": 4.147640791476408e-06, + "loss": 1.8266901, + "memory(GiB)": 55.86, + "step": 1635, + "train_speed(iter/s)": 1.632146 + }, + { + "acc": 0.60775814, + "epoch": 0.041603247082699135, + "grad_norm": 5.53125, + "learning_rate": 4.160324708269914e-06, + "loss": 1.98309479, + "memory(GiB)": 55.86, + "step": 1640, + "train_speed(iter/s)": 1.632649 + }, + { + "acc": 0.6237278, + "epoch": 0.041730086250634194, + "grad_norm": 5.625, + "learning_rate": 4.17300862506342e-06, + "loss": 1.82640667, + "memory(GiB)": 55.86, + "step": 1645, + "train_speed(iter/s)": 1.633162 + }, + { + "acc": 0.61207809, + "epoch": 0.04185692541856925, + "grad_norm": 6.21875, + "learning_rate": 4.185692541856926e-06, + "loss": 1.84851265, + "memory(GiB)": 55.86, + "step": 1650, + "train_speed(iter/s)": 1.633689 + }, + { + "acc": 0.61386757, + "epoch": 0.04198376458650431, + "grad_norm": 7.5625, + "learning_rate": 4.198376458650432e-06, + "loss": 1.88960037, + "memory(GiB)": 55.86, + "step": 1655, + "train_speed(iter/s)": 1.634223 + }, + { + "acc": 0.61731625, + "epoch": 0.04211060375443937, + "grad_norm": 6.96875, + "learning_rate": 4.211060375443937e-06, + "loss": 1.85162468, + "memory(GiB)": 55.86, + "step": 1660, + "train_speed(iter/s)": 1.63477 + }, + { + "acc": 0.59406047, + "epoch": 0.04223744292237443, + "grad_norm": 6.4375, + "learning_rate": 4.223744292237444e-06, + "loss": 2.05021172, + "memory(GiB)": 55.86, + "step": 1665, + "train_speed(iter/s)": 1.635297 + }, + { + "acc": 0.60849686, + "epoch": 0.042364282090309485, + "grad_norm": 5.8125, + "learning_rate": 4.236428209030949e-06, + "loss": 1.91598587, + "memory(GiB)": 55.86, + "step": 1670, + "train_speed(iter/s)": 1.635803 + }, + { + "acc": 0.599365, + "epoch": 0.042491121258244544, + "grad_norm": 7.28125, + "learning_rate": 4.249112125824455e-06, + "loss": 1.9791338, + "memory(GiB)": 55.86, + "step": 1675, + "train_speed(iter/s)": 1.636318 + }, + { + "acc": 0.6116539, + "epoch": 0.0426179604261796, + "grad_norm": 9.75, + "learning_rate": 4.261796042617961e-06, + "loss": 1.91320686, + "memory(GiB)": 55.86, + "step": 1680, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.63092976, + "epoch": 0.04274479959411466, + "grad_norm": 7.53125, + "learning_rate": 4.274479959411467e-06, + "loss": 1.83175011, + "memory(GiB)": 55.86, + "step": 1685, + "train_speed(iter/s)": 1.637347 + }, + { + "acc": 0.61188216, + "epoch": 0.04287163876204972, + "grad_norm": 5.75, + "learning_rate": 4.287163876204972e-06, + "loss": 1.84597168, + "memory(GiB)": 55.86, + "step": 1690, + "train_speed(iter/s)": 1.637889 + }, + { + "acc": 0.6056201, + "epoch": 0.04299847792998478, + "grad_norm": 6.125, + "learning_rate": 4.299847792998479e-06, + "loss": 1.87785301, + "memory(GiB)": 55.86, + "step": 1695, + "train_speed(iter/s)": 1.638433 + }, + { + "acc": 0.61106491, + "epoch": 0.043125317097919835, + "grad_norm": 7.4375, + "learning_rate": 4.312531709791984e-06, + "loss": 1.91492901, + "memory(GiB)": 55.86, + "step": 1700, + "train_speed(iter/s)": 1.638945 + }, + { + "acc": 0.59373679, + "epoch": 0.043252156265854894, + "grad_norm": 8.1875, + "learning_rate": 4.32521562658549e-06, + "loss": 1.9386467, + "memory(GiB)": 55.86, + "step": 1705, + "train_speed(iter/s)": 1.639446 + }, + { + "acc": 0.60725961, + "epoch": 0.04337899543378995, + "grad_norm": 6.65625, + "learning_rate": 4.337899543378996e-06, + "loss": 1.97910957, + "memory(GiB)": 55.86, + "step": 1710, + "train_speed(iter/s)": 1.639955 + }, + { + "acc": 0.58542371, + "epoch": 0.04350583460172501, + "grad_norm": 5.46875, + "learning_rate": 4.350583460172502e-06, + "loss": 1.97228317, + "memory(GiB)": 55.86, + "step": 1715, + "train_speed(iter/s)": 1.640438 + }, + { + "acc": 0.62461662, + "epoch": 0.04363267376966007, + "grad_norm": 6.6875, + "learning_rate": 4.363267376966007e-06, + "loss": 1.87988815, + "memory(GiB)": 55.86, + "step": 1720, + "train_speed(iter/s)": 1.64093 + }, + { + "acc": 0.60323882, + "epoch": 0.04375951293759513, + "grad_norm": 5.875, + "learning_rate": 4.375951293759514e-06, + "loss": 1.95035305, + "memory(GiB)": 55.86, + "step": 1725, + "train_speed(iter/s)": 1.64145 + }, + { + "acc": 0.61045351, + "epoch": 0.043886352105530185, + "grad_norm": 6.625, + "learning_rate": 4.388635210553019e-06, + "loss": 1.90547295, + "memory(GiB)": 55.86, + "step": 1730, + "train_speed(iter/s)": 1.64195 + }, + { + "acc": 0.60988708, + "epoch": 0.044013191273465244, + "grad_norm": 5.71875, + "learning_rate": 4.401319127346525e-06, + "loss": 1.92009602, + "memory(GiB)": 55.86, + "step": 1735, + "train_speed(iter/s)": 1.642447 + }, + { + "acc": 0.61460381, + "epoch": 0.0441400304414003, + "grad_norm": 8.875, + "learning_rate": 4.414003044140031e-06, + "loss": 1.88086433, + "memory(GiB)": 55.86, + "step": 1740, + "train_speed(iter/s)": 1.642955 + }, + { + "acc": 0.61463203, + "epoch": 0.04426686960933536, + "grad_norm": 6.40625, + "learning_rate": 4.426686960933537e-06, + "loss": 1.896945, + "memory(GiB)": 55.86, + "step": 1745, + "train_speed(iter/s)": 1.643396 + }, + { + "acc": 0.59492693, + "epoch": 0.04439370877727042, + "grad_norm": 6.28125, + "learning_rate": 4.439370877727043e-06, + "loss": 1.99612389, + "memory(GiB)": 55.86, + "step": 1750, + "train_speed(iter/s)": 1.643863 + }, + { + "acc": 0.60498633, + "epoch": 0.04452054794520548, + "grad_norm": 6.84375, + "learning_rate": 4.4520547945205486e-06, + "loss": 1.92270393, + "memory(GiB)": 55.86, + "step": 1755, + "train_speed(iter/s)": 1.644334 + }, + { + "acc": 0.62942419, + "epoch": 0.044647387113140535, + "grad_norm": 5.96875, + "learning_rate": 4.464738711314054e-06, + "loss": 1.85165215, + "memory(GiB)": 55.86, + "step": 1760, + "train_speed(iter/s)": 1.644771 + }, + { + "acc": 0.60646572, + "epoch": 0.044774226281075594, + "grad_norm": 6.0, + "learning_rate": 4.47742262810756e-06, + "loss": 1.96096783, + "memory(GiB)": 55.86, + "step": 1765, + "train_speed(iter/s)": 1.64526 + }, + { + "acc": 0.61349754, + "epoch": 0.04490106544901065, + "grad_norm": 5.4375, + "learning_rate": 4.490106544901066e-06, + "loss": 1.86078606, + "memory(GiB)": 55.86, + "step": 1770, + "train_speed(iter/s)": 1.645699 + }, + { + "acc": 0.61186399, + "epoch": 0.04502790461694571, + "grad_norm": 7.625, + "learning_rate": 4.502790461694572e-06, + "loss": 1.96916714, + "memory(GiB)": 55.86, + "step": 1775, + "train_speed(iter/s)": 1.64612 + }, + { + "acc": 0.60810881, + "epoch": 0.04515474378488077, + "grad_norm": 6.5625, + "learning_rate": 4.5154743784880776e-06, + "loss": 1.99163055, + "memory(GiB)": 55.86, + "step": 1780, + "train_speed(iter/s)": 1.646604 + }, + { + "acc": 0.61640735, + "epoch": 0.04528158295281583, + "grad_norm": 5.28125, + "learning_rate": 4.5281582952815835e-06, + "loss": 1.90900517, + "memory(GiB)": 55.86, + "step": 1785, + "train_speed(iter/s)": 1.647036 + }, + { + "acc": 0.61566954, + "epoch": 0.045408422120750885, + "grad_norm": 6.21875, + "learning_rate": 4.540842212075089e-06, + "loss": 1.86947441, + "memory(GiB)": 55.86, + "step": 1790, + "train_speed(iter/s)": 1.647483 + }, + { + "acc": 0.61108809, + "epoch": 0.045535261288685944, + "grad_norm": 6.65625, + "learning_rate": 4.553526128868595e-06, + "loss": 1.94466934, + "memory(GiB)": 55.86, + "step": 1795, + "train_speed(iter/s)": 1.647932 + }, + { + "acc": 0.60261269, + "epoch": 0.045662100456621, + "grad_norm": 4.65625, + "learning_rate": 4.566210045662101e-06, + "loss": 1.94084854, + "memory(GiB)": 55.86, + "step": 1800, + "train_speed(iter/s)": 1.648346 + }, + { + "acc": 0.63324766, + "epoch": 0.04578893962455606, + "grad_norm": 6.375, + "learning_rate": 4.5788939624556065e-06, + "loss": 1.78107758, + "memory(GiB)": 55.86, + "step": 1805, + "train_speed(iter/s)": 1.648777 + }, + { + "acc": 0.60749745, + "epoch": 0.04591577879249112, + "grad_norm": 5.3125, + "learning_rate": 4.5915778792491125e-06, + "loss": 1.91480827, + "memory(GiB)": 55.86, + "step": 1810, + "train_speed(iter/s)": 1.649204 + }, + { + "acc": 0.58367977, + "epoch": 0.04604261796042618, + "grad_norm": 8.3125, + "learning_rate": 4.6042617960426185e-06, + "loss": 2.02885132, + "memory(GiB)": 55.86, + "step": 1815, + "train_speed(iter/s)": 1.649647 + }, + { + "acc": 0.60988832, + "epoch": 0.046169457128361235, + "grad_norm": 7.0, + "learning_rate": 4.616945712836124e-06, + "loss": 1.90552158, + "memory(GiB)": 55.86, + "step": 1820, + "train_speed(iter/s)": 1.650116 + }, + { + "acc": 0.6232265, + "epoch": 0.046296296296296294, + "grad_norm": 7.5, + "learning_rate": 4.62962962962963e-06, + "loss": 1.86238213, + "memory(GiB)": 55.86, + "step": 1825, + "train_speed(iter/s)": 1.650541 + }, + { + "acc": 0.6233623, + "epoch": 0.04642313546423135, + "grad_norm": 7.125, + "learning_rate": 4.6423135464231355e-06, + "loss": 1.95918674, + "memory(GiB)": 55.86, + "step": 1830, + "train_speed(iter/s)": 1.650936 + }, + { + "acc": 0.63595634, + "epoch": 0.04654997463216641, + "grad_norm": 8.6875, + "learning_rate": 4.6549974632166415e-06, + "loss": 1.90061665, + "memory(GiB)": 55.86, + "step": 1835, + "train_speed(iter/s)": 1.651376 + }, + { + "acc": 0.63078909, + "epoch": 0.04667681380010147, + "grad_norm": 5.75, + "learning_rate": 4.6676813800101475e-06, + "loss": 1.84006805, + "memory(GiB)": 55.86, + "step": 1840, + "train_speed(iter/s)": 1.651804 + }, + { + "acc": 0.60643654, + "epoch": 0.04680365296803653, + "grad_norm": 6.3125, + "learning_rate": 4.6803652968036534e-06, + "loss": 1.89833241, + "memory(GiB)": 55.86, + "step": 1845, + "train_speed(iter/s)": 1.65222 + }, + { + "acc": 0.62370415, + "epoch": 0.046930492135971585, + "grad_norm": 6.3125, + "learning_rate": 4.6930492135971586e-06, + "loss": 1.85581799, + "memory(GiB)": 55.86, + "step": 1850, + "train_speed(iter/s)": 1.652628 + }, + { + "acc": 0.60888762, + "epoch": 0.047057331303906644, + "grad_norm": 7.03125, + "learning_rate": 4.705733130390665e-06, + "loss": 1.87097797, + "memory(GiB)": 55.86, + "step": 1855, + "train_speed(iter/s)": 1.653063 + }, + { + "acc": 0.59038219, + "epoch": 0.0471841704718417, + "grad_norm": 5.25, + "learning_rate": 4.7184170471841705e-06, + "loss": 1.92111855, + "memory(GiB)": 55.86, + "step": 1860, + "train_speed(iter/s)": 1.653456 + }, + { + "acc": 0.60695114, + "epoch": 0.04731100963977676, + "grad_norm": 6.28125, + "learning_rate": 4.7311009639776765e-06, + "loss": 1.93966103, + "memory(GiB)": 55.86, + "step": 1865, + "train_speed(iter/s)": 1.653841 + }, + { + "acc": 0.59265776, + "epoch": 0.04743784880771182, + "grad_norm": 6.5625, + "learning_rate": 4.743784880771182e-06, + "loss": 1.93526154, + "memory(GiB)": 55.86, + "step": 1870, + "train_speed(iter/s)": 1.654252 + }, + { + "acc": 0.60292034, + "epoch": 0.04756468797564688, + "grad_norm": 6.25, + "learning_rate": 4.756468797564688e-06, + "loss": 1.88684444, + "memory(GiB)": 55.86, + "step": 1875, + "train_speed(iter/s)": 1.654652 + }, + { + "acc": 0.62602434, + "epoch": 0.047691527143581935, + "grad_norm": 6.125, + "learning_rate": 4.769152714358194e-06, + "loss": 1.94059296, + "memory(GiB)": 55.86, + "step": 1880, + "train_speed(iter/s)": 1.65503 + }, + { + "acc": 0.62489433, + "epoch": 0.047818366311516994, + "grad_norm": 5.125, + "learning_rate": 4.7818366311517e-06, + "loss": 1.88831844, + "memory(GiB)": 55.86, + "step": 1885, + "train_speed(iter/s)": 1.655445 + }, + { + "acc": 0.61385436, + "epoch": 0.04794520547945205, + "grad_norm": 5.84375, + "learning_rate": 4.7945205479452054e-06, + "loss": 1.90837364, + "memory(GiB)": 55.86, + "step": 1890, + "train_speed(iter/s)": 1.655874 + }, + { + "acc": 0.62878113, + "epoch": 0.04807204464738711, + "grad_norm": 7.0, + "learning_rate": 4.807204464738711e-06, + "loss": 1.8503315, + "memory(GiB)": 55.86, + "step": 1895, + "train_speed(iter/s)": 1.656288 + }, + { + "acc": 0.61581697, + "epoch": 0.04819888381532217, + "grad_norm": 10.75, + "learning_rate": 4.819888381532217e-06, + "loss": 1.87218151, + "memory(GiB)": 55.86, + "step": 1900, + "train_speed(iter/s)": 1.656681 + }, + { + "acc": 0.61975689, + "epoch": 0.04832572298325723, + "grad_norm": 7.8125, + "learning_rate": 4.832572298325723e-06, + "loss": 1.8513237, + "memory(GiB)": 55.86, + "step": 1905, + "train_speed(iter/s)": 1.657069 + }, + { + "acc": 0.61313229, + "epoch": 0.048452562151192285, + "grad_norm": 7.46875, + "learning_rate": 4.845256215119229e-06, + "loss": 1.85294476, + "memory(GiB)": 55.86, + "step": 1910, + "train_speed(iter/s)": 1.65742 + }, + { + "acc": 0.61485438, + "epoch": 0.048579401319127344, + "grad_norm": 5.4375, + "learning_rate": 4.857940131912735e-06, + "loss": 1.87055016, + "memory(GiB)": 55.86, + "step": 1915, + "train_speed(iter/s)": 1.65782 + }, + { + "acc": 0.62736216, + "epoch": 0.0487062404870624, + "grad_norm": 6.0625, + "learning_rate": 4.87062404870624e-06, + "loss": 1.77576618, + "memory(GiB)": 55.86, + "step": 1920, + "train_speed(iter/s)": 1.658199 + }, + { + "acc": 0.60139713, + "epoch": 0.04883307965499746, + "grad_norm": 6.03125, + "learning_rate": 4.883307965499746e-06, + "loss": 1.91723747, + "memory(GiB)": 55.86, + "step": 1925, + "train_speed(iter/s)": 1.658582 + }, + { + "acc": 0.59416075, + "epoch": 0.04895991882293252, + "grad_norm": 6.40625, + "learning_rate": 4.895991882293252e-06, + "loss": 2.01217003, + "memory(GiB)": 55.86, + "step": 1930, + "train_speed(iter/s)": 1.658987 + }, + { + "acc": 0.61094227, + "epoch": 0.04908675799086758, + "grad_norm": 6.53125, + "learning_rate": 4.908675799086758e-06, + "loss": 1.86329765, + "memory(GiB)": 55.86, + "step": 1935, + "train_speed(iter/s)": 1.659354 + }, + { + "acc": 0.61396599, + "epoch": 0.049213597158802636, + "grad_norm": 6.46875, + "learning_rate": 4.921359715880264e-06, + "loss": 1.9199873, + "memory(GiB)": 55.86, + "step": 1940, + "train_speed(iter/s)": 1.65971 + }, + { + "acc": 0.61482, + "epoch": 0.049340436326737694, + "grad_norm": 5.71875, + "learning_rate": 4.93404363267377e-06, + "loss": 1.88470364, + "memory(GiB)": 55.86, + "step": 1945, + "train_speed(iter/s)": 1.660082 + }, + { + "acc": 0.63160276, + "epoch": 0.04946727549467275, + "grad_norm": 5.25, + "learning_rate": 4.946727549467275e-06, + "loss": 1.85397835, + "memory(GiB)": 55.86, + "step": 1950, + "train_speed(iter/s)": 1.660474 + }, + { + "acc": 0.61834173, + "epoch": 0.04959411466260781, + "grad_norm": 5.75, + "learning_rate": 4.959411466260781e-06, + "loss": 1.80206661, + "memory(GiB)": 55.86, + "step": 1955, + "train_speed(iter/s)": 1.66085 + }, + { + "acc": 0.61714339, + "epoch": 0.04972095383054287, + "grad_norm": 7.34375, + "learning_rate": 4.972095383054287e-06, + "loss": 1.91153755, + "memory(GiB)": 55.86, + "step": 1960, + "train_speed(iter/s)": 1.661242 + }, + { + "acc": 0.61616869, + "epoch": 0.04984779299847793, + "grad_norm": 6.1875, + "learning_rate": 4.984779299847793e-06, + "loss": 1.87368698, + "memory(GiB)": 55.86, + "step": 1965, + "train_speed(iter/s)": 1.661633 + }, + { + "acc": 0.61846704, + "epoch": 0.049974632166412986, + "grad_norm": 8.0625, + "learning_rate": 4.997463216641299e-06, + "loss": 1.90532875, + "memory(GiB)": 55.86, + "step": 1970, + "train_speed(iter/s)": 1.662005 + }, + { + "acc": 0.62661562, + "epoch": 0.050101471334348044, + "grad_norm": 6.46875, + "learning_rate": 5.010147133434805e-06, + "loss": 1.84291344, + "memory(GiB)": 55.86, + "step": 1975, + "train_speed(iter/s)": 1.66236 + }, + { + "acc": 0.62849011, + "epoch": 0.0502283105022831, + "grad_norm": 8.1875, + "learning_rate": 5.02283105022831e-06, + "loss": 1.80382233, + "memory(GiB)": 55.86, + "step": 1980, + "train_speed(iter/s)": 1.662726 + }, + { + "acc": 0.59887094, + "epoch": 0.05035514967021816, + "grad_norm": 7.28125, + "learning_rate": 5.035514967021817e-06, + "loss": 1.96077232, + "memory(GiB)": 55.86, + "step": 1985, + "train_speed(iter/s)": 1.663082 + }, + { + "acc": 0.61298971, + "epoch": 0.05048198883815322, + "grad_norm": 6.125, + "learning_rate": 5.048198883815323e-06, + "loss": 1.87825432, + "memory(GiB)": 55.86, + "step": 1990, + "train_speed(iter/s)": 1.663439 + }, + { + "acc": 0.59730964, + "epoch": 0.05060882800608828, + "grad_norm": 6.15625, + "learning_rate": 5.060882800608828e-06, + "loss": 1.9565403, + "memory(GiB)": 55.86, + "step": 1995, + "train_speed(iter/s)": 1.663826 + }, + { + "acc": 0.61201725, + "epoch": 0.050735667174023336, + "grad_norm": 6.8125, + "learning_rate": 5.073566717402334e-06, + "loss": 1.85511951, + "memory(GiB)": 55.86, + "step": 2000, + "train_speed(iter/s)": 1.664195 + }, + { + "epoch": 0.050735667174023336, + "eval_acc": 0.601371181793312, + "eval_loss": 1.8402973413467407, + "eval_runtime": 58.5184, + "eval_samples_per_second": 108.855, + "eval_steps_per_second": 27.222, + "step": 2000 + }, + { + "acc": 0.60759439, + "epoch": 0.050862506341958394, + "grad_norm": 7.03125, + "learning_rate": 5.086250634195841e-06, + "loss": 1.88536301, + "memory(GiB)": 55.86, + "step": 2005, + "train_speed(iter/s)": 1.582812 + }, + { + "acc": 0.60362034, + "epoch": 0.05098934550989345, + "grad_norm": 6.0625, + "learning_rate": 5.098934550989346e-06, + "loss": 1.85925827, + "memory(GiB)": 55.86, + "step": 2010, + "train_speed(iter/s)": 1.583373 + }, + { + "acc": 0.62760158, + "epoch": 0.05111618467782851, + "grad_norm": 8.375, + "learning_rate": 5.111618467782852e-06, + "loss": 1.84095268, + "memory(GiB)": 55.86, + "step": 2015, + "train_speed(iter/s)": 1.583881 + }, + { + "acc": 0.62201629, + "epoch": 0.05124302384576357, + "grad_norm": 5.53125, + "learning_rate": 5.124302384576357e-06, + "loss": 1.81402969, + "memory(GiB)": 55.86, + "step": 2020, + "train_speed(iter/s)": 1.584416 + }, + { + "acc": 0.62154894, + "epoch": 0.05136986301369863, + "grad_norm": 5.59375, + "learning_rate": 5.136986301369864e-06, + "loss": 1.83564453, + "memory(GiB)": 55.86, + "step": 2025, + "train_speed(iter/s)": 1.5849 + }, + { + "acc": 0.62282491, + "epoch": 0.051496702181633686, + "grad_norm": 6.71875, + "learning_rate": 5.149670218163369e-06, + "loss": 1.83527508, + "memory(GiB)": 55.86, + "step": 2030, + "train_speed(iter/s)": 1.585429 + }, + { + "acc": 0.62803555, + "epoch": 0.051623541349568744, + "grad_norm": 7.875, + "learning_rate": 5.162354134956875e-06, + "loss": 1.87986412, + "memory(GiB)": 55.86, + "step": 2035, + "train_speed(iter/s)": 1.585932 + }, + { + "acc": 0.62820811, + "epoch": 0.0517503805175038, + "grad_norm": 6.28125, + "learning_rate": 5.175038051750381e-06, + "loss": 1.85619965, + "memory(GiB)": 55.86, + "step": 2040, + "train_speed(iter/s)": 1.586465 + }, + { + "acc": 0.62108946, + "epoch": 0.05187721968543886, + "grad_norm": 10.5, + "learning_rate": 5.187721968543887e-06, + "loss": 1.90872402, + "memory(GiB)": 55.86, + "step": 2045, + "train_speed(iter/s)": 1.586983 + }, + { + "acc": 0.61195669, + "epoch": 0.05200405885337392, + "grad_norm": 6.59375, + "learning_rate": 5.200405885337393e-06, + "loss": 1.82115593, + "memory(GiB)": 58.86, + "step": 2050, + "train_speed(iter/s)": 1.587455 + }, + { + "acc": 0.62076006, + "epoch": 0.05213089802130898, + "grad_norm": 6.8125, + "learning_rate": 5.213089802130898e-06, + "loss": 1.85543823, + "memory(GiB)": 58.86, + "step": 2055, + "train_speed(iter/s)": 1.587975 + }, + { + "acc": 0.6161839, + "epoch": 0.052257737189244036, + "grad_norm": 5.65625, + "learning_rate": 5.225773718924404e-06, + "loss": 1.8410347, + "memory(GiB)": 58.86, + "step": 2060, + "train_speed(iter/s)": 1.588462 + }, + { + "acc": 0.58664618, + "epoch": 0.052384576357179094, + "grad_norm": 6.8125, + "learning_rate": 5.238457635717911e-06, + "loss": 2.01187592, + "memory(GiB)": 58.86, + "step": 2065, + "train_speed(iter/s)": 1.588928 + }, + { + "acc": 0.62029772, + "epoch": 0.05251141552511415, + "grad_norm": 7.125, + "learning_rate": 5.251141552511416e-06, + "loss": 1.8977087, + "memory(GiB)": 58.86, + "step": 2070, + "train_speed(iter/s)": 1.589409 + }, + { + "acc": 0.6000257, + "epoch": 0.05263825469304921, + "grad_norm": 5.5625, + "learning_rate": 5.263825469304922e-06, + "loss": 1.95394592, + "memory(GiB)": 58.86, + "step": 2075, + "train_speed(iter/s)": 1.589895 + }, + { + "acc": 0.59658971, + "epoch": 0.05276509386098427, + "grad_norm": 6.28125, + "learning_rate": 5.276509386098427e-06, + "loss": 1.94618568, + "memory(GiB)": 58.86, + "step": 2080, + "train_speed(iter/s)": 1.5904 + }, + { + "acc": 0.61407924, + "epoch": 0.05289193302891933, + "grad_norm": 6.75, + "learning_rate": 5.289193302891934e-06, + "loss": 1.87246552, + "memory(GiB)": 58.86, + "step": 2085, + "train_speed(iter/s)": 1.590786 + }, + { + "acc": 0.62276363, + "epoch": 0.053018772196854386, + "grad_norm": 5.5625, + "learning_rate": 5.30187721968544e-06, + "loss": 1.86005402, + "memory(GiB)": 58.86, + "step": 2090, + "train_speed(iter/s)": 1.591286 + }, + { + "acc": 0.6083406, + "epoch": 0.053145611364789444, + "grad_norm": 5.65625, + "learning_rate": 5.314561136478945e-06, + "loss": 1.90465031, + "memory(GiB)": 58.86, + "step": 2095, + "train_speed(iter/s)": 1.59175 + }, + { + "acc": 0.61667757, + "epoch": 0.0532724505327245, + "grad_norm": 6.21875, + "learning_rate": 5.327245053272451e-06, + "loss": 1.90533409, + "memory(GiB)": 58.86, + "step": 2100, + "train_speed(iter/s)": 1.592255 + }, + { + "acc": 0.64405117, + "epoch": 0.05339928970065956, + "grad_norm": 6.09375, + "learning_rate": 5.339928970065957e-06, + "loss": 1.75282745, + "memory(GiB)": 58.86, + "step": 2105, + "train_speed(iter/s)": 1.592748 + }, + { + "acc": 0.61783571, + "epoch": 0.05352612886859462, + "grad_norm": 7.75, + "learning_rate": 5.352612886859463e-06, + "loss": 1.91949978, + "memory(GiB)": 58.86, + "step": 2110, + "train_speed(iter/s)": 1.59325 + }, + { + "acc": 0.6170114, + "epoch": 0.05365296803652968, + "grad_norm": 7.09375, + "learning_rate": 5.365296803652969e-06, + "loss": 1.85956173, + "memory(GiB)": 58.86, + "step": 2115, + "train_speed(iter/s)": 1.593743 + }, + { + "acc": 0.61532836, + "epoch": 0.053779807204464736, + "grad_norm": 7.71875, + "learning_rate": 5.377980720446474e-06, + "loss": 1.81922073, + "memory(GiB)": 58.86, + "step": 2120, + "train_speed(iter/s)": 1.594196 + }, + { + "acc": 0.60407591, + "epoch": 0.053906646372399794, + "grad_norm": 6.8125, + "learning_rate": 5.390664637239981e-06, + "loss": 1.92536125, + "memory(GiB)": 58.86, + "step": 2125, + "train_speed(iter/s)": 1.594668 + }, + { + "acc": 0.61450729, + "epoch": 0.05403348554033485, + "grad_norm": 6.8125, + "learning_rate": 5.403348554033486e-06, + "loss": 1.8827652, + "memory(GiB)": 58.86, + "step": 2130, + "train_speed(iter/s)": 1.59517 + }, + { + "acc": 0.62003036, + "epoch": 0.05416032470826991, + "grad_norm": 6.09375, + "learning_rate": 5.416032470826992e-06, + "loss": 1.81562634, + "memory(GiB)": 58.86, + "step": 2135, + "train_speed(iter/s)": 1.595655 + }, + { + "acc": 0.62902613, + "epoch": 0.05428716387620497, + "grad_norm": 6.65625, + "learning_rate": 5.428716387620497e-06, + "loss": 1.82815914, + "memory(GiB)": 58.86, + "step": 2140, + "train_speed(iter/s)": 1.596142 + }, + { + "acc": 0.61863089, + "epoch": 0.05441400304414003, + "grad_norm": 6.21875, + "learning_rate": 5.441400304414004e-06, + "loss": 1.85329399, + "memory(GiB)": 58.86, + "step": 2145, + "train_speed(iter/s)": 1.596633 + }, + { + "acc": 0.61467247, + "epoch": 0.054540842212075086, + "grad_norm": 5.25, + "learning_rate": 5.45408422120751e-06, + "loss": 1.88357754, + "memory(GiB)": 58.86, + "step": 2150, + "train_speed(iter/s)": 1.597122 + }, + { + "acc": 0.61463175, + "epoch": 0.054667681380010144, + "grad_norm": 5.90625, + "learning_rate": 5.466768138001015e-06, + "loss": 1.94149857, + "memory(GiB)": 58.86, + "step": 2155, + "train_speed(iter/s)": 1.5976 + }, + { + "acc": 0.62385445, + "epoch": 0.0547945205479452, + "grad_norm": 5.28125, + "learning_rate": 5.479452054794521e-06, + "loss": 1.93745651, + "memory(GiB)": 58.86, + "step": 2160, + "train_speed(iter/s)": 1.598066 + }, + { + "acc": 0.60461903, + "epoch": 0.05492135971588026, + "grad_norm": 7.28125, + "learning_rate": 5.492135971588028e-06, + "loss": 1.86028633, + "memory(GiB)": 58.86, + "step": 2165, + "train_speed(iter/s)": 1.59853 + }, + { + "acc": 0.592383, + "epoch": 0.05504819888381532, + "grad_norm": 5.46875, + "learning_rate": 5.504819888381533e-06, + "loss": 1.97254333, + "memory(GiB)": 58.86, + "step": 2170, + "train_speed(iter/s)": 1.598994 + }, + { + "acc": 0.62739601, + "epoch": 0.05517503805175038, + "grad_norm": 5.53125, + "learning_rate": 5.517503805175039e-06, + "loss": 1.83993111, + "memory(GiB)": 58.86, + "step": 2175, + "train_speed(iter/s)": 1.599423 + }, + { + "acc": 0.61081505, + "epoch": 0.055301877219685436, + "grad_norm": 6.375, + "learning_rate": 5.530187721968544e-06, + "loss": 1.87527924, + "memory(GiB)": 58.86, + "step": 2180, + "train_speed(iter/s)": 1.599862 + }, + { + "acc": 0.6117691, + "epoch": 0.055428716387620494, + "grad_norm": 7.90625, + "learning_rate": 5.542871638762051e-06, + "loss": 1.89598579, + "memory(GiB)": 58.86, + "step": 2185, + "train_speed(iter/s)": 1.600323 + }, + { + "acc": 0.61759844, + "epoch": 0.05555555555555555, + "grad_norm": 6.46875, + "learning_rate": 5.555555555555557e-06, + "loss": 1.9203064, + "memory(GiB)": 58.86, + "step": 2190, + "train_speed(iter/s)": 1.600771 + }, + { + "acc": 0.62582135, + "epoch": 0.05568239472349061, + "grad_norm": 4.59375, + "learning_rate": 5.568239472349062e-06, + "loss": 1.8298502, + "memory(GiB)": 58.86, + "step": 2195, + "train_speed(iter/s)": 1.601227 + }, + { + "acc": 0.61843352, + "epoch": 0.05580923389142567, + "grad_norm": 8.125, + "learning_rate": 5.580923389142568e-06, + "loss": 1.88783493, + "memory(GiB)": 58.86, + "step": 2200, + "train_speed(iter/s)": 1.601684 + }, + { + "acc": 0.61050396, + "epoch": 0.05593607305936073, + "grad_norm": 6.59375, + "learning_rate": 5.593607305936074e-06, + "loss": 1.8683567, + "memory(GiB)": 58.86, + "step": 2205, + "train_speed(iter/s)": 1.602111 + }, + { + "acc": 0.61746693, + "epoch": 0.056062912227295786, + "grad_norm": 6.3125, + "learning_rate": 5.60629122272958e-06, + "loss": 1.84496689, + "memory(GiB)": 58.86, + "step": 2210, + "train_speed(iter/s)": 1.602564 + }, + { + "acc": 0.61757865, + "epoch": 0.056189751395230844, + "grad_norm": 6.71875, + "learning_rate": 5.618975139523085e-06, + "loss": 1.88137474, + "memory(GiB)": 58.86, + "step": 2215, + "train_speed(iter/s)": 1.603012 + }, + { + "acc": 0.62015538, + "epoch": 0.0563165905631659, + "grad_norm": 6.1875, + "learning_rate": 5.631659056316591e-06, + "loss": 1.81494751, + "memory(GiB)": 58.86, + "step": 2220, + "train_speed(iter/s)": 1.603463 + }, + { + "acc": 0.62243133, + "epoch": 0.05644342973110096, + "grad_norm": 6.5, + "learning_rate": 5.644342973110098e-06, + "loss": 1.85718117, + "memory(GiB)": 58.86, + "step": 2225, + "train_speed(iter/s)": 1.60388 + }, + { + "acc": 0.62325392, + "epoch": 0.05657026889903602, + "grad_norm": 7.3125, + "learning_rate": 5.657026889903603e-06, + "loss": 1.82916603, + "memory(GiB)": 58.86, + "step": 2230, + "train_speed(iter/s)": 1.604302 + }, + { + "acc": 0.60833759, + "epoch": 0.05669710806697108, + "grad_norm": 5.375, + "learning_rate": 5.669710806697109e-06, + "loss": 1.93776398, + "memory(GiB)": 58.86, + "step": 2235, + "train_speed(iter/s)": 1.604749 + }, + { + "acc": 0.61601906, + "epoch": 0.056823947234906136, + "grad_norm": 5.5625, + "learning_rate": 5.682394723490614e-06, + "loss": 1.86586227, + "memory(GiB)": 58.86, + "step": 2240, + "train_speed(iter/s)": 1.605201 + }, + { + "acc": 0.60844631, + "epoch": 0.056950786402841194, + "grad_norm": 5.8125, + "learning_rate": 5.695078640284121e-06, + "loss": 1.89643288, + "memory(GiB)": 58.86, + "step": 2245, + "train_speed(iter/s)": 1.605632 + }, + { + "acc": 0.61506748, + "epoch": 0.05707762557077625, + "grad_norm": 6.21875, + "learning_rate": 5.7077625570776266e-06, + "loss": 1.84293365, + "memory(GiB)": 58.86, + "step": 2250, + "train_speed(iter/s)": 1.606108 + }, + { + "acc": 0.61220007, + "epoch": 0.05720446473871131, + "grad_norm": 5.40625, + "learning_rate": 5.720446473871132e-06, + "loss": 1.8711916, + "memory(GiB)": 58.86, + "step": 2255, + "train_speed(iter/s)": 1.60655 + }, + { + "acc": 0.62895617, + "epoch": 0.05733130390664637, + "grad_norm": 9.3125, + "learning_rate": 5.733130390664638e-06, + "loss": 1.77090511, + "memory(GiB)": 58.86, + "step": 2260, + "train_speed(iter/s)": 1.607017 + }, + { + "acc": 0.611234, + "epoch": 0.05745814307458143, + "grad_norm": 6.59375, + "learning_rate": 5.7458143074581445e-06, + "loss": 1.87374725, + "memory(GiB)": 58.86, + "step": 2265, + "train_speed(iter/s)": 1.607444 + }, + { + "acc": 0.60035524, + "epoch": 0.057584982242516486, + "grad_norm": 6.25, + "learning_rate": 5.75849822425165e-06, + "loss": 1.92587185, + "memory(GiB)": 58.86, + "step": 2270, + "train_speed(iter/s)": 1.607901 + }, + { + "acc": 0.60545778, + "epoch": 0.057711821410451544, + "grad_norm": 6.625, + "learning_rate": 5.7711821410451556e-06, + "loss": 1.93162308, + "memory(GiB)": 58.86, + "step": 2275, + "train_speed(iter/s)": 1.608325 + }, + { + "acc": 0.61990809, + "epoch": 0.0578386605783866, + "grad_norm": 6.0625, + "learning_rate": 5.783866057838661e-06, + "loss": 1.83121796, + "memory(GiB)": 58.86, + "step": 2280, + "train_speed(iter/s)": 1.608759 + }, + { + "acc": 0.59965029, + "epoch": 0.05796549974632166, + "grad_norm": 6.375, + "learning_rate": 5.7965499746321675e-06, + "loss": 1.96522408, + "memory(GiB)": 58.86, + "step": 2285, + "train_speed(iter/s)": 1.609193 + }, + { + "acc": 0.61757164, + "epoch": 0.05809233891425672, + "grad_norm": 7.0625, + "learning_rate": 5.809233891425673e-06, + "loss": 1.86750145, + "memory(GiB)": 58.86, + "step": 2290, + "train_speed(iter/s)": 1.60959 + }, + { + "acc": 0.61162195, + "epoch": 0.05821917808219178, + "grad_norm": 6.25, + "learning_rate": 5.821917808219179e-06, + "loss": 1.86003838, + "memory(GiB)": 58.86, + "step": 2295, + "train_speed(iter/s)": 1.61003 + }, + { + "acc": 0.61908388, + "epoch": 0.058346017250126836, + "grad_norm": 7.1875, + "learning_rate": 5.834601725012684e-06, + "loss": 1.84045887, + "memory(GiB)": 58.86, + "step": 2300, + "train_speed(iter/s)": 1.610449 + }, + { + "acc": 0.61458035, + "epoch": 0.058472856418061894, + "grad_norm": 7.65625, + "learning_rate": 5.8472856418061905e-06, + "loss": 1.88287506, + "memory(GiB)": 58.86, + "step": 2305, + "train_speed(iter/s)": 1.610903 + }, + { + "acc": 0.61573048, + "epoch": 0.05859969558599695, + "grad_norm": 6.1875, + "learning_rate": 5.8599695585996965e-06, + "loss": 1.84793434, + "memory(GiB)": 58.86, + "step": 2310, + "train_speed(iter/s)": 1.61134 + }, + { + "acc": 0.61905918, + "epoch": 0.05872653475393201, + "grad_norm": 5.28125, + "learning_rate": 5.872653475393202e-06, + "loss": 1.80390949, + "memory(GiB)": 58.86, + "step": 2315, + "train_speed(iter/s)": 1.611776 + }, + { + "acc": 0.62466311, + "epoch": 0.05885337392186707, + "grad_norm": 6.0, + "learning_rate": 5.8853373921867076e-06, + "loss": 1.91816978, + "memory(GiB)": 58.86, + "step": 2320, + "train_speed(iter/s)": 1.612206 + }, + { + "acc": 0.63113375, + "epoch": 0.05898021308980213, + "grad_norm": 6.3125, + "learning_rate": 5.898021308980214e-06, + "loss": 1.79769382, + "memory(GiB)": 58.86, + "step": 2325, + "train_speed(iter/s)": 1.612635 + }, + { + "acc": 0.60280986, + "epoch": 0.059107052257737186, + "grad_norm": 6.34375, + "learning_rate": 5.9107052257737195e-06, + "loss": 1.98647652, + "memory(GiB)": 58.86, + "step": 2330, + "train_speed(iter/s)": 1.61304 + }, + { + "acc": 0.60971193, + "epoch": 0.059233891425672244, + "grad_norm": 5.46875, + "learning_rate": 5.9233891425672255e-06, + "loss": 1.92672958, + "memory(GiB)": 58.86, + "step": 2335, + "train_speed(iter/s)": 1.613439 + }, + { + "acc": 0.62279806, + "epoch": 0.0593607305936073, + "grad_norm": 5.6875, + "learning_rate": 5.936073059360731e-06, + "loss": 1.81108055, + "memory(GiB)": 58.86, + "step": 2340, + "train_speed(iter/s)": 1.613866 + }, + { + "acc": 0.61838765, + "epoch": 0.05948756976154236, + "grad_norm": 6.15625, + "learning_rate": 5.948756976154237e-06, + "loss": 1.8841713, + "memory(GiB)": 58.86, + "step": 2345, + "train_speed(iter/s)": 1.614266 + }, + { + "acc": 0.62070599, + "epoch": 0.05961440892947742, + "grad_norm": 6.34375, + "learning_rate": 5.961440892947743e-06, + "loss": 1.87567253, + "memory(GiB)": 58.86, + "step": 2350, + "train_speed(iter/s)": 1.614679 + }, + { + "acc": 0.63126111, + "epoch": 0.05974124809741248, + "grad_norm": 7.25, + "learning_rate": 5.9741248097412485e-06, + "loss": 1.7787096, + "memory(GiB)": 58.86, + "step": 2355, + "train_speed(iter/s)": 1.615087 + }, + { + "acc": 0.61884856, + "epoch": 0.059868087265347536, + "grad_norm": 5.75, + "learning_rate": 5.9868087265347545e-06, + "loss": 1.83313141, + "memory(GiB)": 58.86, + "step": 2360, + "train_speed(iter/s)": 1.615467 + }, + { + "acc": 0.61990161, + "epoch": 0.059994926433282594, + "grad_norm": 5.96875, + "learning_rate": 5.99949264332826e-06, + "loss": 1.80100517, + "memory(GiB)": 58.86, + "step": 2365, + "train_speed(iter/s)": 1.615866 + }, + { + "acc": 0.61794252, + "epoch": 0.06012176560121765, + "grad_norm": 5.28125, + "learning_rate": 6.012176560121766e-06, + "loss": 1.89589882, + "memory(GiB)": 58.86, + "step": 2370, + "train_speed(iter/s)": 1.616262 + }, + { + "acc": 0.62806449, + "epoch": 0.06024860476915271, + "grad_norm": 8.4375, + "learning_rate": 6.0248604769152715e-06, + "loss": 1.83353176, + "memory(GiB)": 58.86, + "step": 2375, + "train_speed(iter/s)": 1.616668 + }, + { + "acc": 0.62483759, + "epoch": 0.06037544393708777, + "grad_norm": 7.375, + "learning_rate": 6.0375443937087775e-06, + "loss": 1.77357826, + "memory(GiB)": 58.86, + "step": 2380, + "train_speed(iter/s)": 1.617065 + }, + { + "acc": 0.61796074, + "epoch": 0.06050228310502283, + "grad_norm": 5.78125, + "learning_rate": 6.050228310502284e-06, + "loss": 1.94466896, + "memory(GiB)": 58.86, + "step": 2385, + "train_speed(iter/s)": 1.617489 + }, + { + "acc": 0.62225666, + "epoch": 0.060629122272957886, + "grad_norm": 5.3125, + "learning_rate": 6.062912227295789e-06, + "loss": 1.85230179, + "memory(GiB)": 58.86, + "step": 2390, + "train_speed(iter/s)": 1.617878 + }, + { + "acc": 0.61804338, + "epoch": 0.060755961440892944, + "grad_norm": 6.59375, + "learning_rate": 6.075596144089295e-06, + "loss": 1.77972374, + "memory(GiB)": 58.86, + "step": 2395, + "train_speed(iter/s)": 1.618289 + }, + { + "acc": 0.61175203, + "epoch": 0.060882800608828, + "grad_norm": 7.28125, + "learning_rate": 6.0882800608828005e-06, + "loss": 1.92963619, + "memory(GiB)": 58.86, + "step": 2400, + "train_speed(iter/s)": 1.618705 + }, + { + "acc": 0.60799284, + "epoch": 0.06100963977676306, + "grad_norm": 5.34375, + "learning_rate": 6.100963977676307e-06, + "loss": 1.9112608, + "memory(GiB)": 58.86, + "step": 2405, + "train_speed(iter/s)": 1.619081 + }, + { + "acc": 0.60597339, + "epoch": 0.06113647894469812, + "grad_norm": 6.03125, + "learning_rate": 6.113647894469813e-06, + "loss": 1.90033302, + "memory(GiB)": 58.86, + "step": 2410, + "train_speed(iter/s)": 1.619485 + }, + { + "acc": 0.63157434, + "epoch": 0.06126331811263318, + "grad_norm": 5.6875, + "learning_rate": 6.126331811263318e-06, + "loss": 1.7876379, + "memory(GiB)": 58.86, + "step": 2415, + "train_speed(iter/s)": 1.619862 + }, + { + "acc": 0.63476391, + "epoch": 0.061390157280568236, + "grad_norm": 5.9375, + "learning_rate": 6.139015728056824e-06, + "loss": 1.80197964, + "memory(GiB)": 58.86, + "step": 2420, + "train_speed(iter/s)": 1.620255 + }, + { + "acc": 0.61584435, + "epoch": 0.061516996448503294, + "grad_norm": 5.96875, + "learning_rate": 6.151699644850331e-06, + "loss": 1.8763895, + "memory(GiB)": 58.86, + "step": 2425, + "train_speed(iter/s)": 1.620642 + }, + { + "acc": 0.63030863, + "epoch": 0.06164383561643835, + "grad_norm": 5.09375, + "learning_rate": 6.164383561643836e-06, + "loss": 1.77596703, + "memory(GiB)": 58.86, + "step": 2430, + "train_speed(iter/s)": 1.621042 + }, + { + "acc": 0.60907698, + "epoch": 0.06177067478437341, + "grad_norm": 5.9375, + "learning_rate": 6.177067478437342e-06, + "loss": 1.89386005, + "memory(GiB)": 58.86, + "step": 2435, + "train_speed(iter/s)": 1.621415 + }, + { + "acc": 0.61367273, + "epoch": 0.06189751395230847, + "grad_norm": 5.78125, + "learning_rate": 6.189751395230847e-06, + "loss": 1.88750191, + "memory(GiB)": 58.86, + "step": 2440, + "train_speed(iter/s)": 1.621802 + }, + { + "acc": 0.60630226, + "epoch": 0.06202435312024353, + "grad_norm": 10.125, + "learning_rate": 6.202435312024354e-06, + "loss": 1.90389862, + "memory(GiB)": 58.86, + "step": 2445, + "train_speed(iter/s)": 1.622167 + }, + { + "acc": 0.63596616, + "epoch": 0.062151192288178586, + "grad_norm": 6.875, + "learning_rate": 6.215119228817859e-06, + "loss": 1.74295464, + "memory(GiB)": 58.86, + "step": 2450, + "train_speed(iter/s)": 1.622545 + }, + { + "acc": 0.60849113, + "epoch": 0.062278031456113644, + "grad_norm": 5.5625, + "learning_rate": 6.227803145611365e-06, + "loss": 1.89324112, + "memory(GiB)": 58.86, + "step": 2455, + "train_speed(iter/s)": 1.622946 + }, + { + "acc": 0.60429263, + "epoch": 0.0624048706240487, + "grad_norm": 6.65625, + "learning_rate": 6.24048706240487e-06, + "loss": 1.89511909, + "memory(GiB)": 58.86, + "step": 2460, + "train_speed(iter/s)": 1.623309 + }, + { + "acc": 0.62859902, + "epoch": 0.06253170979198376, + "grad_norm": 8.1875, + "learning_rate": 6.253170979198377e-06, + "loss": 1.83797894, + "memory(GiB)": 58.86, + "step": 2465, + "train_speed(iter/s)": 1.623669 + }, + { + "acc": 0.61564684, + "epoch": 0.06265854895991882, + "grad_norm": 6.71875, + "learning_rate": 6.265854895991883e-06, + "loss": 1.86227913, + "memory(GiB)": 58.86, + "step": 2470, + "train_speed(iter/s)": 1.624028 + }, + { + "acc": 0.62765665, + "epoch": 0.06278538812785388, + "grad_norm": 6.5625, + "learning_rate": 6.278538812785388e-06, + "loss": 1.87822113, + "memory(GiB)": 58.86, + "step": 2475, + "train_speed(iter/s)": 1.624398 + }, + { + "acc": 0.62202787, + "epoch": 0.06291222729578894, + "grad_norm": 6.78125, + "learning_rate": 6.291222729578894e-06, + "loss": 1.84174767, + "memory(GiB)": 58.86, + "step": 2480, + "train_speed(iter/s)": 1.624764 + }, + { + "acc": 0.61823549, + "epoch": 0.063039066463724, + "grad_norm": 5.34375, + "learning_rate": 6.303906646372401e-06, + "loss": 1.92524567, + "memory(GiB)": 58.86, + "step": 2485, + "train_speed(iter/s)": 1.625121 + }, + { + "acc": 0.62868605, + "epoch": 0.06316590563165905, + "grad_norm": 7.75, + "learning_rate": 6.316590563165906e-06, + "loss": 1.82348976, + "memory(GiB)": 58.86, + "step": 2490, + "train_speed(iter/s)": 1.625492 + }, + { + "acc": 0.61429405, + "epoch": 0.06329274479959411, + "grad_norm": 6.625, + "learning_rate": 6.329274479959412e-06, + "loss": 1.84833794, + "memory(GiB)": 58.86, + "step": 2495, + "train_speed(iter/s)": 1.625888 + }, + { + "acc": 0.6139091, + "epoch": 0.06341958396752917, + "grad_norm": 5.84375, + "learning_rate": 6.341958396752917e-06, + "loss": 1.91333733, + "memory(GiB)": 58.86, + "step": 2500, + "train_speed(iter/s)": 1.626232 + }, + { + "acc": 0.62977324, + "epoch": 0.06354642313546423, + "grad_norm": 7.0, + "learning_rate": 6.354642313546424e-06, + "loss": 1.86739655, + "memory(GiB)": 58.86, + "step": 2505, + "train_speed(iter/s)": 1.626627 + }, + { + "acc": 0.61065321, + "epoch": 0.06367326230339929, + "grad_norm": 5.28125, + "learning_rate": 6.36732623033993e-06, + "loss": 1.8769371, + "memory(GiB)": 58.86, + "step": 2510, + "train_speed(iter/s)": 1.626989 + }, + { + "acc": 0.62435513, + "epoch": 0.06380010147133434, + "grad_norm": 5.25, + "learning_rate": 6.380010147133435e-06, + "loss": 1.76357212, + "memory(GiB)": 58.86, + "step": 2515, + "train_speed(iter/s)": 1.627336 + }, + { + "acc": 0.62186022, + "epoch": 0.0639269406392694, + "grad_norm": 5.8125, + "learning_rate": 6.392694063926941e-06, + "loss": 1.87364273, + "memory(GiB)": 58.86, + "step": 2520, + "train_speed(iter/s)": 1.627699 + }, + { + "acc": 0.60377526, + "epoch": 0.06405377980720446, + "grad_norm": 5.8125, + "learning_rate": 6.405377980720447e-06, + "loss": 1.89314537, + "memory(GiB)": 58.86, + "step": 2525, + "train_speed(iter/s)": 1.628049 + }, + { + "acc": 0.63855057, + "epoch": 0.06418061897513952, + "grad_norm": 7.75, + "learning_rate": 6.418061897513953e-06, + "loss": 1.75219975, + "memory(GiB)": 58.86, + "step": 2530, + "train_speed(iter/s)": 1.628442 + }, + { + "acc": 0.61520262, + "epoch": 0.06430745814307458, + "grad_norm": 7.90625, + "learning_rate": 6.430745814307458e-06, + "loss": 1.82468681, + "memory(GiB)": 58.86, + "step": 2535, + "train_speed(iter/s)": 1.628795 + }, + { + "acc": 0.6168025, + "epoch": 0.06443429731100964, + "grad_norm": 5.6875, + "learning_rate": 6.443429731100964e-06, + "loss": 1.86709976, + "memory(GiB)": 58.86, + "step": 2540, + "train_speed(iter/s)": 1.629145 + }, + { + "acc": 0.64176402, + "epoch": 0.0645611364789447, + "grad_norm": 6.15625, + "learning_rate": 6.456113647894471e-06, + "loss": 1.78402672, + "memory(GiB)": 58.86, + "step": 2545, + "train_speed(iter/s)": 1.629506 + }, + { + "acc": 0.62245908, + "epoch": 0.06468797564687975, + "grad_norm": 4.71875, + "learning_rate": 6.468797564687976e-06, + "loss": 1.80975246, + "memory(GiB)": 58.86, + "step": 2550, + "train_speed(iter/s)": 1.629843 + }, + { + "acc": 0.64164104, + "epoch": 0.06481481481481481, + "grad_norm": 6.78125, + "learning_rate": 6.481481481481482e-06, + "loss": 1.72489376, + "memory(GiB)": 58.86, + "step": 2555, + "train_speed(iter/s)": 1.6302 + }, + { + "acc": 0.62354574, + "epoch": 0.06494165398274987, + "grad_norm": 5.46875, + "learning_rate": 6.494165398274987e-06, + "loss": 1.83460884, + "memory(GiB)": 58.86, + "step": 2560, + "train_speed(iter/s)": 1.63058 + }, + { + "acc": 0.60753074, + "epoch": 0.06506849315068493, + "grad_norm": 5.96875, + "learning_rate": 6.506849315068494e-06, + "loss": 1.86012058, + "memory(GiB)": 58.86, + "step": 2565, + "train_speed(iter/s)": 1.630945 + }, + { + "acc": 0.61494865, + "epoch": 0.06519533231861999, + "grad_norm": 5.84375, + "learning_rate": 6.519533231862e-06, + "loss": 1.8817318, + "memory(GiB)": 58.86, + "step": 2570, + "train_speed(iter/s)": 1.631325 + }, + { + "acc": 0.62781019, + "epoch": 0.06532217148655504, + "grad_norm": 6.3125, + "learning_rate": 6.532217148655505e-06, + "loss": 1.79104652, + "memory(GiB)": 58.86, + "step": 2575, + "train_speed(iter/s)": 1.631683 + }, + { + "acc": 0.61440473, + "epoch": 0.0654490106544901, + "grad_norm": 5.34375, + "learning_rate": 6.544901065449011e-06, + "loss": 1.99972744, + "memory(GiB)": 58.86, + "step": 2580, + "train_speed(iter/s)": 1.632039 + }, + { + "acc": 0.60830774, + "epoch": 0.06557584982242516, + "grad_norm": 10.5, + "learning_rate": 6.557584982242518e-06, + "loss": 1.91051674, + "memory(GiB)": 58.86, + "step": 2585, + "train_speed(iter/s)": 1.632391 + }, + { + "acc": 0.620649, + "epoch": 0.06570268899036022, + "grad_norm": 5.78125, + "learning_rate": 6.570268899036023e-06, + "loss": 1.86778793, + "memory(GiB)": 58.86, + "step": 2590, + "train_speed(iter/s)": 1.63273 + }, + { + "acc": 0.61801691, + "epoch": 0.06582952815829528, + "grad_norm": 5.90625, + "learning_rate": 6.582952815829529e-06, + "loss": 1.8691227, + "memory(GiB)": 58.86, + "step": 2595, + "train_speed(iter/s)": 1.633066 + }, + { + "acc": 0.62306376, + "epoch": 0.06595636732623034, + "grad_norm": 6.65625, + "learning_rate": 6.595636732623034e-06, + "loss": 1.92729607, + "memory(GiB)": 58.86, + "step": 2600, + "train_speed(iter/s)": 1.633407 + }, + { + "acc": 0.62467918, + "epoch": 0.0660832064941654, + "grad_norm": 7.84375, + "learning_rate": 6.608320649416541e-06, + "loss": 1.80098896, + "memory(GiB)": 58.86, + "step": 2605, + "train_speed(iter/s)": 1.633762 + }, + { + "acc": 0.61991296, + "epoch": 0.06621004566210045, + "grad_norm": 6.9375, + "learning_rate": 6.621004566210046e-06, + "loss": 1.8717968, + "memory(GiB)": 58.86, + "step": 2610, + "train_speed(iter/s)": 1.634118 + }, + { + "acc": 0.62227192, + "epoch": 0.06633688483003551, + "grad_norm": 8.4375, + "learning_rate": 6.633688483003552e-06, + "loss": 1.82289581, + "memory(GiB)": 58.86, + "step": 2615, + "train_speed(iter/s)": 1.634445 + }, + { + "acc": 0.61607914, + "epoch": 0.06646372399797057, + "grad_norm": 8.875, + "learning_rate": 6.646372399797057e-06, + "loss": 1.83439407, + "memory(GiB)": 58.86, + "step": 2620, + "train_speed(iter/s)": 1.634792 + }, + { + "acc": 0.62091675, + "epoch": 0.06659056316590563, + "grad_norm": 7.25, + "learning_rate": 6.659056316590564e-06, + "loss": 1.84438725, + "memory(GiB)": 58.86, + "step": 2625, + "train_speed(iter/s)": 1.635129 + }, + { + "acc": 0.61466908, + "epoch": 0.06671740233384069, + "grad_norm": 7.9375, + "learning_rate": 6.67174023338407e-06, + "loss": 1.85825348, + "memory(GiB)": 58.86, + "step": 2630, + "train_speed(iter/s)": 1.635488 + }, + { + "acc": 0.62345629, + "epoch": 0.06684424150177574, + "grad_norm": 7.84375, + "learning_rate": 6.684424150177575e-06, + "loss": 1.85583801, + "memory(GiB)": 58.86, + "step": 2635, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.62290425, + "epoch": 0.0669710806697108, + "grad_norm": 7.0, + "learning_rate": 6.697108066971081e-06, + "loss": 1.89310856, + "memory(GiB)": 58.86, + "step": 2640, + "train_speed(iter/s)": 1.636168 + }, + { + "acc": 0.62458086, + "epoch": 0.06709791983764586, + "grad_norm": 7.0, + "learning_rate": 6.709791983764588e-06, + "loss": 1.79221916, + "memory(GiB)": 58.86, + "step": 2645, + "train_speed(iter/s)": 1.636509 + }, + { + "acc": 0.62424927, + "epoch": 0.06722475900558092, + "grad_norm": 7.375, + "learning_rate": 6.722475900558093e-06, + "loss": 1.82292728, + "memory(GiB)": 58.86, + "step": 2650, + "train_speed(iter/s)": 1.636837 + }, + { + "acc": 0.63885889, + "epoch": 0.06735159817351598, + "grad_norm": 10.8125, + "learning_rate": 6.735159817351599e-06, + "loss": 1.7846859, + "memory(GiB)": 58.86, + "step": 2655, + "train_speed(iter/s)": 1.637161 + }, + { + "acc": 0.63184495, + "epoch": 0.06747843734145104, + "grad_norm": 7.4375, + "learning_rate": 6.747843734145104e-06, + "loss": 1.83132782, + "memory(GiB)": 58.86, + "step": 2660, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.62372584, + "epoch": 0.0676052765093861, + "grad_norm": 6.1875, + "learning_rate": 6.760527650938611e-06, + "loss": 1.79030361, + "memory(GiB)": 58.86, + "step": 2665, + "train_speed(iter/s)": 1.637871 + }, + { + "acc": 0.61351156, + "epoch": 0.06773211567732115, + "grad_norm": 6.125, + "learning_rate": 6.773211567732117e-06, + "loss": 1.86900635, + "memory(GiB)": 58.86, + "step": 2670, + "train_speed(iter/s)": 1.638227 + }, + { + "acc": 0.6281414, + "epoch": 0.06785895484525621, + "grad_norm": 5.4375, + "learning_rate": 6.785895484525622e-06, + "loss": 1.90075188, + "memory(GiB)": 58.86, + "step": 2675, + "train_speed(iter/s)": 1.638557 + }, + { + "acc": 0.6361618, + "epoch": 0.06798579401319127, + "grad_norm": 6.3125, + "learning_rate": 6.798579401319128e-06, + "loss": 1.78285904, + "memory(GiB)": 58.86, + "step": 2680, + "train_speed(iter/s)": 1.638903 + }, + { + "acc": 0.62580462, + "epoch": 0.06811263318112633, + "grad_norm": 6.25, + "learning_rate": 6.811263318112634e-06, + "loss": 1.87772331, + "memory(GiB)": 58.86, + "step": 2685, + "train_speed(iter/s)": 1.639252 + }, + { + "acc": 0.6119997, + "epoch": 0.06823947234906139, + "grad_norm": 6.5, + "learning_rate": 6.82394723490614e-06, + "loss": 1.84669895, + "memory(GiB)": 58.86, + "step": 2690, + "train_speed(iter/s)": 1.639594 + }, + { + "acc": 0.62462873, + "epoch": 0.06836631151699644, + "grad_norm": 5.46875, + "learning_rate": 6.836631151699645e-06, + "loss": 1.86291733, + "memory(GiB)": 58.86, + "step": 2695, + "train_speed(iter/s)": 1.639927 + }, + { + "acc": 0.60608716, + "epoch": 0.0684931506849315, + "grad_norm": 7.0625, + "learning_rate": 6.849315068493151e-06, + "loss": 1.9127264, + "memory(GiB)": 58.86, + "step": 2700, + "train_speed(iter/s)": 1.640256 + }, + { + "acc": 0.61509128, + "epoch": 0.06861998985286656, + "grad_norm": 6.59375, + "learning_rate": 6.861998985286658e-06, + "loss": 1.89054337, + "memory(GiB)": 58.86, + "step": 2705, + "train_speed(iter/s)": 1.640593 + }, + { + "acc": 0.61531215, + "epoch": 0.06874682902080162, + "grad_norm": 6.8125, + "learning_rate": 6.874682902080163e-06, + "loss": 1.94848938, + "memory(GiB)": 58.86, + "step": 2710, + "train_speed(iter/s)": 1.640941 + }, + { + "acc": 0.62668724, + "epoch": 0.06887366818873668, + "grad_norm": 6.03125, + "learning_rate": 6.887366818873669e-06, + "loss": 1.79132462, + "memory(GiB)": 58.86, + "step": 2715, + "train_speed(iter/s)": 1.641258 + }, + { + "acc": 0.62493963, + "epoch": 0.06900050735667174, + "grad_norm": 6.96875, + "learning_rate": 6.900050735667174e-06, + "loss": 1.8403141, + "memory(GiB)": 58.86, + "step": 2720, + "train_speed(iter/s)": 1.6416 + }, + { + "acc": 0.62243023, + "epoch": 0.0691273465246068, + "grad_norm": 5.8125, + "learning_rate": 6.912734652460681e-06, + "loss": 1.82509041, + "memory(GiB)": 58.86, + "step": 2725, + "train_speed(iter/s)": 1.641932 + }, + { + "acc": 0.61616879, + "epoch": 0.06925418569254185, + "grad_norm": 9.875, + "learning_rate": 6.925418569254187e-06, + "loss": 1.81707802, + "memory(GiB)": 58.86, + "step": 2730, + "train_speed(iter/s)": 1.64227 + }, + { + "acc": 0.61606154, + "epoch": 0.06938102486047691, + "grad_norm": 6.21875, + "learning_rate": 6.938102486047692e-06, + "loss": 1.80503426, + "memory(GiB)": 58.86, + "step": 2735, + "train_speed(iter/s)": 1.642583 + }, + { + "acc": 0.61662331, + "epoch": 0.06950786402841197, + "grad_norm": 5.71875, + "learning_rate": 6.950786402841198e-06, + "loss": 1.82287674, + "memory(GiB)": 58.86, + "step": 2740, + "train_speed(iter/s)": 1.642902 + }, + { + "acc": 0.63214316, + "epoch": 0.06963470319634703, + "grad_norm": 5.9375, + "learning_rate": 6.9634703196347046e-06, + "loss": 1.77682457, + "memory(GiB)": 58.86, + "step": 2745, + "train_speed(iter/s)": 1.643234 + }, + { + "acc": 0.62459421, + "epoch": 0.06976154236428209, + "grad_norm": 5.78125, + "learning_rate": 6.97615423642821e-06, + "loss": 1.86816711, + "memory(GiB)": 58.86, + "step": 2750, + "train_speed(iter/s)": 1.643574 + }, + { + "acc": 0.61551743, + "epoch": 0.06988838153221714, + "grad_norm": 6.09375, + "learning_rate": 6.988838153221716e-06, + "loss": 1.86809044, + "memory(GiB)": 58.86, + "step": 2755, + "train_speed(iter/s)": 1.643898 + }, + { + "acc": 0.61875954, + "epoch": 0.0700152207001522, + "grad_norm": 5.59375, + "learning_rate": 7.001522070015221e-06, + "loss": 1.8576107, + "memory(GiB)": 58.86, + "step": 2760, + "train_speed(iter/s)": 1.644181 + }, + { + "acc": 0.63215113, + "epoch": 0.07014205986808726, + "grad_norm": 6.375, + "learning_rate": 7.014205986808728e-06, + "loss": 1.89916, + "memory(GiB)": 58.86, + "step": 2765, + "train_speed(iter/s)": 1.644487 + }, + { + "acc": 0.62541895, + "epoch": 0.07026889903602232, + "grad_norm": 6.40625, + "learning_rate": 7.026889903602233e-06, + "loss": 1.84470425, + "memory(GiB)": 58.86, + "step": 2770, + "train_speed(iter/s)": 1.6448 + }, + { + "acc": 0.62770796, + "epoch": 0.07039573820395738, + "grad_norm": 7.84375, + "learning_rate": 7.039573820395739e-06, + "loss": 1.83992119, + "memory(GiB)": 58.86, + "step": 2775, + "train_speed(iter/s)": 1.645114 + }, + { + "acc": 0.6036365, + "epoch": 0.07052257737189244, + "grad_norm": 6.1875, + "learning_rate": 7.052257737189245e-06, + "loss": 1.89441395, + "memory(GiB)": 58.86, + "step": 2780, + "train_speed(iter/s)": 1.64544 + }, + { + "acc": 0.62922888, + "epoch": 0.0706494165398275, + "grad_norm": 6.59375, + "learning_rate": 7.064941653982751e-06, + "loss": 1.83304195, + "memory(GiB)": 58.86, + "step": 2785, + "train_speed(iter/s)": 1.645744 + }, + { + "acc": 0.614466, + "epoch": 0.07077625570776255, + "grad_norm": 6.28125, + "learning_rate": 7.077625570776257e-06, + "loss": 1.87623177, + "memory(GiB)": 58.86, + "step": 2790, + "train_speed(iter/s)": 1.646046 + }, + { + "acc": 0.6314908, + "epoch": 0.07090309487569761, + "grad_norm": 6.90625, + "learning_rate": 7.090309487569762e-06, + "loss": 1.75695553, + "memory(GiB)": 58.86, + "step": 2795, + "train_speed(iter/s)": 1.646371 + }, + { + "acc": 0.64543848, + "epoch": 0.07102993404363267, + "grad_norm": 6.6875, + "learning_rate": 7.102993404363268e-06, + "loss": 1.75454407, + "memory(GiB)": 58.86, + "step": 2800, + "train_speed(iter/s)": 1.646667 + }, + { + "acc": 0.63171625, + "epoch": 0.07115677321156773, + "grad_norm": 6.5, + "learning_rate": 7.1156773211567745e-06, + "loss": 1.83182487, + "memory(GiB)": 58.86, + "step": 2805, + "train_speed(iter/s)": 1.646961 + }, + { + "acc": 0.63060598, + "epoch": 0.07128361237950279, + "grad_norm": 7.25, + "learning_rate": 7.12836123795028e-06, + "loss": 1.85082245, + "memory(GiB)": 58.86, + "step": 2810, + "train_speed(iter/s)": 1.647279 + }, + { + "acc": 0.60732064, + "epoch": 0.07141045154743784, + "grad_norm": 5.28125, + "learning_rate": 7.1410451547437856e-06, + "loss": 1.85291862, + "memory(GiB)": 58.86, + "step": 2815, + "train_speed(iter/s)": 1.647595 + }, + { + "acc": 0.61993437, + "epoch": 0.0715372907153729, + "grad_norm": 6.65625, + "learning_rate": 7.153729071537291e-06, + "loss": 1.76142082, + "memory(GiB)": 58.86, + "step": 2820, + "train_speed(iter/s)": 1.647889 + }, + { + "acc": 0.63077378, + "epoch": 0.07166412988330796, + "grad_norm": 5.875, + "learning_rate": 7.1664129883307975e-06, + "loss": 1.80787868, + "memory(GiB)": 58.86, + "step": 2825, + "train_speed(iter/s)": 1.648201 + }, + { + "acc": 0.64034786, + "epoch": 0.07179096905124302, + "grad_norm": 5.75, + "learning_rate": 7.1790969051243035e-06, + "loss": 1.76725464, + "memory(GiB)": 58.86, + "step": 2830, + "train_speed(iter/s)": 1.648504 + }, + { + "acc": 0.60083098, + "epoch": 0.07191780821917808, + "grad_norm": 8.0, + "learning_rate": 7.191780821917809e-06, + "loss": 1.93156967, + "memory(GiB)": 58.86, + "step": 2835, + "train_speed(iter/s)": 1.648821 + }, + { + "acc": 0.61838884, + "epoch": 0.07204464738711314, + "grad_norm": 6.625, + "learning_rate": 7.2044647387113146e-06, + "loss": 1.87825241, + "memory(GiB)": 58.86, + "step": 2840, + "train_speed(iter/s)": 1.649119 + }, + { + "acc": 0.62993474, + "epoch": 0.0721714865550482, + "grad_norm": 5.375, + "learning_rate": 7.2171486555048205e-06, + "loss": 1.78725872, + "memory(GiB)": 58.86, + "step": 2845, + "train_speed(iter/s)": 1.649423 + }, + { + "acc": 0.61560841, + "epoch": 0.07229832572298325, + "grad_norm": 6.0625, + "learning_rate": 7.2298325722983265e-06, + "loss": 1.83525448, + "memory(GiB)": 58.86, + "step": 2850, + "train_speed(iter/s)": 1.649738 + }, + { + "acc": 0.64150791, + "epoch": 0.07242516489091831, + "grad_norm": 6.4375, + "learning_rate": 7.242516489091832e-06, + "loss": 1.83049316, + "memory(GiB)": 58.86, + "step": 2855, + "train_speed(iter/s)": 1.650053 + }, + { + "acc": 0.62604313, + "epoch": 0.07255200405885337, + "grad_norm": 6.25, + "learning_rate": 7.2552004058853376e-06, + "loss": 1.77583008, + "memory(GiB)": 58.86, + "step": 2860, + "train_speed(iter/s)": 1.650342 + }, + { + "acc": 0.62669916, + "epoch": 0.07267884322678843, + "grad_norm": 6.03125, + "learning_rate": 7.267884322678844e-06, + "loss": 1.89665585, + "memory(GiB)": 58.86, + "step": 2865, + "train_speed(iter/s)": 1.650624 + }, + { + "acc": 0.62018375, + "epoch": 0.07280568239472349, + "grad_norm": 7.21875, + "learning_rate": 7.2805682394723495e-06, + "loss": 1.8666666, + "memory(GiB)": 58.86, + "step": 2870, + "train_speed(iter/s)": 1.650898 + }, + { + "acc": 0.61261616, + "epoch": 0.07293252156265854, + "grad_norm": 6.1875, + "learning_rate": 7.2932521562658555e-06, + "loss": 1.85211277, + "memory(GiB)": 58.86, + "step": 2875, + "train_speed(iter/s)": 1.651206 + }, + { + "acc": 0.6122263, + "epoch": 0.0730593607305936, + "grad_norm": 5.40625, + "learning_rate": 7.305936073059361e-06, + "loss": 1.83267956, + "memory(GiB)": 58.86, + "step": 2880, + "train_speed(iter/s)": 1.651484 + }, + { + "acc": 0.63585782, + "epoch": 0.07318619989852866, + "grad_norm": 5.15625, + "learning_rate": 7.318619989852867e-06, + "loss": 1.83922138, + "memory(GiB)": 58.86, + "step": 2885, + "train_speed(iter/s)": 1.651781 + }, + { + "acc": 0.61210012, + "epoch": 0.07331303906646372, + "grad_norm": 7.375, + "learning_rate": 7.331303906646373e-06, + "loss": 1.82524376, + "memory(GiB)": 58.86, + "step": 2890, + "train_speed(iter/s)": 1.652078 + }, + { + "acc": 0.63380165, + "epoch": 0.07343987823439878, + "grad_norm": 5.6875, + "learning_rate": 7.3439878234398785e-06, + "loss": 1.80623512, + "memory(GiB)": 58.86, + "step": 2895, + "train_speed(iter/s)": 1.652358 + }, + { + "acc": 0.61072969, + "epoch": 0.07356671740233384, + "grad_norm": 6.21875, + "learning_rate": 7.3566717402333845e-06, + "loss": 1.95809822, + "memory(GiB)": 58.86, + "step": 2900, + "train_speed(iter/s)": 1.652628 + }, + { + "acc": 0.62094526, + "epoch": 0.0736935565702689, + "grad_norm": 6.34375, + "learning_rate": 7.369355657026891e-06, + "loss": 1.83135757, + "memory(GiB)": 58.86, + "step": 2905, + "train_speed(iter/s)": 1.652915 + }, + { + "acc": 0.59886999, + "epoch": 0.07382039573820395, + "grad_norm": 4.9375, + "learning_rate": 7.382039573820396e-06, + "loss": 1.9506773, + "memory(GiB)": 58.86, + "step": 2910, + "train_speed(iter/s)": 1.653203 + }, + { + "acc": 0.62148046, + "epoch": 0.07394723490613901, + "grad_norm": 7.59375, + "learning_rate": 7.394723490613902e-06, + "loss": 1.86128883, + "memory(GiB)": 58.86, + "step": 2915, + "train_speed(iter/s)": 1.653505 + }, + { + "acc": 0.60896997, + "epoch": 0.07407407407407407, + "grad_norm": 5.9375, + "learning_rate": 7.4074074074074075e-06, + "loss": 1.9052927, + "memory(GiB)": 58.86, + "step": 2920, + "train_speed(iter/s)": 1.653763 + }, + { + "acc": 0.61220808, + "epoch": 0.07420091324200913, + "grad_norm": 6.0625, + "learning_rate": 7.420091324200914e-06, + "loss": 1.87214355, + "memory(GiB)": 58.86, + "step": 2925, + "train_speed(iter/s)": 1.654055 + }, + { + "acc": 0.61939154, + "epoch": 0.07432775240994419, + "grad_norm": 7.5, + "learning_rate": 7.432775240994419e-06, + "loss": 1.87435989, + "memory(GiB)": 58.86, + "step": 2930, + "train_speed(iter/s)": 1.654343 + }, + { + "acc": 0.63307371, + "epoch": 0.07445459157787924, + "grad_norm": 6.625, + "learning_rate": 7.445459157787925e-06, + "loss": 1.74041977, + "memory(GiB)": 58.86, + "step": 2935, + "train_speed(iter/s)": 1.65463 + }, + { + "acc": 0.631357, + "epoch": 0.0745814307458143, + "grad_norm": 6.3125, + "learning_rate": 7.458143074581431e-06, + "loss": 1.80306778, + "memory(GiB)": 58.86, + "step": 2940, + "train_speed(iter/s)": 1.654906 + }, + { + "acc": 0.64976625, + "epoch": 0.07470826991374936, + "grad_norm": 7.5625, + "learning_rate": 7.470826991374937e-06, + "loss": 1.70913353, + "memory(GiB)": 58.86, + "step": 2945, + "train_speed(iter/s)": 1.655179 + }, + { + "acc": 0.60800514, + "epoch": 0.07483510908168442, + "grad_norm": 8.4375, + "learning_rate": 7.483510908168443e-06, + "loss": 1.92534523, + "memory(GiB)": 58.86, + "step": 2950, + "train_speed(iter/s)": 1.655468 + }, + { + "acc": 0.63667126, + "epoch": 0.07496194824961948, + "grad_norm": 5.59375, + "learning_rate": 7.496194824961948e-06, + "loss": 1.7739357, + "memory(GiB)": 58.86, + "step": 2955, + "train_speed(iter/s)": 1.655722 + }, + { + "acc": 0.6314599, + "epoch": 0.07508878741755454, + "grad_norm": 5.5625, + "learning_rate": 7.508878741755454e-06, + "loss": 1.76697655, + "memory(GiB)": 58.86, + "step": 2960, + "train_speed(iter/s)": 1.656003 + }, + { + "acc": 0.62168779, + "epoch": 0.0752156265854896, + "grad_norm": 6.25, + "learning_rate": 7.521562658548961e-06, + "loss": 1.80277596, + "memory(GiB)": 58.86, + "step": 2965, + "train_speed(iter/s)": 1.656278 + }, + { + "acc": 0.63421741, + "epoch": 0.07534246575342465, + "grad_norm": 6.3125, + "learning_rate": 7.534246575342466e-06, + "loss": 1.79069939, + "memory(GiB)": 58.86, + "step": 2970, + "train_speed(iter/s)": 1.656559 + }, + { + "acc": 0.63341808, + "epoch": 0.07546930492135971, + "grad_norm": 5.46875, + "learning_rate": 7.546930492135972e-06, + "loss": 1.768433, + "memory(GiB)": 58.86, + "step": 2975, + "train_speed(iter/s)": 1.656832 + }, + { + "acc": 0.62834415, + "epoch": 0.07559614408929477, + "grad_norm": 8.0625, + "learning_rate": 7.559614408929477e-06, + "loss": 1.76345177, + "memory(GiB)": 58.86, + "step": 2980, + "train_speed(iter/s)": 1.657111 + }, + { + "acc": 0.62464895, + "epoch": 0.07572298325722983, + "grad_norm": 6.3125, + "learning_rate": 7.572298325722984e-06, + "loss": 1.82218533, + "memory(GiB)": 58.86, + "step": 2985, + "train_speed(iter/s)": 1.657393 + }, + { + "acc": 0.63267317, + "epoch": 0.07584982242516489, + "grad_norm": 6.15625, + "learning_rate": 7.58498224251649e-06, + "loss": 1.80164795, + "memory(GiB)": 58.86, + "step": 2990, + "train_speed(iter/s)": 1.657635 + }, + { + "acc": 0.61835299, + "epoch": 0.07597666159309995, + "grad_norm": 5.59375, + "learning_rate": 7.597666159309995e-06, + "loss": 1.84410496, + "memory(GiB)": 58.86, + "step": 2995, + "train_speed(iter/s)": 1.657911 + }, + { + "acc": 0.62701006, + "epoch": 0.076103500761035, + "grad_norm": 6.34375, + "learning_rate": 7.610350076103501e-06, + "loss": 1.81502705, + "memory(GiB)": 58.86, + "step": 3000, + "train_speed(iter/s)": 1.658185 + }, + { + "epoch": 0.076103500761035, + "eval_acc": 0.611511160952017, + "eval_loss": 1.7780622243881226, + "eval_runtime": 58.2332, + "eval_samples_per_second": 109.388, + "eval_steps_per_second": 27.356, + "step": 3000 + }, + { + "acc": 0.63783178, + "epoch": 0.07623033992897006, + "grad_norm": 4.90625, + "learning_rate": 7.623033992897007e-06, + "loss": 1.73620682, + "memory(GiB)": 58.86, + "step": 3005, + "train_speed(iter/s)": 1.603207 + }, + { + "acc": 0.61852002, + "epoch": 0.07635717909690512, + "grad_norm": 6.71875, + "learning_rate": 7.635717909690512e-06, + "loss": 1.84506187, + "memory(GiB)": 58.86, + "step": 3010, + "train_speed(iter/s)": 1.603545 + }, + { + "acc": 0.62819405, + "epoch": 0.07648401826484018, + "grad_norm": 6.71875, + "learning_rate": 7.648401826484018e-06, + "loss": 1.79380951, + "memory(GiB)": 58.86, + "step": 3015, + "train_speed(iter/s)": 1.603885 + }, + { + "acc": 0.63974152, + "epoch": 0.07661085743277524, + "grad_norm": 6.0, + "learning_rate": 7.661085743277524e-06, + "loss": 1.77939548, + "memory(GiB)": 58.86, + "step": 3020, + "train_speed(iter/s)": 1.604229 + }, + { + "acc": 0.63172846, + "epoch": 0.0767376966007103, + "grad_norm": 5.96875, + "learning_rate": 7.67376966007103e-06, + "loss": 1.81084213, + "memory(GiB)": 58.86, + "step": 3025, + "train_speed(iter/s)": 1.604564 + }, + { + "acc": 0.62624235, + "epoch": 0.07686453576864535, + "grad_norm": 5.5625, + "learning_rate": 7.686453576864536e-06, + "loss": 1.76887817, + "memory(GiB)": 58.86, + "step": 3030, + "train_speed(iter/s)": 1.604882 + }, + { + "acc": 0.61102958, + "epoch": 0.07699137493658041, + "grad_norm": 6.3125, + "learning_rate": 7.699137493658042e-06, + "loss": 1.83594704, + "memory(GiB)": 58.86, + "step": 3035, + "train_speed(iter/s)": 1.605209 + }, + { + "acc": 0.63113852, + "epoch": 0.07711821410451547, + "grad_norm": 6.1875, + "learning_rate": 7.711821410451548e-06, + "loss": 1.7567131, + "memory(GiB)": 58.86, + "step": 3040, + "train_speed(iter/s)": 1.605541 + }, + { + "acc": 0.6191988, + "epoch": 0.07724505327245053, + "grad_norm": 7.0, + "learning_rate": 7.724505327245054e-06, + "loss": 1.80660229, + "memory(GiB)": 58.86, + "step": 3045, + "train_speed(iter/s)": 1.605879 + }, + { + "acc": 0.62958927, + "epoch": 0.07737189244038559, + "grad_norm": 7.1875, + "learning_rate": 7.73718924403856e-06, + "loss": 1.79680157, + "memory(GiB)": 58.86, + "step": 3050, + "train_speed(iter/s)": 1.606219 + }, + { + "acc": 0.63014517, + "epoch": 0.07749873160832065, + "grad_norm": 5.21875, + "learning_rate": 7.749873160832066e-06, + "loss": 1.81305237, + "memory(GiB)": 58.86, + "step": 3055, + "train_speed(iter/s)": 1.606538 + }, + { + "acc": 0.62149038, + "epoch": 0.0776255707762557, + "grad_norm": 5.09375, + "learning_rate": 7.76255707762557e-06, + "loss": 1.79317703, + "memory(GiB)": 58.86, + "step": 3060, + "train_speed(iter/s)": 1.606861 + }, + { + "acc": 0.63585677, + "epoch": 0.07775240994419076, + "grad_norm": 6.71875, + "learning_rate": 7.775240994419078e-06, + "loss": 1.76710701, + "memory(GiB)": 58.86, + "step": 3065, + "train_speed(iter/s)": 1.607193 + }, + { + "acc": 0.62253685, + "epoch": 0.07787924911212582, + "grad_norm": 6.84375, + "learning_rate": 7.787924911212584e-06, + "loss": 1.83646851, + "memory(GiB)": 58.86, + "step": 3070, + "train_speed(iter/s)": 1.607499 + }, + { + "acc": 0.62994585, + "epoch": 0.07800608828006088, + "grad_norm": 7.25, + "learning_rate": 7.800608828006088e-06, + "loss": 1.83255272, + "memory(GiB)": 58.86, + "step": 3075, + "train_speed(iter/s)": 1.607822 + }, + { + "acc": 0.62628317, + "epoch": 0.07813292744799594, + "grad_norm": 5.9375, + "learning_rate": 7.813292744799594e-06, + "loss": 1.79040089, + "memory(GiB)": 58.86, + "step": 3080, + "train_speed(iter/s)": 1.608151 + }, + { + "acc": 0.6179585, + "epoch": 0.078259766615931, + "grad_norm": 6.15625, + "learning_rate": 7.8259766615931e-06, + "loss": 1.77247314, + "memory(GiB)": 58.86, + "step": 3085, + "train_speed(iter/s)": 1.608442 + }, + { + "acc": 0.62175183, + "epoch": 0.07838660578386605, + "grad_norm": 6.625, + "learning_rate": 7.838660578386606e-06, + "loss": 1.87037334, + "memory(GiB)": 58.86, + "step": 3090, + "train_speed(iter/s)": 1.608778 + }, + { + "acc": 0.62725267, + "epoch": 0.07851344495180111, + "grad_norm": 5.375, + "learning_rate": 7.851344495180112e-06, + "loss": 1.85108929, + "memory(GiB)": 58.86, + "step": 3095, + "train_speed(iter/s)": 1.609085 + }, + { + "acc": 0.62076268, + "epoch": 0.07864028411973617, + "grad_norm": 5.84375, + "learning_rate": 7.864028411973618e-06, + "loss": 1.83174133, + "memory(GiB)": 58.86, + "step": 3100, + "train_speed(iter/s)": 1.609377 + }, + { + "acc": 0.62446389, + "epoch": 0.07876712328767123, + "grad_norm": 6.28125, + "learning_rate": 7.876712328767124e-06, + "loss": 1.79310532, + "memory(GiB)": 58.86, + "step": 3105, + "train_speed(iter/s)": 1.609705 + }, + { + "acc": 0.61990042, + "epoch": 0.07889396245560629, + "grad_norm": 6.125, + "learning_rate": 7.88939624556063e-06, + "loss": 1.84493599, + "memory(GiB)": 58.86, + "step": 3110, + "train_speed(iter/s)": 1.610017 + }, + { + "acc": 0.63923712, + "epoch": 0.07902080162354135, + "grad_norm": 6.375, + "learning_rate": 7.902080162354136e-06, + "loss": 1.71876297, + "memory(GiB)": 58.86, + "step": 3115, + "train_speed(iter/s)": 1.610349 + }, + { + "acc": 0.65725875, + "epoch": 0.0791476407914764, + "grad_norm": 6.40625, + "learning_rate": 7.91476407914764e-06, + "loss": 1.68763828, + "memory(GiB)": 58.86, + "step": 3120, + "train_speed(iter/s)": 1.610685 + }, + { + "acc": 0.6308207, + "epoch": 0.07927447995941146, + "grad_norm": 7.0, + "learning_rate": 7.927447995941148e-06, + "loss": 1.7824398, + "memory(GiB)": 58.86, + "step": 3125, + "train_speed(iter/s)": 1.610988 + }, + { + "acc": 0.62523913, + "epoch": 0.07940131912734652, + "grad_norm": 6.71875, + "learning_rate": 7.940131912734654e-06, + "loss": 1.81446304, + "memory(GiB)": 58.86, + "step": 3130, + "train_speed(iter/s)": 1.611317 + }, + { + "acc": 0.63848977, + "epoch": 0.07952815829528158, + "grad_norm": 5.21875, + "learning_rate": 7.952815829528158e-06, + "loss": 1.73555603, + "memory(GiB)": 58.86, + "step": 3135, + "train_speed(iter/s)": 1.61164 + }, + { + "acc": 0.63784361, + "epoch": 0.07965499746321664, + "grad_norm": 6.375, + "learning_rate": 7.965499746321664e-06, + "loss": 1.82037811, + "memory(GiB)": 58.86, + "step": 3140, + "train_speed(iter/s)": 1.611936 + }, + { + "acc": 0.61641026, + "epoch": 0.0797818366311517, + "grad_norm": 5.71875, + "learning_rate": 7.978183663115172e-06, + "loss": 1.91095695, + "memory(GiB)": 58.86, + "step": 3145, + "train_speed(iter/s)": 1.612258 + }, + { + "acc": 0.6256465, + "epoch": 0.07990867579908675, + "grad_norm": 4.71875, + "learning_rate": 7.990867579908676e-06, + "loss": 1.82138786, + "memory(GiB)": 58.86, + "step": 3150, + "train_speed(iter/s)": 1.61257 + }, + { + "acc": 0.61393795, + "epoch": 0.08003551496702181, + "grad_norm": 5.625, + "learning_rate": 8.003551496702182e-06, + "loss": 1.85157642, + "memory(GiB)": 58.86, + "step": 3155, + "train_speed(iter/s)": 1.612871 + }, + { + "acc": 0.62110429, + "epoch": 0.08016235413495687, + "grad_norm": 5.40625, + "learning_rate": 8.016235413495688e-06, + "loss": 1.79541931, + "memory(GiB)": 58.86, + "step": 3160, + "train_speed(iter/s)": 1.613198 + }, + { + "acc": 0.61994252, + "epoch": 0.08028919330289193, + "grad_norm": 7.375, + "learning_rate": 8.028919330289194e-06, + "loss": 1.91337032, + "memory(GiB)": 58.86, + "step": 3165, + "train_speed(iter/s)": 1.61349 + }, + { + "acc": 0.63742933, + "epoch": 0.08041603247082699, + "grad_norm": 5.0, + "learning_rate": 8.0416032470827e-06, + "loss": 1.78433266, + "memory(GiB)": 58.86, + "step": 3170, + "train_speed(iter/s)": 1.613807 + }, + { + "acc": 0.61900368, + "epoch": 0.08054287163876205, + "grad_norm": 5.5, + "learning_rate": 8.054287163876206e-06, + "loss": 1.82017097, + "memory(GiB)": 58.86, + "step": 3175, + "train_speed(iter/s)": 1.61411 + }, + { + "acc": 0.62221527, + "epoch": 0.0806697108066971, + "grad_norm": 5.6875, + "learning_rate": 8.066971080669712e-06, + "loss": 1.80164413, + "memory(GiB)": 58.86, + "step": 3180, + "train_speed(iter/s)": 1.614429 + }, + { + "acc": 0.61236415, + "epoch": 0.08079654997463216, + "grad_norm": 7.34375, + "learning_rate": 8.079654997463218e-06, + "loss": 1.87927094, + "memory(GiB)": 58.86, + "step": 3185, + "train_speed(iter/s)": 1.614761 + }, + { + "acc": 0.62563963, + "epoch": 0.08092338914256722, + "grad_norm": 6.84375, + "learning_rate": 8.092338914256724e-06, + "loss": 1.78968449, + "memory(GiB)": 58.86, + "step": 3190, + "train_speed(iter/s)": 1.615053 + }, + { + "acc": 0.61603351, + "epoch": 0.08105022831050228, + "grad_norm": 6.625, + "learning_rate": 8.105022831050228e-06, + "loss": 1.75521469, + "memory(GiB)": 58.86, + "step": 3195, + "train_speed(iter/s)": 1.615353 + }, + { + "acc": 0.63678412, + "epoch": 0.08117706747843734, + "grad_norm": 5.59375, + "learning_rate": 8.117706747843734e-06, + "loss": 1.75136757, + "memory(GiB)": 58.86, + "step": 3200, + "train_speed(iter/s)": 1.61565 + }, + { + "acc": 0.6231986, + "epoch": 0.0813039066463724, + "grad_norm": 6.09375, + "learning_rate": 8.130390664637242e-06, + "loss": 1.77493153, + "memory(GiB)": 58.86, + "step": 3205, + "train_speed(iter/s)": 1.615954 + }, + { + "acc": 0.63598914, + "epoch": 0.08143074581430745, + "grad_norm": 5.46875, + "learning_rate": 8.143074581430746e-06, + "loss": 1.75733223, + "memory(GiB)": 58.86, + "step": 3210, + "train_speed(iter/s)": 1.616264 + }, + { + "acc": 0.63642306, + "epoch": 0.08155758498224251, + "grad_norm": 5.78125, + "learning_rate": 8.155758498224252e-06, + "loss": 1.78879604, + "memory(GiB)": 58.86, + "step": 3215, + "train_speed(iter/s)": 1.616574 + }, + { + "acc": 0.63304596, + "epoch": 0.08168442415017757, + "grad_norm": 6.5, + "learning_rate": 8.168442415017758e-06, + "loss": 1.7665226, + "memory(GiB)": 58.86, + "step": 3220, + "train_speed(iter/s)": 1.616879 + }, + { + "acc": 0.62088284, + "epoch": 0.08181126331811263, + "grad_norm": 6.375, + "learning_rate": 8.181126331811264e-06, + "loss": 1.85610085, + "memory(GiB)": 58.86, + "step": 3225, + "train_speed(iter/s)": 1.617189 + }, + { + "acc": 0.62163486, + "epoch": 0.08193810248604769, + "grad_norm": 5.1875, + "learning_rate": 8.19381024860477e-06, + "loss": 1.76511917, + "memory(GiB)": 58.86, + "step": 3230, + "train_speed(iter/s)": 1.617504 + }, + { + "acc": 0.62488856, + "epoch": 0.08206494165398275, + "grad_norm": 5.59375, + "learning_rate": 8.206494165398276e-06, + "loss": 1.69906731, + "memory(GiB)": 58.86, + "step": 3235, + "train_speed(iter/s)": 1.617814 + }, + { + "acc": 0.62290168, + "epoch": 0.0821917808219178, + "grad_norm": 6.6875, + "learning_rate": 8.219178082191782e-06, + "loss": 1.81975422, + "memory(GiB)": 58.86, + "step": 3240, + "train_speed(iter/s)": 1.61812 + }, + { + "acc": 0.62643986, + "epoch": 0.08231861998985286, + "grad_norm": 5.71875, + "learning_rate": 8.231861998985288e-06, + "loss": 1.77693825, + "memory(GiB)": 58.86, + "step": 3245, + "train_speed(iter/s)": 1.618406 + }, + { + "acc": 0.63714232, + "epoch": 0.08244545915778792, + "grad_norm": 5.75, + "learning_rate": 8.244545915778794e-06, + "loss": 1.71134491, + "memory(GiB)": 58.86, + "step": 3250, + "train_speed(iter/s)": 1.618702 + }, + { + "acc": 0.61583309, + "epoch": 0.08257229832572298, + "grad_norm": 4.90625, + "learning_rate": 8.2572298325723e-06, + "loss": 1.83567562, + "memory(GiB)": 58.86, + "step": 3255, + "train_speed(iter/s)": 1.61898 + }, + { + "acc": 0.6344367, + "epoch": 0.08269913749365804, + "grad_norm": 5.59375, + "learning_rate": 8.269913749365804e-06, + "loss": 1.73675213, + "memory(GiB)": 58.86, + "step": 3260, + "train_speed(iter/s)": 1.61927 + }, + { + "acc": 0.61676769, + "epoch": 0.0828259766615931, + "grad_norm": 8.3125, + "learning_rate": 8.282597666159312e-06, + "loss": 1.76578865, + "memory(GiB)": 58.86, + "step": 3265, + "train_speed(iter/s)": 1.619492 + }, + { + "acc": 0.6272892, + "epoch": 0.08295281582952815, + "grad_norm": 6.25, + "learning_rate": 8.295281582952816e-06, + "loss": 1.82483997, + "memory(GiB)": 58.86, + "step": 3270, + "train_speed(iter/s)": 1.619789 + }, + { + "acc": 0.6161901, + "epoch": 0.08307965499746321, + "grad_norm": 6.53125, + "learning_rate": 8.307965499746322e-06, + "loss": 1.76700649, + "memory(GiB)": 58.86, + "step": 3275, + "train_speed(iter/s)": 1.620067 + }, + { + "acc": 0.610918, + "epoch": 0.08320649416539827, + "grad_norm": 5.125, + "learning_rate": 8.320649416539828e-06, + "loss": 1.89990501, + "memory(GiB)": 58.86, + "step": 3280, + "train_speed(iter/s)": 1.620366 + }, + { + "acc": 0.6342813, + "epoch": 0.08333333333333333, + "grad_norm": 5.46875, + "learning_rate": 8.333333333333334e-06, + "loss": 1.7141571, + "memory(GiB)": 58.86, + "step": 3285, + "train_speed(iter/s)": 1.620671 + }, + { + "acc": 0.63332539, + "epoch": 0.08346017250126839, + "grad_norm": 7.21875, + "learning_rate": 8.34601725012684e-06, + "loss": 1.6897047, + "memory(GiB)": 58.86, + "step": 3290, + "train_speed(iter/s)": 1.620973 + }, + { + "acc": 0.6279213, + "epoch": 0.08358701166920345, + "grad_norm": 8.5625, + "learning_rate": 8.358701166920346e-06, + "loss": 1.83149261, + "memory(GiB)": 58.86, + "step": 3295, + "train_speed(iter/s)": 1.621263 + }, + { + "acc": 0.60932512, + "epoch": 0.0837138508371385, + "grad_norm": 5.875, + "learning_rate": 8.371385083713852e-06, + "loss": 1.89715729, + "memory(GiB)": 58.86, + "step": 3300, + "train_speed(iter/s)": 1.621549 + }, + { + "acc": 0.61546173, + "epoch": 0.08384069000507356, + "grad_norm": 6.71875, + "learning_rate": 8.384069000507358e-06, + "loss": 1.84220676, + "memory(GiB)": 58.86, + "step": 3305, + "train_speed(iter/s)": 1.621842 + }, + { + "acc": 0.61634483, + "epoch": 0.08396752917300862, + "grad_norm": 5.15625, + "learning_rate": 8.396752917300864e-06, + "loss": 1.82034645, + "memory(GiB)": 58.86, + "step": 3310, + "train_speed(iter/s)": 1.622119 + }, + { + "acc": 0.61833067, + "epoch": 0.08409436834094368, + "grad_norm": 5.71875, + "learning_rate": 8.40943683409437e-06, + "loss": 1.78616543, + "memory(GiB)": 58.86, + "step": 3315, + "train_speed(iter/s)": 1.622398 + }, + { + "acc": 0.62271051, + "epoch": 0.08422120750887874, + "grad_norm": 6.46875, + "learning_rate": 8.422120750887874e-06, + "loss": 1.83358517, + "memory(GiB)": 58.86, + "step": 3320, + "train_speed(iter/s)": 1.622639 + }, + { + "acc": 0.62470541, + "epoch": 0.0843480466768138, + "grad_norm": 5.59375, + "learning_rate": 8.434804667681381e-06, + "loss": 1.74936008, + "memory(GiB)": 58.86, + "step": 3325, + "train_speed(iter/s)": 1.62293 + }, + { + "acc": 0.62520065, + "epoch": 0.08447488584474885, + "grad_norm": 6.28125, + "learning_rate": 8.447488584474887e-06, + "loss": 1.81033707, + "memory(GiB)": 58.86, + "step": 3330, + "train_speed(iter/s)": 1.623192 + }, + { + "acc": 0.6237464, + "epoch": 0.08460172501268391, + "grad_norm": 7.375, + "learning_rate": 8.460172501268392e-06, + "loss": 1.81594048, + "memory(GiB)": 58.86, + "step": 3335, + "train_speed(iter/s)": 1.62346 + }, + { + "acc": 0.62431679, + "epoch": 0.08472856418061897, + "grad_norm": 5.15625, + "learning_rate": 8.472856418061898e-06, + "loss": 1.77585602, + "memory(GiB)": 58.86, + "step": 3340, + "train_speed(iter/s)": 1.623721 + }, + { + "acc": 0.64794264, + "epoch": 0.08485540334855403, + "grad_norm": 5.71875, + "learning_rate": 8.485540334855404e-06, + "loss": 1.729105, + "memory(GiB)": 58.86, + "step": 3345, + "train_speed(iter/s)": 1.624004 + }, + { + "acc": 0.64635601, + "epoch": 0.08498224251648909, + "grad_norm": 4.4375, + "learning_rate": 8.49822425164891e-06, + "loss": 1.70486851, + "memory(GiB)": 58.86, + "step": 3350, + "train_speed(iter/s)": 1.62426 + }, + { + "acc": 0.62374363, + "epoch": 0.08510908168442415, + "grad_norm": 5.15625, + "learning_rate": 8.510908168442416e-06, + "loss": 1.81973915, + "memory(GiB)": 58.86, + "step": 3355, + "train_speed(iter/s)": 1.624535 + }, + { + "acc": 0.60954399, + "epoch": 0.0852359208523592, + "grad_norm": 6.15625, + "learning_rate": 8.523592085235922e-06, + "loss": 1.87591019, + "memory(GiB)": 58.86, + "step": 3360, + "train_speed(iter/s)": 1.62481 + }, + { + "acc": 0.61560974, + "epoch": 0.08536276002029426, + "grad_norm": 5.28125, + "learning_rate": 8.536276002029428e-06, + "loss": 1.78173676, + "memory(GiB)": 58.86, + "step": 3365, + "train_speed(iter/s)": 1.625082 + }, + { + "acc": 0.62815838, + "epoch": 0.08548959918822932, + "grad_norm": 7.25, + "learning_rate": 8.548959918822933e-06, + "loss": 1.86471519, + "memory(GiB)": 58.86, + "step": 3370, + "train_speed(iter/s)": 1.625369 + }, + { + "acc": 0.61993814, + "epoch": 0.08561643835616438, + "grad_norm": 5.59375, + "learning_rate": 8.56164383561644e-06, + "loss": 1.8225563, + "memory(GiB)": 58.86, + "step": 3375, + "train_speed(iter/s)": 1.625629 + }, + { + "acc": 0.63935537, + "epoch": 0.08574327752409944, + "grad_norm": 7.1875, + "learning_rate": 8.574327752409944e-06, + "loss": 1.7402483, + "memory(GiB)": 58.86, + "step": 3380, + "train_speed(iter/s)": 1.625892 + }, + { + "acc": 0.61767402, + "epoch": 0.0858701166920345, + "grad_norm": 6.28125, + "learning_rate": 8.587011669203451e-06, + "loss": 1.80435104, + "memory(GiB)": 58.86, + "step": 3385, + "train_speed(iter/s)": 1.62616 + }, + { + "acc": 0.64225984, + "epoch": 0.08599695585996955, + "grad_norm": 6.59375, + "learning_rate": 8.599695585996957e-06, + "loss": 1.71389217, + "memory(GiB)": 58.86, + "step": 3390, + "train_speed(iter/s)": 1.626428 + }, + { + "acc": 0.62173142, + "epoch": 0.08612379502790461, + "grad_norm": 6.28125, + "learning_rate": 8.612379502790462e-06, + "loss": 1.85017319, + "memory(GiB)": 58.86, + "step": 3395, + "train_speed(iter/s)": 1.626673 + }, + { + "acc": 0.63053288, + "epoch": 0.08625063419583967, + "grad_norm": 6.0, + "learning_rate": 8.625063419583968e-06, + "loss": 1.80282078, + "memory(GiB)": 58.86, + "step": 3400, + "train_speed(iter/s)": 1.626941 + }, + { + "acc": 0.63656359, + "epoch": 0.08637747336377473, + "grad_norm": 6.15625, + "learning_rate": 8.637747336377475e-06, + "loss": 1.81014576, + "memory(GiB)": 58.86, + "step": 3405, + "train_speed(iter/s)": 1.627219 + }, + { + "acc": 0.63236113, + "epoch": 0.08650431253170979, + "grad_norm": 6.625, + "learning_rate": 8.65043125317098e-06, + "loss": 1.8170887, + "memory(GiB)": 58.86, + "step": 3410, + "train_speed(iter/s)": 1.627505 + }, + { + "acc": 0.6143199, + "epoch": 0.08663115169964485, + "grad_norm": 6.59375, + "learning_rate": 8.663115169964485e-06, + "loss": 1.77595348, + "memory(GiB)": 58.86, + "step": 3415, + "train_speed(iter/s)": 1.627779 + }, + { + "acc": 0.60883551, + "epoch": 0.0867579908675799, + "grad_norm": 7.96875, + "learning_rate": 8.675799086757991e-06, + "loss": 1.87337475, + "memory(GiB)": 58.86, + "step": 3420, + "train_speed(iter/s)": 1.628045 + }, + { + "acc": 0.62656479, + "epoch": 0.08688483003551496, + "grad_norm": 6.46875, + "learning_rate": 8.688483003551497e-06, + "loss": 1.85432644, + "memory(GiB)": 58.86, + "step": 3425, + "train_speed(iter/s)": 1.628306 + }, + { + "acc": 0.63665709, + "epoch": 0.08701166920345002, + "grad_norm": 5.53125, + "learning_rate": 8.701166920345003e-06, + "loss": 1.76383171, + "memory(GiB)": 58.86, + "step": 3430, + "train_speed(iter/s)": 1.62857 + }, + { + "acc": 0.62604742, + "epoch": 0.08713850837138508, + "grad_norm": 5.71875, + "learning_rate": 8.71385083713851e-06, + "loss": 1.82312603, + "memory(GiB)": 67.97, + "step": 3435, + "train_speed(iter/s)": 1.628831 + }, + { + "acc": 0.6141346, + "epoch": 0.08726534753932014, + "grad_norm": 6.375, + "learning_rate": 8.726534753932014e-06, + "loss": 1.90559444, + "memory(GiB)": 67.97, + "step": 3440, + "train_speed(iter/s)": 1.629102 + }, + { + "acc": 0.62872844, + "epoch": 0.0873921867072552, + "grad_norm": 6.6875, + "learning_rate": 8.739218670725521e-06, + "loss": 1.76434479, + "memory(GiB)": 67.97, + "step": 3445, + "train_speed(iter/s)": 1.629372 + }, + { + "acc": 0.6223835, + "epoch": 0.08751902587519025, + "grad_norm": 5.9375, + "learning_rate": 8.751902587519027e-06, + "loss": 1.7607605, + "memory(GiB)": 67.97, + "step": 3450, + "train_speed(iter/s)": 1.629637 + }, + { + "acc": 0.62013903, + "epoch": 0.08764586504312531, + "grad_norm": 6.21875, + "learning_rate": 8.764586504312532e-06, + "loss": 1.87243843, + "memory(GiB)": 67.97, + "step": 3455, + "train_speed(iter/s)": 1.629888 + }, + { + "acc": 0.64138584, + "epoch": 0.08777270421106037, + "grad_norm": 5.71875, + "learning_rate": 8.777270421106037e-06, + "loss": 1.82841682, + "memory(GiB)": 67.97, + "step": 3460, + "train_speed(iter/s)": 1.630133 + }, + { + "acc": 0.63943453, + "epoch": 0.08789954337899543, + "grad_norm": 5.53125, + "learning_rate": 8.789954337899545e-06, + "loss": 1.76984653, + "memory(GiB)": 67.97, + "step": 3465, + "train_speed(iter/s)": 1.630394 + }, + { + "acc": 0.62812405, + "epoch": 0.08802638254693049, + "grad_norm": 6.3125, + "learning_rate": 8.80263825469305e-06, + "loss": 1.78545189, + "memory(GiB)": 67.97, + "step": 3470, + "train_speed(iter/s)": 1.630663 + }, + { + "acc": 0.64027443, + "epoch": 0.08815322171486555, + "grad_norm": 5.46875, + "learning_rate": 8.815322171486555e-06, + "loss": 1.7096199, + "memory(GiB)": 67.97, + "step": 3475, + "train_speed(iter/s)": 1.630922 + }, + { + "acc": 0.63905883, + "epoch": 0.0882800608828006, + "grad_norm": 6.09375, + "learning_rate": 8.828006088280061e-06, + "loss": 1.79863663, + "memory(GiB)": 67.97, + "step": 3480, + "train_speed(iter/s)": 1.631198 + }, + { + "acc": 0.62990875, + "epoch": 0.08840690005073566, + "grad_norm": 5.625, + "learning_rate": 8.840690005073567e-06, + "loss": 1.77944374, + "memory(GiB)": 67.97, + "step": 3485, + "train_speed(iter/s)": 1.631441 + }, + { + "acc": 0.61861358, + "epoch": 0.08853373921867072, + "grad_norm": 5.90625, + "learning_rate": 8.853373921867073e-06, + "loss": 1.75925274, + "memory(GiB)": 67.97, + "step": 3490, + "train_speed(iter/s)": 1.631688 + }, + { + "acc": 0.63133311, + "epoch": 0.08866057838660578, + "grad_norm": 5.4375, + "learning_rate": 8.86605783866058e-06, + "loss": 1.81406441, + "memory(GiB)": 67.97, + "step": 3495, + "train_speed(iter/s)": 1.631948 + }, + { + "acc": 0.63282757, + "epoch": 0.08878741755454084, + "grad_norm": 5.0625, + "learning_rate": 8.878741755454085e-06, + "loss": 1.73094482, + "memory(GiB)": 67.97, + "step": 3500, + "train_speed(iter/s)": 1.632196 + }, + { + "acc": 0.62975836, + "epoch": 0.0889142567224759, + "grad_norm": 4.875, + "learning_rate": 8.891425672247591e-06, + "loss": 1.73807812, + "memory(GiB)": 67.97, + "step": 3505, + "train_speed(iter/s)": 1.632456 + }, + { + "acc": 0.62790809, + "epoch": 0.08904109589041095, + "grad_norm": 5.5, + "learning_rate": 8.904109589041097e-06, + "loss": 1.79924641, + "memory(GiB)": 67.97, + "step": 3510, + "train_speed(iter/s)": 1.632705 + }, + { + "acc": 0.61818233, + "epoch": 0.08916793505834601, + "grad_norm": 8.5, + "learning_rate": 8.916793505834601e-06, + "loss": 1.83528461, + "memory(GiB)": 67.97, + "step": 3515, + "train_speed(iter/s)": 1.632974 + }, + { + "acc": 0.63320665, + "epoch": 0.08929477422628107, + "grad_norm": 5.1875, + "learning_rate": 8.929477422628107e-06, + "loss": 1.80401611, + "memory(GiB)": 67.97, + "step": 3520, + "train_speed(iter/s)": 1.633232 + }, + { + "acc": 0.63903503, + "epoch": 0.08942161339421613, + "grad_norm": 7.0, + "learning_rate": 8.942161339421615e-06, + "loss": 1.77746773, + "memory(GiB)": 67.97, + "step": 3525, + "train_speed(iter/s)": 1.633483 + }, + { + "acc": 0.63534675, + "epoch": 0.08954845256215119, + "grad_norm": 7.71875, + "learning_rate": 8.95484525621512e-06, + "loss": 1.77584343, + "memory(GiB)": 67.97, + "step": 3530, + "train_speed(iter/s)": 1.633718 + }, + { + "acc": 0.61750698, + "epoch": 0.08967529173008625, + "grad_norm": 5.71875, + "learning_rate": 8.967529173008625e-06, + "loss": 1.8113636, + "memory(GiB)": 67.97, + "step": 3535, + "train_speed(iter/s)": 1.633956 + }, + { + "acc": 0.62888799, + "epoch": 0.0898021308980213, + "grad_norm": 5.0625, + "learning_rate": 8.980213089802131e-06, + "loss": 1.72378292, + "memory(GiB)": 67.97, + "step": 3540, + "train_speed(iter/s)": 1.634199 + }, + { + "acc": 0.63384509, + "epoch": 0.08992897006595636, + "grad_norm": 5.65625, + "learning_rate": 8.992897006595637e-06, + "loss": 1.70481033, + "memory(GiB)": 67.97, + "step": 3545, + "train_speed(iter/s)": 1.634448 + }, + { + "acc": 0.63124871, + "epoch": 0.09005580923389142, + "grad_norm": 5.96875, + "learning_rate": 9.005580923389143e-06, + "loss": 1.74125862, + "memory(GiB)": 67.97, + "step": 3550, + "train_speed(iter/s)": 1.634689 + }, + { + "acc": 0.61392713, + "epoch": 0.09018264840182648, + "grad_norm": 5.46875, + "learning_rate": 9.01826484018265e-06, + "loss": 1.80518646, + "memory(GiB)": 67.97, + "step": 3555, + "train_speed(iter/s)": 1.63494 + }, + { + "acc": 0.61655688, + "epoch": 0.09030948756976154, + "grad_norm": 8.0, + "learning_rate": 9.030948756976155e-06, + "loss": 1.87416306, + "memory(GiB)": 67.97, + "step": 3560, + "train_speed(iter/s)": 1.635193 + }, + { + "acc": 0.62979341, + "epoch": 0.0904363267376966, + "grad_norm": 5.75, + "learning_rate": 9.043632673769661e-06, + "loss": 1.79071503, + "memory(GiB)": 67.97, + "step": 3565, + "train_speed(iter/s)": 1.635452 + }, + { + "acc": 0.62730169, + "epoch": 0.09056316590563165, + "grad_norm": 5.9375, + "learning_rate": 9.056316590563167e-06, + "loss": 1.73049717, + "memory(GiB)": 67.97, + "step": 3570, + "train_speed(iter/s)": 1.63571 + }, + { + "acc": 0.62824545, + "epoch": 0.09069000507356671, + "grad_norm": 5.09375, + "learning_rate": 9.069000507356673e-06, + "loss": 1.73457069, + "memory(GiB)": 67.97, + "step": 3575, + "train_speed(iter/s)": 1.635959 + }, + { + "acc": 0.63004065, + "epoch": 0.09081684424150177, + "grad_norm": 5.71875, + "learning_rate": 9.081684424150177e-06, + "loss": 1.76967869, + "memory(GiB)": 67.97, + "step": 3580, + "train_speed(iter/s)": 1.636201 + }, + { + "acc": 0.63578882, + "epoch": 0.09094368340943683, + "grad_norm": 5.25, + "learning_rate": 9.094368340943685e-06, + "loss": 1.70772362, + "memory(GiB)": 67.97, + "step": 3585, + "train_speed(iter/s)": 1.63646 + }, + { + "acc": 0.63399067, + "epoch": 0.09107052257737189, + "grad_norm": 8.1875, + "learning_rate": 9.10705225773719e-06, + "loss": 1.69498749, + "memory(GiB)": 67.97, + "step": 3590, + "train_speed(iter/s)": 1.636722 + }, + { + "acc": 0.62841349, + "epoch": 0.09119736174530695, + "grad_norm": 5.90625, + "learning_rate": 9.119736174530695e-06, + "loss": 1.82112465, + "memory(GiB)": 67.97, + "step": 3595, + "train_speed(iter/s)": 1.636979 + }, + { + "acc": 0.64118824, + "epoch": 0.091324200913242, + "grad_norm": 6.09375, + "learning_rate": 9.132420091324201e-06, + "loss": 1.7731266, + "memory(GiB)": 67.97, + "step": 3600, + "train_speed(iter/s)": 1.637212 + }, + { + "acc": 0.64473572, + "epoch": 0.09145104008117706, + "grad_norm": 6.4375, + "learning_rate": 9.145104008117707e-06, + "loss": 1.71745529, + "memory(GiB)": 67.97, + "step": 3605, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.62576127, + "epoch": 0.09157787924911212, + "grad_norm": 4.78125, + "learning_rate": 9.157787924911213e-06, + "loss": 1.83209114, + "memory(GiB)": 67.97, + "step": 3610, + "train_speed(iter/s)": 1.63769 + }, + { + "acc": 0.63057089, + "epoch": 0.09170471841704718, + "grad_norm": 4.90625, + "learning_rate": 9.170471841704719e-06, + "loss": 1.81908989, + "memory(GiB)": 67.97, + "step": 3615, + "train_speed(iter/s)": 1.637931 + }, + { + "acc": 0.61526289, + "epoch": 0.09183155758498224, + "grad_norm": 7.125, + "learning_rate": 9.183155758498225e-06, + "loss": 1.81995583, + "memory(GiB)": 67.97, + "step": 3620, + "train_speed(iter/s)": 1.638178 + }, + { + "acc": 0.62104368, + "epoch": 0.0919583967529173, + "grad_norm": 5.375, + "learning_rate": 9.195839675291731e-06, + "loss": 1.83020267, + "memory(GiB)": 67.97, + "step": 3625, + "train_speed(iter/s)": 1.638414 + }, + { + "acc": 0.63465476, + "epoch": 0.09208523592085235, + "grad_norm": 5.9375, + "learning_rate": 9.208523592085237e-06, + "loss": 1.76404343, + "memory(GiB)": 67.97, + "step": 3630, + "train_speed(iter/s)": 1.638661 + }, + { + "acc": 0.64479656, + "epoch": 0.09221207508878741, + "grad_norm": 6.5625, + "learning_rate": 9.221207508878743e-06, + "loss": 1.71783867, + "memory(GiB)": 67.97, + "step": 3635, + "train_speed(iter/s)": 1.638906 + }, + { + "acc": 0.62756414, + "epoch": 0.09233891425672247, + "grad_norm": 7.125, + "learning_rate": 9.233891425672247e-06, + "loss": 1.81249371, + "memory(GiB)": 67.97, + "step": 3640, + "train_speed(iter/s)": 1.639138 + }, + { + "acc": 0.63336759, + "epoch": 0.09246575342465753, + "grad_norm": 6.5, + "learning_rate": 9.246575342465755e-06, + "loss": 1.78633366, + "memory(GiB)": 67.97, + "step": 3645, + "train_speed(iter/s)": 1.639376 + }, + { + "acc": 0.6273036, + "epoch": 0.09259259259259259, + "grad_norm": 5.8125, + "learning_rate": 9.25925925925926e-06, + "loss": 1.81920395, + "memory(GiB)": 67.97, + "step": 3650, + "train_speed(iter/s)": 1.639596 + }, + { + "acc": 0.62608423, + "epoch": 0.09271943176052765, + "grad_norm": 6.5, + "learning_rate": 9.271943176052765e-06, + "loss": 1.75691261, + "memory(GiB)": 67.97, + "step": 3655, + "train_speed(iter/s)": 1.639835 + }, + { + "acc": 0.6471118, + "epoch": 0.0928462709284627, + "grad_norm": 6.46875, + "learning_rate": 9.284627092846271e-06, + "loss": 1.73089218, + "memory(GiB)": 67.97, + "step": 3660, + "train_speed(iter/s)": 1.640077 + }, + { + "acc": 0.6465004, + "epoch": 0.09297311009639776, + "grad_norm": 6.6875, + "learning_rate": 9.297311009639777e-06, + "loss": 1.79161148, + "memory(GiB)": 67.97, + "step": 3665, + "train_speed(iter/s)": 1.640319 + }, + { + "acc": 0.63040805, + "epoch": 0.09309994926433282, + "grad_norm": 6.21875, + "learning_rate": 9.309994926433283e-06, + "loss": 1.85037575, + "memory(GiB)": 67.97, + "step": 3670, + "train_speed(iter/s)": 1.640551 + }, + { + "acc": 0.61671419, + "epoch": 0.09322678843226788, + "grad_norm": 6.375, + "learning_rate": 9.322678843226789e-06, + "loss": 1.85424042, + "memory(GiB)": 67.97, + "step": 3675, + "train_speed(iter/s)": 1.64081 + }, + { + "acc": 0.62683759, + "epoch": 0.09335362760020294, + "grad_norm": 8.3125, + "learning_rate": 9.335362760020295e-06, + "loss": 1.86326561, + "memory(GiB)": 67.97, + "step": 3680, + "train_speed(iter/s)": 1.641043 + }, + { + "acc": 0.6272747, + "epoch": 0.093480466768138, + "grad_norm": 5.96875, + "learning_rate": 9.348046676813801e-06, + "loss": 1.75050812, + "memory(GiB)": 67.97, + "step": 3685, + "train_speed(iter/s)": 1.641284 + }, + { + "acc": 0.6240057, + "epoch": 0.09360730593607305, + "grad_norm": 4.625, + "learning_rate": 9.360730593607307e-06, + "loss": 1.79427795, + "memory(GiB)": 67.97, + "step": 3690, + "train_speed(iter/s)": 1.641524 + }, + { + "acc": 0.64045963, + "epoch": 0.09373414510400811, + "grad_norm": 6.125, + "learning_rate": 9.373414510400813e-06, + "loss": 1.68342381, + "memory(GiB)": 67.97, + "step": 3695, + "train_speed(iter/s)": 1.641765 + }, + { + "acc": 0.61470656, + "epoch": 0.09386098427194317, + "grad_norm": 7.375, + "learning_rate": 9.386098427194317e-06, + "loss": 1.89201355, + "memory(GiB)": 67.97, + "step": 3700, + "train_speed(iter/s)": 1.642002 + }, + { + "acc": 0.61346741, + "epoch": 0.09398782343987823, + "grad_norm": 5.625, + "learning_rate": 9.398782343987825e-06, + "loss": 1.87396717, + "memory(GiB)": 67.97, + "step": 3705, + "train_speed(iter/s)": 1.642228 + }, + { + "acc": 0.63009262, + "epoch": 0.09411466260781329, + "grad_norm": 6.15625, + "learning_rate": 9.41146626078133e-06, + "loss": 1.72186909, + "memory(GiB)": 67.97, + "step": 3710, + "train_speed(iter/s)": 1.642475 + }, + { + "acc": 0.64259787, + "epoch": 0.09424150177574835, + "grad_norm": 7.65625, + "learning_rate": 9.424150177574835e-06, + "loss": 1.76672134, + "memory(GiB)": 67.97, + "step": 3715, + "train_speed(iter/s)": 1.642712 + }, + { + "acc": 0.61804214, + "epoch": 0.0943683409436834, + "grad_norm": 7.59375, + "learning_rate": 9.436834094368341e-06, + "loss": 1.79548759, + "memory(GiB)": 67.97, + "step": 3720, + "train_speed(iter/s)": 1.642951 + }, + { + "acc": 0.647159, + "epoch": 0.09449518011161846, + "grad_norm": 7.28125, + "learning_rate": 9.449518011161849e-06, + "loss": 1.6965374, + "memory(GiB)": 67.97, + "step": 3725, + "train_speed(iter/s)": 1.643183 + }, + { + "acc": 0.64615951, + "epoch": 0.09462201927955352, + "grad_norm": 6.0, + "learning_rate": 9.462201927955353e-06, + "loss": 1.69098835, + "memory(GiB)": 67.97, + "step": 3730, + "train_speed(iter/s)": 1.643423 + }, + { + "acc": 0.62432489, + "epoch": 0.09474885844748858, + "grad_norm": 5.34375, + "learning_rate": 9.474885844748859e-06, + "loss": 1.80043926, + "memory(GiB)": 67.97, + "step": 3735, + "train_speed(iter/s)": 1.643661 + }, + { + "acc": 0.6286046, + "epoch": 0.09487569761542364, + "grad_norm": 5.53125, + "learning_rate": 9.487569761542365e-06, + "loss": 1.75099373, + "memory(GiB)": 67.97, + "step": 3740, + "train_speed(iter/s)": 1.643905 + }, + { + "acc": 0.6401032, + "epoch": 0.0950025367833587, + "grad_norm": 5.40625, + "learning_rate": 9.50025367833587e-06, + "loss": 1.75601978, + "memory(GiB)": 67.97, + "step": 3745, + "train_speed(iter/s)": 1.644131 + }, + { + "acc": 0.622264, + "epoch": 0.09512937595129375, + "grad_norm": 5.1875, + "learning_rate": 9.512937595129377e-06, + "loss": 1.78772736, + "memory(GiB)": 67.97, + "step": 3750, + "train_speed(iter/s)": 1.64438 + }, + { + "acc": 0.63415251, + "epoch": 0.09525621511922881, + "grad_norm": 7.0, + "learning_rate": 9.525621511922883e-06, + "loss": 1.83068581, + "memory(GiB)": 67.97, + "step": 3755, + "train_speed(iter/s)": 1.644605 + }, + { + "acc": 0.61628761, + "epoch": 0.09538305428716387, + "grad_norm": 6.21875, + "learning_rate": 9.538305428716389e-06, + "loss": 1.82850704, + "memory(GiB)": 67.97, + "step": 3760, + "train_speed(iter/s)": 1.644837 + }, + { + "acc": 0.63104143, + "epoch": 0.09550989345509893, + "grad_norm": 6.125, + "learning_rate": 9.550989345509895e-06, + "loss": 1.81773071, + "memory(GiB)": 67.97, + "step": 3765, + "train_speed(iter/s)": 1.645072 + }, + { + "acc": 0.62594171, + "epoch": 0.09563673262303399, + "grad_norm": 5.9375, + "learning_rate": 9.5636732623034e-06, + "loss": 1.80854778, + "memory(GiB)": 67.97, + "step": 3770, + "train_speed(iter/s)": 1.645287 + }, + { + "acc": 0.63547354, + "epoch": 0.09576357179096905, + "grad_norm": 5.6875, + "learning_rate": 9.576357179096905e-06, + "loss": 1.73742657, + "memory(GiB)": 67.97, + "step": 3775, + "train_speed(iter/s)": 1.645499 + }, + { + "acc": 0.6176383, + "epoch": 0.0958904109589041, + "grad_norm": 7.84375, + "learning_rate": 9.589041095890411e-06, + "loss": 1.84551468, + "memory(GiB)": 67.97, + "step": 3780, + "train_speed(iter/s)": 1.645712 + }, + { + "acc": 0.6130053, + "epoch": 0.09601725012683916, + "grad_norm": 5.71875, + "learning_rate": 9.601725012683919e-06, + "loss": 1.84195786, + "memory(GiB)": 67.97, + "step": 3785, + "train_speed(iter/s)": 1.645931 + }, + { + "acc": 0.63320622, + "epoch": 0.09614408929477422, + "grad_norm": 6.0625, + "learning_rate": 9.614408929477423e-06, + "loss": 1.7915554, + "memory(GiB)": 67.97, + "step": 3790, + "train_speed(iter/s)": 1.646163 + }, + { + "acc": 0.61723895, + "epoch": 0.09627092846270928, + "grad_norm": 4.8125, + "learning_rate": 9.627092846270929e-06, + "loss": 1.82622223, + "memory(GiB)": 67.97, + "step": 3795, + "train_speed(iter/s)": 1.646367 + }, + { + "acc": 0.61042767, + "epoch": 0.09639776763064434, + "grad_norm": 6.0625, + "learning_rate": 9.639776763064435e-06, + "loss": 1.83352184, + "memory(GiB)": 67.97, + "step": 3800, + "train_speed(iter/s)": 1.646589 + }, + { + "acc": 0.63187056, + "epoch": 0.0965246067985794, + "grad_norm": 4.625, + "learning_rate": 9.65246067985794e-06, + "loss": 1.82001038, + "memory(GiB)": 67.97, + "step": 3805, + "train_speed(iter/s)": 1.646788 + }, + { + "acc": 0.6299325, + "epoch": 0.09665144596651445, + "grad_norm": 5.5, + "learning_rate": 9.665144596651447e-06, + "loss": 1.72859917, + "memory(GiB)": 67.97, + "step": 3810, + "train_speed(iter/s)": 1.647014 + }, + { + "acc": 0.59229765, + "epoch": 0.09677828513444951, + "grad_norm": 7.96875, + "learning_rate": 9.677828513444953e-06, + "loss": 1.85296059, + "memory(GiB)": 67.97, + "step": 3815, + "train_speed(iter/s)": 1.647241 + }, + { + "acc": 0.61764145, + "epoch": 0.09690512430238457, + "grad_norm": 7.4375, + "learning_rate": 9.690512430238459e-06, + "loss": 1.87241898, + "memory(GiB)": 67.97, + "step": 3820, + "train_speed(iter/s)": 1.647446 + }, + { + "acc": 0.6548358, + "epoch": 0.09703196347031963, + "grad_norm": 5.15625, + "learning_rate": 9.703196347031965e-06, + "loss": 1.67850609, + "memory(GiB)": 67.97, + "step": 3825, + "train_speed(iter/s)": 1.647647 + }, + { + "acc": 0.64989834, + "epoch": 0.09715880263825469, + "grad_norm": 5.78125, + "learning_rate": 9.71588026382547e-06, + "loss": 1.70219784, + "memory(GiB)": 67.97, + "step": 3830, + "train_speed(iter/s)": 1.647848 + }, + { + "acc": 0.60952578, + "epoch": 0.09728564180618975, + "grad_norm": 5.71875, + "learning_rate": 9.728564180618977e-06, + "loss": 1.85248871, + "memory(GiB)": 67.97, + "step": 3835, + "train_speed(iter/s)": 1.648052 + }, + { + "acc": 0.62997289, + "epoch": 0.0974124809741248, + "grad_norm": 5.84375, + "learning_rate": 9.74124809741248e-06, + "loss": 1.81175308, + "memory(GiB)": 67.97, + "step": 3840, + "train_speed(iter/s)": 1.64827 + }, + { + "acc": 0.62509851, + "epoch": 0.09753932014205986, + "grad_norm": 5.59375, + "learning_rate": 9.753932014205988e-06, + "loss": 1.77528763, + "memory(GiB)": 67.97, + "step": 3845, + "train_speed(iter/s)": 1.648473 + }, + { + "acc": 0.64610667, + "epoch": 0.09766615930999492, + "grad_norm": 5.21875, + "learning_rate": 9.766615930999493e-06, + "loss": 1.7297596, + "memory(GiB)": 67.97, + "step": 3850, + "train_speed(iter/s)": 1.648688 + }, + { + "acc": 0.64355602, + "epoch": 0.09779299847792998, + "grad_norm": 5.4375, + "learning_rate": 9.779299847792999e-06, + "loss": 1.72099247, + "memory(GiB)": 67.97, + "step": 3855, + "train_speed(iter/s)": 1.648881 + }, + { + "acc": 0.63804913, + "epoch": 0.09791983764586504, + "grad_norm": 6.3125, + "learning_rate": 9.791983764586505e-06, + "loss": 1.70037785, + "memory(GiB)": 67.97, + "step": 3860, + "train_speed(iter/s)": 1.649086 + }, + { + "acc": 0.63336425, + "epoch": 0.0980466768138001, + "grad_norm": 6.34375, + "learning_rate": 9.80466768138001e-06, + "loss": 1.79189491, + "memory(GiB)": 67.97, + "step": 3865, + "train_speed(iter/s)": 1.649293 + }, + { + "acc": 0.62268028, + "epoch": 0.09817351598173515, + "grad_norm": 6.375, + "learning_rate": 9.817351598173517e-06, + "loss": 1.83122673, + "memory(GiB)": 67.97, + "step": 3870, + "train_speed(iter/s)": 1.649501 + }, + { + "acc": 0.62709866, + "epoch": 0.09830035514967021, + "grad_norm": 4.9375, + "learning_rate": 9.830035514967023e-06, + "loss": 1.82139587, + "memory(GiB)": 67.97, + "step": 3875, + "train_speed(iter/s)": 1.649688 + }, + { + "acc": 0.61775036, + "epoch": 0.09842719431760527, + "grad_norm": 5.59375, + "learning_rate": 9.842719431760529e-06, + "loss": 1.87538872, + "memory(GiB)": 67.97, + "step": 3880, + "train_speed(iter/s)": 1.649888 + }, + { + "acc": 0.63216476, + "epoch": 0.09855403348554033, + "grad_norm": 6.65625, + "learning_rate": 9.855403348554034e-06, + "loss": 1.7553051, + "memory(GiB)": 67.97, + "step": 3885, + "train_speed(iter/s)": 1.650098 + }, + { + "acc": 0.60969896, + "epoch": 0.09868087265347539, + "grad_norm": 5.96875, + "learning_rate": 9.86808726534754e-06, + "loss": 1.83891125, + "memory(GiB)": 67.97, + "step": 3890, + "train_speed(iter/s)": 1.650325 + }, + { + "acc": 0.62578363, + "epoch": 0.09880771182141045, + "grad_norm": 5.21875, + "learning_rate": 9.880771182141046e-06, + "loss": 1.85703278, + "memory(GiB)": 67.97, + "step": 3895, + "train_speed(iter/s)": 1.650527 + }, + { + "acc": 0.63930302, + "epoch": 0.0989345509893455, + "grad_norm": 5.09375, + "learning_rate": 9.89345509893455e-06, + "loss": 1.75358734, + "memory(GiB)": 67.97, + "step": 3900, + "train_speed(iter/s)": 1.650726 + }, + { + "acc": 0.63745022, + "epoch": 0.09906139015728056, + "grad_norm": 5.90625, + "learning_rate": 9.906139015728058e-06, + "loss": 1.76766968, + "memory(GiB)": 67.97, + "step": 3905, + "train_speed(iter/s)": 1.650928 + }, + { + "acc": 0.6174964, + "epoch": 0.09918822932521562, + "grad_norm": 5.0625, + "learning_rate": 9.918822932521563e-06, + "loss": 1.81346703, + "memory(GiB)": 67.97, + "step": 3910, + "train_speed(iter/s)": 1.651113 + }, + { + "acc": 0.63096094, + "epoch": 0.09931506849315068, + "grad_norm": 5.1875, + "learning_rate": 9.931506849315069e-06, + "loss": 1.79535713, + "memory(GiB)": 67.97, + "step": 3915, + "train_speed(iter/s)": 1.651319 + }, + { + "acc": 0.62452917, + "epoch": 0.09944190766108574, + "grad_norm": 5.28125, + "learning_rate": 9.944190766108575e-06, + "loss": 1.83531189, + "memory(GiB)": 67.97, + "step": 3920, + "train_speed(iter/s)": 1.651524 + }, + { + "acc": 0.63546438, + "epoch": 0.0995687468290208, + "grad_norm": 6.09375, + "learning_rate": 9.95687468290208e-06, + "loss": 1.67707253, + "memory(GiB)": 67.97, + "step": 3925, + "train_speed(iter/s)": 1.651732 + }, + { + "acc": 0.63164568, + "epoch": 0.09969558599695585, + "grad_norm": 6.5, + "learning_rate": 9.969558599695586e-06, + "loss": 1.77427979, + "memory(GiB)": 67.97, + "step": 3930, + "train_speed(iter/s)": 1.651923 + }, + { + "acc": 0.62451582, + "epoch": 0.09982242516489091, + "grad_norm": 6.09375, + "learning_rate": 9.982242516489092e-06, + "loss": 1.81063709, + "memory(GiB)": 67.97, + "step": 3935, + "train_speed(iter/s)": 1.652129 + }, + { + "acc": 0.64227715, + "epoch": 0.09994926433282597, + "grad_norm": 5.625, + "learning_rate": 9.994926433282598e-06, + "loss": 1.71588745, + "memory(GiB)": 67.97, + "step": 3940, + "train_speed(iter/s)": 1.652325 + }, + { + "acc": 0.65402422, + "epoch": 0.10007610350076103, + "grad_norm": 5.09375, + "learning_rate": 9.999999960413982e-06, + "loss": 1.69254265, + "memory(GiB)": 67.97, + "step": 3945, + "train_speed(iter/s)": 1.652535 + }, + { + "acc": 0.62749376, + "epoch": 0.10020294266869609, + "grad_norm": 9.375, + "learning_rate": 9.99999971849943e-06, + "loss": 1.82273827, + "memory(GiB)": 67.97, + "step": 3950, + "train_speed(iter/s)": 1.652739 + }, + { + "acc": 0.63605247, + "epoch": 0.10032978183663115, + "grad_norm": 5.65625, + "learning_rate": 9.999999256662563e-06, + "loss": 1.81794357, + "memory(GiB)": 67.97, + "step": 3955, + "train_speed(iter/s)": 1.652957 + }, + { + "acc": 0.64998913, + "epoch": 0.1004566210045662, + "grad_norm": 6.40625, + "learning_rate": 9.999998574903408e-06, + "loss": 1.7013237, + "memory(GiB)": 67.97, + "step": 3960, + "train_speed(iter/s)": 1.653166 + }, + { + "acc": 0.63812647, + "epoch": 0.10058346017250126, + "grad_norm": 6.40625, + "learning_rate": 9.99999767322199e-06, + "loss": 1.7743, + "memory(GiB)": 67.97, + "step": 3965, + "train_speed(iter/s)": 1.653357 + }, + { + "acc": 0.62548313, + "epoch": 0.10071029934043632, + "grad_norm": 5.09375, + "learning_rate": 9.999996551618353e-06, + "loss": 1.72225342, + "memory(GiB)": 67.97, + "step": 3970, + "train_speed(iter/s)": 1.653561 + }, + { + "acc": 0.61485071, + "epoch": 0.10083713850837138, + "grad_norm": 6.125, + "learning_rate": 9.999995210092545e-06, + "loss": 1.7970808, + "memory(GiB)": 67.97, + "step": 3975, + "train_speed(iter/s)": 1.653754 + }, + { + "acc": 0.62164564, + "epoch": 0.10096397767630644, + "grad_norm": 6.8125, + "learning_rate": 9.999993648644622e-06, + "loss": 1.75651016, + "memory(GiB)": 67.97, + "step": 3980, + "train_speed(iter/s)": 1.653948 + }, + { + "acc": 0.63222899, + "epoch": 0.1010908168442415, + "grad_norm": 6.25, + "learning_rate": 9.999991867274656e-06, + "loss": 1.78450928, + "memory(GiB)": 67.97, + "step": 3985, + "train_speed(iter/s)": 1.65414 + }, + { + "acc": 0.62905321, + "epoch": 0.10121765601217655, + "grad_norm": 6.4375, + "learning_rate": 9.999989865982725e-06, + "loss": 1.71516094, + "memory(GiB)": 67.97, + "step": 3990, + "train_speed(iter/s)": 1.654343 + }, + { + "acc": 0.62270198, + "epoch": 0.10134449518011161, + "grad_norm": 5.25, + "learning_rate": 9.999987644768917e-06, + "loss": 1.72050247, + "memory(GiB)": 67.97, + "step": 3995, + "train_speed(iter/s)": 1.654539 + }, + { + "acc": 0.63560915, + "epoch": 0.10147133434804667, + "grad_norm": 6.3125, + "learning_rate": 9.999985203633327e-06, + "loss": 1.78750134, + "memory(GiB)": 67.97, + "step": 4000, + "train_speed(iter/s)": 1.654737 + }, + { + "epoch": 0.10147133434804667, + "eval_acc": 0.621197142361916, + "eval_loss": 1.7191544771194458, + "eval_runtime": 58.6125, + "eval_samples_per_second": 108.68, + "eval_steps_per_second": 27.178, + "step": 4000 + }, + { + "acc": 0.62880044, + "epoch": 0.10159817351598173, + "grad_norm": 6.9375, + "learning_rate": 9.999982542576065e-06, + "loss": 1.74402771, + "memory(GiB)": 67.97, + "step": 4005, + "train_speed(iter/s)": 1.613029 + }, + { + "acc": 0.63192768, + "epoch": 0.10172501268391679, + "grad_norm": 7.03125, + "learning_rate": 9.999979661597247e-06, + "loss": 1.79344215, + "memory(GiB)": 67.97, + "step": 4010, + "train_speed(iter/s)": 1.613274 + }, + { + "acc": 0.63948364, + "epoch": 0.10185185185185185, + "grad_norm": 6.125, + "learning_rate": 9.999976560697002e-06, + "loss": 1.7201767, + "memory(GiB)": 67.97, + "step": 4015, + "train_speed(iter/s)": 1.613509 + }, + { + "acc": 0.64680686, + "epoch": 0.1019786910197869, + "grad_norm": 6.875, + "learning_rate": 9.999973239875462e-06, + "loss": 1.73223801, + "memory(GiB)": 67.97, + "step": 4020, + "train_speed(iter/s)": 1.613747 + }, + { + "acc": 0.63398533, + "epoch": 0.10210553018772196, + "grad_norm": 6.625, + "learning_rate": 9.999969699132776e-06, + "loss": 1.7380722, + "memory(GiB)": 67.97, + "step": 4025, + "train_speed(iter/s)": 1.613985 + }, + { + "acc": 0.61618924, + "epoch": 0.10223236935565702, + "grad_norm": 4.8125, + "learning_rate": 9.999965938469102e-06, + "loss": 1.82005501, + "memory(GiB)": 67.97, + "step": 4030, + "train_speed(iter/s)": 1.614221 + }, + { + "acc": 0.62525425, + "epoch": 0.10235920852359208, + "grad_norm": 6.03125, + "learning_rate": 9.9999619578846e-06, + "loss": 1.78214111, + "memory(GiB)": 67.97, + "step": 4035, + "train_speed(iter/s)": 1.614463 + }, + { + "acc": 0.63352356, + "epoch": 0.10248604769152714, + "grad_norm": 6.71875, + "learning_rate": 9.999957757379451e-06, + "loss": 1.75636902, + "memory(GiB)": 67.97, + "step": 4040, + "train_speed(iter/s)": 1.614706 + }, + { + "acc": 0.64604177, + "epoch": 0.1026128868594622, + "grad_norm": 8.0, + "learning_rate": 9.999953336953834e-06, + "loss": 1.66816387, + "memory(GiB)": 67.97, + "step": 4045, + "train_speed(iter/s)": 1.61493 + }, + { + "acc": 0.62357917, + "epoch": 0.10273972602739725, + "grad_norm": 5.4375, + "learning_rate": 9.999948696607946e-06, + "loss": 1.77020607, + "memory(GiB)": 67.97, + "step": 4050, + "train_speed(iter/s)": 1.615162 + }, + { + "acc": 0.63200035, + "epoch": 0.10286656519533231, + "grad_norm": 5.4375, + "learning_rate": 9.999943836341992e-06, + "loss": 1.76096439, + "memory(GiB)": 67.97, + "step": 4055, + "train_speed(iter/s)": 1.61539 + }, + { + "acc": 0.65547247, + "epoch": 0.10299340436326737, + "grad_norm": 5.65625, + "learning_rate": 9.999938756156185e-06, + "loss": 1.64716663, + "memory(GiB)": 67.97, + "step": 4060, + "train_speed(iter/s)": 1.615613 + }, + { + "acc": 0.62961836, + "epoch": 0.10312024353120243, + "grad_norm": 5.78125, + "learning_rate": 9.999933456050747e-06, + "loss": 1.80200157, + "memory(GiB)": 67.97, + "step": 4065, + "train_speed(iter/s)": 1.615838 + }, + { + "acc": 0.62936592, + "epoch": 0.10324708269913749, + "grad_norm": 5.5, + "learning_rate": 9.999927936025914e-06, + "loss": 1.81737175, + "memory(GiB)": 67.97, + "step": 4070, + "train_speed(iter/s)": 1.616064 + }, + { + "acc": 0.63939648, + "epoch": 0.10337392186707255, + "grad_norm": 6.4375, + "learning_rate": 9.999922196081928e-06, + "loss": 1.72342014, + "memory(GiB)": 67.97, + "step": 4075, + "train_speed(iter/s)": 1.616312 + }, + { + "acc": 0.62949147, + "epoch": 0.1035007610350076, + "grad_norm": 5.9375, + "learning_rate": 9.99991623621904e-06, + "loss": 1.75348682, + "memory(GiB)": 67.97, + "step": 4080, + "train_speed(iter/s)": 1.616523 + }, + { + "acc": 0.63353796, + "epoch": 0.10362760020294266, + "grad_norm": 6.3125, + "learning_rate": 9.999910056437512e-06, + "loss": 1.76169186, + "memory(GiB)": 67.97, + "step": 4085, + "train_speed(iter/s)": 1.616759 + }, + { + "acc": 0.63861713, + "epoch": 0.10375443937087772, + "grad_norm": 4.875, + "learning_rate": 9.999903656737618e-06, + "loss": 1.71907082, + "memory(GiB)": 67.97, + "step": 4090, + "train_speed(iter/s)": 1.61699 + }, + { + "acc": 0.62957411, + "epoch": 0.10388127853881278, + "grad_norm": 6.28125, + "learning_rate": 9.999897037119637e-06, + "loss": 1.73045578, + "memory(GiB)": 67.97, + "step": 4095, + "train_speed(iter/s)": 1.617219 + }, + { + "acc": 0.62866564, + "epoch": 0.10400811770674784, + "grad_norm": 6.40625, + "learning_rate": 9.999890197583862e-06, + "loss": 1.77654228, + "memory(GiB)": 67.97, + "step": 4100, + "train_speed(iter/s)": 1.617445 + }, + { + "acc": 0.6204422, + "epoch": 0.1041349568746829, + "grad_norm": 7.0625, + "learning_rate": 9.999883138130593e-06, + "loss": 1.80975266, + "memory(GiB)": 67.97, + "step": 4105, + "train_speed(iter/s)": 1.617658 + }, + { + "acc": 0.64044752, + "epoch": 0.10426179604261795, + "grad_norm": 5.375, + "learning_rate": 9.999875858760143e-06, + "loss": 1.74375191, + "memory(GiB)": 67.97, + "step": 4110, + "train_speed(iter/s)": 1.617877 + }, + { + "acc": 0.628263, + "epoch": 0.10438863521055301, + "grad_norm": 5.84375, + "learning_rate": 9.999868359472826e-06, + "loss": 1.83561497, + "memory(GiB)": 67.97, + "step": 4115, + "train_speed(iter/s)": 1.618091 + }, + { + "acc": 0.63449526, + "epoch": 0.10451547437848807, + "grad_norm": 5.40625, + "learning_rate": 9.999860640268977e-06, + "loss": 1.77878284, + "memory(GiB)": 67.97, + "step": 4120, + "train_speed(iter/s)": 1.61832 + }, + { + "acc": 0.63982677, + "epoch": 0.10464231354642313, + "grad_norm": 5.90625, + "learning_rate": 9.999852701148935e-06, + "loss": 1.74853573, + "memory(GiB)": 67.97, + "step": 4125, + "train_speed(iter/s)": 1.618538 + }, + { + "acc": 0.65120993, + "epoch": 0.10476915271435819, + "grad_norm": 7.3125, + "learning_rate": 9.999844542113049e-06, + "loss": 1.75328979, + "memory(GiB)": 67.97, + "step": 4130, + "train_speed(iter/s)": 1.61877 + }, + { + "acc": 0.61673384, + "epoch": 0.10489599188229325, + "grad_norm": 5.40625, + "learning_rate": 9.999836163161675e-06, + "loss": 1.83802223, + "memory(GiB)": 67.97, + "step": 4135, + "train_speed(iter/s)": 1.618999 + }, + { + "acc": 0.66415944, + "epoch": 0.1050228310502283, + "grad_norm": 6.25, + "learning_rate": 9.999827564295187e-06, + "loss": 1.63727341, + "memory(GiB)": 67.97, + "step": 4140, + "train_speed(iter/s)": 1.619197 + }, + { + "acc": 0.64094048, + "epoch": 0.10514967021816336, + "grad_norm": 5.28125, + "learning_rate": 9.999818745513958e-06, + "loss": 1.73015862, + "memory(GiB)": 67.97, + "step": 4145, + "train_speed(iter/s)": 1.619426 + }, + { + "acc": 0.63930955, + "epoch": 0.10527650938609842, + "grad_norm": 5.6875, + "learning_rate": 9.99980970681838e-06, + "loss": 1.75726871, + "memory(GiB)": 67.97, + "step": 4150, + "train_speed(iter/s)": 1.619641 + }, + { + "acc": 0.64495535, + "epoch": 0.10540334855403348, + "grad_norm": 6.4375, + "learning_rate": 9.999800448208846e-06, + "loss": 1.69719219, + "memory(GiB)": 67.97, + "step": 4155, + "train_speed(iter/s)": 1.619867 + }, + { + "acc": 0.62620168, + "epoch": 0.10553018772196854, + "grad_norm": 5.1875, + "learning_rate": 9.999790969685767e-06, + "loss": 1.74291191, + "memory(GiB)": 67.97, + "step": 4160, + "train_speed(iter/s)": 1.620084 + }, + { + "acc": 0.6214191, + "epoch": 0.1056570268899036, + "grad_norm": 7.40625, + "learning_rate": 9.999781271249559e-06, + "loss": 1.76673584, + "memory(GiB)": 67.97, + "step": 4165, + "train_speed(iter/s)": 1.620306 + }, + { + "acc": 0.64081421, + "epoch": 0.10578386605783865, + "grad_norm": 5.21875, + "learning_rate": 9.999771352900647e-06, + "loss": 1.71336899, + "memory(GiB)": 67.97, + "step": 4170, + "train_speed(iter/s)": 1.62052 + }, + { + "acc": 0.64104505, + "epoch": 0.10591070522577371, + "grad_norm": 5.6875, + "learning_rate": 9.999761214639469e-06, + "loss": 1.63464375, + "memory(GiB)": 67.97, + "step": 4175, + "train_speed(iter/s)": 1.620736 + }, + { + "acc": 0.64937425, + "epoch": 0.10603754439370877, + "grad_norm": 5.59375, + "learning_rate": 9.999750856466472e-06, + "loss": 1.63867226, + "memory(GiB)": 67.97, + "step": 4180, + "train_speed(iter/s)": 1.620903 + }, + { + "acc": 0.63505259, + "epoch": 0.10616438356164383, + "grad_norm": 5.375, + "learning_rate": 9.99974027838211e-06, + "loss": 1.68519821, + "memory(GiB)": 67.97, + "step": 4185, + "train_speed(iter/s)": 1.621141 + }, + { + "acc": 0.64382148, + "epoch": 0.10629122272957889, + "grad_norm": 4.4375, + "learning_rate": 9.999729480386846e-06, + "loss": 1.69050255, + "memory(GiB)": 67.97, + "step": 4190, + "train_speed(iter/s)": 1.62136 + }, + { + "acc": 0.6431694, + "epoch": 0.10641806189751395, + "grad_norm": 5.59375, + "learning_rate": 9.999718462481157e-06, + "loss": 1.69474392, + "memory(GiB)": 67.97, + "step": 4195, + "train_speed(iter/s)": 1.621585 + }, + { + "acc": 0.64862137, + "epoch": 0.106544901065449, + "grad_norm": 5.15625, + "learning_rate": 9.99970722466553e-06, + "loss": 1.7278923, + "memory(GiB)": 67.97, + "step": 4200, + "train_speed(iter/s)": 1.621792 + }, + { + "acc": 0.64202552, + "epoch": 0.10667174023338406, + "grad_norm": 7.5, + "learning_rate": 9.999695766940458e-06, + "loss": 1.70533943, + "memory(GiB)": 67.97, + "step": 4205, + "train_speed(iter/s)": 1.622012 + }, + { + "acc": 0.62963924, + "epoch": 0.10679857940131912, + "grad_norm": 5.25, + "learning_rate": 9.999684089306442e-06, + "loss": 1.75145111, + "memory(GiB)": 67.97, + "step": 4210, + "train_speed(iter/s)": 1.62223 + }, + { + "acc": 0.63862095, + "epoch": 0.10692541856925418, + "grad_norm": 5.03125, + "learning_rate": 9.999672191763999e-06, + "loss": 1.71522713, + "memory(GiB)": 67.97, + "step": 4215, + "train_speed(iter/s)": 1.62243 + }, + { + "acc": 0.64715567, + "epoch": 0.10705225773718924, + "grad_norm": 6.71875, + "learning_rate": 9.99966007431365e-06, + "loss": 1.62397194, + "memory(GiB)": 67.97, + "step": 4220, + "train_speed(iter/s)": 1.622625 + }, + { + "acc": 0.63859258, + "epoch": 0.1071790969051243, + "grad_norm": 4.5625, + "learning_rate": 9.99964773695593e-06, + "loss": 1.73200054, + "memory(GiB)": 67.97, + "step": 4225, + "train_speed(iter/s)": 1.622831 + }, + { + "acc": 0.64712224, + "epoch": 0.10730593607305935, + "grad_norm": 6.125, + "learning_rate": 9.999635179691381e-06, + "loss": 1.77615089, + "memory(GiB)": 67.97, + "step": 4230, + "train_speed(iter/s)": 1.623055 + }, + { + "acc": 0.63496761, + "epoch": 0.10743277524099441, + "grad_norm": 5.40625, + "learning_rate": 9.999622402520553e-06, + "loss": 1.77117615, + "memory(GiB)": 67.97, + "step": 4235, + "train_speed(iter/s)": 1.623269 + }, + { + "acc": 0.6219996, + "epoch": 0.10755961440892947, + "grad_norm": 6.71875, + "learning_rate": 9.999609405444012e-06, + "loss": 1.77838955, + "memory(GiB)": 67.97, + "step": 4240, + "train_speed(iter/s)": 1.623476 + }, + { + "acc": 0.63712187, + "epoch": 0.10768645357686453, + "grad_norm": 5.84375, + "learning_rate": 9.999596188462328e-06, + "loss": 1.73780289, + "memory(GiB)": 67.97, + "step": 4245, + "train_speed(iter/s)": 1.623676 + }, + { + "acc": 0.64100833, + "epoch": 0.10781329274479959, + "grad_norm": 4.65625, + "learning_rate": 9.99958275157608e-06, + "loss": 1.71376457, + "memory(GiB)": 67.97, + "step": 4250, + "train_speed(iter/s)": 1.623885 + }, + { + "acc": 0.64295502, + "epoch": 0.10794013191273465, + "grad_norm": 5.96875, + "learning_rate": 9.999569094785862e-06, + "loss": 1.78116894, + "memory(GiB)": 67.97, + "step": 4255, + "train_speed(iter/s)": 1.6241 + }, + { + "acc": 0.64984322, + "epoch": 0.1080669710806697, + "grad_norm": 6.40625, + "learning_rate": 9.999555218092273e-06, + "loss": 1.7100069, + "memory(GiB)": 67.97, + "step": 4260, + "train_speed(iter/s)": 1.624316 + }, + { + "acc": 0.64320669, + "epoch": 0.10819381024860476, + "grad_norm": 5.6875, + "learning_rate": 9.999541121495926e-06, + "loss": 1.73749485, + "memory(GiB)": 67.97, + "step": 4265, + "train_speed(iter/s)": 1.624532 + }, + { + "acc": 0.6280302, + "epoch": 0.10832064941653982, + "grad_norm": 7.09375, + "learning_rate": 9.999526804997439e-06, + "loss": 1.77813206, + "memory(GiB)": 67.97, + "step": 4270, + "train_speed(iter/s)": 1.624748 + }, + { + "acc": 0.62221069, + "epoch": 0.10844748858447488, + "grad_norm": 5.4375, + "learning_rate": 9.99951226859744e-06, + "loss": 1.83957424, + "memory(GiB)": 67.97, + "step": 4275, + "train_speed(iter/s)": 1.624963 + }, + { + "acc": 0.64271383, + "epoch": 0.10857432775240994, + "grad_norm": 6.375, + "learning_rate": 9.999497512296572e-06, + "loss": 1.73349743, + "memory(GiB)": 67.97, + "step": 4280, + "train_speed(iter/s)": 1.625174 + }, + { + "acc": 0.6472898, + "epoch": 0.108701166920345, + "grad_norm": 10.25, + "learning_rate": 9.999482536095483e-06, + "loss": 1.73808861, + "memory(GiB)": 67.97, + "step": 4285, + "train_speed(iter/s)": 1.625395 + }, + { + "acc": 0.63170528, + "epoch": 0.10882800608828005, + "grad_norm": 4.78125, + "learning_rate": 9.999467339994827e-06, + "loss": 1.73651104, + "memory(GiB)": 67.97, + "step": 4290, + "train_speed(iter/s)": 1.625598 + }, + { + "acc": 0.64741445, + "epoch": 0.10895484525621511, + "grad_norm": 7.03125, + "learning_rate": 9.99945192399528e-06, + "loss": 1.71542015, + "memory(GiB)": 67.97, + "step": 4295, + "train_speed(iter/s)": 1.625806 + }, + { + "acc": 0.62867537, + "epoch": 0.10908168442415017, + "grad_norm": 6.09375, + "learning_rate": 9.999436288097515e-06, + "loss": 1.73983574, + "memory(GiB)": 67.97, + "step": 4300, + "train_speed(iter/s)": 1.626007 + }, + { + "acc": 0.63284001, + "epoch": 0.10920852359208523, + "grad_norm": 6.71875, + "learning_rate": 9.99942043230222e-06, + "loss": 1.77272644, + "memory(GiB)": 67.97, + "step": 4305, + "train_speed(iter/s)": 1.626206 + }, + { + "acc": 0.63630443, + "epoch": 0.10933536276002029, + "grad_norm": 6.90625, + "learning_rate": 9.999404356610095e-06, + "loss": 1.6563343, + "memory(GiB)": 67.97, + "step": 4310, + "train_speed(iter/s)": 1.626398 + }, + { + "acc": 0.6436305, + "epoch": 0.10946220192795535, + "grad_norm": 6.65625, + "learning_rate": 9.999388061021846e-06, + "loss": 1.74741096, + "memory(GiB)": 67.97, + "step": 4315, + "train_speed(iter/s)": 1.626618 + }, + { + "acc": 0.62731266, + "epoch": 0.1095890410958904, + "grad_norm": 5.65625, + "learning_rate": 9.99937154553819e-06, + "loss": 1.77855453, + "memory(GiB)": 67.97, + "step": 4320, + "train_speed(iter/s)": 1.62683 + }, + { + "acc": 0.61831484, + "epoch": 0.10971588026382546, + "grad_norm": 5.96875, + "learning_rate": 9.999354810159852e-06, + "loss": 1.74897957, + "memory(GiB)": 67.97, + "step": 4325, + "train_speed(iter/s)": 1.627037 + }, + { + "acc": 0.6361763, + "epoch": 0.10984271943176052, + "grad_norm": 5.5, + "learning_rate": 9.999337854887567e-06, + "loss": 1.7201046, + "memory(GiB)": 67.97, + "step": 4330, + "train_speed(iter/s)": 1.627238 + }, + { + "acc": 0.64260607, + "epoch": 0.10996955859969558, + "grad_norm": 6.0, + "learning_rate": 9.999320679722086e-06, + "loss": 1.7507761, + "memory(GiB)": 67.97, + "step": 4335, + "train_speed(iter/s)": 1.627454 + }, + { + "acc": 0.6355649, + "epoch": 0.11009639776763064, + "grad_norm": 5.625, + "learning_rate": 9.999303284664159e-06, + "loss": 1.71949749, + "memory(GiB)": 67.97, + "step": 4340, + "train_speed(iter/s)": 1.627654 + }, + { + "acc": 0.65132828, + "epoch": 0.1102232369355657, + "grad_norm": 6.0, + "learning_rate": 9.999285669714555e-06, + "loss": 1.68354759, + "memory(GiB)": 67.97, + "step": 4345, + "train_speed(iter/s)": 1.627861 + }, + { + "acc": 0.6522799, + "epoch": 0.11035007610350075, + "grad_norm": 5.5, + "learning_rate": 9.999267834874044e-06, + "loss": 1.61790657, + "memory(GiB)": 67.97, + "step": 4350, + "train_speed(iter/s)": 1.628069 + }, + { + "acc": 0.6344573, + "epoch": 0.11047691527143581, + "grad_norm": 6.875, + "learning_rate": 9.999249780143416e-06, + "loss": 1.69082794, + "memory(GiB)": 67.97, + "step": 4355, + "train_speed(iter/s)": 1.628275 + }, + { + "acc": 0.63763828, + "epoch": 0.11060375443937087, + "grad_norm": 5.15625, + "learning_rate": 9.999231505523463e-06, + "loss": 1.74555588, + "memory(GiB)": 67.97, + "step": 4360, + "train_speed(iter/s)": 1.628477 + }, + { + "acc": 0.61145363, + "epoch": 0.11073059360730593, + "grad_norm": 5.59375, + "learning_rate": 9.999213011014987e-06, + "loss": 1.79144897, + "memory(GiB)": 67.97, + "step": 4365, + "train_speed(iter/s)": 1.628679 + }, + { + "acc": 0.63161697, + "epoch": 0.11085743277524099, + "grad_norm": 5.125, + "learning_rate": 9.999194296618805e-06, + "loss": 1.75264168, + "memory(GiB)": 67.97, + "step": 4370, + "train_speed(iter/s)": 1.628875 + }, + { + "acc": 0.64262152, + "epoch": 0.11098427194317605, + "grad_norm": 11.0, + "learning_rate": 9.999175362335735e-06, + "loss": 1.7129652, + "memory(GiB)": 67.97, + "step": 4375, + "train_speed(iter/s)": 1.62909 + }, + { + "acc": 0.64140253, + "epoch": 0.1111111111111111, + "grad_norm": 5.9375, + "learning_rate": 9.999156208166614e-06, + "loss": 1.73836594, + "memory(GiB)": 67.97, + "step": 4380, + "train_speed(iter/s)": 1.629305 + }, + { + "acc": 0.63255754, + "epoch": 0.11123795027904616, + "grad_norm": 5.78125, + "learning_rate": 9.999136834112284e-06, + "loss": 1.71961479, + "memory(GiB)": 67.97, + "step": 4385, + "train_speed(iter/s)": 1.629511 + }, + { + "acc": 0.63558664, + "epoch": 0.11136478944698122, + "grad_norm": 4.8125, + "learning_rate": 9.999117240173597e-06, + "loss": 1.76141224, + "memory(GiB)": 67.97, + "step": 4390, + "train_speed(iter/s)": 1.629715 + }, + { + "acc": 0.62749786, + "epoch": 0.11149162861491628, + "grad_norm": 6.09375, + "learning_rate": 9.999097426351412e-06, + "loss": 1.80290337, + "memory(GiB)": 67.97, + "step": 4395, + "train_speed(iter/s)": 1.629938 + }, + { + "acc": 0.63315077, + "epoch": 0.11161846778285134, + "grad_norm": 5.84375, + "learning_rate": 9.999077392646606e-06, + "loss": 1.72891464, + "memory(GiB)": 67.97, + "step": 4400, + "train_speed(iter/s)": 1.630144 + }, + { + "acc": 0.64550276, + "epoch": 0.1117453069507864, + "grad_norm": 6.375, + "learning_rate": 9.999057139060055e-06, + "loss": 1.73157539, + "memory(GiB)": 67.97, + "step": 4405, + "train_speed(iter/s)": 1.630344 + }, + { + "acc": 0.64680681, + "epoch": 0.11187214611872145, + "grad_norm": 5.34375, + "learning_rate": 9.999036665592653e-06, + "loss": 1.69201775, + "memory(GiB)": 67.97, + "step": 4410, + "train_speed(iter/s)": 1.630547 + }, + { + "acc": 0.6476182, + "epoch": 0.11199898528665651, + "grad_norm": 5.28125, + "learning_rate": 9.999015972245298e-06, + "loss": 1.72968769, + "memory(GiB)": 67.97, + "step": 4415, + "train_speed(iter/s)": 1.630743 + }, + { + "acc": 0.63964167, + "epoch": 0.11212582445459157, + "grad_norm": 6.46875, + "learning_rate": 9.998995059018901e-06, + "loss": 1.72246742, + "memory(GiB)": 67.97, + "step": 4420, + "train_speed(iter/s)": 1.630941 + }, + { + "acc": 0.62957039, + "epoch": 0.11225266362252663, + "grad_norm": 6.0, + "learning_rate": 9.998973925914384e-06, + "loss": 1.69691658, + "memory(GiB)": 67.97, + "step": 4425, + "train_speed(iter/s)": 1.631131 + }, + { + "acc": 0.63208961, + "epoch": 0.11237950279046169, + "grad_norm": 5.34375, + "learning_rate": 9.998952572932675e-06, + "loss": 1.75270672, + "memory(GiB)": 67.97, + "step": 4430, + "train_speed(iter/s)": 1.631324 + }, + { + "acc": 0.61630292, + "epoch": 0.11250634195839675, + "grad_norm": 5.96875, + "learning_rate": 9.998931000074712e-06, + "loss": 1.83897057, + "memory(GiB)": 67.97, + "step": 4435, + "train_speed(iter/s)": 1.631537 + }, + { + "acc": 0.63067393, + "epoch": 0.1126331811263318, + "grad_norm": 5.75, + "learning_rate": 9.998909207341446e-06, + "loss": 1.72502518, + "memory(GiB)": 67.97, + "step": 4440, + "train_speed(iter/s)": 1.631739 + }, + { + "acc": 0.62769232, + "epoch": 0.11276002029426686, + "grad_norm": 5.5625, + "learning_rate": 9.998887194733833e-06, + "loss": 1.81189041, + "memory(GiB)": 67.97, + "step": 4445, + "train_speed(iter/s)": 1.631941 + }, + { + "acc": 0.64530735, + "epoch": 0.11288685946220192, + "grad_norm": 5.8125, + "learning_rate": 9.998864962252843e-06, + "loss": 1.66652012, + "memory(GiB)": 67.97, + "step": 4450, + "train_speed(iter/s)": 1.632139 + }, + { + "acc": 0.62774019, + "epoch": 0.11301369863013698, + "grad_norm": 5.40625, + "learning_rate": 9.998842509899456e-06, + "loss": 1.78872833, + "memory(GiB)": 67.97, + "step": 4455, + "train_speed(iter/s)": 1.632349 + }, + { + "acc": 0.63577385, + "epoch": 0.11314053779807204, + "grad_norm": 8.625, + "learning_rate": 9.998819837674655e-06, + "loss": 1.7395277, + "memory(GiB)": 67.97, + "step": 4460, + "train_speed(iter/s)": 1.632552 + }, + { + "acc": 0.64892306, + "epoch": 0.1132673769660071, + "grad_norm": 9.0, + "learning_rate": 9.99879694557944e-06, + "loss": 1.70531273, + "memory(GiB)": 67.97, + "step": 4465, + "train_speed(iter/s)": 1.632742 + }, + { + "acc": 0.6506196, + "epoch": 0.11339421613394216, + "grad_norm": 5.53125, + "learning_rate": 9.998773833614816e-06, + "loss": 1.72897644, + "memory(GiB)": 67.97, + "step": 4470, + "train_speed(iter/s)": 1.632947 + }, + { + "acc": 0.65738592, + "epoch": 0.11352105530187721, + "grad_norm": 6.09375, + "learning_rate": 9.998750501781803e-06, + "loss": 1.62091389, + "memory(GiB)": 67.97, + "step": 4475, + "train_speed(iter/s)": 1.633152 + }, + { + "acc": 0.64763403, + "epoch": 0.11364789446981227, + "grad_norm": 6.4375, + "learning_rate": 9.998726950081425e-06, + "loss": 1.73866997, + "memory(GiB)": 67.97, + "step": 4480, + "train_speed(iter/s)": 1.633341 + }, + { + "acc": 0.63893633, + "epoch": 0.11377473363774733, + "grad_norm": 6.9375, + "learning_rate": 9.998703178514717e-06, + "loss": 1.7975563, + "memory(GiB)": 67.97, + "step": 4485, + "train_speed(iter/s)": 1.63354 + }, + { + "acc": 0.63117728, + "epoch": 0.11390157280568239, + "grad_norm": 6.5625, + "learning_rate": 9.998679187082724e-06, + "loss": 1.70226288, + "memory(GiB)": 67.97, + "step": 4490, + "train_speed(iter/s)": 1.633732 + }, + { + "acc": 0.62577353, + "epoch": 0.11402841197361745, + "grad_norm": 5.5625, + "learning_rate": 9.998654975786506e-06, + "loss": 1.71917152, + "memory(GiB)": 67.97, + "step": 4495, + "train_speed(iter/s)": 1.633929 + }, + { + "acc": 0.63941765, + "epoch": 0.1141552511415525, + "grad_norm": 6.34375, + "learning_rate": 9.998630544627123e-06, + "loss": 1.68745232, + "memory(GiB)": 67.97, + "step": 4500, + "train_speed(iter/s)": 1.634134 + }, + { + "acc": 0.64434137, + "epoch": 0.11428209030948756, + "grad_norm": 4.90625, + "learning_rate": 9.998605893605653e-06, + "loss": 1.68942833, + "memory(GiB)": 67.97, + "step": 4505, + "train_speed(iter/s)": 1.634309 + }, + { + "acc": 0.64555216, + "epoch": 0.11440892947742262, + "grad_norm": 5.375, + "learning_rate": 9.998581022723178e-06, + "loss": 1.75617676, + "memory(GiB)": 67.97, + "step": 4510, + "train_speed(iter/s)": 1.634497 + }, + { + "acc": 0.61802735, + "epoch": 0.11453576864535768, + "grad_norm": 5.5, + "learning_rate": 9.998555931980792e-06, + "loss": 1.77623138, + "memory(GiB)": 67.97, + "step": 4515, + "train_speed(iter/s)": 1.634674 + }, + { + "acc": 0.62451324, + "epoch": 0.11466260781329274, + "grad_norm": 5.125, + "learning_rate": 9.998530621379599e-06, + "loss": 1.76006927, + "memory(GiB)": 67.97, + "step": 4520, + "train_speed(iter/s)": 1.634847 + }, + { + "acc": 0.64568024, + "epoch": 0.1147894469812278, + "grad_norm": 5.21875, + "learning_rate": 9.998505090920713e-06, + "loss": 1.71749382, + "memory(GiB)": 67.97, + "step": 4525, + "train_speed(iter/s)": 1.635037 + }, + { + "acc": 0.64712114, + "epoch": 0.11491628614916286, + "grad_norm": 6.125, + "learning_rate": 9.998479340605257e-06, + "loss": 1.70768147, + "memory(GiB)": 67.97, + "step": 4530, + "train_speed(iter/s)": 1.635245 + }, + { + "acc": 0.63834581, + "epoch": 0.11504312531709791, + "grad_norm": 5.0625, + "learning_rate": 9.99845337043436e-06, + "loss": 1.71314068, + "memory(GiB)": 67.97, + "step": 4535, + "train_speed(iter/s)": 1.635435 + }, + { + "acc": 0.61966105, + "epoch": 0.11516996448503297, + "grad_norm": 6.5625, + "learning_rate": 9.998427180409171e-06, + "loss": 1.83528786, + "memory(GiB)": 67.97, + "step": 4540, + "train_speed(iter/s)": 1.635619 + }, + { + "acc": 0.63189793, + "epoch": 0.11529680365296803, + "grad_norm": 8.0625, + "learning_rate": 9.998400770530836e-06, + "loss": 1.74503021, + "memory(GiB)": 67.97, + "step": 4545, + "train_speed(iter/s)": 1.635808 + }, + { + "acc": 0.63483181, + "epoch": 0.11542364282090309, + "grad_norm": 5.40625, + "learning_rate": 9.99837414080052e-06, + "loss": 1.71271553, + "memory(GiB)": 67.97, + "step": 4550, + "train_speed(iter/s)": 1.636013 + }, + { + "acc": 0.62522945, + "epoch": 0.11555048198883815, + "grad_norm": 5.625, + "learning_rate": 9.998347291219393e-06, + "loss": 1.75175209, + "memory(GiB)": 67.97, + "step": 4555, + "train_speed(iter/s)": 1.636215 + }, + { + "acc": 0.64219851, + "epoch": 0.1156773211567732, + "grad_norm": 5.34375, + "learning_rate": 9.998320221788635e-06, + "loss": 1.71677856, + "memory(GiB)": 67.97, + "step": 4560, + "train_speed(iter/s)": 1.636405 + }, + { + "acc": 0.61600294, + "epoch": 0.11580416032470826, + "grad_norm": 6.78125, + "learning_rate": 9.998292932509438e-06, + "loss": 1.89740715, + "memory(GiB)": 67.97, + "step": 4565, + "train_speed(iter/s)": 1.636596 + }, + { + "acc": 0.63052859, + "epoch": 0.11593099949264332, + "grad_norm": 5.34375, + "learning_rate": 9.998265423383003e-06, + "loss": 1.78069267, + "memory(GiB)": 67.97, + "step": 4570, + "train_speed(iter/s)": 1.636785 + }, + { + "acc": 0.63983808, + "epoch": 0.11605783866057838, + "grad_norm": 6.875, + "learning_rate": 9.998237694410537e-06, + "loss": 1.73593369, + "memory(GiB)": 67.97, + "step": 4575, + "train_speed(iter/s)": 1.636975 + }, + { + "acc": 0.64412074, + "epoch": 0.11618467782851344, + "grad_norm": 6.5, + "learning_rate": 9.998209745593264e-06, + "loss": 1.68864708, + "memory(GiB)": 67.97, + "step": 4580, + "train_speed(iter/s)": 1.637169 + }, + { + "acc": 0.63633366, + "epoch": 0.1163115169964485, + "grad_norm": 5.8125, + "learning_rate": 9.99818157693241e-06, + "loss": 1.73843613, + "memory(GiB)": 67.97, + "step": 4585, + "train_speed(iter/s)": 1.637363 + }, + { + "acc": 0.65344076, + "epoch": 0.11643835616438356, + "grad_norm": 5.375, + "learning_rate": 9.998153188429216e-06, + "loss": 1.60083752, + "memory(GiB)": 67.97, + "step": 4590, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.64183235, + "epoch": 0.11656519533231861, + "grad_norm": 5.625, + "learning_rate": 9.99812458008493e-06, + "loss": 1.6896965, + "memory(GiB)": 67.97, + "step": 4595, + "train_speed(iter/s)": 1.637728 + }, + { + "acc": 0.63621726, + "epoch": 0.11669203450025367, + "grad_norm": 4.8125, + "learning_rate": 9.998095751900806e-06, + "loss": 1.76160297, + "memory(GiB)": 67.97, + "step": 4600, + "train_speed(iter/s)": 1.637911 + }, + { + "acc": 0.62720642, + "epoch": 0.11681887366818873, + "grad_norm": 4.46875, + "learning_rate": 9.99806670387812e-06, + "loss": 1.74097443, + "memory(GiB)": 67.97, + "step": 4605, + "train_speed(iter/s)": 1.638106 + }, + { + "acc": 0.63699913, + "epoch": 0.11694571283612379, + "grad_norm": 6.1875, + "learning_rate": 9.998037436018144e-06, + "loss": 1.74612026, + "memory(GiB)": 67.97, + "step": 4610, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.63713827, + "epoch": 0.11707255200405885, + "grad_norm": 7.84375, + "learning_rate": 9.998007948322168e-06, + "loss": 1.66562538, + "memory(GiB)": 67.97, + "step": 4615, + "train_speed(iter/s)": 1.638474 + }, + { + "acc": 0.63443341, + "epoch": 0.1171993911719939, + "grad_norm": 5.34375, + "learning_rate": 9.997978240791487e-06, + "loss": 1.75145226, + "memory(GiB)": 67.97, + "step": 4620, + "train_speed(iter/s)": 1.638672 + }, + { + "acc": 0.63157201, + "epoch": 0.11732623033992896, + "grad_norm": 8.625, + "learning_rate": 9.99794831342741e-06, + "loss": 1.75364857, + "memory(GiB)": 67.97, + "step": 4625, + "train_speed(iter/s)": 1.638874 + }, + { + "acc": 0.63413148, + "epoch": 0.11745306950786402, + "grad_norm": 5.375, + "learning_rate": 9.99791816623125e-06, + "loss": 1.75627136, + "memory(GiB)": 67.97, + "step": 4630, + "train_speed(iter/s)": 1.63907 + }, + { + "acc": 0.6432013, + "epoch": 0.11757990867579908, + "grad_norm": 4.8125, + "learning_rate": 9.997887799204335e-06, + "loss": 1.67022057, + "memory(GiB)": 67.97, + "step": 4635, + "train_speed(iter/s)": 1.639261 + }, + { + "acc": 0.62581482, + "epoch": 0.11770674784373414, + "grad_norm": 5.3125, + "learning_rate": 9.997857212348e-06, + "loss": 1.7720356, + "memory(GiB)": 67.97, + "step": 4640, + "train_speed(iter/s)": 1.639439 + }, + { + "acc": 0.64077492, + "epoch": 0.1178335870116692, + "grad_norm": 5.09375, + "learning_rate": 9.997826405663593e-06, + "loss": 1.72522316, + "memory(GiB)": 67.97, + "step": 4645, + "train_speed(iter/s)": 1.639626 + }, + { + "acc": 0.65947065, + "epoch": 0.11796042617960426, + "grad_norm": 6.09375, + "learning_rate": 9.997795379152468e-06, + "loss": 1.58124809, + "memory(GiB)": 67.97, + "step": 4650, + "train_speed(iter/s)": 1.639802 + }, + { + "acc": 0.61063266, + "epoch": 0.11808726534753931, + "grad_norm": 5.8125, + "learning_rate": 9.997764132815985e-06, + "loss": 1.7946537, + "memory(GiB)": 67.97, + "step": 4655, + "train_speed(iter/s)": 1.639985 + }, + { + "acc": 0.6272243, + "epoch": 0.11821410451547437, + "grad_norm": 5.8125, + "learning_rate": 9.997732666655524e-06, + "loss": 1.79654083, + "memory(GiB)": 67.97, + "step": 4660, + "train_speed(iter/s)": 1.640177 + }, + { + "acc": 0.6152998, + "epoch": 0.11834094368340943, + "grad_norm": 5.25, + "learning_rate": 9.997700980672469e-06, + "loss": 1.77889538, + "memory(GiB)": 67.97, + "step": 4665, + "train_speed(iter/s)": 1.64038 + }, + { + "acc": 0.64574528, + "epoch": 0.11846778285134449, + "grad_norm": 5.71875, + "learning_rate": 9.997669074868208e-06, + "loss": 1.67501144, + "memory(GiB)": 67.97, + "step": 4670, + "train_speed(iter/s)": 1.640572 + }, + { + "acc": 0.64163051, + "epoch": 0.11859462201927955, + "grad_norm": 6.0625, + "learning_rate": 9.997636949244151e-06, + "loss": 1.72753601, + "memory(GiB)": 67.97, + "step": 4675, + "train_speed(iter/s)": 1.640769 + }, + { + "acc": 0.63248835, + "epoch": 0.1187214611872146, + "grad_norm": 5.4375, + "learning_rate": 9.997604603801707e-06, + "loss": 1.76771755, + "memory(GiB)": 67.97, + "step": 4680, + "train_speed(iter/s)": 1.640963 + }, + { + "acc": 0.63034139, + "epoch": 0.11884830035514966, + "grad_norm": 5.875, + "learning_rate": 9.9975720385423e-06, + "loss": 1.81456032, + "memory(GiB)": 67.97, + "step": 4685, + "train_speed(iter/s)": 1.641155 + }, + { + "acc": 0.64110069, + "epoch": 0.11897513952308472, + "grad_norm": 6.5, + "learning_rate": 9.997539253467361e-06, + "loss": 1.7475235, + "memory(GiB)": 67.97, + "step": 4690, + "train_speed(iter/s)": 1.641342 + }, + { + "acc": 0.63641825, + "epoch": 0.11910197869101978, + "grad_norm": 5.46875, + "learning_rate": 9.997506248578334e-06, + "loss": 1.73779526, + "memory(GiB)": 67.97, + "step": 4695, + "train_speed(iter/s)": 1.641525 + }, + { + "acc": 0.64413662, + "epoch": 0.11922881785895484, + "grad_norm": 7.3125, + "learning_rate": 9.997473023876671e-06, + "loss": 1.78399849, + "memory(GiB)": 67.97, + "step": 4700, + "train_speed(iter/s)": 1.641712 + }, + { + "acc": 0.63945947, + "epoch": 0.1193556570268899, + "grad_norm": 5.03125, + "learning_rate": 9.997439579363831e-06, + "loss": 1.79095154, + "memory(GiB)": 67.97, + "step": 4705, + "train_speed(iter/s)": 1.641897 + }, + { + "acc": 0.63179083, + "epoch": 0.11948249619482496, + "grad_norm": 6.0625, + "learning_rate": 9.997405915041288e-06, + "loss": 1.72449188, + "memory(GiB)": 67.97, + "step": 4710, + "train_speed(iter/s)": 1.642066 + }, + { + "acc": 0.64959311, + "epoch": 0.11960933536276001, + "grad_norm": 6.375, + "learning_rate": 9.99737203091052e-06, + "loss": 1.68974953, + "memory(GiB)": 67.97, + "step": 4715, + "train_speed(iter/s)": 1.642225 + }, + { + "acc": 0.65050182, + "epoch": 0.11973617453069507, + "grad_norm": 5.75, + "learning_rate": 9.997337926973018e-06, + "loss": 1.70047817, + "memory(GiB)": 67.97, + "step": 4720, + "train_speed(iter/s)": 1.642401 + }, + { + "acc": 0.63357754, + "epoch": 0.11986301369863013, + "grad_norm": 5.78125, + "learning_rate": 9.997303603230282e-06, + "loss": 1.7486412, + "memory(GiB)": 67.97, + "step": 4725, + "train_speed(iter/s)": 1.642576 + }, + { + "acc": 0.64942546, + "epoch": 0.11998985286656519, + "grad_norm": 6.4375, + "learning_rate": 9.997269059683822e-06, + "loss": 1.68743744, + "memory(GiB)": 67.97, + "step": 4730, + "train_speed(iter/s)": 1.642746 + }, + { + "acc": 0.62710915, + "epoch": 0.12011669203450025, + "grad_norm": 6.84375, + "learning_rate": 9.997234296335159e-06, + "loss": 1.75961323, + "memory(GiB)": 67.97, + "step": 4735, + "train_speed(iter/s)": 1.642937 + }, + { + "acc": 0.64256353, + "epoch": 0.1202435312024353, + "grad_norm": 5.34375, + "learning_rate": 9.997199313185821e-06, + "loss": 1.70316162, + "memory(GiB)": 67.97, + "step": 4740, + "train_speed(iter/s)": 1.643104 + }, + { + "acc": 0.65233736, + "epoch": 0.12037037037037036, + "grad_norm": 5.40625, + "learning_rate": 9.997164110237345e-06, + "loss": 1.60657234, + "memory(GiB)": 67.97, + "step": 4745, + "train_speed(iter/s)": 1.643284 + }, + { + "acc": 0.64171166, + "epoch": 0.12049720953830542, + "grad_norm": 6.53125, + "learning_rate": 9.99712868749128e-06, + "loss": 1.70478973, + "memory(GiB)": 67.97, + "step": 4750, + "train_speed(iter/s)": 1.643465 + }, + { + "acc": 0.66235285, + "epoch": 0.12062404870624048, + "grad_norm": 6.78125, + "learning_rate": 9.997093044949186e-06, + "loss": 1.71249313, + "memory(GiB)": 67.97, + "step": 4755, + "train_speed(iter/s)": 1.643645 + }, + { + "acc": 0.64120498, + "epoch": 0.12075088787417554, + "grad_norm": 6.0, + "learning_rate": 9.997057182612631e-06, + "loss": 1.67925282, + "memory(GiB)": 67.97, + "step": 4760, + "train_speed(iter/s)": 1.643808 + }, + { + "acc": 0.63244638, + "epoch": 0.1208777270421106, + "grad_norm": 6.5625, + "learning_rate": 9.997021100483188e-06, + "loss": 1.72408867, + "memory(GiB)": 67.97, + "step": 4765, + "train_speed(iter/s)": 1.643983 + }, + { + "acc": 0.64735184, + "epoch": 0.12100456621004566, + "grad_norm": 6.4375, + "learning_rate": 9.996984798562448e-06, + "loss": 1.65241432, + "memory(GiB)": 67.97, + "step": 4770, + "train_speed(iter/s)": 1.644146 + }, + { + "acc": 0.63440666, + "epoch": 0.12113140537798071, + "grad_norm": 6.09375, + "learning_rate": 9.996948276852008e-06, + "loss": 1.70845795, + "memory(GiB)": 67.97, + "step": 4775, + "train_speed(iter/s)": 1.644322 + }, + { + "acc": 0.63711462, + "epoch": 0.12125824454591577, + "grad_norm": 5.46875, + "learning_rate": 9.99691153535347e-06, + "loss": 1.7778965, + "memory(GiB)": 67.97, + "step": 4780, + "train_speed(iter/s)": 1.644505 + }, + { + "acc": 0.64322777, + "epoch": 0.12138508371385083, + "grad_norm": 5.34375, + "learning_rate": 9.996874574068457e-06, + "loss": 1.69959793, + "memory(GiB)": 67.97, + "step": 4785, + "train_speed(iter/s)": 1.644671 + }, + { + "acc": 0.6406477, + "epoch": 0.12151192288178589, + "grad_norm": 4.71875, + "learning_rate": 9.996837392998586e-06, + "loss": 1.72130203, + "memory(GiB)": 67.97, + "step": 4790, + "train_speed(iter/s)": 1.644846 + }, + { + "acc": 0.64000039, + "epoch": 0.12163876204972095, + "grad_norm": 6.21875, + "learning_rate": 9.996799992145501e-06, + "loss": 1.72441673, + "memory(GiB)": 67.97, + "step": 4795, + "train_speed(iter/s)": 1.645014 + }, + { + "acc": 0.64301157, + "epoch": 0.121765601217656, + "grad_norm": 5.40625, + "learning_rate": 9.996762371510843e-06, + "loss": 1.76455154, + "memory(GiB)": 67.97, + "step": 4800, + "train_speed(iter/s)": 1.645192 + }, + { + "acc": 0.64327803, + "epoch": 0.12189244038559106, + "grad_norm": 5.8125, + "learning_rate": 9.996724531096264e-06, + "loss": 1.72058315, + "memory(GiB)": 67.97, + "step": 4805, + "train_speed(iter/s)": 1.645363 + }, + { + "acc": 0.64176278, + "epoch": 0.12201927955352612, + "grad_norm": 6.34375, + "learning_rate": 9.996686470903434e-06, + "loss": 1.73159981, + "memory(GiB)": 67.97, + "step": 4810, + "train_speed(iter/s)": 1.645528 + }, + { + "acc": 0.64407272, + "epoch": 0.12214611872146118, + "grad_norm": 5.4375, + "learning_rate": 9.996648190934025e-06, + "loss": 1.6728405, + "memory(GiB)": 67.97, + "step": 4815, + "train_speed(iter/s)": 1.645701 + }, + { + "acc": 0.64219403, + "epoch": 0.12227295788939624, + "grad_norm": 6.1875, + "learning_rate": 9.996609691189718e-06, + "loss": 1.6965086, + "memory(GiB)": 67.97, + "step": 4820, + "train_speed(iter/s)": 1.645873 + }, + { + "acc": 0.64908476, + "epoch": 0.1223997970573313, + "grad_norm": 5.34375, + "learning_rate": 9.996570971672209e-06, + "loss": 1.68272762, + "memory(GiB)": 67.97, + "step": 4825, + "train_speed(iter/s)": 1.646047 + }, + { + "acc": 0.62543702, + "epoch": 0.12252663622526636, + "grad_norm": 6.0, + "learning_rate": 9.996532032383202e-06, + "loss": 1.81016521, + "memory(GiB)": 67.97, + "step": 4830, + "train_speed(iter/s)": 1.646202 + }, + { + "acc": 0.63629918, + "epoch": 0.12265347539320141, + "grad_norm": 7.03125, + "learning_rate": 9.996492873324406e-06, + "loss": 1.79810486, + "memory(GiB)": 67.97, + "step": 4835, + "train_speed(iter/s)": 1.646374 + }, + { + "acc": 0.63945713, + "epoch": 0.12278031456113647, + "grad_norm": 5.875, + "learning_rate": 9.996453494497546e-06, + "loss": 1.7556982, + "memory(GiB)": 67.97, + "step": 4840, + "train_speed(iter/s)": 1.646542 + }, + { + "acc": 0.63499227, + "epoch": 0.12290715372907153, + "grad_norm": 7.15625, + "learning_rate": 9.996413895904355e-06, + "loss": 1.72905922, + "memory(GiB)": 67.97, + "step": 4845, + "train_speed(iter/s)": 1.646724 + }, + { + "acc": 0.64340296, + "epoch": 0.12303399289700659, + "grad_norm": 5.875, + "learning_rate": 9.996374077546573e-06, + "loss": 1.67888451, + "memory(GiB)": 67.97, + "step": 4850, + "train_speed(iter/s)": 1.646888 + }, + { + "acc": 0.63953152, + "epoch": 0.12316083206494165, + "grad_norm": 5.53125, + "learning_rate": 9.996334039425952e-06, + "loss": 1.71106052, + "memory(GiB)": 67.97, + "step": 4855, + "train_speed(iter/s)": 1.647057 + }, + { + "acc": 0.65192795, + "epoch": 0.1232876712328767, + "grad_norm": 7.15625, + "learning_rate": 9.996293781544255e-06, + "loss": 1.65116768, + "memory(GiB)": 67.97, + "step": 4860, + "train_speed(iter/s)": 1.647223 + }, + { + "acc": 0.63779411, + "epoch": 0.12341451040081176, + "grad_norm": 5.21875, + "learning_rate": 9.996253303903247e-06, + "loss": 1.6962162, + "memory(GiB)": 67.97, + "step": 4865, + "train_speed(iter/s)": 1.647387 + }, + { + "acc": 0.63340173, + "epoch": 0.12354134956874682, + "grad_norm": 5.21875, + "learning_rate": 9.996212606504713e-06, + "loss": 1.68488808, + "memory(GiB)": 67.97, + "step": 4870, + "train_speed(iter/s)": 1.647569 + }, + { + "acc": 0.65302467, + "epoch": 0.12366818873668188, + "grad_norm": 5.71875, + "learning_rate": 9.996171689350444e-06, + "loss": 1.72270584, + "memory(GiB)": 67.97, + "step": 4875, + "train_speed(iter/s)": 1.647726 + }, + { + "acc": 0.65180416, + "epoch": 0.12379502790461694, + "grad_norm": 5.1875, + "learning_rate": 9.996130552442237e-06, + "loss": 1.64479866, + "memory(GiB)": 67.97, + "step": 4880, + "train_speed(iter/s)": 1.647895 + }, + { + "acc": 0.63972445, + "epoch": 0.123921867072552, + "grad_norm": 6.40625, + "learning_rate": 9.996089195781902e-06, + "loss": 1.61258755, + "memory(GiB)": 67.97, + "step": 4885, + "train_speed(iter/s)": 1.64805 + }, + { + "acc": 0.63272543, + "epoch": 0.12404870624048706, + "grad_norm": 5.71875, + "learning_rate": 9.996047619371256e-06, + "loss": 1.74801426, + "memory(GiB)": 67.97, + "step": 4890, + "train_speed(iter/s)": 1.648203 + }, + { + "acc": 0.64775581, + "epoch": 0.12417554540842211, + "grad_norm": 7.125, + "learning_rate": 9.996005823212132e-06, + "loss": 1.71116333, + "memory(GiB)": 67.97, + "step": 4895, + "train_speed(iter/s)": 1.648357 + }, + { + "acc": 0.63807197, + "epoch": 0.12430238457635717, + "grad_norm": 8.125, + "learning_rate": 9.995963807306368e-06, + "loss": 1.69778404, + "memory(GiB)": 67.97, + "step": 4900, + "train_speed(iter/s)": 1.648531 + }, + { + "acc": 0.64420915, + "epoch": 0.12442922374429223, + "grad_norm": 5.28125, + "learning_rate": 9.995921571655808e-06, + "loss": 1.70640068, + "memory(GiB)": 67.97, + "step": 4905, + "train_speed(iter/s)": 1.648694 + }, + { + "acc": 0.62597256, + "epoch": 0.12455606291222729, + "grad_norm": 6.9375, + "learning_rate": 9.995879116262312e-06, + "loss": 1.75829544, + "memory(GiB)": 67.97, + "step": 4910, + "train_speed(iter/s)": 1.648874 + }, + { + "acc": 0.65001144, + "epoch": 0.12468290208016235, + "grad_norm": 5.28125, + "learning_rate": 9.995836441127749e-06, + "loss": 1.62578773, + "memory(GiB)": 67.97, + "step": 4915, + "train_speed(iter/s)": 1.649025 + }, + { + "acc": 0.64603391, + "epoch": 0.1248097412480974, + "grad_norm": 5.40625, + "learning_rate": 9.995793546253993e-06, + "loss": 1.63102798, + "memory(GiB)": 67.97, + "step": 4920, + "train_speed(iter/s)": 1.649174 + }, + { + "acc": 0.64495444, + "epoch": 0.12493658041603246, + "grad_norm": 5.8125, + "learning_rate": 9.995750431642933e-06, + "loss": 1.71283379, + "memory(GiB)": 67.97, + "step": 4925, + "train_speed(iter/s)": 1.649334 + }, + { + "acc": 0.63811507, + "epoch": 0.12506341958396752, + "grad_norm": 6.84375, + "learning_rate": 9.995707097296465e-06, + "loss": 1.72495098, + "memory(GiB)": 67.97, + "step": 4930, + "train_speed(iter/s)": 1.649506 + }, + { + "acc": 0.63929243, + "epoch": 0.1251902587519026, + "grad_norm": 5.625, + "learning_rate": 9.995663543216493e-06, + "loss": 1.75177402, + "memory(GiB)": 67.97, + "step": 4935, + "train_speed(iter/s)": 1.649671 + }, + { + "acc": 0.64940729, + "epoch": 0.12531709791983764, + "grad_norm": 5.34375, + "learning_rate": 9.995619769404936e-06, + "loss": 1.71212883, + "memory(GiB)": 67.97, + "step": 4940, + "train_speed(iter/s)": 1.649835 + }, + { + "acc": 0.62320843, + "epoch": 0.1254439370877727, + "grad_norm": 7.125, + "learning_rate": 9.995575775863717e-06, + "loss": 1.79358368, + "memory(GiB)": 67.97, + "step": 4945, + "train_speed(iter/s)": 1.649988 + }, + { + "acc": 0.66100016, + "epoch": 0.12557077625570776, + "grad_norm": 6.90625, + "learning_rate": 9.995531562594773e-06, + "loss": 1.69016685, + "memory(GiB)": 67.97, + "step": 4950, + "train_speed(iter/s)": 1.650137 + }, + { + "acc": 0.65467863, + "epoch": 0.12569761542364283, + "grad_norm": 6.21875, + "learning_rate": 9.995487129600046e-06, + "loss": 1.64531898, + "memory(GiB)": 67.97, + "step": 4955, + "train_speed(iter/s)": 1.650292 + }, + { + "acc": 0.64707317, + "epoch": 0.12582445459157787, + "grad_norm": 5.40625, + "learning_rate": 9.995442476881491e-06, + "loss": 1.67819366, + "memory(GiB)": 67.97, + "step": 4960, + "train_speed(iter/s)": 1.650443 + }, + { + "acc": 0.64336152, + "epoch": 0.12595129375951294, + "grad_norm": 4.8125, + "learning_rate": 9.995397604441076e-06, + "loss": 1.68882675, + "memory(GiB)": 67.97, + "step": 4965, + "train_speed(iter/s)": 1.650604 + }, + { + "acc": 0.64805965, + "epoch": 0.126078132927448, + "grad_norm": 6.09375, + "learning_rate": 9.995352512280767e-06, + "loss": 1.73531609, + "memory(GiB)": 67.97, + "step": 4970, + "train_speed(iter/s)": 1.650772 + }, + { + "acc": 0.64736953, + "epoch": 0.12620497209538306, + "grad_norm": 6.5, + "learning_rate": 9.995307200402555e-06, + "loss": 1.72024765, + "memory(GiB)": 67.97, + "step": 4975, + "train_speed(iter/s)": 1.650916 + }, + { + "acc": 0.64368896, + "epoch": 0.1263318112633181, + "grad_norm": 6.09375, + "learning_rate": 9.995261668808429e-06, + "loss": 1.63274879, + "memory(GiB)": 67.97, + "step": 4980, + "train_speed(iter/s)": 1.651074 + }, + { + "acc": 0.6580523, + "epoch": 0.12645865043125318, + "grad_norm": 5.5, + "learning_rate": 9.995215917500395e-06, + "loss": 1.64459648, + "memory(GiB)": 67.97, + "step": 4985, + "train_speed(iter/s)": 1.65123 + }, + { + "acc": 0.62267323, + "epoch": 0.12658548959918822, + "grad_norm": 5.1875, + "learning_rate": 9.995169946480459e-06, + "loss": 1.70902634, + "memory(GiB)": 67.97, + "step": 4990, + "train_speed(iter/s)": 1.651386 + }, + { + "acc": 0.63211341, + "epoch": 0.1267123287671233, + "grad_norm": 6.375, + "learning_rate": 9.99512375575065e-06, + "loss": 1.69203529, + "memory(GiB)": 67.97, + "step": 4995, + "train_speed(iter/s)": 1.651543 + }, + { + "acc": 0.6414547, + "epoch": 0.12683916793505834, + "grad_norm": 6.28125, + "learning_rate": 9.995077345312994e-06, + "loss": 1.70896854, + "memory(GiB)": 67.97, + "step": 5000, + "train_speed(iter/s)": 1.651709 + }, + { + "epoch": 0.12683916793505834, + "eval_acc": 0.6274574664252585, + "eval_loss": 1.6814990043640137, + "eval_runtime": 58.1476, + "eval_samples_per_second": 109.549, + "eval_steps_per_second": 27.396, + "step": 5000 + }, + { + "acc": 0.63395348, + "epoch": 0.1269660071029934, + "grad_norm": 6.6875, + "learning_rate": 9.995030715169535e-06, + "loss": 1.74872665, + "memory(GiB)": 67.97, + "step": 5005, + "train_speed(iter/s)": 1.618533 + }, + { + "acc": 0.62984314, + "epoch": 0.12709284627092846, + "grad_norm": 5.40625, + "learning_rate": 9.994983865322327e-06, + "loss": 1.79818287, + "memory(GiB)": 67.97, + "step": 5010, + "train_speed(iter/s)": 1.618712 + }, + { + "acc": 0.63942041, + "epoch": 0.12721968543886353, + "grad_norm": 7.40625, + "learning_rate": 9.994936795773424e-06, + "loss": 1.71386108, + "memory(GiB)": 67.97, + "step": 5015, + "train_speed(iter/s)": 1.61889 + }, + { + "acc": 0.64456043, + "epoch": 0.12734652460679857, + "grad_norm": 4.65625, + "learning_rate": 9.994889506524903e-06, + "loss": 1.70381527, + "memory(GiB)": 67.97, + "step": 5020, + "train_speed(iter/s)": 1.619065 + }, + { + "acc": 0.64319324, + "epoch": 0.12747336377473364, + "grad_norm": 6.59375, + "learning_rate": 9.994841997578839e-06, + "loss": 1.7647501, + "memory(GiB)": 67.97, + "step": 5025, + "train_speed(iter/s)": 1.619243 + }, + { + "acc": 0.64262609, + "epoch": 0.1276002029426687, + "grad_norm": 4.4375, + "learning_rate": 9.994794268937325e-06, + "loss": 1.72183475, + "memory(GiB)": 67.97, + "step": 5030, + "train_speed(iter/s)": 1.619405 + }, + { + "acc": 0.6308147, + "epoch": 0.12772704211060376, + "grad_norm": 8.1875, + "learning_rate": 9.994746320602457e-06, + "loss": 1.77366085, + "memory(GiB)": 67.97, + "step": 5035, + "train_speed(iter/s)": 1.619597 + }, + { + "acc": 0.64514465, + "epoch": 0.1278538812785388, + "grad_norm": 4.875, + "learning_rate": 9.994698152576347e-06, + "loss": 1.73413086, + "memory(GiB)": 67.97, + "step": 5040, + "train_speed(iter/s)": 1.619782 + }, + { + "acc": 0.64518166, + "epoch": 0.12798072044647388, + "grad_norm": 5.875, + "learning_rate": 9.994649764861114e-06, + "loss": 1.69440193, + "memory(GiB)": 67.97, + "step": 5045, + "train_speed(iter/s)": 1.619967 + }, + { + "acc": 0.63295403, + "epoch": 0.12810755961440892, + "grad_norm": 5.5625, + "learning_rate": 9.994601157458882e-06, + "loss": 1.70097904, + "memory(GiB)": 67.97, + "step": 5050, + "train_speed(iter/s)": 1.620153 + }, + { + "acc": 0.61968193, + "epoch": 0.128234398782344, + "grad_norm": 5.8125, + "learning_rate": 9.994552330371792e-06, + "loss": 1.78249187, + "memory(GiB)": 67.97, + "step": 5055, + "train_speed(iter/s)": 1.620332 + }, + { + "acc": 0.62847509, + "epoch": 0.12836123795027904, + "grad_norm": 5.0625, + "learning_rate": 9.994503283601993e-06, + "loss": 1.73668785, + "memory(GiB)": 67.97, + "step": 5060, + "train_speed(iter/s)": 1.620509 + }, + { + "acc": 0.62697902, + "epoch": 0.1284880771182141, + "grad_norm": 5.0, + "learning_rate": 9.99445401715164e-06, + "loss": 1.75026798, + "memory(GiB)": 67.97, + "step": 5065, + "train_speed(iter/s)": 1.620676 + }, + { + "acc": 0.64302969, + "epoch": 0.12861491628614916, + "grad_norm": 5.21875, + "learning_rate": 9.994404531022901e-06, + "loss": 1.6923233, + "memory(GiB)": 67.97, + "step": 5070, + "train_speed(iter/s)": 1.620845 + }, + { + "acc": 0.64488964, + "epoch": 0.12874175545408423, + "grad_norm": 6.78125, + "learning_rate": 9.994354825217954e-06, + "loss": 1.66673298, + "memory(GiB)": 67.97, + "step": 5075, + "train_speed(iter/s)": 1.621022 + }, + { + "acc": 0.63519349, + "epoch": 0.12886859462201927, + "grad_norm": 6.6875, + "learning_rate": 9.99430489973898e-06, + "loss": 1.73534412, + "memory(GiB)": 67.97, + "step": 5080, + "train_speed(iter/s)": 1.621195 + }, + { + "acc": 0.62550731, + "epoch": 0.12899543378995434, + "grad_norm": 8.25, + "learning_rate": 9.994254754588182e-06, + "loss": 1.80104256, + "memory(GiB)": 67.97, + "step": 5085, + "train_speed(iter/s)": 1.621379 + }, + { + "acc": 0.64148254, + "epoch": 0.1291222729578894, + "grad_norm": 5.125, + "learning_rate": 9.99420438976776e-06, + "loss": 1.63720379, + "memory(GiB)": 67.97, + "step": 5090, + "train_speed(iter/s)": 1.621502 + }, + { + "acc": 0.63018656, + "epoch": 0.12924911212582446, + "grad_norm": 6.0, + "learning_rate": 9.994153805279932e-06, + "loss": 1.72082748, + "memory(GiB)": 67.97, + "step": 5095, + "train_speed(iter/s)": 1.621638 + }, + { + "acc": 0.64703856, + "epoch": 0.1293759512937595, + "grad_norm": 5.875, + "learning_rate": 9.994103001126923e-06, + "loss": 1.65867615, + "memory(GiB)": 67.97, + "step": 5100, + "train_speed(iter/s)": 1.621809 + }, + { + "acc": 0.64282503, + "epoch": 0.12950279046169458, + "grad_norm": 6.21875, + "learning_rate": 9.994051977310966e-06, + "loss": 1.69839401, + "memory(GiB)": 67.97, + "step": 5105, + "train_speed(iter/s)": 1.621992 + }, + { + "acc": 0.62093878, + "epoch": 0.12962962962962962, + "grad_norm": 5.53125, + "learning_rate": 9.994000733834307e-06, + "loss": 1.80523911, + "memory(GiB)": 67.97, + "step": 5110, + "train_speed(iter/s)": 1.622167 + }, + { + "acc": 0.6438539, + "epoch": 0.1297564687975647, + "grad_norm": 4.71875, + "learning_rate": 9.993949270699197e-06, + "loss": 1.69229202, + "memory(GiB)": 67.97, + "step": 5115, + "train_speed(iter/s)": 1.622334 + }, + { + "acc": 0.65608463, + "epoch": 0.12988330796549974, + "grad_norm": 5.4375, + "learning_rate": 9.993897587907904e-06, + "loss": 1.63800278, + "memory(GiB)": 67.97, + "step": 5120, + "train_speed(iter/s)": 1.622521 + }, + { + "acc": 0.63888769, + "epoch": 0.1300101471334348, + "grad_norm": 6.125, + "learning_rate": 9.993845685462697e-06, + "loss": 1.71679993, + "memory(GiB)": 67.97, + "step": 5125, + "train_speed(iter/s)": 1.622697 + }, + { + "acc": 0.61410284, + "epoch": 0.13013698630136986, + "grad_norm": 7.03125, + "learning_rate": 9.993793563365864e-06, + "loss": 1.76411209, + "memory(GiB)": 67.97, + "step": 5130, + "train_speed(iter/s)": 1.622872 + }, + { + "acc": 0.63688469, + "epoch": 0.13026382546930493, + "grad_norm": 5.375, + "learning_rate": 9.993741221619692e-06, + "loss": 1.69519062, + "memory(GiB)": 67.97, + "step": 5135, + "train_speed(iter/s)": 1.623041 + }, + { + "acc": 0.64492025, + "epoch": 0.13039066463723997, + "grad_norm": 5.46875, + "learning_rate": 9.993688660226486e-06, + "loss": 1.71028175, + "memory(GiB)": 67.97, + "step": 5140, + "train_speed(iter/s)": 1.623214 + }, + { + "acc": 0.65607786, + "epoch": 0.13051750380517504, + "grad_norm": 8.75, + "learning_rate": 9.993635879188557e-06, + "loss": 1.63757133, + "memory(GiB)": 67.97, + "step": 5145, + "train_speed(iter/s)": 1.623372 + }, + { + "acc": 0.64062729, + "epoch": 0.1306443429731101, + "grad_norm": 6.09375, + "learning_rate": 9.993582878508229e-06, + "loss": 1.69953728, + "memory(GiB)": 67.97, + "step": 5150, + "train_speed(iter/s)": 1.623553 + }, + { + "acc": 0.62717762, + "epoch": 0.13077118214104516, + "grad_norm": 5.4375, + "learning_rate": 9.993529658187829e-06, + "loss": 1.739147, + "memory(GiB)": 67.97, + "step": 5155, + "train_speed(iter/s)": 1.623731 + }, + { + "acc": 0.64492464, + "epoch": 0.1308980213089802, + "grad_norm": 5.59375, + "learning_rate": 9.9934762182297e-06, + "loss": 1.69992256, + "memory(GiB)": 67.97, + "step": 5160, + "train_speed(iter/s)": 1.623903 + }, + { + "acc": 0.6508604, + "epoch": 0.13102486047691528, + "grad_norm": 5.6875, + "learning_rate": 9.993422558636194e-06, + "loss": 1.6587532, + "memory(GiB)": 67.97, + "step": 5165, + "train_speed(iter/s)": 1.624087 + }, + { + "acc": 0.64501171, + "epoch": 0.13115169964485032, + "grad_norm": 5.0625, + "learning_rate": 9.99336867940967e-06, + "loss": 1.6909111, + "memory(GiB)": 67.97, + "step": 5170, + "train_speed(iter/s)": 1.624255 + }, + { + "acc": 0.65164576, + "epoch": 0.1312785388127854, + "grad_norm": 6.28125, + "learning_rate": 9.993314580552497e-06, + "loss": 1.63433113, + "memory(GiB)": 67.97, + "step": 5175, + "train_speed(iter/s)": 1.624426 + }, + { + "acc": 0.63783703, + "epoch": 0.13140537798072044, + "grad_norm": 7.71875, + "learning_rate": 9.993260262067054e-06, + "loss": 1.66052856, + "memory(GiB)": 67.97, + "step": 5180, + "train_speed(iter/s)": 1.624587 + }, + { + "acc": 0.62946987, + "epoch": 0.1315322171486555, + "grad_norm": 6.65625, + "learning_rate": 9.993205723955734e-06, + "loss": 1.76554985, + "memory(GiB)": 67.97, + "step": 5185, + "train_speed(iter/s)": 1.624754 + }, + { + "acc": 0.63935471, + "epoch": 0.13165905631659056, + "grad_norm": 7.125, + "learning_rate": 9.993150966220933e-06, + "loss": 1.65852242, + "memory(GiB)": 67.97, + "step": 5190, + "train_speed(iter/s)": 1.624926 + }, + { + "acc": 0.63495178, + "epoch": 0.13178589548452563, + "grad_norm": 5.5, + "learning_rate": 9.993095988865057e-06, + "loss": 1.73110809, + "memory(GiB)": 67.97, + "step": 5195, + "train_speed(iter/s)": 1.625111 + }, + { + "acc": 0.6334991, + "epoch": 0.13191273465246067, + "grad_norm": 4.75, + "learning_rate": 9.99304079189053e-06, + "loss": 1.75615196, + "memory(GiB)": 67.97, + "step": 5200, + "train_speed(iter/s)": 1.625279 + }, + { + "acc": 0.64708085, + "epoch": 0.13203957382039574, + "grad_norm": 16.125, + "learning_rate": 9.992985375299775e-06, + "loss": 1.64645767, + "memory(GiB)": 67.97, + "step": 5205, + "train_speed(iter/s)": 1.62545 + }, + { + "acc": 0.62608576, + "epoch": 0.1321664129883308, + "grad_norm": 7.71875, + "learning_rate": 9.992929739095232e-06, + "loss": 1.72588921, + "memory(GiB)": 67.97, + "step": 5210, + "train_speed(iter/s)": 1.625619 + }, + { + "acc": 0.62113743, + "epoch": 0.13229325215626586, + "grad_norm": 6.5625, + "learning_rate": 9.992873883279345e-06, + "loss": 1.79214783, + "memory(GiB)": 67.97, + "step": 5215, + "train_speed(iter/s)": 1.625798 + }, + { + "acc": 0.62433376, + "epoch": 0.1324200913242009, + "grad_norm": 5.125, + "learning_rate": 9.992817807854575e-06, + "loss": 1.72761688, + "memory(GiB)": 67.97, + "step": 5220, + "train_speed(iter/s)": 1.625978 + }, + { + "acc": 0.6437819, + "epoch": 0.13254693049213598, + "grad_norm": 5.53125, + "learning_rate": 9.992761512823386e-06, + "loss": 1.70482292, + "memory(GiB)": 67.97, + "step": 5225, + "train_speed(iter/s)": 1.626153 + }, + { + "acc": 0.63523312, + "epoch": 0.13267376966007102, + "grad_norm": 6.4375, + "learning_rate": 9.992704998188255e-06, + "loss": 1.7106987, + "memory(GiB)": 67.97, + "step": 5230, + "train_speed(iter/s)": 1.626319 + }, + { + "acc": 0.61373291, + "epoch": 0.1328006088280061, + "grad_norm": 6.125, + "learning_rate": 9.992648263951668e-06, + "loss": 1.82784958, + "memory(GiB)": 67.97, + "step": 5235, + "train_speed(iter/s)": 1.626497 + }, + { + "acc": 0.64305134, + "epoch": 0.13292744799594114, + "grad_norm": 5.4375, + "learning_rate": 9.992591310116118e-06, + "loss": 1.73701458, + "memory(GiB)": 67.97, + "step": 5240, + "train_speed(iter/s)": 1.62667 + }, + { + "acc": 0.63666525, + "epoch": 0.1330542871638762, + "grad_norm": 5.84375, + "learning_rate": 9.992534136684112e-06, + "loss": 1.67466583, + "memory(GiB)": 67.97, + "step": 5245, + "train_speed(iter/s)": 1.626844 + }, + { + "acc": 0.63887062, + "epoch": 0.13318112633181126, + "grad_norm": 5.65625, + "learning_rate": 9.992476743658165e-06, + "loss": 1.70007858, + "memory(GiB)": 67.97, + "step": 5250, + "train_speed(iter/s)": 1.627002 + }, + { + "acc": 0.65181856, + "epoch": 0.13330796549974633, + "grad_norm": 5.0, + "learning_rate": 9.992419131040803e-06, + "loss": 1.62725792, + "memory(GiB)": 67.97, + "step": 5255, + "train_speed(iter/s)": 1.62717 + }, + { + "acc": 0.65718045, + "epoch": 0.13343480466768137, + "grad_norm": 5.96875, + "learning_rate": 9.992361298834555e-06, + "loss": 1.6195652, + "memory(GiB)": 67.97, + "step": 5260, + "train_speed(iter/s)": 1.627347 + }, + { + "acc": 0.63881211, + "epoch": 0.13356164383561644, + "grad_norm": 5.5, + "learning_rate": 9.99230324704197e-06, + "loss": 1.76107922, + "memory(GiB)": 67.97, + "step": 5265, + "train_speed(iter/s)": 1.627522 + }, + { + "acc": 0.6444293, + "epoch": 0.1336884830035515, + "grad_norm": 6.78125, + "learning_rate": 9.992244975665598e-06, + "loss": 1.69464779, + "memory(GiB)": 67.97, + "step": 5270, + "train_speed(iter/s)": 1.627692 + }, + { + "acc": 0.64365602, + "epoch": 0.13381532217148656, + "grad_norm": 4.96875, + "learning_rate": 9.992186484708003e-06, + "loss": 1.72151871, + "memory(GiB)": 67.97, + "step": 5275, + "train_speed(iter/s)": 1.62786 + }, + { + "acc": 0.64648089, + "epoch": 0.1339421613394216, + "grad_norm": 5.90625, + "learning_rate": 9.992127774171759e-06, + "loss": 1.65160332, + "memory(GiB)": 67.97, + "step": 5280, + "train_speed(iter/s)": 1.628024 + }, + { + "acc": 0.62512159, + "epoch": 0.13406900050735668, + "grad_norm": 5.875, + "learning_rate": 9.992068844059446e-06, + "loss": 1.83518524, + "memory(GiB)": 67.97, + "step": 5285, + "train_speed(iter/s)": 1.628187 + }, + { + "acc": 0.62819347, + "epoch": 0.13419583967529172, + "grad_norm": 7.21875, + "learning_rate": 9.992009694373658e-06, + "loss": 1.70785141, + "memory(GiB)": 67.97, + "step": 5290, + "train_speed(iter/s)": 1.628364 + }, + { + "acc": 0.64102268, + "epoch": 0.1343226788432268, + "grad_norm": 7.53125, + "learning_rate": 9.991950325116995e-06, + "loss": 1.72000961, + "memory(GiB)": 67.97, + "step": 5295, + "train_speed(iter/s)": 1.628533 + }, + { + "acc": 0.63425627, + "epoch": 0.13444951801116184, + "grad_norm": 5.3125, + "learning_rate": 9.99189073629207e-06, + "loss": 1.6980072, + "memory(GiB)": 67.97, + "step": 5300, + "train_speed(iter/s)": 1.628696 + }, + { + "acc": 0.62713289, + "epoch": 0.1345763571790969, + "grad_norm": 7.5625, + "learning_rate": 9.991830927901505e-06, + "loss": 1.81331329, + "memory(GiB)": 67.97, + "step": 5305, + "train_speed(iter/s)": 1.628864 + }, + { + "acc": 0.64455872, + "epoch": 0.13470319634703196, + "grad_norm": 5.75, + "learning_rate": 9.991770899947925e-06, + "loss": 1.64396591, + "memory(GiB)": 67.97, + "step": 5310, + "train_speed(iter/s)": 1.629023 + }, + { + "acc": 0.64155502, + "epoch": 0.13483003551496703, + "grad_norm": 7.5, + "learning_rate": 9.991710652433977e-06, + "loss": 1.71283951, + "memory(GiB)": 67.97, + "step": 5315, + "train_speed(iter/s)": 1.629201 + }, + { + "acc": 0.63127637, + "epoch": 0.13495687468290207, + "grad_norm": 5.75, + "learning_rate": 9.991650185362308e-06, + "loss": 1.72396545, + "memory(GiB)": 67.97, + "step": 5320, + "train_speed(iter/s)": 1.629372 + }, + { + "acc": 0.63823328, + "epoch": 0.13508371385083714, + "grad_norm": 6.0, + "learning_rate": 9.991589498735577e-06, + "loss": 1.67893753, + "memory(GiB)": 67.97, + "step": 5325, + "train_speed(iter/s)": 1.629541 + }, + { + "acc": 0.64746094, + "epoch": 0.1352105530187722, + "grad_norm": 7.375, + "learning_rate": 9.991528592556454e-06, + "loss": 1.65668163, + "memory(GiB)": 67.97, + "step": 5330, + "train_speed(iter/s)": 1.629711 + }, + { + "acc": 0.62300062, + "epoch": 0.13533739218670726, + "grad_norm": 6.71875, + "learning_rate": 9.991467466827618e-06, + "loss": 1.7772522, + "memory(GiB)": 67.97, + "step": 5335, + "train_speed(iter/s)": 1.629871 + }, + { + "acc": 0.64545665, + "epoch": 0.1354642313546423, + "grad_norm": 8.1875, + "learning_rate": 9.99140612155176e-06, + "loss": 1.73008785, + "memory(GiB)": 67.97, + "step": 5340, + "train_speed(iter/s)": 1.630035 + }, + { + "acc": 0.62435536, + "epoch": 0.13559107052257738, + "grad_norm": 6.03125, + "learning_rate": 9.991344556731572e-06, + "loss": 1.75409298, + "memory(GiB)": 67.97, + "step": 5345, + "train_speed(iter/s)": 1.630215 + }, + { + "acc": 0.65214739, + "epoch": 0.13571790969051242, + "grad_norm": 5.4375, + "learning_rate": 9.991282772369766e-06, + "loss": 1.66706543, + "memory(GiB)": 67.97, + "step": 5350, + "train_speed(iter/s)": 1.630395 + }, + { + "acc": 0.63869658, + "epoch": 0.1358447488584475, + "grad_norm": 6.5625, + "learning_rate": 9.99122076846906e-06, + "loss": 1.67649441, + "memory(GiB)": 67.97, + "step": 5355, + "train_speed(iter/s)": 1.630559 + }, + { + "acc": 0.64688721, + "epoch": 0.13597158802638254, + "grad_norm": 5.21875, + "learning_rate": 9.991158545032181e-06, + "loss": 1.69243584, + "memory(GiB)": 67.97, + "step": 5360, + "train_speed(iter/s)": 1.630734 + }, + { + "acc": 0.65743103, + "epoch": 0.1360984271943176, + "grad_norm": 5.5, + "learning_rate": 9.991096102061865e-06, + "loss": 1.63055229, + "memory(GiB)": 67.97, + "step": 5365, + "train_speed(iter/s)": 1.630901 + }, + { + "acc": 0.64526978, + "epoch": 0.13622526636225266, + "grad_norm": 6.84375, + "learning_rate": 9.991033439560858e-06, + "loss": 1.71617832, + "memory(GiB)": 67.97, + "step": 5370, + "train_speed(iter/s)": 1.631076 + }, + { + "acc": 0.63090262, + "epoch": 0.13635210553018773, + "grad_norm": 5.0625, + "learning_rate": 9.990970557531918e-06, + "loss": 1.78805809, + "memory(GiB)": 67.97, + "step": 5375, + "train_speed(iter/s)": 1.631237 + }, + { + "acc": 0.6560832, + "epoch": 0.13647894469812277, + "grad_norm": 6.0625, + "learning_rate": 9.990907455977809e-06, + "loss": 1.66885719, + "memory(GiB)": 67.97, + "step": 5380, + "train_speed(iter/s)": 1.631399 + }, + { + "acc": 0.63162436, + "epoch": 0.13660578386605784, + "grad_norm": 4.65625, + "learning_rate": 9.990844134901308e-06, + "loss": 1.75449448, + "memory(GiB)": 67.97, + "step": 5385, + "train_speed(iter/s)": 1.631556 + }, + { + "acc": 0.66275859, + "epoch": 0.1367326230339929, + "grad_norm": 5.3125, + "learning_rate": 9.9907805943052e-06, + "loss": 1.61374607, + "memory(GiB)": 67.97, + "step": 5390, + "train_speed(iter/s)": 1.631707 + }, + { + "acc": 0.63994455, + "epoch": 0.13685946220192796, + "grad_norm": 4.90625, + "learning_rate": 9.990716834192278e-06, + "loss": 1.64527721, + "memory(GiB)": 67.97, + "step": 5395, + "train_speed(iter/s)": 1.631855 + }, + { + "acc": 0.62512465, + "epoch": 0.136986301369863, + "grad_norm": 5.71875, + "learning_rate": 9.990652854565348e-06, + "loss": 1.7353159, + "memory(GiB)": 67.97, + "step": 5400, + "train_speed(iter/s)": 1.632022 + }, + { + "acc": 0.6311697, + "epoch": 0.13711314053779808, + "grad_norm": 6.21875, + "learning_rate": 9.990588655427225e-06, + "loss": 1.78660164, + "memory(GiB)": 67.97, + "step": 5405, + "train_speed(iter/s)": 1.632179 + }, + { + "acc": 0.64658265, + "epoch": 0.13723997970573312, + "grad_norm": 6.53125, + "learning_rate": 9.99052423678073e-06, + "loss": 1.69756222, + "memory(GiB)": 67.97, + "step": 5410, + "train_speed(iter/s)": 1.632339 + }, + { + "acc": 0.64642596, + "epoch": 0.1373668188736682, + "grad_norm": 5.75, + "learning_rate": 9.990459598628697e-06, + "loss": 1.65353832, + "memory(GiB)": 67.97, + "step": 5415, + "train_speed(iter/s)": 1.632504 + }, + { + "acc": 0.63903875, + "epoch": 0.13749365804160324, + "grad_norm": 4.75, + "learning_rate": 9.990394740973972e-06, + "loss": 1.63889236, + "memory(GiB)": 67.97, + "step": 5420, + "train_speed(iter/s)": 1.632666 + }, + { + "acc": 0.64777212, + "epoch": 0.1376204972095383, + "grad_norm": 5.84375, + "learning_rate": 9.990329663819405e-06, + "loss": 1.72402916, + "memory(GiB)": 67.97, + "step": 5425, + "train_speed(iter/s)": 1.632839 + }, + { + "acc": 0.64259725, + "epoch": 0.13774733637747336, + "grad_norm": 5.53125, + "learning_rate": 9.99026436716786e-06, + "loss": 1.6757103, + "memory(GiB)": 67.97, + "step": 5430, + "train_speed(iter/s)": 1.632997 + }, + { + "acc": 0.63455553, + "epoch": 0.13787417554540843, + "grad_norm": 4.6875, + "learning_rate": 9.990198851022207e-06, + "loss": 1.69476357, + "memory(GiB)": 67.97, + "step": 5435, + "train_speed(iter/s)": 1.63316 + }, + { + "acc": 0.64285312, + "epoch": 0.13800101471334347, + "grad_norm": 5.84375, + "learning_rate": 9.99013311538533e-06, + "loss": 1.69048004, + "memory(GiB)": 67.97, + "step": 5440, + "train_speed(iter/s)": 1.633312 + }, + { + "acc": 0.64904666, + "epoch": 0.13812785388127855, + "grad_norm": 6.53125, + "learning_rate": 9.99006716026012e-06, + "loss": 1.71266918, + "memory(GiB)": 67.97, + "step": 5445, + "train_speed(iter/s)": 1.633475 + }, + { + "acc": 0.64135752, + "epoch": 0.1382546930492136, + "grad_norm": 5.25, + "learning_rate": 9.990000985649475e-06, + "loss": 1.73248444, + "memory(GiB)": 67.97, + "step": 5450, + "train_speed(iter/s)": 1.633645 + }, + { + "acc": 0.63617897, + "epoch": 0.13838153221714866, + "grad_norm": 6.71875, + "learning_rate": 9.989934591556308e-06, + "loss": 1.69448586, + "memory(GiB)": 67.97, + "step": 5455, + "train_speed(iter/s)": 1.633809 + }, + { + "acc": 0.62789268, + "epoch": 0.1385083713850837, + "grad_norm": 6.65625, + "learning_rate": 9.98986797798354e-06, + "loss": 1.75231285, + "memory(GiB)": 67.97, + "step": 5460, + "train_speed(iter/s)": 1.633962 + }, + { + "acc": 0.64234023, + "epoch": 0.13863521055301878, + "grad_norm": 7.0, + "learning_rate": 9.989801144934102e-06, + "loss": 1.68317013, + "memory(GiB)": 67.97, + "step": 5465, + "train_speed(iter/s)": 1.634124 + }, + { + "acc": 0.64935846, + "epoch": 0.13876204972095382, + "grad_norm": 6.375, + "learning_rate": 9.98973409241093e-06, + "loss": 1.68969078, + "memory(GiB)": 67.97, + "step": 5470, + "train_speed(iter/s)": 1.634283 + }, + { + "acc": 0.63298841, + "epoch": 0.1388888888888889, + "grad_norm": 5.40625, + "learning_rate": 9.989666820416974e-06, + "loss": 1.71576881, + "memory(GiB)": 67.97, + "step": 5475, + "train_speed(iter/s)": 1.634457 + }, + { + "acc": 0.6435339, + "epoch": 0.13901572805682394, + "grad_norm": 7.8125, + "learning_rate": 9.989599328955195e-06, + "loss": 1.73315163, + "memory(GiB)": 67.97, + "step": 5480, + "train_speed(iter/s)": 1.634623 + }, + { + "acc": 0.64531841, + "epoch": 0.139142567224759, + "grad_norm": 4.75, + "learning_rate": 9.98953161802856e-06, + "loss": 1.68681412, + "memory(GiB)": 67.97, + "step": 5485, + "train_speed(iter/s)": 1.634772 + }, + { + "acc": 0.65644813, + "epoch": 0.13926940639269406, + "grad_norm": 6.59375, + "learning_rate": 9.98946368764005e-06, + "loss": 1.65063095, + "memory(GiB)": 67.97, + "step": 5490, + "train_speed(iter/s)": 1.634937 + }, + { + "acc": 0.66878214, + "epoch": 0.13939624556062913, + "grad_norm": 7.0625, + "learning_rate": 9.989395537792647e-06, + "loss": 1.66400394, + "memory(GiB)": 67.97, + "step": 5495, + "train_speed(iter/s)": 1.635093 + }, + { + "acc": 0.63075151, + "epoch": 0.13952308472856417, + "grad_norm": 6.1875, + "learning_rate": 9.989327168489356e-06, + "loss": 1.7121563, + "memory(GiB)": 67.97, + "step": 5500, + "train_speed(iter/s)": 1.635255 + }, + { + "acc": 0.64085975, + "epoch": 0.13964992389649925, + "grad_norm": 5.15625, + "learning_rate": 9.989258579733179e-06, + "loss": 1.71124191, + "memory(GiB)": 67.97, + "step": 5505, + "train_speed(iter/s)": 1.635401 + }, + { + "acc": 0.63293228, + "epoch": 0.1397767630644343, + "grad_norm": 5.78125, + "learning_rate": 9.989189771527133e-06, + "loss": 1.67303009, + "memory(GiB)": 67.97, + "step": 5510, + "train_speed(iter/s)": 1.63556 + }, + { + "acc": 0.63710937, + "epoch": 0.13990360223236936, + "grad_norm": 5.15625, + "learning_rate": 9.989120743874248e-06, + "loss": 1.73125992, + "memory(GiB)": 67.97, + "step": 5515, + "train_speed(iter/s)": 1.635716 + }, + { + "acc": 0.62786932, + "epoch": 0.1400304414003044, + "grad_norm": 8.625, + "learning_rate": 9.989051496777556e-06, + "loss": 1.7746994, + "memory(GiB)": 67.97, + "step": 5520, + "train_speed(iter/s)": 1.635886 + }, + { + "acc": 0.63814631, + "epoch": 0.14015728056823948, + "grad_norm": 5.5625, + "learning_rate": 9.988982030240104e-06, + "loss": 1.73342056, + "memory(GiB)": 67.97, + "step": 5525, + "train_speed(iter/s)": 1.636032 + }, + { + "acc": 0.64379349, + "epoch": 0.14028411973617452, + "grad_norm": 6.78125, + "learning_rate": 9.988912344264949e-06, + "loss": 1.77013035, + "memory(GiB)": 67.97, + "step": 5530, + "train_speed(iter/s)": 1.636192 + }, + { + "acc": 0.63793926, + "epoch": 0.1404109589041096, + "grad_norm": 4.875, + "learning_rate": 9.988842438855156e-06, + "loss": 1.63599529, + "memory(GiB)": 67.97, + "step": 5535, + "train_speed(iter/s)": 1.636346 + }, + { + "acc": 0.63129282, + "epoch": 0.14053779807204464, + "grad_norm": 6.4375, + "learning_rate": 9.988772314013799e-06, + "loss": 1.70913277, + "memory(GiB)": 67.97, + "step": 5540, + "train_speed(iter/s)": 1.636497 + }, + { + "acc": 0.63681321, + "epoch": 0.1406646372399797, + "grad_norm": 5.96875, + "learning_rate": 9.988701969743961e-06, + "loss": 1.70088158, + "memory(GiB)": 67.97, + "step": 5545, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.63065233, + "epoch": 0.14079147640791476, + "grad_norm": 4.46875, + "learning_rate": 9.98863140604874e-06, + "loss": 1.73746414, + "memory(GiB)": 67.97, + "step": 5550, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.64270663, + "epoch": 0.14091831557584983, + "grad_norm": 5.40625, + "learning_rate": 9.988560622931233e-06, + "loss": 1.6762867, + "memory(GiB)": 67.97, + "step": 5555, + "train_speed(iter/s)": 1.636971 + }, + { + "acc": 0.65530734, + "epoch": 0.14104515474378487, + "grad_norm": 5.375, + "learning_rate": 9.988489620394562e-06, + "loss": 1.63868313, + "memory(GiB)": 67.97, + "step": 5560, + "train_speed(iter/s)": 1.637134 + }, + { + "acc": 0.65375457, + "epoch": 0.14117199391171995, + "grad_norm": 5.4375, + "learning_rate": 9.988418398441842e-06, + "loss": 1.65380325, + "memory(GiB)": 67.97, + "step": 5565, + "train_speed(iter/s)": 1.637292 + }, + { + "acc": 0.6210989, + "epoch": 0.141298833079655, + "grad_norm": 6.1875, + "learning_rate": 9.98834695707621e-06, + "loss": 1.80401039, + "memory(GiB)": 67.97, + "step": 5570, + "train_speed(iter/s)": 1.637461 + }, + { + "acc": 0.62638574, + "epoch": 0.14142567224759006, + "grad_norm": 4.96875, + "learning_rate": 9.98827529630081e-06, + "loss": 1.72670822, + "memory(GiB)": 67.97, + "step": 5575, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.64285045, + "epoch": 0.1415525114155251, + "grad_norm": 5.40625, + "learning_rate": 9.988203416118788e-06, + "loss": 1.63737946, + "memory(GiB)": 67.97, + "step": 5580, + "train_speed(iter/s)": 1.637773 + }, + { + "acc": 0.65205193, + "epoch": 0.14167935058346018, + "grad_norm": 5.40625, + "learning_rate": 9.98813131653331e-06, + "loss": 1.6648632, + "memory(GiB)": 67.97, + "step": 5585, + "train_speed(iter/s)": 1.637925 + }, + { + "acc": 0.63377504, + "epoch": 0.14180618975139522, + "grad_norm": 5.65625, + "learning_rate": 9.988058997547548e-06, + "loss": 1.70777187, + "memory(GiB)": 67.97, + "step": 5590, + "train_speed(iter/s)": 1.63807 + }, + { + "acc": 0.63157835, + "epoch": 0.1419330289193303, + "grad_norm": 10.75, + "learning_rate": 9.987986459164678e-06, + "loss": 1.80104675, + "memory(GiB)": 67.97, + "step": 5595, + "train_speed(iter/s)": 1.638239 + }, + { + "acc": 0.6438448, + "epoch": 0.14205986808726534, + "grad_norm": 6.3125, + "learning_rate": 9.987913701387897e-06, + "loss": 1.69255219, + "memory(GiB)": 67.97, + "step": 5600, + "train_speed(iter/s)": 1.638393 + }, + { + "acc": 0.63091969, + "epoch": 0.1421867072552004, + "grad_norm": 5.15625, + "learning_rate": 9.9878407242204e-06, + "loss": 1.75262699, + "memory(GiB)": 67.97, + "step": 5605, + "train_speed(iter/s)": 1.63855 + }, + { + "acc": 0.63085594, + "epoch": 0.14231354642313546, + "grad_norm": 6.3125, + "learning_rate": 9.9877675276654e-06, + "loss": 1.6860054, + "memory(GiB)": 67.97, + "step": 5610, + "train_speed(iter/s)": 1.638696 + }, + { + "acc": 0.64327564, + "epoch": 0.14244038559107053, + "grad_norm": 5.09375, + "learning_rate": 9.987694111726114e-06, + "loss": 1.64752007, + "memory(GiB)": 67.97, + "step": 5615, + "train_speed(iter/s)": 1.638851 + }, + { + "acc": 0.64797969, + "epoch": 0.14256722475900557, + "grad_norm": 5.15625, + "learning_rate": 9.987620476405774e-06, + "loss": 1.66869946, + "memory(GiB)": 67.97, + "step": 5620, + "train_speed(iter/s)": 1.638997 + }, + { + "acc": 0.65582399, + "epoch": 0.14269406392694065, + "grad_norm": 5.40625, + "learning_rate": 9.987546621707616e-06, + "loss": 1.67556496, + "memory(GiB)": 67.97, + "step": 5625, + "train_speed(iter/s)": 1.639138 + }, + { + "acc": 0.63089805, + "epoch": 0.1428209030948757, + "grad_norm": 6.0, + "learning_rate": 9.98747254763489e-06, + "loss": 1.71389046, + "memory(GiB)": 67.97, + "step": 5630, + "train_speed(iter/s)": 1.639295 + }, + { + "acc": 0.62215295, + "epoch": 0.14294774226281076, + "grad_norm": 5.84375, + "learning_rate": 9.987398254190855e-06, + "loss": 1.75646858, + "memory(GiB)": 67.97, + "step": 5635, + "train_speed(iter/s)": 1.639445 + }, + { + "acc": 0.64388542, + "epoch": 0.1430745814307458, + "grad_norm": 5.75, + "learning_rate": 9.987323741378777e-06, + "loss": 1.72612591, + "memory(GiB)": 67.97, + "step": 5640, + "train_speed(iter/s)": 1.639604 + }, + { + "acc": 0.66521358, + "epoch": 0.14320142059868088, + "grad_norm": 6.125, + "learning_rate": 9.987249009201934e-06, + "loss": 1.62808895, + "memory(GiB)": 67.97, + "step": 5645, + "train_speed(iter/s)": 1.639755 + }, + { + "acc": 0.64707842, + "epoch": 0.14332825976661592, + "grad_norm": 7.59375, + "learning_rate": 9.987174057663613e-06, + "loss": 1.65816956, + "memory(GiB)": 67.97, + "step": 5650, + "train_speed(iter/s)": 1.639911 + }, + { + "acc": 0.63463721, + "epoch": 0.143455098934551, + "grad_norm": 6.46875, + "learning_rate": 9.987098886767111e-06, + "loss": 1.697579, + "memory(GiB)": 67.97, + "step": 5655, + "train_speed(iter/s)": 1.640061 + }, + { + "acc": 0.62340398, + "epoch": 0.14358193810248604, + "grad_norm": 5.84375, + "learning_rate": 9.987023496515734e-06, + "loss": 1.80572548, + "memory(GiB)": 67.97, + "step": 5660, + "train_speed(iter/s)": 1.640203 + }, + { + "acc": 0.64071021, + "epoch": 0.1437087772704211, + "grad_norm": 8.1875, + "learning_rate": 9.9869478869128e-06, + "loss": 1.71325188, + "memory(GiB)": 67.97, + "step": 5665, + "train_speed(iter/s)": 1.640358 + }, + { + "acc": 0.63145103, + "epoch": 0.14383561643835616, + "grad_norm": 5.625, + "learning_rate": 9.98687205796163e-06, + "loss": 1.76592846, + "memory(GiB)": 67.97, + "step": 5670, + "train_speed(iter/s)": 1.640488 + }, + { + "acc": 0.64574471, + "epoch": 0.14396245560629123, + "grad_norm": 6.25, + "learning_rate": 9.986796009665562e-06, + "loss": 1.64609814, + "memory(GiB)": 67.97, + "step": 5675, + "train_speed(iter/s)": 1.640646 + }, + { + "acc": 0.64121971, + "epoch": 0.14408929477422627, + "grad_norm": 4.78125, + "learning_rate": 9.986719742027944e-06, + "loss": 1.63788223, + "memory(GiB)": 67.97, + "step": 5680, + "train_speed(iter/s)": 1.640796 + }, + { + "acc": 0.6402359, + "epoch": 0.14421613394216135, + "grad_norm": 7.09375, + "learning_rate": 9.986643255052125e-06, + "loss": 1.67876205, + "memory(GiB)": 67.97, + "step": 5685, + "train_speed(iter/s)": 1.640942 + }, + { + "acc": 0.63779802, + "epoch": 0.1443429731100964, + "grad_norm": 5.0, + "learning_rate": 9.986566548741473e-06, + "loss": 1.69805088, + "memory(GiB)": 67.97, + "step": 5690, + "train_speed(iter/s)": 1.641099 + }, + { + "acc": 0.65231657, + "epoch": 0.14446981227803146, + "grad_norm": 5.5625, + "learning_rate": 9.98648962309936e-06, + "loss": 1.68096237, + "memory(GiB)": 67.97, + "step": 5695, + "train_speed(iter/s)": 1.64124 + }, + { + "acc": 0.64773283, + "epoch": 0.1445966514459665, + "grad_norm": 6.84375, + "learning_rate": 9.986412478129171e-06, + "loss": 1.61987782, + "memory(GiB)": 67.97, + "step": 5700, + "train_speed(iter/s)": 1.64139 + }, + { + "acc": 0.64352512, + "epoch": 0.14472349061390158, + "grad_norm": 8.4375, + "learning_rate": 9.9863351138343e-06, + "loss": 1.68878593, + "memory(GiB)": 67.97, + "step": 5705, + "train_speed(iter/s)": 1.641544 + }, + { + "acc": 0.64809437, + "epoch": 0.14485032978183662, + "grad_norm": 6.25, + "learning_rate": 9.986257530218146e-06, + "loss": 1.6550457, + "memory(GiB)": 67.97, + "step": 5710, + "train_speed(iter/s)": 1.641698 + }, + { + "acc": 0.6398253, + "epoch": 0.1449771689497717, + "grad_norm": 5.25, + "learning_rate": 9.986179727284124e-06, + "loss": 1.6624176, + "memory(GiB)": 67.97, + "step": 5715, + "train_speed(iter/s)": 1.641846 + }, + { + "acc": 0.62591176, + "epoch": 0.14510400811770674, + "grad_norm": 7.75, + "learning_rate": 9.986101705035656e-06, + "loss": 1.75093708, + "memory(GiB)": 67.97, + "step": 5720, + "train_speed(iter/s)": 1.641979 + }, + { + "acc": 0.62979212, + "epoch": 0.1452308472856418, + "grad_norm": 5.8125, + "learning_rate": 9.986023463476175e-06, + "loss": 1.76803741, + "memory(GiB)": 67.97, + "step": 5725, + "train_speed(iter/s)": 1.642124 + }, + { + "acc": 0.63339853, + "epoch": 0.14535768645357686, + "grad_norm": 6.3125, + "learning_rate": 9.985945002609119e-06, + "loss": 1.7575386, + "memory(GiB)": 67.97, + "step": 5730, + "train_speed(iter/s)": 1.642276 + }, + { + "acc": 0.63349552, + "epoch": 0.14548452562151193, + "grad_norm": 6.84375, + "learning_rate": 9.985866322437942e-06, + "loss": 1.77933197, + "memory(GiB)": 67.97, + "step": 5735, + "train_speed(iter/s)": 1.642426 + }, + { + "acc": 0.64729791, + "epoch": 0.14561136478944697, + "grad_norm": 7.1875, + "learning_rate": 9.985787422966105e-06, + "loss": 1.67173462, + "memory(GiB)": 67.97, + "step": 5740, + "train_speed(iter/s)": 1.642566 + }, + { + "acc": 0.64463506, + "epoch": 0.14573820395738205, + "grad_norm": 6.71875, + "learning_rate": 9.985708304197075e-06, + "loss": 1.78628616, + "memory(GiB)": 67.97, + "step": 5745, + "train_speed(iter/s)": 1.642714 + }, + { + "acc": 0.64048452, + "epoch": 0.1458650431253171, + "grad_norm": 5.34375, + "learning_rate": 9.985628966134336e-06, + "loss": 1.70549278, + "memory(GiB)": 67.97, + "step": 5750, + "train_speed(iter/s)": 1.642866 + }, + { + "acc": 0.63640237, + "epoch": 0.14599188229325216, + "grad_norm": 7.0625, + "learning_rate": 9.985549408781377e-06, + "loss": 1.69229431, + "memory(GiB)": 67.97, + "step": 5755, + "train_speed(iter/s)": 1.642996 + }, + { + "acc": 0.63343587, + "epoch": 0.1461187214611872, + "grad_norm": 5.28125, + "learning_rate": 9.985469632141693e-06, + "loss": 1.7327282, + "memory(GiB)": 67.97, + "step": 5760, + "train_speed(iter/s)": 1.643145 + }, + { + "acc": 0.635888, + "epoch": 0.14624556062912228, + "grad_norm": 5.15625, + "learning_rate": 9.985389636218797e-06, + "loss": 1.69949722, + "memory(GiB)": 67.97, + "step": 5765, + "train_speed(iter/s)": 1.64328 + }, + { + "acc": 0.6408618, + "epoch": 0.14637239979705732, + "grad_norm": 5.8125, + "learning_rate": 9.985309421016207e-06, + "loss": 1.70303917, + "memory(GiB)": 67.97, + "step": 5770, + "train_speed(iter/s)": 1.64342 + }, + { + "acc": 0.65503311, + "epoch": 0.1464992389649924, + "grad_norm": 5.6875, + "learning_rate": 9.985228986537451e-06, + "loss": 1.67711391, + "memory(GiB)": 67.97, + "step": 5775, + "train_speed(iter/s)": 1.643574 + }, + { + "acc": 0.64052763, + "epoch": 0.14662607813292744, + "grad_norm": 5.71875, + "learning_rate": 9.985148332786068e-06, + "loss": 1.6546669, + "memory(GiB)": 67.97, + "step": 5780, + "train_speed(iter/s)": 1.643721 + }, + { + "acc": 0.63725519, + "epoch": 0.1467529173008625, + "grad_norm": 6.0, + "learning_rate": 9.985067459765603e-06, + "loss": 1.72509747, + "memory(GiB)": 67.97, + "step": 5785, + "train_speed(iter/s)": 1.643875 + }, + { + "acc": 0.64202337, + "epoch": 0.14687975646879756, + "grad_norm": 5.75, + "learning_rate": 9.984986367479615e-06, + "loss": 1.73176651, + "memory(GiB)": 67.97, + "step": 5790, + "train_speed(iter/s)": 1.644012 + }, + { + "acc": 0.64810824, + "epoch": 0.14700659563673263, + "grad_norm": 5.8125, + "learning_rate": 9.984905055931668e-06, + "loss": 1.67020073, + "memory(GiB)": 67.97, + "step": 5795, + "train_speed(iter/s)": 1.644161 + }, + { + "acc": 0.61541805, + "epoch": 0.14713343480466767, + "grad_norm": 5.1875, + "learning_rate": 9.984823525125342e-06, + "loss": 1.7698822, + "memory(GiB)": 67.97, + "step": 5800, + "train_speed(iter/s)": 1.644316 + }, + { + "acc": 0.63629761, + "epoch": 0.14726027397260275, + "grad_norm": 5.25, + "learning_rate": 9.984741775064222e-06, + "loss": 1.72454681, + "memory(GiB)": 67.97, + "step": 5805, + "train_speed(iter/s)": 1.64446 + }, + { + "acc": 0.62756248, + "epoch": 0.1473871131405378, + "grad_norm": 5.03125, + "learning_rate": 9.984659805751904e-06, + "loss": 1.77317219, + "memory(GiB)": 67.97, + "step": 5810, + "train_speed(iter/s)": 1.644601 + }, + { + "acc": 0.65102062, + "epoch": 0.14751395230847286, + "grad_norm": 5.25, + "learning_rate": 9.984577617191993e-06, + "loss": 1.66598091, + "memory(GiB)": 67.97, + "step": 5815, + "train_speed(iter/s)": 1.644747 + }, + { + "acc": 0.63081174, + "epoch": 0.1476407914764079, + "grad_norm": 6.28125, + "learning_rate": 9.984495209388102e-06, + "loss": 1.74582272, + "memory(GiB)": 67.97, + "step": 5820, + "train_speed(iter/s)": 1.644897 + }, + { + "acc": 0.63116512, + "epoch": 0.14776763064434298, + "grad_norm": 6.34375, + "learning_rate": 9.984412582343859e-06, + "loss": 1.76524906, + "memory(GiB)": 67.97, + "step": 5825, + "train_speed(iter/s)": 1.645042 + }, + { + "acc": 0.64548478, + "epoch": 0.14789446981227802, + "grad_norm": 6.0, + "learning_rate": 9.984329736062896e-06, + "loss": 1.64662361, + "memory(GiB)": 67.97, + "step": 5830, + "train_speed(iter/s)": 1.645193 + }, + { + "acc": 0.62500458, + "epoch": 0.1480213089802131, + "grad_norm": 5.46875, + "learning_rate": 9.984246670548858e-06, + "loss": 1.72222424, + "memory(GiB)": 67.97, + "step": 5835, + "train_speed(iter/s)": 1.645333 + }, + { + "acc": 0.640452, + "epoch": 0.14814814814814814, + "grad_norm": 5.15625, + "learning_rate": 9.984163385805398e-06, + "loss": 1.74711456, + "memory(GiB)": 67.97, + "step": 5840, + "train_speed(iter/s)": 1.645473 + }, + { + "acc": 0.63930683, + "epoch": 0.1482749873160832, + "grad_norm": 7.28125, + "learning_rate": 9.984079881836182e-06, + "loss": 1.71177349, + "memory(GiB)": 67.97, + "step": 5845, + "train_speed(iter/s)": 1.645621 + }, + { + "acc": 0.63935652, + "epoch": 0.14840182648401826, + "grad_norm": 6.9375, + "learning_rate": 9.983996158644877e-06, + "loss": 1.6764328, + "memory(GiB)": 67.97, + "step": 5850, + "train_speed(iter/s)": 1.645755 + }, + { + "acc": 0.64308186, + "epoch": 0.14852866565195333, + "grad_norm": 4.5, + "learning_rate": 9.983912216235172e-06, + "loss": 1.60972023, + "memory(GiB)": 67.97, + "step": 5855, + "train_speed(iter/s)": 1.645904 + }, + { + "acc": 0.64627028, + "epoch": 0.14865550481988837, + "grad_norm": 8.6875, + "learning_rate": 9.983828054610754e-06, + "loss": 1.69884377, + "memory(GiB)": 67.97, + "step": 5860, + "train_speed(iter/s)": 1.646051 + }, + { + "acc": 0.64437056, + "epoch": 0.14878234398782345, + "grad_norm": 5.75, + "learning_rate": 9.983743673775328e-06, + "loss": 1.73473854, + "memory(GiB)": 67.97, + "step": 5865, + "train_speed(iter/s)": 1.646185 + }, + { + "acc": 0.64316578, + "epoch": 0.1489091831557585, + "grad_norm": 7.0, + "learning_rate": 9.983659073732604e-06, + "loss": 1.67449303, + "memory(GiB)": 67.97, + "step": 5870, + "train_speed(iter/s)": 1.646339 + }, + { + "acc": 0.64334426, + "epoch": 0.14903602232369356, + "grad_norm": 6.78125, + "learning_rate": 9.983574254486303e-06, + "loss": 1.70057468, + "memory(GiB)": 67.97, + "step": 5875, + "train_speed(iter/s)": 1.646484 + }, + { + "acc": 0.6575243, + "epoch": 0.1491628614916286, + "grad_norm": 4.8125, + "learning_rate": 9.983489216040158e-06, + "loss": 1.67150078, + "memory(GiB)": 67.97, + "step": 5880, + "train_speed(iter/s)": 1.646621 + }, + { + "acc": 0.64971218, + "epoch": 0.14928970065956368, + "grad_norm": 5.40625, + "learning_rate": 9.983403958397907e-06, + "loss": 1.69433556, + "memory(GiB)": 67.97, + "step": 5885, + "train_speed(iter/s)": 1.646766 + }, + { + "acc": 0.63386822, + "epoch": 0.14941653982749872, + "grad_norm": 5.65625, + "learning_rate": 9.9833184815633e-06, + "loss": 1.86196461, + "memory(GiB)": 67.97, + "step": 5890, + "train_speed(iter/s)": 1.646909 + }, + { + "acc": 0.63559723, + "epoch": 0.1495433789954338, + "grad_norm": 5.1875, + "learning_rate": 9.983232785540097e-06, + "loss": 1.79789734, + "memory(GiB)": 67.97, + "step": 5895, + "train_speed(iter/s)": 1.647054 + }, + { + "acc": 0.63170757, + "epoch": 0.14967021816336884, + "grad_norm": 5.9375, + "learning_rate": 9.983146870332068e-06, + "loss": 1.69956932, + "memory(GiB)": 67.97, + "step": 5900, + "train_speed(iter/s)": 1.647196 + }, + { + "acc": 0.63182459, + "epoch": 0.1497970573313039, + "grad_norm": 10.8125, + "learning_rate": 9.98306073594299e-06, + "loss": 1.73979073, + "memory(GiB)": 67.97, + "step": 5905, + "train_speed(iter/s)": 1.647341 + }, + { + "acc": 0.65083694, + "epoch": 0.14992389649923896, + "grad_norm": 6.84375, + "learning_rate": 9.982974382376656e-06, + "loss": 1.67259693, + "memory(GiB)": 67.97, + "step": 5910, + "train_speed(iter/s)": 1.647474 + }, + { + "acc": 0.65537863, + "epoch": 0.15005073566717403, + "grad_norm": 5.9375, + "learning_rate": 9.98288780963686e-06, + "loss": 1.65226727, + "memory(GiB)": 67.97, + "step": 5915, + "train_speed(iter/s)": 1.64761 + }, + { + "acc": 0.63600521, + "epoch": 0.15017757483510907, + "grad_norm": 4.6875, + "learning_rate": 9.98280101772741e-06, + "loss": 1.71045208, + "memory(GiB)": 67.97, + "step": 5920, + "train_speed(iter/s)": 1.647745 + }, + { + "acc": 0.63900399, + "epoch": 0.15030441400304415, + "grad_norm": 5.25, + "learning_rate": 9.982714006652126e-06, + "loss": 1.7458086, + "memory(GiB)": 67.97, + "step": 5925, + "train_speed(iter/s)": 1.647878 + }, + { + "acc": 0.63540921, + "epoch": 0.1504312531709792, + "grad_norm": 5.0, + "learning_rate": 9.982626776414834e-06, + "loss": 1.68440323, + "memory(GiB)": 67.97, + "step": 5930, + "train_speed(iter/s)": 1.648008 + }, + { + "acc": 0.62862368, + "epoch": 0.15055809233891426, + "grad_norm": 5.21875, + "learning_rate": 9.98253932701937e-06, + "loss": 1.70194435, + "memory(GiB)": 67.97, + "step": 5935, + "train_speed(iter/s)": 1.648144 + }, + { + "acc": 0.64620218, + "epoch": 0.1506849315068493, + "grad_norm": 7.0625, + "learning_rate": 9.98245165846958e-06, + "loss": 1.75366306, + "memory(GiB)": 67.97, + "step": 5940, + "train_speed(iter/s)": 1.648281 + }, + { + "acc": 0.64474354, + "epoch": 0.15081177067478438, + "grad_norm": 7.875, + "learning_rate": 9.982363770769323e-06, + "loss": 1.67810211, + "memory(GiB)": 67.97, + "step": 5945, + "train_speed(iter/s)": 1.648428 + }, + { + "acc": 0.64211788, + "epoch": 0.15093860984271942, + "grad_norm": 5.78125, + "learning_rate": 9.98227566392246e-06, + "loss": 1.68489761, + "memory(GiB)": 67.97, + "step": 5950, + "train_speed(iter/s)": 1.648557 + }, + { + "acc": 0.64047265, + "epoch": 0.1510654490106545, + "grad_norm": 7.25, + "learning_rate": 9.982187337932871e-06, + "loss": 1.67838707, + "memory(GiB)": 67.97, + "step": 5955, + "train_speed(iter/s)": 1.648693 + }, + { + "acc": 0.62222662, + "epoch": 0.15119228817858954, + "grad_norm": 4.96875, + "learning_rate": 9.98209879280444e-06, + "loss": 1.76731186, + "memory(GiB)": 67.97, + "step": 5960, + "train_speed(iter/s)": 1.648837 + }, + { + "acc": 0.63446589, + "epoch": 0.1513191273465246, + "grad_norm": 6.15625, + "learning_rate": 9.982010028541057e-06, + "loss": 1.73758335, + "memory(GiB)": 67.97, + "step": 5965, + "train_speed(iter/s)": 1.64898 + }, + { + "acc": 0.64323282, + "epoch": 0.15144596651445966, + "grad_norm": 6.46875, + "learning_rate": 9.981921045146633e-06, + "loss": 1.66124744, + "memory(GiB)": 67.97, + "step": 5970, + "train_speed(iter/s)": 1.649117 + }, + { + "acc": 0.64692993, + "epoch": 0.15157280568239473, + "grad_norm": 5.59375, + "learning_rate": 9.981831842625079e-06, + "loss": 1.64683552, + "memory(GiB)": 67.97, + "step": 5975, + "train_speed(iter/s)": 1.649254 + }, + { + "acc": 0.64014034, + "epoch": 0.15169964485032977, + "grad_norm": 5.59375, + "learning_rate": 9.981742420980316e-06, + "loss": 1.7176815, + "memory(GiB)": 67.97, + "step": 5980, + "train_speed(iter/s)": 1.649399 + }, + { + "acc": 0.64055929, + "epoch": 0.15182648401826485, + "grad_norm": 6.8125, + "learning_rate": 9.981652780216281e-06, + "loss": 1.70856915, + "memory(GiB)": 67.97, + "step": 5985, + "train_speed(iter/s)": 1.649535 + }, + { + "acc": 0.6607892, + "epoch": 0.1519533231861999, + "grad_norm": 8.0, + "learning_rate": 9.981562920336915e-06, + "loss": 1.61497307, + "memory(GiB)": 67.97, + "step": 5990, + "train_speed(iter/s)": 1.64968 + }, + { + "acc": 0.62692347, + "epoch": 0.15208016235413496, + "grad_norm": 5.96875, + "learning_rate": 9.98147284134617e-06, + "loss": 1.7615097, + "memory(GiB)": 67.97, + "step": 5995, + "train_speed(iter/s)": 1.649817 + }, + { + "acc": 0.63626213, + "epoch": 0.15220700152207, + "grad_norm": 6.4375, + "learning_rate": 9.981382543248011e-06, + "loss": 1.73356247, + "memory(GiB)": 67.97, + "step": 6000, + "train_speed(iter/s)": 1.649948 + }, + { + "epoch": 0.15220700152207, + "eval_acc": 0.631341715794068, + "eval_loss": 1.6586743593215942, + "eval_runtime": 58.2386, + "eval_samples_per_second": 109.378, + "eval_steps_per_second": 27.353, + "step": 6000 + }, + { + "acc": 0.64247751, + "epoch": 0.15233384069000508, + "grad_norm": 5.65625, + "learning_rate": 9.981292026046406e-06, + "loss": 1.67188072, + "memory(GiB)": 67.97, + "step": 6005, + "train_speed(iter/s)": 1.622245 + }, + { + "acc": 0.63753009, + "epoch": 0.15246067985794012, + "grad_norm": 6.46875, + "learning_rate": 9.981201289745337e-06, + "loss": 1.69720078, + "memory(GiB)": 67.97, + "step": 6010, + "train_speed(iter/s)": 1.622371 + }, + { + "acc": 0.63781967, + "epoch": 0.1525875190258752, + "grad_norm": 5.71875, + "learning_rate": 9.981110334348796e-06, + "loss": 1.73426647, + "memory(GiB)": 67.97, + "step": 6015, + "train_speed(iter/s)": 1.622529 + }, + { + "acc": 0.65269566, + "epoch": 0.15271435819381024, + "grad_norm": 6.84375, + "learning_rate": 9.981019159860782e-06, + "loss": 1.63263931, + "memory(GiB)": 67.97, + "step": 6020, + "train_speed(iter/s)": 1.62268 + }, + { + "acc": 0.64456654, + "epoch": 0.1528411973617453, + "grad_norm": 7.65625, + "learning_rate": 9.98092776628531e-06, + "loss": 1.69595184, + "memory(GiB)": 67.97, + "step": 6025, + "train_speed(iter/s)": 1.622826 + }, + { + "acc": 0.64955573, + "epoch": 0.15296803652968036, + "grad_norm": 4.96875, + "learning_rate": 9.980836153626396e-06, + "loss": 1.64122925, + "memory(GiB)": 67.97, + "step": 6030, + "train_speed(iter/s)": 1.622974 + }, + { + "acc": 0.65374017, + "epoch": 0.15309487569761543, + "grad_norm": 5.75, + "learning_rate": 9.980744321888068e-06, + "loss": 1.68965683, + "memory(GiB)": 67.97, + "step": 6035, + "train_speed(iter/s)": 1.623118 + }, + { + "acc": 0.63661962, + "epoch": 0.15322171486555047, + "grad_norm": 5.15625, + "learning_rate": 9.98065227107437e-06, + "loss": 1.78869629, + "memory(GiB)": 67.97, + "step": 6040, + "train_speed(iter/s)": 1.623279 + }, + { + "acc": 0.63980098, + "epoch": 0.15334855403348555, + "grad_norm": 5.15625, + "learning_rate": 9.980560001189346e-06, + "loss": 1.71096458, + "memory(GiB)": 67.97, + "step": 6045, + "train_speed(iter/s)": 1.623415 + }, + { + "acc": 0.62682972, + "epoch": 0.1534753932014206, + "grad_norm": 5.5, + "learning_rate": 9.980467512237058e-06, + "loss": 1.76352234, + "memory(GiB)": 67.97, + "step": 6050, + "train_speed(iter/s)": 1.623576 + }, + { + "acc": 0.63099384, + "epoch": 0.15360223236935566, + "grad_norm": 6.40625, + "learning_rate": 9.98037480422157e-06, + "loss": 1.69793587, + "memory(GiB)": 77.59, + "step": 6055, + "train_speed(iter/s)": 1.623726 + }, + { + "acc": 0.65371971, + "epoch": 0.1537290715372907, + "grad_norm": 6.34375, + "learning_rate": 9.980281877146964e-06, + "loss": 1.68482018, + "memory(GiB)": 77.59, + "step": 6060, + "train_speed(iter/s)": 1.623876 + }, + { + "acc": 0.64076004, + "epoch": 0.15385591070522578, + "grad_norm": 5.09375, + "learning_rate": 9.980188731017327e-06, + "loss": 1.7227747, + "memory(GiB)": 77.59, + "step": 6065, + "train_speed(iter/s)": 1.624032 + }, + { + "acc": 0.65239887, + "epoch": 0.15398274987316082, + "grad_norm": 5.96875, + "learning_rate": 9.980095365836753e-06, + "loss": 1.65667305, + "memory(GiB)": 77.59, + "step": 6070, + "train_speed(iter/s)": 1.624173 + }, + { + "acc": 0.64093332, + "epoch": 0.1541095890410959, + "grad_norm": 6.28125, + "learning_rate": 9.980001781609353e-06, + "loss": 1.7067337, + "memory(GiB)": 77.59, + "step": 6075, + "train_speed(iter/s)": 1.62433 + }, + { + "acc": 0.63491664, + "epoch": 0.15423642820903094, + "grad_norm": 5.9375, + "learning_rate": 9.979907978339236e-06, + "loss": 1.71492023, + "memory(GiB)": 77.59, + "step": 6080, + "train_speed(iter/s)": 1.624484 + }, + { + "acc": 0.6486021, + "epoch": 0.154363267376966, + "grad_norm": 6.375, + "learning_rate": 9.979813956030535e-06, + "loss": 1.65320702, + "memory(GiB)": 77.59, + "step": 6085, + "train_speed(iter/s)": 1.624629 + }, + { + "acc": 0.6361412, + "epoch": 0.15449010654490106, + "grad_norm": 6.34375, + "learning_rate": 9.979719714687384e-06, + "loss": 1.68614998, + "memory(GiB)": 77.59, + "step": 6090, + "train_speed(iter/s)": 1.624776 + }, + { + "acc": 0.63105526, + "epoch": 0.15461694571283613, + "grad_norm": 5.1875, + "learning_rate": 9.979625254313924e-06, + "loss": 1.78558273, + "memory(GiB)": 77.59, + "step": 6095, + "train_speed(iter/s)": 1.624916 + }, + { + "acc": 0.63914604, + "epoch": 0.15474378488077117, + "grad_norm": 7.21875, + "learning_rate": 9.979530574914316e-06, + "loss": 1.69465752, + "memory(GiB)": 77.59, + "step": 6100, + "train_speed(iter/s)": 1.625067 + }, + { + "acc": 0.64542117, + "epoch": 0.15487062404870625, + "grad_norm": 6.90625, + "learning_rate": 9.97943567649272e-06, + "loss": 1.65333576, + "memory(GiB)": 77.59, + "step": 6105, + "train_speed(iter/s)": 1.625207 + }, + { + "acc": 0.63427954, + "epoch": 0.1549974632166413, + "grad_norm": 5.9375, + "learning_rate": 9.979340559053311e-06, + "loss": 1.65659218, + "memory(GiB)": 77.59, + "step": 6110, + "train_speed(iter/s)": 1.625348 + }, + { + "acc": 0.63331575, + "epoch": 0.15512430238457636, + "grad_norm": 4.59375, + "learning_rate": 9.979245222600273e-06, + "loss": 1.70867023, + "memory(GiB)": 77.59, + "step": 6115, + "train_speed(iter/s)": 1.625493 + }, + { + "acc": 0.6256835, + "epoch": 0.1552511415525114, + "grad_norm": 6.09375, + "learning_rate": 9.979149667137801e-06, + "loss": 1.76142693, + "memory(GiB)": 77.59, + "step": 6120, + "train_speed(iter/s)": 1.625644 + }, + { + "acc": 0.62907953, + "epoch": 0.15537798072044648, + "grad_norm": 5.8125, + "learning_rate": 9.979053892670094e-06, + "loss": 1.73337746, + "memory(GiB)": 77.59, + "step": 6125, + "train_speed(iter/s)": 1.625785 + }, + { + "acc": 0.63550453, + "epoch": 0.15550481988838152, + "grad_norm": 5.1875, + "learning_rate": 9.978957899201369e-06, + "loss": 1.7557785, + "memory(GiB)": 77.59, + "step": 6130, + "train_speed(iter/s)": 1.625932 + }, + { + "acc": 0.64810534, + "epoch": 0.1556316590563166, + "grad_norm": 4.96875, + "learning_rate": 9.978861686735845e-06, + "loss": 1.69074402, + "memory(GiB)": 77.59, + "step": 6135, + "train_speed(iter/s)": 1.626081 + }, + { + "acc": 0.64021478, + "epoch": 0.15575849822425164, + "grad_norm": 7.65625, + "learning_rate": 9.978765255277756e-06, + "loss": 1.7314827, + "memory(GiB)": 77.59, + "step": 6140, + "train_speed(iter/s)": 1.626227 + }, + { + "acc": 0.64839134, + "epoch": 0.1558853373921867, + "grad_norm": 5.09375, + "learning_rate": 9.97866860483134e-06, + "loss": 1.70773048, + "memory(GiB)": 77.59, + "step": 6145, + "train_speed(iter/s)": 1.626378 + }, + { + "acc": 0.65473585, + "epoch": 0.15601217656012176, + "grad_norm": 5.5625, + "learning_rate": 9.978571735400853e-06, + "loss": 1.65567513, + "memory(GiB)": 77.59, + "step": 6150, + "train_speed(iter/s)": 1.626529 + }, + { + "acc": 0.63746767, + "epoch": 0.15613901572805683, + "grad_norm": 6.625, + "learning_rate": 9.978474646990552e-06, + "loss": 1.66863117, + "memory(GiB)": 77.59, + "step": 6155, + "train_speed(iter/s)": 1.626681 + }, + { + "acc": 0.64006233, + "epoch": 0.15626585489599187, + "grad_norm": 6.28125, + "learning_rate": 9.97837733960471e-06, + "loss": 1.73263168, + "memory(GiB)": 77.59, + "step": 6160, + "train_speed(iter/s)": 1.626811 + }, + { + "acc": 0.65888743, + "epoch": 0.15639269406392695, + "grad_norm": 6.53125, + "learning_rate": 9.978279813247605e-06, + "loss": 1.64967499, + "memory(GiB)": 77.59, + "step": 6165, + "train_speed(iter/s)": 1.626952 + }, + { + "acc": 0.64943757, + "epoch": 0.156519533231862, + "grad_norm": 5.03125, + "learning_rate": 9.978182067923528e-06, + "loss": 1.65488949, + "memory(GiB)": 77.59, + "step": 6170, + "train_speed(iter/s)": 1.627093 + }, + { + "acc": 0.63030982, + "epoch": 0.15664637239979706, + "grad_norm": 4.71875, + "learning_rate": 9.978084103636778e-06, + "loss": 1.79546776, + "memory(GiB)": 77.59, + "step": 6175, + "train_speed(iter/s)": 1.627229 + }, + { + "acc": 0.65145741, + "epoch": 0.1567732115677321, + "grad_norm": 6.21875, + "learning_rate": 9.977985920391661e-06, + "loss": 1.67122612, + "memory(GiB)": 77.59, + "step": 6180, + "train_speed(iter/s)": 1.627374 + }, + { + "acc": 0.64416523, + "epoch": 0.15690005073566718, + "grad_norm": 5.71875, + "learning_rate": 9.977887518192501e-06, + "loss": 1.74429035, + "memory(GiB)": 77.59, + "step": 6185, + "train_speed(iter/s)": 1.627524 + }, + { + "acc": 0.61545467, + "epoch": 0.15702688990360222, + "grad_norm": 4.65625, + "learning_rate": 9.977788897043622e-06, + "loss": 1.7866478, + "memory(GiB)": 77.59, + "step": 6190, + "train_speed(iter/s)": 1.627676 + }, + { + "acc": 0.64552336, + "epoch": 0.1571537290715373, + "grad_norm": 5.90625, + "learning_rate": 9.977690056949363e-06, + "loss": 1.68018341, + "memory(GiB)": 77.59, + "step": 6195, + "train_speed(iter/s)": 1.62782 + }, + { + "acc": 0.6357573, + "epoch": 0.15728056823947234, + "grad_norm": 7.15625, + "learning_rate": 9.977590997914072e-06, + "loss": 1.72546806, + "memory(GiB)": 77.59, + "step": 6200, + "train_speed(iter/s)": 1.627954 + }, + { + "acc": 0.63645563, + "epoch": 0.1574074074074074, + "grad_norm": 5.3125, + "learning_rate": 9.977491719942106e-06, + "loss": 1.67075195, + "memory(GiB)": 77.59, + "step": 6205, + "train_speed(iter/s)": 1.628094 + }, + { + "acc": 0.63589287, + "epoch": 0.15753424657534246, + "grad_norm": 6.125, + "learning_rate": 9.97739222303783e-06, + "loss": 1.71371651, + "memory(GiB)": 77.59, + "step": 6210, + "train_speed(iter/s)": 1.628236 + }, + { + "acc": 0.6513371, + "epoch": 0.15766108574327753, + "grad_norm": 6.8125, + "learning_rate": 9.977292507205623e-06, + "loss": 1.67914371, + "memory(GiB)": 77.59, + "step": 6215, + "train_speed(iter/s)": 1.628381 + }, + { + "acc": 0.61512079, + "epoch": 0.15778792491121257, + "grad_norm": 6.34375, + "learning_rate": 9.977192572449868e-06, + "loss": 1.77316208, + "memory(GiB)": 77.59, + "step": 6220, + "train_speed(iter/s)": 1.628521 + }, + { + "acc": 0.64252591, + "epoch": 0.15791476407914765, + "grad_norm": 5.9375, + "learning_rate": 9.977092418774962e-06, + "loss": 1.69516239, + "memory(GiB)": 77.59, + "step": 6225, + "train_speed(iter/s)": 1.628678 + }, + { + "acc": 0.62649403, + "epoch": 0.1580416032470827, + "grad_norm": 5.90625, + "learning_rate": 9.976992046185313e-06, + "loss": 1.76648808, + "memory(GiB)": 77.59, + "step": 6230, + "train_speed(iter/s)": 1.628821 + }, + { + "acc": 0.634657, + "epoch": 0.15816844241501776, + "grad_norm": 5.09375, + "learning_rate": 9.97689145468533e-06, + "loss": 1.72153091, + "memory(GiB)": 77.59, + "step": 6235, + "train_speed(iter/s)": 1.628963 + }, + { + "acc": 0.65538073, + "epoch": 0.1582952815829528, + "grad_norm": 5.71875, + "learning_rate": 9.976790644279442e-06, + "loss": 1.62780285, + "memory(GiB)": 77.59, + "step": 6240, + "train_speed(iter/s)": 1.629107 + }, + { + "acc": 0.64440994, + "epoch": 0.15842212075088788, + "grad_norm": 5.65625, + "learning_rate": 9.976689614972082e-06, + "loss": 1.63728638, + "memory(GiB)": 77.59, + "step": 6245, + "train_speed(iter/s)": 1.62925 + }, + { + "acc": 0.64431801, + "epoch": 0.15854895991882292, + "grad_norm": 7.5, + "learning_rate": 9.976588366767693e-06, + "loss": 1.71953621, + "memory(GiB)": 77.59, + "step": 6250, + "train_speed(iter/s)": 1.629398 + }, + { + "acc": 0.63995633, + "epoch": 0.158675799086758, + "grad_norm": 6.21875, + "learning_rate": 9.976486899670729e-06, + "loss": 1.6575222, + "memory(GiB)": 77.59, + "step": 6255, + "train_speed(iter/s)": 1.629546 + }, + { + "acc": 0.64340801, + "epoch": 0.15880263825469304, + "grad_norm": 5.75, + "learning_rate": 9.976385213685652e-06, + "loss": 1.7093338, + "memory(GiB)": 77.59, + "step": 6260, + "train_speed(iter/s)": 1.629692 + }, + { + "acc": 0.63619251, + "epoch": 0.1589294774226281, + "grad_norm": 5.84375, + "learning_rate": 9.976283308816937e-06, + "loss": 1.70851097, + "memory(GiB)": 77.59, + "step": 6265, + "train_speed(iter/s)": 1.629836 + }, + { + "acc": 0.65177736, + "epoch": 0.15905631659056316, + "grad_norm": 6.6875, + "learning_rate": 9.976181185069063e-06, + "loss": 1.62205124, + "memory(GiB)": 77.59, + "step": 6270, + "train_speed(iter/s)": 1.629983 + }, + { + "acc": 0.65391154, + "epoch": 0.15918315575849823, + "grad_norm": 4.28125, + "learning_rate": 9.976078842446522e-06, + "loss": 1.64793148, + "memory(GiB)": 77.59, + "step": 6275, + "train_speed(iter/s)": 1.63012 + }, + { + "acc": 0.6346415, + "epoch": 0.15930999492643327, + "grad_norm": 5.15625, + "learning_rate": 9.97597628095382e-06, + "loss": 1.8150074, + "memory(GiB)": 77.59, + "step": 6280, + "train_speed(iter/s)": 1.630261 + }, + { + "acc": 0.62557349, + "epoch": 0.15943683409436835, + "grad_norm": 6.21875, + "learning_rate": 9.975873500595464e-06, + "loss": 1.81581287, + "memory(GiB)": 77.59, + "step": 6285, + "train_speed(iter/s)": 1.630399 + }, + { + "acc": 0.64790401, + "epoch": 0.1595636732623034, + "grad_norm": 6.0, + "learning_rate": 9.975770501375974e-06, + "loss": 1.72666969, + "memory(GiB)": 77.59, + "step": 6290, + "train_speed(iter/s)": 1.630542 + }, + { + "acc": 0.64370136, + "epoch": 0.15969051243023846, + "grad_norm": 5.8125, + "learning_rate": 9.975667283299884e-06, + "loss": 1.68165207, + "memory(GiB)": 77.59, + "step": 6295, + "train_speed(iter/s)": 1.630673 + }, + { + "acc": 0.6499074, + "epoch": 0.1598173515981735, + "grad_norm": 6.65625, + "learning_rate": 9.975563846371732e-06, + "loss": 1.68750763, + "memory(GiB)": 77.59, + "step": 6300, + "train_speed(iter/s)": 1.630803 + }, + { + "acc": 0.64219446, + "epoch": 0.15994419076610858, + "grad_norm": 5.21875, + "learning_rate": 9.975460190596068e-06, + "loss": 1.69849129, + "memory(GiB)": 77.59, + "step": 6305, + "train_speed(iter/s)": 1.630933 + }, + { + "acc": 0.63774624, + "epoch": 0.16007102993404362, + "grad_norm": 7.09375, + "learning_rate": 9.975356315977451e-06, + "loss": 1.70805016, + "memory(GiB)": 77.59, + "step": 6310, + "train_speed(iter/s)": 1.631077 + }, + { + "acc": 0.64253635, + "epoch": 0.1601978691019787, + "grad_norm": 5.78125, + "learning_rate": 9.975252222520449e-06, + "loss": 1.66821289, + "memory(GiB)": 77.59, + "step": 6315, + "train_speed(iter/s)": 1.631216 + }, + { + "acc": 0.63843169, + "epoch": 0.16032470826991374, + "grad_norm": 4.9375, + "learning_rate": 9.97514791022964e-06, + "loss": 1.70587273, + "memory(GiB)": 77.59, + "step": 6320, + "train_speed(iter/s)": 1.63135 + }, + { + "acc": 0.64853153, + "epoch": 0.1604515474378488, + "grad_norm": 5.65625, + "learning_rate": 9.975043379109617e-06, + "loss": 1.68425255, + "memory(GiB)": 77.59, + "step": 6325, + "train_speed(iter/s)": 1.631478 + }, + { + "acc": 0.6516242, + "epoch": 0.16057838660578386, + "grad_norm": 4.96875, + "learning_rate": 9.974938629164973e-06, + "loss": 1.66455574, + "memory(GiB)": 77.59, + "step": 6330, + "train_speed(iter/s)": 1.631611 + }, + { + "acc": 0.64136925, + "epoch": 0.16070522577371893, + "grad_norm": 4.65625, + "learning_rate": 9.974833660400315e-06, + "loss": 1.72513733, + "memory(GiB)": 77.59, + "step": 6335, + "train_speed(iter/s)": 1.631749 + }, + { + "acc": 0.63354859, + "epoch": 0.16083206494165397, + "grad_norm": 6.4375, + "learning_rate": 9.974728472820264e-06, + "loss": 1.6803978, + "memory(GiB)": 77.59, + "step": 6340, + "train_speed(iter/s)": 1.631896 + }, + { + "acc": 0.63999481, + "epoch": 0.16095890410958905, + "grad_norm": 4.5625, + "learning_rate": 9.97462306642944e-06, + "loss": 1.70625572, + "memory(GiB)": 77.59, + "step": 6345, + "train_speed(iter/s)": 1.632022 + }, + { + "acc": 0.66038437, + "epoch": 0.1610857432775241, + "grad_norm": 5.09375, + "learning_rate": 9.974517441232487e-06, + "loss": 1.61866665, + "memory(GiB)": 77.59, + "step": 6350, + "train_speed(iter/s)": 1.632153 + }, + { + "acc": 0.63697634, + "epoch": 0.16121258244545916, + "grad_norm": 5.90625, + "learning_rate": 9.974411597234046e-06, + "loss": 1.70938187, + "memory(GiB)": 77.59, + "step": 6355, + "train_speed(iter/s)": 1.632282 + }, + { + "acc": 0.63928413, + "epoch": 0.1613394216133942, + "grad_norm": 6.71875, + "learning_rate": 9.974305534438774e-06, + "loss": 1.649403, + "memory(GiB)": 77.59, + "step": 6360, + "train_speed(iter/s)": 1.632414 + }, + { + "acc": 0.65040388, + "epoch": 0.16146626078132928, + "grad_norm": 5.4375, + "learning_rate": 9.974199252851338e-06, + "loss": 1.68915596, + "memory(GiB)": 77.59, + "step": 6365, + "train_speed(iter/s)": 1.632562 + }, + { + "acc": 0.65095468, + "epoch": 0.16159309994926432, + "grad_norm": 6.0625, + "learning_rate": 9.974092752476408e-06, + "loss": 1.65609608, + "memory(GiB)": 77.59, + "step": 6370, + "train_speed(iter/s)": 1.632692 + }, + { + "acc": 0.65658998, + "epoch": 0.1617199391171994, + "grad_norm": 5.46875, + "learning_rate": 9.973986033318673e-06, + "loss": 1.65106831, + "memory(GiB)": 77.59, + "step": 6375, + "train_speed(iter/s)": 1.632822 + }, + { + "acc": 0.66161289, + "epoch": 0.16184677828513444, + "grad_norm": 5.75, + "learning_rate": 9.973879095382824e-06, + "loss": 1.59875546, + "memory(GiB)": 77.59, + "step": 6380, + "train_speed(iter/s)": 1.632944 + }, + { + "acc": 0.65416441, + "epoch": 0.1619736174530695, + "grad_norm": 6.375, + "learning_rate": 9.973771938673564e-06, + "loss": 1.71432629, + "memory(GiB)": 77.59, + "step": 6385, + "train_speed(iter/s)": 1.633085 + }, + { + "acc": 0.64049058, + "epoch": 0.16210045662100456, + "grad_norm": 6.125, + "learning_rate": 9.973664563195609e-06, + "loss": 1.75938148, + "memory(GiB)": 77.59, + "step": 6390, + "train_speed(iter/s)": 1.633218 + }, + { + "acc": 0.64086409, + "epoch": 0.16222729578893963, + "grad_norm": 6.125, + "learning_rate": 9.973556968953682e-06, + "loss": 1.69508114, + "memory(GiB)": 77.59, + "step": 6395, + "train_speed(iter/s)": 1.633346 + }, + { + "acc": 0.63516617, + "epoch": 0.16235413495687467, + "grad_norm": 5.65625, + "learning_rate": 9.973449155952512e-06, + "loss": 1.6902504, + "memory(GiB)": 77.59, + "step": 6400, + "train_speed(iter/s)": 1.633477 + }, + { + "acc": 0.64499741, + "epoch": 0.16248097412480975, + "grad_norm": 5.53125, + "learning_rate": 9.973341124196847e-06, + "loss": 1.64758034, + "memory(GiB)": 77.59, + "step": 6405, + "train_speed(iter/s)": 1.633611 + }, + { + "acc": 0.65194874, + "epoch": 0.1626078132927448, + "grad_norm": 5.0625, + "learning_rate": 9.973232873691431e-06, + "loss": 1.62782841, + "memory(GiB)": 77.59, + "step": 6410, + "train_speed(iter/s)": 1.633744 + }, + { + "acc": 0.63939381, + "epoch": 0.16273465246067986, + "grad_norm": 4.90625, + "learning_rate": 9.973124404441031e-06, + "loss": 1.67741985, + "memory(GiB)": 77.59, + "step": 6415, + "train_speed(iter/s)": 1.633866 + }, + { + "acc": 0.63335905, + "epoch": 0.1628614916286149, + "grad_norm": 5.59375, + "learning_rate": 9.973015716450416e-06, + "loss": 1.79659672, + "memory(GiB)": 77.59, + "step": 6420, + "train_speed(iter/s)": 1.633989 + }, + { + "acc": 0.65132799, + "epoch": 0.16298833079654998, + "grad_norm": 5.84375, + "learning_rate": 9.972906809724367e-06, + "loss": 1.63987732, + "memory(GiB)": 77.59, + "step": 6425, + "train_speed(iter/s)": 1.634111 + }, + { + "acc": 0.65189471, + "epoch": 0.16311516996448502, + "grad_norm": 7.59375, + "learning_rate": 9.972797684267674e-06, + "loss": 1.60783386, + "memory(GiB)": 77.59, + "step": 6430, + "train_speed(iter/s)": 1.634248 + }, + { + "acc": 0.65726333, + "epoch": 0.1632420091324201, + "grad_norm": 5.25, + "learning_rate": 9.972688340085137e-06, + "loss": 1.61662788, + "memory(GiB)": 77.59, + "step": 6435, + "train_speed(iter/s)": 1.634369 + }, + { + "acc": 0.64713655, + "epoch": 0.16336884830035514, + "grad_norm": 7.15625, + "learning_rate": 9.972578777181565e-06, + "loss": 1.6686718, + "memory(GiB)": 77.59, + "step": 6440, + "train_speed(iter/s)": 1.63449 + }, + { + "acc": 0.6449069, + "epoch": 0.1634956874682902, + "grad_norm": 5.84375, + "learning_rate": 9.972468995561778e-06, + "loss": 1.65861053, + "memory(GiB)": 77.59, + "step": 6445, + "train_speed(iter/s)": 1.634632 + }, + { + "acc": 0.64491663, + "epoch": 0.16362252663622526, + "grad_norm": 5.5625, + "learning_rate": 9.972358995230604e-06, + "loss": 1.60342979, + "memory(GiB)": 77.59, + "step": 6450, + "train_speed(iter/s)": 1.634765 + }, + { + "acc": 0.65160785, + "epoch": 0.16374936580416033, + "grad_norm": 6.59375, + "learning_rate": 9.97224877619288e-06, + "loss": 1.68284607, + "memory(GiB)": 77.59, + "step": 6455, + "train_speed(iter/s)": 1.634898 + }, + { + "acc": 0.63881249, + "epoch": 0.16387620497209537, + "grad_norm": 5.5625, + "learning_rate": 9.972138338453457e-06, + "loss": 1.73002262, + "memory(GiB)": 77.59, + "step": 6460, + "train_speed(iter/s)": 1.635021 + }, + { + "acc": 0.6422462, + "epoch": 0.16400304414003045, + "grad_norm": 5.46875, + "learning_rate": 9.972027682017191e-06, + "loss": 1.68600445, + "memory(GiB)": 77.59, + "step": 6465, + "train_speed(iter/s)": 1.635156 + }, + { + "acc": 0.63023891, + "epoch": 0.1641298833079655, + "grad_norm": 4.6875, + "learning_rate": 9.971916806888948e-06, + "loss": 1.76903152, + "memory(GiB)": 77.59, + "step": 6470, + "train_speed(iter/s)": 1.635292 + }, + { + "acc": 0.63143888, + "epoch": 0.16425672247590056, + "grad_norm": 7.5, + "learning_rate": 9.971805713073606e-06, + "loss": 1.75191917, + "memory(GiB)": 77.59, + "step": 6475, + "train_speed(iter/s)": 1.635419 + }, + { + "acc": 0.63922138, + "epoch": 0.1643835616438356, + "grad_norm": 4.78125, + "learning_rate": 9.971694400576053e-06, + "loss": 1.64910049, + "memory(GiB)": 77.59, + "step": 6480, + "train_speed(iter/s)": 1.635545 + }, + { + "acc": 0.64952583, + "epoch": 0.16451040081177068, + "grad_norm": 4.78125, + "learning_rate": 9.971582869401182e-06, + "loss": 1.68407669, + "memory(GiB)": 77.59, + "step": 6485, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.64700904, + "epoch": 0.16463723997970572, + "grad_norm": 5.03125, + "learning_rate": 9.9714711195539e-06, + "loss": 1.67678223, + "memory(GiB)": 77.59, + "step": 6490, + "train_speed(iter/s)": 1.635802 + }, + { + "acc": 0.63749566, + "epoch": 0.1647640791476408, + "grad_norm": 7.375, + "learning_rate": 9.97135915103912e-06, + "loss": 1.68261909, + "memory(GiB)": 77.59, + "step": 6495, + "train_speed(iter/s)": 1.635945 + }, + { + "acc": 0.6402482, + "epoch": 0.16489091831557584, + "grad_norm": 5.6875, + "learning_rate": 9.971246963861772e-06, + "loss": 1.6840168, + "memory(GiB)": 77.59, + "step": 6500, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.63845644, + "epoch": 0.1650177574835109, + "grad_norm": 4.875, + "learning_rate": 9.971134558026786e-06, + "loss": 1.67110195, + "memory(GiB)": 77.59, + "step": 6505, + "train_speed(iter/s)": 1.636196 + }, + { + "acc": 0.64929929, + "epoch": 0.16514459665144596, + "grad_norm": 5.625, + "learning_rate": 9.971021933539108e-06, + "loss": 1.64482193, + "memory(GiB)": 77.59, + "step": 6510, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.65398116, + "epoch": 0.16527143581938103, + "grad_norm": 6.21875, + "learning_rate": 9.97090909040369e-06, + "loss": 1.64348888, + "memory(GiB)": 77.59, + "step": 6515, + "train_speed(iter/s)": 1.636455 + }, + { + "acc": 0.64077563, + "epoch": 0.16539827498731607, + "grad_norm": 5.09375, + "learning_rate": 9.970796028625499e-06, + "loss": 1.71602669, + "memory(GiB)": 77.59, + "step": 6520, + "train_speed(iter/s)": 1.636582 + }, + { + "acc": 0.64576349, + "epoch": 0.16552511415525115, + "grad_norm": 6.4375, + "learning_rate": 9.970682748209505e-06, + "loss": 1.71717949, + "memory(GiB)": 77.59, + "step": 6525, + "train_speed(iter/s)": 1.636705 + }, + { + "acc": 0.64918218, + "epoch": 0.1656519533231862, + "grad_norm": 4.65625, + "learning_rate": 9.97056924916069e-06, + "loss": 1.67604523, + "memory(GiB)": 77.59, + "step": 6530, + "train_speed(iter/s)": 1.636822 + }, + { + "acc": 0.66022949, + "epoch": 0.16577879249112126, + "grad_norm": 5.5, + "learning_rate": 9.970455531484049e-06, + "loss": 1.6372364, + "memory(GiB)": 77.59, + "step": 6535, + "train_speed(iter/s)": 1.63695 + }, + { + "acc": 0.64934483, + "epoch": 0.1659056316590563, + "grad_norm": 5.71875, + "learning_rate": 9.97034159518458e-06, + "loss": 1.66848183, + "memory(GiB)": 77.59, + "step": 6540, + "train_speed(iter/s)": 1.637073 + }, + { + "acc": 0.64757705, + "epoch": 0.16603247082699138, + "grad_norm": 6.875, + "learning_rate": 9.9702274402673e-06, + "loss": 1.68137646, + "memory(GiB)": 77.59, + "step": 6545, + "train_speed(iter/s)": 1.637207 + }, + { + "acc": 0.65756893, + "epoch": 0.16615930999492642, + "grad_norm": 5.40625, + "learning_rate": 9.970113066737223e-06, + "loss": 1.67345085, + "memory(GiB)": 77.59, + "step": 6550, + "train_speed(iter/s)": 1.637336 + }, + { + "acc": 0.64228902, + "epoch": 0.1662861491628615, + "grad_norm": 7.03125, + "learning_rate": 9.969998474599386e-06, + "loss": 1.63843899, + "memory(GiB)": 77.59, + "step": 6555, + "train_speed(iter/s)": 1.637455 + }, + { + "acc": 0.65614533, + "epoch": 0.16641298833079654, + "grad_norm": 6.90625, + "learning_rate": 9.969883663858826e-06, + "loss": 1.61360035, + "memory(GiB)": 77.59, + "step": 6560, + "train_speed(iter/s)": 1.637579 + }, + { + "acc": 0.65491066, + "epoch": 0.1665398274987316, + "grad_norm": 6.21875, + "learning_rate": 9.969768634520593e-06, + "loss": 1.66851044, + "memory(GiB)": 77.59, + "step": 6565, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.66400509, + "epoch": 0.16666666666666666, + "grad_norm": 4.78125, + "learning_rate": 9.969653386589749e-06, + "loss": 1.58823271, + "memory(GiB)": 77.59, + "step": 6570, + "train_speed(iter/s)": 1.637838 + }, + { + "acc": 0.65091476, + "epoch": 0.16679350583460173, + "grad_norm": 5.4375, + "learning_rate": 9.96953792007136e-06, + "loss": 1.68033352, + "memory(GiB)": 77.59, + "step": 6575, + "train_speed(iter/s)": 1.637971 + }, + { + "acc": 0.63005195, + "epoch": 0.16692034500253677, + "grad_norm": 4.59375, + "learning_rate": 9.969422234970506e-06, + "loss": 1.72654209, + "memory(GiB)": 77.59, + "step": 6580, + "train_speed(iter/s)": 1.638099 + }, + { + "acc": 0.6641572, + "epoch": 0.16704718417047185, + "grad_norm": 5.34375, + "learning_rate": 9.969306331292273e-06, + "loss": 1.61240158, + "memory(GiB)": 77.59, + "step": 6585, + "train_speed(iter/s)": 1.638216 + }, + { + "acc": 0.6374177, + "epoch": 0.1671740233384069, + "grad_norm": 5.78125, + "learning_rate": 9.969190209041764e-06, + "loss": 1.75764637, + "memory(GiB)": 77.59, + "step": 6590, + "train_speed(iter/s)": 1.638344 + }, + { + "acc": 0.63630428, + "epoch": 0.16730086250634196, + "grad_norm": 6.75, + "learning_rate": 9.969073868224082e-06, + "loss": 1.70809555, + "memory(GiB)": 77.59, + "step": 6595, + "train_speed(iter/s)": 1.638476 + }, + { + "acc": 0.63657174, + "epoch": 0.167427701674277, + "grad_norm": 5.125, + "learning_rate": 9.968957308844346e-06, + "loss": 1.77841892, + "memory(GiB)": 77.59, + "step": 6600, + "train_speed(iter/s)": 1.638607 + }, + { + "acc": 0.64168973, + "epoch": 0.16755454084221208, + "grad_norm": 6.625, + "learning_rate": 9.968840530907684e-06, + "loss": 1.71487465, + "memory(GiB)": 77.59, + "step": 6605, + "train_speed(iter/s)": 1.638724 + }, + { + "acc": 0.62594371, + "epoch": 0.16768138001014712, + "grad_norm": 7.625, + "learning_rate": 9.96872353441923e-06, + "loss": 1.73303661, + "memory(GiB)": 77.59, + "step": 6610, + "train_speed(iter/s)": 1.638842 + }, + { + "acc": 0.6411756, + "epoch": 0.1678082191780822, + "grad_norm": 6.75, + "learning_rate": 9.968606319384131e-06, + "loss": 1.73566246, + "memory(GiB)": 77.59, + "step": 6615, + "train_speed(iter/s)": 1.63896 + }, + { + "acc": 0.64131818, + "epoch": 0.16793505834601724, + "grad_norm": 7.3125, + "learning_rate": 9.968488885807544e-06, + "loss": 1.77030144, + "memory(GiB)": 77.59, + "step": 6620, + "train_speed(iter/s)": 1.639089 + }, + { + "acc": 0.64899211, + "epoch": 0.1680618975139523, + "grad_norm": 4.96875, + "learning_rate": 9.968371233694633e-06, + "loss": 1.67177773, + "memory(GiB)": 77.59, + "step": 6625, + "train_speed(iter/s)": 1.639219 + }, + { + "acc": 0.64630604, + "epoch": 0.16818873668188736, + "grad_norm": 5.75, + "learning_rate": 9.968253363050573e-06, + "loss": 1.69663029, + "memory(GiB)": 77.59, + "step": 6630, + "train_speed(iter/s)": 1.63935 + }, + { + "acc": 0.64754329, + "epoch": 0.16831557584982243, + "grad_norm": 4.625, + "learning_rate": 9.968135273880547e-06, + "loss": 1.64831924, + "memory(GiB)": 77.59, + "step": 6635, + "train_speed(iter/s)": 1.639468 + }, + { + "acc": 0.64454737, + "epoch": 0.16844241501775747, + "grad_norm": 6.75, + "learning_rate": 9.968016966189753e-06, + "loss": 1.67571716, + "memory(GiB)": 77.59, + "step": 6640, + "train_speed(iter/s)": 1.639597 + }, + { + "acc": 0.64654708, + "epoch": 0.16856925418569255, + "grad_norm": 6.3125, + "learning_rate": 9.96789843998339e-06, + "loss": 1.59341631, + "memory(GiB)": 77.59, + "step": 6645, + "train_speed(iter/s)": 1.639715 + }, + { + "acc": 0.64665756, + "epoch": 0.1686960933536276, + "grad_norm": 7.21875, + "learning_rate": 9.967779695266675e-06, + "loss": 1.66016655, + "memory(GiB)": 77.59, + "step": 6650, + "train_speed(iter/s)": 1.639837 + }, + { + "acc": 0.64671593, + "epoch": 0.16882293252156266, + "grad_norm": 5.96875, + "learning_rate": 9.967660732044828e-06, + "loss": 1.68118477, + "memory(GiB)": 77.59, + "step": 6655, + "train_speed(iter/s)": 1.639958 + }, + { + "acc": 0.645193, + "epoch": 0.1689497716894977, + "grad_norm": 5.0, + "learning_rate": 9.967541550323085e-06, + "loss": 1.69771557, + "memory(GiB)": 77.59, + "step": 6660, + "train_speed(iter/s)": 1.640085 + }, + { + "acc": 0.65040121, + "epoch": 0.16907661085743278, + "grad_norm": 6.21875, + "learning_rate": 9.967422150106685e-06, + "loss": 1.69082069, + "memory(GiB)": 77.59, + "step": 6665, + "train_speed(iter/s)": 1.640218 + }, + { + "acc": 0.64032044, + "epoch": 0.16920345002536782, + "grad_norm": 5.0625, + "learning_rate": 9.96730253140088e-06, + "loss": 1.67483864, + "memory(GiB)": 77.59, + "step": 6670, + "train_speed(iter/s)": 1.640341 + }, + { + "acc": 0.64160852, + "epoch": 0.1693302891933029, + "grad_norm": 5.96875, + "learning_rate": 9.967182694210933e-06, + "loss": 1.71246223, + "memory(GiB)": 77.59, + "step": 6675, + "train_speed(iter/s)": 1.640468 + }, + { + "acc": 0.6497797, + "epoch": 0.16945712836123794, + "grad_norm": 4.71875, + "learning_rate": 9.967062638542116e-06, + "loss": 1.67274075, + "memory(GiB)": 77.59, + "step": 6680, + "train_speed(iter/s)": 1.640595 + }, + { + "acc": 0.64639378, + "epoch": 0.169583967529173, + "grad_norm": 5.625, + "learning_rate": 9.966942364399706e-06, + "loss": 1.69283409, + "memory(GiB)": 77.59, + "step": 6685, + "train_speed(iter/s)": 1.640716 + }, + { + "acc": 0.63445115, + "epoch": 0.16971080669710806, + "grad_norm": 5.75, + "learning_rate": 9.966821871788995e-06, + "loss": 1.72606163, + "memory(GiB)": 77.59, + "step": 6690, + "train_speed(iter/s)": 1.640843 + }, + { + "acc": 0.64796643, + "epoch": 0.16983764586504313, + "grad_norm": 6.0625, + "learning_rate": 9.966701160715283e-06, + "loss": 1.67875748, + "memory(GiB)": 77.59, + "step": 6695, + "train_speed(iter/s)": 1.640951 + }, + { + "acc": 0.65945635, + "epoch": 0.16996448503297817, + "grad_norm": 6.28125, + "learning_rate": 9.96658023118388e-06, + "loss": 1.64298096, + "memory(GiB)": 77.59, + "step": 6700, + "train_speed(iter/s)": 1.641083 + }, + { + "acc": 0.64791965, + "epoch": 0.17009132420091325, + "grad_norm": 6.09375, + "learning_rate": 9.966459083200102e-06, + "loss": 1.70205956, + "memory(GiB)": 77.59, + "step": 6705, + "train_speed(iter/s)": 1.641207 + }, + { + "acc": 0.63008356, + "epoch": 0.1702181633688483, + "grad_norm": 5.03125, + "learning_rate": 9.966337716769283e-06, + "loss": 1.74058895, + "memory(GiB)": 77.59, + "step": 6710, + "train_speed(iter/s)": 1.641333 + }, + { + "acc": 0.63434973, + "epoch": 0.17034500253678336, + "grad_norm": 6.0, + "learning_rate": 9.966216131896755e-06, + "loss": 1.73385658, + "memory(GiB)": 77.59, + "step": 6715, + "train_speed(iter/s)": 1.641439 + }, + { + "acc": 0.63197145, + "epoch": 0.1704718417047184, + "grad_norm": 6.5625, + "learning_rate": 9.966094328587871e-06, + "loss": 1.70180206, + "memory(GiB)": 77.59, + "step": 6720, + "train_speed(iter/s)": 1.641552 + }, + { + "acc": 0.63856096, + "epoch": 0.17059868087265348, + "grad_norm": 5.375, + "learning_rate": 9.965972306847986e-06, + "loss": 1.74872761, + "memory(GiB)": 77.59, + "step": 6725, + "train_speed(iter/s)": 1.641663 + }, + { + "acc": 0.66174555, + "epoch": 0.17072552004058852, + "grad_norm": 5.71875, + "learning_rate": 9.965850066682468e-06, + "loss": 1.57597809, + "memory(GiB)": 77.59, + "step": 6730, + "train_speed(iter/s)": 1.641786 + }, + { + "acc": 0.63363771, + "epoch": 0.1708523592085236, + "grad_norm": 5.28125, + "learning_rate": 9.965727608096692e-06, + "loss": 1.67796688, + "memory(GiB)": 77.59, + "step": 6735, + "train_speed(iter/s)": 1.641917 + }, + { + "acc": 0.65201859, + "epoch": 0.17097919837645864, + "grad_norm": 4.5625, + "learning_rate": 9.965604931096045e-06, + "loss": 1.72040901, + "memory(GiB)": 77.59, + "step": 6740, + "train_speed(iter/s)": 1.642042 + }, + { + "acc": 0.63816962, + "epoch": 0.1711060375443937, + "grad_norm": 6.6875, + "learning_rate": 9.965482035685925e-06, + "loss": 1.7437851, + "memory(GiB)": 77.59, + "step": 6745, + "train_speed(iter/s)": 1.642158 + }, + { + "acc": 0.65284352, + "epoch": 0.17123287671232876, + "grad_norm": 5.5625, + "learning_rate": 9.965358921871735e-06, + "loss": 1.64097538, + "memory(GiB)": 77.59, + "step": 6750, + "train_speed(iter/s)": 1.642289 + }, + { + "acc": 0.65702648, + "epoch": 0.17135971588026383, + "grad_norm": 5.21875, + "learning_rate": 9.965235589658891e-06, + "loss": 1.61880646, + "memory(GiB)": 77.59, + "step": 6755, + "train_speed(iter/s)": 1.64242 + }, + { + "acc": 0.64878421, + "epoch": 0.17148655504819887, + "grad_norm": 4.15625, + "learning_rate": 9.965112039052817e-06, + "loss": 1.67541847, + "memory(GiB)": 77.59, + "step": 6760, + "train_speed(iter/s)": 1.64254 + }, + { + "acc": 0.63958874, + "epoch": 0.17161339421613395, + "grad_norm": 5.875, + "learning_rate": 9.964988270058948e-06, + "loss": 1.72893887, + "memory(GiB)": 77.59, + "step": 6765, + "train_speed(iter/s)": 1.642656 + }, + { + "acc": 0.63764372, + "epoch": 0.171740233384069, + "grad_norm": 5.34375, + "learning_rate": 9.96486428268273e-06, + "loss": 1.65488224, + "memory(GiB)": 77.59, + "step": 6770, + "train_speed(iter/s)": 1.642774 + }, + { + "acc": 0.64337626, + "epoch": 0.17186707255200406, + "grad_norm": 6.375, + "learning_rate": 9.964740076929612e-06, + "loss": 1.66642056, + "memory(GiB)": 77.59, + "step": 6775, + "train_speed(iter/s)": 1.642889 + }, + { + "acc": 0.63530145, + "epoch": 0.1719939117199391, + "grad_norm": 6.1875, + "learning_rate": 9.964615652805059e-06, + "loss": 1.72618942, + "memory(GiB)": 77.59, + "step": 6780, + "train_speed(iter/s)": 1.642997 + }, + { + "acc": 0.65174122, + "epoch": 0.17212075088787418, + "grad_norm": 6.0625, + "learning_rate": 9.964491010314545e-06, + "loss": 1.65879211, + "memory(GiB)": 77.59, + "step": 6785, + "train_speed(iter/s)": 1.643111 + }, + { + "acc": 0.64636569, + "epoch": 0.17224759005580922, + "grad_norm": 5.40625, + "learning_rate": 9.964366149463552e-06, + "loss": 1.69336262, + "memory(GiB)": 77.59, + "step": 6790, + "train_speed(iter/s)": 1.643226 + }, + { + "acc": 0.66044865, + "epoch": 0.1723744292237443, + "grad_norm": 5.03125, + "learning_rate": 9.96424107025757e-06, + "loss": 1.61055202, + "memory(GiB)": 77.59, + "step": 6795, + "train_speed(iter/s)": 1.643337 + }, + { + "acc": 0.6528614, + "epoch": 0.17250126839167934, + "grad_norm": 4.84375, + "learning_rate": 9.964115772702104e-06, + "loss": 1.61430187, + "memory(GiB)": 77.59, + "step": 6800, + "train_speed(iter/s)": 1.643454 + }, + { + "acc": 0.64503965, + "epoch": 0.1726281075596144, + "grad_norm": 5.4375, + "learning_rate": 9.963990256802662e-06, + "loss": 1.72916031, + "memory(GiB)": 77.59, + "step": 6805, + "train_speed(iter/s)": 1.64357 + }, + { + "acc": 0.6588366, + "epoch": 0.17275494672754946, + "grad_norm": 7.34375, + "learning_rate": 9.963864522564765e-06, + "loss": 1.61271057, + "memory(GiB)": 77.59, + "step": 6810, + "train_speed(iter/s)": 1.643692 + }, + { + "acc": 0.6429822, + "epoch": 0.17288178589548453, + "grad_norm": 5.09375, + "learning_rate": 9.963738569993945e-06, + "loss": 1.71824512, + "memory(GiB)": 77.59, + "step": 6815, + "train_speed(iter/s)": 1.643811 + }, + { + "acc": 0.65590649, + "epoch": 0.17300862506341957, + "grad_norm": 7.03125, + "learning_rate": 9.963612399095743e-06, + "loss": 1.67555981, + "memory(GiB)": 77.59, + "step": 6820, + "train_speed(iter/s)": 1.643927 + }, + { + "acc": 0.629282, + "epoch": 0.17313546423135465, + "grad_norm": 6.125, + "learning_rate": 9.963486009875705e-06, + "loss": 1.75797329, + "memory(GiB)": 77.59, + "step": 6825, + "train_speed(iter/s)": 1.644038 + }, + { + "acc": 0.64625211, + "epoch": 0.1732623033992897, + "grad_norm": 5.65625, + "learning_rate": 9.963359402339393e-06, + "loss": 1.68408127, + "memory(GiB)": 77.59, + "step": 6830, + "train_speed(iter/s)": 1.644154 + }, + { + "acc": 0.64075089, + "epoch": 0.17338914256722476, + "grad_norm": 7.375, + "learning_rate": 9.963232576492373e-06, + "loss": 1.75103569, + "memory(GiB)": 77.59, + "step": 6835, + "train_speed(iter/s)": 1.644274 + }, + { + "acc": 0.63705473, + "epoch": 0.1735159817351598, + "grad_norm": 5.25, + "learning_rate": 9.963105532340226e-06, + "loss": 1.75616817, + "memory(GiB)": 77.59, + "step": 6840, + "train_speed(iter/s)": 1.644384 + }, + { + "acc": 0.634588, + "epoch": 0.17364282090309488, + "grad_norm": 6.40625, + "learning_rate": 9.962978269888538e-06, + "loss": 1.7275013, + "memory(GiB)": 77.59, + "step": 6845, + "train_speed(iter/s)": 1.644502 + }, + { + "acc": 0.63084083, + "epoch": 0.17376966007102992, + "grad_norm": 4.71875, + "learning_rate": 9.96285078914291e-06, + "loss": 1.67146854, + "memory(GiB)": 77.59, + "step": 6850, + "train_speed(iter/s)": 1.644614 + }, + { + "acc": 0.63827963, + "epoch": 0.173896499238965, + "grad_norm": 5.4375, + "learning_rate": 9.962723090108944e-06, + "loss": 1.70191002, + "memory(GiB)": 77.59, + "step": 6855, + "train_speed(iter/s)": 1.644737 + }, + { + "acc": 0.64026279, + "epoch": 0.17402333840690004, + "grad_norm": 5.625, + "learning_rate": 9.962595172792261e-06, + "loss": 1.70468407, + "memory(GiB)": 77.59, + "step": 6860, + "train_speed(iter/s)": 1.644858 + }, + { + "acc": 0.64106016, + "epoch": 0.1741501775748351, + "grad_norm": 5.0, + "learning_rate": 9.962467037198487e-06, + "loss": 1.63270378, + "memory(GiB)": 77.59, + "step": 6865, + "train_speed(iter/s)": 1.644964 + }, + { + "acc": 0.64408913, + "epoch": 0.17427701674277016, + "grad_norm": 5.03125, + "learning_rate": 9.962338683333254e-06, + "loss": 1.7004734, + "memory(GiB)": 77.59, + "step": 6870, + "train_speed(iter/s)": 1.645078 + }, + { + "acc": 0.63785305, + "epoch": 0.17440385591070523, + "grad_norm": 5.0, + "learning_rate": 9.962210111202212e-06, + "loss": 1.77585373, + "memory(GiB)": 77.59, + "step": 6875, + "train_speed(iter/s)": 1.645195 + }, + { + "acc": 0.64036989, + "epoch": 0.17453069507864027, + "grad_norm": 6.90625, + "learning_rate": 9.962081320811015e-06, + "loss": 1.68967934, + "memory(GiB)": 77.59, + "step": 6880, + "train_speed(iter/s)": 1.645308 + }, + { + "acc": 0.65590191, + "epoch": 0.17465753424657535, + "grad_norm": 5.84375, + "learning_rate": 9.961952312165327e-06, + "loss": 1.67022934, + "memory(GiB)": 77.59, + "step": 6885, + "train_speed(iter/s)": 1.645427 + }, + { + "acc": 0.6510498, + "epoch": 0.1747843734145104, + "grad_norm": 4.875, + "learning_rate": 9.961823085270823e-06, + "loss": 1.65727654, + "memory(GiB)": 77.59, + "step": 6890, + "train_speed(iter/s)": 1.645527 + }, + { + "acc": 0.65662556, + "epoch": 0.17491121258244546, + "grad_norm": 5.09375, + "learning_rate": 9.961693640133187e-06, + "loss": 1.5743947, + "memory(GiB)": 77.59, + "step": 6895, + "train_speed(iter/s)": 1.645636 + }, + { + "acc": 0.64352493, + "epoch": 0.1750380517503805, + "grad_norm": 5.40625, + "learning_rate": 9.961563976758112e-06, + "loss": 1.67347393, + "memory(GiB)": 77.59, + "step": 6900, + "train_speed(iter/s)": 1.645748 + }, + { + "acc": 0.63922806, + "epoch": 0.17516489091831558, + "grad_norm": 6.0, + "learning_rate": 9.961434095151301e-06, + "loss": 1.71597672, + "memory(GiB)": 77.59, + "step": 6905, + "train_speed(iter/s)": 1.645871 + }, + { + "acc": 0.64139996, + "epoch": 0.17529173008625062, + "grad_norm": 4.375, + "learning_rate": 9.961303995318467e-06, + "loss": 1.63297272, + "memory(GiB)": 77.59, + "step": 6910, + "train_speed(iter/s)": 1.645984 + }, + { + "acc": 0.65589032, + "epoch": 0.1754185692541857, + "grad_norm": 18.0, + "learning_rate": 9.961173677265334e-06, + "loss": 1.6429985, + "memory(GiB)": 77.59, + "step": 6915, + "train_speed(iter/s)": 1.646098 + }, + { + "acc": 0.6408843, + "epoch": 0.17554540842212074, + "grad_norm": 4.78125, + "learning_rate": 9.961043140997632e-06, + "loss": 1.65616589, + "memory(GiB)": 77.59, + "step": 6920, + "train_speed(iter/s)": 1.646209 + }, + { + "acc": 0.63362069, + "epoch": 0.1756722475900558, + "grad_norm": 6.125, + "learning_rate": 9.960912386521104e-06, + "loss": 1.6962431, + "memory(GiB)": 77.59, + "step": 6925, + "train_speed(iter/s)": 1.646316 + }, + { + "acc": 0.64199467, + "epoch": 0.17579908675799086, + "grad_norm": 5.75, + "learning_rate": 9.9607814138415e-06, + "loss": 1.69304123, + "memory(GiB)": 77.59, + "step": 6930, + "train_speed(iter/s)": 1.646432 + }, + { + "acc": 0.64293036, + "epoch": 0.17592592592592593, + "grad_norm": 7.8125, + "learning_rate": 9.96065022296458e-06, + "loss": 1.70573483, + "memory(GiB)": 77.59, + "step": 6935, + "train_speed(iter/s)": 1.646544 + }, + { + "acc": 0.65458007, + "epoch": 0.17605276509386097, + "grad_norm": 6.1875, + "learning_rate": 9.960518813896117e-06, + "loss": 1.62631474, + "memory(GiB)": 77.59, + "step": 6940, + "train_speed(iter/s)": 1.64369 + }, + { + "acc": 0.64092197, + "epoch": 0.17617960426179605, + "grad_norm": 5.9375, + "learning_rate": 9.960387186641887e-06, + "loss": 1.6703289, + "memory(GiB)": 77.59, + "step": 6945, + "train_speed(iter/s)": 1.643804 + }, + { + "acc": 0.64519463, + "epoch": 0.1763064434297311, + "grad_norm": 6.125, + "learning_rate": 9.960255341207686e-06, + "loss": 1.63410873, + "memory(GiB)": 77.59, + "step": 6950, + "train_speed(iter/s)": 1.641823 + }, + { + "acc": 0.66517429, + "epoch": 0.17643328259766616, + "grad_norm": 5.1875, + "learning_rate": 9.960123277599305e-06, + "loss": 1.63524628, + "memory(GiB)": 77.59, + "step": 6955, + "train_speed(iter/s)": 1.641933 + }, + { + "acc": 0.63006377, + "epoch": 0.1765601217656012, + "grad_norm": 6.40625, + "learning_rate": 9.959990995822559e-06, + "loss": 1.70026054, + "memory(GiB)": 77.59, + "step": 6960, + "train_speed(iter/s)": 1.642052 + }, + { + "acc": 0.65416689, + "epoch": 0.17668696093353628, + "grad_norm": 4.59375, + "learning_rate": 9.959858495883263e-06, + "loss": 1.64462242, + "memory(GiB)": 77.59, + "step": 6965, + "train_speed(iter/s)": 1.642174 + }, + { + "acc": 0.64853497, + "epoch": 0.17681380010147132, + "grad_norm": 4.75, + "learning_rate": 9.959725777787249e-06, + "loss": 1.61428261, + "memory(GiB)": 77.59, + "step": 6970, + "train_speed(iter/s)": 1.64229 + }, + { + "acc": 0.64460812, + "epoch": 0.1769406392694064, + "grad_norm": 6.09375, + "learning_rate": 9.959592841540349e-06, + "loss": 1.68385105, + "memory(GiB)": 77.59, + "step": 6975, + "train_speed(iter/s)": 1.642392 + }, + { + "acc": 0.63454218, + "epoch": 0.17706747843734144, + "grad_norm": 5.5625, + "learning_rate": 9.959459687148414e-06, + "loss": 1.66434441, + "memory(GiB)": 77.59, + "step": 6980, + "train_speed(iter/s)": 1.642514 + }, + { + "acc": 0.62219305, + "epoch": 0.1771943176052765, + "grad_norm": 6.40625, + "learning_rate": 9.959326314617299e-06, + "loss": 1.72570801, + "memory(GiB)": 77.59, + "step": 6985, + "train_speed(iter/s)": 1.642618 + }, + { + "acc": 0.63553267, + "epoch": 0.17732115677321156, + "grad_norm": 6.0, + "learning_rate": 9.95919272395287e-06, + "loss": 1.71557579, + "memory(GiB)": 77.59, + "step": 6990, + "train_speed(iter/s)": 1.64272 + }, + { + "acc": 0.63350801, + "epoch": 0.17744799594114663, + "grad_norm": 6.1875, + "learning_rate": 9.959058915161006e-06, + "loss": 1.79244995, + "memory(GiB)": 77.59, + "step": 6995, + "train_speed(iter/s)": 1.642833 + }, + { + "acc": 0.62339063, + "epoch": 0.17757483510908167, + "grad_norm": 5.3125, + "learning_rate": 9.95892488824759e-06, + "loss": 1.75188446, + "memory(GiB)": 77.59, + "step": 7000, + "train_speed(iter/s)": 1.64294 + }, + { + "epoch": 0.17757483510908167, + "eval_acc": 0.633713196215154, + "eval_loss": 1.6434897184371948, + "eval_runtime": 58.7221, + "eval_samples_per_second": 108.477, + "eval_steps_per_second": 27.128, + "step": 7000 + }, + { + "acc": 0.66383352, + "epoch": 0.17770167427701675, + "grad_norm": 5.96875, + "learning_rate": 9.958790643218515e-06, + "loss": 1.55810013, + "memory(GiB)": 77.59, + "step": 7005, + "train_speed(iter/s)": 1.616317 + }, + { + "acc": 0.64305129, + "epoch": 0.1778285134449518, + "grad_norm": 4.4375, + "learning_rate": 9.95865618007969e-06, + "loss": 1.63794308, + "memory(GiB)": 77.59, + "step": 7010, + "train_speed(iter/s)": 1.616434 + }, + { + "acc": 0.64964619, + "epoch": 0.17795535261288686, + "grad_norm": 5.1875, + "learning_rate": 9.958521498837029e-06, + "loss": 1.67577801, + "memory(GiB)": 77.59, + "step": 7015, + "train_speed(iter/s)": 1.616566 + }, + { + "acc": 0.64845467, + "epoch": 0.1780821917808219, + "grad_norm": 6.875, + "learning_rate": 9.95838659949645e-06, + "loss": 1.7144001, + "memory(GiB)": 77.59, + "step": 7020, + "train_speed(iter/s)": 1.61669 + }, + { + "acc": 0.66691551, + "epoch": 0.17820903094875698, + "grad_norm": 5.9375, + "learning_rate": 9.958251482063894e-06, + "loss": 1.64887276, + "memory(GiB)": 77.59, + "step": 7025, + "train_speed(iter/s)": 1.616815 + }, + { + "acc": 0.63838854, + "epoch": 0.17833587011669202, + "grad_norm": 5.75, + "learning_rate": 9.9581161465453e-06, + "loss": 1.69448376, + "memory(GiB)": 77.59, + "step": 7030, + "train_speed(iter/s)": 1.616931 + }, + { + "acc": 0.63211827, + "epoch": 0.1784627092846271, + "grad_norm": 6.09375, + "learning_rate": 9.957980592946621e-06, + "loss": 1.69647293, + "memory(GiB)": 77.59, + "step": 7035, + "train_speed(iter/s)": 1.617065 + }, + { + "acc": 0.66187387, + "epoch": 0.17858954845256214, + "grad_norm": 5.53125, + "learning_rate": 9.957844821273822e-06, + "loss": 1.61571198, + "memory(GiB)": 77.59, + "step": 7040, + "train_speed(iter/s)": 1.617194 + }, + { + "acc": 0.65497189, + "epoch": 0.1787163876204972, + "grad_norm": 5.1875, + "learning_rate": 9.95770883153287e-06, + "loss": 1.58073425, + "memory(GiB)": 77.59, + "step": 7045, + "train_speed(iter/s)": 1.617318 + }, + { + "acc": 0.64070177, + "epoch": 0.17884322678843226, + "grad_norm": 5.6875, + "learning_rate": 9.957572623729749e-06, + "loss": 1.7325016, + "memory(GiB)": 77.59, + "step": 7050, + "train_speed(iter/s)": 1.617443 + }, + { + "acc": 0.63557272, + "epoch": 0.17897006595636733, + "grad_norm": 6.71875, + "learning_rate": 9.957436197870451e-06, + "loss": 1.69866238, + "memory(GiB)": 77.59, + "step": 7055, + "train_speed(iter/s)": 1.617574 + }, + { + "acc": 0.64438353, + "epoch": 0.17909690512430237, + "grad_norm": 7.09375, + "learning_rate": 9.957299553960975e-06, + "loss": 1.73529491, + "memory(GiB)": 77.59, + "step": 7060, + "train_speed(iter/s)": 1.617703 + }, + { + "acc": 0.64680119, + "epoch": 0.17922374429223745, + "grad_norm": 6.125, + "learning_rate": 9.957162692007334e-06, + "loss": 1.62102757, + "memory(GiB)": 77.59, + "step": 7065, + "train_speed(iter/s)": 1.617841 + }, + { + "acc": 0.65200863, + "epoch": 0.1793505834601725, + "grad_norm": 6.25, + "learning_rate": 9.957025612015543e-06, + "loss": 1.68627396, + "memory(GiB)": 77.59, + "step": 7070, + "train_speed(iter/s)": 1.61797 + }, + { + "acc": 0.65332608, + "epoch": 0.17947742262810756, + "grad_norm": 5.0, + "learning_rate": 9.956888313991636e-06, + "loss": 1.61336212, + "memory(GiB)": 77.59, + "step": 7075, + "train_speed(iter/s)": 1.618104 + }, + { + "acc": 0.64589319, + "epoch": 0.1796042617960426, + "grad_norm": 5.96875, + "learning_rate": 9.956750797941648e-06, + "loss": 1.69664879, + "memory(GiB)": 77.59, + "step": 7080, + "train_speed(iter/s)": 1.618232 + }, + { + "acc": 0.63732195, + "epoch": 0.17973110096397768, + "grad_norm": 5.0625, + "learning_rate": 9.95661306387163e-06, + "loss": 1.73729515, + "memory(GiB)": 77.59, + "step": 7085, + "train_speed(iter/s)": 1.618362 + }, + { + "acc": 0.64629126, + "epoch": 0.17985794013191272, + "grad_norm": 5.03125, + "learning_rate": 9.95647511178764e-06, + "loss": 1.5979641, + "memory(GiB)": 77.59, + "step": 7090, + "train_speed(iter/s)": 1.618487 + }, + { + "acc": 0.64954901, + "epoch": 0.1799847792998478, + "grad_norm": 6.34375, + "learning_rate": 9.956336941695747e-06, + "loss": 1.70026016, + "memory(GiB)": 77.59, + "step": 7095, + "train_speed(iter/s)": 1.618612 + }, + { + "acc": 0.64863548, + "epoch": 0.18011161846778284, + "grad_norm": 6.3125, + "learning_rate": 9.956198553602026e-06, + "loss": 1.64024582, + "memory(GiB)": 77.59, + "step": 7100, + "train_speed(iter/s)": 1.618741 + }, + { + "acc": 0.61645384, + "epoch": 0.1802384576357179, + "grad_norm": 5.1875, + "learning_rate": 9.956059947512563e-06, + "loss": 1.78149529, + "memory(GiB)": 77.59, + "step": 7105, + "train_speed(iter/s)": 1.618868 + }, + { + "acc": 0.63665099, + "epoch": 0.18036529680365296, + "grad_norm": 5.84375, + "learning_rate": 9.95592112343346e-06, + "loss": 1.67411232, + "memory(GiB)": 77.59, + "step": 7110, + "train_speed(iter/s)": 1.618993 + }, + { + "acc": 0.64703655, + "epoch": 0.18049213597158803, + "grad_norm": 5.9375, + "learning_rate": 9.955782081370818e-06, + "loss": 1.62678795, + "memory(GiB)": 77.59, + "step": 7115, + "train_speed(iter/s)": 1.619117 + }, + { + "acc": 0.65716386, + "epoch": 0.18061897513952307, + "grad_norm": 5.0625, + "learning_rate": 9.955642821330752e-06, + "loss": 1.67519836, + "memory(GiB)": 77.59, + "step": 7120, + "train_speed(iter/s)": 1.61924 + }, + { + "acc": 0.66499462, + "epoch": 0.18074581430745815, + "grad_norm": 5.53125, + "learning_rate": 9.95550334331939e-06, + "loss": 1.59537916, + "memory(GiB)": 77.59, + "step": 7125, + "train_speed(iter/s)": 1.619351 + }, + { + "acc": 0.65082536, + "epoch": 0.1808726534753932, + "grad_norm": 5.28125, + "learning_rate": 9.955363647342868e-06, + "loss": 1.60954227, + "memory(GiB)": 77.59, + "step": 7130, + "train_speed(iter/s)": 1.619464 + }, + { + "acc": 0.63223467, + "epoch": 0.18099949264332826, + "grad_norm": 6.5, + "learning_rate": 9.955223733407327e-06, + "loss": 1.71288204, + "memory(GiB)": 77.59, + "step": 7135, + "train_speed(iter/s)": 1.619588 + }, + { + "acc": 0.64555407, + "epoch": 0.1811263318112633, + "grad_norm": 5.5, + "learning_rate": 9.955083601518924e-06, + "loss": 1.71014099, + "memory(GiB)": 77.59, + "step": 7140, + "train_speed(iter/s)": 1.619707 + }, + { + "acc": 0.62587733, + "epoch": 0.18125317097919838, + "grad_norm": 5.5, + "learning_rate": 9.95494325168382e-06, + "loss": 1.74481926, + "memory(GiB)": 77.59, + "step": 7145, + "train_speed(iter/s)": 1.619828 + }, + { + "acc": 0.63875594, + "epoch": 0.18138001014713342, + "grad_norm": 5.28125, + "learning_rate": 9.954802683908192e-06, + "loss": 1.74068375, + "memory(GiB)": 77.59, + "step": 7150, + "train_speed(iter/s)": 1.619954 + }, + { + "acc": 0.64579372, + "epoch": 0.1815068493150685, + "grad_norm": 5.28125, + "learning_rate": 9.954661898198216e-06, + "loss": 1.66888332, + "memory(GiB)": 77.59, + "step": 7155, + "train_speed(iter/s)": 1.620074 + }, + { + "acc": 0.63566279, + "epoch": 0.18163368848300354, + "grad_norm": 5.9375, + "learning_rate": 9.954520894560092e-06, + "loss": 1.71961861, + "memory(GiB)": 77.59, + "step": 7160, + "train_speed(iter/s)": 1.620204 + }, + { + "acc": 0.63956718, + "epoch": 0.1817605276509386, + "grad_norm": 4.6875, + "learning_rate": 9.954379673000018e-06, + "loss": 1.70595989, + "memory(GiB)": 77.59, + "step": 7165, + "train_speed(iter/s)": 1.620323 + }, + { + "acc": 0.64019213, + "epoch": 0.18188736681887366, + "grad_norm": 5.3125, + "learning_rate": 9.954238233524208e-06, + "loss": 1.72009506, + "memory(GiB)": 77.59, + "step": 7170, + "train_speed(iter/s)": 1.620443 + }, + { + "acc": 0.64163303, + "epoch": 0.18201420598680873, + "grad_norm": 5.78125, + "learning_rate": 9.954096576138879e-06, + "loss": 1.73176422, + "memory(GiB)": 77.59, + "step": 7175, + "train_speed(iter/s)": 1.620558 + }, + { + "acc": 0.6387723, + "epoch": 0.18214104515474377, + "grad_norm": 5.375, + "learning_rate": 9.953954700850264e-06, + "loss": 1.71393185, + "memory(GiB)": 77.59, + "step": 7180, + "train_speed(iter/s)": 1.620686 + }, + { + "acc": 0.63552141, + "epoch": 0.18226788432267885, + "grad_norm": 4.84375, + "learning_rate": 9.953812607664607e-06, + "loss": 1.72599831, + "memory(GiB)": 77.59, + "step": 7185, + "train_speed(iter/s)": 1.6208 + }, + { + "acc": 0.65808129, + "epoch": 0.1823947234906139, + "grad_norm": 6.15625, + "learning_rate": 9.95367029658815e-06, + "loss": 1.67959442, + "memory(GiB)": 77.59, + "step": 7190, + "train_speed(iter/s)": 1.620908 + }, + { + "acc": 0.63875608, + "epoch": 0.18252156265854896, + "grad_norm": 6.4375, + "learning_rate": 9.953527767627159e-06, + "loss": 1.62620068, + "memory(GiB)": 77.59, + "step": 7195, + "train_speed(iter/s)": 1.621024 + }, + { + "acc": 0.66544304, + "epoch": 0.182648401826484, + "grad_norm": 4.96875, + "learning_rate": 9.9533850207879e-06, + "loss": 1.60160732, + "memory(GiB)": 77.59, + "step": 7200, + "train_speed(iter/s)": 1.621144 + }, + { + "acc": 0.65402861, + "epoch": 0.18277524099441908, + "grad_norm": 6.125, + "learning_rate": 9.953242056076652e-06, + "loss": 1.6854044, + "memory(GiB)": 77.59, + "step": 7205, + "train_speed(iter/s)": 1.621263 + }, + { + "acc": 0.63643265, + "epoch": 0.18290208016235412, + "grad_norm": 6.34375, + "learning_rate": 9.953098873499705e-06, + "loss": 1.73724136, + "memory(GiB)": 77.59, + "step": 7210, + "train_speed(iter/s)": 1.62139 + }, + { + "acc": 0.650736, + "epoch": 0.1830289193302892, + "grad_norm": 5.15625, + "learning_rate": 9.952955473063356e-06, + "loss": 1.61893234, + "memory(GiB)": 77.59, + "step": 7215, + "train_speed(iter/s)": 1.621515 + }, + { + "acc": 0.63614092, + "epoch": 0.18315575849822424, + "grad_norm": 5.40625, + "learning_rate": 9.952811854773911e-06, + "loss": 1.72106552, + "memory(GiB)": 77.59, + "step": 7220, + "train_speed(iter/s)": 1.621638 + }, + { + "acc": 0.63416362, + "epoch": 0.1832825976661593, + "grad_norm": 6.3125, + "learning_rate": 9.952668018637687e-06, + "loss": 1.7122797, + "memory(GiB)": 77.59, + "step": 7225, + "train_speed(iter/s)": 1.621759 + }, + { + "acc": 0.64628859, + "epoch": 0.18340943683409436, + "grad_norm": 5.625, + "learning_rate": 9.952523964661014e-06, + "loss": 1.66565475, + "memory(GiB)": 77.59, + "step": 7230, + "train_speed(iter/s)": 1.621883 + }, + { + "acc": 0.64756575, + "epoch": 0.18353627600202943, + "grad_norm": 6.15625, + "learning_rate": 9.952379692850222e-06, + "loss": 1.6767374, + "memory(GiB)": 77.59, + "step": 7235, + "train_speed(iter/s)": 1.622007 + }, + { + "acc": 0.65332966, + "epoch": 0.18366311516996447, + "grad_norm": 9.3125, + "learning_rate": 9.952235203211663e-06, + "loss": 1.62064381, + "memory(GiB)": 77.59, + "step": 7240, + "train_speed(iter/s)": 1.622129 + }, + { + "acc": 0.65426602, + "epoch": 0.18378995433789955, + "grad_norm": 5.25, + "learning_rate": 9.952090495751689e-06, + "loss": 1.64998131, + "memory(GiB)": 77.59, + "step": 7245, + "train_speed(iter/s)": 1.622253 + }, + { + "acc": 0.64379072, + "epoch": 0.1839167935058346, + "grad_norm": 6.0625, + "learning_rate": 9.951945570476666e-06, + "loss": 1.65906219, + "memory(GiB)": 77.59, + "step": 7250, + "train_speed(iter/s)": 1.622381 + }, + { + "acc": 0.62811546, + "epoch": 0.18404363267376966, + "grad_norm": 6.0, + "learning_rate": 9.951800427392968e-06, + "loss": 1.74646511, + "memory(GiB)": 77.59, + "step": 7255, + "train_speed(iter/s)": 1.622507 + }, + { + "acc": 0.64205914, + "epoch": 0.1841704718417047, + "grad_norm": 5.78125, + "learning_rate": 9.951655066506977e-06, + "loss": 1.75297375, + "memory(GiB)": 77.59, + "step": 7260, + "train_speed(iter/s)": 1.622634 + }, + { + "acc": 0.63896675, + "epoch": 0.18429731100963978, + "grad_norm": 5.4375, + "learning_rate": 9.951509487825091e-06, + "loss": 1.64735317, + "memory(GiB)": 77.59, + "step": 7265, + "train_speed(iter/s)": 1.622749 + }, + { + "acc": 0.6320116, + "epoch": 0.18442415017757482, + "grad_norm": 6.59375, + "learning_rate": 9.95136369135371e-06, + "loss": 1.73418388, + "memory(GiB)": 77.59, + "step": 7270, + "train_speed(iter/s)": 1.622871 + }, + { + "acc": 0.64414997, + "epoch": 0.1845509893455099, + "grad_norm": 5.25, + "learning_rate": 9.951217677099248e-06, + "loss": 1.69243927, + "memory(GiB)": 77.59, + "step": 7275, + "train_speed(iter/s)": 1.622996 + }, + { + "acc": 0.6418952, + "epoch": 0.18467782851344494, + "grad_norm": 5.28125, + "learning_rate": 9.951071445068125e-06, + "loss": 1.65538559, + "memory(GiB)": 77.59, + "step": 7280, + "train_speed(iter/s)": 1.623118 + }, + { + "acc": 0.63668184, + "epoch": 0.18480466768138, + "grad_norm": 6.1875, + "learning_rate": 9.950924995266778e-06, + "loss": 1.70533619, + "memory(GiB)": 77.59, + "step": 7285, + "train_speed(iter/s)": 1.623243 + }, + { + "acc": 0.6396327, + "epoch": 0.18493150684931506, + "grad_norm": 6.0, + "learning_rate": 9.950778327701643e-06, + "loss": 1.6748024, + "memory(GiB)": 77.59, + "step": 7290, + "train_speed(iter/s)": 1.623367 + }, + { + "acc": 0.65611219, + "epoch": 0.18505834601725013, + "grad_norm": 4.6875, + "learning_rate": 9.950631442379175e-06, + "loss": 1.66338844, + "memory(GiB)": 77.59, + "step": 7295, + "train_speed(iter/s)": 1.623488 + }, + { + "acc": 0.64618397, + "epoch": 0.18518518518518517, + "grad_norm": 6.03125, + "learning_rate": 9.950484339305832e-06, + "loss": 1.64510574, + "memory(GiB)": 77.59, + "step": 7300, + "train_speed(iter/s)": 1.623595 + }, + { + "acc": 0.65088601, + "epoch": 0.18531202435312025, + "grad_norm": 5.6875, + "learning_rate": 9.950337018488086e-06, + "loss": 1.70006027, + "memory(GiB)": 77.59, + "step": 7305, + "train_speed(iter/s)": 1.623709 + }, + { + "acc": 0.63807602, + "epoch": 0.1854388635210553, + "grad_norm": 5.0, + "learning_rate": 9.950189479932417e-06, + "loss": 1.69802361, + "memory(GiB)": 77.59, + "step": 7310, + "train_speed(iter/s)": 1.623823 + }, + { + "acc": 0.64677753, + "epoch": 0.18556570268899036, + "grad_norm": 5.28125, + "learning_rate": 9.950041723645312e-06, + "loss": 1.66775341, + "memory(GiB)": 77.59, + "step": 7315, + "train_speed(iter/s)": 1.623944 + }, + { + "acc": 0.64219332, + "epoch": 0.1856925418569254, + "grad_norm": 5.0, + "learning_rate": 9.949893749633273e-06, + "loss": 1.71438885, + "memory(GiB)": 77.59, + "step": 7320, + "train_speed(iter/s)": 1.62406 + }, + { + "acc": 0.64291296, + "epoch": 0.18581938102486048, + "grad_norm": 5.8125, + "learning_rate": 9.949745557902806e-06, + "loss": 1.73521843, + "memory(GiB)": 77.59, + "step": 7325, + "train_speed(iter/s)": 1.624177 + }, + { + "acc": 0.64869943, + "epoch": 0.18594622019279552, + "grad_norm": 4.53125, + "learning_rate": 9.949597148460433e-06, + "loss": 1.69890499, + "memory(GiB)": 77.59, + "step": 7330, + "train_speed(iter/s)": 1.624298 + }, + { + "acc": 0.63394394, + "epoch": 0.1860730593607306, + "grad_norm": 5.71875, + "learning_rate": 9.949448521312676e-06, + "loss": 1.79771175, + "memory(GiB)": 77.59, + "step": 7335, + "train_speed(iter/s)": 1.624418 + }, + { + "acc": 0.65008349, + "epoch": 0.18619989852866564, + "grad_norm": 7.4375, + "learning_rate": 9.949299676466077e-06, + "loss": 1.70881271, + "memory(GiB)": 77.59, + "step": 7340, + "train_speed(iter/s)": 1.624544 + }, + { + "acc": 0.6357439, + "epoch": 0.1863267376966007, + "grad_norm": 6.15625, + "learning_rate": 9.94915061392718e-06, + "loss": 1.66827774, + "memory(GiB)": 77.59, + "step": 7345, + "train_speed(iter/s)": 1.624663 + }, + { + "acc": 0.65615187, + "epoch": 0.18645357686453576, + "grad_norm": 5.40625, + "learning_rate": 9.949001333702543e-06, + "loss": 1.63151321, + "memory(GiB)": 77.59, + "step": 7350, + "train_speed(iter/s)": 1.624785 + }, + { + "acc": 0.63377533, + "epoch": 0.18658041603247083, + "grad_norm": 5.90625, + "learning_rate": 9.948851835798732e-06, + "loss": 1.7970108, + "memory(GiB)": 77.59, + "step": 7355, + "train_speed(iter/s)": 1.624897 + }, + { + "acc": 0.62856464, + "epoch": 0.18670725520040587, + "grad_norm": 5.03125, + "learning_rate": 9.948702120222323e-06, + "loss": 1.73523903, + "memory(GiB)": 77.59, + "step": 7360, + "train_speed(iter/s)": 1.625016 + }, + { + "acc": 0.66290793, + "epoch": 0.18683409436834095, + "grad_norm": 6.0, + "learning_rate": 9.9485521869799e-06, + "loss": 1.63394623, + "memory(GiB)": 77.59, + "step": 7365, + "train_speed(iter/s)": 1.625131 + }, + { + "acc": 0.66382289, + "epoch": 0.186960933536276, + "grad_norm": 5.28125, + "learning_rate": 9.948402036078057e-06, + "loss": 1.57046251, + "memory(GiB)": 77.59, + "step": 7370, + "train_speed(iter/s)": 1.62526 + }, + { + "acc": 0.63160882, + "epoch": 0.18708777270421106, + "grad_norm": 6.5625, + "learning_rate": 9.948251667523401e-06, + "loss": 1.72319946, + "memory(GiB)": 77.59, + "step": 7375, + "train_speed(iter/s)": 1.625377 + }, + { + "acc": 0.63488054, + "epoch": 0.1872146118721461, + "grad_norm": 5.375, + "learning_rate": 9.948101081322544e-06, + "loss": 1.62635155, + "memory(GiB)": 77.59, + "step": 7380, + "train_speed(iter/s)": 1.625497 + }, + { + "acc": 0.6456624, + "epoch": 0.18734145104008118, + "grad_norm": 4.9375, + "learning_rate": 9.947950277482109e-06, + "loss": 1.65658779, + "memory(GiB)": 77.59, + "step": 7385, + "train_speed(iter/s)": 1.62562 + }, + { + "acc": 0.64808559, + "epoch": 0.18746829020801623, + "grad_norm": 5.75, + "learning_rate": 9.94779925600873e-06, + "loss": 1.68540001, + "memory(GiB)": 77.59, + "step": 7390, + "train_speed(iter/s)": 1.625739 + }, + { + "acc": 0.66459303, + "epoch": 0.1875951293759513, + "grad_norm": 5.25, + "learning_rate": 9.947648016909048e-06, + "loss": 1.63555069, + "memory(GiB)": 77.59, + "step": 7395, + "train_speed(iter/s)": 1.625863 + }, + { + "acc": 0.64715314, + "epoch": 0.18772196854388634, + "grad_norm": 6.3125, + "learning_rate": 9.947496560189717e-06, + "loss": 1.66416512, + "memory(GiB)": 77.59, + "step": 7400, + "train_speed(iter/s)": 1.625984 + }, + { + "acc": 0.64939599, + "epoch": 0.18784880771182141, + "grad_norm": 5.25, + "learning_rate": 9.9473448858574e-06, + "loss": 1.68471889, + "memory(GiB)": 77.59, + "step": 7405, + "train_speed(iter/s)": 1.626106 + }, + { + "acc": 0.62002168, + "epoch": 0.18797564687975646, + "grad_norm": 5.96875, + "learning_rate": 9.947192993918765e-06, + "loss": 1.78501797, + "memory(GiB)": 77.59, + "step": 7410, + "train_speed(iter/s)": 1.626217 + }, + { + "acc": 0.632866, + "epoch": 0.18810248604769153, + "grad_norm": 5.96875, + "learning_rate": 9.947040884380496e-06, + "loss": 1.70102005, + "memory(GiB)": 77.59, + "step": 7415, + "train_speed(iter/s)": 1.62633 + }, + { + "acc": 0.65509653, + "epoch": 0.18822932521562658, + "grad_norm": 6.40625, + "learning_rate": 9.946888557249281e-06, + "loss": 1.65170479, + "memory(GiB)": 77.59, + "step": 7420, + "train_speed(iter/s)": 1.626448 + }, + { + "acc": 0.64921708, + "epoch": 0.18835616438356165, + "grad_norm": 5.40625, + "learning_rate": 9.946736012531821e-06, + "loss": 1.66297741, + "memory(GiB)": 77.59, + "step": 7425, + "train_speed(iter/s)": 1.626577 + }, + { + "acc": 0.64842544, + "epoch": 0.1884830035514967, + "grad_norm": 5.1875, + "learning_rate": 9.946583250234826e-06, + "loss": 1.65580101, + "memory(GiB)": 77.59, + "step": 7430, + "train_speed(iter/s)": 1.626698 + }, + { + "acc": 0.64534473, + "epoch": 0.18860984271943176, + "grad_norm": 4.84375, + "learning_rate": 9.946430270365015e-06, + "loss": 1.61940098, + "memory(GiB)": 77.59, + "step": 7435, + "train_speed(iter/s)": 1.626814 + }, + { + "acc": 0.65221057, + "epoch": 0.1887366818873668, + "grad_norm": 5.71875, + "learning_rate": 9.946277072929115e-06, + "loss": 1.64499588, + "memory(GiB)": 77.59, + "step": 7440, + "train_speed(iter/s)": 1.626932 + }, + { + "acc": 0.64241719, + "epoch": 0.18886352105530188, + "grad_norm": 5.3125, + "learning_rate": 9.946123657933867e-06, + "loss": 1.69488716, + "memory(GiB)": 77.59, + "step": 7445, + "train_speed(iter/s)": 1.627039 + }, + { + "acc": 0.64012222, + "epoch": 0.18899036022323693, + "grad_norm": 5.71875, + "learning_rate": 9.945970025386018e-06, + "loss": 1.62594109, + "memory(GiB)": 77.59, + "step": 7450, + "train_speed(iter/s)": 1.627153 + }, + { + "acc": 0.64713979, + "epoch": 0.189117199391172, + "grad_norm": 6.46875, + "learning_rate": 9.945816175292326e-06, + "loss": 1.6352438, + "memory(GiB)": 77.59, + "step": 7455, + "train_speed(iter/s)": 1.627272 + }, + { + "acc": 0.64514918, + "epoch": 0.18924403855910704, + "grad_norm": 5.3125, + "learning_rate": 9.945662107659554e-06, + "loss": 1.71115894, + "memory(GiB)": 77.59, + "step": 7460, + "train_speed(iter/s)": 1.627388 + }, + { + "acc": 0.65765886, + "epoch": 0.18937087772704211, + "grad_norm": 7.625, + "learning_rate": 9.945507822494485e-06, + "loss": 1.63517151, + "memory(GiB)": 77.59, + "step": 7465, + "train_speed(iter/s)": 1.627502 + }, + { + "acc": 0.64353065, + "epoch": 0.18949771689497716, + "grad_norm": 5.96875, + "learning_rate": 9.9453533198039e-06, + "loss": 1.68638687, + "memory(GiB)": 77.59, + "step": 7470, + "train_speed(iter/s)": 1.627615 + }, + { + "acc": 0.63392801, + "epoch": 0.18962455606291223, + "grad_norm": 5.3125, + "learning_rate": 9.945198599594598e-06, + "loss": 1.66831703, + "memory(GiB)": 77.59, + "step": 7475, + "train_speed(iter/s)": 1.627736 + }, + { + "acc": 0.63619165, + "epoch": 0.18975139523084728, + "grad_norm": 4.84375, + "learning_rate": 9.945043661873381e-06, + "loss": 1.73271637, + "memory(GiB)": 77.59, + "step": 7480, + "train_speed(iter/s)": 1.627849 + }, + { + "acc": 0.65396185, + "epoch": 0.18987823439878235, + "grad_norm": 4.84375, + "learning_rate": 9.944888506647066e-06, + "loss": 1.6267189, + "memory(GiB)": 77.59, + "step": 7485, + "train_speed(iter/s)": 1.627967 + }, + { + "acc": 0.65445547, + "epoch": 0.1900050735667174, + "grad_norm": 5.875, + "learning_rate": 9.944733133922479e-06, + "loss": 1.65859509, + "memory(GiB)": 77.59, + "step": 7490, + "train_speed(iter/s)": 1.628086 + }, + { + "acc": 0.64349761, + "epoch": 0.19013191273465246, + "grad_norm": 6.0625, + "learning_rate": 9.944577543706451e-06, + "loss": 1.66820087, + "memory(GiB)": 77.59, + "step": 7495, + "train_speed(iter/s)": 1.628209 + }, + { + "acc": 0.63627405, + "epoch": 0.1902587519025875, + "grad_norm": 5.40625, + "learning_rate": 9.944421736005825e-06, + "loss": 1.66228065, + "memory(GiB)": 77.59, + "step": 7500, + "train_speed(iter/s)": 1.628326 + }, + { + "acc": 0.63509989, + "epoch": 0.19038559107052258, + "grad_norm": 5.1875, + "learning_rate": 9.944265710827459e-06, + "loss": 1.75539074, + "memory(GiB)": 77.59, + "step": 7505, + "train_speed(iter/s)": 1.628447 + }, + { + "acc": 0.64721031, + "epoch": 0.19051243023845763, + "grad_norm": 5.15625, + "learning_rate": 9.944109468178208e-06, + "loss": 1.64554634, + "memory(GiB)": 77.59, + "step": 7510, + "train_speed(iter/s)": 1.628563 + }, + { + "acc": 0.64745188, + "epoch": 0.1906392694063927, + "grad_norm": 5.0, + "learning_rate": 9.943953008064953e-06, + "loss": 1.67710266, + "memory(GiB)": 77.59, + "step": 7515, + "train_speed(iter/s)": 1.62868 + }, + { + "acc": 0.64476213, + "epoch": 0.19076610857432774, + "grad_norm": 6.34375, + "learning_rate": 9.94379633049457e-06, + "loss": 1.68845291, + "memory(GiB)": 77.59, + "step": 7520, + "train_speed(iter/s)": 1.6288 + }, + { + "acc": 0.6497797, + "epoch": 0.19089294774226281, + "grad_norm": 4.9375, + "learning_rate": 9.943639435473952e-06, + "loss": 1.69892578, + "memory(GiB)": 77.59, + "step": 7525, + "train_speed(iter/s)": 1.628916 + }, + { + "acc": 0.65878482, + "epoch": 0.19101978691019786, + "grad_norm": 5.40625, + "learning_rate": 9.94348232301e-06, + "loss": 1.63249588, + "memory(GiB)": 77.59, + "step": 7530, + "train_speed(iter/s)": 1.629027 + }, + { + "acc": 0.63181257, + "epoch": 0.19114662607813293, + "grad_norm": 6.0625, + "learning_rate": 9.943324993109624e-06, + "loss": 1.71280022, + "memory(GiB)": 77.59, + "step": 7535, + "train_speed(iter/s)": 1.629141 + }, + { + "acc": 0.65265141, + "epoch": 0.19127346524606798, + "grad_norm": 7.25, + "learning_rate": 9.943167445779745e-06, + "loss": 1.61639557, + "memory(GiB)": 77.59, + "step": 7540, + "train_speed(iter/s)": 1.629258 + }, + { + "acc": 0.67400455, + "epoch": 0.19140030441400305, + "grad_norm": 6.5625, + "learning_rate": 9.94300968102729e-06, + "loss": 1.60792465, + "memory(GiB)": 77.59, + "step": 7545, + "train_speed(iter/s)": 1.629374 + }, + { + "acc": 0.65444889, + "epoch": 0.1915271435819381, + "grad_norm": 5.53125, + "learning_rate": 9.942851698859204e-06, + "loss": 1.64874229, + "memory(GiB)": 77.59, + "step": 7550, + "train_speed(iter/s)": 1.629491 + }, + { + "acc": 0.65789003, + "epoch": 0.19165398274987316, + "grad_norm": 6.0625, + "learning_rate": 9.94269349928243e-06, + "loss": 1.60055962, + "memory(GiB)": 77.59, + "step": 7555, + "train_speed(iter/s)": 1.629602 + }, + { + "acc": 0.6613224, + "epoch": 0.1917808219178082, + "grad_norm": 5.1875, + "learning_rate": 9.942535082303927e-06, + "loss": 1.58959074, + "memory(GiB)": 77.59, + "step": 7560, + "train_speed(iter/s)": 1.629712 + }, + { + "acc": 0.64439492, + "epoch": 0.19190766108574328, + "grad_norm": 5.75, + "learning_rate": 9.942376447930666e-06, + "loss": 1.68322124, + "memory(GiB)": 77.59, + "step": 7565, + "train_speed(iter/s)": 1.629823 + }, + { + "acc": 0.65919485, + "epoch": 0.19203450025367833, + "grad_norm": 4.9375, + "learning_rate": 9.942217596169623e-06, + "loss": 1.57794476, + "memory(GiB)": 77.59, + "step": 7570, + "train_speed(iter/s)": 1.629936 + }, + { + "acc": 0.62779818, + "epoch": 0.1921613394216134, + "grad_norm": 4.96875, + "learning_rate": 9.942058527027785e-06, + "loss": 1.73289337, + "memory(GiB)": 77.59, + "step": 7575, + "train_speed(iter/s)": 1.630051 + }, + { + "acc": 0.62824359, + "epoch": 0.19228817858954844, + "grad_norm": 6.65625, + "learning_rate": 9.941899240512147e-06, + "loss": 1.69513054, + "memory(GiB)": 77.59, + "step": 7580, + "train_speed(iter/s)": 1.630168 + }, + { + "acc": 0.63809719, + "epoch": 0.19241501775748351, + "grad_norm": 5.5, + "learning_rate": 9.941739736629716e-06, + "loss": 1.67402, + "memory(GiB)": 77.59, + "step": 7585, + "train_speed(iter/s)": 1.630282 + }, + { + "acc": 0.64863114, + "epoch": 0.19254185692541856, + "grad_norm": 5.375, + "learning_rate": 9.941580015387509e-06, + "loss": 1.61916904, + "memory(GiB)": 77.59, + "step": 7590, + "train_speed(iter/s)": 1.630397 + }, + { + "acc": 0.62263918, + "epoch": 0.19266869609335363, + "grad_norm": 8.9375, + "learning_rate": 9.94142007679255e-06, + "loss": 1.76624832, + "memory(GiB)": 77.59, + "step": 7595, + "train_speed(iter/s)": 1.630518 + }, + { + "acc": 0.66040039, + "epoch": 0.19279553526128868, + "grad_norm": 7.21875, + "learning_rate": 9.941259920851874e-06, + "loss": 1.6323246, + "memory(GiB)": 77.59, + "step": 7600, + "train_speed(iter/s)": 1.630631 + }, + { + "acc": 0.64601893, + "epoch": 0.19292237442922375, + "grad_norm": 7.6875, + "learning_rate": 9.941099547572527e-06, + "loss": 1.67999172, + "memory(GiB)": 77.59, + "step": 7605, + "train_speed(iter/s)": 1.630755 + }, + { + "acc": 0.65333796, + "epoch": 0.1930492135971588, + "grad_norm": 5.09375, + "learning_rate": 9.94093895696156e-06, + "loss": 1.65717735, + "memory(GiB)": 77.59, + "step": 7610, + "train_speed(iter/s)": 1.630869 + }, + { + "acc": 0.63890438, + "epoch": 0.19317605276509386, + "grad_norm": 6.15625, + "learning_rate": 9.940778149026038e-06, + "loss": 1.69429169, + "memory(GiB)": 77.59, + "step": 7615, + "train_speed(iter/s)": 1.630981 + }, + { + "acc": 0.6394372, + "epoch": 0.1933028919330289, + "grad_norm": 5.4375, + "learning_rate": 9.940617123773036e-06, + "loss": 1.69591408, + "memory(GiB)": 77.59, + "step": 7620, + "train_speed(iter/s)": 1.631098 + }, + { + "acc": 0.65708766, + "epoch": 0.19342973110096398, + "grad_norm": 5.75, + "learning_rate": 9.940455881209632e-06, + "loss": 1.57650871, + "memory(GiB)": 77.59, + "step": 7625, + "train_speed(iter/s)": 1.631219 + }, + { + "acc": 0.64314799, + "epoch": 0.19355657026889903, + "grad_norm": 5.59375, + "learning_rate": 9.940294421342922e-06, + "loss": 1.60519085, + "memory(GiB)": 77.59, + "step": 7630, + "train_speed(iter/s)": 1.631332 + }, + { + "acc": 0.6334199, + "epoch": 0.1936834094368341, + "grad_norm": 6.15625, + "learning_rate": 9.940132744180007e-06, + "loss": 1.7166523, + "memory(GiB)": 77.59, + "step": 7635, + "train_speed(iter/s)": 1.631445 + }, + { + "acc": 0.65944061, + "epoch": 0.19381024860476914, + "grad_norm": 5.5625, + "learning_rate": 9.939970849727995e-06, + "loss": 1.61455383, + "memory(GiB)": 77.59, + "step": 7640, + "train_speed(iter/s)": 1.631554 + }, + { + "acc": 0.63408394, + "epoch": 0.19393708777270421, + "grad_norm": 5.6875, + "learning_rate": 9.939808737994013e-06, + "loss": 1.72968178, + "memory(GiB)": 77.59, + "step": 7645, + "train_speed(iter/s)": 1.631674 + }, + { + "acc": 0.6466785, + "epoch": 0.19406392694063926, + "grad_norm": 5.90625, + "learning_rate": 9.939646408985186e-06, + "loss": 1.61666393, + "memory(GiB)": 77.59, + "step": 7650, + "train_speed(iter/s)": 1.631785 + }, + { + "acc": 0.65346975, + "epoch": 0.19419076610857433, + "grad_norm": 5.9375, + "learning_rate": 9.939483862708658e-06, + "loss": 1.62012177, + "memory(GiB)": 77.59, + "step": 7655, + "train_speed(iter/s)": 1.631903 + }, + { + "acc": 0.64003558, + "epoch": 0.19431760527650938, + "grad_norm": 5.75, + "learning_rate": 9.939321099171575e-06, + "loss": 1.64041882, + "memory(GiB)": 77.59, + "step": 7660, + "train_speed(iter/s)": 1.632007 + }, + { + "acc": 0.64093351, + "epoch": 0.19444444444444445, + "grad_norm": 4.9375, + "learning_rate": 9.939158118381097e-06, + "loss": 1.72127285, + "memory(GiB)": 77.59, + "step": 7665, + "train_speed(iter/s)": 1.63212 + }, + { + "acc": 0.63992558, + "epoch": 0.1945712836123795, + "grad_norm": 6.0625, + "learning_rate": 9.938994920344395e-06, + "loss": 1.68768444, + "memory(GiB)": 77.59, + "step": 7670, + "train_speed(iter/s)": 1.632232 + }, + { + "acc": 0.64807968, + "epoch": 0.19469812278031456, + "grad_norm": 5.0, + "learning_rate": 9.938831505068645e-06, + "loss": 1.62658596, + "memory(GiB)": 77.59, + "step": 7675, + "train_speed(iter/s)": 1.632355 + }, + { + "acc": 0.64729881, + "epoch": 0.1948249619482496, + "grad_norm": 5.375, + "learning_rate": 9.938667872561035e-06, + "loss": 1.58645487, + "memory(GiB)": 77.59, + "step": 7680, + "train_speed(iter/s)": 1.632472 + }, + { + "acc": 0.6605937, + "epoch": 0.19495180111618468, + "grad_norm": 5.46875, + "learning_rate": 9.938504022828762e-06, + "loss": 1.58746214, + "memory(GiB)": 77.59, + "step": 7685, + "train_speed(iter/s)": 1.632586 + }, + { + "acc": 0.64638605, + "epoch": 0.19507864028411973, + "grad_norm": 4.53125, + "learning_rate": 9.938339955879033e-06, + "loss": 1.6381422, + "memory(GiB)": 77.59, + "step": 7690, + "train_speed(iter/s)": 1.632697 + }, + { + "acc": 0.64010434, + "epoch": 0.1952054794520548, + "grad_norm": 5.21875, + "learning_rate": 9.938175671719064e-06, + "loss": 1.67854919, + "memory(GiB)": 77.59, + "step": 7695, + "train_speed(iter/s)": 1.632807 + }, + { + "acc": 0.62463942, + "epoch": 0.19533231861998984, + "grad_norm": 5.78125, + "learning_rate": 9.938011170356083e-06, + "loss": 1.65680466, + "memory(GiB)": 77.59, + "step": 7700, + "train_speed(iter/s)": 1.63292 + }, + { + "acc": 0.63911147, + "epoch": 0.19545915778792491, + "grad_norm": 8.5, + "learning_rate": 9.937846451797324e-06, + "loss": 1.67437267, + "memory(GiB)": 77.59, + "step": 7705, + "train_speed(iter/s)": 1.633029 + }, + { + "acc": 0.65099292, + "epoch": 0.19558599695585996, + "grad_norm": 7.03125, + "learning_rate": 9.93768151605003e-06, + "loss": 1.61787701, + "memory(GiB)": 77.59, + "step": 7710, + "train_speed(iter/s)": 1.633139 + }, + { + "acc": 0.65595174, + "epoch": 0.19571283612379503, + "grad_norm": 6.125, + "learning_rate": 9.93751636312146e-06, + "loss": 1.62009678, + "memory(GiB)": 77.59, + "step": 7715, + "train_speed(iter/s)": 1.63325 + }, + { + "acc": 0.63047824, + "epoch": 0.19583967529173008, + "grad_norm": 6.90625, + "learning_rate": 9.937350993018875e-06, + "loss": 1.75598621, + "memory(GiB)": 77.59, + "step": 7720, + "train_speed(iter/s)": 1.633362 + }, + { + "acc": 0.6408926, + "epoch": 0.19596651445966515, + "grad_norm": 4.84375, + "learning_rate": 9.93718540574955e-06, + "loss": 1.65693874, + "memory(GiB)": 77.59, + "step": 7725, + "train_speed(iter/s)": 1.633478 + }, + { + "acc": 0.63349919, + "epoch": 0.1960933536276002, + "grad_norm": 5.0, + "learning_rate": 9.937019601320768e-06, + "loss": 1.72434883, + "memory(GiB)": 77.59, + "step": 7730, + "train_speed(iter/s)": 1.633592 + }, + { + "acc": 0.62782717, + "epoch": 0.19622019279553526, + "grad_norm": 5.71875, + "learning_rate": 9.936853579739823e-06, + "loss": 1.67874832, + "memory(GiB)": 77.59, + "step": 7735, + "train_speed(iter/s)": 1.633704 + }, + { + "acc": 0.62826281, + "epoch": 0.1963470319634703, + "grad_norm": 6.9375, + "learning_rate": 9.936687341014015e-06, + "loss": 1.75626316, + "memory(GiB)": 77.59, + "step": 7740, + "train_speed(iter/s)": 1.633807 + }, + { + "acc": 0.65873413, + "epoch": 0.19647387113140538, + "grad_norm": 5.40625, + "learning_rate": 9.936520885150655e-06, + "loss": 1.5933773, + "memory(GiB)": 77.59, + "step": 7745, + "train_speed(iter/s)": 1.633913 + }, + { + "acc": 0.64515133, + "epoch": 0.19660071029934043, + "grad_norm": 5.1875, + "learning_rate": 9.936354212157068e-06, + "loss": 1.71064873, + "memory(GiB)": 77.59, + "step": 7750, + "train_speed(iter/s)": 1.634017 + }, + { + "acc": 0.64298263, + "epoch": 0.1967275494672755, + "grad_norm": 4.5, + "learning_rate": 9.936187322040584e-06, + "loss": 1.68684635, + "memory(GiB)": 77.59, + "step": 7755, + "train_speed(iter/s)": 1.634132 + }, + { + "acc": 0.63161044, + "epoch": 0.19685438863521054, + "grad_norm": 5.125, + "learning_rate": 9.936020214808544e-06, + "loss": 1.72244186, + "memory(GiB)": 77.59, + "step": 7760, + "train_speed(iter/s)": 1.634242 + }, + { + "acc": 0.65020905, + "epoch": 0.19698122780314561, + "grad_norm": 5.5, + "learning_rate": 9.935852890468297e-06, + "loss": 1.6733654, + "memory(GiB)": 77.59, + "step": 7765, + "train_speed(iter/s)": 1.634352 + }, + { + "acc": 0.63110623, + "epoch": 0.19710806697108066, + "grad_norm": 8.125, + "learning_rate": 9.935685349027201e-06, + "loss": 1.68286629, + "memory(GiB)": 77.59, + "step": 7770, + "train_speed(iter/s)": 1.634463 + }, + { + "acc": 0.63578558, + "epoch": 0.19723490613901573, + "grad_norm": 5.34375, + "learning_rate": 9.935517590492627e-06, + "loss": 1.67285881, + "memory(GiB)": 77.59, + "step": 7775, + "train_speed(iter/s)": 1.634571 + }, + { + "acc": 0.6535295, + "epoch": 0.19736174530695078, + "grad_norm": 5.25, + "learning_rate": 9.935349614871957e-06, + "loss": 1.61393757, + "memory(GiB)": 77.59, + "step": 7780, + "train_speed(iter/s)": 1.634683 + }, + { + "acc": 0.65299621, + "epoch": 0.19748858447488585, + "grad_norm": 5.3125, + "learning_rate": 9.935181422172574e-06, + "loss": 1.66883202, + "memory(GiB)": 77.59, + "step": 7785, + "train_speed(iter/s)": 1.634797 + }, + { + "acc": 0.63826895, + "epoch": 0.1976154236428209, + "grad_norm": 6.5625, + "learning_rate": 9.935013012401878e-06, + "loss": 1.77757244, + "memory(GiB)": 77.59, + "step": 7790, + "train_speed(iter/s)": 1.634914 + }, + { + "acc": 0.65492325, + "epoch": 0.19774226281075596, + "grad_norm": 5.75, + "learning_rate": 9.934844385567275e-06, + "loss": 1.61685257, + "memory(GiB)": 77.59, + "step": 7795, + "train_speed(iter/s)": 1.635024 + }, + { + "acc": 0.64512129, + "epoch": 0.197869101978691, + "grad_norm": 6.3125, + "learning_rate": 9.934675541676186e-06, + "loss": 1.65668201, + "memory(GiB)": 77.59, + "step": 7800, + "train_speed(iter/s)": 1.63514 + }, + { + "acc": 0.65817451, + "epoch": 0.19799594114662608, + "grad_norm": 6.84375, + "learning_rate": 9.934506480736034e-06, + "loss": 1.62153244, + "memory(GiB)": 77.59, + "step": 7805, + "train_speed(iter/s)": 1.635252 + }, + { + "acc": 0.65557733, + "epoch": 0.19812278031456113, + "grad_norm": 5.6875, + "learning_rate": 9.934337202754257e-06, + "loss": 1.6400877, + "memory(GiB)": 77.59, + "step": 7810, + "train_speed(iter/s)": 1.635368 + }, + { + "acc": 0.65059519, + "epoch": 0.1982496194824962, + "grad_norm": 6.25, + "learning_rate": 9.934167707738298e-06, + "loss": 1.71084137, + "memory(GiB)": 77.59, + "step": 7815, + "train_speed(iter/s)": 1.635488 + }, + { + "acc": 0.64949894, + "epoch": 0.19837645865043124, + "grad_norm": 6.875, + "learning_rate": 9.933997995695615e-06, + "loss": 1.66692123, + "memory(GiB)": 77.59, + "step": 7820, + "train_speed(iter/s)": 1.635587 + }, + { + "acc": 0.6385705, + "epoch": 0.19850329781836631, + "grad_norm": 5.28125, + "learning_rate": 9.93382806663367e-06, + "loss": 1.71105919, + "memory(GiB)": 77.59, + "step": 7825, + "train_speed(iter/s)": 1.635702 + }, + { + "acc": 0.64712896, + "epoch": 0.19863013698630136, + "grad_norm": 5.65625, + "learning_rate": 9.933657920559939e-06, + "loss": 1.65789585, + "memory(GiB)": 77.59, + "step": 7830, + "train_speed(iter/s)": 1.635812 + }, + { + "acc": 0.64820595, + "epoch": 0.19875697615423643, + "grad_norm": 5.34375, + "learning_rate": 9.933487557481905e-06, + "loss": 1.67896671, + "memory(GiB)": 77.59, + "step": 7835, + "train_speed(iter/s)": 1.635923 + }, + { + "acc": 0.65184479, + "epoch": 0.19888381532217148, + "grad_norm": 5.28125, + "learning_rate": 9.933316977407063e-06, + "loss": 1.72082691, + "memory(GiB)": 77.59, + "step": 7840, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.66542158, + "epoch": 0.19901065449010655, + "grad_norm": 5.0625, + "learning_rate": 9.933146180342914e-06, + "loss": 1.56001062, + "memory(GiB)": 77.59, + "step": 7845, + "train_speed(iter/s)": 1.636133 + }, + { + "acc": 0.65702295, + "epoch": 0.1991374936580416, + "grad_norm": 5.15625, + "learning_rate": 9.932975166296972e-06, + "loss": 1.65709152, + "memory(GiB)": 77.59, + "step": 7850, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.64513555, + "epoch": 0.19926433282597666, + "grad_norm": 6.5, + "learning_rate": 9.932803935276757e-06, + "loss": 1.6050312, + "memory(GiB)": 77.59, + "step": 7855, + "train_speed(iter/s)": 1.636336 + }, + { + "acc": 0.63779025, + "epoch": 0.1993911719939117, + "grad_norm": 6.59375, + "learning_rate": 9.932632487289802e-06, + "loss": 1.73334351, + "memory(GiB)": 77.59, + "step": 7860, + "train_speed(iter/s)": 1.636443 + }, + { + "acc": 0.64397364, + "epoch": 0.19951801116184678, + "grad_norm": 5.4375, + "learning_rate": 9.932460822343649e-06, + "loss": 1.66076736, + "memory(GiB)": 77.59, + "step": 7865, + "train_speed(iter/s)": 1.636548 + }, + { + "acc": 0.63360085, + "epoch": 0.19964485032978183, + "grad_norm": 6.03125, + "learning_rate": 9.932288940445845e-06, + "loss": 1.79628563, + "memory(GiB)": 77.59, + "step": 7870, + "train_speed(iter/s)": 1.636653 + }, + { + "acc": 0.6418045, + "epoch": 0.1997716894977169, + "grad_norm": 6.5625, + "learning_rate": 9.932116841603954e-06, + "loss": 1.61534061, + "memory(GiB)": 77.59, + "step": 7875, + "train_speed(iter/s)": 1.636764 + }, + { + "acc": 0.65211296, + "epoch": 0.19989852866565194, + "grad_norm": 7.21875, + "learning_rate": 9.931944525825542e-06, + "loss": 1.64864426, + "memory(GiB)": 77.59, + "step": 7880, + "train_speed(iter/s)": 1.636877 + }, + { + "acc": 0.64422231, + "epoch": 0.20002536783358701, + "grad_norm": 6.53125, + "learning_rate": 9.931771993118191e-06, + "loss": 1.61573906, + "memory(GiB)": 77.59, + "step": 7885, + "train_speed(iter/s)": 1.636974 + }, + { + "acc": 0.64482603, + "epoch": 0.20015220700152206, + "grad_norm": 4.46875, + "learning_rate": 9.931599243489489e-06, + "loss": 1.69481201, + "memory(GiB)": 77.59, + "step": 7890, + "train_speed(iter/s)": 1.637078 + }, + { + "acc": 0.64780335, + "epoch": 0.20027904616945713, + "grad_norm": 5.78125, + "learning_rate": 9.931426276947037e-06, + "loss": 1.65161324, + "memory(GiB)": 77.59, + "step": 7895, + "train_speed(iter/s)": 1.637189 + }, + { + "acc": 0.63699322, + "epoch": 0.20040588533739218, + "grad_norm": 6.0625, + "learning_rate": 9.931253093498437e-06, + "loss": 1.65363426, + "memory(GiB)": 77.59, + "step": 7900, + "train_speed(iter/s)": 1.637298 + }, + { + "acc": 0.63848782, + "epoch": 0.20053272450532725, + "grad_norm": 4.34375, + "learning_rate": 9.93107969315131e-06, + "loss": 1.6675106, + "memory(GiB)": 77.59, + "step": 7905, + "train_speed(iter/s)": 1.637403 + }, + { + "acc": 0.65617108, + "epoch": 0.2006595636732623, + "grad_norm": 5.09375, + "learning_rate": 9.930906075913281e-06, + "loss": 1.66813869, + "memory(GiB)": 77.59, + "step": 7910, + "train_speed(iter/s)": 1.637508 + }, + { + "acc": 0.65991898, + "epoch": 0.20078640284119736, + "grad_norm": 4.8125, + "learning_rate": 9.93073224179199e-06, + "loss": 1.62511177, + "memory(GiB)": 77.59, + "step": 7915, + "train_speed(iter/s)": 1.637602 + }, + { + "acc": 0.64154482, + "epoch": 0.2009132420091324, + "grad_norm": 6.40625, + "learning_rate": 9.93055819079508e-06, + "loss": 1.72127247, + "memory(GiB)": 77.59, + "step": 7920, + "train_speed(iter/s)": 1.637705 + }, + { + "acc": 0.65048213, + "epoch": 0.20104008117706748, + "grad_norm": 4.59375, + "learning_rate": 9.930383922930207e-06, + "loss": 1.72336464, + "memory(GiB)": 77.59, + "step": 7925, + "train_speed(iter/s)": 1.637805 + }, + { + "acc": 0.63912992, + "epoch": 0.20116692034500253, + "grad_norm": 5.8125, + "learning_rate": 9.930209438205038e-06, + "loss": 1.70175743, + "memory(GiB)": 77.59, + "step": 7930, + "train_speed(iter/s)": 1.637907 + }, + { + "acc": 0.64162908, + "epoch": 0.2012937595129376, + "grad_norm": 5.0625, + "learning_rate": 9.930034736627245e-06, + "loss": 1.65782051, + "memory(GiB)": 77.59, + "step": 7935, + "train_speed(iter/s)": 1.638017 + }, + { + "acc": 0.64904346, + "epoch": 0.20142059868087264, + "grad_norm": 5.5, + "learning_rate": 9.929859818204514e-06, + "loss": 1.6428133, + "memory(GiB)": 77.59, + "step": 7940, + "train_speed(iter/s)": 1.638115 + }, + { + "acc": 0.66294374, + "epoch": 0.20154743784880771, + "grad_norm": 5.0, + "learning_rate": 9.929684682944538e-06, + "loss": 1.51369743, + "memory(GiB)": 77.59, + "step": 7945, + "train_speed(iter/s)": 1.638226 + }, + { + "acc": 0.63926039, + "epoch": 0.20167427701674276, + "grad_norm": 5.46875, + "learning_rate": 9.929509330855018e-06, + "loss": 1.70389633, + "memory(GiB)": 77.59, + "step": 7950, + "train_speed(iter/s)": 1.638328 + }, + { + "acc": 0.64644952, + "epoch": 0.20180111618467783, + "grad_norm": 5.3125, + "learning_rate": 9.929333761943672e-06, + "loss": 1.72466908, + "memory(GiB)": 77.59, + "step": 7955, + "train_speed(iter/s)": 1.63843 + }, + { + "acc": 0.63113956, + "epoch": 0.20192795535261288, + "grad_norm": 6.1875, + "learning_rate": 9.929157976218218e-06, + "loss": 1.75687828, + "memory(GiB)": 77.59, + "step": 7960, + "train_speed(iter/s)": 1.638536 + }, + { + "acc": 0.64263639, + "epoch": 0.20205479452054795, + "grad_norm": 6.375, + "learning_rate": 9.928981973686388e-06, + "loss": 1.72279224, + "memory(GiB)": 77.59, + "step": 7965, + "train_speed(iter/s)": 1.638639 + }, + { + "acc": 0.65860739, + "epoch": 0.202181633688483, + "grad_norm": 4.6875, + "learning_rate": 9.928805754355926e-06, + "loss": 1.57535896, + "memory(GiB)": 77.59, + "step": 7970, + "train_speed(iter/s)": 1.638737 + }, + { + "acc": 0.64154053, + "epoch": 0.20230847285641806, + "grad_norm": 6.59375, + "learning_rate": 9.92862931823458e-06, + "loss": 1.65813713, + "memory(GiB)": 77.59, + "step": 7975, + "train_speed(iter/s)": 1.638847 + }, + { + "acc": 0.65051098, + "epoch": 0.2024353120243531, + "grad_norm": 5.84375, + "learning_rate": 9.928452665330113e-06, + "loss": 1.68977509, + "memory(GiB)": 77.59, + "step": 7980, + "train_speed(iter/s)": 1.638947 + }, + { + "acc": 0.63657284, + "epoch": 0.20256215119228818, + "grad_norm": 5.4375, + "learning_rate": 9.928275795650293e-06, + "loss": 1.70972595, + "memory(GiB)": 77.59, + "step": 7985, + "train_speed(iter/s)": 1.639046 + }, + { + "acc": 0.65277157, + "epoch": 0.20268899036022323, + "grad_norm": 5.21875, + "learning_rate": 9.928098709202901e-06, + "loss": 1.6477808, + "memory(GiB)": 77.59, + "step": 7990, + "train_speed(iter/s)": 1.639146 + }, + { + "acc": 0.63990397, + "epoch": 0.2028158295281583, + "grad_norm": 7.6875, + "learning_rate": 9.927921405995727e-06, + "loss": 1.70589218, + "memory(GiB)": 77.59, + "step": 7995, + "train_speed(iter/s)": 1.639244 + }, + { + "acc": 0.6625926, + "epoch": 0.20294266869609334, + "grad_norm": 5.4375, + "learning_rate": 9.927743886036566e-06, + "loss": 1.67445526, + "memory(GiB)": 77.59, + "step": 8000, + "train_speed(iter/s)": 1.639354 + }, + { + "epoch": 0.20294266869609334, + "eval_acc": 0.6357697600745108, + "eval_loss": 1.6331746578216553, + "eval_runtime": 57.8788, + "eval_samples_per_second": 110.058, + "eval_steps_per_second": 27.523, + "step": 8000 + }, + { + "acc": 0.65977569, + "epoch": 0.20306950786402841, + "grad_norm": 5.1875, + "learning_rate": 9.927566149333228e-06, + "loss": 1.58529568, + "memory(GiB)": 77.59, + "step": 8005, + "train_speed(iter/s)": 1.618865 + }, + { + "acc": 0.63945141, + "epoch": 0.20319634703196346, + "grad_norm": 5.78125, + "learning_rate": 9.92738819589353e-06, + "loss": 1.66217422, + "memory(GiB)": 77.59, + "step": 8010, + "train_speed(iter/s)": 1.618984 + }, + { + "acc": 0.6445693, + "epoch": 0.20332318619989853, + "grad_norm": 6.09375, + "learning_rate": 9.927210025725301e-06, + "loss": 1.64672394, + "memory(GiB)": 77.59, + "step": 8015, + "train_speed(iter/s)": 1.619103 + }, + { + "acc": 0.64517527, + "epoch": 0.20345002536783358, + "grad_norm": 6.34375, + "learning_rate": 9.927031638836377e-06, + "loss": 1.68545208, + "memory(GiB)": 77.59, + "step": 8020, + "train_speed(iter/s)": 1.619226 + }, + { + "acc": 0.64619102, + "epoch": 0.20357686453576865, + "grad_norm": 6.53125, + "learning_rate": 9.926853035234603e-06, + "loss": 1.67822075, + "memory(GiB)": 77.59, + "step": 8025, + "train_speed(iter/s)": 1.619352 + }, + { + "acc": 0.65307193, + "epoch": 0.2037037037037037, + "grad_norm": 6.25, + "learning_rate": 9.926674214927836e-06, + "loss": 1.72326927, + "memory(GiB)": 77.59, + "step": 8030, + "train_speed(iter/s)": 1.619473 + }, + { + "acc": 0.64413338, + "epoch": 0.20383054287163876, + "grad_norm": 4.71875, + "learning_rate": 9.926495177923941e-06, + "loss": 1.63698082, + "memory(GiB)": 77.59, + "step": 8035, + "train_speed(iter/s)": 1.61959 + }, + { + "acc": 0.64417505, + "epoch": 0.2039573820395738, + "grad_norm": 6.15625, + "learning_rate": 9.926315924230794e-06, + "loss": 1.67357693, + "memory(GiB)": 77.59, + "step": 8040, + "train_speed(iter/s)": 1.619706 + }, + { + "acc": 0.64143553, + "epoch": 0.20408422120750888, + "grad_norm": 5.40625, + "learning_rate": 9.926136453856277e-06, + "loss": 1.6808012, + "memory(GiB)": 77.59, + "step": 8045, + "train_speed(iter/s)": 1.61982 + }, + { + "acc": 0.6483737, + "epoch": 0.20421106037544393, + "grad_norm": 5.4375, + "learning_rate": 9.925956766808286e-06, + "loss": 1.68206596, + "memory(GiB)": 77.59, + "step": 8050, + "train_speed(iter/s)": 1.61994 + }, + { + "acc": 0.63125157, + "epoch": 0.204337899543379, + "grad_norm": 4.8125, + "learning_rate": 9.925776863094723e-06, + "loss": 1.7031517, + "memory(GiB)": 77.59, + "step": 8055, + "train_speed(iter/s)": 1.620065 + }, + { + "acc": 0.63740573, + "epoch": 0.20446473871131404, + "grad_norm": 5.375, + "learning_rate": 9.9255967427235e-06, + "loss": 1.71787262, + "memory(GiB)": 77.59, + "step": 8060, + "train_speed(iter/s)": 1.620187 + }, + { + "acc": 0.64998183, + "epoch": 0.20459157787924911, + "grad_norm": 5.46875, + "learning_rate": 9.925416405702544e-06, + "loss": 1.65524864, + "memory(GiB)": 77.59, + "step": 8065, + "train_speed(iter/s)": 1.62031 + }, + { + "acc": 0.64478092, + "epoch": 0.20471841704718416, + "grad_norm": 5.34375, + "learning_rate": 9.925235852039783e-06, + "loss": 1.69534702, + "memory(GiB)": 77.59, + "step": 8070, + "train_speed(iter/s)": 1.620429 + }, + { + "acc": 0.63976612, + "epoch": 0.20484525621511923, + "grad_norm": 8.5, + "learning_rate": 9.92505508174316e-06, + "loss": 1.73726997, + "memory(GiB)": 77.59, + "step": 8075, + "train_speed(iter/s)": 1.620547 + }, + { + "acc": 0.65227184, + "epoch": 0.20497209538305428, + "grad_norm": 8.5, + "learning_rate": 9.924874094820625e-06, + "loss": 1.64289227, + "memory(GiB)": 77.59, + "step": 8080, + "train_speed(iter/s)": 1.620659 + }, + { + "acc": 0.64804726, + "epoch": 0.20509893455098935, + "grad_norm": 7.5625, + "learning_rate": 9.924692891280139e-06, + "loss": 1.68954716, + "memory(GiB)": 77.59, + "step": 8085, + "train_speed(iter/s)": 1.620774 + }, + { + "acc": 0.65150881, + "epoch": 0.2052257737189244, + "grad_norm": 5.5625, + "learning_rate": 9.924511471129673e-06, + "loss": 1.63041382, + "memory(GiB)": 77.59, + "step": 8090, + "train_speed(iter/s)": 1.62089 + }, + { + "acc": 0.64884038, + "epoch": 0.20535261288685946, + "grad_norm": 5.4375, + "learning_rate": 9.924329834377206e-06, + "loss": 1.64233952, + "memory(GiB)": 77.59, + "step": 8095, + "train_speed(iter/s)": 1.621001 + }, + { + "acc": 0.64271021, + "epoch": 0.2054794520547945, + "grad_norm": 6.5, + "learning_rate": 9.924147981030728e-06, + "loss": 1.65378113, + "memory(GiB)": 77.59, + "step": 8100, + "train_speed(iter/s)": 1.621119 + }, + { + "acc": 0.66110854, + "epoch": 0.20560629122272958, + "grad_norm": 5.1875, + "learning_rate": 9.923965911098235e-06, + "loss": 1.61195717, + "memory(GiB)": 77.59, + "step": 8105, + "train_speed(iter/s)": 1.621205 + }, + { + "acc": 0.63061852, + "epoch": 0.20573313039066463, + "grad_norm": 7.15625, + "learning_rate": 9.92378362458774e-06, + "loss": 1.75626564, + "memory(GiB)": 77.59, + "step": 8110, + "train_speed(iter/s)": 1.621326 + }, + { + "acc": 0.65140057, + "epoch": 0.2058599695585997, + "grad_norm": 5.1875, + "learning_rate": 9.923601121507256e-06, + "loss": 1.67508411, + "memory(GiB)": 77.59, + "step": 8115, + "train_speed(iter/s)": 1.621448 + }, + { + "acc": 0.63245001, + "epoch": 0.20598680872653474, + "grad_norm": 6.34375, + "learning_rate": 9.923418401864812e-06, + "loss": 1.69823914, + "memory(GiB)": 77.59, + "step": 8120, + "train_speed(iter/s)": 1.62157 + }, + { + "acc": 0.64150181, + "epoch": 0.20611364789446981, + "grad_norm": 5.875, + "learning_rate": 9.923235465668447e-06, + "loss": 1.63199577, + "memory(GiB)": 77.59, + "step": 8125, + "train_speed(iter/s)": 1.621693 + }, + { + "acc": 0.65600052, + "epoch": 0.20624048706240486, + "grad_norm": 5.8125, + "learning_rate": 9.923052312926204e-06, + "loss": 1.65171051, + "memory(GiB)": 77.59, + "step": 8130, + "train_speed(iter/s)": 1.621816 + }, + { + "acc": 0.64646649, + "epoch": 0.20636732623033993, + "grad_norm": 5.0625, + "learning_rate": 9.922868943646142e-06, + "loss": 1.62688026, + "memory(GiB)": 77.59, + "step": 8135, + "train_speed(iter/s)": 1.621935 + }, + { + "acc": 0.6379848, + "epoch": 0.20649416539827498, + "grad_norm": 7.21875, + "learning_rate": 9.922685357836324e-06, + "loss": 1.67142372, + "memory(GiB)": 77.59, + "step": 8140, + "train_speed(iter/s)": 1.622048 + }, + { + "acc": 0.64262109, + "epoch": 0.20662100456621005, + "grad_norm": 5.71875, + "learning_rate": 9.922501555504827e-06, + "loss": 1.67993088, + "memory(GiB)": 77.59, + "step": 8145, + "train_speed(iter/s)": 1.622164 + }, + { + "acc": 0.64655733, + "epoch": 0.2067478437341451, + "grad_norm": 5.5625, + "learning_rate": 9.922317536659733e-06, + "loss": 1.6251852, + "memory(GiB)": 77.59, + "step": 8150, + "train_speed(iter/s)": 1.622278 + }, + { + "acc": 0.65152316, + "epoch": 0.20687468290208016, + "grad_norm": 5.125, + "learning_rate": 9.922133301309136e-06, + "loss": 1.67091179, + "memory(GiB)": 77.59, + "step": 8155, + "train_speed(iter/s)": 1.622401 + }, + { + "acc": 0.6381959, + "epoch": 0.2070015220700152, + "grad_norm": 6.4375, + "learning_rate": 9.921948849461142e-06, + "loss": 1.67875366, + "memory(GiB)": 77.59, + "step": 8160, + "train_speed(iter/s)": 1.622511 + }, + { + "acc": 0.65721121, + "epoch": 0.20712836123795028, + "grad_norm": 5.40625, + "learning_rate": 9.921764181123864e-06, + "loss": 1.5984417, + "memory(GiB)": 77.59, + "step": 8165, + "train_speed(iter/s)": 1.622626 + }, + { + "acc": 0.66035037, + "epoch": 0.20725520040588533, + "grad_norm": 7.40625, + "learning_rate": 9.921579296305421e-06, + "loss": 1.60754204, + "memory(GiB)": 77.59, + "step": 8170, + "train_speed(iter/s)": 1.622728 + }, + { + "acc": 0.64061174, + "epoch": 0.2073820395738204, + "grad_norm": 6.4375, + "learning_rate": 9.921394195013949e-06, + "loss": 1.62264938, + "memory(GiB)": 77.59, + "step": 8175, + "train_speed(iter/s)": 1.622846 + }, + { + "acc": 0.64834986, + "epoch": 0.20750887874175544, + "grad_norm": 5.21875, + "learning_rate": 9.921208877257586e-06, + "loss": 1.68956242, + "memory(GiB)": 77.59, + "step": 8180, + "train_speed(iter/s)": 1.622959 + }, + { + "acc": 0.63978705, + "epoch": 0.20763571790969051, + "grad_norm": 8.3125, + "learning_rate": 9.921023343044486e-06, + "loss": 1.6844799, + "memory(GiB)": 77.59, + "step": 8185, + "train_speed(iter/s)": 1.623068 + }, + { + "acc": 0.6520762, + "epoch": 0.20776255707762556, + "grad_norm": 7.28125, + "learning_rate": 9.92083759238281e-06, + "loss": 1.58932476, + "memory(GiB)": 77.59, + "step": 8190, + "train_speed(iter/s)": 1.623191 + }, + { + "acc": 0.64670382, + "epoch": 0.20788939624556063, + "grad_norm": 5.4375, + "learning_rate": 9.920651625280725e-06, + "loss": 1.7187542, + "memory(GiB)": 77.59, + "step": 8195, + "train_speed(iter/s)": 1.623307 + }, + { + "acc": 0.64202294, + "epoch": 0.20801623541349568, + "grad_norm": 6.09375, + "learning_rate": 9.920465441746412e-06, + "loss": 1.68442764, + "memory(GiB)": 77.59, + "step": 8200, + "train_speed(iter/s)": 1.62343 + }, + { + "acc": 0.64233842, + "epoch": 0.20814307458143075, + "grad_norm": 5.09375, + "learning_rate": 9.920279041788062e-06, + "loss": 1.69617329, + "memory(GiB)": 77.59, + "step": 8205, + "train_speed(iter/s)": 1.623548 + }, + { + "acc": 0.65611525, + "epoch": 0.2082699137493658, + "grad_norm": 5.59375, + "learning_rate": 9.920092425413871e-06, + "loss": 1.65394211, + "memory(GiB)": 77.59, + "step": 8210, + "train_speed(iter/s)": 1.623658 + }, + { + "acc": 0.65303197, + "epoch": 0.20839675291730086, + "grad_norm": 6.0, + "learning_rate": 9.919905592632048e-06, + "loss": 1.56160412, + "memory(GiB)": 77.59, + "step": 8215, + "train_speed(iter/s)": 1.623766 + }, + { + "acc": 0.63766236, + "epoch": 0.2085235920852359, + "grad_norm": 6.28125, + "learning_rate": 9.919718543450813e-06, + "loss": 1.63620415, + "memory(GiB)": 77.59, + "step": 8220, + "train_speed(iter/s)": 1.623885 + }, + { + "acc": 0.64577684, + "epoch": 0.20865043125317098, + "grad_norm": 7.34375, + "learning_rate": 9.919531277878391e-06, + "loss": 1.67722473, + "memory(GiB)": 77.59, + "step": 8225, + "train_speed(iter/s)": 1.623994 + }, + { + "acc": 0.64646959, + "epoch": 0.20877727042110603, + "grad_norm": 9.4375, + "learning_rate": 9.91934379592302e-06, + "loss": 1.57216797, + "memory(GiB)": 77.59, + "step": 8230, + "train_speed(iter/s)": 1.624109 + }, + { + "acc": 0.65957918, + "epoch": 0.2089041095890411, + "grad_norm": 4.9375, + "learning_rate": 9.919156097592944e-06, + "loss": 1.55535278, + "memory(GiB)": 77.59, + "step": 8235, + "train_speed(iter/s)": 1.624221 + }, + { + "acc": 0.65380173, + "epoch": 0.20903094875697614, + "grad_norm": 6.0625, + "learning_rate": 9.91896818289642e-06, + "loss": 1.60029678, + "memory(GiB)": 77.59, + "step": 8240, + "train_speed(iter/s)": 1.624338 + }, + { + "acc": 0.65884752, + "epoch": 0.20915778792491121, + "grad_norm": 5.34375, + "learning_rate": 9.918780051841716e-06, + "loss": 1.65002918, + "memory(GiB)": 77.59, + "step": 8245, + "train_speed(iter/s)": 1.624457 + }, + { + "acc": 0.64723539, + "epoch": 0.20928462709284626, + "grad_norm": 5.21875, + "learning_rate": 9.918591704437103e-06, + "loss": 1.65435219, + "memory(GiB)": 77.59, + "step": 8250, + "train_speed(iter/s)": 1.624565 + }, + { + "acc": 0.64758787, + "epoch": 0.20941146626078133, + "grad_norm": 5.53125, + "learning_rate": 9.918403140690866e-06, + "loss": 1.6532177, + "memory(GiB)": 77.59, + "step": 8255, + "train_speed(iter/s)": 1.624688 + }, + { + "acc": 0.64280815, + "epoch": 0.20953830542871638, + "grad_norm": 5.125, + "learning_rate": 9.918214360611302e-06, + "loss": 1.68372955, + "memory(GiB)": 77.59, + "step": 8260, + "train_speed(iter/s)": 1.624807 + }, + { + "acc": 0.6420639, + "epoch": 0.20966514459665145, + "grad_norm": 7.09375, + "learning_rate": 9.918025364206712e-06, + "loss": 1.6575304, + "memory(GiB)": 77.59, + "step": 8265, + "train_speed(iter/s)": 1.624924 + }, + { + "acc": 0.64520044, + "epoch": 0.2097919837645865, + "grad_norm": 4.96875, + "learning_rate": 9.917836151485407e-06, + "loss": 1.65246849, + "memory(GiB)": 77.59, + "step": 8270, + "train_speed(iter/s)": 1.62503 + }, + { + "acc": 0.65880404, + "epoch": 0.20991882293252156, + "grad_norm": 5.0, + "learning_rate": 9.917646722455713e-06, + "loss": 1.6775856, + "memory(GiB)": 77.59, + "step": 8275, + "train_speed(iter/s)": 1.62515 + }, + { + "acc": 0.639642, + "epoch": 0.2100456621004566, + "grad_norm": 5.9375, + "learning_rate": 9.91745707712596e-06, + "loss": 1.62629128, + "memory(GiB)": 77.59, + "step": 8280, + "train_speed(iter/s)": 1.625265 + }, + { + "acc": 0.64745445, + "epoch": 0.21017250126839168, + "grad_norm": 6.0625, + "learning_rate": 9.91726721550449e-06, + "loss": 1.66536331, + "memory(GiB)": 77.59, + "step": 8285, + "train_speed(iter/s)": 1.625385 + }, + { + "acc": 0.63763294, + "epoch": 0.21029934043632673, + "grad_norm": 4.625, + "learning_rate": 9.917077137599653e-06, + "loss": 1.62223969, + "memory(GiB)": 77.59, + "step": 8290, + "train_speed(iter/s)": 1.625505 + }, + { + "acc": 0.63691978, + "epoch": 0.2104261796042618, + "grad_norm": 4.90625, + "learning_rate": 9.916886843419811e-06, + "loss": 1.72010422, + "memory(GiB)": 77.59, + "step": 8295, + "train_speed(iter/s)": 1.625615 + }, + { + "acc": 0.66086044, + "epoch": 0.21055301877219684, + "grad_norm": 5.96875, + "learning_rate": 9.916696332973334e-06, + "loss": 1.67197609, + "memory(GiB)": 77.59, + "step": 8300, + "train_speed(iter/s)": 1.62573 + }, + { + "acc": 0.6437602, + "epoch": 0.21067985794013191, + "grad_norm": 5.625, + "learning_rate": 9.9165056062686e-06, + "loss": 1.64001007, + "memory(GiB)": 77.59, + "step": 8305, + "train_speed(iter/s)": 1.625846 + }, + { + "acc": 0.63959589, + "epoch": 0.21080669710806696, + "grad_norm": 7.25, + "learning_rate": 9.916314663314e-06, + "loss": 1.68658485, + "memory(GiB)": 77.59, + "step": 8310, + "train_speed(iter/s)": 1.625961 + }, + { + "acc": 0.65336423, + "epoch": 0.21093353627600203, + "grad_norm": 5.0625, + "learning_rate": 9.91612350411793e-06, + "loss": 1.64935341, + "memory(GiB)": 77.59, + "step": 8315, + "train_speed(iter/s)": 1.626078 + }, + { + "acc": 0.66239243, + "epoch": 0.21106037544393708, + "grad_norm": 6.3125, + "learning_rate": 9.9159321286888e-06, + "loss": 1.60674877, + "memory(GiB)": 77.59, + "step": 8320, + "train_speed(iter/s)": 1.626188 + }, + { + "acc": 0.63801651, + "epoch": 0.21118721461187215, + "grad_norm": 5.28125, + "learning_rate": 9.915740537035026e-06, + "loss": 1.70842514, + "memory(GiB)": 77.59, + "step": 8325, + "train_speed(iter/s)": 1.6263 + }, + { + "acc": 0.64135876, + "epoch": 0.2113140537798072, + "grad_norm": 6.0625, + "learning_rate": 9.915548729165036e-06, + "loss": 1.70464973, + "memory(GiB)": 77.59, + "step": 8330, + "train_speed(iter/s)": 1.62642 + }, + { + "acc": 0.67257271, + "epoch": 0.21144089294774226, + "grad_norm": 5.96875, + "learning_rate": 9.915356705087269e-06, + "loss": 1.5961998, + "memory(GiB)": 77.59, + "step": 8335, + "train_speed(iter/s)": 1.62653 + }, + { + "acc": 0.63645921, + "epoch": 0.2115677321156773, + "grad_norm": 5.625, + "learning_rate": 9.915164464810166e-06, + "loss": 1.7077179, + "memory(GiB)": 77.59, + "step": 8340, + "train_speed(iter/s)": 1.626642 + }, + { + "acc": 0.65207982, + "epoch": 0.21169457128361238, + "grad_norm": 5.84375, + "learning_rate": 9.914972008342186e-06, + "loss": 1.63905373, + "memory(GiB)": 77.59, + "step": 8345, + "train_speed(iter/s)": 1.626755 + }, + { + "acc": 0.63130016, + "epoch": 0.21182141045154743, + "grad_norm": 5.375, + "learning_rate": 9.914779335691793e-06, + "loss": 1.72059326, + "memory(GiB)": 77.59, + "step": 8350, + "train_speed(iter/s)": 1.626863 + }, + { + "acc": 0.63687859, + "epoch": 0.2119482496194825, + "grad_norm": 5.0625, + "learning_rate": 9.914586446867463e-06, + "loss": 1.71129837, + "memory(GiB)": 77.59, + "step": 8355, + "train_speed(iter/s)": 1.62697 + }, + { + "acc": 0.65402355, + "epoch": 0.21207508878741754, + "grad_norm": 5.46875, + "learning_rate": 9.914393341877678e-06, + "loss": 1.67361412, + "memory(GiB)": 77.59, + "step": 8360, + "train_speed(iter/s)": 1.627085 + }, + { + "acc": 0.6403204, + "epoch": 0.21220192795535261, + "grad_norm": 6.28125, + "learning_rate": 9.914200020730932e-06, + "loss": 1.68642654, + "memory(GiB)": 77.59, + "step": 8365, + "train_speed(iter/s)": 1.627195 + }, + { + "acc": 0.65126219, + "epoch": 0.21232876712328766, + "grad_norm": 6.4375, + "learning_rate": 9.914006483435732e-06, + "loss": 1.63516178, + "memory(GiB)": 77.59, + "step": 8370, + "train_speed(iter/s)": 1.627308 + }, + { + "acc": 0.63316717, + "epoch": 0.21245560629122273, + "grad_norm": 5.40625, + "learning_rate": 9.913812730000585e-06, + "loss": 1.7060955, + "memory(GiB)": 77.59, + "step": 8375, + "train_speed(iter/s)": 1.627413 + }, + { + "acc": 0.65344415, + "epoch": 0.21258244545915778, + "grad_norm": 6.5625, + "learning_rate": 9.913618760434015e-06, + "loss": 1.66577759, + "memory(GiB)": 77.59, + "step": 8380, + "train_speed(iter/s)": 1.627532 + }, + { + "acc": 0.65479565, + "epoch": 0.21270928462709285, + "grad_norm": 5.78125, + "learning_rate": 9.913424574744555e-06, + "loss": 1.58334064, + "memory(GiB)": 77.59, + "step": 8385, + "train_speed(iter/s)": 1.627642 + }, + { + "acc": 0.65342941, + "epoch": 0.2128361237950279, + "grad_norm": 6.34375, + "learning_rate": 9.913230172940744e-06, + "loss": 1.64900208, + "memory(GiB)": 77.59, + "step": 8390, + "train_speed(iter/s)": 1.627743 + }, + { + "acc": 0.63078222, + "epoch": 0.21296296296296297, + "grad_norm": 5.3125, + "learning_rate": 9.913035555031136e-06, + "loss": 1.69271832, + "memory(GiB)": 77.59, + "step": 8395, + "train_speed(iter/s)": 1.627863 + }, + { + "acc": 0.6467721, + "epoch": 0.213089802130898, + "grad_norm": 5.59375, + "learning_rate": 9.912840721024288e-06, + "loss": 1.61202888, + "memory(GiB)": 77.59, + "step": 8400, + "train_speed(iter/s)": 1.627975 + }, + { + "acc": 0.66350784, + "epoch": 0.21321664129883308, + "grad_norm": 5.1875, + "learning_rate": 9.91264567092877e-06, + "loss": 1.53365574, + "memory(GiB)": 77.59, + "step": 8405, + "train_speed(iter/s)": 1.62809 + }, + { + "acc": 0.63582897, + "epoch": 0.21334348046676813, + "grad_norm": 6.71875, + "learning_rate": 9.912450404753164e-06, + "loss": 1.73253746, + "memory(GiB)": 77.59, + "step": 8410, + "train_speed(iter/s)": 1.628207 + }, + { + "acc": 0.65780821, + "epoch": 0.2134703196347032, + "grad_norm": 6.34375, + "learning_rate": 9.912254922506057e-06, + "loss": 1.63765602, + "memory(GiB)": 77.59, + "step": 8415, + "train_speed(iter/s)": 1.628321 + }, + { + "acc": 0.64422302, + "epoch": 0.21359715880263824, + "grad_norm": 5.40625, + "learning_rate": 9.912059224196044e-06, + "loss": 1.67829494, + "memory(GiB)": 77.59, + "step": 8420, + "train_speed(iter/s)": 1.62844 + }, + { + "acc": 0.64829693, + "epoch": 0.21372399797057332, + "grad_norm": 4.875, + "learning_rate": 9.911863309831738e-06, + "loss": 1.60129166, + "memory(GiB)": 77.59, + "step": 8425, + "train_speed(iter/s)": 1.628549 + }, + { + "acc": 0.6283473, + "epoch": 0.21385083713850836, + "grad_norm": 5.78125, + "learning_rate": 9.911667179421753e-06, + "loss": 1.76669254, + "memory(GiB)": 77.59, + "step": 8430, + "train_speed(iter/s)": 1.628662 + }, + { + "acc": 0.65893593, + "epoch": 0.21397767630644343, + "grad_norm": 4.875, + "learning_rate": 9.911470832974717e-06, + "loss": 1.60862846, + "memory(GiB)": 77.59, + "step": 8435, + "train_speed(iter/s)": 1.628764 + }, + { + "acc": 0.64142599, + "epoch": 0.21410451547437848, + "grad_norm": 5.21875, + "learning_rate": 9.911274270499265e-06, + "loss": 1.61372776, + "memory(GiB)": 77.59, + "step": 8440, + "train_speed(iter/s)": 1.628875 + }, + { + "acc": 0.65471873, + "epoch": 0.21423135464231355, + "grad_norm": 7.25, + "learning_rate": 9.911077492004044e-06, + "loss": 1.63499527, + "memory(GiB)": 77.59, + "step": 8445, + "train_speed(iter/s)": 1.628982 + }, + { + "acc": 0.64031177, + "epoch": 0.2143581938102486, + "grad_norm": 6.125, + "learning_rate": 9.910880497497707e-06, + "loss": 1.64578476, + "memory(GiB)": 77.59, + "step": 8450, + "train_speed(iter/s)": 1.629098 + }, + { + "acc": 0.64969273, + "epoch": 0.21448503297818367, + "grad_norm": 5.28125, + "learning_rate": 9.910683286988922e-06, + "loss": 1.64888611, + "memory(GiB)": 77.59, + "step": 8455, + "train_speed(iter/s)": 1.629208 + }, + { + "acc": 0.64128742, + "epoch": 0.2146118721461187, + "grad_norm": 5.78125, + "learning_rate": 9.910485860486361e-06, + "loss": 1.66696815, + "memory(GiB)": 77.59, + "step": 8460, + "train_speed(iter/s)": 1.629319 + }, + { + "acc": 0.63944397, + "epoch": 0.21473871131405378, + "grad_norm": 5.40625, + "learning_rate": 9.910288217998707e-06, + "loss": 1.70761452, + "memory(GiB)": 77.59, + "step": 8465, + "train_speed(iter/s)": 1.629434 + }, + { + "acc": 0.6441371, + "epoch": 0.21486555048198883, + "grad_norm": 5.6875, + "learning_rate": 9.910090359534654e-06, + "loss": 1.68477287, + "memory(GiB)": 77.59, + "step": 8470, + "train_speed(iter/s)": 1.629546 + }, + { + "acc": 0.65118241, + "epoch": 0.2149923896499239, + "grad_norm": 6.25, + "learning_rate": 9.909892285102907e-06, + "loss": 1.66041737, + "memory(GiB)": 77.59, + "step": 8475, + "train_speed(iter/s)": 1.62966 + }, + { + "acc": 0.65761256, + "epoch": 0.21511922881785894, + "grad_norm": 6.09375, + "learning_rate": 9.909693994712174e-06, + "loss": 1.61552715, + "memory(GiB)": 77.59, + "step": 8480, + "train_speed(iter/s)": 1.629775 + }, + { + "acc": 0.66708546, + "epoch": 0.21524606798579402, + "grad_norm": 5.5, + "learning_rate": 9.909495488371181e-06, + "loss": 1.56115389, + "memory(GiB)": 77.59, + "step": 8485, + "train_speed(iter/s)": 1.629876 + }, + { + "acc": 0.64924803, + "epoch": 0.21537290715372906, + "grad_norm": 6.0, + "learning_rate": 9.909296766088657e-06, + "loss": 1.64076824, + "memory(GiB)": 77.59, + "step": 8490, + "train_speed(iter/s)": 1.629989 + }, + { + "acc": 0.66240573, + "epoch": 0.21549974632166413, + "grad_norm": 6.21875, + "learning_rate": 9.909097827873341e-06, + "loss": 1.62462959, + "memory(GiB)": 77.59, + "step": 8495, + "train_speed(iter/s)": 1.6301 + }, + { + "acc": 0.65771203, + "epoch": 0.21562658548959918, + "grad_norm": 6.375, + "learning_rate": 9.908898673733986e-06, + "loss": 1.68465729, + "memory(GiB)": 77.59, + "step": 8500, + "train_speed(iter/s)": 1.630212 + }, + { + "acc": 0.65662766, + "epoch": 0.21575342465753425, + "grad_norm": 5.59375, + "learning_rate": 9.90869930367935e-06, + "loss": 1.62575798, + "memory(GiB)": 77.59, + "step": 8505, + "train_speed(iter/s)": 1.63032 + }, + { + "acc": 0.63108907, + "epoch": 0.2158802638254693, + "grad_norm": 6.46875, + "learning_rate": 9.908499717718203e-06, + "loss": 1.73423691, + "memory(GiB)": 77.59, + "step": 8510, + "train_speed(iter/s)": 1.630434 + }, + { + "acc": 0.65553608, + "epoch": 0.21600710299340437, + "grad_norm": 5.84375, + "learning_rate": 9.908299915859325e-06, + "loss": 1.66251068, + "memory(GiB)": 77.59, + "step": 8515, + "train_speed(iter/s)": 1.63054 + }, + { + "acc": 0.65806885, + "epoch": 0.2161339421613394, + "grad_norm": 5.25, + "learning_rate": 9.908099898111502e-06, + "loss": 1.61349487, + "memory(GiB)": 77.59, + "step": 8520, + "train_speed(iter/s)": 1.630652 + }, + { + "acc": 0.64546289, + "epoch": 0.21626078132927448, + "grad_norm": 5.90625, + "learning_rate": 9.907899664483533e-06, + "loss": 1.61516914, + "memory(GiB)": 77.59, + "step": 8525, + "train_speed(iter/s)": 1.63076 + }, + { + "acc": 0.62854815, + "epoch": 0.21638762049720953, + "grad_norm": 5.09375, + "learning_rate": 9.907699214984223e-06, + "loss": 1.68960381, + "memory(GiB)": 77.59, + "step": 8530, + "train_speed(iter/s)": 1.63087 + }, + { + "acc": 0.63813071, + "epoch": 0.2165144596651446, + "grad_norm": 5.5, + "learning_rate": 9.90749854962239e-06, + "loss": 1.66673012, + "memory(GiB)": 77.59, + "step": 8535, + "train_speed(iter/s)": 1.630976 + }, + { + "acc": 0.65178452, + "epoch": 0.21664129883307964, + "grad_norm": 6.53125, + "learning_rate": 9.907297668406863e-06, + "loss": 1.60834808, + "memory(GiB)": 77.59, + "step": 8540, + "train_speed(iter/s)": 1.631087 + }, + { + "acc": 0.63977785, + "epoch": 0.21676813800101472, + "grad_norm": 4.96875, + "learning_rate": 9.907096571346474e-06, + "loss": 1.66329422, + "memory(GiB)": 77.59, + "step": 8545, + "train_speed(iter/s)": 1.631205 + }, + { + "acc": 0.64295607, + "epoch": 0.21689497716894976, + "grad_norm": 4.75, + "learning_rate": 9.906895258450067e-06, + "loss": 1.6192564, + "memory(GiB)": 77.59, + "step": 8550, + "train_speed(iter/s)": 1.631317 + }, + { + "acc": 0.65605288, + "epoch": 0.21702181633688483, + "grad_norm": 4.875, + "learning_rate": 9.9066937297265e-06, + "loss": 1.60834084, + "memory(GiB)": 77.59, + "step": 8555, + "train_speed(iter/s)": 1.631423 + }, + { + "acc": 0.6422637, + "epoch": 0.21714865550481988, + "grad_norm": 5.40625, + "learning_rate": 9.906491985184637e-06, + "loss": 1.59745216, + "memory(GiB)": 77.59, + "step": 8560, + "train_speed(iter/s)": 1.631526 + }, + { + "acc": 0.62723713, + "epoch": 0.21727549467275495, + "grad_norm": 6.0, + "learning_rate": 9.906290024833349e-06, + "loss": 1.69743805, + "memory(GiB)": 77.59, + "step": 8565, + "train_speed(iter/s)": 1.631633 + }, + { + "acc": 0.64682379, + "epoch": 0.21740233384069, + "grad_norm": 6.5625, + "learning_rate": 9.906087848681523e-06, + "loss": 1.64991112, + "memory(GiB)": 77.59, + "step": 8570, + "train_speed(iter/s)": 1.631743 + }, + { + "acc": 0.64697676, + "epoch": 0.21752917300862507, + "grad_norm": 6.03125, + "learning_rate": 9.905885456738046e-06, + "loss": 1.66882057, + "memory(GiB)": 77.59, + "step": 8575, + "train_speed(iter/s)": 1.631859 + }, + { + "acc": 0.65591955, + "epoch": 0.2176560121765601, + "grad_norm": 5.90625, + "learning_rate": 9.905682849011826e-06, + "loss": 1.65404167, + "memory(GiB)": 77.59, + "step": 8580, + "train_speed(iter/s)": 1.631971 + }, + { + "acc": 0.64565945, + "epoch": 0.21778285134449518, + "grad_norm": 6.15625, + "learning_rate": 9.905480025511772e-06, + "loss": 1.70297546, + "memory(GiB)": 77.59, + "step": 8585, + "train_speed(iter/s)": 1.632079 + }, + { + "acc": 0.63967342, + "epoch": 0.21790969051243023, + "grad_norm": 5.40625, + "learning_rate": 9.905276986246804e-06, + "loss": 1.67091541, + "memory(GiB)": 77.59, + "step": 8590, + "train_speed(iter/s)": 1.632187 + }, + { + "acc": 0.64475427, + "epoch": 0.2180365296803653, + "grad_norm": 5.1875, + "learning_rate": 9.905073731225854e-06, + "loss": 1.5832325, + "memory(GiB)": 77.59, + "step": 8595, + "train_speed(iter/s)": 1.632305 + }, + { + "acc": 0.62980385, + "epoch": 0.21816336884830034, + "grad_norm": 5.9375, + "learning_rate": 9.904870260457861e-06, + "loss": 1.66110249, + "memory(GiB)": 77.59, + "step": 8600, + "train_speed(iter/s)": 1.632419 + }, + { + "acc": 0.6292099, + "epoch": 0.21829020801623542, + "grad_norm": 6.5, + "learning_rate": 9.904666573951777e-06, + "loss": 1.6999754, + "memory(GiB)": 77.59, + "step": 8605, + "train_speed(iter/s)": 1.632536 + }, + { + "acc": 0.63838439, + "epoch": 0.21841704718417046, + "grad_norm": 5.8125, + "learning_rate": 9.904462671716559e-06, + "loss": 1.70130424, + "memory(GiB)": 77.59, + "step": 8610, + "train_speed(iter/s)": 1.632652 + }, + { + "acc": 0.62956901, + "epoch": 0.21854388635210553, + "grad_norm": 5.21875, + "learning_rate": 9.904258553761175e-06, + "loss": 1.7441967, + "memory(GiB)": 77.59, + "step": 8615, + "train_speed(iter/s)": 1.63277 + }, + { + "acc": 0.64285345, + "epoch": 0.21867072552004058, + "grad_norm": 4.875, + "learning_rate": 9.904054220094603e-06, + "loss": 1.69572258, + "memory(GiB)": 77.59, + "step": 8620, + "train_speed(iter/s)": 1.632882 + }, + { + "acc": 0.63723063, + "epoch": 0.21879756468797565, + "grad_norm": 5.46875, + "learning_rate": 9.903849670725833e-06, + "loss": 1.68363056, + "memory(GiB)": 77.59, + "step": 8625, + "train_speed(iter/s)": 1.632995 + }, + { + "acc": 0.6453342, + "epoch": 0.2189244038559107, + "grad_norm": 5.75, + "learning_rate": 9.903644905663861e-06, + "loss": 1.63896809, + "memory(GiB)": 77.59, + "step": 8630, + "train_speed(iter/s)": 1.633113 + }, + { + "acc": 0.64380865, + "epoch": 0.21905124302384577, + "grad_norm": 7.1875, + "learning_rate": 9.90343992491769e-06, + "loss": 1.64266586, + "memory(GiB)": 77.59, + "step": 8635, + "train_speed(iter/s)": 1.63322 + }, + { + "acc": 0.6507225, + "epoch": 0.2191780821917808, + "grad_norm": 5.25, + "learning_rate": 9.903234728496341e-06, + "loss": 1.64144497, + "memory(GiB)": 77.59, + "step": 8640, + "train_speed(iter/s)": 1.633332 + }, + { + "acc": 0.63628936, + "epoch": 0.21930492135971588, + "grad_norm": 6.125, + "learning_rate": 9.903029316408838e-06, + "loss": 1.72334366, + "memory(GiB)": 77.59, + "step": 8645, + "train_speed(iter/s)": 1.633444 + }, + { + "acc": 0.65579467, + "epoch": 0.21943176052765093, + "grad_norm": 5.28125, + "learning_rate": 9.902823688664214e-06, + "loss": 1.62671928, + "memory(GiB)": 77.59, + "step": 8650, + "train_speed(iter/s)": 1.633561 + }, + { + "acc": 0.64384308, + "epoch": 0.219558599695586, + "grad_norm": 6.1875, + "learning_rate": 9.902617845271514e-06, + "loss": 1.63062286, + "memory(GiB)": 77.59, + "step": 8655, + "train_speed(iter/s)": 1.633677 + }, + { + "acc": 0.66443839, + "epoch": 0.21968543886352104, + "grad_norm": 5.03125, + "learning_rate": 9.902411786239794e-06, + "loss": 1.58239136, + "memory(GiB)": 77.59, + "step": 8660, + "train_speed(iter/s)": 1.633788 + }, + { + "acc": 0.63960915, + "epoch": 0.21981227803145612, + "grad_norm": 4.84375, + "learning_rate": 9.902205511578114e-06, + "loss": 1.66505508, + "memory(GiB)": 77.59, + "step": 8665, + "train_speed(iter/s)": 1.633899 + }, + { + "acc": 0.64205399, + "epoch": 0.21993911719939116, + "grad_norm": 6.75, + "learning_rate": 9.90199902129555e-06, + "loss": 1.6725647, + "memory(GiB)": 77.59, + "step": 8670, + "train_speed(iter/s)": 1.634007 + }, + { + "acc": 0.63029222, + "epoch": 0.22006595636732623, + "grad_norm": 5.71875, + "learning_rate": 9.901792315401184e-06, + "loss": 1.7220562, + "memory(GiB)": 77.59, + "step": 8675, + "train_speed(iter/s)": 1.634111 + }, + { + "acc": 0.65128794, + "epoch": 0.22019279553526128, + "grad_norm": 5.21875, + "learning_rate": 9.901585393904104e-06, + "loss": 1.57617397, + "memory(GiB)": 77.59, + "step": 8680, + "train_speed(iter/s)": 1.634221 + }, + { + "acc": 0.64128423, + "epoch": 0.22031963470319635, + "grad_norm": 5.09375, + "learning_rate": 9.901378256813418e-06, + "loss": 1.69571953, + "memory(GiB)": 77.59, + "step": 8685, + "train_speed(iter/s)": 1.634332 + }, + { + "acc": 0.65720778, + "epoch": 0.2204464738711314, + "grad_norm": 5.84375, + "learning_rate": 9.901170904138232e-06, + "loss": 1.60900707, + "memory(GiB)": 77.59, + "step": 8690, + "train_speed(iter/s)": 1.634438 + }, + { + "acc": 0.62904091, + "epoch": 0.22057331303906647, + "grad_norm": 5.3125, + "learning_rate": 9.900963335887667e-06, + "loss": 1.74302502, + "memory(GiB)": 77.59, + "step": 8695, + "train_speed(iter/s)": 1.634543 + }, + { + "acc": 0.64808598, + "epoch": 0.2207001522070015, + "grad_norm": 6.375, + "learning_rate": 9.900755552070852e-06, + "loss": 1.6748003, + "memory(GiB)": 77.59, + "step": 8700, + "train_speed(iter/s)": 1.634646 + }, + { + "acc": 0.65508118, + "epoch": 0.22082699137493658, + "grad_norm": 5.5, + "learning_rate": 9.900547552696931e-06, + "loss": 1.64467583, + "memory(GiB)": 77.59, + "step": 8705, + "train_speed(iter/s)": 1.63475 + }, + { + "acc": 0.65405536, + "epoch": 0.22095383054287163, + "grad_norm": 5.125, + "learning_rate": 9.900339337775046e-06, + "loss": 1.69861526, + "memory(GiB)": 77.59, + "step": 8710, + "train_speed(iter/s)": 1.63486 + }, + { + "acc": 0.64679093, + "epoch": 0.2210806697108067, + "grad_norm": 6.3125, + "learning_rate": 9.90013090731436e-06, + "loss": 1.67696075, + "memory(GiB)": 77.59, + "step": 8715, + "train_speed(iter/s)": 1.634972 + }, + { + "acc": 0.63485146, + "epoch": 0.22120750887874174, + "grad_norm": 6.5625, + "learning_rate": 9.89992226132404e-06, + "loss": 1.7033226, + "memory(GiB)": 77.59, + "step": 8720, + "train_speed(iter/s)": 1.635081 + }, + { + "acc": 0.64462056, + "epoch": 0.22133434804667682, + "grad_norm": 4.65625, + "learning_rate": 9.899713399813261e-06, + "loss": 1.64769821, + "memory(GiB)": 77.59, + "step": 8725, + "train_speed(iter/s)": 1.635187 + }, + { + "acc": 0.65201559, + "epoch": 0.22146118721461186, + "grad_norm": 5.6875, + "learning_rate": 9.899504322791212e-06, + "loss": 1.63786049, + "memory(GiB)": 77.59, + "step": 8730, + "train_speed(iter/s)": 1.635296 + }, + { + "acc": 0.64604015, + "epoch": 0.22158802638254693, + "grad_norm": 5.34375, + "learning_rate": 9.899295030267086e-06, + "loss": 1.70348663, + "memory(GiB)": 77.59, + "step": 8735, + "train_speed(iter/s)": 1.635403 + }, + { + "acc": 0.63902173, + "epoch": 0.22171486555048198, + "grad_norm": 5.6875, + "learning_rate": 9.899085522250094e-06, + "loss": 1.63704987, + "memory(GiB)": 77.59, + "step": 8740, + "train_speed(iter/s)": 1.635511 + }, + { + "acc": 0.65556135, + "epoch": 0.22184170471841705, + "grad_norm": 5.375, + "learning_rate": 9.898875798749446e-06, + "loss": 1.650494, + "memory(GiB)": 77.59, + "step": 8745, + "train_speed(iter/s)": 1.63562 + }, + { + "acc": 0.6451849, + "epoch": 0.2219685438863521, + "grad_norm": 5.65625, + "learning_rate": 9.898665859774367e-06, + "loss": 1.66049042, + "memory(GiB)": 77.59, + "step": 8750, + "train_speed(iter/s)": 1.635732 + }, + { + "acc": 0.64999762, + "epoch": 0.22209538305428717, + "grad_norm": 7.8125, + "learning_rate": 9.898455705334095e-06, + "loss": 1.6913868, + "memory(GiB)": 77.59, + "step": 8755, + "train_speed(iter/s)": 1.63584 + }, + { + "acc": 0.64375086, + "epoch": 0.2222222222222222, + "grad_norm": 5.15625, + "learning_rate": 9.89824533543787e-06, + "loss": 1.66471634, + "memory(GiB)": 77.59, + "step": 8760, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.63135786, + "epoch": 0.22234906139015728, + "grad_norm": 5.5, + "learning_rate": 9.898034750094946e-06, + "loss": 1.70278244, + "memory(GiB)": 77.59, + "step": 8765, + "train_speed(iter/s)": 1.636057 + }, + { + "acc": 0.64137568, + "epoch": 0.22247590055809233, + "grad_norm": 6.15625, + "learning_rate": 9.897823949314586e-06, + "loss": 1.71330738, + "memory(GiB)": 77.59, + "step": 8770, + "train_speed(iter/s)": 1.636167 + }, + { + "acc": 0.64283295, + "epoch": 0.2226027397260274, + "grad_norm": 6.09375, + "learning_rate": 9.897612933106061e-06, + "loss": 1.71886826, + "memory(GiB)": 77.59, + "step": 8775, + "train_speed(iter/s)": 1.636282 + }, + { + "acc": 0.64763718, + "epoch": 0.22272957889396244, + "grad_norm": 5.46875, + "learning_rate": 9.897401701478654e-06, + "loss": 1.62709312, + "memory(GiB)": 77.59, + "step": 8780, + "train_speed(iter/s)": 1.636388 + }, + { + "acc": 0.64721255, + "epoch": 0.22285641806189752, + "grad_norm": 6.375, + "learning_rate": 9.897190254441653e-06, + "loss": 1.67772217, + "memory(GiB)": 77.59, + "step": 8785, + "train_speed(iter/s)": 1.636495 + }, + { + "acc": 0.63037624, + "epoch": 0.22298325722983256, + "grad_norm": 5.375, + "learning_rate": 9.896978592004363e-06, + "loss": 1.70336437, + "memory(GiB)": 77.59, + "step": 8790, + "train_speed(iter/s)": 1.636599 + }, + { + "acc": 0.63616052, + "epoch": 0.22311009639776763, + "grad_norm": 7.0, + "learning_rate": 9.896766714176089e-06, + "loss": 1.7008009, + "memory(GiB)": 77.59, + "step": 8795, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.64856014, + "epoch": 0.22323693556570268, + "grad_norm": 6.0625, + "learning_rate": 9.896554620966152e-06, + "loss": 1.67676983, + "memory(GiB)": 77.59, + "step": 8800, + "train_speed(iter/s)": 1.636819 + }, + { + "acc": 0.65209675, + "epoch": 0.22336377473363775, + "grad_norm": 6.5, + "learning_rate": 9.896342312383883e-06, + "loss": 1.64270687, + "memory(GiB)": 77.59, + "step": 8805, + "train_speed(iter/s)": 1.636929 + }, + { + "acc": 0.65134401, + "epoch": 0.2234906139015728, + "grad_norm": 6.65625, + "learning_rate": 9.896129788438617e-06, + "loss": 1.65449104, + "memory(GiB)": 77.59, + "step": 8810, + "train_speed(iter/s)": 1.637029 + }, + { + "acc": 0.65437455, + "epoch": 0.22361745306950787, + "grad_norm": 4.96875, + "learning_rate": 9.895917049139704e-06, + "loss": 1.62094345, + "memory(GiB)": 77.59, + "step": 8815, + "train_speed(iter/s)": 1.637136 + }, + { + "acc": 0.6425766, + "epoch": 0.2237442922374429, + "grad_norm": 7.0, + "learning_rate": 9.895704094496502e-06, + "loss": 1.70247765, + "memory(GiB)": 77.59, + "step": 8820, + "train_speed(iter/s)": 1.637252 + }, + { + "acc": 0.666113, + "epoch": 0.22387113140537798, + "grad_norm": 4.84375, + "learning_rate": 9.895490924518372e-06, + "loss": 1.58082705, + "memory(GiB)": 77.59, + "step": 8825, + "train_speed(iter/s)": 1.637358 + }, + { + "acc": 0.65249109, + "epoch": 0.22399797057331303, + "grad_norm": 5.03125, + "learning_rate": 9.895277539214698e-06, + "loss": 1.57726479, + "memory(GiB)": 77.59, + "step": 8830, + "train_speed(iter/s)": 1.637458 + }, + { + "acc": 0.64011312, + "epoch": 0.2241248097412481, + "grad_norm": 5.46875, + "learning_rate": 9.895063938594859e-06, + "loss": 1.71025105, + "memory(GiB)": 77.59, + "step": 8835, + "train_speed(iter/s)": 1.637557 + }, + { + "acc": 0.6358242, + "epoch": 0.22425164890918314, + "grad_norm": 4.6875, + "learning_rate": 9.894850122668256e-06, + "loss": 1.69120293, + "memory(GiB)": 77.59, + "step": 8840, + "train_speed(iter/s)": 1.63766 + }, + { + "acc": 0.65193229, + "epoch": 0.22437848807711822, + "grad_norm": 4.78125, + "learning_rate": 9.89463609144429e-06, + "loss": 1.61133499, + "memory(GiB)": 77.59, + "step": 8845, + "train_speed(iter/s)": 1.637764 + }, + { + "acc": 0.65598588, + "epoch": 0.22450532724505326, + "grad_norm": 6.3125, + "learning_rate": 9.894421844932375e-06, + "loss": 1.65222321, + "memory(GiB)": 77.59, + "step": 8850, + "train_speed(iter/s)": 1.637875 + }, + { + "acc": 0.67114768, + "epoch": 0.22463216641298833, + "grad_norm": 6.40625, + "learning_rate": 9.894207383141937e-06, + "loss": 1.58363075, + "memory(GiB)": 77.59, + "step": 8855, + "train_speed(iter/s)": 1.637981 + }, + { + "acc": 0.64559302, + "epoch": 0.22475900558092338, + "grad_norm": 6.46875, + "learning_rate": 9.893992706082405e-06, + "loss": 1.60230484, + "memory(GiB)": 77.59, + "step": 8860, + "train_speed(iter/s)": 1.638089 + }, + { + "acc": 0.64186382, + "epoch": 0.22488584474885845, + "grad_norm": 5.125, + "learning_rate": 9.893777813763223e-06, + "loss": 1.65982876, + "memory(GiB)": 77.59, + "step": 8865, + "train_speed(iter/s)": 1.638192 + }, + { + "acc": 0.64359713, + "epoch": 0.2250126839167935, + "grad_norm": 7.6875, + "learning_rate": 9.893562706193847e-06, + "loss": 1.74115181, + "memory(GiB)": 77.59, + "step": 8870, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.6513104, + "epoch": 0.22513952308472857, + "grad_norm": 4.96875, + "learning_rate": 9.893347383383732e-06, + "loss": 1.6545517, + "memory(GiB)": 77.59, + "step": 8875, + "train_speed(iter/s)": 1.638421 + }, + { + "acc": 0.66517544, + "epoch": 0.2252663622526636, + "grad_norm": 5.71875, + "learning_rate": 9.893131845342352e-06, + "loss": 1.6524456, + "memory(GiB)": 77.59, + "step": 8880, + "train_speed(iter/s)": 1.638526 + }, + { + "acc": 0.65319042, + "epoch": 0.22539320142059868, + "grad_norm": 5.28125, + "learning_rate": 9.892916092079188e-06, + "loss": 1.67773113, + "memory(GiB)": 77.59, + "step": 8885, + "train_speed(iter/s)": 1.638626 + }, + { + "acc": 0.64998131, + "epoch": 0.22552004058853373, + "grad_norm": 5.15625, + "learning_rate": 9.89270012360373e-06, + "loss": 1.67427521, + "memory(GiB)": 77.59, + "step": 8890, + "train_speed(iter/s)": 1.638731 + }, + { + "acc": 0.65385656, + "epoch": 0.2256468797564688, + "grad_norm": 5.28125, + "learning_rate": 9.892483939925476e-06, + "loss": 1.63581619, + "memory(GiB)": 77.59, + "step": 8895, + "train_speed(iter/s)": 1.638832 + }, + { + "acc": 0.63542638, + "epoch": 0.22577371892440384, + "grad_norm": 6.40625, + "learning_rate": 9.892267541053933e-06, + "loss": 1.67770729, + "memory(GiB)": 77.59, + "step": 8900, + "train_speed(iter/s)": 1.638932 + }, + { + "acc": 0.6464417, + "epoch": 0.22590055809233892, + "grad_norm": 6.0, + "learning_rate": 9.892050926998624e-06, + "loss": 1.6212553, + "memory(GiB)": 77.59, + "step": 8905, + "train_speed(iter/s)": 1.639027 + }, + { + "acc": 0.64251628, + "epoch": 0.22602739726027396, + "grad_norm": 5.1875, + "learning_rate": 9.891834097769071e-06, + "loss": 1.71904182, + "memory(GiB)": 77.59, + "step": 8910, + "train_speed(iter/s)": 1.639131 + }, + { + "acc": 0.63100786, + "epoch": 0.22615423642820903, + "grad_norm": 4.96875, + "learning_rate": 9.891617053374816e-06, + "loss": 1.74044991, + "memory(GiB)": 77.59, + "step": 8915, + "train_speed(iter/s)": 1.639227 + }, + { + "acc": 0.64144549, + "epoch": 0.22628107559614408, + "grad_norm": 6.1875, + "learning_rate": 9.891399793825403e-06, + "loss": 1.69350529, + "memory(GiB)": 77.59, + "step": 8920, + "train_speed(iter/s)": 1.639323 + }, + { + "acc": 0.64767647, + "epoch": 0.22640791476407915, + "grad_norm": 5.25, + "learning_rate": 9.891182319130387e-06, + "loss": 1.72190552, + "memory(GiB)": 77.59, + "step": 8925, + "train_speed(iter/s)": 1.639426 + }, + { + "acc": 0.64755979, + "epoch": 0.2265347539320142, + "grad_norm": 5.90625, + "learning_rate": 9.890964629299336e-06, + "loss": 1.70096512, + "memory(GiB)": 77.59, + "step": 8930, + "train_speed(iter/s)": 1.639526 + }, + { + "acc": 0.64155512, + "epoch": 0.22666159309994927, + "grad_norm": 5.15625, + "learning_rate": 9.890746724341825e-06, + "loss": 1.73209248, + "memory(GiB)": 77.59, + "step": 8935, + "train_speed(iter/s)": 1.639627 + }, + { + "acc": 0.64422998, + "epoch": 0.2267884322678843, + "grad_norm": 5.59375, + "learning_rate": 9.890528604267436e-06, + "loss": 1.58922834, + "memory(GiB)": 77.59, + "step": 8940, + "train_speed(iter/s)": 1.639722 + }, + { + "acc": 0.65478997, + "epoch": 0.22691527143581938, + "grad_norm": 6.71875, + "learning_rate": 9.890310269085765e-06, + "loss": 1.63509693, + "memory(GiB)": 77.59, + "step": 8945, + "train_speed(iter/s)": 1.639821 + }, + { + "acc": 0.63246875, + "epoch": 0.22704211060375443, + "grad_norm": 5.40625, + "learning_rate": 9.890091718806414e-06, + "loss": 1.70619316, + "memory(GiB)": 77.59, + "step": 8950, + "train_speed(iter/s)": 1.639924 + }, + { + "acc": 0.65057049, + "epoch": 0.2271689497716895, + "grad_norm": 6.09375, + "learning_rate": 9.889872953438996e-06, + "loss": 1.65867653, + "memory(GiB)": 77.59, + "step": 8955, + "train_speed(iter/s)": 1.640024 + }, + { + "acc": 0.64201136, + "epoch": 0.22729578893962454, + "grad_norm": 8.25, + "learning_rate": 9.889653972993136e-06, + "loss": 1.68201637, + "memory(GiB)": 77.59, + "step": 8960, + "train_speed(iter/s)": 1.640134 + }, + { + "acc": 0.64295058, + "epoch": 0.22742262810755962, + "grad_norm": 5.21875, + "learning_rate": 9.889434777478464e-06, + "loss": 1.67074699, + "memory(GiB)": 77.59, + "step": 8965, + "train_speed(iter/s)": 1.640239 + }, + { + "acc": 0.65191102, + "epoch": 0.22754946727549466, + "grad_norm": 5.3125, + "learning_rate": 9.88921536690462e-06, + "loss": 1.68216286, + "memory(GiB)": 77.59, + "step": 8970, + "train_speed(iter/s)": 1.640342 + }, + { + "acc": 0.63716831, + "epoch": 0.22767630644342973, + "grad_norm": 5.21875, + "learning_rate": 9.888995741281252e-06, + "loss": 1.73848877, + "memory(GiB)": 77.59, + "step": 8975, + "train_speed(iter/s)": 1.640437 + }, + { + "acc": 0.64836941, + "epoch": 0.22780314561136478, + "grad_norm": 5.8125, + "learning_rate": 9.888775900618028e-06, + "loss": 1.605476, + "memory(GiB)": 77.59, + "step": 8980, + "train_speed(iter/s)": 1.640539 + }, + { + "acc": 0.644102, + "epoch": 0.22792998477929985, + "grad_norm": 6.0, + "learning_rate": 9.88855584492461e-06, + "loss": 1.67533607, + "memory(GiB)": 77.59, + "step": 8985, + "train_speed(iter/s)": 1.640647 + }, + { + "acc": 0.64010086, + "epoch": 0.2280568239472349, + "grad_norm": 6.71875, + "learning_rate": 9.888335574210681e-06, + "loss": 1.68984547, + "memory(GiB)": 77.59, + "step": 8990, + "train_speed(iter/s)": 1.640745 + }, + { + "acc": 0.64795132, + "epoch": 0.22818366311516997, + "grad_norm": 6.34375, + "learning_rate": 9.888115088485931e-06, + "loss": 1.55505962, + "memory(GiB)": 77.59, + "step": 8995, + "train_speed(iter/s)": 1.640843 + }, + { + "acc": 0.6501296, + "epoch": 0.228310502283105, + "grad_norm": 5.40625, + "learning_rate": 9.887894387760053e-06, + "loss": 1.64787827, + "memory(GiB)": 77.59, + "step": 9000, + "train_speed(iter/s)": 1.640949 + }, + { + "epoch": 0.228310502283105, + "eval_acc": 0.6373861090054025, + "eval_loss": 1.6237483024597168, + "eval_runtime": 58.6039, + "eval_samples_per_second": 108.696, + "eval_steps_per_second": 27.182, + "step": 9000 + }, + { + "acc": 0.64845643, + "epoch": 0.22843734145104008, + "grad_norm": 7.375, + "learning_rate": 9.887673472042757e-06, + "loss": 1.68617744, + "memory(GiB)": 77.59, + "step": 9005, + "train_speed(iter/s)": 1.622451 + }, + { + "acc": 0.65458899, + "epoch": 0.22856418061897513, + "grad_norm": 5.71875, + "learning_rate": 9.88745234134376e-06, + "loss": 1.66633224, + "memory(GiB)": 77.59, + "step": 9010, + "train_speed(iter/s)": 1.622546 + }, + { + "acc": 0.63907366, + "epoch": 0.2286910197869102, + "grad_norm": 5.21875, + "learning_rate": 9.887230995672789e-06, + "loss": 1.66835785, + "memory(GiB)": 77.59, + "step": 9015, + "train_speed(iter/s)": 1.622638 + }, + { + "acc": 0.64539042, + "epoch": 0.22881785895484524, + "grad_norm": 6.09375, + "learning_rate": 9.887009435039578e-06, + "loss": 1.59948788, + "memory(GiB)": 77.59, + "step": 9020, + "train_speed(iter/s)": 1.622714 + }, + { + "acc": 0.63725467, + "epoch": 0.22894469812278032, + "grad_norm": 5.96875, + "learning_rate": 9.886787659453873e-06, + "loss": 1.65513573, + "memory(GiB)": 77.59, + "step": 9025, + "train_speed(iter/s)": 1.622807 + }, + { + "acc": 0.64495363, + "epoch": 0.22907153729071536, + "grad_norm": 5.46875, + "learning_rate": 9.886565668925429e-06, + "loss": 1.64530029, + "memory(GiB)": 77.59, + "step": 9030, + "train_speed(iter/s)": 1.622904 + }, + { + "acc": 0.64777966, + "epoch": 0.22919837645865043, + "grad_norm": 10.0625, + "learning_rate": 9.88634346346401e-06, + "loss": 1.64854698, + "memory(GiB)": 77.59, + "step": 9035, + "train_speed(iter/s)": 1.623001 + }, + { + "acc": 0.64281402, + "epoch": 0.22932521562658548, + "grad_norm": 5.21875, + "learning_rate": 9.88612104307939e-06, + "loss": 1.61526184, + "memory(GiB)": 77.59, + "step": 9040, + "train_speed(iter/s)": 1.623108 + }, + { + "acc": 0.66017203, + "epoch": 0.22945205479452055, + "grad_norm": 5.21875, + "learning_rate": 9.885898407781352e-06, + "loss": 1.6112999, + "memory(GiB)": 77.59, + "step": 9045, + "train_speed(iter/s)": 1.623206 + }, + { + "acc": 0.65367327, + "epoch": 0.2295788939624556, + "grad_norm": 5.75, + "learning_rate": 9.885675557579686e-06, + "loss": 1.72248211, + "memory(GiB)": 77.59, + "step": 9050, + "train_speed(iter/s)": 1.62331 + }, + { + "acc": 0.65201654, + "epoch": 0.22970573313039067, + "grad_norm": 6.25, + "learning_rate": 9.885452492484198e-06, + "loss": 1.57434149, + "memory(GiB)": 77.59, + "step": 9055, + "train_speed(iter/s)": 1.623417 + }, + { + "acc": 0.64312749, + "epoch": 0.2298325722983257, + "grad_norm": 5.8125, + "learning_rate": 9.885229212504697e-06, + "loss": 1.66974411, + "memory(GiB)": 77.59, + "step": 9060, + "train_speed(iter/s)": 1.623519 + }, + { + "acc": 0.64915266, + "epoch": 0.22995941146626078, + "grad_norm": 4.75, + "learning_rate": 9.885005717651002e-06, + "loss": 1.62329941, + "memory(GiB)": 77.59, + "step": 9065, + "train_speed(iter/s)": 1.623617 + }, + { + "acc": 0.65288315, + "epoch": 0.23008625063419583, + "grad_norm": 6.125, + "learning_rate": 9.88478200793295e-06, + "loss": 1.63091946, + "memory(GiB)": 77.59, + "step": 9070, + "train_speed(iter/s)": 1.623699 + }, + { + "acc": 0.64600859, + "epoch": 0.2302130898021309, + "grad_norm": 5.59375, + "learning_rate": 9.884558083360372e-06, + "loss": 1.62570229, + "memory(GiB)": 77.59, + "step": 9075, + "train_speed(iter/s)": 1.6238 + }, + { + "acc": 0.6642405, + "epoch": 0.23033992897006594, + "grad_norm": 5.28125, + "learning_rate": 9.884333943943123e-06, + "loss": 1.55125132, + "memory(GiB)": 77.59, + "step": 9080, + "train_speed(iter/s)": 1.623893 + }, + { + "acc": 0.64701395, + "epoch": 0.23046676813800102, + "grad_norm": 5.34375, + "learning_rate": 9.884109589691062e-06, + "loss": 1.66710243, + "memory(GiB)": 77.59, + "step": 9085, + "train_speed(iter/s)": 1.623996 + }, + { + "acc": 0.64406352, + "epoch": 0.23059360730593606, + "grad_norm": 5.3125, + "learning_rate": 9.883885020614052e-06, + "loss": 1.68455009, + "memory(GiB)": 77.59, + "step": 9090, + "train_speed(iter/s)": 1.624103 + }, + { + "acc": 0.66169415, + "epoch": 0.23072044647387113, + "grad_norm": 4.9375, + "learning_rate": 9.883660236721977e-06, + "loss": 1.5562604, + "memory(GiB)": 77.59, + "step": 9095, + "train_speed(iter/s)": 1.624204 + }, + { + "acc": 0.6408246, + "epoch": 0.23084728564180618, + "grad_norm": 5.28125, + "learning_rate": 9.883435238024718e-06, + "loss": 1.67281418, + "memory(GiB)": 77.59, + "step": 9100, + "train_speed(iter/s)": 1.624303 + }, + { + "acc": 0.66595106, + "epoch": 0.23097412480974125, + "grad_norm": 5.34375, + "learning_rate": 9.883210024532176e-06, + "loss": 1.57440062, + "memory(GiB)": 77.59, + "step": 9105, + "train_speed(iter/s)": 1.624397 + }, + { + "acc": 0.65892649, + "epoch": 0.2311009639776763, + "grad_norm": 5.75, + "learning_rate": 9.882984596254255e-06, + "loss": 1.65532417, + "memory(GiB)": 77.59, + "step": 9110, + "train_speed(iter/s)": 1.6245 + }, + { + "acc": 0.64969063, + "epoch": 0.23122780314561137, + "grad_norm": 5.625, + "learning_rate": 9.88275895320087e-06, + "loss": 1.68483906, + "memory(GiB)": 77.59, + "step": 9115, + "train_speed(iter/s)": 1.624599 + }, + { + "acc": 0.64919991, + "epoch": 0.2313546423135464, + "grad_norm": 6.03125, + "learning_rate": 9.882533095381947e-06, + "loss": 1.67741852, + "memory(GiB)": 77.59, + "step": 9120, + "train_speed(iter/s)": 1.624699 + }, + { + "acc": 0.63494534, + "epoch": 0.23148148148148148, + "grad_norm": 7.625, + "learning_rate": 9.882307022807419e-06, + "loss": 1.64587288, + "memory(GiB)": 77.59, + "step": 9125, + "train_speed(iter/s)": 1.624799 + }, + { + "acc": 0.6446043, + "epoch": 0.23160832064941653, + "grad_norm": 4.90625, + "learning_rate": 9.88208073548723e-06, + "loss": 1.73853683, + "memory(GiB)": 77.59, + "step": 9130, + "train_speed(iter/s)": 1.624897 + }, + { + "acc": 0.64634643, + "epoch": 0.2317351598173516, + "grad_norm": 5.28125, + "learning_rate": 9.881854233431333e-06, + "loss": 1.68168411, + "memory(GiB)": 77.59, + "step": 9135, + "train_speed(iter/s)": 1.624992 + }, + { + "acc": 0.6489604, + "epoch": 0.23186199898528664, + "grad_norm": 4.96875, + "learning_rate": 9.881627516649692e-06, + "loss": 1.65658894, + "memory(GiB)": 77.59, + "step": 9140, + "train_speed(iter/s)": 1.625089 + }, + { + "acc": 0.63461447, + "epoch": 0.23198883815322172, + "grad_norm": 4.9375, + "learning_rate": 9.881400585152278e-06, + "loss": 1.73371582, + "memory(GiB)": 77.59, + "step": 9145, + "train_speed(iter/s)": 1.62519 + }, + { + "acc": 0.66043954, + "epoch": 0.23211567732115676, + "grad_norm": 4.84375, + "learning_rate": 9.881173438949072e-06, + "loss": 1.67182083, + "memory(GiB)": 77.59, + "step": 9150, + "train_speed(iter/s)": 1.625293 + }, + { + "acc": 0.65812225, + "epoch": 0.23224251648909183, + "grad_norm": 5.0, + "learning_rate": 9.880946078050064e-06, + "loss": 1.62039642, + "memory(GiB)": 77.59, + "step": 9155, + "train_speed(iter/s)": 1.625395 + }, + { + "acc": 0.63304405, + "epoch": 0.23236935565702688, + "grad_norm": 5.40625, + "learning_rate": 9.880718502465258e-06, + "loss": 1.70877666, + "memory(GiB)": 77.59, + "step": 9160, + "train_speed(iter/s)": 1.625498 + }, + { + "acc": 0.6330555, + "epoch": 0.23249619482496195, + "grad_norm": 5.125, + "learning_rate": 9.88049071220466e-06, + "loss": 1.71098824, + "memory(GiB)": 77.59, + "step": 9165, + "train_speed(iter/s)": 1.625597 + }, + { + "acc": 0.64827271, + "epoch": 0.232623033992897, + "grad_norm": 5.3125, + "learning_rate": 9.88026270727829e-06, + "loss": 1.67315445, + "memory(GiB)": 77.59, + "step": 9170, + "train_speed(iter/s)": 1.625693 + }, + { + "acc": 0.64434195, + "epoch": 0.23274987316083207, + "grad_norm": 5.03125, + "learning_rate": 9.880034487696179e-06, + "loss": 1.64116821, + "memory(GiB)": 77.59, + "step": 9175, + "train_speed(iter/s)": 1.625793 + }, + { + "acc": 0.64183378, + "epoch": 0.2328767123287671, + "grad_norm": 9.3125, + "learning_rate": 9.879806053468361e-06, + "loss": 1.68195076, + "memory(GiB)": 77.59, + "step": 9180, + "train_speed(iter/s)": 1.625894 + }, + { + "acc": 0.64677091, + "epoch": 0.23300355149670218, + "grad_norm": 5.15625, + "learning_rate": 9.879577404604889e-06, + "loss": 1.65706902, + "memory(GiB)": 77.59, + "step": 9185, + "train_speed(iter/s)": 1.625998 + }, + { + "acc": 0.63322067, + "epoch": 0.23313039066463723, + "grad_norm": 5.1875, + "learning_rate": 9.879348541115816e-06, + "loss": 1.67011566, + "memory(GiB)": 77.59, + "step": 9190, + "train_speed(iter/s)": 1.626102 + }, + { + "acc": 0.64351492, + "epoch": 0.2332572298325723, + "grad_norm": 4.71875, + "learning_rate": 9.879119463011208e-06, + "loss": 1.65603466, + "memory(GiB)": 77.59, + "step": 9195, + "train_speed(iter/s)": 1.626206 + }, + { + "acc": 0.65794177, + "epoch": 0.23338406900050734, + "grad_norm": 5.875, + "learning_rate": 9.878890170301143e-06, + "loss": 1.61584663, + "memory(GiB)": 77.59, + "step": 9200, + "train_speed(iter/s)": 1.62631 + }, + { + "acc": 0.66397114, + "epoch": 0.23351090816844242, + "grad_norm": 4.71875, + "learning_rate": 9.878660662995706e-06, + "loss": 1.60551891, + "memory(GiB)": 77.59, + "step": 9205, + "train_speed(iter/s)": 1.626408 + }, + { + "acc": 0.64669518, + "epoch": 0.23363774733637746, + "grad_norm": 6.9375, + "learning_rate": 9.878430941104991e-06, + "loss": 1.67409706, + "memory(GiB)": 77.59, + "step": 9210, + "train_speed(iter/s)": 1.626516 + }, + { + "acc": 0.64231701, + "epoch": 0.23376458650431253, + "grad_norm": 6.1875, + "learning_rate": 9.878201004639104e-06, + "loss": 1.73436699, + "memory(GiB)": 77.59, + "step": 9215, + "train_speed(iter/s)": 1.626623 + }, + { + "acc": 0.6697144, + "epoch": 0.23389142567224758, + "grad_norm": 5.65625, + "learning_rate": 9.877970853608156e-06, + "loss": 1.61370983, + "memory(GiB)": 77.59, + "step": 9220, + "train_speed(iter/s)": 1.626735 + }, + { + "acc": 0.64825511, + "epoch": 0.23401826484018265, + "grad_norm": 6.5, + "learning_rate": 9.87774048802227e-06, + "loss": 1.66075726, + "memory(GiB)": 77.59, + "step": 9225, + "train_speed(iter/s)": 1.626832 + }, + { + "acc": 0.6384212, + "epoch": 0.2341451040081177, + "grad_norm": 5.71875, + "learning_rate": 9.877509907891583e-06, + "loss": 1.57854786, + "memory(GiB)": 77.59, + "step": 9230, + "train_speed(iter/s)": 1.626941 + }, + { + "acc": 0.65710521, + "epoch": 0.23427194317605277, + "grad_norm": 5.65625, + "learning_rate": 9.877279113226232e-06, + "loss": 1.56658764, + "memory(GiB)": 77.59, + "step": 9235, + "train_speed(iter/s)": 1.627041 + }, + { + "acc": 0.65100546, + "epoch": 0.2343987823439878, + "grad_norm": 5.34375, + "learning_rate": 9.87704810403637e-06, + "loss": 1.61526814, + "memory(GiB)": 77.59, + "step": 9240, + "train_speed(iter/s)": 1.627141 + }, + { + "acc": 0.64556274, + "epoch": 0.23452562151192288, + "grad_norm": 5.40625, + "learning_rate": 9.876816880332157e-06, + "loss": 1.68021069, + "memory(GiB)": 77.59, + "step": 9245, + "train_speed(iter/s)": 1.627245 + }, + { + "acc": 0.65085897, + "epoch": 0.23465246067985793, + "grad_norm": 4.25, + "learning_rate": 9.876585442123765e-06, + "loss": 1.64631863, + "memory(GiB)": 77.59, + "step": 9250, + "train_speed(iter/s)": 1.627347 + }, + { + "acc": 0.64963069, + "epoch": 0.234779299847793, + "grad_norm": 5.96875, + "learning_rate": 9.876353789421373e-06, + "loss": 1.67465096, + "memory(GiB)": 77.59, + "step": 9255, + "train_speed(iter/s)": 1.627448 + }, + { + "acc": 0.63810825, + "epoch": 0.23490613901572804, + "grad_norm": 7.3125, + "learning_rate": 9.876121922235171e-06, + "loss": 1.72027836, + "memory(GiB)": 77.59, + "step": 9260, + "train_speed(iter/s)": 1.627557 + }, + { + "acc": 0.64873571, + "epoch": 0.23503297818366312, + "grad_norm": 5.3125, + "learning_rate": 9.875889840575356e-06, + "loss": 1.71790314, + "memory(GiB)": 77.59, + "step": 9265, + "train_speed(iter/s)": 1.627658 + }, + { + "acc": 0.64158063, + "epoch": 0.23515981735159816, + "grad_norm": 4.8125, + "learning_rate": 9.875657544452135e-06, + "loss": 1.65621204, + "memory(GiB)": 77.59, + "step": 9270, + "train_speed(iter/s)": 1.627759 + }, + { + "acc": 0.63539329, + "epoch": 0.23528665651953323, + "grad_norm": 5.78125, + "learning_rate": 9.875425033875728e-06, + "loss": 1.67092934, + "memory(GiB)": 77.59, + "step": 9275, + "train_speed(iter/s)": 1.627861 + }, + { + "acc": 0.66319242, + "epoch": 0.23541349568746828, + "grad_norm": 7.21875, + "learning_rate": 9.875192308856363e-06, + "loss": 1.65409985, + "memory(GiB)": 77.59, + "step": 9280, + "train_speed(iter/s)": 1.627961 + }, + { + "acc": 0.66097026, + "epoch": 0.23554033485540335, + "grad_norm": 6.03125, + "learning_rate": 9.87495936940427e-06, + "loss": 1.6012291, + "memory(GiB)": 77.59, + "step": 9285, + "train_speed(iter/s)": 1.628068 + }, + { + "acc": 0.63480844, + "epoch": 0.2356671740233384, + "grad_norm": 5.59375, + "learning_rate": 9.874726215529702e-06, + "loss": 1.70720577, + "memory(GiB)": 77.59, + "step": 9290, + "train_speed(iter/s)": 1.628177 + }, + { + "acc": 0.64903154, + "epoch": 0.23579401319127347, + "grad_norm": 6.25, + "learning_rate": 9.87449284724291e-06, + "loss": 1.61668167, + "memory(GiB)": 77.59, + "step": 9295, + "train_speed(iter/s)": 1.628279 + }, + { + "acc": 0.65418267, + "epoch": 0.2359208523592085, + "grad_norm": 5.59375, + "learning_rate": 9.874259264554159e-06, + "loss": 1.68040714, + "memory(GiB)": 77.59, + "step": 9300, + "train_speed(iter/s)": 1.628383 + }, + { + "acc": 0.66054163, + "epoch": 0.23604769152714358, + "grad_norm": 5.21875, + "learning_rate": 9.874025467473722e-06, + "loss": 1.5883213, + "memory(GiB)": 77.59, + "step": 9305, + "train_speed(iter/s)": 1.628484 + }, + { + "acc": 0.62926149, + "epoch": 0.23617453069507863, + "grad_norm": 5.8125, + "learning_rate": 9.873791456011887e-06, + "loss": 1.70828228, + "memory(GiB)": 77.59, + "step": 9310, + "train_speed(iter/s)": 1.628584 + }, + { + "acc": 0.65225201, + "epoch": 0.2363013698630137, + "grad_norm": 6.25, + "learning_rate": 9.873557230178942e-06, + "loss": 1.64824638, + "memory(GiB)": 77.59, + "step": 9315, + "train_speed(iter/s)": 1.628683 + }, + { + "acc": 0.64674082, + "epoch": 0.23642820903094874, + "grad_norm": 5.375, + "learning_rate": 9.873322789985191e-06, + "loss": 1.62088299, + "memory(GiB)": 77.59, + "step": 9320, + "train_speed(iter/s)": 1.628784 + }, + { + "acc": 0.63966265, + "epoch": 0.23655504819888382, + "grad_norm": 6.0625, + "learning_rate": 9.873088135440949e-06, + "loss": 1.69843178, + "memory(GiB)": 77.59, + "step": 9325, + "train_speed(iter/s)": 1.628886 + }, + { + "acc": 0.6216156, + "epoch": 0.23668188736681886, + "grad_norm": 5.5625, + "learning_rate": 9.87285326655653e-06, + "loss": 1.73059177, + "memory(GiB)": 77.59, + "step": 9330, + "train_speed(iter/s)": 1.628988 + }, + { + "acc": 0.64705725, + "epoch": 0.23680872653475393, + "grad_norm": 5.28125, + "learning_rate": 9.87261818334227e-06, + "loss": 1.62914925, + "memory(GiB)": 77.59, + "step": 9335, + "train_speed(iter/s)": 1.62909 + }, + { + "acc": 0.62429399, + "epoch": 0.23693556570268898, + "grad_norm": 5.59375, + "learning_rate": 9.87238288580851e-06, + "loss": 1.7366457, + "memory(GiB)": 77.59, + "step": 9340, + "train_speed(iter/s)": 1.629192 + }, + { + "acc": 0.64672909, + "epoch": 0.23706240487062405, + "grad_norm": 5.21875, + "learning_rate": 9.872147373965594e-06, + "loss": 1.65063057, + "memory(GiB)": 77.59, + "step": 9345, + "train_speed(iter/s)": 1.629295 + }, + { + "acc": 0.6331954, + "epoch": 0.2371892440385591, + "grad_norm": 8.3125, + "learning_rate": 9.871911647823884e-06, + "loss": 1.77910805, + "memory(GiB)": 77.59, + "step": 9350, + "train_speed(iter/s)": 1.629405 + }, + { + "acc": 0.65076671, + "epoch": 0.23731608320649417, + "grad_norm": 5.0, + "learning_rate": 9.871675707393749e-06, + "loss": 1.64002972, + "memory(GiB)": 77.59, + "step": 9355, + "train_speed(iter/s)": 1.629505 + }, + { + "acc": 0.6542901, + "epoch": 0.2374429223744292, + "grad_norm": 5.84375, + "learning_rate": 9.871439552685566e-06, + "loss": 1.66089725, + "memory(GiB)": 77.59, + "step": 9360, + "train_speed(iter/s)": 1.629606 + }, + { + "acc": 0.64613533, + "epoch": 0.23756976154236428, + "grad_norm": 5.03125, + "learning_rate": 9.871203183709723e-06, + "loss": 1.66794357, + "memory(GiB)": 77.59, + "step": 9365, + "train_speed(iter/s)": 1.629705 + }, + { + "acc": 0.65607357, + "epoch": 0.23769660071029933, + "grad_norm": 6.59375, + "learning_rate": 9.870966600476614e-06, + "loss": 1.6301157, + "memory(GiB)": 77.59, + "step": 9370, + "train_speed(iter/s)": 1.629808 + }, + { + "acc": 0.66108446, + "epoch": 0.2378234398782344, + "grad_norm": 6.71875, + "learning_rate": 9.870729802996647e-06, + "loss": 1.64777222, + "memory(GiB)": 77.59, + "step": 9375, + "train_speed(iter/s)": 1.629912 + }, + { + "acc": 0.6409246, + "epoch": 0.23795027904616944, + "grad_norm": 6.5, + "learning_rate": 9.870492791280239e-06, + "loss": 1.63623047, + "memory(GiB)": 77.59, + "step": 9380, + "train_speed(iter/s)": 1.630012 + }, + { + "acc": 0.64289203, + "epoch": 0.23807711821410452, + "grad_norm": 9.4375, + "learning_rate": 9.87025556533781e-06, + "loss": 1.63623619, + "memory(GiB)": 77.59, + "step": 9385, + "train_speed(iter/s)": 1.630111 + }, + { + "acc": 0.672299, + "epoch": 0.23820395738203956, + "grad_norm": 6.5625, + "learning_rate": 9.870018125179799e-06, + "loss": 1.53563261, + "memory(GiB)": 77.59, + "step": 9390, + "train_speed(iter/s)": 1.630213 + }, + { + "acc": 0.66684465, + "epoch": 0.23833079654997463, + "grad_norm": 6.96875, + "learning_rate": 9.869780470816647e-06, + "loss": 1.53617125, + "memory(GiB)": 77.59, + "step": 9395, + "train_speed(iter/s)": 1.630318 + }, + { + "acc": 0.66304359, + "epoch": 0.23845763571790968, + "grad_norm": 4.59375, + "learning_rate": 9.869542602258809e-06, + "loss": 1.57127762, + "memory(GiB)": 77.59, + "step": 9400, + "train_speed(iter/s)": 1.630418 + }, + { + "acc": 0.64068289, + "epoch": 0.23858447488584475, + "grad_norm": 7.375, + "learning_rate": 9.869304519516745e-06, + "loss": 1.75879402, + "memory(GiB)": 77.59, + "step": 9405, + "train_speed(iter/s)": 1.630517 + }, + { + "acc": 0.65158987, + "epoch": 0.2387113140537798, + "grad_norm": 5.15625, + "learning_rate": 9.869066222600928e-06, + "loss": 1.68611336, + "memory(GiB)": 77.59, + "step": 9410, + "train_speed(iter/s)": 1.630614 + }, + { + "acc": 0.64545851, + "epoch": 0.23883815322171487, + "grad_norm": 4.8125, + "learning_rate": 9.86882771152184e-06, + "loss": 1.65368767, + "memory(GiB)": 77.59, + "step": 9415, + "train_speed(iter/s)": 1.630707 + }, + { + "acc": 0.63275785, + "epoch": 0.2389649923896499, + "grad_norm": 5.28125, + "learning_rate": 9.868588986289973e-06, + "loss": 1.79201832, + "memory(GiB)": 77.59, + "step": 9420, + "train_speed(iter/s)": 1.630804 + }, + { + "acc": 0.63314781, + "epoch": 0.23909183155758498, + "grad_norm": 5.40625, + "learning_rate": 9.868350046915825e-06, + "loss": 1.73071785, + "memory(GiB)": 77.59, + "step": 9425, + "train_speed(iter/s)": 1.630902 + }, + { + "acc": 0.63926659, + "epoch": 0.23921867072552003, + "grad_norm": 6.15625, + "learning_rate": 9.868110893409906e-06, + "loss": 1.72271461, + "memory(GiB)": 77.59, + "step": 9430, + "train_speed(iter/s)": 1.631011 + }, + { + "acc": 0.67679691, + "epoch": 0.2393455098934551, + "grad_norm": 6.8125, + "learning_rate": 9.867871525782735e-06, + "loss": 1.58200169, + "memory(GiB)": 77.59, + "step": 9435, + "train_speed(iter/s)": 1.631114 + }, + { + "acc": 0.65555964, + "epoch": 0.23947234906139014, + "grad_norm": 5.125, + "learning_rate": 9.86763194404484e-06, + "loss": 1.65161896, + "memory(GiB)": 77.59, + "step": 9440, + "train_speed(iter/s)": 1.631221 + }, + { + "acc": 0.63352251, + "epoch": 0.23959918822932522, + "grad_norm": 5.59375, + "learning_rate": 9.867392148206762e-06, + "loss": 1.70929241, + "memory(GiB)": 77.59, + "step": 9445, + "train_speed(iter/s)": 1.63132 + }, + { + "acc": 0.64765925, + "epoch": 0.23972602739726026, + "grad_norm": 6.375, + "learning_rate": 9.867152138279043e-06, + "loss": 1.66059628, + "memory(GiB)": 77.59, + "step": 9450, + "train_speed(iter/s)": 1.631419 + }, + { + "acc": 0.64454942, + "epoch": 0.23985286656519533, + "grad_norm": 4.84375, + "learning_rate": 9.866911914272246e-06, + "loss": 1.63672409, + "memory(GiB)": 77.59, + "step": 9455, + "train_speed(iter/s)": 1.631519 + }, + { + "acc": 0.65048313, + "epoch": 0.23997970573313038, + "grad_norm": 5.28125, + "learning_rate": 9.866671476196931e-06, + "loss": 1.64810219, + "memory(GiB)": 77.59, + "step": 9460, + "train_speed(iter/s)": 1.631628 + }, + { + "acc": 0.66130323, + "epoch": 0.24010654490106545, + "grad_norm": 5.0, + "learning_rate": 9.866430824063678e-06, + "loss": 1.60275383, + "memory(GiB)": 77.59, + "step": 9465, + "train_speed(iter/s)": 1.631727 + }, + { + "acc": 0.65562611, + "epoch": 0.2402333840690005, + "grad_norm": 6.46875, + "learning_rate": 9.86618995788307e-06, + "loss": 1.60493183, + "memory(GiB)": 87.33, + "step": 9470, + "train_speed(iter/s)": 1.631826 + }, + { + "acc": 0.62938862, + "epoch": 0.24036022323693557, + "grad_norm": 6.1875, + "learning_rate": 9.865948877665702e-06, + "loss": 1.6870388, + "memory(GiB)": 87.33, + "step": 9475, + "train_speed(iter/s)": 1.631931 + }, + { + "acc": 0.64937038, + "epoch": 0.2404870624048706, + "grad_norm": 6.71875, + "learning_rate": 9.865707583422178e-06, + "loss": 1.69814262, + "memory(GiB)": 87.33, + "step": 9480, + "train_speed(iter/s)": 1.632035 + }, + { + "acc": 0.64474349, + "epoch": 0.24061390157280568, + "grad_norm": 5.5625, + "learning_rate": 9.865466075163108e-06, + "loss": 1.66721516, + "memory(GiB)": 87.33, + "step": 9485, + "train_speed(iter/s)": 1.632134 + }, + { + "acc": 0.65736027, + "epoch": 0.24074074074074073, + "grad_norm": 5.0, + "learning_rate": 9.86522435289912e-06, + "loss": 1.60760689, + "memory(GiB)": 87.33, + "step": 9490, + "train_speed(iter/s)": 1.632235 + }, + { + "acc": 0.64089384, + "epoch": 0.2408675799086758, + "grad_norm": 5.40625, + "learning_rate": 9.864982416640843e-06, + "loss": 1.72783279, + "memory(GiB)": 87.33, + "step": 9495, + "train_speed(iter/s)": 1.632331 + }, + { + "acc": 0.64851279, + "epoch": 0.24099441907661084, + "grad_norm": 6.5625, + "learning_rate": 9.864740266398918e-06, + "loss": 1.66511307, + "memory(GiB)": 87.33, + "step": 9500, + "train_speed(iter/s)": 1.632435 + }, + { + "acc": 0.65543299, + "epoch": 0.24112125824454592, + "grad_norm": 5.5625, + "learning_rate": 9.864497902183996e-06, + "loss": 1.59703064, + "memory(GiB)": 87.33, + "step": 9505, + "train_speed(iter/s)": 1.632534 + }, + { + "acc": 0.65599937, + "epoch": 0.24124809741248096, + "grad_norm": 6.75, + "learning_rate": 9.864255324006738e-06, + "loss": 1.62961349, + "memory(GiB)": 87.33, + "step": 9510, + "train_speed(iter/s)": 1.632635 + }, + { + "acc": 0.63429327, + "epoch": 0.24137493658041603, + "grad_norm": 5.96875, + "learning_rate": 9.864012531877814e-06, + "loss": 1.73694382, + "memory(GiB)": 87.33, + "step": 9515, + "train_speed(iter/s)": 1.632737 + }, + { + "acc": 0.63781829, + "epoch": 0.24150177574835108, + "grad_norm": 5.25, + "learning_rate": 9.863769525807903e-06, + "loss": 1.72291622, + "memory(GiB)": 87.33, + "step": 9520, + "train_speed(iter/s)": 1.632841 + }, + { + "acc": 0.66068192, + "epoch": 0.24162861491628615, + "grad_norm": 6.0625, + "learning_rate": 9.863526305807694e-06, + "loss": 1.56554413, + "memory(GiB)": 87.33, + "step": 9525, + "train_speed(iter/s)": 1.632947 + }, + { + "acc": 0.64753299, + "epoch": 0.2417554540842212, + "grad_norm": 8.1875, + "learning_rate": 9.863282871887882e-06, + "loss": 1.73369293, + "memory(GiB)": 87.33, + "step": 9530, + "train_speed(iter/s)": 1.633048 + }, + { + "acc": 0.65124426, + "epoch": 0.24188229325215627, + "grad_norm": 5.1875, + "learning_rate": 9.863039224059177e-06, + "loss": 1.64500675, + "memory(GiB)": 87.33, + "step": 9535, + "train_speed(iter/s)": 1.633144 + }, + { + "acc": 0.65118494, + "epoch": 0.2420091324200913, + "grad_norm": 4.90625, + "learning_rate": 9.862795362332293e-06, + "loss": 1.60982132, + "memory(GiB)": 87.33, + "step": 9540, + "train_speed(iter/s)": 1.633235 + }, + { + "acc": 0.66644897, + "epoch": 0.24213597158802638, + "grad_norm": 5.59375, + "learning_rate": 9.862551286717961e-06, + "loss": 1.59954548, + "memory(GiB)": 87.33, + "step": 9545, + "train_speed(iter/s)": 1.63333 + }, + { + "acc": 0.65461111, + "epoch": 0.24226281075596143, + "grad_norm": 8.1875, + "learning_rate": 9.862306997226914e-06, + "loss": 1.62875366, + "memory(GiB)": 87.33, + "step": 9550, + "train_speed(iter/s)": 1.633421 + }, + { + "acc": 0.64794388, + "epoch": 0.2423896499238965, + "grad_norm": 6.65625, + "learning_rate": 9.862062493869895e-06, + "loss": 1.68707466, + "memory(GiB)": 87.33, + "step": 9555, + "train_speed(iter/s)": 1.633522 + }, + { + "acc": 0.64705133, + "epoch": 0.24251648909183154, + "grad_norm": 5.90625, + "learning_rate": 9.861817776657661e-06, + "loss": 1.60952816, + "memory(GiB)": 87.33, + "step": 9560, + "train_speed(iter/s)": 1.633619 + }, + { + "acc": 0.62973495, + "epoch": 0.24264332825976662, + "grad_norm": 7.40625, + "learning_rate": 9.861572845600973e-06, + "loss": 1.68779659, + "memory(GiB)": 87.33, + "step": 9565, + "train_speed(iter/s)": 1.633721 + }, + { + "acc": 0.64386315, + "epoch": 0.24277016742770166, + "grad_norm": 4.84375, + "learning_rate": 9.861327700710608e-06, + "loss": 1.61891365, + "memory(GiB)": 87.33, + "step": 9570, + "train_speed(iter/s)": 1.633817 + }, + { + "acc": 0.64163079, + "epoch": 0.24289700659563673, + "grad_norm": 5.59375, + "learning_rate": 9.861082341997345e-06, + "loss": 1.70166588, + "memory(GiB)": 87.33, + "step": 9575, + "train_speed(iter/s)": 1.633921 + }, + { + "acc": 0.65208187, + "epoch": 0.24302384576357178, + "grad_norm": 4.75, + "learning_rate": 9.860836769471977e-06, + "loss": 1.63839931, + "memory(GiB)": 87.33, + "step": 9580, + "train_speed(iter/s)": 1.634017 + }, + { + "acc": 0.63927927, + "epoch": 0.24315068493150685, + "grad_norm": 5.3125, + "learning_rate": 9.860590983145307e-06, + "loss": 1.72652149, + "memory(GiB)": 87.33, + "step": 9585, + "train_speed(iter/s)": 1.634115 + }, + { + "acc": 0.65039825, + "epoch": 0.2432775240994419, + "grad_norm": 6.34375, + "learning_rate": 9.860344983028146e-06, + "loss": 1.68072662, + "memory(GiB)": 87.33, + "step": 9590, + "train_speed(iter/s)": 1.634214 + }, + { + "acc": 0.64354124, + "epoch": 0.24340436326737697, + "grad_norm": 5.21875, + "learning_rate": 9.86009876913131e-06, + "loss": 1.68386497, + "memory(GiB)": 87.33, + "step": 9595, + "train_speed(iter/s)": 1.634308 + }, + { + "acc": 0.64121289, + "epoch": 0.243531202435312, + "grad_norm": 4.1875, + "learning_rate": 9.859852341465633e-06, + "loss": 1.70940952, + "memory(GiB)": 87.33, + "step": 9600, + "train_speed(iter/s)": 1.634407 + }, + { + "acc": 0.64821434, + "epoch": 0.24365804160324708, + "grad_norm": 5.28125, + "learning_rate": 9.859605700041951e-06, + "loss": 1.67825699, + "memory(GiB)": 87.33, + "step": 9605, + "train_speed(iter/s)": 1.634502 + }, + { + "acc": 0.66226764, + "epoch": 0.24378488077118213, + "grad_norm": 5.90625, + "learning_rate": 9.859358844871113e-06, + "loss": 1.58269835, + "memory(GiB)": 87.33, + "step": 9610, + "train_speed(iter/s)": 1.634599 + }, + { + "acc": 0.65524387, + "epoch": 0.2439117199391172, + "grad_norm": 5.40625, + "learning_rate": 9.859111775963981e-06, + "loss": 1.65098114, + "memory(GiB)": 87.33, + "step": 9615, + "train_speed(iter/s)": 1.634698 + }, + { + "acc": 0.66100526, + "epoch": 0.24403855910705224, + "grad_norm": 5.28125, + "learning_rate": 9.858864493331417e-06, + "loss": 1.61300449, + "memory(GiB)": 87.33, + "step": 9620, + "train_speed(iter/s)": 1.634794 + }, + { + "acc": 0.63697333, + "epoch": 0.24416539827498732, + "grad_norm": 7.875, + "learning_rate": 9.858616996984297e-06, + "loss": 1.72463856, + "memory(GiB)": 87.33, + "step": 9625, + "train_speed(iter/s)": 1.634896 + }, + { + "acc": 0.63661032, + "epoch": 0.24429223744292236, + "grad_norm": 5.90625, + "learning_rate": 9.858369286933513e-06, + "loss": 1.68524361, + "memory(GiB)": 87.33, + "step": 9630, + "train_speed(iter/s)": 1.634988 + }, + { + "acc": 0.65131092, + "epoch": 0.24441907661085743, + "grad_norm": 5.59375, + "learning_rate": 9.858121363189954e-06, + "loss": 1.66075993, + "memory(GiB)": 87.33, + "step": 9635, + "train_speed(iter/s)": 1.635081 + }, + { + "acc": 0.64018021, + "epoch": 0.24454591577879248, + "grad_norm": 5.90625, + "learning_rate": 9.85787322576453e-06, + "loss": 1.73212318, + "memory(GiB)": 87.33, + "step": 9640, + "train_speed(iter/s)": 1.635185 + }, + { + "acc": 0.65091796, + "epoch": 0.24467275494672755, + "grad_norm": 4.9375, + "learning_rate": 9.85762487466815e-06, + "loss": 1.65248299, + "memory(GiB)": 87.33, + "step": 9645, + "train_speed(iter/s)": 1.635277 + }, + { + "acc": 0.66229463, + "epoch": 0.2447995941146626, + "grad_norm": 6.0625, + "learning_rate": 9.857376309911741e-06, + "loss": 1.53893776, + "memory(GiB)": 87.33, + "step": 9650, + "train_speed(iter/s)": 1.635371 + }, + { + "acc": 0.64031124, + "epoch": 0.24492643328259767, + "grad_norm": 4.90625, + "learning_rate": 9.857127531506237e-06, + "loss": 1.72968102, + "memory(GiB)": 87.33, + "step": 9655, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.64228048, + "epoch": 0.2450532724505327, + "grad_norm": 5.9375, + "learning_rate": 9.856878539462577e-06, + "loss": 1.66113605, + "memory(GiB)": 87.33, + "step": 9660, + "train_speed(iter/s)": 1.635554 + }, + { + "acc": 0.66556864, + "epoch": 0.24518011161846778, + "grad_norm": 5.5, + "learning_rate": 9.856629333791716e-06, + "loss": 1.58413582, + "memory(GiB)": 87.33, + "step": 9665, + "train_speed(iter/s)": 1.635645 + }, + { + "acc": 0.64543605, + "epoch": 0.24530695078640283, + "grad_norm": 5.125, + "learning_rate": 9.856379914504612e-06, + "loss": 1.68589516, + "memory(GiB)": 87.33, + "step": 9670, + "train_speed(iter/s)": 1.63574 + }, + { + "acc": 0.65295658, + "epoch": 0.2454337899543379, + "grad_norm": 4.8125, + "learning_rate": 9.856130281612237e-06, + "loss": 1.69418297, + "memory(GiB)": 87.33, + "step": 9675, + "train_speed(iter/s)": 1.635839 + }, + { + "acc": 0.63720455, + "epoch": 0.24556062912227294, + "grad_norm": 5.25, + "learning_rate": 9.855880435125572e-06, + "loss": 1.61891174, + "memory(GiB)": 87.33, + "step": 9680, + "train_speed(iter/s)": 1.635939 + }, + { + "acc": 0.64129586, + "epoch": 0.24568746829020802, + "grad_norm": 5.90625, + "learning_rate": 9.855630375055604e-06, + "loss": 1.63581123, + "memory(GiB)": 87.33, + "step": 9685, + "train_speed(iter/s)": 1.636032 + }, + { + "acc": 0.64883432, + "epoch": 0.24581430745814306, + "grad_norm": 5.28125, + "learning_rate": 9.855380101413336e-06, + "loss": 1.65804214, + "memory(GiB)": 87.33, + "step": 9690, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.64615369, + "epoch": 0.24594114662607813, + "grad_norm": 5.4375, + "learning_rate": 9.855129614209771e-06, + "loss": 1.6511528, + "memory(GiB)": 87.33, + "step": 9695, + "train_speed(iter/s)": 1.636212 + }, + { + "acc": 0.64521065, + "epoch": 0.24606798579401318, + "grad_norm": 6.125, + "learning_rate": 9.85487891345593e-06, + "loss": 1.68964539, + "memory(GiB)": 87.33, + "step": 9700, + "train_speed(iter/s)": 1.636309 + }, + { + "acc": 0.63514032, + "epoch": 0.24619482496194825, + "grad_norm": 7.03125, + "learning_rate": 9.85462799916284e-06, + "loss": 1.70887966, + "memory(GiB)": 87.33, + "step": 9705, + "train_speed(iter/s)": 1.636403 + }, + { + "acc": 0.63588052, + "epoch": 0.2463216641298833, + "grad_norm": 5.625, + "learning_rate": 9.854376871341535e-06, + "loss": 1.70032406, + "memory(GiB)": 87.33, + "step": 9710, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.63490038, + "epoch": 0.24644850329781837, + "grad_norm": 6.875, + "learning_rate": 9.854125530003063e-06, + "loss": 1.70630455, + "memory(GiB)": 87.33, + "step": 9715, + "train_speed(iter/s)": 1.63659 + }, + { + "acc": 0.65200148, + "epoch": 0.2465753424657534, + "grad_norm": 6.34375, + "learning_rate": 9.853873975158476e-06, + "loss": 1.60985165, + "memory(GiB)": 87.33, + "step": 9720, + "train_speed(iter/s)": 1.636691 + }, + { + "acc": 0.63832979, + "epoch": 0.24670218163368848, + "grad_norm": 5.65625, + "learning_rate": 9.853622206818842e-06, + "loss": 1.72486095, + "memory(GiB)": 87.33, + "step": 9725, + "train_speed(iter/s)": 1.636782 + }, + { + "acc": 0.64781246, + "epoch": 0.24682902080162353, + "grad_norm": 4.875, + "learning_rate": 9.853370224995233e-06, + "loss": 1.65095253, + "memory(GiB)": 87.33, + "step": 9730, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.66477175, + "epoch": 0.2469558599695586, + "grad_norm": 5.46875, + "learning_rate": 9.853118029698733e-06, + "loss": 1.58915806, + "memory(GiB)": 87.33, + "step": 9735, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.6595706, + "epoch": 0.24708269913749364, + "grad_norm": 5.96875, + "learning_rate": 9.852865620940436e-06, + "loss": 1.66983242, + "memory(GiB)": 87.33, + "step": 9740, + "train_speed(iter/s)": 1.637039 + }, + { + "acc": 0.6453341, + "epoch": 0.24720953830542872, + "grad_norm": 4.8125, + "learning_rate": 9.85261299873144e-06, + "loss": 1.65589428, + "memory(GiB)": 87.33, + "step": 9745, + "train_speed(iter/s)": 1.637135 + }, + { + "acc": 0.6633359, + "epoch": 0.24733637747336376, + "grad_norm": 5.25, + "learning_rate": 9.85236016308286e-06, + "loss": 1.61111488, + "memory(GiB)": 87.33, + "step": 9750, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.6529355, + "epoch": 0.24746321664129883, + "grad_norm": 5.4375, + "learning_rate": 9.852107114005816e-06, + "loss": 1.66232491, + "memory(GiB)": 87.33, + "step": 9755, + "train_speed(iter/s)": 1.637313 + }, + { + "acc": 0.66931586, + "epoch": 0.24759005580923388, + "grad_norm": 5.34375, + "learning_rate": 9.851853851511437e-06, + "loss": 1.60879478, + "memory(GiB)": 87.33, + "step": 9760, + "train_speed(iter/s)": 1.637408 + }, + { + "acc": 0.64560871, + "epoch": 0.24771689497716895, + "grad_norm": 6.34375, + "learning_rate": 9.851600375610864e-06, + "loss": 1.62980614, + "memory(GiB)": 87.33, + "step": 9765, + "train_speed(iter/s)": 1.6375 + }, + { + "acc": 0.64487877, + "epoch": 0.247843734145104, + "grad_norm": 6.125, + "learning_rate": 9.851346686315246e-06, + "loss": 1.67241478, + "memory(GiB)": 87.33, + "step": 9770, + "train_speed(iter/s)": 1.637592 + }, + { + "acc": 0.64298697, + "epoch": 0.24797057331303907, + "grad_norm": 5.0, + "learning_rate": 9.851092783635742e-06, + "loss": 1.72505569, + "memory(GiB)": 87.33, + "step": 9775, + "train_speed(iter/s)": 1.637682 + }, + { + "acc": 0.64806018, + "epoch": 0.2480974124809741, + "grad_norm": 6.28125, + "learning_rate": 9.850838667583518e-06, + "loss": 1.62771339, + "memory(GiB)": 87.33, + "step": 9780, + "train_speed(iter/s)": 1.637775 + }, + { + "acc": 0.65341506, + "epoch": 0.24822425164890918, + "grad_norm": 5.59375, + "learning_rate": 9.850584338169752e-06, + "loss": 1.61090584, + "memory(GiB)": 87.33, + "step": 9785, + "train_speed(iter/s)": 1.637866 + }, + { + "acc": 0.64384122, + "epoch": 0.24835109081684423, + "grad_norm": 5.8125, + "learning_rate": 9.85032979540563e-06, + "loss": 1.66910725, + "memory(GiB)": 87.33, + "step": 9790, + "train_speed(iter/s)": 1.637957 + }, + { + "acc": 0.63692422, + "epoch": 0.2484779299847793, + "grad_norm": 5.1875, + "learning_rate": 9.85007503930235e-06, + "loss": 1.69653721, + "memory(GiB)": 87.33, + "step": 9795, + "train_speed(iter/s)": 1.63805 + }, + { + "acc": 0.65492153, + "epoch": 0.24860476915271434, + "grad_norm": 5.34375, + "learning_rate": 9.849820069871114e-06, + "loss": 1.6327713, + "memory(GiB)": 87.33, + "step": 9800, + "train_speed(iter/s)": 1.638138 + }, + { + "acc": 0.64292245, + "epoch": 0.24873160832064942, + "grad_norm": 6.625, + "learning_rate": 9.849564887123138e-06, + "loss": 1.67443733, + "memory(GiB)": 87.33, + "step": 9805, + "train_speed(iter/s)": 1.638226 + }, + { + "acc": 0.63440418, + "epoch": 0.24885844748858446, + "grad_norm": 5.21875, + "learning_rate": 9.849309491069647e-06, + "loss": 1.674436, + "memory(GiB)": 87.33, + "step": 9810, + "train_speed(iter/s)": 1.638317 + }, + { + "acc": 0.64905891, + "epoch": 0.24898528665651953, + "grad_norm": 7.15625, + "learning_rate": 9.849053881721876e-06, + "loss": 1.59621582, + "memory(GiB)": 87.33, + "step": 9815, + "train_speed(iter/s)": 1.638409 + }, + { + "acc": 0.65379725, + "epoch": 0.24911212582445458, + "grad_norm": 5.75, + "learning_rate": 9.848798059091064e-06, + "loss": 1.69068108, + "memory(GiB)": 87.33, + "step": 9820, + "train_speed(iter/s)": 1.638499 + }, + { + "acc": 0.64928665, + "epoch": 0.24923896499238965, + "grad_norm": 5.90625, + "learning_rate": 9.848542023188466e-06, + "loss": 1.69615974, + "memory(GiB)": 87.33, + "step": 9825, + "train_speed(iter/s)": 1.63858 + }, + { + "acc": 0.66876736, + "epoch": 0.2493658041603247, + "grad_norm": 6.34375, + "learning_rate": 9.848285774025342e-06, + "loss": 1.52942619, + "memory(GiB)": 87.33, + "step": 9830, + "train_speed(iter/s)": 1.638666 + }, + { + "acc": 0.64703846, + "epoch": 0.24949264332825977, + "grad_norm": 8.125, + "learning_rate": 9.848029311612963e-06, + "loss": 1.64497204, + "memory(GiB)": 87.33, + "step": 9835, + "train_speed(iter/s)": 1.638753 + }, + { + "acc": 0.65347133, + "epoch": 0.2496194824961948, + "grad_norm": 6.9375, + "learning_rate": 9.84777263596261e-06, + "loss": 1.6713623, + "memory(GiB)": 87.33, + "step": 9840, + "train_speed(iter/s)": 1.638848 + }, + { + "acc": 0.643645, + "epoch": 0.24974632166412988, + "grad_norm": 5.21875, + "learning_rate": 9.847515747085573e-06, + "loss": 1.6845932, + "memory(GiB)": 87.33, + "step": 9845, + "train_speed(iter/s)": 1.638938 + }, + { + "acc": 0.64831305, + "epoch": 0.24987316083206493, + "grad_norm": 4.90625, + "learning_rate": 9.847258644993151e-06, + "loss": 1.67672653, + "memory(GiB)": 87.33, + "step": 9850, + "train_speed(iter/s)": 1.639026 + }, + { + "acc": 0.63253489, + "epoch": 0.25, + "grad_norm": 4.875, + "learning_rate": 9.847001329696653e-06, + "loss": 1.71245689, + "memory(GiB)": 87.33, + "step": 9855, + "train_speed(iter/s)": 1.639113 + }, + { + "acc": 0.64879866, + "epoch": 0.25012683916793504, + "grad_norm": 5.125, + "learning_rate": 9.846743801207395e-06, + "loss": 1.64888535, + "memory(GiB)": 87.33, + "step": 9860, + "train_speed(iter/s)": 1.6392 + }, + { + "acc": 0.62758493, + "epoch": 0.25025367833587014, + "grad_norm": 5.65625, + "learning_rate": 9.846486059536706e-06, + "loss": 1.70128193, + "memory(GiB)": 97.12, + "step": 9865, + "train_speed(iter/s)": 1.639281 + }, + { + "acc": 0.64468579, + "epoch": 0.2503805175038052, + "grad_norm": 5.0, + "learning_rate": 9.846228104695922e-06, + "loss": 1.6634449, + "memory(GiB)": 97.12, + "step": 9870, + "train_speed(iter/s)": 1.639375 + }, + { + "acc": 0.63298979, + "epoch": 0.25050735667174023, + "grad_norm": 5.71875, + "learning_rate": 9.84596993669639e-06, + "loss": 1.76489449, + "memory(GiB)": 97.12, + "step": 9875, + "train_speed(iter/s)": 1.639467 + }, + { + "acc": 0.65223188, + "epoch": 0.2506341958396753, + "grad_norm": 11.8125, + "learning_rate": 9.845711555549464e-06, + "loss": 1.61567135, + "memory(GiB)": 97.12, + "step": 9880, + "train_speed(iter/s)": 1.639558 + }, + { + "acc": 0.647159, + "epoch": 0.2507610350076104, + "grad_norm": 5.21875, + "learning_rate": 9.845452961266509e-06, + "loss": 1.63127213, + "memory(GiB)": 97.12, + "step": 9885, + "train_speed(iter/s)": 1.639645 + }, + { + "acc": 0.65376697, + "epoch": 0.2508878741755454, + "grad_norm": 6.0, + "learning_rate": 9.845194153858899e-06, + "loss": 1.63588867, + "memory(GiB)": 97.12, + "step": 9890, + "train_speed(iter/s)": 1.639732 + }, + { + "acc": 0.64734588, + "epoch": 0.25101471334348047, + "grad_norm": 5.0, + "learning_rate": 9.844935133338018e-06, + "loss": 1.59693937, + "memory(GiB)": 97.12, + "step": 9895, + "train_speed(iter/s)": 1.639813 + }, + { + "acc": 0.63036313, + "epoch": 0.2511415525114155, + "grad_norm": 5.125, + "learning_rate": 9.84467589971526e-06, + "loss": 1.7552578, + "memory(GiB)": 97.12, + "step": 9900, + "train_speed(iter/s)": 1.639896 + }, + { + "acc": 0.64947052, + "epoch": 0.2512683916793506, + "grad_norm": 6.53125, + "learning_rate": 9.844416453002027e-06, + "loss": 1.67681274, + "memory(GiB)": 97.12, + "step": 9905, + "train_speed(iter/s)": 1.639984 + }, + { + "acc": 0.64688048, + "epoch": 0.25139523084728566, + "grad_norm": 5.15625, + "learning_rate": 9.844156793209725e-06, + "loss": 1.63420868, + "memory(GiB)": 97.12, + "step": 9910, + "train_speed(iter/s)": 1.640075 + }, + { + "acc": 0.64966488, + "epoch": 0.2515220700152207, + "grad_norm": 6.125, + "learning_rate": 9.843896920349783e-06, + "loss": 1.65382786, + "memory(GiB)": 97.12, + "step": 9915, + "train_speed(iter/s)": 1.640165 + }, + { + "acc": 0.64519625, + "epoch": 0.25164890918315574, + "grad_norm": 4.6875, + "learning_rate": 9.843636834433627e-06, + "loss": 1.68276272, + "memory(GiB)": 97.12, + "step": 9920, + "train_speed(iter/s)": 1.640259 + }, + { + "acc": 0.64749675, + "epoch": 0.25177574835109084, + "grad_norm": 5.65625, + "learning_rate": 9.843376535472698e-06, + "loss": 1.64375229, + "memory(GiB)": 97.12, + "step": 9925, + "train_speed(iter/s)": 1.640357 + }, + { + "acc": 0.65989466, + "epoch": 0.2519025875190259, + "grad_norm": 4.4375, + "learning_rate": 9.843116023478445e-06, + "loss": 1.57091999, + "memory(GiB)": 97.12, + "step": 9930, + "train_speed(iter/s)": 1.640444 + }, + { + "acc": 0.6505558, + "epoch": 0.25202942668696093, + "grad_norm": 5.09375, + "learning_rate": 9.842855298462327e-06, + "loss": 1.68581715, + "memory(GiB)": 97.12, + "step": 9935, + "train_speed(iter/s)": 1.640529 + }, + { + "acc": 0.64149489, + "epoch": 0.252156265854896, + "grad_norm": 6.28125, + "learning_rate": 9.84259436043581e-06, + "loss": 1.64889679, + "memory(GiB)": 97.12, + "step": 9940, + "train_speed(iter/s)": 1.64062 + }, + { + "acc": 0.64940042, + "epoch": 0.2522831050228311, + "grad_norm": 5.90625, + "learning_rate": 9.842333209410372e-06, + "loss": 1.62770901, + "memory(GiB)": 97.12, + "step": 9945, + "train_speed(iter/s)": 1.640706 + }, + { + "acc": 0.65872822, + "epoch": 0.2524099441907661, + "grad_norm": 5.28125, + "learning_rate": 9.842071845397502e-06, + "loss": 1.5567337, + "memory(GiB)": 97.12, + "step": 9950, + "train_speed(iter/s)": 1.640796 + }, + { + "acc": 0.63890805, + "epoch": 0.25253678335870117, + "grad_norm": 5.5, + "learning_rate": 9.841810268408692e-06, + "loss": 1.6995182, + "memory(GiB)": 97.12, + "step": 9955, + "train_speed(iter/s)": 1.640886 + }, + { + "acc": 0.65421972, + "epoch": 0.2526636225266362, + "grad_norm": 5.0625, + "learning_rate": 9.841548478455451e-06, + "loss": 1.71245689, + "memory(GiB)": 97.12, + "step": 9960, + "train_speed(iter/s)": 1.640977 + }, + { + "acc": 0.6484355, + "epoch": 0.2527904616945713, + "grad_norm": 5.96875, + "learning_rate": 9.841286475549291e-06, + "loss": 1.6949028, + "memory(GiB)": 97.12, + "step": 9965, + "train_speed(iter/s)": 1.641062 + }, + { + "acc": 0.64724045, + "epoch": 0.25291730086250636, + "grad_norm": 5.75, + "learning_rate": 9.841024259701737e-06, + "loss": 1.62486916, + "memory(GiB)": 97.12, + "step": 9970, + "train_speed(iter/s)": 1.641149 + }, + { + "acc": 0.64365444, + "epoch": 0.2530441400304414, + "grad_norm": 5.15625, + "learning_rate": 9.840761830924323e-06, + "loss": 1.66178799, + "memory(GiB)": 97.12, + "step": 9975, + "train_speed(iter/s)": 1.641243 + }, + { + "acc": 0.64171257, + "epoch": 0.25317097919837644, + "grad_norm": 7.15625, + "learning_rate": 9.84049918922859e-06, + "loss": 1.68366013, + "memory(GiB)": 97.12, + "step": 9980, + "train_speed(iter/s)": 1.641335 + }, + { + "acc": 0.63384504, + "epoch": 0.25329781836631154, + "grad_norm": 5.625, + "learning_rate": 9.840236334626091e-06, + "loss": 1.66978188, + "memory(GiB)": 97.12, + "step": 9985, + "train_speed(iter/s)": 1.64142 + }, + { + "acc": 0.64788871, + "epoch": 0.2534246575342466, + "grad_norm": 4.46875, + "learning_rate": 9.83997326712839e-06, + "loss": 1.63131371, + "memory(GiB)": 97.12, + "step": 9990, + "train_speed(iter/s)": 1.64151 + }, + { + "acc": 0.66368647, + "epoch": 0.25355149670218163, + "grad_norm": 4.84375, + "learning_rate": 9.839709986747054e-06, + "loss": 1.56834793, + "memory(GiB)": 97.12, + "step": 9995, + "train_speed(iter/s)": 1.641598 + }, + { + "acc": 0.65299573, + "epoch": 0.2536783358701167, + "grad_norm": 5.03125, + "learning_rate": 9.839446493493667e-06, + "loss": 1.57435932, + "memory(GiB)": 97.12, + "step": 10000, + "train_speed(iter/s)": 1.641683 + }, + { + "epoch": 0.2536783358701167, + "eval_acc": 0.6385112883386899, + "eval_loss": 1.6177022457122803, + "eval_runtime": 58.776, + "eval_samples_per_second": 108.378, + "eval_steps_per_second": 27.103, + "step": 10000 + }, + { + "acc": 0.64907846, + "epoch": 0.2538051750380518, + "grad_norm": 5.0, + "learning_rate": 9.839182787379815e-06, + "loss": 1.58981676, + "memory(GiB)": 97.12, + "step": 10005, + "train_speed(iter/s)": 1.624967 + }, + { + "acc": 0.64923172, + "epoch": 0.2539320142059868, + "grad_norm": 5.53125, + "learning_rate": 9.8389188684171e-06, + "loss": 1.60591602, + "memory(GiB)": 97.12, + "step": 10010, + "train_speed(iter/s)": 1.625059 + }, + { + "acc": 0.65199113, + "epoch": 0.25405885337392187, + "grad_norm": 5.46875, + "learning_rate": 9.838654736617128e-06, + "loss": 1.55235291, + "memory(GiB)": 97.12, + "step": 10015, + "train_speed(iter/s)": 1.625155 + }, + { + "acc": 0.64934359, + "epoch": 0.2541856925418569, + "grad_norm": 5.65625, + "learning_rate": 9.838390391991517e-06, + "loss": 1.62928314, + "memory(GiB)": 97.12, + "step": 10020, + "train_speed(iter/s)": 1.625253 + }, + { + "acc": 0.64461603, + "epoch": 0.254312531709792, + "grad_norm": 5.65625, + "learning_rate": 9.838125834551895e-06, + "loss": 1.67571144, + "memory(GiB)": 97.12, + "step": 10025, + "train_speed(iter/s)": 1.625348 + }, + { + "acc": 0.6496933, + "epoch": 0.25443937087772706, + "grad_norm": 5.4375, + "learning_rate": 9.837861064309899e-06, + "loss": 1.67061977, + "memory(GiB)": 97.12, + "step": 10030, + "train_speed(iter/s)": 1.625444 + }, + { + "acc": 0.64946065, + "epoch": 0.2545662100456621, + "grad_norm": 5.46875, + "learning_rate": 9.837596081277173e-06, + "loss": 1.58180618, + "memory(GiB)": 97.12, + "step": 10035, + "train_speed(iter/s)": 1.625539 + }, + { + "acc": 0.65233727, + "epoch": 0.25469304921359714, + "grad_norm": 5.875, + "learning_rate": 9.837330885465373e-06, + "loss": 1.65300846, + "memory(GiB)": 97.12, + "step": 10040, + "train_speed(iter/s)": 1.625636 + }, + { + "acc": 0.64740419, + "epoch": 0.25481988838153224, + "grad_norm": 5.09375, + "learning_rate": 9.837065476886163e-06, + "loss": 1.66153011, + "memory(GiB)": 97.12, + "step": 10045, + "train_speed(iter/s)": 1.625731 + }, + { + "acc": 0.644137, + "epoch": 0.2549467275494673, + "grad_norm": 7.15625, + "learning_rate": 9.83679985555122e-06, + "loss": 1.64038887, + "memory(GiB)": 97.12, + "step": 10050, + "train_speed(iter/s)": 1.625823 + }, + { + "acc": 0.64094353, + "epoch": 0.25507356671740233, + "grad_norm": 5.75, + "learning_rate": 9.836534021472222e-06, + "loss": 1.67950783, + "memory(GiB)": 97.12, + "step": 10055, + "train_speed(iter/s)": 1.625919 + }, + { + "acc": 0.65049376, + "epoch": 0.2552004058853374, + "grad_norm": 5.96875, + "learning_rate": 9.836267974660866e-06, + "loss": 1.67010841, + "memory(GiB)": 97.12, + "step": 10060, + "train_speed(iter/s)": 1.626009 + }, + { + "acc": 0.65028048, + "epoch": 0.2553272450532725, + "grad_norm": 5.46875, + "learning_rate": 9.836001715128851e-06, + "loss": 1.66651993, + "memory(GiB)": 97.12, + "step": 10065, + "train_speed(iter/s)": 1.6261 + }, + { + "acc": 0.64809399, + "epoch": 0.2554540842212075, + "grad_norm": 6.28125, + "learning_rate": 9.835735242887889e-06, + "loss": 1.61924095, + "memory(GiB)": 97.12, + "step": 10070, + "train_speed(iter/s)": 1.626198 + }, + { + "acc": 0.65004787, + "epoch": 0.25558092338914257, + "grad_norm": 5.34375, + "learning_rate": 9.835468557949701e-06, + "loss": 1.62342682, + "memory(GiB)": 97.12, + "step": 10075, + "train_speed(iter/s)": 1.626294 + }, + { + "acc": 0.64583135, + "epoch": 0.2557077625570776, + "grad_norm": 5.0, + "learning_rate": 9.83520166032602e-06, + "loss": 1.65300865, + "memory(GiB)": 97.12, + "step": 10080, + "train_speed(iter/s)": 1.626388 + }, + { + "acc": 0.64032602, + "epoch": 0.2558346017250127, + "grad_norm": 6.34375, + "learning_rate": 9.834934550028579e-06, + "loss": 1.63845844, + "memory(GiB)": 97.12, + "step": 10085, + "train_speed(iter/s)": 1.626482 + }, + { + "acc": 0.64456515, + "epoch": 0.25596144089294776, + "grad_norm": 6.34375, + "learning_rate": 9.83466722706913e-06, + "loss": 1.68282528, + "memory(GiB)": 97.12, + "step": 10090, + "train_speed(iter/s)": 1.626573 + }, + { + "acc": 0.64471092, + "epoch": 0.2560882800608828, + "grad_norm": 5.71875, + "learning_rate": 9.834399691459433e-06, + "loss": 1.67646332, + "memory(GiB)": 97.12, + "step": 10095, + "train_speed(iter/s)": 1.626662 + }, + { + "acc": 0.64106884, + "epoch": 0.25621511922881784, + "grad_norm": 5.5625, + "learning_rate": 9.83413194321125e-06, + "loss": 1.62911358, + "memory(GiB)": 97.12, + "step": 10100, + "train_speed(iter/s)": 1.626753 + }, + { + "acc": 0.65749855, + "epoch": 0.25634195839675294, + "grad_norm": 6.78125, + "learning_rate": 9.833863982336365e-06, + "loss": 1.60307426, + "memory(GiB)": 97.12, + "step": 10105, + "train_speed(iter/s)": 1.626844 + }, + { + "acc": 0.66185999, + "epoch": 0.256468797564688, + "grad_norm": 4.90625, + "learning_rate": 9.83359580884656e-06, + "loss": 1.64014359, + "memory(GiB)": 97.12, + "step": 10110, + "train_speed(iter/s)": 1.626938 + }, + { + "acc": 0.66246119, + "epoch": 0.25659563673262303, + "grad_norm": 5.34375, + "learning_rate": 9.83332742275363e-06, + "loss": 1.63745556, + "memory(GiB)": 97.12, + "step": 10115, + "train_speed(iter/s)": 1.627026 + }, + { + "acc": 0.63125472, + "epoch": 0.2567224759005581, + "grad_norm": 7.3125, + "learning_rate": 9.833058824069382e-06, + "loss": 1.69879913, + "memory(GiB)": 97.12, + "step": 10120, + "train_speed(iter/s)": 1.627117 + }, + { + "acc": 0.65348196, + "epoch": 0.2568493150684932, + "grad_norm": 6.375, + "learning_rate": 9.832790012805626e-06, + "loss": 1.6033205, + "memory(GiB)": 97.12, + "step": 10125, + "train_speed(iter/s)": 1.627209 + }, + { + "acc": 0.65726557, + "epoch": 0.2569761542364282, + "grad_norm": 6.90625, + "learning_rate": 9.832520988974191e-06, + "loss": 1.57655811, + "memory(GiB)": 97.12, + "step": 10130, + "train_speed(iter/s)": 1.627302 + }, + { + "acc": 0.65688319, + "epoch": 0.25710299340436327, + "grad_norm": 5.3125, + "learning_rate": 9.832251752586907e-06, + "loss": 1.58754587, + "memory(GiB)": 97.12, + "step": 10135, + "train_speed(iter/s)": 1.627391 + }, + { + "acc": 0.65311823, + "epoch": 0.2572298325722983, + "grad_norm": 5.0, + "learning_rate": 9.831982303655617e-06, + "loss": 1.59830036, + "memory(GiB)": 97.12, + "step": 10140, + "train_speed(iter/s)": 1.627481 + }, + { + "acc": 0.65430326, + "epoch": 0.2573566717402334, + "grad_norm": 5.65625, + "learning_rate": 9.83171264219217e-06, + "loss": 1.62602406, + "memory(GiB)": 97.12, + "step": 10145, + "train_speed(iter/s)": 1.627566 + }, + { + "acc": 0.6770484, + "epoch": 0.25748351090816846, + "grad_norm": 5.71875, + "learning_rate": 9.831442768208429e-06, + "loss": 1.4724946, + "memory(GiB)": 97.12, + "step": 10150, + "train_speed(iter/s)": 1.627653 + }, + { + "acc": 0.67030296, + "epoch": 0.2576103500761035, + "grad_norm": 4.75, + "learning_rate": 9.831172681716265e-06, + "loss": 1.60321083, + "memory(GiB)": 97.12, + "step": 10155, + "train_speed(iter/s)": 1.627739 + }, + { + "acc": 0.6637229, + "epoch": 0.25773718924403854, + "grad_norm": 5.0, + "learning_rate": 9.830902382727556e-06, + "loss": 1.60972404, + "memory(GiB)": 97.12, + "step": 10160, + "train_speed(iter/s)": 1.627829 + }, + { + "acc": 0.66137056, + "epoch": 0.25786402841197364, + "grad_norm": 5.0625, + "learning_rate": 9.830631871254193e-06, + "loss": 1.59377689, + "memory(GiB)": 97.12, + "step": 10165, + "train_speed(iter/s)": 1.62792 + }, + { + "acc": 0.65381718, + "epoch": 0.2579908675799087, + "grad_norm": 5.28125, + "learning_rate": 9.830361147308074e-06, + "loss": 1.6218132, + "memory(GiB)": 97.12, + "step": 10170, + "train_speed(iter/s)": 1.628017 + }, + { + "acc": 0.64129176, + "epoch": 0.25811770674784373, + "grad_norm": 4.6875, + "learning_rate": 9.830090210901104e-06, + "loss": 1.6643589, + "memory(GiB)": 97.12, + "step": 10175, + "train_speed(iter/s)": 1.628111 + }, + { + "acc": 0.64200764, + "epoch": 0.2582445459157788, + "grad_norm": 6.4375, + "learning_rate": 9.829819062045203e-06, + "loss": 1.67048416, + "memory(GiB)": 97.12, + "step": 10180, + "train_speed(iter/s)": 1.628199 + }, + { + "acc": 0.66616635, + "epoch": 0.2583713850837139, + "grad_norm": 5.65625, + "learning_rate": 9.829547700752295e-06, + "loss": 1.54576893, + "memory(GiB)": 97.12, + "step": 10185, + "train_speed(iter/s)": 1.628293 + }, + { + "acc": 0.63299341, + "epoch": 0.2584982242516489, + "grad_norm": 5.28125, + "learning_rate": 9.829276127034315e-06, + "loss": 1.71097012, + "memory(GiB)": 97.12, + "step": 10190, + "train_speed(iter/s)": 1.628384 + }, + { + "acc": 0.63835087, + "epoch": 0.25862506341958397, + "grad_norm": 6.125, + "learning_rate": 9.829004340903214e-06, + "loss": 1.67719135, + "memory(GiB)": 97.12, + "step": 10195, + "train_speed(iter/s)": 1.628477 + }, + { + "acc": 0.64494486, + "epoch": 0.258751902587519, + "grad_norm": 4.4375, + "learning_rate": 9.82873234237094e-06, + "loss": 1.6942091, + "memory(GiB)": 97.12, + "step": 10200, + "train_speed(iter/s)": 1.628546 + }, + { + "acc": 0.63703461, + "epoch": 0.2588787417554541, + "grad_norm": 7.3125, + "learning_rate": 9.828460131449457e-06, + "loss": 1.72451572, + "memory(GiB)": 97.12, + "step": 10205, + "train_speed(iter/s)": 1.628641 + }, + { + "acc": 0.63502483, + "epoch": 0.25900558092338916, + "grad_norm": 5.65625, + "learning_rate": 9.828187708150743e-06, + "loss": 1.67805843, + "memory(GiB)": 97.12, + "step": 10210, + "train_speed(iter/s)": 1.628727 + }, + { + "acc": 0.66034803, + "epoch": 0.2591324200913242, + "grad_norm": 5.28125, + "learning_rate": 9.827915072486776e-06, + "loss": 1.59761457, + "memory(GiB)": 97.12, + "step": 10215, + "train_speed(iter/s)": 1.628817 + }, + { + "acc": 0.65768232, + "epoch": 0.25925925925925924, + "grad_norm": 6.0625, + "learning_rate": 9.827642224469547e-06, + "loss": 1.6083252, + "memory(GiB)": 97.12, + "step": 10220, + "train_speed(iter/s)": 1.62891 + }, + { + "acc": 0.64734974, + "epoch": 0.25938609842719434, + "grad_norm": 6.6875, + "learning_rate": 9.827369164111062e-06, + "loss": 1.69996796, + "memory(GiB)": 97.12, + "step": 10225, + "train_speed(iter/s)": 1.629003 + }, + { + "acc": 0.64079723, + "epoch": 0.2595129375951294, + "grad_norm": 6.28125, + "learning_rate": 9.827095891423328e-06, + "loss": 1.73891068, + "memory(GiB)": 97.12, + "step": 10230, + "train_speed(iter/s)": 1.629102 + }, + { + "acc": 0.66326094, + "epoch": 0.25963977676306443, + "grad_norm": 6.28125, + "learning_rate": 9.826822406418366e-06, + "loss": 1.53361692, + "memory(GiB)": 97.12, + "step": 10235, + "train_speed(iter/s)": 1.629191 + }, + { + "acc": 0.6562459, + "epoch": 0.2597666159309995, + "grad_norm": 4.875, + "learning_rate": 9.826548709108202e-06, + "loss": 1.55373507, + "memory(GiB)": 97.12, + "step": 10240, + "train_speed(iter/s)": 1.62928 + }, + { + "acc": 0.64401865, + "epoch": 0.2598934550989346, + "grad_norm": 5.8125, + "learning_rate": 9.826274799504878e-06, + "loss": 1.61399059, + "memory(GiB)": 97.12, + "step": 10245, + "train_speed(iter/s)": 1.629372 + }, + { + "acc": 0.65200453, + "epoch": 0.2600202942668696, + "grad_norm": 4.96875, + "learning_rate": 9.82600067762044e-06, + "loss": 1.63602333, + "memory(GiB)": 97.12, + "step": 10250, + "train_speed(iter/s)": 1.629468 + }, + { + "acc": 0.66021328, + "epoch": 0.26014713343480467, + "grad_norm": 5.90625, + "learning_rate": 9.825726343466947e-06, + "loss": 1.6139225, + "memory(GiB)": 97.12, + "step": 10255, + "train_speed(iter/s)": 1.629559 + }, + { + "acc": 0.64531975, + "epoch": 0.2602739726027397, + "grad_norm": 5.5, + "learning_rate": 9.825451797056462e-06, + "loss": 1.66134338, + "memory(GiB)": 97.12, + "step": 10260, + "train_speed(iter/s)": 1.629651 + }, + { + "acc": 0.64689102, + "epoch": 0.2604008117706748, + "grad_norm": 6.40625, + "learning_rate": 9.825177038401064e-06, + "loss": 1.64150772, + "memory(GiB)": 97.12, + "step": 10265, + "train_speed(iter/s)": 1.629742 + }, + { + "acc": 0.64313393, + "epoch": 0.26052765093860986, + "grad_norm": 5.03125, + "learning_rate": 9.824902067512838e-06, + "loss": 1.67537346, + "memory(GiB)": 97.12, + "step": 10270, + "train_speed(iter/s)": 1.629834 + }, + { + "acc": 0.65759106, + "epoch": 0.2606544901065449, + "grad_norm": 5.15625, + "learning_rate": 9.824626884403877e-06, + "loss": 1.62237453, + "memory(GiB)": 97.12, + "step": 10275, + "train_speed(iter/s)": 1.629926 + }, + { + "acc": 0.64620781, + "epoch": 0.26078132927447994, + "grad_norm": 5.125, + "learning_rate": 9.824351489086283e-06, + "loss": 1.62887878, + "memory(GiB)": 97.12, + "step": 10280, + "train_speed(iter/s)": 1.630012 + }, + { + "acc": 0.65930338, + "epoch": 0.26090816844241504, + "grad_norm": 7.5625, + "learning_rate": 9.824075881572176e-06, + "loss": 1.63321438, + "memory(GiB)": 97.12, + "step": 10285, + "train_speed(iter/s)": 1.630105 + }, + { + "acc": 0.66241989, + "epoch": 0.2610350076103501, + "grad_norm": 5.71875, + "learning_rate": 9.823800061873669e-06, + "loss": 1.63174801, + "memory(GiB)": 97.12, + "step": 10290, + "train_speed(iter/s)": 1.630197 + }, + { + "acc": 0.655688, + "epoch": 0.26116184677828513, + "grad_norm": 6.78125, + "learning_rate": 9.8235240300029e-06, + "loss": 1.68890839, + "memory(GiB)": 97.12, + "step": 10295, + "train_speed(iter/s)": 1.630291 + }, + { + "acc": 0.65024319, + "epoch": 0.2612886859462202, + "grad_norm": 5.09375, + "learning_rate": 9.82324778597201e-06, + "loss": 1.62787571, + "memory(GiB)": 97.12, + "step": 10300, + "train_speed(iter/s)": 1.630383 + }, + { + "acc": 0.64887404, + "epoch": 0.2614155251141553, + "grad_norm": 5.46875, + "learning_rate": 9.822971329793147e-06, + "loss": 1.6628849, + "memory(GiB)": 97.12, + "step": 10305, + "train_speed(iter/s)": 1.630478 + }, + { + "acc": 0.66143398, + "epoch": 0.2615423642820903, + "grad_norm": 5.0625, + "learning_rate": 9.822694661478471e-06, + "loss": 1.55211201, + "memory(GiB)": 97.12, + "step": 10310, + "train_speed(iter/s)": 1.630566 + }, + { + "acc": 0.64220591, + "epoch": 0.26166920345002537, + "grad_norm": 6.28125, + "learning_rate": 9.822417781040154e-06, + "loss": 1.66868706, + "memory(GiB)": 97.12, + "step": 10315, + "train_speed(iter/s)": 1.630655 + }, + { + "acc": 0.64102364, + "epoch": 0.2617960426179604, + "grad_norm": 4.8125, + "learning_rate": 9.822140688490372e-06, + "loss": 1.63731155, + "memory(GiB)": 97.12, + "step": 10320, + "train_speed(iter/s)": 1.630744 + }, + { + "acc": 0.65031204, + "epoch": 0.2619228817858955, + "grad_norm": 5.875, + "learning_rate": 9.821863383841312e-06, + "loss": 1.62481346, + "memory(GiB)": 97.12, + "step": 10325, + "train_speed(iter/s)": 1.630833 + }, + { + "acc": 0.66170759, + "epoch": 0.26204972095383056, + "grad_norm": 5.75, + "learning_rate": 9.821585867105173e-06, + "loss": 1.62501106, + "memory(GiB)": 97.12, + "step": 10330, + "train_speed(iter/s)": 1.630925 + }, + { + "acc": 0.63578653, + "epoch": 0.2621765601217656, + "grad_norm": 5.8125, + "learning_rate": 9.821308138294162e-06, + "loss": 1.6882782, + "memory(GiB)": 97.12, + "step": 10335, + "train_speed(iter/s)": 1.631012 + }, + { + "acc": 0.64705033, + "epoch": 0.26230339928970065, + "grad_norm": 6.03125, + "learning_rate": 9.821030197420492e-06, + "loss": 1.6899147, + "memory(GiB)": 97.12, + "step": 10340, + "train_speed(iter/s)": 1.631102 + }, + { + "acc": 0.64936504, + "epoch": 0.26243023845763574, + "grad_norm": 5.65625, + "learning_rate": 9.820752044496389e-06, + "loss": 1.63202305, + "memory(GiB)": 97.12, + "step": 10345, + "train_speed(iter/s)": 1.63119 + }, + { + "acc": 0.63620114, + "epoch": 0.2625570776255708, + "grad_norm": 5.90625, + "learning_rate": 9.82047367953409e-06, + "loss": 1.60717659, + "memory(GiB)": 97.12, + "step": 10350, + "train_speed(iter/s)": 1.631271 + }, + { + "acc": 0.66164856, + "epoch": 0.26268391679350583, + "grad_norm": 6.125, + "learning_rate": 9.820195102545835e-06, + "loss": 1.59535751, + "memory(GiB)": 97.12, + "step": 10355, + "train_speed(iter/s)": 1.631362 + }, + { + "acc": 0.63654194, + "epoch": 0.2628107559614409, + "grad_norm": 4.75, + "learning_rate": 9.81991631354388e-06, + "loss": 1.63763847, + "memory(GiB)": 97.12, + "step": 10360, + "train_speed(iter/s)": 1.631455 + }, + { + "acc": 0.64154792, + "epoch": 0.262937595129376, + "grad_norm": 5.75, + "learning_rate": 9.819637312540485e-06, + "loss": 1.71660652, + "memory(GiB)": 97.12, + "step": 10365, + "train_speed(iter/s)": 1.631539 + }, + { + "acc": 0.65010271, + "epoch": 0.263064434297311, + "grad_norm": 5.03125, + "learning_rate": 9.819358099547923e-06, + "loss": 1.62426643, + "memory(GiB)": 97.12, + "step": 10370, + "train_speed(iter/s)": 1.631625 + }, + { + "acc": 0.63507938, + "epoch": 0.26319127346524607, + "grad_norm": 5.34375, + "learning_rate": 9.819078674578474e-06, + "loss": 1.73412094, + "memory(GiB)": 97.12, + "step": 10375, + "train_speed(iter/s)": 1.631717 + }, + { + "acc": 0.63094172, + "epoch": 0.2633181126331811, + "grad_norm": 5.5625, + "learning_rate": 9.818799037644432e-06, + "loss": 1.71045914, + "memory(GiB)": 97.12, + "step": 10380, + "train_speed(iter/s)": 1.631811 + }, + { + "acc": 0.65303602, + "epoch": 0.2634449518011162, + "grad_norm": 4.9375, + "learning_rate": 9.818519188758092e-06, + "loss": 1.65938244, + "memory(GiB)": 97.12, + "step": 10385, + "train_speed(iter/s)": 1.631901 + }, + { + "acc": 0.65113139, + "epoch": 0.26357179096905126, + "grad_norm": 5.375, + "learning_rate": 9.818239127931765e-06, + "loss": 1.64128151, + "memory(GiB)": 97.12, + "step": 10390, + "train_speed(iter/s)": 1.631993 + }, + { + "acc": 0.63585367, + "epoch": 0.2636986301369863, + "grad_norm": 5.4375, + "learning_rate": 9.81795885517777e-06, + "loss": 1.7380724, + "memory(GiB)": 97.12, + "step": 10395, + "train_speed(iter/s)": 1.63208 + }, + { + "acc": 0.65230999, + "epoch": 0.26382546930492135, + "grad_norm": 4.625, + "learning_rate": 9.817678370508434e-06, + "loss": 1.67239571, + "memory(GiB)": 97.12, + "step": 10400, + "train_speed(iter/s)": 1.632172 + }, + { + "acc": 0.63712001, + "epoch": 0.26395230847285645, + "grad_norm": 5.71875, + "learning_rate": 9.817397673936093e-06, + "loss": 1.67624626, + "memory(GiB)": 97.12, + "step": 10405, + "train_speed(iter/s)": 1.632264 + }, + { + "acc": 0.6386425, + "epoch": 0.2640791476407915, + "grad_norm": 6.1875, + "learning_rate": 9.817116765473095e-06, + "loss": 1.71765099, + "memory(GiB)": 97.12, + "step": 10410, + "train_speed(iter/s)": 1.63235 + }, + { + "acc": 0.64774165, + "epoch": 0.26420598680872653, + "grad_norm": 5.28125, + "learning_rate": 9.816835645131795e-06, + "loss": 1.60468521, + "memory(GiB)": 97.12, + "step": 10415, + "train_speed(iter/s)": 1.632442 + }, + { + "acc": 0.64888167, + "epoch": 0.2643328259766616, + "grad_norm": 5.84375, + "learning_rate": 9.816554312924555e-06, + "loss": 1.61209583, + "memory(GiB)": 97.12, + "step": 10420, + "train_speed(iter/s)": 1.632531 + }, + { + "acc": 0.66007862, + "epoch": 0.2644596651445967, + "grad_norm": 5.53125, + "learning_rate": 9.816272768863756e-06, + "loss": 1.60714874, + "memory(GiB)": 97.12, + "step": 10425, + "train_speed(iter/s)": 1.632613 + }, + { + "acc": 0.65386915, + "epoch": 0.2645865043125317, + "grad_norm": 5.5625, + "learning_rate": 9.815991012961773e-06, + "loss": 1.60328808, + "memory(GiB)": 97.12, + "step": 10430, + "train_speed(iter/s)": 1.632699 + }, + { + "acc": 0.65707402, + "epoch": 0.26471334348046677, + "grad_norm": 5.09375, + "learning_rate": 9.815709045231008e-06, + "loss": 1.69138889, + "memory(GiB)": 97.12, + "step": 10435, + "train_speed(iter/s)": 1.632783 + }, + { + "acc": 0.64251747, + "epoch": 0.2648401826484018, + "grad_norm": 4.875, + "learning_rate": 9.815426865683858e-06, + "loss": 1.74192123, + "memory(GiB)": 97.12, + "step": 10440, + "train_speed(iter/s)": 1.632875 + }, + { + "acc": 0.65555048, + "epoch": 0.2649670218163369, + "grad_norm": 5.3125, + "learning_rate": 9.815144474332732e-06, + "loss": 1.60484638, + "memory(GiB)": 97.12, + "step": 10445, + "train_speed(iter/s)": 1.632959 + }, + { + "acc": 0.65817327, + "epoch": 0.26509386098427196, + "grad_norm": 5.1875, + "learning_rate": 9.814861871190056e-06, + "loss": 1.60427895, + "memory(GiB)": 97.12, + "step": 10450, + "train_speed(iter/s)": 1.633043 + }, + { + "acc": 0.6541358, + "epoch": 0.265220700152207, + "grad_norm": 6.46875, + "learning_rate": 9.814579056268256e-06, + "loss": 1.5771595, + "memory(GiB)": 97.12, + "step": 10455, + "train_speed(iter/s)": 1.633126 + }, + { + "acc": 0.65767775, + "epoch": 0.26534753932014205, + "grad_norm": 5.59375, + "learning_rate": 9.814296029579776e-06, + "loss": 1.61042709, + "memory(GiB)": 97.12, + "step": 10460, + "train_speed(iter/s)": 1.633217 + }, + { + "acc": 0.62915583, + "epoch": 0.26547437848807715, + "grad_norm": 5.21875, + "learning_rate": 9.814012791137063e-06, + "loss": 1.75139866, + "memory(GiB)": 97.12, + "step": 10465, + "train_speed(iter/s)": 1.633303 + }, + { + "acc": 0.64266024, + "epoch": 0.2656012176560122, + "grad_norm": 5.625, + "learning_rate": 9.81372934095257e-06, + "loss": 1.68595734, + "memory(GiB)": 97.12, + "step": 10470, + "train_speed(iter/s)": 1.633381 + }, + { + "acc": 0.62889385, + "epoch": 0.26572805682394723, + "grad_norm": 12.625, + "learning_rate": 9.813445679038773e-06, + "loss": 1.71039658, + "memory(GiB)": 97.12, + "step": 10475, + "train_speed(iter/s)": 1.633468 + }, + { + "acc": 0.63495979, + "epoch": 0.2658548959918823, + "grad_norm": 9.1875, + "learning_rate": 9.813161805408145e-06, + "loss": 1.70312405, + "memory(GiB)": 97.12, + "step": 10480, + "train_speed(iter/s)": 1.633555 + }, + { + "acc": 0.65271301, + "epoch": 0.2659817351598174, + "grad_norm": 5.6875, + "learning_rate": 9.812877720073169e-06, + "loss": 1.64671345, + "memory(GiB)": 97.12, + "step": 10485, + "train_speed(iter/s)": 1.633641 + }, + { + "acc": 0.64352012, + "epoch": 0.2661085743277524, + "grad_norm": 6.65625, + "learning_rate": 9.812593423046344e-06, + "loss": 1.69086628, + "memory(GiB)": 97.12, + "step": 10490, + "train_speed(iter/s)": 1.633727 + }, + { + "acc": 0.63739977, + "epoch": 0.26623541349568747, + "grad_norm": 4.9375, + "learning_rate": 9.812308914340174e-06, + "loss": 1.76656399, + "memory(GiB)": 97.12, + "step": 10495, + "train_speed(iter/s)": 1.633812 + }, + { + "acc": 0.64444456, + "epoch": 0.2663622526636225, + "grad_norm": 6.59375, + "learning_rate": 9.812024193967171e-06, + "loss": 1.70287209, + "memory(GiB)": 97.12, + "step": 10500, + "train_speed(iter/s)": 1.633903 + }, + { + "acc": 0.6491879, + "epoch": 0.2664890918315576, + "grad_norm": 5.46875, + "learning_rate": 9.811739261939861e-06, + "loss": 1.69029388, + "memory(GiB)": 97.12, + "step": 10505, + "train_speed(iter/s)": 1.63399 + }, + { + "acc": 0.65156522, + "epoch": 0.26661593099949266, + "grad_norm": 4.53125, + "learning_rate": 9.811454118270775e-06, + "loss": 1.67729492, + "memory(GiB)": 97.12, + "step": 10510, + "train_speed(iter/s)": 1.634068 + }, + { + "acc": 0.63503504, + "epoch": 0.2667427701674277, + "grad_norm": 5.59375, + "learning_rate": 9.811168762972457e-06, + "loss": 1.6670002, + "memory(GiB)": 97.12, + "step": 10515, + "train_speed(iter/s)": 1.634155 + }, + { + "acc": 0.64255152, + "epoch": 0.26686960933536275, + "grad_norm": 6.90625, + "learning_rate": 9.810883196057454e-06, + "loss": 1.66788483, + "memory(GiB)": 97.12, + "step": 10520, + "train_speed(iter/s)": 1.63424 + }, + { + "acc": 0.65614009, + "epoch": 0.26699644850329785, + "grad_norm": 5.40625, + "learning_rate": 9.81059741753833e-06, + "loss": 1.61547508, + "memory(GiB)": 97.12, + "step": 10525, + "train_speed(iter/s)": 1.634326 + }, + { + "acc": 0.64814548, + "epoch": 0.2671232876712329, + "grad_norm": 5.5625, + "learning_rate": 9.810311427427653e-06, + "loss": 1.64961472, + "memory(GiB)": 97.12, + "step": 10530, + "train_speed(iter/s)": 1.634414 + }, + { + "acc": 0.65829902, + "epoch": 0.26725012683916793, + "grad_norm": 5.03125, + "learning_rate": 9.810025225738005e-06, + "loss": 1.64391594, + "memory(GiB)": 97.12, + "step": 10535, + "train_speed(iter/s)": 1.634502 + }, + { + "acc": 0.61984053, + "epoch": 0.267376966007103, + "grad_norm": 7.28125, + "learning_rate": 9.809738812481971e-06, + "loss": 1.7640873, + "memory(GiB)": 97.12, + "step": 10540, + "train_speed(iter/s)": 1.634585 + }, + { + "acc": 0.64768834, + "epoch": 0.2675038051750381, + "grad_norm": 5.40625, + "learning_rate": 9.809452187672149e-06, + "loss": 1.63494072, + "memory(GiB)": 97.12, + "step": 10545, + "train_speed(iter/s)": 1.634673 + }, + { + "acc": 0.63901339, + "epoch": 0.2676306443429731, + "grad_norm": 4.9375, + "learning_rate": 9.809165351321149e-06, + "loss": 1.66816654, + "memory(GiB)": 97.12, + "step": 10550, + "train_speed(iter/s)": 1.634759 + }, + { + "acc": 0.66669645, + "epoch": 0.26775748351090817, + "grad_norm": 6.5625, + "learning_rate": 9.808878303441585e-06, + "loss": 1.59799433, + "memory(GiB)": 97.12, + "step": 10555, + "train_speed(iter/s)": 1.634845 + }, + { + "acc": 0.65254521, + "epoch": 0.2678843226788432, + "grad_norm": 6.9375, + "learning_rate": 9.808591044046083e-06, + "loss": 1.68372135, + "memory(GiB)": 97.12, + "step": 10560, + "train_speed(iter/s)": 1.634929 + }, + { + "acc": 0.65123339, + "epoch": 0.2680111618467783, + "grad_norm": 5.96875, + "learning_rate": 9.808303573147277e-06, + "loss": 1.66323242, + "memory(GiB)": 97.17, + "step": 10565, + "train_speed(iter/s)": 1.635007 + }, + { + "acc": 0.65460091, + "epoch": 0.26813800101471336, + "grad_norm": 6.46875, + "learning_rate": 9.808015890757812e-06, + "loss": 1.63129311, + "memory(GiB)": 97.17, + "step": 10570, + "train_speed(iter/s)": 1.635097 + }, + { + "acc": 0.65706034, + "epoch": 0.2682648401826484, + "grad_norm": 6.0, + "learning_rate": 9.807727996890343e-06, + "loss": 1.60857773, + "memory(GiB)": 97.17, + "step": 10575, + "train_speed(iter/s)": 1.635179 + }, + { + "acc": 0.64872966, + "epoch": 0.26839167935058345, + "grad_norm": 5.84375, + "learning_rate": 9.807439891557533e-06, + "loss": 1.64379444, + "memory(GiB)": 97.17, + "step": 10580, + "train_speed(iter/s)": 1.635264 + }, + { + "acc": 0.63933663, + "epoch": 0.26851851851851855, + "grad_norm": 6.1875, + "learning_rate": 9.80715157477205e-06, + "loss": 1.70501518, + "memory(GiB)": 97.17, + "step": 10585, + "train_speed(iter/s)": 1.635347 + }, + { + "acc": 0.64208035, + "epoch": 0.2686453576864536, + "grad_norm": 5.09375, + "learning_rate": 9.806863046546581e-06, + "loss": 1.69165344, + "memory(GiB)": 97.17, + "step": 10590, + "train_speed(iter/s)": 1.635432 + }, + { + "acc": 0.64665227, + "epoch": 0.26877219685438863, + "grad_norm": 4.6875, + "learning_rate": 9.806574306893814e-06, + "loss": 1.66041985, + "memory(GiB)": 97.17, + "step": 10595, + "train_speed(iter/s)": 1.635518 + }, + { + "acc": 0.66367922, + "epoch": 0.2688990360223237, + "grad_norm": 5.8125, + "learning_rate": 9.806285355826447e-06, + "loss": 1.60019341, + "memory(GiB)": 97.17, + "step": 10600, + "train_speed(iter/s)": 1.635606 + }, + { + "acc": 0.66562657, + "epoch": 0.2690258751902588, + "grad_norm": 6.0625, + "learning_rate": 9.805996193357194e-06, + "loss": 1.56760168, + "memory(GiB)": 97.17, + "step": 10605, + "train_speed(iter/s)": 1.635693 + }, + { + "acc": 0.62523212, + "epoch": 0.2691527143581938, + "grad_norm": 5.65625, + "learning_rate": 9.80570681949877e-06, + "loss": 1.68235588, + "memory(GiB)": 97.17, + "step": 10610, + "train_speed(iter/s)": 1.635779 + }, + { + "acc": 0.64439225, + "epoch": 0.26927955352612887, + "grad_norm": 5.21875, + "learning_rate": 9.805417234263905e-06, + "loss": 1.65805702, + "memory(GiB)": 97.17, + "step": 10615, + "train_speed(iter/s)": 1.635869 + }, + { + "acc": 0.63883467, + "epoch": 0.2694063926940639, + "grad_norm": 5.15625, + "learning_rate": 9.805127437665333e-06, + "loss": 1.75823822, + "memory(GiB)": 97.17, + "step": 10620, + "train_speed(iter/s)": 1.635957 + }, + { + "acc": 0.63620129, + "epoch": 0.269533231861999, + "grad_norm": 5.0625, + "learning_rate": 9.804837429715805e-06, + "loss": 1.70582237, + "memory(GiB)": 97.17, + "step": 10625, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.66414537, + "epoch": 0.26966007102993406, + "grad_norm": 5.09375, + "learning_rate": 9.804547210428074e-06, + "loss": 1.61211872, + "memory(GiB)": 97.17, + "step": 10630, + "train_speed(iter/s)": 1.636122 + }, + { + "acc": 0.63372812, + "epoch": 0.2697869101978691, + "grad_norm": 5.90625, + "learning_rate": 9.804256779814906e-06, + "loss": 1.69291878, + "memory(GiB)": 97.17, + "step": 10635, + "train_speed(iter/s)": 1.636209 + }, + { + "acc": 0.6433568, + "epoch": 0.26991374936580415, + "grad_norm": 5.375, + "learning_rate": 9.803966137889076e-06, + "loss": 1.6116333, + "memory(GiB)": 97.17, + "step": 10640, + "train_speed(iter/s)": 1.636287 + }, + { + "acc": 0.63128929, + "epoch": 0.27004058853373925, + "grad_norm": 5.75, + "learning_rate": 9.803675284663368e-06, + "loss": 1.72813683, + "memory(GiB)": 97.17, + "step": 10645, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.65008564, + "epoch": 0.2701674277016743, + "grad_norm": 6.28125, + "learning_rate": 9.803384220150571e-06, + "loss": 1.68646221, + "memory(GiB)": 97.17, + "step": 10650, + "train_speed(iter/s)": 1.636463 + }, + { + "acc": 0.63832312, + "epoch": 0.27029426686960933, + "grad_norm": 7.0, + "learning_rate": 9.803092944363493e-06, + "loss": 1.7142189, + "memory(GiB)": 97.17, + "step": 10655, + "train_speed(iter/s)": 1.636553 + }, + { + "acc": 0.66242409, + "epoch": 0.2704211060375444, + "grad_norm": 6.4375, + "learning_rate": 9.802801457314943e-06, + "loss": 1.61483803, + "memory(GiB)": 97.17, + "step": 10660, + "train_speed(iter/s)": 1.636643 + }, + { + "acc": 0.65640869, + "epoch": 0.2705479452054795, + "grad_norm": 7.3125, + "learning_rate": 9.802509759017741e-06, + "loss": 1.60437355, + "memory(GiB)": 97.17, + "step": 10665, + "train_speed(iter/s)": 1.636727 + }, + { + "acc": 0.63736796, + "epoch": 0.2706747843734145, + "grad_norm": 4.46875, + "learning_rate": 9.802217849484719e-06, + "loss": 1.72520905, + "memory(GiB)": 97.17, + "step": 10670, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.64779425, + "epoch": 0.27080162354134957, + "grad_norm": 6.1875, + "learning_rate": 9.801925728728715e-06, + "loss": 1.64617748, + "memory(GiB)": 97.17, + "step": 10675, + "train_speed(iter/s)": 1.6369 + }, + { + "acc": 0.6488193, + "epoch": 0.2709284627092846, + "grad_norm": 5.21875, + "learning_rate": 9.801633396762577e-06, + "loss": 1.66686363, + "memory(GiB)": 97.17, + "step": 10680, + "train_speed(iter/s)": 1.636983 + }, + { + "acc": 0.65254855, + "epoch": 0.2710553018772197, + "grad_norm": 6.1875, + "learning_rate": 9.801340853599167e-06, + "loss": 1.7029892, + "memory(GiB)": 97.17, + "step": 10685, + "train_speed(iter/s)": 1.637065 + }, + { + "acc": 0.65923319, + "epoch": 0.27118214104515476, + "grad_norm": 7.09375, + "learning_rate": 9.801048099251348e-06, + "loss": 1.60697441, + "memory(GiB)": 97.17, + "step": 10690, + "train_speed(iter/s)": 1.637149 + }, + { + "acc": 0.64430847, + "epoch": 0.2713089802130898, + "grad_norm": 5.125, + "learning_rate": 9.800755133731999e-06, + "loss": 1.6458149, + "memory(GiB)": 97.17, + "step": 10695, + "train_speed(iter/s)": 1.637232 + }, + { + "acc": 0.65353289, + "epoch": 0.27143581938102485, + "grad_norm": 5.5, + "learning_rate": 9.800461957054006e-06, + "loss": 1.63329773, + "memory(GiB)": 97.17, + "step": 10700, + "train_speed(iter/s)": 1.637312 + }, + { + "acc": 0.65184431, + "epoch": 0.27156265854895995, + "grad_norm": 5.9375, + "learning_rate": 9.800168569230261e-06, + "loss": 1.61327991, + "memory(GiB)": 97.17, + "step": 10705, + "train_speed(iter/s)": 1.637394 + }, + { + "acc": 0.65758886, + "epoch": 0.271689497716895, + "grad_norm": 5.125, + "learning_rate": 9.799874970273674e-06, + "loss": 1.59366589, + "memory(GiB)": 97.17, + "step": 10710, + "train_speed(iter/s)": 1.637476 + }, + { + "acc": 0.65175881, + "epoch": 0.27181633688483003, + "grad_norm": 5.0625, + "learning_rate": 9.799581160197156e-06, + "loss": 1.65314426, + "memory(GiB)": 97.17, + "step": 10715, + "train_speed(iter/s)": 1.63756 + }, + { + "acc": 0.65863752, + "epoch": 0.2719431760527651, + "grad_norm": 4.34375, + "learning_rate": 9.799287139013628e-06, + "loss": 1.59951286, + "memory(GiB)": 97.17, + "step": 10720, + "train_speed(iter/s)": 1.637634 + }, + { + "acc": 0.66159692, + "epoch": 0.2720700152207002, + "grad_norm": 5.53125, + "learning_rate": 9.798992906736028e-06, + "loss": 1.56686249, + "memory(GiB)": 97.17, + "step": 10725, + "train_speed(iter/s)": 1.637716 + }, + { + "acc": 0.64639587, + "epoch": 0.2721968543886352, + "grad_norm": 5.34375, + "learning_rate": 9.79869846337729e-06, + "loss": 1.67068996, + "memory(GiB)": 97.17, + "step": 10730, + "train_speed(iter/s)": 1.637797 + }, + { + "acc": 0.64133353, + "epoch": 0.27232369355657027, + "grad_norm": 6.40625, + "learning_rate": 9.79840380895037e-06, + "loss": 1.66402054, + "memory(GiB)": 97.17, + "step": 10735, + "train_speed(iter/s)": 1.637874 + }, + { + "acc": 0.65808792, + "epoch": 0.2724505327245053, + "grad_norm": 5.125, + "learning_rate": 9.798108943468228e-06, + "loss": 1.55847158, + "memory(GiB)": 97.17, + "step": 10740, + "train_speed(iter/s)": 1.637956 + }, + { + "acc": 0.645502, + "epoch": 0.2725773718924404, + "grad_norm": 4.96875, + "learning_rate": 9.797813866943832e-06, + "loss": 1.61261215, + "memory(GiB)": 97.17, + "step": 10745, + "train_speed(iter/s)": 1.638035 + }, + { + "acc": 0.6569036, + "epoch": 0.27270421106037546, + "grad_norm": 6.75, + "learning_rate": 9.797518579390162e-06, + "loss": 1.52561388, + "memory(GiB)": 97.17, + "step": 10750, + "train_speed(iter/s)": 1.638118 + }, + { + "acc": 0.65607042, + "epoch": 0.2728310502283105, + "grad_norm": 4.5625, + "learning_rate": 9.797223080820204e-06, + "loss": 1.63400593, + "memory(GiB)": 97.17, + "step": 10755, + "train_speed(iter/s)": 1.638192 + }, + { + "acc": 0.64432902, + "epoch": 0.27295788939624555, + "grad_norm": 5.5625, + "learning_rate": 9.796927371246958e-06, + "loss": 1.6504652, + "memory(GiB)": 97.17, + "step": 10760, + "train_speed(iter/s)": 1.638276 + }, + { + "acc": 0.64416261, + "epoch": 0.27308472856418065, + "grad_norm": 5.34375, + "learning_rate": 9.796631450683431e-06, + "loss": 1.63592777, + "memory(GiB)": 97.17, + "step": 10765, + "train_speed(iter/s)": 1.638359 + }, + { + "acc": 0.63731375, + "epoch": 0.2732115677321157, + "grad_norm": 5.28125, + "learning_rate": 9.796335319142637e-06, + "loss": 1.64401073, + "memory(GiB)": 97.17, + "step": 10770, + "train_speed(iter/s)": 1.638439 + }, + { + "acc": 0.64967432, + "epoch": 0.27333840690005073, + "grad_norm": 5.90625, + "learning_rate": 9.796038976637599e-06, + "loss": 1.63275318, + "memory(GiB)": 97.17, + "step": 10775, + "train_speed(iter/s)": 1.63852 + }, + { + "acc": 0.65172482, + "epoch": 0.2734652460679858, + "grad_norm": 5.3125, + "learning_rate": 9.795742423181355e-06, + "loss": 1.6356472, + "memory(GiB)": 97.17, + "step": 10780, + "train_speed(iter/s)": 1.638603 + }, + { + "acc": 0.63355989, + "epoch": 0.2735920852359209, + "grad_norm": 5.3125, + "learning_rate": 9.795445658786948e-06, + "loss": 1.74606743, + "memory(GiB)": 97.17, + "step": 10785, + "train_speed(iter/s)": 1.638681 + }, + { + "acc": 0.65085659, + "epoch": 0.2737189244038559, + "grad_norm": 5.09375, + "learning_rate": 9.795148683467431e-06, + "loss": 1.64198532, + "memory(GiB)": 97.17, + "step": 10790, + "train_speed(iter/s)": 1.63876 + }, + { + "acc": 0.64056921, + "epoch": 0.27384576357179097, + "grad_norm": 5.5, + "learning_rate": 9.794851497235866e-06, + "loss": 1.66883278, + "memory(GiB)": 97.17, + "step": 10795, + "train_speed(iter/s)": 1.63884 + }, + { + "acc": 0.63935785, + "epoch": 0.273972602739726, + "grad_norm": 6.21875, + "learning_rate": 9.794554100105325e-06, + "loss": 1.64258156, + "memory(GiB)": 97.17, + "step": 10800, + "train_speed(iter/s)": 1.63892 + }, + { + "acc": 0.6495461, + "epoch": 0.2740994419076611, + "grad_norm": 4.84375, + "learning_rate": 9.794256492088888e-06, + "loss": 1.6080349, + "memory(GiB)": 97.17, + "step": 10805, + "train_speed(iter/s)": 1.639002 + }, + { + "acc": 0.64967103, + "epoch": 0.27422628107559616, + "grad_norm": 6.09375, + "learning_rate": 9.793958673199647e-06, + "loss": 1.68081093, + "memory(GiB)": 97.17, + "step": 10810, + "train_speed(iter/s)": 1.639082 + }, + { + "acc": 0.6587369, + "epoch": 0.2743531202435312, + "grad_norm": 5.0, + "learning_rate": 9.793660643450697e-06, + "loss": 1.61782837, + "memory(GiB)": 97.17, + "step": 10815, + "train_speed(iter/s)": 1.63916 + }, + { + "acc": 0.64281974, + "epoch": 0.27447995941146625, + "grad_norm": 5.53125, + "learning_rate": 9.793362402855152e-06, + "loss": 1.64244423, + "memory(GiB)": 97.17, + "step": 10820, + "train_speed(iter/s)": 1.639235 + }, + { + "acc": 0.64013367, + "epoch": 0.27460679857940135, + "grad_norm": 5.5, + "learning_rate": 9.79306395142613e-06, + "loss": 1.63845024, + "memory(GiB)": 97.17, + "step": 10825, + "train_speed(iter/s)": 1.639315 + }, + { + "acc": 0.65530014, + "epoch": 0.2747336377473364, + "grad_norm": 4.625, + "learning_rate": 9.792765289176751e-06, + "loss": 1.60434837, + "memory(GiB)": 97.17, + "step": 10830, + "train_speed(iter/s)": 1.639391 + }, + { + "acc": 0.64279089, + "epoch": 0.27486047691527143, + "grad_norm": 5.34375, + "learning_rate": 9.79246641612016e-06, + "loss": 1.63921108, + "memory(GiB)": 97.17, + "step": 10835, + "train_speed(iter/s)": 1.639471 + }, + { + "acc": 0.64567795, + "epoch": 0.2749873160832065, + "grad_norm": 5.1875, + "learning_rate": 9.792167332269498e-06, + "loss": 1.62604542, + "memory(GiB)": 97.17, + "step": 10840, + "train_speed(iter/s)": 1.63955 + }, + { + "acc": 0.65950603, + "epoch": 0.2751141552511416, + "grad_norm": 4.84375, + "learning_rate": 9.791868037637922e-06, + "loss": 1.61371078, + "memory(GiB)": 97.17, + "step": 10845, + "train_speed(iter/s)": 1.639627 + }, + { + "acc": 0.64927797, + "epoch": 0.2752409944190766, + "grad_norm": 5.90625, + "learning_rate": 9.791568532238594e-06, + "loss": 1.66386395, + "memory(GiB)": 97.17, + "step": 10850, + "train_speed(iter/s)": 1.639705 + }, + { + "acc": 0.65427227, + "epoch": 0.27536783358701167, + "grad_norm": 5.75, + "learning_rate": 9.79126881608469e-06, + "loss": 1.66656418, + "memory(GiB)": 97.17, + "step": 10855, + "train_speed(iter/s)": 1.63978 + }, + { + "acc": 0.64983964, + "epoch": 0.2754946727549467, + "grad_norm": 6.15625, + "learning_rate": 9.790968889189392e-06, + "loss": 1.64085808, + "memory(GiB)": 97.17, + "step": 10860, + "train_speed(iter/s)": 1.639858 + }, + { + "acc": 0.65658627, + "epoch": 0.2756215119228818, + "grad_norm": 6.1875, + "learning_rate": 9.790668751565893e-06, + "loss": 1.65282135, + "memory(GiB)": 97.17, + "step": 10865, + "train_speed(iter/s)": 1.639937 + }, + { + "acc": 0.65698934, + "epoch": 0.27574835109081686, + "grad_norm": 4.875, + "learning_rate": 9.790368403227391e-06, + "loss": 1.67522392, + "memory(GiB)": 97.17, + "step": 10870, + "train_speed(iter/s)": 1.640015 + }, + { + "acc": 0.64279351, + "epoch": 0.2758751902587519, + "grad_norm": 4.65625, + "learning_rate": 9.7900678441871e-06, + "loss": 1.60776596, + "memory(GiB)": 97.17, + "step": 10875, + "train_speed(iter/s)": 1.640092 + }, + { + "acc": 0.64378433, + "epoch": 0.27600202942668695, + "grad_norm": 5.65625, + "learning_rate": 9.78976707445824e-06, + "loss": 1.625037, + "memory(GiB)": 97.17, + "step": 10880, + "train_speed(iter/s)": 1.640171 + }, + { + "acc": 0.65321908, + "epoch": 0.27612886859462205, + "grad_norm": 6.1875, + "learning_rate": 9.78946609405404e-06, + "loss": 1.65869293, + "memory(GiB)": 97.17, + "step": 10885, + "train_speed(iter/s)": 1.640242 + }, + { + "acc": 0.66312265, + "epoch": 0.2762557077625571, + "grad_norm": 8.875, + "learning_rate": 9.789164902987738e-06, + "loss": 1.61773643, + "memory(GiB)": 97.17, + "step": 10890, + "train_speed(iter/s)": 1.640322 + }, + { + "acc": 0.66834092, + "epoch": 0.27638254693049213, + "grad_norm": 5.03125, + "learning_rate": 9.78886350127258e-06, + "loss": 1.59298992, + "memory(GiB)": 97.17, + "step": 10895, + "train_speed(iter/s)": 1.640399 + }, + { + "acc": 0.64224973, + "epoch": 0.2765093860984272, + "grad_norm": 5.3125, + "learning_rate": 9.788561888921825e-06, + "loss": 1.69609795, + "memory(GiB)": 97.17, + "step": 10900, + "train_speed(iter/s)": 1.64048 + }, + { + "acc": 0.66138825, + "epoch": 0.2766362252663623, + "grad_norm": 6.625, + "learning_rate": 9.788260065948738e-06, + "loss": 1.65636482, + "memory(GiB)": 97.17, + "step": 10905, + "train_speed(iter/s)": 1.640551 + }, + { + "acc": 0.62622252, + "epoch": 0.2767630644342973, + "grad_norm": 6.03125, + "learning_rate": 9.787958032366596e-06, + "loss": 1.71186295, + "memory(GiB)": 97.17, + "step": 10910, + "train_speed(iter/s)": 1.640623 + }, + { + "acc": 0.6585865, + "epoch": 0.27688990360223237, + "grad_norm": 6.0, + "learning_rate": 9.787655788188684e-06, + "loss": 1.58614922, + "memory(GiB)": 97.17, + "step": 10915, + "train_speed(iter/s)": 1.640697 + }, + { + "acc": 0.66387935, + "epoch": 0.2770167427701674, + "grad_norm": 5.53125, + "learning_rate": 9.787353333428293e-06, + "loss": 1.54813852, + "memory(GiB)": 97.17, + "step": 10920, + "train_speed(iter/s)": 1.640774 + }, + { + "acc": 0.64432573, + "epoch": 0.2771435819381025, + "grad_norm": 6.09375, + "learning_rate": 9.78705066809873e-06, + "loss": 1.6542614, + "memory(GiB)": 97.17, + "step": 10925, + "train_speed(iter/s)": 1.640848 + }, + { + "acc": 0.63740487, + "epoch": 0.27727042110603756, + "grad_norm": 6.0, + "learning_rate": 9.786747792213304e-06, + "loss": 1.72560329, + "memory(GiB)": 97.17, + "step": 10930, + "train_speed(iter/s)": 1.640921 + }, + { + "acc": 0.65823984, + "epoch": 0.2773972602739726, + "grad_norm": 6.1875, + "learning_rate": 9.78644470578534e-06, + "loss": 1.62464523, + "memory(GiB)": 97.17, + "step": 10935, + "train_speed(iter/s)": 1.640998 + }, + { + "acc": 0.63928123, + "epoch": 0.27752409944190765, + "grad_norm": 5.09375, + "learning_rate": 9.78614140882817e-06, + "loss": 1.72192078, + "memory(GiB)": 97.17, + "step": 10940, + "train_speed(iter/s)": 1.641074 + }, + { + "acc": 0.66831856, + "epoch": 0.27765093860984275, + "grad_norm": 5.25, + "learning_rate": 9.78583790135513e-06, + "loss": 1.58727722, + "memory(GiB)": 97.17, + "step": 10945, + "train_speed(iter/s)": 1.641145 + }, + { + "acc": 0.65687213, + "epoch": 0.2777777777777778, + "grad_norm": 5.3125, + "learning_rate": 9.785534183379571e-06, + "loss": 1.6255949, + "memory(GiB)": 97.17, + "step": 10950, + "train_speed(iter/s)": 1.641223 + }, + { + "acc": 0.64482126, + "epoch": 0.27790461694571283, + "grad_norm": 5.25, + "learning_rate": 9.785230254914855e-06, + "loss": 1.68952808, + "memory(GiB)": 97.17, + "step": 10955, + "train_speed(iter/s)": 1.641295 + }, + { + "acc": 0.64171743, + "epoch": 0.2780314561136479, + "grad_norm": 5.0625, + "learning_rate": 9.784926115974346e-06, + "loss": 1.67399292, + "memory(GiB)": 97.17, + "step": 10960, + "train_speed(iter/s)": 1.641376 + }, + { + "acc": 0.65712643, + "epoch": 0.278158295281583, + "grad_norm": 4.9375, + "learning_rate": 9.784621766571424e-06, + "loss": 1.63215904, + "memory(GiB)": 97.17, + "step": 10965, + "train_speed(iter/s)": 1.641448 + }, + { + "acc": 0.64262872, + "epoch": 0.278285134449518, + "grad_norm": 5.5, + "learning_rate": 9.784317206719475e-06, + "loss": 1.73264351, + "memory(GiB)": 97.17, + "step": 10970, + "train_speed(iter/s)": 1.64153 + }, + { + "acc": 0.64218521, + "epoch": 0.27841197361745307, + "grad_norm": 4.59375, + "learning_rate": 9.784012436431896e-06, + "loss": 1.66874371, + "memory(GiB)": 97.17, + "step": 10975, + "train_speed(iter/s)": 1.641607 + }, + { + "acc": 0.65534229, + "epoch": 0.2785388127853881, + "grad_norm": 4.9375, + "learning_rate": 9.78370745572209e-06, + "loss": 1.66249256, + "memory(GiB)": 97.17, + "step": 10980, + "train_speed(iter/s)": 1.641686 + }, + { + "acc": 0.64304752, + "epoch": 0.2786656519533232, + "grad_norm": 6.4375, + "learning_rate": 9.783402264603471e-06, + "loss": 1.6038517, + "memory(GiB)": 97.17, + "step": 10985, + "train_speed(iter/s)": 1.641764 + }, + { + "acc": 0.64331083, + "epoch": 0.27879249112125826, + "grad_norm": 4.6875, + "learning_rate": 9.783096863089465e-06, + "loss": 1.6379612, + "memory(GiB)": 97.17, + "step": 10990, + "train_speed(iter/s)": 1.641839 + }, + { + "acc": 0.66518965, + "epoch": 0.2789193302891933, + "grad_norm": 5.875, + "learning_rate": 9.782791251193505e-06, + "loss": 1.51117487, + "memory(GiB)": 97.17, + "step": 10995, + "train_speed(iter/s)": 1.641917 + }, + { + "acc": 0.6459259, + "epoch": 0.27904616945712835, + "grad_norm": 4.59375, + "learning_rate": 9.782485428929032e-06, + "loss": 1.63293686, + "memory(GiB)": 97.17, + "step": 11000, + "train_speed(iter/s)": 1.641992 + }, + { + "epoch": 0.27904616945712835, + "eval_acc": 0.6396231025128587, + "eval_loss": 1.6118441820144653, + "eval_runtime": 58.6012, + "eval_samples_per_second": 108.701, + "eval_steps_per_second": 27.184, + "step": 11000 + }, + { + "acc": 0.66373024, + "epoch": 0.27917300862506345, + "grad_norm": 5.65625, + "learning_rate": 9.782179396309496e-06, + "loss": 1.65504322, + "memory(GiB)": 97.17, + "step": 11005, + "train_speed(iter/s)": 1.626816 + }, + { + "acc": 0.63609104, + "epoch": 0.2792998477929985, + "grad_norm": 4.78125, + "learning_rate": 9.78187315334836e-06, + "loss": 1.66823139, + "memory(GiB)": 97.17, + "step": 11010, + "train_speed(iter/s)": 1.626897 + }, + { + "acc": 0.64733672, + "epoch": 0.27942668696093353, + "grad_norm": 5.875, + "learning_rate": 9.781566700059094e-06, + "loss": 1.65000801, + "memory(GiB)": 97.17, + "step": 11015, + "train_speed(iter/s)": 1.626978 + }, + { + "acc": 0.64460173, + "epoch": 0.2795535261288686, + "grad_norm": 7.34375, + "learning_rate": 9.781260036455176e-06, + "loss": 1.71171131, + "memory(GiB)": 97.17, + "step": 11020, + "train_speed(iter/s)": 1.627058 + }, + { + "acc": 0.66845303, + "epoch": 0.2796803652968037, + "grad_norm": 4.875, + "learning_rate": 9.780953162550093e-06, + "loss": 1.59873047, + "memory(GiB)": 97.17, + "step": 11025, + "train_speed(iter/s)": 1.627134 + }, + { + "acc": 0.64273496, + "epoch": 0.2798072044647387, + "grad_norm": 5.78125, + "learning_rate": 9.780646078357346e-06, + "loss": 1.65467281, + "memory(GiB)": 97.17, + "step": 11030, + "train_speed(iter/s)": 1.627211 + }, + { + "acc": 0.65384021, + "epoch": 0.27993404363267377, + "grad_norm": 5.5, + "learning_rate": 9.78033878389044e-06, + "loss": 1.6958107, + "memory(GiB)": 97.17, + "step": 11035, + "train_speed(iter/s)": 1.627297 + }, + { + "acc": 0.65496006, + "epoch": 0.2800608828006088, + "grad_norm": 6.34375, + "learning_rate": 9.780031279162892e-06, + "loss": 1.66777267, + "memory(GiB)": 97.17, + "step": 11040, + "train_speed(iter/s)": 1.627372 + }, + { + "acc": 0.66064839, + "epoch": 0.2801877219685439, + "grad_norm": 8.4375, + "learning_rate": 9.779723564188228e-06, + "loss": 1.67831955, + "memory(GiB)": 97.17, + "step": 11045, + "train_speed(iter/s)": 1.627458 + }, + { + "acc": 0.64634714, + "epoch": 0.28031456113647896, + "grad_norm": 6.59375, + "learning_rate": 9.77941563897998e-06, + "loss": 1.67265205, + "memory(GiB)": 97.17, + "step": 11050, + "train_speed(iter/s)": 1.62754 + }, + { + "acc": 0.65311151, + "epoch": 0.280441400304414, + "grad_norm": 5.125, + "learning_rate": 9.779107503551695e-06, + "loss": 1.66472244, + "memory(GiB)": 97.17, + "step": 11055, + "train_speed(iter/s)": 1.627623 + }, + { + "acc": 0.66278992, + "epoch": 0.28056823947234905, + "grad_norm": 6.46875, + "learning_rate": 9.778799157916926e-06, + "loss": 1.61254082, + "memory(GiB)": 97.17, + "step": 11060, + "train_speed(iter/s)": 1.627701 + }, + { + "acc": 0.64478445, + "epoch": 0.28069507864028415, + "grad_norm": 4.65625, + "learning_rate": 9.77849060208923e-06, + "loss": 1.64988365, + "memory(GiB)": 97.17, + "step": 11065, + "train_speed(iter/s)": 1.627753 + }, + { + "acc": 0.63668852, + "epoch": 0.2808219178082192, + "grad_norm": 4.90625, + "learning_rate": 9.778181836082185e-06, + "loss": 1.68410778, + "memory(GiB)": 97.17, + "step": 11070, + "train_speed(iter/s)": 1.627832 + }, + { + "acc": 0.65798497, + "epoch": 0.28094875697615423, + "grad_norm": 6.40625, + "learning_rate": 9.777872859909373e-06, + "loss": 1.63516426, + "memory(GiB)": 97.17, + "step": 11075, + "train_speed(iter/s)": 1.62791 + }, + { + "acc": 0.67312264, + "epoch": 0.2810755961440893, + "grad_norm": 6.0, + "learning_rate": 9.777563673584376e-06, + "loss": 1.5473176, + "memory(GiB)": 97.17, + "step": 11080, + "train_speed(iter/s)": 1.62799 + }, + { + "acc": 0.65361595, + "epoch": 0.2812024353120244, + "grad_norm": 5.9375, + "learning_rate": 9.777254277120801e-06, + "loss": 1.60058556, + "memory(GiB)": 97.17, + "step": 11085, + "train_speed(iter/s)": 1.628067 + }, + { + "acc": 0.66371036, + "epoch": 0.2813292744799594, + "grad_norm": 5.125, + "learning_rate": 9.776944670532253e-06, + "loss": 1.62192726, + "memory(GiB)": 97.17, + "step": 11090, + "train_speed(iter/s)": 1.628151 + }, + { + "acc": 0.65502048, + "epoch": 0.28145611364789447, + "grad_norm": 5.96875, + "learning_rate": 9.776634853832352e-06, + "loss": 1.6127655, + "memory(GiB)": 97.17, + "step": 11095, + "train_speed(iter/s)": 1.628232 + }, + { + "acc": 0.65877752, + "epoch": 0.2815829528158295, + "grad_norm": 6.21875, + "learning_rate": 9.776324827034724e-06, + "loss": 1.62296791, + "memory(GiB)": 97.17, + "step": 11100, + "train_speed(iter/s)": 1.628309 + }, + { + "acc": 0.64850435, + "epoch": 0.2817097919837646, + "grad_norm": 4.78125, + "learning_rate": 9.776014590153005e-06, + "loss": 1.64487839, + "memory(GiB)": 97.17, + "step": 11105, + "train_speed(iter/s)": 1.628393 + }, + { + "acc": 0.66627798, + "epoch": 0.28183663115169966, + "grad_norm": 6.1875, + "learning_rate": 9.77570414320084e-06, + "loss": 1.59494495, + "memory(GiB)": 97.17, + "step": 11110, + "train_speed(iter/s)": 1.628473 + }, + { + "acc": 0.65642114, + "epoch": 0.2819634703196347, + "grad_norm": 6.03125, + "learning_rate": 9.775393486191884e-06, + "loss": 1.61409416, + "memory(GiB)": 97.17, + "step": 11115, + "train_speed(iter/s)": 1.628541 + }, + { + "acc": 0.64382234, + "epoch": 0.28209030948756975, + "grad_norm": 6.5, + "learning_rate": 9.775082619139805e-06, + "loss": 1.67371445, + "memory(GiB)": 97.17, + "step": 11120, + "train_speed(iter/s)": 1.628624 + }, + { + "acc": 0.6633831, + "epoch": 0.28221714865550485, + "grad_norm": 6.21875, + "learning_rate": 9.77477154205827e-06, + "loss": 1.60559349, + "memory(GiB)": 97.17, + "step": 11125, + "train_speed(iter/s)": 1.628711 + }, + { + "acc": 0.64146528, + "epoch": 0.2823439878234399, + "grad_norm": 5.4375, + "learning_rate": 9.774460254960968e-06, + "loss": 1.66024208, + "memory(GiB)": 97.17, + "step": 11130, + "train_speed(iter/s)": 1.628797 + }, + { + "acc": 0.660462, + "epoch": 0.28247082699137493, + "grad_norm": 5.96875, + "learning_rate": 9.774148757861584e-06, + "loss": 1.57980824, + "memory(GiB)": 97.17, + "step": 11135, + "train_speed(iter/s)": 1.628877 + }, + { + "acc": 0.64353399, + "epoch": 0.28259766615931, + "grad_norm": 6.40625, + "learning_rate": 9.773837050773824e-06, + "loss": 1.6802948, + "memory(GiB)": 97.17, + "step": 11140, + "train_speed(iter/s)": 1.628959 + }, + { + "acc": 0.63185968, + "epoch": 0.2827245053272451, + "grad_norm": 5.8125, + "learning_rate": 9.773525133711399e-06, + "loss": 1.69713783, + "memory(GiB)": 97.17, + "step": 11145, + "train_speed(iter/s)": 1.629041 + }, + { + "acc": 0.65354457, + "epoch": 0.2828513444951801, + "grad_norm": 5.09375, + "learning_rate": 9.773213006688024e-06, + "loss": 1.68390293, + "memory(GiB)": 97.17, + "step": 11150, + "train_speed(iter/s)": 1.62912 + }, + { + "acc": 0.63266058, + "epoch": 0.28297818366311517, + "grad_norm": 5.9375, + "learning_rate": 9.77290066971743e-06, + "loss": 1.74093666, + "memory(GiB)": 97.17, + "step": 11155, + "train_speed(iter/s)": 1.629202 + }, + { + "acc": 0.64206438, + "epoch": 0.2831050228310502, + "grad_norm": 4.75, + "learning_rate": 9.772588122813358e-06, + "loss": 1.66956558, + "memory(GiB)": 97.17, + "step": 11160, + "train_speed(iter/s)": 1.629282 + }, + { + "acc": 0.66281791, + "epoch": 0.2832318619989853, + "grad_norm": 6.03125, + "learning_rate": 9.772275365989548e-06, + "loss": 1.65986023, + "memory(GiB)": 97.17, + "step": 11165, + "train_speed(iter/s)": 1.629362 + }, + { + "acc": 0.66160192, + "epoch": 0.28335870116692036, + "grad_norm": 5.625, + "learning_rate": 9.771962399259764e-06, + "loss": 1.6244503, + "memory(GiB)": 97.17, + "step": 11170, + "train_speed(iter/s)": 1.629443 + }, + { + "acc": 0.66339049, + "epoch": 0.2834855403348554, + "grad_norm": 5.1875, + "learning_rate": 9.771649222637767e-06, + "loss": 1.55549812, + "memory(GiB)": 97.17, + "step": 11175, + "train_speed(iter/s)": 1.629523 + }, + { + "acc": 0.62669697, + "epoch": 0.28361237950279045, + "grad_norm": 6.375, + "learning_rate": 9.771335836137332e-06, + "loss": 1.69616547, + "memory(GiB)": 97.17, + "step": 11180, + "train_speed(iter/s)": 1.629605 + }, + { + "acc": 0.65840244, + "epoch": 0.28373921867072555, + "grad_norm": 5.28125, + "learning_rate": 9.771022239772248e-06, + "loss": 1.5628808, + "memory(GiB)": 97.17, + "step": 11185, + "train_speed(iter/s)": 1.629681 + }, + { + "acc": 0.64685392, + "epoch": 0.2838660578386606, + "grad_norm": 5.84375, + "learning_rate": 9.770708433556302e-06, + "loss": 1.69844799, + "memory(GiB)": 97.17, + "step": 11190, + "train_speed(iter/s)": 1.629762 + }, + { + "acc": 0.64578285, + "epoch": 0.28399289700659563, + "grad_norm": 7.0, + "learning_rate": 9.7703944175033e-06, + "loss": 1.63361492, + "memory(GiB)": 97.17, + "step": 11195, + "train_speed(iter/s)": 1.629846 + }, + { + "acc": 0.65484905, + "epoch": 0.2841197361745307, + "grad_norm": 5.28125, + "learning_rate": 9.770080191627054e-06, + "loss": 1.60732327, + "memory(GiB)": 97.17, + "step": 11200, + "train_speed(iter/s)": 1.629927 + }, + { + "acc": 0.65470614, + "epoch": 0.2842465753424658, + "grad_norm": 4.71875, + "learning_rate": 9.769765755941383e-06, + "loss": 1.62965069, + "memory(GiB)": 97.17, + "step": 11205, + "train_speed(iter/s)": 1.630006 + }, + { + "acc": 0.64792995, + "epoch": 0.2843734145104008, + "grad_norm": 5.8125, + "learning_rate": 9.76945111046012e-06, + "loss": 1.63702583, + "memory(GiB)": 97.17, + "step": 11210, + "train_speed(iter/s)": 1.630085 + }, + { + "acc": 0.63567381, + "epoch": 0.28450025367833587, + "grad_norm": 5.6875, + "learning_rate": 9.769136255197103e-06, + "loss": 1.73173981, + "memory(GiB)": 97.17, + "step": 11215, + "train_speed(iter/s)": 1.630161 + }, + { + "acc": 0.64610329, + "epoch": 0.2846270928462709, + "grad_norm": 6.1875, + "learning_rate": 9.768821190166179e-06, + "loss": 1.63170204, + "memory(GiB)": 97.17, + "step": 11220, + "train_speed(iter/s)": 1.63025 + }, + { + "acc": 0.64216576, + "epoch": 0.284753932014206, + "grad_norm": 5.78125, + "learning_rate": 9.76850591538121e-06, + "loss": 1.5620615, + "memory(GiB)": 97.17, + "step": 11225, + "train_speed(iter/s)": 1.630327 + }, + { + "acc": 0.64817505, + "epoch": 0.28488077118214106, + "grad_norm": 5.59375, + "learning_rate": 9.76819043085606e-06, + "loss": 1.63430252, + "memory(GiB)": 97.17, + "step": 11230, + "train_speed(iter/s)": 1.63041 + }, + { + "acc": 0.65770745, + "epoch": 0.2850076103500761, + "grad_norm": 4.625, + "learning_rate": 9.767874736604605e-06, + "loss": 1.58754339, + "memory(GiB)": 97.17, + "step": 11235, + "train_speed(iter/s)": 1.630487 + }, + { + "acc": 0.66049709, + "epoch": 0.28513444951801115, + "grad_norm": 5.5625, + "learning_rate": 9.767558832640734e-06, + "loss": 1.55478992, + "memory(GiB)": 97.17, + "step": 11240, + "train_speed(iter/s)": 1.630569 + }, + { + "acc": 0.6559062, + "epoch": 0.28526128868594625, + "grad_norm": 5.53125, + "learning_rate": 9.76724271897834e-06, + "loss": 1.56323986, + "memory(GiB)": 97.17, + "step": 11245, + "train_speed(iter/s)": 1.630647 + }, + { + "acc": 0.6375628, + "epoch": 0.2853881278538813, + "grad_norm": 5.625, + "learning_rate": 9.766926395631326e-06, + "loss": 1.68773232, + "memory(GiB)": 97.17, + "step": 11250, + "train_speed(iter/s)": 1.630729 + }, + { + "acc": 0.65749111, + "epoch": 0.28551496702181633, + "grad_norm": 6.4375, + "learning_rate": 9.766609862613607e-06, + "loss": 1.59407263, + "memory(GiB)": 97.17, + "step": 11255, + "train_speed(iter/s)": 1.630807 + }, + { + "acc": 0.65684752, + "epoch": 0.2856418061897514, + "grad_norm": 5.84375, + "learning_rate": 9.766293119939104e-06, + "loss": 1.67577019, + "memory(GiB)": 97.17, + "step": 11260, + "train_speed(iter/s)": 1.630889 + }, + { + "acc": 0.65764813, + "epoch": 0.2857686453576865, + "grad_norm": 5.15625, + "learning_rate": 9.76597616762175e-06, + "loss": 1.61688423, + "memory(GiB)": 97.17, + "step": 11265, + "train_speed(iter/s)": 1.63096 + }, + { + "acc": 0.64011869, + "epoch": 0.2858954845256215, + "grad_norm": 4.71875, + "learning_rate": 9.765659005675488e-06, + "loss": 1.62728672, + "memory(GiB)": 97.17, + "step": 11270, + "train_speed(iter/s)": 1.631034 + }, + { + "acc": 0.6329874, + "epoch": 0.28602232369355657, + "grad_norm": 5.375, + "learning_rate": 9.765341634114263e-06, + "loss": 1.69099216, + "memory(GiB)": 97.17, + "step": 11275, + "train_speed(iter/s)": 1.631112 + }, + { + "acc": 0.64708767, + "epoch": 0.2861491628614916, + "grad_norm": 5.25, + "learning_rate": 9.765024052952037e-06, + "loss": 1.65065842, + "memory(GiB)": 97.17, + "step": 11280, + "train_speed(iter/s)": 1.631191 + }, + { + "acc": 0.64189854, + "epoch": 0.2862760020294267, + "grad_norm": 5.0625, + "learning_rate": 9.76470626220278e-06, + "loss": 1.69097328, + "memory(GiB)": 97.17, + "step": 11285, + "train_speed(iter/s)": 1.63127 + }, + { + "acc": 0.64078856, + "epoch": 0.28640284119736176, + "grad_norm": 5.5625, + "learning_rate": 9.76438826188047e-06, + "loss": 1.65014381, + "memory(GiB)": 97.17, + "step": 11290, + "train_speed(iter/s)": 1.631341 + }, + { + "acc": 0.6390677, + "epoch": 0.2865296803652968, + "grad_norm": 7.28125, + "learning_rate": 9.76407005199909e-06, + "loss": 1.68696251, + "memory(GiB)": 97.17, + "step": 11295, + "train_speed(iter/s)": 1.631417 + }, + { + "acc": 0.64552989, + "epoch": 0.28665651953323185, + "grad_norm": 4.4375, + "learning_rate": 9.76375163257264e-06, + "loss": 1.61886024, + "memory(GiB)": 97.17, + "step": 11300, + "train_speed(iter/s)": 1.631493 + }, + { + "acc": 0.65681577, + "epoch": 0.28678335870116695, + "grad_norm": 5.5, + "learning_rate": 9.763433003615124e-06, + "loss": 1.62201767, + "memory(GiB)": 97.17, + "step": 11305, + "train_speed(iter/s)": 1.631578 + }, + { + "acc": 0.64618597, + "epoch": 0.286910197869102, + "grad_norm": 6.25, + "learning_rate": 9.763114165140559e-06, + "loss": 1.72712708, + "memory(GiB)": 97.17, + "step": 11310, + "train_speed(iter/s)": 1.631655 + }, + { + "acc": 0.63099065, + "epoch": 0.28703703703703703, + "grad_norm": 5.28125, + "learning_rate": 9.762795117162967e-06, + "loss": 1.69247704, + "memory(GiB)": 97.17, + "step": 11315, + "train_speed(iter/s)": 1.631743 + }, + { + "acc": 0.66388273, + "epoch": 0.2871638762049721, + "grad_norm": 5.875, + "learning_rate": 9.76247585969638e-06, + "loss": 1.58248329, + "memory(GiB)": 97.17, + "step": 11320, + "train_speed(iter/s)": 1.631819 + }, + { + "acc": 0.65471501, + "epoch": 0.2872907153729072, + "grad_norm": 5.5625, + "learning_rate": 9.762156392754842e-06, + "loss": 1.58320112, + "memory(GiB)": 97.17, + "step": 11325, + "train_speed(iter/s)": 1.631896 + }, + { + "acc": 0.6289319, + "epoch": 0.2874175545408422, + "grad_norm": 5.6875, + "learning_rate": 9.761836716352405e-06, + "loss": 1.67280121, + "memory(GiB)": 97.17, + "step": 11330, + "train_speed(iter/s)": 1.631969 + }, + { + "acc": 0.63756342, + "epoch": 0.28754439370877727, + "grad_norm": 5.25, + "learning_rate": 9.761516830503128e-06, + "loss": 1.71608143, + "memory(GiB)": 97.17, + "step": 11335, + "train_speed(iter/s)": 1.632053 + }, + { + "acc": 0.65393553, + "epoch": 0.2876712328767123, + "grad_norm": 5.4375, + "learning_rate": 9.761196735221083e-06, + "loss": 1.66383781, + "memory(GiB)": 97.17, + "step": 11340, + "train_speed(iter/s)": 1.63213 + }, + { + "acc": 0.64664373, + "epoch": 0.2877980720446474, + "grad_norm": 5.71875, + "learning_rate": 9.76087643052035e-06, + "loss": 1.63629303, + "memory(GiB)": 97.17, + "step": 11345, + "train_speed(iter/s)": 1.632205 + }, + { + "acc": 0.64661798, + "epoch": 0.28792491121258246, + "grad_norm": 5.21875, + "learning_rate": 9.760555916415015e-06, + "loss": 1.66401596, + "memory(GiB)": 97.17, + "step": 11350, + "train_speed(iter/s)": 1.632279 + }, + { + "acc": 0.65716586, + "epoch": 0.2880517503805175, + "grad_norm": 5.78125, + "learning_rate": 9.760235192919175e-06, + "loss": 1.58799686, + "memory(GiB)": 97.17, + "step": 11355, + "train_speed(iter/s)": 1.632355 + }, + { + "acc": 0.64715643, + "epoch": 0.28817858954845255, + "grad_norm": 6.09375, + "learning_rate": 9.75991426004694e-06, + "loss": 1.65637703, + "memory(GiB)": 97.17, + "step": 11360, + "train_speed(iter/s)": 1.632434 + }, + { + "acc": 0.64976225, + "epoch": 0.28830542871638765, + "grad_norm": 5.1875, + "learning_rate": 9.759593117812423e-06, + "loss": 1.63441086, + "memory(GiB)": 97.17, + "step": 11365, + "train_speed(iter/s)": 1.632505 + }, + { + "acc": 0.67269187, + "epoch": 0.2884322678843227, + "grad_norm": 5.65625, + "learning_rate": 9.75927176622975e-06, + "loss": 1.56246796, + "memory(GiB)": 97.17, + "step": 11370, + "train_speed(iter/s)": 1.632582 + }, + { + "acc": 0.6465395, + "epoch": 0.28855910705225774, + "grad_norm": 5.09375, + "learning_rate": 9.758950205313057e-06, + "loss": 1.68576775, + "memory(GiB)": 97.17, + "step": 11375, + "train_speed(iter/s)": 1.632654 + }, + { + "acc": 0.66221781, + "epoch": 0.2886859462201928, + "grad_norm": 4.40625, + "learning_rate": 9.758628435076488e-06, + "loss": 1.59709435, + "memory(GiB)": 97.17, + "step": 11380, + "train_speed(iter/s)": 1.632726 + }, + { + "acc": 0.64337535, + "epoch": 0.2888127853881279, + "grad_norm": 5.3125, + "learning_rate": 9.758306455534193e-06, + "loss": 1.65946236, + "memory(GiB)": 97.17, + "step": 11385, + "train_speed(iter/s)": 1.632798 + }, + { + "acc": 0.63714933, + "epoch": 0.2889396245560629, + "grad_norm": 5.65625, + "learning_rate": 9.757984266700336e-06, + "loss": 1.68031082, + "memory(GiB)": 97.17, + "step": 11390, + "train_speed(iter/s)": 1.632868 + }, + { + "acc": 0.65039325, + "epoch": 0.28906646372399797, + "grad_norm": 7.21875, + "learning_rate": 9.75766186858909e-06, + "loss": 1.62771111, + "memory(GiB)": 97.17, + "step": 11395, + "train_speed(iter/s)": 1.632943 + }, + { + "acc": 0.6509644, + "epoch": 0.289193302891933, + "grad_norm": 5.125, + "learning_rate": 9.757339261214631e-06, + "loss": 1.69709454, + "memory(GiB)": 97.17, + "step": 11400, + "train_speed(iter/s)": 1.633024 + }, + { + "acc": 0.64038453, + "epoch": 0.2893201420598681, + "grad_norm": 5.4375, + "learning_rate": 9.757016444591152e-06, + "loss": 1.64617481, + "memory(GiB)": 97.17, + "step": 11405, + "train_speed(iter/s)": 1.633097 + }, + { + "acc": 0.65103588, + "epoch": 0.28944698122780316, + "grad_norm": 4.6875, + "learning_rate": 9.756693418732852e-06, + "loss": 1.64224205, + "memory(GiB)": 97.17, + "step": 11410, + "train_speed(iter/s)": 1.633172 + }, + { + "acc": 0.65074749, + "epoch": 0.2895738203957382, + "grad_norm": 5.65625, + "learning_rate": 9.756370183653938e-06, + "loss": 1.7003973, + "memory(GiB)": 97.17, + "step": 11415, + "train_speed(iter/s)": 1.633249 + }, + { + "acc": 0.64227662, + "epoch": 0.28970065956367325, + "grad_norm": 6.25, + "learning_rate": 9.756046739368628e-06, + "loss": 1.63268166, + "memory(GiB)": 97.17, + "step": 11420, + "train_speed(iter/s)": 1.633334 + }, + { + "acc": 0.63798075, + "epoch": 0.28982749873160835, + "grad_norm": 6.1875, + "learning_rate": 9.755723085891147e-06, + "loss": 1.68317146, + "memory(GiB)": 97.17, + "step": 11425, + "train_speed(iter/s)": 1.633415 + }, + { + "acc": 0.66065168, + "epoch": 0.2899543378995434, + "grad_norm": 5.8125, + "learning_rate": 9.755399223235734e-06, + "loss": 1.58074322, + "memory(GiB)": 97.17, + "step": 11430, + "train_speed(iter/s)": 1.633486 + }, + { + "acc": 0.64656153, + "epoch": 0.29008117706747844, + "grad_norm": 6.90625, + "learning_rate": 9.75507515141663e-06, + "loss": 1.64568062, + "memory(GiB)": 97.17, + "step": 11435, + "train_speed(iter/s)": 1.633562 + }, + { + "acc": 0.64046583, + "epoch": 0.2902080162354135, + "grad_norm": 6.28125, + "learning_rate": 9.75475087044809e-06, + "loss": 1.64064331, + "memory(GiB)": 97.17, + "step": 11440, + "train_speed(iter/s)": 1.633635 + }, + { + "acc": 0.64787884, + "epoch": 0.2903348554033486, + "grad_norm": 5.625, + "learning_rate": 9.754426380344382e-06, + "loss": 1.67387772, + "memory(GiB)": 97.17, + "step": 11445, + "train_speed(iter/s)": 1.633712 + }, + { + "acc": 0.64361849, + "epoch": 0.2904616945712836, + "grad_norm": 5.3125, + "learning_rate": 9.754101681119772e-06, + "loss": 1.71696663, + "memory(GiB)": 97.17, + "step": 11450, + "train_speed(iter/s)": 1.633796 + }, + { + "acc": 0.63113894, + "epoch": 0.29058853373921867, + "grad_norm": 6.28125, + "learning_rate": 9.753776772788545e-06, + "loss": 1.69180794, + "memory(GiB)": 97.17, + "step": 11455, + "train_speed(iter/s)": 1.63388 + }, + { + "acc": 0.64941807, + "epoch": 0.2907153729071537, + "grad_norm": 4.875, + "learning_rate": 9.753451655364992e-06, + "loss": 1.64529076, + "memory(GiB)": 97.17, + "step": 11460, + "train_speed(iter/s)": 1.63396 + }, + { + "acc": 0.6493063, + "epoch": 0.2908422120750888, + "grad_norm": 5.65625, + "learning_rate": 9.75312632886341e-06, + "loss": 1.71305027, + "memory(GiB)": 97.17, + "step": 11465, + "train_speed(iter/s)": 1.634035 + }, + { + "acc": 0.67065487, + "epoch": 0.29096905124302386, + "grad_norm": 5.0, + "learning_rate": 9.752800793298113e-06, + "loss": 1.56441851, + "memory(GiB)": 97.17, + "step": 11470, + "train_speed(iter/s)": 1.634108 + }, + { + "acc": 0.6549777, + "epoch": 0.2910958904109589, + "grad_norm": 5.59375, + "learning_rate": 9.752475048683419e-06, + "loss": 1.62848644, + "memory(GiB)": 97.17, + "step": 11475, + "train_speed(iter/s)": 1.634183 + }, + { + "acc": 0.65016327, + "epoch": 0.29122272957889395, + "grad_norm": 5.28125, + "learning_rate": 9.752149095033651e-06, + "loss": 1.61026001, + "memory(GiB)": 97.17, + "step": 11480, + "train_speed(iter/s)": 1.634257 + }, + { + "acc": 0.65454712, + "epoch": 0.29134956874682905, + "grad_norm": 6.375, + "learning_rate": 9.75182293236315e-06, + "loss": 1.61261635, + "memory(GiB)": 97.17, + "step": 11485, + "train_speed(iter/s)": 1.634337 + }, + { + "acc": 0.64424939, + "epoch": 0.2914764079147641, + "grad_norm": 6.34375, + "learning_rate": 9.751496560686262e-06, + "loss": 1.56798515, + "memory(GiB)": 97.17, + "step": 11490, + "train_speed(iter/s)": 1.634417 + }, + { + "acc": 0.65181317, + "epoch": 0.29160324708269914, + "grad_norm": 5.40625, + "learning_rate": 9.751169980017341e-06, + "loss": 1.61228981, + "memory(GiB)": 97.17, + "step": 11495, + "train_speed(iter/s)": 1.634489 + }, + { + "acc": 0.65896001, + "epoch": 0.2917300862506342, + "grad_norm": 4.9375, + "learning_rate": 9.750843190370752e-06, + "loss": 1.59504261, + "memory(GiB)": 97.17, + "step": 11500, + "train_speed(iter/s)": 1.634557 + }, + { + "acc": 0.67066407, + "epoch": 0.2918569254185693, + "grad_norm": 4.125, + "learning_rate": 9.750516191760868e-06, + "loss": 1.57905722, + "memory(GiB)": 97.17, + "step": 11505, + "train_speed(iter/s)": 1.634633 + }, + { + "acc": 0.64843054, + "epoch": 0.2919837645865043, + "grad_norm": 4.375, + "learning_rate": 9.750188984202073e-06, + "loss": 1.62277679, + "memory(GiB)": 97.17, + "step": 11510, + "train_speed(iter/s)": 1.634711 + }, + { + "acc": 0.6408927, + "epoch": 0.29211060375443937, + "grad_norm": 5.875, + "learning_rate": 9.749861567708759e-06, + "loss": 1.71921844, + "memory(GiB)": 97.17, + "step": 11515, + "train_speed(iter/s)": 1.634789 + }, + { + "acc": 0.6344408, + "epoch": 0.2922374429223744, + "grad_norm": 6.1875, + "learning_rate": 9.749533942295323e-06, + "loss": 1.68303127, + "memory(GiB)": 97.17, + "step": 11520, + "train_speed(iter/s)": 1.634865 + }, + { + "acc": 0.66041121, + "epoch": 0.2923642820903095, + "grad_norm": 6.78125, + "learning_rate": 9.749206107976183e-06, + "loss": 1.59183712, + "memory(GiB)": 97.17, + "step": 11525, + "train_speed(iter/s)": 1.634947 + }, + { + "acc": 0.64058123, + "epoch": 0.29249112125824456, + "grad_norm": 6.0, + "learning_rate": 9.748878064765753e-06, + "loss": 1.66036873, + "memory(GiB)": 97.17, + "step": 11530, + "train_speed(iter/s)": 1.635021 + }, + { + "acc": 0.64665055, + "epoch": 0.2926179604261796, + "grad_norm": 6.78125, + "learning_rate": 9.748549812678466e-06, + "loss": 1.68230362, + "memory(GiB)": 97.17, + "step": 11535, + "train_speed(iter/s)": 1.635089 + }, + { + "acc": 0.65084305, + "epoch": 0.29274479959411465, + "grad_norm": 5.125, + "learning_rate": 9.748221351728754e-06, + "loss": 1.68193531, + "memory(GiB)": 97.17, + "step": 11540, + "train_speed(iter/s)": 1.635162 + }, + { + "acc": 0.65032053, + "epoch": 0.29287163876204975, + "grad_norm": 6.0625, + "learning_rate": 9.747892681931067e-06, + "loss": 1.62254906, + "memory(GiB)": 97.17, + "step": 11545, + "train_speed(iter/s)": 1.635233 + }, + { + "acc": 0.64764028, + "epoch": 0.2929984779299848, + "grad_norm": 5.78125, + "learning_rate": 9.747563803299865e-06, + "loss": 1.63244286, + "memory(GiB)": 97.17, + "step": 11550, + "train_speed(iter/s)": 1.635312 + }, + { + "acc": 0.65060344, + "epoch": 0.29312531709791984, + "grad_norm": 5.5625, + "learning_rate": 9.74723471584961e-06, + "loss": 1.62937889, + "memory(GiB)": 97.17, + "step": 11555, + "train_speed(iter/s)": 1.635386 + }, + { + "acc": 0.65269713, + "epoch": 0.2932521562658549, + "grad_norm": 5.78125, + "learning_rate": 9.746905419594777e-06, + "loss": 1.59821739, + "memory(GiB)": 97.17, + "step": 11560, + "train_speed(iter/s)": 1.635467 + }, + { + "acc": 0.64498816, + "epoch": 0.29337899543379, + "grad_norm": 5.6875, + "learning_rate": 9.746575914549851e-06, + "loss": 1.63843441, + "memory(GiB)": 97.17, + "step": 11565, + "train_speed(iter/s)": 1.635545 + }, + { + "acc": 0.62795815, + "epoch": 0.293505834601725, + "grad_norm": 5.375, + "learning_rate": 9.746246200729323e-06, + "loss": 1.74897957, + "memory(GiB)": 97.17, + "step": 11570, + "train_speed(iter/s)": 1.635617 + }, + { + "acc": 0.65472836, + "epoch": 0.29363267376966007, + "grad_norm": 5.15625, + "learning_rate": 9.745916278147696e-06, + "loss": 1.61250858, + "memory(GiB)": 97.17, + "step": 11575, + "train_speed(iter/s)": 1.635691 + }, + { + "acc": 0.65050497, + "epoch": 0.2937595129375951, + "grad_norm": 7.46875, + "learning_rate": 9.745586146819484e-06, + "loss": 1.64993591, + "memory(GiB)": 97.17, + "step": 11580, + "train_speed(iter/s)": 1.635762 + }, + { + "acc": 0.64963841, + "epoch": 0.2938863521055302, + "grad_norm": 4.90625, + "learning_rate": 9.745255806759205e-06, + "loss": 1.72799568, + "memory(GiB)": 97.17, + "step": 11585, + "train_speed(iter/s)": 1.635835 + }, + { + "acc": 0.64927502, + "epoch": 0.29401319127346526, + "grad_norm": 6.125, + "learning_rate": 9.74492525798139e-06, + "loss": 1.67259102, + "memory(GiB)": 97.17, + "step": 11590, + "train_speed(iter/s)": 1.635911 + }, + { + "acc": 0.63580432, + "epoch": 0.2941400304414003, + "grad_norm": 5.15625, + "learning_rate": 9.744594500500578e-06, + "loss": 1.67914238, + "memory(GiB)": 97.17, + "step": 11595, + "train_speed(iter/s)": 1.635988 + }, + { + "acc": 0.64402461, + "epoch": 0.29426686960933535, + "grad_norm": 6.53125, + "learning_rate": 9.744263534331315e-06, + "loss": 1.61763115, + "memory(GiB)": 97.17, + "step": 11600, + "train_speed(iter/s)": 1.636069 + }, + { + "acc": 0.65339947, + "epoch": 0.29439370877727045, + "grad_norm": 5.6875, + "learning_rate": 9.743932359488161e-06, + "loss": 1.63013649, + "memory(GiB)": 97.17, + "step": 11605, + "train_speed(iter/s)": 1.636144 + }, + { + "acc": 0.64586911, + "epoch": 0.2945205479452055, + "grad_norm": 4.6875, + "learning_rate": 9.743600975985681e-06, + "loss": 1.6654829, + "memory(GiB)": 97.17, + "step": 11610, + "train_speed(iter/s)": 1.636221 + }, + { + "acc": 0.65215054, + "epoch": 0.29464738711314054, + "grad_norm": 5.53125, + "learning_rate": 9.743269383838452e-06, + "loss": 1.66870823, + "memory(GiB)": 97.17, + "step": 11615, + "train_speed(iter/s)": 1.636295 + }, + { + "acc": 0.65218196, + "epoch": 0.2947742262810756, + "grad_norm": 6.15625, + "learning_rate": 9.74293758306106e-06, + "loss": 1.630896, + "memory(GiB)": 97.17, + "step": 11620, + "train_speed(iter/s)": 1.63637 + }, + { + "acc": 0.627877, + "epoch": 0.2949010654490107, + "grad_norm": 8.0625, + "learning_rate": 9.742605573668096e-06, + "loss": 1.68542633, + "memory(GiB)": 97.17, + "step": 11625, + "train_speed(iter/s)": 1.63645 + }, + { + "acc": 0.64570208, + "epoch": 0.2950279046169457, + "grad_norm": 7.1875, + "learning_rate": 9.742273355674164e-06, + "loss": 1.68923283, + "memory(GiB)": 97.17, + "step": 11630, + "train_speed(iter/s)": 1.636527 + }, + { + "acc": 0.66054845, + "epoch": 0.29515474378488077, + "grad_norm": 5.1875, + "learning_rate": 9.741940929093879e-06, + "loss": 1.63594322, + "memory(GiB)": 97.17, + "step": 11635, + "train_speed(iter/s)": 1.636598 + }, + { + "acc": 0.65538964, + "epoch": 0.2952815829528158, + "grad_norm": 5.46875, + "learning_rate": 9.741608293941858e-06, + "loss": 1.60515289, + "memory(GiB)": 97.17, + "step": 11640, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.65831141, + "epoch": 0.2954084221207509, + "grad_norm": 5.625, + "learning_rate": 9.741275450232736e-06, + "loss": 1.63779564, + "memory(GiB)": 97.17, + "step": 11645, + "train_speed(iter/s)": 1.636746 + }, + { + "acc": 0.6579608, + "epoch": 0.29553526128868596, + "grad_norm": 5.53125, + "learning_rate": 9.740942397981151e-06, + "loss": 1.55092669, + "memory(GiB)": 97.17, + "step": 11650, + "train_speed(iter/s)": 1.636819 + }, + { + "acc": 0.649158, + "epoch": 0.295662100456621, + "grad_norm": 5.25, + "learning_rate": 9.740609137201752e-06, + "loss": 1.6361805, + "memory(GiB)": 97.17, + "step": 11655, + "train_speed(iter/s)": 1.636895 + }, + { + "acc": 0.64914727, + "epoch": 0.29578893962455605, + "grad_norm": 5.4375, + "learning_rate": 9.7402756679092e-06, + "loss": 1.64583282, + "memory(GiB)": 97.17, + "step": 11660, + "train_speed(iter/s)": 1.636966 + }, + { + "acc": 0.64234037, + "epoch": 0.29591577879249115, + "grad_norm": 5.71875, + "learning_rate": 9.739941990118157e-06, + "loss": 1.60633354, + "memory(GiB)": 97.17, + "step": 11665, + "train_speed(iter/s)": 1.637037 + }, + { + "acc": 0.65490937, + "epoch": 0.2960426179604262, + "grad_norm": 4.8125, + "learning_rate": 9.739608103843306e-06, + "loss": 1.63323059, + "memory(GiB)": 97.17, + "step": 11670, + "train_speed(iter/s)": 1.637105 + }, + { + "acc": 0.65202107, + "epoch": 0.29616945712836124, + "grad_norm": 4.875, + "learning_rate": 9.739274009099328e-06, + "loss": 1.64343796, + "memory(GiB)": 97.17, + "step": 11675, + "train_speed(iter/s)": 1.637178 + }, + { + "acc": 0.66397929, + "epoch": 0.2962962962962963, + "grad_norm": 5.53125, + "learning_rate": 9.738939705900922e-06, + "loss": 1.62341652, + "memory(GiB)": 97.17, + "step": 11680, + "train_speed(iter/s)": 1.637258 + }, + { + "acc": 0.6575644, + "epoch": 0.2964231354642314, + "grad_norm": 4.75, + "learning_rate": 9.738605194262787e-06, + "loss": 1.59835129, + "memory(GiB)": 97.17, + "step": 11685, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.649084, + "epoch": 0.2965499746321664, + "grad_norm": 4.78125, + "learning_rate": 9.738270474199641e-06, + "loss": 1.62513294, + "memory(GiB)": 97.17, + "step": 11690, + "train_speed(iter/s)": 1.637407 + }, + { + "acc": 0.63300428, + "epoch": 0.29667681380010147, + "grad_norm": 6.875, + "learning_rate": 9.737935545726205e-06, + "loss": 1.69159355, + "memory(GiB)": 97.17, + "step": 11695, + "train_speed(iter/s)": 1.637488 + }, + { + "acc": 0.65190229, + "epoch": 0.2968036529680365, + "grad_norm": 5.75, + "learning_rate": 9.737600408857208e-06, + "loss": 1.63336754, + "memory(GiB)": 97.17, + "step": 11700, + "train_speed(iter/s)": 1.637569 + }, + { + "acc": 0.63793454, + "epoch": 0.2969304921359716, + "grad_norm": 6.09375, + "learning_rate": 9.737265063607395e-06, + "loss": 1.6612709, + "memory(GiB)": 97.17, + "step": 11705, + "train_speed(iter/s)": 1.637641 + }, + { + "acc": 0.6532095, + "epoch": 0.29705733130390666, + "grad_norm": 5.875, + "learning_rate": 9.736929509991515e-06, + "loss": 1.59731693, + "memory(GiB)": 97.17, + "step": 11710, + "train_speed(iter/s)": 1.637714 + }, + { + "acc": 0.65627689, + "epoch": 0.2971841704718417, + "grad_norm": 6.0625, + "learning_rate": 9.736593748024325e-06, + "loss": 1.65486469, + "memory(GiB)": 97.17, + "step": 11715, + "train_speed(iter/s)": 1.637797 + }, + { + "acc": 0.65927405, + "epoch": 0.29731100963977675, + "grad_norm": 4.875, + "learning_rate": 9.736257777720595e-06, + "loss": 1.55941372, + "memory(GiB)": 97.17, + "step": 11720, + "train_speed(iter/s)": 1.637872 + }, + { + "acc": 0.6624898, + "epoch": 0.29743784880771185, + "grad_norm": 7.21875, + "learning_rate": 9.735921599095101e-06, + "loss": 1.57044315, + "memory(GiB)": 97.17, + "step": 11725, + "train_speed(iter/s)": 1.637951 + }, + { + "acc": 0.6622478, + "epoch": 0.2975646879756469, + "grad_norm": 5.09375, + "learning_rate": 9.735585212162633e-06, + "loss": 1.58110027, + "memory(GiB)": 97.17, + "step": 11730, + "train_speed(iter/s)": 1.638026 + }, + { + "acc": 0.64733858, + "epoch": 0.29769152714358194, + "grad_norm": 6.40625, + "learning_rate": 9.735248616937983e-06, + "loss": 1.66352272, + "memory(GiB)": 97.17, + "step": 11735, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.65417871, + "epoch": 0.297818366311517, + "grad_norm": 5.5625, + "learning_rate": 9.734911813435957e-06, + "loss": 1.59013805, + "memory(GiB)": 97.17, + "step": 11740, + "train_speed(iter/s)": 1.638175 + }, + { + "acc": 0.65507183, + "epoch": 0.2979452054794521, + "grad_norm": 4.90625, + "learning_rate": 9.73457480167137e-06, + "loss": 1.74874783, + "memory(GiB)": 97.17, + "step": 11745, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.66042709, + "epoch": 0.2980720446473871, + "grad_norm": 5.6875, + "learning_rate": 9.734237581659045e-06, + "loss": 1.63711433, + "memory(GiB)": 97.17, + "step": 11750, + "train_speed(iter/s)": 1.63833 + }, + { + "acc": 0.64553061, + "epoch": 0.29819888381532217, + "grad_norm": 5.53125, + "learning_rate": 9.733900153413813e-06, + "loss": 1.62274132, + "memory(GiB)": 97.17, + "step": 11755, + "train_speed(iter/s)": 1.638408 + }, + { + "acc": 0.65123997, + "epoch": 0.2983257229832572, + "grad_norm": 6.40625, + "learning_rate": 9.733562516950519e-06, + "loss": 1.61130638, + "memory(GiB)": 97.17, + "step": 11760, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.63914804, + "epoch": 0.2984525621511923, + "grad_norm": 6.96875, + "learning_rate": 9.73322467228401e-06, + "loss": 1.64715347, + "memory(GiB)": 97.17, + "step": 11765, + "train_speed(iter/s)": 1.638558 + }, + { + "acc": 0.64855762, + "epoch": 0.29857940131912736, + "grad_norm": 7.1875, + "learning_rate": 9.73288661942915e-06, + "loss": 1.62235718, + "memory(GiB)": 97.17, + "step": 11770, + "train_speed(iter/s)": 1.638634 + }, + { + "acc": 0.65780964, + "epoch": 0.2987062404870624, + "grad_norm": 6.15625, + "learning_rate": 9.732548358400802e-06, + "loss": 1.60995064, + "memory(GiB)": 97.17, + "step": 11775, + "train_speed(iter/s)": 1.638707 + }, + { + "acc": 0.652736, + "epoch": 0.29883307965499745, + "grad_norm": 4.84375, + "learning_rate": 9.73220988921385e-06, + "loss": 1.59062061, + "memory(GiB)": 97.17, + "step": 11780, + "train_speed(iter/s)": 1.638777 + }, + { + "acc": 0.65027757, + "epoch": 0.29895991882293255, + "grad_norm": 5.46875, + "learning_rate": 9.73187121188318e-06, + "loss": 1.60554485, + "memory(GiB)": 97.17, + "step": 11785, + "train_speed(iter/s)": 1.638853 + }, + { + "acc": 0.66155748, + "epoch": 0.2990867579908676, + "grad_norm": 7.03125, + "learning_rate": 9.731532326423686e-06, + "loss": 1.60177994, + "memory(GiB)": 97.17, + "step": 11790, + "train_speed(iter/s)": 1.638928 + }, + { + "acc": 0.66709805, + "epoch": 0.29921359715880264, + "grad_norm": 4.8125, + "learning_rate": 9.731193232850277e-06, + "loss": 1.56361628, + "memory(GiB)": 97.17, + "step": 11795, + "train_speed(iter/s)": 1.639002 + }, + { + "acc": 0.64233665, + "epoch": 0.2993404363267377, + "grad_norm": 5.03125, + "learning_rate": 9.730853931177866e-06, + "loss": 1.66935577, + "memory(GiB)": 97.17, + "step": 11800, + "train_speed(iter/s)": 1.639074 + }, + { + "acc": 0.65680666, + "epoch": 0.2994672754946728, + "grad_norm": 6.40625, + "learning_rate": 9.730514421421378e-06, + "loss": 1.60994625, + "memory(GiB)": 97.17, + "step": 11805, + "train_speed(iter/s)": 1.63915 + }, + { + "acc": 0.64663, + "epoch": 0.2995941146626078, + "grad_norm": 4.78125, + "learning_rate": 9.730174703595745e-06, + "loss": 1.64971447, + "memory(GiB)": 97.17, + "step": 11810, + "train_speed(iter/s)": 1.639216 + }, + { + "acc": 0.65689802, + "epoch": 0.29972095383054287, + "grad_norm": 4.9375, + "learning_rate": 9.72983477771591e-06, + "loss": 1.57845268, + "memory(GiB)": 97.17, + "step": 11815, + "train_speed(iter/s)": 1.639285 + }, + { + "acc": 0.6412138, + "epoch": 0.2998477929984779, + "grad_norm": 4.90625, + "learning_rate": 9.729494643796823e-06, + "loss": 1.68806343, + "memory(GiB)": 97.17, + "step": 11820, + "train_speed(iter/s)": 1.639357 + }, + { + "acc": 0.65105066, + "epoch": 0.299974632166413, + "grad_norm": 5.25, + "learning_rate": 9.729154301853448e-06, + "loss": 1.62693596, + "memory(GiB)": 97.17, + "step": 11825, + "train_speed(iter/s)": 1.639429 + }, + { + "acc": 0.6450696, + "epoch": 0.30010147133434806, + "grad_norm": 7.71875, + "learning_rate": 9.72881375190075e-06, + "loss": 1.68029881, + "memory(GiB)": 97.17, + "step": 11830, + "train_speed(iter/s)": 1.639502 + }, + { + "acc": 0.66626835, + "epoch": 0.3002283105022831, + "grad_norm": 4.65625, + "learning_rate": 9.728472993953712e-06, + "loss": 1.60636997, + "memory(GiB)": 97.17, + "step": 11835, + "train_speed(iter/s)": 1.639571 + }, + { + "acc": 0.64302607, + "epoch": 0.30035514967021815, + "grad_norm": 4.96875, + "learning_rate": 9.728132028027323e-06, + "loss": 1.58945427, + "memory(GiB)": 97.17, + "step": 11840, + "train_speed(iter/s)": 1.639646 + }, + { + "acc": 0.67251196, + "epoch": 0.30048198883815325, + "grad_norm": 4.875, + "learning_rate": 9.727790854136573e-06, + "loss": 1.62307892, + "memory(GiB)": 97.17, + "step": 11845, + "train_speed(iter/s)": 1.639711 + }, + { + "acc": 0.65551519, + "epoch": 0.3006088280060883, + "grad_norm": 4.6875, + "learning_rate": 9.727449472296476e-06, + "loss": 1.57311468, + "memory(GiB)": 97.17, + "step": 11850, + "train_speed(iter/s)": 1.63978 + }, + { + "acc": 0.64782877, + "epoch": 0.30073566717402334, + "grad_norm": 6.46875, + "learning_rate": 9.727107882522045e-06, + "loss": 1.67652302, + "memory(GiB)": 97.17, + "step": 11855, + "train_speed(iter/s)": 1.639856 + }, + { + "acc": 0.66182756, + "epoch": 0.3008625063419584, + "grad_norm": 5.65625, + "learning_rate": 9.726766084828303e-06, + "loss": 1.60665207, + "memory(GiB)": 97.17, + "step": 11860, + "train_speed(iter/s)": 1.639926 + }, + { + "acc": 0.64263477, + "epoch": 0.3009893455098935, + "grad_norm": 5.3125, + "learning_rate": 9.726424079230286e-06, + "loss": 1.62117882, + "memory(GiB)": 97.17, + "step": 11865, + "train_speed(iter/s)": 1.639989 + }, + { + "acc": 0.65106149, + "epoch": 0.3011161846778285, + "grad_norm": 5.46875, + "learning_rate": 9.726081865743036e-06, + "loss": 1.63567924, + "memory(GiB)": 97.17, + "step": 11870, + "train_speed(iter/s)": 1.640061 + }, + { + "acc": 0.67203779, + "epoch": 0.30124302384576357, + "grad_norm": 5.09375, + "learning_rate": 9.725739444381603e-06, + "loss": 1.60356159, + "memory(GiB)": 97.17, + "step": 11875, + "train_speed(iter/s)": 1.640138 + }, + { + "acc": 0.65460844, + "epoch": 0.3013698630136986, + "grad_norm": 5.125, + "learning_rate": 9.725396815161053e-06, + "loss": 1.65225525, + "memory(GiB)": 97.17, + "step": 11880, + "train_speed(iter/s)": 1.640207 + }, + { + "acc": 0.64531651, + "epoch": 0.3014967021816337, + "grad_norm": 5.40625, + "learning_rate": 9.725053978096453e-06, + "loss": 1.67289333, + "memory(GiB)": 97.17, + "step": 11885, + "train_speed(iter/s)": 1.640271 + }, + { + "acc": 0.6419682, + "epoch": 0.30162354134956876, + "grad_norm": 5.15625, + "learning_rate": 9.724710933202884e-06, + "loss": 1.71273708, + "memory(GiB)": 97.17, + "step": 11890, + "train_speed(iter/s)": 1.640338 + }, + { + "acc": 0.65387602, + "epoch": 0.3017503805175038, + "grad_norm": 5.71875, + "learning_rate": 9.724367680495432e-06, + "loss": 1.64997559, + "memory(GiB)": 97.17, + "step": 11895, + "train_speed(iter/s)": 1.64041 + }, + { + "acc": 0.662253, + "epoch": 0.30187721968543885, + "grad_norm": 5.0625, + "learning_rate": 9.724024219989198e-06, + "loss": 1.58682127, + "memory(GiB)": 97.17, + "step": 11900, + "train_speed(iter/s)": 1.640479 + }, + { + "acc": 0.64213529, + "epoch": 0.30200405885337395, + "grad_norm": 5.90625, + "learning_rate": 9.723680551699286e-06, + "loss": 1.63422241, + "memory(GiB)": 97.17, + "step": 11905, + "train_speed(iter/s)": 1.640547 + }, + { + "acc": 0.63804383, + "epoch": 0.302130898021309, + "grad_norm": 6.3125, + "learning_rate": 9.723336675640815e-06, + "loss": 1.740905, + "memory(GiB)": 97.17, + "step": 11910, + "train_speed(iter/s)": 1.640617 + }, + { + "acc": 0.64859419, + "epoch": 0.30225773718924404, + "grad_norm": 5.40625, + "learning_rate": 9.722992591828908e-06, + "loss": 1.62869473, + "memory(GiB)": 97.17, + "step": 11915, + "train_speed(iter/s)": 1.640691 + }, + { + "acc": 0.66343718, + "epoch": 0.3023845763571791, + "grad_norm": 5.8125, + "learning_rate": 9.722648300278701e-06, + "loss": 1.54543724, + "memory(GiB)": 97.17, + "step": 11920, + "train_speed(iter/s)": 1.640763 + }, + { + "acc": 0.66775374, + "epoch": 0.3025114155251142, + "grad_norm": 5.59375, + "learning_rate": 9.722303801005338e-06, + "loss": 1.62005329, + "memory(GiB)": 97.17, + "step": 11925, + "train_speed(iter/s)": 1.640829 + }, + { + "acc": 0.65400538, + "epoch": 0.3026382546930492, + "grad_norm": 6.28125, + "learning_rate": 9.721959094023968e-06, + "loss": 1.68701725, + "memory(GiB)": 97.17, + "step": 11930, + "train_speed(iter/s)": 1.6409 + }, + { + "acc": 0.67039242, + "epoch": 0.30276509386098427, + "grad_norm": 6.1875, + "learning_rate": 9.721614179349754e-06, + "loss": 1.54521427, + "memory(GiB)": 97.17, + "step": 11935, + "train_speed(iter/s)": 1.640972 + }, + { + "acc": 0.66024427, + "epoch": 0.3028919330289193, + "grad_norm": 6.03125, + "learning_rate": 9.72126905699787e-06, + "loss": 1.66078053, + "memory(GiB)": 97.17, + "step": 11940, + "train_speed(iter/s)": 1.64105 + }, + { + "acc": 0.64067478, + "epoch": 0.3030187721968544, + "grad_norm": 4.90625, + "learning_rate": 9.720923726983493e-06, + "loss": 1.62818756, + "memory(GiB)": 97.17, + "step": 11945, + "train_speed(iter/s)": 1.641124 + }, + { + "acc": 0.64188647, + "epoch": 0.30314561136478946, + "grad_norm": 5.0, + "learning_rate": 9.720578189321814e-06, + "loss": 1.63102531, + "memory(GiB)": 97.17, + "step": 11950, + "train_speed(iter/s)": 1.641192 + }, + { + "acc": 0.66136594, + "epoch": 0.3032724505327245, + "grad_norm": 6.28125, + "learning_rate": 9.72023244402803e-06, + "loss": 1.61931629, + "memory(GiB)": 97.17, + "step": 11955, + "train_speed(iter/s)": 1.64126 + }, + { + "acc": 0.6492928, + "epoch": 0.30339928970065955, + "grad_norm": 5.3125, + "learning_rate": 9.719886491117348e-06, + "loss": 1.62725468, + "memory(GiB)": 97.17, + "step": 11960, + "train_speed(iter/s)": 1.641334 + }, + { + "acc": 0.64850187, + "epoch": 0.30352612886859465, + "grad_norm": 6.25, + "learning_rate": 9.719540330604986e-06, + "loss": 1.58523388, + "memory(GiB)": 97.17, + "step": 11965, + "train_speed(iter/s)": 1.641401 + }, + { + "acc": 0.65585322, + "epoch": 0.3036529680365297, + "grad_norm": 5.90625, + "learning_rate": 9.71919396250617e-06, + "loss": 1.6394001, + "memory(GiB)": 97.17, + "step": 11970, + "train_speed(iter/s)": 1.641473 + }, + { + "acc": 0.62977915, + "epoch": 0.30377980720446474, + "grad_norm": 5.3125, + "learning_rate": 9.718847386836131e-06, + "loss": 1.70795479, + "memory(GiB)": 97.17, + "step": 11975, + "train_speed(iter/s)": 1.641546 + }, + { + "acc": 0.65073948, + "epoch": 0.3039066463723998, + "grad_norm": 5.34375, + "learning_rate": 9.718500603610119e-06, + "loss": 1.66167107, + "memory(GiB)": 97.17, + "step": 11980, + "train_speed(iter/s)": 1.641623 + }, + { + "acc": 0.6401463, + "epoch": 0.3040334855403349, + "grad_norm": 6.9375, + "learning_rate": 9.718153612843382e-06, + "loss": 1.69964352, + "memory(GiB)": 97.17, + "step": 11985, + "train_speed(iter/s)": 1.641699 + }, + { + "acc": 0.64130259, + "epoch": 0.3041603247082699, + "grad_norm": 4.75, + "learning_rate": 9.717806414551186e-06, + "loss": 1.68308411, + "memory(GiB)": 97.17, + "step": 11990, + "train_speed(iter/s)": 1.641768 + }, + { + "acc": 0.66283197, + "epoch": 0.30428716387620497, + "grad_norm": 6.96875, + "learning_rate": 9.717459008748798e-06, + "loss": 1.62097893, + "memory(GiB)": 97.17, + "step": 11995, + "train_speed(iter/s)": 1.641843 + }, + { + "acc": 0.65009527, + "epoch": 0.30441400304414, + "grad_norm": 6.5, + "learning_rate": 9.717111395451501e-06, + "loss": 1.62187538, + "memory(GiB)": 97.17, + "step": 12000, + "train_speed(iter/s)": 1.64192 + }, + { + "epoch": 0.30441400304414, + "eval_acc": 0.6403051432891239, + "eval_loss": 1.6072824001312256, + "eval_runtime": 58.6693, + "eval_samples_per_second": 108.575, + "eval_steps_per_second": 27.152, + "step": 12000 + }, + { + "acc": 0.63487883, + "epoch": 0.3045408422120751, + "grad_norm": 5.0625, + "learning_rate": 9.716763574674586e-06, + "loss": 1.70920391, + "memory(GiB)": 97.17, + "step": 12005, + "train_speed(iter/s)": 1.627986 + }, + { + "acc": 0.66409378, + "epoch": 0.30466768138001016, + "grad_norm": 5.15625, + "learning_rate": 9.71641554643335e-06, + "loss": 1.53675585, + "memory(GiB)": 97.17, + "step": 12010, + "train_speed(iter/s)": 1.628052 + }, + { + "acc": 0.65317812, + "epoch": 0.3047945205479452, + "grad_norm": 5.21875, + "learning_rate": 9.7160673107431e-06, + "loss": 1.59409924, + "memory(GiB)": 97.17, + "step": 12015, + "train_speed(iter/s)": 1.628124 + }, + { + "acc": 0.62879562, + "epoch": 0.30492135971588025, + "grad_norm": 6.53125, + "learning_rate": 9.715718867619155e-06, + "loss": 1.72604752, + "memory(GiB)": 97.17, + "step": 12020, + "train_speed(iter/s)": 1.628197 + }, + { + "acc": 0.6527308, + "epoch": 0.30504819888381535, + "grad_norm": 5.0, + "learning_rate": 9.715370217076838e-06, + "loss": 1.59955444, + "memory(GiB)": 97.17, + "step": 12025, + "train_speed(iter/s)": 1.62827 + }, + { + "acc": 0.64126039, + "epoch": 0.3051750380517504, + "grad_norm": 5.59375, + "learning_rate": 9.715021359131489e-06, + "loss": 1.66279678, + "memory(GiB)": 97.17, + "step": 12030, + "train_speed(iter/s)": 1.628327 + }, + { + "acc": 0.63692665, + "epoch": 0.30530187721968544, + "grad_norm": 8.4375, + "learning_rate": 9.714672293798449e-06, + "loss": 1.69898586, + "memory(GiB)": 97.17, + "step": 12035, + "train_speed(iter/s)": 1.628403 + }, + { + "acc": 0.639188, + "epoch": 0.3054287163876205, + "grad_norm": 5.125, + "learning_rate": 9.71432302109307e-06, + "loss": 1.68704529, + "memory(GiB)": 97.17, + "step": 12040, + "train_speed(iter/s)": 1.62847 + }, + { + "acc": 0.6301796, + "epoch": 0.3055555555555556, + "grad_norm": 6.21875, + "learning_rate": 9.713973541030716e-06, + "loss": 1.65257874, + "memory(GiB)": 97.17, + "step": 12045, + "train_speed(iter/s)": 1.628544 + }, + { + "acc": 0.65060339, + "epoch": 0.3056823947234906, + "grad_norm": 5.6875, + "learning_rate": 9.713623853626763e-06, + "loss": 1.5629858, + "memory(GiB)": 97.17, + "step": 12050, + "train_speed(iter/s)": 1.62862 + }, + { + "acc": 0.65652709, + "epoch": 0.30580923389142567, + "grad_norm": 5.1875, + "learning_rate": 9.713273958896586e-06, + "loss": 1.63734894, + "memory(GiB)": 97.17, + "step": 12055, + "train_speed(iter/s)": 1.628696 + }, + { + "acc": 0.6524848, + "epoch": 0.3059360730593607, + "grad_norm": 4.59375, + "learning_rate": 9.712923856855578e-06, + "loss": 1.58321218, + "memory(GiB)": 97.17, + "step": 12060, + "train_speed(iter/s)": 1.628771 + }, + { + "acc": 0.63657403, + "epoch": 0.3060629122272958, + "grad_norm": 6.34375, + "learning_rate": 9.712573547519134e-06, + "loss": 1.66819191, + "memory(GiB)": 97.17, + "step": 12065, + "train_speed(iter/s)": 1.628845 + }, + { + "acc": 0.63482022, + "epoch": 0.30618975139523086, + "grad_norm": 7.3125, + "learning_rate": 9.712223030902668e-06, + "loss": 1.72340755, + "memory(GiB)": 97.17, + "step": 12070, + "train_speed(iter/s)": 1.628919 + }, + { + "acc": 0.65714412, + "epoch": 0.3063165905631659, + "grad_norm": 6.40625, + "learning_rate": 9.711872307021594e-06, + "loss": 1.62071381, + "memory(GiB)": 97.17, + "step": 12075, + "train_speed(iter/s)": 1.628993 + }, + { + "acc": 0.65527635, + "epoch": 0.30644342973110095, + "grad_norm": 6.875, + "learning_rate": 9.711521375891339e-06, + "loss": 1.67614136, + "memory(GiB)": 97.17, + "step": 12080, + "train_speed(iter/s)": 1.629065 + }, + { + "acc": 0.64842501, + "epoch": 0.30657026889903605, + "grad_norm": 5.875, + "learning_rate": 9.71117023752734e-06, + "loss": 1.66528587, + "memory(GiB)": 97.17, + "step": 12085, + "train_speed(iter/s)": 1.629137 + }, + { + "acc": 0.6477582, + "epoch": 0.3066971080669711, + "grad_norm": 5.5, + "learning_rate": 9.710818891945037e-06, + "loss": 1.61027756, + "memory(GiB)": 97.17, + "step": 12090, + "train_speed(iter/s)": 1.629206 + }, + { + "acc": 0.65924463, + "epoch": 0.30682394723490614, + "grad_norm": 5.78125, + "learning_rate": 9.71046733915989e-06, + "loss": 1.58891096, + "memory(GiB)": 97.17, + "step": 12095, + "train_speed(iter/s)": 1.62928 + }, + { + "acc": 0.64791431, + "epoch": 0.3069507864028412, + "grad_norm": 4.84375, + "learning_rate": 9.710115579187356e-06, + "loss": 1.64345722, + "memory(GiB)": 97.17, + "step": 12100, + "train_speed(iter/s)": 1.629357 + }, + { + "acc": 0.63740606, + "epoch": 0.3070776255707763, + "grad_norm": 6.09375, + "learning_rate": 9.709763612042911e-06, + "loss": 1.66559505, + "memory(GiB)": 97.17, + "step": 12105, + "train_speed(iter/s)": 1.629435 + }, + { + "acc": 0.65210972, + "epoch": 0.3072044647387113, + "grad_norm": 5.4375, + "learning_rate": 9.709411437742035e-06, + "loss": 1.56379681, + "memory(GiB)": 97.17, + "step": 12110, + "train_speed(iter/s)": 1.629503 + }, + { + "acc": 0.65660381, + "epoch": 0.30733130390664637, + "grad_norm": 6.40625, + "learning_rate": 9.709059056300218e-06, + "loss": 1.59955273, + "memory(GiB)": 97.17, + "step": 12115, + "train_speed(iter/s)": 1.629565 + }, + { + "acc": 0.65138302, + "epoch": 0.3074581430745814, + "grad_norm": 5.46875, + "learning_rate": 9.708706467732958e-06, + "loss": 1.59153452, + "memory(GiB)": 97.17, + "step": 12120, + "train_speed(iter/s)": 1.629637 + }, + { + "acc": 0.65156617, + "epoch": 0.3075849822425165, + "grad_norm": 5.53125, + "learning_rate": 9.708353672055766e-06, + "loss": 1.67782021, + "memory(GiB)": 97.17, + "step": 12125, + "train_speed(iter/s)": 1.629714 + }, + { + "acc": 0.65236835, + "epoch": 0.30771182141045156, + "grad_norm": 5.9375, + "learning_rate": 9.708000669284158e-06, + "loss": 1.66671524, + "memory(GiB)": 97.17, + "step": 12130, + "train_speed(iter/s)": 1.629787 + }, + { + "acc": 0.652806, + "epoch": 0.3078386605783866, + "grad_norm": 5.96875, + "learning_rate": 9.707647459433661e-06, + "loss": 1.64346886, + "memory(GiB)": 97.17, + "step": 12135, + "train_speed(iter/s)": 1.629866 + }, + { + "acc": 0.64888916, + "epoch": 0.30796549974632165, + "grad_norm": 6.40625, + "learning_rate": 9.707294042519808e-06, + "loss": 1.6143465, + "memory(GiB)": 97.17, + "step": 12140, + "train_speed(iter/s)": 1.629939 + }, + { + "acc": 0.65195875, + "epoch": 0.30809233891425675, + "grad_norm": 5.5, + "learning_rate": 9.70694041855815e-06, + "loss": 1.71236172, + "memory(GiB)": 97.17, + "step": 12145, + "train_speed(iter/s)": 1.630014 + }, + { + "acc": 0.65103683, + "epoch": 0.3082191780821918, + "grad_norm": 4.6875, + "learning_rate": 9.706586587564236e-06, + "loss": 1.68275299, + "memory(GiB)": 97.17, + "step": 12150, + "train_speed(iter/s)": 1.630087 + }, + { + "acc": 0.64341426, + "epoch": 0.30834601725012684, + "grad_norm": 6.28125, + "learning_rate": 9.70623254955363e-06, + "loss": 1.63932571, + "memory(GiB)": 97.17, + "step": 12155, + "train_speed(iter/s)": 1.630159 + }, + { + "acc": 0.66488438, + "epoch": 0.3084728564180619, + "grad_norm": 7.78125, + "learning_rate": 9.705878304541905e-06, + "loss": 1.55229368, + "memory(GiB)": 97.17, + "step": 12160, + "train_speed(iter/s)": 1.630233 + }, + { + "acc": 0.64901409, + "epoch": 0.308599695585997, + "grad_norm": 5.46875, + "learning_rate": 9.705523852544643e-06, + "loss": 1.61777954, + "memory(GiB)": 97.17, + "step": 12165, + "train_speed(iter/s)": 1.630305 + }, + { + "acc": 0.64030151, + "epoch": 0.308726534753932, + "grad_norm": 6.625, + "learning_rate": 9.705169193577434e-06, + "loss": 1.70685806, + "memory(GiB)": 97.17, + "step": 12170, + "train_speed(iter/s)": 1.630374 + }, + { + "acc": 0.65600266, + "epoch": 0.30885337392186707, + "grad_norm": 7.3125, + "learning_rate": 9.704814327655874e-06, + "loss": 1.63442345, + "memory(GiB)": 97.17, + "step": 12175, + "train_speed(iter/s)": 1.630447 + }, + { + "acc": 0.66261511, + "epoch": 0.3089802130898021, + "grad_norm": 5.5625, + "learning_rate": 9.704459254795575e-06, + "loss": 1.61930466, + "memory(GiB)": 97.17, + "step": 12180, + "train_speed(iter/s)": 1.630516 + }, + { + "acc": 0.62956772, + "epoch": 0.3091070522577372, + "grad_norm": 5.09375, + "learning_rate": 9.704103975012155e-06, + "loss": 1.68084412, + "memory(GiB)": 97.17, + "step": 12185, + "train_speed(iter/s)": 1.630589 + }, + { + "acc": 0.65555058, + "epoch": 0.30923389142567226, + "grad_norm": 5.5, + "learning_rate": 9.703748488321238e-06, + "loss": 1.58282928, + "memory(GiB)": 97.17, + "step": 12190, + "train_speed(iter/s)": 1.630663 + }, + { + "acc": 0.67006302, + "epoch": 0.3093607305936073, + "grad_norm": 5.96875, + "learning_rate": 9.703392794738464e-06, + "loss": 1.57716703, + "memory(GiB)": 97.17, + "step": 12195, + "train_speed(iter/s)": 1.630732 + }, + { + "acc": 0.64453869, + "epoch": 0.30948756976154235, + "grad_norm": 6.0625, + "learning_rate": 9.703036894279476e-06, + "loss": 1.6787096, + "memory(GiB)": 97.17, + "step": 12200, + "train_speed(iter/s)": 1.630804 + }, + { + "acc": 0.66845036, + "epoch": 0.30961440892947745, + "grad_norm": 6.375, + "learning_rate": 9.702680786959925e-06, + "loss": 1.54558868, + "memory(GiB)": 97.17, + "step": 12205, + "train_speed(iter/s)": 1.630872 + }, + { + "acc": 0.65537148, + "epoch": 0.3097412480974125, + "grad_norm": 5.21875, + "learning_rate": 9.702324472795481e-06, + "loss": 1.62108536, + "memory(GiB)": 97.17, + "step": 12210, + "train_speed(iter/s)": 1.630943 + }, + { + "acc": 0.64351082, + "epoch": 0.30986808726534754, + "grad_norm": 5.5625, + "learning_rate": 9.70196795180181e-06, + "loss": 1.64338341, + "memory(GiB)": 97.17, + "step": 12215, + "train_speed(iter/s)": 1.631015 + }, + { + "acc": 0.65183668, + "epoch": 0.3099949264332826, + "grad_norm": 6.65625, + "learning_rate": 9.701611223994596e-06, + "loss": 1.63149185, + "memory(GiB)": 97.17, + "step": 12220, + "train_speed(iter/s)": 1.631088 + }, + { + "acc": 0.6511651, + "epoch": 0.3101217656012177, + "grad_norm": 4.78125, + "learning_rate": 9.701254289389529e-06, + "loss": 1.60745201, + "memory(GiB)": 97.17, + "step": 12225, + "train_speed(iter/s)": 1.631161 + }, + { + "acc": 0.65827971, + "epoch": 0.3102486047691527, + "grad_norm": 6.6875, + "learning_rate": 9.700897148002308e-06, + "loss": 1.65823116, + "memory(GiB)": 97.17, + "step": 12230, + "train_speed(iter/s)": 1.631231 + }, + { + "acc": 0.65242763, + "epoch": 0.31037544393708777, + "grad_norm": 5.46875, + "learning_rate": 9.700539799848645e-06, + "loss": 1.66842232, + "memory(GiB)": 97.17, + "step": 12235, + "train_speed(iter/s)": 1.631302 + }, + { + "acc": 0.65059233, + "epoch": 0.3105022831050228, + "grad_norm": 4.96875, + "learning_rate": 9.700182244944252e-06, + "loss": 1.62010841, + "memory(GiB)": 97.17, + "step": 12240, + "train_speed(iter/s)": 1.631377 + }, + { + "acc": 0.6480607, + "epoch": 0.3106291222729579, + "grad_norm": 5.375, + "learning_rate": 9.69982448330486e-06, + "loss": 1.66656265, + "memory(GiB)": 97.17, + "step": 12245, + "train_speed(iter/s)": 1.631447 + }, + { + "acc": 0.64326897, + "epoch": 0.31075596144089296, + "grad_norm": 5.8125, + "learning_rate": 9.699466514946206e-06, + "loss": 1.75429573, + "memory(GiB)": 97.17, + "step": 12250, + "train_speed(iter/s)": 1.631516 + }, + { + "acc": 0.66316671, + "epoch": 0.310882800608828, + "grad_norm": 6.75, + "learning_rate": 9.699108339884032e-06, + "loss": 1.57260742, + "memory(GiB)": 97.17, + "step": 12255, + "train_speed(iter/s)": 1.631586 + }, + { + "acc": 0.63127174, + "epoch": 0.31100963977676305, + "grad_norm": 5.46875, + "learning_rate": 9.698749958134093e-06, + "loss": 1.7066433, + "memory(GiB)": 97.17, + "step": 12260, + "train_speed(iter/s)": 1.631652 + }, + { + "acc": 0.64093122, + "epoch": 0.31113647894469815, + "grad_norm": 6.0, + "learning_rate": 9.698391369712152e-06, + "loss": 1.72350273, + "memory(GiB)": 97.17, + "step": 12265, + "train_speed(iter/s)": 1.631722 + }, + { + "acc": 0.65811157, + "epoch": 0.3112633181126332, + "grad_norm": 5.28125, + "learning_rate": 9.698032574633982e-06, + "loss": 1.62617531, + "memory(GiB)": 97.17, + "step": 12270, + "train_speed(iter/s)": 1.631793 + }, + { + "acc": 0.65103946, + "epoch": 0.31139015728056824, + "grad_norm": 5.25, + "learning_rate": 9.697673572915364e-06, + "loss": 1.62757626, + "memory(GiB)": 97.17, + "step": 12275, + "train_speed(iter/s)": 1.631868 + }, + { + "acc": 0.6472517, + "epoch": 0.3115169964485033, + "grad_norm": 5.4375, + "learning_rate": 9.697314364572087e-06, + "loss": 1.64757442, + "memory(GiB)": 97.17, + "step": 12280, + "train_speed(iter/s)": 1.631934 + }, + { + "acc": 0.65679131, + "epoch": 0.3116438356164384, + "grad_norm": 5.625, + "learning_rate": 9.696954949619955e-06, + "loss": 1.67645168, + "memory(GiB)": 97.17, + "step": 12285, + "train_speed(iter/s)": 1.632006 + }, + { + "acc": 0.65980196, + "epoch": 0.3117706747843734, + "grad_norm": 5.09375, + "learning_rate": 9.696595328074774e-06, + "loss": 1.60186901, + "memory(GiB)": 97.17, + "step": 12290, + "train_speed(iter/s)": 1.632071 + }, + { + "acc": 0.6604579, + "epoch": 0.31189751395230847, + "grad_norm": 5.8125, + "learning_rate": 9.69623549995236e-06, + "loss": 1.63972206, + "memory(GiB)": 97.17, + "step": 12295, + "train_speed(iter/s)": 1.632142 + }, + { + "acc": 0.64192972, + "epoch": 0.3120243531202435, + "grad_norm": 5.96875, + "learning_rate": 9.695875465268543e-06, + "loss": 1.68872604, + "memory(GiB)": 97.17, + "step": 12300, + "train_speed(iter/s)": 1.632215 + }, + { + "acc": 0.65491533, + "epoch": 0.3121511922881786, + "grad_norm": 7.25, + "learning_rate": 9.695515224039156e-06, + "loss": 1.66243534, + "memory(GiB)": 97.17, + "step": 12305, + "train_speed(iter/s)": 1.632283 + }, + { + "acc": 0.6556776, + "epoch": 0.31227803145611366, + "grad_norm": 5.875, + "learning_rate": 9.695154776280047e-06, + "loss": 1.60202942, + "memory(GiB)": 97.17, + "step": 12310, + "train_speed(iter/s)": 1.63235 + }, + { + "acc": 0.64802794, + "epoch": 0.3124048706240487, + "grad_norm": 5.625, + "learning_rate": 9.694794122007067e-06, + "loss": 1.68175659, + "memory(GiB)": 97.17, + "step": 12315, + "train_speed(iter/s)": 1.632423 + }, + { + "acc": 0.64842658, + "epoch": 0.31253170979198375, + "grad_norm": 5.40625, + "learning_rate": 9.694433261236083e-06, + "loss": 1.6339119, + "memory(GiB)": 97.17, + "step": 12320, + "train_speed(iter/s)": 1.632493 + }, + { + "acc": 0.64380045, + "epoch": 0.31265854895991885, + "grad_norm": 5.5, + "learning_rate": 9.694072193982962e-06, + "loss": 1.7081356, + "memory(GiB)": 97.17, + "step": 12325, + "train_speed(iter/s)": 1.632561 + }, + { + "acc": 0.65466681, + "epoch": 0.3127853881278539, + "grad_norm": 5.5625, + "learning_rate": 9.69371092026359e-06, + "loss": 1.64989376, + "memory(GiB)": 97.17, + "step": 12330, + "train_speed(iter/s)": 1.632635 + }, + { + "acc": 0.67976747, + "epoch": 0.31291222729578894, + "grad_norm": 5.3125, + "learning_rate": 9.693349440093855e-06, + "loss": 1.58285637, + "memory(GiB)": 97.17, + "step": 12335, + "train_speed(iter/s)": 1.632708 + }, + { + "acc": 0.66369729, + "epoch": 0.313039066463724, + "grad_norm": 4.6875, + "learning_rate": 9.69298775348966e-06, + "loss": 1.52019424, + "memory(GiB)": 97.17, + "step": 12340, + "train_speed(iter/s)": 1.632773 + }, + { + "acc": 0.65198488, + "epoch": 0.3131659056316591, + "grad_norm": 6.0625, + "learning_rate": 9.69262586046691e-06, + "loss": 1.63689671, + "memory(GiB)": 97.17, + "step": 12345, + "train_speed(iter/s)": 1.632844 + }, + { + "acc": 0.65919633, + "epoch": 0.3132927447995941, + "grad_norm": 5.84375, + "learning_rate": 9.692263761041521e-06, + "loss": 1.65497913, + "memory(GiB)": 97.17, + "step": 12350, + "train_speed(iter/s)": 1.632912 + }, + { + "acc": 0.64915676, + "epoch": 0.31341958396752917, + "grad_norm": 4.59375, + "learning_rate": 9.691901455229425e-06, + "loss": 1.62881203, + "memory(GiB)": 97.17, + "step": 12355, + "train_speed(iter/s)": 1.632984 + }, + { + "acc": 0.6485455, + "epoch": 0.3135464231354642, + "grad_norm": 5.40625, + "learning_rate": 9.691538943046552e-06, + "loss": 1.64240589, + "memory(GiB)": 97.17, + "step": 12360, + "train_speed(iter/s)": 1.633054 + }, + { + "acc": 0.6553206, + "epoch": 0.3136732623033993, + "grad_norm": 4.75, + "learning_rate": 9.691176224508853e-06, + "loss": 1.66156693, + "memory(GiB)": 97.17, + "step": 12365, + "train_speed(iter/s)": 1.633121 + }, + { + "acc": 0.64690952, + "epoch": 0.31380010147133436, + "grad_norm": 5.78125, + "learning_rate": 9.690813299632278e-06, + "loss": 1.60425549, + "memory(GiB)": 97.17, + "step": 12370, + "train_speed(iter/s)": 1.633194 + }, + { + "acc": 0.65716491, + "epoch": 0.3139269406392694, + "grad_norm": 5.1875, + "learning_rate": 9.690450168432793e-06, + "loss": 1.66567345, + "memory(GiB)": 97.17, + "step": 12375, + "train_speed(iter/s)": 1.63326 + }, + { + "acc": 0.63407702, + "epoch": 0.31405377980720445, + "grad_norm": 5.0625, + "learning_rate": 9.690086830926366e-06, + "loss": 1.64914207, + "memory(GiB)": 97.17, + "step": 12380, + "train_speed(iter/s)": 1.633327 + }, + { + "acc": 0.63304691, + "epoch": 0.31418061897513955, + "grad_norm": 5.125, + "learning_rate": 9.689723287128981e-06, + "loss": 1.68027935, + "memory(GiB)": 97.17, + "step": 12385, + "train_speed(iter/s)": 1.633393 + }, + { + "acc": 0.65252461, + "epoch": 0.3143074581430746, + "grad_norm": 5.9375, + "learning_rate": 9.689359537056628e-06, + "loss": 1.63740005, + "memory(GiB)": 97.17, + "step": 12390, + "train_speed(iter/s)": 1.633464 + }, + { + "acc": 0.64774771, + "epoch": 0.31443429731100964, + "grad_norm": 5.71875, + "learning_rate": 9.688995580725304e-06, + "loss": 1.59514341, + "memory(GiB)": 97.17, + "step": 12395, + "train_speed(iter/s)": 1.633533 + }, + { + "acc": 0.65391817, + "epoch": 0.3145611364789447, + "grad_norm": 5.84375, + "learning_rate": 9.688631418151022e-06, + "loss": 1.58650837, + "memory(GiB)": 97.17, + "step": 12400, + "train_speed(iter/s)": 1.633604 + }, + { + "acc": 0.66186643, + "epoch": 0.3146879756468798, + "grad_norm": 5.90625, + "learning_rate": 9.688267049349796e-06, + "loss": 1.59120388, + "memory(GiB)": 97.17, + "step": 12405, + "train_speed(iter/s)": 1.633675 + }, + { + "acc": 0.64244699, + "epoch": 0.3148148148148148, + "grad_norm": 5.3125, + "learning_rate": 9.687902474337654e-06, + "loss": 1.72289429, + "memory(GiB)": 97.17, + "step": 12410, + "train_speed(iter/s)": 1.63374 + }, + { + "acc": 0.64908428, + "epoch": 0.31494165398274987, + "grad_norm": 5.0625, + "learning_rate": 9.687537693130631e-06, + "loss": 1.6042099, + "memory(GiB)": 97.17, + "step": 12415, + "train_speed(iter/s)": 1.63381 + }, + { + "acc": 0.64052429, + "epoch": 0.3150684931506849, + "grad_norm": 5.4375, + "learning_rate": 9.687172705744773e-06, + "loss": 1.68112698, + "memory(GiB)": 97.17, + "step": 12420, + "train_speed(iter/s)": 1.633878 + }, + { + "acc": 0.63257399, + "epoch": 0.31519533231862, + "grad_norm": 5.875, + "learning_rate": 9.686807512196132e-06, + "loss": 1.7116066, + "memory(GiB)": 97.17, + "step": 12425, + "train_speed(iter/s)": 1.633949 + }, + { + "acc": 0.65500231, + "epoch": 0.31532217148655506, + "grad_norm": 5.875, + "learning_rate": 9.68644211250077e-06, + "loss": 1.61961002, + "memory(GiB)": 97.17, + "step": 12430, + "train_speed(iter/s)": 1.634019 + }, + { + "acc": 0.62910328, + "epoch": 0.3154490106544901, + "grad_norm": 6.15625, + "learning_rate": 9.686076506674761e-06, + "loss": 1.68652954, + "memory(GiB)": 97.17, + "step": 12435, + "train_speed(iter/s)": 1.634088 + }, + { + "acc": 0.64076643, + "epoch": 0.31557584982242515, + "grad_norm": 5.09375, + "learning_rate": 9.685710694734187e-06, + "loss": 1.68172646, + "memory(GiB)": 97.17, + "step": 12440, + "train_speed(iter/s)": 1.634161 + }, + { + "acc": 0.64843788, + "epoch": 0.31570268899036025, + "grad_norm": 5.78125, + "learning_rate": 9.685344676695135e-06, + "loss": 1.63500252, + "memory(GiB)": 97.17, + "step": 12445, + "train_speed(iter/s)": 1.63423 + }, + { + "acc": 0.6416647, + "epoch": 0.3158295281582953, + "grad_norm": 7.28125, + "learning_rate": 9.684978452573706e-06, + "loss": 1.67504749, + "memory(GiB)": 97.17, + "step": 12450, + "train_speed(iter/s)": 1.634295 + }, + { + "acc": 0.6630147, + "epoch": 0.31595636732623034, + "grad_norm": 4.78125, + "learning_rate": 9.684612022386008e-06, + "loss": 1.5468152, + "memory(GiB)": 97.17, + "step": 12455, + "train_speed(iter/s)": 1.634358 + }, + { + "acc": 0.63974771, + "epoch": 0.3160832064941654, + "grad_norm": 4.6875, + "learning_rate": 9.68424538614816e-06, + "loss": 1.67091637, + "memory(GiB)": 97.17, + "step": 12460, + "train_speed(iter/s)": 1.634426 + }, + { + "acc": 0.64996028, + "epoch": 0.3162100456621005, + "grad_norm": 5.53125, + "learning_rate": 9.683878543876281e-06, + "loss": 1.66914978, + "memory(GiB)": 97.17, + "step": 12465, + "train_speed(iter/s)": 1.634497 + }, + { + "acc": 0.65390582, + "epoch": 0.3163368848300355, + "grad_norm": 5.90625, + "learning_rate": 9.683511495586516e-06, + "loss": 1.60467873, + "memory(GiB)": 97.17, + "step": 12470, + "train_speed(iter/s)": 1.63457 + }, + { + "acc": 0.65536156, + "epoch": 0.31646372399797057, + "grad_norm": 5.46875, + "learning_rate": 9.683144241295003e-06, + "loss": 1.57642612, + "memory(GiB)": 97.17, + "step": 12475, + "train_speed(iter/s)": 1.634627 + }, + { + "acc": 0.65573554, + "epoch": 0.3165905631659056, + "grad_norm": 7.40625, + "learning_rate": 9.682776781017899e-06, + "loss": 1.61298599, + "memory(GiB)": 97.17, + "step": 12480, + "train_speed(iter/s)": 1.634694 + }, + { + "acc": 0.63949137, + "epoch": 0.3167174023338407, + "grad_norm": 6.125, + "learning_rate": 9.682409114771364e-06, + "loss": 1.67794991, + "memory(GiB)": 97.17, + "step": 12485, + "train_speed(iter/s)": 1.634759 + }, + { + "acc": 0.63329649, + "epoch": 0.31684424150177576, + "grad_norm": 5.5625, + "learning_rate": 9.682041242571571e-06, + "loss": 1.64192829, + "memory(GiB)": 97.17, + "step": 12490, + "train_speed(iter/s)": 1.634828 + }, + { + "acc": 0.65123, + "epoch": 0.3169710806697108, + "grad_norm": 6.125, + "learning_rate": 9.681673164434701e-06, + "loss": 1.65969276, + "memory(GiB)": 97.17, + "step": 12495, + "train_speed(iter/s)": 1.6349 + }, + { + "acc": 0.660251, + "epoch": 0.31709791983764585, + "grad_norm": 4.53125, + "learning_rate": 9.681304880376942e-06, + "loss": 1.60781631, + "memory(GiB)": 97.17, + "step": 12500, + "train_speed(iter/s)": 1.634967 + }, + { + "acc": 0.65085888, + "epoch": 0.31722475900558095, + "grad_norm": 5.5, + "learning_rate": 9.680936390414495e-06, + "loss": 1.73761711, + "memory(GiB)": 97.17, + "step": 12505, + "train_speed(iter/s)": 1.635035 + }, + { + "acc": 0.64507508, + "epoch": 0.317351598173516, + "grad_norm": 6.125, + "learning_rate": 9.680567694563566e-06, + "loss": 1.64682598, + "memory(GiB)": 97.17, + "step": 12510, + "train_speed(iter/s)": 1.635099 + }, + { + "acc": 0.65353298, + "epoch": 0.31747843734145104, + "grad_norm": 5.625, + "learning_rate": 9.680198792840371e-06, + "loss": 1.64877968, + "memory(GiB)": 97.17, + "step": 12515, + "train_speed(iter/s)": 1.635166 + }, + { + "acc": 0.6431283, + "epoch": 0.3176052765093861, + "grad_norm": 5.4375, + "learning_rate": 9.67982968526114e-06, + "loss": 1.63001709, + "memory(GiB)": 97.17, + "step": 12520, + "train_speed(iter/s)": 1.635227 + }, + { + "acc": 0.66182041, + "epoch": 0.3177321156773212, + "grad_norm": 5.53125, + "learning_rate": 9.679460371842104e-06, + "loss": 1.57468204, + "memory(GiB)": 97.17, + "step": 12525, + "train_speed(iter/s)": 1.635293 + }, + { + "acc": 0.64837599, + "epoch": 0.3178589548452562, + "grad_norm": 5.6875, + "learning_rate": 9.679090852599508e-06, + "loss": 1.65184116, + "memory(GiB)": 97.17, + "step": 12530, + "train_speed(iter/s)": 1.635362 + }, + { + "acc": 0.64117403, + "epoch": 0.31798579401319127, + "grad_norm": 5.53125, + "learning_rate": 9.678721127549608e-06, + "loss": 1.67416115, + "memory(GiB)": 97.17, + "step": 12535, + "train_speed(iter/s)": 1.635433 + }, + { + "acc": 0.63163776, + "epoch": 0.3181126331811263, + "grad_norm": 6.84375, + "learning_rate": 9.678351196708662e-06, + "loss": 1.76682434, + "memory(GiB)": 97.17, + "step": 12540, + "train_speed(iter/s)": 1.6355 + }, + { + "acc": 0.64566784, + "epoch": 0.3182394723490614, + "grad_norm": 6.25, + "learning_rate": 9.677981060092943e-06, + "loss": 1.69713898, + "memory(GiB)": 97.17, + "step": 12545, + "train_speed(iter/s)": 1.635566 + }, + { + "acc": 0.63155918, + "epoch": 0.31836631151699646, + "grad_norm": 4.53125, + "learning_rate": 9.677610717718732e-06, + "loss": 1.71420555, + "memory(GiB)": 97.17, + "step": 12550, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.62976179, + "epoch": 0.3184931506849315, + "grad_norm": 6.6875, + "learning_rate": 9.677240169602317e-06, + "loss": 1.70730667, + "memory(GiB)": 97.17, + "step": 12555, + "train_speed(iter/s)": 1.635697 + }, + { + "acc": 0.65112371, + "epoch": 0.31861998985286655, + "grad_norm": 6.625, + "learning_rate": 9.676869415759999e-06, + "loss": 1.67480469, + "memory(GiB)": 97.17, + "step": 12560, + "train_speed(iter/s)": 1.63576 + }, + { + "acc": 0.63670521, + "epoch": 0.31874682902080165, + "grad_norm": 5.4375, + "learning_rate": 9.67649845620808e-06, + "loss": 1.7223175, + "memory(GiB)": 97.17, + "step": 12565, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.65497069, + "epoch": 0.3188736681887367, + "grad_norm": 5.21875, + "learning_rate": 9.676127290962883e-06, + "loss": 1.67975903, + "memory(GiB)": 97.17, + "step": 12570, + "train_speed(iter/s)": 1.635893 + }, + { + "acc": 0.65211391, + "epoch": 0.31900050735667174, + "grad_norm": 5.8125, + "learning_rate": 9.675755920040728e-06, + "loss": 1.62615433, + "memory(GiB)": 97.17, + "step": 12575, + "train_speed(iter/s)": 1.635957 + }, + { + "acc": 0.66108007, + "epoch": 0.3191273465246068, + "grad_norm": 5.09375, + "learning_rate": 9.675384343457954e-06, + "loss": 1.63112106, + "memory(GiB)": 97.17, + "step": 12580, + "train_speed(iter/s)": 1.636016 + }, + { + "acc": 0.64188585, + "epoch": 0.3192541856925419, + "grad_norm": 5.125, + "learning_rate": 9.675012561230901e-06, + "loss": 1.69303932, + "memory(GiB)": 97.17, + "step": 12585, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.65562472, + "epoch": 0.3193810248604769, + "grad_norm": 5.90625, + "learning_rate": 9.674640573375924e-06, + "loss": 1.65416985, + "memory(GiB)": 97.17, + "step": 12590, + "train_speed(iter/s)": 1.636152 + }, + { + "acc": 0.64916744, + "epoch": 0.31950786402841197, + "grad_norm": 6.6875, + "learning_rate": 9.674268379909383e-06, + "loss": 1.68290672, + "memory(GiB)": 97.17, + "step": 12595, + "train_speed(iter/s)": 1.636221 + }, + { + "acc": 0.64814253, + "epoch": 0.319634703196347, + "grad_norm": 7.15625, + "learning_rate": 9.67389598084765e-06, + "loss": 1.63947773, + "memory(GiB)": 97.17, + "step": 12600, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.64805546, + "epoch": 0.3197615423642821, + "grad_norm": 7.46875, + "learning_rate": 9.673523376207103e-06, + "loss": 1.60097141, + "memory(GiB)": 97.17, + "step": 12605, + "train_speed(iter/s)": 1.636353 + }, + { + "acc": 0.64015789, + "epoch": 0.31988838153221716, + "grad_norm": 7.0, + "learning_rate": 9.673150566004135e-06, + "loss": 1.66025257, + "memory(GiB)": 97.17, + "step": 12610, + "train_speed(iter/s)": 1.636425 + }, + { + "acc": 0.64231443, + "epoch": 0.3200152207001522, + "grad_norm": 5.34375, + "learning_rate": 9.672777550255137e-06, + "loss": 1.72642555, + "memory(GiB)": 97.17, + "step": 12615, + "train_speed(iter/s)": 1.636493 + }, + { + "acc": 0.65578756, + "epoch": 0.32014205986808725, + "grad_norm": 5.625, + "learning_rate": 9.672404328976523e-06, + "loss": 1.66687927, + "memory(GiB)": 97.17, + "step": 12620, + "train_speed(iter/s)": 1.636556 + }, + { + "acc": 0.65339026, + "epoch": 0.32026889903602235, + "grad_norm": 5.71875, + "learning_rate": 9.672030902184706e-06, + "loss": 1.64034901, + "memory(GiB)": 97.17, + "step": 12625, + "train_speed(iter/s)": 1.636618 + }, + { + "acc": 0.64897451, + "epoch": 0.3203957382039574, + "grad_norm": 5.0, + "learning_rate": 9.671657269896108e-06, + "loss": 1.72187843, + "memory(GiB)": 97.17, + "step": 12630, + "train_speed(iter/s)": 1.636681 + }, + { + "acc": 0.65473375, + "epoch": 0.32052257737189244, + "grad_norm": 4.84375, + "learning_rate": 9.671283432127169e-06, + "loss": 1.63135338, + "memory(GiB)": 97.17, + "step": 12635, + "train_speed(iter/s)": 1.636748 + }, + { + "acc": 0.65806017, + "epoch": 0.3206494165398275, + "grad_norm": 5.5625, + "learning_rate": 9.670909388894328e-06, + "loss": 1.58408194, + "memory(GiB)": 97.17, + "step": 12640, + "train_speed(iter/s)": 1.636818 + }, + { + "acc": 0.64611273, + "epoch": 0.3207762557077626, + "grad_norm": 4.6875, + "learning_rate": 9.670535140214037e-06, + "loss": 1.66608925, + "memory(GiB)": 97.17, + "step": 12645, + "train_speed(iter/s)": 1.636885 + }, + { + "acc": 0.65427494, + "epoch": 0.3209030948756976, + "grad_norm": 5.65625, + "learning_rate": 9.670160686102759e-06, + "loss": 1.65067062, + "memory(GiB)": 97.17, + "step": 12650, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.64160805, + "epoch": 0.32102993404363267, + "grad_norm": 5.40625, + "learning_rate": 9.669786026576962e-06, + "loss": 1.68313408, + "memory(GiB)": 97.17, + "step": 12655, + "train_speed(iter/s)": 1.637009 + }, + { + "acc": 0.6566082, + "epoch": 0.3211567732115677, + "grad_norm": 5.46875, + "learning_rate": 9.669411161653127e-06, + "loss": 1.63150597, + "memory(GiB)": 97.17, + "step": 12660, + "train_speed(iter/s)": 1.637074 + }, + { + "acc": 0.64243369, + "epoch": 0.3212836123795028, + "grad_norm": 5.3125, + "learning_rate": 9.669036091347742e-06, + "loss": 1.61782188, + "memory(GiB)": 97.17, + "step": 12665, + "train_speed(iter/s)": 1.637142 + }, + { + "acc": 0.65017185, + "epoch": 0.32141045154743786, + "grad_norm": 6.4375, + "learning_rate": 9.668660815677304e-06, + "loss": 1.59754953, + "memory(GiB)": 97.17, + "step": 12670, + "train_speed(iter/s)": 1.637204 + }, + { + "acc": 0.67116165, + "epoch": 0.3215372907153729, + "grad_norm": 6.53125, + "learning_rate": 9.668285334658319e-06, + "loss": 1.57749071, + "memory(GiB)": 97.17, + "step": 12675, + "train_speed(iter/s)": 1.637272 + }, + { + "acc": 0.6393919, + "epoch": 0.32166412988330795, + "grad_norm": 5.1875, + "learning_rate": 9.667909648307302e-06, + "loss": 1.68731289, + "memory(GiB)": 97.17, + "step": 12680, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.66831617, + "epoch": 0.32179096905124305, + "grad_norm": 7.0625, + "learning_rate": 9.66753375664078e-06, + "loss": 1.59963207, + "memory(GiB)": 97.17, + "step": 12685, + "train_speed(iter/s)": 1.637401 + }, + { + "acc": 0.6519949, + "epoch": 0.3219178082191781, + "grad_norm": 6.0625, + "learning_rate": 9.667157659675284e-06, + "loss": 1.63812599, + "memory(GiB)": 97.17, + "step": 12690, + "train_speed(iter/s)": 1.637464 + }, + { + "acc": 0.65419474, + "epoch": 0.32204464738711314, + "grad_norm": 5.46875, + "learning_rate": 9.666781357427355e-06, + "loss": 1.65822525, + "memory(GiB)": 97.17, + "step": 12695, + "train_speed(iter/s)": 1.637528 + }, + { + "acc": 0.64320679, + "epoch": 0.3221714865550482, + "grad_norm": 5.5, + "learning_rate": 9.666404849913546e-06, + "loss": 1.65864391, + "memory(GiB)": 97.17, + "step": 12700, + "train_speed(iter/s)": 1.637593 + }, + { + "acc": 0.65752296, + "epoch": 0.3222983257229833, + "grad_norm": 6.6875, + "learning_rate": 9.66602813715042e-06, + "loss": 1.60788002, + "memory(GiB)": 97.17, + "step": 12705, + "train_speed(iter/s)": 1.637662 + }, + { + "acc": 0.65540419, + "epoch": 0.3224251648909183, + "grad_norm": 5.65625, + "learning_rate": 9.665651219154543e-06, + "loss": 1.6908247, + "memory(GiB)": 97.17, + "step": 12710, + "train_speed(iter/s)": 1.637727 + }, + { + "acc": 0.6586338, + "epoch": 0.32255200405885337, + "grad_norm": 4.875, + "learning_rate": 9.665274095942495e-06, + "loss": 1.6695631, + "memory(GiB)": 97.17, + "step": 12715, + "train_speed(iter/s)": 1.637792 + }, + { + "acc": 0.65187531, + "epoch": 0.3226788432267884, + "grad_norm": 7.0, + "learning_rate": 9.664896767530862e-06, + "loss": 1.68884411, + "memory(GiB)": 97.17, + "step": 12720, + "train_speed(iter/s)": 1.637858 + }, + { + "acc": 0.65139108, + "epoch": 0.3228056823947235, + "grad_norm": 5.125, + "learning_rate": 9.664519233936242e-06, + "loss": 1.67368927, + "memory(GiB)": 97.17, + "step": 12725, + "train_speed(iter/s)": 1.637926 + }, + { + "acc": 0.64907026, + "epoch": 0.32293252156265856, + "grad_norm": 4.90625, + "learning_rate": 9.664141495175242e-06, + "loss": 1.57588377, + "memory(GiB)": 97.17, + "step": 12730, + "train_speed(iter/s)": 1.637988 + }, + { + "acc": 0.65204868, + "epoch": 0.3230593607305936, + "grad_norm": 5.09375, + "learning_rate": 9.663763551264476e-06, + "loss": 1.66170807, + "memory(GiB)": 97.17, + "step": 12735, + "train_speed(iter/s)": 1.638046 + }, + { + "acc": 0.65029244, + "epoch": 0.32318619989852865, + "grad_norm": 6.1875, + "learning_rate": 9.663385402220565e-06, + "loss": 1.60605583, + "memory(GiB)": 97.17, + "step": 12740, + "train_speed(iter/s)": 1.638107 + }, + { + "acc": 0.65202026, + "epoch": 0.32331303906646375, + "grad_norm": 6.3125, + "learning_rate": 9.663007048060144e-06, + "loss": 1.70589123, + "memory(GiB)": 97.17, + "step": 12745, + "train_speed(iter/s)": 1.638169 + }, + { + "acc": 0.66016808, + "epoch": 0.3234398782343988, + "grad_norm": 5.34375, + "learning_rate": 9.662628488799854e-06, + "loss": 1.5867878, + "memory(GiB)": 97.17, + "step": 12750, + "train_speed(iter/s)": 1.638229 + }, + { + "acc": 0.64225903, + "epoch": 0.32356671740233384, + "grad_norm": 6.03125, + "learning_rate": 9.662249724456346e-06, + "loss": 1.62411652, + "memory(GiB)": 97.17, + "step": 12755, + "train_speed(iter/s)": 1.638292 + }, + { + "acc": 0.65363932, + "epoch": 0.3236935565702689, + "grad_norm": 6.34375, + "learning_rate": 9.661870755046278e-06, + "loss": 1.64809647, + "memory(GiB)": 97.17, + "step": 12760, + "train_speed(iter/s)": 1.63836 + }, + { + "acc": 0.64653487, + "epoch": 0.323820395738204, + "grad_norm": 4.84375, + "learning_rate": 9.661491580586322e-06, + "loss": 1.58892832, + "memory(GiB)": 97.17, + "step": 12765, + "train_speed(iter/s)": 1.638417 + }, + { + "acc": 0.64173794, + "epoch": 0.323947234906139, + "grad_norm": 6.4375, + "learning_rate": 9.661112201093156e-06, + "loss": 1.65680523, + "memory(GiB)": 97.17, + "step": 12770, + "train_speed(iter/s)": 1.638486 + }, + { + "acc": 0.64310226, + "epoch": 0.32407407407407407, + "grad_norm": 5.71875, + "learning_rate": 9.660732616583463e-06, + "loss": 1.6770319, + "memory(GiB)": 97.17, + "step": 12775, + "train_speed(iter/s)": 1.638548 + }, + { + "acc": 0.66438527, + "epoch": 0.3242009132420091, + "grad_norm": 5.6875, + "learning_rate": 9.660352827073941e-06, + "loss": 1.60030174, + "memory(GiB)": 97.17, + "step": 12780, + "train_speed(iter/s)": 1.63861 + }, + { + "acc": 0.63418293, + "epoch": 0.3243277524099442, + "grad_norm": 5.3125, + "learning_rate": 9.659972832581295e-06, + "loss": 1.66988316, + "memory(GiB)": 97.17, + "step": 12785, + "train_speed(iter/s)": 1.638677 + }, + { + "acc": 0.64672856, + "epoch": 0.32445459157787926, + "grad_norm": 5.46875, + "learning_rate": 9.659592633122238e-06, + "loss": 1.62647552, + "memory(GiB)": 97.17, + "step": 12790, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.64901643, + "epoch": 0.3245814307458143, + "grad_norm": 8.625, + "learning_rate": 9.659212228713495e-06, + "loss": 1.66549568, + "memory(GiB)": 97.17, + "step": 12795, + "train_speed(iter/s)": 1.638812 + }, + { + "acc": 0.6582293, + "epoch": 0.32470826991374935, + "grad_norm": 6.125, + "learning_rate": 9.658831619371793e-06, + "loss": 1.61192818, + "memory(GiB)": 97.17, + "step": 12800, + "train_speed(iter/s)": 1.638876 + }, + { + "acc": 0.64622264, + "epoch": 0.32483510908168445, + "grad_norm": 6.21875, + "learning_rate": 9.658450805113879e-06, + "loss": 1.70059204, + "memory(GiB)": 97.17, + "step": 12805, + "train_speed(iter/s)": 1.638943 + }, + { + "acc": 0.66009068, + "epoch": 0.3249619482496195, + "grad_norm": 4.78125, + "learning_rate": 9.6580697859565e-06, + "loss": 1.58308096, + "memory(GiB)": 97.17, + "step": 12810, + "train_speed(iter/s)": 1.639009 + }, + { + "acc": 0.67253685, + "epoch": 0.32508878741755454, + "grad_norm": 4.84375, + "learning_rate": 9.657688561916414e-06, + "loss": 1.49973269, + "memory(GiB)": 97.17, + "step": 12815, + "train_speed(iter/s)": 1.639066 + }, + { + "acc": 0.66676235, + "epoch": 0.3252156265854896, + "grad_norm": 6.03125, + "learning_rate": 9.65730713301039e-06, + "loss": 1.57210884, + "memory(GiB)": 97.17, + "step": 12820, + "train_speed(iter/s)": 1.639131 + }, + { + "acc": 0.64397898, + "epoch": 0.3253424657534247, + "grad_norm": 6.375, + "learning_rate": 9.656925499255206e-06, + "loss": 1.63543816, + "memory(GiB)": 97.17, + "step": 12825, + "train_speed(iter/s)": 1.639201 + }, + { + "acc": 0.65438166, + "epoch": 0.3254693049213597, + "grad_norm": 5.0625, + "learning_rate": 9.656543660667646e-06, + "loss": 1.59673157, + "memory(GiB)": 97.17, + "step": 12830, + "train_speed(iter/s)": 1.639267 + }, + { + "acc": 0.66472788, + "epoch": 0.32559614408929477, + "grad_norm": 5.75, + "learning_rate": 9.656161617264507e-06, + "loss": 1.5379261, + "memory(GiB)": 97.17, + "step": 12835, + "train_speed(iter/s)": 1.639332 + }, + { + "acc": 0.63419309, + "epoch": 0.3257229832572298, + "grad_norm": 6.625, + "learning_rate": 9.65577936906259e-06, + "loss": 1.64864674, + "memory(GiB)": 97.17, + "step": 12840, + "train_speed(iter/s)": 1.639392 + }, + { + "acc": 0.63277168, + "epoch": 0.3258498224251649, + "grad_norm": 6.90625, + "learning_rate": 9.65539691607871e-06, + "loss": 1.64743996, + "memory(GiB)": 97.17, + "step": 12845, + "train_speed(iter/s)": 1.639456 + }, + { + "acc": 0.64508448, + "epoch": 0.32597666159309996, + "grad_norm": 4.8125, + "learning_rate": 9.65501425832969e-06, + "loss": 1.677705, + "memory(GiB)": 97.17, + "step": 12850, + "train_speed(iter/s)": 1.639514 + }, + { + "acc": 0.64293809, + "epoch": 0.326103500761035, + "grad_norm": 5.3125, + "learning_rate": 9.65463139583236e-06, + "loss": 1.65923309, + "memory(GiB)": 97.17, + "step": 12855, + "train_speed(iter/s)": 1.639579 + }, + { + "acc": 0.64264169, + "epoch": 0.32623033992897005, + "grad_norm": 5.6875, + "learning_rate": 9.65424832860356e-06, + "loss": 1.65921211, + "memory(GiB)": 97.17, + "step": 12860, + "train_speed(iter/s)": 1.639647 + }, + { + "acc": 0.64991336, + "epoch": 0.32635717909690515, + "grad_norm": 5.625, + "learning_rate": 9.653865056660136e-06, + "loss": 1.64876823, + "memory(GiB)": 97.17, + "step": 12865, + "train_speed(iter/s)": 1.63971 + }, + { + "acc": 0.64246111, + "epoch": 0.3264840182648402, + "grad_norm": 5.5625, + "learning_rate": 9.653481580018951e-06, + "loss": 1.6958704, + "memory(GiB)": 97.17, + "step": 12870, + "train_speed(iter/s)": 1.639775 + }, + { + "acc": 0.6485539, + "epoch": 0.32661085743277524, + "grad_norm": 6.375, + "learning_rate": 9.653097898696869e-06, + "loss": 1.69870033, + "memory(GiB)": 97.17, + "step": 12875, + "train_speed(iter/s)": 1.639837 + }, + { + "acc": 0.64272871, + "epoch": 0.3267376966007103, + "grad_norm": 6.46875, + "learning_rate": 9.652714012710766e-06, + "loss": 1.67902279, + "memory(GiB)": 97.17, + "step": 12880, + "train_speed(iter/s)": 1.639903 + }, + { + "acc": 0.64827604, + "epoch": 0.3268645357686454, + "grad_norm": 6.875, + "learning_rate": 9.65232992207753e-06, + "loss": 1.7218359, + "memory(GiB)": 97.17, + "step": 12885, + "train_speed(iter/s)": 1.639972 + }, + { + "acc": 0.64713454, + "epoch": 0.3269913749365804, + "grad_norm": 4.75, + "learning_rate": 9.651945626814052e-06, + "loss": 1.68011665, + "memory(GiB)": 97.17, + "step": 12890, + "train_speed(iter/s)": 1.64004 + }, + { + "acc": 0.65753012, + "epoch": 0.32711821410451547, + "grad_norm": 6.53125, + "learning_rate": 9.651561126937236e-06, + "loss": 1.61671028, + "memory(GiB)": 97.17, + "step": 12895, + "train_speed(iter/s)": 1.640102 + }, + { + "acc": 0.64125328, + "epoch": 0.3272450532724505, + "grad_norm": 6.625, + "learning_rate": 9.651176422463994e-06, + "loss": 1.71301308, + "memory(GiB)": 97.17, + "step": 12900, + "train_speed(iter/s)": 1.640165 + }, + { + "acc": 0.65696259, + "epoch": 0.3273718924403856, + "grad_norm": 4.75, + "learning_rate": 9.650791513411246e-06, + "loss": 1.59470024, + "memory(GiB)": 97.17, + "step": 12905, + "train_speed(iter/s)": 1.640227 + }, + { + "acc": 0.63237667, + "epoch": 0.32749873160832066, + "grad_norm": 4.65625, + "learning_rate": 9.650406399795924e-06, + "loss": 1.69231205, + "memory(GiB)": 97.17, + "step": 12910, + "train_speed(iter/s)": 1.640295 + }, + { + "acc": 0.64365764, + "epoch": 0.3276255707762557, + "grad_norm": 7.40625, + "learning_rate": 9.650021081634965e-06, + "loss": 1.68579178, + "memory(GiB)": 97.17, + "step": 12915, + "train_speed(iter/s)": 1.640364 + }, + { + "acc": 0.63840742, + "epoch": 0.32775240994419075, + "grad_norm": 5.8125, + "learning_rate": 9.649635558945318e-06, + "loss": 1.65430431, + "memory(GiB)": 97.17, + "step": 12920, + "train_speed(iter/s)": 1.640429 + }, + { + "acc": 0.64144392, + "epoch": 0.32787924911212585, + "grad_norm": 4.59375, + "learning_rate": 9.649249831743941e-06, + "loss": 1.68406563, + "memory(GiB)": 97.17, + "step": 12925, + "train_speed(iter/s)": 1.640488 + }, + { + "acc": 0.64603863, + "epoch": 0.3280060882800609, + "grad_norm": 5.34375, + "learning_rate": 9.6488639000478e-06, + "loss": 1.68732719, + "memory(GiB)": 97.17, + "step": 12930, + "train_speed(iter/s)": 1.640549 + }, + { + "acc": 0.6613368, + "epoch": 0.32813292744799594, + "grad_norm": 5.65625, + "learning_rate": 9.648477763873868e-06, + "loss": 1.64164238, + "memory(GiB)": 97.17, + "step": 12935, + "train_speed(iter/s)": 1.640607 + }, + { + "acc": 0.6546916, + "epoch": 0.328259766615931, + "grad_norm": 5.75, + "learning_rate": 9.648091423239128e-06, + "loss": 1.5988987, + "memory(GiB)": 97.17, + "step": 12940, + "train_speed(iter/s)": 1.640673 + }, + { + "acc": 0.65834322, + "epoch": 0.3283866057838661, + "grad_norm": 5.59375, + "learning_rate": 9.647704878160576e-06, + "loss": 1.57230587, + "memory(GiB)": 97.17, + "step": 12945, + "train_speed(iter/s)": 1.640735 + }, + { + "acc": 0.63474984, + "epoch": 0.3285134449518011, + "grad_norm": 5.8125, + "learning_rate": 9.647318128655213e-06, + "loss": 1.68502693, + "memory(GiB)": 97.17, + "step": 12950, + "train_speed(iter/s)": 1.640797 + }, + { + "acc": 0.66642752, + "epoch": 0.32864028411973617, + "grad_norm": 4.84375, + "learning_rate": 9.64693117474005e-06, + "loss": 1.58760185, + "memory(GiB)": 97.17, + "step": 12955, + "train_speed(iter/s)": 1.64085 + }, + { + "acc": 0.65560608, + "epoch": 0.3287671232876712, + "grad_norm": 5.84375, + "learning_rate": 9.646544016432109e-06, + "loss": 1.64631081, + "memory(GiB)": 97.17, + "step": 12960, + "train_speed(iter/s)": 1.640916 + }, + { + "acc": 0.66386285, + "epoch": 0.3288939624556063, + "grad_norm": 6.15625, + "learning_rate": 9.646156653748415e-06, + "loss": 1.61627159, + "memory(GiB)": 97.17, + "step": 12965, + "train_speed(iter/s)": 1.640978 + }, + { + "acc": 0.65677385, + "epoch": 0.32902080162354136, + "grad_norm": 5.84375, + "learning_rate": 9.645769086706008e-06, + "loss": 1.690905, + "memory(GiB)": 97.17, + "step": 12970, + "train_speed(iter/s)": 1.641036 + }, + { + "acc": 0.65197349, + "epoch": 0.3291476407914764, + "grad_norm": 4.5625, + "learning_rate": 9.645381315321934e-06, + "loss": 1.63398762, + "memory(GiB)": 97.17, + "step": 12975, + "train_speed(iter/s)": 1.641102 + }, + { + "acc": 0.64948888, + "epoch": 0.32927447995941145, + "grad_norm": 4.84375, + "learning_rate": 9.64499333961325e-06, + "loss": 1.66944027, + "memory(GiB)": 97.17, + "step": 12980, + "train_speed(iter/s)": 1.641166 + }, + { + "acc": 0.66175227, + "epoch": 0.32940131912734655, + "grad_norm": 5.09375, + "learning_rate": 9.64460515959702e-06, + "loss": 1.53600693, + "memory(GiB)": 97.17, + "step": 12985, + "train_speed(iter/s)": 1.64123 + }, + { + "acc": 0.64427519, + "epoch": 0.3295281582952816, + "grad_norm": 6.0625, + "learning_rate": 9.64421677529032e-06, + "loss": 1.65408115, + "memory(GiB)": 97.17, + "step": 12990, + "train_speed(iter/s)": 1.641296 + }, + { + "acc": 0.6672967, + "epoch": 0.32965499746321664, + "grad_norm": 5.09375, + "learning_rate": 9.64382818671023e-06, + "loss": 1.55482807, + "memory(GiB)": 97.17, + "step": 12995, + "train_speed(iter/s)": 1.641357 + }, + { + "acc": 0.65891047, + "epoch": 0.3297818366311517, + "grad_norm": 6.28125, + "learning_rate": 9.643439393873844e-06, + "loss": 1.57503719, + "memory(GiB)": 97.17, + "step": 13000, + "train_speed(iter/s)": 1.641425 + }, + { + "epoch": 0.3297818366311517, + "eval_acc": 0.6410164203509607, + "eval_loss": 1.6034586429595947, + "eval_runtime": 58.9433, + "eval_samples_per_second": 108.07, + "eval_steps_per_second": 27.026, + "step": 13000 + }, + { + "acc": 0.65530634, + "epoch": 0.3299086757990868, + "grad_norm": 5.15625, + "learning_rate": 9.643050396798262e-06, + "loss": 1.66781387, + "memory(GiB)": 97.17, + "step": 13005, + "train_speed(iter/s)": 1.628505 + }, + { + "acc": 0.66238599, + "epoch": 0.3300355149670218, + "grad_norm": 8.125, + "learning_rate": 9.642661195500593e-06, + "loss": 1.53265553, + "memory(GiB)": 97.17, + "step": 13010, + "train_speed(iter/s)": 1.628571 + }, + { + "acc": 0.6384572, + "epoch": 0.33016235413495687, + "grad_norm": 6.15625, + "learning_rate": 9.642271789997956e-06, + "loss": 1.6549612, + "memory(GiB)": 97.17, + "step": 13015, + "train_speed(iter/s)": 1.628634 + }, + { + "acc": 0.65347447, + "epoch": 0.3302891933028919, + "grad_norm": 4.8125, + "learning_rate": 9.64188218030748e-06, + "loss": 1.56046085, + "memory(GiB)": 97.17, + "step": 13020, + "train_speed(iter/s)": 1.628699 + }, + { + "acc": 0.66249228, + "epoch": 0.330416032470827, + "grad_norm": 5.1875, + "learning_rate": 9.641492366446301e-06, + "loss": 1.62477837, + "memory(GiB)": 97.17, + "step": 13025, + "train_speed(iter/s)": 1.628764 + }, + { + "acc": 0.64906745, + "epoch": 0.33054287163876206, + "grad_norm": 5.09375, + "learning_rate": 9.641102348431565e-06, + "loss": 1.62433739, + "memory(GiB)": 97.17, + "step": 13030, + "train_speed(iter/s)": 1.628827 + }, + { + "acc": 0.64215651, + "epoch": 0.3306697108066971, + "grad_norm": 5.34375, + "learning_rate": 9.640712126280429e-06, + "loss": 1.65913658, + "memory(GiB)": 97.17, + "step": 13035, + "train_speed(iter/s)": 1.62889 + }, + { + "acc": 0.64990587, + "epoch": 0.33079654997463215, + "grad_norm": 5.125, + "learning_rate": 9.640321700010053e-06, + "loss": 1.61545315, + "memory(GiB)": 97.17, + "step": 13040, + "train_speed(iter/s)": 1.628955 + }, + { + "acc": 0.6462347, + "epoch": 0.33092338914256725, + "grad_norm": 5.5625, + "learning_rate": 9.63993106963761e-06, + "loss": 1.67008781, + "memory(GiB)": 97.17, + "step": 13045, + "train_speed(iter/s)": 1.629019 + }, + { + "acc": 0.64295092, + "epoch": 0.3310502283105023, + "grad_norm": 5.125, + "learning_rate": 9.639540235180283e-06, + "loss": 1.63737774, + "memory(GiB)": 97.17, + "step": 13050, + "train_speed(iter/s)": 1.629081 + }, + { + "acc": 0.66302996, + "epoch": 0.33117706747843734, + "grad_norm": 6.5625, + "learning_rate": 9.639149196655263e-06, + "loss": 1.59176388, + "memory(GiB)": 97.17, + "step": 13055, + "train_speed(iter/s)": 1.629143 + }, + { + "acc": 0.64858561, + "epoch": 0.3313039066463724, + "grad_norm": 5.3125, + "learning_rate": 9.638757954079749e-06, + "loss": 1.60935822, + "memory(GiB)": 97.17, + "step": 13060, + "train_speed(iter/s)": 1.629206 + }, + { + "acc": 0.65634975, + "epoch": 0.3314307458143075, + "grad_norm": 7.15625, + "learning_rate": 9.638366507470948e-06, + "loss": 1.60039139, + "memory(GiB)": 97.17, + "step": 13065, + "train_speed(iter/s)": 1.629272 + }, + { + "acc": 0.65855961, + "epoch": 0.3315575849822425, + "grad_norm": 5.9375, + "learning_rate": 9.637974856846082e-06, + "loss": 1.62336674, + "memory(GiB)": 97.17, + "step": 13070, + "train_speed(iter/s)": 1.629333 + }, + { + "acc": 0.65982628, + "epoch": 0.33168442415017757, + "grad_norm": 6.21875, + "learning_rate": 9.637583002222373e-06, + "loss": 1.54557648, + "memory(GiB)": 97.17, + "step": 13075, + "train_speed(iter/s)": 1.629392 + }, + { + "acc": 0.65409746, + "epoch": 0.3318112633181126, + "grad_norm": 5.4375, + "learning_rate": 9.637190943617059e-06, + "loss": 1.63411102, + "memory(GiB)": 97.17, + "step": 13080, + "train_speed(iter/s)": 1.629456 + }, + { + "acc": 0.64977961, + "epoch": 0.3319381024860477, + "grad_norm": 6.21875, + "learning_rate": 9.636798681047383e-06, + "loss": 1.59947138, + "memory(GiB)": 97.17, + "step": 13085, + "train_speed(iter/s)": 1.629506 + }, + { + "acc": 0.6693881, + "epoch": 0.33206494165398276, + "grad_norm": 6.65625, + "learning_rate": 9.6364062145306e-06, + "loss": 1.602742, + "memory(GiB)": 97.17, + "step": 13090, + "train_speed(iter/s)": 1.62957 + }, + { + "acc": 0.6565311, + "epoch": 0.3321917808219178, + "grad_norm": 4.75, + "learning_rate": 9.636013544083971e-06, + "loss": 1.62767162, + "memory(GiB)": 97.17, + "step": 13095, + "train_speed(iter/s)": 1.629627 + }, + { + "acc": 0.6459856, + "epoch": 0.33231861998985285, + "grad_norm": 5.5, + "learning_rate": 9.635620669724768e-06, + "loss": 1.65949135, + "memory(GiB)": 97.17, + "step": 13100, + "train_speed(iter/s)": 1.629699 + }, + { + "acc": 0.65224152, + "epoch": 0.33244545915778795, + "grad_norm": 5.6875, + "learning_rate": 9.635227591470272e-06, + "loss": 1.65984306, + "memory(GiB)": 97.17, + "step": 13105, + "train_speed(iter/s)": 1.629768 + }, + { + "acc": 0.66255636, + "epoch": 0.332572298325723, + "grad_norm": 5.5625, + "learning_rate": 9.63483430933777e-06, + "loss": 1.59273062, + "memory(GiB)": 97.17, + "step": 13110, + "train_speed(iter/s)": 1.629828 + }, + { + "acc": 0.65655255, + "epoch": 0.33269913749365804, + "grad_norm": 5.8125, + "learning_rate": 9.634440823344565e-06, + "loss": 1.65388527, + "memory(GiB)": 97.17, + "step": 13115, + "train_speed(iter/s)": 1.629894 + }, + { + "acc": 0.66799397, + "epoch": 0.3328259766615931, + "grad_norm": 6.03125, + "learning_rate": 9.634047133507959e-06, + "loss": 1.55609608, + "memory(GiB)": 97.17, + "step": 13120, + "train_speed(iter/s)": 1.629964 + }, + { + "acc": 0.66454654, + "epoch": 0.3329528158295282, + "grad_norm": 5.625, + "learning_rate": 9.63365323984527e-06, + "loss": 1.5684514, + "memory(GiB)": 97.17, + "step": 13125, + "train_speed(iter/s)": 1.630027 + }, + { + "acc": 0.66204042, + "epoch": 0.3330796549974632, + "grad_norm": 5.875, + "learning_rate": 9.633259142373825e-06, + "loss": 1.58791218, + "memory(GiB)": 97.17, + "step": 13130, + "train_speed(iter/s)": 1.630093 + }, + { + "acc": 0.63796101, + "epoch": 0.33320649416539827, + "grad_norm": 6.21875, + "learning_rate": 9.632864841110957e-06, + "loss": 1.64469719, + "memory(GiB)": 97.17, + "step": 13135, + "train_speed(iter/s)": 1.630156 + }, + { + "acc": 0.6486742, + "epoch": 0.3333333333333333, + "grad_norm": 6.46875, + "learning_rate": 9.632470336074009e-06, + "loss": 1.69275875, + "memory(GiB)": 97.17, + "step": 13140, + "train_speed(iter/s)": 1.630225 + }, + { + "acc": 0.65933766, + "epoch": 0.3334601725012684, + "grad_norm": 6.125, + "learning_rate": 9.632075627280333e-06, + "loss": 1.57961845, + "memory(GiB)": 97.17, + "step": 13145, + "train_speed(iter/s)": 1.630293 + }, + { + "acc": 0.64420061, + "epoch": 0.33358701166920346, + "grad_norm": 5.71875, + "learning_rate": 9.631680714747292e-06, + "loss": 1.65871258, + "memory(GiB)": 97.17, + "step": 13150, + "train_speed(iter/s)": 1.630361 + }, + { + "acc": 0.66268587, + "epoch": 0.3337138508371385, + "grad_norm": 8.0625, + "learning_rate": 9.63128559849225e-06, + "loss": 1.57028494, + "memory(GiB)": 97.17, + "step": 13155, + "train_speed(iter/s)": 1.630429 + }, + { + "acc": 0.6471487, + "epoch": 0.33384069000507355, + "grad_norm": 6.6875, + "learning_rate": 9.630890278532594e-06, + "loss": 1.59411259, + "memory(GiB)": 97.17, + "step": 13160, + "train_speed(iter/s)": 1.630498 + }, + { + "acc": 0.6281713, + "epoch": 0.33396752917300865, + "grad_norm": 5.0625, + "learning_rate": 9.630494754885706e-06, + "loss": 1.72221336, + "memory(GiB)": 97.17, + "step": 13165, + "train_speed(iter/s)": 1.630562 + }, + { + "acc": 0.65375352, + "epoch": 0.3340943683409437, + "grad_norm": 5.375, + "learning_rate": 9.630099027568986e-06, + "loss": 1.61772003, + "memory(GiB)": 97.17, + "step": 13170, + "train_speed(iter/s)": 1.630629 + }, + { + "acc": 0.65680413, + "epoch": 0.33422120750887874, + "grad_norm": 5.3125, + "learning_rate": 9.629703096599839e-06, + "loss": 1.65999069, + "memory(GiB)": 97.17, + "step": 13175, + "train_speed(iter/s)": 1.6307 + }, + { + "acc": 0.65099554, + "epoch": 0.3343480466768138, + "grad_norm": 5.09375, + "learning_rate": 9.629306961995678e-06, + "loss": 1.63689957, + "memory(GiB)": 97.17, + "step": 13180, + "train_speed(iter/s)": 1.630761 + }, + { + "acc": 0.6357625, + "epoch": 0.3344748858447489, + "grad_norm": 5.40625, + "learning_rate": 9.62891062377393e-06, + "loss": 1.67008762, + "memory(GiB)": 97.17, + "step": 13185, + "train_speed(iter/s)": 1.630828 + }, + { + "acc": 0.6549716, + "epoch": 0.3346017250126839, + "grad_norm": 6.125, + "learning_rate": 9.628514081952026e-06, + "loss": 1.60908508, + "memory(GiB)": 97.17, + "step": 13190, + "train_speed(iter/s)": 1.630891 + }, + { + "acc": 0.62954302, + "epoch": 0.33472856418061897, + "grad_norm": 5.8125, + "learning_rate": 9.628117336547408e-06, + "loss": 1.72531414, + "memory(GiB)": 97.17, + "step": 13195, + "train_speed(iter/s)": 1.63096 + }, + { + "acc": 0.64352145, + "epoch": 0.334855403348554, + "grad_norm": 4.53125, + "learning_rate": 9.627720387577525e-06, + "loss": 1.6556284, + "memory(GiB)": 97.17, + "step": 13200, + "train_speed(iter/s)": 1.631022 + }, + { + "acc": 0.64894171, + "epoch": 0.3349822425164891, + "grad_norm": 6.46875, + "learning_rate": 9.62732323505984e-06, + "loss": 1.69405785, + "memory(GiB)": 97.17, + "step": 13205, + "train_speed(iter/s)": 1.631092 + }, + { + "acc": 0.65751781, + "epoch": 0.33510908168442416, + "grad_norm": 5.09375, + "learning_rate": 9.62692587901182e-06, + "loss": 1.57094851, + "memory(GiB)": 97.17, + "step": 13210, + "train_speed(iter/s)": 1.631155 + }, + { + "acc": 0.6518158, + "epoch": 0.3352359208523592, + "grad_norm": 5.90625, + "learning_rate": 9.62652831945094e-06, + "loss": 1.65788956, + "memory(GiB)": 97.17, + "step": 13215, + "train_speed(iter/s)": 1.631202 + }, + { + "acc": 0.6636971, + "epoch": 0.33536276002029425, + "grad_norm": 5.75, + "learning_rate": 9.626130556394689e-06, + "loss": 1.6660244, + "memory(GiB)": 97.17, + "step": 13220, + "train_speed(iter/s)": 1.631264 + }, + { + "acc": 0.66110892, + "epoch": 0.33548959918822935, + "grad_norm": 5.125, + "learning_rate": 9.625732589860562e-06, + "loss": 1.6646822, + "memory(GiB)": 97.17, + "step": 13225, + "train_speed(iter/s)": 1.631333 + }, + { + "acc": 0.63690033, + "epoch": 0.3356164383561644, + "grad_norm": 6.09375, + "learning_rate": 9.625334419866064e-06, + "loss": 1.71548882, + "memory(GiB)": 97.17, + "step": 13230, + "train_speed(iter/s)": 1.631403 + }, + { + "acc": 0.64992247, + "epoch": 0.33574327752409944, + "grad_norm": 5.625, + "learning_rate": 9.624936046428708e-06, + "loss": 1.61509514, + "memory(GiB)": 97.17, + "step": 13235, + "train_speed(iter/s)": 1.631472 + }, + { + "acc": 0.66669917, + "epoch": 0.3358701166920345, + "grad_norm": 5.65625, + "learning_rate": 9.624537469566015e-06, + "loss": 1.61054688, + "memory(GiB)": 97.17, + "step": 13240, + "train_speed(iter/s)": 1.631541 + }, + { + "acc": 0.65492659, + "epoch": 0.3359969558599696, + "grad_norm": 5.9375, + "learning_rate": 9.624138689295516e-06, + "loss": 1.58358002, + "memory(GiB)": 97.17, + "step": 13245, + "train_speed(iter/s)": 1.631608 + }, + { + "acc": 0.6432312, + "epoch": 0.3361237950279046, + "grad_norm": 5.53125, + "learning_rate": 9.623739705634753e-06, + "loss": 1.60147266, + "memory(GiB)": 97.17, + "step": 13250, + "train_speed(iter/s)": 1.631678 + }, + { + "acc": 0.65860195, + "epoch": 0.33625063419583967, + "grad_norm": 6.34375, + "learning_rate": 9.623340518601274e-06, + "loss": 1.63152981, + "memory(GiB)": 97.17, + "step": 13255, + "train_speed(iter/s)": 1.631741 + }, + { + "acc": 0.64756298, + "epoch": 0.3363774733637747, + "grad_norm": 7.3125, + "learning_rate": 9.622941128212639e-06, + "loss": 1.72195683, + "memory(GiB)": 97.17, + "step": 13260, + "train_speed(iter/s)": 1.631811 + }, + { + "acc": 0.65460567, + "epoch": 0.3365043125317098, + "grad_norm": 6.09375, + "learning_rate": 9.622541534486411e-06, + "loss": 1.61770439, + "memory(GiB)": 97.17, + "step": 13265, + "train_speed(iter/s)": 1.631884 + }, + { + "acc": 0.6486124, + "epoch": 0.33663115169964486, + "grad_norm": 6.625, + "learning_rate": 9.62214173744017e-06, + "loss": 1.64007339, + "memory(GiB)": 97.17, + "step": 13270, + "train_speed(iter/s)": 1.631948 + }, + { + "acc": 0.64311457, + "epoch": 0.3367579908675799, + "grad_norm": 5.90625, + "learning_rate": 9.6217417370915e-06, + "loss": 1.63710213, + "memory(GiB)": 97.17, + "step": 13275, + "train_speed(iter/s)": 1.63201 + }, + { + "acc": 0.65351524, + "epoch": 0.33688483003551495, + "grad_norm": 6.0625, + "learning_rate": 9.62134153345799e-06, + "loss": 1.59720058, + "memory(GiB)": 97.17, + "step": 13280, + "train_speed(iter/s)": 1.632073 + }, + { + "acc": 0.65047779, + "epoch": 0.33701166920345005, + "grad_norm": 5.96875, + "learning_rate": 9.620941126557248e-06, + "loss": 1.57065411, + "memory(GiB)": 97.17, + "step": 13285, + "train_speed(iter/s)": 1.632136 + }, + { + "acc": 0.66537809, + "epoch": 0.3371385083713851, + "grad_norm": 4.75, + "learning_rate": 9.620540516406885e-06, + "loss": 1.54066076, + "memory(GiB)": 97.17, + "step": 13290, + "train_speed(iter/s)": 1.6322 + }, + { + "acc": 0.64236665, + "epoch": 0.33726534753932014, + "grad_norm": 5.09375, + "learning_rate": 9.620139703024522e-06, + "loss": 1.62466125, + "memory(GiB)": 97.17, + "step": 13295, + "train_speed(iter/s)": 1.63227 + }, + { + "acc": 0.64019966, + "epoch": 0.3373921867072552, + "grad_norm": 6.46875, + "learning_rate": 9.619738686427785e-06, + "loss": 1.68296986, + "memory(GiB)": 97.17, + "step": 13300, + "train_speed(iter/s)": 1.632339 + }, + { + "acc": 0.6494771, + "epoch": 0.3375190258751903, + "grad_norm": 5.28125, + "learning_rate": 9.619337466634317e-06, + "loss": 1.61415253, + "memory(GiB)": 97.17, + "step": 13305, + "train_speed(iter/s)": 1.632408 + }, + { + "acc": 0.66605744, + "epoch": 0.3376458650431253, + "grad_norm": 5.75, + "learning_rate": 9.618936043661762e-06, + "loss": 1.58365126, + "memory(GiB)": 97.17, + "step": 13310, + "train_speed(iter/s)": 1.632472 + }, + { + "acc": 0.63992095, + "epoch": 0.33777270421106037, + "grad_norm": 7.96875, + "learning_rate": 9.618534417527779e-06, + "loss": 1.64826775, + "memory(GiB)": 97.17, + "step": 13315, + "train_speed(iter/s)": 1.632534 + }, + { + "acc": 0.65570612, + "epoch": 0.3378995433789954, + "grad_norm": 6.09375, + "learning_rate": 9.61813258825003e-06, + "loss": 1.5715662, + "memory(GiB)": 97.17, + "step": 13320, + "train_speed(iter/s)": 1.632597 + }, + { + "acc": 0.64682083, + "epoch": 0.3380263825469305, + "grad_norm": 5.53125, + "learning_rate": 9.617730555846191e-06, + "loss": 1.64066086, + "memory(GiB)": 97.17, + "step": 13325, + "train_speed(iter/s)": 1.632659 + }, + { + "acc": 0.64868999, + "epoch": 0.33815322171486556, + "grad_norm": 6.25, + "learning_rate": 9.617328320333947e-06, + "loss": 1.62098942, + "memory(GiB)": 97.17, + "step": 13330, + "train_speed(iter/s)": 1.632724 + }, + { + "acc": 0.67242332, + "epoch": 0.3382800608828006, + "grad_norm": 5.65625, + "learning_rate": 9.616925881730989e-06, + "loss": 1.51666565, + "memory(GiB)": 97.17, + "step": 13335, + "train_speed(iter/s)": 1.632784 + }, + { + "acc": 0.65879402, + "epoch": 0.33840690005073565, + "grad_norm": 5.40625, + "learning_rate": 9.616523240055017e-06, + "loss": 1.6054718, + "memory(GiB)": 97.17, + "step": 13340, + "train_speed(iter/s)": 1.632849 + }, + { + "acc": 0.63616529, + "epoch": 0.33853373921867075, + "grad_norm": 5.34375, + "learning_rate": 9.616120395323743e-06, + "loss": 1.69087791, + "memory(GiB)": 97.17, + "step": 13345, + "train_speed(iter/s)": 1.632914 + }, + { + "acc": 0.65138173, + "epoch": 0.3386605783866058, + "grad_norm": 5.09375, + "learning_rate": 9.615717347554882e-06, + "loss": 1.58977051, + "memory(GiB)": 97.17, + "step": 13350, + "train_speed(iter/s)": 1.632978 + }, + { + "acc": 0.63408489, + "epoch": 0.33878741755454084, + "grad_norm": 5.15625, + "learning_rate": 9.615314096766166e-06, + "loss": 1.6426239, + "memory(GiB)": 97.17, + "step": 13355, + "train_speed(iter/s)": 1.633044 + }, + { + "acc": 0.64465322, + "epoch": 0.3389142567224759, + "grad_norm": 6.5625, + "learning_rate": 9.61491064297533e-06, + "loss": 1.67834702, + "memory(GiB)": 97.17, + "step": 13360, + "train_speed(iter/s)": 1.633109 + }, + { + "acc": 0.64876709, + "epoch": 0.339041095890411, + "grad_norm": 6.09375, + "learning_rate": 9.614506986200119e-06, + "loss": 1.67453194, + "memory(GiB)": 97.17, + "step": 13365, + "train_speed(iter/s)": 1.633175 + }, + { + "acc": 0.64880538, + "epoch": 0.339167935058346, + "grad_norm": 7.1875, + "learning_rate": 9.61410312645829e-06, + "loss": 1.63719215, + "memory(GiB)": 97.17, + "step": 13370, + "train_speed(iter/s)": 1.633238 + }, + { + "acc": 0.65017805, + "epoch": 0.33929477422628107, + "grad_norm": 6.4375, + "learning_rate": 9.613699063767603e-06, + "loss": 1.67882271, + "memory(GiB)": 97.17, + "step": 13375, + "train_speed(iter/s)": 1.633303 + }, + { + "acc": 0.64997711, + "epoch": 0.3394216133942161, + "grad_norm": 5.65625, + "learning_rate": 9.613294798145833e-06, + "loss": 1.66933441, + "memory(GiB)": 97.17, + "step": 13380, + "train_speed(iter/s)": 1.633369 + }, + { + "acc": 0.6599268, + "epoch": 0.3395484525621512, + "grad_norm": 6.25, + "learning_rate": 9.612890329610762e-06, + "loss": 1.58168192, + "memory(GiB)": 97.17, + "step": 13385, + "train_speed(iter/s)": 1.633432 + }, + { + "acc": 0.65642452, + "epoch": 0.33967529173008626, + "grad_norm": 5.6875, + "learning_rate": 9.612485658180178e-06, + "loss": 1.62011013, + "memory(GiB)": 97.17, + "step": 13390, + "train_speed(iter/s)": 1.6335 + }, + { + "acc": 0.65850906, + "epoch": 0.3398021308980213, + "grad_norm": 5.34375, + "learning_rate": 9.612080783871882e-06, + "loss": 1.60980778, + "memory(GiB)": 97.17, + "step": 13395, + "train_speed(iter/s)": 1.633563 + }, + { + "acc": 0.65071831, + "epoch": 0.33992897006595635, + "grad_norm": 5.125, + "learning_rate": 9.611675706703682e-06, + "loss": 1.59563951, + "memory(GiB)": 97.17, + "step": 13400, + "train_speed(iter/s)": 1.633627 + }, + { + "acc": 0.64234285, + "epoch": 0.34005580923389145, + "grad_norm": 6.59375, + "learning_rate": 9.611270426693395e-06, + "loss": 1.67684059, + "memory(GiB)": 97.17, + "step": 13405, + "train_speed(iter/s)": 1.63369 + }, + { + "acc": 0.62538528, + "epoch": 0.3401826484018265, + "grad_norm": 6.65625, + "learning_rate": 9.610864943858847e-06, + "loss": 1.66686058, + "memory(GiB)": 97.17, + "step": 13410, + "train_speed(iter/s)": 1.633758 + }, + { + "acc": 0.65809946, + "epoch": 0.34030948756976154, + "grad_norm": 6.09375, + "learning_rate": 9.61045925821787e-06, + "loss": 1.62358341, + "memory(GiB)": 97.17, + "step": 13415, + "train_speed(iter/s)": 1.633829 + }, + { + "acc": 0.64416828, + "epoch": 0.3404363267376966, + "grad_norm": 4.9375, + "learning_rate": 9.610053369788314e-06, + "loss": 1.64812698, + "memory(GiB)": 97.17, + "step": 13420, + "train_speed(iter/s)": 1.633891 + }, + { + "acc": 0.64484787, + "epoch": 0.3405631659056317, + "grad_norm": 5.46875, + "learning_rate": 9.609647278588027e-06, + "loss": 1.72029305, + "memory(GiB)": 97.17, + "step": 13425, + "train_speed(iter/s)": 1.633958 + }, + { + "acc": 0.65401459, + "epoch": 0.3406900050735667, + "grad_norm": 4.34375, + "learning_rate": 9.609240984634871e-06, + "loss": 1.59744959, + "memory(GiB)": 97.17, + "step": 13430, + "train_speed(iter/s)": 1.63402 + }, + { + "acc": 0.65944738, + "epoch": 0.34081684424150177, + "grad_norm": 5.4375, + "learning_rate": 9.608834487946719e-06, + "loss": 1.63598385, + "memory(GiB)": 97.17, + "step": 13435, + "train_speed(iter/s)": 1.634085 + }, + { + "acc": 0.63552823, + "epoch": 0.3409436834094368, + "grad_norm": 6.1875, + "learning_rate": 9.60842778854145e-06, + "loss": 1.64073486, + "memory(GiB)": 97.17, + "step": 13440, + "train_speed(iter/s)": 1.634152 + }, + { + "acc": 0.65797157, + "epoch": 0.3410705225773719, + "grad_norm": 5.0625, + "learning_rate": 9.60802088643695e-06, + "loss": 1.61512642, + "memory(GiB)": 97.17, + "step": 13445, + "train_speed(iter/s)": 1.634217 + }, + { + "acc": 0.64022861, + "epoch": 0.34119736174530696, + "grad_norm": 4.75, + "learning_rate": 9.60761378165112e-06, + "loss": 1.6373558, + "memory(GiB)": 97.17, + "step": 13450, + "train_speed(iter/s)": 1.63428 + }, + { + "acc": 0.67306585, + "epoch": 0.341324200913242, + "grad_norm": 5.4375, + "learning_rate": 9.607206474201863e-06, + "loss": 1.58249474, + "memory(GiB)": 97.17, + "step": 13455, + "train_speed(iter/s)": 1.634341 + }, + { + "acc": 0.6459002, + "epoch": 0.34145104008117705, + "grad_norm": 5.875, + "learning_rate": 9.606798964107096e-06, + "loss": 1.69725952, + "memory(GiB)": 97.17, + "step": 13460, + "train_speed(iter/s)": 1.634406 + }, + { + "acc": 0.66193967, + "epoch": 0.34157787924911215, + "grad_norm": 6.15625, + "learning_rate": 9.60639125138474e-06, + "loss": 1.54741488, + "memory(GiB)": 97.17, + "step": 13465, + "train_speed(iter/s)": 1.634474 + }, + { + "acc": 0.65342951, + "epoch": 0.3417047184170472, + "grad_norm": 6.0625, + "learning_rate": 9.605983336052735e-06, + "loss": 1.63188744, + "memory(GiB)": 97.17, + "step": 13470, + "train_speed(iter/s)": 1.634538 + }, + { + "acc": 0.63303299, + "epoch": 0.34183155758498224, + "grad_norm": 5.4375, + "learning_rate": 9.605575218129017e-06, + "loss": 1.65524559, + "memory(GiB)": 97.17, + "step": 13475, + "train_speed(iter/s)": 1.634607 + }, + { + "acc": 0.63365688, + "epoch": 0.3419583967529173, + "grad_norm": 5.375, + "learning_rate": 9.605166897631539e-06, + "loss": 1.70623589, + "memory(GiB)": 97.17, + "step": 13480, + "train_speed(iter/s)": 1.634672 + }, + { + "acc": 0.65946827, + "epoch": 0.3420852359208524, + "grad_norm": 5.6875, + "learning_rate": 9.604758374578259e-06, + "loss": 1.60951061, + "memory(GiB)": 97.17, + "step": 13485, + "train_speed(iter/s)": 1.634734 + }, + { + "acc": 0.6486948, + "epoch": 0.3422120750887874, + "grad_norm": 6.9375, + "learning_rate": 9.604349648987148e-06, + "loss": 1.62078304, + "memory(GiB)": 97.17, + "step": 13490, + "train_speed(iter/s)": 1.6348 + }, + { + "acc": 0.64911952, + "epoch": 0.34233891425672247, + "grad_norm": 6.09375, + "learning_rate": 9.603940720876181e-06, + "loss": 1.66483097, + "memory(GiB)": 97.17, + "step": 13495, + "train_speed(iter/s)": 1.634861 + }, + { + "acc": 0.64871492, + "epoch": 0.3424657534246575, + "grad_norm": 5.59375, + "learning_rate": 9.603531590263348e-06, + "loss": 1.63723221, + "memory(GiB)": 97.17, + "step": 13500, + "train_speed(iter/s)": 1.634923 + }, + { + "acc": 0.64357762, + "epoch": 0.3425925925925926, + "grad_norm": 4.8125, + "learning_rate": 9.603122257166641e-06, + "loss": 1.64757099, + "memory(GiB)": 97.17, + "step": 13505, + "train_speed(iter/s)": 1.634985 + }, + { + "acc": 0.66512899, + "epoch": 0.34271943176052766, + "grad_norm": 5.375, + "learning_rate": 9.602712721604066e-06, + "loss": 1.58341637, + "memory(GiB)": 97.17, + "step": 13510, + "train_speed(iter/s)": 1.635046 + }, + { + "acc": 0.63635159, + "epoch": 0.3428462709284627, + "grad_norm": 6.21875, + "learning_rate": 9.602302983593637e-06, + "loss": 1.68867359, + "memory(GiB)": 97.17, + "step": 13515, + "train_speed(iter/s)": 1.635115 + }, + { + "acc": 0.64827604, + "epoch": 0.34297311009639775, + "grad_norm": 6.625, + "learning_rate": 9.601893043153372e-06, + "loss": 1.64419937, + "memory(GiB)": 97.17, + "step": 13520, + "train_speed(iter/s)": 1.635181 + }, + { + "acc": 0.64873228, + "epoch": 0.34309994926433285, + "grad_norm": 5.25, + "learning_rate": 9.601482900301308e-06, + "loss": 1.62032547, + "memory(GiB)": 97.17, + "step": 13525, + "train_speed(iter/s)": 1.635243 + }, + { + "acc": 0.65557809, + "epoch": 0.3432267884322679, + "grad_norm": 5.625, + "learning_rate": 9.60107255505548e-06, + "loss": 1.64220734, + "memory(GiB)": 97.17, + "step": 13530, + "train_speed(iter/s)": 1.635311 + }, + { + "acc": 0.63922901, + "epoch": 0.34335362760020294, + "grad_norm": 4.9375, + "learning_rate": 9.60066200743394e-06, + "loss": 1.75861874, + "memory(GiB)": 97.17, + "step": 13535, + "train_speed(iter/s)": 1.635379 + }, + { + "acc": 0.65102186, + "epoch": 0.343480466768138, + "grad_norm": 7.6875, + "learning_rate": 9.600251257454744e-06, + "loss": 1.61872768, + "memory(GiB)": 97.17, + "step": 13540, + "train_speed(iter/s)": 1.635447 + }, + { + "acc": 0.63279428, + "epoch": 0.3436073059360731, + "grad_norm": 5.15625, + "learning_rate": 9.599840305135959e-06, + "loss": 1.66210136, + "memory(GiB)": 97.17, + "step": 13545, + "train_speed(iter/s)": 1.63551 + }, + { + "acc": 0.65107059, + "epoch": 0.3437341451040081, + "grad_norm": 5.4375, + "learning_rate": 9.59942915049566e-06, + "loss": 1.56801577, + "memory(GiB)": 97.17, + "step": 13550, + "train_speed(iter/s)": 1.635574 + }, + { + "acc": 0.63942881, + "epoch": 0.34386098427194317, + "grad_norm": 5.3125, + "learning_rate": 9.599017793551933e-06, + "loss": 1.6313673, + "memory(GiB)": 97.17, + "step": 13555, + "train_speed(iter/s)": 1.635642 + }, + { + "acc": 0.6361475, + "epoch": 0.3439878234398782, + "grad_norm": 5.5625, + "learning_rate": 9.598606234322869e-06, + "loss": 1.74004631, + "memory(GiB)": 97.17, + "step": 13560, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.65669527, + "epoch": 0.3441146626078133, + "grad_norm": 5.15625, + "learning_rate": 9.598194472826574e-06, + "loss": 1.6196373, + "memory(GiB)": 97.17, + "step": 13565, + "train_speed(iter/s)": 1.635769 + }, + { + "acc": 0.64783645, + "epoch": 0.34424150177574836, + "grad_norm": 6.3125, + "learning_rate": 9.597782509081154e-06, + "loss": 1.65286465, + "memory(GiB)": 97.17, + "step": 13570, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.65066686, + "epoch": 0.3443683409436834, + "grad_norm": 4.9375, + "learning_rate": 9.597370343104733e-06, + "loss": 1.58463345, + "memory(GiB)": 97.17, + "step": 13575, + "train_speed(iter/s)": 1.6359 + }, + { + "acc": 0.6642622, + "epoch": 0.34449518011161845, + "grad_norm": 6.90625, + "learning_rate": 9.596957974915438e-06, + "loss": 1.5500495, + "memory(GiB)": 97.17, + "step": 13580, + "train_speed(iter/s)": 1.635958 + }, + { + "acc": 0.64914742, + "epoch": 0.34462201927955355, + "grad_norm": 4.40625, + "learning_rate": 9.596545404531408e-06, + "loss": 1.63522263, + "memory(GiB)": 97.17, + "step": 13585, + "train_speed(iter/s)": 1.636017 + }, + { + "acc": 0.63693576, + "epoch": 0.3447488584474886, + "grad_norm": 4.875, + "learning_rate": 9.596132631970788e-06, + "loss": 1.61731949, + "memory(GiB)": 97.17, + "step": 13590, + "train_speed(iter/s)": 1.636079 + }, + { + "acc": 0.66151857, + "epoch": 0.34487569761542364, + "grad_norm": 5.25, + "learning_rate": 9.595719657251735e-06, + "loss": 1.57577553, + "memory(GiB)": 97.17, + "step": 13595, + "train_speed(iter/s)": 1.636145 + }, + { + "acc": 0.66294804, + "epoch": 0.3450025367833587, + "grad_norm": 5.8125, + "learning_rate": 9.595306480392413e-06, + "loss": 1.57756386, + "memory(GiB)": 97.17, + "step": 13600, + "train_speed(iter/s)": 1.636202 + }, + { + "acc": 0.64581871, + "epoch": 0.3451293759512938, + "grad_norm": 6.96875, + "learning_rate": 9.594893101410995e-06, + "loss": 1.66540737, + "memory(GiB)": 97.17, + "step": 13605, + "train_speed(iter/s)": 1.636264 + }, + { + "acc": 0.64619398, + "epoch": 0.3452562151192288, + "grad_norm": 5.09375, + "learning_rate": 9.594479520325665e-06, + "loss": 1.63915825, + "memory(GiB)": 97.17, + "step": 13610, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.6353097, + "epoch": 0.34538305428716387, + "grad_norm": 5.90625, + "learning_rate": 9.594065737154611e-06, + "loss": 1.60954819, + "memory(GiB)": 97.17, + "step": 13615, + "train_speed(iter/s)": 1.636384 + }, + { + "acc": 0.65606308, + "epoch": 0.3455098934550989, + "grad_norm": 6.1875, + "learning_rate": 9.593651751916037e-06, + "loss": 1.59537392, + "memory(GiB)": 97.17, + "step": 13620, + "train_speed(iter/s)": 1.636442 + }, + { + "acc": 0.66525683, + "epoch": 0.345636732623034, + "grad_norm": 6.25, + "learning_rate": 9.593237564628149e-06, + "loss": 1.52252274, + "memory(GiB)": 97.17, + "step": 13625, + "train_speed(iter/s)": 1.636507 + }, + { + "acc": 0.66184902, + "epoch": 0.34576357179096906, + "grad_norm": 5.5625, + "learning_rate": 9.592823175309164e-06, + "loss": 1.50504761, + "memory(GiB)": 97.17, + "step": 13630, + "train_speed(iter/s)": 1.63657 + }, + { + "acc": 0.65679488, + "epoch": 0.3458904109589041, + "grad_norm": 4.625, + "learning_rate": 9.592408583977311e-06, + "loss": 1.59319515, + "memory(GiB)": 97.17, + "step": 13635, + "train_speed(iter/s)": 1.636632 + }, + { + "acc": 0.6349916, + "epoch": 0.34601725012683915, + "grad_norm": 5.6875, + "learning_rate": 9.591993790650826e-06, + "loss": 1.60629158, + "memory(GiB)": 97.17, + "step": 13640, + "train_speed(iter/s)": 1.636695 + }, + { + "acc": 0.66123581, + "epoch": 0.34614408929477425, + "grad_norm": 6.15625, + "learning_rate": 9.591578795347952e-06, + "loss": 1.66320419, + "memory(GiB)": 97.17, + "step": 13645, + "train_speed(iter/s)": 1.636758 + }, + { + "acc": 0.65161381, + "epoch": 0.3462709284627093, + "grad_norm": 6.4375, + "learning_rate": 9.591163598086943e-06, + "loss": 1.63374386, + "memory(GiB)": 97.17, + "step": 13650, + "train_speed(iter/s)": 1.636825 + }, + { + "acc": 0.65236111, + "epoch": 0.34639776763064434, + "grad_norm": 5.1875, + "learning_rate": 9.59074819888606e-06, + "loss": 1.64519844, + "memory(GiB)": 97.17, + "step": 13655, + "train_speed(iter/s)": 1.636892 + }, + { + "acc": 0.65328779, + "epoch": 0.3465246067985794, + "grad_norm": 5.375, + "learning_rate": 9.590332597763575e-06, + "loss": 1.59348421, + "memory(GiB)": 97.17, + "step": 13660, + "train_speed(iter/s)": 1.636954 + }, + { + "acc": 0.65040708, + "epoch": 0.3466514459665145, + "grad_norm": 5.5, + "learning_rate": 9.589916794737768e-06, + "loss": 1.67796631, + "memory(GiB)": 97.17, + "step": 13665, + "train_speed(iter/s)": 1.637015 + }, + { + "acc": 0.65535231, + "epoch": 0.3467782851344495, + "grad_norm": 5.84375, + "learning_rate": 9.589500789826927e-06, + "loss": 1.69573822, + "memory(GiB)": 97.17, + "step": 13670, + "train_speed(iter/s)": 1.637075 + }, + { + "acc": 0.6386796, + "epoch": 0.34690512430238457, + "grad_norm": 5.84375, + "learning_rate": 9.589084583049353e-06, + "loss": 1.73981438, + "memory(GiB)": 97.17, + "step": 13675, + "train_speed(iter/s)": 1.637138 + }, + { + "acc": 0.6500874, + "epoch": 0.3470319634703196, + "grad_norm": 6.59375, + "learning_rate": 9.588668174423348e-06, + "loss": 1.62884007, + "memory(GiB)": 97.17, + "step": 13680, + "train_speed(iter/s)": 1.637203 + }, + { + "acc": 0.64499726, + "epoch": 0.3471588026382547, + "grad_norm": 5.28125, + "learning_rate": 9.588251563967232e-06, + "loss": 1.70470772, + "memory(GiB)": 97.17, + "step": 13685, + "train_speed(iter/s)": 1.63726 + }, + { + "acc": 0.64006367, + "epoch": 0.34728564180618976, + "grad_norm": 5.28125, + "learning_rate": 9.587834751699326e-06, + "loss": 1.71168442, + "memory(GiB)": 97.17, + "step": 13690, + "train_speed(iter/s)": 1.637322 + }, + { + "acc": 0.64713793, + "epoch": 0.3474124809741248, + "grad_norm": 6.4375, + "learning_rate": 9.587417737637963e-06, + "loss": 1.67513752, + "memory(GiB)": 97.17, + "step": 13695, + "train_speed(iter/s)": 1.637386 + }, + { + "acc": 0.65330906, + "epoch": 0.34753932014205985, + "grad_norm": 5.65625, + "learning_rate": 9.587000521801488e-06, + "loss": 1.60202026, + "memory(GiB)": 97.17, + "step": 13700, + "train_speed(iter/s)": 1.637444 + }, + { + "acc": 0.65934496, + "epoch": 0.34766615930999495, + "grad_norm": 5.90625, + "learning_rate": 9.58658310420825e-06, + "loss": 1.61946106, + "memory(GiB)": 97.17, + "step": 13705, + "train_speed(iter/s)": 1.637507 + }, + { + "acc": 0.64947195, + "epoch": 0.34779299847793, + "grad_norm": 5.875, + "learning_rate": 9.58616548487661e-06, + "loss": 1.60600033, + "memory(GiB)": 97.17, + "step": 13710, + "train_speed(iter/s)": 1.63757 + }, + { + "acc": 0.6499331, + "epoch": 0.34791983764586504, + "grad_norm": 6.0, + "learning_rate": 9.585747663824936e-06, + "loss": 1.68437023, + "memory(GiB)": 97.17, + "step": 13715, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.65571041, + "epoch": 0.3480466768138001, + "grad_norm": 5.75, + "learning_rate": 9.585329641071606e-06, + "loss": 1.60613651, + "memory(GiB)": 97.17, + "step": 13720, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.65194998, + "epoch": 0.3481735159817352, + "grad_norm": 6.84375, + "learning_rate": 9.584911416635007e-06, + "loss": 1.63271618, + "memory(GiB)": 97.17, + "step": 13725, + "train_speed(iter/s)": 1.637746 + }, + { + "acc": 0.64861517, + "epoch": 0.3483003551496702, + "grad_norm": 5.0625, + "learning_rate": 9.584492990533533e-06, + "loss": 1.64106178, + "memory(GiB)": 97.17, + "step": 13730, + "train_speed(iter/s)": 1.637806 + }, + { + "acc": 0.65483723, + "epoch": 0.34842719431760527, + "grad_norm": 5.65625, + "learning_rate": 9.58407436278559e-06, + "loss": 1.5321784, + "memory(GiB)": 97.17, + "step": 13735, + "train_speed(iter/s)": 1.637866 + }, + { + "acc": 0.64420929, + "epoch": 0.3485540334855403, + "grad_norm": 5.1875, + "learning_rate": 9.583655533409588e-06, + "loss": 1.62746487, + "memory(GiB)": 97.17, + "step": 13740, + "train_speed(iter/s)": 1.637924 + }, + { + "acc": 0.6508462, + "epoch": 0.3486808726534754, + "grad_norm": 4.53125, + "learning_rate": 9.583236502423952e-06, + "loss": 1.63266335, + "memory(GiB)": 97.17, + "step": 13745, + "train_speed(iter/s)": 1.637986 + }, + { + "acc": 0.6501246, + "epoch": 0.34880771182141046, + "grad_norm": 5.59375, + "learning_rate": 9.582817269847112e-06, + "loss": 1.69108353, + "memory(GiB)": 97.17, + "step": 13750, + "train_speed(iter/s)": 1.63805 + }, + { + "acc": 0.65709333, + "epoch": 0.3489345509893455, + "grad_norm": 5.90625, + "learning_rate": 9.582397835697509e-06, + "loss": 1.57833624, + "memory(GiB)": 97.17, + "step": 13755, + "train_speed(iter/s)": 1.638106 + }, + { + "acc": 0.65828834, + "epoch": 0.34906139015728055, + "grad_norm": 5.0625, + "learning_rate": 9.581978199993587e-06, + "loss": 1.60937443, + "memory(GiB)": 97.17, + "step": 13760, + "train_speed(iter/s)": 1.638162 + }, + { + "acc": 0.64878697, + "epoch": 0.34918822932521565, + "grad_norm": 4.84375, + "learning_rate": 9.58155836275381e-06, + "loss": 1.6178339, + "memory(GiB)": 97.17, + "step": 13765, + "train_speed(iter/s)": 1.638224 + }, + { + "acc": 0.64724798, + "epoch": 0.3493150684931507, + "grad_norm": 6.0625, + "learning_rate": 9.581138323996639e-06, + "loss": 1.71036758, + "memory(GiB)": 97.17, + "step": 13770, + "train_speed(iter/s)": 1.638288 + }, + { + "acc": 0.66152434, + "epoch": 0.34944190766108574, + "grad_norm": 5.5, + "learning_rate": 9.580718083740553e-06, + "loss": 1.68143501, + "memory(GiB)": 97.17, + "step": 13775, + "train_speed(iter/s)": 1.638352 + }, + { + "acc": 0.68205414, + "epoch": 0.3495687468290208, + "grad_norm": 5.28125, + "learning_rate": 9.580297642004032e-06, + "loss": 1.54240494, + "memory(GiB)": 97.17, + "step": 13780, + "train_speed(iter/s)": 1.638412 + }, + { + "acc": 0.65155163, + "epoch": 0.3496955859969559, + "grad_norm": 4.90625, + "learning_rate": 9.579876998805573e-06, + "loss": 1.6943512, + "memory(GiB)": 97.17, + "step": 13785, + "train_speed(iter/s)": 1.638475 + }, + { + "acc": 0.65310907, + "epoch": 0.3498224251648909, + "grad_norm": 5.84375, + "learning_rate": 9.579456154163676e-06, + "loss": 1.64239845, + "memory(GiB)": 97.17, + "step": 13790, + "train_speed(iter/s)": 1.638537 + }, + { + "acc": 0.6457129, + "epoch": 0.34994926433282597, + "grad_norm": 5.28125, + "learning_rate": 9.57903510809685e-06, + "loss": 1.60808601, + "memory(GiB)": 97.17, + "step": 13795, + "train_speed(iter/s)": 1.638596 + }, + { + "acc": 0.65088377, + "epoch": 0.350076103500761, + "grad_norm": 6.28125, + "learning_rate": 9.578613860623617e-06, + "loss": 1.60699387, + "memory(GiB)": 97.17, + "step": 13800, + "train_speed(iter/s)": 1.638658 + }, + { + "acc": 0.66416445, + "epoch": 0.3502029426686961, + "grad_norm": 5.875, + "learning_rate": 9.578192411762503e-06, + "loss": 1.53634377, + "memory(GiB)": 97.17, + "step": 13805, + "train_speed(iter/s)": 1.638723 + }, + { + "acc": 0.64890122, + "epoch": 0.35032978183663116, + "grad_norm": 4.5625, + "learning_rate": 9.577770761532049e-06, + "loss": 1.66492214, + "memory(GiB)": 97.17, + "step": 13810, + "train_speed(iter/s)": 1.638784 + }, + { + "acc": 0.64871078, + "epoch": 0.3504566210045662, + "grad_norm": 6.375, + "learning_rate": 9.577348909950797e-06, + "loss": 1.65300541, + "memory(GiB)": 97.17, + "step": 13815, + "train_speed(iter/s)": 1.638839 + }, + { + "acc": 0.66125431, + "epoch": 0.35058346017250125, + "grad_norm": 5.34375, + "learning_rate": 9.576926857037303e-06, + "loss": 1.4876543, + "memory(GiB)": 97.17, + "step": 13820, + "train_speed(iter/s)": 1.638902 + }, + { + "acc": 0.66586304, + "epoch": 0.35071029934043635, + "grad_norm": 5.71875, + "learning_rate": 9.576504602810133e-06, + "loss": 1.63209648, + "memory(GiB)": 97.17, + "step": 13825, + "train_speed(iter/s)": 1.638967 + }, + { + "acc": 0.65355177, + "epoch": 0.3508371385083714, + "grad_norm": 5.28125, + "learning_rate": 9.576082147287858e-06, + "loss": 1.6247776, + "memory(GiB)": 97.17, + "step": 13830, + "train_speed(iter/s)": 1.63903 + }, + { + "acc": 0.65186634, + "epoch": 0.35096397767630644, + "grad_norm": 5.96875, + "learning_rate": 9.575659490489058e-06, + "loss": 1.58020687, + "memory(GiB)": 97.17, + "step": 13835, + "train_speed(iter/s)": 1.639093 + }, + { + "acc": 0.65629144, + "epoch": 0.3510908168442415, + "grad_norm": 5.5, + "learning_rate": 9.575236632432325e-06, + "loss": 1.63607521, + "memory(GiB)": 97.17, + "step": 13840, + "train_speed(iter/s)": 1.639153 + }, + { + "acc": 0.66264906, + "epoch": 0.3512176560121766, + "grad_norm": 6.125, + "learning_rate": 9.574813573136259e-06, + "loss": 1.62069511, + "memory(GiB)": 97.17, + "step": 13845, + "train_speed(iter/s)": 1.639213 + }, + { + "acc": 0.6588851, + "epoch": 0.3513444951801116, + "grad_norm": 4.78125, + "learning_rate": 9.574390312619466e-06, + "loss": 1.62078972, + "memory(GiB)": 97.17, + "step": 13850, + "train_speed(iter/s)": 1.639271 + }, + { + "acc": 0.66158595, + "epoch": 0.35147133434804667, + "grad_norm": 5.125, + "learning_rate": 9.573966850900565e-06, + "loss": 1.61515694, + "memory(GiB)": 97.17, + "step": 13855, + "train_speed(iter/s)": 1.639328 + }, + { + "acc": 0.64539061, + "epoch": 0.3515981735159817, + "grad_norm": 5.90625, + "learning_rate": 9.57354318799818e-06, + "loss": 1.58629074, + "memory(GiB)": 97.17, + "step": 13860, + "train_speed(iter/s)": 1.639389 + }, + { + "acc": 0.6522891, + "epoch": 0.3517250126839168, + "grad_norm": 5.46875, + "learning_rate": 9.573119323930946e-06, + "loss": 1.62132435, + "memory(GiB)": 97.17, + "step": 13865, + "train_speed(iter/s)": 1.639449 + }, + { + "acc": 0.65135703, + "epoch": 0.35185185185185186, + "grad_norm": 5.3125, + "learning_rate": 9.572695258717507e-06, + "loss": 1.64454937, + "memory(GiB)": 97.17, + "step": 13870, + "train_speed(iter/s)": 1.63951 + }, + { + "acc": 0.65078793, + "epoch": 0.3519786910197869, + "grad_norm": 5.25, + "learning_rate": 9.572270992376513e-06, + "loss": 1.6489296, + "memory(GiB)": 97.17, + "step": 13875, + "train_speed(iter/s)": 1.639573 + }, + { + "acc": 0.65659442, + "epoch": 0.35210553018772195, + "grad_norm": 5.03125, + "learning_rate": 9.571846524926629e-06, + "loss": 1.62971611, + "memory(GiB)": 97.17, + "step": 13880, + "train_speed(iter/s)": 1.639633 + }, + { + "acc": 0.65870566, + "epoch": 0.35223236935565705, + "grad_norm": 5.03125, + "learning_rate": 9.571421856386522e-06, + "loss": 1.67318134, + "memory(GiB)": 97.17, + "step": 13885, + "train_speed(iter/s)": 1.639693 + }, + { + "acc": 0.65042315, + "epoch": 0.3523592085235921, + "grad_norm": 5.25, + "learning_rate": 9.570996986774872e-06, + "loss": 1.63607979, + "memory(GiB)": 97.17, + "step": 13890, + "train_speed(iter/s)": 1.638787 + }, + { + "acc": 0.64825239, + "epoch": 0.35248604769152714, + "grad_norm": 6.875, + "learning_rate": 9.570571916110366e-06, + "loss": 1.66116676, + "memory(GiB)": 97.17, + "step": 13895, + "train_speed(iter/s)": 1.638846 + }, + { + "acc": 0.65396852, + "epoch": 0.3526128868594622, + "grad_norm": 5.09375, + "learning_rate": 9.570146644411705e-06, + "loss": 1.62385712, + "memory(GiB)": 97.17, + "step": 13900, + "train_speed(iter/s)": 1.637907 + }, + { + "acc": 0.65529709, + "epoch": 0.3527397260273973, + "grad_norm": 5.5625, + "learning_rate": 9.569721171697587e-06, + "loss": 1.63701954, + "memory(GiB)": 97.17, + "step": 13905, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.64981155, + "epoch": 0.3528665651953323, + "grad_norm": 5.21875, + "learning_rate": 9.569295497986727e-06, + "loss": 1.68096714, + "memory(GiB)": 97.17, + "step": 13910, + "train_speed(iter/s)": 1.636975 + }, + { + "acc": 0.66631312, + "epoch": 0.35299340436326737, + "grad_norm": 6.375, + "learning_rate": 9.568869623297855e-06, + "loss": 1.60080128, + "memory(GiB)": 97.17, + "step": 13915, + "train_speed(iter/s)": 1.637038 + }, + { + "acc": 0.64119601, + "epoch": 0.3531202435312024, + "grad_norm": 5.1875, + "learning_rate": 9.568443547649697e-06, + "loss": 1.72888298, + "memory(GiB)": 97.17, + "step": 13920, + "train_speed(iter/s)": 1.637102 + }, + { + "acc": 0.6534049, + "epoch": 0.3532470826991375, + "grad_norm": 6.125, + "learning_rate": 9.568017271060994e-06, + "loss": 1.64671173, + "memory(GiB)": 97.17, + "step": 13925, + "train_speed(iter/s)": 1.637161 + }, + { + "acc": 0.66404309, + "epoch": 0.35337392186707256, + "grad_norm": 7.125, + "learning_rate": 9.567590793550498e-06, + "loss": 1.58683472, + "memory(GiB)": 97.17, + "step": 13930, + "train_speed(iter/s)": 1.637219 + }, + { + "acc": 0.65419855, + "epoch": 0.3535007610350076, + "grad_norm": 6.40625, + "learning_rate": 9.567164115136965e-06, + "loss": 1.62939644, + "memory(GiB)": 97.17, + "step": 13935, + "train_speed(iter/s)": 1.637283 + }, + { + "acc": 0.66233878, + "epoch": 0.35362760020294265, + "grad_norm": 5.1875, + "learning_rate": 9.566737235839166e-06, + "loss": 1.61837234, + "memory(GiB)": 97.17, + "step": 13940, + "train_speed(iter/s)": 1.637346 + }, + { + "acc": 0.64619679, + "epoch": 0.35375443937087775, + "grad_norm": 6.5625, + "learning_rate": 9.566310155675871e-06, + "loss": 1.64337883, + "memory(GiB)": 97.17, + "step": 13945, + "train_speed(iter/s)": 1.637411 + }, + { + "acc": 0.64525166, + "epoch": 0.3538812785388128, + "grad_norm": 4.25, + "learning_rate": 9.56588287466587e-06, + "loss": 1.58622475, + "memory(GiB)": 97.17, + "step": 13950, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.6659534, + "epoch": 0.35400811770674784, + "grad_norm": 5.96875, + "learning_rate": 9.565455392827954e-06, + "loss": 1.58146667, + "memory(GiB)": 97.17, + "step": 13955, + "train_speed(iter/s)": 1.637531 + }, + { + "acc": 0.64464064, + "epoch": 0.3541349568746829, + "grad_norm": 5.15625, + "learning_rate": 9.565027710180927e-06, + "loss": 1.65124092, + "memory(GiB)": 97.17, + "step": 13960, + "train_speed(iter/s)": 1.637589 + }, + { + "acc": 0.66620688, + "epoch": 0.354261796042618, + "grad_norm": 5.9375, + "learning_rate": 9.5645998267436e-06, + "loss": 1.51923618, + "memory(GiB)": 97.17, + "step": 13965, + "train_speed(iter/s)": 1.637652 + }, + { + "acc": 0.64231234, + "epoch": 0.354388635210553, + "grad_norm": 4.90625, + "learning_rate": 9.564171742534794e-06, + "loss": 1.64211617, + "memory(GiB)": 97.17, + "step": 13970, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.65112343, + "epoch": 0.35451547437848807, + "grad_norm": 6.28125, + "learning_rate": 9.563743457573336e-06, + "loss": 1.67483883, + "memory(GiB)": 97.17, + "step": 13975, + "train_speed(iter/s)": 1.637769 + }, + { + "acc": 0.64085951, + "epoch": 0.3546423135464231, + "grad_norm": 5.1875, + "learning_rate": 9.563314971878065e-06, + "loss": 1.65576439, + "memory(GiB)": 97.17, + "step": 13980, + "train_speed(iter/s)": 1.637828 + }, + { + "acc": 0.64888682, + "epoch": 0.3547691527143582, + "grad_norm": 5.5625, + "learning_rate": 9.562886285467828e-06, + "loss": 1.6142662, + "memory(GiB)": 97.17, + "step": 13985, + "train_speed(iter/s)": 1.637892 + }, + { + "acc": 0.64186583, + "epoch": 0.35489599188229326, + "grad_norm": 5.75, + "learning_rate": 9.56245739836148e-06, + "loss": 1.6557024, + "memory(GiB)": 97.17, + "step": 13990, + "train_speed(iter/s)": 1.637949 + }, + { + "acc": 0.6515049, + "epoch": 0.3550228310502283, + "grad_norm": 6.21875, + "learning_rate": 9.562028310577887e-06, + "loss": 1.62775726, + "memory(GiB)": 97.17, + "step": 13995, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.64320784, + "epoch": 0.35514967021816335, + "grad_norm": 5.5625, + "learning_rate": 9.56159902213592e-06, + "loss": 1.65720387, + "memory(GiB)": 97.17, + "step": 14000, + "train_speed(iter/s)": 1.638075 + }, + { + "epoch": 0.35514967021816335, + "eval_acc": 0.6414904658384445, + "eval_loss": 1.6004287004470825, + "eval_runtime": 59.2539, + "eval_samples_per_second": 107.503, + "eval_steps_per_second": 26.884, + "step": 14000 + }, + { + "acc": 0.65292993, + "epoch": 0.35527650938609845, + "grad_norm": 5.875, + "learning_rate": 9.561169533054462e-06, + "loss": 1.63042793, + "memory(GiB)": 97.17, + "step": 14005, + "train_speed(iter/s)": 1.626063 + }, + { + "acc": 0.66362543, + "epoch": 0.3554033485540335, + "grad_norm": 6.84375, + "learning_rate": 9.560739843352404e-06, + "loss": 1.62679939, + "memory(GiB)": 97.17, + "step": 14010, + "train_speed(iter/s)": 1.626126 + }, + { + "acc": 0.64804759, + "epoch": 0.35553018772196854, + "grad_norm": 6.09375, + "learning_rate": 9.560309953048645e-06, + "loss": 1.6837923, + "memory(GiB)": 97.17, + "step": 14015, + "train_speed(iter/s)": 1.625266 + }, + { + "acc": 0.63858299, + "epoch": 0.3556570268899036, + "grad_norm": 5.3125, + "learning_rate": 9.559879862162095e-06, + "loss": 1.69130516, + "memory(GiB)": 97.17, + "step": 14020, + "train_speed(iter/s)": 1.625333 + }, + { + "acc": 0.65001059, + "epoch": 0.3557838660578387, + "grad_norm": 5.46875, + "learning_rate": 9.55944957071167e-06, + "loss": 1.66215382, + "memory(GiB)": 97.17, + "step": 14025, + "train_speed(iter/s)": 1.625399 + }, + { + "acc": 0.64804235, + "epoch": 0.3559107052257737, + "grad_norm": 6.875, + "learning_rate": 9.559019078716295e-06, + "loss": 1.65577374, + "memory(GiB)": 97.17, + "step": 14030, + "train_speed(iter/s)": 1.625471 + }, + { + "acc": 0.65114422, + "epoch": 0.35603754439370877, + "grad_norm": 7.09375, + "learning_rate": 9.558588386194907e-06, + "loss": 1.68736877, + "memory(GiB)": 97.17, + "step": 14035, + "train_speed(iter/s)": 1.625539 + }, + { + "acc": 0.64714065, + "epoch": 0.3561643835616438, + "grad_norm": 6.875, + "learning_rate": 9.55815749316645e-06, + "loss": 1.59792538, + "memory(GiB)": 97.17, + "step": 14040, + "train_speed(iter/s)": 1.625605 + }, + { + "acc": 0.63453717, + "epoch": 0.3562912227295789, + "grad_norm": 5.5, + "learning_rate": 9.557726399649875e-06, + "loss": 1.66835327, + "memory(GiB)": 97.17, + "step": 14045, + "train_speed(iter/s)": 1.625672 + }, + { + "acc": 0.64820013, + "epoch": 0.35641806189751396, + "grad_norm": 7.1875, + "learning_rate": 9.557295105664144e-06, + "loss": 1.64806557, + "memory(GiB)": 97.17, + "step": 14050, + "train_speed(iter/s)": 1.625739 + }, + { + "acc": 0.63990793, + "epoch": 0.356544901065449, + "grad_norm": 5.8125, + "learning_rate": 9.556863611228228e-06, + "loss": 1.69207649, + "memory(GiB)": 97.17, + "step": 14055, + "train_speed(iter/s)": 1.625806 + }, + { + "acc": 0.65620365, + "epoch": 0.35667174023338405, + "grad_norm": 5.78125, + "learning_rate": 9.556431916361105e-06, + "loss": 1.6343586, + "memory(GiB)": 97.17, + "step": 14060, + "train_speed(iter/s)": 1.625874 + }, + { + "acc": 0.63775182, + "epoch": 0.35679857940131915, + "grad_norm": 5.3125, + "learning_rate": 9.556000021081764e-06, + "loss": 1.63741646, + "memory(GiB)": 97.17, + "step": 14065, + "train_speed(iter/s)": 1.625941 + }, + { + "acc": 0.65458846, + "epoch": 0.3569254185692542, + "grad_norm": 7.25, + "learning_rate": 9.5555679254092e-06, + "loss": 1.55885248, + "memory(GiB)": 97.17, + "step": 14070, + "train_speed(iter/s)": 1.626004 + }, + { + "acc": 0.64213009, + "epoch": 0.35705225773718924, + "grad_norm": 5.875, + "learning_rate": 9.55513562936242e-06, + "loss": 1.7240551, + "memory(GiB)": 97.17, + "step": 14075, + "train_speed(iter/s)": 1.626071 + }, + { + "acc": 0.63884268, + "epoch": 0.3571790969051243, + "grad_norm": 6.09375, + "learning_rate": 9.554703132960437e-06, + "loss": 1.74014511, + "memory(GiB)": 97.17, + "step": 14080, + "train_speed(iter/s)": 1.626137 + }, + { + "acc": 0.6602293, + "epoch": 0.3573059360730594, + "grad_norm": 5.09375, + "learning_rate": 9.554270436222277e-06, + "loss": 1.56141319, + "memory(GiB)": 97.17, + "step": 14085, + "train_speed(iter/s)": 1.6262 + }, + { + "acc": 0.64819026, + "epoch": 0.3574327752409944, + "grad_norm": 5.25, + "learning_rate": 9.553837539166969e-06, + "loss": 1.60231209, + "memory(GiB)": 97.17, + "step": 14090, + "train_speed(iter/s)": 1.626265 + }, + { + "acc": 0.64707174, + "epoch": 0.35755961440892947, + "grad_norm": 6.0, + "learning_rate": 9.553404441813554e-06, + "loss": 1.69079437, + "memory(GiB)": 97.17, + "step": 14095, + "train_speed(iter/s)": 1.626332 + }, + { + "acc": 0.6546186, + "epoch": 0.3576864535768645, + "grad_norm": 7.84375, + "learning_rate": 9.552971144181083e-06, + "loss": 1.58403511, + "memory(GiB)": 97.17, + "step": 14100, + "train_speed(iter/s)": 1.626398 + }, + { + "acc": 0.65222545, + "epoch": 0.3578132927447996, + "grad_norm": 5.28125, + "learning_rate": 9.552537646288612e-06, + "loss": 1.6568119, + "memory(GiB)": 97.17, + "step": 14105, + "train_speed(iter/s)": 1.626462 + }, + { + "acc": 0.66831675, + "epoch": 0.35794013191273466, + "grad_norm": 6.46875, + "learning_rate": 9.552103948155211e-06, + "loss": 1.57768946, + "memory(GiB)": 97.17, + "step": 14110, + "train_speed(iter/s)": 1.626529 + }, + { + "acc": 0.66277895, + "epoch": 0.3580669710806697, + "grad_norm": 6.59375, + "learning_rate": 9.551670049799954e-06, + "loss": 1.62563763, + "memory(GiB)": 97.17, + "step": 14115, + "train_speed(iter/s)": 1.626598 + }, + { + "acc": 0.64569521, + "epoch": 0.35819381024860475, + "grad_norm": 5.6875, + "learning_rate": 9.551235951241927e-06, + "loss": 1.62170048, + "memory(GiB)": 97.17, + "step": 14120, + "train_speed(iter/s)": 1.626663 + }, + { + "acc": 0.64057965, + "epoch": 0.35832064941653985, + "grad_norm": 6.03125, + "learning_rate": 9.550801652500223e-06, + "loss": 1.72616081, + "memory(GiB)": 97.17, + "step": 14125, + "train_speed(iter/s)": 1.626714 + }, + { + "acc": 0.65643935, + "epoch": 0.3584474885844749, + "grad_norm": 6.1875, + "learning_rate": 9.550367153593944e-06, + "loss": 1.64241409, + "memory(GiB)": 97.17, + "step": 14130, + "train_speed(iter/s)": 1.626783 + }, + { + "acc": 0.6423604, + "epoch": 0.35857432775240994, + "grad_norm": 4.875, + "learning_rate": 9.549932454542202e-06, + "loss": 1.64936028, + "memory(GiB)": 97.17, + "step": 14135, + "train_speed(iter/s)": 1.626848 + }, + { + "acc": 0.66251836, + "epoch": 0.358701166920345, + "grad_norm": 7.75, + "learning_rate": 9.549497555364115e-06, + "loss": 1.58814211, + "memory(GiB)": 97.17, + "step": 14140, + "train_speed(iter/s)": 1.626917 + }, + { + "acc": 0.65246286, + "epoch": 0.3588280060882801, + "grad_norm": 6.53125, + "learning_rate": 9.549062456078816e-06, + "loss": 1.60987892, + "memory(GiB)": 97.17, + "step": 14145, + "train_speed(iter/s)": 1.626982 + }, + { + "acc": 0.65147219, + "epoch": 0.3589548452562151, + "grad_norm": 5.5, + "learning_rate": 9.54862715670544e-06, + "loss": 1.62243233, + "memory(GiB)": 97.17, + "step": 14150, + "train_speed(iter/s)": 1.62705 + }, + { + "acc": 0.65065045, + "epoch": 0.35908168442415017, + "grad_norm": 4.9375, + "learning_rate": 9.548191657263132e-06, + "loss": 1.61188564, + "memory(GiB)": 97.17, + "step": 14155, + "train_speed(iter/s)": 1.627118 + }, + { + "acc": 0.66279345, + "epoch": 0.3592085235920852, + "grad_norm": 5.15625, + "learning_rate": 9.547755957771049e-06, + "loss": 1.62165356, + "memory(GiB)": 97.17, + "step": 14160, + "train_speed(iter/s)": 1.627184 + }, + { + "acc": 0.65560017, + "epoch": 0.3593353627600203, + "grad_norm": 5.03125, + "learning_rate": 9.547320058248356e-06, + "loss": 1.59486837, + "memory(GiB)": 97.17, + "step": 14165, + "train_speed(iter/s)": 1.627253 + }, + { + "acc": 0.65943356, + "epoch": 0.35946220192795536, + "grad_norm": 6.1875, + "learning_rate": 9.546883958714223e-06, + "loss": 1.66743088, + "memory(GiB)": 97.17, + "step": 14170, + "train_speed(iter/s)": 1.627325 + }, + { + "acc": 0.62036667, + "epoch": 0.3595890410958904, + "grad_norm": 5.75, + "learning_rate": 9.546447659187834e-06, + "loss": 1.71007957, + "memory(GiB)": 97.17, + "step": 14175, + "train_speed(iter/s)": 1.627394 + }, + { + "acc": 0.65520248, + "epoch": 0.35971588026382545, + "grad_norm": 5.15625, + "learning_rate": 9.546011159688377e-06, + "loss": 1.60203037, + "memory(GiB)": 97.17, + "step": 14180, + "train_speed(iter/s)": 1.627459 + }, + { + "acc": 0.66012106, + "epoch": 0.35984271943176055, + "grad_norm": 5.1875, + "learning_rate": 9.545574460235055e-06, + "loss": 1.63000755, + "memory(GiB)": 97.17, + "step": 14185, + "train_speed(iter/s)": 1.627527 + }, + { + "acc": 0.64758515, + "epoch": 0.3599695585996956, + "grad_norm": 5.4375, + "learning_rate": 9.545137560847071e-06, + "loss": 1.5833518, + "memory(GiB)": 97.17, + "step": 14190, + "train_speed(iter/s)": 1.627596 + }, + { + "acc": 0.64470348, + "epoch": 0.36009639776763064, + "grad_norm": 5.78125, + "learning_rate": 9.544700461543647e-06, + "loss": 1.64939404, + "memory(GiB)": 97.17, + "step": 14195, + "train_speed(iter/s)": 1.627666 + }, + { + "acc": 0.66848049, + "epoch": 0.3602232369355657, + "grad_norm": 5.28125, + "learning_rate": 9.544263162344005e-06, + "loss": 1.58201141, + "memory(GiB)": 97.17, + "step": 14200, + "train_speed(iter/s)": 1.627735 + }, + { + "acc": 0.64273958, + "epoch": 0.3603500761035008, + "grad_norm": 5.84375, + "learning_rate": 9.54382566326738e-06, + "loss": 1.68439293, + "memory(GiB)": 97.17, + "step": 14205, + "train_speed(iter/s)": 1.627798 + }, + { + "acc": 0.64344196, + "epoch": 0.3604769152714358, + "grad_norm": 5.0625, + "learning_rate": 9.543387964333018e-06, + "loss": 1.6235239, + "memory(GiB)": 97.17, + "step": 14210, + "train_speed(iter/s)": 1.627864 + }, + { + "acc": 0.63097067, + "epoch": 0.36060375443937087, + "grad_norm": 5.84375, + "learning_rate": 9.542950065560165e-06, + "loss": 1.6433712, + "memory(GiB)": 97.17, + "step": 14215, + "train_speed(iter/s)": 1.627928 + }, + { + "acc": 0.62943425, + "epoch": 0.3607305936073059, + "grad_norm": 5.125, + "learning_rate": 9.542511966968087e-06, + "loss": 1.74006004, + "memory(GiB)": 97.17, + "step": 14220, + "train_speed(iter/s)": 1.627993 + }, + { + "acc": 0.65337658, + "epoch": 0.360857432775241, + "grad_norm": 5.9375, + "learning_rate": 9.542073668576052e-06, + "loss": 1.55032177, + "memory(GiB)": 97.17, + "step": 14225, + "train_speed(iter/s)": 1.628058 + }, + { + "acc": 0.65861611, + "epoch": 0.36098427194317606, + "grad_norm": 4.59375, + "learning_rate": 9.541635170403338e-06, + "loss": 1.58314495, + "memory(GiB)": 97.17, + "step": 14230, + "train_speed(iter/s)": 1.628127 + }, + { + "acc": 0.64714174, + "epoch": 0.3611111111111111, + "grad_norm": 5.21875, + "learning_rate": 9.541196472469234e-06, + "loss": 1.62513771, + "memory(GiB)": 97.17, + "step": 14235, + "train_speed(iter/s)": 1.628191 + }, + { + "acc": 0.65190058, + "epoch": 0.36123795027904615, + "grad_norm": 6.6875, + "learning_rate": 9.540757574793032e-06, + "loss": 1.6331768, + "memory(GiB)": 97.17, + "step": 14240, + "train_speed(iter/s)": 1.628257 + }, + { + "acc": 0.64696169, + "epoch": 0.36136478944698125, + "grad_norm": 6.03125, + "learning_rate": 9.540318477394039e-06, + "loss": 1.64407692, + "memory(GiB)": 97.17, + "step": 14245, + "train_speed(iter/s)": 1.628328 + }, + { + "acc": 0.66525965, + "epoch": 0.3614916286149163, + "grad_norm": 4.8125, + "learning_rate": 9.539879180291568e-06, + "loss": 1.58592291, + "memory(GiB)": 97.17, + "step": 14250, + "train_speed(iter/s)": 1.628393 + }, + { + "acc": 0.6540174, + "epoch": 0.36161846778285134, + "grad_norm": 5.3125, + "learning_rate": 9.539439683504943e-06, + "loss": 1.61532097, + "memory(GiB)": 97.17, + "step": 14255, + "train_speed(iter/s)": 1.62846 + }, + { + "acc": 0.64410844, + "epoch": 0.3617453069507864, + "grad_norm": 5.46875, + "learning_rate": 9.538999987053492e-06, + "loss": 1.70699844, + "memory(GiB)": 97.17, + "step": 14260, + "train_speed(iter/s)": 1.628529 + }, + { + "acc": 0.64135828, + "epoch": 0.3618721461187215, + "grad_norm": 5.03125, + "learning_rate": 9.538560090956557e-06, + "loss": 1.67186012, + "memory(GiB)": 97.17, + "step": 14265, + "train_speed(iter/s)": 1.628594 + }, + { + "acc": 0.6543262, + "epoch": 0.3619989852866565, + "grad_norm": 5.78125, + "learning_rate": 9.538119995233485e-06, + "loss": 1.61801453, + "memory(GiB)": 97.17, + "step": 14270, + "train_speed(iter/s)": 1.628661 + }, + { + "acc": 0.6599596, + "epoch": 0.36212582445459157, + "grad_norm": 5.03125, + "learning_rate": 9.537679699903637e-06, + "loss": 1.64361725, + "memory(GiB)": 97.17, + "step": 14275, + "train_speed(iter/s)": 1.628721 + }, + { + "acc": 0.67413931, + "epoch": 0.3622526636225266, + "grad_norm": 7.0, + "learning_rate": 9.537239204986375e-06, + "loss": 1.48503141, + "memory(GiB)": 97.17, + "step": 14280, + "train_speed(iter/s)": 1.628788 + }, + { + "acc": 0.63579888, + "epoch": 0.3623795027904617, + "grad_norm": 5.03125, + "learning_rate": 9.536798510501075e-06, + "loss": 1.65280933, + "memory(GiB)": 97.17, + "step": 14285, + "train_speed(iter/s)": 1.628854 + }, + { + "acc": 0.65138206, + "epoch": 0.36250634195839676, + "grad_norm": 9.625, + "learning_rate": 9.536357616467123e-06, + "loss": 1.60710125, + "memory(GiB)": 97.17, + "step": 14290, + "train_speed(iter/s)": 1.628924 + }, + { + "acc": 0.65690031, + "epoch": 0.3626331811263318, + "grad_norm": 5.03125, + "learning_rate": 9.535916522903908e-06, + "loss": 1.59372406, + "memory(GiB)": 97.17, + "step": 14295, + "train_speed(iter/s)": 1.62899 + }, + { + "acc": 0.64046726, + "epoch": 0.36276002029426685, + "grad_norm": 6.40625, + "learning_rate": 9.535475229830832e-06, + "loss": 1.69992867, + "memory(GiB)": 97.17, + "step": 14300, + "train_speed(iter/s)": 1.629054 + }, + { + "acc": 0.64613805, + "epoch": 0.36288685946220195, + "grad_norm": 5.09375, + "learning_rate": 9.535033737267308e-06, + "loss": 1.59447412, + "memory(GiB)": 97.17, + "step": 14305, + "train_speed(iter/s)": 1.629118 + }, + { + "acc": 0.64742351, + "epoch": 0.363013698630137, + "grad_norm": 5.9375, + "learning_rate": 9.534592045232752e-06, + "loss": 1.56169319, + "memory(GiB)": 97.17, + "step": 14310, + "train_speed(iter/s)": 1.629182 + }, + { + "acc": 0.64363832, + "epoch": 0.36314053779807204, + "grad_norm": 5.6875, + "learning_rate": 9.534150153746591e-06, + "loss": 1.70888824, + "memory(GiB)": 97.17, + "step": 14315, + "train_speed(iter/s)": 1.629244 + }, + { + "acc": 0.64335051, + "epoch": 0.3632673769660071, + "grad_norm": 6.4375, + "learning_rate": 9.533708062828264e-06, + "loss": 1.62780361, + "memory(GiB)": 97.17, + "step": 14320, + "train_speed(iter/s)": 1.629314 + }, + { + "acc": 0.65207286, + "epoch": 0.3633942161339422, + "grad_norm": 5.40625, + "learning_rate": 9.533265772497216e-06, + "loss": 1.67987556, + "memory(GiB)": 97.17, + "step": 14325, + "train_speed(iter/s)": 1.629379 + }, + { + "acc": 0.65923223, + "epoch": 0.3635210553018772, + "grad_norm": 5.875, + "learning_rate": 9.532823282772899e-06, + "loss": 1.58446646, + "memory(GiB)": 97.17, + "step": 14330, + "train_speed(iter/s)": 1.629448 + }, + { + "acc": 0.64248061, + "epoch": 0.36364789446981227, + "grad_norm": 6.15625, + "learning_rate": 9.532380593674775e-06, + "loss": 1.61728401, + "memory(GiB)": 97.17, + "step": 14335, + "train_speed(iter/s)": 1.629513 + }, + { + "acc": 0.64716191, + "epoch": 0.3637747336377473, + "grad_norm": 6.9375, + "learning_rate": 9.531937705222319e-06, + "loss": 1.67005825, + "memory(GiB)": 97.17, + "step": 14340, + "train_speed(iter/s)": 1.629582 + }, + { + "acc": 0.6364892, + "epoch": 0.3639015728056824, + "grad_norm": 5.625, + "learning_rate": 9.531494617435006e-06, + "loss": 1.63972435, + "memory(GiB)": 97.17, + "step": 14345, + "train_speed(iter/s)": 1.629645 + }, + { + "acc": 0.65665822, + "epoch": 0.36402841197361746, + "grad_norm": 7.4375, + "learning_rate": 9.531051330332331e-06, + "loss": 1.5889123, + "memory(GiB)": 97.17, + "step": 14350, + "train_speed(iter/s)": 1.629712 + }, + { + "acc": 0.63490801, + "epoch": 0.3641552511415525, + "grad_norm": 5.65625, + "learning_rate": 9.530607843933788e-06, + "loss": 1.67954674, + "memory(GiB)": 97.17, + "step": 14355, + "train_speed(iter/s)": 1.629778 + }, + { + "acc": 0.65125618, + "epoch": 0.36428209030948755, + "grad_norm": 5.46875, + "learning_rate": 9.530164158258883e-06, + "loss": 1.57851257, + "memory(GiB)": 97.17, + "step": 14360, + "train_speed(iter/s)": 1.629843 + }, + { + "acc": 0.66065435, + "epoch": 0.36440892947742265, + "grad_norm": 5.59375, + "learning_rate": 9.529720273327135e-06, + "loss": 1.5982132, + "memory(GiB)": 97.17, + "step": 14365, + "train_speed(iter/s)": 1.629907 + }, + { + "acc": 0.65221481, + "epoch": 0.3645357686453577, + "grad_norm": 6.46875, + "learning_rate": 9.529276189158063e-06, + "loss": 1.67781277, + "memory(GiB)": 97.17, + "step": 14370, + "train_speed(iter/s)": 1.629972 + }, + { + "acc": 0.65666084, + "epoch": 0.36466260781329274, + "grad_norm": 5.09375, + "learning_rate": 9.528831905771205e-06, + "loss": 1.63739128, + "memory(GiB)": 97.17, + "step": 14375, + "train_speed(iter/s)": 1.630032 + }, + { + "acc": 0.64627666, + "epoch": 0.3647894469812278, + "grad_norm": 6.96875, + "learning_rate": 9.528387423186098e-06, + "loss": 1.66223564, + "memory(GiB)": 97.17, + "step": 14380, + "train_speed(iter/s)": 1.630098 + }, + { + "acc": 0.65122247, + "epoch": 0.3649162861491629, + "grad_norm": 5.46875, + "learning_rate": 9.527942741422297e-06, + "loss": 1.60981102, + "memory(GiB)": 97.17, + "step": 14385, + "train_speed(iter/s)": 1.630165 + }, + { + "acc": 0.64303856, + "epoch": 0.3650431253170979, + "grad_norm": 5.71875, + "learning_rate": 9.527497860499355e-06, + "loss": 1.70701275, + "memory(GiB)": 97.17, + "step": 14390, + "train_speed(iter/s)": 1.63023 + }, + { + "acc": 0.65450912, + "epoch": 0.36516996448503297, + "grad_norm": 5.9375, + "learning_rate": 9.527052780436845e-06, + "loss": 1.54058809, + "memory(GiB)": 97.17, + "step": 14395, + "train_speed(iter/s)": 1.630296 + }, + { + "acc": 0.64673252, + "epoch": 0.365296803652968, + "grad_norm": 5.9375, + "learning_rate": 9.52660750125434e-06, + "loss": 1.62171021, + "memory(GiB)": 97.17, + "step": 14400, + "train_speed(iter/s)": 1.630358 + }, + { + "acc": 0.64762077, + "epoch": 0.3654236428209031, + "grad_norm": 5.0625, + "learning_rate": 9.52616202297143e-06, + "loss": 1.64411507, + "memory(GiB)": 97.17, + "step": 14405, + "train_speed(iter/s)": 1.630421 + }, + { + "acc": 0.64639888, + "epoch": 0.36555048198883816, + "grad_norm": 4.65625, + "learning_rate": 9.525716345607706e-06, + "loss": 1.62094383, + "memory(GiB)": 97.17, + "step": 14410, + "train_speed(iter/s)": 1.630484 + }, + { + "acc": 0.62937675, + "epoch": 0.3656773211567732, + "grad_norm": 4.78125, + "learning_rate": 9.52527046918277e-06, + "loss": 1.70464325, + "memory(GiB)": 97.17, + "step": 14415, + "train_speed(iter/s)": 1.630548 + }, + { + "acc": 0.63535228, + "epoch": 0.36580416032470825, + "grad_norm": 4.8125, + "learning_rate": 9.524824393716235e-06, + "loss": 1.70969715, + "memory(GiB)": 97.17, + "step": 14420, + "train_speed(iter/s)": 1.630611 + }, + { + "acc": 0.646453, + "epoch": 0.36593099949264335, + "grad_norm": 6.125, + "learning_rate": 9.524378119227722e-06, + "loss": 1.62970886, + "memory(GiB)": 97.17, + "step": 14425, + "train_speed(iter/s)": 1.630675 + }, + { + "acc": 0.65223055, + "epoch": 0.3660578386605784, + "grad_norm": 5.65625, + "learning_rate": 9.523931645736858e-06, + "loss": 1.63869209, + "memory(GiB)": 97.17, + "step": 14430, + "train_speed(iter/s)": 1.63074 + }, + { + "acc": 0.65391784, + "epoch": 0.36618467782851344, + "grad_norm": 5.75, + "learning_rate": 9.523484973263283e-06, + "loss": 1.65556507, + "memory(GiB)": 97.17, + "step": 14435, + "train_speed(iter/s)": 1.630804 + }, + { + "acc": 0.66809335, + "epoch": 0.3663115169964485, + "grad_norm": 6.40625, + "learning_rate": 9.523038101826644e-06, + "loss": 1.58158026, + "memory(GiB)": 97.17, + "step": 14440, + "train_speed(iter/s)": 1.630871 + }, + { + "acc": 0.62958465, + "epoch": 0.3664383561643836, + "grad_norm": 5.78125, + "learning_rate": 9.522591031446596e-06, + "loss": 1.71249962, + "memory(GiB)": 97.17, + "step": 14445, + "train_speed(iter/s)": 1.630934 + }, + { + "acc": 0.65568352, + "epoch": 0.3665651953323186, + "grad_norm": 5.84375, + "learning_rate": 9.522143762142801e-06, + "loss": 1.59135056, + "memory(GiB)": 97.17, + "step": 14450, + "train_speed(iter/s)": 1.630998 + }, + { + "acc": 0.66998911, + "epoch": 0.36669203450025367, + "grad_norm": 5.71875, + "learning_rate": 9.521696293934934e-06, + "loss": 1.59900875, + "memory(GiB)": 97.17, + "step": 14455, + "train_speed(iter/s)": 1.631058 + }, + { + "acc": 0.65012841, + "epoch": 0.3668188736681887, + "grad_norm": 5.21875, + "learning_rate": 9.521248626842676e-06, + "loss": 1.66891575, + "memory(GiB)": 97.17, + "step": 14460, + "train_speed(iter/s)": 1.631121 + }, + { + "acc": 0.63980789, + "epoch": 0.3669457128361238, + "grad_norm": 4.875, + "learning_rate": 9.520800760885716e-06, + "loss": 1.69302425, + "memory(GiB)": 97.17, + "step": 14465, + "train_speed(iter/s)": 1.631182 + }, + { + "acc": 0.65004869, + "epoch": 0.36707255200405886, + "grad_norm": 5.34375, + "learning_rate": 9.520352696083756e-06, + "loss": 1.58977261, + "memory(GiB)": 97.17, + "step": 14470, + "train_speed(iter/s)": 1.631241 + }, + { + "acc": 0.65741978, + "epoch": 0.3671993911719939, + "grad_norm": 5.875, + "learning_rate": 9.519904432456504e-06, + "loss": 1.5539629, + "memory(GiB)": 97.17, + "step": 14475, + "train_speed(iter/s)": 1.631306 + }, + { + "acc": 0.67306805, + "epoch": 0.36732623033992895, + "grad_norm": 5.96875, + "learning_rate": 9.519455970023672e-06, + "loss": 1.52765646, + "memory(GiB)": 97.17, + "step": 14480, + "train_speed(iter/s)": 1.631369 + }, + { + "acc": 0.64803562, + "epoch": 0.36745306950786405, + "grad_norm": 5.90625, + "learning_rate": 9.519007308804991e-06, + "loss": 1.63630543, + "memory(GiB)": 97.17, + "step": 14485, + "train_speed(iter/s)": 1.631431 + }, + { + "acc": 0.65242558, + "epoch": 0.3675799086757991, + "grad_norm": 4.78125, + "learning_rate": 9.518558448820193e-06, + "loss": 1.5788681, + "memory(GiB)": 97.17, + "step": 14490, + "train_speed(iter/s)": 1.63149 + }, + { + "acc": 0.64776564, + "epoch": 0.36770674784373414, + "grad_norm": 6.625, + "learning_rate": 9.518109390089017e-06, + "loss": 1.64999542, + "memory(GiB)": 97.17, + "step": 14495, + "train_speed(iter/s)": 1.631549 + }, + { + "acc": 0.64250813, + "epoch": 0.3678335870116692, + "grad_norm": 7.0, + "learning_rate": 9.517660132631222e-06, + "loss": 1.68008728, + "memory(GiB)": 97.17, + "step": 14500, + "train_speed(iter/s)": 1.63161 + }, + { + "acc": 0.66387734, + "epoch": 0.3679604261796043, + "grad_norm": 5.53125, + "learning_rate": 9.517210676466561e-06, + "loss": 1.63639526, + "memory(GiB)": 97.17, + "step": 14505, + "train_speed(iter/s)": 1.631675 + }, + { + "acc": 0.65841141, + "epoch": 0.3680872653475393, + "grad_norm": 4.34375, + "learning_rate": 9.516761021614809e-06, + "loss": 1.60073128, + "memory(GiB)": 97.17, + "step": 14510, + "train_speed(iter/s)": 1.631736 + }, + { + "acc": 0.65091815, + "epoch": 0.36821410451547437, + "grad_norm": 4.96875, + "learning_rate": 9.51631116809574e-06, + "loss": 1.62698765, + "memory(GiB)": 97.17, + "step": 14515, + "train_speed(iter/s)": 1.631797 + }, + { + "acc": 0.64248943, + "epoch": 0.3683409436834094, + "grad_norm": 5.46875, + "learning_rate": 9.515861115929144e-06, + "loss": 1.69752483, + "memory(GiB)": 97.17, + "step": 14520, + "train_speed(iter/s)": 1.63186 + }, + { + "acc": 0.64379282, + "epoch": 0.3684677828513445, + "grad_norm": 8.3125, + "learning_rate": 9.515410865134812e-06, + "loss": 1.62120342, + "memory(GiB)": 97.17, + "step": 14525, + "train_speed(iter/s)": 1.631923 + }, + { + "acc": 0.66134501, + "epoch": 0.36859462201927956, + "grad_norm": 5.59375, + "learning_rate": 9.514960415732551e-06, + "loss": 1.56136761, + "memory(GiB)": 97.17, + "step": 14530, + "train_speed(iter/s)": 1.631988 + }, + { + "acc": 0.64463539, + "epoch": 0.3687214611872146, + "grad_norm": 6.1875, + "learning_rate": 9.514509767742172e-06, + "loss": 1.68267403, + "memory(GiB)": 97.17, + "step": 14535, + "train_speed(iter/s)": 1.632052 + }, + { + "acc": 0.65353851, + "epoch": 0.36884830035514965, + "grad_norm": 8.0625, + "learning_rate": 9.5140589211835e-06, + "loss": 1.63590584, + "memory(GiB)": 97.17, + "step": 14540, + "train_speed(iter/s)": 1.632115 + }, + { + "acc": 0.65548134, + "epoch": 0.36897513952308475, + "grad_norm": 6.59375, + "learning_rate": 9.513607876076363e-06, + "loss": 1.6759716, + "memory(GiB)": 97.17, + "step": 14545, + "train_speed(iter/s)": 1.632179 + }, + { + "acc": 0.65960293, + "epoch": 0.3691019786910198, + "grad_norm": 6.90625, + "learning_rate": 9.513156632440598e-06, + "loss": 1.58911648, + "memory(GiB)": 97.17, + "step": 14550, + "train_speed(iter/s)": 1.632244 + }, + { + "acc": 0.65421047, + "epoch": 0.36922881785895484, + "grad_norm": 5.09375, + "learning_rate": 9.512705190296055e-06, + "loss": 1.63908081, + "memory(GiB)": 97.17, + "step": 14555, + "train_speed(iter/s)": 1.632302 + }, + { + "acc": 0.65789351, + "epoch": 0.3693556570268899, + "grad_norm": 5.4375, + "learning_rate": 9.512253549662588e-06, + "loss": 1.58075638, + "memory(GiB)": 97.17, + "step": 14560, + "train_speed(iter/s)": 1.632364 + }, + { + "acc": 0.6556778, + "epoch": 0.369482496194825, + "grad_norm": 5.71875, + "learning_rate": 9.511801710560066e-06, + "loss": 1.68199329, + "memory(GiB)": 97.17, + "step": 14565, + "train_speed(iter/s)": 1.632427 + }, + { + "acc": 0.65963564, + "epoch": 0.36960933536276, + "grad_norm": 6.625, + "learning_rate": 9.511349673008364e-06, + "loss": 1.584758, + "memory(GiB)": 97.17, + "step": 14570, + "train_speed(iter/s)": 1.632488 + }, + { + "acc": 0.6469985, + "epoch": 0.36973617453069507, + "grad_norm": 8.8125, + "learning_rate": 9.510897437027358e-06, + "loss": 1.64005203, + "memory(GiB)": 97.17, + "step": 14575, + "train_speed(iter/s)": 1.632548 + }, + { + "acc": 0.6561111, + "epoch": 0.3698630136986301, + "grad_norm": 6.0, + "learning_rate": 9.510445002636943e-06, + "loss": 1.62161083, + "memory(GiB)": 97.17, + "step": 14580, + "train_speed(iter/s)": 1.632609 + }, + { + "acc": 0.67015715, + "epoch": 0.3699898528665652, + "grad_norm": 5.8125, + "learning_rate": 9.50999236985702e-06, + "loss": 1.60314808, + "memory(GiB)": 97.17, + "step": 14585, + "train_speed(iter/s)": 1.632669 + }, + { + "acc": 0.65853643, + "epoch": 0.37011669203450026, + "grad_norm": 6.28125, + "learning_rate": 9.509539538707497e-06, + "loss": 1.68197575, + "memory(GiB)": 97.17, + "step": 14590, + "train_speed(iter/s)": 1.632729 + }, + { + "acc": 0.64739695, + "epoch": 0.3702435312024353, + "grad_norm": 7.3125, + "learning_rate": 9.50908650920829e-06, + "loss": 1.64297028, + "memory(GiB)": 97.17, + "step": 14595, + "train_speed(iter/s)": 1.632788 + }, + { + "acc": 0.64998484, + "epoch": 0.37037037037037035, + "grad_norm": 4.84375, + "learning_rate": 9.50863328137933e-06, + "loss": 1.60108604, + "memory(GiB)": 97.17, + "step": 14600, + "train_speed(iter/s)": 1.632848 + }, + { + "acc": 0.65740128, + "epoch": 0.37049720953830545, + "grad_norm": 5.5, + "learning_rate": 9.508179855240545e-06, + "loss": 1.59114885, + "memory(GiB)": 97.17, + "step": 14605, + "train_speed(iter/s)": 1.632908 + }, + { + "acc": 0.65801759, + "epoch": 0.3706240487062405, + "grad_norm": 5.375, + "learning_rate": 9.507726230811884e-06, + "loss": 1.68662777, + "memory(GiB)": 97.17, + "step": 14610, + "train_speed(iter/s)": 1.632971 + }, + { + "acc": 0.64627829, + "epoch": 0.37075088787417554, + "grad_norm": 5.78125, + "learning_rate": 9.507272408113298e-06, + "loss": 1.60258789, + "memory(GiB)": 97.17, + "step": 14615, + "train_speed(iter/s)": 1.633027 + }, + { + "acc": 0.63934393, + "epoch": 0.3708777270421106, + "grad_norm": 4.59375, + "learning_rate": 9.506818387164748e-06, + "loss": 1.67579861, + "memory(GiB)": 97.17, + "step": 14620, + "train_speed(iter/s)": 1.633087 + }, + { + "acc": 0.65653715, + "epoch": 0.3710045662100457, + "grad_norm": 5.71875, + "learning_rate": 9.506364167986204e-06, + "loss": 1.65439224, + "memory(GiB)": 97.17, + "step": 14625, + "train_speed(iter/s)": 1.633145 + }, + { + "acc": 0.65933456, + "epoch": 0.3711314053779807, + "grad_norm": 6.1875, + "learning_rate": 9.505909750597644e-06, + "loss": 1.65304337, + "memory(GiB)": 97.17, + "step": 14630, + "train_speed(iter/s)": 1.63321 + }, + { + "acc": 0.64524608, + "epoch": 0.37125824454591577, + "grad_norm": 5.90625, + "learning_rate": 9.505455135019055e-06, + "loss": 1.60034714, + "memory(GiB)": 97.17, + "step": 14635, + "train_speed(iter/s)": 1.633273 + }, + { + "acc": 0.6503952, + "epoch": 0.3713850837138508, + "grad_norm": 8.6875, + "learning_rate": 9.505000321270435e-06, + "loss": 1.59648886, + "memory(GiB)": 97.17, + "step": 14640, + "train_speed(iter/s)": 1.633336 + }, + { + "acc": 0.6597218, + "epoch": 0.3715119228817859, + "grad_norm": 6.28125, + "learning_rate": 9.504545309371786e-06, + "loss": 1.62139435, + "memory(GiB)": 97.17, + "step": 14645, + "train_speed(iter/s)": 1.633397 + }, + { + "acc": 0.63369408, + "epoch": 0.37163876204972096, + "grad_norm": 4.96875, + "learning_rate": 9.504090099343125e-06, + "loss": 1.68813248, + "memory(GiB)": 97.17, + "step": 14650, + "train_speed(iter/s)": 1.633462 + }, + { + "acc": 0.67516389, + "epoch": 0.371765601217656, + "grad_norm": 5.96875, + "learning_rate": 9.50363469120447e-06, + "loss": 1.54997845, + "memory(GiB)": 97.17, + "step": 14655, + "train_speed(iter/s)": 1.633521 + }, + { + "acc": 0.62684927, + "epoch": 0.37189244038559105, + "grad_norm": 5.59375, + "learning_rate": 9.503179084975855e-06, + "loss": 1.68973427, + "memory(GiB)": 97.17, + "step": 14660, + "train_speed(iter/s)": 1.633586 + }, + { + "acc": 0.64531069, + "epoch": 0.37201927955352615, + "grad_norm": 5.25, + "learning_rate": 9.502723280677319e-06, + "loss": 1.67327309, + "memory(GiB)": 97.17, + "step": 14665, + "train_speed(iter/s)": 1.633646 + }, + { + "acc": 0.66482682, + "epoch": 0.3721461187214612, + "grad_norm": 6.625, + "learning_rate": 9.50226727832891e-06, + "loss": 1.67276001, + "memory(GiB)": 97.17, + "step": 14670, + "train_speed(iter/s)": 1.633707 + }, + { + "acc": 0.63452954, + "epoch": 0.37227295788939624, + "grad_norm": 6.375, + "learning_rate": 9.501811077950685e-06, + "loss": 1.68142776, + "memory(GiB)": 97.17, + "step": 14675, + "train_speed(iter/s)": 1.633768 + }, + { + "acc": 0.65397015, + "epoch": 0.3723997970573313, + "grad_norm": 5.78125, + "learning_rate": 9.501354679562708e-06, + "loss": 1.62402115, + "memory(GiB)": 97.17, + "step": 14680, + "train_speed(iter/s)": 1.63383 + }, + { + "acc": 0.65664773, + "epoch": 0.3725266362252664, + "grad_norm": 5.75, + "learning_rate": 9.500898083185058e-06, + "loss": 1.6172451, + "memory(GiB)": 97.17, + "step": 14685, + "train_speed(iter/s)": 1.63389 + }, + { + "acc": 0.65130043, + "epoch": 0.3726534753932014, + "grad_norm": 5.15625, + "learning_rate": 9.500441288837812e-06, + "loss": 1.59556112, + "memory(GiB)": 97.17, + "step": 14690, + "train_speed(iter/s)": 1.633947 + }, + { + "acc": 0.64774413, + "epoch": 0.37278031456113647, + "grad_norm": 6.125, + "learning_rate": 9.499984296541066e-06, + "loss": 1.67401886, + "memory(GiB)": 97.17, + "step": 14695, + "train_speed(iter/s)": 1.634009 + }, + { + "acc": 0.63403406, + "epoch": 0.3729071537290715, + "grad_norm": 6.25, + "learning_rate": 9.49952710631492e-06, + "loss": 1.76184998, + "memory(GiB)": 97.17, + "step": 14700, + "train_speed(iter/s)": 1.63407 + }, + { + "acc": 0.64683886, + "epoch": 0.3730339928970066, + "grad_norm": 6.0, + "learning_rate": 9.499069718179484e-06, + "loss": 1.62151756, + "memory(GiB)": 97.17, + "step": 14705, + "train_speed(iter/s)": 1.634131 + }, + { + "acc": 0.66225176, + "epoch": 0.37316083206494166, + "grad_norm": 5.96875, + "learning_rate": 9.498612132154874e-06, + "loss": 1.56371212, + "memory(GiB)": 97.17, + "step": 14710, + "train_speed(iter/s)": 1.634192 + }, + { + "acc": 0.65066142, + "epoch": 0.3732876712328767, + "grad_norm": 4.15625, + "learning_rate": 9.498154348261217e-06, + "loss": 1.60862694, + "memory(GiB)": 97.17, + "step": 14715, + "train_speed(iter/s)": 1.634253 + }, + { + "acc": 0.65171728, + "epoch": 0.37341451040081175, + "grad_norm": 5.71875, + "learning_rate": 9.497696366518649e-06, + "loss": 1.63322906, + "memory(GiB)": 97.17, + "step": 14720, + "train_speed(iter/s)": 1.634314 + }, + { + "acc": 0.64669833, + "epoch": 0.37354134956874685, + "grad_norm": 4.90625, + "learning_rate": 9.497238186947315e-06, + "loss": 1.66693687, + "memory(GiB)": 97.17, + "step": 14725, + "train_speed(iter/s)": 1.634375 + }, + { + "acc": 0.63985062, + "epoch": 0.3736681887366819, + "grad_norm": 5.6875, + "learning_rate": 9.496779809567367e-06, + "loss": 1.69202881, + "memory(GiB)": 97.17, + "step": 14730, + "train_speed(iter/s)": 1.634436 + }, + { + "acc": 0.6521153, + "epoch": 0.37379502790461694, + "grad_norm": 4.625, + "learning_rate": 9.496321234398967e-06, + "loss": 1.56692915, + "memory(GiB)": 97.17, + "step": 14735, + "train_speed(iter/s)": 1.634496 + }, + { + "acc": 0.65274858, + "epoch": 0.373921867072552, + "grad_norm": 4.5625, + "learning_rate": 9.495862461462282e-06, + "loss": 1.62136841, + "memory(GiB)": 97.17, + "step": 14740, + "train_speed(iter/s)": 1.634554 + }, + { + "acc": 0.65033445, + "epoch": 0.3740487062404871, + "grad_norm": 5.03125, + "learning_rate": 9.495403490777495e-06, + "loss": 1.6391552, + "memory(GiB)": 97.17, + "step": 14745, + "train_speed(iter/s)": 1.634608 + }, + { + "acc": 0.67271242, + "epoch": 0.3741755454084221, + "grad_norm": 5.46875, + "learning_rate": 9.49494432236479e-06, + "loss": 1.56324282, + "memory(GiB)": 97.17, + "step": 14750, + "train_speed(iter/s)": 1.634667 + }, + { + "acc": 0.65324721, + "epoch": 0.37430238457635717, + "grad_norm": 5.25, + "learning_rate": 9.494484956244368e-06, + "loss": 1.63767223, + "memory(GiB)": 97.17, + "step": 14755, + "train_speed(iter/s)": 1.634727 + }, + { + "acc": 0.64981432, + "epoch": 0.3744292237442922, + "grad_norm": 6.5625, + "learning_rate": 9.49402539243643e-06, + "loss": 1.65517578, + "memory(GiB)": 97.17, + "step": 14760, + "train_speed(iter/s)": 1.634791 + }, + { + "acc": 0.64459352, + "epoch": 0.3745560629122273, + "grad_norm": 5.3125, + "learning_rate": 9.49356563096119e-06, + "loss": 1.64013443, + "memory(GiB)": 97.17, + "step": 14765, + "train_speed(iter/s)": 1.634848 + }, + { + "acc": 0.65208397, + "epoch": 0.37468290208016236, + "grad_norm": 5.09375, + "learning_rate": 9.49310567183887e-06, + "loss": 1.60190315, + "memory(GiB)": 97.17, + "step": 14770, + "train_speed(iter/s)": 1.63491 + }, + { + "acc": 0.65274296, + "epoch": 0.3748097412480974, + "grad_norm": 6.0625, + "learning_rate": 9.492645515089706e-06, + "loss": 1.62087288, + "memory(GiB)": 97.17, + "step": 14775, + "train_speed(iter/s)": 1.634972 + }, + { + "acc": 0.65257802, + "epoch": 0.37493658041603245, + "grad_norm": 6.375, + "learning_rate": 9.492185160733934e-06, + "loss": 1.62115135, + "memory(GiB)": 97.17, + "step": 14780, + "train_speed(iter/s)": 1.635033 + }, + { + "acc": 0.64833503, + "epoch": 0.37506341958396755, + "grad_norm": 6.65625, + "learning_rate": 9.491724608791798e-06, + "loss": 1.65103951, + "memory(GiB)": 97.17, + "step": 14785, + "train_speed(iter/s)": 1.635093 + }, + { + "acc": 0.64485178, + "epoch": 0.3751902587519026, + "grad_norm": 5.5625, + "learning_rate": 9.491263859283563e-06, + "loss": 1.58244686, + "memory(GiB)": 97.17, + "step": 14790, + "train_speed(iter/s)": 1.635151 + }, + { + "acc": 0.65921292, + "epoch": 0.37531709791983764, + "grad_norm": 4.375, + "learning_rate": 9.490802912229491e-06, + "loss": 1.61254768, + "memory(GiB)": 97.17, + "step": 14795, + "train_speed(iter/s)": 1.635211 + }, + { + "acc": 0.63699598, + "epoch": 0.3754439370877727, + "grad_norm": 4.59375, + "learning_rate": 9.490341767649858e-06, + "loss": 1.69989891, + "memory(GiB)": 97.17, + "step": 14800, + "train_speed(iter/s)": 1.635272 + }, + { + "acc": 0.65551848, + "epoch": 0.3755707762557078, + "grad_norm": 6.0, + "learning_rate": 9.489880425564944e-06, + "loss": 1.57976818, + "memory(GiB)": 97.17, + "step": 14805, + "train_speed(iter/s)": 1.635332 + }, + { + "acc": 0.64182749, + "epoch": 0.37569761542364283, + "grad_norm": 4.71875, + "learning_rate": 9.489418885995043e-06, + "loss": 1.62107086, + "memory(GiB)": 97.17, + "step": 14810, + "train_speed(iter/s)": 1.635393 + }, + { + "acc": 0.63403397, + "epoch": 0.3758244545915779, + "grad_norm": 5.375, + "learning_rate": 9.488957148960457e-06, + "loss": 1.72306919, + "memory(GiB)": 97.17, + "step": 14815, + "train_speed(iter/s)": 1.635455 + }, + { + "acc": 0.63592906, + "epoch": 0.3759512937595129, + "grad_norm": 5.28125, + "learning_rate": 9.488495214481494e-06, + "loss": 1.61382446, + "memory(GiB)": 97.17, + "step": 14820, + "train_speed(iter/s)": 1.635516 + }, + { + "acc": 0.65910792, + "epoch": 0.376078132927448, + "grad_norm": 5.28125, + "learning_rate": 9.48803308257847e-06, + "loss": 1.62439651, + "memory(GiB)": 97.17, + "step": 14825, + "train_speed(iter/s)": 1.635574 + }, + { + "acc": 0.66102428, + "epoch": 0.37620497209538306, + "grad_norm": 5.6875, + "learning_rate": 9.487570753271716e-06, + "loss": 1.59010477, + "memory(GiB)": 97.17, + "step": 14830, + "train_speed(iter/s)": 1.635637 + }, + { + "acc": 0.64529419, + "epoch": 0.3763318112633181, + "grad_norm": 6.71875, + "learning_rate": 9.487108226581564e-06, + "loss": 1.65264645, + "memory(GiB)": 97.17, + "step": 14835, + "train_speed(iter/s)": 1.6357 + }, + { + "acc": 0.65747051, + "epoch": 0.37645865043125315, + "grad_norm": 4.84375, + "learning_rate": 9.486645502528355e-06, + "loss": 1.62201042, + "memory(GiB)": 97.17, + "step": 14840, + "train_speed(iter/s)": 1.635758 + }, + { + "acc": 0.65689774, + "epoch": 0.37658548959918825, + "grad_norm": 4.84375, + "learning_rate": 9.486182581132449e-06, + "loss": 1.58362923, + "memory(GiB)": 97.17, + "step": 14845, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.651126, + "epoch": 0.3767123287671233, + "grad_norm": 6.4375, + "learning_rate": 9.485719462414202e-06, + "loss": 1.57980499, + "memory(GiB)": 97.17, + "step": 14850, + "train_speed(iter/s)": 1.63588 + }, + { + "acc": 0.6693182, + "epoch": 0.37683916793505834, + "grad_norm": 7.15625, + "learning_rate": 9.485256146393987e-06, + "loss": 1.66769447, + "memory(GiB)": 97.17, + "step": 14855, + "train_speed(iter/s)": 1.635943 + }, + { + "acc": 0.65831304, + "epoch": 0.3769660071029934, + "grad_norm": 6.40625, + "learning_rate": 9.484792633092182e-06, + "loss": 1.62320271, + "memory(GiB)": 97.17, + "step": 14860, + "train_speed(iter/s)": 1.636 + }, + { + "acc": 0.66238065, + "epoch": 0.3770928462709285, + "grad_norm": 5.84375, + "learning_rate": 9.484328922529172e-06, + "loss": 1.55749445, + "memory(GiB)": 97.17, + "step": 14865, + "train_speed(iter/s)": 1.636061 + }, + { + "acc": 0.64541454, + "epoch": 0.37721968543886353, + "grad_norm": 5.03125, + "learning_rate": 9.483865014725356e-06, + "loss": 1.54721661, + "memory(GiB)": 97.17, + "step": 14870, + "train_speed(iter/s)": 1.63612 + }, + { + "acc": 0.65263486, + "epoch": 0.3773465246067986, + "grad_norm": 7.46875, + "learning_rate": 9.483400909701139e-06, + "loss": 1.65839577, + "memory(GiB)": 97.17, + "step": 14875, + "train_speed(iter/s)": 1.63618 + }, + { + "acc": 0.64702287, + "epoch": 0.3774733637747336, + "grad_norm": 5.65625, + "learning_rate": 9.482936607476931e-06, + "loss": 1.64633999, + "memory(GiB)": 97.17, + "step": 14880, + "train_speed(iter/s)": 1.63624 + }, + { + "acc": 0.64630709, + "epoch": 0.3776002029426687, + "grad_norm": 6.4375, + "learning_rate": 9.482472108073157e-06, + "loss": 1.69578705, + "memory(GiB)": 97.17, + "step": 14885, + "train_speed(iter/s)": 1.636303 + }, + { + "acc": 0.64325972, + "epoch": 0.37772704211060376, + "grad_norm": 5.78125, + "learning_rate": 9.482007411510245e-06, + "loss": 1.62746124, + "memory(GiB)": 97.17, + "step": 14890, + "train_speed(iter/s)": 1.63636 + }, + { + "acc": 0.65429587, + "epoch": 0.3778538812785388, + "grad_norm": 5.375, + "learning_rate": 9.48154251780864e-06, + "loss": 1.57414808, + "memory(GiB)": 97.17, + "step": 14895, + "train_speed(iter/s)": 1.636419 + }, + { + "acc": 0.65453629, + "epoch": 0.37798072044647385, + "grad_norm": 5.75, + "learning_rate": 9.481077426988782e-06, + "loss": 1.64011116, + "memory(GiB)": 97.17, + "step": 14900, + "train_speed(iter/s)": 1.63648 + }, + { + "acc": 0.63851681, + "epoch": 0.37810755961440895, + "grad_norm": 5.6875, + "learning_rate": 9.480612139071134e-06, + "loss": 1.64953651, + "memory(GiB)": 97.17, + "step": 14905, + "train_speed(iter/s)": 1.636538 + }, + { + "acc": 0.64264445, + "epoch": 0.378234398782344, + "grad_norm": 6.03125, + "learning_rate": 9.48014665407616e-06, + "loss": 1.65910034, + "memory(GiB)": 97.17, + "step": 14910, + "train_speed(iter/s)": 1.636597 + }, + { + "acc": 0.6703825, + "epoch": 0.37836123795027904, + "grad_norm": 5.65625, + "learning_rate": 9.479680972024334e-06, + "loss": 1.59870377, + "memory(GiB)": 97.17, + "step": 14915, + "train_speed(iter/s)": 1.636656 + }, + { + "acc": 0.66856813, + "epoch": 0.3784880771182141, + "grad_norm": 6.125, + "learning_rate": 9.47921509293614e-06, + "loss": 1.57689972, + "memory(GiB)": 97.17, + "step": 14920, + "train_speed(iter/s)": 1.636717 + }, + { + "acc": 0.65607595, + "epoch": 0.3786149162861492, + "grad_norm": 5.5, + "learning_rate": 9.478749016832066e-06, + "loss": 1.67704678, + "memory(GiB)": 97.17, + "step": 14925, + "train_speed(iter/s)": 1.636775 + }, + { + "acc": 0.65648675, + "epoch": 0.37874175545408423, + "grad_norm": 4.625, + "learning_rate": 9.478282743732613e-06, + "loss": 1.57121677, + "memory(GiB)": 97.17, + "step": 14930, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.65071754, + "epoch": 0.3788685946220193, + "grad_norm": 5.78125, + "learning_rate": 9.477816273658293e-06, + "loss": 1.63837528, + "memory(GiB)": 97.17, + "step": 14935, + "train_speed(iter/s)": 1.636884 + }, + { + "acc": 0.64984107, + "epoch": 0.3789954337899543, + "grad_norm": 5.34375, + "learning_rate": 9.47734960662962e-06, + "loss": 1.67655354, + "memory(GiB)": 97.17, + "step": 14940, + "train_speed(iter/s)": 1.63694 + }, + { + "acc": 0.65793409, + "epoch": 0.3791222729578894, + "grad_norm": 8.125, + "learning_rate": 9.476882742667122e-06, + "loss": 1.60423203, + "memory(GiB)": 97.17, + "step": 14945, + "train_speed(iter/s)": 1.636998 + }, + { + "acc": 0.64778004, + "epoch": 0.37924911212582446, + "grad_norm": 5.5, + "learning_rate": 9.476415681791333e-06, + "loss": 1.59187326, + "memory(GiB)": 97.17, + "step": 14950, + "train_speed(iter/s)": 1.637054 + }, + { + "acc": 0.65125122, + "epoch": 0.3793759512937595, + "grad_norm": 5.78125, + "learning_rate": 9.475948424022798e-06, + "loss": 1.59747, + "memory(GiB)": 97.17, + "step": 14955, + "train_speed(iter/s)": 1.637113 + }, + { + "acc": 0.66960096, + "epoch": 0.37950279046169455, + "grad_norm": 5.15625, + "learning_rate": 9.475480969382065e-06, + "loss": 1.56869583, + "memory(GiB)": 97.17, + "step": 14960, + "train_speed(iter/s)": 1.637171 + }, + { + "acc": 0.6516613, + "epoch": 0.37962962962962965, + "grad_norm": 5.21875, + "learning_rate": 9.475013317889699e-06, + "loss": 1.60754051, + "memory(GiB)": 97.17, + "step": 14965, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.64712429, + "epoch": 0.3797564687975647, + "grad_norm": 5.5625, + "learning_rate": 9.474545469566267e-06, + "loss": 1.60855618, + "memory(GiB)": 97.17, + "step": 14970, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.64248848, + "epoch": 0.37988330796549974, + "grad_norm": 5.3125, + "learning_rate": 9.474077424432348e-06, + "loss": 1.63334675, + "memory(GiB)": 97.17, + "step": 14975, + "train_speed(iter/s)": 1.637349 + }, + { + "acc": 0.64489765, + "epoch": 0.3800101471334348, + "grad_norm": 5.875, + "learning_rate": 9.47360918250853e-06, + "loss": 1.70478287, + "memory(GiB)": 97.17, + "step": 14980, + "train_speed(iter/s)": 1.637406 + }, + { + "acc": 0.64993458, + "epoch": 0.3801369863013699, + "grad_norm": 6.90625, + "learning_rate": 9.473140743815405e-06, + "loss": 1.70229359, + "memory(GiB)": 97.17, + "step": 14985, + "train_speed(iter/s)": 1.637467 + }, + { + "acc": 0.67129154, + "epoch": 0.38026382546930493, + "grad_norm": 5.1875, + "learning_rate": 9.47267210837358e-06, + "loss": 1.57331486, + "memory(GiB)": 97.17, + "step": 14990, + "train_speed(iter/s)": 1.637527 + }, + { + "acc": 0.65323563, + "epoch": 0.38039066463724, + "grad_norm": 6.78125, + "learning_rate": 9.472203276203667e-06, + "loss": 1.641152, + "memory(GiB)": 97.17, + "step": 14995, + "train_speed(iter/s)": 1.637588 + }, + { + "acc": 0.65414772, + "epoch": 0.380517503805175, + "grad_norm": 5.46875, + "learning_rate": 9.471734247326284e-06, + "loss": 1.66306038, + "memory(GiB)": 97.17, + "step": 15000, + "train_speed(iter/s)": 1.637649 + }, + { + "epoch": 0.380517503805175, + "eval_acc": 0.6420889743702191, + "eval_loss": 1.5977007150650024, + "eval_runtime": 58.6084, + "eval_samples_per_second": 108.687, + "eval_steps_per_second": 27.18, + "step": 15000 + }, + { + "acc": 0.65199609, + "epoch": 0.3806443429731101, + "grad_norm": 6.03125, + "learning_rate": 9.471265021762067e-06, + "loss": 1.61440811, + "memory(GiB)": 97.17, + "step": 15005, + "train_speed(iter/s)": 1.626547 + }, + { + "acc": 0.67658763, + "epoch": 0.38077118214104516, + "grad_norm": 6.5, + "learning_rate": 9.47079559953165e-06, + "loss": 1.50812092, + "memory(GiB)": 97.17, + "step": 15010, + "train_speed(iter/s)": 1.626606 + }, + { + "acc": 0.63453341, + "epoch": 0.3808980213089802, + "grad_norm": 5.9375, + "learning_rate": 9.470325980655683e-06, + "loss": 1.68023796, + "memory(GiB)": 97.17, + "step": 15015, + "train_speed(iter/s)": 1.626665 + }, + { + "acc": 0.6445303, + "epoch": 0.38102486047691525, + "grad_norm": 8.25, + "learning_rate": 9.46985616515482e-06, + "loss": 1.68390846, + "memory(GiB)": 97.17, + "step": 15020, + "train_speed(iter/s)": 1.626722 + }, + { + "acc": 0.64478121, + "epoch": 0.38115169964485035, + "grad_norm": 6.0, + "learning_rate": 9.469386153049727e-06, + "loss": 1.62552166, + "memory(GiB)": 97.17, + "step": 15025, + "train_speed(iter/s)": 1.626778 + }, + { + "acc": 0.65908871, + "epoch": 0.3812785388127854, + "grad_norm": 7.59375, + "learning_rate": 9.468915944361076e-06, + "loss": 1.55891399, + "memory(GiB)": 97.17, + "step": 15030, + "train_speed(iter/s)": 1.626836 + }, + { + "acc": 0.65880489, + "epoch": 0.38140537798072044, + "grad_norm": 4.53125, + "learning_rate": 9.468445539109551e-06, + "loss": 1.63597965, + "memory(GiB)": 97.17, + "step": 15035, + "train_speed(iter/s)": 1.626895 + }, + { + "acc": 0.6407299, + "epoch": 0.3815322171486555, + "grad_norm": 5.59375, + "learning_rate": 9.46797493731584e-06, + "loss": 1.70256844, + "memory(GiB)": 97.17, + "step": 15040, + "train_speed(iter/s)": 1.62694 + }, + { + "acc": 0.66172566, + "epoch": 0.3816590563165906, + "grad_norm": 4.875, + "learning_rate": 9.467504139000642e-06, + "loss": 1.61686726, + "memory(GiB)": 97.17, + "step": 15045, + "train_speed(iter/s)": 1.626995 + }, + { + "acc": 0.65152993, + "epoch": 0.38178589548452563, + "grad_norm": 4.84375, + "learning_rate": 9.467033144184667e-06, + "loss": 1.67275543, + "memory(GiB)": 97.17, + "step": 15050, + "train_speed(iter/s)": 1.627045 + }, + { + "acc": 0.6585288, + "epoch": 0.3819127346524607, + "grad_norm": 6.46875, + "learning_rate": 9.466561952888632e-06, + "loss": 1.6463604, + "memory(GiB)": 97.17, + "step": 15055, + "train_speed(iter/s)": 1.627096 + }, + { + "acc": 0.64626746, + "epoch": 0.3820395738203957, + "grad_norm": 6.28125, + "learning_rate": 9.466090565133259e-06, + "loss": 1.62370682, + "memory(GiB)": 97.17, + "step": 15060, + "train_speed(iter/s)": 1.627154 + }, + { + "acc": 0.65925994, + "epoch": 0.3821664129883308, + "grad_norm": 4.75, + "learning_rate": 9.465618980939284e-06, + "loss": 1.59799404, + "memory(GiB)": 97.17, + "step": 15065, + "train_speed(iter/s)": 1.627212 + }, + { + "acc": 0.66021891, + "epoch": 0.38229325215626586, + "grad_norm": 5.5, + "learning_rate": 9.465147200327446e-06, + "loss": 1.65960159, + "memory(GiB)": 97.17, + "step": 15070, + "train_speed(iter/s)": 1.627264 + }, + { + "acc": 0.65524716, + "epoch": 0.3824200913242009, + "grad_norm": 6.09375, + "learning_rate": 9.464675223318503e-06, + "loss": 1.55825253, + "memory(GiB)": 97.17, + "step": 15075, + "train_speed(iter/s)": 1.627323 + }, + { + "acc": 0.66851449, + "epoch": 0.38254693049213595, + "grad_norm": 6.40625, + "learning_rate": 9.464203049933207e-06, + "loss": 1.60118256, + "memory(GiB)": 97.17, + "step": 15080, + "train_speed(iter/s)": 1.627377 + }, + { + "acc": 0.68179879, + "epoch": 0.38267376966007105, + "grad_norm": 6.125, + "learning_rate": 9.463730680192332e-06, + "loss": 1.57144289, + "memory(GiB)": 97.17, + "step": 15085, + "train_speed(iter/s)": 1.627435 + }, + { + "acc": 0.64977884, + "epoch": 0.3828006088280061, + "grad_norm": 5.375, + "learning_rate": 9.46325811411665e-06, + "loss": 1.67484436, + "memory(GiB)": 97.17, + "step": 15090, + "train_speed(iter/s)": 1.62749 + }, + { + "acc": 0.65589213, + "epoch": 0.38292744799594114, + "grad_norm": 6.9375, + "learning_rate": 9.462785351726951e-06, + "loss": 1.61144466, + "memory(GiB)": 97.17, + "step": 15095, + "train_speed(iter/s)": 1.627548 + }, + { + "acc": 0.65560389, + "epoch": 0.3830542871638762, + "grad_norm": 5.375, + "learning_rate": 9.462312393044027e-06, + "loss": 1.60794601, + "memory(GiB)": 97.17, + "step": 15100, + "train_speed(iter/s)": 1.627603 + }, + { + "acc": 0.66775174, + "epoch": 0.3831811263318113, + "grad_norm": 5.5, + "learning_rate": 9.46183923808868e-06, + "loss": 1.58283987, + "memory(GiB)": 97.17, + "step": 15105, + "train_speed(iter/s)": 1.627658 + }, + { + "acc": 0.65754728, + "epoch": 0.38330796549974633, + "grad_norm": 5.46875, + "learning_rate": 9.461365886881724e-06, + "loss": 1.64416447, + "memory(GiB)": 97.17, + "step": 15110, + "train_speed(iter/s)": 1.627709 + }, + { + "acc": 0.65614381, + "epoch": 0.3834348046676814, + "grad_norm": 5.375, + "learning_rate": 9.460892339443977e-06, + "loss": 1.67810593, + "memory(GiB)": 97.17, + "step": 15115, + "train_speed(iter/s)": 1.627766 + }, + { + "acc": 0.65124025, + "epoch": 0.3835616438356164, + "grad_norm": 7.5625, + "learning_rate": 9.460418595796268e-06, + "loss": 1.61333828, + "memory(GiB)": 97.17, + "step": 15120, + "train_speed(iter/s)": 1.62782 + }, + { + "acc": 0.63805156, + "epoch": 0.3836884830035515, + "grad_norm": 4.71875, + "learning_rate": 9.459944655959437e-06, + "loss": 1.6731926, + "memory(GiB)": 97.17, + "step": 15125, + "train_speed(iter/s)": 1.627875 + }, + { + "acc": 0.66052732, + "epoch": 0.38381532217148656, + "grad_norm": 5.625, + "learning_rate": 9.459470519954325e-06, + "loss": 1.54127674, + "memory(GiB)": 97.17, + "step": 15130, + "train_speed(iter/s)": 1.627933 + }, + { + "acc": 0.65779576, + "epoch": 0.3839421613394216, + "grad_norm": 6.875, + "learning_rate": 9.458996187801791e-06, + "loss": 1.59846478, + "memory(GiB)": 97.17, + "step": 15135, + "train_speed(iter/s)": 1.627992 + }, + { + "acc": 0.63993168, + "epoch": 0.38406900050735665, + "grad_norm": 5.96875, + "learning_rate": 9.458521659522697e-06, + "loss": 1.67858791, + "memory(GiB)": 97.17, + "step": 15140, + "train_speed(iter/s)": 1.628052 + }, + { + "acc": 0.64192643, + "epoch": 0.38419583967529175, + "grad_norm": 5.46875, + "learning_rate": 9.458046935137913e-06, + "loss": 1.70274601, + "memory(GiB)": 97.17, + "step": 15145, + "train_speed(iter/s)": 1.628112 + }, + { + "acc": 0.66031561, + "epoch": 0.3843226788432268, + "grad_norm": 5.34375, + "learning_rate": 9.457572014668323e-06, + "loss": 1.63343391, + "memory(GiB)": 97.17, + "step": 15150, + "train_speed(iter/s)": 1.62817 + }, + { + "acc": 0.65354128, + "epoch": 0.38444951801116184, + "grad_norm": 4.96875, + "learning_rate": 9.457096898134813e-06, + "loss": 1.61082172, + "memory(GiB)": 97.17, + "step": 15155, + "train_speed(iter/s)": 1.628224 + }, + { + "acc": 0.66112552, + "epoch": 0.3845763571790969, + "grad_norm": 7.125, + "learning_rate": 9.45662158555828e-06, + "loss": 1.63481731, + "memory(GiB)": 97.17, + "step": 15160, + "train_speed(iter/s)": 1.628283 + }, + { + "acc": 0.65172977, + "epoch": 0.384703196347032, + "grad_norm": 7.9375, + "learning_rate": 9.456146076959636e-06, + "loss": 1.63406162, + "memory(GiB)": 97.17, + "step": 15165, + "train_speed(iter/s)": 1.628343 + }, + { + "acc": 0.66097212, + "epoch": 0.38483003551496703, + "grad_norm": 5.3125, + "learning_rate": 9.455670372359791e-06, + "loss": 1.66054554, + "memory(GiB)": 97.17, + "step": 15170, + "train_speed(iter/s)": 1.628396 + }, + { + "acc": 0.64635258, + "epoch": 0.3849568746829021, + "grad_norm": 4.96875, + "learning_rate": 9.45519447177967e-06, + "loss": 1.64068184, + "memory(GiB)": 97.17, + "step": 15175, + "train_speed(iter/s)": 1.62845 + }, + { + "acc": 0.65124454, + "epoch": 0.3850837138508371, + "grad_norm": 5.5625, + "learning_rate": 9.454718375240204e-06, + "loss": 1.67692146, + "memory(GiB)": 97.17, + "step": 15180, + "train_speed(iter/s)": 1.628506 + }, + { + "acc": 0.66304941, + "epoch": 0.3852105530187722, + "grad_norm": 6.875, + "learning_rate": 9.454242082762336e-06, + "loss": 1.58205814, + "memory(GiB)": 97.17, + "step": 15185, + "train_speed(iter/s)": 1.628567 + }, + { + "acc": 0.66110382, + "epoch": 0.38533739218670726, + "grad_norm": 7.0, + "learning_rate": 9.453765594367014e-06, + "loss": 1.62625275, + "memory(GiB)": 97.17, + "step": 15190, + "train_speed(iter/s)": 1.628625 + }, + { + "acc": 0.65365376, + "epoch": 0.3854642313546423, + "grad_norm": 5.40625, + "learning_rate": 9.453288910075196e-06, + "loss": 1.60047874, + "memory(GiB)": 97.17, + "step": 15195, + "train_speed(iter/s)": 1.628684 + }, + { + "acc": 0.65300121, + "epoch": 0.38559107052257735, + "grad_norm": 6.5, + "learning_rate": 9.452812029907849e-06, + "loss": 1.65511322, + "memory(GiB)": 97.17, + "step": 15200, + "train_speed(iter/s)": 1.628738 + }, + { + "acc": 0.64740038, + "epoch": 0.38571790969051245, + "grad_norm": 5.46875, + "learning_rate": 9.452334953885951e-06, + "loss": 1.64176769, + "memory(GiB)": 97.17, + "step": 15205, + "train_speed(iter/s)": 1.628796 + }, + { + "acc": 0.64705348, + "epoch": 0.3858447488584475, + "grad_norm": 6.0, + "learning_rate": 9.451857682030481e-06, + "loss": 1.66733418, + "memory(GiB)": 97.17, + "step": 15210, + "train_speed(iter/s)": 1.628851 + }, + { + "acc": 0.66407166, + "epoch": 0.38597158802638254, + "grad_norm": 6.84375, + "learning_rate": 9.451380214362436e-06, + "loss": 1.64468498, + "memory(GiB)": 97.17, + "step": 15215, + "train_speed(iter/s)": 1.628906 + }, + { + "acc": 0.65807333, + "epoch": 0.3860984271943176, + "grad_norm": 4.84375, + "learning_rate": 9.450902550902814e-06, + "loss": 1.61301842, + "memory(GiB)": 97.17, + "step": 15220, + "train_speed(iter/s)": 1.62896 + }, + { + "acc": 0.64157362, + "epoch": 0.3862252663622527, + "grad_norm": 5.34375, + "learning_rate": 9.450424691672626e-06, + "loss": 1.70941715, + "memory(GiB)": 97.17, + "step": 15225, + "train_speed(iter/s)": 1.629015 + }, + { + "acc": 0.65545411, + "epoch": 0.38635210553018773, + "grad_norm": 5.8125, + "learning_rate": 9.449946636692891e-06, + "loss": 1.59882326, + "memory(GiB)": 97.17, + "step": 15230, + "train_speed(iter/s)": 1.629069 + }, + { + "acc": 0.65746436, + "epoch": 0.3864789446981228, + "grad_norm": 6.125, + "learning_rate": 9.449468385984634e-06, + "loss": 1.60253696, + "memory(GiB)": 97.17, + "step": 15235, + "train_speed(iter/s)": 1.629128 + }, + { + "acc": 0.66188722, + "epoch": 0.3866057838660578, + "grad_norm": 6.71875, + "learning_rate": 9.448989939568892e-06, + "loss": 1.61767769, + "memory(GiB)": 97.17, + "step": 15240, + "train_speed(iter/s)": 1.629186 + }, + { + "acc": 0.63285279, + "epoch": 0.3867326230339929, + "grad_norm": 5.125, + "learning_rate": 9.448511297466708e-06, + "loss": 1.69206161, + "memory(GiB)": 97.17, + "step": 15245, + "train_speed(iter/s)": 1.62924 + }, + { + "acc": 0.6526329, + "epoch": 0.38685946220192796, + "grad_norm": 5.3125, + "learning_rate": 9.448032459699139e-06, + "loss": 1.59504023, + "memory(GiB)": 97.17, + "step": 15250, + "train_speed(iter/s)": 1.629297 + }, + { + "acc": 0.63574696, + "epoch": 0.386986301369863, + "grad_norm": 5.28125, + "learning_rate": 9.447553426287244e-06, + "loss": 1.70259533, + "memory(GiB)": 97.17, + "step": 15255, + "train_speed(iter/s)": 1.629353 + }, + { + "acc": 0.65058651, + "epoch": 0.38711314053779805, + "grad_norm": 4.78125, + "learning_rate": 9.44707419725209e-06, + "loss": 1.59770164, + "memory(GiB)": 97.17, + "step": 15260, + "train_speed(iter/s)": 1.629407 + }, + { + "acc": 0.6444521, + "epoch": 0.38723997970573315, + "grad_norm": 5.90625, + "learning_rate": 9.446594772614759e-06, + "loss": 1.69053574, + "memory(GiB)": 97.17, + "step": 15265, + "train_speed(iter/s)": 1.629462 + }, + { + "acc": 0.64778633, + "epoch": 0.3873668188736682, + "grad_norm": 5.28125, + "learning_rate": 9.446115152396335e-06, + "loss": 1.6015049, + "memory(GiB)": 97.17, + "step": 15270, + "train_speed(iter/s)": 1.629516 + }, + { + "acc": 0.66978006, + "epoch": 0.38749365804160324, + "grad_norm": 7.34375, + "learning_rate": 9.445635336617919e-06, + "loss": 1.55565643, + "memory(GiB)": 97.17, + "step": 15275, + "train_speed(iter/s)": 1.629566 + }, + { + "acc": 0.64932642, + "epoch": 0.3876204972095383, + "grad_norm": 4.21875, + "learning_rate": 9.445155325300612e-06, + "loss": 1.65540466, + "memory(GiB)": 97.17, + "step": 15280, + "train_speed(iter/s)": 1.629619 + }, + { + "acc": 0.65902166, + "epoch": 0.3877473363774734, + "grad_norm": 5.8125, + "learning_rate": 9.444675118465528e-06, + "loss": 1.64618721, + "memory(GiB)": 97.17, + "step": 15285, + "train_speed(iter/s)": 1.629671 + }, + { + "acc": 0.64727697, + "epoch": 0.38787417554540843, + "grad_norm": 5.15625, + "learning_rate": 9.444194716133785e-06, + "loss": 1.69308586, + "memory(GiB)": 97.17, + "step": 15290, + "train_speed(iter/s)": 1.629722 + }, + { + "acc": 0.65821066, + "epoch": 0.3880010147133435, + "grad_norm": 5.34375, + "learning_rate": 9.44371411832652e-06, + "loss": 1.57123346, + "memory(GiB)": 97.17, + "step": 15295, + "train_speed(iter/s)": 1.629777 + }, + { + "acc": 0.63835669, + "epoch": 0.3881278538812785, + "grad_norm": 5.4375, + "learning_rate": 9.443233325064867e-06, + "loss": 1.6920599, + "memory(GiB)": 97.17, + "step": 15300, + "train_speed(iter/s)": 1.629828 + }, + { + "acc": 0.64770451, + "epoch": 0.3882546930492136, + "grad_norm": 4.875, + "learning_rate": 9.442752336369976e-06, + "loss": 1.60383701, + "memory(GiB)": 97.17, + "step": 15305, + "train_speed(iter/s)": 1.629884 + }, + { + "acc": 0.67474632, + "epoch": 0.38838153221714866, + "grad_norm": 5.125, + "learning_rate": 9.442271152263e-06, + "loss": 1.54049063, + "memory(GiB)": 97.17, + "step": 15310, + "train_speed(iter/s)": 1.629934 + }, + { + "acc": 0.65135574, + "epoch": 0.3885083713850837, + "grad_norm": 4.96875, + "learning_rate": 9.441789772765107e-06, + "loss": 1.64121819, + "memory(GiB)": 97.17, + "step": 15315, + "train_speed(iter/s)": 1.62999 + }, + { + "acc": 0.64553738, + "epoch": 0.38863521055301875, + "grad_norm": 5.3125, + "learning_rate": 9.441308197897467e-06, + "loss": 1.63902283, + "memory(GiB)": 97.17, + "step": 15320, + "train_speed(iter/s)": 1.630046 + }, + { + "acc": 0.66621041, + "epoch": 0.38876204972095385, + "grad_norm": 6.375, + "learning_rate": 9.440826427681264e-06, + "loss": 1.66489449, + "memory(GiB)": 97.17, + "step": 15325, + "train_speed(iter/s)": 1.630099 + }, + { + "acc": 0.6516644, + "epoch": 0.3888888888888889, + "grad_norm": 6.6875, + "learning_rate": 9.44034446213769e-06, + "loss": 1.62715588, + "memory(GiB)": 97.17, + "step": 15330, + "train_speed(iter/s)": 1.630155 + }, + { + "acc": 0.65201325, + "epoch": 0.38901572805682394, + "grad_norm": 5.8125, + "learning_rate": 9.439862301287939e-06, + "loss": 1.61978149, + "memory(GiB)": 97.17, + "step": 15335, + "train_speed(iter/s)": 1.630212 + }, + { + "acc": 0.64715352, + "epoch": 0.389142567224759, + "grad_norm": 6.03125, + "learning_rate": 9.439379945153223e-06, + "loss": 1.69087639, + "memory(GiB)": 97.17, + "step": 15340, + "train_speed(iter/s)": 1.630272 + }, + { + "acc": 0.66509418, + "epoch": 0.3892694063926941, + "grad_norm": 6.34375, + "learning_rate": 9.438897393754755e-06, + "loss": 1.54576006, + "memory(GiB)": 97.17, + "step": 15345, + "train_speed(iter/s)": 1.630333 + }, + { + "acc": 0.6554986, + "epoch": 0.38939624556062913, + "grad_norm": 5.21875, + "learning_rate": 9.438414647113762e-06, + "loss": 1.60549564, + "memory(GiB)": 97.17, + "step": 15350, + "train_speed(iter/s)": 1.63039 + }, + { + "acc": 0.64943905, + "epoch": 0.3895230847285642, + "grad_norm": 5.46875, + "learning_rate": 9.437931705251478e-06, + "loss": 1.62872963, + "memory(GiB)": 97.17, + "step": 15355, + "train_speed(iter/s)": 1.630448 + }, + { + "acc": 0.65276423, + "epoch": 0.3896499238964992, + "grad_norm": 4.96875, + "learning_rate": 9.437448568189142e-06, + "loss": 1.55095549, + "memory(GiB)": 97.17, + "step": 15360, + "train_speed(iter/s)": 1.630502 + }, + { + "acc": 0.65781813, + "epoch": 0.3897767630644343, + "grad_norm": 5.03125, + "learning_rate": 9.436965235948008e-06, + "loss": 1.57868671, + "memory(GiB)": 97.17, + "step": 15365, + "train_speed(iter/s)": 1.630559 + }, + { + "acc": 0.65866489, + "epoch": 0.38990360223236936, + "grad_norm": 4.96875, + "learning_rate": 9.436481708549332e-06, + "loss": 1.64397202, + "memory(GiB)": 97.17, + "step": 15370, + "train_speed(iter/s)": 1.630617 + }, + { + "acc": 0.64708924, + "epoch": 0.3900304414003044, + "grad_norm": 5.40625, + "learning_rate": 9.435997986014382e-06, + "loss": 1.69053593, + "memory(GiB)": 97.17, + "step": 15375, + "train_speed(iter/s)": 1.630675 + }, + { + "acc": 0.6348774, + "epoch": 0.39015728056823945, + "grad_norm": 6.1875, + "learning_rate": 9.435514068364437e-06, + "loss": 1.72300053, + "memory(GiB)": 97.17, + "step": 15380, + "train_speed(iter/s)": 1.630728 + }, + { + "acc": 0.65201349, + "epoch": 0.39028411973617455, + "grad_norm": 5.28125, + "learning_rate": 9.43502995562078e-06, + "loss": 1.59301605, + "memory(GiB)": 97.17, + "step": 15385, + "train_speed(iter/s)": 1.630781 + }, + { + "acc": 0.65294523, + "epoch": 0.3904109589041096, + "grad_norm": 7.03125, + "learning_rate": 9.434545647804703e-06, + "loss": 1.65312443, + "memory(GiB)": 97.17, + "step": 15390, + "train_speed(iter/s)": 1.63084 + }, + { + "acc": 0.66024895, + "epoch": 0.39053779807204464, + "grad_norm": 6.3125, + "learning_rate": 9.434061144937512e-06, + "loss": 1.58912821, + "memory(GiB)": 97.17, + "step": 15395, + "train_speed(iter/s)": 1.630895 + }, + { + "acc": 0.66142116, + "epoch": 0.3906646372399797, + "grad_norm": 6.03125, + "learning_rate": 9.433576447040513e-06, + "loss": 1.56118126, + "memory(GiB)": 97.17, + "step": 15400, + "train_speed(iter/s)": 1.630951 + }, + { + "acc": 0.65852776, + "epoch": 0.3907914764079148, + "grad_norm": 5.1875, + "learning_rate": 9.433091554135029e-06, + "loss": 1.66822338, + "memory(GiB)": 97.17, + "step": 15405, + "train_speed(iter/s)": 1.631006 + }, + { + "acc": 0.64996738, + "epoch": 0.39091831557584983, + "grad_norm": 5.84375, + "learning_rate": 9.432606466242384e-06, + "loss": 1.68477383, + "memory(GiB)": 97.17, + "step": 15410, + "train_speed(iter/s)": 1.63106 + }, + { + "acc": 0.65065851, + "epoch": 0.3910451547437849, + "grad_norm": 4.875, + "learning_rate": 9.43212118338392e-06, + "loss": 1.59774828, + "memory(GiB)": 97.17, + "step": 15415, + "train_speed(iter/s)": 1.631117 + }, + { + "acc": 0.65187855, + "epoch": 0.3911719939117199, + "grad_norm": 5.25, + "learning_rate": 9.431635705580975e-06, + "loss": 1.61890888, + "memory(GiB)": 97.17, + "step": 15420, + "train_speed(iter/s)": 1.631172 + }, + { + "acc": 0.65411873, + "epoch": 0.391298833079655, + "grad_norm": 6.9375, + "learning_rate": 9.431150032854907e-06, + "loss": 1.61599655, + "memory(GiB)": 97.17, + "step": 15425, + "train_speed(iter/s)": 1.631232 + }, + { + "acc": 0.65631323, + "epoch": 0.39142567224759006, + "grad_norm": 6.375, + "learning_rate": 9.430664165227077e-06, + "loss": 1.55557957, + "memory(GiB)": 97.17, + "step": 15430, + "train_speed(iter/s)": 1.631287 + }, + { + "acc": 0.65505667, + "epoch": 0.3915525114155251, + "grad_norm": 5.28125, + "learning_rate": 9.430178102718857e-06, + "loss": 1.64248047, + "memory(GiB)": 97.17, + "step": 15435, + "train_speed(iter/s)": 1.631342 + }, + { + "acc": 0.66421671, + "epoch": 0.39167935058346015, + "grad_norm": 5.8125, + "learning_rate": 9.429691845351623e-06, + "loss": 1.57862911, + "memory(GiB)": 97.17, + "step": 15440, + "train_speed(iter/s)": 1.631397 + }, + { + "acc": 0.65027418, + "epoch": 0.39180618975139525, + "grad_norm": 5.1875, + "learning_rate": 9.429205393146763e-06, + "loss": 1.56260729, + "memory(GiB)": 97.17, + "step": 15445, + "train_speed(iter/s)": 1.631451 + }, + { + "acc": 0.64492798, + "epoch": 0.3919330289193303, + "grad_norm": 5.5625, + "learning_rate": 9.428718746125678e-06, + "loss": 1.67092323, + "memory(GiB)": 97.17, + "step": 15450, + "train_speed(iter/s)": 1.63151 + }, + { + "acc": 0.65922537, + "epoch": 0.39205986808726534, + "grad_norm": 5.5, + "learning_rate": 9.428231904309768e-06, + "loss": 1.66560287, + "memory(GiB)": 97.17, + "step": 15455, + "train_speed(iter/s)": 1.631564 + }, + { + "acc": 0.64279432, + "epoch": 0.3921867072552004, + "grad_norm": 5.46875, + "learning_rate": 9.427744867720448e-06, + "loss": 1.67148151, + "memory(GiB)": 97.17, + "step": 15460, + "train_speed(iter/s)": 1.631617 + }, + { + "acc": 0.65166764, + "epoch": 0.3923135464231355, + "grad_norm": 5.71875, + "learning_rate": 9.42725763637914e-06, + "loss": 1.63665276, + "memory(GiB)": 97.17, + "step": 15465, + "train_speed(iter/s)": 1.631671 + }, + { + "acc": 0.66557617, + "epoch": 0.39244038559107053, + "grad_norm": 5.65625, + "learning_rate": 9.426770210307277e-06, + "loss": 1.58535719, + "memory(GiB)": 97.17, + "step": 15470, + "train_speed(iter/s)": 1.631726 + }, + { + "acc": 0.64976187, + "epoch": 0.3925672247590056, + "grad_norm": 5.71875, + "learning_rate": 9.426282589526294e-06, + "loss": 1.64792614, + "memory(GiB)": 97.17, + "step": 15475, + "train_speed(iter/s)": 1.631779 + }, + { + "acc": 0.6637393, + "epoch": 0.3926940639269406, + "grad_norm": 4.90625, + "learning_rate": 9.425794774057641e-06, + "loss": 1.5826395, + "memory(GiB)": 97.17, + "step": 15480, + "train_speed(iter/s)": 1.631836 + }, + { + "acc": 0.65702786, + "epoch": 0.3928209030948757, + "grad_norm": 5.0625, + "learning_rate": 9.425306763922775e-06, + "loss": 1.61587639, + "memory(GiB)": 97.17, + "step": 15485, + "train_speed(iter/s)": 1.631893 + }, + { + "acc": 0.63265066, + "epoch": 0.39294774226281076, + "grad_norm": 7.125, + "learning_rate": 9.42481855914316e-06, + "loss": 1.67759323, + "memory(GiB)": 97.17, + "step": 15490, + "train_speed(iter/s)": 1.631951 + }, + { + "acc": 0.63959403, + "epoch": 0.3930745814307458, + "grad_norm": 7.1875, + "learning_rate": 9.424330159740269e-06, + "loss": 1.64781876, + "memory(GiB)": 97.17, + "step": 15495, + "train_speed(iter/s)": 1.632001 + }, + { + "acc": 0.6650527, + "epoch": 0.39320142059868085, + "grad_norm": 5.03125, + "learning_rate": 9.423841565735582e-06, + "loss": 1.50305843, + "memory(GiB)": 97.17, + "step": 15500, + "train_speed(iter/s)": 1.632054 + }, + { + "acc": 0.65494719, + "epoch": 0.39332825976661595, + "grad_norm": 5.4375, + "learning_rate": 9.423352777150597e-06, + "loss": 1.67718582, + "memory(GiB)": 97.17, + "step": 15505, + "train_speed(iter/s)": 1.632108 + }, + { + "acc": 0.66393356, + "epoch": 0.393455098934551, + "grad_norm": 6.53125, + "learning_rate": 9.422863794006804e-06, + "loss": 1.59034262, + "memory(GiB)": 97.17, + "step": 15510, + "train_speed(iter/s)": 1.632163 + }, + { + "acc": 0.64157934, + "epoch": 0.39358193810248604, + "grad_norm": 5.0625, + "learning_rate": 9.422374616325716e-06, + "loss": 1.67116318, + "memory(GiB)": 97.17, + "step": 15515, + "train_speed(iter/s)": 1.632218 + }, + { + "acc": 0.65052934, + "epoch": 0.3937087772704211, + "grad_norm": 6.0, + "learning_rate": 9.421885244128847e-06, + "loss": 1.64150238, + "memory(GiB)": 97.17, + "step": 15520, + "train_speed(iter/s)": 1.632269 + }, + { + "acc": 0.65489674, + "epoch": 0.3938356164383562, + "grad_norm": 5.71875, + "learning_rate": 9.421395677437724e-06, + "loss": 1.60311394, + "memory(GiB)": 97.17, + "step": 15525, + "train_speed(iter/s)": 1.632321 + }, + { + "acc": 0.66780558, + "epoch": 0.39396245560629123, + "grad_norm": 6.53125, + "learning_rate": 9.42090591627388e-06, + "loss": 1.58155308, + "memory(GiB)": 97.17, + "step": 15530, + "train_speed(iter/s)": 1.632374 + }, + { + "acc": 0.65116453, + "epoch": 0.3940892947742263, + "grad_norm": 6.6875, + "learning_rate": 9.420415960658853e-06, + "loss": 1.60542164, + "memory(GiB)": 97.17, + "step": 15535, + "train_speed(iter/s)": 1.632422 + }, + { + "acc": 0.65156603, + "epoch": 0.3942161339421613, + "grad_norm": 5.59375, + "learning_rate": 9.419925810614196e-06, + "loss": 1.63556404, + "memory(GiB)": 97.17, + "step": 15540, + "train_speed(iter/s)": 1.632473 + }, + { + "acc": 0.64251719, + "epoch": 0.3943429731100964, + "grad_norm": 8.8125, + "learning_rate": 9.419435466161471e-06, + "loss": 1.63585396, + "memory(GiB)": 97.17, + "step": 15545, + "train_speed(iter/s)": 1.632529 + }, + { + "acc": 0.64899192, + "epoch": 0.39446981227803146, + "grad_norm": 5.59375, + "learning_rate": 9.418944927322242e-06, + "loss": 1.70206203, + "memory(GiB)": 97.17, + "step": 15550, + "train_speed(iter/s)": 1.632587 + }, + { + "acc": 0.65994377, + "epoch": 0.3945966514459665, + "grad_norm": 5.875, + "learning_rate": 9.418454194118085e-06, + "loss": 1.5923357, + "memory(GiB)": 97.17, + "step": 15555, + "train_speed(iter/s)": 1.632644 + }, + { + "acc": 0.6623775, + "epoch": 0.39472349061390155, + "grad_norm": 5.78125, + "learning_rate": 9.417963266570587e-06, + "loss": 1.66040497, + "memory(GiB)": 97.17, + "step": 15560, + "train_speed(iter/s)": 1.632696 + }, + { + "acc": 0.65084915, + "epoch": 0.39485032978183665, + "grad_norm": 4.53125, + "learning_rate": 9.417472144701338e-06, + "loss": 1.61573162, + "memory(GiB)": 97.17, + "step": 15565, + "train_speed(iter/s)": 1.632753 + }, + { + "acc": 0.66019616, + "epoch": 0.3949771689497717, + "grad_norm": 5.5, + "learning_rate": 9.416980828531944e-06, + "loss": 1.5662322, + "memory(GiB)": 97.17, + "step": 15570, + "train_speed(iter/s)": 1.632813 + }, + { + "acc": 0.64340067, + "epoch": 0.39510400811770674, + "grad_norm": 5.78125, + "learning_rate": 9.41648931808401e-06, + "loss": 1.66267509, + "memory(GiB)": 97.17, + "step": 15575, + "train_speed(iter/s)": 1.632871 + }, + { + "acc": 0.64938602, + "epoch": 0.3952308472856418, + "grad_norm": 5.53125, + "learning_rate": 9.415997613379159e-06, + "loss": 1.61800346, + "memory(GiB)": 97.17, + "step": 15580, + "train_speed(iter/s)": 1.632926 + }, + { + "acc": 0.65750771, + "epoch": 0.3953576864535769, + "grad_norm": 5.8125, + "learning_rate": 9.415505714439016e-06, + "loss": 1.60547295, + "memory(GiB)": 97.17, + "step": 15585, + "train_speed(iter/s)": 1.632984 + }, + { + "acc": 0.64576778, + "epoch": 0.39548452562151193, + "grad_norm": 4.875, + "learning_rate": 9.415013621285219e-06, + "loss": 1.64392223, + "memory(GiB)": 97.17, + "step": 15590, + "train_speed(iter/s)": 1.633039 + }, + { + "acc": 0.6579072, + "epoch": 0.395611364789447, + "grad_norm": 6.09375, + "learning_rate": 9.41452133393941e-06, + "loss": 1.54888668, + "memory(GiB)": 97.17, + "step": 15595, + "train_speed(iter/s)": 1.633095 + }, + { + "acc": 0.66567664, + "epoch": 0.395738203957382, + "grad_norm": 4.78125, + "learning_rate": 9.414028852423245e-06, + "loss": 1.5840951, + "memory(GiB)": 97.17, + "step": 15600, + "train_speed(iter/s)": 1.633147 + }, + { + "acc": 0.66496782, + "epoch": 0.3958650431253171, + "grad_norm": 7.625, + "learning_rate": 9.413536176758384e-06, + "loss": 1.57238636, + "memory(GiB)": 97.17, + "step": 15605, + "train_speed(iter/s)": 1.633205 + }, + { + "acc": 0.64426823, + "epoch": 0.39599188229325216, + "grad_norm": 5.75, + "learning_rate": 9.413043306966496e-06, + "loss": 1.64144936, + "memory(GiB)": 97.17, + "step": 15610, + "train_speed(iter/s)": 1.633264 + }, + { + "acc": 0.67001653, + "epoch": 0.3961187214611872, + "grad_norm": 6.59375, + "learning_rate": 9.41255024306926e-06, + "loss": 1.57618198, + "memory(GiB)": 97.17, + "step": 15615, + "train_speed(iter/s)": 1.633323 + }, + { + "acc": 0.6681159, + "epoch": 0.39624556062912225, + "grad_norm": 5.90625, + "learning_rate": 9.412056985088364e-06, + "loss": 1.6572546, + "memory(GiB)": 97.17, + "step": 15620, + "train_speed(iter/s)": 1.63338 + }, + { + "acc": 0.64795952, + "epoch": 0.39637239979705735, + "grad_norm": 6.5, + "learning_rate": 9.411563533045505e-06, + "loss": 1.63303547, + "memory(GiB)": 97.17, + "step": 15625, + "train_speed(iter/s)": 1.633437 + }, + { + "acc": 0.64200125, + "epoch": 0.3964992389649924, + "grad_norm": 6.21875, + "learning_rate": 9.411069886962383e-06, + "loss": 1.68100014, + "memory(GiB)": 97.17, + "step": 15630, + "train_speed(iter/s)": 1.63349 + }, + { + "acc": 0.63818402, + "epoch": 0.39662607813292744, + "grad_norm": 7.1875, + "learning_rate": 9.410576046860716e-06, + "loss": 1.65536861, + "memory(GiB)": 97.17, + "step": 15635, + "train_speed(iter/s)": 1.633549 + }, + { + "acc": 0.6690546, + "epoch": 0.3967529173008625, + "grad_norm": 5.46875, + "learning_rate": 9.41008201276222e-06, + "loss": 1.58703251, + "memory(GiB)": 97.17, + "step": 15640, + "train_speed(iter/s)": 1.633603 + }, + { + "acc": 0.66416597, + "epoch": 0.3968797564687976, + "grad_norm": 5.3125, + "learning_rate": 9.409587784688629e-06, + "loss": 1.64223862, + "memory(GiB)": 97.17, + "step": 15645, + "train_speed(iter/s)": 1.633657 + }, + { + "acc": 0.62723894, + "epoch": 0.39700659563673263, + "grad_norm": 5.46875, + "learning_rate": 9.409093362661678e-06, + "loss": 1.7032486, + "memory(GiB)": 97.17, + "step": 15650, + "train_speed(iter/s)": 1.633713 + }, + { + "acc": 0.65629663, + "epoch": 0.3971334348046677, + "grad_norm": 6.15625, + "learning_rate": 9.408598746703119e-06, + "loss": 1.60428791, + "memory(GiB)": 97.17, + "step": 15655, + "train_speed(iter/s)": 1.633768 + }, + { + "acc": 0.65424824, + "epoch": 0.3972602739726027, + "grad_norm": 6.28125, + "learning_rate": 9.408103936834703e-06, + "loss": 1.65331917, + "memory(GiB)": 97.17, + "step": 15660, + "train_speed(iter/s)": 1.633825 + }, + { + "acc": 0.65117049, + "epoch": 0.3973871131405378, + "grad_norm": 5.6875, + "learning_rate": 9.407608933078194e-06, + "loss": 1.65230179, + "memory(GiB)": 97.17, + "step": 15665, + "train_speed(iter/s)": 1.633878 + }, + { + "acc": 0.65586481, + "epoch": 0.39751395230847286, + "grad_norm": 7.0, + "learning_rate": 9.407113735455366e-06, + "loss": 1.61515751, + "memory(GiB)": 97.17, + "step": 15670, + "train_speed(iter/s)": 1.633933 + }, + { + "acc": 0.65823298, + "epoch": 0.3976407914764079, + "grad_norm": 6.34375, + "learning_rate": 9.406618343988e-06, + "loss": 1.59263668, + "memory(GiB)": 97.17, + "step": 15675, + "train_speed(iter/s)": 1.633988 + }, + { + "acc": 0.65400338, + "epoch": 0.39776763064434295, + "grad_norm": 5.53125, + "learning_rate": 9.406122758697885e-06, + "loss": 1.63175106, + "memory(GiB)": 97.17, + "step": 15680, + "train_speed(iter/s)": 1.634041 + }, + { + "acc": 0.65228043, + "epoch": 0.39789446981227805, + "grad_norm": 4.90625, + "learning_rate": 9.405626979606819e-06, + "loss": 1.68690357, + "memory(GiB)": 97.17, + "step": 15685, + "train_speed(iter/s)": 1.634093 + }, + { + "acc": 0.65005369, + "epoch": 0.3980213089802131, + "grad_norm": 5.09375, + "learning_rate": 9.405131006736608e-06, + "loss": 1.61571674, + "memory(GiB)": 97.17, + "step": 15690, + "train_speed(iter/s)": 1.634152 + }, + { + "acc": 0.66304731, + "epoch": 0.39814814814814814, + "grad_norm": 5.59375, + "learning_rate": 9.404634840109069e-06, + "loss": 1.63206615, + "memory(GiB)": 97.17, + "step": 15695, + "train_speed(iter/s)": 1.634209 + }, + { + "acc": 0.6460711, + "epoch": 0.3982749873160832, + "grad_norm": 5.34375, + "learning_rate": 9.404138479746022e-06, + "loss": 1.63198643, + "memory(GiB)": 97.17, + "step": 15700, + "train_speed(iter/s)": 1.63426 + }, + { + "acc": 0.65446758, + "epoch": 0.3984018264840183, + "grad_norm": 5.875, + "learning_rate": 9.403641925669304e-06, + "loss": 1.61774788, + "memory(GiB)": 97.17, + "step": 15705, + "train_speed(iter/s)": 1.634313 + }, + { + "acc": 0.65057478, + "epoch": 0.39852866565195333, + "grad_norm": 4.28125, + "learning_rate": 9.403145177900752e-06, + "loss": 1.66977463, + "memory(GiB)": 97.17, + "step": 15710, + "train_speed(iter/s)": 1.634365 + }, + { + "acc": 0.6444972, + "epoch": 0.3986555048198884, + "grad_norm": 6.0, + "learning_rate": 9.402648236462217e-06, + "loss": 1.62636948, + "memory(GiB)": 97.17, + "step": 15715, + "train_speed(iter/s)": 1.634423 + }, + { + "acc": 0.65888119, + "epoch": 0.3987823439878234, + "grad_norm": 4.5625, + "learning_rate": 9.402151101375557e-06, + "loss": 1.63502407, + "memory(GiB)": 97.17, + "step": 15720, + "train_speed(iter/s)": 1.634476 + }, + { + "acc": 0.64015155, + "epoch": 0.3989091831557585, + "grad_norm": 5.1875, + "learning_rate": 9.401653772662638e-06, + "loss": 1.62309017, + "memory(GiB)": 97.17, + "step": 15725, + "train_speed(iter/s)": 1.634532 + }, + { + "acc": 0.65206957, + "epoch": 0.39903602232369356, + "grad_norm": 5.03125, + "learning_rate": 9.401156250345331e-06, + "loss": 1.6193222, + "memory(GiB)": 97.17, + "step": 15730, + "train_speed(iter/s)": 1.634587 + }, + { + "acc": 0.65838399, + "epoch": 0.3991628614916286, + "grad_norm": 5.46875, + "learning_rate": 9.400658534445524e-06, + "loss": 1.59883022, + "memory(GiB)": 97.17, + "step": 15735, + "train_speed(iter/s)": 1.634638 + }, + { + "acc": 0.63716321, + "epoch": 0.39928970065956365, + "grad_norm": 5.84375, + "learning_rate": 9.40016062498511e-06, + "loss": 1.63768959, + "memory(GiB)": 97.17, + "step": 15740, + "train_speed(iter/s)": 1.63469 + }, + { + "acc": 0.64570274, + "epoch": 0.39941653982749875, + "grad_norm": 6.0625, + "learning_rate": 9.399662521985982e-06, + "loss": 1.62347889, + "memory(GiB)": 97.17, + "step": 15745, + "train_speed(iter/s)": 1.634736 + }, + { + "acc": 0.65868735, + "epoch": 0.3995433789954338, + "grad_norm": 8.125, + "learning_rate": 9.399164225470055e-06, + "loss": 1.65992889, + "memory(GiB)": 97.17, + "step": 15750, + "train_speed(iter/s)": 1.634786 + }, + { + "acc": 0.65334015, + "epoch": 0.39967021816336884, + "grad_norm": 5.84375, + "learning_rate": 9.398665735459245e-06, + "loss": 1.65760117, + "memory(GiB)": 97.17, + "step": 15755, + "train_speed(iter/s)": 1.634837 + }, + { + "acc": 0.63274751, + "epoch": 0.3997970573313039, + "grad_norm": 4.59375, + "learning_rate": 9.398167051975475e-06, + "loss": 1.67615051, + "memory(GiB)": 97.17, + "step": 15760, + "train_speed(iter/s)": 1.634888 + }, + { + "acc": 0.6517416, + "epoch": 0.399923896499239, + "grad_norm": 6.40625, + "learning_rate": 9.397668175040684e-06, + "loss": 1.61494884, + "memory(GiB)": 97.17, + "step": 15765, + "train_speed(iter/s)": 1.634939 + }, + { + "acc": 0.63603888, + "epoch": 0.40005073566717403, + "grad_norm": 5.1875, + "learning_rate": 9.397169104676813e-06, + "loss": 1.66938114, + "memory(GiB)": 97.17, + "step": 15770, + "train_speed(iter/s)": 1.634993 + }, + { + "acc": 0.63894658, + "epoch": 0.4001775748351091, + "grad_norm": 5.4375, + "learning_rate": 9.39666984090581e-06, + "loss": 1.6665844, + "memory(GiB)": 97.17, + "step": 15775, + "train_speed(iter/s)": 1.635045 + }, + { + "acc": 0.66393557, + "epoch": 0.4003044140030441, + "grad_norm": 5.375, + "learning_rate": 9.396170383749642e-06, + "loss": 1.61611996, + "memory(GiB)": 97.17, + "step": 15780, + "train_speed(iter/s)": 1.635099 + }, + { + "acc": 0.65847626, + "epoch": 0.4004312531709792, + "grad_norm": 4.84375, + "learning_rate": 9.39567073323027e-06, + "loss": 1.60259666, + "memory(GiB)": 97.17, + "step": 15785, + "train_speed(iter/s)": 1.63515 + }, + { + "acc": 0.63672585, + "epoch": 0.40055809233891426, + "grad_norm": 5.4375, + "learning_rate": 9.395170889369674e-06, + "loss": 1.71591072, + "memory(GiB)": 97.17, + "step": 15790, + "train_speed(iter/s)": 1.635205 + }, + { + "acc": 0.65618362, + "epoch": 0.4006849315068493, + "grad_norm": 6.0, + "learning_rate": 9.39467085218984e-06, + "loss": 1.64862404, + "memory(GiB)": 97.17, + "step": 15795, + "train_speed(iter/s)": 1.635256 + }, + { + "acc": 0.64691854, + "epoch": 0.40081177067478435, + "grad_norm": 5.03125, + "learning_rate": 9.39417062171276e-06, + "loss": 1.62944794, + "memory(GiB)": 97.17, + "step": 15800, + "train_speed(iter/s)": 1.635311 + }, + { + "acc": 0.65817442, + "epoch": 0.40093860984271945, + "grad_norm": 5.875, + "learning_rate": 9.393670197960439e-06, + "loss": 1.63574867, + "memory(GiB)": 97.17, + "step": 15805, + "train_speed(iter/s)": 1.635361 + }, + { + "acc": 0.64882078, + "epoch": 0.4010654490106545, + "grad_norm": 5.1875, + "learning_rate": 9.393169580954884e-06, + "loss": 1.59303999, + "memory(GiB)": 97.17, + "step": 15810, + "train_speed(iter/s)": 1.635415 + }, + { + "acc": 0.64674039, + "epoch": 0.40119228817858954, + "grad_norm": 4.59375, + "learning_rate": 9.392668770718118e-06, + "loss": 1.68373737, + "memory(GiB)": 97.17, + "step": 15815, + "train_speed(iter/s)": 1.635471 + }, + { + "acc": 0.65925636, + "epoch": 0.4013191273465246, + "grad_norm": 5.5625, + "learning_rate": 9.392167767272169e-06, + "loss": 1.64430122, + "memory(GiB)": 97.17, + "step": 15820, + "train_speed(iter/s)": 1.635527 + }, + { + "acc": 0.64930334, + "epoch": 0.4014459665144597, + "grad_norm": 5.0625, + "learning_rate": 9.39166657063907e-06, + "loss": 1.66579933, + "memory(GiB)": 97.17, + "step": 15825, + "train_speed(iter/s)": 1.635579 + }, + { + "acc": 0.64079461, + "epoch": 0.40157280568239473, + "grad_norm": 4.90625, + "learning_rate": 9.391165180840869e-06, + "loss": 1.64372616, + "memory(GiB)": 97.17, + "step": 15830, + "train_speed(iter/s)": 1.635633 + }, + { + "acc": 0.65208101, + "epoch": 0.4016996448503298, + "grad_norm": 6.5, + "learning_rate": 9.390663597899619e-06, + "loss": 1.6628273, + "memory(GiB)": 97.17, + "step": 15835, + "train_speed(iter/s)": 1.635687 + }, + { + "acc": 0.66142225, + "epoch": 0.4018264840182648, + "grad_norm": 5.5, + "learning_rate": 9.39016182183738e-06, + "loss": 1.55630836, + "memory(GiB)": 97.17, + "step": 15840, + "train_speed(iter/s)": 1.635743 + }, + { + "acc": 0.66288266, + "epoch": 0.4019533231861999, + "grad_norm": 6.0, + "learning_rate": 9.389659852676223e-06, + "loss": 1.60005302, + "memory(GiB)": 97.17, + "step": 15845, + "train_speed(iter/s)": 1.635794 + }, + { + "acc": 0.67552366, + "epoch": 0.40208016235413496, + "grad_norm": 6.34375, + "learning_rate": 9.389157690438228e-06, + "loss": 1.50747881, + "memory(GiB)": 97.17, + "step": 15850, + "train_speed(iter/s)": 1.635846 + }, + { + "acc": 0.65831046, + "epoch": 0.40220700152207, + "grad_norm": 5.375, + "learning_rate": 9.38865533514548e-06, + "loss": 1.55039349, + "memory(GiB)": 97.17, + "step": 15855, + "train_speed(iter/s)": 1.635901 + }, + { + "acc": 0.66160603, + "epoch": 0.40233384069000505, + "grad_norm": 5.0625, + "learning_rate": 9.388152786820078e-06, + "loss": 1.55197296, + "memory(GiB)": 97.17, + "step": 15860, + "train_speed(iter/s)": 1.635953 + }, + { + "acc": 0.63546963, + "epoch": 0.40246067985794015, + "grad_norm": 5.40625, + "learning_rate": 9.387650045484124e-06, + "loss": 1.75525208, + "memory(GiB)": 97.17, + "step": 15865, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.6519228, + "epoch": 0.4025875190258752, + "grad_norm": 5.9375, + "learning_rate": 9.387147111159734e-06, + "loss": 1.61350021, + "memory(GiB)": 97.17, + "step": 15870, + "train_speed(iter/s)": 1.636059 + }, + { + "acc": 0.64786277, + "epoch": 0.40271435819381024, + "grad_norm": 7.15625, + "learning_rate": 9.386643983869025e-06, + "loss": 1.62597084, + "memory(GiB)": 97.17, + "step": 15875, + "train_speed(iter/s)": 1.636115 + }, + { + "acc": 0.65504198, + "epoch": 0.4028411973617453, + "grad_norm": 7.46875, + "learning_rate": 9.386140663634127e-06, + "loss": 1.68262711, + "memory(GiB)": 97.17, + "step": 15880, + "train_speed(iter/s)": 1.636171 + }, + { + "acc": 0.64914284, + "epoch": 0.4029680365296804, + "grad_norm": 5.625, + "learning_rate": 9.385637150477182e-06, + "loss": 1.6466259, + "memory(GiB)": 97.17, + "step": 15885, + "train_speed(iter/s)": 1.636223 + }, + { + "acc": 0.64096508, + "epoch": 0.40309487569761543, + "grad_norm": 6.25, + "learning_rate": 9.385133444420333e-06, + "loss": 1.71891117, + "memory(GiB)": 97.17, + "step": 15890, + "train_speed(iter/s)": 1.636279 + }, + { + "acc": 0.66728516, + "epoch": 0.4032217148655505, + "grad_norm": 6.5625, + "learning_rate": 9.384629545485738e-06, + "loss": 1.51819839, + "memory(GiB)": 97.17, + "step": 15895, + "train_speed(iter/s)": 1.636334 + }, + { + "acc": 0.64217792, + "epoch": 0.4033485540334855, + "grad_norm": 5.34375, + "learning_rate": 9.38412545369556e-06, + "loss": 1.63828697, + "memory(GiB)": 97.17, + "step": 15900, + "train_speed(iter/s)": 1.636384 + }, + { + "acc": 0.64461389, + "epoch": 0.4034753932014206, + "grad_norm": 7.8125, + "learning_rate": 9.383621169071971e-06, + "loss": 1.61118851, + "memory(GiB)": 97.17, + "step": 15905, + "train_speed(iter/s)": 1.636441 + }, + { + "acc": 0.66804571, + "epoch": 0.40360223236935566, + "grad_norm": 5.21875, + "learning_rate": 9.38311669163715e-06, + "loss": 1.64611588, + "memory(GiB)": 97.17, + "step": 15910, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.65931616, + "epoch": 0.4037290715372907, + "grad_norm": 6.84375, + "learning_rate": 9.38261202141329e-06, + "loss": 1.60034523, + "memory(GiB)": 97.17, + "step": 15915, + "train_speed(iter/s)": 1.636546 + }, + { + "acc": 0.64861155, + "epoch": 0.40385591070522575, + "grad_norm": 6.75, + "learning_rate": 9.382107158422585e-06, + "loss": 1.63590164, + "memory(GiB)": 97.17, + "step": 15920, + "train_speed(iter/s)": 1.6366 + }, + { + "acc": 0.65154324, + "epoch": 0.40398274987316085, + "grad_norm": 5.78125, + "learning_rate": 9.381602102687241e-06, + "loss": 1.6102478, + "memory(GiB)": 97.17, + "step": 15925, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.66373606, + "epoch": 0.4041095890410959, + "grad_norm": 5.9375, + "learning_rate": 9.381096854229476e-06, + "loss": 1.62204723, + "memory(GiB)": 97.17, + "step": 15930, + "train_speed(iter/s)": 1.636701 + }, + { + "acc": 0.66498594, + "epoch": 0.40423642820903094, + "grad_norm": 5.0625, + "learning_rate": 9.38059141307151e-06, + "loss": 1.60091343, + "memory(GiB)": 97.17, + "step": 15935, + "train_speed(iter/s)": 1.636751 + }, + { + "acc": 0.65378904, + "epoch": 0.404363267376966, + "grad_norm": 5.21875, + "learning_rate": 9.380085779235577e-06, + "loss": 1.60037174, + "memory(GiB)": 97.17, + "step": 15940, + "train_speed(iter/s)": 1.636806 + }, + { + "acc": 0.6509573, + "epoch": 0.4044901065449011, + "grad_norm": 4.75, + "learning_rate": 9.379579952743916e-06, + "loss": 1.63471603, + "memory(GiB)": 97.17, + "step": 15945, + "train_speed(iter/s)": 1.63686 + }, + { + "acc": 0.64465246, + "epoch": 0.40461694571283613, + "grad_norm": 5.875, + "learning_rate": 9.379073933618774e-06, + "loss": 1.61435032, + "memory(GiB)": 97.17, + "step": 15950, + "train_speed(iter/s)": 1.636918 + }, + { + "acc": 0.64981737, + "epoch": 0.4047437848807712, + "grad_norm": 5.28125, + "learning_rate": 9.37856772188241e-06, + "loss": 1.65799713, + "memory(GiB)": 97.17, + "step": 15955, + "train_speed(iter/s)": 1.636972 + }, + { + "acc": 0.64226127, + "epoch": 0.4048706240487062, + "grad_norm": 5.4375, + "learning_rate": 9.378061317557088e-06, + "loss": 1.60751457, + "memory(GiB)": 97.17, + "step": 15960, + "train_speed(iter/s)": 1.637026 + }, + { + "acc": 0.67459617, + "epoch": 0.4049974632166413, + "grad_norm": 5.875, + "learning_rate": 9.377554720665083e-06, + "loss": 1.60703316, + "memory(GiB)": 97.17, + "step": 15965, + "train_speed(iter/s)": 1.637077 + }, + { + "acc": 0.64980779, + "epoch": 0.40512430238457636, + "grad_norm": 5.09375, + "learning_rate": 9.377047931228677e-06, + "loss": 1.67774754, + "memory(GiB)": 97.17, + "step": 15970, + "train_speed(iter/s)": 1.637131 + }, + { + "acc": 0.66006041, + "epoch": 0.4052511415525114, + "grad_norm": 6.875, + "learning_rate": 9.376540949270161e-06, + "loss": 1.58426991, + "memory(GiB)": 97.17, + "step": 15975, + "train_speed(iter/s)": 1.637185 + }, + { + "acc": 0.65663538, + "epoch": 0.40537798072044645, + "grad_norm": 5.40625, + "learning_rate": 9.376033774811833e-06, + "loss": 1.5896637, + "memory(GiB)": 97.17, + "step": 15980, + "train_speed(iter/s)": 1.637242 + }, + { + "acc": 0.66919851, + "epoch": 0.40550481988838155, + "grad_norm": 6.0, + "learning_rate": 9.375526407876003e-06, + "loss": 1.57763729, + "memory(GiB)": 97.17, + "step": 15985, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.64241471, + "epoch": 0.4056316590563166, + "grad_norm": 6.09375, + "learning_rate": 9.375018848484987e-06, + "loss": 1.67906704, + "memory(GiB)": 97.17, + "step": 15990, + "train_speed(iter/s)": 1.637346 + }, + { + "acc": 0.66374674, + "epoch": 0.40575849822425164, + "grad_norm": 4.75, + "learning_rate": 9.374511096661108e-06, + "loss": 1.60427475, + "memory(GiB)": 97.17, + "step": 15995, + "train_speed(iter/s)": 1.637396 + }, + { + "acc": 0.64524317, + "epoch": 0.4058853373921867, + "grad_norm": 5.0, + "learning_rate": 9.374003152426701e-06, + "loss": 1.64672585, + "memory(GiB)": 97.17, + "step": 16000, + "train_speed(iter/s)": 1.637447 + }, + { + "epoch": 0.4058853373921867, + "eval_acc": 0.6424895114825512, + "eval_loss": 1.5949453115463257, + "eval_runtime": 58.7857, + "eval_samples_per_second": 108.36, + "eval_steps_per_second": 27.098, + "step": 16000 + }, + { + "acc": 0.64903059, + "epoch": 0.4060121765601218, + "grad_norm": 6.09375, + "learning_rate": 9.373495015804106e-06, + "loss": 1.60470886, + "memory(GiB)": 97.17, + "step": 16005, + "train_speed(iter/s)": 1.627009 + }, + { + "acc": 0.64625101, + "epoch": 0.40613901572805683, + "grad_norm": 5.75, + "learning_rate": 9.372986686815674e-06, + "loss": 1.66825714, + "memory(GiB)": 97.17, + "step": 16010, + "train_speed(iter/s)": 1.627063 + }, + { + "acc": 0.6507658, + "epoch": 0.4062658548959919, + "grad_norm": 6.09375, + "learning_rate": 9.372478165483763e-06, + "loss": 1.59801474, + "memory(GiB)": 97.17, + "step": 16015, + "train_speed(iter/s)": 1.627117 + }, + { + "acc": 0.65511169, + "epoch": 0.4063926940639269, + "grad_norm": 5.5625, + "learning_rate": 9.371969451830743e-06, + "loss": 1.6586689, + "memory(GiB)": 97.17, + "step": 16020, + "train_speed(iter/s)": 1.627168 + }, + { + "acc": 0.65719781, + "epoch": 0.406519533231862, + "grad_norm": 5.375, + "learning_rate": 9.371460545878986e-06, + "loss": 1.60247536, + "memory(GiB)": 97.17, + "step": 16025, + "train_speed(iter/s)": 1.627223 + }, + { + "acc": 0.65161324, + "epoch": 0.40664637239979706, + "grad_norm": 5.9375, + "learning_rate": 9.370951447650875e-06, + "loss": 1.57683468, + "memory(GiB)": 97.17, + "step": 16030, + "train_speed(iter/s)": 1.627273 + }, + { + "acc": 0.63173494, + "epoch": 0.4067732115677321, + "grad_norm": 5.28125, + "learning_rate": 9.370442157168806e-06, + "loss": 1.68955822, + "memory(GiB)": 97.17, + "step": 16035, + "train_speed(iter/s)": 1.627326 + }, + { + "acc": 0.64423733, + "epoch": 0.40690005073566715, + "grad_norm": 5.21875, + "learning_rate": 9.369932674455177e-06, + "loss": 1.65101261, + "memory(GiB)": 97.17, + "step": 16040, + "train_speed(iter/s)": 1.62738 + }, + { + "acc": 0.65857348, + "epoch": 0.40702688990360225, + "grad_norm": 5.53125, + "learning_rate": 9.3694229995324e-06, + "loss": 1.5409668, + "memory(GiB)": 97.17, + "step": 16045, + "train_speed(iter/s)": 1.627428 + }, + { + "acc": 0.66051173, + "epoch": 0.4071537290715373, + "grad_norm": 6.3125, + "learning_rate": 9.368913132422891e-06, + "loss": 1.60270748, + "memory(GiB)": 97.17, + "step": 16050, + "train_speed(iter/s)": 1.627482 + }, + { + "acc": 0.65240541, + "epoch": 0.40728056823947234, + "grad_norm": 5.875, + "learning_rate": 9.368403073149079e-06, + "loss": 1.65346661, + "memory(GiB)": 97.17, + "step": 16055, + "train_speed(iter/s)": 1.627534 + }, + { + "acc": 0.64385419, + "epoch": 0.4074074074074074, + "grad_norm": 6.65625, + "learning_rate": 9.367892821733393e-06, + "loss": 1.60929718, + "memory(GiB)": 97.17, + "step": 16060, + "train_speed(iter/s)": 1.627583 + }, + { + "acc": 0.65275679, + "epoch": 0.4075342465753425, + "grad_norm": 5.4375, + "learning_rate": 9.367382378198282e-06, + "loss": 1.62133484, + "memory(GiB)": 97.17, + "step": 16065, + "train_speed(iter/s)": 1.627638 + }, + { + "acc": 0.65248361, + "epoch": 0.40766108574327753, + "grad_norm": 6.96875, + "learning_rate": 9.366871742566193e-06, + "loss": 1.58335047, + "memory(GiB)": 97.17, + "step": 16070, + "train_speed(iter/s)": 1.627689 + }, + { + "acc": 0.63841887, + "epoch": 0.4077879249112126, + "grad_norm": 5.96875, + "learning_rate": 9.36636091485959e-06, + "loss": 1.65226517, + "memory(GiB)": 97.17, + "step": 16075, + "train_speed(iter/s)": 1.627743 + }, + { + "acc": 0.65547657, + "epoch": 0.4079147640791476, + "grad_norm": 5.46875, + "learning_rate": 9.365849895100939e-06, + "loss": 1.59378586, + "memory(GiB)": 97.17, + "step": 16080, + "train_speed(iter/s)": 1.627795 + }, + { + "acc": 0.66481771, + "epoch": 0.4080416032470827, + "grad_norm": 7.78125, + "learning_rate": 9.365338683312714e-06, + "loss": 1.58041439, + "memory(GiB)": 97.17, + "step": 16085, + "train_speed(iter/s)": 1.627852 + }, + { + "acc": 0.65277824, + "epoch": 0.40816844241501776, + "grad_norm": 4.96875, + "learning_rate": 9.364827279517408e-06, + "loss": 1.62052765, + "memory(GiB)": 97.17, + "step": 16090, + "train_speed(iter/s)": 1.627907 + }, + { + "acc": 0.64046726, + "epoch": 0.4082952815829528, + "grad_norm": 4.96875, + "learning_rate": 9.36431568373751e-06, + "loss": 1.67497082, + "memory(GiB)": 97.17, + "step": 16095, + "train_speed(iter/s)": 1.627959 + }, + { + "acc": 0.65935326, + "epoch": 0.40842212075088785, + "grad_norm": 5.78125, + "learning_rate": 9.363803895995522e-06, + "loss": 1.57289524, + "memory(GiB)": 97.17, + "step": 16100, + "train_speed(iter/s)": 1.628014 + }, + { + "acc": 0.6536623, + "epoch": 0.40854895991882295, + "grad_norm": 5.5625, + "learning_rate": 9.363291916313955e-06, + "loss": 1.60950966, + "memory(GiB)": 97.17, + "step": 16105, + "train_speed(iter/s)": 1.628068 + }, + { + "acc": 0.64895964, + "epoch": 0.408675799086758, + "grad_norm": 4.84375, + "learning_rate": 9.362779744715332e-06, + "loss": 1.63531952, + "memory(GiB)": 97.17, + "step": 16110, + "train_speed(iter/s)": 1.628119 + }, + { + "acc": 0.6486104, + "epoch": 0.40880263825469304, + "grad_norm": 6.03125, + "learning_rate": 9.362267381222174e-06, + "loss": 1.65392284, + "memory(GiB)": 97.17, + "step": 16115, + "train_speed(iter/s)": 1.628175 + }, + { + "acc": 0.65669584, + "epoch": 0.4089294774226281, + "grad_norm": 5.09375, + "learning_rate": 9.361754825857022e-06, + "loss": 1.62155495, + "memory(GiB)": 97.17, + "step": 16120, + "train_speed(iter/s)": 1.628226 + }, + { + "acc": 0.6453598, + "epoch": 0.4090563165905632, + "grad_norm": 5.5, + "learning_rate": 9.36124207864242e-06, + "loss": 1.64764557, + "memory(GiB)": 97.17, + "step": 16125, + "train_speed(iter/s)": 1.628281 + }, + { + "acc": 0.64444523, + "epoch": 0.40918315575849823, + "grad_norm": 5.6875, + "learning_rate": 9.360729139600917e-06, + "loss": 1.65023556, + "memory(GiB)": 97.17, + "step": 16130, + "train_speed(iter/s)": 1.62834 + }, + { + "acc": 0.65969701, + "epoch": 0.4093099949264333, + "grad_norm": 5.25, + "learning_rate": 9.36021600875508e-06, + "loss": 1.63509293, + "memory(GiB)": 97.17, + "step": 16135, + "train_speed(iter/s)": 1.628394 + }, + { + "acc": 0.6343359, + "epoch": 0.4094368340943683, + "grad_norm": 5.625, + "learning_rate": 9.359702686127474e-06, + "loss": 1.68029633, + "memory(GiB)": 97.17, + "step": 16140, + "train_speed(iter/s)": 1.628448 + }, + { + "acc": 0.65928364, + "epoch": 0.4095636732623034, + "grad_norm": 5.15625, + "learning_rate": 9.359189171740679e-06, + "loss": 1.63962669, + "memory(GiB)": 97.17, + "step": 16145, + "train_speed(iter/s)": 1.628504 + }, + { + "acc": 0.64124012, + "epoch": 0.40969051243023846, + "grad_norm": 5.75, + "learning_rate": 9.358675465617283e-06, + "loss": 1.66341, + "memory(GiB)": 97.17, + "step": 16150, + "train_speed(iter/s)": 1.62856 + }, + { + "acc": 0.64791241, + "epoch": 0.4098173515981735, + "grad_norm": 5.75, + "learning_rate": 9.35816156777988e-06, + "loss": 1.65935631, + "memory(GiB)": 97.17, + "step": 16155, + "train_speed(iter/s)": 1.628614 + }, + { + "acc": 0.6422976, + "epoch": 0.40994419076610855, + "grad_norm": 6.96875, + "learning_rate": 9.357647478251072e-06, + "loss": 1.64075851, + "memory(GiB)": 97.17, + "step": 16160, + "train_speed(iter/s)": 1.628669 + }, + { + "acc": 0.64418387, + "epoch": 0.41007102993404365, + "grad_norm": 6.1875, + "learning_rate": 9.357133197053475e-06, + "loss": 1.69670563, + "memory(GiB)": 97.17, + "step": 16165, + "train_speed(iter/s)": 1.628727 + }, + { + "acc": 0.66096764, + "epoch": 0.4101978691019787, + "grad_norm": 9.4375, + "learning_rate": 9.356618724209704e-06, + "loss": 1.60868568, + "memory(GiB)": 97.17, + "step": 16170, + "train_speed(iter/s)": 1.628781 + }, + { + "acc": 0.65348921, + "epoch": 0.41032470826991374, + "grad_norm": 6.90625, + "learning_rate": 9.356104059742392e-06, + "loss": 1.59801893, + "memory(GiB)": 97.17, + "step": 16175, + "train_speed(iter/s)": 1.628836 + }, + { + "acc": 0.65164595, + "epoch": 0.4104515474378488, + "grad_norm": 5.46875, + "learning_rate": 9.355589203674175e-06, + "loss": 1.6303215, + "memory(GiB)": 97.17, + "step": 16180, + "train_speed(iter/s)": 1.628889 + }, + { + "acc": 0.66172752, + "epoch": 0.4105783866057839, + "grad_norm": 6.84375, + "learning_rate": 9.355074156027699e-06, + "loss": 1.58874168, + "memory(GiB)": 97.17, + "step": 16185, + "train_speed(iter/s)": 1.628946 + }, + { + "acc": 0.64384103, + "epoch": 0.41070522577371893, + "grad_norm": 5.5625, + "learning_rate": 9.354558916825616e-06, + "loss": 1.68263836, + "memory(GiB)": 97.17, + "step": 16190, + "train_speed(iter/s)": 1.628999 + }, + { + "acc": 0.65688691, + "epoch": 0.410832064941654, + "grad_norm": 5.03125, + "learning_rate": 9.354043486090592e-06, + "loss": 1.59741135, + "memory(GiB)": 97.17, + "step": 16195, + "train_speed(iter/s)": 1.62905 + }, + { + "acc": 0.64088583, + "epoch": 0.410958904109589, + "grad_norm": 5.59375, + "learning_rate": 9.353527863845296e-06, + "loss": 1.63057804, + "memory(GiB)": 97.17, + "step": 16200, + "train_speed(iter/s)": 1.629105 + }, + { + "acc": 0.66965623, + "epoch": 0.4110857432775241, + "grad_norm": 4.96875, + "learning_rate": 9.353012050112405e-06, + "loss": 1.5989255, + "memory(GiB)": 97.17, + "step": 16205, + "train_speed(iter/s)": 1.629159 + }, + { + "acc": 0.64907036, + "epoch": 0.41121258244545916, + "grad_norm": 5.25, + "learning_rate": 9.352496044914611e-06, + "loss": 1.63676128, + "memory(GiB)": 97.17, + "step": 16210, + "train_speed(iter/s)": 1.629209 + }, + { + "acc": 0.65607471, + "epoch": 0.4113394216133942, + "grad_norm": 7.375, + "learning_rate": 9.351979848274608e-06, + "loss": 1.62216969, + "memory(GiB)": 97.17, + "step": 16215, + "train_speed(iter/s)": 1.629262 + }, + { + "acc": 0.66906185, + "epoch": 0.41146626078132925, + "grad_norm": 4.875, + "learning_rate": 9.351463460215102e-06, + "loss": 1.52141571, + "memory(GiB)": 97.17, + "step": 16220, + "train_speed(iter/s)": 1.629303 + }, + { + "acc": 0.66006007, + "epoch": 0.41159309994926435, + "grad_norm": 6.4375, + "learning_rate": 9.350946880758804e-06, + "loss": 1.56992044, + "memory(GiB)": 97.17, + "step": 16225, + "train_speed(iter/s)": 1.629356 + }, + { + "acc": 0.65177946, + "epoch": 0.4117199391171994, + "grad_norm": 5.3125, + "learning_rate": 9.350430109928437e-06, + "loss": 1.61403236, + "memory(GiB)": 97.17, + "step": 16230, + "train_speed(iter/s)": 1.62941 + }, + { + "acc": 0.65621557, + "epoch": 0.41184677828513444, + "grad_norm": 6.78125, + "learning_rate": 9.349913147746731e-06, + "loss": 1.68774567, + "memory(GiB)": 97.17, + "step": 16235, + "train_speed(iter/s)": 1.629464 + }, + { + "acc": 0.66037645, + "epoch": 0.4119736174530695, + "grad_norm": 5.1875, + "learning_rate": 9.349395994236423e-06, + "loss": 1.57108192, + "memory(GiB)": 97.17, + "step": 16240, + "train_speed(iter/s)": 1.629517 + }, + { + "acc": 0.6542994, + "epoch": 0.4121004566210046, + "grad_norm": 5.4375, + "learning_rate": 9.348878649420262e-06, + "loss": 1.69590435, + "memory(GiB)": 97.17, + "step": 16245, + "train_speed(iter/s)": 1.629573 + }, + { + "acc": 0.64440928, + "epoch": 0.41222729578893963, + "grad_norm": 5.9375, + "learning_rate": 9.348361113321e-06, + "loss": 1.64741974, + "memory(GiB)": 97.17, + "step": 16250, + "train_speed(iter/s)": 1.629623 + }, + { + "acc": 0.65193729, + "epoch": 0.4123541349568747, + "grad_norm": 5.4375, + "learning_rate": 9.347843385961403e-06, + "loss": 1.64539108, + "memory(GiB)": 97.17, + "step": 16255, + "train_speed(iter/s)": 1.629677 + }, + { + "acc": 0.63490973, + "epoch": 0.4124809741248097, + "grad_norm": 5.875, + "learning_rate": 9.347325467364242e-06, + "loss": 1.6491951, + "memory(GiB)": 97.17, + "step": 16260, + "train_speed(iter/s)": 1.629731 + }, + { + "acc": 0.66243963, + "epoch": 0.4126078132927448, + "grad_norm": 5.28125, + "learning_rate": 9.346807357552296e-06, + "loss": 1.59512444, + "memory(GiB)": 97.17, + "step": 16265, + "train_speed(iter/s)": 1.629784 + }, + { + "acc": 0.64454498, + "epoch": 0.41273465246067986, + "grad_norm": 4.96875, + "learning_rate": 9.346289056548357e-06, + "loss": 1.65488186, + "memory(GiB)": 97.17, + "step": 16270, + "train_speed(iter/s)": 1.62984 + }, + { + "acc": 0.66899223, + "epoch": 0.4128614916286149, + "grad_norm": 5.5625, + "learning_rate": 9.345770564375221e-06, + "loss": 1.565625, + "memory(GiB)": 97.17, + "step": 16275, + "train_speed(iter/s)": 1.629891 + }, + { + "acc": 0.64188261, + "epoch": 0.41298833079654995, + "grad_norm": 4.90625, + "learning_rate": 9.345251881055692e-06, + "loss": 1.63279953, + "memory(GiB)": 97.17, + "step": 16280, + "train_speed(iter/s)": 1.629946 + }, + { + "acc": 0.65170283, + "epoch": 0.41311516996448505, + "grad_norm": 6.21875, + "learning_rate": 9.344733006612585e-06, + "loss": 1.67423515, + "memory(GiB)": 97.17, + "step": 16285, + "train_speed(iter/s)": 1.629999 + }, + { + "acc": 0.65542493, + "epoch": 0.4132420091324201, + "grad_norm": 5.53125, + "learning_rate": 9.344213941068724e-06, + "loss": 1.61397209, + "memory(GiB)": 97.17, + "step": 16290, + "train_speed(iter/s)": 1.630053 + }, + { + "acc": 0.64678516, + "epoch": 0.41336884830035514, + "grad_norm": 5.875, + "learning_rate": 9.343694684446937e-06, + "loss": 1.64910469, + "memory(GiB)": 97.17, + "step": 16295, + "train_speed(iter/s)": 1.630105 + }, + { + "acc": 0.6524024, + "epoch": 0.4134956874682902, + "grad_norm": 5.1875, + "learning_rate": 9.343175236770065e-06, + "loss": 1.56770487, + "memory(GiB)": 97.17, + "step": 16300, + "train_speed(iter/s)": 1.630159 + }, + { + "acc": 0.66147881, + "epoch": 0.4136225266362253, + "grad_norm": 4.46875, + "learning_rate": 9.342655598060955e-06, + "loss": 1.59089117, + "memory(GiB)": 97.17, + "step": 16305, + "train_speed(iter/s)": 1.630213 + }, + { + "acc": 0.6551693, + "epoch": 0.41374936580416033, + "grad_norm": 4.5625, + "learning_rate": 9.342135768342464e-06, + "loss": 1.58866253, + "memory(GiB)": 97.17, + "step": 16310, + "train_speed(iter/s)": 1.630265 + }, + { + "acc": 0.66414738, + "epoch": 0.4138762049720954, + "grad_norm": 5.5, + "learning_rate": 9.341615747637454e-06, + "loss": 1.58338203, + "memory(GiB)": 97.17, + "step": 16315, + "train_speed(iter/s)": 1.630319 + }, + { + "acc": 0.64451828, + "epoch": 0.4140030441400304, + "grad_norm": 5.09375, + "learning_rate": 9.3410955359688e-06, + "loss": 1.66825867, + "memory(GiB)": 97.17, + "step": 16320, + "train_speed(iter/s)": 1.630374 + }, + { + "acc": 0.66090703, + "epoch": 0.4141298833079655, + "grad_norm": 6.90625, + "learning_rate": 9.340575133359385e-06, + "loss": 1.59784298, + "memory(GiB)": 97.17, + "step": 16325, + "train_speed(iter/s)": 1.630426 + }, + { + "acc": 0.65299778, + "epoch": 0.41425672247590056, + "grad_norm": 5.6875, + "learning_rate": 9.340054539832095e-06, + "loss": 1.5794241, + "memory(GiB)": 97.17, + "step": 16330, + "train_speed(iter/s)": 1.630477 + }, + { + "acc": 0.64211025, + "epoch": 0.4143835616438356, + "grad_norm": 6.78125, + "learning_rate": 9.339533755409828e-06, + "loss": 1.68089371, + "memory(GiB)": 97.17, + "step": 16335, + "train_speed(iter/s)": 1.630532 + }, + { + "acc": 0.64001141, + "epoch": 0.41451040081177065, + "grad_norm": 5.5, + "learning_rate": 9.339012780115492e-06, + "loss": 1.6290781, + "memory(GiB)": 97.17, + "step": 16340, + "train_speed(iter/s)": 1.630583 + }, + { + "acc": 0.6527792, + "epoch": 0.41463723997970575, + "grad_norm": 5.03125, + "learning_rate": 9.338491613972002e-06, + "loss": 1.59525042, + "memory(GiB)": 97.17, + "step": 16345, + "train_speed(iter/s)": 1.630632 + }, + { + "acc": 0.67494831, + "epoch": 0.4147640791476408, + "grad_norm": 5.46875, + "learning_rate": 9.337970257002282e-06, + "loss": 1.60703545, + "memory(GiB)": 97.17, + "step": 16350, + "train_speed(iter/s)": 1.630682 + }, + { + "acc": 0.64884949, + "epoch": 0.41489091831557584, + "grad_norm": 6.53125, + "learning_rate": 9.337448709229261e-06, + "loss": 1.70602894, + "memory(GiB)": 97.17, + "step": 16355, + "train_speed(iter/s)": 1.630739 + }, + { + "acc": 0.66629572, + "epoch": 0.4150177574835109, + "grad_norm": 5.96875, + "learning_rate": 9.336926970675883e-06, + "loss": 1.62642403, + "memory(GiB)": 97.17, + "step": 16360, + "train_speed(iter/s)": 1.630795 + }, + { + "acc": 0.6303978, + "epoch": 0.415144596651446, + "grad_norm": 5.34375, + "learning_rate": 9.33640504136509e-06, + "loss": 1.71301994, + "memory(GiB)": 97.17, + "step": 16365, + "train_speed(iter/s)": 1.630847 + }, + { + "acc": 0.65087194, + "epoch": 0.41527143581938103, + "grad_norm": 5.84375, + "learning_rate": 9.335882921319845e-06, + "loss": 1.61233215, + "memory(GiB)": 97.17, + "step": 16370, + "train_speed(iter/s)": 1.630902 + }, + { + "acc": 0.66915307, + "epoch": 0.4153982749873161, + "grad_norm": 5.21875, + "learning_rate": 9.335360610563111e-06, + "loss": 1.62349396, + "memory(GiB)": 97.17, + "step": 16375, + "train_speed(iter/s)": 1.630954 + }, + { + "acc": 0.65744815, + "epoch": 0.4155251141552511, + "grad_norm": 7.40625, + "learning_rate": 9.33483810911786e-06, + "loss": 1.6356617, + "memory(GiB)": 97.17, + "step": 16380, + "train_speed(iter/s)": 1.631004 + }, + { + "acc": 0.63689585, + "epoch": 0.4156519533231862, + "grad_norm": 5.71875, + "learning_rate": 9.334315417007079e-06, + "loss": 1.69260674, + "memory(GiB)": 97.17, + "step": 16385, + "train_speed(iter/s)": 1.631052 + }, + { + "acc": 0.64266686, + "epoch": 0.41577879249112126, + "grad_norm": 4.96875, + "learning_rate": 9.333792534253751e-06, + "loss": 1.63932495, + "memory(GiB)": 97.17, + "step": 16390, + "train_speed(iter/s)": 1.631106 + }, + { + "acc": 0.64490991, + "epoch": 0.4159056316590563, + "grad_norm": 5.40625, + "learning_rate": 9.333269460880879e-06, + "loss": 1.62875023, + "memory(GiB)": 97.17, + "step": 16395, + "train_speed(iter/s)": 1.631158 + }, + { + "acc": 0.65804968, + "epoch": 0.41603247082699135, + "grad_norm": 7.375, + "learning_rate": 9.33274619691147e-06, + "loss": 1.5533864, + "memory(GiB)": 97.17, + "step": 16400, + "train_speed(iter/s)": 1.631211 + }, + { + "acc": 0.649788, + "epoch": 0.41615930999492645, + "grad_norm": 6.125, + "learning_rate": 9.332222742368537e-06, + "loss": 1.58417892, + "memory(GiB)": 97.17, + "step": 16405, + "train_speed(iter/s)": 1.631264 + }, + { + "acc": 0.64279757, + "epoch": 0.4162861491628615, + "grad_norm": 5.25, + "learning_rate": 9.331699097275108e-06, + "loss": 1.66128044, + "memory(GiB)": 97.17, + "step": 16410, + "train_speed(iter/s)": 1.631319 + }, + { + "acc": 0.65956573, + "epoch": 0.41641298833079654, + "grad_norm": 7.875, + "learning_rate": 9.331175261654213e-06, + "loss": 1.61678085, + "memory(GiB)": 97.17, + "step": 16415, + "train_speed(iter/s)": 1.631374 + }, + { + "acc": 0.64912167, + "epoch": 0.4165398274987316, + "grad_norm": 5.4375, + "learning_rate": 9.330651235528891e-06, + "loss": 1.64932976, + "memory(GiB)": 97.17, + "step": 16420, + "train_speed(iter/s)": 1.631424 + }, + { + "acc": 0.64392099, + "epoch": 0.4166666666666667, + "grad_norm": 5.375, + "learning_rate": 9.330127018922195e-06, + "loss": 1.66445293, + "memory(GiB)": 97.17, + "step": 16425, + "train_speed(iter/s)": 1.631475 + }, + { + "acc": 0.66941986, + "epoch": 0.41679350583460173, + "grad_norm": 5.5, + "learning_rate": 9.329602611857179e-06, + "loss": 1.53443508, + "memory(GiB)": 97.17, + "step": 16430, + "train_speed(iter/s)": 1.631525 + }, + { + "acc": 0.65015574, + "epoch": 0.4169203450025368, + "grad_norm": 5.875, + "learning_rate": 9.329078014356909e-06, + "loss": 1.59227047, + "memory(GiB)": 97.17, + "step": 16435, + "train_speed(iter/s)": 1.631578 + }, + { + "acc": 0.67195711, + "epoch": 0.4170471841704718, + "grad_norm": 6.21875, + "learning_rate": 9.32855322644446e-06, + "loss": 1.52868013, + "memory(GiB)": 97.17, + "step": 16440, + "train_speed(iter/s)": 1.63163 + }, + { + "acc": 0.65971193, + "epoch": 0.4171740233384069, + "grad_norm": 5.78125, + "learning_rate": 9.328028248142916e-06, + "loss": 1.55929604, + "memory(GiB)": 97.17, + "step": 16445, + "train_speed(iter/s)": 1.63168 + }, + { + "acc": 0.64351778, + "epoch": 0.41730086250634196, + "grad_norm": 4.71875, + "learning_rate": 9.327503079475365e-06, + "loss": 1.67029533, + "memory(GiB)": 97.17, + "step": 16450, + "train_speed(iter/s)": 1.631732 + }, + { + "acc": 0.64811516, + "epoch": 0.417427701674277, + "grad_norm": 6.125, + "learning_rate": 9.326977720464908e-06, + "loss": 1.61634636, + "memory(GiB)": 97.17, + "step": 16455, + "train_speed(iter/s)": 1.631787 + }, + { + "acc": 0.65875635, + "epoch": 0.41755454084221205, + "grad_norm": 6.1875, + "learning_rate": 9.326452171134652e-06, + "loss": 1.61829872, + "memory(GiB)": 97.17, + "step": 16460, + "train_speed(iter/s)": 1.631836 + }, + { + "acc": 0.63802261, + "epoch": 0.41768138001014715, + "grad_norm": 6.5, + "learning_rate": 9.325926431507714e-06, + "loss": 1.64991493, + "memory(GiB)": 97.17, + "step": 16465, + "train_speed(iter/s)": 1.631886 + }, + { + "acc": 0.65828586, + "epoch": 0.4178082191780822, + "grad_norm": 5.625, + "learning_rate": 9.325400501607218e-06, + "loss": 1.61011219, + "memory(GiB)": 97.17, + "step": 16470, + "train_speed(iter/s)": 1.631935 + }, + { + "acc": 0.64187222, + "epoch": 0.41793505834601724, + "grad_norm": 6.125, + "learning_rate": 9.324874381456295e-06, + "loss": 1.66432953, + "memory(GiB)": 97.17, + "step": 16475, + "train_speed(iter/s)": 1.631986 + }, + { + "acc": 0.65641108, + "epoch": 0.4180618975139523, + "grad_norm": 7.1875, + "learning_rate": 9.324348071078088e-06, + "loss": 1.60426331, + "memory(GiB)": 97.17, + "step": 16480, + "train_speed(iter/s)": 1.632041 + }, + { + "acc": 0.64060287, + "epoch": 0.4181887366818874, + "grad_norm": 5.15625, + "learning_rate": 9.323821570495748e-06, + "loss": 1.66343651, + "memory(GiB)": 97.17, + "step": 16485, + "train_speed(iter/s)": 1.63209 + }, + { + "acc": 0.64553232, + "epoch": 0.41831557584982243, + "grad_norm": 5.75, + "learning_rate": 9.32329487973243e-06, + "loss": 1.63939247, + "memory(GiB)": 97.17, + "step": 16490, + "train_speed(iter/s)": 1.63214 + }, + { + "acc": 0.66361637, + "epoch": 0.4184424150177575, + "grad_norm": 5.5, + "learning_rate": 9.3227679988113e-06, + "loss": 1.63096085, + "memory(GiB)": 97.17, + "step": 16495, + "train_speed(iter/s)": 1.632191 + }, + { + "acc": 0.66125531, + "epoch": 0.4185692541856925, + "grad_norm": 6.5625, + "learning_rate": 9.322240927755534e-06, + "loss": 1.60955887, + "memory(GiB)": 97.17, + "step": 16500, + "train_speed(iter/s)": 1.632239 + }, + { + "acc": 0.64777756, + "epoch": 0.4186960933536276, + "grad_norm": 7.4375, + "learning_rate": 9.321713666588314e-06, + "loss": 1.62595806, + "memory(GiB)": 97.17, + "step": 16505, + "train_speed(iter/s)": 1.632289 + }, + { + "acc": 0.65439548, + "epoch": 0.41882293252156266, + "grad_norm": 5.1875, + "learning_rate": 9.321186215332833e-06, + "loss": 1.67922421, + "memory(GiB)": 97.17, + "step": 16510, + "train_speed(iter/s)": 1.632342 + }, + { + "acc": 0.65504904, + "epoch": 0.4189497716894977, + "grad_norm": 5.25, + "learning_rate": 9.320658574012289e-06, + "loss": 1.63084793, + "memory(GiB)": 97.17, + "step": 16515, + "train_speed(iter/s)": 1.632397 + }, + { + "acc": 0.65674686, + "epoch": 0.41907661085743275, + "grad_norm": 5.5, + "learning_rate": 9.32013074264989e-06, + "loss": 1.61073532, + "memory(GiB)": 97.17, + "step": 16520, + "train_speed(iter/s)": 1.632445 + }, + { + "acc": 0.65759077, + "epoch": 0.41920345002536785, + "grad_norm": 6.09375, + "learning_rate": 9.319602721268853e-06, + "loss": 1.59918833, + "memory(GiB)": 97.17, + "step": 16525, + "train_speed(iter/s)": 1.632498 + }, + { + "acc": 0.6814713, + "epoch": 0.4193302891933029, + "grad_norm": 5.15625, + "learning_rate": 9.319074509892403e-06, + "loss": 1.53227873, + "memory(GiB)": 97.17, + "step": 16530, + "train_speed(iter/s)": 1.632549 + }, + { + "acc": 0.65880942, + "epoch": 0.41945712836123794, + "grad_norm": 4.9375, + "learning_rate": 9.318546108543774e-06, + "loss": 1.55095558, + "memory(GiB)": 97.17, + "step": 16535, + "train_speed(iter/s)": 1.632603 + }, + { + "acc": 0.63179932, + "epoch": 0.419583967529173, + "grad_norm": 5.5, + "learning_rate": 9.318017517246205e-06, + "loss": 1.68021927, + "memory(GiB)": 97.17, + "step": 16540, + "train_speed(iter/s)": 1.632653 + }, + { + "acc": 0.6292191, + "epoch": 0.4197108066971081, + "grad_norm": 6.53125, + "learning_rate": 9.317488736022948e-06, + "loss": 1.69453812, + "memory(GiB)": 97.17, + "step": 16545, + "train_speed(iter/s)": 1.632705 + }, + { + "acc": 0.65382562, + "epoch": 0.41983764586504313, + "grad_norm": 6.75, + "learning_rate": 9.316959764897259e-06, + "loss": 1.64304657, + "memory(GiB)": 97.17, + "step": 16550, + "train_speed(iter/s)": 1.632759 + }, + { + "acc": 0.64357138, + "epoch": 0.4199644850329782, + "grad_norm": 5.875, + "learning_rate": 9.316430603892406e-06, + "loss": 1.61357231, + "memory(GiB)": 97.17, + "step": 16555, + "train_speed(iter/s)": 1.632814 + }, + { + "acc": 0.64302549, + "epoch": 0.4200913242009132, + "grad_norm": 4.9375, + "learning_rate": 9.315901253031663e-06, + "loss": 1.64227886, + "memory(GiB)": 97.17, + "step": 16560, + "train_speed(iter/s)": 1.632866 + }, + { + "acc": 0.64157295, + "epoch": 0.4202181633688483, + "grad_norm": 6.6875, + "learning_rate": 9.315371712338315e-06, + "loss": 1.7676487, + "memory(GiB)": 97.17, + "step": 16565, + "train_speed(iter/s)": 1.632919 + }, + { + "acc": 0.65801296, + "epoch": 0.42034500253678336, + "grad_norm": 5.40625, + "learning_rate": 9.314841981835652e-06, + "loss": 1.59332809, + "memory(GiB)": 97.17, + "step": 16570, + "train_speed(iter/s)": 1.632974 + }, + { + "acc": 0.64057236, + "epoch": 0.4204718417047184, + "grad_norm": 5.75, + "learning_rate": 9.314312061546974e-06, + "loss": 1.6918026, + "memory(GiB)": 97.17, + "step": 16575, + "train_speed(iter/s)": 1.633024 + }, + { + "acc": 0.65233259, + "epoch": 0.42059868087265345, + "grad_norm": 4.5625, + "learning_rate": 9.313781951495588e-06, + "loss": 1.66234646, + "memory(GiB)": 97.17, + "step": 16580, + "train_speed(iter/s)": 1.633077 + }, + { + "acc": 0.65132189, + "epoch": 0.42072552004058855, + "grad_norm": 5.46875, + "learning_rate": 9.313251651704816e-06, + "loss": 1.64941978, + "memory(GiB)": 97.17, + "step": 16585, + "train_speed(iter/s)": 1.63313 + }, + { + "acc": 0.6595892, + "epoch": 0.4208523592085236, + "grad_norm": 5.0625, + "learning_rate": 9.312721162197975e-06, + "loss": 1.60508728, + "memory(GiB)": 97.17, + "step": 16590, + "train_speed(iter/s)": 1.633182 + }, + { + "acc": 0.66196313, + "epoch": 0.42097919837645864, + "grad_norm": 5.25, + "learning_rate": 9.312190482998405e-06, + "loss": 1.59453812, + "memory(GiB)": 97.17, + "step": 16595, + "train_speed(iter/s)": 1.633241 + }, + { + "acc": 0.64996533, + "epoch": 0.4211060375443937, + "grad_norm": 5.65625, + "learning_rate": 9.311659614129443e-06, + "loss": 1.66943264, + "memory(GiB)": 97.17, + "step": 16600, + "train_speed(iter/s)": 1.633295 + }, + { + "acc": 0.64626307, + "epoch": 0.4212328767123288, + "grad_norm": 6.125, + "learning_rate": 9.311128555614443e-06, + "loss": 1.65645447, + "memory(GiB)": 97.17, + "step": 16605, + "train_speed(iter/s)": 1.633353 + }, + { + "acc": 0.65463581, + "epoch": 0.42135971588026383, + "grad_norm": 5.0, + "learning_rate": 9.31059730747676e-06, + "loss": 1.55136127, + "memory(GiB)": 97.17, + "step": 16610, + "train_speed(iter/s)": 1.633404 + }, + { + "acc": 0.65890212, + "epoch": 0.4214865550481989, + "grad_norm": 5.5, + "learning_rate": 9.310065869739763e-06, + "loss": 1.6205616, + "memory(GiB)": 97.17, + "step": 16615, + "train_speed(iter/s)": 1.633459 + }, + { + "acc": 0.6442235, + "epoch": 0.4216133942161339, + "grad_norm": 5.34375, + "learning_rate": 9.309534242426826e-06, + "loss": 1.59124451, + "memory(GiB)": 97.17, + "step": 16620, + "train_speed(iter/s)": 1.633512 + }, + { + "acc": 0.65964456, + "epoch": 0.421740233384069, + "grad_norm": 4.78125, + "learning_rate": 9.30900242556133e-06, + "loss": 1.58672934, + "memory(GiB)": 97.17, + "step": 16625, + "train_speed(iter/s)": 1.633566 + }, + { + "acc": 0.65840259, + "epoch": 0.42186707255200406, + "grad_norm": 6.15625, + "learning_rate": 9.308470419166672e-06, + "loss": 1.55596886, + "memory(GiB)": 97.17, + "step": 16630, + "train_speed(iter/s)": 1.633617 + }, + { + "acc": 0.64595361, + "epoch": 0.4219939117199391, + "grad_norm": 5.28125, + "learning_rate": 9.307938223266247e-06, + "loss": 1.63031864, + "memory(GiB)": 97.17, + "step": 16635, + "train_speed(iter/s)": 1.633672 + }, + { + "acc": 0.63774099, + "epoch": 0.42212075088787415, + "grad_norm": 4.59375, + "learning_rate": 9.307405837883467e-06, + "loss": 1.70536747, + "memory(GiB)": 97.17, + "step": 16640, + "train_speed(iter/s)": 1.633725 + }, + { + "acc": 0.64718413, + "epoch": 0.42224759005580925, + "grad_norm": 5.25, + "learning_rate": 9.306873263041745e-06, + "loss": 1.62035789, + "memory(GiB)": 97.17, + "step": 16645, + "train_speed(iter/s)": 1.633774 + }, + { + "acc": 0.65107799, + "epoch": 0.4223744292237443, + "grad_norm": 6.625, + "learning_rate": 9.30634049876451e-06, + "loss": 1.6117794, + "memory(GiB)": 97.17, + "step": 16650, + "train_speed(iter/s)": 1.633829 + }, + { + "acc": 0.64570847, + "epoch": 0.42250126839167934, + "grad_norm": 5.96875, + "learning_rate": 9.305807545075194e-06, + "loss": 1.60762367, + "memory(GiB)": 97.17, + "step": 16655, + "train_speed(iter/s)": 1.633881 + }, + { + "acc": 0.65536709, + "epoch": 0.4226281075596144, + "grad_norm": 4.78125, + "learning_rate": 9.305274401997237e-06, + "loss": 1.62804165, + "memory(GiB)": 97.17, + "step": 16660, + "train_speed(iter/s)": 1.633935 + }, + { + "acc": 0.65016775, + "epoch": 0.4227549467275495, + "grad_norm": 5.3125, + "learning_rate": 9.304741069554088e-06, + "loss": 1.59783401, + "memory(GiB)": 97.17, + "step": 16665, + "train_speed(iter/s)": 1.633985 + }, + { + "acc": 0.66279917, + "epoch": 0.42288178589548453, + "grad_norm": 5.28125, + "learning_rate": 9.304207547769211e-06, + "loss": 1.57599201, + "memory(GiB)": 97.17, + "step": 16670, + "train_speed(iter/s)": 1.634035 + }, + { + "acc": 0.66747637, + "epoch": 0.4230086250634196, + "grad_norm": 6.46875, + "learning_rate": 9.303673836666067e-06, + "loss": 1.61142273, + "memory(GiB)": 97.17, + "step": 16675, + "train_speed(iter/s)": 1.634087 + }, + { + "acc": 0.64322491, + "epoch": 0.4231354642313546, + "grad_norm": 5.25, + "learning_rate": 9.303139936268133e-06, + "loss": 1.66456013, + "memory(GiB)": 97.17, + "step": 16680, + "train_speed(iter/s)": 1.634144 + }, + { + "acc": 0.63724728, + "epoch": 0.4232623033992897, + "grad_norm": 5.25, + "learning_rate": 9.302605846598894e-06, + "loss": 1.6882061, + "memory(GiB)": 97.17, + "step": 16685, + "train_speed(iter/s)": 1.634198 + }, + { + "acc": 0.64323635, + "epoch": 0.42338914256722476, + "grad_norm": 6.6875, + "learning_rate": 9.30207156768184e-06, + "loss": 1.62905502, + "memory(GiB)": 97.17, + "step": 16690, + "train_speed(iter/s)": 1.634252 + }, + { + "acc": 0.65594854, + "epoch": 0.4235159817351598, + "grad_norm": 6.09375, + "learning_rate": 9.30153709954047e-06, + "loss": 1.56245403, + "memory(GiB)": 97.17, + "step": 16695, + "train_speed(iter/s)": 1.634306 + }, + { + "acc": 0.65969977, + "epoch": 0.42364282090309485, + "grad_norm": 5.0, + "learning_rate": 9.301002442198294e-06, + "loss": 1.55928516, + "memory(GiB)": 97.17, + "step": 16700, + "train_speed(iter/s)": 1.634358 + }, + { + "acc": 0.65157671, + "epoch": 0.42376966007102995, + "grad_norm": 5.8125, + "learning_rate": 9.300467595678829e-06, + "loss": 1.6262455, + "memory(GiB)": 97.17, + "step": 16705, + "train_speed(iter/s)": 1.634412 + }, + { + "acc": 0.64952097, + "epoch": 0.423896499238965, + "grad_norm": 7.09375, + "learning_rate": 9.299932560005596e-06, + "loss": 1.65484352, + "memory(GiB)": 97.17, + "step": 16710, + "train_speed(iter/s)": 1.634467 + }, + { + "acc": 0.65549059, + "epoch": 0.42402333840690004, + "grad_norm": 5.90625, + "learning_rate": 9.299397335202133e-06, + "loss": 1.64444237, + "memory(GiB)": 97.17, + "step": 16715, + "train_speed(iter/s)": 1.634521 + }, + { + "acc": 0.66014671, + "epoch": 0.4241501775748351, + "grad_norm": 5.90625, + "learning_rate": 9.29886192129198e-06, + "loss": 1.61765747, + "memory(GiB)": 97.17, + "step": 16720, + "train_speed(iter/s)": 1.634571 + }, + { + "acc": 0.63968897, + "epoch": 0.4242770167427702, + "grad_norm": 6.15625, + "learning_rate": 9.298326318298688e-06, + "loss": 1.656534, + "memory(GiB)": 97.17, + "step": 16725, + "train_speed(iter/s)": 1.634623 + }, + { + "acc": 0.647507, + "epoch": 0.42440385591070523, + "grad_norm": 5.46875, + "learning_rate": 9.29779052624581e-06, + "loss": 1.58004341, + "memory(GiB)": 97.17, + "step": 16730, + "train_speed(iter/s)": 1.634678 + }, + { + "acc": 0.65747089, + "epoch": 0.4245306950786403, + "grad_norm": 6.8125, + "learning_rate": 9.29725454515692e-06, + "loss": 1.62379761, + "memory(GiB)": 97.17, + "step": 16735, + "train_speed(iter/s)": 1.634732 + }, + { + "acc": 0.65892248, + "epoch": 0.4246575342465753, + "grad_norm": 5.625, + "learning_rate": 9.296718375055587e-06, + "loss": 1.54168205, + "memory(GiB)": 97.17, + "step": 16740, + "train_speed(iter/s)": 1.634786 + }, + { + "acc": 0.65943336, + "epoch": 0.4247843734145104, + "grad_norm": 5.5, + "learning_rate": 9.296182015965399e-06, + "loss": 1.63737831, + "memory(GiB)": 97.17, + "step": 16745, + "train_speed(iter/s)": 1.634839 + }, + { + "acc": 0.65610476, + "epoch": 0.42491121258244546, + "grad_norm": 7.71875, + "learning_rate": 9.295645467909942e-06, + "loss": 1.60306702, + "memory(GiB)": 97.17, + "step": 16750, + "train_speed(iter/s)": 1.634894 + }, + { + "acc": 0.64639149, + "epoch": 0.4250380517503805, + "grad_norm": 5.25, + "learning_rate": 9.29510873091282e-06, + "loss": 1.55421333, + "memory(GiB)": 97.17, + "step": 16755, + "train_speed(iter/s)": 1.634946 + }, + { + "acc": 0.64904027, + "epoch": 0.42516489091831555, + "grad_norm": 6.28125, + "learning_rate": 9.29457180499764e-06, + "loss": 1.66660004, + "memory(GiB)": 97.17, + "step": 16760, + "train_speed(iter/s)": 1.635001 + }, + { + "acc": 0.64320278, + "epoch": 0.42529173008625065, + "grad_norm": 5.71875, + "learning_rate": 9.294034690188016e-06, + "loss": 1.63236961, + "memory(GiB)": 97.17, + "step": 16765, + "train_speed(iter/s)": 1.635054 + }, + { + "acc": 0.64747686, + "epoch": 0.4254185692541857, + "grad_norm": 5.0, + "learning_rate": 9.293497386507577e-06, + "loss": 1.66572227, + "memory(GiB)": 97.17, + "step": 16770, + "train_speed(iter/s)": 1.635108 + }, + { + "acc": 0.66596174, + "epoch": 0.42554540842212074, + "grad_norm": 6.28125, + "learning_rate": 9.292959893979953e-06, + "loss": 1.60522346, + "memory(GiB)": 97.17, + "step": 16775, + "train_speed(iter/s)": 1.635163 + }, + { + "acc": 0.65274973, + "epoch": 0.4256722475900558, + "grad_norm": 5.25, + "learning_rate": 9.292422212628786e-06, + "loss": 1.65038452, + "memory(GiB)": 97.17, + "step": 16780, + "train_speed(iter/s)": 1.635218 + }, + { + "acc": 0.65924911, + "epoch": 0.4257990867579909, + "grad_norm": 5.40625, + "learning_rate": 9.291884342477728e-06, + "loss": 1.62604218, + "memory(GiB)": 97.17, + "step": 16785, + "train_speed(iter/s)": 1.635269 + }, + { + "acc": 0.65498648, + "epoch": 0.42592592592592593, + "grad_norm": 5.21875, + "learning_rate": 9.291346283550433e-06, + "loss": 1.58802414, + "memory(GiB)": 97.17, + "step": 16790, + "train_speed(iter/s)": 1.635325 + }, + { + "acc": 0.64843564, + "epoch": 0.426052765093861, + "grad_norm": 6.40625, + "learning_rate": 9.290808035870569e-06, + "loss": 1.67216606, + "memory(GiB)": 97.17, + "step": 16795, + "train_speed(iter/s)": 1.635381 + }, + { + "acc": 0.64500399, + "epoch": 0.426179604261796, + "grad_norm": 6.09375, + "learning_rate": 9.29026959946181e-06, + "loss": 1.68752384, + "memory(GiB)": 97.17, + "step": 16800, + "train_speed(iter/s)": 1.635432 + }, + { + "acc": 0.6609199, + "epoch": 0.4263064434297311, + "grad_norm": 4.875, + "learning_rate": 9.289730974347841e-06, + "loss": 1.60022697, + "memory(GiB)": 97.17, + "step": 16805, + "train_speed(iter/s)": 1.635486 + }, + { + "acc": 0.67437744, + "epoch": 0.42643328259766616, + "grad_norm": 5.5625, + "learning_rate": 9.28919216055235e-06, + "loss": 1.56150808, + "memory(GiB)": 97.17, + "step": 16810, + "train_speed(iter/s)": 1.635539 + }, + { + "acc": 0.66691504, + "epoch": 0.4265601217656012, + "grad_norm": 8.0625, + "learning_rate": 9.288653158099038e-06, + "loss": 1.54448881, + "memory(GiB)": 97.17, + "step": 16815, + "train_speed(iter/s)": 1.635592 + }, + { + "acc": 0.65150204, + "epoch": 0.42668696093353625, + "grad_norm": 5.375, + "learning_rate": 9.288113967011612e-06, + "loss": 1.65020218, + "memory(GiB)": 97.17, + "step": 16820, + "train_speed(iter/s)": 1.635642 + }, + { + "acc": 0.66859961, + "epoch": 0.42681380010147135, + "grad_norm": 5.78125, + "learning_rate": 9.28757458731379e-06, + "loss": 1.57901583, + "memory(GiB)": 97.17, + "step": 16825, + "train_speed(iter/s)": 1.635694 + }, + { + "acc": 0.64868131, + "epoch": 0.4269406392694064, + "grad_norm": 5.59375, + "learning_rate": 9.287035019029295e-06, + "loss": 1.67384911, + "memory(GiB)": 97.17, + "step": 16830, + "train_speed(iter/s)": 1.635746 + }, + { + "acc": 0.65630088, + "epoch": 0.42706747843734144, + "grad_norm": 5.625, + "learning_rate": 9.286495262181859e-06, + "loss": 1.53766232, + "memory(GiB)": 97.17, + "step": 16835, + "train_speed(iter/s)": 1.6358 + }, + { + "acc": 0.64981813, + "epoch": 0.4271943176052765, + "grad_norm": 7.4375, + "learning_rate": 9.285955316795224e-06, + "loss": 1.66251774, + "memory(GiB)": 97.17, + "step": 16840, + "train_speed(iter/s)": 1.635856 + }, + { + "acc": 0.65139971, + "epoch": 0.4273211567732116, + "grad_norm": 7.84375, + "learning_rate": 9.285415182893138e-06, + "loss": 1.6517189, + "memory(GiB)": 97.17, + "step": 16845, + "train_speed(iter/s)": 1.635909 + }, + { + "acc": 0.65064836, + "epoch": 0.42744799594114663, + "grad_norm": 6.4375, + "learning_rate": 9.28487486049936e-06, + "loss": 1.62671642, + "memory(GiB)": 97.17, + "step": 16850, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.65457125, + "epoch": 0.4275748351090817, + "grad_norm": 7.90625, + "learning_rate": 9.284334349637655e-06, + "loss": 1.68887558, + "memory(GiB)": 97.17, + "step": 16855, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.66219091, + "epoch": 0.4277016742770167, + "grad_norm": 5.46875, + "learning_rate": 9.283793650331798e-06, + "loss": 1.56025734, + "memory(GiB)": 97.17, + "step": 16860, + "train_speed(iter/s)": 1.636073 + }, + { + "acc": 0.66432199, + "epoch": 0.4278285134449518, + "grad_norm": 5.625, + "learning_rate": 9.283252762605568e-06, + "loss": 1.6029665, + "memory(GiB)": 97.17, + "step": 16865, + "train_speed(iter/s)": 1.636124 + }, + { + "acc": 0.64984684, + "epoch": 0.42795535261288686, + "grad_norm": 5.375, + "learning_rate": 9.28271168648276e-06, + "loss": 1.64032516, + "memory(GiB)": 97.17, + "step": 16870, + "train_speed(iter/s)": 1.636177 + }, + { + "acc": 0.65118694, + "epoch": 0.4280821917808219, + "grad_norm": 6.9375, + "learning_rate": 9.282170421987171e-06, + "loss": 1.65879517, + "memory(GiB)": 97.17, + "step": 16875, + "train_speed(iter/s)": 1.636232 + }, + { + "acc": 0.64793534, + "epoch": 0.42820903094875695, + "grad_norm": 6.15625, + "learning_rate": 9.281628969142609e-06, + "loss": 1.73152161, + "memory(GiB)": 97.17, + "step": 16880, + "train_speed(iter/s)": 1.63629 + }, + { + "acc": 0.6323853, + "epoch": 0.42833587011669205, + "grad_norm": 5.90625, + "learning_rate": 9.281087327972886e-06, + "loss": 1.70044098, + "memory(GiB)": 97.17, + "step": 16885, + "train_speed(iter/s)": 1.636345 + }, + { + "acc": 0.65641103, + "epoch": 0.4284627092846271, + "grad_norm": 5.875, + "learning_rate": 9.280545498501832e-06, + "loss": 1.56100769, + "memory(GiB)": 97.17, + "step": 16890, + "train_speed(iter/s)": 1.636401 + }, + { + "acc": 0.64588432, + "epoch": 0.42858954845256214, + "grad_norm": 5.46875, + "learning_rate": 9.280003480753274e-06, + "loss": 1.68182926, + "memory(GiB)": 97.17, + "step": 16895, + "train_speed(iter/s)": 1.636457 + }, + { + "acc": 0.63259392, + "epoch": 0.4287163876204972, + "grad_norm": 5.53125, + "learning_rate": 9.279461274751054e-06, + "loss": 1.72251854, + "memory(GiB)": 97.17, + "step": 16900, + "train_speed(iter/s)": 1.636513 + }, + { + "acc": 0.64388452, + "epoch": 0.4288432267884323, + "grad_norm": 5.4375, + "learning_rate": 9.27891888051902e-06, + "loss": 1.63054733, + "memory(GiB)": 97.17, + "step": 16905, + "train_speed(iter/s)": 1.636564 + }, + { + "acc": 0.65210323, + "epoch": 0.42897006595636733, + "grad_norm": 5.90625, + "learning_rate": 9.278376298081032e-06, + "loss": 1.6496582, + "memory(GiB)": 97.17, + "step": 16910, + "train_speed(iter/s)": 1.636621 + }, + { + "acc": 0.65076361, + "epoch": 0.4290969051243024, + "grad_norm": 5.78125, + "learning_rate": 9.277833527460952e-06, + "loss": 1.6364645, + "memory(GiB)": 97.17, + "step": 16915, + "train_speed(iter/s)": 1.636679 + }, + { + "acc": 0.66106238, + "epoch": 0.4292237442922374, + "grad_norm": 5.8125, + "learning_rate": 9.277290568682653e-06, + "loss": 1.63757401, + "memory(GiB)": 97.17, + "step": 16920, + "train_speed(iter/s)": 1.636733 + }, + { + "acc": 0.63203897, + "epoch": 0.4293505834601725, + "grad_norm": 5.59375, + "learning_rate": 9.27674742177002e-06, + "loss": 1.70032616, + "memory(GiB)": 97.17, + "step": 16925, + "train_speed(iter/s)": 1.636791 + }, + { + "acc": 0.6439805, + "epoch": 0.42947742262810756, + "grad_norm": 5.21875, + "learning_rate": 9.27620408674694e-06, + "loss": 1.59769163, + "memory(GiB)": 97.17, + "step": 16930, + "train_speed(iter/s)": 1.636845 + }, + { + "acc": 0.64007516, + "epoch": 0.4296042617960426, + "grad_norm": 10.0625, + "learning_rate": 9.275660563637313e-06, + "loss": 1.62902184, + "memory(GiB)": 97.17, + "step": 16935, + "train_speed(iter/s)": 1.636897 + }, + { + "acc": 0.65972977, + "epoch": 0.42973110096397765, + "grad_norm": 5.5625, + "learning_rate": 9.275116852465043e-06, + "loss": 1.61088524, + "memory(GiB)": 97.17, + "step": 16940, + "train_speed(iter/s)": 1.636954 + }, + { + "acc": 0.6521903, + "epoch": 0.42985794013191275, + "grad_norm": 6.375, + "learning_rate": 9.274572953254048e-06, + "loss": 1.58857517, + "memory(GiB)": 97.17, + "step": 16945, + "train_speed(iter/s)": 1.637008 + }, + { + "acc": 0.6719595, + "epoch": 0.4299847792998478, + "grad_norm": 7.125, + "learning_rate": 9.27402886602825e-06, + "loss": 1.49877234, + "memory(GiB)": 97.17, + "step": 16950, + "train_speed(iter/s)": 1.637063 + }, + { + "acc": 0.64124508, + "epoch": 0.43011161846778284, + "grad_norm": 6.125, + "learning_rate": 9.27348459081158e-06, + "loss": 1.65098991, + "memory(GiB)": 97.17, + "step": 16955, + "train_speed(iter/s)": 1.637116 + }, + { + "acc": 0.65731564, + "epoch": 0.4302384576357179, + "grad_norm": 5.0, + "learning_rate": 9.272940127627979e-06, + "loss": 1.60240383, + "memory(GiB)": 97.17, + "step": 16960, + "train_speed(iter/s)": 1.637172 + }, + { + "acc": 0.65151868, + "epoch": 0.430365296803653, + "grad_norm": 6.53125, + "learning_rate": 9.272395476501392e-06, + "loss": 1.6307972, + "memory(GiB)": 97.17, + "step": 16965, + "train_speed(iter/s)": 1.637231 + }, + { + "acc": 0.63347569, + "epoch": 0.43049213597158803, + "grad_norm": 4.59375, + "learning_rate": 9.27185063745578e-06, + "loss": 1.71275272, + "memory(GiB)": 97.17, + "step": 16970, + "train_speed(iter/s)": 1.637282 + }, + { + "acc": 0.65819225, + "epoch": 0.4306189751395231, + "grad_norm": 5.3125, + "learning_rate": 9.271305610515103e-06, + "loss": 1.58094749, + "memory(GiB)": 97.17, + "step": 16975, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.65439162, + "epoch": 0.4307458143074581, + "grad_norm": 5.09375, + "learning_rate": 9.270760395703334e-06, + "loss": 1.65464859, + "memory(GiB)": 97.17, + "step": 16980, + "train_speed(iter/s)": 1.637385 + }, + { + "acc": 0.64985428, + "epoch": 0.4308726534753932, + "grad_norm": 5.625, + "learning_rate": 9.270214993044456e-06, + "loss": 1.67589989, + "memory(GiB)": 97.17, + "step": 16985, + "train_speed(iter/s)": 1.637438 + }, + { + "acc": 0.64864516, + "epoch": 0.43099949264332826, + "grad_norm": 5.53125, + "learning_rate": 9.269669402562458e-06, + "loss": 1.62970772, + "memory(GiB)": 97.17, + "step": 16990, + "train_speed(iter/s)": 1.637491 + }, + { + "acc": 0.66255465, + "epoch": 0.4311263318112633, + "grad_norm": 5.59375, + "learning_rate": 9.269123624281336e-06, + "loss": 1.60183716, + "memory(GiB)": 97.17, + "step": 16995, + "train_speed(iter/s)": 1.637544 + }, + { + "acc": 0.67056694, + "epoch": 0.43125317097919835, + "grad_norm": 4.96875, + "learning_rate": 9.268577658225097e-06, + "loss": 1.50912457, + "memory(GiB)": 97.17, + "step": 17000, + "train_speed(iter/s)": 1.637596 + }, + { + "epoch": 0.43125317097919835, + "eval_acc": 0.6428900485948832, + "eval_loss": 1.5926954746246338, + "eval_runtime": 58.4038, + "eval_samples_per_second": 109.068, + "eval_steps_per_second": 27.276, + "step": 17000 + }, + { + "acc": 0.66105733, + "epoch": 0.43138001014713345, + "grad_norm": 5.96875, + "learning_rate": 9.268031504417756e-06, + "loss": 1.60161285, + "memory(GiB)": 97.17, + "step": 17005, + "train_speed(iter/s)": 1.627824 + }, + { + "acc": 0.65026212, + "epoch": 0.4315068493150685, + "grad_norm": 6.59375, + "learning_rate": 9.267485162883334e-06, + "loss": 1.62716045, + "memory(GiB)": 97.17, + "step": 17010, + "train_speed(iter/s)": 1.627868 + }, + { + "acc": 0.63613806, + "epoch": 0.43163368848300354, + "grad_norm": 6.0, + "learning_rate": 9.266938633645861e-06, + "loss": 1.71146259, + "memory(GiB)": 97.17, + "step": 17015, + "train_speed(iter/s)": 1.627925 + }, + { + "acc": 0.66908288, + "epoch": 0.4317605276509386, + "grad_norm": 8.0625, + "learning_rate": 9.266391916729376e-06, + "loss": 1.5233408, + "memory(GiB)": 97.17, + "step": 17020, + "train_speed(iter/s)": 1.62798 + }, + { + "acc": 0.64691744, + "epoch": 0.4318873668188737, + "grad_norm": 5.59375, + "learning_rate": 9.265845012157926e-06, + "loss": 1.65524063, + "memory(GiB)": 97.17, + "step": 17025, + "train_speed(iter/s)": 1.628033 + }, + { + "acc": 0.64269838, + "epoch": 0.43201420598680873, + "grad_norm": 6.6875, + "learning_rate": 9.265297919955566e-06, + "loss": 1.61516838, + "memory(GiB)": 97.17, + "step": 17030, + "train_speed(iter/s)": 1.628084 + }, + { + "acc": 0.64917049, + "epoch": 0.4321410451547438, + "grad_norm": 5.875, + "learning_rate": 9.264750640146363e-06, + "loss": 1.64190254, + "memory(GiB)": 97.17, + "step": 17035, + "train_speed(iter/s)": 1.628141 + }, + { + "acc": 0.65181785, + "epoch": 0.4322678843226788, + "grad_norm": 4.375, + "learning_rate": 9.264203172754384e-06, + "loss": 1.61965599, + "memory(GiB)": 97.17, + "step": 17040, + "train_speed(iter/s)": 1.628193 + }, + { + "acc": 0.65951385, + "epoch": 0.4323947234906139, + "grad_norm": 5.875, + "learning_rate": 9.263655517803713e-06, + "loss": 1.64219055, + "memory(GiB)": 97.17, + "step": 17045, + "train_speed(iter/s)": 1.628244 + }, + { + "acc": 0.64553347, + "epoch": 0.43252156265854896, + "grad_norm": 4.8125, + "learning_rate": 9.263107675318434e-06, + "loss": 1.68419914, + "memory(GiB)": 97.17, + "step": 17050, + "train_speed(iter/s)": 1.628294 + }, + { + "acc": 0.66039534, + "epoch": 0.432648401826484, + "grad_norm": 5.125, + "learning_rate": 9.262559645322648e-06, + "loss": 1.5950983, + "memory(GiB)": 97.17, + "step": 17055, + "train_speed(iter/s)": 1.628347 + }, + { + "acc": 0.65618653, + "epoch": 0.43277524099441905, + "grad_norm": 5.78125, + "learning_rate": 9.262011427840459e-06, + "loss": 1.61158066, + "memory(GiB)": 97.17, + "step": 17060, + "train_speed(iter/s)": 1.628398 + }, + { + "acc": 0.66023817, + "epoch": 0.43290208016235415, + "grad_norm": 6.375, + "learning_rate": 9.261463022895976e-06, + "loss": 1.5364913, + "memory(GiB)": 97.17, + "step": 17065, + "train_speed(iter/s)": 1.628454 + }, + { + "acc": 0.66091561, + "epoch": 0.4330289193302892, + "grad_norm": 6.125, + "learning_rate": 9.260914430513325e-06, + "loss": 1.6025919, + "memory(GiB)": 97.17, + "step": 17070, + "train_speed(iter/s)": 1.628506 + }, + { + "acc": 0.63900504, + "epoch": 0.43315575849822424, + "grad_norm": 5.4375, + "learning_rate": 9.260365650716632e-06, + "loss": 1.6923912, + "memory(GiB)": 97.17, + "step": 17075, + "train_speed(iter/s)": 1.628559 + }, + { + "acc": 0.65349422, + "epoch": 0.4332825976661593, + "grad_norm": 6.1875, + "learning_rate": 9.259816683530038e-06, + "loss": 1.65538883, + "memory(GiB)": 97.17, + "step": 17080, + "train_speed(iter/s)": 1.628612 + }, + { + "acc": 0.66349411, + "epoch": 0.4334094368340944, + "grad_norm": 5.78125, + "learning_rate": 9.259267528977687e-06, + "loss": 1.5489666, + "memory(GiB)": 97.17, + "step": 17085, + "train_speed(iter/s)": 1.628666 + }, + { + "acc": 0.65586414, + "epoch": 0.43353627600202943, + "grad_norm": 5.53125, + "learning_rate": 9.258718187083734e-06, + "loss": 1.61791191, + "memory(GiB)": 97.17, + "step": 17090, + "train_speed(iter/s)": 1.628716 + }, + { + "acc": 0.660045, + "epoch": 0.4336631151699645, + "grad_norm": 6.125, + "learning_rate": 9.258168657872341e-06, + "loss": 1.59707642, + "memory(GiB)": 97.17, + "step": 17095, + "train_speed(iter/s)": 1.628769 + }, + { + "acc": 0.65517187, + "epoch": 0.4337899543378995, + "grad_norm": 5.6875, + "learning_rate": 9.25761894136768e-06, + "loss": 1.65323601, + "memory(GiB)": 97.17, + "step": 17100, + "train_speed(iter/s)": 1.628824 + }, + { + "acc": 0.64673848, + "epoch": 0.4339167935058346, + "grad_norm": 4.6875, + "learning_rate": 9.25706903759393e-06, + "loss": 1.64696388, + "memory(GiB)": 97.17, + "step": 17105, + "train_speed(iter/s)": 1.628881 + }, + { + "acc": 0.6620564, + "epoch": 0.43404363267376966, + "grad_norm": 5.84375, + "learning_rate": 9.256518946575274e-06, + "loss": 1.57205925, + "memory(GiB)": 97.17, + "step": 17110, + "train_speed(iter/s)": 1.628932 + }, + { + "acc": 0.65434246, + "epoch": 0.4341704718417047, + "grad_norm": 7.0, + "learning_rate": 9.255968668335912e-06, + "loss": 1.65409088, + "memory(GiB)": 97.17, + "step": 17115, + "train_speed(iter/s)": 1.628986 + }, + { + "acc": 0.65264697, + "epoch": 0.43429731100963975, + "grad_norm": 7.59375, + "learning_rate": 9.255418202900048e-06, + "loss": 1.59731989, + "memory(GiB)": 97.17, + "step": 17120, + "train_speed(iter/s)": 1.629038 + }, + { + "acc": 0.64172454, + "epoch": 0.43442415017757485, + "grad_norm": 5.34375, + "learning_rate": 9.254867550291891e-06, + "loss": 1.65201607, + "memory(GiB)": 97.17, + "step": 17125, + "train_speed(iter/s)": 1.629095 + }, + { + "acc": 0.65509071, + "epoch": 0.4345509893455099, + "grad_norm": 5.71875, + "learning_rate": 9.254316710535662e-06, + "loss": 1.55219383, + "memory(GiB)": 97.17, + "step": 17130, + "train_speed(iter/s)": 1.629151 + }, + { + "acc": 0.6626318, + "epoch": 0.43467782851344494, + "grad_norm": 6.65625, + "learning_rate": 9.253765683655591e-06, + "loss": 1.61080475, + "memory(GiB)": 97.17, + "step": 17135, + "train_speed(iter/s)": 1.629193 + }, + { + "acc": 0.6753911, + "epoch": 0.43480466768138, + "grad_norm": 6.65625, + "learning_rate": 9.253214469675913e-06, + "loss": 1.54352493, + "memory(GiB)": 97.17, + "step": 17140, + "train_speed(iter/s)": 1.629248 + }, + { + "acc": 0.66376657, + "epoch": 0.4349315068493151, + "grad_norm": 5.84375, + "learning_rate": 9.252663068620874e-06, + "loss": 1.59380894, + "memory(GiB)": 97.17, + "step": 17145, + "train_speed(iter/s)": 1.629304 + }, + { + "acc": 0.64371247, + "epoch": 0.43505834601725013, + "grad_norm": 4.78125, + "learning_rate": 9.252111480514726e-06, + "loss": 1.66468639, + "memory(GiB)": 97.17, + "step": 17150, + "train_speed(iter/s)": 1.629355 + }, + { + "acc": 0.65283666, + "epoch": 0.4351851851851852, + "grad_norm": 5.9375, + "learning_rate": 9.251559705381731e-06, + "loss": 1.66845894, + "memory(GiB)": 97.17, + "step": 17155, + "train_speed(iter/s)": 1.629409 + }, + { + "acc": 0.62987647, + "epoch": 0.4353120243531202, + "grad_norm": 6.625, + "learning_rate": 9.251007743246159e-06, + "loss": 1.68390102, + "memory(GiB)": 97.17, + "step": 17160, + "train_speed(iter/s)": 1.629466 + }, + { + "acc": 0.66484098, + "epoch": 0.4354388635210553, + "grad_norm": 7.375, + "learning_rate": 9.250455594132286e-06, + "loss": 1.6065155, + "memory(GiB)": 97.17, + "step": 17165, + "train_speed(iter/s)": 1.629522 + }, + { + "acc": 0.65618491, + "epoch": 0.43556570268899036, + "grad_norm": 6.0, + "learning_rate": 9.249903258064399e-06, + "loss": 1.66336746, + "memory(GiB)": 97.17, + "step": 17170, + "train_speed(iter/s)": 1.629576 + }, + { + "acc": 0.66518464, + "epoch": 0.4356925418569254, + "grad_norm": 5.65625, + "learning_rate": 9.249350735066792e-06, + "loss": 1.55276699, + "memory(GiB)": 97.17, + "step": 17175, + "train_speed(iter/s)": 1.629631 + }, + { + "acc": 0.63644385, + "epoch": 0.43581938102486045, + "grad_norm": 5.875, + "learning_rate": 9.248798025163768e-06, + "loss": 1.63725357, + "memory(GiB)": 97.17, + "step": 17180, + "train_speed(iter/s)": 1.629687 + }, + { + "acc": 0.66316242, + "epoch": 0.43594622019279555, + "grad_norm": 7.9375, + "learning_rate": 9.248245128379638e-06, + "loss": 1.56775265, + "memory(GiB)": 97.17, + "step": 17185, + "train_speed(iter/s)": 1.629742 + }, + { + "acc": 0.65070972, + "epoch": 0.4360730593607306, + "grad_norm": 5.4375, + "learning_rate": 9.24769204473872e-06, + "loss": 1.60765514, + "memory(GiB)": 97.17, + "step": 17190, + "train_speed(iter/s)": 1.629794 + }, + { + "acc": 0.65628014, + "epoch": 0.43619989852866564, + "grad_norm": 5.3125, + "learning_rate": 9.24713877426534e-06, + "loss": 1.58805046, + "memory(GiB)": 97.17, + "step": 17195, + "train_speed(iter/s)": 1.629845 + }, + { + "acc": 0.66126528, + "epoch": 0.4363267376966007, + "grad_norm": 5.09375, + "learning_rate": 9.246585316983837e-06, + "loss": 1.61996746, + "memory(GiB)": 97.17, + "step": 17200, + "train_speed(iter/s)": 1.629898 + }, + { + "acc": 0.63899622, + "epoch": 0.4364535768645358, + "grad_norm": 5.0625, + "learning_rate": 9.24603167291855e-06, + "loss": 1.69389324, + "memory(GiB)": 97.17, + "step": 17205, + "train_speed(iter/s)": 1.629951 + }, + { + "acc": 0.6473176, + "epoch": 0.43658041603247083, + "grad_norm": 5.65625, + "learning_rate": 9.245477842093832e-06, + "loss": 1.62571335, + "memory(GiB)": 97.17, + "step": 17210, + "train_speed(iter/s)": 1.630003 + }, + { + "acc": 0.64487653, + "epoch": 0.4367072552004059, + "grad_norm": 5.8125, + "learning_rate": 9.244923824534046e-06, + "loss": 1.62782249, + "memory(GiB)": 97.17, + "step": 17215, + "train_speed(iter/s)": 1.630059 + }, + { + "acc": 0.64285207, + "epoch": 0.4368340943683409, + "grad_norm": 6.34375, + "learning_rate": 9.244369620263558e-06, + "loss": 1.64927273, + "memory(GiB)": 97.17, + "step": 17220, + "train_speed(iter/s)": 1.630113 + }, + { + "acc": 0.65320253, + "epoch": 0.436960933536276, + "grad_norm": 5.78125, + "learning_rate": 9.243815229306746e-06, + "loss": 1.57669182, + "memory(GiB)": 97.17, + "step": 17225, + "train_speed(iter/s)": 1.630164 + }, + { + "acc": 0.64941335, + "epoch": 0.43708777270421106, + "grad_norm": 6.34375, + "learning_rate": 9.243260651687989e-06, + "loss": 1.64794197, + "memory(GiB)": 97.17, + "step": 17230, + "train_speed(iter/s)": 1.630217 + }, + { + "acc": 0.64359121, + "epoch": 0.4372146118721461, + "grad_norm": 5.34375, + "learning_rate": 9.242705887431685e-06, + "loss": 1.66660576, + "memory(GiB)": 97.17, + "step": 17235, + "train_speed(iter/s)": 1.630271 + }, + { + "acc": 0.65126886, + "epoch": 0.43734145104008115, + "grad_norm": 6.78125, + "learning_rate": 9.242150936562235e-06, + "loss": 1.6832901, + "memory(GiB)": 97.17, + "step": 17240, + "train_speed(iter/s)": 1.630324 + }, + { + "acc": 0.65410557, + "epoch": 0.43746829020801625, + "grad_norm": 5.0625, + "learning_rate": 9.241595799104046e-06, + "loss": 1.58133001, + "memory(GiB)": 97.17, + "step": 17245, + "train_speed(iter/s)": 1.630376 + }, + { + "acc": 0.65065088, + "epoch": 0.4375951293759513, + "grad_norm": 5.3125, + "learning_rate": 9.241040475081537e-06, + "loss": 1.63557606, + "memory(GiB)": 97.17, + "step": 17250, + "train_speed(iter/s)": 1.630427 + }, + { + "acc": 0.64462276, + "epoch": 0.43772196854388634, + "grad_norm": 5.28125, + "learning_rate": 9.240484964519131e-06, + "loss": 1.65170746, + "memory(GiB)": 97.17, + "step": 17255, + "train_speed(iter/s)": 1.630479 + }, + { + "acc": 0.65563655, + "epoch": 0.4378488077118214, + "grad_norm": 6.625, + "learning_rate": 9.239929267441267e-06, + "loss": 1.59051075, + "memory(GiB)": 97.17, + "step": 17260, + "train_speed(iter/s)": 1.630528 + }, + { + "acc": 0.66627102, + "epoch": 0.4379756468797565, + "grad_norm": 6.03125, + "learning_rate": 9.239373383872382e-06, + "loss": 1.6809948, + "memory(GiB)": 97.17, + "step": 17265, + "train_speed(iter/s)": 1.630583 + }, + { + "acc": 0.63808298, + "epoch": 0.43810248604769153, + "grad_norm": 5.0, + "learning_rate": 9.238817313836927e-06, + "loss": 1.59716835, + "memory(GiB)": 97.17, + "step": 17270, + "train_speed(iter/s)": 1.630633 + }, + { + "acc": 0.65963092, + "epoch": 0.4382293252156266, + "grad_norm": 6.53125, + "learning_rate": 9.238261057359365e-06, + "loss": 1.59871511, + "memory(GiB)": 97.17, + "step": 17275, + "train_speed(iter/s)": 1.630685 + }, + { + "acc": 0.6534812, + "epoch": 0.4383561643835616, + "grad_norm": 5.15625, + "learning_rate": 9.237704614464157e-06, + "loss": 1.63023701, + "memory(GiB)": 97.17, + "step": 17280, + "train_speed(iter/s)": 1.630739 + }, + { + "acc": 0.64379129, + "epoch": 0.4384830035514967, + "grad_norm": 4.96875, + "learning_rate": 9.237147985175781e-06, + "loss": 1.67299042, + "memory(GiB)": 97.17, + "step": 17285, + "train_speed(iter/s)": 1.630792 + }, + { + "acc": 0.65906954, + "epoch": 0.43860984271943176, + "grad_norm": 4.96875, + "learning_rate": 9.236591169518717e-06, + "loss": 1.61831512, + "memory(GiB)": 97.17, + "step": 17290, + "train_speed(iter/s)": 1.630841 + }, + { + "acc": 0.65493207, + "epoch": 0.4387366818873668, + "grad_norm": 5.75, + "learning_rate": 9.236034167517461e-06, + "loss": 1.63975716, + "memory(GiB)": 97.17, + "step": 17295, + "train_speed(iter/s)": 1.630893 + }, + { + "acc": 0.65858755, + "epoch": 0.43886352105530185, + "grad_norm": 7.75, + "learning_rate": 9.235476979196507e-06, + "loss": 1.61957054, + "memory(GiB)": 97.17, + "step": 17300, + "train_speed(iter/s)": 1.630945 + }, + { + "acc": 0.66521482, + "epoch": 0.43899036022323695, + "grad_norm": 5.78125, + "learning_rate": 9.234919604580368e-06, + "loss": 1.62613564, + "memory(GiB)": 97.17, + "step": 17305, + "train_speed(iter/s)": 1.630997 + }, + { + "acc": 0.66867867, + "epoch": 0.439117199391172, + "grad_norm": 5.28125, + "learning_rate": 9.234362043693556e-06, + "loss": 1.54498463, + "memory(GiB)": 97.17, + "step": 17310, + "train_speed(iter/s)": 1.631051 + }, + { + "acc": 0.64163628, + "epoch": 0.43924403855910704, + "grad_norm": 5.03125, + "learning_rate": 9.233804296560596e-06, + "loss": 1.63727856, + "memory(GiB)": 97.17, + "step": 17315, + "train_speed(iter/s)": 1.631102 + }, + { + "acc": 0.63664804, + "epoch": 0.4393708777270421, + "grad_norm": 5.875, + "learning_rate": 9.233246363206021e-06, + "loss": 1.70027809, + "memory(GiB)": 97.17, + "step": 17320, + "train_speed(iter/s)": 1.631157 + }, + { + "acc": 0.6491395, + "epoch": 0.4394977168949772, + "grad_norm": 5.53125, + "learning_rate": 9.232688243654371e-06, + "loss": 1.68130474, + "memory(GiB)": 97.17, + "step": 17325, + "train_speed(iter/s)": 1.631211 + }, + { + "acc": 0.64025698, + "epoch": 0.43962455606291223, + "grad_norm": 5.53125, + "learning_rate": 9.232129937930194e-06, + "loss": 1.6935976, + "memory(GiB)": 97.17, + "step": 17330, + "train_speed(iter/s)": 1.631263 + }, + { + "acc": 0.65890064, + "epoch": 0.4397513952308473, + "grad_norm": 4.96875, + "learning_rate": 9.231571446058047e-06, + "loss": 1.66459961, + "memory(GiB)": 97.17, + "step": 17335, + "train_speed(iter/s)": 1.631316 + }, + { + "acc": 0.65500379, + "epoch": 0.4398782343987823, + "grad_norm": 4.84375, + "learning_rate": 9.231012768062497e-06, + "loss": 1.61756439, + "memory(GiB)": 97.17, + "step": 17340, + "train_speed(iter/s)": 1.631368 + }, + { + "acc": 0.65212064, + "epoch": 0.4400050735667174, + "grad_norm": 5.0625, + "learning_rate": 9.230453903968112e-06, + "loss": 1.64284668, + "memory(GiB)": 97.17, + "step": 17345, + "train_speed(iter/s)": 1.63142 + }, + { + "acc": 0.66191378, + "epoch": 0.44013191273465246, + "grad_norm": 6.75, + "learning_rate": 9.22989485379948e-06, + "loss": 1.58716145, + "memory(GiB)": 97.17, + "step": 17350, + "train_speed(iter/s)": 1.631473 + }, + { + "acc": 0.65656586, + "epoch": 0.4402587519025875, + "grad_norm": 5.15625, + "learning_rate": 9.229335617581187e-06, + "loss": 1.52349739, + "memory(GiB)": 97.17, + "step": 17355, + "train_speed(iter/s)": 1.631523 + }, + { + "acc": 0.6607399, + "epoch": 0.44038559107052255, + "grad_norm": 5.09375, + "learning_rate": 9.22877619533783e-06, + "loss": 1.58022556, + "memory(GiB)": 97.17, + "step": 17360, + "train_speed(iter/s)": 1.631576 + }, + { + "acc": 0.65099888, + "epoch": 0.44051243023845765, + "grad_norm": 4.78125, + "learning_rate": 9.228216587094014e-06, + "loss": 1.59342184, + "memory(GiB)": 97.17, + "step": 17365, + "train_speed(iter/s)": 1.631625 + }, + { + "acc": 0.66549187, + "epoch": 0.4406392694063927, + "grad_norm": 7.0625, + "learning_rate": 9.227656792874358e-06, + "loss": 1.53481331, + "memory(GiB)": 97.17, + "step": 17370, + "train_speed(iter/s)": 1.631675 + }, + { + "acc": 0.65343962, + "epoch": 0.44076610857432774, + "grad_norm": 6.0, + "learning_rate": 9.227096812703479e-06, + "loss": 1.65888443, + "memory(GiB)": 97.17, + "step": 17375, + "train_speed(iter/s)": 1.631723 + }, + { + "acc": 0.65020285, + "epoch": 0.4408929477422628, + "grad_norm": 6.53125, + "learning_rate": 9.22653664660601e-06, + "loss": 1.62792549, + "memory(GiB)": 97.17, + "step": 17380, + "train_speed(iter/s)": 1.631774 + }, + { + "acc": 0.6598793, + "epoch": 0.4410197869101979, + "grad_norm": 4.9375, + "learning_rate": 9.225976294606589e-06, + "loss": 1.62354546, + "memory(GiB)": 97.17, + "step": 17385, + "train_speed(iter/s)": 1.631824 + }, + { + "acc": 0.64714398, + "epoch": 0.44114662607813293, + "grad_norm": 6.78125, + "learning_rate": 9.225415756729863e-06, + "loss": 1.62764759, + "memory(GiB)": 97.17, + "step": 17390, + "train_speed(iter/s)": 1.631874 + }, + { + "acc": 0.63898187, + "epoch": 0.441273465246068, + "grad_norm": 6.4375, + "learning_rate": 9.224855033000489e-06, + "loss": 1.64272556, + "memory(GiB)": 97.17, + "step": 17395, + "train_speed(iter/s)": 1.631924 + }, + { + "acc": 0.66961508, + "epoch": 0.441400304414003, + "grad_norm": 4.96875, + "learning_rate": 9.224294123443125e-06, + "loss": 1.55725889, + "memory(GiB)": 97.17, + "step": 17400, + "train_speed(iter/s)": 1.631965 + }, + { + "acc": 0.66209025, + "epoch": 0.4415271435819381, + "grad_norm": 5.09375, + "learning_rate": 9.223733028082447e-06, + "loss": 1.58789091, + "memory(GiB)": 97.17, + "step": 17405, + "train_speed(iter/s)": 1.632016 + }, + { + "acc": 0.65001025, + "epoch": 0.44165398274987316, + "grad_norm": 5.84375, + "learning_rate": 9.223171746943132e-06, + "loss": 1.61291885, + "memory(GiB)": 97.17, + "step": 17410, + "train_speed(iter/s)": 1.632067 + }, + { + "acc": 0.66293631, + "epoch": 0.4417808219178082, + "grad_norm": 5.375, + "learning_rate": 9.222610280049868e-06, + "loss": 1.65044289, + "memory(GiB)": 97.17, + "step": 17415, + "train_speed(iter/s)": 1.632117 + }, + { + "acc": 0.64517002, + "epoch": 0.44190766108574325, + "grad_norm": 5.84375, + "learning_rate": 9.222048627427352e-06, + "loss": 1.67645073, + "memory(GiB)": 97.17, + "step": 17420, + "train_speed(iter/s)": 1.63217 + }, + { + "acc": 0.6596745, + "epoch": 0.44203450025367835, + "grad_norm": 5.59375, + "learning_rate": 9.221486789100288e-06, + "loss": 1.60908546, + "memory(GiB)": 97.17, + "step": 17425, + "train_speed(iter/s)": 1.632218 + }, + { + "acc": 0.64544168, + "epoch": 0.4421613394216134, + "grad_norm": 6.65625, + "learning_rate": 9.220924765093386e-06, + "loss": 1.62117519, + "memory(GiB)": 97.17, + "step": 17430, + "train_speed(iter/s)": 1.632268 + }, + { + "acc": 0.63961639, + "epoch": 0.44228817858954844, + "grad_norm": 5.0, + "learning_rate": 9.220362555431369e-06, + "loss": 1.6463419, + "memory(GiB)": 97.17, + "step": 17435, + "train_speed(iter/s)": 1.632321 + }, + { + "acc": 0.64414148, + "epoch": 0.4424150177574835, + "grad_norm": 5.03125, + "learning_rate": 9.219800160138964e-06, + "loss": 1.58510227, + "memory(GiB)": 97.17, + "step": 17440, + "train_speed(iter/s)": 1.632374 + }, + { + "acc": 0.64963026, + "epoch": 0.4425418569254186, + "grad_norm": 6.28125, + "learning_rate": 9.219237579240907e-06, + "loss": 1.61883717, + "memory(GiB)": 97.17, + "step": 17445, + "train_speed(iter/s)": 1.632424 + }, + { + "acc": 0.6608654, + "epoch": 0.44266869609335363, + "grad_norm": 5.5, + "learning_rate": 9.218674812761946e-06, + "loss": 1.58943939, + "memory(GiB)": 97.17, + "step": 17450, + "train_speed(iter/s)": 1.632472 + }, + { + "acc": 0.64905262, + "epoch": 0.4427955352612887, + "grad_norm": 6.03125, + "learning_rate": 9.21811186072683e-06, + "loss": 1.59389238, + "memory(GiB)": 97.17, + "step": 17455, + "train_speed(iter/s)": 1.632521 + }, + { + "acc": 0.66097002, + "epoch": 0.4429223744292237, + "grad_norm": 6.0625, + "learning_rate": 9.21754872316032e-06, + "loss": 1.61188049, + "memory(GiB)": 97.17, + "step": 17460, + "train_speed(iter/s)": 1.63257 + }, + { + "acc": 0.67582827, + "epoch": 0.4430492135971588, + "grad_norm": 6.46875, + "learning_rate": 9.21698540008719e-06, + "loss": 1.54886074, + "memory(GiB)": 97.17, + "step": 17465, + "train_speed(iter/s)": 1.63262 + }, + { + "acc": 0.64947634, + "epoch": 0.44317605276509386, + "grad_norm": 6.03125, + "learning_rate": 9.216421891532214e-06, + "loss": 1.61587067, + "memory(GiB)": 97.17, + "step": 17470, + "train_speed(iter/s)": 1.632669 + }, + { + "acc": 0.66888046, + "epoch": 0.4433028919330289, + "grad_norm": 6.46875, + "learning_rate": 9.215858197520178e-06, + "loss": 1.6213028, + "memory(GiB)": 97.17, + "step": 17475, + "train_speed(iter/s)": 1.632719 + }, + { + "acc": 0.65724154, + "epoch": 0.44342973110096395, + "grad_norm": 5.71875, + "learning_rate": 9.215294318075876e-06, + "loss": 1.62744942, + "memory(GiB)": 97.17, + "step": 17480, + "train_speed(iter/s)": 1.632774 + }, + { + "acc": 0.66109142, + "epoch": 0.44355657026889905, + "grad_norm": 7.71875, + "learning_rate": 9.21473025322411e-06, + "loss": 1.55315933, + "memory(GiB)": 97.17, + "step": 17485, + "train_speed(iter/s)": 1.632825 + }, + { + "acc": 0.65149584, + "epoch": 0.4436834094368341, + "grad_norm": 5.0625, + "learning_rate": 9.21416600298969e-06, + "loss": 1.65997868, + "memory(GiB)": 97.17, + "step": 17490, + "train_speed(iter/s)": 1.632877 + }, + { + "acc": 0.64868312, + "epoch": 0.44381024860476914, + "grad_norm": 5.1875, + "learning_rate": 9.213601567397434e-06, + "loss": 1.55312653, + "memory(GiB)": 97.17, + "step": 17495, + "train_speed(iter/s)": 1.632927 + }, + { + "acc": 0.64261456, + "epoch": 0.4439370877727042, + "grad_norm": 6.8125, + "learning_rate": 9.213036946472169e-06, + "loss": 1.70766525, + "memory(GiB)": 97.17, + "step": 17500, + "train_speed(iter/s)": 1.632976 + }, + { + "acc": 0.63818736, + "epoch": 0.4440639269406393, + "grad_norm": 5.625, + "learning_rate": 9.212472140238729e-06, + "loss": 1.63124352, + "memory(GiB)": 97.17, + "step": 17505, + "train_speed(iter/s)": 1.633028 + }, + { + "acc": 0.6509151, + "epoch": 0.44419076610857433, + "grad_norm": 6.25, + "learning_rate": 9.211907148721958e-06, + "loss": 1.62345543, + "memory(GiB)": 97.17, + "step": 17510, + "train_speed(iter/s)": 1.63308 + }, + { + "acc": 0.65588999, + "epoch": 0.4443176052765094, + "grad_norm": 5.0, + "learning_rate": 9.211341971946705e-06, + "loss": 1.62372894, + "memory(GiB)": 97.17, + "step": 17515, + "train_speed(iter/s)": 1.633127 + }, + { + "acc": 0.66911678, + "epoch": 0.4444444444444444, + "grad_norm": 5.15625, + "learning_rate": 9.21077660993783e-06, + "loss": 1.605439, + "memory(GiB)": 97.17, + "step": 17520, + "train_speed(iter/s)": 1.633178 + }, + { + "acc": 0.65478215, + "epoch": 0.4445712836123795, + "grad_norm": 6.53125, + "learning_rate": 9.210211062720198e-06, + "loss": 1.64492722, + "memory(GiB)": 97.17, + "step": 17525, + "train_speed(iter/s)": 1.633225 + }, + { + "acc": 0.63903246, + "epoch": 0.44469812278031456, + "grad_norm": 4.875, + "learning_rate": 9.209645330318689e-06, + "loss": 1.68473778, + "memory(GiB)": 97.17, + "step": 17530, + "train_speed(iter/s)": 1.633277 + }, + { + "acc": 0.65429549, + "epoch": 0.4448249619482496, + "grad_norm": 6.875, + "learning_rate": 9.209079412758183e-06, + "loss": 1.6484724, + "memory(GiB)": 97.17, + "step": 17535, + "train_speed(iter/s)": 1.633324 + }, + { + "acc": 0.65037231, + "epoch": 0.44495180111618465, + "grad_norm": 7.9375, + "learning_rate": 9.208513310063572e-06, + "loss": 1.65741959, + "memory(GiB)": 97.17, + "step": 17540, + "train_speed(iter/s)": 1.633375 + }, + { + "acc": 0.66384783, + "epoch": 0.44507864028411975, + "grad_norm": 6.5625, + "learning_rate": 9.207947022259755e-06, + "loss": 1.6025013, + "memory(GiB)": 97.17, + "step": 17545, + "train_speed(iter/s)": 1.633426 + }, + { + "acc": 0.6584012, + "epoch": 0.4452054794520548, + "grad_norm": 6.90625, + "learning_rate": 9.207380549371642e-06, + "loss": 1.56964865, + "memory(GiB)": 97.17, + "step": 17550, + "train_speed(iter/s)": 1.633474 + }, + { + "acc": 0.64631844, + "epoch": 0.44533231861998984, + "grad_norm": 6.40625, + "learning_rate": 9.206813891424147e-06, + "loss": 1.65718536, + "memory(GiB)": 97.17, + "step": 17555, + "train_speed(iter/s)": 1.63352 + }, + { + "acc": 0.64426107, + "epoch": 0.4454591577879249, + "grad_norm": 5.3125, + "learning_rate": 9.206247048442196e-06, + "loss": 1.68766079, + "memory(GiB)": 97.17, + "step": 17560, + "train_speed(iter/s)": 1.633571 + }, + { + "acc": 0.65517244, + "epoch": 0.44558599695586, + "grad_norm": 5.1875, + "learning_rate": 9.20568002045072e-06, + "loss": 1.61405602, + "memory(GiB)": 97.17, + "step": 17565, + "train_speed(iter/s)": 1.633623 + }, + { + "acc": 0.64193468, + "epoch": 0.44571283612379503, + "grad_norm": 4.6875, + "learning_rate": 9.20511280747466e-06, + "loss": 1.63794937, + "memory(GiB)": 97.17, + "step": 17570, + "train_speed(iter/s)": 1.633672 + }, + { + "acc": 0.65987406, + "epoch": 0.4458396752917301, + "grad_norm": 4.9375, + "learning_rate": 9.204545409538962e-06, + "loss": 1.55015793, + "memory(GiB)": 97.17, + "step": 17575, + "train_speed(iter/s)": 1.633724 + }, + { + "acc": 0.64122486, + "epoch": 0.4459665144596651, + "grad_norm": 5.03125, + "learning_rate": 9.203977826668587e-06, + "loss": 1.72442589, + "memory(GiB)": 97.17, + "step": 17580, + "train_speed(iter/s)": 1.633774 + }, + { + "acc": 0.66856356, + "epoch": 0.4460933536276002, + "grad_norm": 5.34375, + "learning_rate": 9.203410058888498e-06, + "loss": 1.54310093, + "memory(GiB)": 97.17, + "step": 17585, + "train_speed(iter/s)": 1.633822 + }, + { + "acc": 0.6521203, + "epoch": 0.44622019279553526, + "grad_norm": 5.71875, + "learning_rate": 9.202842106223667e-06, + "loss": 1.62952347, + "memory(GiB)": 97.17, + "step": 17590, + "train_speed(iter/s)": 1.63387 + }, + { + "acc": 0.64071207, + "epoch": 0.4463470319634703, + "grad_norm": 5.875, + "learning_rate": 9.202273968699075e-06, + "loss": 1.68462486, + "memory(GiB)": 97.17, + "step": 17595, + "train_speed(iter/s)": 1.633923 + }, + { + "acc": 0.66767244, + "epoch": 0.44647387113140535, + "grad_norm": 7.65625, + "learning_rate": 9.201705646339714e-06, + "loss": 1.61972485, + "memory(GiB)": 97.17, + "step": 17600, + "train_speed(iter/s)": 1.633974 + }, + { + "acc": 0.65168958, + "epoch": 0.44660071029934045, + "grad_norm": 4.96875, + "learning_rate": 9.201137139170578e-06, + "loss": 1.5934412, + "memory(GiB)": 97.17, + "step": 17605, + "train_speed(iter/s)": 1.634024 + }, + { + "acc": 0.66115465, + "epoch": 0.4467275494672755, + "grad_norm": 5.0625, + "learning_rate": 9.200568447216673e-06, + "loss": 1.58206282, + "memory(GiB)": 97.17, + "step": 17610, + "train_speed(iter/s)": 1.634073 + }, + { + "acc": 0.65404058, + "epoch": 0.44685438863521054, + "grad_norm": 5.0625, + "learning_rate": 9.199999570503015e-06, + "loss": 1.57423792, + "memory(GiB)": 97.17, + "step": 17615, + "train_speed(iter/s)": 1.634124 + }, + { + "acc": 0.6270751, + "epoch": 0.4469812278031456, + "grad_norm": 4.375, + "learning_rate": 9.199430509054625e-06, + "loss": 1.74342499, + "memory(GiB)": 97.17, + "step": 17620, + "train_speed(iter/s)": 1.634175 + }, + { + "acc": 0.65442171, + "epoch": 0.4471080669710807, + "grad_norm": 6.0625, + "learning_rate": 9.19886126289653e-06, + "loss": 1.61158905, + "memory(GiB)": 97.17, + "step": 17625, + "train_speed(iter/s)": 1.63422 + }, + { + "acc": 0.6561059, + "epoch": 0.44723490613901573, + "grad_norm": 5.46875, + "learning_rate": 9.198291832053771e-06, + "loss": 1.65528717, + "memory(GiB)": 97.17, + "step": 17630, + "train_speed(iter/s)": 1.634267 + }, + { + "acc": 0.65104151, + "epoch": 0.4473617453069508, + "grad_norm": 5.8125, + "learning_rate": 9.197722216551395e-06, + "loss": 1.63396149, + "memory(GiB)": 97.17, + "step": 17635, + "train_speed(iter/s)": 1.634317 + }, + { + "acc": 0.6543983, + "epoch": 0.4474885844748858, + "grad_norm": 4.59375, + "learning_rate": 9.197152416414452e-06, + "loss": 1.58556957, + "memory(GiB)": 97.17, + "step": 17640, + "train_speed(iter/s)": 1.634368 + }, + { + "acc": 0.64443326, + "epoch": 0.4476154236428209, + "grad_norm": 5.3125, + "learning_rate": 9.196582431668007e-06, + "loss": 1.71702366, + "memory(GiB)": 97.17, + "step": 17645, + "train_speed(iter/s)": 1.634419 + }, + { + "acc": 0.66325607, + "epoch": 0.44774226281075596, + "grad_norm": 6.0625, + "learning_rate": 9.196012262337131e-06, + "loss": 1.59066172, + "memory(GiB)": 97.17, + "step": 17650, + "train_speed(iter/s)": 1.634468 + }, + { + "acc": 0.64031544, + "epoch": 0.447869101978691, + "grad_norm": 5.96875, + "learning_rate": 9.195441908446902e-06, + "loss": 1.70693684, + "memory(GiB)": 97.17, + "step": 17655, + "train_speed(iter/s)": 1.634518 + }, + { + "acc": 0.65541973, + "epoch": 0.44799594114662605, + "grad_norm": 5.53125, + "learning_rate": 9.194871370022407e-06, + "loss": 1.63178329, + "memory(GiB)": 97.17, + "step": 17660, + "train_speed(iter/s)": 1.634568 + }, + { + "acc": 0.66145759, + "epoch": 0.44812278031456115, + "grad_norm": 5.4375, + "learning_rate": 9.194300647088739e-06, + "loss": 1.60694675, + "memory(GiB)": 97.17, + "step": 17665, + "train_speed(iter/s)": 1.634616 + }, + { + "acc": 0.63905807, + "epoch": 0.4482496194824962, + "grad_norm": 5.40625, + "learning_rate": 9.193729739671002e-06, + "loss": 1.65575333, + "memory(GiB)": 97.17, + "step": 17670, + "train_speed(iter/s)": 1.634666 + }, + { + "acc": 0.64483628, + "epoch": 0.44837645865043124, + "grad_norm": 4.5, + "learning_rate": 9.193158647794308e-06, + "loss": 1.61878014, + "memory(GiB)": 97.17, + "step": 17675, + "train_speed(iter/s)": 1.634713 + }, + { + "acc": 0.64103479, + "epoch": 0.4485032978183663, + "grad_norm": 6.03125, + "learning_rate": 9.192587371483777e-06, + "loss": 1.64115047, + "memory(GiB)": 97.17, + "step": 17680, + "train_speed(iter/s)": 1.634763 + }, + { + "acc": 0.63627243, + "epoch": 0.4486301369863014, + "grad_norm": 6.25, + "learning_rate": 9.192015910764535e-06, + "loss": 1.69046936, + "memory(GiB)": 97.17, + "step": 17685, + "train_speed(iter/s)": 1.63481 + }, + { + "acc": 0.6593894, + "epoch": 0.44875697615423643, + "grad_norm": 5.96875, + "learning_rate": 9.191444265661715e-06, + "loss": 1.64550056, + "memory(GiB)": 97.17, + "step": 17690, + "train_speed(iter/s)": 1.634861 + }, + { + "acc": 0.64664779, + "epoch": 0.4488838153221715, + "grad_norm": 5.125, + "learning_rate": 9.190872436200464e-06, + "loss": 1.65807114, + "memory(GiB)": 97.17, + "step": 17695, + "train_speed(iter/s)": 1.63491 + }, + { + "acc": 0.63693523, + "epoch": 0.4490106544901065, + "grad_norm": 6.40625, + "learning_rate": 9.190300422405933e-06, + "loss": 1.61289158, + "memory(GiB)": 97.17, + "step": 17700, + "train_speed(iter/s)": 1.634959 + }, + { + "acc": 0.6512929, + "epoch": 0.4491374936580416, + "grad_norm": 5.6875, + "learning_rate": 9.18972822430328e-06, + "loss": 1.661796, + "memory(GiB)": 97.17, + "step": 17705, + "train_speed(iter/s)": 1.635012 + }, + { + "acc": 0.65816598, + "epoch": 0.44926433282597666, + "grad_norm": 5.5, + "learning_rate": 9.189155841917675e-06, + "loss": 1.55127144, + "memory(GiB)": 97.17, + "step": 17710, + "train_speed(iter/s)": 1.635059 + }, + { + "acc": 0.64287744, + "epoch": 0.4493911719939117, + "grad_norm": 5.6875, + "learning_rate": 9.18858327527429e-06, + "loss": 1.6665947, + "memory(GiB)": 97.17, + "step": 17715, + "train_speed(iter/s)": 1.635111 + }, + { + "acc": 0.63745298, + "epoch": 0.44951801116184675, + "grad_norm": 5.40625, + "learning_rate": 9.188010524398314e-06, + "loss": 1.62430573, + "memory(GiB)": 97.17, + "step": 17720, + "train_speed(iter/s)": 1.635161 + }, + { + "acc": 0.68063426, + "epoch": 0.44964485032978185, + "grad_norm": 6.4375, + "learning_rate": 9.187437589314939e-06, + "loss": 1.51655159, + "memory(GiB)": 97.17, + "step": 17725, + "train_speed(iter/s)": 1.635209 + }, + { + "acc": 0.65839486, + "epoch": 0.4497716894977169, + "grad_norm": 5.6875, + "learning_rate": 9.186864470049358e-06, + "loss": 1.5760601, + "memory(GiB)": 97.17, + "step": 17730, + "train_speed(iter/s)": 1.635259 + }, + { + "acc": 0.66085672, + "epoch": 0.44989852866565194, + "grad_norm": 4.78125, + "learning_rate": 9.186291166626789e-06, + "loss": 1.59891024, + "memory(GiB)": 97.17, + "step": 17735, + "train_speed(iter/s)": 1.63531 + }, + { + "acc": 0.6589829, + "epoch": 0.450025367833587, + "grad_norm": 5.59375, + "learning_rate": 9.185717679072444e-06, + "loss": 1.61997185, + "memory(GiB)": 97.17, + "step": 17740, + "train_speed(iter/s)": 1.635359 + }, + { + "acc": 0.6514677, + "epoch": 0.4501522070015221, + "grad_norm": 5.71875, + "learning_rate": 9.185144007411547e-06, + "loss": 1.61622772, + "memory(GiB)": 97.17, + "step": 17745, + "train_speed(iter/s)": 1.63541 + }, + { + "acc": 0.63201022, + "epoch": 0.45027904616945713, + "grad_norm": 4.9375, + "learning_rate": 9.18457015166933e-06, + "loss": 1.62467995, + "memory(GiB)": 97.17, + "step": 17750, + "train_speed(iter/s)": 1.635458 + }, + { + "acc": 0.65665436, + "epoch": 0.4504058853373922, + "grad_norm": 6.40625, + "learning_rate": 9.183996111871034e-06, + "loss": 1.55665665, + "memory(GiB)": 97.17, + "step": 17755, + "train_speed(iter/s)": 1.635507 + }, + { + "acc": 0.67177896, + "epoch": 0.4505327245053272, + "grad_norm": 5.875, + "learning_rate": 9.18342188804191e-06, + "loss": 1.65101967, + "memory(GiB)": 97.17, + "step": 17760, + "train_speed(iter/s)": 1.635557 + }, + { + "acc": 0.64590473, + "epoch": 0.4506595636732623, + "grad_norm": 12.4375, + "learning_rate": 9.182847480207215e-06, + "loss": 1.62943039, + "memory(GiB)": 97.17, + "step": 17765, + "train_speed(iter/s)": 1.635606 + }, + { + "acc": 0.6614017, + "epoch": 0.45078640284119736, + "grad_norm": 4.9375, + "learning_rate": 9.182272888392211e-06, + "loss": 1.60033531, + "memory(GiB)": 97.17, + "step": 17770, + "train_speed(iter/s)": 1.635654 + }, + { + "acc": 0.64702106, + "epoch": 0.4509132420091324, + "grad_norm": 6.84375, + "learning_rate": 9.181698112622175e-06, + "loss": 1.63443604, + "memory(GiB)": 97.17, + "step": 17775, + "train_speed(iter/s)": 1.635706 + }, + { + "acc": 0.6286911, + "epoch": 0.45104008117706745, + "grad_norm": 5.15625, + "learning_rate": 9.181123152922384e-06, + "loss": 1.69296875, + "memory(GiB)": 97.17, + "step": 17780, + "train_speed(iter/s)": 1.635752 + }, + { + "acc": 0.64293375, + "epoch": 0.45116692034500255, + "grad_norm": 5.03125, + "learning_rate": 9.18054800931813e-06, + "loss": 1.63769188, + "memory(GiB)": 97.17, + "step": 17785, + "train_speed(iter/s)": 1.635799 + }, + { + "acc": 0.64541492, + "epoch": 0.4512937595129376, + "grad_norm": 5.65625, + "learning_rate": 9.17997268183471e-06, + "loss": 1.58755579, + "memory(GiB)": 97.17, + "step": 17790, + "train_speed(iter/s)": 1.635848 + }, + { + "acc": 0.63184462, + "epoch": 0.45142059868087264, + "grad_norm": 5.3125, + "learning_rate": 9.17939717049743e-06, + "loss": 1.67936535, + "memory(GiB)": 97.17, + "step": 17795, + "train_speed(iter/s)": 1.6359 + }, + { + "acc": 0.65373926, + "epoch": 0.4515474378488077, + "grad_norm": 6.4375, + "learning_rate": 9.1788214753316e-06, + "loss": 1.64679909, + "memory(GiB)": 97.17, + "step": 17800, + "train_speed(iter/s)": 1.635947 + }, + { + "acc": 0.65907717, + "epoch": 0.4516742770167428, + "grad_norm": 5.96875, + "learning_rate": 9.178245596362546e-06, + "loss": 1.60675106, + "memory(GiB)": 97.17, + "step": 17805, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.6508554, + "epoch": 0.45180111618467783, + "grad_norm": 5.9375, + "learning_rate": 9.177669533615599e-06, + "loss": 1.61277542, + "memory(GiB)": 97.17, + "step": 17810, + "train_speed(iter/s)": 1.636043 + }, + { + "acc": 0.65542326, + "epoch": 0.4519279553526129, + "grad_norm": 5.03125, + "learning_rate": 9.17709328711609e-06, + "loss": 1.56318274, + "memory(GiB)": 97.17, + "step": 17815, + "train_speed(iter/s)": 1.636092 + }, + { + "acc": 0.63743639, + "epoch": 0.4520547945205479, + "grad_norm": 5.375, + "learning_rate": 9.17651685688937e-06, + "loss": 1.66922951, + "memory(GiB)": 97.17, + "step": 17820, + "train_speed(iter/s)": 1.636138 + }, + { + "acc": 0.65172262, + "epoch": 0.452181633688483, + "grad_norm": 5.59375, + "learning_rate": 9.175940242960792e-06, + "loss": 1.66571579, + "memory(GiB)": 97.17, + "step": 17825, + "train_speed(iter/s)": 1.636191 + }, + { + "acc": 0.65301452, + "epoch": 0.45230847285641806, + "grad_norm": 4.90625, + "learning_rate": 9.175363445355718e-06, + "loss": 1.6732338, + "memory(GiB)": 97.17, + "step": 17830, + "train_speed(iter/s)": 1.63624 + }, + { + "acc": 0.66110291, + "epoch": 0.4524353120243531, + "grad_norm": 5.53125, + "learning_rate": 9.174786464099519e-06, + "loss": 1.60229473, + "memory(GiB)": 97.17, + "step": 17835, + "train_speed(iter/s)": 1.636288 + }, + { + "acc": 0.64350657, + "epoch": 0.45256215119228815, + "grad_norm": 5.65625, + "learning_rate": 9.17420929921757e-06, + "loss": 1.64239502, + "memory(GiB)": 97.17, + "step": 17840, + "train_speed(iter/s)": 1.636336 + }, + { + "acc": 0.66892071, + "epoch": 0.45268899036022325, + "grad_norm": 6.65625, + "learning_rate": 9.173631950735262e-06, + "loss": 1.56081057, + "memory(GiB)": 97.17, + "step": 17845, + "train_speed(iter/s)": 1.636383 + }, + { + "acc": 0.65648899, + "epoch": 0.4528158295281583, + "grad_norm": 5.84375, + "learning_rate": 9.173054418677985e-06, + "loss": 1.64969406, + "memory(GiB)": 97.17, + "step": 17850, + "train_speed(iter/s)": 1.636434 + }, + { + "acc": 0.6633276, + "epoch": 0.45294266869609334, + "grad_norm": 6.53125, + "learning_rate": 9.172476703071145e-06, + "loss": 1.58426533, + "memory(GiB)": 97.17, + "step": 17855, + "train_speed(iter/s)": 1.63648 + }, + { + "acc": 0.66323233, + "epoch": 0.4530695078640284, + "grad_norm": 5.78125, + "learning_rate": 9.171898803940148e-06, + "loss": 1.56856079, + "memory(GiB)": 97.17, + "step": 17860, + "train_speed(iter/s)": 1.636528 + }, + { + "acc": 0.66387711, + "epoch": 0.4531963470319635, + "grad_norm": 4.84375, + "learning_rate": 9.17132072131042e-06, + "loss": 1.56297207, + "memory(GiB)": 97.17, + "step": 17865, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.65439901, + "epoch": 0.45332318619989853, + "grad_norm": 5.9375, + "learning_rate": 9.170742455207378e-06, + "loss": 1.60466576, + "memory(GiB)": 97.17, + "step": 17870, + "train_speed(iter/s)": 1.636621 + }, + { + "acc": 0.65063219, + "epoch": 0.4534500253678336, + "grad_norm": 7.125, + "learning_rate": 9.170164005656465e-06, + "loss": 1.61257973, + "memory(GiB)": 97.17, + "step": 17875, + "train_speed(iter/s)": 1.636669 + }, + { + "acc": 0.67193336, + "epoch": 0.4535768645357686, + "grad_norm": 6.46875, + "learning_rate": 9.169585372683118e-06, + "loss": 1.49588842, + "memory(GiB)": 97.17, + "step": 17880, + "train_speed(iter/s)": 1.636715 + }, + { + "acc": 0.66043739, + "epoch": 0.4537037037037037, + "grad_norm": 6.6875, + "learning_rate": 9.169006556312794e-06, + "loss": 1.61532135, + "memory(GiB)": 97.17, + "step": 17885, + "train_speed(iter/s)": 1.636763 + }, + { + "acc": 0.64468875, + "epoch": 0.45383054287163876, + "grad_norm": 5.65625, + "learning_rate": 9.168427556570946e-06, + "loss": 1.66786633, + "memory(GiB)": 97.17, + "step": 17890, + "train_speed(iter/s)": 1.636808 + }, + { + "acc": 0.66313658, + "epoch": 0.4539573820395738, + "grad_norm": 6.1875, + "learning_rate": 9.167848373483044e-06, + "loss": 1.61062508, + "memory(GiB)": 97.17, + "step": 17895, + "train_speed(iter/s)": 1.636859 + }, + { + "acc": 0.65240016, + "epoch": 0.45408422120750885, + "grad_norm": 5.34375, + "learning_rate": 9.167269007074561e-06, + "loss": 1.63836269, + "memory(GiB)": 97.17, + "step": 17900, + "train_speed(iter/s)": 1.636905 + }, + { + "acc": 0.65331364, + "epoch": 0.45421106037544395, + "grad_norm": 4.15625, + "learning_rate": 9.166689457370983e-06, + "loss": 1.6184988, + "memory(GiB)": 97.17, + "step": 17905, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.63214364, + "epoch": 0.454337899543379, + "grad_norm": 5.40625, + "learning_rate": 9.166109724397801e-06, + "loss": 1.65282021, + "memory(GiB)": 97.17, + "step": 17910, + "train_speed(iter/s)": 1.637001 + }, + { + "acc": 0.65271082, + "epoch": 0.45446473871131404, + "grad_norm": 7.84375, + "learning_rate": 9.165529808180511e-06, + "loss": 1.61452484, + "memory(GiB)": 97.17, + "step": 17915, + "train_speed(iter/s)": 1.637052 + }, + { + "acc": 0.6501111, + "epoch": 0.4545915778792491, + "grad_norm": 4.65625, + "learning_rate": 9.164949708744622e-06, + "loss": 1.62055683, + "memory(GiB)": 97.17, + "step": 17920, + "train_speed(iter/s)": 1.637099 + }, + { + "acc": 0.6526082, + "epoch": 0.4547184170471842, + "grad_norm": 8.25, + "learning_rate": 9.164369426115652e-06, + "loss": 1.58178997, + "memory(GiB)": 97.17, + "step": 17925, + "train_speed(iter/s)": 1.637146 + }, + { + "acc": 0.65028062, + "epoch": 0.45484525621511923, + "grad_norm": 5.1875, + "learning_rate": 9.16378896031912e-06, + "loss": 1.65476837, + "memory(GiB)": 97.17, + "step": 17930, + "train_speed(iter/s)": 1.637195 + }, + { + "acc": 0.65379977, + "epoch": 0.4549720953830543, + "grad_norm": 5.6875, + "learning_rate": 9.163208311380561e-06, + "loss": 1.60218658, + "memory(GiB)": 97.17, + "step": 17935, + "train_speed(iter/s)": 1.637244 + }, + { + "acc": 0.65739164, + "epoch": 0.4550989345509893, + "grad_norm": 4.8125, + "learning_rate": 9.162627479325511e-06, + "loss": 1.59945431, + "memory(GiB)": 97.17, + "step": 17940, + "train_speed(iter/s)": 1.637293 + }, + { + "acc": 0.66322875, + "epoch": 0.4552257737189244, + "grad_norm": 6.0, + "learning_rate": 9.16204646417952e-06, + "loss": 1.60096436, + "memory(GiB)": 97.17, + "step": 17945, + "train_speed(iter/s)": 1.637342 + }, + { + "acc": 0.65176544, + "epoch": 0.45535261288685946, + "grad_norm": 5.3125, + "learning_rate": 9.161465265968145e-06, + "loss": 1.56856689, + "memory(GiB)": 97.17, + "step": 17950, + "train_speed(iter/s)": 1.637389 + }, + { + "acc": 0.66775045, + "epoch": 0.4554794520547945, + "grad_norm": 6.4375, + "learning_rate": 9.160883884716948e-06, + "loss": 1.54070654, + "memory(GiB)": 97.17, + "step": 17955, + "train_speed(iter/s)": 1.637436 + }, + { + "acc": 0.65757189, + "epoch": 0.45560629122272955, + "grad_norm": 4.875, + "learning_rate": 9.1603023204515e-06, + "loss": 1.65287247, + "memory(GiB)": 97.17, + "step": 17960, + "train_speed(iter/s)": 1.637485 + }, + { + "acc": 0.64055328, + "epoch": 0.45573313039066465, + "grad_norm": 4.78125, + "learning_rate": 9.15972057319738e-06, + "loss": 1.62745132, + "memory(GiB)": 97.17, + "step": 17965, + "train_speed(iter/s)": 1.63753 + }, + { + "acc": 0.66283889, + "epoch": 0.4558599695585997, + "grad_norm": 7.59375, + "learning_rate": 9.159138642980178e-06, + "loss": 1.63845444, + "memory(GiB)": 97.17, + "step": 17970, + "train_speed(iter/s)": 1.637574 + }, + { + "acc": 0.64528112, + "epoch": 0.45598680872653474, + "grad_norm": 6.78125, + "learning_rate": 9.15855652982549e-06, + "loss": 1.66781158, + "memory(GiB)": 97.17, + "step": 17975, + "train_speed(iter/s)": 1.637619 + }, + { + "acc": 0.66027126, + "epoch": 0.4561136478944698, + "grad_norm": 7.3125, + "learning_rate": 9.15797423375892e-06, + "loss": 1.62811623, + "memory(GiB)": 97.17, + "step": 17980, + "train_speed(iter/s)": 1.637667 + }, + { + "acc": 0.65096769, + "epoch": 0.4562404870624049, + "grad_norm": 6.0625, + "learning_rate": 9.157391754806079e-06, + "loss": 1.65528755, + "memory(GiB)": 97.17, + "step": 17985, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.66090035, + "epoch": 0.45636732623033993, + "grad_norm": 7.40625, + "learning_rate": 9.156809092992588e-06, + "loss": 1.59680519, + "memory(GiB)": 97.17, + "step": 17990, + "train_speed(iter/s)": 1.637757 + }, + { + "acc": 0.6504416, + "epoch": 0.456494165398275, + "grad_norm": 6.625, + "learning_rate": 9.156226248344072e-06, + "loss": 1.62218189, + "memory(GiB)": 97.17, + "step": 17995, + "train_speed(iter/s)": 1.6378 + }, + { + "acc": 0.66787395, + "epoch": 0.45662100456621, + "grad_norm": 4.9375, + "learning_rate": 9.15564322088617e-06, + "loss": 1.57303944, + "memory(GiB)": 97.17, + "step": 18000, + "train_speed(iter/s)": 1.637847 + }, + { + "epoch": 0.45662100456621, + "eval_acc": 0.6431803231444878, + "eval_loss": 1.5907191038131714, + "eval_runtime": 58.3705, + "eval_samples_per_second": 109.131, + "eval_steps_per_second": 27.291, + "step": 18000 + }, + { + "acc": 0.64999857, + "epoch": 0.4567478437341451, + "grad_norm": 5.40625, + "learning_rate": 9.155060010644525e-06, + "loss": 1.62140446, + "memory(GiB)": 97.17, + "step": 18005, + "train_speed(iter/s)": 1.628615 + }, + { + "acc": 0.65930071, + "epoch": 0.45687468290208016, + "grad_norm": 6.84375, + "learning_rate": 9.154476617644792e-06, + "loss": 1.57437725, + "memory(GiB)": 97.17, + "step": 18010, + "train_speed(iter/s)": 1.628661 + }, + { + "acc": 0.65113735, + "epoch": 0.4570015220700152, + "grad_norm": 9.3125, + "learning_rate": 9.153893041912627e-06, + "loss": 1.56833649, + "memory(GiB)": 97.17, + "step": 18015, + "train_speed(iter/s)": 1.628707 + }, + { + "acc": 0.66707897, + "epoch": 0.45712836123795025, + "grad_norm": 6.34375, + "learning_rate": 9.1533092834737e-06, + "loss": 1.56586494, + "memory(GiB)": 97.17, + "step": 18020, + "train_speed(iter/s)": 1.628755 + }, + { + "acc": 0.65480509, + "epoch": 0.45725520040588535, + "grad_norm": 6.0, + "learning_rate": 9.152725342353688e-06, + "loss": 1.62384796, + "memory(GiB)": 97.17, + "step": 18025, + "train_speed(iter/s)": 1.6288 + }, + { + "acc": 0.63864136, + "epoch": 0.4573820395738204, + "grad_norm": 7.21875, + "learning_rate": 9.152141218578276e-06, + "loss": 1.69390335, + "memory(GiB)": 97.17, + "step": 18030, + "train_speed(iter/s)": 1.628846 + }, + { + "acc": 0.64268441, + "epoch": 0.45750887874175544, + "grad_norm": 6.71875, + "learning_rate": 9.151556912173154e-06, + "loss": 1.70281296, + "memory(GiB)": 97.17, + "step": 18035, + "train_speed(iter/s)": 1.628891 + }, + { + "acc": 0.65205879, + "epoch": 0.4576357179096905, + "grad_norm": 6.75, + "learning_rate": 9.150972423164024e-06, + "loss": 1.62303314, + "memory(GiB)": 97.17, + "step": 18040, + "train_speed(iter/s)": 1.628936 + }, + { + "acc": 0.65466805, + "epoch": 0.4577625570776256, + "grad_norm": 6.8125, + "learning_rate": 9.150387751576594e-06, + "loss": 1.67525177, + "memory(GiB)": 97.17, + "step": 18045, + "train_speed(iter/s)": 1.628984 + }, + { + "acc": 0.64855871, + "epoch": 0.45788939624556063, + "grad_norm": 4.3125, + "learning_rate": 9.14980289743658e-06, + "loss": 1.62913437, + "memory(GiB)": 97.17, + "step": 18050, + "train_speed(iter/s)": 1.629017 + }, + { + "acc": 0.64263763, + "epoch": 0.4580162354134957, + "grad_norm": 5.90625, + "learning_rate": 9.149217860769708e-06, + "loss": 1.66544647, + "memory(GiB)": 97.17, + "step": 18055, + "train_speed(iter/s)": 1.629067 + }, + { + "acc": 0.66130505, + "epoch": 0.4581430745814307, + "grad_norm": 6.25, + "learning_rate": 9.14863264160171e-06, + "loss": 1.57379818, + "memory(GiB)": 97.17, + "step": 18060, + "train_speed(iter/s)": 1.629116 + }, + { + "acc": 0.64522581, + "epoch": 0.4582699137493658, + "grad_norm": 5.59375, + "learning_rate": 9.148047239958325e-06, + "loss": 1.6959671, + "memory(GiB)": 97.17, + "step": 18065, + "train_speed(iter/s)": 1.629168 + }, + { + "acc": 0.64630213, + "epoch": 0.45839675291730086, + "grad_norm": 5.875, + "learning_rate": 9.147461655865302e-06, + "loss": 1.62077827, + "memory(GiB)": 97.17, + "step": 18070, + "train_speed(iter/s)": 1.629219 + }, + { + "acc": 0.66259346, + "epoch": 0.4585235920852359, + "grad_norm": 5.71875, + "learning_rate": 9.1468758893484e-06, + "loss": 1.65996361, + "memory(GiB)": 97.17, + "step": 18075, + "train_speed(iter/s)": 1.629271 + }, + { + "acc": 0.66683135, + "epoch": 0.45865043125317095, + "grad_norm": 5.03125, + "learning_rate": 9.14628994043338e-06, + "loss": 1.64738579, + "memory(GiB)": 97.17, + "step": 18080, + "train_speed(iter/s)": 1.629322 + }, + { + "acc": 0.65946379, + "epoch": 0.45877727042110605, + "grad_norm": 5.1875, + "learning_rate": 9.145703809146018e-06, + "loss": 1.63190613, + "memory(GiB)": 97.17, + "step": 18085, + "train_speed(iter/s)": 1.629373 + }, + { + "acc": 0.63743377, + "epoch": 0.4589041095890411, + "grad_norm": 5.875, + "learning_rate": 9.145117495512092e-06, + "loss": 1.65002594, + "memory(GiB)": 97.17, + "step": 18090, + "train_speed(iter/s)": 1.62942 + }, + { + "acc": 0.65873318, + "epoch": 0.45903094875697614, + "grad_norm": 5.3125, + "learning_rate": 9.144530999557393e-06, + "loss": 1.70841599, + "memory(GiB)": 97.17, + "step": 18095, + "train_speed(iter/s)": 1.629472 + }, + { + "acc": 0.6582479, + "epoch": 0.4591577879249112, + "grad_norm": 7.4375, + "learning_rate": 9.143944321307718e-06, + "loss": 1.62365761, + "memory(GiB)": 97.17, + "step": 18100, + "train_speed(iter/s)": 1.629524 + }, + { + "acc": 0.65006514, + "epoch": 0.4592846270928463, + "grad_norm": 5.03125, + "learning_rate": 9.143357460788868e-06, + "loss": 1.65395966, + "memory(GiB)": 97.17, + "step": 18105, + "train_speed(iter/s)": 1.629568 + }, + { + "acc": 0.64196391, + "epoch": 0.45941146626078133, + "grad_norm": 6.15625, + "learning_rate": 9.14277041802666e-06, + "loss": 1.70032959, + "memory(GiB)": 97.17, + "step": 18110, + "train_speed(iter/s)": 1.629617 + }, + { + "acc": 0.6500701, + "epoch": 0.4595383054287164, + "grad_norm": 5.65625, + "learning_rate": 9.142183193046912e-06, + "loss": 1.61194954, + "memory(GiB)": 97.17, + "step": 18115, + "train_speed(iter/s)": 1.629665 + }, + { + "acc": 0.66789994, + "epoch": 0.4596651445966514, + "grad_norm": 5.28125, + "learning_rate": 9.141595785875453e-06, + "loss": 1.53755598, + "memory(GiB)": 97.17, + "step": 18120, + "train_speed(iter/s)": 1.629715 + }, + { + "acc": 0.65315056, + "epoch": 0.4597919837645865, + "grad_norm": 4.65625, + "learning_rate": 9.141008196538122e-06, + "loss": 1.60123291, + "memory(GiB)": 97.17, + "step": 18125, + "train_speed(iter/s)": 1.629768 + }, + { + "acc": 0.64036112, + "epoch": 0.45991882293252156, + "grad_norm": 5.625, + "learning_rate": 9.14042042506076e-06, + "loss": 1.6665062, + "memory(GiB)": 97.17, + "step": 18130, + "train_speed(iter/s)": 1.629821 + }, + { + "acc": 0.65954399, + "epoch": 0.4600456621004566, + "grad_norm": 6.34375, + "learning_rate": 9.139832471469224e-06, + "loss": 1.62609215, + "memory(GiB)": 97.17, + "step": 18135, + "train_speed(iter/s)": 1.629867 + }, + { + "acc": 0.66797457, + "epoch": 0.46017250126839165, + "grad_norm": 4.6875, + "learning_rate": 9.13924433578937e-06, + "loss": 1.58874931, + "memory(GiB)": 97.17, + "step": 18140, + "train_speed(iter/s)": 1.629916 + }, + { + "acc": 0.65389252, + "epoch": 0.46029934043632675, + "grad_norm": 5.1875, + "learning_rate": 9.138656018047074e-06, + "loss": 1.60943718, + "memory(GiB)": 97.17, + "step": 18145, + "train_speed(iter/s)": 1.629967 + }, + { + "acc": 0.66221528, + "epoch": 0.4604261796042618, + "grad_norm": 5.21875, + "learning_rate": 9.138067518268206e-06, + "loss": 1.59948177, + "memory(GiB)": 97.17, + "step": 18150, + "train_speed(iter/s)": 1.630015 + }, + { + "acc": 0.66288638, + "epoch": 0.46055301877219684, + "grad_norm": 5.03125, + "learning_rate": 9.137478836478654e-06, + "loss": 1.60354652, + "memory(GiB)": 97.17, + "step": 18155, + "train_speed(iter/s)": 1.630066 + }, + { + "acc": 0.65707946, + "epoch": 0.4606798579401319, + "grad_norm": 5.96875, + "learning_rate": 9.13688997270431e-06, + "loss": 1.594314, + "memory(GiB)": 97.17, + "step": 18160, + "train_speed(iter/s)": 1.630117 + }, + { + "acc": 0.67923374, + "epoch": 0.460806697108067, + "grad_norm": 5.53125, + "learning_rate": 9.136300926971076e-06, + "loss": 1.55084715, + "memory(GiB)": 97.17, + "step": 18165, + "train_speed(iter/s)": 1.630169 + }, + { + "acc": 0.65642824, + "epoch": 0.46093353627600203, + "grad_norm": 4.84375, + "learning_rate": 9.135711699304858e-06, + "loss": 1.66619377, + "memory(GiB)": 97.17, + "step": 18170, + "train_speed(iter/s)": 1.630222 + }, + { + "acc": 0.65295868, + "epoch": 0.4610603754439371, + "grad_norm": 5.40625, + "learning_rate": 9.135122289731575e-06, + "loss": 1.6186924, + "memory(GiB)": 97.17, + "step": 18175, + "train_speed(iter/s)": 1.630275 + }, + { + "acc": 0.65446663, + "epoch": 0.4611872146118721, + "grad_norm": 6.3125, + "learning_rate": 9.134532698277154e-06, + "loss": 1.57828321, + "memory(GiB)": 97.17, + "step": 18180, + "train_speed(iter/s)": 1.630329 + }, + { + "acc": 0.65571055, + "epoch": 0.4613140537798072, + "grad_norm": 7.5, + "learning_rate": 9.133942924967524e-06, + "loss": 1.63390808, + "memory(GiB)": 97.17, + "step": 18185, + "train_speed(iter/s)": 1.630381 + }, + { + "acc": 0.6724884, + "epoch": 0.46144089294774226, + "grad_norm": 7.9375, + "learning_rate": 9.133352969828628e-06, + "loss": 1.55879192, + "memory(GiB)": 97.17, + "step": 18190, + "train_speed(iter/s)": 1.630433 + }, + { + "acc": 0.64934139, + "epoch": 0.4615677321156773, + "grad_norm": 5.34375, + "learning_rate": 9.132762832886416e-06, + "loss": 1.57403498, + "memory(GiB)": 97.17, + "step": 18195, + "train_speed(iter/s)": 1.630485 + }, + { + "acc": 0.64723287, + "epoch": 0.46169457128361235, + "grad_norm": 6.53125, + "learning_rate": 9.13217251416684e-06, + "loss": 1.69276886, + "memory(GiB)": 97.17, + "step": 18200, + "train_speed(iter/s)": 1.630539 + }, + { + "acc": 0.64785681, + "epoch": 0.46182141045154745, + "grad_norm": 6.5, + "learning_rate": 9.131582013695867e-06, + "loss": 1.63392258, + "memory(GiB)": 97.17, + "step": 18205, + "train_speed(iter/s)": 1.630589 + }, + { + "acc": 0.66743007, + "epoch": 0.4619482496194825, + "grad_norm": 6.25, + "learning_rate": 9.130991331499474e-06, + "loss": 1.67314148, + "memory(GiB)": 97.17, + "step": 18210, + "train_speed(iter/s)": 1.630644 + }, + { + "acc": 0.67090573, + "epoch": 0.46207508878741754, + "grad_norm": 6.21875, + "learning_rate": 9.130400467603637e-06, + "loss": 1.55846739, + "memory(GiB)": 97.17, + "step": 18215, + "train_speed(iter/s)": 1.630696 + }, + { + "acc": 0.63479166, + "epoch": 0.4622019279553526, + "grad_norm": 6.40625, + "learning_rate": 9.129809422034349e-06, + "loss": 1.66420631, + "memory(GiB)": 97.17, + "step": 18220, + "train_speed(iter/s)": 1.630752 + }, + { + "acc": 0.65594912, + "epoch": 0.4623287671232877, + "grad_norm": 6.3125, + "learning_rate": 9.129218194817601e-06, + "loss": 1.6711628, + "memory(GiB)": 97.17, + "step": 18225, + "train_speed(iter/s)": 1.630805 + }, + { + "acc": 0.64859838, + "epoch": 0.46245560629122273, + "grad_norm": 5.6875, + "learning_rate": 9.128626785979404e-06, + "loss": 1.67346287, + "memory(GiB)": 97.17, + "step": 18230, + "train_speed(iter/s)": 1.630857 + }, + { + "acc": 0.64274359, + "epoch": 0.4625824454591578, + "grad_norm": 5.625, + "learning_rate": 9.128035195545766e-06, + "loss": 1.64927006, + "memory(GiB)": 97.17, + "step": 18235, + "train_speed(iter/s)": 1.630911 + }, + { + "acc": 0.63747911, + "epoch": 0.4627092846270928, + "grad_norm": 5.59375, + "learning_rate": 9.12744342354271e-06, + "loss": 1.74260864, + "memory(GiB)": 97.17, + "step": 18240, + "train_speed(iter/s)": 1.630961 + }, + { + "acc": 0.65783386, + "epoch": 0.4628361237950279, + "grad_norm": 6.03125, + "learning_rate": 9.126851469996265e-06, + "loss": 1.60981178, + "memory(GiB)": 97.17, + "step": 18245, + "train_speed(iter/s)": 1.631013 + }, + { + "acc": 0.64982357, + "epoch": 0.46296296296296297, + "grad_norm": 6.5, + "learning_rate": 9.126259334932467e-06, + "loss": 1.62623215, + "memory(GiB)": 97.17, + "step": 18250, + "train_speed(iter/s)": 1.631068 + }, + { + "acc": 0.64053478, + "epoch": 0.463089802130898, + "grad_norm": 7.15625, + "learning_rate": 9.125667018377362e-06, + "loss": 1.71250153, + "memory(GiB)": 97.17, + "step": 18255, + "train_speed(iter/s)": 1.631122 + }, + { + "acc": 0.65603752, + "epoch": 0.46321664129883305, + "grad_norm": 6.9375, + "learning_rate": 9.125074520357002e-06, + "loss": 1.6324482, + "memory(GiB)": 97.17, + "step": 18260, + "train_speed(iter/s)": 1.631173 + }, + { + "acc": 0.65509129, + "epoch": 0.46334348046676815, + "grad_norm": 5.25, + "learning_rate": 9.124481840897446e-06, + "loss": 1.5972024, + "memory(GiB)": 97.17, + "step": 18265, + "train_speed(iter/s)": 1.631223 + }, + { + "acc": 0.64845858, + "epoch": 0.4634703196347032, + "grad_norm": 5.0625, + "learning_rate": 9.123888980024765e-06, + "loss": 1.58163042, + "memory(GiB)": 97.17, + "step": 18270, + "train_speed(iter/s)": 1.631272 + }, + { + "acc": 0.6682056, + "epoch": 0.46359715880263824, + "grad_norm": 4.625, + "learning_rate": 9.123295937765034e-06, + "loss": 1.57059727, + "memory(GiB)": 97.17, + "step": 18275, + "train_speed(iter/s)": 1.631325 + }, + { + "acc": 0.67809019, + "epoch": 0.4637239979705733, + "grad_norm": 6.25, + "learning_rate": 9.122702714144339e-06, + "loss": 1.53272963, + "memory(GiB)": 97.17, + "step": 18280, + "train_speed(iter/s)": 1.631376 + }, + { + "acc": 0.65309391, + "epoch": 0.4638508371385084, + "grad_norm": 5.375, + "learning_rate": 9.122109309188772e-06, + "loss": 1.61041451, + "memory(GiB)": 97.17, + "step": 18285, + "train_speed(iter/s)": 1.631429 + }, + { + "acc": 0.67439466, + "epoch": 0.46397767630644343, + "grad_norm": 7.375, + "learning_rate": 9.121515722924435e-06, + "loss": 1.58415031, + "memory(GiB)": 97.17, + "step": 18290, + "train_speed(iter/s)": 1.631482 + }, + { + "acc": 0.66338997, + "epoch": 0.4641045154743785, + "grad_norm": 5.375, + "learning_rate": 9.120921955377433e-06, + "loss": 1.57285643, + "memory(GiB)": 97.17, + "step": 18295, + "train_speed(iter/s)": 1.631535 + }, + { + "acc": 0.6486599, + "epoch": 0.4642313546423135, + "grad_norm": 5.5, + "learning_rate": 9.120328006573887e-06, + "loss": 1.62669716, + "memory(GiB)": 97.17, + "step": 18300, + "train_speed(iter/s)": 1.631584 + }, + { + "acc": 0.65634398, + "epoch": 0.4643581938102486, + "grad_norm": 5.46875, + "learning_rate": 9.119733876539916e-06, + "loss": 1.55204182, + "memory(GiB)": 97.17, + "step": 18305, + "train_speed(iter/s)": 1.631635 + }, + { + "acc": 0.65420036, + "epoch": 0.46448503297818367, + "grad_norm": 6.125, + "learning_rate": 9.119139565301658e-06, + "loss": 1.59699821, + "memory(GiB)": 97.17, + "step": 18310, + "train_speed(iter/s)": 1.631687 + }, + { + "acc": 0.65560145, + "epoch": 0.4646118721461187, + "grad_norm": 5.71875, + "learning_rate": 9.118545072885253e-06, + "loss": 1.6689415, + "memory(GiB)": 97.17, + "step": 18315, + "train_speed(iter/s)": 1.631737 + }, + { + "acc": 0.66651087, + "epoch": 0.46473871131405375, + "grad_norm": 8.4375, + "learning_rate": 9.117950399316845e-06, + "loss": 1.57884626, + "memory(GiB)": 97.17, + "step": 18320, + "train_speed(iter/s)": 1.631786 + }, + { + "acc": 0.66842513, + "epoch": 0.46486555048198885, + "grad_norm": 6.0625, + "learning_rate": 9.117355544622595e-06, + "loss": 1.58536873, + "memory(GiB)": 97.17, + "step": 18325, + "train_speed(iter/s)": 1.631835 + }, + { + "acc": 0.64968004, + "epoch": 0.4649923896499239, + "grad_norm": 5.59375, + "learning_rate": 9.116760508828664e-06, + "loss": 1.55995865, + "memory(GiB)": 97.17, + "step": 18330, + "train_speed(iter/s)": 1.631884 + }, + { + "acc": 0.66877708, + "epoch": 0.46511922881785894, + "grad_norm": 5.34375, + "learning_rate": 9.116165291961225e-06, + "loss": 1.59292078, + "memory(GiB)": 97.17, + "step": 18335, + "train_speed(iter/s)": 1.631931 + }, + { + "acc": 0.64873347, + "epoch": 0.465246067985794, + "grad_norm": 5.0625, + "learning_rate": 9.11556989404646e-06, + "loss": 1.60118408, + "memory(GiB)": 97.17, + "step": 18340, + "train_speed(iter/s)": 1.631979 + }, + { + "acc": 0.64336104, + "epoch": 0.4653729071537291, + "grad_norm": 5.84375, + "learning_rate": 9.114974315110558e-06, + "loss": 1.68564701, + "memory(GiB)": 97.17, + "step": 18345, + "train_speed(iter/s)": 1.632027 + }, + { + "acc": 0.66673598, + "epoch": 0.46549974632166413, + "grad_norm": 6.3125, + "learning_rate": 9.114378555179712e-06, + "loss": 1.61895199, + "memory(GiB)": 97.17, + "step": 18350, + "train_speed(iter/s)": 1.632076 + }, + { + "acc": 0.66504173, + "epoch": 0.4656265854895992, + "grad_norm": 5.78125, + "learning_rate": 9.11378261428013e-06, + "loss": 1.56538124, + "memory(GiB)": 97.17, + "step": 18355, + "train_speed(iter/s)": 1.632125 + }, + { + "acc": 0.65559959, + "epoch": 0.4657534246575342, + "grad_norm": 5.71875, + "learning_rate": 9.11318649243802e-06, + "loss": 1.62557507, + "memory(GiB)": 97.17, + "step": 18360, + "train_speed(iter/s)": 1.632174 + }, + { + "acc": 0.65828705, + "epoch": 0.4658802638254693, + "grad_norm": 5.75, + "learning_rate": 9.112590189679604e-06, + "loss": 1.61647129, + "memory(GiB)": 97.17, + "step": 18365, + "train_speed(iter/s)": 1.632228 + }, + { + "acc": 0.64962778, + "epoch": 0.46600710299340437, + "grad_norm": 4.6875, + "learning_rate": 9.111993706031109e-06, + "loss": 1.60085125, + "memory(GiB)": 97.17, + "step": 18370, + "train_speed(iter/s)": 1.632275 + }, + { + "acc": 0.66088781, + "epoch": 0.4661339421613394, + "grad_norm": 5.53125, + "learning_rate": 9.111397041518774e-06, + "loss": 1.61294212, + "memory(GiB)": 97.17, + "step": 18375, + "train_speed(iter/s)": 1.632322 + }, + { + "acc": 0.65508494, + "epoch": 0.46626078132927445, + "grad_norm": 5.5625, + "learning_rate": 9.11080019616884e-06, + "loss": 1.59683514, + "memory(GiB)": 97.17, + "step": 18380, + "train_speed(iter/s)": 1.632373 + }, + { + "acc": 0.66044416, + "epoch": 0.46638762049720955, + "grad_norm": 6.03125, + "learning_rate": 9.11020317000756e-06, + "loss": 1.63567581, + "memory(GiB)": 97.17, + "step": 18385, + "train_speed(iter/s)": 1.632422 + }, + { + "acc": 0.63134689, + "epoch": 0.4665144596651446, + "grad_norm": 5.71875, + "learning_rate": 9.109605963061194e-06, + "loss": 1.73026295, + "memory(GiB)": 97.17, + "step": 18390, + "train_speed(iter/s)": 1.632468 + }, + { + "acc": 0.66334658, + "epoch": 0.46664129883307964, + "grad_norm": 6.375, + "learning_rate": 9.10900857535601e-06, + "loss": 1.573347, + "memory(GiB)": 97.17, + "step": 18395, + "train_speed(iter/s)": 1.632518 + }, + { + "acc": 0.63498793, + "epoch": 0.4667681380010147, + "grad_norm": 6.125, + "learning_rate": 9.108411006918283e-06, + "loss": 1.68694935, + "memory(GiB)": 97.17, + "step": 18400, + "train_speed(iter/s)": 1.632569 + }, + { + "acc": 0.65896225, + "epoch": 0.4668949771689498, + "grad_norm": 6.6875, + "learning_rate": 9.107813257774298e-06, + "loss": 1.63279266, + "memory(GiB)": 97.17, + "step": 18405, + "train_speed(iter/s)": 1.632621 + }, + { + "acc": 0.64386454, + "epoch": 0.46702181633688483, + "grad_norm": 5.9375, + "learning_rate": 9.107215327950345e-06, + "loss": 1.62228317, + "memory(GiB)": 97.17, + "step": 18410, + "train_speed(iter/s)": 1.632669 + }, + { + "acc": 0.65005245, + "epoch": 0.4671486555048199, + "grad_norm": 6.125, + "learning_rate": 9.106617217472724e-06, + "loss": 1.6490984, + "memory(GiB)": 97.17, + "step": 18415, + "train_speed(iter/s)": 1.63272 + }, + { + "acc": 0.64562855, + "epoch": 0.4672754946727549, + "grad_norm": 5.34375, + "learning_rate": 9.106018926367744e-06, + "loss": 1.63344383, + "memory(GiB)": 97.17, + "step": 18420, + "train_speed(iter/s)": 1.63277 + }, + { + "acc": 0.65251913, + "epoch": 0.46740233384069, + "grad_norm": 5.46875, + "learning_rate": 9.10542045466172e-06, + "loss": 1.63472099, + "memory(GiB)": 97.17, + "step": 18425, + "train_speed(iter/s)": 1.632817 + }, + { + "acc": 0.65662622, + "epoch": 0.46752917300862507, + "grad_norm": 5.78125, + "learning_rate": 9.104821802380974e-06, + "loss": 1.66220341, + "memory(GiB)": 97.17, + "step": 18430, + "train_speed(iter/s)": 1.632866 + }, + { + "acc": 0.65970116, + "epoch": 0.4676560121765601, + "grad_norm": 5.25, + "learning_rate": 9.104222969551838e-06, + "loss": 1.53581562, + "memory(GiB)": 97.17, + "step": 18435, + "train_speed(iter/s)": 1.632914 + }, + { + "acc": 0.66083355, + "epoch": 0.46778285134449515, + "grad_norm": 5.9375, + "learning_rate": 9.103623956200654e-06, + "loss": 1.63388538, + "memory(GiB)": 97.17, + "step": 18440, + "train_speed(iter/s)": 1.632962 + }, + { + "acc": 0.648207, + "epoch": 0.46790969051243025, + "grad_norm": 6.0, + "learning_rate": 9.103024762353766e-06, + "loss": 1.69622955, + "memory(GiB)": 97.17, + "step": 18445, + "train_speed(iter/s)": 1.633007 + }, + { + "acc": 0.66191196, + "epoch": 0.4680365296803653, + "grad_norm": 4.9375, + "learning_rate": 9.102425388037527e-06, + "loss": 1.58526535, + "memory(GiB)": 97.17, + "step": 18450, + "train_speed(iter/s)": 1.633057 + }, + { + "acc": 0.65277543, + "epoch": 0.46816336884830034, + "grad_norm": 5.6875, + "learning_rate": 9.101825833278308e-06, + "loss": 1.62531395, + "memory(GiB)": 97.17, + "step": 18455, + "train_speed(iter/s)": 1.633106 + }, + { + "acc": 0.65889072, + "epoch": 0.4682902080162354, + "grad_norm": 5.5625, + "learning_rate": 9.101226098102473e-06, + "loss": 1.56215038, + "memory(GiB)": 97.17, + "step": 18460, + "train_speed(iter/s)": 1.633156 + }, + { + "acc": 0.65938935, + "epoch": 0.4684170471841705, + "grad_norm": 8.0, + "learning_rate": 9.100626182536405e-06, + "loss": 1.64778728, + "memory(GiB)": 97.17, + "step": 18465, + "train_speed(iter/s)": 1.633204 + }, + { + "acc": 0.65993805, + "epoch": 0.46854388635210553, + "grad_norm": 5.84375, + "learning_rate": 9.100026086606488e-06, + "loss": 1.58330822, + "memory(GiB)": 97.17, + "step": 18470, + "train_speed(iter/s)": 1.633255 + }, + { + "acc": 0.67940564, + "epoch": 0.4686707255200406, + "grad_norm": 5.90625, + "learning_rate": 9.09942581033912e-06, + "loss": 1.5305685, + "memory(GiB)": 97.17, + "step": 18475, + "train_speed(iter/s)": 1.633303 + }, + { + "acc": 0.66762815, + "epoch": 0.4687975646879756, + "grad_norm": 5.90625, + "learning_rate": 9.0988253537607e-06, + "loss": 1.52450085, + "memory(GiB)": 97.17, + "step": 18480, + "train_speed(iter/s)": 1.633352 + }, + { + "acc": 0.66718826, + "epoch": 0.4689244038559107, + "grad_norm": 4.90625, + "learning_rate": 9.098224716897644e-06, + "loss": 1.59068661, + "memory(GiB)": 97.17, + "step": 18485, + "train_speed(iter/s)": 1.633399 + }, + { + "acc": 0.66398401, + "epoch": 0.46905124302384577, + "grad_norm": 5.6875, + "learning_rate": 9.097623899776366e-06, + "loss": 1.58663921, + "memory(GiB)": 97.17, + "step": 18490, + "train_speed(iter/s)": 1.633448 + }, + { + "acc": 0.64091969, + "epoch": 0.4691780821917808, + "grad_norm": 6.25, + "learning_rate": 9.097022902423294e-06, + "loss": 1.59719944, + "memory(GiB)": 97.17, + "step": 18495, + "train_speed(iter/s)": 1.633498 + }, + { + "acc": 0.65153608, + "epoch": 0.46930492135971585, + "grad_norm": 5.21875, + "learning_rate": 9.096421724864864e-06, + "loss": 1.63566704, + "memory(GiB)": 97.17, + "step": 18500, + "train_speed(iter/s)": 1.633546 + }, + { + "acc": 0.65336642, + "epoch": 0.46943176052765095, + "grad_norm": 5.28125, + "learning_rate": 9.095820367127517e-06, + "loss": 1.64004707, + "memory(GiB)": 97.17, + "step": 18505, + "train_speed(iter/s)": 1.633593 + }, + { + "acc": 0.64430351, + "epoch": 0.469558599695586, + "grad_norm": 5.5, + "learning_rate": 9.095218829237703e-06, + "loss": 1.64820118, + "memory(GiB)": 97.17, + "step": 18510, + "train_speed(iter/s)": 1.633642 + }, + { + "acc": 0.63728056, + "epoch": 0.46968543886352104, + "grad_norm": 5.5625, + "learning_rate": 9.094617111221881e-06, + "loss": 1.6593441, + "memory(GiB)": 97.17, + "step": 18515, + "train_speed(iter/s)": 1.63369 + }, + { + "acc": 0.64452839, + "epoch": 0.4698122780314561, + "grad_norm": 4.65625, + "learning_rate": 9.09401521310652e-06, + "loss": 1.58186817, + "memory(GiB)": 97.17, + "step": 18520, + "train_speed(iter/s)": 1.633741 + }, + { + "acc": 0.67075787, + "epoch": 0.4699391171993912, + "grad_norm": 5.1875, + "learning_rate": 9.093413134918088e-06, + "loss": 1.55130606, + "memory(GiB)": 97.17, + "step": 18525, + "train_speed(iter/s)": 1.633792 + }, + { + "acc": 0.66500759, + "epoch": 0.47006595636732623, + "grad_norm": 5.21875, + "learning_rate": 9.09281087668307e-06, + "loss": 1.60559177, + "memory(GiB)": 97.17, + "step": 18530, + "train_speed(iter/s)": 1.63384 + }, + { + "acc": 0.6658442, + "epoch": 0.4701927955352613, + "grad_norm": 6.09375, + "learning_rate": 9.09220843842796e-06, + "loss": 1.58783302, + "memory(GiB)": 97.17, + "step": 18535, + "train_speed(iter/s)": 1.63389 + }, + { + "acc": 0.64500914, + "epoch": 0.4703196347031963, + "grad_norm": 5.09375, + "learning_rate": 9.09160582017925e-06, + "loss": 1.60298882, + "memory(GiB)": 97.17, + "step": 18540, + "train_speed(iter/s)": 1.63394 + }, + { + "acc": 0.63264341, + "epoch": 0.4704464738711314, + "grad_norm": 5.25, + "learning_rate": 9.091003021963449e-06, + "loss": 1.68952332, + "memory(GiB)": 97.17, + "step": 18545, + "train_speed(iter/s)": 1.633991 + }, + { + "acc": 0.65284557, + "epoch": 0.47057331303906647, + "grad_norm": 5.5, + "learning_rate": 9.09040004380707e-06, + "loss": 1.6526825, + "memory(GiB)": 97.17, + "step": 18550, + "train_speed(iter/s)": 1.634037 + }, + { + "acc": 0.64369583, + "epoch": 0.4707001522070015, + "grad_norm": 5.46875, + "learning_rate": 9.089796885736637e-06, + "loss": 1.62356949, + "memory(GiB)": 97.17, + "step": 18555, + "train_speed(iter/s)": 1.634085 + }, + { + "acc": 0.66493397, + "epoch": 0.47082699137493655, + "grad_norm": 4.71875, + "learning_rate": 9.089193547778674e-06, + "loss": 1.5830575, + "memory(GiB)": 97.17, + "step": 18560, + "train_speed(iter/s)": 1.634132 + }, + { + "acc": 0.64886351, + "epoch": 0.47095383054287165, + "grad_norm": 5.375, + "learning_rate": 9.088590029959724e-06, + "loss": 1.604776, + "memory(GiB)": 97.17, + "step": 18565, + "train_speed(iter/s)": 1.63418 + }, + { + "acc": 0.64658508, + "epoch": 0.4710806697108067, + "grad_norm": 6.125, + "learning_rate": 9.08798633230633e-06, + "loss": 1.70517406, + "memory(GiB)": 97.17, + "step": 18570, + "train_speed(iter/s)": 1.634228 + }, + { + "acc": 0.66426773, + "epoch": 0.47120750887874174, + "grad_norm": 4.875, + "learning_rate": 9.087382454845044e-06, + "loss": 1.66563606, + "memory(GiB)": 97.17, + "step": 18575, + "train_speed(iter/s)": 1.634276 + }, + { + "acc": 0.64232955, + "epoch": 0.4713343480466768, + "grad_norm": 5.15625, + "learning_rate": 9.08677839760243e-06, + "loss": 1.67427292, + "memory(GiB)": 97.17, + "step": 18580, + "train_speed(iter/s)": 1.634327 + }, + { + "acc": 0.63752437, + "epoch": 0.4714611872146119, + "grad_norm": 5.5625, + "learning_rate": 9.086174160605055e-06, + "loss": 1.62534065, + "memory(GiB)": 97.17, + "step": 18585, + "train_speed(iter/s)": 1.634379 + }, + { + "acc": 0.64555354, + "epoch": 0.47158802638254693, + "grad_norm": 4.84375, + "learning_rate": 9.085569743879498e-06, + "loss": 1.65529881, + "memory(GiB)": 97.17, + "step": 18590, + "train_speed(iter/s)": 1.634424 + }, + { + "acc": 0.65081263, + "epoch": 0.471714865550482, + "grad_norm": 5.0, + "learning_rate": 9.084965147452342e-06, + "loss": 1.61330452, + "memory(GiB)": 97.17, + "step": 18595, + "train_speed(iter/s)": 1.634474 + }, + { + "acc": 0.65549164, + "epoch": 0.471841704718417, + "grad_norm": 5.15625, + "learning_rate": 9.084360371350182e-06, + "loss": 1.60776005, + "memory(GiB)": 97.17, + "step": 18600, + "train_speed(iter/s)": 1.63452 + }, + { + "acc": 0.65038986, + "epoch": 0.4719685438863521, + "grad_norm": 5.09375, + "learning_rate": 9.083755415599617e-06, + "loss": 1.61372547, + "memory(GiB)": 97.17, + "step": 18605, + "train_speed(iter/s)": 1.63457 + }, + { + "acc": 0.65071692, + "epoch": 0.47209538305428717, + "grad_norm": 5.65625, + "learning_rate": 9.083150280227255e-06, + "loss": 1.58352623, + "memory(GiB)": 97.17, + "step": 18610, + "train_speed(iter/s)": 1.63462 + }, + { + "acc": 0.65439391, + "epoch": 0.4722222222222222, + "grad_norm": 5.28125, + "learning_rate": 9.082544965259716e-06, + "loss": 1.59898672, + "memory(GiB)": 97.17, + "step": 18615, + "train_speed(iter/s)": 1.634667 + }, + { + "acc": 0.66267424, + "epoch": 0.47234906139015725, + "grad_norm": 5.34375, + "learning_rate": 9.081939470723619e-06, + "loss": 1.61046295, + "memory(GiB)": 97.17, + "step": 18620, + "train_speed(iter/s)": 1.634718 + }, + { + "acc": 0.63989754, + "epoch": 0.47247590055809235, + "grad_norm": 6.5, + "learning_rate": 9.081333796645603e-06, + "loss": 1.62868729, + "memory(GiB)": 97.17, + "step": 18625, + "train_speed(iter/s)": 1.63477 + }, + { + "acc": 0.65537705, + "epoch": 0.4726027397260274, + "grad_norm": 6.0625, + "learning_rate": 9.080727943052304e-06, + "loss": 1.59164009, + "memory(GiB)": 97.17, + "step": 18630, + "train_speed(iter/s)": 1.634822 + }, + { + "acc": 0.64680939, + "epoch": 0.47272957889396244, + "grad_norm": 5.71875, + "learning_rate": 9.080121909970369e-06, + "loss": 1.63066101, + "memory(GiB)": 97.17, + "step": 18635, + "train_speed(iter/s)": 1.634869 + }, + { + "acc": 0.65633183, + "epoch": 0.4728564180618975, + "grad_norm": 6.4375, + "learning_rate": 9.079515697426457e-06, + "loss": 1.57321014, + "memory(GiB)": 97.17, + "step": 18640, + "train_speed(iter/s)": 1.634918 + }, + { + "acc": 0.65313835, + "epoch": 0.4729832572298326, + "grad_norm": 5.84375, + "learning_rate": 9.078909305447231e-06, + "loss": 1.62922554, + "memory(GiB)": 97.17, + "step": 18645, + "train_speed(iter/s)": 1.634968 + }, + { + "acc": 0.66686578, + "epoch": 0.47311009639776763, + "grad_norm": 5.1875, + "learning_rate": 9.078302734059363e-06, + "loss": 1.5349041, + "memory(GiB)": 97.17, + "step": 18650, + "train_speed(iter/s)": 1.635018 + }, + { + "acc": 0.66314716, + "epoch": 0.4732369355657027, + "grad_norm": 6.1875, + "learning_rate": 9.077695983289531e-06, + "loss": 1.60294647, + "memory(GiB)": 97.17, + "step": 18655, + "train_speed(iter/s)": 1.635065 + }, + { + "acc": 0.64946804, + "epoch": 0.4733637747336377, + "grad_norm": 4.90625, + "learning_rate": 9.077089053164426e-06, + "loss": 1.62236404, + "memory(GiB)": 97.17, + "step": 18660, + "train_speed(iter/s)": 1.635116 + }, + { + "acc": 0.65981016, + "epoch": 0.4734906139015728, + "grad_norm": 4.875, + "learning_rate": 9.076481943710742e-06, + "loss": 1.56549664, + "memory(GiB)": 97.17, + "step": 18665, + "train_speed(iter/s)": 1.635163 + }, + { + "acc": 0.65033021, + "epoch": 0.47361745306950787, + "grad_norm": 6.3125, + "learning_rate": 9.075874654955178e-06, + "loss": 1.61561012, + "memory(GiB)": 97.17, + "step": 18670, + "train_speed(iter/s)": 1.635214 + }, + { + "acc": 0.64855671, + "epoch": 0.4737442922374429, + "grad_norm": 4.6875, + "learning_rate": 9.075267186924453e-06, + "loss": 1.65048275, + "memory(GiB)": 97.17, + "step": 18675, + "train_speed(iter/s)": 1.635263 + }, + { + "acc": 0.64401731, + "epoch": 0.47387113140537795, + "grad_norm": 5.53125, + "learning_rate": 9.074659539645281e-06, + "loss": 1.60564995, + "memory(GiB)": 97.17, + "step": 18680, + "train_speed(iter/s)": 1.63531 + }, + { + "acc": 0.64505358, + "epoch": 0.47399797057331305, + "grad_norm": 5.8125, + "learning_rate": 9.074051713144392e-06, + "loss": 1.69649239, + "memory(GiB)": 97.17, + "step": 18685, + "train_speed(iter/s)": 1.635362 + }, + { + "acc": 0.65669184, + "epoch": 0.4741248097412481, + "grad_norm": 6.03125, + "learning_rate": 9.073443707448517e-06, + "loss": 1.63959579, + "memory(GiB)": 97.17, + "step": 18690, + "train_speed(iter/s)": 1.635412 + }, + { + "acc": 0.64006977, + "epoch": 0.47425164890918314, + "grad_norm": 6.625, + "learning_rate": 9.072835522584402e-06, + "loss": 1.67402325, + "memory(GiB)": 97.17, + "step": 18695, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.66570463, + "epoch": 0.4743784880771182, + "grad_norm": 6.25, + "learning_rate": 9.072227158578798e-06, + "loss": 1.56113482, + "memory(GiB)": 97.17, + "step": 18700, + "train_speed(iter/s)": 1.635509 + }, + { + "acc": 0.64029751, + "epoch": 0.4745053272450533, + "grad_norm": 6.59375, + "learning_rate": 9.07161861545846e-06, + "loss": 1.61435833, + "memory(GiB)": 97.17, + "step": 18705, + "train_speed(iter/s)": 1.635557 + }, + { + "acc": 0.64389362, + "epoch": 0.47463216641298833, + "grad_norm": 7.09375, + "learning_rate": 9.071009893250158e-06, + "loss": 1.70898781, + "memory(GiB)": 97.17, + "step": 18710, + "train_speed(iter/s)": 1.635608 + }, + { + "acc": 0.64417443, + "epoch": 0.4747590055809234, + "grad_norm": 6.15625, + "learning_rate": 9.070400991980666e-06, + "loss": 1.64150505, + "memory(GiB)": 97.17, + "step": 18715, + "train_speed(iter/s)": 1.635656 + }, + { + "acc": 0.66944103, + "epoch": 0.4748858447488584, + "grad_norm": 5.28125, + "learning_rate": 9.069791911676765e-06, + "loss": 1.60371284, + "memory(GiB)": 97.17, + "step": 18720, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.669384, + "epoch": 0.4750126839167935, + "grad_norm": 5.59375, + "learning_rate": 9.069182652365245e-06, + "loss": 1.57409821, + "memory(GiB)": 97.17, + "step": 18725, + "train_speed(iter/s)": 1.635755 + }, + { + "acc": 0.6405951, + "epoch": 0.47513952308472857, + "grad_norm": 6.03125, + "learning_rate": 9.068573214072904e-06, + "loss": 1.71492043, + "memory(GiB)": 97.17, + "step": 18730, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.64446325, + "epoch": 0.4752663622526636, + "grad_norm": 4.6875, + "learning_rate": 9.067963596826547e-06, + "loss": 1.65332108, + "memory(GiB)": 97.17, + "step": 18735, + "train_speed(iter/s)": 1.635854 + }, + { + "acc": 0.65595675, + "epoch": 0.47539320142059865, + "grad_norm": 5.25, + "learning_rate": 9.067353800652991e-06, + "loss": 1.64717388, + "memory(GiB)": 97.17, + "step": 18740, + "train_speed(iter/s)": 1.635903 + }, + { + "acc": 0.65872364, + "epoch": 0.47552004058853375, + "grad_norm": 5.75, + "learning_rate": 9.066743825579056e-06, + "loss": 1.61537857, + "memory(GiB)": 97.17, + "step": 18745, + "train_speed(iter/s)": 1.635951 + }, + { + "acc": 0.66051412, + "epoch": 0.4756468797564688, + "grad_norm": 6.21875, + "learning_rate": 9.06613367163157e-06, + "loss": 1.54608269, + "memory(GiB)": 97.17, + "step": 18750, + "train_speed(iter/s)": 1.636001 + }, + { + "acc": 0.6519444, + "epoch": 0.47577371892440384, + "grad_norm": 5.75, + "learning_rate": 9.06552333883737e-06, + "loss": 1.61317806, + "memory(GiB)": 97.17, + "step": 18755, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.65443516, + "epoch": 0.4759005580923389, + "grad_norm": 7.0, + "learning_rate": 9.064912827223303e-06, + "loss": 1.62242889, + "memory(GiB)": 97.17, + "step": 18760, + "train_speed(iter/s)": 1.636098 + }, + { + "acc": 0.65507555, + "epoch": 0.476027397260274, + "grad_norm": 5.21875, + "learning_rate": 9.06430213681622e-06, + "loss": 1.61009979, + "memory(GiB)": 97.17, + "step": 18765, + "train_speed(iter/s)": 1.636147 + }, + { + "acc": 0.63461132, + "epoch": 0.47615423642820903, + "grad_norm": 5.1875, + "learning_rate": 9.063691267642987e-06, + "loss": 1.70338039, + "memory(GiB)": 97.17, + "step": 18770, + "train_speed(iter/s)": 1.636198 + }, + { + "acc": 0.65015697, + "epoch": 0.4762810755961441, + "grad_norm": 5.625, + "learning_rate": 9.063080219730467e-06, + "loss": 1.64285851, + "memory(GiB)": 97.17, + "step": 18775, + "train_speed(iter/s)": 1.636251 + }, + { + "acc": 0.64973965, + "epoch": 0.4764079147640791, + "grad_norm": 5.84375, + "learning_rate": 9.062468993105538e-06, + "loss": 1.61367493, + "memory(GiB)": 97.17, + "step": 18780, + "train_speed(iter/s)": 1.636299 + }, + { + "acc": 0.67192078, + "epoch": 0.4765347539320142, + "grad_norm": 5.75, + "learning_rate": 9.061857587795084e-06, + "loss": 1.52915649, + "memory(GiB)": 97.17, + "step": 18785, + "train_speed(iter/s)": 1.636349 + }, + { + "acc": 0.65035758, + "epoch": 0.47666159309994927, + "grad_norm": 6.0, + "learning_rate": 9.061246003826e-06, + "loss": 1.71583366, + "memory(GiB)": 97.17, + "step": 18790, + "train_speed(iter/s)": 1.636399 + }, + { + "acc": 0.64987903, + "epoch": 0.4767884322678843, + "grad_norm": 6.125, + "learning_rate": 9.060634241225184e-06, + "loss": 1.64932785, + "memory(GiB)": 97.17, + "step": 18795, + "train_speed(iter/s)": 1.636446 + }, + { + "acc": 0.65373144, + "epoch": 0.47691527143581935, + "grad_norm": 5.65625, + "learning_rate": 9.060022300019546e-06, + "loss": 1.61680775, + "memory(GiB)": 97.17, + "step": 18800, + "train_speed(iter/s)": 1.636499 + }, + { + "acc": 0.65803919, + "epoch": 0.47704211060375445, + "grad_norm": 5.875, + "learning_rate": 9.059410180236e-06, + "loss": 1.57195053, + "memory(GiB)": 97.17, + "step": 18805, + "train_speed(iter/s)": 1.636546 + }, + { + "acc": 0.65733013, + "epoch": 0.4771689497716895, + "grad_norm": 4.90625, + "learning_rate": 9.058797881901469e-06, + "loss": 1.5844697, + "memory(GiB)": 97.17, + "step": 18810, + "train_speed(iter/s)": 1.636592 + }, + { + "acc": 0.6689486, + "epoch": 0.47729578893962454, + "grad_norm": 6.34375, + "learning_rate": 9.058185405042886e-06, + "loss": 1.5118207, + "memory(GiB)": 97.17, + "step": 18815, + "train_speed(iter/s)": 1.636642 + }, + { + "acc": 0.65490046, + "epoch": 0.4774226281075596, + "grad_norm": 5.21875, + "learning_rate": 9.05757274968719e-06, + "loss": 1.69743958, + "memory(GiB)": 97.17, + "step": 18820, + "train_speed(iter/s)": 1.636688 + }, + { + "acc": 0.65074377, + "epoch": 0.4775494672754947, + "grad_norm": 6.09375, + "learning_rate": 9.056959915861331e-06, + "loss": 1.58289957, + "memory(GiB)": 97.17, + "step": 18825, + "train_speed(iter/s)": 1.636736 + }, + { + "acc": 0.64664888, + "epoch": 0.47767630644342973, + "grad_norm": 6.4375, + "learning_rate": 9.056346903592262e-06, + "loss": 1.6505188, + "memory(GiB)": 97.17, + "step": 18830, + "train_speed(iter/s)": 1.636784 + }, + { + "acc": 0.64677014, + "epoch": 0.4778031456113648, + "grad_norm": 4.84375, + "learning_rate": 9.055733712906943e-06, + "loss": 1.61426468, + "memory(GiB)": 97.17, + "step": 18835, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.64425516, + "epoch": 0.4779299847792998, + "grad_norm": 5.84375, + "learning_rate": 9.05512034383235e-06, + "loss": 1.62231998, + "memory(GiB)": 97.17, + "step": 18840, + "train_speed(iter/s)": 1.63688 + }, + { + "acc": 0.63516402, + "epoch": 0.4780568239472349, + "grad_norm": 5.53125, + "learning_rate": 9.054506796395458e-06, + "loss": 1.64320946, + "memory(GiB)": 97.17, + "step": 18845, + "train_speed(iter/s)": 1.636929 + }, + { + "acc": 0.65735517, + "epoch": 0.47818366311516997, + "grad_norm": 5.75, + "learning_rate": 9.053893070623256e-06, + "loss": 1.65515213, + "memory(GiB)": 97.17, + "step": 18850, + "train_speed(iter/s)": 1.636978 + }, + { + "acc": 0.66131325, + "epoch": 0.478310502283105, + "grad_norm": 6.09375, + "learning_rate": 9.053279166542738e-06, + "loss": 1.63779106, + "memory(GiB)": 97.17, + "step": 18855, + "train_speed(iter/s)": 1.637027 + }, + { + "acc": 0.66859837, + "epoch": 0.47843734145104005, + "grad_norm": 5.84375, + "learning_rate": 9.052665084180906e-06, + "loss": 1.58122597, + "memory(GiB)": 97.17, + "step": 18860, + "train_speed(iter/s)": 1.637074 + }, + { + "acc": 0.64981456, + "epoch": 0.47856418061897515, + "grad_norm": 5.5, + "learning_rate": 9.052050823564767e-06, + "loss": 1.63787308, + "memory(GiB)": 97.17, + "step": 18865, + "train_speed(iter/s)": 1.637123 + }, + { + "acc": 0.66255226, + "epoch": 0.4786910197869102, + "grad_norm": 4.9375, + "learning_rate": 9.051436384721344e-06, + "loss": 1.59685802, + "memory(GiB)": 97.17, + "step": 18870, + "train_speed(iter/s)": 1.637172 + }, + { + "acc": 0.67633996, + "epoch": 0.47881785895484524, + "grad_norm": 5.4375, + "learning_rate": 9.05082176767766e-06, + "loss": 1.52323971, + "memory(GiB)": 97.17, + "step": 18875, + "train_speed(iter/s)": 1.637223 + }, + { + "acc": 0.6529479, + "epoch": 0.4789446981227803, + "grad_norm": 4.5625, + "learning_rate": 9.050206972460749e-06, + "loss": 1.61393414, + "memory(GiB)": 97.17, + "step": 18880, + "train_speed(iter/s)": 1.637273 + }, + { + "acc": 0.65651464, + "epoch": 0.4790715372907154, + "grad_norm": 6.09375, + "learning_rate": 9.049591999097651e-06, + "loss": 1.65935745, + "memory(GiB)": 97.17, + "step": 18885, + "train_speed(iter/s)": 1.637323 + }, + { + "acc": 0.63807549, + "epoch": 0.47919837645865043, + "grad_norm": 5.96875, + "learning_rate": 9.048976847615418e-06, + "loss": 1.65334721, + "memory(GiB)": 97.17, + "step": 18890, + "train_speed(iter/s)": 1.637373 + }, + { + "acc": 0.64474735, + "epoch": 0.4793252156265855, + "grad_norm": 5.09375, + "learning_rate": 9.048361518041107e-06, + "loss": 1.64946785, + "memory(GiB)": 97.17, + "step": 18895, + "train_speed(iter/s)": 1.63742 + }, + { + "acc": 0.66650367, + "epoch": 0.4794520547945205, + "grad_norm": 4.8125, + "learning_rate": 9.04774601040178e-06, + "loss": 1.53804979, + "memory(GiB)": 97.17, + "step": 18900, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.64678864, + "epoch": 0.4795788939624556, + "grad_norm": 7.34375, + "learning_rate": 9.047130324724513e-06, + "loss": 1.61122131, + "memory(GiB)": 97.17, + "step": 18905, + "train_speed(iter/s)": 1.637519 + }, + { + "acc": 0.64797907, + "epoch": 0.47970573313039067, + "grad_norm": 8.5, + "learning_rate": 9.046514461036385e-06, + "loss": 1.63961678, + "memory(GiB)": 97.17, + "step": 18910, + "train_speed(iter/s)": 1.637561 + }, + { + "acc": 0.64692249, + "epoch": 0.4798325722983257, + "grad_norm": 4.90625, + "learning_rate": 9.045898419364483e-06, + "loss": 1.6343441, + "memory(GiB)": 97.17, + "step": 18915, + "train_speed(iter/s)": 1.637611 + }, + { + "acc": 0.6598197, + "epoch": 0.47995941146626075, + "grad_norm": 4.5, + "learning_rate": 9.045282199735906e-06, + "loss": 1.58556423, + "memory(GiB)": 97.17, + "step": 18920, + "train_speed(iter/s)": 1.63766 + }, + { + "acc": 0.63780127, + "epoch": 0.48008625063419585, + "grad_norm": 5.90625, + "learning_rate": 9.044665802177756e-06, + "loss": 1.75557442, + "memory(GiB)": 97.17, + "step": 18925, + "train_speed(iter/s)": 1.637705 + }, + { + "acc": 0.65323682, + "epoch": 0.4802130898021309, + "grad_norm": 9.0625, + "learning_rate": 9.044049226717148e-06, + "loss": 1.60862904, + "memory(GiB)": 97.17, + "step": 18930, + "train_speed(iter/s)": 1.637752 + }, + { + "acc": 0.6711524, + "epoch": 0.48033992897006594, + "grad_norm": 5.375, + "learning_rate": 9.043432473381198e-06, + "loss": 1.58349142, + "memory(GiB)": 97.17, + "step": 18935, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.6529037, + "epoch": 0.480466768138001, + "grad_norm": 5.78125, + "learning_rate": 9.042815542197037e-06, + "loss": 1.64929562, + "memory(GiB)": 97.17, + "step": 18940, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.64694366, + "epoch": 0.4805936073059361, + "grad_norm": 5.4375, + "learning_rate": 9.042198433191796e-06, + "loss": 1.57790012, + "memory(GiB)": 97.17, + "step": 18945, + "train_speed(iter/s)": 1.637896 + }, + { + "acc": 0.64720192, + "epoch": 0.48072044647387113, + "grad_norm": 6.5625, + "learning_rate": 9.041581146392621e-06, + "loss": 1.65945187, + "memory(GiB)": 97.17, + "step": 18950, + "train_speed(iter/s)": 1.637943 + }, + { + "acc": 0.64622927, + "epoch": 0.4808472856418062, + "grad_norm": 6.21875, + "learning_rate": 9.040963681826665e-06, + "loss": 1.5980423, + "memory(GiB)": 97.17, + "step": 18955, + "train_speed(iter/s)": 1.637996 + }, + { + "acc": 0.65423183, + "epoch": 0.4809741248097412, + "grad_norm": 5.71875, + "learning_rate": 9.040346039521085e-06, + "loss": 1.63318958, + "memory(GiB)": 97.17, + "step": 18960, + "train_speed(iter/s)": 1.638046 + }, + { + "acc": 0.66483846, + "epoch": 0.4811009639776763, + "grad_norm": 5.0625, + "learning_rate": 9.039728219503044e-06, + "loss": 1.56115227, + "memory(GiB)": 97.17, + "step": 18965, + "train_speed(iter/s)": 1.638095 + }, + { + "acc": 0.66229806, + "epoch": 0.48122780314561137, + "grad_norm": 4.9375, + "learning_rate": 9.039110221799721e-06, + "loss": 1.57973385, + "memory(GiB)": 97.17, + "step": 18970, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.65636435, + "epoch": 0.4813546423135464, + "grad_norm": 5.6875, + "learning_rate": 9.038492046438298e-06, + "loss": 1.62687855, + "memory(GiB)": 97.17, + "step": 18975, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.65604401, + "epoch": 0.48148148148148145, + "grad_norm": 6.4375, + "learning_rate": 9.037873693445965e-06, + "loss": 1.63516674, + "memory(GiB)": 97.17, + "step": 18980, + "train_speed(iter/s)": 1.63824 + }, + { + "acc": 0.65956035, + "epoch": 0.48160832064941655, + "grad_norm": 5.125, + "learning_rate": 9.037255162849918e-06, + "loss": 1.61233826, + "memory(GiB)": 97.17, + "step": 18985, + "train_speed(iter/s)": 1.63829 + }, + { + "acc": 0.65382061, + "epoch": 0.4817351598173516, + "grad_norm": 5.90625, + "learning_rate": 9.036636454677363e-06, + "loss": 1.60012722, + "memory(GiB)": 97.17, + "step": 18990, + "train_speed(iter/s)": 1.63834 + }, + { + "acc": 0.64359808, + "epoch": 0.48186199898528664, + "grad_norm": 6.0625, + "learning_rate": 9.036017568955516e-06, + "loss": 1.63891335, + "memory(GiB)": 97.17, + "step": 18995, + "train_speed(iter/s)": 1.638387 + }, + { + "acc": 0.65544543, + "epoch": 0.4819888381532217, + "grad_norm": 4.5, + "learning_rate": 9.035398505711597e-06, + "loss": 1.64487362, + "memory(GiB)": 97.17, + "step": 19000, + "train_speed(iter/s)": 1.638436 + }, + { + "epoch": 0.4819888381532217, + "eval_acc": 0.6435002516408865, + "eval_loss": 1.5891553163528442, + "eval_runtime": 58.1883, + "eval_samples_per_second": 109.472, + "eval_steps_per_second": 27.377, + "step": 19000 + }, + { + "acc": 0.65876441, + "epoch": 0.4821156773211568, + "grad_norm": 6.125, + "learning_rate": 9.034779264972834e-06, + "loss": 1.55358868, + "memory(GiB)": 97.17, + "step": 19005, + "train_speed(iter/s)": 1.629709 + }, + { + "acc": 0.66507697, + "epoch": 0.48224251648909183, + "grad_norm": 5.15625, + "learning_rate": 9.034159846766464e-06, + "loss": 1.59579582, + "memory(GiB)": 97.17, + "step": 19010, + "train_speed(iter/s)": 1.62976 + }, + { + "acc": 0.65994821, + "epoch": 0.4823693556570269, + "grad_norm": 5.8125, + "learning_rate": 9.033540251119734e-06, + "loss": 1.5259758, + "memory(GiB)": 97.17, + "step": 19015, + "train_speed(iter/s)": 1.629813 + }, + { + "acc": 0.6565865, + "epoch": 0.4824961948249619, + "grad_norm": 5.125, + "learning_rate": 9.032920478059897e-06, + "loss": 1.57405233, + "memory(GiB)": 97.17, + "step": 19020, + "train_speed(iter/s)": 1.629862 + }, + { + "acc": 0.65229812, + "epoch": 0.482623033992897, + "grad_norm": 5.25, + "learning_rate": 9.032300527614209e-06, + "loss": 1.67209702, + "memory(GiB)": 97.17, + "step": 19025, + "train_speed(iter/s)": 1.629915 + }, + { + "acc": 0.64681187, + "epoch": 0.48274987316083207, + "grad_norm": 5.875, + "learning_rate": 9.031680399809941e-06, + "loss": 1.66641884, + "memory(GiB)": 97.17, + "step": 19030, + "train_speed(iter/s)": 1.629967 + }, + { + "acc": 0.63678856, + "epoch": 0.4828767123287671, + "grad_norm": 5.03125, + "learning_rate": 9.031060094674371e-06, + "loss": 1.70992813, + "memory(GiB)": 97.17, + "step": 19035, + "train_speed(iter/s)": 1.630018 + }, + { + "acc": 0.64104605, + "epoch": 0.48300355149670215, + "grad_norm": 7.21875, + "learning_rate": 9.030439612234778e-06, + "loss": 1.60441246, + "memory(GiB)": 97.17, + "step": 19040, + "train_speed(iter/s)": 1.630072 + }, + { + "acc": 0.66000285, + "epoch": 0.48313039066463725, + "grad_norm": 5.03125, + "learning_rate": 9.029818952518458e-06, + "loss": 1.59520874, + "memory(GiB)": 97.17, + "step": 19045, + "train_speed(iter/s)": 1.630118 + }, + { + "acc": 0.67154431, + "epoch": 0.4832572298325723, + "grad_norm": 5.34375, + "learning_rate": 9.029198115552708e-06, + "loss": 1.54623356, + "memory(GiB)": 97.17, + "step": 19050, + "train_speed(iter/s)": 1.630168 + }, + { + "acc": 0.64939351, + "epoch": 0.48338406900050734, + "grad_norm": 5.4375, + "learning_rate": 9.028577101364837e-06, + "loss": 1.59926214, + "memory(GiB)": 97.17, + "step": 19055, + "train_speed(iter/s)": 1.630218 + }, + { + "acc": 0.66098413, + "epoch": 0.4835109081684424, + "grad_norm": 5.78125, + "learning_rate": 9.027955909982157e-06, + "loss": 1.58970289, + "memory(GiB)": 97.17, + "step": 19060, + "train_speed(iter/s)": 1.630268 + }, + { + "acc": 0.68105259, + "epoch": 0.4836377473363775, + "grad_norm": 5.0625, + "learning_rate": 9.027334541431993e-06, + "loss": 1.49056139, + "memory(GiB)": 97.17, + "step": 19065, + "train_speed(iter/s)": 1.630313 + }, + { + "acc": 0.65410118, + "epoch": 0.48376458650431253, + "grad_norm": 7.65625, + "learning_rate": 9.026712995741676e-06, + "loss": 1.59510307, + "memory(GiB)": 97.17, + "step": 19070, + "train_speed(iter/s)": 1.630361 + }, + { + "acc": 0.63339963, + "epoch": 0.4838914256722476, + "grad_norm": 5.71875, + "learning_rate": 9.026091272938543e-06, + "loss": 1.66985054, + "memory(GiB)": 97.17, + "step": 19075, + "train_speed(iter/s)": 1.630409 + }, + { + "acc": 0.65780506, + "epoch": 0.4840182648401826, + "grad_norm": 5.09375, + "learning_rate": 9.02546937304994e-06, + "loss": 1.59855032, + "memory(GiB)": 97.17, + "step": 19080, + "train_speed(iter/s)": 1.630461 + }, + { + "acc": 0.65102997, + "epoch": 0.4841451040081177, + "grad_norm": 5.5625, + "learning_rate": 9.02484729610322e-06, + "loss": 1.64531765, + "memory(GiB)": 97.17, + "step": 19085, + "train_speed(iter/s)": 1.630509 + }, + { + "acc": 0.64340091, + "epoch": 0.48427194317605277, + "grad_norm": 5.65625, + "learning_rate": 9.02422504212575e-06, + "loss": 1.68798866, + "memory(GiB)": 97.17, + "step": 19090, + "train_speed(iter/s)": 1.63056 + }, + { + "acc": 0.66295605, + "epoch": 0.4843987823439878, + "grad_norm": 4.90625, + "learning_rate": 9.023602611144893e-06, + "loss": 1.59011669, + "memory(GiB)": 97.17, + "step": 19095, + "train_speed(iter/s)": 1.63061 + }, + { + "acc": 0.64069386, + "epoch": 0.48452562151192285, + "grad_norm": 5.0625, + "learning_rate": 9.02298000318803e-06, + "loss": 1.6068161, + "memory(GiB)": 97.17, + "step": 19100, + "train_speed(iter/s)": 1.630656 + }, + { + "acc": 0.64794445, + "epoch": 0.48465246067985795, + "grad_norm": 5.75, + "learning_rate": 9.022357218282546e-06, + "loss": 1.60515022, + "memory(GiB)": 97.17, + "step": 19105, + "train_speed(iter/s)": 1.630703 + }, + { + "acc": 0.64888821, + "epoch": 0.484779299847793, + "grad_norm": 5.625, + "learning_rate": 9.021734256455832e-06, + "loss": 1.65507488, + "memory(GiB)": 97.17, + "step": 19110, + "train_speed(iter/s)": 1.63075 + }, + { + "acc": 0.68182416, + "epoch": 0.48490613901572804, + "grad_norm": 5.21875, + "learning_rate": 9.02111111773529e-06, + "loss": 1.537393, + "memory(GiB)": 97.17, + "step": 19115, + "train_speed(iter/s)": 1.630797 + }, + { + "acc": 0.65830941, + "epoch": 0.4850329781836631, + "grad_norm": 5.6875, + "learning_rate": 9.020487802148328e-06, + "loss": 1.54871597, + "memory(GiB)": 97.17, + "step": 19120, + "train_speed(iter/s)": 1.630849 + }, + { + "acc": 0.64492812, + "epoch": 0.4851598173515982, + "grad_norm": 5.46875, + "learning_rate": 9.019864309722362e-06, + "loss": 1.6327282, + "memory(GiB)": 97.17, + "step": 19125, + "train_speed(iter/s)": 1.630897 + }, + { + "acc": 0.65784407, + "epoch": 0.48528665651953323, + "grad_norm": 5.125, + "learning_rate": 9.019240640484816e-06, + "loss": 1.58626938, + "memory(GiB)": 97.17, + "step": 19130, + "train_speed(iter/s)": 1.630944 + }, + { + "acc": 0.65162892, + "epoch": 0.4854134956874683, + "grad_norm": 5.875, + "learning_rate": 9.018616794463124e-06, + "loss": 1.57989788, + "memory(GiB)": 97.17, + "step": 19135, + "train_speed(iter/s)": 1.630993 + }, + { + "acc": 0.64719133, + "epoch": 0.4855403348554033, + "grad_norm": 5.21875, + "learning_rate": 9.017992771684722e-06, + "loss": 1.65698051, + "memory(GiB)": 97.17, + "step": 19140, + "train_speed(iter/s)": 1.631041 + }, + { + "acc": 0.63881626, + "epoch": 0.4856671740233384, + "grad_norm": 8.0625, + "learning_rate": 9.017368572177058e-06, + "loss": 1.61152592, + "memory(GiB)": 97.17, + "step": 19145, + "train_speed(iter/s)": 1.631085 + }, + { + "acc": 0.64969001, + "epoch": 0.48579401319127347, + "grad_norm": 6.125, + "learning_rate": 9.016744195967588e-06, + "loss": 1.6037796, + "memory(GiB)": 97.17, + "step": 19150, + "train_speed(iter/s)": 1.63113 + }, + { + "acc": 0.66961908, + "epoch": 0.4859208523592085, + "grad_norm": 6.15625, + "learning_rate": 9.016119643083777e-06, + "loss": 1.64942417, + "memory(GiB)": 97.17, + "step": 19155, + "train_speed(iter/s)": 1.631177 + }, + { + "acc": 0.65805769, + "epoch": 0.48604769152714355, + "grad_norm": 6.09375, + "learning_rate": 9.015494913553091e-06, + "loss": 1.61547852, + "memory(GiB)": 97.17, + "step": 19160, + "train_speed(iter/s)": 1.631228 + }, + { + "acc": 0.64842701, + "epoch": 0.48617453069507865, + "grad_norm": 5.09375, + "learning_rate": 9.014870007403012e-06, + "loss": 1.61685257, + "memory(GiB)": 97.17, + "step": 19165, + "train_speed(iter/s)": 1.631279 + }, + { + "acc": 0.65189915, + "epoch": 0.4863013698630137, + "grad_norm": 6.53125, + "learning_rate": 9.014244924661026e-06, + "loss": 1.56398201, + "memory(GiB)": 97.17, + "step": 19170, + "train_speed(iter/s)": 1.631329 + }, + { + "acc": 0.65040245, + "epoch": 0.48642820903094874, + "grad_norm": 5.375, + "learning_rate": 9.013619665354626e-06, + "loss": 1.63303699, + "memory(GiB)": 97.17, + "step": 19175, + "train_speed(iter/s)": 1.631376 + }, + { + "acc": 0.64636779, + "epoch": 0.4865550481988838, + "grad_norm": 5.875, + "learning_rate": 9.01299422951131e-06, + "loss": 1.68049507, + "memory(GiB)": 97.17, + "step": 19180, + "train_speed(iter/s)": 1.631428 + }, + { + "acc": 0.64332356, + "epoch": 0.4866818873668189, + "grad_norm": 5.84375, + "learning_rate": 9.012368617158593e-06, + "loss": 1.65196228, + "memory(GiB)": 97.17, + "step": 19185, + "train_speed(iter/s)": 1.63148 + }, + { + "acc": 0.64770594, + "epoch": 0.48680872653475393, + "grad_norm": 5.3125, + "learning_rate": 9.01174282832399e-06, + "loss": 1.60900192, + "memory(GiB)": 97.17, + "step": 19190, + "train_speed(iter/s)": 1.631528 + }, + { + "acc": 0.66381068, + "epoch": 0.486935565702689, + "grad_norm": 5.65625, + "learning_rate": 9.011116863035027e-06, + "loss": 1.57273817, + "memory(GiB)": 97.17, + "step": 19195, + "train_speed(iter/s)": 1.631575 + }, + { + "acc": 0.64555984, + "epoch": 0.487062404870624, + "grad_norm": 5.59375, + "learning_rate": 9.010490721319237e-06, + "loss": 1.64356728, + "memory(GiB)": 97.17, + "step": 19200, + "train_speed(iter/s)": 1.631623 + }, + { + "acc": 0.64967284, + "epoch": 0.4871892440385591, + "grad_norm": 6.0625, + "learning_rate": 9.009864403204157e-06, + "loss": 1.60948715, + "memory(GiB)": 97.17, + "step": 19205, + "train_speed(iter/s)": 1.631668 + }, + { + "acc": 0.65512981, + "epoch": 0.48731608320649417, + "grad_norm": 5.0, + "learning_rate": 9.00923790871734e-06, + "loss": 1.57831182, + "memory(GiB)": 97.17, + "step": 19210, + "train_speed(iter/s)": 1.631715 + }, + { + "acc": 0.65080533, + "epoch": 0.4874429223744292, + "grad_norm": 6.375, + "learning_rate": 9.008611237886339e-06, + "loss": 1.65522251, + "memory(GiB)": 97.17, + "step": 19215, + "train_speed(iter/s)": 1.631763 + }, + { + "acc": 0.65541615, + "epoch": 0.48756976154236426, + "grad_norm": 6.53125, + "learning_rate": 9.00798439073872e-06, + "loss": 1.62413025, + "memory(GiB)": 97.17, + "step": 19220, + "train_speed(iter/s)": 1.631808 + }, + { + "acc": 0.65070553, + "epoch": 0.48769660071029935, + "grad_norm": 7.3125, + "learning_rate": 9.007357367302052e-06, + "loss": 1.62216053, + "memory(GiB)": 97.17, + "step": 19225, + "train_speed(iter/s)": 1.631855 + }, + { + "acc": 0.64507093, + "epoch": 0.4878234398782344, + "grad_norm": 5.46875, + "learning_rate": 9.006730167603914e-06, + "loss": 1.67657089, + "memory(GiB)": 97.17, + "step": 19230, + "train_speed(iter/s)": 1.631902 + }, + { + "acc": 0.65183091, + "epoch": 0.48795027904616944, + "grad_norm": 6.03125, + "learning_rate": 9.006102791671896e-06, + "loss": 1.63866215, + "memory(GiB)": 97.17, + "step": 19235, + "train_speed(iter/s)": 1.631937 + }, + { + "acc": 0.66255231, + "epoch": 0.4880771182141045, + "grad_norm": 5.96875, + "learning_rate": 9.005475239533591e-06, + "loss": 1.60908585, + "memory(GiB)": 97.17, + "step": 19240, + "train_speed(iter/s)": 1.631985 + }, + { + "acc": 0.64312086, + "epoch": 0.4882039573820396, + "grad_norm": 6.96875, + "learning_rate": 9.0048475112166e-06, + "loss": 1.62154655, + "memory(GiB)": 97.17, + "step": 19245, + "train_speed(iter/s)": 1.632031 + }, + { + "acc": 0.64979849, + "epoch": 0.48833079654997463, + "grad_norm": 5.5625, + "learning_rate": 9.00421960674854e-06, + "loss": 1.63101864, + "memory(GiB)": 97.17, + "step": 19250, + "train_speed(iter/s)": 1.632079 + }, + { + "acc": 0.64713917, + "epoch": 0.4884576357179097, + "grad_norm": 5.5625, + "learning_rate": 9.003591526157021e-06, + "loss": 1.65120621, + "memory(GiB)": 97.17, + "step": 19255, + "train_speed(iter/s)": 1.632122 + }, + { + "acc": 0.64968815, + "epoch": 0.4885844748858447, + "grad_norm": 6.15625, + "learning_rate": 9.002963269469672e-06, + "loss": 1.65066643, + "memory(GiB)": 97.17, + "step": 19260, + "train_speed(iter/s)": 1.632165 + }, + { + "acc": 0.66717105, + "epoch": 0.4887113140537798, + "grad_norm": 5.125, + "learning_rate": 9.002334836714126e-06, + "loss": 1.55717783, + "memory(GiB)": 97.17, + "step": 19265, + "train_speed(iter/s)": 1.632207 + }, + { + "acc": 0.65099764, + "epoch": 0.48883815322171487, + "grad_norm": 5.21875, + "learning_rate": 9.001706227918023e-06, + "loss": 1.60708771, + "memory(GiB)": 97.17, + "step": 19270, + "train_speed(iter/s)": 1.632247 + }, + { + "acc": 0.64583077, + "epoch": 0.4889649923896499, + "grad_norm": 5.21875, + "learning_rate": 9.001077443109016e-06, + "loss": 1.68178291, + "memory(GiB)": 97.17, + "step": 19275, + "train_speed(iter/s)": 1.632295 + }, + { + "acc": 0.64101601, + "epoch": 0.48909183155758496, + "grad_norm": 4.8125, + "learning_rate": 9.00044848231476e-06, + "loss": 1.63091698, + "memory(GiB)": 97.17, + "step": 19280, + "train_speed(iter/s)": 1.63234 + }, + { + "acc": 0.67217836, + "epoch": 0.48921867072552006, + "grad_norm": 4.90625, + "learning_rate": 8.999819345562919e-06, + "loss": 1.55919819, + "memory(GiB)": 97.17, + "step": 19285, + "train_speed(iter/s)": 1.632386 + }, + { + "acc": 0.65007591, + "epoch": 0.4893455098934551, + "grad_norm": 5.625, + "learning_rate": 8.999190032881165e-06, + "loss": 1.65904083, + "memory(GiB)": 97.17, + "step": 19290, + "train_speed(iter/s)": 1.632427 + }, + { + "acc": 0.67074599, + "epoch": 0.48947234906139014, + "grad_norm": 5.375, + "learning_rate": 8.998560544297176e-06, + "loss": 1.55652294, + "memory(GiB)": 97.17, + "step": 19295, + "train_speed(iter/s)": 1.632469 + }, + { + "acc": 0.6557714, + "epoch": 0.4895991882293252, + "grad_norm": 5.125, + "learning_rate": 8.997930879838646e-06, + "loss": 1.60311966, + "memory(GiB)": 97.17, + "step": 19300, + "train_speed(iter/s)": 1.632513 + }, + { + "acc": 0.65521827, + "epoch": 0.4897260273972603, + "grad_norm": 5.15625, + "learning_rate": 8.997301039533264e-06, + "loss": 1.62269154, + "memory(GiB)": 97.17, + "step": 19305, + "train_speed(iter/s)": 1.632558 + }, + { + "acc": 0.64769917, + "epoch": 0.48985286656519533, + "grad_norm": 5.6875, + "learning_rate": 8.996671023408737e-06, + "loss": 1.63032913, + "memory(GiB)": 97.17, + "step": 19310, + "train_speed(iter/s)": 1.632597 + }, + { + "acc": 0.64696217, + "epoch": 0.4899797057331304, + "grad_norm": 5.15625, + "learning_rate": 8.996040831492772e-06, + "loss": 1.63778114, + "memory(GiB)": 97.17, + "step": 19315, + "train_speed(iter/s)": 1.632639 + }, + { + "acc": 0.63116865, + "epoch": 0.4901065449010654, + "grad_norm": 4.8125, + "learning_rate": 8.995410463813093e-06, + "loss": 1.60176773, + "memory(GiB)": 97.17, + "step": 19320, + "train_speed(iter/s)": 1.632686 + }, + { + "acc": 0.65650911, + "epoch": 0.4902333840690005, + "grad_norm": 4.6875, + "learning_rate": 8.99477992039742e-06, + "loss": 1.57855148, + "memory(GiB)": 97.17, + "step": 19325, + "train_speed(iter/s)": 1.632728 + }, + { + "acc": 0.65348015, + "epoch": 0.49036022323693557, + "grad_norm": 6.78125, + "learning_rate": 8.994149201273495e-06, + "loss": 1.6068161, + "memory(GiB)": 97.17, + "step": 19330, + "train_speed(iter/s)": 1.632774 + }, + { + "acc": 0.6420002, + "epoch": 0.4904870624048706, + "grad_norm": 6.21875, + "learning_rate": 8.993518306469052e-06, + "loss": 1.58773594, + "memory(GiB)": 97.17, + "step": 19335, + "train_speed(iter/s)": 1.632818 + }, + { + "acc": 0.63002582, + "epoch": 0.49061390157280566, + "grad_norm": 7.0625, + "learning_rate": 8.992887236011847e-06, + "loss": 1.7034874, + "memory(GiB)": 97.17, + "step": 19340, + "train_speed(iter/s)": 1.632861 + }, + { + "acc": 0.64082441, + "epoch": 0.49074074074074076, + "grad_norm": 5.375, + "learning_rate": 8.992255989929632e-06, + "loss": 1.64837914, + "memory(GiB)": 97.17, + "step": 19345, + "train_speed(iter/s)": 1.632907 + }, + { + "acc": 0.67334785, + "epoch": 0.4908675799086758, + "grad_norm": 5.0625, + "learning_rate": 8.991624568250175e-06, + "loss": 1.55427952, + "memory(GiB)": 97.17, + "step": 19350, + "train_speed(iter/s)": 1.63295 + }, + { + "acc": 0.65000949, + "epoch": 0.49099441907661084, + "grad_norm": 6.21875, + "learning_rate": 8.99099297100125e-06, + "loss": 1.68209858, + "memory(GiB)": 97.17, + "step": 19355, + "train_speed(iter/s)": 1.632991 + }, + { + "acc": 0.6570837, + "epoch": 0.4911212582445459, + "grad_norm": 7.78125, + "learning_rate": 8.990361198210634e-06, + "loss": 1.61916962, + "memory(GiB)": 97.17, + "step": 19360, + "train_speed(iter/s)": 1.633036 + }, + { + "acc": 0.64149675, + "epoch": 0.491248097412481, + "grad_norm": 4.65625, + "learning_rate": 8.989729249906116e-06, + "loss": 1.70096588, + "memory(GiB)": 97.17, + "step": 19365, + "train_speed(iter/s)": 1.633075 + }, + { + "acc": 0.6558856, + "epoch": 0.49137493658041603, + "grad_norm": 5.09375, + "learning_rate": 8.989097126115493e-06, + "loss": 1.58833504, + "memory(GiB)": 97.17, + "step": 19370, + "train_speed(iter/s)": 1.633118 + }, + { + "acc": 0.64996033, + "epoch": 0.4915017757483511, + "grad_norm": 5.5, + "learning_rate": 8.98846482686657e-06, + "loss": 1.66905727, + "memory(GiB)": 97.17, + "step": 19375, + "train_speed(iter/s)": 1.633162 + }, + { + "acc": 0.66091418, + "epoch": 0.4916286149162861, + "grad_norm": 6.0625, + "learning_rate": 8.987832352187156e-06, + "loss": 1.64019527, + "memory(GiB)": 97.17, + "step": 19380, + "train_speed(iter/s)": 1.633208 + }, + { + "acc": 0.65745149, + "epoch": 0.4917554540842212, + "grad_norm": 5.84375, + "learning_rate": 8.987199702105071e-06, + "loss": 1.66534767, + "memory(GiB)": 97.17, + "step": 19385, + "train_speed(iter/s)": 1.633252 + }, + { + "acc": 0.64414577, + "epoch": 0.49188229325215627, + "grad_norm": 5.65625, + "learning_rate": 8.986566876648141e-06, + "loss": 1.66644592, + "memory(GiB)": 97.17, + "step": 19390, + "train_speed(iter/s)": 1.633297 + }, + { + "acc": 0.64303875, + "epoch": 0.4920091324200913, + "grad_norm": 6.46875, + "learning_rate": 8.985933875844202e-06, + "loss": 1.62279053, + "memory(GiB)": 97.17, + "step": 19395, + "train_speed(iter/s)": 1.633341 + }, + { + "acc": 0.67910142, + "epoch": 0.49213597158802636, + "grad_norm": 5.375, + "learning_rate": 8.985300699721094e-06, + "loss": 1.55852766, + "memory(GiB)": 97.17, + "step": 19400, + "train_speed(iter/s)": 1.633384 + }, + { + "acc": 0.65786572, + "epoch": 0.49226281075596146, + "grad_norm": 5.5, + "learning_rate": 8.984667348306669e-06, + "loss": 1.64398079, + "memory(GiB)": 97.17, + "step": 19405, + "train_speed(iter/s)": 1.633429 + }, + { + "acc": 0.66132321, + "epoch": 0.4923896499238965, + "grad_norm": 5.65625, + "learning_rate": 8.984033821628782e-06, + "loss": 1.57578688, + "memory(GiB)": 97.17, + "step": 19410, + "train_speed(iter/s)": 1.633473 + }, + { + "acc": 0.66964483, + "epoch": 0.49251648909183154, + "grad_norm": 5.34375, + "learning_rate": 8.983400119715303e-06, + "loss": 1.54320574, + "memory(GiB)": 97.17, + "step": 19415, + "train_speed(iter/s)": 1.633518 + }, + { + "acc": 0.65455723, + "epoch": 0.4926433282597666, + "grad_norm": 5.96875, + "learning_rate": 8.982766242594099e-06, + "loss": 1.61968479, + "memory(GiB)": 97.17, + "step": 19420, + "train_speed(iter/s)": 1.633563 + }, + { + "acc": 0.67131414, + "epoch": 0.4927701674277017, + "grad_norm": 7.21875, + "learning_rate": 8.982132190293056e-06, + "loss": 1.61747837, + "memory(GiB)": 97.17, + "step": 19425, + "train_speed(iter/s)": 1.63361 + }, + { + "acc": 0.65977802, + "epoch": 0.49289700659563673, + "grad_norm": 5.78125, + "learning_rate": 8.98149796284006e-06, + "loss": 1.58889999, + "memory(GiB)": 97.17, + "step": 19430, + "train_speed(iter/s)": 1.633653 + }, + { + "acc": 0.653619, + "epoch": 0.4930238457635718, + "grad_norm": 6.84375, + "learning_rate": 8.980863560263007e-06, + "loss": 1.60346317, + "memory(GiB)": 97.17, + "step": 19435, + "train_speed(iter/s)": 1.6337 + }, + { + "acc": 0.65869341, + "epoch": 0.4931506849315068, + "grad_norm": 5.6875, + "learning_rate": 8.980228982589802e-06, + "loss": 1.59405651, + "memory(GiB)": 97.17, + "step": 19440, + "train_speed(iter/s)": 1.633742 + }, + { + "acc": 0.66383777, + "epoch": 0.4932775240994419, + "grad_norm": 5.40625, + "learning_rate": 8.979594229848355e-06, + "loss": 1.61059952, + "memory(GiB)": 97.17, + "step": 19445, + "train_speed(iter/s)": 1.633786 + }, + { + "acc": 0.65295792, + "epoch": 0.49340436326737697, + "grad_norm": 5.78125, + "learning_rate": 8.978959302066587e-06, + "loss": 1.66142769, + "memory(GiB)": 97.17, + "step": 19450, + "train_speed(iter/s)": 1.633834 + }, + { + "acc": 0.64952669, + "epoch": 0.493531202435312, + "grad_norm": 5.0, + "learning_rate": 8.978324199272423e-06, + "loss": 1.60217705, + "memory(GiB)": 97.17, + "step": 19455, + "train_speed(iter/s)": 1.633881 + }, + { + "acc": 0.66519308, + "epoch": 0.49365804160324706, + "grad_norm": 6.1875, + "learning_rate": 8.977688921493799e-06, + "loss": 1.56891441, + "memory(GiB)": 97.17, + "step": 19460, + "train_speed(iter/s)": 1.633929 + }, + { + "acc": 0.66260753, + "epoch": 0.49378488077118216, + "grad_norm": 5.09375, + "learning_rate": 8.977053468758659e-06, + "loss": 1.56150455, + "memory(GiB)": 97.17, + "step": 19465, + "train_speed(iter/s)": 1.633974 + }, + { + "acc": 0.66025896, + "epoch": 0.4939117199391172, + "grad_norm": 5.65625, + "learning_rate": 8.976417841094949e-06, + "loss": 1.61268768, + "memory(GiB)": 97.17, + "step": 19470, + "train_speed(iter/s)": 1.63402 + }, + { + "acc": 0.65946765, + "epoch": 0.49403855910705224, + "grad_norm": 5.875, + "learning_rate": 8.97578203853063e-06, + "loss": 1.64046898, + "memory(GiB)": 97.17, + "step": 19475, + "train_speed(iter/s)": 1.634066 + }, + { + "acc": 0.65515213, + "epoch": 0.4941653982749873, + "grad_norm": 4.9375, + "learning_rate": 8.975146061093667e-06, + "loss": 1.60014114, + "memory(GiB)": 97.17, + "step": 19480, + "train_speed(iter/s)": 1.634106 + }, + { + "acc": 0.64681988, + "epoch": 0.4942922374429224, + "grad_norm": 5.3125, + "learning_rate": 8.97450990881203e-06, + "loss": 1.69216728, + "memory(GiB)": 97.17, + "step": 19485, + "train_speed(iter/s)": 1.634151 + }, + { + "acc": 0.66483059, + "epoch": 0.49441907661085743, + "grad_norm": 9.1875, + "learning_rate": 8.973873581713705e-06, + "loss": 1.59817677, + "memory(GiB)": 97.17, + "step": 19490, + "train_speed(iter/s)": 1.634198 + }, + { + "acc": 0.65917969, + "epoch": 0.4945459157787925, + "grad_norm": 5.59375, + "learning_rate": 8.973237079826676e-06, + "loss": 1.62293243, + "memory(GiB)": 97.17, + "step": 19495, + "train_speed(iter/s)": 1.634239 + }, + { + "acc": 0.6548646, + "epoch": 0.4946727549467275, + "grad_norm": 6.84375, + "learning_rate": 8.972600403178941e-06, + "loss": 1.63616447, + "memory(GiB)": 97.17, + "step": 19500, + "train_speed(iter/s)": 1.634281 + }, + { + "acc": 0.65196009, + "epoch": 0.4947995941146626, + "grad_norm": 6.25, + "learning_rate": 8.971963551798506e-06, + "loss": 1.56809616, + "memory(GiB)": 97.17, + "step": 19505, + "train_speed(iter/s)": 1.634322 + }, + { + "acc": 0.65663686, + "epoch": 0.49492643328259767, + "grad_norm": 6.3125, + "learning_rate": 8.971326525713378e-06, + "loss": 1.54991417, + "memory(GiB)": 97.17, + "step": 19510, + "train_speed(iter/s)": 1.634365 + }, + { + "acc": 0.64974566, + "epoch": 0.4950532724505327, + "grad_norm": 6.46875, + "learning_rate": 8.97068932495158e-06, + "loss": 1.65213737, + "memory(GiB)": 97.17, + "step": 19515, + "train_speed(iter/s)": 1.634409 + }, + { + "acc": 0.65483007, + "epoch": 0.49518011161846776, + "grad_norm": 9.25, + "learning_rate": 8.970051949541137e-06, + "loss": 1.6299305, + "memory(GiB)": 97.17, + "step": 19520, + "train_speed(iter/s)": 1.634456 + }, + { + "acc": 0.65145993, + "epoch": 0.49530695078640286, + "grad_norm": 5.875, + "learning_rate": 8.969414399510085e-06, + "loss": 1.60536118, + "memory(GiB)": 97.17, + "step": 19525, + "train_speed(iter/s)": 1.634498 + }, + { + "acc": 0.66421452, + "epoch": 0.4954337899543379, + "grad_norm": 5.375, + "learning_rate": 8.968776674886466e-06, + "loss": 1.61416245, + "memory(GiB)": 97.17, + "step": 19530, + "train_speed(iter/s)": 1.634543 + }, + { + "acc": 0.63569074, + "epoch": 0.49556062912227294, + "grad_norm": 8.75, + "learning_rate": 8.968138775698328e-06, + "loss": 1.70665836, + "memory(GiB)": 97.17, + "step": 19535, + "train_speed(iter/s)": 1.634593 + }, + { + "acc": 0.66848354, + "epoch": 0.495687468290208, + "grad_norm": 6.1875, + "learning_rate": 8.96750070197373e-06, + "loss": 1.57719469, + "memory(GiB)": 97.17, + "step": 19540, + "train_speed(iter/s)": 1.63464 + }, + { + "acc": 0.64501586, + "epoch": 0.4958143074581431, + "grad_norm": 5.90625, + "learning_rate": 8.966862453740738e-06, + "loss": 1.60796223, + "memory(GiB)": 97.17, + "step": 19545, + "train_speed(iter/s)": 1.634687 + }, + { + "acc": 0.65066652, + "epoch": 0.49594114662607813, + "grad_norm": 5.15625, + "learning_rate": 8.966224031027426e-06, + "loss": 1.66179066, + "memory(GiB)": 97.17, + "step": 19550, + "train_speed(iter/s)": 1.63473 + }, + { + "acc": 0.64585896, + "epoch": 0.4960679857940132, + "grad_norm": 5.5625, + "learning_rate": 8.965585433861871e-06, + "loss": 1.6374012, + "memory(GiB)": 97.17, + "step": 19555, + "train_speed(iter/s)": 1.63478 + }, + { + "acc": 0.65385785, + "epoch": 0.4961948249619482, + "grad_norm": 5.59375, + "learning_rate": 8.964946662272167e-06, + "loss": 1.5826478, + "memory(GiB)": 97.17, + "step": 19560, + "train_speed(iter/s)": 1.634827 + }, + { + "acc": 0.65989742, + "epoch": 0.4963216641298833, + "grad_norm": 5.59375, + "learning_rate": 8.964307716286404e-06, + "loss": 1.58067436, + "memory(GiB)": 97.17, + "step": 19565, + "train_speed(iter/s)": 1.63487 + }, + { + "acc": 0.66022587, + "epoch": 0.49644850329781837, + "grad_norm": 6.21875, + "learning_rate": 8.963668595932689e-06, + "loss": 1.646978, + "memory(GiB)": 97.17, + "step": 19570, + "train_speed(iter/s)": 1.634917 + }, + { + "acc": 0.65500035, + "epoch": 0.4965753424657534, + "grad_norm": 5.6875, + "learning_rate": 8.96302930123913e-06, + "loss": 1.6167778, + "memory(GiB)": 97.17, + "step": 19575, + "train_speed(iter/s)": 1.634962 + }, + { + "acc": 0.66047921, + "epoch": 0.49670218163368846, + "grad_norm": 5.15625, + "learning_rate": 8.962389832233853e-06, + "loss": 1.57737408, + "memory(GiB)": 97.17, + "step": 19580, + "train_speed(iter/s)": 1.635008 + }, + { + "acc": 0.64837852, + "epoch": 0.49682902080162356, + "grad_norm": 7.90625, + "learning_rate": 8.961750188944978e-06, + "loss": 1.69679546, + "memory(GiB)": 97.17, + "step": 19585, + "train_speed(iter/s)": 1.635057 + }, + { + "acc": 0.6454319, + "epoch": 0.4969558599695586, + "grad_norm": 5.625, + "learning_rate": 8.96111037140064e-06, + "loss": 1.61879864, + "memory(GiB)": 97.17, + "step": 19590, + "train_speed(iter/s)": 1.6351 + }, + { + "acc": 0.65377541, + "epoch": 0.49708269913749364, + "grad_norm": 4.78125, + "learning_rate": 8.960470379628986e-06, + "loss": 1.5757411, + "memory(GiB)": 97.17, + "step": 19595, + "train_speed(iter/s)": 1.635146 + }, + { + "acc": 0.64588599, + "epoch": 0.4972095383054287, + "grad_norm": 6.0, + "learning_rate": 8.959830213658161e-06, + "loss": 1.6123106, + "memory(GiB)": 97.17, + "step": 19600, + "train_speed(iter/s)": 1.635192 + }, + { + "acc": 0.64932814, + "epoch": 0.4973363774733638, + "grad_norm": 5.46875, + "learning_rate": 8.959189873516324e-06, + "loss": 1.67450199, + "memory(GiB)": 97.17, + "step": 19605, + "train_speed(iter/s)": 1.635236 + }, + { + "acc": 0.64351878, + "epoch": 0.49746321664129883, + "grad_norm": 5.15625, + "learning_rate": 8.95854935923164e-06, + "loss": 1.6929636, + "memory(GiB)": 97.17, + "step": 19610, + "train_speed(iter/s)": 1.635281 + }, + { + "acc": 0.65900097, + "epoch": 0.4975900558092339, + "grad_norm": 7.09375, + "learning_rate": 8.95790867083228e-06, + "loss": 1.67759018, + "memory(GiB)": 97.17, + "step": 19615, + "train_speed(iter/s)": 1.635327 + }, + { + "acc": 0.63874559, + "epoch": 0.4977168949771689, + "grad_norm": 6.6875, + "learning_rate": 8.957267808346428e-06, + "loss": 1.67475052, + "memory(GiB)": 97.17, + "step": 19620, + "train_speed(iter/s)": 1.635372 + }, + { + "acc": 0.66545205, + "epoch": 0.497843734145104, + "grad_norm": 5.59375, + "learning_rate": 8.95662677180227e-06, + "loss": 1.55602398, + "memory(GiB)": 97.17, + "step": 19625, + "train_speed(iter/s)": 1.635419 + }, + { + "acc": 0.66076527, + "epoch": 0.49797057331303907, + "grad_norm": 5.53125, + "learning_rate": 8.955985561228e-06, + "loss": 1.59383564, + "memory(GiB)": 97.17, + "step": 19630, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.66685324, + "epoch": 0.4980974124809741, + "grad_norm": 6.25, + "learning_rate": 8.955344176651824e-06, + "loss": 1.60934792, + "memory(GiB)": 97.17, + "step": 19635, + "train_speed(iter/s)": 1.635509 + }, + { + "acc": 0.66330113, + "epoch": 0.49822425164890916, + "grad_norm": 6.625, + "learning_rate": 8.954702618101952e-06, + "loss": 1.57660933, + "memory(GiB)": 97.17, + "step": 19640, + "train_speed(iter/s)": 1.635555 + }, + { + "acc": 0.64868908, + "epoch": 0.49835109081684426, + "grad_norm": 7.3125, + "learning_rate": 8.9540608856066e-06, + "loss": 1.63900948, + "memory(GiB)": 97.17, + "step": 19645, + "train_speed(iter/s)": 1.635599 + }, + { + "acc": 0.64220729, + "epoch": 0.4984779299847793, + "grad_norm": 7.625, + "learning_rate": 8.953418979194e-06, + "loss": 1.71169529, + "memory(GiB)": 97.17, + "step": 19650, + "train_speed(iter/s)": 1.635641 + }, + { + "acc": 0.6536541, + "epoch": 0.49860476915271434, + "grad_norm": 5.90625, + "learning_rate": 8.95277689889238e-06, + "loss": 1.58922529, + "memory(GiB)": 97.17, + "step": 19655, + "train_speed(iter/s)": 1.635686 + }, + { + "acc": 0.64948592, + "epoch": 0.4987316083206494, + "grad_norm": 6.09375, + "learning_rate": 8.952134644729985e-06, + "loss": 1.63966732, + "memory(GiB)": 97.17, + "step": 19660, + "train_speed(iter/s)": 1.635731 + }, + { + "acc": 0.66129317, + "epoch": 0.4988584474885845, + "grad_norm": 5.9375, + "learning_rate": 8.951492216735062e-06, + "loss": 1.58415575, + "memory(GiB)": 97.17, + "step": 19665, + "train_speed(iter/s)": 1.635777 + }, + { + "acc": 0.63886471, + "epoch": 0.49898528665651953, + "grad_norm": 5.625, + "learning_rate": 8.950849614935872e-06, + "loss": 1.65664673, + "memory(GiB)": 97.17, + "step": 19670, + "train_speed(iter/s)": 1.635822 + }, + { + "acc": 0.64195886, + "epoch": 0.4991121258244546, + "grad_norm": 6.3125, + "learning_rate": 8.950206839360674e-06, + "loss": 1.69861031, + "memory(GiB)": 97.17, + "step": 19675, + "train_speed(iter/s)": 1.635865 + }, + { + "acc": 0.64352846, + "epoch": 0.4992389649923896, + "grad_norm": 4.71875, + "learning_rate": 8.949563890037745e-06, + "loss": 1.67011986, + "memory(GiB)": 97.17, + "step": 19680, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.64192257, + "epoch": 0.4993658041603247, + "grad_norm": 5.59375, + "learning_rate": 8.948920766995362e-06, + "loss": 1.6732151, + "memory(GiB)": 97.17, + "step": 19685, + "train_speed(iter/s)": 1.635954 + }, + { + "acc": 0.6566762, + "epoch": 0.49949264332825977, + "grad_norm": 5.875, + "learning_rate": 8.948277470261812e-06, + "loss": 1.59000168, + "memory(GiB)": 97.17, + "step": 19690, + "train_speed(iter/s)": 1.635996 + }, + { + "acc": 0.63902798, + "epoch": 0.4996194824961948, + "grad_norm": 5.96875, + "learning_rate": 8.94763399986539e-06, + "loss": 1.63371792, + "memory(GiB)": 97.17, + "step": 19695, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.66027279, + "epoch": 0.49974632166412986, + "grad_norm": 4.625, + "learning_rate": 8.946990355834401e-06, + "loss": 1.58149529, + "memory(GiB)": 97.17, + "step": 19700, + "train_speed(iter/s)": 1.636085 + }, + { + "acc": 0.65635209, + "epoch": 0.49987316083206496, + "grad_norm": 5.75, + "learning_rate": 8.946346538197156e-06, + "loss": 1.55523911, + "memory(GiB)": 97.17, + "step": 19705, + "train_speed(iter/s)": 1.63613 + }, + { + "acc": 0.64486589, + "epoch": 0.5, + "grad_norm": 5.15625, + "learning_rate": 8.94570254698197e-06, + "loss": 1.64925861, + "memory(GiB)": 97.17, + "step": 19710, + "train_speed(iter/s)": 1.636173 + }, + { + "acc": 0.6442924, + "epoch": 0.5001268391679351, + "grad_norm": 5.59375, + "learning_rate": 8.945058382217168e-06, + "loss": 1.63886719, + "memory(GiB)": 97.17, + "step": 19715, + "train_speed(iter/s)": 1.636219 + }, + { + "acc": 0.64146109, + "epoch": 0.5002536783358701, + "grad_norm": 5.34375, + "learning_rate": 8.944414043931086e-06, + "loss": 1.68201981, + "memory(GiB)": 97.17, + "step": 19720, + "train_speed(iter/s)": 1.63626 + }, + { + "acc": 0.65783596, + "epoch": 0.5003805175038052, + "grad_norm": 4.4375, + "learning_rate": 8.943769532152065e-06, + "loss": 1.56636171, + "memory(GiB)": 97.17, + "step": 19725, + "train_speed(iter/s)": 1.636302 + }, + { + "acc": 0.64758997, + "epoch": 0.5005073566717403, + "grad_norm": 5.25, + "learning_rate": 8.94312484690845e-06, + "loss": 1.60370312, + "memory(GiB)": 97.17, + "step": 19730, + "train_speed(iter/s)": 1.63635 + }, + { + "acc": 0.65292311, + "epoch": 0.5006341958396753, + "grad_norm": 5.5, + "learning_rate": 8.9424799882286e-06, + "loss": 1.62388172, + "memory(GiB)": 97.17, + "step": 19735, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.66491714, + "epoch": 0.5007610350076104, + "grad_norm": 4.6875, + "learning_rate": 8.94183495614088e-06, + "loss": 1.58442364, + "memory(GiB)": 97.17, + "step": 19740, + "train_speed(iter/s)": 1.636436 + }, + { + "acc": 0.67173519, + "epoch": 0.5008878741755454, + "grad_norm": 5.28125, + "learning_rate": 8.941189750673658e-06, + "loss": 1.56227512, + "memory(GiB)": 97.17, + "step": 19745, + "train_speed(iter/s)": 1.636478 + }, + { + "acc": 0.64533234, + "epoch": 0.5010147133434805, + "grad_norm": 5.3125, + "learning_rate": 8.940544371855315e-06, + "loss": 1.60845604, + "memory(GiB)": 97.17, + "step": 19750, + "train_speed(iter/s)": 1.636525 + }, + { + "acc": 0.66256008, + "epoch": 0.5011415525114156, + "grad_norm": 6.65625, + "learning_rate": 8.939898819714237e-06, + "loss": 1.53998079, + "memory(GiB)": 97.17, + "step": 19755, + "train_speed(iter/s)": 1.63657 + }, + { + "acc": 0.65779552, + "epoch": 0.5012683916793506, + "grad_norm": 5.25, + "learning_rate": 8.93925309427882e-06, + "loss": 1.58629084, + "memory(GiB)": 97.17, + "step": 19760, + "train_speed(iter/s)": 1.636618 + }, + { + "acc": 0.652987, + "epoch": 0.5013952308472857, + "grad_norm": 5.65625, + "learning_rate": 8.938607195577462e-06, + "loss": 1.66644135, + "memory(GiB)": 97.17, + "step": 19765, + "train_speed(iter/s)": 1.63666 + }, + { + "acc": 0.64125495, + "epoch": 0.5015220700152208, + "grad_norm": 4.90625, + "learning_rate": 8.937961123638577e-06, + "loss": 1.67882233, + "memory(GiB)": 97.17, + "step": 19770, + "train_speed(iter/s)": 1.636706 + }, + { + "acc": 0.64762201, + "epoch": 0.5016489091831557, + "grad_norm": 5.78125, + "learning_rate": 8.93731487849058e-06, + "loss": 1.65156441, + "memory(GiB)": 97.17, + "step": 19775, + "train_speed(iter/s)": 1.636749 + }, + { + "acc": 0.63744946, + "epoch": 0.5017757483510908, + "grad_norm": 6.9375, + "learning_rate": 8.936668460161895e-06, + "loss": 1.66271744, + "memory(GiB)": 97.17, + "step": 19780, + "train_speed(iter/s)": 1.636796 + }, + { + "acc": 0.65909729, + "epoch": 0.5019025875190258, + "grad_norm": 6.34375, + "learning_rate": 8.936021868680956e-06, + "loss": 1.68622627, + "memory(GiB)": 97.17, + "step": 19785, + "train_speed(iter/s)": 1.636837 + }, + { + "acc": 0.64431381, + "epoch": 0.5020294266869609, + "grad_norm": 6.09375, + "learning_rate": 8.935375104076201e-06, + "loss": 1.68079185, + "memory(GiB)": 97.17, + "step": 19790, + "train_speed(iter/s)": 1.636882 + }, + { + "acc": 0.65269499, + "epoch": 0.502156265854896, + "grad_norm": 5.59375, + "learning_rate": 8.93472816637608e-06, + "loss": 1.61054878, + "memory(GiB)": 97.17, + "step": 19795, + "train_speed(iter/s)": 1.636925 + }, + { + "acc": 0.65919523, + "epoch": 0.502283105022831, + "grad_norm": 5.5625, + "learning_rate": 8.934081055609046e-06, + "loss": 1.63678875, + "memory(GiB)": 97.17, + "step": 19800, + "train_speed(iter/s)": 1.63697 + }, + { + "acc": 0.66355925, + "epoch": 0.5024099441907661, + "grad_norm": 5.75, + "learning_rate": 8.933433771803562e-06, + "loss": 1.58405828, + "memory(GiB)": 97.17, + "step": 19805, + "train_speed(iter/s)": 1.637015 + }, + { + "acc": 0.65147705, + "epoch": 0.5025367833587012, + "grad_norm": 5.25, + "learning_rate": 8.932786314988099e-06, + "loss": 1.61141987, + "memory(GiB)": 97.17, + "step": 19810, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.65990744, + "epoch": 0.5026636225266362, + "grad_norm": 6.84375, + "learning_rate": 8.932138685191136e-06, + "loss": 1.60928097, + "memory(GiB)": 97.17, + "step": 19815, + "train_speed(iter/s)": 1.637099 + }, + { + "acc": 0.66688509, + "epoch": 0.5027904616945713, + "grad_norm": 6.90625, + "learning_rate": 8.931490882441159e-06, + "loss": 1.55720978, + "memory(GiB)": 97.17, + "step": 19820, + "train_speed(iter/s)": 1.637143 + }, + { + "acc": 0.64496918, + "epoch": 0.5029173008625063, + "grad_norm": 7.09375, + "learning_rate": 8.930842906766659e-06, + "loss": 1.65523148, + "memory(GiB)": 97.17, + "step": 19825, + "train_speed(iter/s)": 1.637188 + }, + { + "acc": 0.66474676, + "epoch": 0.5030441400304414, + "grad_norm": 6.46875, + "learning_rate": 8.930194758196138e-06, + "loss": 1.56693897, + "memory(GiB)": 97.17, + "step": 19830, + "train_speed(iter/s)": 1.637232 + }, + { + "acc": 0.64261909, + "epoch": 0.5031709791983765, + "grad_norm": 4.9375, + "learning_rate": 8.929546436758105e-06, + "loss": 1.6713604, + "memory(GiB)": 97.17, + "step": 19835, + "train_speed(iter/s)": 1.637276 + }, + { + "acc": 0.65234485, + "epoch": 0.5032978183663115, + "grad_norm": 5.09375, + "learning_rate": 8.928897942481075e-06, + "loss": 1.58415432, + "memory(GiB)": 97.17, + "step": 19840, + "train_speed(iter/s)": 1.637318 + }, + { + "acc": 0.64369764, + "epoch": 0.5034246575342466, + "grad_norm": 7.46875, + "learning_rate": 8.928249275393572e-06, + "loss": 1.61291428, + "memory(GiB)": 97.17, + "step": 19845, + "train_speed(iter/s)": 1.637362 + }, + { + "acc": 0.65665665, + "epoch": 0.5035514967021817, + "grad_norm": 5.28125, + "learning_rate": 8.927600435524129e-06, + "loss": 1.57517281, + "memory(GiB)": 97.17, + "step": 19850, + "train_speed(iter/s)": 1.637404 + }, + { + "acc": 0.64826097, + "epoch": 0.5036783358701167, + "grad_norm": 5.625, + "learning_rate": 8.926951422901282e-06, + "loss": 1.67055874, + "memory(GiB)": 97.17, + "step": 19855, + "train_speed(iter/s)": 1.637446 + }, + { + "acc": 0.66410532, + "epoch": 0.5038051750380518, + "grad_norm": 8.25, + "learning_rate": 8.926302237553578e-06, + "loss": 1.59546185, + "memory(GiB)": 97.17, + "step": 19860, + "train_speed(iter/s)": 1.63749 + }, + { + "acc": 0.64603043, + "epoch": 0.5039320142059868, + "grad_norm": 6.375, + "learning_rate": 8.925652879509575e-06, + "loss": 1.61109486, + "memory(GiB)": 97.17, + "step": 19865, + "train_speed(iter/s)": 1.637533 + }, + { + "acc": 0.65816126, + "epoch": 0.5040588533739219, + "grad_norm": 6.875, + "learning_rate": 8.925003348797829e-06, + "loss": 1.68585777, + "memory(GiB)": 97.17, + "step": 19870, + "train_speed(iter/s)": 1.637578 + }, + { + "acc": 0.65360856, + "epoch": 0.504185692541857, + "grad_norm": 5.25, + "learning_rate": 8.924353645446912e-06, + "loss": 1.55985394, + "memory(GiB)": 97.17, + "step": 19875, + "train_speed(iter/s)": 1.637622 + }, + { + "acc": 0.65019102, + "epoch": 0.504312531709792, + "grad_norm": 5.6875, + "learning_rate": 8.923703769485403e-06, + "loss": 1.62335587, + "memory(GiB)": 97.17, + "step": 19880, + "train_speed(iter/s)": 1.637665 + }, + { + "acc": 0.65725064, + "epoch": 0.5044393708777271, + "grad_norm": 6.28125, + "learning_rate": 8.92305372094188e-06, + "loss": 1.62615643, + "memory(GiB)": 97.17, + "step": 19885, + "train_speed(iter/s)": 1.637708 + }, + { + "acc": 0.66691246, + "epoch": 0.5045662100456622, + "grad_norm": 6.5625, + "learning_rate": 8.922403499844943e-06, + "loss": 1.56492739, + "memory(GiB)": 97.17, + "step": 19890, + "train_speed(iter/s)": 1.63775 + }, + { + "acc": 0.65334301, + "epoch": 0.5046930492135971, + "grad_norm": 5.78125, + "learning_rate": 8.921753106223186e-06, + "loss": 1.58829002, + "memory(GiB)": 97.17, + "step": 19895, + "train_speed(iter/s)": 1.637793 + }, + { + "acc": 0.62701683, + "epoch": 0.5048198883815322, + "grad_norm": 5.78125, + "learning_rate": 8.92110254010522e-06, + "loss": 1.7714592, + "memory(GiB)": 97.17, + "step": 19900, + "train_speed(iter/s)": 1.637837 + }, + { + "acc": 0.63515863, + "epoch": 0.5049467275494672, + "grad_norm": 4.5, + "learning_rate": 8.920451801519656e-06, + "loss": 1.70014973, + "memory(GiB)": 97.17, + "step": 19905, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.65417986, + "epoch": 0.5050735667174023, + "grad_norm": 6.78125, + "learning_rate": 8.919800890495118e-06, + "loss": 1.62693977, + "memory(GiB)": 97.17, + "step": 19910, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.6486495, + "epoch": 0.5052004058853374, + "grad_norm": 5.25, + "learning_rate": 8.919149807060237e-06, + "loss": 1.65757732, + "memory(GiB)": 97.17, + "step": 19915, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.64851494, + "epoch": 0.5053272450532724, + "grad_norm": 5.75, + "learning_rate": 8.918498551243649e-06, + "loss": 1.67344894, + "memory(GiB)": 97.17, + "step": 19920, + "train_speed(iter/s)": 1.638007 + }, + { + "acc": 0.64613094, + "epoch": 0.5054540842212075, + "grad_norm": 5.03125, + "learning_rate": 8.917847123073999e-06, + "loss": 1.63346424, + "memory(GiB)": 97.17, + "step": 19925, + "train_speed(iter/s)": 1.638051 + }, + { + "acc": 0.64864411, + "epoch": 0.5055809233891426, + "grad_norm": 8.1875, + "learning_rate": 8.917195522579943e-06, + "loss": 1.5969182, + "memory(GiB)": 97.17, + "step": 19930, + "train_speed(iter/s)": 1.638098 + }, + { + "acc": 0.65429907, + "epoch": 0.5057077625570776, + "grad_norm": 5.96875, + "learning_rate": 8.916543749790139e-06, + "loss": 1.63215523, + "memory(GiB)": 97.17, + "step": 19935, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.64883585, + "epoch": 0.5058346017250127, + "grad_norm": 7.25, + "learning_rate": 8.915891804733253e-06, + "loss": 1.64344845, + "memory(GiB)": 97.17, + "step": 19940, + "train_speed(iter/s)": 1.63819 + }, + { + "acc": 0.66215897, + "epoch": 0.5059614408929477, + "grad_norm": 5.375, + "learning_rate": 8.915239687437963e-06, + "loss": 1.5686039, + "memory(GiB)": 97.17, + "step": 19945, + "train_speed(iter/s)": 1.638233 + }, + { + "acc": 0.65325317, + "epoch": 0.5060882800608828, + "grad_norm": 5.125, + "learning_rate": 8.91458739793295e-06, + "loss": 1.61301193, + "memory(GiB)": 97.17, + "step": 19950, + "train_speed(iter/s)": 1.638275 + }, + { + "acc": 0.65407767, + "epoch": 0.5062151192288179, + "grad_norm": 5.65625, + "learning_rate": 8.913934936246907e-06, + "loss": 1.65999413, + "memory(GiB)": 97.17, + "step": 19955, + "train_speed(iter/s)": 1.638319 + }, + { + "acc": 0.63604031, + "epoch": 0.5063419583967529, + "grad_norm": 5.71875, + "learning_rate": 8.91328230240853e-06, + "loss": 1.71096077, + "memory(GiB)": 97.17, + "step": 19960, + "train_speed(iter/s)": 1.638362 + }, + { + "acc": 0.65486422, + "epoch": 0.506468797564688, + "grad_norm": 5.78125, + "learning_rate": 8.912629496446528e-06, + "loss": 1.56815701, + "memory(GiB)": 97.17, + "step": 19965, + "train_speed(iter/s)": 1.638406 + }, + { + "acc": 0.66075773, + "epoch": 0.5065956367326231, + "grad_norm": 7.5, + "learning_rate": 8.911976518389612e-06, + "loss": 1.51954403, + "memory(GiB)": 97.17, + "step": 19970, + "train_speed(iter/s)": 1.638449 + }, + { + "acc": 0.66865425, + "epoch": 0.5067224759005581, + "grad_norm": 5.875, + "learning_rate": 8.9113233682665e-06, + "loss": 1.57399855, + "memory(GiB)": 97.17, + "step": 19975, + "train_speed(iter/s)": 1.638493 + }, + { + "acc": 0.64773932, + "epoch": 0.5068493150684932, + "grad_norm": 5.25, + "learning_rate": 8.910670046105927e-06, + "loss": 1.60767498, + "memory(GiB)": 97.17, + "step": 19980, + "train_speed(iter/s)": 1.63854 + }, + { + "acc": 0.65828819, + "epoch": 0.5069761542364282, + "grad_norm": 5.9375, + "learning_rate": 8.910016551936623e-06, + "loss": 1.63433437, + "memory(GiB)": 97.17, + "step": 19985, + "train_speed(iter/s)": 1.638583 + }, + { + "acc": 0.64350533, + "epoch": 0.5071029934043633, + "grad_norm": 6.0, + "learning_rate": 8.909362885787333e-06, + "loss": 1.66422157, + "memory(GiB)": 97.17, + "step": 19990, + "train_speed(iter/s)": 1.638626 + }, + { + "acc": 0.64171085, + "epoch": 0.5072298325722984, + "grad_norm": 5.25, + "learning_rate": 8.908709047686813e-06, + "loss": 1.67835464, + "memory(GiB)": 97.17, + "step": 19995, + "train_speed(iter/s)": 1.63867 + }, + { + "acc": 0.65791407, + "epoch": 0.5073566717402334, + "grad_norm": 6.625, + "learning_rate": 8.908055037663815e-06, + "loss": 1.63178844, + "memory(GiB)": 97.17, + "step": 20000, + "train_speed(iter/s)": 1.638715 + }, + { + "epoch": 0.5073566717402334, + "eval_acc": 0.6437379008764621, + "eval_loss": 1.5866650342941284, + "eval_runtime": 58.3523, + "eval_samples_per_second": 109.164, + "eval_steps_per_second": 27.3, + "step": 20000 + }, + { + "acc": 0.66467433, + "epoch": 0.5074835109081685, + "grad_norm": 6.25, + "learning_rate": 8.907400855747111e-06, + "loss": 1.60907555, + "memory(GiB)": 97.17, + "step": 20005, + "train_speed(iter/s)": 1.6304 + }, + { + "acc": 0.62223005, + "epoch": 0.5076103500761036, + "grad_norm": 5.4375, + "learning_rate": 8.906746501965468e-06, + "loss": 1.74345722, + "memory(GiB)": 97.17, + "step": 20010, + "train_speed(iter/s)": 1.630442 + }, + { + "acc": 0.67485809, + "epoch": 0.5077371892440385, + "grad_norm": 6.53125, + "learning_rate": 8.906091976347675e-06, + "loss": 1.53418055, + "memory(GiB)": 97.17, + "step": 20015, + "train_speed(iter/s)": 1.630483 + }, + { + "acc": 0.64181247, + "epoch": 0.5078640284119736, + "grad_norm": 5.0625, + "learning_rate": 8.905437278922516e-06, + "loss": 1.62519455, + "memory(GiB)": 97.17, + "step": 20020, + "train_speed(iter/s)": 1.630524 + }, + { + "acc": 0.657902, + "epoch": 0.5079908675799086, + "grad_norm": 6.3125, + "learning_rate": 8.90478240971879e-06, + "loss": 1.61010838, + "memory(GiB)": 97.17, + "step": 20025, + "train_speed(iter/s)": 1.630571 + }, + { + "acc": 0.65885143, + "epoch": 0.5081177067478437, + "grad_norm": 5.34375, + "learning_rate": 8.904127368765298e-06, + "loss": 1.62665596, + "memory(GiB)": 97.17, + "step": 20030, + "train_speed(iter/s)": 1.630613 + }, + { + "acc": 0.63172417, + "epoch": 0.5082445459157788, + "grad_norm": 6.625, + "learning_rate": 8.903472156090856e-06, + "loss": 1.62974129, + "memory(GiB)": 97.17, + "step": 20035, + "train_speed(iter/s)": 1.630658 + }, + { + "acc": 0.66237855, + "epoch": 0.5083713850837138, + "grad_norm": 5.53125, + "learning_rate": 8.902816771724279e-06, + "loss": 1.56528263, + "memory(GiB)": 97.17, + "step": 20040, + "train_speed(iter/s)": 1.6307 + }, + { + "acc": 0.64765735, + "epoch": 0.5084982242516489, + "grad_norm": 5.28125, + "learning_rate": 8.902161215694396e-06, + "loss": 1.59422989, + "memory(GiB)": 97.17, + "step": 20045, + "train_speed(iter/s)": 1.630745 + }, + { + "acc": 0.65953779, + "epoch": 0.508625063419584, + "grad_norm": 5.4375, + "learning_rate": 8.901505488030042e-06, + "loss": 1.61008282, + "memory(GiB)": 97.17, + "step": 20050, + "train_speed(iter/s)": 1.630789 + }, + { + "acc": 0.63473411, + "epoch": 0.508751902587519, + "grad_norm": 6.09375, + "learning_rate": 8.900849588760057e-06, + "loss": 1.6123127, + "memory(GiB)": 97.17, + "step": 20055, + "train_speed(iter/s)": 1.630833 + }, + { + "acc": 0.6434783, + "epoch": 0.5088787417554541, + "grad_norm": 5.46875, + "learning_rate": 8.90019351791329e-06, + "loss": 1.65244026, + "memory(GiB)": 97.17, + "step": 20060, + "train_speed(iter/s)": 1.630869 + }, + { + "acc": 0.65200148, + "epoch": 0.5090055809233891, + "grad_norm": 7.15625, + "learning_rate": 8.8995372755186e-06, + "loss": 1.59405804, + "memory(GiB)": 97.17, + "step": 20065, + "train_speed(iter/s)": 1.630912 + }, + { + "acc": 0.64505138, + "epoch": 0.5091324200913242, + "grad_norm": 8.4375, + "learning_rate": 8.89888086160485e-06, + "loss": 1.60657616, + "memory(GiB)": 97.17, + "step": 20070, + "train_speed(iter/s)": 1.630955 + }, + { + "acc": 0.64989672, + "epoch": 0.5092592592592593, + "grad_norm": 5.46875, + "learning_rate": 8.898224276200913e-06, + "loss": 1.63871651, + "memory(GiB)": 97.17, + "step": 20075, + "train_speed(iter/s)": 1.630998 + }, + { + "acc": 0.6588295, + "epoch": 0.5093860984271943, + "grad_norm": 5.34375, + "learning_rate": 8.897567519335669e-06, + "loss": 1.59247589, + "memory(GiB)": 97.17, + "step": 20080, + "train_speed(iter/s)": 1.631041 + }, + { + "acc": 0.64510145, + "epoch": 0.5095129375951294, + "grad_norm": 5.84375, + "learning_rate": 8.896910591038002e-06, + "loss": 1.61216774, + "memory(GiB)": 97.17, + "step": 20085, + "train_speed(iter/s)": 1.631087 + }, + { + "acc": 0.64206228, + "epoch": 0.5096397767630645, + "grad_norm": 8.0625, + "learning_rate": 8.896253491336809e-06, + "loss": 1.69341984, + "memory(GiB)": 97.17, + "step": 20090, + "train_speed(iter/s)": 1.631132 + }, + { + "acc": 0.63603344, + "epoch": 0.5097666159309995, + "grad_norm": 5.875, + "learning_rate": 8.895596220260993e-06, + "loss": 1.64787731, + "memory(GiB)": 97.17, + "step": 20095, + "train_speed(iter/s)": 1.631175 + }, + { + "acc": 0.65597162, + "epoch": 0.5098934550989346, + "grad_norm": 7.0625, + "learning_rate": 8.894938777839462e-06, + "loss": 1.63399353, + "memory(GiB)": 97.17, + "step": 20100, + "train_speed(iter/s)": 1.631216 + }, + { + "acc": 0.65730529, + "epoch": 0.5100202942668696, + "grad_norm": 5.8125, + "learning_rate": 8.894281164101134e-06, + "loss": 1.60963593, + "memory(GiB)": 97.17, + "step": 20105, + "train_speed(iter/s)": 1.631256 + }, + { + "acc": 0.63457851, + "epoch": 0.5101471334348047, + "grad_norm": 5.1875, + "learning_rate": 8.893623379074934e-06, + "loss": 1.64609795, + "memory(GiB)": 97.17, + "step": 20110, + "train_speed(iter/s)": 1.631297 + }, + { + "acc": 0.65438733, + "epoch": 0.5102739726027398, + "grad_norm": 5.75, + "learning_rate": 8.892965422789793e-06, + "loss": 1.61619415, + "memory(GiB)": 97.17, + "step": 20115, + "train_speed(iter/s)": 1.631344 + }, + { + "acc": 0.65672588, + "epoch": 0.5104008117706748, + "grad_norm": 6.15625, + "learning_rate": 8.892307295274654e-06, + "loss": 1.65938377, + "memory(GiB)": 97.17, + "step": 20120, + "train_speed(iter/s)": 1.63139 + }, + { + "acc": 0.65587487, + "epoch": 0.5105276509386099, + "grad_norm": 6.78125, + "learning_rate": 8.89164899655846e-06, + "loss": 1.62233658, + "memory(GiB)": 97.17, + "step": 20125, + "train_speed(iter/s)": 1.631434 + }, + { + "acc": 0.67769499, + "epoch": 0.510654490106545, + "grad_norm": 5.46875, + "learning_rate": 8.89099052667017e-06, + "loss": 1.50240917, + "memory(GiB)": 97.17, + "step": 20130, + "train_speed(iter/s)": 1.631474 + }, + { + "acc": 0.64973674, + "epoch": 0.51078132927448, + "grad_norm": 5.34375, + "learning_rate": 8.890331885638744e-06, + "loss": 1.65610256, + "memory(GiB)": 97.17, + "step": 20135, + "train_speed(iter/s)": 1.63152 + }, + { + "acc": 0.65786638, + "epoch": 0.510908168442415, + "grad_norm": 4.96875, + "learning_rate": 8.889673073493151e-06, + "loss": 1.59287968, + "memory(GiB)": 97.17, + "step": 20140, + "train_speed(iter/s)": 1.631562 + }, + { + "acc": 0.65741429, + "epoch": 0.51103500761035, + "grad_norm": 6.125, + "learning_rate": 8.889014090262371e-06, + "loss": 1.66737022, + "memory(GiB)": 97.17, + "step": 20145, + "train_speed(iter/s)": 1.631592 + }, + { + "acc": 0.64493418, + "epoch": 0.5111618467782851, + "grad_norm": 5.5625, + "learning_rate": 8.888354935975388e-06, + "loss": 1.56829071, + "memory(GiB)": 97.17, + "step": 20150, + "train_speed(iter/s)": 1.631633 + }, + { + "acc": 0.65764189, + "epoch": 0.5112886859462202, + "grad_norm": 4.4375, + "learning_rate": 8.887695610661196e-06, + "loss": 1.58194008, + "memory(GiB)": 97.17, + "step": 20155, + "train_speed(iter/s)": 1.631675 + }, + { + "acc": 0.65194035, + "epoch": 0.5114155251141552, + "grad_norm": 5.40625, + "learning_rate": 8.887036114348792e-06, + "loss": 1.67314415, + "memory(GiB)": 97.17, + "step": 20160, + "train_speed(iter/s)": 1.631718 + }, + { + "acc": 0.6675209, + "epoch": 0.5115423642820903, + "grad_norm": 5.40625, + "learning_rate": 8.886376447067186e-06, + "loss": 1.63950768, + "memory(GiB)": 97.17, + "step": 20165, + "train_speed(iter/s)": 1.631762 + }, + { + "acc": 0.64694824, + "epoch": 0.5116692034500254, + "grad_norm": 6.59375, + "learning_rate": 8.885716608845394e-06, + "loss": 1.64419384, + "memory(GiB)": 97.17, + "step": 20170, + "train_speed(iter/s)": 1.631803 + }, + { + "acc": 0.65578842, + "epoch": 0.5117960426179604, + "grad_norm": 4.8125, + "learning_rate": 8.885056599712436e-06, + "loss": 1.58569765, + "memory(GiB)": 97.17, + "step": 20175, + "train_speed(iter/s)": 1.631846 + }, + { + "acc": 0.64497242, + "epoch": 0.5119228817858955, + "grad_norm": 5.8125, + "learning_rate": 8.884396419697343e-06, + "loss": 1.65522842, + "memory(GiB)": 97.17, + "step": 20180, + "train_speed(iter/s)": 1.631887 + }, + { + "acc": 0.65047626, + "epoch": 0.5120497209538305, + "grad_norm": 5.875, + "learning_rate": 8.883736068829151e-06, + "loss": 1.58411598, + "memory(GiB)": 97.17, + "step": 20185, + "train_speed(iter/s)": 1.631933 + }, + { + "acc": 0.65407124, + "epoch": 0.5121765601217656, + "grad_norm": 5.34375, + "learning_rate": 8.88307554713691e-06, + "loss": 1.59185514, + "memory(GiB)": 97.17, + "step": 20190, + "train_speed(iter/s)": 1.631977 + }, + { + "acc": 0.66403227, + "epoch": 0.5123033992897007, + "grad_norm": 5.625, + "learning_rate": 8.882414854649667e-06, + "loss": 1.58980694, + "memory(GiB)": 97.17, + "step": 20195, + "train_speed(iter/s)": 1.632023 + }, + { + "acc": 0.65741043, + "epoch": 0.5124302384576357, + "grad_norm": 5.375, + "learning_rate": 8.881753991396488e-06, + "loss": 1.64481468, + "memory(GiB)": 97.17, + "step": 20200, + "train_speed(iter/s)": 1.632064 + }, + { + "acc": 0.65740623, + "epoch": 0.5125570776255708, + "grad_norm": 5.8125, + "learning_rate": 8.881092957406436e-06, + "loss": 1.5566761, + "memory(GiB)": 97.17, + "step": 20205, + "train_speed(iter/s)": 1.632106 + }, + { + "acc": 0.65452919, + "epoch": 0.5126839167935059, + "grad_norm": 6.0625, + "learning_rate": 8.880431752708588e-06, + "loss": 1.5680829, + "memory(GiB)": 97.17, + "step": 20210, + "train_speed(iter/s)": 1.632146 + }, + { + "acc": 0.67387424, + "epoch": 0.5128107559614409, + "grad_norm": 5.75, + "learning_rate": 8.879770377332026e-06, + "loss": 1.55310574, + "memory(GiB)": 97.17, + "step": 20215, + "train_speed(iter/s)": 1.632191 + }, + { + "acc": 0.656464, + "epoch": 0.512937595129376, + "grad_norm": 6.40625, + "learning_rate": 8.879108831305842e-06, + "loss": 1.60006008, + "memory(GiB)": 97.17, + "step": 20220, + "train_speed(iter/s)": 1.632233 + }, + { + "acc": 0.6463912, + "epoch": 0.513064434297311, + "grad_norm": 5.9375, + "learning_rate": 8.878447114659131e-06, + "loss": 1.61846733, + "memory(GiB)": 97.17, + "step": 20225, + "train_speed(iter/s)": 1.632278 + }, + { + "acc": 0.67182813, + "epoch": 0.5131912734652461, + "grad_norm": 7.5, + "learning_rate": 8.877785227421003e-06, + "loss": 1.53214722, + "memory(GiB)": 97.17, + "step": 20230, + "train_speed(iter/s)": 1.632324 + }, + { + "acc": 0.66676221, + "epoch": 0.5133181126331812, + "grad_norm": 6.375, + "learning_rate": 8.877123169620565e-06, + "loss": 1.59670267, + "memory(GiB)": 97.17, + "step": 20235, + "train_speed(iter/s)": 1.632369 + }, + { + "acc": 0.64495134, + "epoch": 0.5134449518011162, + "grad_norm": 5.0, + "learning_rate": 8.876460941286941e-06, + "loss": 1.62505302, + "memory(GiB)": 97.17, + "step": 20240, + "train_speed(iter/s)": 1.632411 + }, + { + "acc": 0.63942709, + "epoch": 0.5135717909690513, + "grad_norm": 5.96875, + "learning_rate": 8.875798542449256e-06, + "loss": 1.66225662, + "memory(GiB)": 97.17, + "step": 20245, + "train_speed(iter/s)": 1.632457 + }, + { + "acc": 0.63722963, + "epoch": 0.5136986301369864, + "grad_norm": 5.5, + "learning_rate": 8.87513597313665e-06, + "loss": 1.67115879, + "memory(GiB)": 97.17, + "step": 20250, + "train_speed(iter/s)": 1.632501 + }, + { + "acc": 0.64086881, + "epoch": 0.5138254693049213, + "grad_norm": 6.75, + "learning_rate": 8.87447323337826e-06, + "loss": 1.6811409, + "memory(GiB)": 97.17, + "step": 20255, + "train_speed(iter/s)": 1.632544 + }, + { + "acc": 0.67046118, + "epoch": 0.5139523084728564, + "grad_norm": 5.71875, + "learning_rate": 8.87381032320324e-06, + "loss": 1.5784833, + "memory(GiB)": 97.17, + "step": 20260, + "train_speed(iter/s)": 1.632584 + }, + { + "acc": 0.64712725, + "epoch": 0.5140791476407914, + "grad_norm": 5.46875, + "learning_rate": 8.873147242640746e-06, + "loss": 1.63043079, + "memory(GiB)": 97.17, + "step": 20265, + "train_speed(iter/s)": 1.632628 + }, + { + "acc": 0.64511232, + "epoch": 0.5142059868087265, + "grad_norm": 4.59375, + "learning_rate": 8.872483991719944e-06, + "loss": 1.65729504, + "memory(GiB)": 97.17, + "step": 20270, + "train_speed(iter/s)": 1.632671 + }, + { + "acc": 0.63720894, + "epoch": 0.5143328259766616, + "grad_norm": 5.125, + "learning_rate": 8.871820570470009e-06, + "loss": 1.67285767, + "memory(GiB)": 97.17, + "step": 20275, + "train_speed(iter/s)": 1.63271 + }, + { + "acc": 0.65420094, + "epoch": 0.5144596651445966, + "grad_norm": 8.6875, + "learning_rate": 8.871156978920116e-06, + "loss": 1.65575237, + "memory(GiB)": 97.17, + "step": 20280, + "train_speed(iter/s)": 1.632756 + }, + { + "acc": 0.66418495, + "epoch": 0.5145865043125317, + "grad_norm": 5.6875, + "learning_rate": 8.870493217099456e-06, + "loss": 1.50092735, + "memory(GiB)": 97.17, + "step": 20285, + "train_speed(iter/s)": 1.632797 + }, + { + "acc": 0.65824113, + "epoch": 0.5147133434804668, + "grad_norm": 5.34375, + "learning_rate": 8.869829285037224e-06, + "loss": 1.53967924, + "memory(GiB)": 97.17, + "step": 20290, + "train_speed(iter/s)": 1.632841 + }, + { + "acc": 0.64342742, + "epoch": 0.5148401826484018, + "grad_norm": 6.59375, + "learning_rate": 8.869165182762623e-06, + "loss": 1.66172791, + "memory(GiB)": 97.17, + "step": 20295, + "train_speed(iter/s)": 1.632884 + }, + { + "acc": 0.65398817, + "epoch": 0.5149670218163369, + "grad_norm": 5.53125, + "learning_rate": 8.868500910304863e-06, + "loss": 1.65386562, + "memory(GiB)": 97.17, + "step": 20300, + "train_speed(iter/s)": 1.632927 + }, + { + "acc": 0.65053244, + "epoch": 0.5150938609842719, + "grad_norm": 4.9375, + "learning_rate": 8.86783646769316e-06, + "loss": 1.59877939, + "memory(GiB)": 97.17, + "step": 20305, + "train_speed(iter/s)": 1.63297 + }, + { + "acc": 0.66531544, + "epoch": 0.515220700152207, + "grad_norm": 5.21875, + "learning_rate": 8.867171854956742e-06, + "loss": 1.6009716, + "memory(GiB)": 97.17, + "step": 20310, + "train_speed(iter/s)": 1.633011 + }, + { + "acc": 0.64752612, + "epoch": 0.5153475393201421, + "grad_norm": 8.3125, + "learning_rate": 8.86650707212484e-06, + "loss": 1.62827721, + "memory(GiB)": 97.17, + "step": 20315, + "train_speed(iter/s)": 1.633054 + }, + { + "acc": 0.65645323, + "epoch": 0.5154743784880771, + "grad_norm": 5.65625, + "learning_rate": 8.865842119226693e-06, + "loss": 1.63121605, + "memory(GiB)": 97.17, + "step": 20320, + "train_speed(iter/s)": 1.6331 + }, + { + "acc": 0.64139228, + "epoch": 0.5156012176560122, + "grad_norm": 6.15625, + "learning_rate": 8.86517699629155e-06, + "loss": 1.68119621, + "memory(GiB)": 97.17, + "step": 20325, + "train_speed(iter/s)": 1.633144 + }, + { + "acc": 0.64034743, + "epoch": 0.5157280568239473, + "grad_norm": 5.09375, + "learning_rate": 8.864511703348666e-06, + "loss": 1.65172272, + "memory(GiB)": 97.17, + "step": 20330, + "train_speed(iter/s)": 1.633186 + }, + { + "acc": 0.64373608, + "epoch": 0.5158548959918823, + "grad_norm": 6.5, + "learning_rate": 8.863846240427306e-06, + "loss": 1.68302612, + "memory(GiB)": 97.17, + "step": 20335, + "train_speed(iter/s)": 1.633227 + }, + { + "acc": 0.66251297, + "epoch": 0.5159817351598174, + "grad_norm": 6.28125, + "learning_rate": 8.863180607556733e-06, + "loss": 1.51638041, + "memory(GiB)": 97.17, + "step": 20340, + "train_speed(iter/s)": 1.633269 + }, + { + "acc": 0.64889421, + "epoch": 0.5161085743277524, + "grad_norm": 6.5, + "learning_rate": 8.862514804766234e-06, + "loss": 1.57986498, + "memory(GiB)": 97.17, + "step": 20345, + "train_speed(iter/s)": 1.633314 + }, + { + "acc": 0.64768066, + "epoch": 0.5162354134956875, + "grad_norm": 6.03125, + "learning_rate": 8.861848832085084e-06, + "loss": 1.62796288, + "memory(GiB)": 97.17, + "step": 20350, + "train_speed(iter/s)": 1.63336 + }, + { + "acc": 0.65385237, + "epoch": 0.5163622526636226, + "grad_norm": 5.78125, + "learning_rate": 8.861182689542585e-06, + "loss": 1.60219955, + "memory(GiB)": 97.17, + "step": 20355, + "train_speed(iter/s)": 1.633401 + }, + { + "acc": 0.65883746, + "epoch": 0.5164890918315576, + "grad_norm": 5.90625, + "learning_rate": 8.86051637716803e-06, + "loss": 1.55787125, + "memory(GiB)": 97.17, + "step": 20360, + "train_speed(iter/s)": 1.633445 + }, + { + "acc": 0.64599462, + "epoch": 0.5166159309994927, + "grad_norm": 5.4375, + "learning_rate": 8.859849894990728e-06, + "loss": 1.66964417, + "memory(GiB)": 97.17, + "step": 20365, + "train_speed(iter/s)": 1.633488 + }, + { + "acc": 0.64820824, + "epoch": 0.5167427701674278, + "grad_norm": 5.53125, + "learning_rate": 8.859183243039995e-06, + "loss": 1.65293179, + "memory(GiB)": 97.17, + "step": 20370, + "train_speed(iter/s)": 1.63353 + }, + { + "acc": 0.63783326, + "epoch": 0.5168696093353627, + "grad_norm": 6.03125, + "learning_rate": 8.858516421345154e-06, + "loss": 1.63754845, + "memory(GiB)": 97.17, + "step": 20375, + "train_speed(iter/s)": 1.633574 + }, + { + "acc": 0.65004282, + "epoch": 0.5169964485032978, + "grad_norm": 5.65625, + "learning_rate": 8.857849429935534e-06, + "loss": 1.66275406, + "memory(GiB)": 97.17, + "step": 20380, + "train_speed(iter/s)": 1.633617 + }, + { + "acc": 0.65984912, + "epoch": 0.5171232876712328, + "grad_norm": 6.3125, + "learning_rate": 8.857182268840471e-06, + "loss": 1.6439785, + "memory(GiB)": 97.17, + "step": 20385, + "train_speed(iter/s)": 1.633661 + }, + { + "acc": 0.65073705, + "epoch": 0.5172501268391679, + "grad_norm": 5.1875, + "learning_rate": 8.856514938089312e-06, + "loss": 1.60009708, + "memory(GiB)": 97.17, + "step": 20390, + "train_speed(iter/s)": 1.633701 + }, + { + "acc": 0.6440773, + "epoch": 0.517376966007103, + "grad_norm": 5.59375, + "learning_rate": 8.855847437711407e-06, + "loss": 1.70226879, + "memory(GiB)": 97.17, + "step": 20395, + "train_speed(iter/s)": 1.633744 + }, + { + "acc": 0.64767084, + "epoch": 0.517503805175038, + "grad_norm": 6.25, + "learning_rate": 8.855179767736117e-06, + "loss": 1.65001278, + "memory(GiB)": 97.17, + "step": 20400, + "train_speed(iter/s)": 1.633788 + }, + { + "acc": 0.64709287, + "epoch": 0.5176306443429731, + "grad_norm": 5.34375, + "learning_rate": 8.854511928192808e-06, + "loss": 1.60745468, + "memory(GiB)": 97.17, + "step": 20405, + "train_speed(iter/s)": 1.63383 + }, + { + "acc": 0.64274278, + "epoch": 0.5177574835109082, + "grad_norm": 5.65625, + "learning_rate": 8.853843919110856e-06, + "loss": 1.57408152, + "memory(GiB)": 97.17, + "step": 20410, + "train_speed(iter/s)": 1.633871 + }, + { + "acc": 0.64803972, + "epoch": 0.5178843226788432, + "grad_norm": 6.25, + "learning_rate": 8.853175740519642e-06, + "loss": 1.6757473, + "memory(GiB)": 97.17, + "step": 20415, + "train_speed(iter/s)": 1.633918 + }, + { + "acc": 0.6625638, + "epoch": 0.5180111618467783, + "grad_norm": 6.09375, + "learning_rate": 8.852507392448555e-06, + "loss": 1.58051805, + "memory(GiB)": 97.17, + "step": 20420, + "train_speed(iter/s)": 1.633961 + }, + { + "acc": 0.65181627, + "epoch": 0.5181380010147133, + "grad_norm": 6.9375, + "learning_rate": 8.851838874926994e-06, + "loss": 1.63936996, + "memory(GiB)": 97.17, + "step": 20425, + "train_speed(iter/s)": 1.634003 + }, + { + "acc": 0.64780436, + "epoch": 0.5182648401826484, + "grad_norm": 6.65625, + "learning_rate": 8.851170187984362e-06, + "loss": 1.61182823, + "memory(GiB)": 97.17, + "step": 20430, + "train_speed(iter/s)": 1.634043 + }, + { + "acc": 0.65935707, + "epoch": 0.5183916793505835, + "grad_norm": 6.34375, + "learning_rate": 8.850501331650069e-06, + "loss": 1.57177334, + "memory(GiB)": 97.17, + "step": 20435, + "train_speed(iter/s)": 1.634087 + }, + { + "acc": 0.65161285, + "epoch": 0.5185185185185185, + "grad_norm": 5.71875, + "learning_rate": 8.849832305953536e-06, + "loss": 1.66026421, + "memory(GiB)": 97.17, + "step": 20440, + "train_speed(iter/s)": 1.634129 + }, + { + "acc": 0.65268941, + "epoch": 0.5186453576864536, + "grad_norm": 4.59375, + "learning_rate": 8.849163110924193e-06, + "loss": 1.61938477, + "memory(GiB)": 97.17, + "step": 20445, + "train_speed(iter/s)": 1.634172 + }, + { + "acc": 0.64094872, + "epoch": 0.5187721968543887, + "grad_norm": 5.0625, + "learning_rate": 8.84849374659147e-06, + "loss": 1.67266521, + "memory(GiB)": 97.17, + "step": 20450, + "train_speed(iter/s)": 1.634212 + }, + { + "acc": 0.65699377, + "epoch": 0.5188990360223237, + "grad_norm": 4.59375, + "learning_rate": 8.847824212984807e-06, + "loss": 1.55342484, + "memory(GiB)": 97.17, + "step": 20455, + "train_speed(iter/s)": 1.634258 + }, + { + "acc": 0.66278982, + "epoch": 0.5190258751902588, + "grad_norm": 5.3125, + "learning_rate": 8.84715451013366e-06, + "loss": 1.50579128, + "memory(GiB)": 97.17, + "step": 20460, + "train_speed(iter/s)": 1.6343 + }, + { + "acc": 0.65456476, + "epoch": 0.5191527143581938, + "grad_norm": 6.59375, + "learning_rate": 8.846484638067478e-06, + "loss": 1.5660162, + "memory(GiB)": 97.17, + "step": 20465, + "train_speed(iter/s)": 1.634344 + }, + { + "acc": 0.64687386, + "epoch": 0.5192795535261289, + "grad_norm": 5.21875, + "learning_rate": 8.84581459681573e-06, + "loss": 1.62442856, + "memory(GiB)": 97.17, + "step": 20470, + "train_speed(iter/s)": 1.634388 + }, + { + "acc": 0.66794186, + "epoch": 0.519406392694064, + "grad_norm": 6.15625, + "learning_rate": 8.845144386407884e-06, + "loss": 1.59475269, + "memory(GiB)": 97.17, + "step": 20475, + "train_speed(iter/s)": 1.634424 + }, + { + "acc": 0.64997854, + "epoch": 0.519533231861999, + "grad_norm": 5.0625, + "learning_rate": 8.844474006873422e-06, + "loss": 1.63259506, + "memory(GiB)": 97.17, + "step": 20480, + "train_speed(iter/s)": 1.634466 + }, + { + "acc": 0.6645627, + "epoch": 0.5196600710299341, + "grad_norm": 6.03125, + "learning_rate": 8.84380345824183e-06, + "loss": 1.55536518, + "memory(GiB)": 97.17, + "step": 20485, + "train_speed(iter/s)": 1.634508 + }, + { + "acc": 0.6481904, + "epoch": 0.5197869101978692, + "grad_norm": 6.34375, + "learning_rate": 8.843132740542599e-06, + "loss": 1.63135262, + "memory(GiB)": 97.17, + "step": 20490, + "train_speed(iter/s)": 1.634552 + }, + { + "acc": 0.66286187, + "epoch": 0.5199137493658041, + "grad_norm": 12.875, + "learning_rate": 8.842461853805232e-06, + "loss": 1.52390251, + "memory(GiB)": 97.17, + "step": 20495, + "train_speed(iter/s)": 1.634593 + }, + { + "acc": 0.65427513, + "epoch": 0.5200405885337392, + "grad_norm": 5.65625, + "learning_rate": 8.841790798059237e-06, + "loss": 1.60269718, + "memory(GiB)": 97.17, + "step": 20500, + "train_speed(iter/s)": 1.634634 + }, + { + "acc": 0.67601938, + "epoch": 0.5201674277016742, + "grad_norm": 6.6875, + "learning_rate": 8.84111957333413e-06, + "loss": 1.52135229, + "memory(GiB)": 97.17, + "step": 20505, + "train_speed(iter/s)": 1.634678 + }, + { + "acc": 0.63770156, + "epoch": 0.5202942668696093, + "grad_norm": 6.0, + "learning_rate": 8.840448179659436e-06, + "loss": 1.74112816, + "memory(GiB)": 97.17, + "step": 20510, + "train_speed(iter/s)": 1.634722 + }, + { + "acc": 0.64741273, + "epoch": 0.5204211060375444, + "grad_norm": 5.03125, + "learning_rate": 8.839776617064683e-06, + "loss": 1.63785095, + "memory(GiB)": 97.17, + "step": 20515, + "train_speed(iter/s)": 1.634763 + }, + { + "acc": 0.66083527, + "epoch": 0.5205479452054794, + "grad_norm": 4.96875, + "learning_rate": 8.839104885579413e-06, + "loss": 1.61422634, + "memory(GiB)": 97.17, + "step": 20520, + "train_speed(iter/s)": 1.634806 + }, + { + "acc": 0.64398055, + "epoch": 0.5206747843734145, + "grad_norm": 6.4375, + "learning_rate": 8.83843298523317e-06, + "loss": 1.65961456, + "memory(GiB)": 97.17, + "step": 20525, + "train_speed(iter/s)": 1.63485 + }, + { + "acc": 0.64862199, + "epoch": 0.5208016235413496, + "grad_norm": 6.0625, + "learning_rate": 8.837760916055505e-06, + "loss": 1.64898834, + "memory(GiB)": 97.17, + "step": 20530, + "train_speed(iter/s)": 1.634891 + }, + { + "acc": 0.64246101, + "epoch": 0.5209284627092846, + "grad_norm": 5.5625, + "learning_rate": 8.837088678075983e-06, + "loss": 1.69156151, + "memory(GiB)": 97.17, + "step": 20535, + "train_speed(iter/s)": 1.634933 + }, + { + "acc": 0.6487082, + "epoch": 0.5210553018772197, + "grad_norm": 4.78125, + "learning_rate": 8.836416271324166e-06, + "loss": 1.62909603, + "memory(GiB)": 97.17, + "step": 20540, + "train_speed(iter/s)": 1.634976 + }, + { + "acc": 0.67679324, + "epoch": 0.5211821410451547, + "grad_norm": 4.5, + "learning_rate": 8.835743695829635e-06, + "loss": 1.49806566, + "memory(GiB)": 97.17, + "step": 20545, + "train_speed(iter/s)": 1.635021 + }, + { + "acc": 0.64529123, + "epoch": 0.5213089802130898, + "grad_norm": 6.78125, + "learning_rate": 8.835070951621971e-06, + "loss": 1.66490364, + "memory(GiB)": 97.17, + "step": 20550, + "train_speed(iter/s)": 1.635065 + }, + { + "acc": 0.65202789, + "epoch": 0.5214358193810249, + "grad_norm": 6.03125, + "learning_rate": 8.834398038730765e-06, + "loss": 1.61711502, + "memory(GiB)": 97.17, + "step": 20555, + "train_speed(iter/s)": 1.635108 + }, + { + "acc": 0.68759584, + "epoch": 0.5215626585489599, + "grad_norm": 6.5, + "learning_rate": 8.833724957185612e-06, + "loss": 1.49708958, + "memory(GiB)": 97.17, + "step": 20560, + "train_speed(iter/s)": 1.635152 + }, + { + "acc": 0.64750705, + "epoch": 0.521689497716895, + "grad_norm": 6.6875, + "learning_rate": 8.83305170701612e-06, + "loss": 1.67592335, + "memory(GiB)": 97.17, + "step": 20565, + "train_speed(iter/s)": 1.635195 + }, + { + "acc": 0.66132841, + "epoch": 0.5218163368848301, + "grad_norm": 5.53125, + "learning_rate": 8.832378288251902e-06, + "loss": 1.61383629, + "memory(GiB)": 97.17, + "step": 20570, + "train_speed(iter/s)": 1.635241 + }, + { + "acc": 0.65200987, + "epoch": 0.5219431760527651, + "grad_norm": 5.8125, + "learning_rate": 8.831704700922574e-06, + "loss": 1.64687843, + "memory(GiB)": 97.17, + "step": 20575, + "train_speed(iter/s)": 1.635282 + }, + { + "acc": 0.65637298, + "epoch": 0.5220700152207002, + "grad_norm": 5.96875, + "learning_rate": 8.831030945057767e-06, + "loss": 1.60000801, + "memory(GiB)": 97.17, + "step": 20580, + "train_speed(iter/s)": 1.635324 + }, + { + "acc": 0.66913633, + "epoch": 0.5221968543886352, + "grad_norm": 5.375, + "learning_rate": 8.830357020687115e-06, + "loss": 1.52004366, + "memory(GiB)": 97.17, + "step": 20585, + "train_speed(iter/s)": 1.635364 + }, + { + "acc": 0.65937729, + "epoch": 0.5223236935565703, + "grad_norm": 5.8125, + "learning_rate": 8.82968292784026e-06, + "loss": 1.62688942, + "memory(GiB)": 97.17, + "step": 20590, + "train_speed(iter/s)": 1.635407 + }, + { + "acc": 0.65578389, + "epoch": 0.5224505327245054, + "grad_norm": 6.40625, + "learning_rate": 8.82900866654685e-06, + "loss": 1.60818672, + "memory(GiB)": 97.17, + "step": 20595, + "train_speed(iter/s)": 1.63545 + }, + { + "acc": 0.65954952, + "epoch": 0.5225773718924404, + "grad_norm": 4.875, + "learning_rate": 8.828334236836546e-06, + "loss": 1.59411736, + "memory(GiB)": 97.17, + "step": 20600, + "train_speed(iter/s)": 1.635489 + }, + { + "acc": 0.64722118, + "epoch": 0.5227042110603755, + "grad_norm": 5.0625, + "learning_rate": 8.827659638739007e-06, + "loss": 1.66816025, + "memory(GiB)": 97.17, + "step": 20605, + "train_speed(iter/s)": 1.635531 + }, + { + "acc": 0.65049634, + "epoch": 0.5228310502283106, + "grad_norm": 5.40625, + "learning_rate": 8.82698487228391e-06, + "loss": 1.58742905, + "memory(GiB)": 97.17, + "step": 20610, + "train_speed(iter/s)": 1.635574 + }, + { + "acc": 0.64607558, + "epoch": 0.5229578893962455, + "grad_norm": 5.5625, + "learning_rate": 8.826309937500932e-06, + "loss": 1.60876427, + "memory(GiB)": 97.17, + "step": 20615, + "train_speed(iter/s)": 1.635616 + }, + { + "acc": 0.64737964, + "epoch": 0.5230847285641806, + "grad_norm": 5.1875, + "learning_rate": 8.825634834419758e-06, + "loss": 1.66019268, + "memory(GiB)": 97.17, + "step": 20620, + "train_speed(iter/s)": 1.635659 + }, + { + "acc": 0.66241264, + "epoch": 0.5232115677321156, + "grad_norm": 7.03125, + "learning_rate": 8.824959563070085e-06, + "loss": 1.5532692, + "memory(GiB)": 97.17, + "step": 20625, + "train_speed(iter/s)": 1.6357 + }, + { + "acc": 0.6638639, + "epoch": 0.5233384069000507, + "grad_norm": 5.1875, + "learning_rate": 8.824284123481614e-06, + "loss": 1.57181873, + "memory(GiB)": 97.17, + "step": 20630, + "train_speed(iter/s)": 1.635744 + }, + { + "acc": 0.64825201, + "epoch": 0.5234652460679858, + "grad_norm": 5.40625, + "learning_rate": 8.823608515684053e-06, + "loss": 1.64821243, + "memory(GiB)": 97.17, + "step": 20635, + "train_speed(iter/s)": 1.635787 + }, + { + "acc": 0.64459519, + "epoch": 0.5235920852359208, + "grad_norm": 4.65625, + "learning_rate": 8.822932739707118e-06, + "loss": 1.67736187, + "memory(GiB)": 97.17, + "step": 20640, + "train_speed(iter/s)": 1.635827 + }, + { + "acc": 0.64496059, + "epoch": 0.5237189244038559, + "grad_norm": 5.75, + "learning_rate": 8.822256795580532e-06, + "loss": 1.65869064, + "memory(GiB)": 97.17, + "step": 20645, + "train_speed(iter/s)": 1.635868 + }, + { + "acc": 0.66012797, + "epoch": 0.523845763571791, + "grad_norm": 5.96875, + "learning_rate": 8.821580683334027e-06, + "loss": 1.60667648, + "memory(GiB)": 97.17, + "step": 20650, + "train_speed(iter/s)": 1.635909 + }, + { + "acc": 0.64167061, + "epoch": 0.523972602739726, + "grad_norm": 6.40625, + "learning_rate": 8.820904402997343e-06, + "loss": 1.64902191, + "memory(GiB)": 97.17, + "step": 20655, + "train_speed(iter/s)": 1.635952 + }, + { + "acc": 0.65029426, + "epoch": 0.5240994419076611, + "grad_norm": 5.375, + "learning_rate": 8.820227954600222e-06, + "loss": 1.55934143, + "memory(GiB)": 97.17, + "step": 20660, + "train_speed(iter/s)": 1.635994 + }, + { + "acc": 0.63539953, + "epoch": 0.5242262810755961, + "grad_norm": 5.1875, + "learning_rate": 8.819551338172421e-06, + "loss": 1.69757156, + "memory(GiB)": 97.17, + "step": 20665, + "train_speed(iter/s)": 1.636034 + }, + { + "acc": 0.66527739, + "epoch": 0.5243531202435312, + "grad_norm": 5.125, + "learning_rate": 8.8188745537437e-06, + "loss": 1.61686554, + "memory(GiB)": 97.17, + "step": 20670, + "train_speed(iter/s)": 1.636075 + }, + { + "acc": 0.6516263, + "epoch": 0.5244799594114663, + "grad_norm": 5.875, + "learning_rate": 8.818197601343822e-06, + "loss": 1.60345631, + "memory(GiB)": 97.17, + "step": 20675, + "train_speed(iter/s)": 1.63612 + }, + { + "acc": 0.65005751, + "epoch": 0.5246067985794013, + "grad_norm": 6.96875, + "learning_rate": 8.81752048100257e-06, + "loss": 1.59206944, + "memory(GiB)": 97.17, + "step": 20680, + "train_speed(iter/s)": 1.636163 + }, + { + "acc": 0.63440671, + "epoch": 0.5247336377473364, + "grad_norm": 6.03125, + "learning_rate": 8.816843192749724e-06, + "loss": 1.68802929, + "memory(GiB)": 97.17, + "step": 20685, + "train_speed(iter/s)": 1.636204 + }, + { + "acc": 0.65044222, + "epoch": 0.5248604769152715, + "grad_norm": 6.125, + "learning_rate": 8.816165736615072e-06, + "loss": 1.61524734, + "memory(GiB)": 97.17, + "step": 20690, + "train_speed(iter/s)": 1.636247 + }, + { + "acc": 0.65094843, + "epoch": 0.5249873160832065, + "grad_norm": 6.5625, + "learning_rate": 8.815488112628412e-06, + "loss": 1.62463493, + "memory(GiB)": 97.17, + "step": 20695, + "train_speed(iter/s)": 1.636289 + }, + { + "acc": 0.66658874, + "epoch": 0.5251141552511416, + "grad_norm": 5.71875, + "learning_rate": 8.814810320819551e-06, + "loss": 1.56076031, + "memory(GiB)": 97.17, + "step": 20700, + "train_speed(iter/s)": 1.636328 + }, + { + "acc": 0.65842886, + "epoch": 0.5252409944190766, + "grad_norm": 5.6875, + "learning_rate": 8.8141323612183e-06, + "loss": 1.61330528, + "memory(GiB)": 97.17, + "step": 20705, + "train_speed(iter/s)": 1.636371 + }, + { + "acc": 0.64636955, + "epoch": 0.5253678335870117, + "grad_norm": 7.40625, + "learning_rate": 8.813454233854479e-06, + "loss": 1.62866688, + "memory(GiB)": 97.17, + "step": 20710, + "train_speed(iter/s)": 1.636414 + }, + { + "acc": 0.65614576, + "epoch": 0.5254946727549468, + "grad_norm": 5.59375, + "learning_rate": 8.812775938757914e-06, + "loss": 1.61141777, + "memory(GiB)": 97.17, + "step": 20715, + "train_speed(iter/s)": 1.636455 + }, + { + "acc": 0.67145171, + "epoch": 0.5256215119228818, + "grad_norm": 6.25, + "learning_rate": 8.812097475958442e-06, + "loss": 1.56528406, + "memory(GiB)": 97.17, + "step": 20720, + "train_speed(iter/s)": 1.636499 + }, + { + "acc": 0.64792519, + "epoch": 0.5257483510908169, + "grad_norm": 5.625, + "learning_rate": 8.811418845485902e-06, + "loss": 1.65422173, + "memory(GiB)": 97.17, + "step": 20725, + "train_speed(iter/s)": 1.636544 + }, + { + "acc": 0.65836978, + "epoch": 0.525875190258752, + "grad_norm": 5.5625, + "learning_rate": 8.810740047370146e-06, + "loss": 1.64090042, + "memory(GiB)": 97.17, + "step": 20730, + "train_speed(iter/s)": 1.636586 + }, + { + "acc": 0.67015996, + "epoch": 0.526002029426687, + "grad_norm": 5.625, + "learning_rate": 8.810061081641026e-06, + "loss": 1.62822876, + "memory(GiB)": 97.17, + "step": 20735, + "train_speed(iter/s)": 1.636627 + }, + { + "acc": 0.66730595, + "epoch": 0.526128868594622, + "grad_norm": 4.625, + "learning_rate": 8.809381948328412e-06, + "loss": 1.58117638, + "memory(GiB)": 97.17, + "step": 20740, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.65899262, + "epoch": 0.526255707762557, + "grad_norm": 5.125, + "learning_rate": 8.80870264746217e-06, + "loss": 1.64216938, + "memory(GiB)": 97.17, + "step": 20745, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.64683342, + "epoch": 0.5263825469304921, + "grad_norm": 4.8125, + "learning_rate": 8.808023179072183e-06, + "loss": 1.607337, + "memory(GiB)": 97.17, + "step": 20750, + "train_speed(iter/s)": 1.636749 + }, + { + "acc": 0.6462841, + "epoch": 0.5265093860984272, + "grad_norm": 5.3125, + "learning_rate": 8.807343543188333e-06, + "loss": 1.68285522, + "memory(GiB)": 97.17, + "step": 20755, + "train_speed(iter/s)": 1.636791 + }, + { + "acc": 0.66174536, + "epoch": 0.5266362252663622, + "grad_norm": 6.03125, + "learning_rate": 8.806663739840515e-06, + "loss": 1.61272812, + "memory(GiB)": 97.17, + "step": 20760, + "train_speed(iter/s)": 1.636835 + }, + { + "acc": 0.64558692, + "epoch": 0.5267630644342973, + "grad_norm": 5.78125, + "learning_rate": 8.805983769058633e-06, + "loss": 1.65259418, + "memory(GiB)": 97.17, + "step": 20765, + "train_speed(iter/s)": 1.636874 + }, + { + "acc": 0.66302023, + "epoch": 0.5268899036022324, + "grad_norm": 6.6875, + "learning_rate": 8.80530363087259e-06, + "loss": 1.61344185, + "memory(GiB)": 97.17, + "step": 20770, + "train_speed(iter/s)": 1.636916 + }, + { + "acc": 0.65623193, + "epoch": 0.5270167427701674, + "grad_norm": 5.40625, + "learning_rate": 8.804623325312305e-06, + "loss": 1.66651974, + "memory(GiB)": 97.17, + "step": 20775, + "train_speed(iter/s)": 1.636958 + }, + { + "acc": 0.64837036, + "epoch": 0.5271435819381025, + "grad_norm": 5.21875, + "learning_rate": 8.8039428524077e-06, + "loss": 1.59288073, + "memory(GiB)": 97.17, + "step": 20780, + "train_speed(iter/s)": 1.636999 + }, + { + "acc": 0.66401401, + "epoch": 0.5272704211060375, + "grad_norm": 6.375, + "learning_rate": 8.803262212188703e-06, + "loss": 1.65262375, + "memory(GiB)": 97.17, + "step": 20785, + "train_speed(iter/s)": 1.637042 + }, + { + "acc": 0.65952148, + "epoch": 0.5273972602739726, + "grad_norm": 6.53125, + "learning_rate": 8.802581404685255e-06, + "loss": 1.62571354, + "memory(GiB)": 97.17, + "step": 20790, + "train_speed(iter/s)": 1.637084 + }, + { + "acc": 0.63022528, + "epoch": 0.5275240994419077, + "grad_norm": 5.71875, + "learning_rate": 8.8019004299273e-06, + "loss": 1.67102699, + "memory(GiB)": 97.17, + "step": 20795, + "train_speed(iter/s)": 1.637126 + }, + { + "acc": 0.64707384, + "epoch": 0.5276509386098427, + "grad_norm": 7.3125, + "learning_rate": 8.801219287944788e-06, + "loss": 1.64225349, + "memory(GiB)": 97.17, + "step": 20800, + "train_speed(iter/s)": 1.637168 + }, + { + "acc": 0.65275025, + "epoch": 0.5277777777777778, + "grad_norm": 6.28125, + "learning_rate": 8.800537978767683e-06, + "loss": 1.6052227, + "memory(GiB)": 97.17, + "step": 20805, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.66931887, + "epoch": 0.5279046169457129, + "grad_norm": 6.75, + "learning_rate": 8.79985650242595e-06, + "loss": 1.56394033, + "memory(GiB)": 97.17, + "step": 20810, + "train_speed(iter/s)": 1.637255 + }, + { + "acc": 0.64282403, + "epoch": 0.5280314561136479, + "grad_norm": 5.0, + "learning_rate": 8.79917485894956e-06, + "loss": 1.66500397, + "memory(GiB)": 97.17, + "step": 20815, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.65252514, + "epoch": 0.528158295281583, + "grad_norm": 5.15625, + "learning_rate": 8.798493048368498e-06, + "loss": 1.63258247, + "memory(GiB)": 97.17, + "step": 20820, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.65473585, + "epoch": 0.528285134449518, + "grad_norm": 4.59375, + "learning_rate": 8.797811070712754e-06, + "loss": 1.57983236, + "memory(GiB)": 97.17, + "step": 20825, + "train_speed(iter/s)": 1.637377 + }, + { + "acc": 0.64744306, + "epoch": 0.5284119736174531, + "grad_norm": 5.65625, + "learning_rate": 8.797128926012323e-06, + "loss": 1.6679306, + "memory(GiB)": 97.17, + "step": 20830, + "train_speed(iter/s)": 1.637418 + }, + { + "acc": 0.65198712, + "epoch": 0.5285388127853882, + "grad_norm": 6.96875, + "learning_rate": 8.796446614297208e-06, + "loss": 1.61573257, + "memory(GiB)": 97.17, + "step": 20835, + "train_speed(iter/s)": 1.637459 + }, + { + "acc": 0.65284791, + "epoch": 0.5286656519533232, + "grad_norm": 6.0625, + "learning_rate": 8.795764135597421e-06, + "loss": 1.62490902, + "memory(GiB)": 97.17, + "step": 20840, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.65560951, + "epoch": 0.5287924911212583, + "grad_norm": 4.78125, + "learning_rate": 8.79508148994298e-06, + "loss": 1.62684975, + "memory(GiB)": 97.17, + "step": 20845, + "train_speed(iter/s)": 1.637539 + }, + { + "acc": 0.64139962, + "epoch": 0.5289193302891934, + "grad_norm": 5.65625, + "learning_rate": 8.794398677363913e-06, + "loss": 1.642066, + "memory(GiB)": 97.17, + "step": 20850, + "train_speed(iter/s)": 1.636889 + }, + { + "acc": 0.66660652, + "epoch": 0.5290461694571283, + "grad_norm": 5.6875, + "learning_rate": 8.79371569789025e-06, + "loss": 1.55384274, + "memory(GiB)": 97.17, + "step": 20855, + "train_speed(iter/s)": 1.63693 + }, + { + "acc": 0.65368643, + "epoch": 0.5291730086250634, + "grad_norm": 6.15625, + "learning_rate": 8.793032551552034e-06, + "loss": 1.62286797, + "memory(GiB)": 97.17, + "step": 20860, + "train_speed(iter/s)": 1.636973 + }, + { + "acc": 0.6497963, + "epoch": 0.5292998477929984, + "grad_norm": 5.875, + "learning_rate": 8.792349238379311e-06, + "loss": 1.64692841, + "memory(GiB)": 97.17, + "step": 20865, + "train_speed(iter/s)": 1.637015 + }, + { + "acc": 0.63457546, + "epoch": 0.5294266869609335, + "grad_norm": 5.65625, + "learning_rate": 8.791665758402137e-06, + "loss": 1.6945919, + "memory(GiB)": 97.17, + "step": 20870, + "train_speed(iter/s)": 1.637055 + }, + { + "acc": 0.66274824, + "epoch": 0.5295535261288686, + "grad_norm": 5.875, + "learning_rate": 8.790982111650574e-06, + "loss": 1.60294247, + "memory(GiB)": 97.17, + "step": 20875, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.65766006, + "epoch": 0.5296803652968036, + "grad_norm": 5.09375, + "learning_rate": 8.790298298154694e-06, + "loss": 1.56191568, + "memory(GiB)": 97.17, + "step": 20880, + "train_speed(iter/s)": 1.636424 + }, + { + "acc": 0.64680061, + "epoch": 0.5298072044647387, + "grad_norm": 4.71875, + "learning_rate": 8.78961431794457e-06, + "loss": 1.63118896, + "memory(GiB)": 97.17, + "step": 20885, + "train_speed(iter/s)": 1.635778 + }, + { + "acc": 0.65813251, + "epoch": 0.5299340436326738, + "grad_norm": 5.375, + "learning_rate": 8.78893017105029e-06, + "loss": 1.59893961, + "memory(GiB)": 97.17, + "step": 20890, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.64986544, + "epoch": 0.5300608828006088, + "grad_norm": 5.6875, + "learning_rate": 8.788245857501944e-06, + "loss": 1.58806782, + "memory(GiB)": 97.17, + "step": 20895, + "train_speed(iter/s)": 1.635863 + }, + { + "acc": 0.64958639, + "epoch": 0.5301877219685439, + "grad_norm": 5.65625, + "learning_rate": 8.787561377329633e-06, + "loss": 1.66650715, + "memory(GiB)": 97.17, + "step": 20900, + "train_speed(iter/s)": 1.635905 + }, + { + "acc": 0.6603848, + "epoch": 0.5303145611364789, + "grad_norm": 5.5625, + "learning_rate": 8.786876730563462e-06, + "loss": 1.63459206, + "memory(GiB)": 97.17, + "step": 20905, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.65420365, + "epoch": 0.530441400304414, + "grad_norm": 7.3125, + "learning_rate": 8.786191917233545e-06, + "loss": 1.6224205, + "memory(GiB)": 97.17, + "step": 20910, + "train_speed(iter/s)": 1.635989 + }, + { + "acc": 0.66148334, + "epoch": 0.5305682394723491, + "grad_norm": 5.59375, + "learning_rate": 8.785506937370003e-06, + "loss": 1.57055511, + "memory(GiB)": 97.17, + "step": 20915, + "train_speed(iter/s)": 1.636032 + }, + { + "acc": 0.66252542, + "epoch": 0.5306950786402841, + "grad_norm": 6.84375, + "learning_rate": 8.784821791002965e-06, + "loss": 1.63754253, + "memory(GiB)": 97.17, + "step": 20920, + "train_speed(iter/s)": 1.636074 + }, + { + "acc": 0.6593853, + "epoch": 0.5308219178082192, + "grad_norm": 7.28125, + "learning_rate": 8.784136478162567e-06, + "loss": 1.6666605, + "memory(GiB)": 97.17, + "step": 20925, + "train_speed(iter/s)": 1.636117 + }, + { + "acc": 0.66064806, + "epoch": 0.5309487569761543, + "grad_norm": 6.40625, + "learning_rate": 8.783450998878951e-06, + "loss": 1.5791708, + "memory(GiB)": 97.17, + "step": 20930, + "train_speed(iter/s)": 1.636159 + }, + { + "acc": 0.65172501, + "epoch": 0.5310755961440893, + "grad_norm": 5.0, + "learning_rate": 8.78276535318227e-06, + "loss": 1.61124516, + "memory(GiB)": 97.17, + "step": 20935, + "train_speed(iter/s)": 1.636199 + }, + { + "acc": 0.63314943, + "epoch": 0.5312024353120244, + "grad_norm": 6.5625, + "learning_rate": 8.782079541102678e-06, + "loss": 1.68262787, + "memory(GiB)": 97.17, + "step": 20940, + "train_speed(iter/s)": 1.63624 + }, + { + "acc": 0.64838591, + "epoch": 0.5313292744799594, + "grad_norm": 4.96875, + "learning_rate": 8.781393562670342e-06, + "loss": 1.6325592, + "memory(GiB)": 97.17, + "step": 20945, + "train_speed(iter/s)": 1.636279 + }, + { + "acc": 0.63982205, + "epoch": 0.5314561136478945, + "grad_norm": 5.25, + "learning_rate": 8.780707417915436e-06, + "loss": 1.66564236, + "memory(GiB)": 97.17, + "step": 20950, + "train_speed(iter/s)": 1.63632 + }, + { + "acc": 0.66419077, + "epoch": 0.5315829528158296, + "grad_norm": 5.1875, + "learning_rate": 8.780021106868138e-06, + "loss": 1.50687103, + "memory(GiB)": 97.17, + "step": 20955, + "train_speed(iter/s)": 1.636361 + }, + { + "acc": 0.64624062, + "epoch": 0.5317097919837646, + "grad_norm": 6.71875, + "learning_rate": 8.779334629558633e-06, + "loss": 1.67514515, + "memory(GiB)": 97.17, + "step": 20960, + "train_speed(iter/s)": 1.636399 + }, + { + "acc": 0.65233459, + "epoch": 0.5318366311516997, + "grad_norm": 7.65625, + "learning_rate": 8.77864798601712e-06, + "loss": 1.63725204, + "memory(GiB)": 97.17, + "step": 20965, + "train_speed(iter/s)": 1.63644 + }, + { + "acc": 0.65923095, + "epoch": 0.5319634703196348, + "grad_norm": 5.1875, + "learning_rate": 8.777961176273795e-06, + "loss": 1.62402458, + "memory(GiB)": 97.17, + "step": 20970, + "train_speed(iter/s)": 1.636479 + }, + { + "acc": 0.65343065, + "epoch": 0.5320903094875697, + "grad_norm": 7.34375, + "learning_rate": 8.777274200358873e-06, + "loss": 1.66068497, + "memory(GiB)": 97.17, + "step": 20975, + "train_speed(iter/s)": 1.636522 + }, + { + "acc": 0.66354771, + "epoch": 0.5322171486555048, + "grad_norm": 5.40625, + "learning_rate": 8.776587058302566e-06, + "loss": 1.61062431, + "memory(GiB)": 97.17, + "step": 20980, + "train_speed(iter/s)": 1.636564 + }, + { + "acc": 0.65908747, + "epoch": 0.5323439878234398, + "grad_norm": 5.0, + "learning_rate": 8.7758997501351e-06, + "loss": 1.57970219, + "memory(GiB)": 97.17, + "step": 20985, + "train_speed(iter/s)": 1.636606 + }, + { + "acc": 0.66279473, + "epoch": 0.5324708269913749, + "grad_norm": 6.0, + "learning_rate": 8.775212275886705e-06, + "loss": 1.60897942, + "memory(GiB)": 97.17, + "step": 20990, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.63885632, + "epoch": 0.53259766615931, + "grad_norm": 6.5, + "learning_rate": 8.774524635587617e-06, + "loss": 1.66984577, + "memory(GiB)": 97.17, + "step": 20995, + "train_speed(iter/s)": 1.636691 + }, + { + "acc": 0.65384226, + "epoch": 0.532724505327245, + "grad_norm": 5.21875, + "learning_rate": 8.773836829268084e-06, + "loss": 1.63063908, + "memory(GiB)": 97.17, + "step": 21000, + "train_speed(iter/s)": 1.636734 + }, + { + "epoch": 0.532724505327245, + "eval_acc": 0.6440223281689523, + "eval_loss": 1.5857573747634888, + "eval_runtime": 58.3644, + "eval_samples_per_second": 109.142, + "eval_steps_per_second": 27.294, + "step": 21000 + }, + { + "acc": 0.64876814, + "epoch": 0.5328513444951801, + "grad_norm": 5.46875, + "learning_rate": 8.77314885695836e-06, + "loss": 1.59493008, + "memory(GiB)": 97.17, + "step": 21005, + "train_speed(iter/s)": 1.628842 + }, + { + "acc": 0.65038333, + "epoch": 0.5329781836631152, + "grad_norm": 5.40625, + "learning_rate": 8.772460718688702e-06, + "loss": 1.59678936, + "memory(GiB)": 97.17, + "step": 21010, + "train_speed(iter/s)": 1.628888 + }, + { + "acc": 0.66392612, + "epoch": 0.5331050228310502, + "grad_norm": 6.1875, + "learning_rate": 8.771772414489379e-06, + "loss": 1.57642765, + "memory(GiB)": 97.17, + "step": 21015, + "train_speed(iter/s)": 1.628936 + }, + { + "acc": 0.64752407, + "epoch": 0.5332318619989853, + "grad_norm": 6.40625, + "learning_rate": 8.771083944390665e-06, + "loss": 1.64965706, + "memory(GiB)": 97.17, + "step": 21020, + "train_speed(iter/s)": 1.628983 + }, + { + "acc": 0.64618082, + "epoch": 0.5333587011669203, + "grad_norm": 4.84375, + "learning_rate": 8.770395308422842e-06, + "loss": 1.67151146, + "memory(GiB)": 97.17, + "step": 21025, + "train_speed(iter/s)": 1.629026 + }, + { + "acc": 0.65674653, + "epoch": 0.5334855403348554, + "grad_norm": 6.0625, + "learning_rate": 8.769706506616201e-06, + "loss": 1.6231432, + "memory(GiB)": 97.17, + "step": 21030, + "train_speed(iter/s)": 1.629074 + }, + { + "acc": 0.66036348, + "epoch": 0.5336123795027905, + "grad_norm": 4.3125, + "learning_rate": 8.769017539001037e-06, + "loss": 1.58847218, + "memory(GiB)": 97.17, + "step": 21035, + "train_speed(iter/s)": 1.62912 + }, + { + "acc": 0.65948958, + "epoch": 0.5337392186707255, + "grad_norm": 5.875, + "learning_rate": 8.768328405607655e-06, + "loss": 1.63342094, + "memory(GiB)": 97.17, + "step": 21040, + "train_speed(iter/s)": 1.629166 + }, + { + "acc": 0.64525099, + "epoch": 0.5338660578386606, + "grad_norm": 5.25, + "learning_rate": 8.767639106466364e-06, + "loss": 1.69602432, + "memory(GiB)": 97.17, + "step": 21045, + "train_speed(iter/s)": 1.629213 + }, + { + "acc": 0.65140467, + "epoch": 0.5339928970065957, + "grad_norm": 4.96875, + "learning_rate": 8.766949641607484e-06, + "loss": 1.62470818, + "memory(GiB)": 97.17, + "step": 21050, + "train_speed(iter/s)": 1.629259 + }, + { + "acc": 0.65899959, + "epoch": 0.5341197361745307, + "grad_norm": 6.125, + "learning_rate": 8.76626001106134e-06, + "loss": 1.61285095, + "memory(GiB)": 97.17, + "step": 21055, + "train_speed(iter/s)": 1.629304 + }, + { + "acc": 0.66049242, + "epoch": 0.5342465753424658, + "grad_norm": 5.46875, + "learning_rate": 8.765570214858268e-06, + "loss": 1.543892, + "memory(GiB)": 97.17, + "step": 21060, + "train_speed(iter/s)": 1.62934 + }, + { + "acc": 0.65370889, + "epoch": 0.5343734145104008, + "grad_norm": 6.0, + "learning_rate": 8.764880253028604e-06, + "loss": 1.57371101, + "memory(GiB)": 97.17, + "step": 21065, + "train_speed(iter/s)": 1.629386 + }, + { + "acc": 0.64696016, + "epoch": 0.5345002536783359, + "grad_norm": 4.4375, + "learning_rate": 8.764190125602698e-06, + "loss": 1.59636192, + "memory(GiB)": 97.17, + "step": 21070, + "train_speed(iter/s)": 1.62943 + }, + { + "acc": 0.65373049, + "epoch": 0.534627092846271, + "grad_norm": 5.875, + "learning_rate": 8.763499832610904e-06, + "loss": 1.64795647, + "memory(GiB)": 97.17, + "step": 21075, + "train_speed(iter/s)": 1.629478 + }, + { + "acc": 0.66816235, + "epoch": 0.534753932014206, + "grad_norm": 4.90625, + "learning_rate": 8.762809374083585e-06, + "loss": 1.54908638, + "memory(GiB)": 97.17, + "step": 21080, + "train_speed(iter/s)": 1.629522 + }, + { + "acc": 0.64169078, + "epoch": 0.5348807711821411, + "grad_norm": 7.5, + "learning_rate": 8.76211875005111e-06, + "loss": 1.65061417, + "memory(GiB)": 97.17, + "step": 21085, + "train_speed(iter/s)": 1.62957 + }, + { + "acc": 0.67778893, + "epoch": 0.5350076103500762, + "grad_norm": 7.15625, + "learning_rate": 8.761427960543854e-06, + "loss": 1.52431011, + "memory(GiB)": 97.17, + "step": 21090, + "train_speed(iter/s)": 1.629617 + }, + { + "acc": 0.65496731, + "epoch": 0.5351344495180111, + "grad_norm": 5.625, + "learning_rate": 8.760737005592205e-06, + "loss": 1.62793102, + "memory(GiB)": 97.17, + "step": 21095, + "train_speed(iter/s)": 1.629663 + }, + { + "acc": 0.68127284, + "epoch": 0.5352612886859462, + "grad_norm": 4.6875, + "learning_rate": 8.760045885226551e-06, + "loss": 1.5818656, + "memory(GiB)": 97.17, + "step": 21100, + "train_speed(iter/s)": 1.629075 + }, + { + "acc": 0.66091471, + "epoch": 0.5353881278538812, + "grad_norm": 5.875, + "learning_rate": 8.759354599477293e-06, + "loss": 1.62469769, + "memory(GiB)": 97.17, + "step": 21105, + "train_speed(iter/s)": 1.629122 + }, + { + "acc": 0.6668292, + "epoch": 0.5355149670218163, + "grad_norm": 5.75, + "learning_rate": 8.758663148374833e-06, + "loss": 1.53694038, + "memory(GiB)": 97.17, + "step": 21110, + "train_speed(iter/s)": 1.629169 + }, + { + "acc": 0.66185007, + "epoch": 0.5356418061897514, + "grad_norm": 5.40625, + "learning_rate": 8.757971531949587e-06, + "loss": 1.63506279, + "memory(GiB)": 97.17, + "step": 21115, + "train_speed(iter/s)": 1.629214 + }, + { + "acc": 0.65972338, + "epoch": 0.5357686453576864, + "grad_norm": 5.0, + "learning_rate": 8.757279750231977e-06, + "loss": 1.58701916, + "memory(GiB)": 97.17, + "step": 21120, + "train_speed(iter/s)": 1.629261 + }, + { + "acc": 0.66191249, + "epoch": 0.5358954845256215, + "grad_norm": 5.25, + "learning_rate": 8.756587803252426e-06, + "loss": 1.55750599, + "memory(GiB)": 97.17, + "step": 21125, + "train_speed(iter/s)": 1.629307 + }, + { + "acc": 0.65840745, + "epoch": 0.5360223236935566, + "grad_norm": 5.53125, + "learning_rate": 8.755895691041373e-06, + "loss": 1.61100273, + "memory(GiB)": 97.17, + "step": 21130, + "train_speed(iter/s)": 1.629353 + }, + { + "acc": 0.65723963, + "epoch": 0.5361491628614916, + "grad_norm": 4.96875, + "learning_rate": 8.755203413629257e-06, + "loss": 1.61247616, + "memory(GiB)": 97.17, + "step": 21135, + "train_speed(iter/s)": 1.629398 + }, + { + "acc": 0.66113396, + "epoch": 0.5362760020294267, + "grad_norm": 5.34375, + "learning_rate": 8.75451097104653e-06, + "loss": 1.5241684, + "memory(GiB)": 97.17, + "step": 21140, + "train_speed(iter/s)": 1.629445 + }, + { + "acc": 0.65825863, + "epoch": 0.5364028411973617, + "grad_norm": 5.40625, + "learning_rate": 8.75381836332365e-06, + "loss": 1.65647945, + "memory(GiB)": 97.17, + "step": 21145, + "train_speed(iter/s)": 1.62949 + }, + { + "acc": 0.63270392, + "epoch": 0.5365296803652968, + "grad_norm": 5.3125, + "learning_rate": 8.753125590491077e-06, + "loss": 1.64848175, + "memory(GiB)": 97.17, + "step": 21150, + "train_speed(iter/s)": 1.629537 + }, + { + "acc": 0.66235132, + "epoch": 0.5366565195332319, + "grad_norm": 4.78125, + "learning_rate": 8.752432652579284e-06, + "loss": 1.56671915, + "memory(GiB)": 97.17, + "step": 21155, + "train_speed(iter/s)": 1.629583 + }, + { + "acc": 0.64957347, + "epoch": 0.5367833587011669, + "grad_norm": 6.53125, + "learning_rate": 8.751739549618749e-06, + "loss": 1.64176674, + "memory(GiB)": 97.17, + "step": 21160, + "train_speed(iter/s)": 1.629629 + }, + { + "acc": 0.65781221, + "epoch": 0.536910197869102, + "grad_norm": 6.5, + "learning_rate": 8.751046281639958e-06, + "loss": 1.57351341, + "memory(GiB)": 97.17, + "step": 21165, + "train_speed(iter/s)": 1.629675 + }, + { + "acc": 0.65551076, + "epoch": 0.5370370370370371, + "grad_norm": 4.875, + "learning_rate": 8.750352848673405e-06, + "loss": 1.64728775, + "memory(GiB)": 97.17, + "step": 21170, + "train_speed(iter/s)": 1.629722 + }, + { + "acc": 0.64609766, + "epoch": 0.5371638762049721, + "grad_norm": 5.6875, + "learning_rate": 8.749659250749589e-06, + "loss": 1.6993351, + "memory(GiB)": 97.17, + "step": 21175, + "train_speed(iter/s)": 1.629768 + }, + { + "acc": 0.66735706, + "epoch": 0.5372907153729072, + "grad_norm": 6.78125, + "learning_rate": 8.748965487899019e-06, + "loss": 1.57400665, + "memory(GiB)": 97.17, + "step": 21180, + "train_speed(iter/s)": 1.629815 + }, + { + "acc": 0.65819178, + "epoch": 0.5374175545408422, + "grad_norm": 6.25, + "learning_rate": 8.748271560152208e-06, + "loss": 1.59758873, + "memory(GiB)": 97.17, + "step": 21185, + "train_speed(iter/s)": 1.629863 + }, + { + "acc": 0.6487606, + "epoch": 0.5375443937087773, + "grad_norm": 5.53125, + "learning_rate": 8.74757746753968e-06, + "loss": 1.64695549, + "memory(GiB)": 97.17, + "step": 21190, + "train_speed(iter/s)": 1.629912 + }, + { + "acc": 0.65843048, + "epoch": 0.5376712328767124, + "grad_norm": 6.375, + "learning_rate": 8.746883210091963e-06, + "loss": 1.52678165, + "memory(GiB)": 97.17, + "step": 21195, + "train_speed(iter/s)": 1.62996 + }, + { + "acc": 0.67876468, + "epoch": 0.5377980720446474, + "grad_norm": 4.875, + "learning_rate": 8.746188787839593e-06, + "loss": 1.49834757, + "memory(GiB)": 97.17, + "step": 21200, + "train_speed(iter/s)": 1.630008 + }, + { + "acc": 0.65346189, + "epoch": 0.5379249112125825, + "grad_norm": 5.125, + "learning_rate": 8.745494200813116e-06, + "loss": 1.62275333, + "memory(GiB)": 97.17, + "step": 21205, + "train_speed(iter/s)": 1.630054 + }, + { + "acc": 0.64756126, + "epoch": 0.5380517503805176, + "grad_norm": 7.375, + "learning_rate": 8.74479944904308e-06, + "loss": 1.62635345, + "memory(GiB)": 97.17, + "step": 21210, + "train_speed(iter/s)": 1.6301 + }, + { + "acc": 0.64901819, + "epoch": 0.5381785895484525, + "grad_norm": 5.59375, + "learning_rate": 8.744104532560047e-06, + "loss": 1.62624359, + "memory(GiB)": 97.17, + "step": 21215, + "train_speed(iter/s)": 1.630145 + }, + { + "acc": 0.64173245, + "epoch": 0.5383054287163876, + "grad_norm": 5.53125, + "learning_rate": 8.74340945139458e-06, + "loss": 1.72811394, + "memory(GiB)": 97.17, + "step": 21220, + "train_speed(iter/s)": 1.630192 + }, + { + "acc": 0.65192585, + "epoch": 0.5384322678843226, + "grad_norm": 8.375, + "learning_rate": 8.742714205577251e-06, + "loss": 1.63894882, + "memory(GiB)": 97.17, + "step": 21225, + "train_speed(iter/s)": 1.630238 + }, + { + "acc": 0.66877565, + "epoch": 0.5385591070522577, + "grad_norm": 4.96875, + "learning_rate": 8.742018795138642e-06, + "loss": 1.49626751, + "memory(GiB)": 97.17, + "step": 21230, + "train_speed(iter/s)": 1.630284 + }, + { + "acc": 0.65010915, + "epoch": 0.5386859462201928, + "grad_norm": 5.65625, + "learning_rate": 8.74132322010934e-06, + "loss": 1.6074913, + "memory(GiB)": 97.17, + "step": 21235, + "train_speed(iter/s)": 1.630327 + }, + { + "acc": 0.64931459, + "epoch": 0.5388127853881278, + "grad_norm": 5.1875, + "learning_rate": 8.740627480519937e-06, + "loss": 1.58404293, + "memory(GiB)": 97.17, + "step": 21240, + "train_speed(iter/s)": 1.630372 + }, + { + "acc": 0.64686995, + "epoch": 0.5389396245560629, + "grad_norm": 6.375, + "learning_rate": 8.739931576401037e-06, + "loss": 1.62725353, + "memory(GiB)": 97.17, + "step": 21245, + "train_speed(iter/s)": 1.630417 + }, + { + "acc": 0.65141277, + "epoch": 0.539066463723998, + "grad_norm": 5.3125, + "learning_rate": 8.73923550778325e-06, + "loss": 1.54246845, + "memory(GiB)": 97.17, + "step": 21250, + "train_speed(iter/s)": 1.630464 + }, + { + "acc": 0.66038904, + "epoch": 0.539193302891933, + "grad_norm": 5.84375, + "learning_rate": 8.73853927469719e-06, + "loss": 1.63263245, + "memory(GiB)": 97.17, + "step": 21255, + "train_speed(iter/s)": 1.630507 + }, + { + "acc": 0.65081282, + "epoch": 0.5393201420598681, + "grad_norm": 6.625, + "learning_rate": 8.73784287717348e-06, + "loss": 1.6154726, + "memory(GiB)": 97.17, + "step": 21260, + "train_speed(iter/s)": 1.630554 + }, + { + "acc": 0.62822676, + "epoch": 0.5394469812278031, + "grad_norm": 5.3125, + "learning_rate": 8.737146315242755e-06, + "loss": 1.71875343, + "memory(GiB)": 97.17, + "step": 21265, + "train_speed(iter/s)": 1.630599 + }, + { + "acc": 0.64272547, + "epoch": 0.5395738203957382, + "grad_norm": 5.4375, + "learning_rate": 8.73644958893565e-06, + "loss": 1.67719517, + "memory(GiB)": 97.17, + "step": 21270, + "train_speed(iter/s)": 1.630648 + }, + { + "acc": 0.66212773, + "epoch": 0.5397006595636733, + "grad_norm": 4.90625, + "learning_rate": 8.735752698282807e-06, + "loss": 1.57490864, + "memory(GiB)": 97.17, + "step": 21275, + "train_speed(iter/s)": 1.630693 + }, + { + "acc": 0.67078214, + "epoch": 0.5398274987316083, + "grad_norm": 5.25, + "learning_rate": 8.735055643314883e-06, + "loss": 1.66050434, + "memory(GiB)": 97.17, + "step": 21280, + "train_speed(iter/s)": 1.63074 + }, + { + "acc": 0.64633322, + "epoch": 0.5399543378995434, + "grad_norm": 5.3125, + "learning_rate": 8.734358424062536e-06, + "loss": 1.66605797, + "memory(GiB)": 97.17, + "step": 21285, + "train_speed(iter/s)": 1.630782 + }, + { + "acc": 0.66149521, + "epoch": 0.5400811770674785, + "grad_norm": 4.9375, + "learning_rate": 8.733661040556433e-06, + "loss": 1.60061684, + "memory(GiB)": 97.17, + "step": 21290, + "train_speed(iter/s)": 1.630828 + }, + { + "acc": 0.638094, + "epoch": 0.5402080162354135, + "grad_norm": 5.53125, + "learning_rate": 8.732963492827248e-06, + "loss": 1.64067459, + "memory(GiB)": 97.17, + "step": 21295, + "train_speed(iter/s)": 1.630873 + }, + { + "acc": 0.64380012, + "epoch": 0.5403348554033486, + "grad_norm": 5.09375, + "learning_rate": 8.732265780905661e-06, + "loss": 1.64288254, + "memory(GiB)": 97.17, + "step": 21300, + "train_speed(iter/s)": 1.630917 + }, + { + "acc": 0.64279027, + "epoch": 0.5404616945712836, + "grad_norm": 5.1875, + "learning_rate": 8.731567904822362e-06, + "loss": 1.6680933, + "memory(GiB)": 97.17, + "step": 21305, + "train_speed(iter/s)": 1.630962 + }, + { + "acc": 0.66126328, + "epoch": 0.5405885337392187, + "grad_norm": 5.28125, + "learning_rate": 8.730869864608047e-06, + "loss": 1.48987989, + "memory(GiB)": 97.17, + "step": 21310, + "train_speed(iter/s)": 1.631008 + }, + { + "acc": 0.64659724, + "epoch": 0.5407153729071538, + "grad_norm": 6.0625, + "learning_rate": 8.730171660293418e-06, + "loss": 1.70109501, + "memory(GiB)": 97.17, + "step": 21315, + "train_speed(iter/s)": 1.631055 + }, + { + "acc": 0.65225129, + "epoch": 0.5408422120750888, + "grad_norm": 4.625, + "learning_rate": 8.729473291909185e-06, + "loss": 1.60611572, + "memory(GiB)": 97.17, + "step": 21320, + "train_speed(iter/s)": 1.631102 + }, + { + "acc": 0.64276223, + "epoch": 0.5409690512430239, + "grad_norm": 5.90625, + "learning_rate": 8.728774759486065e-06, + "loss": 1.63817921, + "memory(GiB)": 97.17, + "step": 21325, + "train_speed(iter/s)": 1.63115 + }, + { + "acc": 0.64491348, + "epoch": 0.541095890410959, + "grad_norm": 5.09375, + "learning_rate": 8.728076063054786e-06, + "loss": 1.60837955, + "memory(GiB)": 97.17, + "step": 21330, + "train_speed(iter/s)": 1.631196 + }, + { + "acc": 0.65019789, + "epoch": 0.541222729578894, + "grad_norm": 6.03125, + "learning_rate": 8.727377202646074e-06, + "loss": 1.71281738, + "memory(GiB)": 97.17, + "step": 21335, + "train_speed(iter/s)": 1.631244 + }, + { + "acc": 0.66167102, + "epoch": 0.541349568746829, + "grad_norm": 7.15625, + "learning_rate": 8.726678178290673e-06, + "loss": 1.65932713, + "memory(GiB)": 97.17, + "step": 21340, + "train_speed(iter/s)": 1.631289 + }, + { + "acc": 0.6633606, + "epoch": 0.541476407914764, + "grad_norm": 6.3125, + "learning_rate": 8.725978990019326e-06, + "loss": 1.60574989, + "memory(GiB)": 97.17, + "step": 21345, + "train_speed(iter/s)": 1.631338 + }, + { + "acc": 0.6633574, + "epoch": 0.5416032470826991, + "grad_norm": 4.90625, + "learning_rate": 8.72527963786279e-06, + "loss": 1.58699722, + "memory(GiB)": 97.17, + "step": 21350, + "train_speed(iter/s)": 1.631381 + }, + { + "acc": 0.65025959, + "epoch": 0.5417300862506342, + "grad_norm": 4.90625, + "learning_rate": 8.72458012185182e-06, + "loss": 1.69510059, + "memory(GiB)": 97.17, + "step": 21355, + "train_speed(iter/s)": 1.631423 + }, + { + "acc": 0.65867424, + "epoch": 0.5418569254185692, + "grad_norm": 5.15625, + "learning_rate": 8.72388044201719e-06, + "loss": 1.57125511, + "memory(GiB)": 97.17, + "step": 21360, + "train_speed(iter/s)": 1.631468 + }, + { + "acc": 0.67092819, + "epoch": 0.5419837645865043, + "grad_norm": 5.90625, + "learning_rate": 8.723180598389671e-06, + "loss": 1.50119743, + "memory(GiB)": 97.17, + "step": 21365, + "train_speed(iter/s)": 1.631512 + }, + { + "acc": 0.64988117, + "epoch": 0.5421106037544394, + "grad_norm": 4.375, + "learning_rate": 8.722480591000046e-06, + "loss": 1.63654594, + "memory(GiB)": 97.17, + "step": 21370, + "train_speed(iter/s)": 1.631556 + }, + { + "acc": 0.67209129, + "epoch": 0.5422374429223744, + "grad_norm": 8.5, + "learning_rate": 8.721780419879106e-06, + "loss": 1.52114229, + "memory(GiB)": 97.17, + "step": 21375, + "train_speed(iter/s)": 1.631603 + }, + { + "acc": 0.65188808, + "epoch": 0.5423642820903095, + "grad_norm": 4.875, + "learning_rate": 8.721080085057646e-06, + "loss": 1.64187241, + "memory(GiB)": 97.17, + "step": 21380, + "train_speed(iter/s)": 1.63165 + }, + { + "acc": 0.6517755, + "epoch": 0.5424911212582445, + "grad_norm": 7.96875, + "learning_rate": 8.72037958656647e-06, + "loss": 1.70594139, + "memory(GiB)": 97.17, + "step": 21385, + "train_speed(iter/s)": 1.631697 + }, + { + "acc": 0.6375782, + "epoch": 0.5426179604261796, + "grad_norm": 5.125, + "learning_rate": 8.71967892443639e-06, + "loss": 1.64758358, + "memory(GiB)": 97.17, + "step": 21390, + "train_speed(iter/s)": 1.631742 + }, + { + "acc": 0.66278372, + "epoch": 0.5427447995941147, + "grad_norm": 6.34375, + "learning_rate": 8.718978098698226e-06, + "loss": 1.59483213, + "memory(GiB)": 97.17, + "step": 21395, + "train_speed(iter/s)": 1.631787 + }, + { + "acc": 0.65543542, + "epoch": 0.5428716387620497, + "grad_norm": 6.59375, + "learning_rate": 8.718277109382799e-06, + "loss": 1.64448128, + "memory(GiB)": 97.17, + "step": 21400, + "train_speed(iter/s)": 1.631834 + }, + { + "acc": 0.65426188, + "epoch": 0.5429984779299848, + "grad_norm": 6.75, + "learning_rate": 8.717575956520942e-06, + "loss": 1.66474266, + "memory(GiB)": 97.17, + "step": 21405, + "train_speed(iter/s)": 1.631882 + }, + { + "acc": 0.6654305, + "epoch": 0.5431253170979199, + "grad_norm": 4.78125, + "learning_rate": 8.716874640143498e-06, + "loss": 1.55772953, + "memory(GiB)": 97.17, + "step": 21410, + "train_speed(iter/s)": 1.631927 + }, + { + "acc": 0.65467491, + "epoch": 0.5432521562658549, + "grad_norm": 7.875, + "learning_rate": 8.716173160281315e-06, + "loss": 1.66262207, + "memory(GiB)": 97.17, + "step": 21415, + "train_speed(iter/s)": 1.631971 + }, + { + "acc": 0.63358483, + "epoch": 0.54337899543379, + "grad_norm": 7.21875, + "learning_rate": 8.715471516965242e-06, + "loss": 1.69256821, + "memory(GiB)": 97.17, + "step": 21420, + "train_speed(iter/s)": 1.632016 + }, + { + "acc": 0.64574442, + "epoch": 0.543505834601725, + "grad_norm": 6.15625, + "learning_rate": 8.714769710226144e-06, + "loss": 1.65064392, + "memory(GiB)": 97.17, + "step": 21425, + "train_speed(iter/s)": 1.632061 + }, + { + "acc": 0.6545187, + "epoch": 0.5436326737696601, + "grad_norm": 5.75, + "learning_rate": 8.714067740094888e-06, + "loss": 1.63890152, + "memory(GiB)": 97.17, + "step": 21430, + "train_speed(iter/s)": 1.632107 + }, + { + "acc": 0.67132926, + "epoch": 0.5437595129375952, + "grad_norm": 8.4375, + "learning_rate": 8.713365606602353e-06, + "loss": 1.58325291, + "memory(GiB)": 97.17, + "step": 21435, + "train_speed(iter/s)": 1.632154 + }, + { + "acc": 0.67222395, + "epoch": 0.5438863521055302, + "grad_norm": 5.9375, + "learning_rate": 8.71266330977942e-06, + "loss": 1.50973949, + "memory(GiB)": 97.17, + "step": 21440, + "train_speed(iter/s)": 1.632196 + }, + { + "acc": 0.67877855, + "epoch": 0.5440131912734653, + "grad_norm": 5.28125, + "learning_rate": 8.711960849656975e-06, + "loss": 1.52223024, + "memory(GiB)": 97.17, + "step": 21445, + "train_speed(iter/s)": 1.63224 + }, + { + "acc": 0.64687319, + "epoch": 0.5441400304414004, + "grad_norm": 6.75, + "learning_rate": 8.711258226265922e-06, + "loss": 1.62021103, + "memory(GiB)": 97.17, + "step": 21450, + "train_speed(iter/s)": 1.632283 + }, + { + "acc": 0.66055937, + "epoch": 0.5442668696093353, + "grad_norm": 5.875, + "learning_rate": 8.710555439637163e-06, + "loss": 1.65854111, + "memory(GiB)": 97.17, + "step": 21455, + "train_speed(iter/s)": 1.63233 + }, + { + "acc": 0.64457908, + "epoch": 0.5443937087772704, + "grad_norm": 6.65625, + "learning_rate": 8.709852489801608e-06, + "loss": 1.58159885, + "memory(GiB)": 97.17, + "step": 21460, + "train_speed(iter/s)": 1.632375 + }, + { + "acc": 0.6482645, + "epoch": 0.5445205479452054, + "grad_norm": 6.28125, + "learning_rate": 8.709149376790177e-06, + "loss": 1.64121399, + "memory(GiB)": 97.17, + "step": 21465, + "train_speed(iter/s)": 1.632422 + }, + { + "acc": 0.65086312, + "epoch": 0.5446473871131405, + "grad_norm": 5.71875, + "learning_rate": 8.708446100633796e-06, + "loss": 1.64037914, + "memory(GiB)": 97.17, + "step": 21470, + "train_speed(iter/s)": 1.632467 + }, + { + "acc": 0.68844948, + "epoch": 0.5447742262810756, + "grad_norm": 6.46875, + "learning_rate": 8.707742661363401e-06, + "loss": 1.47648354, + "memory(GiB)": 97.17, + "step": 21475, + "train_speed(iter/s)": 1.632515 + }, + { + "acc": 0.63630157, + "epoch": 0.5449010654490106, + "grad_norm": 6.40625, + "learning_rate": 8.707039059009927e-06, + "loss": 1.72131348, + "memory(GiB)": 97.17, + "step": 21480, + "train_speed(iter/s)": 1.632558 + }, + { + "acc": 0.66428332, + "epoch": 0.5450279046169457, + "grad_norm": 6.46875, + "learning_rate": 8.706335293604326e-06, + "loss": 1.60624733, + "memory(GiB)": 97.17, + "step": 21485, + "train_speed(iter/s)": 1.632602 + }, + { + "acc": 0.65580721, + "epoch": 0.5451547437848808, + "grad_norm": 5.34375, + "learning_rate": 8.705631365177552e-06, + "loss": 1.65646057, + "memory(GiB)": 97.17, + "step": 21490, + "train_speed(iter/s)": 1.632649 + }, + { + "acc": 0.65630984, + "epoch": 0.5452815829528158, + "grad_norm": 6.3125, + "learning_rate": 8.704927273760563e-06, + "loss": 1.58290529, + "memory(GiB)": 97.17, + "step": 21495, + "train_speed(iter/s)": 1.632695 + }, + { + "acc": 0.65003924, + "epoch": 0.5454084221207509, + "grad_norm": 5.125, + "learning_rate": 8.704223019384334e-06, + "loss": 1.63767433, + "memory(GiB)": 97.17, + "step": 21500, + "train_speed(iter/s)": 1.632739 + }, + { + "acc": 0.66435966, + "epoch": 0.5455352612886859, + "grad_norm": 6.375, + "learning_rate": 8.703518602079836e-06, + "loss": 1.57400694, + "memory(GiB)": 97.17, + "step": 21505, + "train_speed(iter/s)": 1.632784 + }, + { + "acc": 0.64868693, + "epoch": 0.545662100456621, + "grad_norm": 5.5625, + "learning_rate": 8.702814021878057e-06, + "loss": 1.6368927, + "memory(GiB)": 97.17, + "step": 21510, + "train_speed(iter/s)": 1.63283 + }, + { + "acc": 0.64857607, + "epoch": 0.5457889396245561, + "grad_norm": 5.21875, + "learning_rate": 8.702109278809985e-06, + "loss": 1.66303978, + "memory(GiB)": 97.17, + "step": 21515, + "train_speed(iter/s)": 1.632876 + }, + { + "acc": 0.66084538, + "epoch": 0.5459157787924911, + "grad_norm": 4.75, + "learning_rate": 8.70140437290662e-06, + "loss": 1.61717606, + "memory(GiB)": 97.17, + "step": 21520, + "train_speed(iter/s)": 1.632918 + }, + { + "acc": 0.64758983, + "epoch": 0.5460426179604262, + "grad_norm": 5.4375, + "learning_rate": 8.700699304198963e-06, + "loss": 1.68869362, + "memory(GiB)": 97.17, + "step": 21525, + "train_speed(iter/s)": 1.632963 + }, + { + "acc": 0.65050988, + "epoch": 0.5461694571283613, + "grad_norm": 5.25, + "learning_rate": 8.699994072718026e-06, + "loss": 1.65058823, + "memory(GiB)": 97.17, + "step": 21530, + "train_speed(iter/s)": 1.633006 + }, + { + "acc": 0.66107302, + "epoch": 0.5462962962962963, + "grad_norm": 5.96875, + "learning_rate": 8.699288678494833e-06, + "loss": 1.55940018, + "memory(GiB)": 97.17, + "step": 21535, + "train_speed(iter/s)": 1.633052 + }, + { + "acc": 0.64216099, + "epoch": 0.5464231354642314, + "grad_norm": 5.34375, + "learning_rate": 8.698583121560407e-06, + "loss": 1.6853466, + "memory(GiB)": 97.17, + "step": 21540, + "train_speed(iter/s)": 1.633098 + }, + { + "acc": 0.65122237, + "epoch": 0.5465499746321664, + "grad_norm": 5.625, + "learning_rate": 8.697877401945784e-06, + "loss": 1.62169094, + "memory(GiB)": 97.17, + "step": 21545, + "train_speed(iter/s)": 1.633143 + }, + { + "acc": 0.66671586, + "epoch": 0.5466768138001015, + "grad_norm": 5.40625, + "learning_rate": 8.697171519682002e-06, + "loss": 1.56995211, + "memory(GiB)": 97.17, + "step": 21550, + "train_speed(iter/s)": 1.633189 + }, + { + "acc": 0.65399394, + "epoch": 0.5468036529680366, + "grad_norm": 4.875, + "learning_rate": 8.696465474800109e-06, + "loss": 1.60602703, + "memory(GiB)": 97.17, + "step": 21555, + "train_speed(iter/s)": 1.633232 + }, + { + "acc": 0.65313334, + "epoch": 0.5469304921359716, + "grad_norm": 5.4375, + "learning_rate": 8.695759267331162e-06, + "loss": 1.63794708, + "memory(GiB)": 97.17, + "step": 21560, + "train_speed(iter/s)": 1.633277 + }, + { + "acc": 0.65162725, + "epoch": 0.5470573313039067, + "grad_norm": 4.84375, + "learning_rate": 8.69505289730622e-06, + "loss": 1.62994118, + "memory(GiB)": 97.17, + "step": 21565, + "train_speed(iter/s)": 1.633321 + }, + { + "acc": 0.66639128, + "epoch": 0.5471841704718418, + "grad_norm": 6.96875, + "learning_rate": 8.694346364756356e-06, + "loss": 1.604953, + "memory(GiB)": 97.17, + "step": 21570, + "train_speed(iter/s)": 1.633365 + }, + { + "acc": 0.65384355, + "epoch": 0.5473110096397767, + "grad_norm": 5.03125, + "learning_rate": 8.693639669712645e-06, + "loss": 1.63351345, + "memory(GiB)": 97.17, + "step": 21575, + "train_speed(iter/s)": 1.633407 + }, + { + "acc": 0.65933371, + "epoch": 0.5474378488077118, + "grad_norm": 6.53125, + "learning_rate": 8.692932812206171e-06, + "loss": 1.56650009, + "memory(GiB)": 97.17, + "step": 21580, + "train_speed(iter/s)": 1.633449 + }, + { + "acc": 0.64212232, + "epoch": 0.5475646879756468, + "grad_norm": 5.09375, + "learning_rate": 8.692225792268023e-06, + "loss": 1.62280388, + "memory(GiB)": 97.17, + "step": 21585, + "train_speed(iter/s)": 1.633494 + }, + { + "acc": 0.65291519, + "epoch": 0.5476915271435819, + "grad_norm": 6.4375, + "learning_rate": 8.691518609929302e-06, + "loss": 1.64678917, + "memory(GiB)": 97.17, + "step": 21590, + "train_speed(iter/s)": 1.633538 + }, + { + "acc": 0.6526032, + "epoch": 0.547818366311517, + "grad_norm": 5.03125, + "learning_rate": 8.690811265221108e-06, + "loss": 1.59640846, + "memory(GiB)": 97.17, + "step": 21595, + "train_speed(iter/s)": 1.633579 + }, + { + "acc": 0.66505475, + "epoch": 0.547945205479452, + "grad_norm": 5.375, + "learning_rate": 8.690103758174558e-06, + "loss": 1.60830574, + "memory(GiB)": 97.17, + "step": 21600, + "train_speed(iter/s)": 1.633624 + }, + { + "acc": 0.62413301, + "epoch": 0.5480720446473871, + "grad_norm": 6.09375, + "learning_rate": 8.68939608882077e-06, + "loss": 1.66306019, + "memory(GiB)": 97.17, + "step": 21605, + "train_speed(iter/s)": 1.633669 + }, + { + "acc": 0.64723406, + "epoch": 0.5481988838153222, + "grad_norm": 6.15625, + "learning_rate": 8.688688257190869e-06, + "loss": 1.63699303, + "memory(GiB)": 97.17, + "step": 21610, + "train_speed(iter/s)": 1.633714 + }, + { + "acc": 0.67335544, + "epoch": 0.5483257229832572, + "grad_norm": 7.6875, + "learning_rate": 8.68798026331599e-06, + "loss": 1.51924582, + "memory(GiB)": 97.17, + "step": 21615, + "train_speed(iter/s)": 1.633759 + }, + { + "acc": 0.65182371, + "epoch": 0.5484525621511923, + "grad_norm": 5.1875, + "learning_rate": 8.687272107227274e-06, + "loss": 1.6607935, + "memory(GiB)": 97.17, + "step": 21620, + "train_speed(iter/s)": 1.633802 + }, + { + "acc": 0.63714275, + "epoch": 0.5485794013191273, + "grad_norm": 4.53125, + "learning_rate": 8.686563788955867e-06, + "loss": 1.66709213, + "memory(GiB)": 97.17, + "step": 21625, + "train_speed(iter/s)": 1.633847 + }, + { + "acc": 0.65819426, + "epoch": 0.5487062404870624, + "grad_norm": 5.53125, + "learning_rate": 8.685855308532926e-06, + "loss": 1.58025532, + "memory(GiB)": 97.17, + "step": 21630, + "train_speed(iter/s)": 1.633892 + }, + { + "acc": 0.65978975, + "epoch": 0.5488330796549975, + "grad_norm": 6.8125, + "learning_rate": 8.685146665989613e-06, + "loss": 1.60376511, + "memory(GiB)": 97.17, + "step": 21635, + "train_speed(iter/s)": 1.633937 + }, + { + "acc": 0.64683123, + "epoch": 0.5489599188229325, + "grad_norm": 5.6875, + "learning_rate": 8.684437861357095e-06, + "loss": 1.60137978, + "memory(GiB)": 97.17, + "step": 21640, + "train_speed(iter/s)": 1.633983 + }, + { + "acc": 0.65049801, + "epoch": 0.5490867579908676, + "grad_norm": 5.46875, + "learning_rate": 8.683728894666551e-06, + "loss": 1.56815739, + "memory(GiB)": 97.17, + "step": 21645, + "train_speed(iter/s)": 1.634029 + }, + { + "acc": 0.65585585, + "epoch": 0.5492135971588027, + "grad_norm": 5.46875, + "learning_rate": 8.683019765949163e-06, + "loss": 1.62692013, + "memory(GiB)": 97.17, + "step": 21650, + "train_speed(iter/s)": 1.634074 + }, + { + "acc": 0.64181685, + "epoch": 0.5493404363267377, + "grad_norm": 5.84375, + "learning_rate": 8.682310475236123e-06, + "loss": 1.61168747, + "memory(GiB)": 97.17, + "step": 21655, + "train_speed(iter/s)": 1.634118 + }, + { + "acc": 0.64873557, + "epoch": 0.5494672754946728, + "grad_norm": 6.15625, + "learning_rate": 8.681601022558628e-06, + "loss": 1.64048157, + "memory(GiB)": 97.17, + "step": 21660, + "train_speed(iter/s)": 1.634163 + }, + { + "acc": 0.65478134, + "epoch": 0.5495941146626078, + "grad_norm": 5.28125, + "learning_rate": 8.680891407947882e-06, + "loss": 1.59257851, + "memory(GiB)": 97.17, + "step": 21665, + "train_speed(iter/s)": 1.634208 + }, + { + "acc": 0.64225292, + "epoch": 0.5497209538305429, + "grad_norm": 5.375, + "learning_rate": 8.680181631435098e-06, + "loss": 1.72277775, + "memory(GiB)": 97.17, + "step": 21670, + "train_speed(iter/s)": 1.634253 + }, + { + "acc": 0.65724702, + "epoch": 0.549847792998478, + "grad_norm": 6.90625, + "learning_rate": 8.679471693051495e-06, + "loss": 1.61637383, + "memory(GiB)": 97.17, + "step": 21675, + "train_speed(iter/s)": 1.634297 + }, + { + "acc": 0.66019516, + "epoch": 0.549974632166413, + "grad_norm": 5.6875, + "learning_rate": 8.678761592828301e-06, + "loss": 1.59520149, + "memory(GiB)": 97.17, + "step": 21680, + "train_speed(iter/s)": 1.63434 + }, + { + "acc": 0.65677004, + "epoch": 0.5501014713343481, + "grad_norm": 5.5, + "learning_rate": 8.678051330796746e-06, + "loss": 1.56796055, + "memory(GiB)": 97.17, + "step": 21685, + "train_speed(iter/s)": 1.634378 + }, + { + "acc": 0.63619041, + "epoch": 0.5502283105022832, + "grad_norm": 6.1875, + "learning_rate": 8.677340906988072e-06, + "loss": 1.62516232, + "memory(GiB)": 97.17, + "step": 21690, + "train_speed(iter/s)": 1.634422 + }, + { + "acc": 0.65493727, + "epoch": 0.5503551496702181, + "grad_norm": 5.8125, + "learning_rate": 8.676630321433528e-06, + "loss": 1.58001223, + "memory(GiB)": 97.17, + "step": 21695, + "train_speed(iter/s)": 1.634466 + }, + { + "acc": 0.65300012, + "epoch": 0.5504819888381532, + "grad_norm": 5.3125, + "learning_rate": 8.675919574164366e-06, + "loss": 1.62814541, + "memory(GiB)": 97.17, + "step": 21700, + "train_speed(iter/s)": 1.634509 + }, + { + "acc": 0.65359364, + "epoch": 0.5506088280060882, + "grad_norm": 4.75, + "learning_rate": 8.675208665211851e-06, + "loss": 1.60846596, + "memory(GiB)": 97.17, + "step": 21705, + "train_speed(iter/s)": 1.634552 + }, + { + "acc": 0.64589133, + "epoch": 0.5507356671740233, + "grad_norm": 8.375, + "learning_rate": 8.674497594607249e-06, + "loss": 1.63216724, + "memory(GiB)": 97.17, + "step": 21710, + "train_speed(iter/s)": 1.634595 + }, + { + "acc": 0.64513469, + "epoch": 0.5508625063419584, + "grad_norm": 7.03125, + "learning_rate": 8.673786362381837e-06, + "loss": 1.67052593, + "memory(GiB)": 97.17, + "step": 21715, + "train_speed(iter/s)": 1.63464 + }, + { + "acc": 0.64713788, + "epoch": 0.5509893455098934, + "grad_norm": 4.6875, + "learning_rate": 8.673074968566899e-06, + "loss": 1.58162365, + "memory(GiB)": 97.17, + "step": 21720, + "train_speed(iter/s)": 1.634684 + }, + { + "acc": 0.63161049, + "epoch": 0.5511161846778285, + "grad_norm": 6.21875, + "learning_rate": 8.672363413193724e-06, + "loss": 1.70378265, + "memory(GiB)": 97.17, + "step": 21725, + "train_speed(iter/s)": 1.634727 + }, + { + "acc": 0.65538349, + "epoch": 0.5512430238457636, + "grad_norm": 4.625, + "learning_rate": 8.671651696293613e-06, + "loss": 1.59071178, + "memory(GiB)": 97.17, + "step": 21730, + "train_speed(iter/s)": 1.634771 + }, + { + "acc": 0.64878416, + "epoch": 0.5513698630136986, + "grad_norm": 5.9375, + "learning_rate": 8.670939817897865e-06, + "loss": 1.66455212, + "memory(GiB)": 97.17, + "step": 21735, + "train_speed(iter/s)": 1.634816 + }, + { + "acc": 0.64152088, + "epoch": 0.5514967021816337, + "grad_norm": 4.53125, + "learning_rate": 8.670227778037796e-06, + "loss": 1.64212399, + "memory(GiB)": 97.17, + "step": 21740, + "train_speed(iter/s)": 1.634858 + }, + { + "acc": 0.65355515, + "epoch": 0.5516235413495687, + "grad_norm": 6.40625, + "learning_rate": 8.669515576744722e-06, + "loss": 1.6184082, + "memory(GiB)": 97.17, + "step": 21745, + "train_speed(iter/s)": 1.6349 + }, + { + "acc": 0.6453692, + "epoch": 0.5517503805175038, + "grad_norm": 5.8125, + "learning_rate": 8.66880321404997e-06, + "loss": 1.64840527, + "memory(GiB)": 97.17, + "step": 21750, + "train_speed(iter/s)": 1.634944 + }, + { + "acc": 0.66755161, + "epoch": 0.5518772196854389, + "grad_norm": 6.75, + "learning_rate": 8.668090689984872e-06, + "loss": 1.5847641, + "memory(GiB)": 97.17, + "step": 21755, + "train_speed(iter/s)": 1.634989 + }, + { + "acc": 0.64707861, + "epoch": 0.5520040588533739, + "grad_norm": 5.75, + "learning_rate": 8.667378004580769e-06, + "loss": 1.59607029, + "memory(GiB)": 97.17, + "step": 21760, + "train_speed(iter/s)": 1.635033 + }, + { + "acc": 0.66107101, + "epoch": 0.552130898021309, + "grad_norm": 5.4375, + "learning_rate": 8.666665157869007e-06, + "loss": 1.58932552, + "memory(GiB)": 97.17, + "step": 21765, + "train_speed(iter/s)": 1.635079 + }, + { + "acc": 0.65089655, + "epoch": 0.5522577371892441, + "grad_norm": 5.4375, + "learning_rate": 8.665952149880942e-06, + "loss": 1.66242371, + "memory(GiB)": 97.17, + "step": 21770, + "train_speed(iter/s)": 1.635122 + }, + { + "acc": 0.666045, + "epoch": 0.5523845763571791, + "grad_norm": 5.90625, + "learning_rate": 8.665238980647934e-06, + "loss": 1.5191741, + "memory(GiB)": 97.17, + "step": 21775, + "train_speed(iter/s)": 1.635165 + }, + { + "acc": 0.64814553, + "epoch": 0.5525114155251142, + "grad_norm": 6.03125, + "learning_rate": 8.66452565020135e-06, + "loss": 1.65093784, + "memory(GiB)": 97.17, + "step": 21780, + "train_speed(iter/s)": 1.635207 + }, + { + "acc": 0.65193038, + "epoch": 0.5526382546930492, + "grad_norm": 6.25, + "learning_rate": 8.663812158572568e-06, + "loss": 1.63222961, + "memory(GiB)": 97.17, + "step": 21785, + "train_speed(iter/s)": 1.63525 + }, + { + "acc": 0.6616415, + "epoch": 0.5527650938609843, + "grad_norm": 7.09375, + "learning_rate": 8.663098505792971e-06, + "loss": 1.56274929, + "memory(GiB)": 97.17, + "step": 21790, + "train_speed(iter/s)": 1.635294 + }, + { + "acc": 0.64735346, + "epoch": 0.5528919330289194, + "grad_norm": 5.25, + "learning_rate": 8.662384691893947e-06, + "loss": 1.59049015, + "memory(GiB)": 97.17, + "step": 21795, + "train_speed(iter/s)": 1.635337 + }, + { + "acc": 0.65642405, + "epoch": 0.5530187721968544, + "grad_norm": 5.40625, + "learning_rate": 8.661670716906889e-06, + "loss": 1.52714348, + "memory(GiB)": 97.17, + "step": 21800, + "train_speed(iter/s)": 1.63538 + }, + { + "acc": 0.65531926, + "epoch": 0.5531456113647895, + "grad_norm": 5.96875, + "learning_rate": 8.66095658086321e-06, + "loss": 1.67893257, + "memory(GiB)": 97.17, + "step": 21805, + "train_speed(iter/s)": 1.635422 + }, + { + "acc": 0.64629855, + "epoch": 0.5532724505327246, + "grad_norm": 5.71875, + "learning_rate": 8.660242283794312e-06, + "loss": 1.60194397, + "memory(GiB)": 97.17, + "step": 21810, + "train_speed(iter/s)": 1.635466 + }, + { + "acc": 0.64400001, + "epoch": 0.5533992897006595, + "grad_norm": 6.5625, + "learning_rate": 8.659527825731617e-06, + "loss": 1.66130409, + "memory(GiB)": 97.17, + "step": 21815, + "train_speed(iter/s)": 1.635508 + }, + { + "acc": 0.64950542, + "epoch": 0.5535261288685946, + "grad_norm": 6.21875, + "learning_rate": 8.65881320670655e-06, + "loss": 1.60069313, + "memory(GiB)": 97.17, + "step": 21820, + "train_speed(iter/s)": 1.635549 + }, + { + "acc": 0.65981803, + "epoch": 0.5536529680365296, + "grad_norm": 5.375, + "learning_rate": 8.658098426750543e-06, + "loss": 1.61864071, + "memory(GiB)": 97.17, + "step": 21825, + "train_speed(iter/s)": 1.635593 + }, + { + "acc": 0.64138765, + "epoch": 0.5537798072044647, + "grad_norm": 4.9375, + "learning_rate": 8.657383485895034e-06, + "loss": 1.70235882, + "memory(GiB)": 97.17, + "step": 21830, + "train_speed(iter/s)": 1.635638 + }, + { + "acc": 0.6707552, + "epoch": 0.5539066463723998, + "grad_norm": 5.90625, + "learning_rate": 8.656668384171472e-06, + "loss": 1.6362215, + "memory(GiB)": 97.17, + "step": 21835, + "train_speed(iter/s)": 1.635681 + }, + { + "acc": 0.67066364, + "epoch": 0.5540334855403348, + "grad_norm": 7.125, + "learning_rate": 8.655953121611307e-06, + "loss": 1.58275146, + "memory(GiB)": 97.17, + "step": 21840, + "train_speed(iter/s)": 1.635725 + }, + { + "acc": 0.65124965, + "epoch": 0.5541603247082699, + "grad_norm": 5.125, + "learning_rate": 8.655237698246002e-06, + "loss": 1.62097054, + "memory(GiB)": 97.17, + "step": 21845, + "train_speed(iter/s)": 1.63577 + }, + { + "acc": 0.65502911, + "epoch": 0.554287163876205, + "grad_norm": 6.375, + "learning_rate": 8.654522114107024e-06, + "loss": 1.56840305, + "memory(GiB)": 97.17, + "step": 21850, + "train_speed(iter/s)": 1.635812 + }, + { + "acc": 0.65515871, + "epoch": 0.55441400304414, + "grad_norm": 5.53125, + "learning_rate": 8.653806369225846e-06, + "loss": 1.60781307, + "memory(GiB)": 97.17, + "step": 21855, + "train_speed(iter/s)": 1.635855 + }, + { + "acc": 0.64324269, + "epoch": 0.5545408422120751, + "grad_norm": 5.875, + "learning_rate": 8.65309046363395e-06, + "loss": 1.60527706, + "memory(GiB)": 97.17, + "step": 21860, + "train_speed(iter/s)": 1.635897 + }, + { + "acc": 0.6480732, + "epoch": 0.5546676813800101, + "grad_norm": 5.53125, + "learning_rate": 8.652374397362828e-06, + "loss": 1.59619026, + "memory(GiB)": 97.17, + "step": 21865, + "train_speed(iter/s)": 1.635937 + }, + { + "acc": 0.65826082, + "epoch": 0.5547945205479452, + "grad_norm": 4.65625, + "learning_rate": 8.651658170443972e-06, + "loss": 1.51217985, + "memory(GiB)": 97.17, + "step": 21870, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.65649519, + "epoch": 0.5549213597158803, + "grad_norm": 6.15625, + "learning_rate": 8.650941782908886e-06, + "loss": 1.58350449, + "memory(GiB)": 97.17, + "step": 21875, + "train_speed(iter/s)": 1.636023 + }, + { + "acc": 0.66670685, + "epoch": 0.5550481988838153, + "grad_norm": 4.34375, + "learning_rate": 8.65022523478908e-06, + "loss": 1.53149109, + "memory(GiB)": 97.17, + "step": 21880, + "train_speed(iter/s)": 1.636066 + }, + { + "acc": 0.65694299, + "epoch": 0.5551750380517504, + "grad_norm": 5.5, + "learning_rate": 8.649508526116073e-06, + "loss": 1.57991695, + "memory(GiB)": 97.17, + "step": 21885, + "train_speed(iter/s)": 1.63611 + }, + { + "acc": 0.64611058, + "epoch": 0.5553018772196855, + "grad_norm": 6.03125, + "learning_rate": 8.648791656921384e-06, + "loss": 1.57871151, + "memory(GiB)": 97.17, + "step": 21890, + "train_speed(iter/s)": 1.636155 + }, + { + "acc": 0.65757208, + "epoch": 0.5554287163876205, + "grad_norm": 5.6875, + "learning_rate": 8.648074627236549e-06, + "loss": 1.60997257, + "memory(GiB)": 97.17, + "step": 21895, + "train_speed(iter/s)": 1.636195 + }, + { + "acc": 0.65856309, + "epoch": 0.5555555555555556, + "grad_norm": 7.34375, + "learning_rate": 8.647357437093104e-06, + "loss": 1.60625763, + "memory(GiB)": 97.17, + "step": 21900, + "train_speed(iter/s)": 1.636238 + }, + { + "acc": 0.66049991, + "epoch": 0.5556823947234906, + "grad_norm": 5.1875, + "learning_rate": 8.646640086522595e-06, + "loss": 1.63476467, + "memory(GiB)": 97.17, + "step": 21905, + "train_speed(iter/s)": 1.636282 + }, + { + "acc": 0.65205097, + "epoch": 0.5558092338914257, + "grad_norm": 5.90625, + "learning_rate": 8.645922575556575e-06, + "loss": 1.68087444, + "memory(GiB)": 97.17, + "step": 21910, + "train_speed(iter/s)": 1.636327 + }, + { + "acc": 0.65282979, + "epoch": 0.5559360730593608, + "grad_norm": 4.9375, + "learning_rate": 8.645204904226601e-06, + "loss": 1.63369827, + "memory(GiB)": 97.17, + "step": 21915, + "train_speed(iter/s)": 1.63637 + }, + { + "acc": 0.66559925, + "epoch": 0.5560629122272958, + "grad_norm": 6.375, + "learning_rate": 8.64448707256424e-06, + "loss": 1.54760084, + "memory(GiB)": 97.17, + "step": 21920, + "train_speed(iter/s)": 1.636416 + }, + { + "acc": 0.66131949, + "epoch": 0.5561897513952309, + "grad_norm": 4.65625, + "learning_rate": 8.643769080601067e-06, + "loss": 1.56847315, + "memory(GiB)": 97.17, + "step": 21925, + "train_speed(iter/s)": 1.63646 + }, + { + "acc": 0.66901631, + "epoch": 0.556316590563166, + "grad_norm": 6.03125, + "learning_rate": 8.643050928368661e-06, + "loss": 1.54112282, + "memory(GiB)": 97.17, + "step": 21930, + "train_speed(iter/s)": 1.636504 + }, + { + "acc": 0.6561614, + "epoch": 0.556443429731101, + "grad_norm": 6.21875, + "learning_rate": 8.642332615898611e-06, + "loss": 1.59263697, + "memory(GiB)": 97.17, + "step": 21935, + "train_speed(iter/s)": 1.636545 + }, + { + "acc": 0.65573463, + "epoch": 0.556570268899036, + "grad_norm": 5.53125, + "learning_rate": 8.64161414322251e-06, + "loss": 1.5521946, + "memory(GiB)": 97.17, + "step": 21940, + "train_speed(iter/s)": 1.636586 + }, + { + "acc": 0.67051821, + "epoch": 0.556697108066971, + "grad_norm": 6.0, + "learning_rate": 8.64089551037196e-06, + "loss": 1.55594463, + "memory(GiB)": 97.17, + "step": 21945, + "train_speed(iter/s)": 1.636628 + }, + { + "acc": 0.65225635, + "epoch": 0.5568239472349061, + "grad_norm": 4.6875, + "learning_rate": 8.640176717378573e-06, + "loss": 1.58434238, + "memory(GiB)": 97.17, + "step": 21950, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.66207571, + "epoch": 0.5569507864028412, + "grad_norm": 5.9375, + "learning_rate": 8.639457764273957e-06, + "loss": 1.63933716, + "memory(GiB)": 97.17, + "step": 21955, + "train_speed(iter/s)": 1.636711 + }, + { + "acc": 0.63854728, + "epoch": 0.5570776255707762, + "grad_norm": 6.53125, + "learning_rate": 8.638738651089744e-06, + "loss": 1.63071098, + "memory(GiB)": 97.17, + "step": 21960, + "train_speed(iter/s)": 1.636754 + }, + { + "acc": 0.64369378, + "epoch": 0.5572044647387113, + "grad_norm": 5.9375, + "learning_rate": 8.638019377857555e-06, + "loss": 1.6409647, + "memory(GiB)": 97.17, + "step": 21965, + "train_speed(iter/s)": 1.636796 + }, + { + "acc": 0.65234113, + "epoch": 0.5573313039066464, + "grad_norm": 7.8125, + "learning_rate": 8.637299944609034e-06, + "loss": 1.69292946, + "memory(GiB)": 97.17, + "step": 21970, + "train_speed(iter/s)": 1.636837 + }, + { + "acc": 0.66183348, + "epoch": 0.5574581430745814, + "grad_norm": 6.9375, + "learning_rate": 8.636580351375821e-06, + "loss": 1.62541199, + "memory(GiB)": 97.17, + "step": 21975, + "train_speed(iter/s)": 1.636879 + }, + { + "acc": 0.64304957, + "epoch": 0.5575849822425165, + "grad_norm": 6.5625, + "learning_rate": 8.635860598189569e-06, + "loss": 1.70774117, + "memory(GiB)": 97.17, + "step": 21980, + "train_speed(iter/s)": 1.63692 + }, + { + "acc": 0.64546146, + "epoch": 0.5577118214104515, + "grad_norm": 5.1875, + "learning_rate": 8.635140685081936e-06, + "loss": 1.66176224, + "memory(GiB)": 97.17, + "step": 21985, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.65537148, + "epoch": 0.5578386605783866, + "grad_norm": 6.125, + "learning_rate": 8.634420612084583e-06, + "loss": 1.57952919, + "memory(GiB)": 97.17, + "step": 21990, + "train_speed(iter/s)": 1.637004 + }, + { + "acc": 0.64952412, + "epoch": 0.5579654997463217, + "grad_norm": 5.25, + "learning_rate": 8.633700379229187e-06, + "loss": 1.63313599, + "memory(GiB)": 97.17, + "step": 21995, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.65133219, + "epoch": 0.5580923389142567, + "grad_norm": 6.125, + "learning_rate": 8.632979986547423e-06, + "loss": 1.69516029, + "memory(GiB)": 97.17, + "step": 22000, + "train_speed(iter/s)": 1.63709 + }, + { + "epoch": 0.5580923389142567, + "eval_acc": 0.6442837840942076, + "eval_loss": 1.584275245666504, + "eval_runtime": 57.9033, + "eval_samples_per_second": 110.011, + "eval_steps_per_second": 27.511, + "step": 22000 + }, + { + "acc": 0.64766521, + "epoch": 0.5582191780821918, + "grad_norm": 5.21875, + "learning_rate": 8.632259434070982e-06, + "loss": 1.65357399, + "memory(GiB)": 97.17, + "step": 22005, + "train_speed(iter/s)": 1.629594 + }, + { + "acc": 0.67265396, + "epoch": 0.5583460172501269, + "grad_norm": 5.46875, + "learning_rate": 8.631538721831551e-06, + "loss": 1.59529648, + "memory(GiB)": 97.17, + "step": 22010, + "train_speed(iter/s)": 1.62964 + }, + { + "acc": 0.64182444, + "epoch": 0.5584728564180619, + "grad_norm": 4.9375, + "learning_rate": 8.630817849860835e-06, + "loss": 1.67003822, + "memory(GiB)": 97.17, + "step": 22015, + "train_speed(iter/s)": 1.629678 + }, + { + "acc": 0.65431232, + "epoch": 0.558599695585997, + "grad_norm": 6.65625, + "learning_rate": 8.63009681819054e-06, + "loss": 1.59051275, + "memory(GiB)": 97.17, + "step": 22020, + "train_speed(iter/s)": 1.629718 + }, + { + "acc": 0.64806523, + "epoch": 0.558726534753932, + "grad_norm": 5.34375, + "learning_rate": 8.629375626852378e-06, + "loss": 1.58648911, + "memory(GiB)": 97.17, + "step": 22025, + "train_speed(iter/s)": 1.629757 + }, + { + "acc": 0.64907637, + "epoch": 0.5588533739218671, + "grad_norm": 4.5, + "learning_rate": 8.628654275878074e-06, + "loss": 1.64522209, + "memory(GiB)": 97.17, + "step": 22030, + "train_speed(iter/s)": 1.629796 + }, + { + "acc": 0.65537271, + "epoch": 0.5589802130898022, + "grad_norm": 5.03125, + "learning_rate": 8.627932765299353e-06, + "loss": 1.58960819, + "memory(GiB)": 97.17, + "step": 22035, + "train_speed(iter/s)": 1.629835 + }, + { + "acc": 0.65957298, + "epoch": 0.5591070522577372, + "grad_norm": 5.3125, + "learning_rate": 8.627211095147952e-06, + "loss": 1.56123638, + "memory(GiB)": 97.17, + "step": 22040, + "train_speed(iter/s)": 1.62988 + }, + { + "acc": 0.66828861, + "epoch": 0.5592338914256723, + "grad_norm": 6.15625, + "learning_rate": 8.626489265455614e-06, + "loss": 1.67097893, + "memory(GiB)": 97.17, + "step": 22045, + "train_speed(iter/s)": 1.629919 + }, + { + "acc": 0.6471066, + "epoch": 0.5593607305936074, + "grad_norm": 6.4375, + "learning_rate": 8.625767276254084e-06, + "loss": 1.64611778, + "memory(GiB)": 97.17, + "step": 22050, + "train_speed(iter/s)": 1.629961 + }, + { + "acc": 0.64427881, + "epoch": 0.5594875697615423, + "grad_norm": 5.4375, + "learning_rate": 8.625045127575123e-06, + "loss": 1.65598946, + "memory(GiB)": 97.17, + "step": 22055, + "train_speed(iter/s)": 1.63 + }, + { + "acc": 0.66052151, + "epoch": 0.5596144089294774, + "grad_norm": 8.8125, + "learning_rate": 8.624322819450493e-06, + "loss": 1.65792542, + "memory(GiB)": 97.17, + "step": 22060, + "train_speed(iter/s)": 1.630041 + }, + { + "acc": 0.67088552, + "epoch": 0.5597412480974124, + "grad_norm": 8.625, + "learning_rate": 8.623600351911962e-06, + "loss": 1.54815693, + "memory(GiB)": 97.17, + "step": 22065, + "train_speed(iter/s)": 1.630082 + }, + { + "acc": 0.66115246, + "epoch": 0.5598680872653475, + "grad_norm": 5.0, + "learning_rate": 8.622877724991312e-06, + "loss": 1.62382908, + "memory(GiB)": 97.17, + "step": 22070, + "train_speed(iter/s)": 1.630107 + }, + { + "acc": 0.66171041, + "epoch": 0.5599949264332826, + "grad_norm": 4.90625, + "learning_rate": 8.622154938720323e-06, + "loss": 1.56196642, + "memory(GiB)": 97.17, + "step": 22075, + "train_speed(iter/s)": 1.630146 + }, + { + "acc": 0.64730992, + "epoch": 0.5601217656012176, + "grad_norm": 5.125, + "learning_rate": 8.621431993130787e-06, + "loss": 1.64873505, + "memory(GiB)": 97.17, + "step": 22080, + "train_speed(iter/s)": 1.630185 + }, + { + "acc": 0.66908636, + "epoch": 0.5602486047691527, + "grad_norm": 4.96875, + "learning_rate": 8.620708888254506e-06, + "loss": 1.57481318, + "memory(GiB)": 97.17, + "step": 22085, + "train_speed(iter/s)": 1.630224 + }, + { + "acc": 0.66095982, + "epoch": 0.5603754439370878, + "grad_norm": 5.65625, + "learning_rate": 8.619985624123282e-06, + "loss": 1.5818306, + "memory(GiB)": 97.17, + "step": 22090, + "train_speed(iter/s)": 1.630265 + }, + { + "acc": 0.66841831, + "epoch": 0.5605022831050228, + "grad_norm": 4.8125, + "learning_rate": 8.619262200768928e-06, + "loss": 1.53226871, + "memory(GiB)": 97.17, + "step": 22095, + "train_speed(iter/s)": 1.630305 + }, + { + "acc": 0.66509643, + "epoch": 0.5606291222729579, + "grad_norm": 5.03125, + "learning_rate": 8.618538618223262e-06, + "loss": 1.56957045, + "memory(GiB)": 97.17, + "step": 22100, + "train_speed(iter/s)": 1.630342 + }, + { + "acc": 0.66304598, + "epoch": 0.5607559614408929, + "grad_norm": 5.84375, + "learning_rate": 8.617814876518114e-06, + "loss": 1.59483376, + "memory(GiB)": 97.17, + "step": 22105, + "train_speed(iter/s)": 1.630384 + }, + { + "acc": 0.65765972, + "epoch": 0.560882800608828, + "grad_norm": 8.25, + "learning_rate": 8.617090975685314e-06, + "loss": 1.65395584, + "memory(GiB)": 97.17, + "step": 22110, + "train_speed(iter/s)": 1.630425 + }, + { + "acc": 0.65512762, + "epoch": 0.5610096397767631, + "grad_norm": 5.59375, + "learning_rate": 8.616366915756704e-06, + "loss": 1.61534538, + "memory(GiB)": 97.17, + "step": 22115, + "train_speed(iter/s)": 1.630465 + }, + { + "acc": 0.65322418, + "epoch": 0.5611364789446981, + "grad_norm": 5.0, + "learning_rate": 8.615642696764131e-06, + "loss": 1.62809296, + "memory(GiB)": 97.17, + "step": 22120, + "train_speed(iter/s)": 1.630508 + }, + { + "acc": 0.65729551, + "epoch": 0.5612633181126332, + "grad_norm": 5.34375, + "learning_rate": 8.614918318739452e-06, + "loss": 1.62419853, + "memory(GiB)": 97.17, + "step": 22125, + "train_speed(iter/s)": 1.630547 + }, + { + "acc": 0.66303067, + "epoch": 0.5613901572805683, + "grad_norm": 5.875, + "learning_rate": 8.614193781714522e-06, + "loss": 1.61998672, + "memory(GiB)": 97.17, + "step": 22130, + "train_speed(iter/s)": 1.630588 + }, + { + "acc": 0.65157919, + "epoch": 0.5615169964485033, + "grad_norm": 5.5625, + "learning_rate": 8.613469085721215e-06, + "loss": 1.61684551, + "memory(GiB)": 97.17, + "step": 22135, + "train_speed(iter/s)": 1.630632 + }, + { + "acc": 0.65204177, + "epoch": 0.5616438356164384, + "grad_norm": 5.53125, + "learning_rate": 8.612744230791406e-06, + "loss": 1.67990074, + "memory(GiB)": 97.17, + "step": 22140, + "train_speed(iter/s)": 1.63067 + }, + { + "acc": 0.66377344, + "epoch": 0.5617706747843734, + "grad_norm": 8.125, + "learning_rate": 8.612019216956975e-06, + "loss": 1.55788307, + "memory(GiB)": 97.17, + "step": 22145, + "train_speed(iter/s)": 1.63071 + }, + { + "acc": 0.64535675, + "epoch": 0.5618975139523085, + "grad_norm": 5.3125, + "learning_rate": 8.611294044249811e-06, + "loss": 1.60758018, + "memory(GiB)": 97.17, + "step": 22150, + "train_speed(iter/s)": 1.630748 + }, + { + "acc": 0.65348024, + "epoch": 0.5620243531202436, + "grad_norm": 5.8125, + "learning_rate": 8.610568712701814e-06, + "loss": 1.60966167, + "memory(GiB)": 97.17, + "step": 22155, + "train_speed(iter/s)": 1.630786 + }, + { + "acc": 0.64837885, + "epoch": 0.5621511922881786, + "grad_norm": 5.09375, + "learning_rate": 8.609843222344883e-06, + "loss": 1.61449966, + "memory(GiB)": 97.17, + "step": 22160, + "train_speed(iter/s)": 1.63083 + }, + { + "acc": 0.66407118, + "epoch": 0.5622780314561137, + "grad_norm": 5.25, + "learning_rate": 8.609117573210931e-06, + "loss": 1.52226009, + "memory(GiB)": 97.17, + "step": 22165, + "train_speed(iter/s)": 1.63087 + }, + { + "acc": 0.6528976, + "epoch": 0.5624048706240488, + "grad_norm": 5.59375, + "learning_rate": 8.608391765331875e-06, + "loss": 1.58448124, + "memory(GiB)": 97.17, + "step": 22170, + "train_speed(iter/s)": 1.630915 + }, + { + "acc": 0.66446767, + "epoch": 0.5625317097919837, + "grad_norm": 5.15625, + "learning_rate": 8.607665798739638e-06, + "loss": 1.60372181, + "memory(GiB)": 97.17, + "step": 22175, + "train_speed(iter/s)": 1.630954 + }, + { + "acc": 0.6596478, + "epoch": 0.5626585489599188, + "grad_norm": 5.78125, + "learning_rate": 8.606939673466153e-06, + "loss": 1.59313192, + "memory(GiB)": 97.17, + "step": 22180, + "train_speed(iter/s)": 1.630994 + }, + { + "acc": 0.65518103, + "epoch": 0.5627853881278538, + "grad_norm": 6.875, + "learning_rate": 8.606213389543356e-06, + "loss": 1.58872013, + "memory(GiB)": 97.17, + "step": 22185, + "train_speed(iter/s)": 1.631035 + }, + { + "acc": 0.65328484, + "epoch": 0.5629122272957889, + "grad_norm": 6.40625, + "learning_rate": 8.605486947003194e-06, + "loss": 1.65926418, + "memory(GiB)": 97.17, + "step": 22190, + "train_speed(iter/s)": 1.631077 + }, + { + "acc": 0.64201903, + "epoch": 0.563039066463724, + "grad_norm": 4.875, + "learning_rate": 8.60476034587762e-06, + "loss": 1.68698311, + "memory(GiB)": 97.17, + "step": 22195, + "train_speed(iter/s)": 1.631113 + }, + { + "acc": 0.66494994, + "epoch": 0.563165905631659, + "grad_norm": 5.40625, + "learning_rate": 8.604033586198592e-06, + "loss": 1.57686749, + "memory(GiB)": 97.17, + "step": 22200, + "train_speed(iter/s)": 1.631154 + }, + { + "acc": 0.63865528, + "epoch": 0.5632927447995941, + "grad_norm": 5.96875, + "learning_rate": 8.603306667998074e-06, + "loss": 1.67061462, + "memory(GiB)": 97.17, + "step": 22205, + "train_speed(iter/s)": 1.631194 + }, + { + "acc": 0.6500452, + "epoch": 0.5634195839675292, + "grad_norm": 6.03125, + "learning_rate": 8.602579591308043e-06, + "loss": 1.64475994, + "memory(GiB)": 97.17, + "step": 22210, + "train_speed(iter/s)": 1.631233 + }, + { + "acc": 0.63216305, + "epoch": 0.5635464231354642, + "grad_norm": 5.34375, + "learning_rate": 8.601852356160476e-06, + "loss": 1.68564835, + "memory(GiB)": 97.17, + "step": 22215, + "train_speed(iter/s)": 1.631273 + }, + { + "acc": 0.66152973, + "epoch": 0.5636732623033993, + "grad_norm": 5.28125, + "learning_rate": 8.60112496258736e-06, + "loss": 1.59963875, + "memory(GiB)": 97.17, + "step": 22220, + "train_speed(iter/s)": 1.631314 + }, + { + "acc": 0.64364147, + "epoch": 0.5638001014713343, + "grad_norm": 5.125, + "learning_rate": 8.600397410620693e-06, + "loss": 1.65571842, + "memory(GiB)": 97.17, + "step": 22225, + "train_speed(iter/s)": 1.631352 + }, + { + "acc": 0.64903145, + "epoch": 0.5639269406392694, + "grad_norm": 6.03125, + "learning_rate": 8.599669700292472e-06, + "loss": 1.62756386, + "memory(GiB)": 97.17, + "step": 22230, + "train_speed(iter/s)": 1.631392 + }, + { + "acc": 0.66847501, + "epoch": 0.5640537798072045, + "grad_norm": 5.40625, + "learning_rate": 8.598941831634707e-06, + "loss": 1.64775505, + "memory(GiB)": 97.17, + "step": 22235, + "train_speed(iter/s)": 1.631432 + }, + { + "acc": 0.63823862, + "epoch": 0.5641806189751395, + "grad_norm": 5.25, + "learning_rate": 8.598213804679412e-06, + "loss": 1.65832157, + "memory(GiB)": 97.17, + "step": 22240, + "train_speed(iter/s)": 1.631473 + }, + { + "acc": 0.64000854, + "epoch": 0.5643074581430746, + "grad_norm": 5.28125, + "learning_rate": 8.597485619458609e-06, + "loss": 1.67096252, + "memory(GiB)": 97.17, + "step": 22245, + "train_speed(iter/s)": 1.631503 + }, + { + "acc": 0.65146627, + "epoch": 0.5644342973110097, + "grad_norm": 5.4375, + "learning_rate": 8.596757276004327e-06, + "loss": 1.6283865, + "memory(GiB)": 97.17, + "step": 22250, + "train_speed(iter/s)": 1.631543 + }, + { + "acc": 0.66809855, + "epoch": 0.5645611364789447, + "grad_norm": 6.40625, + "learning_rate": 8.5960287743486e-06, + "loss": 1.66621056, + "memory(GiB)": 97.17, + "step": 22255, + "train_speed(iter/s)": 1.631588 + }, + { + "acc": 0.6549943, + "epoch": 0.5646879756468798, + "grad_norm": 5.15625, + "learning_rate": 8.595300114523473e-06, + "loss": 1.57259054, + "memory(GiB)": 97.17, + "step": 22260, + "train_speed(iter/s)": 1.631631 + }, + { + "acc": 0.65657454, + "epoch": 0.5648148148148148, + "grad_norm": 5.3125, + "learning_rate": 8.594571296560997e-06, + "loss": 1.67733383, + "memory(GiB)": 97.17, + "step": 22265, + "train_speed(iter/s)": 1.631672 + }, + { + "acc": 0.64633541, + "epoch": 0.5649416539827499, + "grad_norm": 4.90625, + "learning_rate": 8.593842320493224e-06, + "loss": 1.68772202, + "memory(GiB)": 97.17, + "step": 22270, + "train_speed(iter/s)": 1.631712 + }, + { + "acc": 0.64857774, + "epoch": 0.565068493150685, + "grad_norm": 5.625, + "learning_rate": 8.593113186352222e-06, + "loss": 1.68307953, + "memory(GiB)": 97.17, + "step": 22275, + "train_speed(iter/s)": 1.631755 + }, + { + "acc": 0.63205538, + "epoch": 0.56519533231862, + "grad_norm": 5.625, + "learning_rate": 8.592383894170059e-06, + "loss": 1.68476067, + "memory(GiB)": 97.17, + "step": 22280, + "train_speed(iter/s)": 1.631799 + }, + { + "acc": 0.66157036, + "epoch": 0.5653221714865551, + "grad_norm": 4.625, + "learning_rate": 8.591654443978815e-06, + "loss": 1.59444284, + "memory(GiB)": 97.17, + "step": 22285, + "train_speed(iter/s)": 1.63184 + }, + { + "acc": 0.65190382, + "epoch": 0.5654490106544902, + "grad_norm": 6.125, + "learning_rate": 8.590924835810572e-06, + "loss": 1.61812744, + "memory(GiB)": 97.17, + "step": 22290, + "train_speed(iter/s)": 1.631885 + }, + { + "acc": 0.65776253, + "epoch": 0.5655758498224251, + "grad_norm": 6.25, + "learning_rate": 8.590195069697423e-06, + "loss": 1.56847477, + "memory(GiB)": 97.17, + "step": 22295, + "train_speed(iter/s)": 1.631928 + }, + { + "acc": 0.64089284, + "epoch": 0.5657026889903602, + "grad_norm": 5.875, + "learning_rate": 8.589465145671465e-06, + "loss": 1.63800068, + "memory(GiB)": 97.17, + "step": 22300, + "train_speed(iter/s)": 1.631969 + }, + { + "acc": 0.6443018, + "epoch": 0.5658295281582952, + "grad_norm": 5.21875, + "learning_rate": 8.588735063764803e-06, + "loss": 1.61475258, + "memory(GiB)": 97.17, + "step": 22305, + "train_speed(iter/s)": 1.63201 + }, + { + "acc": 0.65383434, + "epoch": 0.5659563673262303, + "grad_norm": 4.96875, + "learning_rate": 8.588004824009552e-06, + "loss": 1.68520279, + "memory(GiB)": 97.17, + "step": 22310, + "train_speed(iter/s)": 1.632051 + }, + { + "acc": 0.6525898, + "epoch": 0.5660832064941654, + "grad_norm": 6.0625, + "learning_rate": 8.58727442643783e-06, + "loss": 1.67151756, + "memory(GiB)": 97.17, + "step": 22315, + "train_speed(iter/s)": 1.632089 + }, + { + "acc": 0.64363141, + "epoch": 0.5662100456621004, + "grad_norm": 6.40625, + "learning_rate": 8.586543871081764e-06, + "loss": 1.63221016, + "memory(GiB)": 97.17, + "step": 22320, + "train_speed(iter/s)": 1.63213 + }, + { + "acc": 0.67659674, + "epoch": 0.5663368848300355, + "grad_norm": 5.1875, + "learning_rate": 8.585813157973482e-06, + "loss": 1.53684492, + "memory(GiB)": 97.17, + "step": 22325, + "train_speed(iter/s)": 1.632171 + }, + { + "acc": 0.64077759, + "epoch": 0.5664637239979706, + "grad_norm": 6.53125, + "learning_rate": 8.58508228714513e-06, + "loss": 1.59774857, + "memory(GiB)": 97.17, + "step": 22330, + "train_speed(iter/s)": 1.632214 + }, + { + "acc": 0.65861969, + "epoch": 0.5665905631659056, + "grad_norm": 6.40625, + "learning_rate": 8.584351258628852e-06, + "loss": 1.58166103, + "memory(GiB)": 97.17, + "step": 22335, + "train_speed(iter/s)": 1.632254 + }, + { + "acc": 0.6759655, + "epoch": 0.5667174023338407, + "grad_norm": 5.375, + "learning_rate": 8.583620072456803e-06, + "loss": 1.48119345, + "memory(GiB)": 97.17, + "step": 22340, + "train_speed(iter/s)": 1.632295 + }, + { + "acc": 0.66174827, + "epoch": 0.5668442415017757, + "grad_norm": 5.34375, + "learning_rate": 8.582888728661142e-06, + "loss": 1.59675293, + "memory(GiB)": 97.17, + "step": 22345, + "train_speed(iter/s)": 1.632337 + }, + { + "acc": 0.64102669, + "epoch": 0.5669710806697108, + "grad_norm": 5.875, + "learning_rate": 8.582157227274042e-06, + "loss": 1.68126278, + "memory(GiB)": 97.17, + "step": 22350, + "train_speed(iter/s)": 1.632378 + }, + { + "acc": 0.65328808, + "epoch": 0.5670979198376459, + "grad_norm": 6.75, + "learning_rate": 8.581425568327671e-06, + "loss": 1.62424183, + "memory(GiB)": 97.17, + "step": 22355, + "train_speed(iter/s)": 1.632416 + }, + { + "acc": 0.64292893, + "epoch": 0.5672247590055809, + "grad_norm": 5.34375, + "learning_rate": 8.580693751854215e-06, + "loss": 1.65654449, + "memory(GiB)": 97.17, + "step": 22360, + "train_speed(iter/s)": 1.632459 + }, + { + "acc": 0.64939899, + "epoch": 0.567351598173516, + "grad_norm": 5.0, + "learning_rate": 8.57996177788586e-06, + "loss": 1.5958786, + "memory(GiB)": 97.17, + "step": 22365, + "train_speed(iter/s)": 1.632501 + }, + { + "acc": 0.64354606, + "epoch": 0.5674784373414511, + "grad_norm": 7.09375, + "learning_rate": 8.579229646454803e-06, + "loss": 1.63148613, + "memory(GiB)": 97.17, + "step": 22370, + "train_speed(iter/s)": 1.632543 + }, + { + "acc": 0.67264624, + "epoch": 0.5676052765093861, + "grad_norm": 8.4375, + "learning_rate": 8.578497357593246e-06, + "loss": 1.5287075, + "memory(GiB)": 97.17, + "step": 22375, + "train_speed(iter/s)": 1.632585 + }, + { + "acc": 0.66717358, + "epoch": 0.5677321156773212, + "grad_norm": 5.34375, + "learning_rate": 8.5777649113334e-06, + "loss": 1.49966593, + "memory(GiB)": 97.17, + "step": 22380, + "train_speed(iter/s)": 1.632623 + }, + { + "acc": 0.65953608, + "epoch": 0.5678589548452562, + "grad_norm": 5.28125, + "learning_rate": 8.577032307707476e-06, + "loss": 1.57445078, + "memory(GiB)": 97.17, + "step": 22385, + "train_speed(iter/s)": 1.632665 + }, + { + "acc": 0.63723788, + "epoch": 0.5679857940131913, + "grad_norm": 5.75, + "learning_rate": 8.576299546747704e-06, + "loss": 1.70777283, + "memory(GiB)": 97.17, + "step": 22390, + "train_speed(iter/s)": 1.632708 + }, + { + "acc": 0.65935545, + "epoch": 0.5681126331811264, + "grad_norm": 5.625, + "learning_rate": 8.575566628486309e-06, + "loss": 1.63568611, + "memory(GiB)": 97.17, + "step": 22395, + "train_speed(iter/s)": 1.632748 + }, + { + "acc": 0.63334737, + "epoch": 0.5682394723490614, + "grad_norm": 5.5, + "learning_rate": 8.574833552955532e-06, + "loss": 1.65445709, + "memory(GiB)": 97.17, + "step": 22400, + "train_speed(iter/s)": 1.63279 + }, + { + "acc": 0.6508224, + "epoch": 0.5683663115169965, + "grad_norm": 4.71875, + "learning_rate": 8.574100320187612e-06, + "loss": 1.64084568, + "memory(GiB)": 97.17, + "step": 22405, + "train_speed(iter/s)": 1.63283 + }, + { + "acc": 0.65611391, + "epoch": 0.5684931506849316, + "grad_norm": 5.8125, + "learning_rate": 8.573366930214807e-06, + "loss": 1.66153507, + "memory(GiB)": 97.17, + "step": 22410, + "train_speed(iter/s)": 1.632872 + }, + { + "acc": 0.63149695, + "epoch": 0.5686199898528665, + "grad_norm": 6.71875, + "learning_rate": 8.572633383069366e-06, + "loss": 1.67252502, + "memory(GiB)": 97.17, + "step": 22415, + "train_speed(iter/s)": 1.632916 + }, + { + "acc": 0.64451799, + "epoch": 0.5687468290208016, + "grad_norm": 12.4375, + "learning_rate": 8.571899678783561e-06, + "loss": 1.63048859, + "memory(GiB)": 97.17, + "step": 22420, + "train_speed(iter/s)": 1.632956 + }, + { + "acc": 0.65323577, + "epoch": 0.5688736681887366, + "grad_norm": 6.3125, + "learning_rate": 8.57116581738966e-06, + "loss": 1.62588692, + "memory(GiB)": 97.17, + "step": 22425, + "train_speed(iter/s)": 1.632997 + }, + { + "acc": 0.66419601, + "epoch": 0.5690005073566717, + "grad_norm": 5.21875, + "learning_rate": 8.570431798919941e-06, + "loss": 1.57818089, + "memory(GiB)": 97.17, + "step": 22430, + "train_speed(iter/s)": 1.633037 + }, + { + "acc": 0.6607265, + "epoch": 0.5691273465246068, + "grad_norm": 6.5625, + "learning_rate": 8.569697623406692e-06, + "loss": 1.56275806, + "memory(GiB)": 97.17, + "step": 22435, + "train_speed(iter/s)": 1.633076 + }, + { + "acc": 0.64556789, + "epoch": 0.5692541856925418, + "grad_norm": 5.09375, + "learning_rate": 8.568963290882204e-06, + "loss": 1.62211933, + "memory(GiB)": 97.17, + "step": 22440, + "train_speed(iter/s)": 1.633116 + }, + { + "acc": 0.66603861, + "epoch": 0.5693810248604769, + "grad_norm": 5.375, + "learning_rate": 8.568228801378775e-06, + "loss": 1.53854589, + "memory(GiB)": 97.17, + "step": 22445, + "train_speed(iter/s)": 1.633155 + }, + { + "acc": 0.65906096, + "epoch": 0.569507864028412, + "grad_norm": 4.96875, + "learning_rate": 8.567494154928713e-06, + "loss": 1.58619699, + "memory(GiB)": 97.17, + "step": 22450, + "train_speed(iter/s)": 1.633195 + }, + { + "acc": 0.63740587, + "epoch": 0.569634703196347, + "grad_norm": 5.125, + "learning_rate": 8.566759351564332e-06, + "loss": 1.61090679, + "memory(GiB)": 97.17, + "step": 22455, + "train_speed(iter/s)": 1.633233 + }, + { + "acc": 0.64091873, + "epoch": 0.5697615423642821, + "grad_norm": 5.40625, + "learning_rate": 8.566024391317947e-06, + "loss": 1.65458221, + "memory(GiB)": 97.17, + "step": 22460, + "train_speed(iter/s)": 1.633272 + }, + { + "acc": 0.65131426, + "epoch": 0.5698883815322171, + "grad_norm": 6.90625, + "learning_rate": 8.565289274221891e-06, + "loss": 1.61053772, + "memory(GiB)": 97.17, + "step": 22465, + "train_speed(iter/s)": 1.633312 + }, + { + "acc": 0.66191001, + "epoch": 0.5700152207001522, + "grad_norm": 4.96875, + "learning_rate": 8.564554000308493e-06, + "loss": 1.5993679, + "memory(GiB)": 97.17, + "step": 22470, + "train_speed(iter/s)": 1.633354 + }, + { + "acc": 0.6503726, + "epoch": 0.5701420598680873, + "grad_norm": 6.15625, + "learning_rate": 8.563818569610096e-06, + "loss": 1.62359886, + "memory(GiB)": 97.17, + "step": 22475, + "train_speed(iter/s)": 1.633396 + }, + { + "acc": 0.64764481, + "epoch": 0.5702688990360223, + "grad_norm": 4.96875, + "learning_rate": 8.563082982159048e-06, + "loss": 1.59813824, + "memory(GiB)": 97.17, + "step": 22480, + "train_speed(iter/s)": 1.633435 + }, + { + "acc": 0.66095142, + "epoch": 0.5703957382039574, + "grad_norm": 6.09375, + "learning_rate": 8.562347237987701e-06, + "loss": 1.62702637, + "memory(GiB)": 97.17, + "step": 22485, + "train_speed(iter/s)": 1.633474 + }, + { + "acc": 0.63881736, + "epoch": 0.5705225773718925, + "grad_norm": 6.0625, + "learning_rate": 8.561611337128418e-06, + "loss": 1.67207108, + "memory(GiB)": 97.17, + "step": 22490, + "train_speed(iter/s)": 1.633511 + }, + { + "acc": 0.6611743, + "epoch": 0.5706494165398275, + "grad_norm": 5.71875, + "learning_rate": 8.560875279613568e-06, + "loss": 1.57725773, + "memory(GiB)": 97.17, + "step": 22495, + "train_speed(iter/s)": 1.633549 + }, + { + "acc": 0.64042716, + "epoch": 0.5707762557077626, + "grad_norm": 5.15625, + "learning_rate": 8.560139065475523e-06, + "loss": 1.66806068, + "memory(GiB)": 97.17, + "step": 22500, + "train_speed(iter/s)": 1.633587 + }, + { + "acc": 0.66801863, + "epoch": 0.5709030948756976, + "grad_norm": 6.40625, + "learning_rate": 8.559402694746671e-06, + "loss": 1.6008194, + "memory(GiB)": 97.17, + "step": 22505, + "train_speed(iter/s)": 1.633629 + }, + { + "acc": 0.66383257, + "epoch": 0.5710299340436327, + "grad_norm": 5.5625, + "learning_rate": 8.558666167459393e-06, + "loss": 1.52449493, + "memory(GiB)": 97.17, + "step": 22510, + "train_speed(iter/s)": 1.633674 + }, + { + "acc": 0.64486103, + "epoch": 0.5711567732115678, + "grad_norm": 5.03125, + "learning_rate": 8.55792948364609e-06, + "loss": 1.73176956, + "memory(GiB)": 97.17, + "step": 22515, + "train_speed(iter/s)": 1.633716 + }, + { + "acc": 0.66222563, + "epoch": 0.5712836123795028, + "grad_norm": 5.96875, + "learning_rate": 8.557192643339164e-06, + "loss": 1.65326118, + "memory(GiB)": 97.17, + "step": 22520, + "train_speed(iter/s)": 1.633754 + }, + { + "acc": 0.66218786, + "epoch": 0.5714104515474379, + "grad_norm": 4.59375, + "learning_rate": 8.556455646571022e-06, + "loss": 1.61443806, + "memory(GiB)": 97.17, + "step": 22525, + "train_speed(iter/s)": 1.633794 + }, + { + "acc": 0.66605749, + "epoch": 0.571537290715373, + "grad_norm": 5.5625, + "learning_rate": 8.555718493374084e-06, + "loss": 1.64509468, + "memory(GiB)": 97.17, + "step": 22530, + "train_speed(iter/s)": 1.633831 + }, + { + "acc": 0.66166363, + "epoch": 0.571664129883308, + "grad_norm": 5.5, + "learning_rate": 8.55498118378077e-06, + "loss": 1.63057213, + "memory(GiB)": 97.17, + "step": 22535, + "train_speed(iter/s)": 1.63387 + }, + { + "acc": 0.6782208, + "epoch": 0.571790969051243, + "grad_norm": 7.65625, + "learning_rate": 8.554243717823512e-06, + "loss": 1.50373659, + "memory(GiB)": 97.17, + "step": 22540, + "train_speed(iter/s)": 1.633912 + }, + { + "acc": 0.65800934, + "epoch": 0.571917808219178, + "grad_norm": 5.65625, + "learning_rate": 8.553506095534747e-06, + "loss": 1.64103966, + "memory(GiB)": 97.17, + "step": 22545, + "train_speed(iter/s)": 1.633952 + }, + { + "acc": 0.65278802, + "epoch": 0.5720446473871131, + "grad_norm": 5.625, + "learning_rate": 8.55276831694692e-06, + "loss": 1.55767307, + "memory(GiB)": 97.17, + "step": 22550, + "train_speed(iter/s)": 1.633994 + }, + { + "acc": 0.6616468, + "epoch": 0.5721714865550482, + "grad_norm": 5.59375, + "learning_rate": 8.552030382092477e-06, + "loss": 1.61699657, + "memory(GiB)": 97.17, + "step": 22555, + "train_speed(iter/s)": 1.634033 + }, + { + "acc": 0.66322451, + "epoch": 0.5722983257229832, + "grad_norm": 5.625, + "learning_rate": 8.551292291003884e-06, + "loss": 1.62340508, + "memory(GiB)": 97.17, + "step": 22560, + "train_speed(iter/s)": 1.634074 + }, + { + "acc": 0.65315456, + "epoch": 0.5724251648909183, + "grad_norm": 8.0625, + "learning_rate": 8.550554043713597e-06, + "loss": 1.63757877, + "memory(GiB)": 97.17, + "step": 22565, + "train_speed(iter/s)": 1.634115 + }, + { + "acc": 0.66580782, + "epoch": 0.5725520040588534, + "grad_norm": 5.53125, + "learning_rate": 8.549815640254092e-06, + "loss": 1.56148205, + "memory(GiB)": 97.17, + "step": 22570, + "train_speed(iter/s)": 1.634157 + }, + { + "acc": 0.65758343, + "epoch": 0.5726788432267884, + "grad_norm": 6.25, + "learning_rate": 8.549077080657846e-06, + "loss": 1.59915657, + "memory(GiB)": 97.17, + "step": 22575, + "train_speed(iter/s)": 1.634199 + }, + { + "acc": 0.63893986, + "epoch": 0.5728056823947235, + "grad_norm": 6.5, + "learning_rate": 8.548338364957345e-06, + "loss": 1.68260727, + "memory(GiB)": 97.17, + "step": 22580, + "train_speed(iter/s)": 1.634238 + }, + { + "acc": 0.64738617, + "epoch": 0.5729325215626585, + "grad_norm": 5.5, + "learning_rate": 8.54759949318508e-06, + "loss": 1.64903946, + "memory(GiB)": 97.17, + "step": 22585, + "train_speed(iter/s)": 1.634278 + }, + { + "acc": 0.65078812, + "epoch": 0.5730593607305936, + "grad_norm": 4.78125, + "learning_rate": 8.546860465373552e-06, + "loss": 1.5880373, + "memory(GiB)": 97.17, + "step": 22590, + "train_speed(iter/s)": 1.634317 + }, + { + "acc": 0.65682545, + "epoch": 0.5731861998985287, + "grad_norm": 5.71875, + "learning_rate": 8.546121281555265e-06, + "loss": 1.57653236, + "memory(GiB)": 97.17, + "step": 22595, + "train_speed(iter/s)": 1.634359 + }, + { + "acc": 0.66010704, + "epoch": 0.5733130390664637, + "grad_norm": 5.40625, + "learning_rate": 8.54538194176273e-06, + "loss": 1.56372452, + "memory(GiB)": 97.17, + "step": 22600, + "train_speed(iter/s)": 1.634396 + }, + { + "acc": 0.6544301, + "epoch": 0.5734398782343988, + "grad_norm": 6.5625, + "learning_rate": 8.544642446028469e-06, + "loss": 1.60253029, + "memory(GiB)": 97.17, + "step": 22605, + "train_speed(iter/s)": 1.634438 + }, + { + "acc": 0.6301075, + "epoch": 0.5735667174023339, + "grad_norm": 8.1875, + "learning_rate": 8.543902794385008e-06, + "loss": 1.73892765, + "memory(GiB)": 97.17, + "step": 22610, + "train_speed(iter/s)": 1.634477 + }, + { + "acc": 0.6411973, + "epoch": 0.5736935565702689, + "grad_norm": 5.71875, + "learning_rate": 8.543162986864879e-06, + "loss": 1.66419106, + "memory(GiB)": 97.17, + "step": 22615, + "train_speed(iter/s)": 1.634515 + }, + { + "acc": 0.67401361, + "epoch": 0.573820395738204, + "grad_norm": 5.5625, + "learning_rate": 8.542423023500623e-06, + "loss": 1.54650631, + "memory(GiB)": 97.17, + "step": 22620, + "train_speed(iter/s)": 1.634553 + }, + { + "acc": 0.64794912, + "epoch": 0.573947234906139, + "grad_norm": 6.21875, + "learning_rate": 8.541682904324786e-06, + "loss": 1.64625969, + "memory(GiB)": 97.17, + "step": 22625, + "train_speed(iter/s)": 1.63459 + }, + { + "acc": 0.64722643, + "epoch": 0.5740740740740741, + "grad_norm": 5.5625, + "learning_rate": 8.540942629369923e-06, + "loss": 1.65565987, + "memory(GiB)": 97.17, + "step": 22630, + "train_speed(iter/s)": 1.634628 + }, + { + "acc": 0.63343868, + "epoch": 0.5742009132420092, + "grad_norm": 5.21875, + "learning_rate": 8.540202198668595e-06, + "loss": 1.63734856, + "memory(GiB)": 97.17, + "step": 22635, + "train_speed(iter/s)": 1.634667 + }, + { + "acc": 0.65869322, + "epoch": 0.5743277524099442, + "grad_norm": 5.28125, + "learning_rate": 8.539461612253368e-06, + "loss": 1.5885767, + "memory(GiB)": 97.17, + "step": 22640, + "train_speed(iter/s)": 1.634706 + }, + { + "acc": 0.65664492, + "epoch": 0.5744545915778793, + "grad_norm": 5.46875, + "learning_rate": 8.538720870156816e-06, + "loss": 1.59492931, + "memory(GiB)": 97.17, + "step": 22645, + "train_speed(iter/s)": 1.634745 + }, + { + "acc": 0.65455604, + "epoch": 0.5745814307458144, + "grad_norm": 5.21875, + "learning_rate": 8.53797997241152e-06, + "loss": 1.57693586, + "memory(GiB)": 97.17, + "step": 22650, + "train_speed(iter/s)": 1.634784 + }, + { + "acc": 0.66212964, + "epoch": 0.5747082699137493, + "grad_norm": 7.03125, + "learning_rate": 8.537238919050071e-06, + "loss": 1.59262638, + "memory(GiB)": 97.17, + "step": 22655, + "train_speed(iter/s)": 1.634826 + }, + { + "acc": 0.6656559, + "epoch": 0.5748351090816844, + "grad_norm": 5.0625, + "learning_rate": 8.53649771010506e-06, + "loss": 1.56936579, + "memory(GiB)": 97.17, + "step": 22660, + "train_speed(iter/s)": 1.634864 + }, + { + "acc": 0.65250397, + "epoch": 0.5749619482496194, + "grad_norm": 5.375, + "learning_rate": 8.535756345609092e-06, + "loss": 1.6102972, + "memory(GiB)": 97.17, + "step": 22665, + "train_speed(iter/s)": 1.634906 + }, + { + "acc": 0.66066313, + "epoch": 0.5750887874175545, + "grad_norm": 5.96875, + "learning_rate": 8.535014825594772e-06, + "loss": 1.59062881, + "memory(GiB)": 97.17, + "step": 22670, + "train_speed(iter/s)": 1.634941 + }, + { + "acc": 0.65874252, + "epoch": 0.5752156265854896, + "grad_norm": 5.03125, + "learning_rate": 8.534273150094718e-06, + "loss": 1.59291534, + "memory(GiB)": 97.17, + "step": 22675, + "train_speed(iter/s)": 1.634978 + }, + { + "acc": 0.64439807, + "epoch": 0.5753424657534246, + "grad_norm": 7.28125, + "learning_rate": 8.533531319141552e-06, + "loss": 1.57150679, + "memory(GiB)": 97.17, + "step": 22680, + "train_speed(iter/s)": 1.635014 + }, + { + "acc": 0.63908682, + "epoch": 0.5754693049213597, + "grad_norm": 6.28125, + "learning_rate": 8.532789332767902e-06, + "loss": 1.69566898, + "memory(GiB)": 97.17, + "step": 22685, + "train_speed(iter/s)": 1.635049 + }, + { + "acc": 0.65716305, + "epoch": 0.5755961440892948, + "grad_norm": 5.625, + "learning_rate": 8.532047191006405e-06, + "loss": 1.58032408, + "memory(GiB)": 97.17, + "step": 22690, + "train_speed(iter/s)": 1.635088 + }, + { + "acc": 0.64233589, + "epoch": 0.5757229832572298, + "grad_norm": 4.375, + "learning_rate": 8.531304893889702e-06, + "loss": 1.63509083, + "memory(GiB)": 97.17, + "step": 22695, + "train_speed(iter/s)": 1.635126 + }, + { + "acc": 0.66026225, + "epoch": 0.5758498224251649, + "grad_norm": 5.5625, + "learning_rate": 8.530562441450445e-06, + "loss": 1.63984337, + "memory(GiB)": 97.17, + "step": 22700, + "train_speed(iter/s)": 1.635163 + }, + { + "acc": 0.65600543, + "epoch": 0.5759766615930999, + "grad_norm": 5.65625, + "learning_rate": 8.529819833721289e-06, + "loss": 1.63429909, + "memory(GiB)": 97.17, + "step": 22705, + "train_speed(iter/s)": 1.635203 + }, + { + "acc": 0.65073228, + "epoch": 0.576103500761035, + "grad_norm": 5.96875, + "learning_rate": 8.529077070734896e-06, + "loss": 1.65823822, + "memory(GiB)": 97.17, + "step": 22710, + "train_speed(iter/s)": 1.635242 + }, + { + "acc": 0.65583858, + "epoch": 0.5762303399289701, + "grad_norm": 6.28125, + "learning_rate": 8.528334152523938e-06, + "loss": 1.6523138, + "memory(GiB)": 97.17, + "step": 22715, + "train_speed(iter/s)": 1.635281 + }, + { + "acc": 0.67079296, + "epoch": 0.5763571790969051, + "grad_norm": 5.75, + "learning_rate": 8.52759107912109e-06, + "loss": 1.5369688, + "memory(GiB)": 97.17, + "step": 22720, + "train_speed(iter/s)": 1.635321 + }, + { + "acc": 0.66434717, + "epoch": 0.5764840182648402, + "grad_norm": 5.4375, + "learning_rate": 8.526847850559037e-06, + "loss": 1.59597836, + "memory(GiB)": 97.17, + "step": 22725, + "train_speed(iter/s)": 1.635359 + }, + { + "acc": 0.65048466, + "epoch": 0.5766108574327753, + "grad_norm": 4.78125, + "learning_rate": 8.526104466870472e-06, + "loss": 1.64152584, + "memory(GiB)": 97.17, + "step": 22730, + "train_speed(iter/s)": 1.635396 + }, + { + "acc": 0.66162724, + "epoch": 0.5767376966007103, + "grad_norm": 5.09375, + "learning_rate": 8.525360928088087e-06, + "loss": 1.57986431, + "memory(GiB)": 97.17, + "step": 22735, + "train_speed(iter/s)": 1.635434 + }, + { + "acc": 0.65751557, + "epoch": 0.5768645357686454, + "grad_norm": 5.5, + "learning_rate": 8.524617234244588e-06, + "loss": 1.59875603, + "memory(GiB)": 97.17, + "step": 22740, + "train_speed(iter/s)": 1.635471 + }, + { + "acc": 0.65876346, + "epoch": 0.5769913749365804, + "grad_norm": 5.71875, + "learning_rate": 8.523873385372687e-06, + "loss": 1.50989676, + "memory(GiB)": 97.17, + "step": 22745, + "train_speed(iter/s)": 1.635509 + }, + { + "acc": 0.64990516, + "epoch": 0.5771182141045155, + "grad_norm": 6.28125, + "learning_rate": 8.523129381505104e-06, + "loss": 1.69517059, + "memory(GiB)": 97.17, + "step": 22750, + "train_speed(iter/s)": 1.635546 + }, + { + "acc": 0.67025118, + "epoch": 0.5772450532724506, + "grad_norm": 5.71875, + "learning_rate": 8.522385222674559e-06, + "loss": 1.55408869, + "memory(GiB)": 97.17, + "step": 22755, + "train_speed(iter/s)": 1.635585 + }, + { + "acc": 0.660672, + "epoch": 0.5773718924403856, + "grad_norm": 5.21875, + "learning_rate": 8.521640908913787e-06, + "loss": 1.54079924, + "memory(GiB)": 97.17, + "step": 22760, + "train_speed(iter/s)": 1.635621 + }, + { + "acc": 0.66065736, + "epoch": 0.5774987316083207, + "grad_norm": 5.125, + "learning_rate": 8.520896440255524e-06, + "loss": 1.63035107, + "memory(GiB)": 97.17, + "step": 22765, + "train_speed(iter/s)": 1.635659 + }, + { + "acc": 0.66402907, + "epoch": 0.5776255707762558, + "grad_norm": 5.96875, + "learning_rate": 8.520151816732517e-06, + "loss": 1.56851654, + "memory(GiB)": 97.17, + "step": 22770, + "train_speed(iter/s)": 1.6357 + }, + { + "acc": 0.63997631, + "epoch": 0.5777524099441907, + "grad_norm": 6.0625, + "learning_rate": 8.519407038377515e-06, + "loss": 1.70682335, + "memory(GiB)": 97.17, + "step": 22775, + "train_speed(iter/s)": 1.635741 + }, + { + "acc": 0.64541235, + "epoch": 0.5778792491121258, + "grad_norm": 5.28125, + "learning_rate": 8.518662105223279e-06, + "loss": 1.66096973, + "memory(GiB)": 97.17, + "step": 22780, + "train_speed(iter/s)": 1.63578 + }, + { + "acc": 0.65397167, + "epoch": 0.5780060882800608, + "grad_norm": 5.6875, + "learning_rate": 8.517917017302574e-06, + "loss": 1.63193169, + "memory(GiB)": 97.17, + "step": 22785, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.65304327, + "epoch": 0.5781329274479959, + "grad_norm": 5.15625, + "learning_rate": 8.517171774648172e-06, + "loss": 1.65042076, + "memory(GiB)": 97.17, + "step": 22790, + "train_speed(iter/s)": 1.635854 + }, + { + "acc": 0.6451889, + "epoch": 0.578259766615931, + "grad_norm": 5.40625, + "learning_rate": 8.516426377292854e-06, + "loss": 1.6351757, + "memory(GiB)": 97.17, + "step": 22795, + "train_speed(iter/s)": 1.635892 + }, + { + "acc": 0.65759945, + "epoch": 0.578386605783866, + "grad_norm": 5.0625, + "learning_rate": 8.515680825269404e-06, + "loss": 1.57706127, + "memory(GiB)": 97.17, + "step": 22800, + "train_speed(iter/s)": 1.635931 + }, + { + "acc": 0.65407743, + "epoch": 0.5785134449518011, + "grad_norm": 5.75, + "learning_rate": 8.514935118610613e-06, + "loss": 1.67828522, + "memory(GiB)": 97.17, + "step": 22805, + "train_speed(iter/s)": 1.635969 + }, + { + "acc": 0.65324202, + "epoch": 0.5786402841197362, + "grad_norm": 5.125, + "learning_rate": 8.514189257349283e-06, + "loss": 1.62905502, + "memory(GiB)": 97.17, + "step": 22810, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.66132698, + "epoch": 0.5787671232876712, + "grad_norm": 5.375, + "learning_rate": 8.51344324151822e-06, + "loss": 1.57477112, + "memory(GiB)": 97.17, + "step": 22815, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.64995775, + "epoch": 0.5788939624556063, + "grad_norm": 6.09375, + "learning_rate": 8.512697071150235e-06, + "loss": 1.61292915, + "memory(GiB)": 97.17, + "step": 22820, + "train_speed(iter/s)": 1.636074 + }, + { + "acc": 0.65169153, + "epoch": 0.5790208016235413, + "grad_norm": 5.875, + "learning_rate": 8.511950746278152e-06, + "loss": 1.49183826, + "memory(GiB)": 97.17, + "step": 22825, + "train_speed(iter/s)": 1.636112 + }, + { + "acc": 0.65585341, + "epoch": 0.5791476407914764, + "grad_norm": 6.75, + "learning_rate": 8.511204266934797e-06, + "loss": 1.62278996, + "memory(GiB)": 97.17, + "step": 22830, + "train_speed(iter/s)": 1.636148 + }, + { + "acc": 0.6411128, + "epoch": 0.5792744799594115, + "grad_norm": 5.65625, + "learning_rate": 8.510457633152998e-06, + "loss": 1.70634956, + "memory(GiB)": 97.17, + "step": 22835, + "train_speed(iter/s)": 1.636185 + }, + { + "acc": 0.64888144, + "epoch": 0.5794013191273465, + "grad_norm": 4.9375, + "learning_rate": 8.509710844965602e-06, + "loss": 1.612883, + "memory(GiB)": 97.17, + "step": 22840, + "train_speed(iter/s)": 1.636222 + }, + { + "acc": 0.64987893, + "epoch": 0.5795281582952816, + "grad_norm": 7.71875, + "learning_rate": 8.508963902405451e-06, + "loss": 1.69131012, + "memory(GiB)": 97.17, + "step": 22845, + "train_speed(iter/s)": 1.636258 + }, + { + "acc": 0.6553628, + "epoch": 0.5796549974632167, + "grad_norm": 6.5, + "learning_rate": 8.508216805505403e-06, + "loss": 1.58818827, + "memory(GiB)": 97.17, + "step": 22850, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.6612669, + "epoch": 0.5797818366311517, + "grad_norm": 4.5625, + "learning_rate": 8.507469554298318e-06, + "loss": 1.63925438, + "memory(GiB)": 97.17, + "step": 22855, + "train_speed(iter/s)": 1.63633 + }, + { + "acc": 0.66252761, + "epoch": 0.5799086757990868, + "grad_norm": 5.625, + "learning_rate": 8.506722148817061e-06, + "loss": 1.58173122, + "memory(GiB)": 97.17, + "step": 22860, + "train_speed(iter/s)": 1.636367 + }, + { + "acc": 0.66712084, + "epoch": 0.5800355149670218, + "grad_norm": 5.40625, + "learning_rate": 8.505974589094505e-06, + "loss": 1.55390739, + "memory(GiB)": 97.17, + "step": 22865, + "train_speed(iter/s)": 1.636403 + }, + { + "acc": 0.66790862, + "epoch": 0.5801623541349569, + "grad_norm": 4.75, + "learning_rate": 8.505226875163537e-06, + "loss": 1.62106781, + "memory(GiB)": 97.17, + "step": 22870, + "train_speed(iter/s)": 1.63644 + }, + { + "acc": 0.65671058, + "epoch": 0.580289193302892, + "grad_norm": 6.375, + "learning_rate": 8.50447900705704e-06, + "loss": 1.59848099, + "memory(GiB)": 97.17, + "step": 22875, + "train_speed(iter/s)": 1.636481 + }, + { + "acc": 0.66197948, + "epoch": 0.580416032470827, + "grad_norm": 4.375, + "learning_rate": 8.503730984807911e-06, + "loss": 1.58144665, + "memory(GiB)": 97.17, + "step": 22880, + "train_speed(iter/s)": 1.636519 + }, + { + "acc": 0.67121081, + "epoch": 0.5805428716387621, + "grad_norm": 5.21875, + "learning_rate": 8.502982808449049e-06, + "loss": 1.55431309, + "memory(GiB)": 97.17, + "step": 22885, + "train_speed(iter/s)": 1.636556 + }, + { + "acc": 0.64571066, + "epoch": 0.5806697108066972, + "grad_norm": 5.0, + "learning_rate": 8.502234478013363e-06, + "loss": 1.67499714, + "memory(GiB)": 97.17, + "step": 22890, + "train_speed(iter/s)": 1.636596 + }, + { + "acc": 0.64868345, + "epoch": 0.5807965499746321, + "grad_norm": 6.9375, + "learning_rate": 8.501485993533769e-06, + "loss": 1.64944916, + "memory(GiB)": 97.17, + "step": 22895, + "train_speed(iter/s)": 1.636634 + }, + { + "acc": 0.65402269, + "epoch": 0.5809233891425672, + "grad_norm": 5.71875, + "learning_rate": 8.500737355043188e-06, + "loss": 1.63367748, + "memory(GiB)": 97.17, + "step": 22900, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.65983505, + "epoch": 0.5810502283105022, + "grad_norm": 5.21875, + "learning_rate": 8.499988562574549e-06, + "loss": 1.6229641, + "memory(GiB)": 97.17, + "step": 22905, + "train_speed(iter/s)": 1.636714 + }, + { + "acc": 0.66664605, + "epoch": 0.5811770674784373, + "grad_norm": 8.1875, + "learning_rate": 8.499239616160787e-06, + "loss": 1.64632587, + "memory(GiB)": 97.17, + "step": 22910, + "train_speed(iter/s)": 1.63675 + }, + { + "acc": 0.66269217, + "epoch": 0.5813039066463724, + "grad_norm": 6.09375, + "learning_rate": 8.498490515834841e-06, + "loss": 1.62370987, + "memory(GiB)": 97.17, + "step": 22915, + "train_speed(iter/s)": 1.636789 + }, + { + "acc": 0.64714632, + "epoch": 0.5814307458143074, + "grad_norm": 4.75, + "learning_rate": 8.497741261629664e-06, + "loss": 1.62714901, + "memory(GiB)": 97.17, + "step": 22920, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.64612112, + "epoch": 0.5815575849822425, + "grad_norm": 5.28125, + "learning_rate": 8.496991853578212e-06, + "loss": 1.62256603, + "memory(GiB)": 97.17, + "step": 22925, + "train_speed(iter/s)": 1.636865 + }, + { + "acc": 0.65187206, + "epoch": 0.5816844241501776, + "grad_norm": 5.375, + "learning_rate": 8.496242291713444e-06, + "loss": 1.62594223, + "memory(GiB)": 97.17, + "step": 22930, + "train_speed(iter/s)": 1.636904 + }, + { + "acc": 0.66642528, + "epoch": 0.5818112633181126, + "grad_norm": 6.6875, + "learning_rate": 8.495492576068329e-06, + "loss": 1.62148819, + "memory(GiB)": 97.17, + "step": 22935, + "train_speed(iter/s)": 1.636943 + }, + { + "acc": 0.65136757, + "epoch": 0.5819381024860477, + "grad_norm": 4.84375, + "learning_rate": 8.494742706675844e-06, + "loss": 1.65615921, + "memory(GiB)": 97.17, + "step": 22940, + "train_speed(iter/s)": 1.636981 + }, + { + "acc": 0.6433547, + "epoch": 0.5820649416539827, + "grad_norm": 5.78125, + "learning_rate": 8.493992683568975e-06, + "loss": 1.70139961, + "memory(GiB)": 97.17, + "step": 22945, + "train_speed(iter/s)": 1.63702 + }, + { + "acc": 0.65435853, + "epoch": 0.5821917808219178, + "grad_norm": 7.84375, + "learning_rate": 8.493242506780705e-06, + "loss": 1.59594765, + "memory(GiB)": 97.17, + "step": 22950, + "train_speed(iter/s)": 1.637059 + }, + { + "acc": 0.64634666, + "epoch": 0.5823186199898529, + "grad_norm": 7.03125, + "learning_rate": 8.492492176344035e-06, + "loss": 1.69558048, + "memory(GiB)": 97.17, + "step": 22955, + "train_speed(iter/s)": 1.637099 + }, + { + "acc": 0.66459703, + "epoch": 0.5824454591577879, + "grad_norm": 4.78125, + "learning_rate": 8.491741692291967e-06, + "loss": 1.59905815, + "memory(GiB)": 97.17, + "step": 22960, + "train_speed(iter/s)": 1.637137 + }, + { + "acc": 0.64039135, + "epoch": 0.582572298325723, + "grad_norm": 6.84375, + "learning_rate": 8.490991054657507e-06, + "loss": 1.68901482, + "memory(GiB)": 97.17, + "step": 22965, + "train_speed(iter/s)": 1.637175 + }, + { + "acc": 0.65616636, + "epoch": 0.5826991374936581, + "grad_norm": 6.21875, + "learning_rate": 8.490240263473677e-06, + "loss": 1.6567852, + "memory(GiB)": 97.17, + "step": 22970, + "train_speed(iter/s)": 1.637213 + }, + { + "acc": 0.64903359, + "epoch": 0.5828259766615931, + "grad_norm": 7.46875, + "learning_rate": 8.489489318773496e-06, + "loss": 1.64451199, + "memory(GiB)": 97.17, + "step": 22975, + "train_speed(iter/s)": 1.637253 + }, + { + "acc": 0.64575744, + "epoch": 0.5829528158295282, + "grad_norm": 5.9375, + "learning_rate": 8.488738220589996e-06, + "loss": 1.68805923, + "memory(GiB)": 97.17, + "step": 22980, + "train_speed(iter/s)": 1.637291 + }, + { + "acc": 0.66106524, + "epoch": 0.5830796549974632, + "grad_norm": 5.46875, + "learning_rate": 8.487986968956212e-06, + "loss": 1.56896286, + "memory(GiB)": 97.17, + "step": 22985, + "train_speed(iter/s)": 1.637329 + }, + { + "acc": 0.65841284, + "epoch": 0.5832064941653983, + "grad_norm": 5.28125, + "learning_rate": 8.487235563905191e-06, + "loss": 1.61763344, + "memory(GiB)": 97.17, + "step": 22990, + "train_speed(iter/s)": 1.637368 + }, + { + "acc": 0.65555654, + "epoch": 0.5833333333333334, + "grad_norm": 5.8125, + "learning_rate": 8.486484005469977e-06, + "loss": 1.60066929, + "memory(GiB)": 97.17, + "step": 22995, + "train_speed(iter/s)": 1.637405 + }, + { + "acc": 0.657195, + "epoch": 0.5834601725012684, + "grad_norm": 5.25, + "learning_rate": 8.485732293683633e-06, + "loss": 1.58927183, + "memory(GiB)": 97.17, + "step": 23000, + "train_speed(iter/s)": 1.637443 + }, + { + "epoch": 0.5834601725012684, + "eval_acc": 0.6443664810162533, + "eval_loss": 1.584184169769287, + "eval_runtime": 58.432, + "eval_samples_per_second": 109.016, + "eval_steps_per_second": 27.262, + "step": 23000 + }, + { + "acc": 0.65083895, + "epoch": 0.5835870116692035, + "grad_norm": 6.75, + "learning_rate": 8.48498042857922e-06, + "loss": 1.6316246, + "memory(GiB)": 97.17, + "step": 23005, + "train_speed(iter/s)": 1.630208 + }, + { + "acc": 0.64590511, + "epoch": 0.5837138508371386, + "grad_norm": 6.09375, + "learning_rate": 8.484228410189807e-06, + "loss": 1.63669891, + "memory(GiB)": 97.17, + "step": 23010, + "train_speed(iter/s)": 1.630245 + }, + { + "acc": 0.63782701, + "epoch": 0.5838406900050735, + "grad_norm": 5.96875, + "learning_rate": 8.483476238548473e-06, + "loss": 1.70055771, + "memory(GiB)": 97.17, + "step": 23015, + "train_speed(iter/s)": 1.630282 + }, + { + "acc": 0.65304079, + "epoch": 0.5839675291730086, + "grad_norm": 4.96875, + "learning_rate": 8.482723913688301e-06, + "loss": 1.66720009, + "memory(GiB)": 97.17, + "step": 23020, + "train_speed(iter/s)": 1.630319 + }, + { + "acc": 0.66204033, + "epoch": 0.5840943683409436, + "grad_norm": 6.34375, + "learning_rate": 8.481971435642382e-06, + "loss": 1.60161018, + "memory(GiB)": 97.17, + "step": 23025, + "train_speed(iter/s)": 1.630357 + }, + { + "acc": 0.66512694, + "epoch": 0.5842212075088787, + "grad_norm": 5.28125, + "learning_rate": 8.481218804443814e-06, + "loss": 1.57576027, + "memory(GiB)": 97.17, + "step": 23030, + "train_speed(iter/s)": 1.630395 + }, + { + "acc": 0.64378653, + "epoch": 0.5843480466768138, + "grad_norm": 6.5, + "learning_rate": 8.480466020125701e-06, + "loss": 1.6231741, + "memory(GiB)": 97.17, + "step": 23035, + "train_speed(iter/s)": 1.630433 + }, + { + "acc": 0.64961295, + "epoch": 0.5844748858447488, + "grad_norm": 4.75, + "learning_rate": 8.479713082721153e-06, + "loss": 1.65742702, + "memory(GiB)": 97.17, + "step": 23040, + "train_speed(iter/s)": 1.630469 + }, + { + "acc": 0.67278495, + "epoch": 0.5846017250126839, + "grad_norm": 6.15625, + "learning_rate": 8.478959992263288e-06, + "loss": 1.54251881, + "memory(GiB)": 97.17, + "step": 23045, + "train_speed(iter/s)": 1.630505 + }, + { + "acc": 0.64318628, + "epoch": 0.584728564180619, + "grad_norm": 5.53125, + "learning_rate": 8.478206748785229e-06, + "loss": 1.67044601, + "memory(GiB)": 97.17, + "step": 23050, + "train_speed(iter/s)": 1.630545 + }, + { + "acc": 0.65506592, + "epoch": 0.584855403348554, + "grad_norm": 5.625, + "learning_rate": 8.477453352320108e-06, + "loss": 1.58791437, + "memory(GiB)": 97.17, + "step": 23055, + "train_speed(iter/s)": 1.630582 + }, + { + "acc": 0.6587451, + "epoch": 0.5849822425164891, + "grad_norm": 5.03125, + "learning_rate": 8.476699802901066e-06, + "loss": 1.60707893, + "memory(GiB)": 97.17, + "step": 23060, + "train_speed(iter/s)": 1.630619 + }, + { + "acc": 0.65943556, + "epoch": 0.5851090816844241, + "grad_norm": 5.6875, + "learning_rate": 8.47594610056124e-06, + "loss": 1.56292238, + "memory(GiB)": 97.17, + "step": 23065, + "train_speed(iter/s)": 1.630658 + }, + { + "acc": 0.64979725, + "epoch": 0.5852359208523592, + "grad_norm": 5.875, + "learning_rate": 8.475192245333787e-06, + "loss": 1.61738892, + "memory(GiB)": 97.17, + "step": 23070, + "train_speed(iter/s)": 1.630696 + }, + { + "acc": 0.65580435, + "epoch": 0.5853627600202943, + "grad_norm": 6.875, + "learning_rate": 8.474438237251864e-06, + "loss": 1.56334438, + "memory(GiB)": 97.17, + "step": 23075, + "train_speed(iter/s)": 1.630732 + }, + { + "acc": 0.66290398, + "epoch": 0.5854895991882293, + "grad_norm": 5.15625, + "learning_rate": 8.473684076348635e-06, + "loss": 1.54914141, + "memory(GiB)": 97.17, + "step": 23080, + "train_speed(iter/s)": 1.630769 + }, + { + "acc": 0.65610676, + "epoch": 0.5856164383561644, + "grad_norm": 6.0, + "learning_rate": 8.472929762657272e-06, + "loss": 1.61477509, + "memory(GiB)": 97.17, + "step": 23085, + "train_speed(iter/s)": 1.630807 + }, + { + "acc": 0.65753269, + "epoch": 0.5857432775240995, + "grad_norm": 5.59375, + "learning_rate": 8.472175296210952e-06, + "loss": 1.58797789, + "memory(GiB)": 97.17, + "step": 23090, + "train_speed(iter/s)": 1.630846 + }, + { + "acc": 0.65164843, + "epoch": 0.5858701166920345, + "grad_norm": 6.0625, + "learning_rate": 8.471420677042858e-06, + "loss": 1.6396883, + "memory(GiB)": 97.17, + "step": 23095, + "train_speed(iter/s)": 1.630884 + }, + { + "acc": 0.66111522, + "epoch": 0.5859969558599696, + "grad_norm": 5.78125, + "learning_rate": 8.470665905186188e-06, + "loss": 1.57690125, + "memory(GiB)": 97.17, + "step": 23100, + "train_speed(iter/s)": 1.630923 + }, + { + "acc": 0.65413284, + "epoch": 0.5861237950279046, + "grad_norm": 5.09375, + "learning_rate": 8.469910980674134e-06, + "loss": 1.65788784, + "memory(GiB)": 97.17, + "step": 23105, + "train_speed(iter/s)": 1.630961 + }, + { + "acc": 0.67559872, + "epoch": 0.5862506341958397, + "grad_norm": 5.34375, + "learning_rate": 8.469155903539903e-06, + "loss": 1.48754435, + "memory(GiB)": 97.17, + "step": 23110, + "train_speed(iter/s)": 1.630998 + }, + { + "acc": 0.64601326, + "epoch": 0.5863774733637748, + "grad_norm": 7.65625, + "learning_rate": 8.468400673816705e-06, + "loss": 1.64300041, + "memory(GiB)": 97.17, + "step": 23115, + "train_speed(iter/s)": 1.631034 + }, + { + "acc": 0.65507307, + "epoch": 0.5865043125317098, + "grad_norm": 6.09375, + "learning_rate": 8.467645291537763e-06, + "loss": 1.67484093, + "memory(GiB)": 97.17, + "step": 23120, + "train_speed(iter/s)": 1.631073 + }, + { + "acc": 0.64951906, + "epoch": 0.5866311516996449, + "grad_norm": 5.53125, + "learning_rate": 8.466889756736298e-06, + "loss": 1.57847137, + "memory(GiB)": 97.17, + "step": 23125, + "train_speed(iter/s)": 1.631114 + }, + { + "acc": 0.64380484, + "epoch": 0.58675799086758, + "grad_norm": 4.96875, + "learning_rate": 8.466134069445544e-06, + "loss": 1.59181137, + "memory(GiB)": 97.17, + "step": 23130, + "train_speed(iter/s)": 1.631152 + }, + { + "acc": 0.65185699, + "epoch": 0.586884830035515, + "grad_norm": 5.21875, + "learning_rate": 8.465378229698737e-06, + "loss": 1.66399231, + "memory(GiB)": 97.17, + "step": 23135, + "train_speed(iter/s)": 1.63119 + }, + { + "acc": 0.64459105, + "epoch": 0.58701166920345, + "grad_norm": 5.59375, + "learning_rate": 8.464622237529123e-06, + "loss": 1.66215572, + "memory(GiB)": 97.17, + "step": 23140, + "train_speed(iter/s)": 1.631228 + }, + { + "acc": 0.66941447, + "epoch": 0.587138508371385, + "grad_norm": 5.9375, + "learning_rate": 8.463866092969958e-06, + "loss": 1.54132729, + "memory(GiB)": 97.17, + "step": 23145, + "train_speed(iter/s)": 1.631266 + }, + { + "acc": 0.66022167, + "epoch": 0.5872653475393201, + "grad_norm": 5.5, + "learning_rate": 8.463109796054495e-06, + "loss": 1.61912155, + "memory(GiB)": 97.17, + "step": 23150, + "train_speed(iter/s)": 1.631303 + }, + { + "acc": 0.65391273, + "epoch": 0.5873921867072552, + "grad_norm": 6.28125, + "learning_rate": 8.462353346815999e-06, + "loss": 1.6016737, + "memory(GiB)": 97.17, + "step": 23155, + "train_speed(iter/s)": 1.631332 + }, + { + "acc": 0.65436144, + "epoch": 0.5875190258751902, + "grad_norm": 5.96875, + "learning_rate": 8.461596745287747e-06, + "loss": 1.5995245, + "memory(GiB)": 97.17, + "step": 23160, + "train_speed(iter/s)": 1.631372 + }, + { + "acc": 0.64279499, + "epoch": 0.5876458650431253, + "grad_norm": 5.9375, + "learning_rate": 8.460839991503016e-06, + "loss": 1.673703, + "memory(GiB)": 97.17, + "step": 23165, + "train_speed(iter/s)": 1.631411 + }, + { + "acc": 0.66852322, + "epoch": 0.5877727042110604, + "grad_norm": 6.6875, + "learning_rate": 8.46008308549509e-06, + "loss": 1.54446135, + "memory(GiB)": 97.17, + "step": 23170, + "train_speed(iter/s)": 1.631449 + }, + { + "acc": 0.65555248, + "epoch": 0.5878995433789954, + "grad_norm": 5.625, + "learning_rate": 8.459326027297261e-06, + "loss": 1.59135494, + "memory(GiB)": 97.17, + "step": 23175, + "train_speed(iter/s)": 1.631488 + }, + { + "acc": 0.65714149, + "epoch": 0.5880263825469305, + "grad_norm": 6.03125, + "learning_rate": 8.45856881694283e-06, + "loss": 1.59860697, + "memory(GiB)": 97.17, + "step": 23180, + "train_speed(iter/s)": 1.631528 + }, + { + "acc": 0.65539322, + "epoch": 0.5881532217148655, + "grad_norm": 5.6875, + "learning_rate": 8.4578114544651e-06, + "loss": 1.58906803, + "memory(GiB)": 97.17, + "step": 23185, + "train_speed(iter/s)": 1.631563 + }, + { + "acc": 0.65719471, + "epoch": 0.5882800608828006, + "grad_norm": 6.53125, + "learning_rate": 8.457053939897385e-06, + "loss": 1.62150211, + "memory(GiB)": 97.17, + "step": 23190, + "train_speed(iter/s)": 1.631602 + }, + { + "acc": 0.66549149, + "epoch": 0.5884069000507357, + "grad_norm": 5.09375, + "learning_rate": 8.456296273273e-06, + "loss": 1.58194494, + "memory(GiB)": 97.17, + "step": 23195, + "train_speed(iter/s)": 1.631641 + }, + { + "acc": 0.65814161, + "epoch": 0.5885337392186707, + "grad_norm": 6.125, + "learning_rate": 8.455538454625276e-06, + "loss": 1.60630798, + "memory(GiB)": 97.17, + "step": 23200, + "train_speed(iter/s)": 1.631681 + }, + { + "acc": 0.65053844, + "epoch": 0.5886605783866058, + "grad_norm": 5.90625, + "learning_rate": 8.454780483987544e-06, + "loss": 1.63077755, + "memory(GiB)": 97.17, + "step": 23205, + "train_speed(iter/s)": 1.631719 + }, + { + "acc": 0.66433411, + "epoch": 0.5887874175545409, + "grad_norm": 8.5, + "learning_rate": 8.45402236139314e-06, + "loss": 1.56801281, + "memory(GiB)": 97.17, + "step": 23210, + "train_speed(iter/s)": 1.63176 + }, + { + "acc": 0.65297689, + "epoch": 0.5889142567224759, + "grad_norm": 5.1875, + "learning_rate": 8.453264086875411e-06, + "loss": 1.6130888, + "memory(GiB)": 97.17, + "step": 23215, + "train_speed(iter/s)": 1.631799 + }, + { + "acc": 0.65417309, + "epoch": 0.589041095890411, + "grad_norm": 6.1875, + "learning_rate": 8.452505660467713e-06, + "loss": 1.53566599, + "memory(GiB)": 97.17, + "step": 23220, + "train_speed(iter/s)": 1.631839 + }, + { + "acc": 0.66245646, + "epoch": 0.589167935058346, + "grad_norm": 6.90625, + "learning_rate": 8.451747082203398e-06, + "loss": 1.62393475, + "memory(GiB)": 97.17, + "step": 23225, + "train_speed(iter/s)": 1.631878 + }, + { + "acc": 0.66943555, + "epoch": 0.5892947742262811, + "grad_norm": 7.625, + "learning_rate": 8.450988352115838e-06, + "loss": 1.55491238, + "memory(GiB)": 97.17, + "step": 23230, + "train_speed(iter/s)": 1.631918 + }, + { + "acc": 0.65429621, + "epoch": 0.5894216133942162, + "grad_norm": 5.25, + "learning_rate": 8.450229470238401e-06, + "loss": 1.58933592, + "memory(GiB)": 97.17, + "step": 23235, + "train_speed(iter/s)": 1.631957 + }, + { + "acc": 0.64573588, + "epoch": 0.5895484525621512, + "grad_norm": 7.75, + "learning_rate": 8.44947043660447e-06, + "loss": 1.62904663, + "memory(GiB)": 97.17, + "step": 23240, + "train_speed(iter/s)": 1.631996 + }, + { + "acc": 0.67295542, + "epoch": 0.5896752917300863, + "grad_norm": 6.125, + "learning_rate": 8.448711251247425e-06, + "loss": 1.56359625, + "memory(GiB)": 97.17, + "step": 23245, + "train_speed(iter/s)": 1.632037 + }, + { + "acc": 0.655369, + "epoch": 0.5898021308980214, + "grad_norm": 6.40625, + "learning_rate": 8.447951914200665e-06, + "loss": 1.62867508, + "memory(GiB)": 97.17, + "step": 23250, + "train_speed(iter/s)": 1.632074 + }, + { + "acc": 0.66799998, + "epoch": 0.5899289700659563, + "grad_norm": 5.5625, + "learning_rate": 8.447192425497583e-06, + "loss": 1.56641235, + "memory(GiB)": 97.17, + "step": 23255, + "train_speed(iter/s)": 1.632112 + }, + { + "acc": 0.6660368, + "epoch": 0.5900558092338914, + "grad_norm": 6.21875, + "learning_rate": 8.44643278517159e-06, + "loss": 1.56193514, + "memory(GiB)": 97.17, + "step": 23260, + "train_speed(iter/s)": 1.63215 + }, + { + "acc": 0.63970351, + "epoch": 0.5901826484018264, + "grad_norm": 5.15625, + "learning_rate": 8.445672993256095e-06, + "loss": 1.71108532, + "memory(GiB)": 97.17, + "step": 23265, + "train_speed(iter/s)": 1.632187 + }, + { + "acc": 0.64268646, + "epoch": 0.5903094875697615, + "grad_norm": 5.9375, + "learning_rate": 8.444913049784517e-06, + "loss": 1.67581463, + "memory(GiB)": 97.17, + "step": 23270, + "train_speed(iter/s)": 1.632226 + }, + { + "acc": 0.67111602, + "epoch": 0.5904363267376966, + "grad_norm": 5.96875, + "learning_rate": 8.444152954790285e-06, + "loss": 1.57265453, + "memory(GiB)": 97.17, + "step": 23275, + "train_speed(iter/s)": 1.632264 + }, + { + "acc": 0.65613003, + "epoch": 0.5905631659056316, + "grad_norm": 5.53125, + "learning_rate": 8.443392708306827e-06, + "loss": 1.63115654, + "memory(GiB)": 97.17, + "step": 23280, + "train_speed(iter/s)": 1.632301 + }, + { + "acc": 0.66380405, + "epoch": 0.5906900050735667, + "grad_norm": 7.21875, + "learning_rate": 8.442632310367585e-06, + "loss": 1.57424507, + "memory(GiB)": 97.17, + "step": 23285, + "train_speed(iter/s)": 1.632342 + }, + { + "acc": 0.65739794, + "epoch": 0.5908168442415018, + "grad_norm": 5.75, + "learning_rate": 8.441871761006001e-06, + "loss": 1.5503912, + "memory(GiB)": 97.17, + "step": 23290, + "train_speed(iter/s)": 1.632381 + }, + { + "acc": 0.6517333, + "epoch": 0.5909436834094368, + "grad_norm": 6.5625, + "learning_rate": 8.441111060255533e-06, + "loss": 1.66126156, + "memory(GiB)": 97.17, + "step": 23295, + "train_speed(iter/s)": 1.63242 + }, + { + "acc": 0.62545977, + "epoch": 0.5910705225773719, + "grad_norm": 6.125, + "learning_rate": 8.440350208149637e-06, + "loss": 1.72088184, + "memory(GiB)": 97.17, + "step": 23300, + "train_speed(iter/s)": 1.632458 + }, + { + "acc": 0.65832124, + "epoch": 0.5911973617453069, + "grad_norm": 7.875, + "learning_rate": 8.43958920472178e-06, + "loss": 1.58804693, + "memory(GiB)": 97.17, + "step": 23305, + "train_speed(iter/s)": 1.632498 + }, + { + "acc": 0.65824728, + "epoch": 0.591324200913242, + "grad_norm": 4.9375, + "learning_rate": 8.43882805000543e-06, + "loss": 1.61714001, + "memory(GiB)": 97.17, + "step": 23310, + "train_speed(iter/s)": 1.632533 + }, + { + "acc": 0.64468489, + "epoch": 0.5914510400811771, + "grad_norm": 5.5625, + "learning_rate": 8.43806674403407e-06, + "loss": 1.67449455, + "memory(GiB)": 97.17, + "step": 23315, + "train_speed(iter/s)": 1.632573 + }, + { + "acc": 0.66782417, + "epoch": 0.5915778792491121, + "grad_norm": 6.1875, + "learning_rate": 8.437305286841187e-06, + "loss": 1.52976942, + "memory(GiB)": 97.17, + "step": 23320, + "train_speed(iter/s)": 1.632613 + }, + { + "acc": 0.67024045, + "epoch": 0.5917047184170472, + "grad_norm": 5.28125, + "learning_rate": 8.436543678460269e-06, + "loss": 1.58486519, + "memory(GiB)": 97.17, + "step": 23325, + "train_speed(iter/s)": 1.632651 + }, + { + "acc": 0.66463103, + "epoch": 0.5918315575849823, + "grad_norm": 7.0, + "learning_rate": 8.435781918924817e-06, + "loss": 1.59040165, + "memory(GiB)": 97.17, + "step": 23330, + "train_speed(iter/s)": 1.632688 + }, + { + "acc": 0.66600232, + "epoch": 0.5919583967529173, + "grad_norm": 5.71875, + "learning_rate": 8.435020008268335e-06, + "loss": 1.57260599, + "memory(GiB)": 97.17, + "step": 23335, + "train_speed(iter/s)": 1.632725 + }, + { + "acc": 0.65437956, + "epoch": 0.5920852359208524, + "grad_norm": 6.75, + "learning_rate": 8.43425794652434e-06, + "loss": 1.60292473, + "memory(GiB)": 97.17, + "step": 23340, + "train_speed(iter/s)": 1.632762 + }, + { + "acc": 0.64356432, + "epoch": 0.5922120750887874, + "grad_norm": 5.125, + "learning_rate": 8.433495733726345e-06, + "loss": 1.665802, + "memory(GiB)": 97.17, + "step": 23345, + "train_speed(iter/s)": 1.632801 + }, + { + "acc": 0.64941578, + "epoch": 0.5923389142567225, + "grad_norm": 4.8125, + "learning_rate": 8.43273336990788e-06, + "loss": 1.66789627, + "memory(GiB)": 97.17, + "step": 23350, + "train_speed(iter/s)": 1.632839 + }, + { + "acc": 0.66172667, + "epoch": 0.5924657534246576, + "grad_norm": 5.5625, + "learning_rate": 8.431970855102475e-06, + "loss": 1.59821272, + "memory(GiB)": 97.17, + "step": 23355, + "train_speed(iter/s)": 1.63288 + }, + { + "acc": 0.66407843, + "epoch": 0.5925925925925926, + "grad_norm": 4.46875, + "learning_rate": 8.43120818934367e-06, + "loss": 1.57593365, + "memory(GiB)": 97.17, + "step": 23360, + "train_speed(iter/s)": 1.632921 + }, + { + "acc": 0.65724597, + "epoch": 0.5927194317605277, + "grad_norm": 5.6875, + "learning_rate": 8.430445372665008e-06, + "loss": 1.64276714, + "memory(GiB)": 97.17, + "step": 23365, + "train_speed(iter/s)": 1.632961 + }, + { + "acc": 0.6491847, + "epoch": 0.5928462709284628, + "grad_norm": 5.40625, + "learning_rate": 8.429682405100042e-06, + "loss": 1.67211685, + "memory(GiB)": 97.17, + "step": 23370, + "train_speed(iter/s)": 1.632999 + }, + { + "acc": 0.66703463, + "epoch": 0.5929731100963977, + "grad_norm": 5.6875, + "learning_rate": 8.428919286682333e-06, + "loss": 1.55302324, + "memory(GiB)": 97.17, + "step": 23375, + "train_speed(iter/s)": 1.633038 + }, + { + "acc": 0.66369238, + "epoch": 0.5930999492643328, + "grad_norm": 5.46875, + "learning_rate": 8.428156017445443e-06, + "loss": 1.57563419, + "memory(GiB)": 97.17, + "step": 23380, + "train_speed(iter/s)": 1.633074 + }, + { + "acc": 0.65099092, + "epoch": 0.5932267884322678, + "grad_norm": 5.875, + "learning_rate": 8.427392597422947e-06, + "loss": 1.62383461, + "memory(GiB)": 97.17, + "step": 23385, + "train_speed(iter/s)": 1.633113 + }, + { + "acc": 0.65516863, + "epoch": 0.5933536276002029, + "grad_norm": 5.25, + "learning_rate": 8.426629026648423e-06, + "loss": 1.63778954, + "memory(GiB)": 97.17, + "step": 23390, + "train_speed(iter/s)": 1.633152 + }, + { + "acc": 0.64695163, + "epoch": 0.593480466768138, + "grad_norm": 6.5, + "learning_rate": 8.425865305155455e-06, + "loss": 1.65957737, + "memory(GiB)": 97.17, + "step": 23395, + "train_speed(iter/s)": 1.633191 + }, + { + "acc": 0.65745635, + "epoch": 0.593607305936073, + "grad_norm": 7.03125, + "learning_rate": 8.425101432977636e-06, + "loss": 1.64048576, + "memory(GiB)": 97.17, + "step": 23400, + "train_speed(iter/s)": 1.63323 + }, + { + "acc": 0.62996144, + "epoch": 0.5937341451040081, + "grad_norm": 5.625, + "learning_rate": 8.424337410148562e-06, + "loss": 1.66695919, + "memory(GiB)": 97.17, + "step": 23405, + "train_speed(iter/s)": 1.633269 + }, + { + "acc": 0.64446592, + "epoch": 0.5938609842719432, + "grad_norm": 5.84375, + "learning_rate": 8.423573236701842e-06, + "loss": 1.65895329, + "memory(GiB)": 97.17, + "step": 23410, + "train_speed(iter/s)": 1.633309 + }, + { + "acc": 0.64963346, + "epoch": 0.5939878234398782, + "grad_norm": 5.5625, + "learning_rate": 8.422808912671086e-06, + "loss": 1.66036854, + "memory(GiB)": 97.17, + "step": 23415, + "train_speed(iter/s)": 1.633351 + }, + { + "acc": 0.66884909, + "epoch": 0.5941146626078133, + "grad_norm": 4.84375, + "learning_rate": 8.422044438089911e-06, + "loss": 1.62320824, + "memory(GiB)": 97.17, + "step": 23420, + "train_speed(iter/s)": 1.633386 + }, + { + "acc": 0.65858469, + "epoch": 0.5942415017757483, + "grad_norm": 7.375, + "learning_rate": 8.421279812991944e-06, + "loss": 1.58954277, + "memory(GiB)": 97.17, + "step": 23425, + "train_speed(iter/s)": 1.633426 + }, + { + "acc": 0.65962644, + "epoch": 0.5943683409436834, + "grad_norm": 5.0625, + "learning_rate": 8.420515037410817e-06, + "loss": 1.59507351, + "memory(GiB)": 97.17, + "step": 23430, + "train_speed(iter/s)": 1.633466 + }, + { + "acc": 0.64386396, + "epoch": 0.5944951801116185, + "grad_norm": 5.1875, + "learning_rate": 8.419750111380166e-06, + "loss": 1.60042381, + "memory(GiB)": 97.17, + "step": 23435, + "train_speed(iter/s)": 1.633504 + }, + { + "acc": 0.65020213, + "epoch": 0.5946220192795535, + "grad_norm": 5.5, + "learning_rate": 8.418985034933637e-06, + "loss": 1.64524841, + "memory(GiB)": 97.17, + "step": 23440, + "train_speed(iter/s)": 1.633543 + }, + { + "acc": 0.66250248, + "epoch": 0.5947488584474886, + "grad_norm": 5.5, + "learning_rate": 8.418219808104882e-06, + "loss": 1.57158031, + "memory(GiB)": 97.17, + "step": 23445, + "train_speed(iter/s)": 1.633582 + }, + { + "acc": 0.66387591, + "epoch": 0.5948756976154237, + "grad_norm": 5.8125, + "learning_rate": 8.417454430927559e-06, + "loss": 1.60943375, + "memory(GiB)": 97.17, + "step": 23450, + "train_speed(iter/s)": 1.633623 + }, + { + "acc": 0.6533473, + "epoch": 0.5950025367833587, + "grad_norm": 5.59375, + "learning_rate": 8.41668890343533e-06, + "loss": 1.66412411, + "memory(GiB)": 97.17, + "step": 23455, + "train_speed(iter/s)": 1.633662 + }, + { + "acc": 0.64161644, + "epoch": 0.5951293759512938, + "grad_norm": 4.90625, + "learning_rate": 8.41592322566187e-06, + "loss": 1.67894611, + "memory(GiB)": 97.17, + "step": 23460, + "train_speed(iter/s)": 1.633699 + }, + { + "acc": 0.65543036, + "epoch": 0.5952562151192288, + "grad_norm": 5.96875, + "learning_rate": 8.415157397640857e-06, + "loss": 1.52906284, + "memory(GiB)": 97.17, + "step": 23465, + "train_speed(iter/s)": 1.633737 + }, + { + "acc": 0.64938507, + "epoch": 0.5953830542871639, + "grad_norm": 5.5, + "learning_rate": 8.414391419405972e-06, + "loss": 1.61706276, + "memory(GiB)": 97.17, + "step": 23470, + "train_speed(iter/s)": 1.633776 + }, + { + "acc": 0.65292044, + "epoch": 0.595509893455099, + "grad_norm": 6.5625, + "learning_rate": 8.413625290990909e-06, + "loss": 1.66275444, + "memory(GiB)": 97.17, + "step": 23475, + "train_speed(iter/s)": 1.633817 + }, + { + "acc": 0.67137198, + "epoch": 0.595636732623034, + "grad_norm": 5.59375, + "learning_rate": 8.412859012429365e-06, + "loss": 1.52991858, + "memory(GiB)": 97.17, + "step": 23480, + "train_speed(iter/s)": 1.633854 + }, + { + "acc": 0.66181145, + "epoch": 0.5957635717909691, + "grad_norm": 5.59375, + "learning_rate": 8.412092583755043e-06, + "loss": 1.57894573, + "memory(GiB)": 97.17, + "step": 23485, + "train_speed(iter/s)": 1.633893 + }, + { + "acc": 0.64901962, + "epoch": 0.5958904109589042, + "grad_norm": 5.4375, + "learning_rate": 8.411326005001658e-06, + "loss": 1.61250839, + "memory(GiB)": 97.17, + "step": 23490, + "train_speed(iter/s)": 1.633934 + }, + { + "acc": 0.64099665, + "epoch": 0.5960172501268391, + "grad_norm": 5.5, + "learning_rate": 8.410559276202922e-06, + "loss": 1.64035416, + "memory(GiB)": 97.17, + "step": 23495, + "train_speed(iter/s)": 1.633971 + }, + { + "acc": 0.64370995, + "epoch": 0.5961440892947742, + "grad_norm": 4.71875, + "learning_rate": 8.409792397392565e-06, + "loss": 1.61438026, + "memory(GiB)": 97.17, + "step": 23500, + "train_speed(iter/s)": 1.634008 + }, + { + "acc": 0.6530561, + "epoch": 0.5962709284627092, + "grad_norm": 6.4375, + "learning_rate": 8.40902536860431e-06, + "loss": 1.59955921, + "memory(GiB)": 97.17, + "step": 23505, + "train_speed(iter/s)": 1.634046 + }, + { + "acc": 0.65695734, + "epoch": 0.5963977676306443, + "grad_norm": 5.5625, + "learning_rate": 8.408258189871904e-06, + "loss": 1.58472672, + "memory(GiB)": 97.17, + "step": 23510, + "train_speed(iter/s)": 1.634084 + }, + { + "acc": 0.66635165, + "epoch": 0.5965246067985794, + "grad_norm": 5.15625, + "learning_rate": 8.407490861229084e-06, + "loss": 1.61341419, + "memory(GiB)": 97.17, + "step": 23515, + "train_speed(iter/s)": 1.634123 + }, + { + "acc": 0.64310079, + "epoch": 0.5966514459665144, + "grad_norm": 6.84375, + "learning_rate": 8.406723382709603e-06, + "loss": 1.66445427, + "memory(GiB)": 97.17, + "step": 23520, + "train_speed(iter/s)": 1.634162 + }, + { + "acc": 0.66659174, + "epoch": 0.5967782851344495, + "grad_norm": 5.3125, + "learning_rate": 8.405955754347216e-06, + "loss": 1.61462631, + "memory(GiB)": 97.17, + "step": 23525, + "train_speed(iter/s)": 1.634199 + }, + { + "acc": 0.6579628, + "epoch": 0.5969051243023846, + "grad_norm": 5.375, + "learning_rate": 8.40518797617569e-06, + "loss": 1.60541725, + "memory(GiB)": 97.17, + "step": 23530, + "train_speed(iter/s)": 1.634237 + }, + { + "acc": 0.67792006, + "epoch": 0.5970319634703196, + "grad_norm": 5.625, + "learning_rate": 8.404420048228794e-06, + "loss": 1.46336565, + "memory(GiB)": 97.17, + "step": 23535, + "train_speed(iter/s)": 1.634272 + }, + { + "acc": 0.6444315, + "epoch": 0.5971588026382547, + "grad_norm": 5.21875, + "learning_rate": 8.403651970540305e-06, + "loss": 1.65791664, + "memory(GiB)": 97.17, + "step": 23540, + "train_speed(iter/s)": 1.63431 + }, + { + "acc": 0.6475204, + "epoch": 0.5972856418061897, + "grad_norm": 5.71875, + "learning_rate": 8.402883743144005e-06, + "loss": 1.5997447, + "memory(GiB)": 97.17, + "step": 23545, + "train_speed(iter/s)": 1.63435 + }, + { + "acc": 0.6533288, + "epoch": 0.5974124809741248, + "grad_norm": 5.65625, + "learning_rate": 8.402115366073686e-06, + "loss": 1.65870399, + "memory(GiB)": 97.17, + "step": 23550, + "train_speed(iter/s)": 1.63439 + }, + { + "acc": 0.64768128, + "epoch": 0.5975393201420599, + "grad_norm": 5.5, + "learning_rate": 8.401346839363143e-06, + "loss": 1.60726089, + "memory(GiB)": 97.17, + "step": 23555, + "train_speed(iter/s)": 1.634427 + }, + { + "acc": 0.65291977, + "epoch": 0.5976661593099949, + "grad_norm": 6.15625, + "learning_rate": 8.40057816304618e-06, + "loss": 1.63162308, + "memory(GiB)": 97.17, + "step": 23560, + "train_speed(iter/s)": 1.634468 + }, + { + "acc": 0.64753909, + "epoch": 0.59779299847793, + "grad_norm": 6.28125, + "learning_rate": 8.399809337156608e-06, + "loss": 1.67615204, + "memory(GiB)": 97.17, + "step": 23565, + "train_speed(iter/s)": 1.634507 + }, + { + "acc": 0.65012894, + "epoch": 0.5979198376458651, + "grad_norm": 5.15625, + "learning_rate": 8.39904036172824e-06, + "loss": 1.61763802, + "memory(GiB)": 97.17, + "step": 23570, + "train_speed(iter/s)": 1.634544 + }, + { + "acc": 0.65089827, + "epoch": 0.5980466768138001, + "grad_norm": 5.3125, + "learning_rate": 8.398271236794904e-06, + "loss": 1.56932459, + "memory(GiB)": 97.17, + "step": 23575, + "train_speed(iter/s)": 1.634582 + }, + { + "acc": 0.64946094, + "epoch": 0.5981735159817352, + "grad_norm": 5.96875, + "learning_rate": 8.397501962390427e-06, + "loss": 1.61197071, + "memory(GiB)": 97.17, + "step": 23580, + "train_speed(iter/s)": 1.63462 + }, + { + "acc": 0.64305458, + "epoch": 0.5983003551496702, + "grad_norm": 5.375, + "learning_rate": 8.396732538548642e-06, + "loss": 1.64893017, + "memory(GiB)": 97.17, + "step": 23585, + "train_speed(iter/s)": 1.634656 + }, + { + "acc": 0.65778427, + "epoch": 0.5984271943176053, + "grad_norm": 5.1875, + "learning_rate": 8.395962965303397e-06, + "loss": 1.68108139, + "memory(GiB)": 97.17, + "step": 23590, + "train_speed(iter/s)": 1.634693 + }, + { + "acc": 0.67590961, + "epoch": 0.5985540334855404, + "grad_norm": 5.0, + "learning_rate": 8.395193242688537e-06, + "loss": 1.58161917, + "memory(GiB)": 97.17, + "step": 23595, + "train_speed(iter/s)": 1.634729 + }, + { + "acc": 0.65937762, + "epoch": 0.5986808726534754, + "grad_norm": 5.65625, + "learning_rate": 8.394423370737922e-06, + "loss": 1.53476868, + "memory(GiB)": 97.17, + "step": 23600, + "train_speed(iter/s)": 1.634769 + }, + { + "acc": 0.66086016, + "epoch": 0.5988077118214105, + "grad_norm": 5.3125, + "learning_rate": 8.393653349485412e-06, + "loss": 1.5682476, + "memory(GiB)": 97.17, + "step": 23605, + "train_speed(iter/s)": 1.634806 + }, + { + "acc": 0.65274711, + "epoch": 0.5989345509893456, + "grad_norm": 6.8125, + "learning_rate": 8.392883178964874e-06, + "loss": 1.64301605, + "memory(GiB)": 97.17, + "step": 23610, + "train_speed(iter/s)": 1.634843 + }, + { + "acc": 0.6575932, + "epoch": 0.5990613901572805, + "grad_norm": 6.75, + "learning_rate": 8.392112859210186e-06, + "loss": 1.59404621, + "memory(GiB)": 97.17, + "step": 23615, + "train_speed(iter/s)": 1.634882 + }, + { + "acc": 0.65138531, + "epoch": 0.5991882293252156, + "grad_norm": 5.53125, + "learning_rate": 8.391342390255232e-06, + "loss": 1.6398407, + "memory(GiB)": 97.17, + "step": 23620, + "train_speed(iter/s)": 1.63492 + }, + { + "acc": 0.65016856, + "epoch": 0.5993150684931506, + "grad_norm": 4.8125, + "learning_rate": 8.390571772133896e-06, + "loss": 1.62908974, + "memory(GiB)": 97.17, + "step": 23625, + "train_speed(iter/s)": 1.634959 + }, + { + "acc": 0.66987467, + "epoch": 0.5994419076610857, + "grad_norm": 5.84375, + "learning_rate": 8.389801004880077e-06, + "loss": 1.63829155, + "memory(GiB)": 97.17, + "step": 23630, + "train_speed(iter/s)": 1.634996 + }, + { + "acc": 0.64644132, + "epoch": 0.5995687468290208, + "grad_norm": 5.40625, + "learning_rate": 8.389030088527675e-06, + "loss": 1.63240185, + "memory(GiB)": 97.17, + "step": 23635, + "train_speed(iter/s)": 1.635035 + }, + { + "acc": 0.64512405, + "epoch": 0.5996955859969558, + "grad_norm": 6.03125, + "learning_rate": 8.388259023110598e-06, + "loss": 1.6434124, + "memory(GiB)": 97.17, + "step": 23640, + "train_speed(iter/s)": 1.635073 + }, + { + "acc": 0.65086021, + "epoch": 0.5998224251648909, + "grad_norm": 6.5, + "learning_rate": 8.387487808662765e-06, + "loss": 1.62117882, + "memory(GiB)": 97.17, + "step": 23645, + "train_speed(iter/s)": 1.635112 + }, + { + "acc": 0.64057388, + "epoch": 0.599949264332826, + "grad_norm": 5.375, + "learning_rate": 8.38671644521809e-06, + "loss": 1.66967812, + "memory(GiB)": 97.17, + "step": 23650, + "train_speed(iter/s)": 1.635149 + }, + { + "acc": 0.65469522, + "epoch": 0.600076103500761, + "grad_norm": 5.21875, + "learning_rate": 8.385944932810508e-06, + "loss": 1.63358097, + "memory(GiB)": 97.17, + "step": 23655, + "train_speed(iter/s)": 1.635187 + }, + { + "acc": 0.6675519, + "epoch": 0.6002029426686961, + "grad_norm": 5.5625, + "learning_rate": 8.385173271473948e-06, + "loss": 1.58354616, + "memory(GiB)": 97.17, + "step": 23660, + "train_speed(iter/s)": 1.635226 + }, + { + "acc": 0.64779515, + "epoch": 0.6003297818366311, + "grad_norm": 5.8125, + "learning_rate": 8.384401461242355e-06, + "loss": 1.67138214, + "memory(GiB)": 97.17, + "step": 23665, + "train_speed(iter/s)": 1.635266 + }, + { + "acc": 0.65686722, + "epoch": 0.6004566210045662, + "grad_norm": 6.40625, + "learning_rate": 8.383629502149678e-06, + "loss": 1.59346199, + "memory(GiB)": 97.17, + "step": 23670, + "train_speed(iter/s)": 1.635304 + }, + { + "acc": 0.65006852, + "epoch": 0.6005834601725013, + "grad_norm": 6.21875, + "learning_rate": 8.382857394229865e-06, + "loss": 1.61142731, + "memory(GiB)": 97.17, + "step": 23675, + "train_speed(iter/s)": 1.635344 + }, + { + "acc": 0.65049267, + "epoch": 0.6007102993404363, + "grad_norm": 5.25, + "learning_rate": 8.382085137516883e-06, + "loss": 1.64968128, + "memory(GiB)": 97.17, + "step": 23680, + "train_speed(iter/s)": 1.63538 + }, + { + "acc": 0.66970272, + "epoch": 0.6008371385083714, + "grad_norm": 5.25, + "learning_rate": 8.381312732044696e-06, + "loss": 1.60367165, + "memory(GiB)": 97.17, + "step": 23685, + "train_speed(iter/s)": 1.635418 + }, + { + "acc": 0.67198534, + "epoch": 0.6009639776763065, + "grad_norm": 6.21875, + "learning_rate": 8.380540177847278e-06, + "loss": 1.55842972, + "memory(GiB)": 97.17, + "step": 23690, + "train_speed(iter/s)": 1.635456 + }, + { + "acc": 0.63949499, + "epoch": 0.6010908168442415, + "grad_norm": 5.53125, + "learning_rate": 8.37976747495861e-06, + "loss": 1.68784389, + "memory(GiB)": 97.17, + "step": 23695, + "train_speed(iter/s)": 1.635492 + }, + { + "acc": 0.65733333, + "epoch": 0.6012176560121766, + "grad_norm": 8.375, + "learning_rate": 8.378994623412679e-06, + "loss": 1.54904194, + "memory(GiB)": 97.17, + "step": 23700, + "train_speed(iter/s)": 1.635533 + }, + { + "acc": 0.63535132, + "epoch": 0.6013444951801116, + "grad_norm": 5.8125, + "learning_rate": 8.378221623243478e-06, + "loss": 1.5994071, + "memory(GiB)": 97.17, + "step": 23705, + "train_speed(iter/s)": 1.635571 + }, + { + "acc": 0.65157075, + "epoch": 0.6014713343480467, + "grad_norm": 5.09375, + "learning_rate": 8.377448474485008e-06, + "loss": 1.57085342, + "memory(GiB)": 97.17, + "step": 23710, + "train_speed(iter/s)": 1.63561 + }, + { + "acc": 0.6623836, + "epoch": 0.6015981735159818, + "grad_norm": 4.90625, + "learning_rate": 8.376675177171273e-06, + "loss": 1.6030447, + "memory(GiB)": 97.17, + "step": 23715, + "train_speed(iter/s)": 1.635647 + }, + { + "acc": 0.66628366, + "epoch": 0.6017250126839168, + "grad_norm": 5.5625, + "learning_rate": 8.375901731336292e-06, + "loss": 1.566998, + "memory(GiB)": 97.17, + "step": 23720, + "train_speed(iter/s)": 1.635685 + }, + { + "acc": 0.65826263, + "epoch": 0.6018518518518519, + "grad_norm": 6.84375, + "learning_rate": 8.375128137014076e-06, + "loss": 1.5863224, + "memory(GiB)": 97.17, + "step": 23725, + "train_speed(iter/s)": 1.635722 + }, + { + "acc": 0.64467607, + "epoch": 0.601978691019787, + "grad_norm": 6.96875, + "learning_rate": 8.374354394238658e-06, + "loss": 1.7122797, + "memory(GiB)": 97.17, + "step": 23730, + "train_speed(iter/s)": 1.63576 + }, + { + "acc": 0.65857096, + "epoch": 0.602105530187722, + "grad_norm": 5.75, + "learning_rate": 8.373580503044068e-06, + "loss": 1.58004427, + "memory(GiB)": 97.17, + "step": 23735, + "train_speed(iter/s)": 1.635794 + }, + { + "acc": 0.66265678, + "epoch": 0.602232369355657, + "grad_norm": 7.15625, + "learning_rate": 8.372806463464347e-06, + "loss": 1.61926918, + "memory(GiB)": 97.17, + "step": 23740, + "train_speed(iter/s)": 1.635828 + }, + { + "acc": 0.67056332, + "epoch": 0.602359208523592, + "grad_norm": 5.8125, + "learning_rate": 8.372032275533538e-06, + "loss": 1.51040268, + "memory(GiB)": 97.17, + "step": 23745, + "train_speed(iter/s)": 1.635866 + }, + { + "acc": 0.6574666, + "epoch": 0.6024860476915271, + "grad_norm": 5.84375, + "learning_rate": 8.371257939285692e-06, + "loss": 1.63043423, + "memory(GiB)": 97.17, + "step": 23750, + "train_speed(iter/s)": 1.635903 + }, + { + "acc": 0.67122121, + "epoch": 0.6026128868594622, + "grad_norm": 6.15625, + "learning_rate": 8.370483454754873e-06, + "loss": 1.61019058, + "memory(GiB)": 97.17, + "step": 23755, + "train_speed(iter/s)": 1.635941 + }, + { + "acc": 0.66039429, + "epoch": 0.6027397260273972, + "grad_norm": 5.1875, + "learning_rate": 8.369708821975144e-06, + "loss": 1.58683586, + "memory(GiB)": 97.17, + "step": 23760, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.67080631, + "epoch": 0.6028665651953323, + "grad_norm": 5.875, + "learning_rate": 8.368934040980576e-06, + "loss": 1.52552214, + "memory(GiB)": 97.17, + "step": 23765, + "train_speed(iter/s)": 1.636014 + }, + { + "acc": 0.66372862, + "epoch": 0.6029934043632674, + "grad_norm": 6.25, + "learning_rate": 8.368159111805246e-06, + "loss": 1.68078499, + "memory(GiB)": 97.17, + "step": 23770, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.66894069, + "epoch": 0.6031202435312024, + "grad_norm": 5.09375, + "learning_rate": 8.367384034483242e-06, + "loss": 1.57246742, + "memory(GiB)": 97.17, + "step": 23775, + "train_speed(iter/s)": 1.636088 + }, + { + "acc": 0.65181723, + "epoch": 0.6032470826991375, + "grad_norm": 5.8125, + "learning_rate": 8.366608809048653e-06, + "loss": 1.65746231, + "memory(GiB)": 97.17, + "step": 23780, + "train_speed(iter/s)": 1.636123 + }, + { + "acc": 0.64440804, + "epoch": 0.6033739218670725, + "grad_norm": 6.75, + "learning_rate": 8.365833435535579e-06, + "loss": 1.61723785, + "memory(GiB)": 97.17, + "step": 23785, + "train_speed(iter/s)": 1.636164 + }, + { + "acc": 0.65853601, + "epoch": 0.6035007610350076, + "grad_norm": 6.09375, + "learning_rate": 8.365057913978123e-06, + "loss": 1.65635109, + "memory(GiB)": 97.17, + "step": 23790, + "train_speed(iter/s)": 1.636199 + }, + { + "acc": 0.6434782, + "epoch": 0.6036276002029427, + "grad_norm": 6.125, + "learning_rate": 8.364282244410394e-06, + "loss": 1.69672985, + "memory(GiB)": 97.17, + "step": 23795, + "train_speed(iter/s)": 1.636237 + }, + { + "acc": 0.65837231, + "epoch": 0.6037544393708777, + "grad_norm": 4.625, + "learning_rate": 8.363506426866513e-06, + "loss": 1.61673279, + "memory(GiB)": 97.17, + "step": 23800, + "train_speed(iter/s)": 1.636275 + }, + { + "acc": 0.65322962, + "epoch": 0.6038812785388128, + "grad_norm": 5.125, + "learning_rate": 8.362730461380602e-06, + "loss": 1.61582108, + "memory(GiB)": 97.17, + "step": 23805, + "train_speed(iter/s)": 1.636314 + }, + { + "acc": 0.65290103, + "epoch": 0.6040081177067479, + "grad_norm": 5.6875, + "learning_rate": 8.361954347986793e-06, + "loss": 1.60475922, + "memory(GiB)": 97.17, + "step": 23810, + "train_speed(iter/s)": 1.636354 + }, + { + "acc": 0.67212706, + "epoch": 0.6041349568746829, + "grad_norm": 4.53125, + "learning_rate": 8.36117808671922e-06, + "loss": 1.50592575, + "memory(GiB)": 97.17, + "step": 23815, + "train_speed(iter/s)": 1.636391 + }, + { + "acc": 0.65316696, + "epoch": 0.604261796042618, + "grad_norm": 5.28125, + "learning_rate": 8.36040167761203e-06, + "loss": 1.5869669, + "memory(GiB)": 97.17, + "step": 23820, + "train_speed(iter/s)": 1.636429 + }, + { + "acc": 0.64897199, + "epoch": 0.604388635210553, + "grad_norm": 6.1875, + "learning_rate": 8.359625120699368e-06, + "loss": 1.63080406, + "memory(GiB)": 97.17, + "step": 23825, + "train_speed(iter/s)": 1.636469 + }, + { + "acc": 0.6570981, + "epoch": 0.6045154743784881, + "grad_norm": 5.84375, + "learning_rate": 8.358848416015397e-06, + "loss": 1.56049919, + "memory(GiB)": 97.17, + "step": 23830, + "train_speed(iter/s)": 1.636506 + }, + { + "acc": 0.63806753, + "epoch": 0.6046423135464232, + "grad_norm": 5.0625, + "learning_rate": 8.358071563594274e-06, + "loss": 1.68667126, + "memory(GiB)": 97.17, + "step": 23835, + "train_speed(iter/s)": 1.636543 + }, + { + "acc": 0.64888859, + "epoch": 0.6047691527143582, + "grad_norm": 5.5, + "learning_rate": 8.357294563470173e-06, + "loss": 1.62895489, + "memory(GiB)": 97.17, + "step": 23840, + "train_speed(iter/s)": 1.636583 + }, + { + "acc": 0.66969862, + "epoch": 0.6048959918822933, + "grad_norm": 5.6875, + "learning_rate": 8.356517415677267e-06, + "loss": 1.54189816, + "memory(GiB)": 97.17, + "step": 23845, + "train_speed(iter/s)": 1.63662 + }, + { + "acc": 0.63496723, + "epoch": 0.6050228310502284, + "grad_norm": 7.65625, + "learning_rate": 8.355740120249739e-06, + "loss": 1.67266369, + "memory(GiB)": 97.17, + "step": 23850, + "train_speed(iter/s)": 1.636658 + }, + { + "acc": 0.6457356, + "epoch": 0.6051496702181633, + "grad_norm": 5.75, + "learning_rate": 8.354962677221779e-06, + "loss": 1.61528854, + "memory(GiB)": 97.17, + "step": 23855, + "train_speed(iter/s)": 1.636698 + }, + { + "acc": 0.64900188, + "epoch": 0.6052765093860984, + "grad_norm": 5.90625, + "learning_rate": 8.35418508662758e-06, + "loss": 1.59869604, + "memory(GiB)": 97.17, + "step": 23860, + "train_speed(iter/s)": 1.636737 + }, + { + "acc": 0.64458017, + "epoch": 0.6054033485540334, + "grad_norm": 7.09375, + "learning_rate": 8.353407348501346e-06, + "loss": 1.65958786, + "memory(GiB)": 97.17, + "step": 23865, + "train_speed(iter/s)": 1.636776 + }, + { + "acc": 0.65521193, + "epoch": 0.6055301877219685, + "grad_norm": 5.53125, + "learning_rate": 8.352629462877286e-06, + "loss": 1.61344986, + "memory(GiB)": 97.17, + "step": 23870, + "train_speed(iter/s)": 1.636817 + }, + { + "acc": 0.64518614, + "epoch": 0.6056570268899036, + "grad_norm": 6.1875, + "learning_rate": 8.351851429789613e-06, + "loss": 1.67500229, + "memory(GiB)": 97.17, + "step": 23875, + "train_speed(iter/s)": 1.636855 + }, + { + "acc": 0.63837938, + "epoch": 0.6057838660578386, + "grad_norm": 5.3125, + "learning_rate": 8.35107324927255e-06, + "loss": 1.62520275, + "memory(GiB)": 97.17, + "step": 23880, + "train_speed(iter/s)": 1.636895 + }, + { + "acc": 0.64832067, + "epoch": 0.6059107052257737, + "grad_norm": 6.40625, + "learning_rate": 8.350294921360323e-06, + "loss": 1.66826439, + "memory(GiB)": 97.17, + "step": 23885, + "train_speed(iter/s)": 1.636936 + }, + { + "acc": 0.64945412, + "epoch": 0.6060375443937088, + "grad_norm": 5.375, + "learning_rate": 8.349516446087168e-06, + "loss": 1.6516819, + "memory(GiB)": 97.17, + "step": 23890, + "train_speed(iter/s)": 1.636973 + }, + { + "acc": 0.63361063, + "epoch": 0.6061643835616438, + "grad_norm": 5.53125, + "learning_rate": 8.348737823487325e-06, + "loss": 1.66778126, + "memory(GiB)": 97.17, + "step": 23895, + "train_speed(iter/s)": 1.637013 + }, + { + "acc": 0.64627576, + "epoch": 0.6062912227295789, + "grad_norm": 5.84375, + "learning_rate": 8.347959053595042e-06, + "loss": 1.65390759, + "memory(GiB)": 97.17, + "step": 23900, + "train_speed(iter/s)": 1.637049 + }, + { + "acc": 0.64018984, + "epoch": 0.6064180618975139, + "grad_norm": 6.5, + "learning_rate": 8.347180136444572e-06, + "loss": 1.6810379, + "memory(GiB)": 97.17, + "step": 23905, + "train_speed(iter/s)": 1.637089 + }, + { + "acc": 0.66054482, + "epoch": 0.606544901065449, + "grad_norm": 5.59375, + "learning_rate": 8.346401072070174e-06, + "loss": 1.60496788, + "memory(GiB)": 97.17, + "step": 23910, + "train_speed(iter/s)": 1.637126 + }, + { + "acc": 0.63156519, + "epoch": 0.6066717402333841, + "grad_norm": 6.6875, + "learning_rate": 8.345621860506119e-06, + "loss": 1.68892822, + "memory(GiB)": 97.17, + "step": 23915, + "train_speed(iter/s)": 1.637164 + }, + { + "acc": 0.66439419, + "epoch": 0.6067985794013191, + "grad_norm": 6.125, + "learning_rate": 8.344842501786675e-06, + "loss": 1.52353735, + "memory(GiB)": 97.17, + "step": 23920, + "train_speed(iter/s)": 1.637203 + }, + { + "acc": 0.64234953, + "epoch": 0.6069254185692542, + "grad_norm": 6.15625, + "learning_rate": 8.344062995946125e-06, + "loss": 1.69694901, + "memory(GiB)": 97.17, + "step": 23925, + "train_speed(iter/s)": 1.637241 + }, + { + "acc": 0.6489996, + "epoch": 0.6070522577371893, + "grad_norm": 5.4375, + "learning_rate": 8.343283343018755e-06, + "loss": 1.62597313, + "memory(GiB)": 97.17, + "step": 23930, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.65009031, + "epoch": 0.6071790969051243, + "grad_norm": 5.71875, + "learning_rate": 8.342503543038855e-06, + "loss": 1.64044991, + "memory(GiB)": 97.17, + "step": 23935, + "train_speed(iter/s)": 1.637313 + }, + { + "acc": 0.64959722, + "epoch": 0.6073059360730594, + "grad_norm": 6.625, + "learning_rate": 8.341723596040728e-06, + "loss": 1.669664, + "memory(GiB)": 97.17, + "step": 23940, + "train_speed(iter/s)": 1.637351 + }, + { + "acc": 0.63595128, + "epoch": 0.6074327752409944, + "grad_norm": 5.15625, + "learning_rate": 8.340943502058675e-06, + "loss": 1.68558273, + "memory(GiB)": 97.17, + "step": 23945, + "train_speed(iter/s)": 1.637389 + }, + { + "acc": 0.6498745, + "epoch": 0.6075596144089295, + "grad_norm": 5.84375, + "learning_rate": 8.340163261127014e-06, + "loss": 1.66121311, + "memory(GiB)": 97.17, + "step": 23950, + "train_speed(iter/s)": 1.637429 + }, + { + "acc": 0.64318385, + "epoch": 0.6076864535768646, + "grad_norm": 5.46875, + "learning_rate": 8.339382873280058e-06, + "loss": 1.65904541, + "memory(GiB)": 97.17, + "step": 23955, + "train_speed(iter/s)": 1.637465 + }, + { + "acc": 0.6516696, + "epoch": 0.6078132927447996, + "grad_norm": 5.0625, + "learning_rate": 8.338602338552136e-06, + "loss": 1.66058064, + "memory(GiB)": 107.26, + "step": 23960, + "train_speed(iter/s)": 1.637499 + }, + { + "acc": 0.65156097, + "epoch": 0.6079401319127347, + "grad_norm": 5.4375, + "learning_rate": 8.337821656977574e-06, + "loss": 1.60544434, + "memory(GiB)": 107.26, + "step": 23965, + "train_speed(iter/s)": 1.637536 + }, + { + "acc": 0.66548567, + "epoch": 0.6080669710806698, + "grad_norm": 5.65625, + "learning_rate": 8.337040828590715e-06, + "loss": 1.588727, + "memory(GiB)": 107.26, + "step": 23970, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.6474226, + "epoch": 0.6081938102486047, + "grad_norm": 6.0, + "learning_rate": 8.336259853425901e-06, + "loss": 1.61458206, + "memory(GiB)": 107.26, + "step": 23975, + "train_speed(iter/s)": 1.637615 + }, + { + "acc": 0.64859209, + "epoch": 0.6083206494165398, + "grad_norm": 5.46875, + "learning_rate": 8.335478731517484e-06, + "loss": 1.60038109, + "memory(GiB)": 107.26, + "step": 23980, + "train_speed(iter/s)": 1.637653 + }, + { + "acc": 0.64348207, + "epoch": 0.6084474885844748, + "grad_norm": 6.15625, + "learning_rate": 8.33469746289982e-06, + "loss": 1.66140556, + "memory(GiB)": 107.26, + "step": 23985, + "train_speed(iter/s)": 1.637689 + }, + { + "acc": 0.63877859, + "epoch": 0.6085743277524099, + "grad_norm": 5.75, + "learning_rate": 8.333916047607274e-06, + "loss": 1.64812298, + "memory(GiB)": 107.26, + "step": 23990, + "train_speed(iter/s)": 1.637728 + }, + { + "acc": 0.66716499, + "epoch": 0.608701166920345, + "grad_norm": 6.5625, + "learning_rate": 8.333134485674214e-06, + "loss": 1.61449852, + "memory(GiB)": 107.26, + "step": 23995, + "train_speed(iter/s)": 1.637765 + }, + { + "acc": 0.64051781, + "epoch": 0.60882800608828, + "grad_norm": 5.53125, + "learning_rate": 8.33235277713502e-06, + "loss": 1.62289371, + "memory(GiB)": 107.26, + "step": 24000, + "train_speed(iter/s)": 1.637803 + }, + { + "epoch": 0.60882800608828, + "eval_acc": 0.6446935097534336, + "eval_loss": 1.5826815366744995, + "eval_runtime": 57.9028, + "eval_samples_per_second": 110.012, + "eval_steps_per_second": 27.512, + "step": 24000 + }, + { + "acc": 0.65905495, + "epoch": 0.6089548452562151, + "grad_norm": 7.4375, + "learning_rate": 8.33157092202407e-06, + "loss": 1.70113144, + "memory(GiB)": 107.26, + "step": 24005, + "train_speed(iter/s)": 1.630917 + }, + { + "acc": 0.65316553, + "epoch": 0.6090816844241502, + "grad_norm": 4.625, + "learning_rate": 8.33078892037576e-06, + "loss": 1.63966274, + "memory(GiB)": 107.26, + "step": 24010, + "train_speed(iter/s)": 1.630954 + }, + { + "acc": 0.65524273, + "epoch": 0.6092085235920852, + "grad_norm": 6.875, + "learning_rate": 8.33000677222448e-06, + "loss": 1.61504612, + "memory(GiB)": 107.26, + "step": 24015, + "train_speed(iter/s)": 1.630996 + }, + { + "acc": 0.65486999, + "epoch": 0.6093353627600203, + "grad_norm": 4.96875, + "learning_rate": 8.329224477604635e-06, + "loss": 1.62060165, + "memory(GiB)": 107.26, + "step": 24020, + "train_speed(iter/s)": 1.631037 + }, + { + "acc": 0.64503832, + "epoch": 0.6094622019279553, + "grad_norm": 5.96875, + "learning_rate": 8.328442036550633e-06, + "loss": 1.64288311, + "memory(GiB)": 107.26, + "step": 24025, + "train_speed(iter/s)": 1.631077 + }, + { + "acc": 0.65088234, + "epoch": 0.6095890410958904, + "grad_norm": 5.65625, + "learning_rate": 8.327659449096892e-06, + "loss": 1.57201939, + "memory(GiB)": 107.26, + "step": 24030, + "train_speed(iter/s)": 1.631116 + }, + { + "acc": 0.67160325, + "epoch": 0.6097158802638255, + "grad_norm": 9.0, + "learning_rate": 8.32687671527783e-06, + "loss": 1.49210892, + "memory(GiB)": 107.26, + "step": 24035, + "train_speed(iter/s)": 1.631159 + }, + { + "acc": 0.6553565, + "epoch": 0.6098427194317605, + "grad_norm": 5.65625, + "learning_rate": 8.326093835127878e-06, + "loss": 1.64779854, + "memory(GiB)": 107.26, + "step": 24040, + "train_speed(iter/s)": 1.6312 + }, + { + "acc": 0.65048723, + "epoch": 0.6099695585996956, + "grad_norm": 7.1875, + "learning_rate": 8.325310808681466e-06, + "loss": 1.63453903, + "memory(GiB)": 107.26, + "step": 24045, + "train_speed(iter/s)": 1.631237 + }, + { + "acc": 0.65614166, + "epoch": 0.6100963977676307, + "grad_norm": 5.8125, + "learning_rate": 8.32452763597304e-06, + "loss": 1.53854465, + "memory(GiB)": 107.26, + "step": 24050, + "train_speed(iter/s)": 1.631274 + }, + { + "acc": 0.64905958, + "epoch": 0.6102232369355657, + "grad_norm": 5.71875, + "learning_rate": 8.323744317037048e-06, + "loss": 1.56913691, + "memory(GiB)": 107.26, + "step": 24055, + "train_speed(iter/s)": 1.631313 + }, + { + "acc": 0.64826059, + "epoch": 0.6103500761035008, + "grad_norm": 4.8125, + "learning_rate": 8.322960851907937e-06, + "loss": 1.6305357, + "memory(GiB)": 107.26, + "step": 24060, + "train_speed(iter/s)": 1.631334 + }, + { + "acc": 0.63951769, + "epoch": 0.6104769152714358, + "grad_norm": 7.0, + "learning_rate": 8.322177240620175e-06, + "loss": 1.65924416, + "memory(GiB)": 107.26, + "step": 24065, + "train_speed(iter/s)": 1.631374 + }, + { + "acc": 0.66433921, + "epoch": 0.6106037544393709, + "grad_norm": 5.71875, + "learning_rate": 8.321393483208224e-06, + "loss": 1.56772337, + "memory(GiB)": 107.26, + "step": 24070, + "train_speed(iter/s)": 1.631407 + }, + { + "acc": 0.66684809, + "epoch": 0.610730593607306, + "grad_norm": 5.28125, + "learning_rate": 8.32060957970656e-06, + "loss": 1.56584702, + "memory(GiB)": 107.26, + "step": 24075, + "train_speed(iter/s)": 1.631446 + }, + { + "acc": 0.64457984, + "epoch": 0.610857432775241, + "grad_norm": 5.09375, + "learning_rate": 8.319825530149661e-06, + "loss": 1.67626209, + "memory(GiB)": 107.26, + "step": 24080, + "train_speed(iter/s)": 1.631483 + }, + { + "acc": 0.65042267, + "epoch": 0.6109842719431761, + "grad_norm": 5.5, + "learning_rate": 8.319041334572012e-06, + "loss": 1.63317566, + "memory(GiB)": 107.26, + "step": 24085, + "train_speed(iter/s)": 1.631521 + }, + { + "acc": 0.64856639, + "epoch": 0.6111111111111112, + "grad_norm": 5.75, + "learning_rate": 8.318256993008108e-06, + "loss": 1.65145302, + "memory(GiB)": 107.26, + "step": 24090, + "train_speed(iter/s)": 1.631561 + }, + { + "acc": 0.63675413, + "epoch": 0.6112379502790461, + "grad_norm": 5.25, + "learning_rate": 8.317472505492446e-06, + "loss": 1.70657921, + "memory(GiB)": 107.26, + "step": 24095, + "train_speed(iter/s)": 1.631602 + }, + { + "acc": 0.66812935, + "epoch": 0.6113647894469812, + "grad_norm": 5.125, + "learning_rate": 8.31668787205953e-06, + "loss": 1.5039525, + "memory(GiB)": 107.26, + "step": 24100, + "train_speed(iter/s)": 1.631641 + }, + { + "acc": 0.66860933, + "epoch": 0.6114916286149162, + "grad_norm": 4.84375, + "learning_rate": 8.315903092743876e-06, + "loss": 1.59197607, + "memory(GiB)": 107.26, + "step": 24105, + "train_speed(iter/s)": 1.631682 + }, + { + "acc": 0.67009511, + "epoch": 0.6116184677828513, + "grad_norm": 5.5, + "learning_rate": 8.315118167579999e-06, + "loss": 1.57855749, + "memory(GiB)": 107.26, + "step": 24110, + "train_speed(iter/s)": 1.631723 + }, + { + "acc": 0.6609683, + "epoch": 0.6117453069507864, + "grad_norm": 6.09375, + "learning_rate": 8.314333096602423e-06, + "loss": 1.56729641, + "memory(GiB)": 107.26, + "step": 24115, + "train_speed(iter/s)": 1.631763 + }, + { + "acc": 0.65835438, + "epoch": 0.6118721461187214, + "grad_norm": 6.0625, + "learning_rate": 8.313547879845682e-06, + "loss": 1.63325996, + "memory(GiB)": 107.26, + "step": 24120, + "train_speed(iter/s)": 1.631805 + }, + { + "acc": 0.64077373, + "epoch": 0.6119989852866565, + "grad_norm": 7.34375, + "learning_rate": 8.312762517344308e-06, + "loss": 1.66759548, + "memory(GiB)": 107.26, + "step": 24125, + "train_speed(iter/s)": 1.631844 + }, + { + "acc": 0.64215155, + "epoch": 0.6121258244545916, + "grad_norm": 7.21875, + "learning_rate": 8.311977009132851e-06, + "loss": 1.64293251, + "memory(GiB)": 107.26, + "step": 24130, + "train_speed(iter/s)": 1.631887 + }, + { + "acc": 0.65572925, + "epoch": 0.6122526636225266, + "grad_norm": 5.40625, + "learning_rate": 8.311191355245858e-06, + "loss": 1.64922485, + "memory(GiB)": 107.26, + "step": 24135, + "train_speed(iter/s)": 1.631928 + }, + { + "acc": 0.66276288, + "epoch": 0.6123795027904617, + "grad_norm": 4.75, + "learning_rate": 8.310405555717884e-06, + "loss": 1.62353764, + "memory(GiB)": 107.26, + "step": 24140, + "train_speed(iter/s)": 1.631968 + }, + { + "acc": 0.6509696, + "epoch": 0.6125063419583967, + "grad_norm": 5.1875, + "learning_rate": 8.309619610583495e-06, + "loss": 1.66010208, + "memory(GiB)": 107.26, + "step": 24145, + "train_speed(iter/s)": 1.632005 + }, + { + "acc": 0.68065972, + "epoch": 0.6126331811263318, + "grad_norm": 5.5, + "learning_rate": 8.30883351987726e-06, + "loss": 1.57454548, + "memory(GiB)": 107.26, + "step": 24150, + "train_speed(iter/s)": 1.632043 + }, + { + "acc": 0.65138063, + "epoch": 0.6127600202942669, + "grad_norm": 8.5625, + "learning_rate": 8.30804728363375e-06, + "loss": 1.63922977, + "memory(GiB)": 107.26, + "step": 24155, + "train_speed(iter/s)": 1.632082 + }, + { + "acc": 0.66233044, + "epoch": 0.6128868594622019, + "grad_norm": 7.03125, + "learning_rate": 8.307260901887556e-06, + "loss": 1.59522343, + "memory(GiB)": 107.26, + "step": 24160, + "train_speed(iter/s)": 1.632122 + }, + { + "acc": 0.65686007, + "epoch": 0.613013698630137, + "grad_norm": 5.125, + "learning_rate": 8.306474374673259e-06, + "loss": 1.54250326, + "memory(GiB)": 107.26, + "step": 24165, + "train_speed(iter/s)": 1.632161 + }, + { + "acc": 0.66284804, + "epoch": 0.6131405377980721, + "grad_norm": 6.75, + "learning_rate": 8.305687702025457e-06, + "loss": 1.59297419, + "memory(GiB)": 107.26, + "step": 24170, + "train_speed(iter/s)": 1.6322 + }, + { + "acc": 0.6563828, + "epoch": 0.6132673769660071, + "grad_norm": 5.84375, + "learning_rate": 8.304900883978753e-06, + "loss": 1.58604946, + "memory(GiB)": 107.26, + "step": 24175, + "train_speed(iter/s)": 1.63224 + }, + { + "acc": 0.6420083, + "epoch": 0.6133942161339422, + "grad_norm": 5.21875, + "learning_rate": 8.304113920567751e-06, + "loss": 1.67757549, + "memory(GiB)": 107.26, + "step": 24180, + "train_speed(iter/s)": 1.632279 + }, + { + "acc": 0.65166378, + "epoch": 0.6135210553018772, + "grad_norm": 5.78125, + "learning_rate": 8.303326811827066e-06, + "loss": 1.61237316, + "memory(GiB)": 107.26, + "step": 24185, + "train_speed(iter/s)": 1.632318 + }, + { + "acc": 0.65409741, + "epoch": 0.6136478944698123, + "grad_norm": 5.28125, + "learning_rate": 8.302539557791322e-06, + "loss": 1.63222599, + "memory(GiB)": 107.26, + "step": 24190, + "train_speed(iter/s)": 1.632356 + }, + { + "acc": 0.63583212, + "epoch": 0.6137747336377474, + "grad_norm": 4.96875, + "learning_rate": 8.301752158495141e-06, + "loss": 1.66746311, + "memory(GiB)": 107.26, + "step": 24195, + "train_speed(iter/s)": 1.632393 + }, + { + "acc": 0.65150318, + "epoch": 0.6139015728056824, + "grad_norm": 5.96875, + "learning_rate": 8.300964613973159e-06, + "loss": 1.65224075, + "memory(GiB)": 107.26, + "step": 24200, + "train_speed(iter/s)": 1.632433 + }, + { + "acc": 0.64884634, + "epoch": 0.6140284119736175, + "grad_norm": 5.375, + "learning_rate": 8.300176924260017e-06, + "loss": 1.60562305, + "memory(GiB)": 107.26, + "step": 24205, + "train_speed(iter/s)": 1.63247 + }, + { + "acc": 0.65094285, + "epoch": 0.6141552511415526, + "grad_norm": 6.25, + "learning_rate": 8.299389089390359e-06, + "loss": 1.59919872, + "memory(GiB)": 107.26, + "step": 24210, + "train_speed(iter/s)": 1.632511 + }, + { + "acc": 0.66140113, + "epoch": 0.6142820903094875, + "grad_norm": 4.875, + "learning_rate": 8.298601109398838e-06, + "loss": 1.57670517, + "memory(GiB)": 107.26, + "step": 24215, + "train_speed(iter/s)": 1.63255 + }, + { + "acc": 0.65009112, + "epoch": 0.6144089294774226, + "grad_norm": 4.46875, + "learning_rate": 8.297812984320113e-06, + "loss": 1.63598061, + "memory(GiB)": 107.26, + "step": 24220, + "train_speed(iter/s)": 1.632588 + }, + { + "acc": 0.64870777, + "epoch": 0.6145357686453576, + "grad_norm": 5.28125, + "learning_rate": 8.297024714188851e-06, + "loss": 1.61085854, + "memory(GiB)": 107.26, + "step": 24225, + "train_speed(iter/s)": 1.632629 + }, + { + "acc": 0.6540554, + "epoch": 0.6146626078132927, + "grad_norm": 6.375, + "learning_rate": 8.296236299039719e-06, + "loss": 1.65814362, + "memory(GiB)": 107.26, + "step": 24230, + "train_speed(iter/s)": 1.63267 + }, + { + "acc": 0.65093565, + "epoch": 0.6147894469812278, + "grad_norm": 5.125, + "learning_rate": 8.295447738907401e-06, + "loss": 1.54922571, + "memory(GiB)": 107.26, + "step": 24235, + "train_speed(iter/s)": 1.632709 + }, + { + "acc": 0.65365977, + "epoch": 0.6149162861491628, + "grad_norm": 5.125, + "learning_rate": 8.294659033826576e-06, + "loss": 1.61292439, + "memory(GiB)": 107.26, + "step": 24240, + "train_speed(iter/s)": 1.632749 + }, + { + "acc": 0.64184256, + "epoch": 0.6150431253170979, + "grad_norm": 6.78125, + "learning_rate": 8.293870183831937e-06, + "loss": 1.64473724, + "memory(GiB)": 107.26, + "step": 24245, + "train_speed(iter/s)": 1.632788 + }, + { + "acc": 0.65789547, + "epoch": 0.615169964485033, + "grad_norm": 6.375, + "learning_rate": 8.293081188958183e-06, + "loss": 1.57331505, + "memory(GiB)": 107.26, + "step": 24250, + "train_speed(iter/s)": 1.632827 + }, + { + "acc": 0.66379347, + "epoch": 0.615296803652968, + "grad_norm": 5.59375, + "learning_rate": 8.292292049240014e-06, + "loss": 1.57434521, + "memory(GiB)": 107.26, + "step": 24255, + "train_speed(iter/s)": 1.632866 + }, + { + "acc": 0.64103479, + "epoch": 0.6154236428209031, + "grad_norm": 5.53125, + "learning_rate": 8.291502764712143e-06, + "loss": 1.66669807, + "memory(GiB)": 107.26, + "step": 24260, + "train_speed(iter/s)": 1.632907 + }, + { + "acc": 0.66064053, + "epoch": 0.6155504819888381, + "grad_norm": 5.4375, + "learning_rate": 8.290713335409284e-06, + "loss": 1.58635864, + "memory(GiB)": 107.26, + "step": 24265, + "train_speed(iter/s)": 1.632946 + }, + { + "acc": 0.6556314, + "epoch": 0.6156773211567732, + "grad_norm": 5.5, + "learning_rate": 8.28992376136616e-06, + "loss": 1.58295794, + "memory(GiB)": 107.26, + "step": 24270, + "train_speed(iter/s)": 1.632984 + }, + { + "acc": 0.64795151, + "epoch": 0.6158041603247083, + "grad_norm": 5.5, + "learning_rate": 8.289134042617502e-06, + "loss": 1.67456646, + "memory(GiB)": 107.26, + "step": 24275, + "train_speed(iter/s)": 1.633024 + }, + { + "acc": 0.65625167, + "epoch": 0.6159309994926433, + "grad_norm": 5.34375, + "learning_rate": 8.288344179198043e-06, + "loss": 1.58869667, + "memory(GiB)": 107.26, + "step": 24280, + "train_speed(iter/s)": 1.633062 + }, + { + "acc": 0.6535018, + "epoch": 0.6160578386605784, + "grad_norm": 5.59375, + "learning_rate": 8.287554171142525e-06, + "loss": 1.65524254, + "memory(GiB)": 107.26, + "step": 24285, + "train_speed(iter/s)": 1.633101 + }, + { + "acc": 0.6364502, + "epoch": 0.6161846778285135, + "grad_norm": 5.15625, + "learning_rate": 8.2867640184857e-06, + "loss": 1.68619289, + "memory(GiB)": 107.26, + "step": 24290, + "train_speed(iter/s)": 1.63314 + }, + { + "acc": 0.63597555, + "epoch": 0.6163115169964485, + "grad_norm": 5.8125, + "learning_rate": 8.285973721262315e-06, + "loss": 1.65202217, + "memory(GiB)": 107.26, + "step": 24295, + "train_speed(iter/s)": 1.63318 + }, + { + "acc": 0.6548295, + "epoch": 0.6164383561643836, + "grad_norm": 5.15625, + "learning_rate": 8.285183279507135e-06, + "loss": 1.59142675, + "memory(GiB)": 107.26, + "step": 24300, + "train_speed(iter/s)": 1.633217 + }, + { + "acc": 0.64779425, + "epoch": 0.6165651953323186, + "grad_norm": 9.375, + "learning_rate": 8.28439269325493e-06, + "loss": 1.60534821, + "memory(GiB)": 107.26, + "step": 24305, + "train_speed(iter/s)": 1.633255 + }, + { + "acc": 0.64788494, + "epoch": 0.6166920345002537, + "grad_norm": 6.125, + "learning_rate": 8.28360196254047e-06, + "loss": 1.68173389, + "memory(GiB)": 107.26, + "step": 24310, + "train_speed(iter/s)": 1.633292 + }, + { + "acc": 0.65048113, + "epoch": 0.6168188736681888, + "grad_norm": 6.28125, + "learning_rate": 8.282811087398535e-06, + "loss": 1.62319794, + "memory(GiB)": 107.26, + "step": 24315, + "train_speed(iter/s)": 1.633334 + }, + { + "acc": 0.6476831, + "epoch": 0.6169457128361238, + "grad_norm": 6.625, + "learning_rate": 8.282020067863911e-06, + "loss": 1.62564278, + "memory(GiB)": 107.26, + "step": 24320, + "train_speed(iter/s)": 1.633373 + }, + { + "acc": 0.67075973, + "epoch": 0.6170725520040589, + "grad_norm": 5.84375, + "learning_rate": 8.281228903971391e-06, + "loss": 1.56481209, + "memory(GiB)": 107.26, + "step": 24325, + "train_speed(iter/s)": 1.633412 + }, + { + "acc": 0.66688604, + "epoch": 0.617199391171994, + "grad_norm": 7.21875, + "learning_rate": 8.280437595755774e-06, + "loss": 1.52157927, + "memory(GiB)": 107.26, + "step": 24330, + "train_speed(iter/s)": 1.633451 + }, + { + "acc": 0.65221734, + "epoch": 0.617326230339929, + "grad_norm": 5.28125, + "learning_rate": 8.279646143251867e-06, + "loss": 1.53978062, + "memory(GiB)": 107.26, + "step": 24335, + "train_speed(iter/s)": 1.63349 + }, + { + "acc": 0.65937424, + "epoch": 0.617453069507864, + "grad_norm": 5.6875, + "learning_rate": 8.278854546494479e-06, + "loss": 1.61242599, + "memory(GiB)": 107.26, + "step": 24340, + "train_speed(iter/s)": 1.633528 + }, + { + "acc": 0.65502834, + "epoch": 0.617579908675799, + "grad_norm": 5.8125, + "learning_rate": 8.27806280551843e-06, + "loss": 1.56261711, + "memory(GiB)": 107.26, + "step": 24345, + "train_speed(iter/s)": 1.633567 + }, + { + "acc": 0.64649534, + "epoch": 0.6177067478437341, + "grad_norm": 5.71875, + "learning_rate": 8.277270920358542e-06, + "loss": 1.60185318, + "memory(GiB)": 107.26, + "step": 24350, + "train_speed(iter/s)": 1.633606 + }, + { + "acc": 0.67093916, + "epoch": 0.6178335870116692, + "grad_norm": 5.90625, + "learning_rate": 8.276478891049649e-06, + "loss": 1.59160185, + "memory(GiB)": 107.26, + "step": 24355, + "train_speed(iter/s)": 1.633647 + }, + { + "acc": 0.66452913, + "epoch": 0.6179604261796042, + "grad_norm": 5.8125, + "learning_rate": 8.275686717626584e-06, + "loss": 1.56573763, + "memory(GiB)": 107.26, + "step": 24360, + "train_speed(iter/s)": 1.633684 + }, + { + "acc": 0.6515511, + "epoch": 0.6180872653475393, + "grad_norm": 4.5625, + "learning_rate": 8.274894400124191e-06, + "loss": 1.60488739, + "memory(GiB)": 107.26, + "step": 24365, + "train_speed(iter/s)": 1.633722 + }, + { + "acc": 0.65088544, + "epoch": 0.6182141045154744, + "grad_norm": 5.375, + "learning_rate": 8.274101938577324e-06, + "loss": 1.64141998, + "memory(GiB)": 107.26, + "step": 24370, + "train_speed(iter/s)": 1.633761 + }, + { + "acc": 0.65597076, + "epoch": 0.6183409436834094, + "grad_norm": 7.0, + "learning_rate": 8.273309333020834e-06, + "loss": 1.56200743, + "memory(GiB)": 107.26, + "step": 24375, + "train_speed(iter/s)": 1.633801 + }, + { + "acc": 0.65608997, + "epoch": 0.6184677828513445, + "grad_norm": 4.96875, + "learning_rate": 8.272516583489587e-06, + "loss": 1.64275074, + "memory(GiB)": 107.26, + "step": 24380, + "train_speed(iter/s)": 1.633841 + }, + { + "acc": 0.6670053, + "epoch": 0.6185946220192795, + "grad_norm": 6.0625, + "learning_rate": 8.271723690018448e-06, + "loss": 1.55916777, + "memory(GiB)": 107.26, + "step": 24385, + "train_speed(iter/s)": 1.633881 + }, + { + "acc": 0.63828945, + "epoch": 0.6187214611872146, + "grad_norm": 5.3125, + "learning_rate": 8.270930652642295e-06, + "loss": 1.61743279, + "memory(GiB)": 107.26, + "step": 24390, + "train_speed(iter/s)": 1.633919 + }, + { + "acc": 0.64620543, + "epoch": 0.6188483003551497, + "grad_norm": 5.53125, + "learning_rate": 8.270137471396007e-06, + "loss": 1.6379776, + "memory(GiB)": 107.26, + "step": 24395, + "train_speed(iter/s)": 1.633957 + }, + { + "acc": 0.6568716, + "epoch": 0.6189751395230847, + "grad_norm": 4.71875, + "learning_rate": 8.269344146314475e-06, + "loss": 1.60519867, + "memory(GiB)": 107.26, + "step": 24400, + "train_speed(iter/s)": 1.633994 + }, + { + "acc": 0.65031061, + "epoch": 0.6191019786910198, + "grad_norm": 5.90625, + "learning_rate": 8.26855067743259e-06, + "loss": 1.68257332, + "memory(GiB)": 107.26, + "step": 24405, + "train_speed(iter/s)": 1.634033 + }, + { + "acc": 0.65250025, + "epoch": 0.6192288178589549, + "grad_norm": 5.625, + "learning_rate": 8.267757064785254e-06, + "loss": 1.59980431, + "memory(GiB)": 107.26, + "step": 24410, + "train_speed(iter/s)": 1.63407 + }, + { + "acc": 0.65542336, + "epoch": 0.6193556570268899, + "grad_norm": 6.0, + "learning_rate": 8.26696330840737e-06, + "loss": 1.61762924, + "memory(GiB)": 107.26, + "step": 24415, + "train_speed(iter/s)": 1.634108 + }, + { + "acc": 0.6726089, + "epoch": 0.619482496194825, + "grad_norm": 5.5625, + "learning_rate": 8.266169408333856e-06, + "loss": 1.57673302, + "memory(GiB)": 107.26, + "step": 24420, + "train_speed(iter/s)": 1.634144 + }, + { + "acc": 0.66403751, + "epoch": 0.61960933536276, + "grad_norm": 5.625, + "learning_rate": 8.265375364599629e-06, + "loss": 1.60204811, + "memory(GiB)": 107.26, + "step": 24425, + "train_speed(iter/s)": 1.634186 + }, + { + "acc": 0.66187143, + "epoch": 0.6197361745306951, + "grad_norm": 6.625, + "learning_rate": 8.264581177239615e-06, + "loss": 1.60062084, + "memory(GiB)": 107.26, + "step": 24430, + "train_speed(iter/s)": 1.634223 + }, + { + "acc": 0.66657109, + "epoch": 0.6198630136986302, + "grad_norm": 5.78125, + "learning_rate": 8.263786846288745e-06, + "loss": 1.5310668, + "memory(GiB)": 107.26, + "step": 24435, + "train_speed(iter/s)": 1.634263 + }, + { + "acc": 0.66046629, + "epoch": 0.6199898528665652, + "grad_norm": 6.71875, + "learning_rate": 8.262992371781956e-06, + "loss": 1.6019474, + "memory(GiB)": 107.26, + "step": 24440, + "train_speed(iter/s)": 1.6343 + }, + { + "acc": 0.63187566, + "epoch": 0.6201166920345003, + "grad_norm": 5.6875, + "learning_rate": 8.262197753754195e-06, + "loss": 1.68738117, + "memory(GiB)": 107.26, + "step": 24445, + "train_speed(iter/s)": 1.634339 + }, + { + "acc": 0.65081878, + "epoch": 0.6202435312024354, + "grad_norm": 5.90625, + "learning_rate": 8.261402992240414e-06, + "loss": 1.67191315, + "memory(GiB)": 107.26, + "step": 24450, + "train_speed(iter/s)": 1.634376 + }, + { + "acc": 0.64992442, + "epoch": 0.6203703703703703, + "grad_norm": 6.375, + "learning_rate": 8.260608087275566e-06, + "loss": 1.65810432, + "memory(GiB)": 107.26, + "step": 24455, + "train_speed(iter/s)": 1.634415 + }, + { + "acc": 0.65987325, + "epoch": 0.6204972095383054, + "grad_norm": 4.875, + "learning_rate": 8.259813038894617e-06, + "loss": 1.60712872, + "memory(GiB)": 107.26, + "step": 24460, + "train_speed(iter/s)": 1.634452 + }, + { + "acc": 0.65861382, + "epoch": 0.6206240487062404, + "grad_norm": 5.59375, + "learning_rate": 8.259017847132538e-06, + "loss": 1.57515965, + "memory(GiB)": 107.26, + "step": 24465, + "train_speed(iter/s)": 1.634488 + }, + { + "acc": 0.6521121, + "epoch": 0.6207508878741755, + "grad_norm": 5.9375, + "learning_rate": 8.258222512024303e-06, + "loss": 1.64870644, + "memory(GiB)": 107.26, + "step": 24470, + "train_speed(iter/s)": 1.634527 + }, + { + "acc": 0.65789003, + "epoch": 0.6208777270421106, + "grad_norm": 5.25, + "learning_rate": 8.257427033604894e-06, + "loss": 1.54153767, + "memory(GiB)": 107.26, + "step": 24475, + "train_speed(iter/s)": 1.634567 + }, + { + "acc": 0.63943577, + "epoch": 0.6210045662100456, + "grad_norm": 6.0, + "learning_rate": 8.256631411909305e-06, + "loss": 1.68126507, + "memory(GiB)": 107.26, + "step": 24480, + "train_speed(iter/s)": 1.634602 + }, + { + "acc": 0.65381846, + "epoch": 0.6211314053779807, + "grad_norm": 5.625, + "learning_rate": 8.25583564697252e-06, + "loss": 1.54730148, + "memory(GiB)": 107.26, + "step": 24485, + "train_speed(iter/s)": 1.634641 + }, + { + "acc": 0.65746574, + "epoch": 0.6212582445459158, + "grad_norm": 5.53125, + "learning_rate": 8.255039738829552e-06, + "loss": 1.62740364, + "memory(GiB)": 107.26, + "step": 24490, + "train_speed(iter/s)": 1.63468 + }, + { + "acc": 0.65461531, + "epoch": 0.6213850837138508, + "grad_norm": 5.78125, + "learning_rate": 8.254243687515402e-06, + "loss": 1.61341038, + "memory(GiB)": 107.26, + "step": 24495, + "train_speed(iter/s)": 1.634716 + }, + { + "acc": 0.65132909, + "epoch": 0.6215119228817859, + "grad_norm": 5.65625, + "learning_rate": 8.253447493065085e-06, + "loss": 1.60907631, + "memory(GiB)": 107.26, + "step": 24500, + "train_speed(iter/s)": 1.634755 + }, + { + "acc": 0.64845624, + "epoch": 0.6216387620497209, + "grad_norm": 5.65625, + "learning_rate": 8.252651155513622e-06, + "loss": 1.64631405, + "memory(GiB)": 107.26, + "step": 24505, + "train_speed(iter/s)": 1.634794 + }, + { + "acc": 0.63941102, + "epoch": 0.621765601217656, + "grad_norm": 5.8125, + "learning_rate": 8.251854674896039e-06, + "loss": 1.70469093, + "memory(GiB)": 107.26, + "step": 24510, + "train_speed(iter/s)": 1.634829 + }, + { + "acc": 0.64546967, + "epoch": 0.6218924403855911, + "grad_norm": 5.59375, + "learning_rate": 8.251058051247368e-06, + "loss": 1.63095169, + "memory(GiB)": 107.26, + "step": 24515, + "train_speed(iter/s)": 1.634867 + }, + { + "acc": 0.6682179, + "epoch": 0.6220192795535261, + "grad_norm": 5.75, + "learning_rate": 8.250261284602651e-06, + "loss": 1.58506727, + "memory(GiB)": 107.26, + "step": 24520, + "train_speed(iter/s)": 1.634907 + }, + { + "acc": 0.66320562, + "epoch": 0.6221461187214612, + "grad_norm": 6.1875, + "learning_rate": 8.249464374996932e-06, + "loss": 1.60664139, + "memory(GiB)": 107.26, + "step": 24525, + "train_speed(iter/s)": 1.634945 + }, + { + "acc": 0.65261164, + "epoch": 0.6222729578893963, + "grad_norm": 5.53125, + "learning_rate": 8.24866732246526e-06, + "loss": 1.66367607, + "memory(GiB)": 107.26, + "step": 24530, + "train_speed(iter/s)": 1.634984 + }, + { + "acc": 0.64880095, + "epoch": 0.6223997970573313, + "grad_norm": 6.65625, + "learning_rate": 8.247870127042695e-06, + "loss": 1.61308403, + "memory(GiB)": 107.26, + "step": 24535, + "train_speed(iter/s)": 1.635021 + }, + { + "acc": 0.64007292, + "epoch": 0.6225266362252664, + "grad_norm": 6.5, + "learning_rate": 8.247072788764302e-06, + "loss": 1.63403111, + "memory(GiB)": 107.26, + "step": 24540, + "train_speed(iter/s)": 1.635059 + }, + { + "acc": 0.64348264, + "epoch": 0.6226534753932014, + "grad_norm": 5.71875, + "learning_rate": 8.246275307665147e-06, + "loss": 1.66267395, + "memory(GiB)": 107.26, + "step": 24545, + "train_speed(iter/s)": 1.635096 + }, + { + "acc": 0.66274967, + "epoch": 0.6227803145611365, + "grad_norm": 6.28125, + "learning_rate": 8.245477683780316e-06, + "loss": 1.54251146, + "memory(GiB)": 107.26, + "step": 24550, + "train_speed(iter/s)": 1.635132 + }, + { + "acc": 0.6593421, + "epoch": 0.6229071537290716, + "grad_norm": 6.0, + "learning_rate": 8.244679917144883e-06, + "loss": 1.61581612, + "memory(GiB)": 107.26, + "step": 24555, + "train_speed(iter/s)": 1.635168 + }, + { + "acc": 0.63921938, + "epoch": 0.6230339928970066, + "grad_norm": 6.4375, + "learning_rate": 8.243882007793941e-06, + "loss": 1.6940382, + "memory(GiB)": 107.26, + "step": 24560, + "train_speed(iter/s)": 1.635205 + }, + { + "acc": 0.65796113, + "epoch": 0.6231608320649417, + "grad_norm": 6.21875, + "learning_rate": 8.243083955762588e-06, + "loss": 1.60132904, + "memory(GiB)": 107.26, + "step": 24565, + "train_speed(iter/s)": 1.635239 + }, + { + "acc": 0.66274385, + "epoch": 0.6232876712328768, + "grad_norm": 6.0625, + "learning_rate": 8.24228576108592e-06, + "loss": 1.60198097, + "memory(GiB)": 107.26, + "step": 24570, + "train_speed(iter/s)": 1.635278 + }, + { + "acc": 0.65700483, + "epoch": 0.6234145104008117, + "grad_norm": 6.875, + "learning_rate": 8.24148742379905e-06, + "loss": 1.54407921, + "memory(GiB)": 107.26, + "step": 24575, + "train_speed(iter/s)": 1.635316 + }, + { + "acc": 0.65609951, + "epoch": 0.6235413495687468, + "grad_norm": 6.0, + "learning_rate": 8.240688943937092e-06, + "loss": 1.60369186, + "memory(GiB)": 107.26, + "step": 24580, + "train_speed(iter/s)": 1.635353 + }, + { + "acc": 0.65588841, + "epoch": 0.6236681887366818, + "grad_norm": 6.125, + "learning_rate": 8.239890321535163e-06, + "loss": 1.61838551, + "memory(GiB)": 107.26, + "step": 24585, + "train_speed(iter/s)": 1.63539 + }, + { + "acc": 0.65276937, + "epoch": 0.6237950279046169, + "grad_norm": 5.78125, + "learning_rate": 8.239091556628395e-06, + "loss": 1.66119251, + "memory(GiB)": 107.26, + "step": 24590, + "train_speed(iter/s)": 1.635429 + }, + { + "acc": 0.65116787, + "epoch": 0.623921867072552, + "grad_norm": 5.6875, + "learning_rate": 8.238292649251918e-06, + "loss": 1.64313469, + "memory(GiB)": 107.26, + "step": 24595, + "train_speed(iter/s)": 1.635466 + }, + { + "acc": 0.67093649, + "epoch": 0.624048706240487, + "grad_norm": 5.375, + "learning_rate": 8.237493599440871e-06, + "loss": 1.60220451, + "memory(GiB)": 107.26, + "step": 24600, + "train_speed(iter/s)": 1.635503 + }, + { + "acc": 0.63234806, + "epoch": 0.6241755454084221, + "grad_norm": 5.6875, + "learning_rate": 8.236694407230402e-06, + "loss": 1.74058857, + "memory(GiB)": 107.26, + "step": 24605, + "train_speed(iter/s)": 1.635539 + }, + { + "acc": 0.66838779, + "epoch": 0.6243023845763572, + "grad_norm": 5.84375, + "learning_rate": 8.235895072655664e-06, + "loss": 1.4857338, + "memory(GiB)": 107.26, + "step": 24610, + "train_speed(iter/s)": 1.635572 + }, + { + "acc": 0.6534894, + "epoch": 0.6244292237442922, + "grad_norm": 6.0, + "learning_rate": 8.235095595751809e-06, + "loss": 1.62717571, + "memory(GiB)": 107.26, + "step": 24615, + "train_speed(iter/s)": 1.635609 + }, + { + "acc": 0.66915188, + "epoch": 0.6245560629122273, + "grad_norm": 5.78125, + "learning_rate": 8.23429597655401e-06, + "loss": 1.58127661, + "memory(GiB)": 107.26, + "step": 24620, + "train_speed(iter/s)": 1.635646 + }, + { + "acc": 0.63129644, + "epoch": 0.6246829020801623, + "grad_norm": 5.875, + "learning_rate": 8.233496215097433e-06, + "loss": 1.67797775, + "memory(GiB)": 107.26, + "step": 24625, + "train_speed(iter/s)": 1.635685 + }, + { + "acc": 0.65792122, + "epoch": 0.6248097412480974, + "grad_norm": 5.46875, + "learning_rate": 8.232696311417256e-06, + "loss": 1.54957199, + "memory(GiB)": 107.26, + "step": 24630, + "train_speed(iter/s)": 1.635723 + }, + { + "acc": 0.64836216, + "epoch": 0.6249365804160325, + "grad_norm": 7.1875, + "learning_rate": 8.231896265548662e-06, + "loss": 1.66277733, + "memory(GiB)": 107.26, + "step": 24635, + "train_speed(iter/s)": 1.635762 + }, + { + "acc": 0.6531055, + "epoch": 0.6250634195839675, + "grad_norm": 5.3125, + "learning_rate": 8.231096077526841e-06, + "loss": 1.64894123, + "memory(GiB)": 107.26, + "step": 24640, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.6701932, + "epoch": 0.6251902587519026, + "grad_norm": 5.59375, + "learning_rate": 8.230295747386988e-06, + "loss": 1.56419325, + "memory(GiB)": 107.26, + "step": 24645, + "train_speed(iter/s)": 1.635836 + }, + { + "acc": 0.66264434, + "epoch": 0.6253170979198377, + "grad_norm": 6.0625, + "learning_rate": 8.229495275164307e-06, + "loss": 1.52827473, + "memory(GiB)": 107.26, + "step": 24650, + "train_speed(iter/s)": 1.635875 + }, + { + "acc": 0.66026759, + "epoch": 0.6254439370877727, + "grad_norm": 6.1875, + "learning_rate": 8.228694660894003e-06, + "loss": 1.65783253, + "memory(GiB)": 107.26, + "step": 24655, + "train_speed(iter/s)": 1.635912 + }, + { + "acc": 0.66281977, + "epoch": 0.6255707762557078, + "grad_norm": 7.9375, + "learning_rate": 8.227893904611295e-06, + "loss": 1.58205214, + "memory(GiB)": 107.26, + "step": 24660, + "train_speed(iter/s)": 1.63595 + }, + { + "acc": 0.66056638, + "epoch": 0.6256976154236428, + "grad_norm": 5.0, + "learning_rate": 8.2270930063514e-06, + "loss": 1.58064556, + "memory(GiB)": 107.26, + "step": 24665, + "train_speed(iter/s)": 1.635988 + }, + { + "acc": 0.65215459, + "epoch": 0.6258244545915779, + "grad_norm": 5.5, + "learning_rate": 8.226291966149549e-06, + "loss": 1.66009369, + "memory(GiB)": 107.26, + "step": 24670, + "train_speed(iter/s)": 1.636026 + }, + { + "acc": 0.65944614, + "epoch": 0.625951293759513, + "grad_norm": 5.40625, + "learning_rate": 8.225490784040971e-06, + "loss": 1.57912216, + "memory(GiB)": 107.26, + "step": 24675, + "train_speed(iter/s)": 1.636064 + }, + { + "acc": 0.65118546, + "epoch": 0.626078132927448, + "grad_norm": 6.34375, + "learning_rate": 8.224689460060908e-06, + "loss": 1.63942261, + "memory(GiB)": 107.26, + "step": 24680, + "train_speed(iter/s)": 1.6361 + }, + { + "acc": 0.66725736, + "epoch": 0.6262049720953831, + "grad_norm": 5.625, + "learning_rate": 8.223887994244604e-06, + "loss": 1.58191662, + "memory(GiB)": 107.26, + "step": 24685, + "train_speed(iter/s)": 1.636136 + }, + { + "acc": 0.66614647, + "epoch": 0.6263318112633182, + "grad_norm": 5.5, + "learning_rate": 8.223086386627314e-06, + "loss": 1.57282143, + "memory(GiB)": 107.26, + "step": 24690, + "train_speed(iter/s)": 1.636175 + }, + { + "acc": 0.6728343, + "epoch": 0.6264586504312532, + "grad_norm": 6.1875, + "learning_rate": 8.222284637244296e-06, + "loss": 1.53513422, + "memory(GiB)": 107.26, + "step": 24695, + "train_speed(iter/s)": 1.636214 + }, + { + "acc": 0.66431308, + "epoch": 0.6265854895991883, + "grad_norm": 4.71875, + "learning_rate": 8.221482746130811e-06, + "loss": 1.55937099, + "memory(GiB)": 107.26, + "step": 24700, + "train_speed(iter/s)": 1.636253 + }, + { + "acc": 0.65554366, + "epoch": 0.6267123287671232, + "grad_norm": 5.6875, + "learning_rate": 8.220680713322131e-06, + "loss": 1.61941605, + "memory(GiB)": 107.26, + "step": 24705, + "train_speed(iter/s)": 1.636289 + }, + { + "acc": 0.65283508, + "epoch": 0.6268391679350583, + "grad_norm": 5.5, + "learning_rate": 8.219878538853537e-06, + "loss": 1.60055885, + "memory(GiB)": 107.26, + "step": 24710, + "train_speed(iter/s)": 1.636323 + }, + { + "acc": 0.6452179, + "epoch": 0.6269660071029934, + "grad_norm": 4.96875, + "learning_rate": 8.219076222760307e-06, + "loss": 1.67959328, + "memory(GiB)": 107.26, + "step": 24715, + "train_speed(iter/s)": 1.636361 + }, + { + "acc": 0.63807435, + "epoch": 0.6270928462709284, + "grad_norm": 4.9375, + "learning_rate": 8.218273765077734e-06, + "loss": 1.62349262, + "memory(GiB)": 107.26, + "step": 24720, + "train_speed(iter/s)": 1.636399 + }, + { + "acc": 0.6541852, + "epoch": 0.6272196854388635, + "grad_norm": 5.375, + "learning_rate": 8.21747116584111e-06, + "loss": 1.6081852, + "memory(GiB)": 107.26, + "step": 24725, + "train_speed(iter/s)": 1.636436 + }, + { + "acc": 0.64146233, + "epoch": 0.6273465246067986, + "grad_norm": 5.0625, + "learning_rate": 8.21666842508574e-06, + "loss": 1.56283884, + "memory(GiB)": 107.26, + "step": 24730, + "train_speed(iter/s)": 1.636473 + }, + { + "acc": 0.64809847, + "epoch": 0.6274733637747336, + "grad_norm": 6.46875, + "learning_rate": 8.215865542846932e-06, + "loss": 1.68420887, + "memory(GiB)": 107.26, + "step": 24735, + "train_speed(iter/s)": 1.636511 + }, + { + "acc": 0.64047184, + "epoch": 0.6276002029426687, + "grad_norm": 7.46875, + "learning_rate": 8.215062519160002e-06, + "loss": 1.63542347, + "memory(GiB)": 107.26, + "step": 24740, + "train_speed(iter/s)": 1.636548 + }, + { + "acc": 0.65750084, + "epoch": 0.6277270421106037, + "grad_norm": 5.9375, + "learning_rate": 8.214259354060263e-06, + "loss": 1.62848167, + "memory(GiB)": 107.26, + "step": 24745, + "train_speed(iter/s)": 1.636587 + }, + { + "acc": 0.65126572, + "epoch": 0.6278538812785388, + "grad_norm": 6.34375, + "learning_rate": 8.21345604758305e-06, + "loss": 1.58686419, + "memory(GiB)": 107.26, + "step": 24750, + "train_speed(iter/s)": 1.636626 + }, + { + "acc": 0.65963607, + "epoch": 0.6279807204464739, + "grad_norm": 4.96875, + "learning_rate": 8.212652599763693e-06, + "loss": 1.5857255, + "memory(GiB)": 107.26, + "step": 24755, + "train_speed(iter/s)": 1.636663 + }, + { + "acc": 0.64530449, + "epoch": 0.6281075596144089, + "grad_norm": 7.46875, + "learning_rate": 8.211849010637532e-06, + "loss": 1.6340374, + "memory(GiB)": 107.26, + "step": 24760, + "train_speed(iter/s)": 1.636703 + }, + { + "acc": 0.6404911, + "epoch": 0.628234398782344, + "grad_norm": 5.8125, + "learning_rate": 8.211045280239908e-06, + "loss": 1.5899806, + "memory(GiB)": 107.26, + "step": 24765, + "train_speed(iter/s)": 1.636739 + }, + { + "acc": 0.65176077, + "epoch": 0.6283612379502791, + "grad_norm": 6.0625, + "learning_rate": 8.210241408606182e-06, + "loss": 1.5757411, + "memory(GiB)": 107.26, + "step": 24770, + "train_speed(iter/s)": 1.636775 + }, + { + "acc": 0.66138029, + "epoch": 0.6284880771182141, + "grad_norm": 6.59375, + "learning_rate": 8.2094373957717e-06, + "loss": 1.62178822, + "memory(GiB)": 107.26, + "step": 24775, + "train_speed(iter/s)": 1.636815 + }, + { + "acc": 0.64928465, + "epoch": 0.6286149162861492, + "grad_norm": 5.6875, + "learning_rate": 8.208633241771836e-06, + "loss": 1.62128124, + "memory(GiB)": 107.26, + "step": 24780, + "train_speed(iter/s)": 1.636852 + }, + { + "acc": 0.65925827, + "epoch": 0.6287417554540842, + "grad_norm": 5.53125, + "learning_rate": 8.207828946641956e-06, + "loss": 1.61395226, + "memory(GiB)": 107.26, + "step": 24785, + "train_speed(iter/s)": 1.636888 + }, + { + "acc": 0.66922345, + "epoch": 0.6288685946220193, + "grad_norm": 4.625, + "learning_rate": 8.207024510417436e-06, + "loss": 1.59678698, + "memory(GiB)": 107.26, + "step": 24790, + "train_speed(iter/s)": 1.636927 + }, + { + "acc": 0.65218124, + "epoch": 0.6289954337899544, + "grad_norm": 7.21875, + "learning_rate": 8.20621993313366e-06, + "loss": 1.65590324, + "memory(GiB)": 107.26, + "step": 24795, + "train_speed(iter/s)": 1.636966 + }, + { + "acc": 0.66004591, + "epoch": 0.6291222729578894, + "grad_norm": 5.46875, + "learning_rate": 8.205415214826018e-06, + "loss": 1.74226646, + "memory(GiB)": 107.26, + "step": 24800, + "train_speed(iter/s)": 1.637001 + }, + { + "acc": 0.6490797, + "epoch": 0.6292491121258245, + "grad_norm": 5.15625, + "learning_rate": 8.204610355529901e-06, + "loss": 1.62131805, + "memory(GiB)": 107.26, + "step": 24805, + "train_speed(iter/s)": 1.637039 + }, + { + "acc": 0.64782681, + "epoch": 0.6293759512937596, + "grad_norm": 5.375, + "learning_rate": 8.203805355280715e-06, + "loss": 1.6390831, + "memory(GiB)": 107.26, + "step": 24810, + "train_speed(iter/s)": 1.637075 + }, + { + "acc": 0.65443325, + "epoch": 0.6295027904616946, + "grad_norm": 5.15625, + "learning_rate": 8.203000214113865e-06, + "loss": 1.64088039, + "memory(GiB)": 107.26, + "step": 24815, + "train_speed(iter/s)": 1.637113 + }, + { + "acc": 0.66053181, + "epoch": 0.6296296296296297, + "grad_norm": 5.3125, + "learning_rate": 8.202194932064767e-06, + "loss": 1.58313808, + "memory(GiB)": 107.26, + "step": 24820, + "train_speed(iter/s)": 1.637148 + }, + { + "acc": 0.65155458, + "epoch": 0.6297564687975646, + "grad_norm": 6.59375, + "learning_rate": 8.201389509168836e-06, + "loss": 1.63907413, + "memory(GiB)": 107.26, + "step": 24825, + "train_speed(iter/s)": 1.637184 + }, + { + "acc": 0.6377501, + "epoch": 0.6298833079654997, + "grad_norm": 6.40625, + "learning_rate": 8.200583945461502e-06, + "loss": 1.64891891, + "memory(GiB)": 107.26, + "step": 24830, + "train_speed(iter/s)": 1.637221 + }, + { + "acc": 0.65754781, + "epoch": 0.6300101471334348, + "grad_norm": 5.96875, + "learning_rate": 8.199778240978197e-06, + "loss": 1.57515907, + "memory(GiB)": 107.26, + "step": 24835, + "train_speed(iter/s)": 1.637257 + }, + { + "acc": 0.63422813, + "epoch": 0.6301369863013698, + "grad_norm": 7.03125, + "learning_rate": 8.19897239575436e-06, + "loss": 1.69267731, + "memory(GiB)": 107.26, + "step": 24840, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.63644643, + "epoch": 0.6302638254693049, + "grad_norm": 4.71875, + "learning_rate": 8.198166409825434e-06, + "loss": 1.68144569, + "memory(GiB)": 107.26, + "step": 24845, + "train_speed(iter/s)": 1.637328 + }, + { + "acc": 0.65338507, + "epoch": 0.63039066463724, + "grad_norm": 6.84375, + "learning_rate": 8.19736028322687e-06, + "loss": 1.60599079, + "memory(GiB)": 107.26, + "step": 24850, + "train_speed(iter/s)": 1.637365 + }, + { + "acc": 0.64865341, + "epoch": 0.630517503805175, + "grad_norm": 5.15625, + "learning_rate": 8.196554015994126e-06, + "loss": 1.64108543, + "memory(GiB)": 107.26, + "step": 24855, + "train_speed(iter/s)": 1.637401 + }, + { + "acc": 0.66276264, + "epoch": 0.6306443429731101, + "grad_norm": 6.15625, + "learning_rate": 8.195747608162665e-06, + "loss": 1.6303894, + "memory(GiB)": 107.26, + "step": 24860, + "train_speed(iter/s)": 1.637439 + }, + { + "acc": 0.65164747, + "epoch": 0.6307711821410451, + "grad_norm": 5.09375, + "learning_rate": 8.194941059767957e-06, + "loss": 1.61740971, + "memory(GiB)": 107.26, + "step": 24865, + "train_speed(iter/s)": 1.637477 + }, + { + "acc": 0.64635897, + "epoch": 0.6308980213089802, + "grad_norm": 5.78125, + "learning_rate": 8.194134370845474e-06, + "loss": 1.60363312, + "memory(GiB)": 107.26, + "step": 24870, + "train_speed(iter/s)": 1.637514 + }, + { + "acc": 0.6533236, + "epoch": 0.6310248604769153, + "grad_norm": 6.5625, + "learning_rate": 8.193327541430703e-06, + "loss": 1.64744797, + "memory(GiB)": 107.26, + "step": 24875, + "train_speed(iter/s)": 1.637552 + }, + { + "acc": 0.65546293, + "epoch": 0.6311516996448503, + "grad_norm": 5.84375, + "learning_rate": 8.192520571559128e-06, + "loss": 1.64636726, + "memory(GiB)": 107.26, + "step": 24880, + "train_speed(iter/s)": 1.637589 + }, + { + "acc": 0.65899286, + "epoch": 0.6312785388127854, + "grad_norm": 5.90625, + "learning_rate": 8.191713461266246e-06, + "loss": 1.60760269, + "memory(GiB)": 107.26, + "step": 24885, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.65114822, + "epoch": 0.6314053779807205, + "grad_norm": 5.65625, + "learning_rate": 8.190906210587555e-06, + "loss": 1.63634949, + "memory(GiB)": 107.26, + "step": 24890, + "train_speed(iter/s)": 1.637663 + }, + { + "acc": 0.65625172, + "epoch": 0.6315322171486555, + "grad_norm": 5.40625, + "learning_rate": 8.190098819558562e-06, + "loss": 1.58358479, + "memory(GiB)": 107.26, + "step": 24895, + "train_speed(iter/s)": 1.637701 + }, + { + "acc": 0.65328989, + "epoch": 0.6316590563165906, + "grad_norm": 4.53125, + "learning_rate": 8.189291288214782e-06, + "loss": 1.60500851, + "memory(GiB)": 107.26, + "step": 24900, + "train_speed(iter/s)": 1.637738 + }, + { + "acc": 0.64872632, + "epoch": 0.6317858954845256, + "grad_norm": 4.46875, + "learning_rate": 8.18848361659173e-06, + "loss": 1.62037506, + "memory(GiB)": 107.26, + "step": 24905, + "train_speed(iter/s)": 1.637774 + }, + { + "acc": 0.67482018, + "epoch": 0.6319127346524607, + "grad_norm": 5.46875, + "learning_rate": 8.187675804724935e-06, + "loss": 1.56209764, + "memory(GiB)": 107.26, + "step": 24910, + "train_speed(iter/s)": 1.63781 + }, + { + "acc": 0.64697728, + "epoch": 0.6320395738203958, + "grad_norm": 5.28125, + "learning_rate": 8.186867852649925e-06, + "loss": 1.62317314, + "memory(GiB)": 107.26, + "step": 24915, + "train_speed(iter/s)": 1.637848 + }, + { + "acc": 0.65313296, + "epoch": 0.6321664129883308, + "grad_norm": 5.59375, + "learning_rate": 8.186059760402238e-06, + "loss": 1.62176342, + "memory(GiB)": 107.26, + "step": 24920, + "train_speed(iter/s)": 1.637884 + }, + { + "acc": 0.66063852, + "epoch": 0.6322932521562659, + "grad_norm": 5.40625, + "learning_rate": 8.185251528017419e-06, + "loss": 1.60959129, + "memory(GiB)": 107.26, + "step": 24925, + "train_speed(iter/s)": 1.637919 + }, + { + "acc": 0.65080404, + "epoch": 0.632420091324201, + "grad_norm": 6.0625, + "learning_rate": 8.184443155531016e-06, + "loss": 1.62780094, + "memory(GiB)": 107.26, + "step": 24930, + "train_speed(iter/s)": 1.637956 + }, + { + "acc": 0.65380893, + "epoch": 0.632546930492136, + "grad_norm": 5.25, + "learning_rate": 8.183634642978586e-06, + "loss": 1.61431084, + "memory(GiB)": 107.26, + "step": 24935, + "train_speed(iter/s)": 1.637992 + }, + { + "acc": 0.65905294, + "epoch": 0.632673769660071, + "grad_norm": 7.25, + "learning_rate": 8.18282599039569e-06, + "loss": 1.62732964, + "memory(GiB)": 107.26, + "step": 24940, + "train_speed(iter/s)": 1.638029 + }, + { + "acc": 0.65347195, + "epoch": 0.632800608828006, + "grad_norm": 6.03125, + "learning_rate": 8.182017197817898e-06, + "loss": 1.6477684, + "memory(GiB)": 107.26, + "step": 24945, + "train_speed(iter/s)": 1.638064 + }, + { + "acc": 0.66291871, + "epoch": 0.6329274479959411, + "grad_norm": 5.5, + "learning_rate": 8.181208265280782e-06, + "loss": 1.59911957, + "memory(GiB)": 107.26, + "step": 24950, + "train_speed(iter/s)": 1.638098 + }, + { + "acc": 0.64377847, + "epoch": 0.6330542871638762, + "grad_norm": 5.71875, + "learning_rate": 8.180399192819923e-06, + "loss": 1.70650063, + "memory(GiB)": 107.26, + "step": 24955, + "train_speed(iter/s)": 1.638134 + }, + { + "acc": 0.65112553, + "epoch": 0.6331811263318112, + "grad_norm": 6.78125, + "learning_rate": 8.17958998047091e-06, + "loss": 1.63111649, + "memory(GiB)": 107.26, + "step": 24960, + "train_speed(iter/s)": 1.638169 + }, + { + "acc": 0.6614501, + "epoch": 0.6333079654997463, + "grad_norm": 6.5625, + "learning_rate": 8.178780628269332e-06, + "loss": 1.56051483, + "memory(GiB)": 107.26, + "step": 24965, + "train_speed(iter/s)": 1.638207 + }, + { + "acc": 0.65216112, + "epoch": 0.6334348046676814, + "grad_norm": 5.125, + "learning_rate": 8.177971136250788e-06, + "loss": 1.67489262, + "memory(GiB)": 107.26, + "step": 24970, + "train_speed(iter/s)": 1.638239 + }, + { + "acc": 0.65246515, + "epoch": 0.6335616438356164, + "grad_norm": 5.28125, + "learning_rate": 8.177161504450887e-06, + "loss": 1.59421234, + "memory(GiB)": 107.26, + "step": 24975, + "train_speed(iter/s)": 1.638276 + }, + { + "acc": 0.64446716, + "epoch": 0.6336884830035515, + "grad_norm": 5.59375, + "learning_rate": 8.176351732905239e-06, + "loss": 1.68438053, + "memory(GiB)": 107.26, + "step": 24980, + "train_speed(iter/s)": 1.638313 + }, + { + "acc": 0.65932193, + "epoch": 0.6338153221714865, + "grad_norm": 6.40625, + "learning_rate": 8.175541821649459e-06, + "loss": 1.60514469, + "memory(GiB)": 107.26, + "step": 24985, + "train_speed(iter/s)": 1.638348 + }, + { + "acc": 0.6764039, + "epoch": 0.6339421613394216, + "grad_norm": 6.09375, + "learning_rate": 8.174731770719173e-06, + "loss": 1.53645611, + "memory(GiB)": 107.26, + "step": 24990, + "train_speed(iter/s)": 1.638382 + }, + { + "acc": 0.65205746, + "epoch": 0.6340690005073567, + "grad_norm": 5.15625, + "learning_rate": 8.173921580150008e-06, + "loss": 1.62166176, + "memory(GiB)": 107.26, + "step": 24995, + "train_speed(iter/s)": 1.638417 + }, + { + "acc": 0.645013, + "epoch": 0.6341958396752917, + "grad_norm": 6.40625, + "learning_rate": 8.173111249977602e-06, + "loss": 1.68191547, + "memory(GiB)": 107.26, + "step": 25000, + "train_speed(iter/s)": 1.638454 + }, + { + "epoch": 0.6341958396752917, + "eval_acc": 0.644863497870972, + "eval_loss": 1.581565022468567, + "eval_runtime": 58.0014, + "eval_samples_per_second": 109.825, + "eval_steps_per_second": 27.465, + "step": 25000 + }, + { + "acc": 0.66443939, + "epoch": 0.6343226788432268, + "grad_norm": 5.71875, + "learning_rate": 8.172300780237596e-06, + "loss": 1.55179424, + "memory(GiB)": 107.26, + "step": 25005, + "train_speed(iter/s)": 1.631833 + }, + { + "acc": 0.65388923, + "epoch": 0.6344495180111619, + "grad_norm": 6.125, + "learning_rate": 8.171490170965639e-06, + "loss": 1.59664783, + "memory(GiB)": 107.26, + "step": 25010, + "train_speed(iter/s)": 1.63187 + }, + { + "acc": 0.66988387, + "epoch": 0.6345763571790969, + "grad_norm": 5.9375, + "learning_rate": 8.170679422197385e-06, + "loss": 1.52830715, + "memory(GiB)": 107.26, + "step": 25015, + "train_speed(iter/s)": 1.631905 + }, + { + "acc": 0.65053511, + "epoch": 0.634703196347032, + "grad_norm": 5.875, + "learning_rate": 8.169868533968493e-06, + "loss": 1.61230907, + "memory(GiB)": 107.26, + "step": 25020, + "train_speed(iter/s)": 1.631943 + }, + { + "acc": 0.64172406, + "epoch": 0.634830035514967, + "grad_norm": 5.0625, + "learning_rate": 8.16905750631463e-06, + "loss": 1.63221207, + "memory(GiB)": 107.26, + "step": 25025, + "train_speed(iter/s)": 1.631979 + }, + { + "acc": 0.66764474, + "epoch": 0.6349568746829021, + "grad_norm": 7.71875, + "learning_rate": 8.168246339271471e-06, + "loss": 1.60535107, + "memory(GiB)": 107.26, + "step": 25030, + "train_speed(iter/s)": 1.632015 + }, + { + "acc": 0.63745842, + "epoch": 0.6350837138508372, + "grad_norm": 5.71875, + "learning_rate": 8.16743503287469e-06, + "loss": 1.63422623, + "memory(GiB)": 107.26, + "step": 25035, + "train_speed(iter/s)": 1.632053 + }, + { + "acc": 0.65079484, + "epoch": 0.6352105530187722, + "grad_norm": 5.8125, + "learning_rate": 8.166623587159978e-06, + "loss": 1.57841473, + "memory(GiB)": 107.26, + "step": 25040, + "train_speed(iter/s)": 1.632087 + }, + { + "acc": 0.65513973, + "epoch": 0.6353373921867073, + "grad_norm": 5.84375, + "learning_rate": 8.16581200216302e-06, + "loss": 1.57435713, + "memory(GiB)": 107.26, + "step": 25045, + "train_speed(iter/s)": 1.632126 + }, + { + "acc": 0.64942665, + "epoch": 0.6354642313546424, + "grad_norm": 4.875, + "learning_rate": 8.165000277919517e-06, + "loss": 1.64163494, + "memory(GiB)": 107.26, + "step": 25050, + "train_speed(iter/s)": 1.632162 + }, + { + "acc": 0.65720091, + "epoch": 0.6355910705225774, + "grad_norm": 6.09375, + "learning_rate": 8.16418841446517e-06, + "loss": 1.59496078, + "memory(GiB)": 107.26, + "step": 25055, + "train_speed(iter/s)": 1.632198 + }, + { + "acc": 0.65523882, + "epoch": 0.6357179096905125, + "grad_norm": 5.28125, + "learning_rate": 8.163376411835691e-06, + "loss": 1.57789049, + "memory(GiB)": 107.26, + "step": 25060, + "train_speed(iter/s)": 1.632237 + }, + { + "acc": 0.67631674, + "epoch": 0.6358447488584474, + "grad_norm": 4.53125, + "learning_rate": 8.162564270066793e-06, + "loss": 1.504039, + "memory(GiB)": 107.26, + "step": 25065, + "train_speed(iter/s)": 1.632275 + }, + { + "acc": 0.65791121, + "epoch": 0.6359715880263825, + "grad_norm": 5.8125, + "learning_rate": 8.1617519891942e-06, + "loss": 1.61463451, + "memory(GiB)": 107.26, + "step": 25070, + "train_speed(iter/s)": 1.632312 + }, + { + "acc": 0.65800676, + "epoch": 0.6360984271943176, + "grad_norm": 5.5, + "learning_rate": 8.160939569253637e-06, + "loss": 1.55168028, + "memory(GiB)": 107.26, + "step": 25075, + "train_speed(iter/s)": 1.632348 + }, + { + "acc": 0.65362477, + "epoch": 0.6362252663622526, + "grad_norm": 5.375, + "learning_rate": 8.160127010280838e-06, + "loss": 1.60622139, + "memory(GiB)": 107.26, + "step": 25080, + "train_speed(iter/s)": 1.632386 + }, + { + "acc": 0.64838934, + "epoch": 0.6363521055301877, + "grad_norm": 7.09375, + "learning_rate": 8.159314312311546e-06, + "loss": 1.66545372, + "memory(GiB)": 107.26, + "step": 25085, + "train_speed(iter/s)": 1.632422 + }, + { + "acc": 0.64707794, + "epoch": 0.6364789446981228, + "grad_norm": 5.375, + "learning_rate": 8.158501475381505e-06, + "loss": 1.66130714, + "memory(GiB)": 107.26, + "step": 25090, + "train_speed(iter/s)": 1.632458 + }, + { + "acc": 0.65416427, + "epoch": 0.6366057838660578, + "grad_norm": 5.375, + "learning_rate": 8.157688499526466e-06, + "loss": 1.55415554, + "memory(GiB)": 107.26, + "step": 25095, + "train_speed(iter/s)": 1.632494 + }, + { + "acc": 0.65607867, + "epoch": 0.6367326230339929, + "grad_norm": 6.0, + "learning_rate": 8.15687538478219e-06, + "loss": 1.61145306, + "memory(GiB)": 107.26, + "step": 25100, + "train_speed(iter/s)": 1.632533 + }, + { + "acc": 0.64754839, + "epoch": 0.6368594622019279, + "grad_norm": 5.9375, + "learning_rate": 8.156062131184439e-06, + "loss": 1.65939617, + "memory(GiB)": 107.26, + "step": 25105, + "train_speed(iter/s)": 1.632571 + }, + { + "acc": 0.65528903, + "epoch": 0.636986301369863, + "grad_norm": 6.1875, + "learning_rate": 8.155248738768986e-06, + "loss": 1.61583443, + "memory(GiB)": 107.26, + "step": 25110, + "train_speed(iter/s)": 1.632608 + }, + { + "acc": 0.66784687, + "epoch": 0.6371131405377981, + "grad_norm": 5.8125, + "learning_rate": 8.154435207571606e-06, + "loss": 1.52366409, + "memory(GiB)": 107.26, + "step": 25115, + "train_speed(iter/s)": 1.632646 + }, + { + "acc": 0.66205354, + "epoch": 0.6372399797057331, + "grad_norm": 8.3125, + "learning_rate": 8.153621537628083e-06, + "loss": 1.58188858, + "memory(GiB)": 107.26, + "step": 25120, + "train_speed(iter/s)": 1.632686 + }, + { + "acc": 0.65227323, + "epoch": 0.6373668188736682, + "grad_norm": 5.96875, + "learning_rate": 8.152807728974203e-06, + "loss": 1.59889174, + "memory(GiB)": 107.26, + "step": 25125, + "train_speed(iter/s)": 1.632723 + }, + { + "acc": 0.6689105, + "epoch": 0.6374936580416033, + "grad_norm": 6.875, + "learning_rate": 8.151993781645765e-06, + "loss": 1.58072891, + "memory(GiB)": 107.26, + "step": 25130, + "train_speed(iter/s)": 1.632761 + }, + { + "acc": 0.63539681, + "epoch": 0.6376204972095383, + "grad_norm": 8.0, + "learning_rate": 8.151179695678565e-06, + "loss": 1.65160522, + "memory(GiB)": 107.26, + "step": 25135, + "train_speed(iter/s)": 1.632801 + }, + { + "acc": 0.67017813, + "epoch": 0.6377473363774734, + "grad_norm": 5.4375, + "learning_rate": 8.150365471108414e-06, + "loss": 1.53730745, + "memory(GiB)": 107.26, + "step": 25140, + "train_speed(iter/s)": 1.63284 + }, + { + "acc": 0.64716587, + "epoch": 0.6378741755454084, + "grad_norm": 4.96875, + "learning_rate": 8.149551107971125e-06, + "loss": 1.67645149, + "memory(GiB)": 107.26, + "step": 25145, + "train_speed(iter/s)": 1.632879 + }, + { + "acc": 0.64729729, + "epoch": 0.6380010147133435, + "grad_norm": 5.8125, + "learning_rate": 8.148736606302517e-06, + "loss": 1.63931675, + "memory(GiB)": 107.26, + "step": 25150, + "train_speed(iter/s)": 1.632919 + }, + { + "acc": 0.6407692, + "epoch": 0.6381278538812786, + "grad_norm": 5.65625, + "learning_rate": 8.147921966138412e-06, + "loss": 1.65959816, + "memory(GiB)": 107.26, + "step": 25155, + "train_speed(iter/s)": 1.632958 + }, + { + "acc": 0.67805433, + "epoch": 0.6382546930492136, + "grad_norm": 4.78125, + "learning_rate": 8.147107187514647e-06, + "loss": 1.52108879, + "memory(GiB)": 107.26, + "step": 25160, + "train_speed(iter/s)": 1.632995 + }, + { + "acc": 0.66085944, + "epoch": 0.6383815322171487, + "grad_norm": 4.625, + "learning_rate": 8.146292270467056e-06, + "loss": 1.50633984, + "memory(GiB)": 107.26, + "step": 25165, + "train_speed(iter/s)": 1.633031 + }, + { + "acc": 0.66203742, + "epoch": 0.6385083713850838, + "grad_norm": 5.65625, + "learning_rate": 8.145477215031486e-06, + "loss": 1.57740507, + "memory(GiB)": 107.26, + "step": 25170, + "train_speed(iter/s)": 1.633069 + }, + { + "acc": 0.65342994, + "epoch": 0.6386352105530188, + "grad_norm": 6.21875, + "learning_rate": 8.144662021243782e-06, + "loss": 1.53255253, + "memory(GiB)": 107.26, + "step": 25175, + "train_speed(iter/s)": 1.633109 + }, + { + "acc": 0.64925594, + "epoch": 0.6387620497209539, + "grad_norm": 5.1875, + "learning_rate": 8.143846689139805e-06, + "loss": 1.65110722, + "memory(GiB)": 107.26, + "step": 25180, + "train_speed(iter/s)": 1.633149 + }, + { + "acc": 0.64615192, + "epoch": 0.6388888888888888, + "grad_norm": 4.8125, + "learning_rate": 8.143031218755411e-06, + "loss": 1.64296265, + "memory(GiB)": 107.26, + "step": 25185, + "train_speed(iter/s)": 1.633188 + }, + { + "acc": 0.63147774, + "epoch": 0.6390157280568239, + "grad_norm": 6.21875, + "learning_rate": 8.142215610126474e-06, + "loss": 1.68469028, + "memory(GiB)": 107.26, + "step": 25190, + "train_speed(iter/s)": 1.633225 + }, + { + "acc": 0.66989598, + "epoch": 0.639142567224759, + "grad_norm": 5.59375, + "learning_rate": 8.141399863288863e-06, + "loss": 1.56986675, + "memory(GiB)": 107.26, + "step": 25195, + "train_speed(iter/s)": 1.633261 + }, + { + "acc": 0.64840164, + "epoch": 0.639269406392694, + "grad_norm": 5.1875, + "learning_rate": 8.140583978278463e-06, + "loss": 1.60602264, + "memory(GiB)": 107.26, + "step": 25200, + "train_speed(iter/s)": 1.633299 + }, + { + "acc": 0.67312622, + "epoch": 0.6393962455606291, + "grad_norm": 5.0625, + "learning_rate": 8.139767955131157e-06, + "loss": 1.41391335, + "memory(GiB)": 107.26, + "step": 25205, + "train_speed(iter/s)": 1.633336 + }, + { + "acc": 0.67137156, + "epoch": 0.6395230847285642, + "grad_norm": 5.28125, + "learning_rate": 8.138951793882838e-06, + "loss": 1.54170761, + "memory(GiB)": 107.26, + "step": 25210, + "train_speed(iter/s)": 1.633374 + }, + { + "acc": 0.67052155, + "epoch": 0.6396499238964992, + "grad_norm": 5.78125, + "learning_rate": 8.138135494569405e-06, + "loss": 1.55081882, + "memory(GiB)": 107.26, + "step": 25215, + "train_speed(iter/s)": 1.633412 + }, + { + "acc": 0.65794039, + "epoch": 0.6397767630644343, + "grad_norm": 5.9375, + "learning_rate": 8.137319057226763e-06, + "loss": 1.54577208, + "memory(GiB)": 107.26, + "step": 25220, + "train_speed(iter/s)": 1.633449 + }, + { + "acc": 0.65920897, + "epoch": 0.6399036022323693, + "grad_norm": 5.09375, + "learning_rate": 8.136502481890821e-06, + "loss": 1.62497635, + "memory(GiB)": 107.26, + "step": 25225, + "train_speed(iter/s)": 1.633486 + }, + { + "acc": 0.66269212, + "epoch": 0.6400304414003044, + "grad_norm": 7.21875, + "learning_rate": 8.135685768597496e-06, + "loss": 1.59908581, + "memory(GiB)": 107.26, + "step": 25230, + "train_speed(iter/s)": 1.633514 + }, + { + "acc": 0.65427208, + "epoch": 0.6401572805682395, + "grad_norm": 5.125, + "learning_rate": 8.134868917382713e-06, + "loss": 1.56183453, + "memory(GiB)": 107.26, + "step": 25235, + "train_speed(iter/s)": 1.633549 + }, + { + "acc": 0.64797058, + "epoch": 0.6402841197361745, + "grad_norm": 5.15625, + "learning_rate": 8.134051928282396e-06, + "loss": 1.61085243, + "memory(GiB)": 107.26, + "step": 25240, + "train_speed(iter/s)": 1.633585 + }, + { + "acc": 0.66116238, + "epoch": 0.6404109589041096, + "grad_norm": 5.21875, + "learning_rate": 8.133234801332484e-06, + "loss": 1.63974171, + "memory(GiB)": 107.26, + "step": 25245, + "train_speed(iter/s)": 1.633622 + }, + { + "acc": 0.66599889, + "epoch": 0.6405377980720447, + "grad_norm": 5.875, + "learning_rate": 8.132417536568918e-06, + "loss": 1.55043354, + "memory(GiB)": 107.26, + "step": 25250, + "train_speed(iter/s)": 1.63366 + }, + { + "acc": 0.6334969, + "epoch": 0.6406646372399797, + "grad_norm": 5.6875, + "learning_rate": 8.131600134027641e-06, + "loss": 1.67072468, + "memory(GiB)": 107.26, + "step": 25255, + "train_speed(iter/s)": 1.633697 + }, + { + "acc": 0.65395894, + "epoch": 0.6407914764079148, + "grad_norm": 5.96875, + "learning_rate": 8.13078259374461e-06, + "loss": 1.62380371, + "memory(GiB)": 107.26, + "step": 25260, + "train_speed(iter/s)": 1.633734 + }, + { + "acc": 0.64973154, + "epoch": 0.6409183155758498, + "grad_norm": 5.9375, + "learning_rate": 8.129964915755781e-06, + "loss": 1.62462616, + "memory(GiB)": 107.26, + "step": 25265, + "train_speed(iter/s)": 1.633772 + }, + { + "acc": 0.66672859, + "epoch": 0.6410451547437849, + "grad_norm": 4.65625, + "learning_rate": 8.129147100097122e-06, + "loss": 1.52427235, + "memory(GiB)": 107.26, + "step": 25270, + "train_speed(iter/s)": 1.633809 + }, + { + "acc": 0.65974579, + "epoch": 0.64117199391172, + "grad_norm": 6.53125, + "learning_rate": 8.128329146804604e-06, + "loss": 1.62272339, + "memory(GiB)": 107.26, + "step": 25275, + "train_speed(iter/s)": 1.633845 + }, + { + "acc": 0.65485783, + "epoch": 0.641298833079655, + "grad_norm": 6.84375, + "learning_rate": 8.127511055914201e-06, + "loss": 1.58440361, + "memory(GiB)": 107.26, + "step": 25280, + "train_speed(iter/s)": 1.633883 + }, + { + "acc": 0.67908206, + "epoch": 0.6414256722475901, + "grad_norm": 6.5, + "learning_rate": 8.1266928274619e-06, + "loss": 1.58362217, + "memory(GiB)": 107.26, + "step": 25285, + "train_speed(iter/s)": 1.633919 + }, + { + "acc": 0.65474596, + "epoch": 0.6415525114155252, + "grad_norm": 5.03125, + "learning_rate": 8.125874461483687e-06, + "loss": 1.58742056, + "memory(GiB)": 107.26, + "step": 25290, + "train_speed(iter/s)": 1.633956 + }, + { + "acc": 0.62787018, + "epoch": 0.6416793505834602, + "grad_norm": 5.6875, + "learning_rate": 8.12505595801556e-06, + "loss": 1.69383907, + "memory(GiB)": 107.26, + "step": 25295, + "train_speed(iter/s)": 1.633992 + }, + { + "acc": 0.64466057, + "epoch": 0.6418061897513953, + "grad_norm": 9.125, + "learning_rate": 8.12423731709352e-06, + "loss": 1.66952839, + "memory(GiB)": 107.26, + "step": 25300, + "train_speed(iter/s)": 1.634029 + }, + { + "acc": 0.6457571, + "epoch": 0.6419330289193302, + "grad_norm": 5.21875, + "learning_rate": 8.123418538753573e-06, + "loss": 1.56150846, + "memory(GiB)": 107.26, + "step": 25305, + "train_speed(iter/s)": 1.63407 + }, + { + "acc": 0.65962567, + "epoch": 0.6420598680872653, + "grad_norm": 6.40625, + "learning_rate": 8.122599623031735e-06, + "loss": 1.57684612, + "memory(GiB)": 107.26, + "step": 25310, + "train_speed(iter/s)": 1.634105 + }, + { + "acc": 0.64726105, + "epoch": 0.6421867072552004, + "grad_norm": 5.09375, + "learning_rate": 8.121780569964024e-06, + "loss": 1.6117382, + "memory(GiB)": 107.26, + "step": 25315, + "train_speed(iter/s)": 1.63414 + }, + { + "acc": 0.66389976, + "epoch": 0.6423135464231354, + "grad_norm": 5.34375, + "learning_rate": 8.120961379586466e-06, + "loss": 1.57469578, + "memory(GiB)": 107.26, + "step": 25320, + "train_speed(iter/s)": 1.634177 + }, + { + "acc": 0.64762573, + "epoch": 0.6424403855910705, + "grad_norm": 6.0, + "learning_rate": 8.120142051935092e-06, + "loss": 1.66447239, + "memory(GiB)": 107.26, + "step": 25325, + "train_speed(iter/s)": 1.634214 + }, + { + "acc": 0.62941637, + "epoch": 0.6425672247590056, + "grad_norm": 5.9375, + "learning_rate": 8.11932258704594e-06, + "loss": 1.73394642, + "memory(GiB)": 107.26, + "step": 25330, + "train_speed(iter/s)": 1.634251 + }, + { + "acc": 0.63256741, + "epoch": 0.6426940639269406, + "grad_norm": 8.6875, + "learning_rate": 8.118502984955053e-06, + "loss": 1.7257637, + "memory(GiB)": 107.26, + "step": 25335, + "train_speed(iter/s)": 1.634288 + }, + { + "acc": 0.65086489, + "epoch": 0.6428209030948757, + "grad_norm": 5.71875, + "learning_rate": 8.117683245698483e-06, + "loss": 1.63473816, + "memory(GiB)": 107.26, + "step": 25340, + "train_speed(iter/s)": 1.634325 + }, + { + "acc": 0.65323277, + "epoch": 0.6429477422628107, + "grad_norm": 5.75, + "learning_rate": 8.116863369312283e-06, + "loss": 1.64795647, + "memory(GiB)": 107.26, + "step": 25345, + "train_speed(iter/s)": 1.634363 + }, + { + "acc": 0.66284151, + "epoch": 0.6430745814307458, + "grad_norm": 5.53125, + "learning_rate": 8.116043355832518e-06, + "loss": 1.62753296, + "memory(GiB)": 107.26, + "step": 25350, + "train_speed(iter/s)": 1.6344 + }, + { + "acc": 0.6454772, + "epoch": 0.6432014205986809, + "grad_norm": 4.5625, + "learning_rate": 8.115223205295253e-06, + "loss": 1.6379509, + "memory(GiB)": 107.26, + "step": 25355, + "train_speed(iter/s)": 1.634437 + }, + { + "acc": 0.65888605, + "epoch": 0.6433282597666159, + "grad_norm": 6.875, + "learning_rate": 8.114402917736563e-06, + "loss": 1.60328274, + "memory(GiB)": 107.26, + "step": 25360, + "train_speed(iter/s)": 1.634474 + }, + { + "acc": 0.65666022, + "epoch": 0.643455098934551, + "grad_norm": 5.90625, + "learning_rate": 8.113582493192529e-06, + "loss": 1.55814066, + "memory(GiB)": 107.26, + "step": 25365, + "train_speed(iter/s)": 1.634511 + }, + { + "acc": 0.66418591, + "epoch": 0.6435819381024861, + "grad_norm": 5.84375, + "learning_rate": 8.112761931699235e-06, + "loss": 1.61468201, + "memory(GiB)": 107.26, + "step": 25370, + "train_speed(iter/s)": 1.634545 + }, + { + "acc": 0.6513351, + "epoch": 0.6437087772704211, + "grad_norm": 5.875, + "learning_rate": 8.111941233292772e-06, + "loss": 1.64838982, + "memory(GiB)": 107.26, + "step": 25375, + "train_speed(iter/s)": 1.634581 + }, + { + "acc": 0.66825876, + "epoch": 0.6438356164383562, + "grad_norm": 6.3125, + "learning_rate": 8.111120398009243e-06, + "loss": 1.56160393, + "memory(GiB)": 107.26, + "step": 25380, + "train_speed(iter/s)": 1.634617 + }, + { + "acc": 0.64440546, + "epoch": 0.6439624556062912, + "grad_norm": 5.40625, + "learning_rate": 8.110299425884745e-06, + "loss": 1.6469408, + "memory(GiB)": 107.26, + "step": 25385, + "train_speed(iter/s)": 1.634654 + }, + { + "acc": 0.64203129, + "epoch": 0.6440892947742263, + "grad_norm": 5.46875, + "learning_rate": 8.109478316955394e-06, + "loss": 1.67655334, + "memory(GiB)": 107.26, + "step": 25390, + "train_speed(iter/s)": 1.634689 + }, + { + "acc": 0.64533777, + "epoch": 0.6442161339421614, + "grad_norm": 5.3125, + "learning_rate": 8.108657071257304e-06, + "loss": 1.63391838, + "memory(GiB)": 107.26, + "step": 25395, + "train_speed(iter/s)": 1.634727 + }, + { + "acc": 0.64848204, + "epoch": 0.6443429731100964, + "grad_norm": 5.875, + "learning_rate": 8.107835688826598e-06, + "loss": 1.65008049, + "memory(GiB)": 107.26, + "step": 25400, + "train_speed(iter/s)": 1.634763 + }, + { + "acc": 0.66970668, + "epoch": 0.6444698122780315, + "grad_norm": 4.9375, + "learning_rate": 8.1070141696994e-06, + "loss": 1.56662645, + "memory(GiB)": 107.26, + "step": 25405, + "train_speed(iter/s)": 1.634799 + }, + { + "acc": 0.65604649, + "epoch": 0.6445966514459666, + "grad_norm": 6.0, + "learning_rate": 8.106192513911849e-06, + "loss": 1.59266396, + "memory(GiB)": 107.26, + "step": 25410, + "train_speed(iter/s)": 1.634835 + }, + { + "acc": 0.65174699, + "epoch": 0.6447234906139016, + "grad_norm": 5.28125, + "learning_rate": 8.105370721500083e-06, + "loss": 1.68827057, + "memory(GiB)": 107.26, + "step": 25415, + "train_speed(iter/s)": 1.634869 + }, + { + "acc": 0.65298123, + "epoch": 0.6448503297818367, + "grad_norm": 5.125, + "learning_rate": 8.104548792500246e-06, + "loss": 1.57413826, + "memory(GiB)": 107.26, + "step": 25420, + "train_speed(iter/s)": 1.634904 + }, + { + "acc": 0.6495698, + "epoch": 0.6449771689497716, + "grad_norm": 5.65625, + "learning_rate": 8.103726726948495e-06, + "loss": 1.64965038, + "memory(GiB)": 107.26, + "step": 25425, + "train_speed(iter/s)": 1.63494 + }, + { + "acc": 0.65652938, + "epoch": 0.6451040081177067, + "grad_norm": 4.8125, + "learning_rate": 8.102904524880985e-06, + "loss": 1.62134857, + "memory(GiB)": 107.26, + "step": 25430, + "train_speed(iter/s)": 1.634976 + }, + { + "acc": 0.64343343, + "epoch": 0.6452308472856418, + "grad_norm": 5.09375, + "learning_rate": 8.10208218633388e-06, + "loss": 1.64101143, + "memory(GiB)": 107.26, + "step": 25435, + "train_speed(iter/s)": 1.635012 + }, + { + "acc": 0.64966459, + "epoch": 0.6453576864535768, + "grad_norm": 11.75, + "learning_rate": 8.10125971134335e-06, + "loss": 1.64917374, + "memory(GiB)": 107.26, + "step": 25440, + "train_speed(iter/s)": 1.635051 + }, + { + "acc": 0.65247822, + "epoch": 0.6454845256215119, + "grad_norm": 5.5, + "learning_rate": 8.100437099945572e-06, + "loss": 1.58667545, + "memory(GiB)": 107.26, + "step": 25445, + "train_speed(iter/s)": 1.635089 + }, + { + "acc": 0.66119399, + "epoch": 0.645611364789447, + "grad_norm": 4.96875, + "learning_rate": 8.099614352176727e-06, + "loss": 1.56738052, + "memory(GiB)": 107.26, + "step": 25450, + "train_speed(iter/s)": 1.635127 + }, + { + "acc": 0.65651379, + "epoch": 0.645738203957382, + "grad_norm": 5.5625, + "learning_rate": 8.098791468073007e-06, + "loss": 1.61565437, + "memory(GiB)": 107.26, + "step": 25455, + "train_speed(iter/s)": 1.635163 + }, + { + "acc": 0.66619954, + "epoch": 0.6458650431253171, + "grad_norm": 6.625, + "learning_rate": 8.097968447670601e-06, + "loss": 1.59684935, + "memory(GiB)": 107.26, + "step": 25460, + "train_speed(iter/s)": 1.6352 + }, + { + "acc": 0.65974846, + "epoch": 0.6459918822932521, + "grad_norm": 6.125, + "learning_rate": 8.09714529100571e-06, + "loss": 1.60825329, + "memory(GiB)": 107.26, + "step": 25465, + "train_speed(iter/s)": 1.635234 + }, + { + "acc": 0.65640121, + "epoch": 0.6461187214611872, + "grad_norm": 6.0, + "learning_rate": 8.096321998114545e-06, + "loss": 1.61770458, + "memory(GiB)": 107.26, + "step": 25470, + "train_speed(iter/s)": 1.63527 + }, + { + "acc": 0.66895566, + "epoch": 0.6462455606291223, + "grad_norm": 6.5625, + "learning_rate": 8.09549856903331e-06, + "loss": 1.53334856, + "memory(GiB)": 107.26, + "step": 25475, + "train_speed(iter/s)": 1.635305 + }, + { + "acc": 0.64003773, + "epoch": 0.6463723997970573, + "grad_norm": 5.125, + "learning_rate": 8.094675003798232e-06, + "loss": 1.65117798, + "memory(GiB)": 107.26, + "step": 25480, + "train_speed(iter/s)": 1.63534 + }, + { + "acc": 0.67290869, + "epoch": 0.6464992389649924, + "grad_norm": 6.8125, + "learning_rate": 8.093851302445528e-06, + "loss": 1.54114876, + "memory(GiB)": 107.26, + "step": 25485, + "train_speed(iter/s)": 1.635376 + }, + { + "acc": 0.65590672, + "epoch": 0.6466260781329275, + "grad_norm": 5.40625, + "learning_rate": 8.093027465011431e-06, + "loss": 1.59310751, + "memory(GiB)": 107.26, + "step": 25490, + "train_speed(iter/s)": 1.635413 + }, + { + "acc": 0.65389652, + "epoch": 0.6467529173008625, + "grad_norm": 5.34375, + "learning_rate": 8.092203491532178e-06, + "loss": 1.57344627, + "memory(GiB)": 107.26, + "step": 25495, + "train_speed(iter/s)": 1.635449 + }, + { + "acc": 0.65665984, + "epoch": 0.6468797564687976, + "grad_norm": 5.40625, + "learning_rate": 8.091379382044009e-06, + "loss": 1.63595314, + "memory(GiB)": 107.26, + "step": 25500, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.66399288, + "epoch": 0.6470065956367326, + "grad_norm": 6.28125, + "learning_rate": 8.090555136583172e-06, + "loss": 1.56749935, + "memory(GiB)": 107.26, + "step": 25505, + "train_speed(iter/s)": 1.635521 + }, + { + "acc": 0.65745196, + "epoch": 0.6471334348046677, + "grad_norm": 5.53125, + "learning_rate": 8.089730755185921e-06, + "loss": 1.58977823, + "memory(GiB)": 107.26, + "step": 25510, + "train_speed(iter/s)": 1.635557 + }, + { + "acc": 0.6484602, + "epoch": 0.6472602739726028, + "grad_norm": 5.6875, + "learning_rate": 8.088906237888517e-06, + "loss": 1.64411335, + "memory(GiB)": 107.26, + "step": 25515, + "train_speed(iter/s)": 1.635591 + }, + { + "acc": 0.64755487, + "epoch": 0.6473871131405378, + "grad_norm": 4.96875, + "learning_rate": 8.088081584727228e-06, + "loss": 1.59233646, + "memory(GiB)": 107.26, + "step": 25520, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.64910173, + "epoch": 0.6475139523084729, + "grad_norm": 6.90625, + "learning_rate": 8.08725679573832e-06, + "loss": 1.60463009, + "memory(GiB)": 107.26, + "step": 25525, + "train_speed(iter/s)": 1.635666 + }, + { + "acc": 0.66866031, + "epoch": 0.647640791476408, + "grad_norm": 5.28125, + "learning_rate": 8.086431870958078e-06, + "loss": 1.58421097, + "memory(GiB)": 107.26, + "step": 25530, + "train_speed(iter/s)": 1.635701 + }, + { + "acc": 0.65110388, + "epoch": 0.647767630644343, + "grad_norm": 4.59375, + "learning_rate": 8.085606810422781e-06, + "loss": 1.62232876, + "memory(GiB)": 107.26, + "step": 25535, + "train_speed(iter/s)": 1.635739 + }, + { + "acc": 0.64505992, + "epoch": 0.647894469812278, + "grad_norm": 5.78125, + "learning_rate": 8.08478161416872e-06, + "loss": 1.68454304, + "memory(GiB)": 107.26, + "step": 25540, + "train_speed(iter/s)": 1.635777 + }, + { + "acc": 0.65496173, + "epoch": 0.648021308980213, + "grad_norm": 5.125, + "learning_rate": 8.083956282232192e-06, + "loss": 1.65139961, + "memory(GiB)": 107.26, + "step": 25545, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.64949837, + "epoch": 0.6481481481481481, + "grad_norm": 6.21875, + "learning_rate": 8.083130814649498e-06, + "loss": 1.56567659, + "memory(GiB)": 107.26, + "step": 25550, + "train_speed(iter/s)": 1.635852 + }, + { + "acc": 0.68030043, + "epoch": 0.6482749873160832, + "grad_norm": 5.96875, + "learning_rate": 8.082305211456943e-06, + "loss": 1.5074996, + "memory(GiB)": 107.26, + "step": 25555, + "train_speed(iter/s)": 1.635889 + }, + { + "acc": 0.65378113, + "epoch": 0.6484018264840182, + "grad_norm": 5.8125, + "learning_rate": 8.081479472690846e-06, + "loss": 1.66485443, + "memory(GiB)": 107.26, + "step": 25560, + "train_speed(iter/s)": 1.635924 + }, + { + "acc": 0.65303106, + "epoch": 0.6485286656519533, + "grad_norm": 4.625, + "learning_rate": 8.080653598387522e-06, + "loss": 1.54809875, + "memory(GiB)": 107.26, + "step": 25565, + "train_speed(iter/s)": 1.63596 + }, + { + "acc": 0.67481833, + "epoch": 0.6486555048198884, + "grad_norm": 6.96875, + "learning_rate": 8.0798275885833e-06, + "loss": 1.55298853, + "memory(GiB)": 107.26, + "step": 25570, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.65893068, + "epoch": 0.6487823439878234, + "grad_norm": 5.5625, + "learning_rate": 8.07900144331451e-06, + "loss": 1.54101124, + "memory(GiB)": 107.26, + "step": 25575, + "train_speed(iter/s)": 1.636034 + }, + { + "acc": 0.65369158, + "epoch": 0.6489091831557585, + "grad_norm": 6.15625, + "learning_rate": 8.07817516261749e-06, + "loss": 1.6125845, + "memory(GiB)": 107.26, + "step": 25580, + "train_speed(iter/s)": 1.63607 + }, + { + "acc": 0.65587826, + "epoch": 0.6490360223236935, + "grad_norm": 5.15625, + "learning_rate": 8.077348746528583e-06, + "loss": 1.57133379, + "memory(GiB)": 107.26, + "step": 25585, + "train_speed(iter/s)": 1.636105 + }, + { + "acc": 0.66055965, + "epoch": 0.6491628614916286, + "grad_norm": 5.0625, + "learning_rate": 8.076522195084139e-06, + "loss": 1.54116821, + "memory(GiB)": 107.26, + "step": 25590, + "train_speed(iter/s)": 1.636139 + }, + { + "acc": 0.66508985, + "epoch": 0.6492897006595637, + "grad_norm": 4.875, + "learning_rate": 8.075695508320512e-06, + "loss": 1.62400665, + "memory(GiB)": 107.26, + "step": 25595, + "train_speed(iter/s)": 1.636173 + }, + { + "acc": 0.64792643, + "epoch": 0.6494165398274987, + "grad_norm": 6.5625, + "learning_rate": 8.074868686274065e-06, + "loss": 1.71825066, + "memory(GiB)": 107.26, + "step": 25600, + "train_speed(iter/s)": 1.636209 + }, + { + "acc": 0.66676998, + "epoch": 0.6495433789954338, + "grad_norm": 7.1875, + "learning_rate": 8.074041728981166e-06, + "loss": 1.55171442, + "memory(GiB)": 107.26, + "step": 25605, + "train_speed(iter/s)": 1.636243 + }, + { + "acc": 0.6556746, + "epoch": 0.6496702181633689, + "grad_norm": 5.1875, + "learning_rate": 8.073214636478186e-06, + "loss": 1.59134932, + "memory(GiB)": 107.26, + "step": 25610, + "train_speed(iter/s)": 1.636279 + }, + { + "acc": 0.659267, + "epoch": 0.6497970573313039, + "grad_norm": 8.1875, + "learning_rate": 8.072387408801506e-06, + "loss": 1.63330154, + "memory(GiB)": 107.26, + "step": 25615, + "train_speed(iter/s)": 1.636315 + }, + { + "acc": 0.64754276, + "epoch": 0.649923896499239, + "grad_norm": 5.125, + "learning_rate": 8.07156004598751e-06, + "loss": 1.61135445, + "memory(GiB)": 107.26, + "step": 25620, + "train_speed(iter/s)": 1.636351 + }, + { + "acc": 0.65480671, + "epoch": 0.650050735667174, + "grad_norm": 5.34375, + "learning_rate": 8.07073254807259e-06, + "loss": 1.59963512, + "memory(GiB)": 107.26, + "step": 25625, + "train_speed(iter/s)": 1.636387 + }, + { + "acc": 0.65050282, + "epoch": 0.6501775748351091, + "grad_norm": 4.96875, + "learning_rate": 8.069904915093144e-06, + "loss": 1.62776031, + "memory(GiB)": 107.26, + "step": 25630, + "train_speed(iter/s)": 1.636425 + }, + { + "acc": 0.6454741, + "epoch": 0.6503044140030442, + "grad_norm": 4.75, + "learning_rate": 8.069077147085571e-06, + "loss": 1.6130579, + "memory(GiB)": 107.26, + "step": 25635, + "train_speed(iter/s)": 1.636459 + }, + { + "acc": 0.66796341, + "epoch": 0.6504312531709792, + "grad_norm": 5.75, + "learning_rate": 8.068249244086283e-06, + "loss": 1.59014292, + "memory(GiB)": 107.26, + "step": 25640, + "train_speed(iter/s)": 1.636495 + }, + { + "acc": 0.64008842, + "epoch": 0.6505580923389143, + "grad_norm": 5.75, + "learning_rate": 8.067421206131696e-06, + "loss": 1.6800972, + "memory(GiB)": 107.26, + "step": 25645, + "train_speed(iter/s)": 1.636533 + }, + { + "acc": 0.6439374, + "epoch": 0.6506849315068494, + "grad_norm": 6.0, + "learning_rate": 8.06659303325823e-06, + "loss": 1.61153755, + "memory(GiB)": 107.26, + "step": 25650, + "train_speed(iter/s)": 1.636568 + }, + { + "acc": 0.65181694, + "epoch": 0.6508117706747844, + "grad_norm": 5.375, + "learning_rate": 8.06576472550231e-06, + "loss": 1.65965424, + "memory(GiB)": 107.26, + "step": 25655, + "train_speed(iter/s)": 1.636605 + }, + { + "acc": 0.65107126, + "epoch": 0.6509386098427195, + "grad_norm": 5.59375, + "learning_rate": 8.064936282900368e-06, + "loss": 1.64304657, + "memory(GiB)": 107.26, + "step": 25660, + "train_speed(iter/s)": 1.636639 + }, + { + "acc": 0.6511466, + "epoch": 0.6510654490106544, + "grad_norm": 5.96875, + "learning_rate": 8.064107705488846e-06, + "loss": 1.56734219, + "memory(GiB)": 107.26, + "step": 25665, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.65623379, + "epoch": 0.6511922881785895, + "grad_norm": 5.9375, + "learning_rate": 8.063278993304188e-06, + "loss": 1.55760822, + "memory(GiB)": 107.26, + "step": 25670, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.64906135, + "epoch": 0.6513191273465246, + "grad_norm": 7.34375, + "learning_rate": 8.06245014638284e-06, + "loss": 1.62841072, + "memory(GiB)": 107.26, + "step": 25675, + "train_speed(iter/s)": 1.636743 + }, + { + "acc": 0.66630692, + "epoch": 0.6514459665144596, + "grad_norm": 6.53125, + "learning_rate": 8.061621164761266e-06, + "loss": 1.53510332, + "memory(GiB)": 107.26, + "step": 25680, + "train_speed(iter/s)": 1.636779 + }, + { + "acc": 0.65046349, + "epoch": 0.6515728056823947, + "grad_norm": 5.4375, + "learning_rate": 8.06079204847592e-06, + "loss": 1.57576828, + "memory(GiB)": 107.26, + "step": 25685, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.65991106, + "epoch": 0.6516996448503298, + "grad_norm": 5.0625, + "learning_rate": 8.059962797563277e-06, + "loss": 1.58262768, + "memory(GiB)": 107.26, + "step": 25690, + "train_speed(iter/s)": 1.636849 + }, + { + "acc": 0.65574341, + "epoch": 0.6518264840182648, + "grad_norm": 4.59375, + "learning_rate": 8.059133412059808e-06, + "loss": 1.56862288, + "memory(GiB)": 107.26, + "step": 25695, + "train_speed(iter/s)": 1.636886 + }, + { + "acc": 0.65753093, + "epoch": 0.6519533231861999, + "grad_norm": 6.0, + "learning_rate": 8.058303892001993e-06, + "loss": 1.59334984, + "memory(GiB)": 107.26, + "step": 25700, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.65194035, + "epoch": 0.6520801623541349, + "grad_norm": 7.125, + "learning_rate": 8.057474237426318e-06, + "loss": 1.59372311, + "memory(GiB)": 107.26, + "step": 25705, + "train_speed(iter/s)": 1.636957 + }, + { + "acc": 0.6390759, + "epoch": 0.65220700152207, + "grad_norm": 5.46875, + "learning_rate": 8.056644448369275e-06, + "loss": 1.68950539, + "memory(GiB)": 107.26, + "step": 25710, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.64411707, + "epoch": 0.6523338406900051, + "grad_norm": 4.875, + "learning_rate": 8.055814524867364e-06, + "loss": 1.66304226, + "memory(GiB)": 107.26, + "step": 25715, + "train_speed(iter/s)": 1.637028 + }, + { + "acc": 0.64946504, + "epoch": 0.6524606798579401, + "grad_norm": 6.28125, + "learning_rate": 8.054984466957085e-06, + "loss": 1.67126904, + "memory(GiB)": 107.26, + "step": 25720, + "train_speed(iter/s)": 1.637063 + }, + { + "acc": 0.66729188, + "epoch": 0.6525875190258752, + "grad_norm": 5.375, + "learning_rate": 8.05415427467495e-06, + "loss": 1.47327223, + "memory(GiB)": 107.26, + "step": 25725, + "train_speed(iter/s)": 1.637101 + }, + { + "acc": 0.65841341, + "epoch": 0.6527143581938103, + "grad_norm": 5.1875, + "learning_rate": 8.053323948057477e-06, + "loss": 1.60801105, + "memory(GiB)": 107.26, + "step": 25730, + "train_speed(iter/s)": 1.637136 + }, + { + "acc": 0.64222345, + "epoch": 0.6528411973617453, + "grad_norm": 5.5625, + "learning_rate": 8.052493487141183e-06, + "loss": 1.64245872, + "memory(GiB)": 107.26, + "step": 25735, + "train_speed(iter/s)": 1.637169 + }, + { + "acc": 0.64860888, + "epoch": 0.6529680365296804, + "grad_norm": 5.09375, + "learning_rate": 8.051662891962594e-06, + "loss": 1.62688332, + "memory(GiB)": 107.26, + "step": 25740, + "train_speed(iter/s)": 1.637206 + }, + { + "acc": 0.65567026, + "epoch": 0.6530948756976154, + "grad_norm": 5.21875, + "learning_rate": 8.05083216255825e-06, + "loss": 1.61858711, + "memory(GiB)": 107.26, + "step": 25745, + "train_speed(iter/s)": 1.63724 + }, + { + "acc": 0.65512767, + "epoch": 0.6532217148655505, + "grad_norm": 5.0625, + "learning_rate": 8.050001298964685e-06, + "loss": 1.63614407, + "memory(GiB)": 107.26, + "step": 25750, + "train_speed(iter/s)": 1.637274 + }, + { + "acc": 0.66472425, + "epoch": 0.6533485540334856, + "grad_norm": 5.8125, + "learning_rate": 8.049170301218445e-06, + "loss": 1.62809258, + "memory(GiB)": 107.26, + "step": 25755, + "train_speed(iter/s)": 1.637309 + }, + { + "acc": 0.66269274, + "epoch": 0.6534753932014206, + "grad_norm": 6.84375, + "learning_rate": 8.048339169356085e-06, + "loss": 1.59124031, + "memory(GiB)": 107.26, + "step": 25760, + "train_speed(iter/s)": 1.637346 + }, + { + "acc": 0.67250175, + "epoch": 0.6536022323693557, + "grad_norm": 5.78125, + "learning_rate": 8.047507903414155e-06, + "loss": 1.52040024, + "memory(GiB)": 107.26, + "step": 25765, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.67127681, + "epoch": 0.6537290715372908, + "grad_norm": 5.25, + "learning_rate": 8.046676503429222e-06, + "loss": 1.54010944, + "memory(GiB)": 107.26, + "step": 25770, + "train_speed(iter/s)": 1.637417 + }, + { + "acc": 0.65577064, + "epoch": 0.6538559107052258, + "grad_norm": 5.9375, + "learning_rate": 8.045844969437855e-06, + "loss": 1.5808691, + "memory(GiB)": 107.26, + "step": 25775, + "train_speed(iter/s)": 1.637452 + }, + { + "acc": 0.64445257, + "epoch": 0.6539827498731609, + "grad_norm": 7.03125, + "learning_rate": 8.045013301476625e-06, + "loss": 1.61710358, + "memory(GiB)": 107.26, + "step": 25780, + "train_speed(iter/s)": 1.637488 + }, + { + "acc": 0.6572897, + "epoch": 0.6541095890410958, + "grad_norm": 4.59375, + "learning_rate": 8.044181499582117e-06, + "loss": 1.61221962, + "memory(GiB)": 107.26, + "step": 25785, + "train_speed(iter/s)": 1.637526 + }, + { + "acc": 0.66239758, + "epoch": 0.6542364282090309, + "grad_norm": 5.59375, + "learning_rate": 8.043349563790917e-06, + "loss": 1.61270027, + "memory(GiB)": 107.26, + "step": 25790, + "train_speed(iter/s)": 1.63756 + }, + { + "acc": 0.65623088, + "epoch": 0.654363267376966, + "grad_norm": 5.28125, + "learning_rate": 8.042517494139612e-06, + "loss": 1.64167175, + "memory(GiB)": 107.26, + "step": 25795, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.65754857, + "epoch": 0.654490106544901, + "grad_norm": 4.90625, + "learning_rate": 8.041685290664806e-06, + "loss": 1.52577028, + "memory(GiB)": 107.26, + "step": 25800, + "train_speed(iter/s)": 1.63763 + }, + { + "acc": 0.64661393, + "epoch": 0.6546169457128361, + "grad_norm": 5.71875, + "learning_rate": 8.0408529534031e-06, + "loss": 1.60060444, + "memory(GiB)": 107.26, + "step": 25805, + "train_speed(iter/s)": 1.637665 + }, + { + "acc": 0.64958339, + "epoch": 0.6547437848807712, + "grad_norm": 4.90625, + "learning_rate": 8.040020482391105e-06, + "loss": 1.58185863, + "memory(GiB)": 107.26, + "step": 25810, + "train_speed(iter/s)": 1.637702 + }, + { + "acc": 0.68604307, + "epoch": 0.6548706240487062, + "grad_norm": 5.625, + "learning_rate": 8.039187877665435e-06, + "loss": 1.49172001, + "memory(GiB)": 107.26, + "step": 25815, + "train_speed(iter/s)": 1.637739 + }, + { + "acc": 0.66448898, + "epoch": 0.6549974632166413, + "grad_norm": 9.0, + "learning_rate": 8.038355139262716e-06, + "loss": 1.56033497, + "memory(GiB)": 107.26, + "step": 25820, + "train_speed(iter/s)": 1.637773 + }, + { + "acc": 0.66828508, + "epoch": 0.6551243023845763, + "grad_norm": 6.0625, + "learning_rate": 8.037522267219571e-06, + "loss": 1.57431402, + "memory(GiB)": 107.26, + "step": 25825, + "train_speed(iter/s)": 1.637806 + }, + { + "acc": 0.64303837, + "epoch": 0.6552511415525114, + "grad_norm": 7.1875, + "learning_rate": 8.036689261572636e-06, + "loss": 1.61256237, + "memory(GiB)": 107.26, + "step": 25830, + "train_speed(iter/s)": 1.637841 + }, + { + "acc": 0.64496369, + "epoch": 0.6553779807204465, + "grad_norm": 6.71875, + "learning_rate": 8.035856122358548e-06, + "loss": 1.66227551, + "memory(GiB)": 107.26, + "step": 25835, + "train_speed(iter/s)": 1.637877 + }, + { + "acc": 0.65122848, + "epoch": 0.6555048198883815, + "grad_norm": 5.21875, + "learning_rate": 8.035022849613954e-06, + "loss": 1.63981819, + "memory(GiB)": 107.26, + "step": 25840, + "train_speed(iter/s)": 1.637914 + }, + { + "acc": 0.65888414, + "epoch": 0.6556316590563166, + "grad_norm": 6.40625, + "learning_rate": 8.034189443375505e-06, + "loss": 1.64588718, + "memory(GiB)": 107.26, + "step": 25845, + "train_speed(iter/s)": 1.63795 + }, + { + "acc": 0.65115261, + "epoch": 0.6557584982242517, + "grad_norm": 5.34375, + "learning_rate": 8.033355903679858e-06, + "loss": 1.5615099, + "memory(GiB)": 107.26, + "step": 25850, + "train_speed(iter/s)": 1.637988 + }, + { + "acc": 0.66259556, + "epoch": 0.6558853373921867, + "grad_norm": 5.90625, + "learning_rate": 8.032522230563676e-06, + "loss": 1.61838951, + "memory(GiB)": 107.26, + "step": 25855, + "train_speed(iter/s)": 1.638023 + }, + { + "acc": 0.65130081, + "epoch": 0.6560121765601218, + "grad_norm": 6.65625, + "learning_rate": 8.031688424063625e-06, + "loss": 1.57806177, + "memory(GiB)": 107.26, + "step": 25860, + "train_speed(iter/s)": 1.638058 + }, + { + "acc": 0.65015593, + "epoch": 0.6561390157280568, + "grad_norm": 5.8125, + "learning_rate": 8.030854484216381e-06, + "loss": 1.62416039, + "memory(GiB)": 107.26, + "step": 25865, + "train_speed(iter/s)": 1.638095 + }, + { + "acc": 0.65527468, + "epoch": 0.6562658548959919, + "grad_norm": 4.78125, + "learning_rate": 8.030020411058627e-06, + "loss": 1.66755371, + "memory(GiB)": 107.26, + "step": 25870, + "train_speed(iter/s)": 1.63813 + }, + { + "acc": 0.66531949, + "epoch": 0.656392694063927, + "grad_norm": 5.875, + "learning_rate": 8.029186204627049e-06, + "loss": 1.57561426, + "memory(GiB)": 107.26, + "step": 25875, + "train_speed(iter/s)": 1.638166 + }, + { + "acc": 0.65105863, + "epoch": 0.656519533231862, + "grad_norm": 7.3125, + "learning_rate": 8.028351864958335e-06, + "loss": 1.59731588, + "memory(GiB)": 107.26, + "step": 25880, + "train_speed(iter/s)": 1.638203 + }, + { + "acc": 0.67144046, + "epoch": 0.6566463723997971, + "grad_norm": 6.40625, + "learning_rate": 8.027517392089185e-06, + "loss": 1.55755978, + "memory(GiB)": 107.26, + "step": 25885, + "train_speed(iter/s)": 1.63824 + }, + { + "acc": 0.65607567, + "epoch": 0.6567732115677322, + "grad_norm": 6.0, + "learning_rate": 8.026682786056304e-06, + "loss": 1.62314148, + "memory(GiB)": 107.26, + "step": 25890, + "train_speed(iter/s)": 1.638278 + }, + { + "acc": 0.63641014, + "epoch": 0.6569000507356672, + "grad_norm": 4.5625, + "learning_rate": 8.025848046896401e-06, + "loss": 1.69427681, + "memory(GiB)": 107.26, + "step": 25895, + "train_speed(iter/s)": 1.638316 + }, + { + "acc": 0.65331173, + "epoch": 0.6570268899036023, + "grad_norm": 5.75, + "learning_rate": 8.02501317464619e-06, + "loss": 1.61411819, + "memory(GiB)": 107.26, + "step": 25900, + "train_speed(iter/s)": 1.638353 + }, + { + "acc": 0.65533915, + "epoch": 0.6571537290715372, + "grad_norm": 7.125, + "learning_rate": 8.024178169342396e-06, + "loss": 1.6569809, + "memory(GiB)": 107.26, + "step": 25905, + "train_speed(iter/s)": 1.638388 + }, + { + "acc": 0.65819635, + "epoch": 0.6572805682394723, + "grad_norm": 5.8125, + "learning_rate": 8.023343031021744e-06, + "loss": 1.6106245, + "memory(GiB)": 107.26, + "step": 25910, + "train_speed(iter/s)": 1.638423 + }, + { + "acc": 0.64963164, + "epoch": 0.6574074074074074, + "grad_norm": 5.0, + "learning_rate": 8.022507759720966e-06, + "loss": 1.5897171, + "memory(GiB)": 107.26, + "step": 25915, + "train_speed(iter/s)": 1.638461 + }, + { + "acc": 0.65574932, + "epoch": 0.6575342465753424, + "grad_norm": 5.6875, + "learning_rate": 8.021672355476802e-06, + "loss": 1.63314705, + "memory(GiB)": 107.26, + "step": 25920, + "train_speed(iter/s)": 1.638497 + }, + { + "acc": 0.6544919, + "epoch": 0.6576610857432775, + "grad_norm": 5.3125, + "learning_rate": 8.020836818325997e-06, + "loss": 1.58537827, + "memory(GiB)": 107.26, + "step": 25925, + "train_speed(iter/s)": 1.638533 + }, + { + "acc": 0.65422497, + "epoch": 0.6577879249112126, + "grad_norm": 7.71875, + "learning_rate": 8.020001148305304e-06, + "loss": 1.59328852, + "memory(GiB)": 107.26, + "step": 25930, + "train_speed(iter/s)": 1.638568 + }, + { + "acc": 0.67387271, + "epoch": 0.6579147640791476, + "grad_norm": 5.9375, + "learning_rate": 8.019165345451475e-06, + "loss": 1.56165943, + "memory(GiB)": 107.26, + "step": 25935, + "train_speed(iter/s)": 1.638602 + }, + { + "acc": 0.66296759, + "epoch": 0.6580416032470827, + "grad_norm": 6.84375, + "learning_rate": 8.018329409801276e-06, + "loss": 1.59441996, + "memory(GiB)": 107.26, + "step": 25940, + "train_speed(iter/s)": 1.63864 + }, + { + "acc": 0.65683632, + "epoch": 0.6581684424150177, + "grad_norm": 8.5625, + "learning_rate": 8.017493341391471e-06, + "loss": 1.58436546, + "memory(GiB)": 107.26, + "step": 25945, + "train_speed(iter/s)": 1.638674 + }, + { + "acc": 0.65543962, + "epoch": 0.6582952815829528, + "grad_norm": 5.28125, + "learning_rate": 8.016657140258839e-06, + "loss": 1.5694334, + "memory(GiB)": 107.26, + "step": 25950, + "train_speed(iter/s)": 1.638709 + }, + { + "acc": 0.65649805, + "epoch": 0.6584221207508879, + "grad_norm": 5.84375, + "learning_rate": 8.015820806440157e-06, + "loss": 1.62765369, + "memory(GiB)": 107.26, + "step": 25955, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.63261914, + "epoch": 0.6585489599188229, + "grad_norm": 5.875, + "learning_rate": 8.014984339972211e-06, + "loss": 1.62459164, + "memory(GiB)": 107.26, + "step": 25960, + "train_speed(iter/s)": 1.638782 + }, + { + "acc": 0.65641041, + "epoch": 0.658675799086758, + "grad_norm": 5.21875, + "learning_rate": 8.014147740891793e-06, + "loss": 1.61278076, + "memory(GiB)": 107.26, + "step": 25965, + "train_speed(iter/s)": 1.638816 + }, + { + "acc": 0.65521011, + "epoch": 0.6588026382546931, + "grad_norm": 7.0, + "learning_rate": 8.0133110092357e-06, + "loss": 1.56442394, + "memory(GiB)": 107.26, + "step": 25970, + "train_speed(iter/s)": 1.638852 + }, + { + "acc": 0.66287298, + "epoch": 0.6589294774226281, + "grad_norm": 5.5625, + "learning_rate": 8.012474145040737e-06, + "loss": 1.66549492, + "memory(GiB)": 107.26, + "step": 25975, + "train_speed(iter/s)": 1.638888 + }, + { + "acc": 0.65622835, + "epoch": 0.6590563165905632, + "grad_norm": 5.71875, + "learning_rate": 8.01163714834371e-06, + "loss": 1.62561264, + "memory(GiB)": 107.26, + "step": 25980, + "train_speed(iter/s)": 1.638921 + }, + { + "acc": 0.65145273, + "epoch": 0.6591831557584982, + "grad_norm": 6.34375, + "learning_rate": 8.010800019181433e-06, + "loss": 1.68249474, + "memory(GiB)": 107.26, + "step": 25985, + "train_speed(iter/s)": 1.638959 + }, + { + "acc": 0.64840899, + "epoch": 0.6593099949264333, + "grad_norm": 5.84375, + "learning_rate": 8.009962757590732e-06, + "loss": 1.6307621, + "memory(GiB)": 107.26, + "step": 25990, + "train_speed(iter/s)": 1.638993 + }, + { + "acc": 0.65632353, + "epoch": 0.6594368340943684, + "grad_norm": 5.09375, + "learning_rate": 8.00912536360843e-06, + "loss": 1.6047554, + "memory(GiB)": 107.26, + "step": 25995, + "train_speed(iter/s)": 1.63903 + }, + { + "acc": 0.65990715, + "epoch": 0.6595636732623034, + "grad_norm": 6.21875, + "learning_rate": 8.008287837271359e-06, + "loss": 1.56774426, + "memory(GiB)": 107.26, + "step": 26000, + "train_speed(iter/s)": 1.639064 + }, + { + "epoch": 0.6595636732623034, + "eval_acc": 0.6449340826175665, + "eval_loss": 1.5801098346710205, + "eval_runtime": 58.8201, + "eval_samples_per_second": 108.296, + "eval_steps_per_second": 27.083, + "step": 26000 + }, + { + "acc": 0.66742239, + "epoch": 0.6596905124302385, + "grad_norm": 5.8125, + "learning_rate": 8.007450178616356e-06, + "loss": 1.5487195, + "memory(GiB)": 107.26, + "step": 26005, + "train_speed(iter/s)": 1.632609 + }, + { + "acc": 0.64972801, + "epoch": 0.6598173515981736, + "grad_norm": 5.46875, + "learning_rate": 8.00661238768027e-06, + "loss": 1.59287224, + "memory(GiB)": 107.26, + "step": 26010, + "train_speed(iter/s)": 1.632642 + }, + { + "acc": 0.65350962, + "epoch": 0.6599441907661086, + "grad_norm": 6.25, + "learning_rate": 8.005774464499947e-06, + "loss": 1.6239933, + "memory(GiB)": 107.26, + "step": 26015, + "train_speed(iter/s)": 1.632676 + }, + { + "acc": 0.63990555, + "epoch": 0.6600710299340437, + "grad_norm": 5.15625, + "learning_rate": 8.004936409112243e-06, + "loss": 1.63208122, + "memory(GiB)": 107.26, + "step": 26020, + "train_speed(iter/s)": 1.632711 + }, + { + "acc": 0.65411482, + "epoch": 0.6601978691019786, + "grad_norm": 5.09375, + "learning_rate": 8.004098221554018e-06, + "loss": 1.61860123, + "memory(GiB)": 107.26, + "step": 26025, + "train_speed(iter/s)": 1.632743 + }, + { + "acc": 0.6635293, + "epoch": 0.6603247082699137, + "grad_norm": 4.5625, + "learning_rate": 8.003259901862143e-06, + "loss": 1.59302425, + "memory(GiB)": 107.26, + "step": 26030, + "train_speed(iter/s)": 1.632776 + }, + { + "acc": 0.64447856, + "epoch": 0.6604515474378488, + "grad_norm": 5.46875, + "learning_rate": 8.002421450073488e-06, + "loss": 1.57079191, + "memory(GiB)": 107.26, + "step": 26035, + "train_speed(iter/s)": 1.63281 + }, + { + "acc": 0.65432482, + "epoch": 0.6605783866057838, + "grad_norm": 6.15625, + "learning_rate": 8.001582866224932e-06, + "loss": 1.56841364, + "memory(GiB)": 107.26, + "step": 26040, + "train_speed(iter/s)": 1.632844 + }, + { + "acc": 0.66357298, + "epoch": 0.6607052257737189, + "grad_norm": 6.21875, + "learning_rate": 8.000744150353362e-06, + "loss": 1.63216171, + "memory(GiB)": 107.26, + "step": 26045, + "train_speed(iter/s)": 1.632876 + }, + { + "acc": 0.65270128, + "epoch": 0.660832064941654, + "grad_norm": 5.46875, + "learning_rate": 7.999905302495667e-06, + "loss": 1.64064026, + "memory(GiB)": 107.26, + "step": 26050, + "train_speed(iter/s)": 1.632909 + }, + { + "acc": 0.66189537, + "epoch": 0.660958904109589, + "grad_norm": 6.03125, + "learning_rate": 7.999066322688743e-06, + "loss": 1.59782085, + "memory(GiB)": 107.26, + "step": 26055, + "train_speed(iter/s)": 1.632944 + }, + { + "acc": 0.6538331, + "epoch": 0.6610857432775241, + "grad_norm": 6.15625, + "learning_rate": 7.998227210969491e-06, + "loss": 1.67116566, + "memory(GiB)": 107.26, + "step": 26060, + "train_speed(iter/s)": 1.632976 + }, + { + "acc": 0.66555872, + "epoch": 0.6612125824454591, + "grad_norm": 5.5, + "learning_rate": 7.997387967374821e-06, + "loss": 1.56799259, + "memory(GiB)": 107.26, + "step": 26065, + "train_speed(iter/s)": 1.63301 + }, + { + "acc": 0.6442678, + "epoch": 0.6613394216133942, + "grad_norm": 6.71875, + "learning_rate": 7.996548591941647e-06, + "loss": 1.68439751, + "memory(GiB)": 107.26, + "step": 26070, + "train_speed(iter/s)": 1.633044 + }, + { + "acc": 0.65063286, + "epoch": 0.6614662607813293, + "grad_norm": 6.75, + "learning_rate": 7.995709084706884e-06, + "loss": 1.59219618, + "memory(GiB)": 107.26, + "step": 26075, + "train_speed(iter/s)": 1.633077 + }, + { + "acc": 0.63879976, + "epoch": 0.6615930999492643, + "grad_norm": 5.5, + "learning_rate": 7.994869445707463e-06, + "loss": 1.64366302, + "memory(GiB)": 107.26, + "step": 26080, + "train_speed(iter/s)": 1.63311 + }, + { + "acc": 0.65073891, + "epoch": 0.6617199391171994, + "grad_norm": 5.4375, + "learning_rate": 7.994029674980313e-06, + "loss": 1.59470654, + "memory(GiB)": 107.26, + "step": 26085, + "train_speed(iter/s)": 1.633143 + }, + { + "acc": 0.66801548, + "epoch": 0.6618467782851345, + "grad_norm": 5.0, + "learning_rate": 7.99318977256237e-06, + "loss": 1.56970501, + "memory(GiB)": 107.26, + "step": 26090, + "train_speed(iter/s)": 1.633177 + }, + { + "acc": 0.64988208, + "epoch": 0.6619736174530695, + "grad_norm": 4.3125, + "learning_rate": 7.992349738490576e-06, + "loss": 1.63021088, + "memory(GiB)": 107.26, + "step": 26095, + "train_speed(iter/s)": 1.633204 + }, + { + "acc": 0.66828499, + "epoch": 0.6621004566210046, + "grad_norm": 6.40625, + "learning_rate": 7.991509572801883e-06, + "loss": 1.58921242, + "memory(GiB)": 107.26, + "step": 26100, + "train_speed(iter/s)": 1.633234 + }, + { + "acc": 0.65110073, + "epoch": 0.6622272957889396, + "grad_norm": 4.90625, + "learning_rate": 7.990669275533241e-06, + "loss": 1.6571209, + "memory(GiB)": 107.26, + "step": 26105, + "train_speed(iter/s)": 1.633266 + }, + { + "acc": 0.64173956, + "epoch": 0.6623541349568747, + "grad_norm": 5.8125, + "learning_rate": 7.989828846721613e-06, + "loss": 1.65434036, + "memory(GiB)": 107.26, + "step": 26110, + "train_speed(iter/s)": 1.633301 + }, + { + "acc": 0.64650497, + "epoch": 0.6624809741248098, + "grad_norm": 5.1875, + "learning_rate": 7.98898828640396e-06, + "loss": 1.60591927, + "memory(GiB)": 107.26, + "step": 26115, + "train_speed(iter/s)": 1.633327 + }, + { + "acc": 0.65101547, + "epoch": 0.6626078132927448, + "grad_norm": 6.03125, + "learning_rate": 7.988147594617262e-06, + "loss": 1.61112423, + "memory(GiB)": 107.26, + "step": 26120, + "train_speed(iter/s)": 1.633361 + }, + { + "acc": 0.65958891, + "epoch": 0.6627346524606799, + "grad_norm": 4.96875, + "learning_rate": 7.987306771398489e-06, + "loss": 1.55805855, + "memory(GiB)": 107.26, + "step": 26125, + "train_speed(iter/s)": 1.633393 + }, + { + "acc": 0.67060671, + "epoch": 0.662861491628615, + "grad_norm": 5.9375, + "learning_rate": 7.986465816784628e-06, + "loss": 1.60093422, + "memory(GiB)": 107.26, + "step": 26130, + "train_speed(iter/s)": 1.633426 + }, + { + "acc": 0.65147324, + "epoch": 0.66298833079655, + "grad_norm": 5.90625, + "learning_rate": 7.985624730812667e-06, + "loss": 1.61662483, + "memory(GiB)": 107.26, + "step": 26135, + "train_speed(iter/s)": 1.633459 + }, + { + "acc": 0.64554853, + "epoch": 0.663115169964485, + "grad_norm": 6.21875, + "learning_rate": 7.984783513519601e-06, + "loss": 1.64859066, + "memory(GiB)": 107.26, + "step": 26140, + "train_speed(iter/s)": 1.633494 + }, + { + "acc": 0.64900913, + "epoch": 0.66324200913242, + "grad_norm": 6.0625, + "learning_rate": 7.98394216494243e-06, + "loss": 1.68912888, + "memory(GiB)": 107.26, + "step": 26145, + "train_speed(iter/s)": 1.633526 + }, + { + "acc": 0.65496206, + "epoch": 0.6633688483003551, + "grad_norm": 5.875, + "learning_rate": 7.983100685118157e-06, + "loss": 1.65027428, + "memory(GiB)": 107.26, + "step": 26150, + "train_speed(iter/s)": 1.633558 + }, + { + "acc": 0.6627738, + "epoch": 0.6634956874682902, + "grad_norm": 5.21875, + "learning_rate": 7.9822590740838e-06, + "loss": 1.53033924, + "memory(GiB)": 107.26, + "step": 26155, + "train_speed(iter/s)": 1.633593 + }, + { + "acc": 0.65780592, + "epoch": 0.6636225266362252, + "grad_norm": 5.21875, + "learning_rate": 7.981417331876373e-06, + "loss": 1.55905523, + "memory(GiB)": 107.26, + "step": 26160, + "train_speed(iter/s)": 1.633624 + }, + { + "acc": 0.65119009, + "epoch": 0.6637493658041603, + "grad_norm": 4.96875, + "learning_rate": 7.980575458532901e-06, + "loss": 1.65087109, + "memory(GiB)": 107.26, + "step": 26165, + "train_speed(iter/s)": 1.633658 + }, + { + "acc": 0.65455079, + "epoch": 0.6638762049720954, + "grad_norm": 5.34375, + "learning_rate": 7.979733454090415e-06, + "loss": 1.60217419, + "memory(GiB)": 107.26, + "step": 26170, + "train_speed(iter/s)": 1.633693 + }, + { + "acc": 0.64061966, + "epoch": 0.6640030441400304, + "grad_norm": 6.375, + "learning_rate": 7.978891318585947e-06, + "loss": 1.59168797, + "memory(GiB)": 107.26, + "step": 26175, + "train_speed(iter/s)": 1.633726 + }, + { + "acc": 0.65740409, + "epoch": 0.6641298833079655, + "grad_norm": 6.3125, + "learning_rate": 7.978049052056537e-06, + "loss": 1.62550316, + "memory(GiB)": 107.26, + "step": 26180, + "train_speed(iter/s)": 1.633761 + }, + { + "acc": 0.6493063, + "epoch": 0.6642567224759005, + "grad_norm": 6.0625, + "learning_rate": 7.977206654539235e-06, + "loss": 1.60285969, + "memory(GiB)": 107.26, + "step": 26185, + "train_speed(iter/s)": 1.633797 + }, + { + "acc": 0.64898381, + "epoch": 0.6643835616438356, + "grad_norm": 5.21875, + "learning_rate": 7.976364126071092e-06, + "loss": 1.65029259, + "memory(GiB)": 107.26, + "step": 26190, + "train_speed(iter/s)": 1.633833 + }, + { + "acc": 0.62990637, + "epoch": 0.6645104008117707, + "grad_norm": 6.03125, + "learning_rate": 7.975521466689166e-06, + "loss": 1.7216938, + "memory(GiB)": 107.26, + "step": 26195, + "train_speed(iter/s)": 1.633869 + }, + { + "acc": 0.64958706, + "epoch": 0.6646372399797057, + "grad_norm": 7.3125, + "learning_rate": 7.974678676430523e-06, + "loss": 1.59830379, + "memory(GiB)": 107.26, + "step": 26200, + "train_speed(iter/s)": 1.633902 + }, + { + "acc": 0.66740875, + "epoch": 0.6647640791476408, + "grad_norm": 5.46875, + "learning_rate": 7.97383575533223e-06, + "loss": 1.53848362, + "memory(GiB)": 107.26, + "step": 26205, + "train_speed(iter/s)": 1.633936 + }, + { + "acc": 0.64491701, + "epoch": 0.6648909183155759, + "grad_norm": 5.4375, + "learning_rate": 7.972992703431362e-06, + "loss": 1.61117287, + "memory(GiB)": 107.26, + "step": 26210, + "train_speed(iter/s)": 1.63397 + }, + { + "acc": 0.65159121, + "epoch": 0.6650177574835109, + "grad_norm": 5.8125, + "learning_rate": 7.972149520765e-06, + "loss": 1.62900295, + "memory(GiB)": 107.26, + "step": 26215, + "train_speed(iter/s)": 1.634006 + }, + { + "acc": 0.64747009, + "epoch": 0.665144596651446, + "grad_norm": 5.84375, + "learning_rate": 7.971306207370236e-06, + "loss": 1.62082176, + "memory(GiB)": 107.26, + "step": 26220, + "train_speed(iter/s)": 1.63404 + }, + { + "acc": 0.65735497, + "epoch": 0.665271435819381, + "grad_norm": 5.4375, + "learning_rate": 7.970462763284157e-06, + "loss": 1.59599295, + "memory(GiB)": 107.26, + "step": 26225, + "train_speed(iter/s)": 1.634074 + }, + { + "acc": 0.65048742, + "epoch": 0.6653982749873161, + "grad_norm": 5.6875, + "learning_rate": 7.969619188543865e-06, + "loss": 1.66062508, + "memory(GiB)": 107.26, + "step": 26230, + "train_speed(iter/s)": 1.63411 + }, + { + "acc": 0.65688505, + "epoch": 0.6655251141552512, + "grad_norm": 6.25, + "learning_rate": 7.968775483186462e-06, + "loss": 1.5941186, + "memory(GiB)": 107.26, + "step": 26235, + "train_speed(iter/s)": 1.634142 + }, + { + "acc": 0.67182746, + "epoch": 0.6656519533231862, + "grad_norm": 5.46875, + "learning_rate": 7.967931647249058e-06, + "loss": 1.54654465, + "memory(GiB)": 107.26, + "step": 26240, + "train_speed(iter/s)": 1.634176 + }, + { + "acc": 0.6422019, + "epoch": 0.6657787924911213, + "grad_norm": 6.40625, + "learning_rate": 7.967087680768768e-06, + "loss": 1.609688, + "memory(GiB)": 107.26, + "step": 26245, + "train_speed(iter/s)": 1.63421 + }, + { + "acc": 0.65624475, + "epoch": 0.6659056316590564, + "grad_norm": 5.8125, + "learning_rate": 7.966243583782718e-06, + "loss": 1.6517086, + "memory(GiB)": 107.26, + "step": 26250, + "train_speed(iter/s)": 1.634245 + }, + { + "acc": 0.64523468, + "epoch": 0.6660324708269914, + "grad_norm": 5.25, + "learning_rate": 7.96539935632803e-06, + "loss": 1.62207298, + "memory(GiB)": 107.26, + "step": 26255, + "train_speed(iter/s)": 1.63428 + }, + { + "acc": 0.65360136, + "epoch": 0.6661593099949265, + "grad_norm": 5.75, + "learning_rate": 7.964554998441839e-06, + "loss": 1.59817991, + "memory(GiB)": 107.26, + "step": 26260, + "train_speed(iter/s)": 1.634315 + }, + { + "acc": 0.66992517, + "epoch": 0.6662861491628614, + "grad_norm": 5.4375, + "learning_rate": 7.963710510161282e-06, + "loss": 1.57187328, + "memory(GiB)": 107.26, + "step": 26265, + "train_speed(iter/s)": 1.634349 + }, + { + "acc": 0.67026062, + "epoch": 0.6664129883307965, + "grad_norm": 5.21875, + "learning_rate": 7.962865891523508e-06, + "loss": 1.55525513, + "memory(GiB)": 107.26, + "step": 26270, + "train_speed(iter/s)": 1.634382 + }, + { + "acc": 0.65087891, + "epoch": 0.6665398274987316, + "grad_norm": 6.0625, + "learning_rate": 7.96202114256566e-06, + "loss": 1.70106716, + "memory(GiB)": 107.26, + "step": 26275, + "train_speed(iter/s)": 1.634417 + }, + { + "acc": 0.65026321, + "epoch": 0.6666666666666666, + "grad_norm": 5.40625, + "learning_rate": 7.961176263324902e-06, + "loss": 1.60736542, + "memory(GiB)": 107.26, + "step": 26280, + "train_speed(iter/s)": 1.63445 + }, + { + "acc": 0.64772739, + "epoch": 0.6667935058346017, + "grad_norm": 5.21875, + "learning_rate": 7.960331253838387e-06, + "loss": 1.59110584, + "memory(GiB)": 107.26, + "step": 26285, + "train_speed(iter/s)": 1.634483 + }, + { + "acc": 0.64581356, + "epoch": 0.6669203450025368, + "grad_norm": 4.75, + "learning_rate": 7.95948611414329e-06, + "loss": 1.58659763, + "memory(GiB)": 107.26, + "step": 26290, + "train_speed(iter/s)": 1.634518 + }, + { + "acc": 0.65254569, + "epoch": 0.6670471841704718, + "grad_norm": 6.21875, + "learning_rate": 7.958640844276776e-06, + "loss": 1.65601578, + "memory(GiB)": 107.26, + "step": 26295, + "train_speed(iter/s)": 1.634553 + }, + { + "acc": 0.64018664, + "epoch": 0.6671740233384069, + "grad_norm": 5.25, + "learning_rate": 7.957795444276033e-06, + "loss": 1.65617447, + "memory(GiB)": 107.26, + "step": 26300, + "train_speed(iter/s)": 1.634587 + }, + { + "acc": 0.65503931, + "epoch": 0.6673008625063419, + "grad_norm": 6.46875, + "learning_rate": 7.956949914178239e-06, + "loss": 1.56485348, + "memory(GiB)": 107.26, + "step": 26305, + "train_speed(iter/s)": 1.634622 + }, + { + "acc": 0.65427999, + "epoch": 0.667427701674277, + "grad_norm": 6.5625, + "learning_rate": 7.956104254020587e-06, + "loss": 1.56067181, + "memory(GiB)": 107.26, + "step": 26310, + "train_speed(iter/s)": 1.634658 + }, + { + "acc": 0.65704389, + "epoch": 0.6675545408422121, + "grad_norm": 5.15625, + "learning_rate": 7.95525846384027e-06, + "loss": 1.59198112, + "memory(GiB)": 107.26, + "step": 26315, + "train_speed(iter/s)": 1.634692 + }, + { + "acc": 0.66022663, + "epoch": 0.6676813800101471, + "grad_norm": 4.5625, + "learning_rate": 7.954412543674493e-06, + "loss": 1.55944347, + "memory(GiB)": 107.26, + "step": 26320, + "train_speed(iter/s)": 1.634727 + }, + { + "acc": 0.65044961, + "epoch": 0.6678082191780822, + "grad_norm": 6.3125, + "learning_rate": 7.95356649356046e-06, + "loss": 1.6676157, + "memory(GiB)": 107.26, + "step": 26325, + "train_speed(iter/s)": 1.634763 + }, + { + "acc": 0.63911214, + "epoch": 0.6679350583460173, + "grad_norm": 6.53125, + "learning_rate": 7.952720313535387e-06, + "loss": 1.67508221, + "memory(GiB)": 107.26, + "step": 26330, + "train_speed(iter/s)": 1.634799 + }, + { + "acc": 0.6503706, + "epoch": 0.6680618975139523, + "grad_norm": 5.59375, + "learning_rate": 7.951874003636492e-06, + "loss": 1.67283287, + "memory(GiB)": 107.26, + "step": 26335, + "train_speed(iter/s)": 1.634834 + }, + { + "acc": 0.66250377, + "epoch": 0.6681887366818874, + "grad_norm": 4.65625, + "learning_rate": 7.951027563901e-06, + "loss": 1.51327457, + "memory(GiB)": 107.26, + "step": 26340, + "train_speed(iter/s)": 1.634868 + }, + { + "acc": 0.64156828, + "epoch": 0.6683155758498224, + "grad_norm": 7.125, + "learning_rate": 7.950180994366138e-06, + "loss": 1.65698223, + "memory(GiB)": 107.26, + "step": 26345, + "train_speed(iter/s)": 1.634904 + }, + { + "acc": 0.67646832, + "epoch": 0.6684424150177575, + "grad_norm": 5.75, + "learning_rate": 7.949334295069147e-06, + "loss": 1.50760193, + "memory(GiB)": 107.26, + "step": 26350, + "train_speed(iter/s)": 1.634939 + }, + { + "acc": 0.64608178, + "epoch": 0.6685692541856926, + "grad_norm": 6.8125, + "learning_rate": 7.948487466047263e-06, + "loss": 1.66705494, + "memory(GiB)": 107.26, + "step": 26355, + "train_speed(iter/s)": 1.634974 + }, + { + "acc": 0.6460598, + "epoch": 0.6686960933536276, + "grad_norm": 7.15625, + "learning_rate": 7.947640507337737e-06, + "loss": 1.59428129, + "memory(GiB)": 107.26, + "step": 26360, + "train_speed(iter/s)": 1.635008 + }, + { + "acc": 0.64024282, + "epoch": 0.6688229325215627, + "grad_norm": 5.5625, + "learning_rate": 7.946793418977821e-06, + "loss": 1.61949196, + "memory(GiB)": 107.26, + "step": 26365, + "train_speed(iter/s)": 1.635041 + }, + { + "acc": 0.63931704, + "epoch": 0.6689497716894978, + "grad_norm": 5.375, + "learning_rate": 7.945946201004775e-06, + "loss": 1.65089874, + "memory(GiB)": 107.26, + "step": 26370, + "train_speed(iter/s)": 1.635073 + }, + { + "acc": 0.64310131, + "epoch": 0.6690766108574328, + "grad_norm": 5.46875, + "learning_rate": 7.945098853455862e-06, + "loss": 1.64254761, + "memory(GiB)": 107.26, + "step": 26375, + "train_speed(iter/s)": 1.635109 + }, + { + "acc": 0.66963959, + "epoch": 0.6692034500253679, + "grad_norm": 4.40625, + "learning_rate": 7.944251376368352e-06, + "loss": 1.54499846, + "memory(GiB)": 107.26, + "step": 26380, + "train_speed(iter/s)": 1.635142 + }, + { + "acc": 0.64572573, + "epoch": 0.6693302891933028, + "grad_norm": 5.59375, + "learning_rate": 7.943403769779523e-06, + "loss": 1.67245369, + "memory(GiB)": 107.26, + "step": 26385, + "train_speed(iter/s)": 1.635175 + }, + { + "acc": 0.65155425, + "epoch": 0.6694571283612379, + "grad_norm": 6.28125, + "learning_rate": 7.942556033726654e-06, + "loss": 1.6684967, + "memory(GiB)": 107.26, + "step": 26390, + "train_speed(iter/s)": 1.635211 + }, + { + "acc": 0.66894259, + "epoch": 0.669583967529173, + "grad_norm": 5.25, + "learning_rate": 7.941708168247033e-06, + "loss": 1.61360855, + "memory(GiB)": 107.26, + "step": 26395, + "train_speed(iter/s)": 1.635246 + }, + { + "acc": 0.66247182, + "epoch": 0.669710806697108, + "grad_norm": 5.53125, + "learning_rate": 7.940860173377952e-06, + "loss": 1.6089695, + "memory(GiB)": 107.26, + "step": 26400, + "train_speed(iter/s)": 1.63528 + }, + { + "acc": 0.65205727, + "epoch": 0.6698376458650431, + "grad_norm": 5.8125, + "learning_rate": 7.940012049156711e-06, + "loss": 1.60652428, + "memory(GiB)": 107.26, + "step": 26405, + "train_speed(iter/s)": 1.635314 + }, + { + "acc": 0.66205568, + "epoch": 0.6699644850329782, + "grad_norm": 5.0625, + "learning_rate": 7.939163795620614e-06, + "loss": 1.60997677, + "memory(GiB)": 107.26, + "step": 26410, + "train_speed(iter/s)": 1.635348 + }, + { + "acc": 0.64853101, + "epoch": 0.6700913242009132, + "grad_norm": 5.71875, + "learning_rate": 7.938315412806971e-06, + "loss": 1.64800568, + "memory(GiB)": 107.26, + "step": 26415, + "train_speed(iter/s)": 1.635381 + }, + { + "acc": 0.67020588, + "epoch": 0.6702181633688483, + "grad_norm": 6.5, + "learning_rate": 7.937466900753098e-06, + "loss": 1.54874592, + "memory(GiB)": 107.26, + "step": 26420, + "train_speed(iter/s)": 1.635416 + }, + { + "acc": 0.64904571, + "epoch": 0.6703450025367833, + "grad_norm": 7.03125, + "learning_rate": 7.936618259496316e-06, + "loss": 1.64735374, + "memory(GiB)": 107.26, + "step": 26425, + "train_speed(iter/s)": 1.635452 + }, + { + "acc": 0.6575202, + "epoch": 0.6704718417047184, + "grad_norm": 5.5625, + "learning_rate": 7.935769489073952e-06, + "loss": 1.58822384, + "memory(GiB)": 107.26, + "step": 26430, + "train_speed(iter/s)": 1.635488 + }, + { + "acc": 0.66376667, + "epoch": 0.6705986808726535, + "grad_norm": 5.3125, + "learning_rate": 7.934920589523336e-06, + "loss": 1.52993689, + "memory(GiB)": 107.26, + "step": 26435, + "train_speed(iter/s)": 1.635521 + }, + { + "acc": 0.64739075, + "epoch": 0.6707255200405885, + "grad_norm": 7.71875, + "learning_rate": 7.934071560881812e-06, + "loss": 1.64702415, + "memory(GiB)": 107.26, + "step": 26440, + "train_speed(iter/s)": 1.635554 + }, + { + "acc": 0.66285353, + "epoch": 0.6708523592085236, + "grad_norm": 5.28125, + "learning_rate": 7.93322240318672e-06, + "loss": 1.57003212, + "memory(GiB)": 107.26, + "step": 26445, + "train_speed(iter/s)": 1.635584 + }, + { + "acc": 0.65634179, + "epoch": 0.6709791983764587, + "grad_norm": 5.0, + "learning_rate": 7.93237311647541e-06, + "loss": 1.65199566, + "memory(GiB)": 107.26, + "step": 26450, + "train_speed(iter/s)": 1.635617 + }, + { + "acc": 0.66775026, + "epoch": 0.6711060375443937, + "grad_norm": 5.5625, + "learning_rate": 7.93152370078524e-06, + "loss": 1.5171627, + "memory(GiB)": 107.26, + "step": 26455, + "train_speed(iter/s)": 1.635652 + }, + { + "acc": 0.65419035, + "epoch": 0.6712328767123288, + "grad_norm": 4.75, + "learning_rate": 7.930674156153569e-06, + "loss": 1.57321243, + "memory(GiB)": 107.26, + "step": 26460, + "train_speed(iter/s)": 1.635684 + }, + { + "acc": 0.65150967, + "epoch": 0.6713597158802638, + "grad_norm": 5.1875, + "learning_rate": 7.929824482617763e-06, + "loss": 1.62609501, + "memory(GiB)": 107.26, + "step": 26465, + "train_speed(iter/s)": 1.635716 + }, + { + "acc": 0.64687262, + "epoch": 0.6714865550481989, + "grad_norm": 6.5625, + "learning_rate": 7.928974680215196e-06, + "loss": 1.69332085, + "memory(GiB)": 107.26, + "step": 26470, + "train_speed(iter/s)": 1.635749 + }, + { + "acc": 0.64114003, + "epoch": 0.671613394216134, + "grad_norm": 5.21875, + "learning_rate": 7.928124748983244e-06, + "loss": 1.65573158, + "memory(GiB)": 107.26, + "step": 26475, + "train_speed(iter/s)": 1.635782 + }, + { + "acc": 0.64556017, + "epoch": 0.671740233384069, + "grad_norm": 6.59375, + "learning_rate": 7.927274688959294e-06, + "loss": 1.67613335, + "memory(GiB)": 107.26, + "step": 26480, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.64693079, + "epoch": 0.6718670725520041, + "grad_norm": 6.5625, + "learning_rate": 7.926424500180734e-06, + "loss": 1.66498871, + "memory(GiB)": 107.26, + "step": 26485, + "train_speed(iter/s)": 1.635848 + }, + { + "acc": 0.68129091, + "epoch": 0.6719939117199392, + "grad_norm": 5.8125, + "learning_rate": 7.92557418268496e-06, + "loss": 1.5222827, + "memory(GiB)": 107.26, + "step": 26490, + "train_speed(iter/s)": 1.63588 + }, + { + "acc": 0.65798445, + "epoch": 0.6721207508878742, + "grad_norm": 5.4375, + "learning_rate": 7.92472373650937e-06, + "loss": 1.58806095, + "memory(GiB)": 107.26, + "step": 26495, + "train_speed(iter/s)": 1.635915 + }, + { + "acc": 0.63552518, + "epoch": 0.6722475900558093, + "grad_norm": 5.1875, + "learning_rate": 7.923873161691373e-06, + "loss": 1.70570908, + "memory(GiB)": 107.26, + "step": 26500, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.66993885, + "epoch": 0.6723744292237442, + "grad_norm": 9.0, + "learning_rate": 7.923022458268379e-06, + "loss": 1.60077896, + "memory(GiB)": 107.26, + "step": 26505, + "train_speed(iter/s)": 1.635979 + }, + { + "acc": 0.66220317, + "epoch": 0.6725012683916793, + "grad_norm": 5.53125, + "learning_rate": 7.922171626277809e-06, + "loss": 1.60844975, + "memory(GiB)": 107.26, + "step": 26510, + "train_speed(iter/s)": 1.636015 + }, + { + "acc": 0.65502162, + "epoch": 0.6726281075596144, + "grad_norm": 4.90625, + "learning_rate": 7.921320665757081e-06, + "loss": 1.61531067, + "memory(GiB)": 107.26, + "step": 26515, + "train_speed(iter/s)": 1.636048 + }, + { + "acc": 0.65748725, + "epoch": 0.6727549467275494, + "grad_norm": 5.71875, + "learning_rate": 7.920469576743631e-06, + "loss": 1.575597, + "memory(GiB)": 107.26, + "step": 26520, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.65715456, + "epoch": 0.6728817858954845, + "grad_norm": 6.03125, + "learning_rate": 7.919618359274888e-06, + "loss": 1.62790604, + "memory(GiB)": 107.26, + "step": 26525, + "train_speed(iter/s)": 1.636118 + }, + { + "acc": 0.65933003, + "epoch": 0.6730086250634196, + "grad_norm": 6.125, + "learning_rate": 7.918767013388295e-06, + "loss": 1.62225952, + "memory(GiB)": 107.26, + "step": 26530, + "train_speed(iter/s)": 1.636149 + }, + { + "acc": 0.64483819, + "epoch": 0.6731354642313546, + "grad_norm": 5.875, + "learning_rate": 7.917915539121297e-06, + "loss": 1.64615746, + "memory(GiB)": 107.26, + "step": 26535, + "train_speed(iter/s)": 1.636185 + }, + { + "acc": 0.6288146, + "epoch": 0.6732623033992897, + "grad_norm": 5.25, + "learning_rate": 7.917063936511347e-06, + "loss": 1.7355484, + "memory(GiB)": 107.26, + "step": 26540, + "train_speed(iter/s)": 1.636215 + }, + { + "acc": 0.64846973, + "epoch": 0.6733891425672247, + "grad_norm": 6.5625, + "learning_rate": 7.9162122055959e-06, + "loss": 1.65554886, + "memory(GiB)": 107.26, + "step": 26545, + "train_speed(iter/s)": 1.636249 + }, + { + "acc": 0.66079869, + "epoch": 0.6735159817351598, + "grad_norm": 6.03125, + "learning_rate": 7.91536034641242e-06, + "loss": 1.60821095, + "memory(GiB)": 107.26, + "step": 26550, + "train_speed(iter/s)": 1.636283 + }, + { + "acc": 0.6459981, + "epoch": 0.6736428209030949, + "grad_norm": 9.5, + "learning_rate": 7.914508358998376e-06, + "loss": 1.67073231, + "memory(GiB)": 107.26, + "step": 26555, + "train_speed(iter/s)": 1.636318 + }, + { + "acc": 0.67174993, + "epoch": 0.6737696600710299, + "grad_norm": 5.65625, + "learning_rate": 7.913656243391243e-06, + "loss": 1.54314861, + "memory(GiB)": 107.26, + "step": 26560, + "train_speed(iter/s)": 1.636354 + }, + { + "acc": 0.64914966, + "epoch": 0.673896499238965, + "grad_norm": 6.125, + "learning_rate": 7.9128039996285e-06, + "loss": 1.5862114, + "memory(GiB)": 107.26, + "step": 26565, + "train_speed(iter/s)": 1.636388 + }, + { + "acc": 0.64251785, + "epoch": 0.6740233384069001, + "grad_norm": 6.21875, + "learning_rate": 7.911951627747633e-06, + "loss": 1.64313965, + "memory(GiB)": 107.26, + "step": 26570, + "train_speed(iter/s)": 1.63642 + }, + { + "acc": 0.64136143, + "epoch": 0.6741501775748351, + "grad_norm": 5.75, + "learning_rate": 7.91109912778613e-06, + "loss": 1.65305634, + "memory(GiB)": 107.26, + "step": 26575, + "train_speed(iter/s)": 1.636455 + }, + { + "acc": 0.65396233, + "epoch": 0.6742770167427702, + "grad_norm": 5.0625, + "learning_rate": 7.910246499781492e-06, + "loss": 1.60790348, + "memory(GiB)": 107.26, + "step": 26580, + "train_speed(iter/s)": 1.636487 + }, + { + "acc": 0.64579554, + "epoch": 0.6744038559107052, + "grad_norm": 5.53125, + "learning_rate": 7.90939374377122e-06, + "loss": 1.68032074, + "memory(GiB)": 107.26, + "step": 26585, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.65164728, + "epoch": 0.6745306950786403, + "grad_norm": 4.90625, + "learning_rate": 7.908540859792821e-06, + "loss": 1.57334747, + "memory(GiB)": 107.26, + "step": 26590, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.65671206, + "epoch": 0.6746575342465754, + "grad_norm": 6.0, + "learning_rate": 7.907687847883809e-06, + "loss": 1.56122694, + "memory(GiB)": 107.26, + "step": 26595, + "train_speed(iter/s)": 1.636588 + }, + { + "acc": 0.65064335, + "epoch": 0.6747843734145104, + "grad_norm": 6.25, + "learning_rate": 7.906834708081703e-06, + "loss": 1.60919304, + "memory(GiB)": 107.26, + "step": 26600, + "train_speed(iter/s)": 1.636624 + }, + { + "acc": 0.65889888, + "epoch": 0.6749112125824455, + "grad_norm": 5.59375, + "learning_rate": 7.90598144042403e-06, + "loss": 1.58028049, + "memory(GiB)": 107.26, + "step": 26605, + "train_speed(iter/s)": 1.636657 + }, + { + "acc": 0.6562705, + "epoch": 0.6750380517503806, + "grad_norm": 6.625, + "learning_rate": 7.905128044948318e-06, + "loss": 1.6336153, + "memory(GiB)": 107.26, + "step": 26610, + "train_speed(iter/s)": 1.636689 + }, + { + "acc": 0.64722996, + "epoch": 0.6751648909183156, + "grad_norm": 5.46875, + "learning_rate": 7.904274521692104e-06, + "loss": 1.65594254, + "memory(GiB)": 107.26, + "step": 26615, + "train_speed(iter/s)": 1.636722 + }, + { + "acc": 0.65281363, + "epoch": 0.6752917300862507, + "grad_norm": 5.21875, + "learning_rate": 7.90342087069293e-06, + "loss": 1.60127831, + "memory(GiB)": 107.26, + "step": 26620, + "train_speed(iter/s)": 1.636757 + }, + { + "acc": 0.65799212, + "epoch": 0.6754185692541856, + "grad_norm": 5.46875, + "learning_rate": 7.902567091988343e-06, + "loss": 1.56904078, + "memory(GiB)": 107.26, + "step": 26625, + "train_speed(iter/s)": 1.636791 + }, + { + "acc": 0.65744662, + "epoch": 0.6755454084221207, + "grad_norm": 6.6875, + "learning_rate": 7.901713185615898e-06, + "loss": 1.60119095, + "memory(GiB)": 107.26, + "step": 26630, + "train_speed(iter/s)": 1.636824 + }, + { + "acc": 0.64757876, + "epoch": 0.6756722475900558, + "grad_norm": 6.125, + "learning_rate": 7.90085915161315e-06, + "loss": 1.61928406, + "memory(GiB)": 107.26, + "step": 26635, + "train_speed(iter/s)": 1.636856 + }, + { + "acc": 0.65160193, + "epoch": 0.6757990867579908, + "grad_norm": 5.125, + "learning_rate": 7.900004990017667e-06, + "loss": 1.63451233, + "memory(GiB)": 107.26, + "step": 26640, + "train_speed(iter/s)": 1.636891 + }, + { + "acc": 0.63665543, + "epoch": 0.6759259259259259, + "grad_norm": 6.3125, + "learning_rate": 7.899150700867014e-06, + "loss": 1.68126888, + "memory(GiB)": 107.26, + "step": 26645, + "train_speed(iter/s)": 1.636925 + }, + { + "acc": 0.65109377, + "epoch": 0.676052765093861, + "grad_norm": 5.9375, + "learning_rate": 7.898296284198772e-06, + "loss": 1.58115616, + "memory(GiB)": 107.26, + "step": 26650, + "train_speed(iter/s)": 1.636957 + }, + { + "acc": 0.66057358, + "epoch": 0.676179604261796, + "grad_norm": 6.625, + "learning_rate": 7.897441740050518e-06, + "loss": 1.64735184, + "memory(GiB)": 107.26, + "step": 26655, + "train_speed(iter/s)": 1.63699 + }, + { + "acc": 0.63221507, + "epoch": 0.6763064434297311, + "grad_norm": 5.34375, + "learning_rate": 7.89658706845984e-06, + "loss": 1.63628731, + "memory(GiB)": 107.26, + "step": 26660, + "train_speed(iter/s)": 1.637023 + }, + { + "acc": 0.64983578, + "epoch": 0.6764332825976661, + "grad_norm": 6.09375, + "learning_rate": 7.89573226946433e-06, + "loss": 1.62094765, + "memory(GiB)": 107.26, + "step": 26665, + "train_speed(iter/s)": 1.637056 + }, + { + "acc": 0.65202594, + "epoch": 0.6765601217656012, + "grad_norm": 6.96875, + "learning_rate": 7.89487734310159e-06, + "loss": 1.62998619, + "memory(GiB)": 107.26, + "step": 26670, + "train_speed(iter/s)": 1.637088 + }, + { + "acc": 0.65368414, + "epoch": 0.6766869609335363, + "grad_norm": 5.09375, + "learning_rate": 7.894022289409216e-06, + "loss": 1.62135048, + "memory(GiB)": 107.26, + "step": 26675, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.63958931, + "epoch": 0.6768138001014713, + "grad_norm": 5.09375, + "learning_rate": 7.893167108424822e-06, + "loss": 1.66207352, + "memory(GiB)": 107.26, + "step": 26680, + "train_speed(iter/s)": 1.637155 + }, + { + "acc": 0.66196418, + "epoch": 0.6769406392694064, + "grad_norm": 5.4375, + "learning_rate": 7.89231180018602e-06, + "loss": 1.53666916, + "memory(GiB)": 107.26, + "step": 26685, + "train_speed(iter/s)": 1.637191 + }, + { + "acc": 0.65888853, + "epoch": 0.6770674784373415, + "grad_norm": 5.59375, + "learning_rate": 7.891456364730434e-06, + "loss": 1.6374464, + "memory(GiB)": 107.26, + "step": 26690, + "train_speed(iter/s)": 1.637224 + }, + { + "acc": 0.6450736, + "epoch": 0.6771943176052765, + "grad_norm": 5.21875, + "learning_rate": 7.890600802095686e-06, + "loss": 1.70170803, + "memory(GiB)": 107.26, + "step": 26695, + "train_speed(iter/s)": 1.637258 + }, + { + "acc": 0.65078168, + "epoch": 0.6773211567732116, + "grad_norm": 6.875, + "learning_rate": 7.889745112319411e-06, + "loss": 1.58687868, + "memory(GiB)": 107.26, + "step": 26700, + "train_speed(iter/s)": 1.637291 + }, + { + "acc": 0.65395117, + "epoch": 0.6774479959411466, + "grad_norm": 7.3125, + "learning_rate": 7.888889295439244e-06, + "loss": 1.59613609, + "memory(GiB)": 107.26, + "step": 26705, + "train_speed(iter/s)": 1.637325 + }, + { + "acc": 0.65182972, + "epoch": 0.6775748351090817, + "grad_norm": 6.4375, + "learning_rate": 7.888033351492827e-06, + "loss": 1.63670216, + "memory(GiB)": 107.26, + "step": 26710, + "train_speed(iter/s)": 1.637357 + }, + { + "acc": 0.64360676, + "epoch": 0.6777016742770168, + "grad_norm": 5.34375, + "learning_rate": 7.887177280517808e-06, + "loss": 1.696245, + "memory(GiB)": 107.26, + "step": 26715, + "train_speed(iter/s)": 1.63739 + }, + { + "acc": 0.65883231, + "epoch": 0.6778285134449518, + "grad_norm": 5.8125, + "learning_rate": 7.886321082551845e-06, + "loss": 1.5388957, + "memory(GiB)": 107.26, + "step": 26720, + "train_speed(iter/s)": 1.637424 + }, + { + "acc": 0.66942749, + "epoch": 0.6779553526128869, + "grad_norm": 6.03125, + "learning_rate": 7.88546475763259e-06, + "loss": 1.55770645, + "memory(GiB)": 107.26, + "step": 26725, + "train_speed(iter/s)": 1.637455 + }, + { + "acc": 0.66165552, + "epoch": 0.678082191780822, + "grad_norm": 5.21875, + "learning_rate": 7.884608305797716e-06, + "loss": 1.5738019, + "memory(GiB)": 107.26, + "step": 26730, + "train_speed(iter/s)": 1.637487 + }, + { + "acc": 0.64709148, + "epoch": 0.678209030948757, + "grad_norm": 4.90625, + "learning_rate": 7.883751727084888e-06, + "loss": 1.66402664, + "memory(GiB)": 107.26, + "step": 26735, + "train_speed(iter/s)": 1.63752 + }, + { + "acc": 0.65054336, + "epoch": 0.678335870116692, + "grad_norm": 5.125, + "learning_rate": 7.882895021531784e-06, + "loss": 1.65783081, + "memory(GiB)": 107.26, + "step": 26740, + "train_speed(iter/s)": 1.637553 + }, + { + "acc": 0.65563211, + "epoch": 0.678462709284627, + "grad_norm": 6.28125, + "learning_rate": 7.882038189176085e-06, + "loss": 1.59870319, + "memory(GiB)": 107.26, + "step": 26745, + "train_speed(iter/s)": 1.637587 + }, + { + "acc": 0.66284204, + "epoch": 0.6785895484525621, + "grad_norm": 5.375, + "learning_rate": 7.881181230055481e-06, + "loss": 1.59951248, + "memory(GiB)": 107.26, + "step": 26750, + "train_speed(iter/s)": 1.637619 + }, + { + "acc": 0.66546164, + "epoch": 0.6787163876204972, + "grad_norm": 12.25, + "learning_rate": 7.880324144207663e-06, + "loss": 1.58989954, + "memory(GiB)": 107.26, + "step": 26755, + "train_speed(iter/s)": 1.637651 + }, + { + "acc": 0.66123896, + "epoch": 0.6788432267884322, + "grad_norm": 6.1875, + "learning_rate": 7.879466931670328e-06, + "loss": 1.61991444, + "memory(GiB)": 107.26, + "step": 26760, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.64596043, + "epoch": 0.6789700659563673, + "grad_norm": 6.59375, + "learning_rate": 7.878609592481182e-06, + "loss": 1.70888786, + "memory(GiB)": 107.26, + "step": 26765, + "train_speed(iter/s)": 1.63772 + }, + { + "acc": 0.66199121, + "epoch": 0.6790969051243024, + "grad_norm": 5.46875, + "learning_rate": 7.877752126677933e-06, + "loss": 1.61715145, + "memory(GiB)": 107.26, + "step": 26770, + "train_speed(iter/s)": 1.637753 + }, + { + "acc": 0.67377157, + "epoch": 0.6792237442922374, + "grad_norm": 5.40625, + "learning_rate": 7.876894534298298e-06, + "loss": 1.5004343, + "memory(GiB)": 107.26, + "step": 26775, + "train_speed(iter/s)": 1.637784 + }, + { + "acc": 0.66394854, + "epoch": 0.6793505834601725, + "grad_norm": 6.03125, + "learning_rate": 7.87603681538e-06, + "loss": 1.54943047, + "memory(GiB)": 107.26, + "step": 26780, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.63446035, + "epoch": 0.6794774226281075, + "grad_norm": 6.125, + "learning_rate": 7.875178969960757e-06, + "loss": 1.72093811, + "memory(GiB)": 107.26, + "step": 26785, + "train_speed(iter/s)": 1.637849 + }, + { + "acc": 0.64778862, + "epoch": 0.6796042617960426, + "grad_norm": 4.8125, + "learning_rate": 7.87432099807831e-06, + "loss": 1.6499033, + "memory(GiB)": 107.26, + "step": 26790, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.65446739, + "epoch": 0.6797311009639777, + "grad_norm": 5.9375, + "learning_rate": 7.87346289977039e-06, + "loss": 1.62626915, + "memory(GiB)": 107.26, + "step": 26795, + "train_speed(iter/s)": 1.637913 + }, + { + "acc": 0.66241598, + "epoch": 0.6798579401319127, + "grad_norm": 5.84375, + "learning_rate": 7.872604675074745e-06, + "loss": 1.59820137, + "memory(GiB)": 107.26, + "step": 26800, + "train_speed(iter/s)": 1.637946 + }, + { + "acc": 0.65282211, + "epoch": 0.6799847792998478, + "grad_norm": 7.34375, + "learning_rate": 7.871746324029119e-06, + "loss": 1.5940114, + "memory(GiB)": 107.26, + "step": 26805, + "train_speed(iter/s)": 1.637981 + }, + { + "acc": 0.65721221, + "epoch": 0.6801116184677829, + "grad_norm": 5.8125, + "learning_rate": 7.87088784667127e-06, + "loss": 1.58220139, + "memory(GiB)": 107.26, + "step": 26810, + "train_speed(iter/s)": 1.638015 + }, + { + "acc": 0.63236394, + "epoch": 0.6802384576357179, + "grad_norm": 5.5625, + "learning_rate": 7.870029243038955e-06, + "loss": 1.70133324, + "memory(GiB)": 107.26, + "step": 26815, + "train_speed(iter/s)": 1.638049 + }, + { + "acc": 0.6395833, + "epoch": 0.680365296803653, + "grad_norm": 5.75, + "learning_rate": 7.869170513169941e-06, + "loss": 1.65109482, + "memory(GiB)": 107.26, + "step": 26820, + "train_speed(iter/s)": 1.638081 + }, + { + "acc": 0.63203564, + "epoch": 0.680492135971588, + "grad_norm": 5.25, + "learning_rate": 7.868311657101996e-06, + "loss": 1.62600861, + "memory(GiB)": 107.26, + "step": 26825, + "train_speed(iter/s)": 1.638115 + }, + { + "acc": 0.64707937, + "epoch": 0.6806189751395231, + "grad_norm": 7.40625, + "learning_rate": 7.8674526748729e-06, + "loss": 1.65282154, + "memory(GiB)": 107.26, + "step": 26830, + "train_speed(iter/s)": 1.638148 + }, + { + "acc": 0.67791576, + "epoch": 0.6807458143074582, + "grad_norm": 7.5, + "learning_rate": 7.866593566520432e-06, + "loss": 1.48686743, + "memory(GiB)": 107.26, + "step": 26835, + "train_speed(iter/s)": 1.638185 + }, + { + "acc": 0.65274925, + "epoch": 0.6808726534753932, + "grad_norm": 5.21875, + "learning_rate": 7.865734332082382e-06, + "loss": 1.68021507, + "memory(GiB)": 107.26, + "step": 26840, + "train_speed(iter/s)": 1.638217 + }, + { + "acc": 0.66506386, + "epoch": 0.6809994926433283, + "grad_norm": 7.03125, + "learning_rate": 7.86487497159654e-06, + "loss": 1.54281139, + "memory(GiB)": 107.26, + "step": 26845, + "train_speed(iter/s)": 1.63825 + }, + { + "acc": 0.65327525, + "epoch": 0.6811263318112634, + "grad_norm": 5.25, + "learning_rate": 7.864015485100706e-06, + "loss": 1.59985142, + "memory(GiB)": 107.26, + "step": 26850, + "train_speed(iter/s)": 1.638284 + }, + { + "acc": 0.65137606, + "epoch": 0.6812531709791984, + "grad_norm": 5.53125, + "learning_rate": 7.863155872632685e-06, + "loss": 1.54490366, + "memory(GiB)": 107.26, + "step": 26855, + "train_speed(iter/s)": 1.638317 + }, + { + "acc": 0.65559554, + "epoch": 0.6813800101471335, + "grad_norm": 6.0625, + "learning_rate": 7.862296134230287e-06, + "loss": 1.57524509, + "memory(GiB)": 107.26, + "step": 26860, + "train_speed(iter/s)": 1.638351 + }, + { + "acc": 0.65322442, + "epoch": 0.6815068493150684, + "grad_norm": 5.96875, + "learning_rate": 7.861436269931322e-06, + "loss": 1.59274559, + "memory(GiB)": 107.26, + "step": 26865, + "train_speed(iter/s)": 1.638383 + }, + { + "acc": 0.6607255, + "epoch": 0.6816336884830035, + "grad_norm": 5.21875, + "learning_rate": 7.860576279773617e-06, + "loss": 1.53485146, + "memory(GiB)": 107.26, + "step": 26870, + "train_speed(iter/s)": 1.638416 + }, + { + "acc": 0.65396767, + "epoch": 0.6817605276509386, + "grad_norm": 6.0, + "learning_rate": 7.859716163794995e-06, + "loss": 1.58389664, + "memory(GiB)": 107.26, + "step": 26875, + "train_speed(iter/s)": 1.638449 + }, + { + "acc": 0.65315032, + "epoch": 0.6818873668188736, + "grad_norm": 4.21875, + "learning_rate": 7.858855922033289e-06, + "loss": 1.62126102, + "memory(GiB)": 107.26, + "step": 26880, + "train_speed(iter/s)": 1.638482 + }, + { + "acc": 0.64716854, + "epoch": 0.6820142059868087, + "grad_norm": 6.28125, + "learning_rate": 7.857995554526334e-06, + "loss": 1.67432861, + "memory(GiB)": 107.26, + "step": 26885, + "train_speed(iter/s)": 1.638515 + }, + { + "acc": 0.64961729, + "epoch": 0.6821410451547438, + "grad_norm": 5.28125, + "learning_rate": 7.857135061311977e-06, + "loss": 1.62102013, + "memory(GiB)": 107.26, + "step": 26890, + "train_speed(iter/s)": 1.638547 + }, + { + "acc": 0.64940009, + "epoch": 0.6822678843226788, + "grad_norm": 5.4375, + "learning_rate": 7.856274442428062e-06, + "loss": 1.65618038, + "memory(GiB)": 107.26, + "step": 26895, + "train_speed(iter/s)": 1.63858 + }, + { + "acc": 0.6480588, + "epoch": 0.6823947234906139, + "grad_norm": 5.8125, + "learning_rate": 7.855413697912446e-06, + "loss": 1.61122971, + "memory(GiB)": 107.26, + "step": 26900, + "train_speed(iter/s)": 1.638615 + }, + { + "acc": 0.65209689, + "epoch": 0.6825215626585489, + "grad_norm": 5.28125, + "learning_rate": 7.854552827802987e-06, + "loss": 1.6268013, + "memory(GiB)": 107.26, + "step": 26905, + "train_speed(iter/s)": 1.638646 + }, + { + "acc": 0.66580715, + "epoch": 0.682648401826484, + "grad_norm": 12.375, + "learning_rate": 7.853691832137547e-06, + "loss": 1.656567, + "memory(GiB)": 107.26, + "step": 26910, + "train_speed(iter/s)": 1.638678 + }, + { + "acc": 0.66611404, + "epoch": 0.6827752409944191, + "grad_norm": 4.9375, + "learning_rate": 7.852830710954003e-06, + "loss": 1.56231728, + "memory(GiB)": 107.26, + "step": 26915, + "train_speed(iter/s)": 1.638711 + }, + { + "acc": 0.65471764, + "epoch": 0.6829020801623541, + "grad_norm": 6.15625, + "learning_rate": 7.851969464290226e-06, + "loss": 1.62660065, + "memory(GiB)": 107.26, + "step": 26920, + "train_speed(iter/s)": 1.638745 + }, + { + "acc": 0.65919824, + "epoch": 0.6830289193302892, + "grad_norm": 5.96875, + "learning_rate": 7.851108092184099e-06, + "loss": 1.58362312, + "memory(GiB)": 107.26, + "step": 26925, + "train_speed(iter/s)": 1.638778 + }, + { + "acc": 0.66248193, + "epoch": 0.6831557584982243, + "grad_norm": 5.21875, + "learning_rate": 7.850246594673508e-06, + "loss": 1.60354748, + "memory(GiB)": 107.26, + "step": 26930, + "train_speed(iter/s)": 1.638813 + }, + { + "acc": 0.65030179, + "epoch": 0.6832825976661593, + "grad_norm": 6.34375, + "learning_rate": 7.849384971796346e-06, + "loss": 1.69164639, + "memory(GiB)": 107.26, + "step": 26935, + "train_speed(iter/s)": 1.638846 + }, + { + "acc": 0.65637693, + "epoch": 0.6834094368340944, + "grad_norm": 5.625, + "learning_rate": 7.848523223590514e-06, + "loss": 1.6101099, + "memory(GiB)": 107.26, + "step": 26940, + "train_speed(iter/s)": 1.638879 + }, + { + "acc": 0.65389519, + "epoch": 0.6835362760020294, + "grad_norm": 6.40625, + "learning_rate": 7.84766135009391e-06, + "loss": 1.67608795, + "memory(GiB)": 107.26, + "step": 26945, + "train_speed(iter/s)": 1.638912 + }, + { + "acc": 0.64838867, + "epoch": 0.6836631151699645, + "grad_norm": 4.75, + "learning_rate": 7.846799351344447e-06, + "loss": 1.63493614, + "memory(GiB)": 107.26, + "step": 26950, + "train_speed(iter/s)": 1.638945 + }, + { + "acc": 0.66400805, + "epoch": 0.6837899543378996, + "grad_norm": 6.71875, + "learning_rate": 7.845937227380038e-06, + "loss": 1.59935093, + "memory(GiB)": 107.26, + "step": 26955, + "train_speed(iter/s)": 1.638978 + }, + { + "acc": 0.6427022, + "epoch": 0.6839167935058346, + "grad_norm": 5.21875, + "learning_rate": 7.845074978238604e-06, + "loss": 1.67486324, + "memory(GiB)": 107.26, + "step": 26960, + "train_speed(iter/s)": 1.639009 + }, + { + "acc": 0.65842757, + "epoch": 0.6840436326737697, + "grad_norm": 5.40625, + "learning_rate": 7.84421260395807e-06, + "loss": 1.61804733, + "memory(GiB)": 107.26, + "step": 26965, + "train_speed(iter/s)": 1.639043 + }, + { + "acc": 0.65147972, + "epoch": 0.6841704718417048, + "grad_norm": 5.75, + "learning_rate": 7.84335010457637e-06, + "loss": 1.62659683, + "memory(GiB)": 107.26, + "step": 26970, + "train_speed(iter/s)": 1.639076 + }, + { + "acc": 0.65960317, + "epoch": 0.6842973110096398, + "grad_norm": 6.34375, + "learning_rate": 7.842487480131435e-06, + "loss": 1.52411785, + "memory(GiB)": 107.26, + "step": 26975, + "train_speed(iter/s)": 1.639109 + }, + { + "acc": 0.65147829, + "epoch": 0.6844241501775749, + "grad_norm": 6.25, + "learning_rate": 7.84162473066121e-06, + "loss": 1.6766367, + "memory(GiB)": 107.26, + "step": 26980, + "train_speed(iter/s)": 1.639141 + }, + { + "acc": 0.66292338, + "epoch": 0.6845509893455098, + "grad_norm": 5.375, + "learning_rate": 7.840761856203642e-06, + "loss": 1.50713816, + "memory(GiB)": 107.26, + "step": 26985, + "train_speed(iter/s)": 1.639172 + }, + { + "acc": 0.64710188, + "epoch": 0.6846778285134449, + "grad_norm": 5.46875, + "learning_rate": 7.839898856796685e-06, + "loss": 1.67985497, + "memory(GiB)": 107.26, + "step": 26990, + "train_speed(iter/s)": 1.639203 + }, + { + "acc": 0.64517975, + "epoch": 0.68480466768138, + "grad_norm": 5.78125, + "learning_rate": 7.839035732478297e-06, + "loss": 1.64958305, + "memory(GiB)": 107.26, + "step": 26995, + "train_speed(iter/s)": 1.639235 + }, + { + "acc": 0.65153446, + "epoch": 0.684931506849315, + "grad_norm": 6.0, + "learning_rate": 7.838172483286441e-06, + "loss": 1.69107857, + "memory(GiB)": 107.26, + "step": 27000, + "train_speed(iter/s)": 1.639269 + }, + { + "epoch": 0.684931506849315, + "eval_acc": 0.6450710754985308, + "eval_loss": 1.5803025960922241, + "eval_runtime": 58.4784, + "eval_samples_per_second": 108.929, + "eval_steps_per_second": 27.241, + "step": 27000 + }, + { + "acc": 0.66906567, + "epoch": 0.6850583460172501, + "grad_norm": 5.0, + "learning_rate": 7.83730910925909e-06, + "loss": 1.55417967, + "memory(GiB)": 107.26, + "step": 27005, + "train_speed(iter/s)": 1.633082 + }, + { + "acc": 0.6517766, + "epoch": 0.6851851851851852, + "grad_norm": 5.21875, + "learning_rate": 7.836445610434215e-06, + "loss": 1.59759665, + "memory(GiB)": 107.26, + "step": 27010, + "train_speed(iter/s)": 1.633116 + }, + { + "acc": 0.65137568, + "epoch": 0.6853120243531202, + "grad_norm": 5.8125, + "learning_rate": 7.835581986849799e-06, + "loss": 1.63237514, + "memory(GiB)": 107.26, + "step": 27015, + "train_speed(iter/s)": 1.633151 + }, + { + "acc": 0.64684496, + "epoch": 0.6854388635210553, + "grad_norm": 5.46875, + "learning_rate": 7.834718238543827e-06, + "loss": 1.60083122, + "memory(GiB)": 107.26, + "step": 27020, + "train_speed(iter/s)": 1.633183 + }, + { + "acc": 0.6389863, + "epoch": 0.6855657026889903, + "grad_norm": 4.84375, + "learning_rate": 7.833854365554289e-06, + "loss": 1.66159573, + "memory(GiB)": 107.26, + "step": 27025, + "train_speed(iter/s)": 1.633217 + }, + { + "acc": 0.64249711, + "epoch": 0.6856925418569254, + "grad_norm": 4.6875, + "learning_rate": 7.832990367919186e-06, + "loss": 1.66134071, + "memory(GiB)": 107.26, + "step": 27030, + "train_speed(iter/s)": 1.63325 + }, + { + "acc": 0.6603487, + "epoch": 0.6858193810248605, + "grad_norm": 6.4375, + "learning_rate": 7.832126245676518e-06, + "loss": 1.5967391, + "memory(GiB)": 107.26, + "step": 27035, + "train_speed(iter/s)": 1.633281 + }, + { + "acc": 0.65505686, + "epoch": 0.6859462201927955, + "grad_norm": 6.71875, + "learning_rate": 7.831261998864293e-06, + "loss": 1.59895248, + "memory(GiB)": 107.26, + "step": 27040, + "train_speed(iter/s)": 1.633315 + }, + { + "acc": 0.65722203, + "epoch": 0.6860730593607306, + "grad_norm": 5.34375, + "learning_rate": 7.830397627520526e-06, + "loss": 1.65112114, + "memory(GiB)": 107.26, + "step": 27045, + "train_speed(iter/s)": 1.633348 + }, + { + "acc": 0.64407406, + "epoch": 0.6861998985286657, + "grad_norm": 5.65625, + "learning_rate": 7.82953313168323e-06, + "loss": 1.63695679, + "memory(GiB)": 107.26, + "step": 27050, + "train_speed(iter/s)": 1.633382 + }, + { + "acc": 0.64654527, + "epoch": 0.6863267376966007, + "grad_norm": 6.09375, + "learning_rate": 7.828668511390439e-06, + "loss": 1.58244209, + "memory(GiB)": 107.26, + "step": 27055, + "train_speed(iter/s)": 1.633414 + }, + { + "acc": 0.64846325, + "epoch": 0.6864535768645358, + "grad_norm": 5.4375, + "learning_rate": 7.827803766680176e-06, + "loss": 1.63761368, + "memory(GiB)": 107.26, + "step": 27060, + "train_speed(iter/s)": 1.633446 + }, + { + "acc": 0.65124149, + "epoch": 0.6865804160324708, + "grad_norm": 6.3125, + "learning_rate": 7.826938897590477e-06, + "loss": 1.63812027, + "memory(GiB)": 107.26, + "step": 27065, + "train_speed(iter/s)": 1.633477 + }, + { + "acc": 0.666012, + "epoch": 0.6867072552004059, + "grad_norm": 5.28125, + "learning_rate": 7.826073904159384e-06, + "loss": 1.53024359, + "memory(GiB)": 107.26, + "step": 27070, + "train_speed(iter/s)": 1.633508 + }, + { + "acc": 0.66201282, + "epoch": 0.686834094368341, + "grad_norm": 5.71875, + "learning_rate": 7.825208786424944e-06, + "loss": 1.61423779, + "memory(GiB)": 107.26, + "step": 27075, + "train_speed(iter/s)": 1.63354 + }, + { + "acc": 0.6322577, + "epoch": 0.686960933536276, + "grad_norm": 8.9375, + "learning_rate": 7.824343544425207e-06, + "loss": 1.70796871, + "memory(GiB)": 107.26, + "step": 27080, + "train_speed(iter/s)": 1.633576 + }, + { + "acc": 0.65276732, + "epoch": 0.6870877727042111, + "grad_norm": 4.59375, + "learning_rate": 7.823478178198234e-06, + "loss": 1.59573755, + "memory(GiB)": 107.26, + "step": 27085, + "train_speed(iter/s)": 1.633607 + }, + { + "acc": 0.66387219, + "epoch": 0.6872146118721462, + "grad_norm": 5.21875, + "learning_rate": 7.822612687782083e-06, + "loss": 1.55407057, + "memory(GiB)": 107.26, + "step": 27090, + "train_speed(iter/s)": 1.633641 + }, + { + "acc": 0.66301851, + "epoch": 0.6873414510400812, + "grad_norm": 5.90625, + "learning_rate": 7.821747073214823e-06, + "loss": 1.56343193, + "memory(GiB)": 107.26, + "step": 27095, + "train_speed(iter/s)": 1.633673 + }, + { + "acc": 0.65670996, + "epoch": 0.6874682902080163, + "grad_norm": 5.03125, + "learning_rate": 7.820881334534529e-06, + "loss": 1.55156994, + "memory(GiB)": 107.26, + "step": 27100, + "train_speed(iter/s)": 1.633706 + }, + { + "acc": 0.66631594, + "epoch": 0.6875951293759512, + "grad_norm": 5.09375, + "learning_rate": 7.820015471779278e-06, + "loss": 1.53754244, + "memory(GiB)": 107.26, + "step": 27105, + "train_speed(iter/s)": 1.633738 + }, + { + "acc": 0.66593151, + "epoch": 0.6877219685438863, + "grad_norm": 5.78125, + "learning_rate": 7.819149484987159e-06, + "loss": 1.56529617, + "memory(GiB)": 107.26, + "step": 27110, + "train_speed(iter/s)": 1.633772 + }, + { + "acc": 0.6684948, + "epoch": 0.6878488077118214, + "grad_norm": 5.0, + "learning_rate": 7.818283374196259e-06, + "loss": 1.5542263, + "memory(GiB)": 107.26, + "step": 27115, + "train_speed(iter/s)": 1.633803 + }, + { + "acc": 0.65718846, + "epoch": 0.6879756468797564, + "grad_norm": 6.03125, + "learning_rate": 7.817417139444671e-06, + "loss": 1.65624275, + "memory(GiB)": 107.26, + "step": 27120, + "train_speed(iter/s)": 1.633837 + }, + { + "acc": 0.63353791, + "epoch": 0.6881024860476915, + "grad_norm": 7.0, + "learning_rate": 7.8165507807705e-06, + "loss": 1.70971642, + "memory(GiB)": 107.26, + "step": 27125, + "train_speed(iter/s)": 1.633872 + }, + { + "acc": 0.64279776, + "epoch": 0.6882293252156266, + "grad_norm": 5.375, + "learning_rate": 7.81568429821185e-06, + "loss": 1.68047504, + "memory(GiB)": 107.26, + "step": 27130, + "train_speed(iter/s)": 1.633907 + }, + { + "acc": 0.65463715, + "epoch": 0.6883561643835616, + "grad_norm": 5.8125, + "learning_rate": 7.814817691806834e-06, + "loss": 1.66218224, + "memory(GiB)": 107.26, + "step": 27135, + "train_speed(iter/s)": 1.633939 + }, + { + "acc": 0.65490894, + "epoch": 0.6884830035514967, + "grad_norm": 5.5, + "learning_rate": 7.813950961593569e-06, + "loss": 1.5835741, + "memory(GiB)": 107.26, + "step": 27140, + "train_speed(iter/s)": 1.633971 + }, + { + "acc": 0.64625416, + "epoch": 0.6886098427194317, + "grad_norm": 4.6875, + "learning_rate": 7.813084107610175e-06, + "loss": 1.61685715, + "memory(GiB)": 107.26, + "step": 27145, + "train_speed(iter/s)": 1.634001 + }, + { + "acc": 0.66330686, + "epoch": 0.6887366818873668, + "grad_norm": 5.125, + "learning_rate": 7.812217129894785e-06, + "loss": 1.4957881, + "memory(GiB)": 107.26, + "step": 27150, + "train_speed(iter/s)": 1.634032 + }, + { + "acc": 0.66132069, + "epoch": 0.6888635210553019, + "grad_norm": 5.28125, + "learning_rate": 7.811350028485531e-06, + "loss": 1.55517197, + "memory(GiB)": 107.26, + "step": 27155, + "train_speed(iter/s)": 1.634066 + }, + { + "acc": 0.65277162, + "epoch": 0.6889903602232369, + "grad_norm": 6.0, + "learning_rate": 7.810482803420549e-06, + "loss": 1.54844151, + "memory(GiB)": 107.26, + "step": 27160, + "train_speed(iter/s)": 1.634098 + }, + { + "acc": 0.65135374, + "epoch": 0.689117199391172, + "grad_norm": 5.8125, + "learning_rate": 7.809615454737984e-06, + "loss": 1.64623528, + "memory(GiB)": 107.26, + "step": 27165, + "train_speed(iter/s)": 1.634129 + }, + { + "acc": 0.64086094, + "epoch": 0.6892440385591071, + "grad_norm": 6.25, + "learning_rate": 7.808747982475991e-06, + "loss": 1.6480648, + "memory(GiB)": 107.26, + "step": 27170, + "train_speed(iter/s)": 1.634164 + }, + { + "acc": 0.64434471, + "epoch": 0.6893708777270421, + "grad_norm": 7.1875, + "learning_rate": 7.807880386672718e-06, + "loss": 1.64645977, + "memory(GiB)": 107.26, + "step": 27175, + "train_speed(iter/s)": 1.634198 + }, + { + "acc": 0.64159942, + "epoch": 0.6894977168949772, + "grad_norm": 6.1875, + "learning_rate": 7.807012667366332e-06, + "loss": 1.67221508, + "memory(GiB)": 107.26, + "step": 27180, + "train_speed(iter/s)": 1.63423 + }, + { + "acc": 0.65396795, + "epoch": 0.6896245560629122, + "grad_norm": 6.90625, + "learning_rate": 7.806144824594994e-06, + "loss": 1.64463844, + "memory(GiB)": 107.26, + "step": 27185, + "train_speed(iter/s)": 1.634261 + }, + { + "acc": 0.64494209, + "epoch": 0.6897513952308473, + "grad_norm": 5.5625, + "learning_rate": 7.805276858396879e-06, + "loss": 1.62344456, + "memory(GiB)": 107.26, + "step": 27190, + "train_speed(iter/s)": 1.634296 + }, + { + "acc": 0.66242733, + "epoch": 0.6898782343987824, + "grad_norm": 5.375, + "learning_rate": 7.804408768810164e-06, + "loss": 1.58216953, + "memory(GiB)": 107.26, + "step": 27195, + "train_speed(iter/s)": 1.634328 + }, + { + "acc": 0.64973612, + "epoch": 0.6900050735667174, + "grad_norm": 5.71875, + "learning_rate": 7.80354055587303e-06, + "loss": 1.68611679, + "memory(GiB)": 107.26, + "step": 27200, + "train_speed(iter/s)": 1.634361 + }, + { + "acc": 0.65159464, + "epoch": 0.6901319127346525, + "grad_norm": 5.59375, + "learning_rate": 7.802672219623665e-06, + "loss": 1.59653902, + "memory(GiB)": 107.26, + "step": 27205, + "train_speed(iter/s)": 1.634391 + }, + { + "acc": 0.66741424, + "epoch": 0.6902587519025876, + "grad_norm": 6.40625, + "learning_rate": 7.801803760100264e-06, + "loss": 1.60990677, + "memory(GiB)": 107.26, + "step": 27210, + "train_speed(iter/s)": 1.634424 + }, + { + "acc": 0.65118651, + "epoch": 0.6903855910705226, + "grad_norm": 7.3125, + "learning_rate": 7.800935177341022e-06, + "loss": 1.58042345, + "memory(GiB)": 107.26, + "step": 27215, + "train_speed(iter/s)": 1.634458 + }, + { + "acc": 0.66853809, + "epoch": 0.6905124302384577, + "grad_norm": 5.1875, + "learning_rate": 7.800066471384149e-06, + "loss": 1.59965324, + "memory(GiB)": 107.26, + "step": 27220, + "train_speed(iter/s)": 1.634489 + }, + { + "acc": 0.66857328, + "epoch": 0.6906392694063926, + "grad_norm": 6.125, + "learning_rate": 7.799197642267848e-06, + "loss": 1.60582314, + "memory(GiB)": 107.26, + "step": 27225, + "train_speed(iter/s)": 1.634523 + }, + { + "acc": 0.65664892, + "epoch": 0.6907661085743277, + "grad_norm": 5.0, + "learning_rate": 7.79832869003034e-06, + "loss": 1.57414627, + "memory(GiB)": 107.26, + "step": 27230, + "train_speed(iter/s)": 1.634556 + }, + { + "acc": 0.66003609, + "epoch": 0.6908929477422628, + "grad_norm": 6.03125, + "learning_rate": 7.797459614709842e-06, + "loss": 1.61059303, + "memory(GiB)": 107.26, + "step": 27235, + "train_speed(iter/s)": 1.63459 + }, + { + "acc": 0.65616622, + "epoch": 0.6910197869101978, + "grad_norm": 6.0, + "learning_rate": 7.796590416344578e-06, + "loss": 1.61198807, + "memory(GiB)": 107.26, + "step": 27240, + "train_speed(iter/s)": 1.634623 + }, + { + "acc": 0.64562421, + "epoch": 0.6911466260781329, + "grad_norm": 5.65625, + "learning_rate": 7.795721094972783e-06, + "loss": 1.68045845, + "memory(GiB)": 107.26, + "step": 27245, + "train_speed(iter/s)": 1.634654 + }, + { + "acc": 0.63632498, + "epoch": 0.691273465246068, + "grad_norm": 6.09375, + "learning_rate": 7.794851650632693e-06, + "loss": 1.70179195, + "memory(GiB)": 107.26, + "step": 27250, + "train_speed(iter/s)": 1.63469 + }, + { + "acc": 0.65938749, + "epoch": 0.691400304414003, + "grad_norm": 4.9375, + "learning_rate": 7.793982083362548e-06, + "loss": 1.50130119, + "memory(GiB)": 107.26, + "step": 27255, + "train_speed(iter/s)": 1.634714 + }, + { + "acc": 0.67015014, + "epoch": 0.6915271435819381, + "grad_norm": 6.65625, + "learning_rate": 7.7931123932006e-06, + "loss": 1.5763669, + "memory(GiB)": 107.26, + "step": 27260, + "train_speed(iter/s)": 1.634746 + }, + { + "acc": 0.64620943, + "epoch": 0.6916539827498731, + "grad_norm": 5.21875, + "learning_rate": 7.792242580185095e-06, + "loss": 1.59284096, + "memory(GiB)": 107.26, + "step": 27265, + "train_speed(iter/s)": 1.634779 + }, + { + "acc": 0.6555275, + "epoch": 0.6917808219178082, + "grad_norm": 7.53125, + "learning_rate": 7.791372644354295e-06, + "loss": 1.55669098, + "memory(GiB)": 107.26, + "step": 27270, + "train_speed(iter/s)": 1.634814 + }, + { + "acc": 0.64665003, + "epoch": 0.6919076610857433, + "grad_norm": 6.3125, + "learning_rate": 7.790502585746464e-06, + "loss": 1.64027348, + "memory(GiB)": 107.26, + "step": 27275, + "train_speed(iter/s)": 1.634846 + }, + { + "acc": 0.66824036, + "epoch": 0.6920345002536783, + "grad_norm": 5.71875, + "learning_rate": 7.789632404399872e-06, + "loss": 1.54462252, + "memory(GiB)": 107.26, + "step": 27280, + "train_speed(iter/s)": 1.634878 + }, + { + "acc": 0.6542613, + "epoch": 0.6921613394216134, + "grad_norm": 7.1875, + "learning_rate": 7.788762100352791e-06, + "loss": 1.6289505, + "memory(GiB)": 107.26, + "step": 27285, + "train_speed(iter/s)": 1.634912 + }, + { + "acc": 0.66637259, + "epoch": 0.6922881785895485, + "grad_norm": 5.75, + "learning_rate": 7.787891673643501e-06, + "loss": 1.5987524, + "memory(GiB)": 107.26, + "step": 27290, + "train_speed(iter/s)": 1.634943 + }, + { + "acc": 0.64926505, + "epoch": 0.6924150177574835, + "grad_norm": 6.65625, + "learning_rate": 7.78702112431029e-06, + "loss": 1.64018211, + "memory(GiB)": 107.26, + "step": 27295, + "train_speed(iter/s)": 1.634974 + }, + { + "acc": 0.65669932, + "epoch": 0.6925418569254186, + "grad_norm": 5.46875, + "learning_rate": 7.786150452391446e-06, + "loss": 1.57897968, + "memory(GiB)": 107.26, + "step": 27300, + "train_speed(iter/s)": 1.635007 + }, + { + "acc": 0.66868744, + "epoch": 0.6926686960933536, + "grad_norm": 5.96875, + "learning_rate": 7.785279657925265e-06, + "loss": 1.56595135, + "memory(GiB)": 107.26, + "step": 27305, + "train_speed(iter/s)": 1.635041 + }, + { + "acc": 0.64119015, + "epoch": 0.6927955352612887, + "grad_norm": 5.75, + "learning_rate": 7.784408740950051e-06, + "loss": 1.64192123, + "memory(GiB)": 107.26, + "step": 27310, + "train_speed(iter/s)": 1.635073 + }, + { + "acc": 0.66927886, + "epoch": 0.6929223744292238, + "grad_norm": 6.0, + "learning_rate": 7.783537701504109e-06, + "loss": 1.58278618, + "memory(GiB)": 107.26, + "step": 27315, + "train_speed(iter/s)": 1.635106 + }, + { + "acc": 0.66143999, + "epoch": 0.6930492135971588, + "grad_norm": 5.25, + "learning_rate": 7.782666539625749e-06, + "loss": 1.65351219, + "memory(GiB)": 107.26, + "step": 27320, + "train_speed(iter/s)": 1.635141 + }, + { + "acc": 0.66218901, + "epoch": 0.6931760527650939, + "grad_norm": 6.21875, + "learning_rate": 7.781795255353293e-06, + "loss": 1.56159458, + "memory(GiB)": 107.26, + "step": 27325, + "train_speed(iter/s)": 1.635173 + }, + { + "acc": 0.66439648, + "epoch": 0.693302891933029, + "grad_norm": 5.9375, + "learning_rate": 7.780923848725061e-06, + "loss": 1.57512035, + "memory(GiB)": 107.26, + "step": 27330, + "train_speed(iter/s)": 1.635208 + }, + { + "acc": 0.64612985, + "epoch": 0.693429731100964, + "grad_norm": 5.1875, + "learning_rate": 7.780052319779382e-06, + "loss": 1.58392906, + "memory(GiB)": 107.26, + "step": 27335, + "train_speed(iter/s)": 1.635239 + }, + { + "acc": 0.64886003, + "epoch": 0.693556570268899, + "grad_norm": 5.90625, + "learning_rate": 7.779180668554591e-06, + "loss": 1.66179276, + "memory(GiB)": 107.26, + "step": 27340, + "train_speed(iter/s)": 1.63527 + }, + { + "acc": 0.66692924, + "epoch": 0.693683409436834, + "grad_norm": 5.71875, + "learning_rate": 7.778308895089024e-06, + "loss": 1.56435852, + "memory(GiB)": 107.26, + "step": 27345, + "train_speed(iter/s)": 1.635303 + }, + { + "acc": 0.66735201, + "epoch": 0.6938102486047691, + "grad_norm": 5.34375, + "learning_rate": 7.77743699942103e-06, + "loss": 1.5706811, + "memory(GiB)": 107.26, + "step": 27350, + "train_speed(iter/s)": 1.635337 + }, + { + "acc": 0.66817093, + "epoch": 0.6939370877727042, + "grad_norm": 6.25, + "learning_rate": 7.776564981588955e-06, + "loss": 1.56316748, + "memory(GiB)": 107.26, + "step": 27355, + "train_speed(iter/s)": 1.635369 + }, + { + "acc": 0.65437531, + "epoch": 0.6940639269406392, + "grad_norm": 6.6875, + "learning_rate": 7.775692841631154e-06, + "loss": 1.69971409, + "memory(GiB)": 107.26, + "step": 27360, + "train_speed(iter/s)": 1.635403 + }, + { + "acc": 0.64340844, + "epoch": 0.6941907661085743, + "grad_norm": 4.34375, + "learning_rate": 7.774820579585993e-06, + "loss": 1.61681747, + "memory(GiB)": 107.26, + "step": 27365, + "train_speed(iter/s)": 1.635437 + }, + { + "acc": 0.65908537, + "epoch": 0.6943176052765094, + "grad_norm": 5.9375, + "learning_rate": 7.773948195491831e-06, + "loss": 1.58944664, + "memory(GiB)": 107.26, + "step": 27370, + "train_speed(iter/s)": 1.635469 + }, + { + "acc": 0.64517727, + "epoch": 0.6944444444444444, + "grad_norm": 5.65625, + "learning_rate": 7.773075689387044e-06, + "loss": 1.67690811, + "memory(GiB)": 107.26, + "step": 27375, + "train_speed(iter/s)": 1.635504 + }, + { + "acc": 0.66697702, + "epoch": 0.6945712836123795, + "grad_norm": 7.5625, + "learning_rate": 7.772203061310008e-06, + "loss": 1.56519604, + "memory(GiB)": 107.26, + "step": 27380, + "train_speed(iter/s)": 1.635535 + }, + { + "acc": 0.648909, + "epoch": 0.6946981227803145, + "grad_norm": 5.875, + "learning_rate": 7.771330311299104e-06, + "loss": 1.70168056, + "memory(GiB)": 107.26, + "step": 27385, + "train_speed(iter/s)": 1.635567 + }, + { + "acc": 0.65259457, + "epoch": 0.6948249619482496, + "grad_norm": 4.65625, + "learning_rate": 7.770457439392719e-06, + "loss": 1.60979671, + "memory(GiB)": 107.26, + "step": 27390, + "train_speed(iter/s)": 1.6356 + }, + { + "acc": 0.6404027, + "epoch": 0.6949518011161847, + "grad_norm": 5.03125, + "learning_rate": 7.769584445629247e-06, + "loss": 1.7007122, + "memory(GiB)": 107.26, + "step": 27395, + "train_speed(iter/s)": 1.635633 + }, + { + "acc": 0.65212255, + "epoch": 0.6950786402841197, + "grad_norm": 6.75, + "learning_rate": 7.768711330047087e-06, + "loss": 1.60189686, + "memory(GiB)": 107.26, + "step": 27400, + "train_speed(iter/s)": 1.635666 + }, + { + "acc": 0.65985851, + "epoch": 0.6952054794520548, + "grad_norm": 5.8125, + "learning_rate": 7.767838092684638e-06, + "loss": 1.62681122, + "memory(GiB)": 107.26, + "step": 27405, + "train_speed(iter/s)": 1.635699 + }, + { + "acc": 0.67035189, + "epoch": 0.6953323186199899, + "grad_norm": 4.34375, + "learning_rate": 7.766964733580316e-06, + "loss": 1.54356546, + "memory(GiB)": 107.26, + "step": 27410, + "train_speed(iter/s)": 1.635729 + }, + { + "acc": 0.66668577, + "epoch": 0.6954591577879249, + "grad_norm": 5.0, + "learning_rate": 7.76609125277253e-06, + "loss": 1.59488697, + "memory(GiB)": 107.26, + "step": 27415, + "train_speed(iter/s)": 1.635762 + }, + { + "acc": 0.6549706, + "epoch": 0.69558599695586, + "grad_norm": 7.9375, + "learning_rate": 7.7652176502997e-06, + "loss": 1.58501558, + "memory(GiB)": 107.26, + "step": 27420, + "train_speed(iter/s)": 1.635793 + }, + { + "acc": 0.66406651, + "epoch": 0.695712836123795, + "grad_norm": 5.25, + "learning_rate": 7.764343926200254e-06, + "loss": 1.53203926, + "memory(GiB)": 107.26, + "step": 27425, + "train_speed(iter/s)": 1.635825 + }, + { + "acc": 0.6538558, + "epoch": 0.6958396752917301, + "grad_norm": 5.375, + "learning_rate": 7.763470080512617e-06, + "loss": 1.60885811, + "memory(GiB)": 107.26, + "step": 27430, + "train_speed(iter/s)": 1.635859 + }, + { + "acc": 0.65442615, + "epoch": 0.6959665144596652, + "grad_norm": 6.5625, + "learning_rate": 7.762596113275229e-06, + "loss": 1.59839926, + "memory(GiB)": 107.26, + "step": 27435, + "train_speed(iter/s)": 1.635892 + }, + { + "acc": 0.65658865, + "epoch": 0.6960933536276002, + "grad_norm": 6.96875, + "learning_rate": 7.761722024526533e-06, + "loss": 1.64569206, + "memory(GiB)": 107.26, + "step": 27440, + "train_speed(iter/s)": 1.635927 + }, + { + "acc": 0.66237354, + "epoch": 0.6962201927955353, + "grad_norm": 5.375, + "learning_rate": 7.760847814304969e-06, + "loss": 1.61346474, + "memory(GiB)": 107.26, + "step": 27445, + "train_speed(iter/s)": 1.635958 + }, + { + "acc": 0.65442677, + "epoch": 0.6963470319634704, + "grad_norm": 8.3125, + "learning_rate": 7.759973482648992e-06, + "loss": 1.59562788, + "memory(GiB)": 107.26, + "step": 27450, + "train_speed(iter/s)": 1.635991 + }, + { + "acc": 0.65584602, + "epoch": 0.6964738711314054, + "grad_norm": 4.8125, + "learning_rate": 7.75909902959706e-06, + "loss": 1.65837135, + "memory(GiB)": 107.26, + "step": 27455, + "train_speed(iter/s)": 1.636025 + }, + { + "acc": 0.64058361, + "epoch": 0.6966007102993405, + "grad_norm": 5.21875, + "learning_rate": 7.758224455187632e-06, + "loss": 1.67555428, + "memory(GiB)": 107.26, + "step": 27460, + "train_speed(iter/s)": 1.636058 + }, + { + "acc": 0.65166955, + "epoch": 0.6967275494672754, + "grad_norm": 6.0625, + "learning_rate": 7.75734975945918e-06, + "loss": 1.67998333, + "memory(GiB)": 107.26, + "step": 27465, + "train_speed(iter/s)": 1.636087 + }, + { + "acc": 0.65085659, + "epoch": 0.6968543886352105, + "grad_norm": 7.15625, + "learning_rate": 7.756474942450174e-06, + "loss": 1.64790001, + "memory(GiB)": 107.26, + "step": 27470, + "train_speed(iter/s)": 1.63612 + }, + { + "acc": 0.66765256, + "epoch": 0.6969812278031456, + "grad_norm": 6.5625, + "learning_rate": 7.755600004199094e-06, + "loss": 1.54629698, + "memory(GiB)": 107.26, + "step": 27475, + "train_speed(iter/s)": 1.636153 + }, + { + "acc": 0.66047487, + "epoch": 0.6971080669710806, + "grad_norm": 6.0, + "learning_rate": 7.754724944744423e-06, + "loss": 1.60178394, + "memory(GiB)": 107.26, + "step": 27480, + "train_speed(iter/s)": 1.636184 + }, + { + "acc": 0.65821381, + "epoch": 0.6972349061390157, + "grad_norm": 6.5625, + "learning_rate": 7.753849764124648e-06, + "loss": 1.66496429, + "memory(GiB)": 107.26, + "step": 27485, + "train_speed(iter/s)": 1.636215 + }, + { + "acc": 0.64803553, + "epoch": 0.6973617453069508, + "grad_norm": 6.03125, + "learning_rate": 7.752974462378268e-06, + "loss": 1.63399925, + "memory(GiB)": 107.26, + "step": 27490, + "train_speed(iter/s)": 1.636247 + }, + { + "acc": 0.65273108, + "epoch": 0.6974885844748858, + "grad_norm": 5.0, + "learning_rate": 7.752099039543778e-06, + "loss": 1.62570057, + "memory(GiB)": 107.26, + "step": 27495, + "train_speed(iter/s)": 1.636279 + }, + { + "acc": 0.65321431, + "epoch": 0.6976154236428209, + "grad_norm": 6.03125, + "learning_rate": 7.751223495659685e-06, + "loss": 1.57066135, + "memory(GiB)": 107.26, + "step": 27500, + "train_speed(iter/s)": 1.636314 + }, + { + "acc": 0.65747986, + "epoch": 0.6977422628107559, + "grad_norm": 5.25, + "learning_rate": 7.7503478307645e-06, + "loss": 1.60095863, + "memory(GiB)": 107.26, + "step": 27505, + "train_speed(iter/s)": 1.636347 + }, + { + "acc": 0.66565981, + "epoch": 0.697869101978691, + "grad_norm": 5.34375, + "learning_rate": 7.74947204489674e-06, + "loss": 1.55961475, + "memory(GiB)": 107.26, + "step": 27510, + "train_speed(iter/s)": 1.636379 + }, + { + "acc": 0.64922576, + "epoch": 0.6979959411466261, + "grad_norm": 6.53125, + "learning_rate": 7.748596138094922e-06, + "loss": 1.66757259, + "memory(GiB)": 107.26, + "step": 27515, + "train_speed(iter/s)": 1.636409 + }, + { + "acc": 0.62690125, + "epoch": 0.6981227803145611, + "grad_norm": 5.84375, + "learning_rate": 7.747720110397573e-06, + "loss": 1.68180904, + "memory(GiB)": 107.26, + "step": 27520, + "train_speed(iter/s)": 1.636439 + }, + { + "acc": 0.65505948, + "epoch": 0.6982496194824962, + "grad_norm": 5.4375, + "learning_rate": 7.746843961843226e-06, + "loss": 1.64896736, + "memory(GiB)": 107.26, + "step": 27525, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.67042408, + "epoch": 0.6983764586504313, + "grad_norm": 6.28125, + "learning_rate": 7.74596769247042e-06, + "loss": 1.5555418, + "memory(GiB)": 107.26, + "step": 27530, + "train_speed(iter/s)": 1.636502 + }, + { + "acc": 0.66162472, + "epoch": 0.6985032978183663, + "grad_norm": 5.875, + "learning_rate": 7.745091302317694e-06, + "loss": 1.58649368, + "memory(GiB)": 107.26, + "step": 27535, + "train_speed(iter/s)": 1.636536 + }, + { + "acc": 0.66233625, + "epoch": 0.6986301369863014, + "grad_norm": 5.78125, + "learning_rate": 7.744214791423597e-06, + "loss": 1.6265234, + "memory(GiB)": 107.26, + "step": 27540, + "train_speed(iter/s)": 1.636569 + }, + { + "acc": 0.63770685, + "epoch": 0.6987569761542364, + "grad_norm": 5.65625, + "learning_rate": 7.74333815982668e-06, + "loss": 1.65318565, + "memory(GiB)": 107.26, + "step": 27545, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.64911213, + "epoch": 0.6988838153221715, + "grad_norm": 5.1875, + "learning_rate": 7.742461407565504e-06, + "loss": 1.58032379, + "memory(GiB)": 107.26, + "step": 27550, + "train_speed(iter/s)": 1.636635 + }, + { + "acc": 0.65556951, + "epoch": 0.6990106544901066, + "grad_norm": 5.03125, + "learning_rate": 7.741584534678632e-06, + "loss": 1.64268456, + "memory(GiB)": 107.26, + "step": 27555, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.66315279, + "epoch": 0.6991374936580416, + "grad_norm": 5.21875, + "learning_rate": 7.74070754120463e-06, + "loss": 1.6171772, + "memory(GiB)": 107.26, + "step": 27560, + "train_speed(iter/s)": 1.636702 + }, + { + "acc": 0.65602617, + "epoch": 0.6992643328259767, + "grad_norm": 5.65625, + "learning_rate": 7.739830427182073e-06, + "loss": 1.64129524, + "memory(GiB)": 107.26, + "step": 27565, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.65914369, + "epoch": 0.6993911719939118, + "grad_norm": 6.375, + "learning_rate": 7.738953192649544e-06, + "loss": 1.60509415, + "memory(GiB)": 107.26, + "step": 27570, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.65117669, + "epoch": 0.6995180111618468, + "grad_norm": 5.34375, + "learning_rate": 7.738075837645625e-06, + "loss": 1.62055225, + "memory(GiB)": 107.26, + "step": 27575, + "train_speed(iter/s)": 1.636799 + }, + { + "acc": 0.66142426, + "epoch": 0.6996448503297819, + "grad_norm": 6.28125, + "learning_rate": 7.737198362208904e-06, + "loss": 1.61629028, + "memory(GiB)": 107.26, + "step": 27580, + "train_speed(iter/s)": 1.636829 + }, + { + "acc": 0.65815783, + "epoch": 0.6997716894977168, + "grad_norm": 7.15625, + "learning_rate": 7.736320766377978e-06, + "loss": 1.6959465, + "memory(GiB)": 107.26, + "step": 27585, + "train_speed(iter/s)": 1.636861 + }, + { + "acc": 0.64031978, + "epoch": 0.6998985286656519, + "grad_norm": 5.5625, + "learning_rate": 7.735443050191452e-06, + "loss": 1.59045753, + "memory(GiB)": 107.26, + "step": 27590, + "train_speed(iter/s)": 1.636893 + }, + { + "acc": 0.64829602, + "epoch": 0.700025367833587, + "grad_norm": 5.5, + "learning_rate": 7.734565213687923e-06, + "loss": 1.58005152, + "memory(GiB)": 107.26, + "step": 27595, + "train_speed(iter/s)": 1.636925 + }, + { + "acc": 0.65510674, + "epoch": 0.700152207001522, + "grad_norm": 4.9375, + "learning_rate": 7.733687256906009e-06, + "loss": 1.64164505, + "memory(GiB)": 107.26, + "step": 27600, + "train_speed(iter/s)": 1.636958 + }, + { + "acc": 0.66025529, + "epoch": 0.7002790461694571, + "grad_norm": 5.15625, + "learning_rate": 7.732809179884324e-06, + "loss": 1.59567947, + "memory(GiB)": 107.26, + "step": 27605, + "train_speed(iter/s)": 1.63699 + }, + { + "acc": 0.6611599, + "epoch": 0.7004058853373922, + "grad_norm": 6.09375, + "learning_rate": 7.73193098266149e-06, + "loss": 1.58921146, + "memory(GiB)": 107.26, + "step": 27610, + "train_speed(iter/s)": 1.637022 + }, + { + "acc": 0.64529257, + "epoch": 0.7005327245053272, + "grad_norm": 6.71875, + "learning_rate": 7.731052665276135e-06, + "loss": 1.67083435, + "memory(GiB)": 107.26, + "step": 27615, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.66823578, + "epoch": 0.7006595636732623, + "grad_norm": 6.75, + "learning_rate": 7.730174227766892e-06, + "loss": 1.57269411, + "memory(GiB)": 107.26, + "step": 27620, + "train_speed(iter/s)": 1.637089 + }, + { + "acc": 0.65937729, + "epoch": 0.7007864028411973, + "grad_norm": 6.28125, + "learning_rate": 7.729295670172394e-06, + "loss": 1.59853535, + "memory(GiB)": 107.26, + "step": 27625, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.66014252, + "epoch": 0.7009132420091324, + "grad_norm": 6.09375, + "learning_rate": 7.728416992531287e-06, + "loss": 1.60750999, + "memory(GiB)": 107.26, + "step": 27630, + "train_speed(iter/s)": 1.637153 + }, + { + "acc": 0.65903234, + "epoch": 0.7010400811770675, + "grad_norm": 6.4375, + "learning_rate": 7.72753819488222e-06, + "loss": 1.64045296, + "memory(GiB)": 107.26, + "step": 27635, + "train_speed(iter/s)": 1.637185 + }, + { + "acc": 0.65994263, + "epoch": 0.7011669203450025, + "grad_norm": 5.3125, + "learning_rate": 7.726659277263848e-06, + "loss": 1.58435211, + "memory(GiB)": 107.26, + "step": 27640, + "train_speed(iter/s)": 1.637217 + }, + { + "acc": 0.65562024, + "epoch": 0.7012937595129376, + "grad_norm": 9.9375, + "learning_rate": 7.725780239714824e-06, + "loss": 1.66056213, + "memory(GiB)": 107.26, + "step": 27645, + "train_speed(iter/s)": 1.637248 + }, + { + "acc": 0.65208654, + "epoch": 0.7014205986808727, + "grad_norm": 6.1875, + "learning_rate": 7.724901082273817e-06, + "loss": 1.57872782, + "memory(GiB)": 107.26, + "step": 27650, + "train_speed(iter/s)": 1.63728 + }, + { + "acc": 0.65807223, + "epoch": 0.7015474378488077, + "grad_norm": 6.09375, + "learning_rate": 7.724021804979493e-06, + "loss": 1.57590771, + "memory(GiB)": 107.26, + "step": 27655, + "train_speed(iter/s)": 1.63731 + }, + { + "acc": 0.65873418, + "epoch": 0.7016742770167428, + "grad_norm": 6.15625, + "learning_rate": 7.723142407870532e-06, + "loss": 1.61198368, + "memory(GiB)": 107.26, + "step": 27660, + "train_speed(iter/s)": 1.637341 + }, + { + "acc": 0.67246671, + "epoch": 0.7018011161846778, + "grad_norm": 5.59375, + "learning_rate": 7.722262890985605e-06, + "loss": 1.60879211, + "memory(GiB)": 107.26, + "step": 27665, + "train_speed(iter/s)": 1.637373 + }, + { + "acc": 0.64914808, + "epoch": 0.7019279553526129, + "grad_norm": 5.5, + "learning_rate": 7.721383254363407e-06, + "loss": 1.65706635, + "memory(GiB)": 107.26, + "step": 27670, + "train_speed(iter/s)": 1.637405 + }, + { + "acc": 0.67284093, + "epoch": 0.702054794520548, + "grad_norm": 6.875, + "learning_rate": 7.72050349804262e-06, + "loss": 1.52197485, + "memory(GiB)": 107.26, + "step": 27675, + "train_speed(iter/s)": 1.637436 + }, + { + "acc": 0.66381187, + "epoch": 0.702181633688483, + "grad_norm": 5.09375, + "learning_rate": 7.719623622061943e-06, + "loss": 1.5586031, + "memory(GiB)": 107.26, + "step": 27680, + "train_speed(iter/s)": 1.637468 + }, + { + "acc": 0.64491067, + "epoch": 0.7023084728564181, + "grad_norm": 5.6875, + "learning_rate": 7.718743626460076e-06, + "loss": 1.66212597, + "memory(GiB)": 107.26, + "step": 27685, + "train_speed(iter/s)": 1.6375 + }, + { + "acc": 0.66848497, + "epoch": 0.7024353120243532, + "grad_norm": 5.28125, + "learning_rate": 7.71786351127573e-06, + "loss": 1.53633022, + "memory(GiB)": 107.26, + "step": 27690, + "train_speed(iter/s)": 1.637532 + }, + { + "acc": 0.64966307, + "epoch": 0.7025621511922882, + "grad_norm": 5.59375, + "learning_rate": 7.71698327654761e-06, + "loss": 1.59696503, + "memory(GiB)": 107.26, + "step": 27695, + "train_speed(iter/s)": 1.637563 + }, + { + "acc": 0.65377159, + "epoch": 0.7026889903602233, + "grad_norm": 6.0625, + "learning_rate": 7.716102922314435e-06, + "loss": 1.60128994, + "memory(GiB)": 107.26, + "step": 27700, + "train_speed(iter/s)": 1.637593 + }, + { + "acc": 0.6383997, + "epoch": 0.7028158295281582, + "grad_norm": 5.59375, + "learning_rate": 7.715222448614926e-06, + "loss": 1.7098875, + "memory(GiB)": 107.26, + "step": 27705, + "train_speed(iter/s)": 1.637624 + }, + { + "acc": 0.65988851, + "epoch": 0.7029426686960933, + "grad_norm": 6.84375, + "learning_rate": 7.714341855487812e-06, + "loss": 1.61131897, + "memory(GiB)": 107.26, + "step": 27710, + "train_speed(iter/s)": 1.637653 + }, + { + "acc": 0.64717531, + "epoch": 0.7030695078640284, + "grad_norm": 5.0625, + "learning_rate": 7.713461142971824e-06, + "loss": 1.5954669, + "memory(GiB)": 107.26, + "step": 27715, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.6708745, + "epoch": 0.7031963470319634, + "grad_norm": 5.9375, + "learning_rate": 7.712580311105701e-06, + "loss": 1.59094448, + "memory(GiB)": 107.26, + "step": 27720, + "train_speed(iter/s)": 1.637716 + }, + { + "acc": 0.65302525, + "epoch": 0.7033231861998985, + "grad_norm": 5.1875, + "learning_rate": 7.711699359928184e-06, + "loss": 1.55217676, + "memory(GiB)": 107.26, + "step": 27725, + "train_speed(iter/s)": 1.637749 + }, + { + "acc": 0.6635447, + "epoch": 0.7034500253678336, + "grad_norm": 6.5625, + "learning_rate": 7.710818289478024e-06, + "loss": 1.58479214, + "memory(GiB)": 107.26, + "step": 27730, + "train_speed(iter/s)": 1.637783 + }, + { + "acc": 0.65795183, + "epoch": 0.7035768645357686, + "grad_norm": 5.625, + "learning_rate": 7.709937099793971e-06, + "loss": 1.6035223, + "memory(GiB)": 107.26, + "step": 27735, + "train_speed(iter/s)": 1.637814 + }, + { + "acc": 0.64138708, + "epoch": 0.7037037037037037, + "grad_norm": 6.34375, + "learning_rate": 7.709055790914787e-06, + "loss": 1.64339561, + "memory(GiB)": 107.26, + "step": 27740, + "train_speed(iter/s)": 1.637848 + }, + { + "acc": 0.65108232, + "epoch": 0.7038305428716387, + "grad_norm": 5.84375, + "learning_rate": 7.708174362879234e-06, + "loss": 1.60807934, + "memory(GiB)": 107.26, + "step": 27745, + "train_speed(iter/s)": 1.637878 + }, + { + "acc": 0.64732895, + "epoch": 0.7039573820395738, + "grad_norm": 5.34375, + "learning_rate": 7.70729281572608e-06, + "loss": 1.66552467, + "memory(GiB)": 107.26, + "step": 27750, + "train_speed(iter/s)": 1.637911 + }, + { + "acc": 0.63947697, + "epoch": 0.7040842212075089, + "grad_norm": 5.375, + "learning_rate": 7.706411149494102e-06, + "loss": 1.6603775, + "memory(GiB)": 107.26, + "step": 27755, + "train_speed(iter/s)": 1.637942 + }, + { + "acc": 0.6579258, + "epoch": 0.7042110603754439, + "grad_norm": 6.03125, + "learning_rate": 7.705529364222079e-06, + "loss": 1.63553219, + "memory(GiB)": 107.26, + "step": 27760, + "train_speed(iter/s)": 1.637975 + }, + { + "acc": 0.65640154, + "epoch": 0.704337899543379, + "grad_norm": 6.84375, + "learning_rate": 7.704647459948793e-06, + "loss": 1.5791626, + "memory(GiB)": 107.26, + "step": 27765, + "train_speed(iter/s)": 1.638007 + }, + { + "acc": 0.65575662, + "epoch": 0.7044647387113141, + "grad_norm": 5.40625, + "learning_rate": 7.703765436713038e-06, + "loss": 1.60827332, + "memory(GiB)": 107.26, + "step": 27770, + "train_speed(iter/s)": 1.638041 + }, + { + "acc": 0.67407074, + "epoch": 0.7045915778792491, + "grad_norm": 5.75, + "learning_rate": 7.702883294553607e-06, + "loss": 1.53509302, + "memory(GiB)": 107.26, + "step": 27775, + "train_speed(iter/s)": 1.638073 + }, + { + "acc": 0.66179962, + "epoch": 0.7047184170471842, + "grad_norm": 5.34375, + "learning_rate": 7.702001033509302e-06, + "loss": 1.53914547, + "memory(GiB)": 107.26, + "step": 27780, + "train_speed(iter/s)": 1.638104 + }, + { + "acc": 0.63739076, + "epoch": 0.7048452562151192, + "grad_norm": 4.84375, + "learning_rate": 7.701118653618927e-06, + "loss": 1.63914642, + "memory(GiB)": 107.26, + "step": 27785, + "train_speed(iter/s)": 1.638136 + }, + { + "acc": 0.64568243, + "epoch": 0.7049720953830543, + "grad_norm": 5.78125, + "learning_rate": 7.700236154921294e-06, + "loss": 1.60901546, + "memory(GiB)": 107.26, + "step": 27790, + "train_speed(iter/s)": 1.638167 + }, + { + "acc": 0.64461689, + "epoch": 0.7050989345509894, + "grad_norm": 5.0625, + "learning_rate": 7.699353537455222e-06, + "loss": 1.63301888, + "memory(GiB)": 107.26, + "step": 27795, + "train_speed(iter/s)": 1.638199 + }, + { + "acc": 0.64643307, + "epoch": 0.7052257737189244, + "grad_norm": 5.15625, + "learning_rate": 7.698470801259526e-06, + "loss": 1.65585098, + "memory(GiB)": 107.26, + "step": 27800, + "train_speed(iter/s)": 1.638232 + }, + { + "acc": 0.66306343, + "epoch": 0.7053526128868595, + "grad_norm": 5.9375, + "learning_rate": 7.697587946373037e-06, + "loss": 1.63212433, + "memory(GiB)": 107.26, + "step": 27805, + "train_speed(iter/s)": 1.638265 + }, + { + "acc": 0.65107331, + "epoch": 0.7054794520547946, + "grad_norm": 5.0, + "learning_rate": 7.696704972834589e-06, + "loss": 1.59155254, + "memory(GiB)": 107.26, + "step": 27810, + "train_speed(iter/s)": 1.638296 + }, + { + "acc": 0.66268091, + "epoch": 0.7056062912227296, + "grad_norm": 5.375, + "learning_rate": 7.695821880683012e-06, + "loss": 1.62391586, + "memory(GiB)": 107.26, + "step": 27815, + "train_speed(iter/s)": 1.638329 + }, + { + "acc": 0.65237837, + "epoch": 0.7057331303906647, + "grad_norm": 6.59375, + "learning_rate": 7.694938669957156e-06, + "loss": 1.58336897, + "memory(GiB)": 107.26, + "step": 27820, + "train_speed(iter/s)": 1.638361 + }, + { + "acc": 0.6549633, + "epoch": 0.7058599695585996, + "grad_norm": 5.875, + "learning_rate": 7.694055340695862e-06, + "loss": 1.59736347, + "memory(GiB)": 107.26, + "step": 27825, + "train_speed(iter/s)": 1.638391 + }, + { + "acc": 0.66499166, + "epoch": 0.7059868087265347, + "grad_norm": 7.625, + "learning_rate": 7.693171892937991e-06, + "loss": 1.58155193, + "memory(GiB)": 107.26, + "step": 27830, + "train_speed(iter/s)": 1.638425 + }, + { + "acc": 0.65267334, + "epoch": 0.7061136478944698, + "grad_norm": 5.28125, + "learning_rate": 7.692288326722393e-06, + "loss": 1.65204754, + "memory(GiB)": 107.26, + "step": 27835, + "train_speed(iter/s)": 1.638457 + }, + { + "acc": 0.65091667, + "epoch": 0.7062404870624048, + "grad_norm": 6.125, + "learning_rate": 7.691404642087933e-06, + "loss": 1.59223194, + "memory(GiB)": 107.26, + "step": 27840, + "train_speed(iter/s)": 1.637969 + }, + { + "acc": 0.6709857, + "epoch": 0.7063673262303399, + "grad_norm": 6.125, + "learning_rate": 7.690520839073484e-06, + "loss": 1.57745066, + "memory(GiB)": 107.26, + "step": 27845, + "train_speed(iter/s)": 1.637998 + }, + { + "acc": 0.65187516, + "epoch": 0.706494165398275, + "grad_norm": 6.15625, + "learning_rate": 7.689636917717913e-06, + "loss": 1.5948163, + "memory(GiB)": 107.26, + "step": 27850, + "train_speed(iter/s)": 1.637483 + }, + { + "acc": 0.66029778, + "epoch": 0.70662100456621, + "grad_norm": 5.28125, + "learning_rate": 7.688752878060103e-06, + "loss": 1.62348709, + "memory(GiB)": 107.26, + "step": 27855, + "train_speed(iter/s)": 1.637514 + }, + { + "acc": 0.65179682, + "epoch": 0.7067478437341451, + "grad_norm": 5.0625, + "learning_rate": 7.687868720138939e-06, + "loss": 1.56717014, + "memory(GiB)": 107.26, + "step": 27860, + "train_speed(iter/s)": 1.637547 + }, + { + "acc": 0.65384226, + "epoch": 0.7068746829020801, + "grad_norm": 6.90625, + "learning_rate": 7.686984443993304e-06, + "loss": 1.6424366, + "memory(GiB)": 107.26, + "step": 27865, + "train_speed(iter/s)": 1.637578 + }, + { + "acc": 0.64773078, + "epoch": 0.7070015220700152, + "grad_norm": 5.40625, + "learning_rate": 7.6861000496621e-06, + "loss": 1.63153687, + "memory(GiB)": 107.26, + "step": 27870, + "train_speed(iter/s)": 1.637609 + }, + { + "acc": 0.65964775, + "epoch": 0.7071283612379503, + "grad_norm": 6.21875, + "learning_rate": 7.685215537184223e-06, + "loss": 1.65231895, + "memory(GiB)": 107.26, + "step": 27875, + "train_speed(iter/s)": 1.637639 + }, + { + "acc": 0.65603256, + "epoch": 0.7072552004058853, + "grad_norm": 6.0, + "learning_rate": 7.684330906598577e-06, + "loss": 1.66472816, + "memory(GiB)": 107.26, + "step": 27880, + "train_speed(iter/s)": 1.63767 + }, + { + "acc": 0.66592264, + "epoch": 0.7073820395738204, + "grad_norm": 5.4375, + "learning_rate": 7.683446157944075e-06, + "loss": 1.57878933, + "memory(GiB)": 107.26, + "step": 27885, + "train_speed(iter/s)": 1.637699 + }, + { + "acc": 0.6455842, + "epoch": 0.7075088787417555, + "grad_norm": 6.0625, + "learning_rate": 7.682561291259628e-06, + "loss": 1.69266052, + "memory(GiB)": 107.26, + "step": 27890, + "train_speed(iter/s)": 1.637729 + }, + { + "acc": 0.65307169, + "epoch": 0.7076357179096905, + "grad_norm": 5.625, + "learning_rate": 7.681676306584159e-06, + "loss": 1.61092491, + "memory(GiB)": 107.26, + "step": 27895, + "train_speed(iter/s)": 1.637759 + }, + { + "acc": 0.65158167, + "epoch": 0.7077625570776256, + "grad_norm": 6.21875, + "learning_rate": 7.680791203956594e-06, + "loss": 1.56169558, + "memory(GiB)": 107.26, + "step": 27900, + "train_speed(iter/s)": 1.637792 + }, + { + "acc": 0.6487205, + "epoch": 0.7078893962455606, + "grad_norm": 6.625, + "learning_rate": 7.679905983415861e-06, + "loss": 1.65474091, + "memory(GiB)": 107.26, + "step": 27905, + "train_speed(iter/s)": 1.637823 + }, + { + "acc": 0.66449122, + "epoch": 0.7080162354134957, + "grad_norm": 5.71875, + "learning_rate": 7.6790206450009e-06, + "loss": 1.61333084, + "memory(GiB)": 107.26, + "step": 27910, + "train_speed(iter/s)": 1.637855 + }, + { + "acc": 0.64561214, + "epoch": 0.7081430745814308, + "grad_norm": 5.21875, + "learning_rate": 7.678135188750648e-06, + "loss": 1.61060162, + "memory(GiB)": 107.26, + "step": 27915, + "train_speed(iter/s)": 1.637886 + }, + { + "acc": 0.65609264, + "epoch": 0.7082699137493658, + "grad_norm": 5.625, + "learning_rate": 7.677249614704057e-06, + "loss": 1.59790611, + "memory(GiB)": 107.26, + "step": 27920, + "train_speed(iter/s)": 1.637917 + }, + { + "acc": 0.66631536, + "epoch": 0.7083967529173009, + "grad_norm": 5.34375, + "learning_rate": 7.676363922900073e-06, + "loss": 1.57596111, + "memory(GiB)": 107.26, + "step": 27925, + "train_speed(iter/s)": 1.637949 + }, + { + "acc": 0.63721027, + "epoch": 0.708523592085236, + "grad_norm": 6.0, + "learning_rate": 7.675478113377653e-06, + "loss": 1.672649, + "memory(GiB)": 107.26, + "step": 27930, + "train_speed(iter/s)": 1.63798 + }, + { + "acc": 0.65707998, + "epoch": 0.708650431253171, + "grad_norm": 6.21875, + "learning_rate": 7.674592186175762e-06, + "loss": 1.58273659, + "memory(GiB)": 107.26, + "step": 27935, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.66473184, + "epoch": 0.708777270421106, + "grad_norm": 4.8125, + "learning_rate": 7.673706141333365e-06, + "loss": 1.50540161, + "memory(GiB)": 107.26, + "step": 27940, + "train_speed(iter/s)": 1.638043 + }, + { + "acc": 0.64969101, + "epoch": 0.708904109589041, + "grad_norm": 5.28125, + "learning_rate": 7.672819978889435e-06, + "loss": 1.63706322, + "memory(GiB)": 107.26, + "step": 27945, + "train_speed(iter/s)": 1.638074 + }, + { + "acc": 0.65129633, + "epoch": 0.7090309487569761, + "grad_norm": 6.4375, + "learning_rate": 7.67193369888295e-06, + "loss": 1.64601135, + "memory(GiB)": 107.26, + "step": 27950, + "train_speed(iter/s)": 1.638106 + }, + { + "acc": 0.66796207, + "epoch": 0.7091577879249112, + "grad_norm": 6.5, + "learning_rate": 7.67104730135289e-06, + "loss": 1.52657471, + "memory(GiB)": 107.26, + "step": 27955, + "train_speed(iter/s)": 1.638139 + }, + { + "acc": 0.66234288, + "epoch": 0.7092846270928462, + "grad_norm": 5.625, + "learning_rate": 7.670160786338246e-06, + "loss": 1.62004318, + "memory(GiB)": 107.26, + "step": 27960, + "train_speed(iter/s)": 1.638171 + }, + { + "acc": 0.65799379, + "epoch": 0.7094114662607813, + "grad_norm": 5.53125, + "learning_rate": 7.669274153878006e-06, + "loss": 1.58321896, + "memory(GiB)": 107.26, + "step": 27965, + "train_speed(iter/s)": 1.638204 + }, + { + "acc": 0.64791522, + "epoch": 0.7095383054287164, + "grad_norm": 5.3125, + "learning_rate": 7.668387404011176e-06, + "loss": 1.65013142, + "memory(GiB)": 107.26, + "step": 27970, + "train_speed(iter/s)": 1.638237 + }, + { + "acc": 0.6491189, + "epoch": 0.7096651445966514, + "grad_norm": 5.90625, + "learning_rate": 7.667500536776748e-06, + "loss": 1.67357464, + "memory(GiB)": 107.26, + "step": 27975, + "train_speed(iter/s)": 1.638268 + }, + { + "acc": 0.65121598, + "epoch": 0.7097919837645865, + "grad_norm": 4.875, + "learning_rate": 7.666613552213742e-06, + "loss": 1.61239452, + "memory(GiB)": 107.26, + "step": 27980, + "train_speed(iter/s)": 1.6383 + }, + { + "acc": 0.66499119, + "epoch": 0.7099188229325215, + "grad_norm": 5.90625, + "learning_rate": 7.665726450361165e-06, + "loss": 1.57723751, + "memory(GiB)": 107.26, + "step": 27985, + "train_speed(iter/s)": 1.63833 + }, + { + "acc": 0.66060958, + "epoch": 0.7100456621004566, + "grad_norm": 4.625, + "learning_rate": 7.664839231258036e-06, + "loss": 1.60058346, + "memory(GiB)": 107.26, + "step": 27990, + "train_speed(iter/s)": 1.638361 + }, + { + "acc": 0.66210179, + "epoch": 0.7101725012683917, + "grad_norm": 6.09375, + "learning_rate": 7.663951894943383e-06, + "loss": 1.6458725, + "memory(GiB)": 107.26, + "step": 27995, + "train_speed(iter/s)": 1.638393 + }, + { + "acc": 0.64717278, + "epoch": 0.7102993404363267, + "grad_norm": 6.5625, + "learning_rate": 7.66306444145623e-06, + "loss": 1.6352869, + "memory(GiB)": 107.26, + "step": 28000, + "train_speed(iter/s)": 1.638427 + }, + { + "epoch": 0.7102993404363267, + "eval_acc": 0.6452535934527427, + "eval_loss": 1.5794938802719116, + "eval_runtime": 58.7015, + "eval_samples_per_second": 108.515, + "eval_steps_per_second": 27.137, + "step": 28000 + }, + { + "acc": 0.64372878, + "epoch": 0.7104261796042618, + "grad_norm": 6.59375, + "learning_rate": 7.662176870835614e-06, + "loss": 1.65335445, + "memory(GiB)": 107.26, + "step": 28005, + "train_speed(iter/s)": 1.632447 + }, + { + "acc": 0.66129799, + "epoch": 0.7105530187721969, + "grad_norm": 5.4375, + "learning_rate": 7.661289183120572e-06, + "loss": 1.58838711, + "memory(GiB)": 107.26, + "step": 28010, + "train_speed(iter/s)": 1.632478 + }, + { + "acc": 0.65851049, + "epoch": 0.7106798579401319, + "grad_norm": 6.0, + "learning_rate": 7.66040137835015e-06, + "loss": 1.63926582, + "memory(GiB)": 107.26, + "step": 28015, + "train_speed(iter/s)": 1.632507 + }, + { + "acc": 0.62963667, + "epoch": 0.710806697108067, + "grad_norm": 5.0, + "learning_rate": 7.659513456563399e-06, + "loss": 1.70481815, + "memory(GiB)": 107.26, + "step": 28020, + "train_speed(iter/s)": 1.632538 + }, + { + "acc": 0.66526031, + "epoch": 0.710933536276002, + "grad_norm": 6.25, + "learning_rate": 7.658625417799372e-06, + "loss": 1.58290081, + "memory(GiB)": 107.26, + "step": 28025, + "train_speed(iter/s)": 1.632569 + }, + { + "acc": 0.65998726, + "epoch": 0.7110603754439371, + "grad_norm": 5.71875, + "learning_rate": 7.657737262097128e-06, + "loss": 1.59762917, + "memory(GiB)": 107.26, + "step": 28030, + "train_speed(iter/s)": 1.6326 + }, + { + "acc": 0.66310759, + "epoch": 0.7111872146118722, + "grad_norm": 5.28125, + "learning_rate": 7.656848989495733e-06, + "loss": 1.58448563, + "memory(GiB)": 107.26, + "step": 28035, + "train_speed(iter/s)": 1.632633 + }, + { + "acc": 0.6720552, + "epoch": 0.7113140537798072, + "grad_norm": 5.75, + "learning_rate": 7.65596060003426e-06, + "loss": 1.62708893, + "memory(GiB)": 107.26, + "step": 28040, + "train_speed(iter/s)": 1.632663 + }, + { + "acc": 0.64487104, + "epoch": 0.7114408929477423, + "grad_norm": 8.125, + "learning_rate": 7.655072093751779e-06, + "loss": 1.7126873, + "memory(GiB)": 107.26, + "step": 28045, + "train_speed(iter/s)": 1.632694 + }, + { + "acc": 0.65357685, + "epoch": 0.7115677321156774, + "grad_norm": 6.5, + "learning_rate": 7.654183470687375e-06, + "loss": 1.61941929, + "memory(GiB)": 107.26, + "step": 28050, + "train_speed(iter/s)": 1.632724 + }, + { + "acc": 0.66873689, + "epoch": 0.7116945712836124, + "grad_norm": 6.53125, + "learning_rate": 7.653294730880131e-06, + "loss": 1.56383419, + "memory(GiB)": 107.26, + "step": 28055, + "train_speed(iter/s)": 1.632757 + }, + { + "acc": 0.66259618, + "epoch": 0.7118214104515475, + "grad_norm": 6.4375, + "learning_rate": 7.65240587436914e-06, + "loss": 1.52600517, + "memory(GiB)": 107.26, + "step": 28060, + "train_speed(iter/s)": 1.63279 + }, + { + "acc": 0.65123034, + "epoch": 0.7119482496194824, + "grad_norm": 5.71875, + "learning_rate": 7.651516901193494e-06, + "loss": 1.68130493, + "memory(GiB)": 107.26, + "step": 28065, + "train_speed(iter/s)": 1.632822 + }, + { + "acc": 0.66621528, + "epoch": 0.7120750887874175, + "grad_norm": 5.59375, + "learning_rate": 7.650627811392298e-06, + "loss": 1.5240799, + "memory(GiB)": 107.26, + "step": 28070, + "train_speed(iter/s)": 1.632854 + }, + { + "acc": 0.63887463, + "epoch": 0.7122019279553526, + "grad_norm": 5.625, + "learning_rate": 7.649738605004658e-06, + "loss": 1.63159866, + "memory(GiB)": 107.26, + "step": 28075, + "train_speed(iter/s)": 1.632885 + }, + { + "acc": 0.66605206, + "epoch": 0.7123287671232876, + "grad_norm": 5.0625, + "learning_rate": 7.648849282069682e-06, + "loss": 1.5066618, + "memory(GiB)": 107.26, + "step": 28080, + "train_speed(iter/s)": 1.632915 + }, + { + "acc": 0.65374641, + "epoch": 0.7124556062912227, + "grad_norm": 5.15625, + "learning_rate": 7.647959842626489e-06, + "loss": 1.63920403, + "memory(GiB)": 107.26, + "step": 28085, + "train_speed(iter/s)": 1.632948 + }, + { + "acc": 0.64618883, + "epoch": 0.7125824454591578, + "grad_norm": 5.28125, + "learning_rate": 7.6470702867142e-06, + "loss": 1.63928795, + "memory(GiB)": 107.26, + "step": 28090, + "train_speed(iter/s)": 1.632978 + }, + { + "acc": 0.6400157, + "epoch": 0.7127092846270928, + "grad_norm": 5.34375, + "learning_rate": 7.646180614371941e-06, + "loss": 1.64554939, + "memory(GiB)": 107.26, + "step": 28095, + "train_speed(iter/s)": 1.633009 + }, + { + "acc": 0.65022769, + "epoch": 0.7128361237950279, + "grad_norm": 6.875, + "learning_rate": 7.645290825638845e-06, + "loss": 1.62553387, + "memory(GiB)": 107.26, + "step": 28100, + "train_speed(iter/s)": 1.633042 + }, + { + "acc": 0.65212383, + "epoch": 0.7129629629629629, + "grad_norm": 7.25, + "learning_rate": 7.644400920554048e-06, + "loss": 1.61919861, + "memory(GiB)": 107.26, + "step": 28105, + "train_speed(iter/s)": 1.633072 + }, + { + "acc": 0.65708828, + "epoch": 0.713089802130898, + "grad_norm": 5.0, + "learning_rate": 7.64351089915669e-06, + "loss": 1.65823097, + "memory(GiB)": 107.26, + "step": 28110, + "train_speed(iter/s)": 1.633105 + }, + { + "acc": 0.68063121, + "epoch": 0.7132166412988331, + "grad_norm": 5.71875, + "learning_rate": 7.642620761485921e-06, + "loss": 1.51957417, + "memory(GiB)": 107.26, + "step": 28115, + "train_speed(iter/s)": 1.633137 + }, + { + "acc": 0.65642333, + "epoch": 0.7133434804667681, + "grad_norm": 5.3125, + "learning_rate": 7.641730507580896e-06, + "loss": 1.59378891, + "memory(GiB)": 107.26, + "step": 28120, + "train_speed(iter/s)": 1.633171 + }, + { + "acc": 0.64085526, + "epoch": 0.7134703196347032, + "grad_norm": 5.25, + "learning_rate": 7.640840137480763e-06, + "loss": 1.67423286, + "memory(GiB)": 107.26, + "step": 28125, + "train_speed(iter/s)": 1.633202 + }, + { + "acc": 0.65447989, + "epoch": 0.7135971588026383, + "grad_norm": 5.6875, + "learning_rate": 7.639949651224697e-06, + "loss": 1.56445322, + "memory(GiB)": 107.26, + "step": 28130, + "train_speed(iter/s)": 1.633233 + }, + { + "acc": 0.64223018, + "epoch": 0.7137239979705733, + "grad_norm": 5.625, + "learning_rate": 7.639059048851853e-06, + "loss": 1.67291393, + "memory(GiB)": 107.26, + "step": 28135, + "train_speed(iter/s)": 1.633258 + }, + { + "acc": 0.65768952, + "epoch": 0.7138508371385084, + "grad_norm": 5.4375, + "learning_rate": 7.638168330401412e-06, + "loss": 1.56363239, + "memory(GiB)": 107.26, + "step": 28140, + "train_speed(iter/s)": 1.633288 + }, + { + "acc": 0.65164928, + "epoch": 0.7139776763064434, + "grad_norm": 7.4375, + "learning_rate": 7.637277495912548e-06, + "loss": 1.58314877, + "memory(GiB)": 107.26, + "step": 28145, + "train_speed(iter/s)": 1.633318 + }, + { + "acc": 0.66595659, + "epoch": 0.7141045154743785, + "grad_norm": 5.1875, + "learning_rate": 7.636386545424447e-06, + "loss": 1.52089405, + "memory(GiB)": 107.26, + "step": 28150, + "train_speed(iter/s)": 1.633349 + }, + { + "acc": 0.64813285, + "epoch": 0.7142313546423136, + "grad_norm": 7.0, + "learning_rate": 7.635495478976294e-06, + "loss": 1.61723919, + "memory(GiB)": 107.26, + "step": 28155, + "train_speed(iter/s)": 1.633382 + }, + { + "acc": 0.64786263, + "epoch": 0.7143581938102486, + "grad_norm": 6.53125, + "learning_rate": 7.634604296607285e-06, + "loss": 1.70086632, + "memory(GiB)": 107.26, + "step": 28160, + "train_speed(iter/s)": 1.633414 + }, + { + "acc": 0.66287107, + "epoch": 0.7144850329781837, + "grad_norm": 5.625, + "learning_rate": 7.633712998356612e-06, + "loss": 1.52047501, + "memory(GiB)": 107.26, + "step": 28165, + "train_speed(iter/s)": 1.633446 + }, + { + "acc": 0.64548182, + "epoch": 0.7146118721461188, + "grad_norm": 6.5625, + "learning_rate": 7.632821584263486e-06, + "loss": 1.66940746, + "memory(GiB)": 107.26, + "step": 28170, + "train_speed(iter/s)": 1.633479 + }, + { + "acc": 0.65837064, + "epoch": 0.7147387113140538, + "grad_norm": 5.4375, + "learning_rate": 7.631930054367112e-06, + "loss": 1.5959218, + "memory(GiB)": 107.26, + "step": 28175, + "train_speed(iter/s)": 1.633511 + }, + { + "acc": 0.65959353, + "epoch": 0.7148655504819889, + "grad_norm": 5.1875, + "learning_rate": 7.631038408706703e-06, + "loss": 1.55693836, + "memory(GiB)": 107.26, + "step": 28180, + "train_speed(iter/s)": 1.633544 + }, + { + "acc": 0.64711881, + "epoch": 0.7149923896499238, + "grad_norm": 6.0625, + "learning_rate": 7.630146647321476e-06, + "loss": 1.60683556, + "memory(GiB)": 107.26, + "step": 28185, + "train_speed(iter/s)": 1.633574 + }, + { + "acc": 0.65769544, + "epoch": 0.7151192288178589, + "grad_norm": 5.0625, + "learning_rate": 7.62925477025066e-06, + "loss": 1.67220135, + "memory(GiB)": 107.26, + "step": 28190, + "train_speed(iter/s)": 1.633606 + }, + { + "acc": 0.64771824, + "epoch": 0.715246067985794, + "grad_norm": 5.53125, + "learning_rate": 7.628362777533479e-06, + "loss": 1.6699955, + "memory(GiB)": 107.26, + "step": 28195, + "train_speed(iter/s)": 1.633638 + }, + { + "acc": 0.64890823, + "epoch": 0.715372907153729, + "grad_norm": 6.0625, + "learning_rate": 7.627470669209169e-06, + "loss": 1.60386066, + "memory(GiB)": 107.26, + "step": 28200, + "train_speed(iter/s)": 1.633671 + }, + { + "acc": 0.66130986, + "epoch": 0.7154997463216641, + "grad_norm": 5.25, + "learning_rate": 7.626578445316968e-06, + "loss": 1.58195038, + "memory(GiB)": 107.26, + "step": 28205, + "train_speed(iter/s)": 1.633704 + }, + { + "acc": 0.65298753, + "epoch": 0.7156265854895992, + "grad_norm": 7.09375, + "learning_rate": 7.62568610589612e-06, + "loss": 1.62716141, + "memory(GiB)": 107.26, + "step": 28210, + "train_speed(iter/s)": 1.633737 + }, + { + "acc": 0.64660292, + "epoch": 0.7157534246575342, + "grad_norm": 7.25, + "learning_rate": 7.624793650985873e-06, + "loss": 1.65130424, + "memory(GiB)": 107.26, + "step": 28215, + "train_speed(iter/s)": 1.633772 + }, + { + "acc": 0.64226384, + "epoch": 0.7158802638254693, + "grad_norm": 5.3125, + "learning_rate": 7.6239010806254835e-06, + "loss": 1.62994423, + "memory(GiB)": 107.26, + "step": 28220, + "train_speed(iter/s)": 1.633803 + }, + { + "acc": 0.64463453, + "epoch": 0.7160071029934043, + "grad_norm": 5.78125, + "learning_rate": 7.6230083948542084e-06, + "loss": 1.65804558, + "memory(GiB)": 107.26, + "step": 28225, + "train_speed(iter/s)": 1.633835 + }, + { + "acc": 0.66637888, + "epoch": 0.7161339421613394, + "grad_norm": 6.125, + "learning_rate": 7.622115593711314e-06, + "loss": 1.53976002, + "memory(GiB)": 107.26, + "step": 28230, + "train_speed(iter/s)": 1.633868 + }, + { + "acc": 0.66092119, + "epoch": 0.7162607813292745, + "grad_norm": 5.15625, + "learning_rate": 7.62122267723607e-06, + "loss": 1.58150387, + "memory(GiB)": 107.26, + "step": 28235, + "train_speed(iter/s)": 1.633901 + }, + { + "acc": 0.66293249, + "epoch": 0.7163876204972095, + "grad_norm": 4.71875, + "learning_rate": 7.620329645467748e-06, + "loss": 1.51172228, + "memory(GiB)": 107.26, + "step": 28240, + "train_speed(iter/s)": 1.633933 + }, + { + "acc": 0.64189219, + "epoch": 0.7165144596651446, + "grad_norm": 5.625, + "learning_rate": 7.61943649844563e-06, + "loss": 1.72075176, + "memory(GiB)": 107.26, + "step": 28245, + "train_speed(iter/s)": 1.633967 + }, + { + "acc": 0.64627934, + "epoch": 0.7166412988330797, + "grad_norm": 5.34375, + "learning_rate": 7.618543236209001e-06, + "loss": 1.66811256, + "memory(GiB)": 107.26, + "step": 28250, + "train_speed(iter/s)": 1.634 + }, + { + "acc": 0.64912987, + "epoch": 0.7167681380010147, + "grad_norm": 5.3125, + "learning_rate": 7.617649858797147e-06, + "loss": 1.6408947, + "memory(GiB)": 107.26, + "step": 28255, + "train_speed(iter/s)": 1.634034 + }, + { + "acc": 0.65791235, + "epoch": 0.7168949771689498, + "grad_norm": 5.4375, + "learning_rate": 7.616756366249367e-06, + "loss": 1.6241497, + "memory(GiB)": 107.26, + "step": 28260, + "train_speed(iter/s)": 1.634066 + }, + { + "acc": 0.64235945, + "epoch": 0.7170218163368848, + "grad_norm": 4.71875, + "learning_rate": 7.6158627586049586e-06, + "loss": 1.68135319, + "memory(GiB)": 107.26, + "step": 28265, + "train_speed(iter/s)": 1.634097 + }, + { + "acc": 0.66701756, + "epoch": 0.7171486555048199, + "grad_norm": 5.21875, + "learning_rate": 7.614969035903228e-06, + "loss": 1.53536587, + "memory(GiB)": 107.26, + "step": 28270, + "train_speed(iter/s)": 1.634129 + }, + { + "acc": 0.6596734, + "epoch": 0.717275494672755, + "grad_norm": 5.6875, + "learning_rate": 7.614075198183482e-06, + "loss": 1.55323257, + "memory(GiB)": 107.26, + "step": 28275, + "train_speed(iter/s)": 1.634161 + }, + { + "acc": 0.65606174, + "epoch": 0.71740233384069, + "grad_norm": 8.0625, + "learning_rate": 7.6131812454850406e-06, + "loss": 1.6242775, + "memory(GiB)": 107.26, + "step": 28280, + "train_speed(iter/s)": 1.634194 + }, + { + "acc": 0.65724993, + "epoch": 0.7175291730086251, + "grad_norm": 5.96875, + "learning_rate": 7.612287177847219e-06, + "loss": 1.62805672, + "memory(GiB)": 107.26, + "step": 28285, + "train_speed(iter/s)": 1.634226 + }, + { + "acc": 0.6599793, + "epoch": 0.7176560121765602, + "grad_norm": 5.125, + "learning_rate": 7.611392995309345e-06, + "loss": 1.61015854, + "memory(GiB)": 107.26, + "step": 28290, + "train_speed(iter/s)": 1.634256 + }, + { + "acc": 0.66009569, + "epoch": 0.7177828513444952, + "grad_norm": 6.28125, + "learning_rate": 7.610498697910748e-06, + "loss": 1.55302458, + "memory(GiB)": 107.26, + "step": 28295, + "train_speed(iter/s)": 1.634284 + }, + { + "acc": 0.65794744, + "epoch": 0.7179096905124303, + "grad_norm": 5.03125, + "learning_rate": 7.609604285690762e-06, + "loss": 1.59754276, + "memory(GiB)": 107.26, + "step": 28300, + "train_speed(iter/s)": 1.634315 + }, + { + "acc": 0.67010798, + "epoch": 0.7180365296803652, + "grad_norm": 5.96875, + "learning_rate": 7.608709758688731e-06, + "loss": 1.56151638, + "memory(GiB)": 107.26, + "step": 28305, + "train_speed(iter/s)": 1.634345 + }, + { + "acc": 0.64606771, + "epoch": 0.7181633688483003, + "grad_norm": 5.5, + "learning_rate": 7.607815116943995e-06, + "loss": 1.55978374, + "memory(GiB)": 107.26, + "step": 28310, + "train_speed(iter/s)": 1.634379 + }, + { + "acc": 0.65833216, + "epoch": 0.7182902080162354, + "grad_norm": 5.25, + "learning_rate": 7.606920360495908e-06, + "loss": 1.54789133, + "memory(GiB)": 107.26, + "step": 28315, + "train_speed(iter/s)": 1.634413 + }, + { + "acc": 0.63946843, + "epoch": 0.7184170471841704, + "grad_norm": 5.1875, + "learning_rate": 7.6060254893838255e-06, + "loss": 1.65668297, + "memory(GiB)": 107.26, + "step": 28320, + "train_speed(iter/s)": 1.634444 + }, + { + "acc": 0.63760405, + "epoch": 0.7185438863521055, + "grad_norm": 5.28125, + "learning_rate": 7.6051305036471065e-06, + "loss": 1.6331768, + "memory(GiB)": 107.26, + "step": 28325, + "train_speed(iter/s)": 1.634473 + }, + { + "acc": 0.66723285, + "epoch": 0.7186707255200406, + "grad_norm": 5.875, + "learning_rate": 7.604235403325117e-06, + "loss": 1.54969244, + "memory(GiB)": 107.26, + "step": 28330, + "train_speed(iter/s)": 1.634502 + }, + { + "acc": 0.65138001, + "epoch": 0.7187975646879756, + "grad_norm": 5.65625, + "learning_rate": 7.603340188457227e-06, + "loss": 1.64608841, + "memory(GiB)": 107.26, + "step": 28335, + "train_speed(iter/s)": 1.634534 + }, + { + "acc": 0.64432344, + "epoch": 0.7189244038559107, + "grad_norm": 5.5625, + "learning_rate": 7.602444859082814e-06, + "loss": 1.63461361, + "memory(GiB)": 107.26, + "step": 28340, + "train_speed(iter/s)": 1.634567 + }, + { + "acc": 0.65408378, + "epoch": 0.7190512430238457, + "grad_norm": 6.4375, + "learning_rate": 7.601549415241254e-06, + "loss": 1.59898453, + "memory(GiB)": 107.26, + "step": 28345, + "train_speed(iter/s)": 1.634601 + }, + { + "acc": 0.65668678, + "epoch": 0.7191780821917808, + "grad_norm": 6.8125, + "learning_rate": 7.6006538569719375e-06, + "loss": 1.57138348, + "memory(GiB)": 107.26, + "step": 28350, + "train_speed(iter/s)": 1.63463 + }, + { + "acc": 0.64599156, + "epoch": 0.7193049213597159, + "grad_norm": 5.6875, + "learning_rate": 7.599758184314252e-06, + "loss": 1.62103386, + "memory(GiB)": 107.26, + "step": 28355, + "train_speed(iter/s)": 1.634659 + }, + { + "acc": 0.65033569, + "epoch": 0.7194317605276509, + "grad_norm": 6.28125, + "learning_rate": 7.598862397307596e-06, + "loss": 1.61566505, + "memory(GiB)": 107.26, + "step": 28360, + "train_speed(iter/s)": 1.63469 + }, + { + "acc": 0.68914576, + "epoch": 0.719558599695586, + "grad_norm": 5.4375, + "learning_rate": 7.597966495991368e-06, + "loss": 1.48199768, + "memory(GiB)": 107.26, + "step": 28365, + "train_speed(iter/s)": 1.634723 + }, + { + "acc": 0.66563497, + "epoch": 0.7196854388635211, + "grad_norm": 5.5, + "learning_rate": 7.597070480404974e-06, + "loss": 1.49406185, + "memory(GiB)": 107.26, + "step": 28370, + "train_speed(iter/s)": 1.634754 + }, + { + "acc": 0.65118771, + "epoch": 0.7198122780314561, + "grad_norm": 4.9375, + "learning_rate": 7.596174350587826e-06, + "loss": 1.6017231, + "memory(GiB)": 107.26, + "step": 28375, + "train_speed(iter/s)": 1.634785 + }, + { + "acc": 0.64284024, + "epoch": 0.7199391171993912, + "grad_norm": 5.21875, + "learning_rate": 7.595278106579339e-06, + "loss": 1.62861099, + "memory(GiB)": 107.26, + "step": 28380, + "train_speed(iter/s)": 1.634815 + }, + { + "acc": 0.66137543, + "epoch": 0.7200659563673262, + "grad_norm": 8.3125, + "learning_rate": 7.594381748418933e-06, + "loss": 1.56671133, + "memory(GiB)": 107.26, + "step": 28385, + "train_speed(iter/s)": 1.634845 + }, + { + "acc": 0.65421228, + "epoch": 0.7201927955352613, + "grad_norm": 5.0625, + "learning_rate": 7.593485276146035e-06, + "loss": 1.57735558, + "memory(GiB)": 107.26, + "step": 28390, + "train_speed(iter/s)": 1.634878 + }, + { + "acc": 0.67311559, + "epoch": 0.7203196347031964, + "grad_norm": 8.4375, + "learning_rate": 7.592588689800077e-06, + "loss": 1.57889423, + "memory(GiB)": 107.26, + "step": 28395, + "train_speed(iter/s)": 1.634909 + }, + { + "acc": 0.64280615, + "epoch": 0.7204464738711314, + "grad_norm": 5.5625, + "learning_rate": 7.591691989420491e-06, + "loss": 1.66243401, + "memory(GiB)": 107.26, + "step": 28400, + "train_speed(iter/s)": 1.634941 + }, + { + "acc": 0.66686802, + "epoch": 0.7205733130390665, + "grad_norm": 5.8125, + "learning_rate": 7.590795175046721e-06, + "loss": 1.56688204, + "memory(GiB)": 107.26, + "step": 28405, + "train_speed(iter/s)": 1.634969 + }, + { + "acc": 0.65321946, + "epoch": 0.7207001522070016, + "grad_norm": 6.125, + "learning_rate": 7.5898982467182125e-06, + "loss": 1.58327923, + "memory(GiB)": 107.26, + "step": 28410, + "train_speed(iter/s)": 1.635 + }, + { + "acc": 0.64529729, + "epoch": 0.7208269913749366, + "grad_norm": 7.9375, + "learning_rate": 7.589001204474416e-06, + "loss": 1.61412697, + "memory(GiB)": 107.26, + "step": 28415, + "train_speed(iter/s)": 1.635028 + }, + { + "acc": 0.63404508, + "epoch": 0.7209538305428717, + "grad_norm": 5.25, + "learning_rate": 7.588104048354787e-06, + "loss": 1.6536087, + "memory(GiB)": 107.26, + "step": 28420, + "train_speed(iter/s)": 1.635061 + }, + { + "acc": 0.65113173, + "epoch": 0.7210806697108066, + "grad_norm": 6.1875, + "learning_rate": 7.587206778398788e-06, + "loss": 1.585532, + "memory(GiB)": 107.26, + "step": 28425, + "train_speed(iter/s)": 1.635094 + }, + { + "acc": 0.66793399, + "epoch": 0.7212075088787417, + "grad_norm": 7.71875, + "learning_rate": 7.586309394645882e-06, + "loss": 1.53728447, + "memory(GiB)": 107.26, + "step": 28430, + "train_speed(iter/s)": 1.635129 + }, + { + "acc": 0.66511078, + "epoch": 0.7213343480466768, + "grad_norm": 6.125, + "learning_rate": 7.585411897135544e-06, + "loss": 1.5370821, + "memory(GiB)": 107.26, + "step": 28435, + "train_speed(iter/s)": 1.63516 + }, + { + "acc": 0.65755339, + "epoch": 0.7214611872146118, + "grad_norm": 6.59375, + "learning_rate": 7.584514285907245e-06, + "loss": 1.52565784, + "memory(GiB)": 107.26, + "step": 28440, + "train_speed(iter/s)": 1.635193 + }, + { + "acc": 0.66906776, + "epoch": 0.7215880263825469, + "grad_norm": 5.78125, + "learning_rate": 7.583616561000471e-06, + "loss": 1.57781048, + "memory(GiB)": 107.26, + "step": 28445, + "train_speed(iter/s)": 1.635225 + }, + { + "acc": 0.63835506, + "epoch": 0.721714865550482, + "grad_norm": 5.0625, + "learning_rate": 7.582718722454705e-06, + "loss": 1.66925316, + "memory(GiB)": 107.26, + "step": 28450, + "train_speed(iter/s)": 1.635258 + }, + { + "acc": 0.65034003, + "epoch": 0.721841704718417, + "grad_norm": 5.5, + "learning_rate": 7.581820770309438e-06, + "loss": 1.65013161, + "memory(GiB)": 107.26, + "step": 28455, + "train_speed(iter/s)": 1.635291 + }, + { + "acc": 0.67539387, + "epoch": 0.7219685438863521, + "grad_norm": 5.71875, + "learning_rate": 7.580922704604168e-06, + "loss": 1.54581757, + "memory(GiB)": 107.26, + "step": 28460, + "train_speed(iter/s)": 1.635323 + }, + { + "acc": 0.64145727, + "epoch": 0.7220953830542871, + "grad_norm": 5.6875, + "learning_rate": 7.5800245253783935e-06, + "loss": 1.61393929, + "memory(GiB)": 107.26, + "step": 28465, + "train_speed(iter/s)": 1.635358 + }, + { + "acc": 0.63770723, + "epoch": 0.7222222222222222, + "grad_norm": 5.65625, + "learning_rate": 7.579126232671621e-06, + "loss": 1.62833481, + "memory(GiB)": 107.26, + "step": 28470, + "train_speed(iter/s)": 1.63539 + }, + { + "acc": 0.66014361, + "epoch": 0.7223490613901573, + "grad_norm": 7.21875, + "learning_rate": 7.578227826523361e-06, + "loss": 1.65746651, + "memory(GiB)": 107.26, + "step": 28475, + "train_speed(iter/s)": 1.635422 + }, + { + "acc": 0.67401967, + "epoch": 0.7224759005580923, + "grad_norm": 5.15625, + "learning_rate": 7.577329306973132e-06, + "loss": 1.55169582, + "memory(GiB)": 107.26, + "step": 28480, + "train_speed(iter/s)": 1.635454 + }, + { + "acc": 0.66695127, + "epoch": 0.7226027397260274, + "grad_norm": 5.71875, + "learning_rate": 7.576430674060452e-06, + "loss": 1.53620176, + "memory(GiB)": 107.26, + "step": 28485, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.64144616, + "epoch": 0.7227295788939625, + "grad_norm": 6.0625, + "learning_rate": 7.575531927824849e-06, + "loss": 1.67516003, + "memory(GiB)": 107.26, + "step": 28490, + "train_speed(iter/s)": 1.63552 + }, + { + "acc": 0.66256618, + "epoch": 0.7228564180618975, + "grad_norm": 5.96875, + "learning_rate": 7.574633068305852e-06, + "loss": 1.64587593, + "memory(GiB)": 107.26, + "step": 28495, + "train_speed(iter/s)": 1.63555 + }, + { + "acc": 0.65183434, + "epoch": 0.7229832572298326, + "grad_norm": 6.4375, + "learning_rate": 7.5737340955429995e-06, + "loss": 1.62679482, + "memory(GiB)": 107.26, + "step": 28500, + "train_speed(iter/s)": 1.635583 + }, + { + "acc": 0.66222229, + "epoch": 0.7231100963977676, + "grad_norm": 5.53125, + "learning_rate": 7.572835009575828e-06, + "loss": 1.53670635, + "memory(GiB)": 107.26, + "step": 28505, + "train_speed(iter/s)": 1.635615 + }, + { + "acc": 0.65891342, + "epoch": 0.7232369355657027, + "grad_norm": 6.6875, + "learning_rate": 7.571935810443886e-06, + "loss": 1.60377464, + "memory(GiB)": 107.26, + "step": 28510, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.6671082, + "epoch": 0.7233637747336378, + "grad_norm": 4.8125, + "learning_rate": 7.571036498186727e-06, + "loss": 1.55178967, + "memory(GiB)": 107.26, + "step": 28515, + "train_speed(iter/s)": 1.635679 + }, + { + "acc": 0.65232339, + "epoch": 0.7234906139015728, + "grad_norm": 6.375, + "learning_rate": 7.570137072843902e-06, + "loss": 1.6531435, + "memory(GiB)": 107.26, + "step": 28520, + "train_speed(iter/s)": 1.635712 + }, + { + "acc": 0.65039473, + "epoch": 0.7236174530695079, + "grad_norm": 5.78125, + "learning_rate": 7.569237534454974e-06, + "loss": 1.63271866, + "memory(GiB)": 107.26, + "step": 28525, + "train_speed(iter/s)": 1.635743 + }, + { + "acc": 0.64541965, + "epoch": 0.723744292237443, + "grad_norm": 5.40625, + "learning_rate": 7.568337883059509e-06, + "loss": 1.65945778, + "memory(GiB)": 107.26, + "step": 28530, + "train_speed(iter/s)": 1.635774 + }, + { + "acc": 0.64750261, + "epoch": 0.723871131405378, + "grad_norm": 5.96875, + "learning_rate": 7.567438118697077e-06, + "loss": 1.63611412, + "memory(GiB)": 107.26, + "step": 28535, + "train_speed(iter/s)": 1.635808 + }, + { + "acc": 0.6540668, + "epoch": 0.723997970573313, + "grad_norm": 6.09375, + "learning_rate": 7.566538241407253e-06, + "loss": 1.5664588, + "memory(GiB)": 107.26, + "step": 28540, + "train_speed(iter/s)": 1.635842 + }, + { + "acc": 0.64244623, + "epoch": 0.724124809741248, + "grad_norm": 5.15625, + "learning_rate": 7.565638251229617e-06, + "loss": 1.6360487, + "memory(GiB)": 107.26, + "step": 28545, + "train_speed(iter/s)": 1.635873 + }, + { + "acc": 0.66653395, + "epoch": 0.7242516489091831, + "grad_norm": 5.28125, + "learning_rate": 7.5647381482037585e-06, + "loss": 1.5413662, + "memory(GiB)": 107.26, + "step": 28550, + "train_speed(iter/s)": 1.635906 + }, + { + "acc": 0.66212358, + "epoch": 0.7243784880771182, + "grad_norm": 4.96875, + "learning_rate": 7.563837932369264e-06, + "loss": 1.62485027, + "memory(GiB)": 107.26, + "step": 28555, + "train_speed(iter/s)": 1.635937 + }, + { + "acc": 0.65409918, + "epoch": 0.7245053272450532, + "grad_norm": 6.34375, + "learning_rate": 7.562937603765732e-06, + "loss": 1.60686321, + "memory(GiB)": 107.26, + "step": 28560, + "train_speed(iter/s)": 1.635967 + }, + { + "acc": 0.65579839, + "epoch": 0.7246321664129883, + "grad_norm": 5.3125, + "learning_rate": 7.562037162432761e-06, + "loss": 1.60123482, + "memory(GiB)": 107.26, + "step": 28565, + "train_speed(iter/s)": 1.635996 + }, + { + "acc": 0.64640474, + "epoch": 0.7247590055809234, + "grad_norm": 5.65625, + "learning_rate": 7.561136608409956e-06, + "loss": 1.60928974, + "memory(GiB)": 107.26, + "step": 28570, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.63884611, + "epoch": 0.7248858447488584, + "grad_norm": 5.375, + "learning_rate": 7.560235941736929e-06, + "loss": 1.70201988, + "memory(GiB)": 107.26, + "step": 28575, + "train_speed(iter/s)": 1.636059 + }, + { + "acc": 0.65022783, + "epoch": 0.7250126839167935, + "grad_norm": 5.6875, + "learning_rate": 7.559335162453294e-06, + "loss": 1.64949226, + "memory(GiB)": 107.26, + "step": 28580, + "train_speed(iter/s)": 1.63609 + }, + { + "acc": 0.66325626, + "epoch": 0.7251395230847285, + "grad_norm": 7.5625, + "learning_rate": 7.558434270598672e-06, + "loss": 1.60072556, + "memory(GiB)": 107.26, + "step": 28585, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.65244141, + "epoch": 0.7252663622526636, + "grad_norm": 5.625, + "learning_rate": 7.5575332662126885e-06, + "loss": 1.60335426, + "memory(GiB)": 107.26, + "step": 28590, + "train_speed(iter/s)": 1.636152 + }, + { + "acc": 0.6560441, + "epoch": 0.7253932014205987, + "grad_norm": 5.5625, + "learning_rate": 7.556632149334975e-06, + "loss": 1.5482873, + "memory(GiB)": 107.26, + "step": 28595, + "train_speed(iter/s)": 1.636186 + }, + { + "acc": 0.65472507, + "epoch": 0.7255200405885337, + "grad_norm": 5.375, + "learning_rate": 7.555730920005163e-06, + "loss": 1.54491882, + "memory(GiB)": 107.26, + "step": 28600, + "train_speed(iter/s)": 1.636217 + }, + { + "acc": 0.62411947, + "epoch": 0.7256468797564688, + "grad_norm": 6.3125, + "learning_rate": 7.554829578262894e-06, + "loss": 1.70723457, + "memory(GiB)": 107.26, + "step": 28605, + "train_speed(iter/s)": 1.636248 + }, + { + "acc": 0.6647521, + "epoch": 0.7257737189244039, + "grad_norm": 7.28125, + "learning_rate": 7.5539281241478155e-06, + "loss": 1.63543358, + "memory(GiB)": 107.26, + "step": 28610, + "train_speed(iter/s)": 1.63628 + }, + { + "acc": 0.66804781, + "epoch": 0.7259005580923389, + "grad_norm": 5.4375, + "learning_rate": 7.5530265576995756e-06, + "loss": 1.55517902, + "memory(GiB)": 107.26, + "step": 28615, + "train_speed(iter/s)": 1.636311 + }, + { + "acc": 0.64884882, + "epoch": 0.726027397260274, + "grad_norm": 5.125, + "learning_rate": 7.552124878957829e-06, + "loss": 1.64047585, + "memory(GiB)": 107.26, + "step": 28620, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65027685, + "epoch": 0.726154236428209, + "grad_norm": 5.78125, + "learning_rate": 7.551223087962234e-06, + "loss": 1.60495262, + "memory(GiB)": 107.26, + "step": 28625, + "train_speed(iter/s)": 1.63637 + }, + { + "acc": 0.66777925, + "epoch": 0.7262810755961441, + "grad_norm": 4.96875, + "learning_rate": 7.55032118475246e-06, + "loss": 1.53380642, + "memory(GiB)": 107.26, + "step": 28630, + "train_speed(iter/s)": 1.6364 + }, + { + "acc": 0.65362878, + "epoch": 0.7264079147640792, + "grad_norm": 4.5625, + "learning_rate": 7.549419169368171e-06, + "loss": 1.6411747, + "memory(GiB)": 107.26, + "step": 28635, + "train_speed(iter/s)": 1.636431 + }, + { + "acc": 0.63941097, + "epoch": 0.7265347539320142, + "grad_norm": 5.9375, + "learning_rate": 7.548517041849048e-06, + "loss": 1.67578945, + "memory(GiB)": 107.26, + "step": 28640, + "train_speed(iter/s)": 1.636464 + }, + { + "acc": 0.65995092, + "epoch": 0.7266615930999493, + "grad_norm": 5.40625, + "learning_rate": 7.547614802234764e-06, + "loss": 1.58281345, + "memory(GiB)": 107.26, + "step": 28645, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.66975508, + "epoch": 0.7267884322678844, + "grad_norm": 4.78125, + "learning_rate": 7.546712450565008e-06, + "loss": 1.61544456, + "memory(GiB)": 107.26, + "step": 28650, + "train_speed(iter/s)": 1.636526 + }, + { + "acc": 0.66985846, + "epoch": 0.7269152714358194, + "grad_norm": 4.90625, + "learning_rate": 7.545809986879469e-06, + "loss": 1.49771652, + "memory(GiB)": 107.26, + "step": 28655, + "train_speed(iter/s)": 1.636555 + }, + { + "acc": 0.65896926, + "epoch": 0.7270421106037545, + "grad_norm": 6.875, + "learning_rate": 7.5449074112178385e-06, + "loss": 1.60677738, + "memory(GiB)": 107.26, + "step": 28660, + "train_speed(iter/s)": 1.636587 + }, + { + "acc": 0.66010389, + "epoch": 0.7271689497716894, + "grad_norm": 7.125, + "learning_rate": 7.54400472361982e-06, + "loss": 1.69140244, + "memory(GiB)": 107.26, + "step": 28665, + "train_speed(iter/s)": 1.63662 + }, + { + "acc": 0.63945427, + "epoch": 0.7272957889396245, + "grad_norm": 5.53125, + "learning_rate": 7.543101924125115e-06, + "loss": 1.61796341, + "memory(GiB)": 107.26, + "step": 28670, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.65030837, + "epoch": 0.7274226281075596, + "grad_norm": 6.28125, + "learning_rate": 7.542199012773432e-06, + "loss": 1.69965439, + "memory(GiB)": 107.26, + "step": 28675, + "train_speed(iter/s)": 1.636679 + }, + { + "acc": 0.65872974, + "epoch": 0.7275494672754946, + "grad_norm": 6.0, + "learning_rate": 7.541295989604488e-06, + "loss": 1.60749512, + "memory(GiB)": 107.26, + "step": 28680, + "train_speed(iter/s)": 1.636711 + }, + { + "acc": 0.6657393, + "epoch": 0.7276763064434297, + "grad_norm": 6.09375, + "learning_rate": 7.540392854657999e-06, + "loss": 1.54372215, + "memory(GiB)": 107.26, + "step": 28685, + "train_speed(iter/s)": 1.636741 + }, + { + "acc": 0.66332645, + "epoch": 0.7278031456113648, + "grad_norm": 6.9375, + "learning_rate": 7.539489607973691e-06, + "loss": 1.59117022, + "memory(GiB)": 107.26, + "step": 28690, + "train_speed(iter/s)": 1.636775 + }, + { + "acc": 0.65675788, + "epoch": 0.7279299847792998, + "grad_norm": 6.59375, + "learning_rate": 7.5385862495912905e-06, + "loss": 1.66924763, + "memory(GiB)": 107.26, + "step": 28695, + "train_speed(iter/s)": 1.636806 + }, + { + "acc": 0.65276146, + "epoch": 0.7280568239472349, + "grad_norm": 6.78125, + "learning_rate": 7.537682779550537e-06, + "loss": 1.63707638, + "memory(GiB)": 107.26, + "step": 28700, + "train_speed(iter/s)": 1.636836 + }, + { + "acc": 0.6351037, + "epoch": 0.7281836631151699, + "grad_norm": 6.21875, + "learning_rate": 7.536779197891159e-06, + "loss": 1.66475887, + "memory(GiB)": 107.26, + "step": 28705, + "train_speed(iter/s)": 1.636867 + }, + { + "acc": 0.65997095, + "epoch": 0.728310502283105, + "grad_norm": 5.9375, + "learning_rate": 7.535875504652912e-06, + "loss": 1.55873117, + "memory(GiB)": 107.26, + "step": 28710, + "train_speed(iter/s)": 1.6369 + }, + { + "acc": 0.65323343, + "epoch": 0.7284373414510401, + "grad_norm": 6.03125, + "learning_rate": 7.534971699875534e-06, + "loss": 1.64260521, + "memory(GiB)": 107.26, + "step": 28715, + "train_speed(iter/s)": 1.63693 + }, + { + "acc": 0.65670843, + "epoch": 0.7285641806189751, + "grad_norm": 5.46875, + "learning_rate": 7.534067783598784e-06, + "loss": 1.58925095, + "memory(GiB)": 107.26, + "step": 28720, + "train_speed(iter/s)": 1.63696 + }, + { + "acc": 0.63327646, + "epoch": 0.7286910197869102, + "grad_norm": 5.625, + "learning_rate": 7.533163755862419e-06, + "loss": 1.65086842, + "memory(GiB)": 107.26, + "step": 28725, + "train_speed(iter/s)": 1.636987 + }, + { + "acc": 0.65334597, + "epoch": 0.7288178589548453, + "grad_norm": 5.25, + "learning_rate": 7.5322596167062035e-06, + "loss": 1.60484161, + "memory(GiB)": 107.26, + "step": 28730, + "train_speed(iter/s)": 1.637018 + }, + { + "acc": 0.6649353, + "epoch": 0.7289446981227803, + "grad_norm": 5.84375, + "learning_rate": 7.5313553661699035e-06, + "loss": 1.56646852, + "memory(GiB)": 107.26, + "step": 28735, + "train_speed(iter/s)": 1.637049 + }, + { + "acc": 0.6701273, + "epoch": 0.7290715372907154, + "grad_norm": 20.625, + "learning_rate": 7.530451004293292e-06, + "loss": 1.5336462, + "memory(GiB)": 107.26, + "step": 28740, + "train_speed(iter/s)": 1.637081 + }, + { + "acc": 0.64840803, + "epoch": 0.7291983764586504, + "grad_norm": 5.78125, + "learning_rate": 7.5295465311161485e-06, + "loss": 1.58825636, + "memory(GiB)": 107.26, + "step": 28745, + "train_speed(iter/s)": 1.637111 + }, + { + "acc": 0.66824765, + "epoch": 0.7293252156265855, + "grad_norm": 5.375, + "learning_rate": 7.5286419466782546e-06, + "loss": 1.52523804, + "memory(GiB)": 107.26, + "step": 28750, + "train_speed(iter/s)": 1.637144 + }, + { + "acc": 0.6580842, + "epoch": 0.7294520547945206, + "grad_norm": 10.75, + "learning_rate": 7.527737251019399e-06, + "loss": 1.61860924, + "memory(GiB)": 107.26, + "step": 28755, + "train_speed(iter/s)": 1.637176 + }, + { + "acc": 0.63688421, + "epoch": 0.7295788939624556, + "grad_norm": 7.0625, + "learning_rate": 7.526832444179373e-06, + "loss": 1.65304832, + "memory(GiB)": 107.26, + "step": 28760, + "train_speed(iter/s)": 1.637206 + }, + { + "acc": 0.65494881, + "epoch": 0.7297057331303907, + "grad_norm": 6.5625, + "learning_rate": 7.525927526197974e-06, + "loss": 1.61950989, + "memory(GiB)": 107.26, + "step": 28765, + "train_speed(iter/s)": 1.63724 + }, + { + "acc": 0.64320092, + "epoch": 0.7298325722983258, + "grad_norm": 5.03125, + "learning_rate": 7.5250224971150065e-06, + "loss": 1.63848495, + "memory(GiB)": 107.26, + "step": 28770, + "train_speed(iter/s)": 1.63727 + }, + { + "acc": 0.66441946, + "epoch": 0.7299594114662608, + "grad_norm": 4.96875, + "learning_rate": 7.524117356970275e-06, + "loss": 1.58541851, + "memory(GiB)": 107.26, + "step": 28775, + "train_speed(iter/s)": 1.637301 + }, + { + "acc": 0.64897156, + "epoch": 0.7300862506341959, + "grad_norm": 5.75, + "learning_rate": 7.523212105803594e-06, + "loss": 1.697052, + "memory(GiB)": 107.26, + "step": 28780, + "train_speed(iter/s)": 1.637332 + }, + { + "acc": 0.65576429, + "epoch": 0.7302130898021308, + "grad_norm": 6.75, + "learning_rate": 7.522306743654777e-06, + "loss": 1.58193026, + "memory(GiB)": 107.26, + "step": 28785, + "train_speed(iter/s)": 1.637365 + }, + { + "acc": 0.65727882, + "epoch": 0.7303399289700659, + "grad_norm": 5.1875, + "learning_rate": 7.521401270563651e-06, + "loss": 1.62872696, + "memory(GiB)": 107.26, + "step": 28790, + "train_speed(iter/s)": 1.637396 + }, + { + "acc": 0.64559159, + "epoch": 0.730466768138001, + "grad_norm": 5.46875, + "learning_rate": 7.520495686570037e-06, + "loss": 1.6176899, + "memory(GiB)": 107.26, + "step": 28795, + "train_speed(iter/s)": 1.637428 + }, + { + "acc": 0.6717761, + "epoch": 0.730593607305936, + "grad_norm": 6.0, + "learning_rate": 7.5195899917137716e-06, + "loss": 1.55495958, + "memory(GiB)": 107.26, + "step": 28800, + "train_speed(iter/s)": 1.637459 + }, + { + "acc": 0.64469399, + "epoch": 0.7307204464738711, + "grad_norm": 6.625, + "learning_rate": 7.518684186034688e-06, + "loss": 1.5902565, + "memory(GiB)": 107.26, + "step": 28805, + "train_speed(iter/s)": 1.637489 + }, + { + "acc": 0.66833234, + "epoch": 0.7308472856418062, + "grad_norm": 4.4375, + "learning_rate": 7.51777826957263e-06, + "loss": 1.58796854, + "memory(GiB)": 107.26, + "step": 28810, + "train_speed(iter/s)": 1.637521 + }, + { + "acc": 0.65012321, + "epoch": 0.7309741248097412, + "grad_norm": 5.53125, + "learning_rate": 7.516872242367441e-06, + "loss": 1.59206676, + "memory(GiB)": 107.26, + "step": 28815, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.65565753, + "epoch": 0.7311009639776763, + "grad_norm": 5.875, + "learning_rate": 7.5159661044589745e-06, + "loss": 1.64078064, + "memory(GiB)": 107.26, + "step": 28820, + "train_speed(iter/s)": 1.637582 + }, + { + "acc": 0.66105814, + "epoch": 0.7312278031456113, + "grad_norm": 7.4375, + "learning_rate": 7.515059855887087e-06, + "loss": 1.58720894, + "memory(GiB)": 107.26, + "step": 28825, + "train_speed(iter/s)": 1.637614 + }, + { + "acc": 0.65079846, + "epoch": 0.7313546423135464, + "grad_norm": 6.40625, + "learning_rate": 7.514153496691636e-06, + "loss": 1.53527317, + "memory(GiB)": 107.26, + "step": 28830, + "train_speed(iter/s)": 1.637644 + }, + { + "acc": 0.66522713, + "epoch": 0.7314814814814815, + "grad_norm": 5.28125, + "learning_rate": 7.513247026912491e-06, + "loss": 1.50655642, + "memory(GiB)": 107.26, + "step": 28835, + "train_speed(iter/s)": 1.637675 + }, + { + "acc": 0.66022797, + "epoch": 0.7316083206494165, + "grad_norm": 6.40625, + "learning_rate": 7.512340446589521e-06, + "loss": 1.61134663, + "memory(GiB)": 107.26, + "step": 28840, + "train_speed(iter/s)": 1.637707 + }, + { + "acc": 0.66343188, + "epoch": 0.7317351598173516, + "grad_norm": 4.9375, + "learning_rate": 7.5114337557625985e-06, + "loss": 1.56014729, + "memory(GiB)": 107.26, + "step": 28845, + "train_speed(iter/s)": 1.637738 + }, + { + "acc": 0.67496748, + "epoch": 0.7318619989852867, + "grad_norm": 7.0, + "learning_rate": 7.510526954471611e-06, + "loss": 1.50279789, + "memory(GiB)": 107.26, + "step": 28850, + "train_speed(iter/s)": 1.637769 + }, + { + "acc": 0.6495604, + "epoch": 0.7319888381532217, + "grad_norm": 10.3125, + "learning_rate": 7.509620042756436e-06, + "loss": 1.66697559, + "memory(GiB)": 107.26, + "step": 28855, + "train_speed(iter/s)": 1.637801 + }, + { + "acc": 0.64409056, + "epoch": 0.7321156773211568, + "grad_norm": 5.5, + "learning_rate": 7.508713020656968e-06, + "loss": 1.67663002, + "memory(GiB)": 107.26, + "step": 28860, + "train_speed(iter/s)": 1.637834 + }, + { + "acc": 0.64763794, + "epoch": 0.7322425164890918, + "grad_norm": 5.71875, + "learning_rate": 7.5078058882131e-06, + "loss": 1.64753304, + "memory(GiB)": 107.26, + "step": 28865, + "train_speed(iter/s)": 1.637865 + }, + { + "acc": 0.64455328, + "epoch": 0.7323693556570269, + "grad_norm": 5.4375, + "learning_rate": 7.506898645464733e-06, + "loss": 1.64705353, + "memory(GiB)": 107.26, + "step": 28870, + "train_speed(iter/s)": 1.637898 + }, + { + "acc": 0.66780729, + "epoch": 0.732496194824962, + "grad_norm": 6.03125, + "learning_rate": 7.505991292451772e-06, + "loss": 1.53682671, + "memory(GiB)": 107.26, + "step": 28875, + "train_speed(iter/s)": 1.637929 + }, + { + "acc": 0.64855089, + "epoch": 0.732623033992897, + "grad_norm": 4.53125, + "learning_rate": 7.505083829214125e-06, + "loss": 1.6743021, + "memory(GiB)": 107.26, + "step": 28880, + "train_speed(iter/s)": 1.63796 + }, + { + "acc": 0.65300374, + "epoch": 0.7327498731608321, + "grad_norm": 4.84375, + "learning_rate": 7.5041762557917065e-06, + "loss": 1.67922287, + "memory(GiB)": 107.26, + "step": 28885, + "train_speed(iter/s)": 1.63799 + }, + { + "acc": 0.64687438, + "epoch": 0.7328767123287672, + "grad_norm": 5.0625, + "learning_rate": 7.5032685722244355e-06, + "loss": 1.67658386, + "memory(GiB)": 107.26, + "step": 28890, + "train_speed(iter/s)": 1.638022 + }, + { + "acc": 0.64731216, + "epoch": 0.7330035514967022, + "grad_norm": 5.84375, + "learning_rate": 7.502360778552238e-06, + "loss": 1.60681381, + "memory(GiB)": 107.26, + "step": 28895, + "train_speed(iter/s)": 1.638054 + }, + { + "acc": 0.64578061, + "epoch": 0.7331303906646373, + "grad_norm": 5.75, + "learning_rate": 7.5014528748150405e-06, + "loss": 1.64231052, + "memory(GiB)": 107.26, + "step": 28900, + "train_speed(iter/s)": 1.638085 + }, + { + "acc": 0.64699678, + "epoch": 0.7332572298325722, + "grad_norm": 6.71875, + "learning_rate": 7.5005448610527765e-06, + "loss": 1.59507742, + "memory(GiB)": 107.26, + "step": 28905, + "train_speed(iter/s)": 1.638118 + }, + { + "acc": 0.66919632, + "epoch": 0.7333840690005073, + "grad_norm": 5.28125, + "learning_rate": 7.499636737305386e-06, + "loss": 1.52741604, + "memory(GiB)": 107.26, + "step": 28910, + "train_speed(iter/s)": 1.638149 + }, + { + "acc": 0.68215356, + "epoch": 0.7335109081684424, + "grad_norm": 6.0, + "learning_rate": 7.498728503612811e-06, + "loss": 1.53564415, + "memory(GiB)": 107.26, + "step": 28915, + "train_speed(iter/s)": 1.638181 + }, + { + "acc": 0.65912724, + "epoch": 0.7336377473363774, + "grad_norm": 6.34375, + "learning_rate": 7.497820160015002e-06, + "loss": 1.60964584, + "memory(GiB)": 107.26, + "step": 28920, + "train_speed(iter/s)": 1.638213 + }, + { + "acc": 0.63588533, + "epoch": 0.7337645865043125, + "grad_norm": 4.8125, + "learning_rate": 7.496911706551908e-06, + "loss": 1.64821682, + "memory(GiB)": 107.26, + "step": 28925, + "train_speed(iter/s)": 1.638244 + }, + { + "acc": 0.6685626, + "epoch": 0.7338914256722476, + "grad_norm": 5.78125, + "learning_rate": 7.496003143263492e-06, + "loss": 1.57344646, + "memory(GiB)": 107.26, + "step": 28930, + "train_speed(iter/s)": 1.638275 + }, + { + "acc": 0.64720635, + "epoch": 0.7340182648401826, + "grad_norm": 5.34375, + "learning_rate": 7.495094470189712e-06, + "loss": 1.68701668, + "memory(GiB)": 107.26, + "step": 28935, + "train_speed(iter/s)": 1.638304 + }, + { + "acc": 0.63717484, + "epoch": 0.7341451040081177, + "grad_norm": 6.78125, + "learning_rate": 7.4941856873705376e-06, + "loss": 1.75105724, + "memory(GiB)": 107.26, + "step": 28940, + "train_speed(iter/s)": 1.638336 + }, + { + "acc": 0.65002036, + "epoch": 0.7342719431760527, + "grad_norm": 5.21875, + "learning_rate": 7.493276794845941e-06, + "loss": 1.62092533, + "memory(GiB)": 107.26, + "step": 28945, + "train_speed(iter/s)": 1.638368 + }, + { + "acc": 0.6571394, + "epoch": 0.7343987823439878, + "grad_norm": 5.78125, + "learning_rate": 7.4923677926559005e-06, + "loss": 1.54039307, + "memory(GiB)": 107.26, + "step": 28950, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.66165857, + "epoch": 0.7345256215119229, + "grad_norm": 5.53125, + "learning_rate": 7.491458680840396e-06, + "loss": 1.59092617, + "memory(GiB)": 107.26, + "step": 28955, + "train_speed(iter/s)": 1.638433 + }, + { + "acc": 0.65214124, + "epoch": 0.7346524606798579, + "grad_norm": 5.46875, + "learning_rate": 7.490549459439415e-06, + "loss": 1.59545469, + "memory(GiB)": 107.26, + "step": 28960, + "train_speed(iter/s)": 1.638463 + }, + { + "acc": 0.65226622, + "epoch": 0.734779299847793, + "grad_norm": 4.59375, + "learning_rate": 7.48964012849295e-06, + "loss": 1.61961594, + "memory(GiB)": 107.26, + "step": 28965, + "train_speed(iter/s)": 1.638495 + }, + { + "acc": 0.64480858, + "epoch": 0.7349061390157281, + "grad_norm": 5.78125, + "learning_rate": 7.488730688040995e-06, + "loss": 1.61559258, + "memory(GiB)": 107.26, + "step": 28970, + "train_speed(iter/s)": 1.638526 + }, + { + "acc": 0.64775119, + "epoch": 0.7350329781836631, + "grad_norm": 7.96875, + "learning_rate": 7.487821138123554e-06, + "loss": 1.6876297, + "memory(GiB)": 107.26, + "step": 28975, + "train_speed(iter/s)": 1.638557 + }, + { + "acc": 0.68063836, + "epoch": 0.7351598173515982, + "grad_norm": 5.59375, + "learning_rate": 7.486911478780633e-06, + "loss": 1.51250477, + "memory(GiB)": 107.26, + "step": 28980, + "train_speed(iter/s)": 1.638588 + }, + { + "acc": 0.66190429, + "epoch": 0.7352866565195332, + "grad_norm": 5.5, + "learning_rate": 7.4860017100522395e-06, + "loss": 1.58536749, + "memory(GiB)": 107.26, + "step": 28985, + "train_speed(iter/s)": 1.638619 + }, + { + "acc": 0.64551468, + "epoch": 0.7354134956874683, + "grad_norm": 7.0625, + "learning_rate": 7.485091831978394e-06, + "loss": 1.66205559, + "memory(GiB)": 107.26, + "step": 28990, + "train_speed(iter/s)": 1.638651 + }, + { + "acc": 0.66126914, + "epoch": 0.7355403348554034, + "grad_norm": 5.40625, + "learning_rate": 7.484181844599113e-06, + "loss": 1.63732357, + "memory(GiB)": 107.26, + "step": 28995, + "train_speed(iter/s)": 1.638682 + }, + { + "acc": 0.66213698, + "epoch": 0.7356671740233384, + "grad_norm": 6.34375, + "learning_rate": 7.483271747954425e-06, + "loss": 1.556464, + "memory(GiB)": 107.26, + "step": 29000, + "train_speed(iter/s)": 1.638712 + }, + { + "epoch": 0.7356671740233384, + "eval_acc": 0.645303712799437, + "eval_loss": 1.5776091814041138, + "eval_runtime": 58.7302, + "eval_samples_per_second": 108.462, + "eval_steps_per_second": 27.124, + "step": 29000 + }, + { + "acc": 0.65302205, + "epoch": 0.7357940131912735, + "grad_norm": 6.0625, + "learning_rate": 7.482361542084356e-06, + "loss": 1.59565334, + "memory(GiB)": 107.26, + "step": 29005, + "train_speed(iter/s)": 1.632929 + }, + { + "acc": 0.65489759, + "epoch": 0.7359208523592086, + "grad_norm": 4.78125, + "learning_rate": 7.481451227028946e-06, + "loss": 1.55446205, + "memory(GiB)": 107.26, + "step": 29010, + "train_speed(iter/s)": 1.63296 + }, + { + "acc": 0.67156148, + "epoch": 0.7360476915271436, + "grad_norm": 6.375, + "learning_rate": 7.4805408028282316e-06, + "loss": 1.54848623, + "memory(GiB)": 107.26, + "step": 29015, + "train_speed(iter/s)": 1.632984 + }, + { + "acc": 0.66563883, + "epoch": 0.7361745306950787, + "grad_norm": 4.75, + "learning_rate": 7.479630269522257e-06, + "loss": 1.57929354, + "memory(GiB)": 107.26, + "step": 29020, + "train_speed(iter/s)": 1.633013 + }, + { + "acc": 0.64647994, + "epoch": 0.7363013698630136, + "grad_norm": 5.84375, + "learning_rate": 7.478719627151073e-06, + "loss": 1.72253551, + "memory(GiB)": 107.26, + "step": 29025, + "train_speed(iter/s)": 1.63304 + }, + { + "acc": 0.64691334, + "epoch": 0.7364282090309487, + "grad_norm": 6.21875, + "learning_rate": 7.4778088757547325e-06, + "loss": 1.640555, + "memory(GiB)": 107.26, + "step": 29030, + "train_speed(iter/s)": 1.633066 + }, + { + "acc": 0.65285029, + "epoch": 0.7365550481988838, + "grad_norm": 6.8125, + "learning_rate": 7.476898015373296e-06, + "loss": 1.64556732, + "memory(GiB)": 107.26, + "step": 29035, + "train_speed(iter/s)": 1.633094 + }, + { + "acc": 0.66230888, + "epoch": 0.7366818873668188, + "grad_norm": 5.53125, + "learning_rate": 7.4759870460468256e-06, + "loss": 1.55623093, + "memory(GiB)": 107.26, + "step": 29040, + "train_speed(iter/s)": 1.633124 + }, + { + "acc": 0.6492528, + "epoch": 0.7368087265347539, + "grad_norm": 6.5, + "learning_rate": 7.475075967815391e-06, + "loss": 1.57962933, + "memory(GiB)": 107.26, + "step": 29045, + "train_speed(iter/s)": 1.633156 + }, + { + "acc": 0.65094757, + "epoch": 0.736935565702689, + "grad_norm": 5.34375, + "learning_rate": 7.474164780719064e-06, + "loss": 1.57986946, + "memory(GiB)": 107.26, + "step": 29050, + "train_speed(iter/s)": 1.633184 + }, + { + "acc": 0.6516325, + "epoch": 0.737062404870624, + "grad_norm": 7.09375, + "learning_rate": 7.473253484797924e-06, + "loss": 1.62027225, + "memory(GiB)": 107.26, + "step": 29055, + "train_speed(iter/s)": 1.632782 + }, + { + "acc": 0.66782169, + "epoch": 0.7371892440385591, + "grad_norm": 7.5, + "learning_rate": 7.4723420800920545e-06, + "loss": 1.57625542, + "memory(GiB)": 107.26, + "step": 29060, + "train_speed(iter/s)": 1.632815 + }, + { + "acc": 0.6600131, + "epoch": 0.7373160832064941, + "grad_norm": 5.53125, + "learning_rate": 7.47143056664154e-06, + "loss": 1.569203, + "memory(GiB)": 107.26, + "step": 29065, + "train_speed(iter/s)": 1.632844 + }, + { + "acc": 0.65921316, + "epoch": 0.7374429223744292, + "grad_norm": 6.15625, + "learning_rate": 7.470518944486476e-06, + "loss": 1.59611015, + "memory(GiB)": 107.26, + "step": 29070, + "train_speed(iter/s)": 1.632874 + }, + { + "acc": 0.6511754, + "epoch": 0.7375697615423643, + "grad_norm": 6.65625, + "learning_rate": 7.469607213666958e-06, + "loss": 1.63686409, + "memory(GiB)": 107.26, + "step": 29075, + "train_speed(iter/s)": 1.632903 + }, + { + "acc": 0.65454955, + "epoch": 0.7376966007102993, + "grad_norm": 6.0625, + "learning_rate": 7.468695374223092e-06, + "loss": 1.56224289, + "memory(GiB)": 107.26, + "step": 29080, + "train_speed(iter/s)": 1.632931 + }, + { + "acc": 0.65945535, + "epoch": 0.7378234398782344, + "grad_norm": 4.90625, + "learning_rate": 7.4677834261949765e-06, + "loss": 1.59688416, + "memory(GiB)": 107.26, + "step": 29085, + "train_speed(iter/s)": 1.632962 + }, + { + "acc": 0.67590871, + "epoch": 0.7379502790461695, + "grad_norm": 5.6875, + "learning_rate": 7.466871369622731e-06, + "loss": 1.5191824, + "memory(GiB)": 107.26, + "step": 29090, + "train_speed(iter/s)": 1.632995 + }, + { + "acc": 0.64535332, + "epoch": 0.7380771182141045, + "grad_norm": 6.5, + "learning_rate": 7.465959204546469e-06, + "loss": 1.64760017, + "memory(GiB)": 107.26, + "step": 29095, + "train_speed(iter/s)": 1.633024 + }, + { + "acc": 0.64834518, + "epoch": 0.7382039573820396, + "grad_norm": 5.96875, + "learning_rate": 7.465046931006311e-06, + "loss": 1.62377567, + "memory(GiB)": 107.26, + "step": 29100, + "train_speed(iter/s)": 1.633054 + }, + { + "acc": 0.66090288, + "epoch": 0.7383307965499746, + "grad_norm": 5.40625, + "learning_rate": 7.464134549042383e-06, + "loss": 1.57544804, + "memory(GiB)": 107.26, + "step": 29105, + "train_speed(iter/s)": 1.633085 + }, + { + "acc": 0.65764174, + "epoch": 0.7384576357179097, + "grad_norm": 5.125, + "learning_rate": 7.463222058694817e-06, + "loss": 1.634412, + "memory(GiB)": 107.26, + "step": 29110, + "train_speed(iter/s)": 1.633115 + }, + { + "acc": 0.6506928, + "epoch": 0.7385844748858448, + "grad_norm": 4.75, + "learning_rate": 7.462309460003747e-06, + "loss": 1.63622303, + "memory(GiB)": 107.26, + "step": 29115, + "train_speed(iter/s)": 1.633147 + }, + { + "acc": 0.65546932, + "epoch": 0.7387113140537798, + "grad_norm": 6.90625, + "learning_rate": 7.461396753009314e-06, + "loss": 1.61398411, + "memory(GiB)": 107.26, + "step": 29120, + "train_speed(iter/s)": 1.633179 + }, + { + "acc": 0.66466231, + "epoch": 0.7388381532217149, + "grad_norm": 5.125, + "learning_rate": 7.460483937751662e-06, + "loss": 1.56808968, + "memory(GiB)": 107.26, + "step": 29125, + "train_speed(iter/s)": 1.633211 + }, + { + "acc": 0.64946513, + "epoch": 0.73896499238965, + "grad_norm": 5.0, + "learning_rate": 7.45957101427094e-06, + "loss": 1.57433662, + "memory(GiB)": 107.26, + "step": 29130, + "train_speed(iter/s)": 1.633241 + }, + { + "acc": 0.64844632, + "epoch": 0.739091831557585, + "grad_norm": 4.90625, + "learning_rate": 7.458657982607303e-06, + "loss": 1.61761398, + "memory(GiB)": 107.26, + "step": 29135, + "train_speed(iter/s)": 1.633273 + }, + { + "acc": 0.64636641, + "epoch": 0.73921867072552, + "grad_norm": 7.375, + "learning_rate": 7.457744842800913e-06, + "loss": 1.65024185, + "memory(GiB)": 107.26, + "step": 29140, + "train_speed(iter/s)": 1.633302 + }, + { + "acc": 0.63608418, + "epoch": 0.739345509893455, + "grad_norm": 5.625, + "learning_rate": 7.45683159489193e-06, + "loss": 1.72340813, + "memory(GiB)": 107.26, + "step": 29145, + "train_speed(iter/s)": 1.633333 + }, + { + "acc": 0.64535031, + "epoch": 0.7394723490613901, + "grad_norm": 6.0, + "learning_rate": 7.455918238920526e-06, + "loss": 1.60442944, + "memory(GiB)": 107.26, + "step": 29150, + "train_speed(iter/s)": 1.633363 + }, + { + "acc": 0.65186863, + "epoch": 0.7395991882293252, + "grad_norm": 5.59375, + "learning_rate": 7.455004774926873e-06, + "loss": 1.64565983, + "memory(GiB)": 107.26, + "step": 29155, + "train_speed(iter/s)": 1.633391 + }, + { + "acc": 0.65645924, + "epoch": 0.7397260273972602, + "grad_norm": 7.65625, + "learning_rate": 7.454091202951148e-06, + "loss": 1.59208612, + "memory(GiB)": 107.26, + "step": 29160, + "train_speed(iter/s)": 1.633419 + }, + { + "acc": 0.64907517, + "epoch": 0.7398528665651953, + "grad_norm": 6.0, + "learning_rate": 7.453177523033536e-06, + "loss": 1.61463699, + "memory(GiB)": 107.26, + "step": 29165, + "train_speed(iter/s)": 1.633448 + }, + { + "acc": 0.64362574, + "epoch": 0.7399797057331304, + "grad_norm": 6.5, + "learning_rate": 7.452263735214223e-06, + "loss": 1.62020454, + "memory(GiB)": 107.26, + "step": 29170, + "train_speed(iter/s)": 1.633476 + }, + { + "acc": 0.65544157, + "epoch": 0.7401065449010654, + "grad_norm": 6.25, + "learning_rate": 7.451349839533404e-06, + "loss": 1.61742306, + "memory(GiB)": 107.26, + "step": 29175, + "train_speed(iter/s)": 1.633508 + }, + { + "acc": 0.64561186, + "epoch": 0.7402333840690005, + "grad_norm": 5.78125, + "learning_rate": 7.450435836031273e-06, + "loss": 1.57471981, + "memory(GiB)": 107.26, + "step": 29180, + "train_speed(iter/s)": 1.633539 + }, + { + "acc": 0.65780139, + "epoch": 0.7403602232369355, + "grad_norm": 5.59375, + "learning_rate": 7.449521724748034e-06, + "loss": 1.57947617, + "memory(GiB)": 107.26, + "step": 29185, + "train_speed(iter/s)": 1.633568 + }, + { + "acc": 0.65712581, + "epoch": 0.7404870624048706, + "grad_norm": 6.25, + "learning_rate": 7.4486075057238936e-06, + "loss": 1.58286419, + "memory(GiB)": 107.26, + "step": 29190, + "train_speed(iter/s)": 1.633598 + }, + { + "acc": 0.66068292, + "epoch": 0.7406139015728057, + "grad_norm": 6.0625, + "learning_rate": 7.447693178999062e-06, + "loss": 1.62818146, + "memory(GiB)": 107.26, + "step": 29195, + "train_speed(iter/s)": 1.633628 + }, + { + "acc": 0.65259008, + "epoch": 0.7407407407407407, + "grad_norm": 7.0, + "learning_rate": 7.446778744613759e-06, + "loss": 1.63442802, + "memory(GiB)": 107.26, + "step": 29200, + "train_speed(iter/s)": 1.633658 + }, + { + "acc": 0.67077045, + "epoch": 0.7408675799086758, + "grad_norm": 5.53125, + "learning_rate": 7.445864202608198e-06, + "loss": 1.53851881, + "memory(GiB)": 107.26, + "step": 29205, + "train_speed(iter/s)": 1.633688 + }, + { + "acc": 0.6462534, + "epoch": 0.7409944190766109, + "grad_norm": 6.28125, + "learning_rate": 7.444949553022613e-06, + "loss": 1.61572952, + "memory(GiB)": 107.26, + "step": 29210, + "train_speed(iter/s)": 1.633719 + }, + { + "acc": 0.65515347, + "epoch": 0.7411212582445459, + "grad_norm": 6.125, + "learning_rate": 7.444034795897229e-06, + "loss": 1.56886559, + "memory(GiB)": 107.26, + "step": 29215, + "train_speed(iter/s)": 1.63375 + }, + { + "acc": 0.66997108, + "epoch": 0.741248097412481, + "grad_norm": 5.40625, + "learning_rate": 7.443119931272285e-06, + "loss": 1.54643269, + "memory(GiB)": 107.26, + "step": 29220, + "train_speed(iter/s)": 1.633778 + }, + { + "acc": 0.66247816, + "epoch": 0.741374936580416, + "grad_norm": 5.8125, + "learning_rate": 7.442204959188016e-06, + "loss": 1.5858345, + "memory(GiB)": 107.26, + "step": 29225, + "train_speed(iter/s)": 1.633808 + }, + { + "acc": 0.64899673, + "epoch": 0.7415017757483511, + "grad_norm": 5.53125, + "learning_rate": 7.4412898796846724e-06, + "loss": 1.62201366, + "memory(GiB)": 107.26, + "step": 29230, + "train_speed(iter/s)": 1.633839 + }, + { + "acc": 0.66947298, + "epoch": 0.7416286149162862, + "grad_norm": 5.1875, + "learning_rate": 7.440374692802497e-06, + "loss": 1.56868095, + "memory(GiB)": 107.26, + "step": 29235, + "train_speed(iter/s)": 1.633869 + }, + { + "acc": 0.66679707, + "epoch": 0.7417554540842212, + "grad_norm": 6.59375, + "learning_rate": 7.439459398581747e-06, + "loss": 1.56801071, + "memory(GiB)": 107.26, + "step": 29240, + "train_speed(iter/s)": 1.633901 + }, + { + "acc": 0.65908208, + "epoch": 0.7418822932521563, + "grad_norm": 7.71875, + "learning_rate": 7.438543997062684e-06, + "loss": 1.53008347, + "memory(GiB)": 107.26, + "step": 29245, + "train_speed(iter/s)": 1.63393 + }, + { + "acc": 0.63522816, + "epoch": 0.7420091324200914, + "grad_norm": 5.5, + "learning_rate": 7.437628488285568e-06, + "loss": 1.59189148, + "memory(GiB)": 107.26, + "step": 29250, + "train_speed(iter/s)": 1.633961 + }, + { + "acc": 0.65244327, + "epoch": 0.7421359715880264, + "grad_norm": 4.875, + "learning_rate": 7.4367128722906665e-06, + "loss": 1.57065783, + "memory(GiB)": 107.26, + "step": 29255, + "train_speed(iter/s)": 1.633992 + }, + { + "acc": 0.65026116, + "epoch": 0.7422628107559615, + "grad_norm": 5.78125, + "learning_rate": 7.435797149118255e-06, + "loss": 1.65320244, + "memory(GiB)": 107.26, + "step": 29260, + "train_speed(iter/s)": 1.634024 + }, + { + "acc": 0.65269237, + "epoch": 0.7423896499238964, + "grad_norm": 6.25, + "learning_rate": 7.434881318808609e-06, + "loss": 1.64191818, + "memory(GiB)": 107.26, + "step": 29265, + "train_speed(iter/s)": 1.634054 + }, + { + "acc": 0.65367908, + "epoch": 0.7425164890918315, + "grad_norm": 5.3125, + "learning_rate": 7.433965381402013e-06, + "loss": 1.57841244, + "memory(GiB)": 107.26, + "step": 29270, + "train_speed(iter/s)": 1.634085 + }, + { + "acc": 0.65718813, + "epoch": 0.7426433282597666, + "grad_norm": 6.25, + "learning_rate": 7.4330493369387514e-06, + "loss": 1.57849865, + "memory(GiB)": 107.26, + "step": 29275, + "train_speed(iter/s)": 1.634114 + }, + { + "acc": 0.63749423, + "epoch": 0.7427701674277016, + "grad_norm": 5.34375, + "learning_rate": 7.432133185459117e-06, + "loss": 1.61236401, + "memory(GiB)": 107.26, + "step": 29280, + "train_speed(iter/s)": 1.634143 + }, + { + "acc": 0.65103292, + "epoch": 0.7428970065956367, + "grad_norm": 5.53125, + "learning_rate": 7.431216927003406e-06, + "loss": 1.5567421, + "memory(GiB)": 107.26, + "step": 29285, + "train_speed(iter/s)": 1.634173 + }, + { + "acc": 0.65703173, + "epoch": 0.7430238457635718, + "grad_norm": 6.5625, + "learning_rate": 7.430300561611922e-06, + "loss": 1.58064222, + "memory(GiB)": 107.26, + "step": 29290, + "train_speed(iter/s)": 1.634204 + }, + { + "acc": 0.66567316, + "epoch": 0.7431506849315068, + "grad_norm": 5.09375, + "learning_rate": 7.429384089324967e-06, + "loss": 1.55421238, + "memory(GiB)": 107.26, + "step": 29295, + "train_speed(iter/s)": 1.634235 + }, + { + "acc": 0.65256748, + "epoch": 0.7432775240994419, + "grad_norm": 5.875, + "learning_rate": 7.428467510182854e-06, + "loss": 1.61065369, + "memory(GiB)": 107.26, + "step": 29300, + "train_speed(iter/s)": 1.634265 + }, + { + "acc": 0.66901436, + "epoch": 0.7434043632673769, + "grad_norm": 5.75, + "learning_rate": 7.427550824225896e-06, + "loss": 1.52973232, + "memory(GiB)": 107.26, + "step": 29305, + "train_speed(iter/s)": 1.634295 + }, + { + "acc": 0.65624475, + "epoch": 0.743531202435312, + "grad_norm": 6.375, + "learning_rate": 7.426634031494417e-06, + "loss": 1.60223827, + "memory(GiB)": 107.26, + "step": 29310, + "train_speed(iter/s)": 1.634325 + }, + { + "acc": 0.66822181, + "epoch": 0.7436580416032471, + "grad_norm": 6.0625, + "learning_rate": 7.425717132028738e-06, + "loss": 1.54658632, + "memory(GiB)": 107.26, + "step": 29315, + "train_speed(iter/s)": 1.634356 + }, + { + "acc": 0.64067836, + "epoch": 0.7437848807711821, + "grad_norm": 5.1875, + "learning_rate": 7.42480012586919e-06, + "loss": 1.67891979, + "memory(GiB)": 107.26, + "step": 29320, + "train_speed(iter/s)": 1.634387 + }, + { + "acc": 0.65336094, + "epoch": 0.7439117199391172, + "grad_norm": 6.375, + "learning_rate": 7.423883013056106e-06, + "loss": 1.53660908, + "memory(GiB)": 107.26, + "step": 29325, + "train_speed(iter/s)": 1.634417 + }, + { + "acc": 0.65465679, + "epoch": 0.7440385591070523, + "grad_norm": 6.125, + "learning_rate": 7.422965793629825e-06, + "loss": 1.58391066, + "memory(GiB)": 107.26, + "step": 29330, + "train_speed(iter/s)": 1.634448 + }, + { + "acc": 0.65347052, + "epoch": 0.7441653982749873, + "grad_norm": 6.65625, + "learning_rate": 7.422048467630691e-06, + "loss": 1.64281731, + "memory(GiB)": 107.26, + "step": 29335, + "train_speed(iter/s)": 1.634478 + }, + { + "acc": 0.66338549, + "epoch": 0.7442922374429224, + "grad_norm": 5.8125, + "learning_rate": 7.421131035099052e-06, + "loss": 1.5602623, + "memory(GiB)": 107.26, + "step": 29340, + "train_speed(iter/s)": 1.634508 + }, + { + "acc": 0.65835791, + "epoch": 0.7444190766108574, + "grad_norm": 5.40625, + "learning_rate": 7.42021349607526e-06, + "loss": 1.58160629, + "memory(GiB)": 107.26, + "step": 29345, + "train_speed(iter/s)": 1.634537 + }, + { + "acc": 0.67601261, + "epoch": 0.7445459157787925, + "grad_norm": 4.6875, + "learning_rate": 7.419295850599673e-06, + "loss": 1.48615532, + "memory(GiB)": 107.26, + "step": 29350, + "train_speed(iter/s)": 1.634567 + }, + { + "acc": 0.65748472, + "epoch": 0.7446727549467276, + "grad_norm": 6.3125, + "learning_rate": 7.418378098712653e-06, + "loss": 1.61027641, + "memory(GiB)": 107.26, + "step": 29355, + "train_speed(iter/s)": 1.634597 + }, + { + "acc": 0.64197178, + "epoch": 0.7447995941146626, + "grad_norm": 5.1875, + "learning_rate": 7.417460240454568e-06, + "loss": 1.74327393, + "memory(GiB)": 107.26, + "step": 29360, + "train_speed(iter/s)": 1.634626 + }, + { + "acc": 0.66285787, + "epoch": 0.7449264332825977, + "grad_norm": 5.75, + "learning_rate": 7.4165422758657865e-06, + "loss": 1.55088511, + "memory(GiB)": 107.26, + "step": 29365, + "train_speed(iter/s)": 1.634655 + }, + { + "acc": 0.66351409, + "epoch": 0.7450532724505328, + "grad_norm": 5.9375, + "learning_rate": 7.415624204986689e-06, + "loss": 1.55484562, + "memory(GiB)": 107.26, + "step": 29370, + "train_speed(iter/s)": 1.634685 + }, + { + "acc": 0.65254173, + "epoch": 0.7451801116184678, + "grad_norm": 5.15625, + "learning_rate": 7.4147060278576525e-06, + "loss": 1.61883831, + "memory(GiB)": 107.26, + "step": 29375, + "train_speed(iter/s)": 1.634715 + }, + { + "acc": 0.65611467, + "epoch": 0.7453069507864029, + "grad_norm": 6.09375, + "learning_rate": 7.413787744519064e-06, + "loss": 1.57823658, + "memory(GiB)": 107.26, + "step": 29380, + "train_speed(iter/s)": 1.634745 + }, + { + "acc": 0.64709883, + "epoch": 0.7454337899543378, + "grad_norm": 5.46875, + "learning_rate": 7.412869355011314e-06, + "loss": 1.59464245, + "memory(GiB)": 107.26, + "step": 29385, + "train_speed(iter/s)": 1.634774 + }, + { + "acc": 0.64943509, + "epoch": 0.7455606291222729, + "grad_norm": 5.71875, + "learning_rate": 7.411950859374797e-06, + "loss": 1.66105232, + "memory(GiB)": 107.26, + "step": 29390, + "train_speed(iter/s)": 1.634801 + }, + { + "acc": 0.66782761, + "epoch": 0.745687468290208, + "grad_norm": 6.34375, + "learning_rate": 7.411032257649913e-06, + "loss": 1.58989353, + "memory(GiB)": 107.26, + "step": 29395, + "train_speed(iter/s)": 1.634832 + }, + { + "acc": 0.66166725, + "epoch": 0.745814307458143, + "grad_norm": 5.59375, + "learning_rate": 7.410113549877065e-06, + "loss": 1.57638054, + "memory(GiB)": 107.26, + "step": 29400, + "train_speed(iter/s)": 1.634863 + }, + { + "acc": 0.65150476, + "epoch": 0.7459411466260781, + "grad_norm": 4.875, + "learning_rate": 7.409194736096663e-06, + "loss": 1.62246895, + "memory(GiB)": 107.26, + "step": 29405, + "train_speed(iter/s)": 1.634893 + }, + { + "acc": 0.66954308, + "epoch": 0.7460679857940132, + "grad_norm": 5.84375, + "learning_rate": 7.408275816349121e-06, + "loss": 1.58283529, + "memory(GiB)": 107.26, + "step": 29410, + "train_speed(iter/s)": 1.634923 + }, + { + "acc": 0.65509677, + "epoch": 0.7461948249619482, + "grad_norm": 5.15625, + "learning_rate": 7.4073567906748555e-06, + "loss": 1.6054512, + "memory(GiB)": 107.26, + "step": 29415, + "train_speed(iter/s)": 1.634954 + }, + { + "acc": 0.65568814, + "epoch": 0.7463216641298833, + "grad_norm": 5.34375, + "learning_rate": 7.406437659114291e-06, + "loss": 1.57774544, + "memory(GiB)": 107.26, + "step": 29420, + "train_speed(iter/s)": 1.634985 + }, + { + "acc": 0.65638022, + "epoch": 0.7464485032978183, + "grad_norm": 5.59375, + "learning_rate": 7.405518421707854e-06, + "loss": 1.63415298, + "memory(GiB)": 107.26, + "step": 29425, + "train_speed(iter/s)": 1.635015 + }, + { + "acc": 0.65783281, + "epoch": 0.7465753424657534, + "grad_norm": 5.875, + "learning_rate": 7.404599078495977e-06, + "loss": 1.57890148, + "memory(GiB)": 107.26, + "step": 29430, + "train_speed(iter/s)": 1.635047 + }, + { + "acc": 0.64815598, + "epoch": 0.7467021816336885, + "grad_norm": 4.90625, + "learning_rate": 7.403679629519096e-06, + "loss": 1.62392502, + "memory(GiB)": 107.26, + "step": 29435, + "train_speed(iter/s)": 1.635076 + }, + { + "acc": 0.64220324, + "epoch": 0.7468290208016235, + "grad_norm": 5.34375, + "learning_rate": 7.402760074817654e-06, + "loss": 1.72198009, + "memory(GiB)": 107.26, + "step": 29440, + "train_speed(iter/s)": 1.635108 + }, + { + "acc": 0.68102574, + "epoch": 0.7469558599695586, + "grad_norm": 5.65625, + "learning_rate": 7.4018404144320955e-06, + "loss": 1.56236458, + "memory(GiB)": 107.26, + "step": 29445, + "train_speed(iter/s)": 1.635138 + }, + { + "acc": 0.66697998, + "epoch": 0.7470826991374937, + "grad_norm": 6.0, + "learning_rate": 7.4009206484028735e-06, + "loss": 1.61778507, + "memory(GiB)": 107.26, + "step": 29450, + "train_speed(iter/s)": 1.635168 + }, + { + "acc": 0.65659909, + "epoch": 0.7472095383054287, + "grad_norm": 5.59375, + "learning_rate": 7.400000776770441e-06, + "loss": 1.56847897, + "memory(GiB)": 107.26, + "step": 29455, + "train_speed(iter/s)": 1.635198 + }, + { + "acc": 0.64287806, + "epoch": 0.7473363774733638, + "grad_norm": 6.375, + "learning_rate": 7.39908079957526e-06, + "loss": 1.62988739, + "memory(GiB)": 107.26, + "step": 29460, + "train_speed(iter/s)": 1.635227 + }, + { + "acc": 0.65236793, + "epoch": 0.7474632166412988, + "grad_norm": 8.4375, + "learning_rate": 7.398160716857794e-06, + "loss": 1.61483688, + "memory(GiB)": 107.26, + "step": 29465, + "train_speed(iter/s)": 1.635255 + }, + { + "acc": 0.66173334, + "epoch": 0.7475900558092339, + "grad_norm": 5.40625, + "learning_rate": 7.397240528658513e-06, + "loss": 1.56382313, + "memory(GiB)": 107.26, + "step": 29470, + "train_speed(iter/s)": 1.635285 + }, + { + "acc": 0.66082401, + "epoch": 0.747716894977169, + "grad_norm": 5.78125, + "learning_rate": 7.39632023501789e-06, + "loss": 1.59990225, + "memory(GiB)": 107.26, + "step": 29475, + "train_speed(iter/s)": 1.635314 + }, + { + "acc": 0.64387903, + "epoch": 0.747843734145104, + "grad_norm": 5.5, + "learning_rate": 7.3953998359764036e-06, + "loss": 1.66633511, + "memory(GiB)": 107.26, + "step": 29480, + "train_speed(iter/s)": 1.635343 + }, + { + "acc": 0.65288548, + "epoch": 0.7479705733130391, + "grad_norm": 6.28125, + "learning_rate": 7.394479331574539e-06, + "loss": 1.62439461, + "memory(GiB)": 107.26, + "step": 29485, + "train_speed(iter/s)": 1.635372 + }, + { + "acc": 0.64740705, + "epoch": 0.7480974124809742, + "grad_norm": 5.21875, + "learning_rate": 7.393558721852783e-06, + "loss": 1.67519226, + "memory(GiB)": 107.26, + "step": 29490, + "train_speed(iter/s)": 1.635402 + }, + { + "acc": 0.63736835, + "epoch": 0.7482242516489092, + "grad_norm": 6.25, + "learning_rate": 7.392638006851627e-06, + "loss": 1.64093533, + "memory(GiB)": 107.26, + "step": 29495, + "train_speed(iter/s)": 1.635433 + }, + { + "acc": 0.66596212, + "epoch": 0.7483510908168443, + "grad_norm": 6.59375, + "learning_rate": 7.391717186611569e-06, + "loss": 1.57376862, + "memory(GiB)": 107.26, + "step": 29500, + "train_speed(iter/s)": 1.635464 + }, + { + "acc": 0.66979008, + "epoch": 0.7484779299847792, + "grad_norm": 6.46875, + "learning_rate": 7.39079626117311e-06, + "loss": 1.55795383, + "memory(GiB)": 107.26, + "step": 29505, + "train_speed(iter/s)": 1.635493 + }, + { + "acc": 0.64057879, + "epoch": 0.7486047691527143, + "grad_norm": 6.34375, + "learning_rate": 7.3898752305767595e-06, + "loss": 1.64928398, + "memory(GiB)": 107.26, + "step": 29510, + "train_speed(iter/s)": 1.635523 + }, + { + "acc": 0.64141364, + "epoch": 0.7487316083206494, + "grad_norm": 5.6875, + "learning_rate": 7.3889540948630245e-06, + "loss": 1.62794495, + "memory(GiB)": 107.26, + "step": 29515, + "train_speed(iter/s)": 1.635553 + }, + { + "acc": 0.65392351, + "epoch": 0.7488584474885844, + "grad_norm": 7.1875, + "learning_rate": 7.388032854072424e-06, + "loss": 1.58461208, + "memory(GiB)": 107.26, + "step": 29520, + "train_speed(iter/s)": 1.635583 + }, + { + "acc": 0.64792252, + "epoch": 0.7489852866565195, + "grad_norm": 6.09375, + "learning_rate": 7.387111508245476e-06, + "loss": 1.63074455, + "memory(GiB)": 107.26, + "step": 29525, + "train_speed(iter/s)": 1.635616 + }, + { + "acc": 0.66881089, + "epoch": 0.7491121258244546, + "grad_norm": 5.96875, + "learning_rate": 7.386190057422706e-06, + "loss": 1.49496088, + "memory(GiB)": 107.26, + "step": 29530, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.64808435, + "epoch": 0.7492389649923896, + "grad_norm": 5.1875, + "learning_rate": 7.385268501644645e-06, + "loss": 1.57918186, + "memory(GiB)": 107.26, + "step": 29535, + "train_speed(iter/s)": 1.635676 + }, + { + "acc": 0.65629177, + "epoch": 0.7493658041603247, + "grad_norm": 5.28125, + "learning_rate": 7.384346840951824e-06, + "loss": 1.61358967, + "memory(GiB)": 107.26, + "step": 29540, + "train_speed(iter/s)": 1.635706 + }, + { + "acc": 0.66148424, + "epoch": 0.7494926433282597, + "grad_norm": 5.875, + "learning_rate": 7.383425075384785e-06, + "loss": 1.54641685, + "memory(GiB)": 107.26, + "step": 29545, + "train_speed(iter/s)": 1.635735 + }, + { + "acc": 0.65984583, + "epoch": 0.7496194824961948, + "grad_norm": 5.5625, + "learning_rate": 7.382503204984069e-06, + "loss": 1.57003193, + "memory(GiB)": 107.26, + "step": 29550, + "train_speed(iter/s)": 1.635765 + }, + { + "acc": 0.64785705, + "epoch": 0.7497463216641299, + "grad_norm": 5.71875, + "learning_rate": 7.381581229790226e-06, + "loss": 1.56552334, + "memory(GiB)": 107.26, + "step": 29555, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.66115417, + "epoch": 0.7498731608320649, + "grad_norm": 5.75, + "learning_rate": 7.380659149843806e-06, + "loss": 1.55136147, + "memory(GiB)": 107.26, + "step": 29560, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.65330138, + "epoch": 0.75, + "grad_norm": 5.34375, + "learning_rate": 7.379736965185369e-06, + "loss": 1.60111237, + "memory(GiB)": 107.26, + "step": 29565, + "train_speed(iter/s)": 1.635857 + }, + { + "acc": 0.63867292, + "epoch": 0.7501268391679351, + "grad_norm": 4.65625, + "learning_rate": 7.378814675855475e-06, + "loss": 1.67757683, + "memory(GiB)": 107.26, + "step": 29570, + "train_speed(iter/s)": 1.635885 + }, + { + "acc": 0.66674709, + "epoch": 0.7502536783358701, + "grad_norm": 6.625, + "learning_rate": 7.37789228189469e-06, + "loss": 1.58155918, + "memory(GiB)": 107.26, + "step": 29575, + "train_speed(iter/s)": 1.635915 + }, + { + "acc": 0.65364809, + "epoch": 0.7503805175038052, + "grad_norm": 4.75, + "learning_rate": 7.376969783343588e-06, + "loss": 1.59277039, + "memory(GiB)": 107.26, + "step": 29580, + "train_speed(iter/s)": 1.635945 + }, + { + "acc": 0.66334772, + "epoch": 0.7505073566717403, + "grad_norm": 5.625, + "learning_rate": 7.37604718024274e-06, + "loss": 1.5441288, + "memory(GiB)": 107.26, + "step": 29585, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.66174784, + "epoch": 0.7506341958396753, + "grad_norm": 6.46875, + "learning_rate": 7.375124472632732e-06, + "loss": 1.6192318, + "memory(GiB)": 107.26, + "step": 29590, + "train_speed(iter/s)": 1.636007 + }, + { + "acc": 0.64309864, + "epoch": 0.7507610350076104, + "grad_norm": 5.0625, + "learning_rate": 7.374201660554142e-06, + "loss": 1.66559067, + "memory(GiB)": 107.26, + "step": 29595, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.65074983, + "epoch": 0.7508878741755454, + "grad_norm": 6.5, + "learning_rate": 7.373278744047565e-06, + "loss": 1.62208939, + "memory(GiB)": 107.26, + "step": 29600, + "train_speed(iter/s)": 1.63607 + }, + { + "acc": 0.64657602, + "epoch": 0.7510147133434805, + "grad_norm": 7.09375, + "learning_rate": 7.372355723153593e-06, + "loss": 1.67632256, + "memory(GiB)": 107.26, + "step": 29605, + "train_speed(iter/s)": 1.6361 + }, + { + "acc": 0.64342175, + "epoch": 0.7511415525114156, + "grad_norm": 6.65625, + "learning_rate": 7.371432597912824e-06, + "loss": 1.62645016, + "memory(GiB)": 107.26, + "step": 29610, + "train_speed(iter/s)": 1.63613 + }, + { + "acc": 0.66082139, + "epoch": 0.7512683916793506, + "grad_norm": 5.96875, + "learning_rate": 7.3705093683658616e-06, + "loss": 1.62099457, + "memory(GiB)": 107.26, + "step": 29615, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.65664968, + "epoch": 0.7513952308472857, + "grad_norm": 5.84375, + "learning_rate": 7.369586034553313e-06, + "loss": 1.61133804, + "memory(GiB)": 107.26, + "step": 29620, + "train_speed(iter/s)": 1.636195 + }, + { + "acc": 0.64428134, + "epoch": 0.7515220700152208, + "grad_norm": 6.4375, + "learning_rate": 7.368662596515792e-06, + "loss": 1.69118843, + "memory(GiB)": 107.26, + "step": 29625, + "train_speed(iter/s)": 1.636225 + }, + { + "acc": 0.65153189, + "epoch": 0.7516489091831557, + "grad_norm": 6.625, + "learning_rate": 7.367739054293914e-06, + "loss": 1.64189262, + "memory(GiB)": 107.26, + "step": 29630, + "train_speed(iter/s)": 1.636256 + }, + { + "acc": 0.6485333, + "epoch": 0.7517757483510908, + "grad_norm": 5.375, + "learning_rate": 7.366815407928302e-06, + "loss": 1.62105923, + "memory(GiB)": 107.26, + "step": 29635, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.65996079, + "epoch": 0.7519025875190258, + "grad_norm": 5.3125, + "learning_rate": 7.365891657459582e-06, + "loss": 1.59371748, + "memory(GiB)": 107.26, + "step": 29640, + "train_speed(iter/s)": 1.636314 + }, + { + "acc": 0.64206958, + "epoch": 0.7520294266869609, + "grad_norm": 5.6875, + "learning_rate": 7.3649678029283825e-06, + "loss": 1.65756855, + "memory(GiB)": 107.26, + "step": 29645, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.64436665, + "epoch": 0.752156265854896, + "grad_norm": 6.34375, + "learning_rate": 7.364043844375342e-06, + "loss": 1.65251083, + "memory(GiB)": 107.26, + "step": 29650, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.64730539, + "epoch": 0.752283105022831, + "grad_norm": 5.4375, + "learning_rate": 7.363119781841095e-06, + "loss": 1.65012016, + "memory(GiB)": 107.26, + "step": 29655, + "train_speed(iter/s)": 1.636403 + }, + { + "acc": 0.64375935, + "epoch": 0.7524099441907661, + "grad_norm": 5.78125, + "learning_rate": 7.362195615366293e-06, + "loss": 1.60773811, + "memory(GiB)": 107.26, + "step": 29660, + "train_speed(iter/s)": 1.636432 + }, + { + "acc": 0.67046642, + "epoch": 0.7525367833587012, + "grad_norm": 6.625, + "learning_rate": 7.361271344991579e-06, + "loss": 1.5541666, + "memory(GiB)": 107.26, + "step": 29665, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.64899292, + "epoch": 0.7526636225266362, + "grad_norm": 4.96875, + "learning_rate": 7.36034697075761e-06, + "loss": 1.62179127, + "memory(GiB)": 107.26, + "step": 29670, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.65920362, + "epoch": 0.7527904616945713, + "grad_norm": 6.875, + "learning_rate": 7.359422492705043e-06, + "loss": 1.59457026, + "memory(GiB)": 107.26, + "step": 29675, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.6616468, + "epoch": 0.7529173008625063, + "grad_norm": 7.1875, + "learning_rate": 7.3584979108745405e-06, + "loss": 1.56224499, + "memory(GiB)": 107.26, + "step": 29680, + "train_speed(iter/s)": 1.63655 + }, + { + "acc": 0.66282234, + "epoch": 0.7530441400304414, + "grad_norm": 5.625, + "learning_rate": 7.357573225306771e-06, + "loss": 1.61290379, + "memory(GiB)": 107.26, + "step": 29685, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.65243807, + "epoch": 0.7531709791983765, + "grad_norm": 6.59375, + "learning_rate": 7.356648436042404e-06, + "loss": 1.63054199, + "memory(GiB)": 107.26, + "step": 29690, + "train_speed(iter/s)": 1.636609 + }, + { + "acc": 0.64831009, + "epoch": 0.7532978183663115, + "grad_norm": 6.46875, + "learning_rate": 7.355723543122118e-06, + "loss": 1.65382385, + "memory(GiB)": 107.26, + "step": 29695, + "train_speed(iter/s)": 1.636639 + }, + { + "acc": 0.67311249, + "epoch": 0.7534246575342466, + "grad_norm": 4.90625, + "learning_rate": 7.354798546586592e-06, + "loss": 1.5371871, + "memory(GiB)": 107.26, + "step": 29700, + "train_speed(iter/s)": 1.63667 + }, + { + "acc": 0.66026783, + "epoch": 0.7535514967021817, + "grad_norm": 7.03125, + "learning_rate": 7.353873446476512e-06, + "loss": 1.57704239, + "memory(GiB)": 107.26, + "step": 29705, + "train_speed(iter/s)": 1.636698 + }, + { + "acc": 0.65831127, + "epoch": 0.7536783358701167, + "grad_norm": 5.9375, + "learning_rate": 7.3529482428325705e-06, + "loss": 1.65265522, + "memory(GiB)": 107.26, + "step": 29710, + "train_speed(iter/s)": 1.636728 + }, + { + "acc": 0.65506353, + "epoch": 0.7538051750380518, + "grad_norm": 6.09375, + "learning_rate": 7.35202293569546e-06, + "loss": 1.62467575, + "memory(GiB)": 107.26, + "step": 29715, + "train_speed(iter/s)": 1.636759 + }, + { + "acc": 0.65700579, + "epoch": 0.7539320142059868, + "grad_norm": 5.375, + "learning_rate": 7.351097525105878e-06, + "loss": 1.59740696, + "memory(GiB)": 107.26, + "step": 29720, + "train_speed(iter/s)": 1.636784 + }, + { + "acc": 0.65579414, + "epoch": 0.7540588533739219, + "grad_norm": 4.90625, + "learning_rate": 7.35017201110453e-06, + "loss": 1.55782127, + "memory(GiB)": 107.26, + "step": 29725, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.64052753, + "epoch": 0.754185692541857, + "grad_norm": 6.5, + "learning_rate": 7.349246393732126e-06, + "loss": 1.65632668, + "memory(GiB)": 107.26, + "step": 29730, + "train_speed(iter/s)": 1.636841 + }, + { + "acc": 0.65518627, + "epoch": 0.754312531709792, + "grad_norm": 7.59375, + "learning_rate": 7.3483206730293755e-06, + "loss": 1.61497421, + "memory(GiB)": 107.26, + "step": 29735, + "train_speed(iter/s)": 1.63687 + }, + { + "acc": 0.66390858, + "epoch": 0.7544393708777271, + "grad_norm": 5.46875, + "learning_rate": 7.347394849036998e-06, + "loss": 1.59423027, + "memory(GiB)": 107.26, + "step": 29740, + "train_speed(iter/s)": 1.6369 + }, + { + "acc": 0.66273823, + "epoch": 0.7545662100456622, + "grad_norm": 5.53125, + "learning_rate": 7.346468921795714e-06, + "loss": 1.53641481, + "memory(GiB)": 107.26, + "step": 29745, + "train_speed(iter/s)": 1.636932 + }, + { + "acc": 0.65079851, + "epoch": 0.7546930492135971, + "grad_norm": 5.15625, + "learning_rate": 7.345542891346251e-06, + "loss": 1.6226469, + "memory(GiB)": 107.26, + "step": 29750, + "train_speed(iter/s)": 1.636961 + }, + { + "acc": 0.65375557, + "epoch": 0.7548198883815322, + "grad_norm": 5.46875, + "learning_rate": 7.344616757729341e-06, + "loss": 1.60310822, + "memory(GiB)": 107.26, + "step": 29755, + "train_speed(iter/s)": 1.636991 + }, + { + "acc": 0.66145716, + "epoch": 0.7549467275494672, + "grad_norm": 5.65625, + "learning_rate": 7.343690520985716e-06, + "loss": 1.61966534, + "memory(GiB)": 107.26, + "step": 29760, + "train_speed(iter/s)": 1.637021 + }, + { + "acc": 0.64625902, + "epoch": 0.7550735667174023, + "grad_norm": 5.5, + "learning_rate": 7.342764181156119e-06, + "loss": 1.59873657, + "memory(GiB)": 107.26, + "step": 29765, + "train_speed(iter/s)": 1.63705 + }, + { + "acc": 0.65413418, + "epoch": 0.7552004058853374, + "grad_norm": 5.03125, + "learning_rate": 7.341837738281293e-06, + "loss": 1.59818268, + "memory(GiB)": 107.26, + "step": 29770, + "train_speed(iter/s)": 1.63708 + }, + { + "acc": 0.66173258, + "epoch": 0.7553272450532724, + "grad_norm": 8.25, + "learning_rate": 7.3409111924019885e-06, + "loss": 1.62648888, + "memory(GiB)": 107.26, + "step": 29775, + "train_speed(iter/s)": 1.637109 + }, + { + "acc": 0.65616388, + "epoch": 0.7554540842212075, + "grad_norm": 6.375, + "learning_rate": 7.3399845435589574e-06, + "loss": 1.63140087, + "memory(GiB)": 107.26, + "step": 29780, + "train_speed(iter/s)": 1.63714 + }, + { + "acc": 0.6688632, + "epoch": 0.7555809233891426, + "grad_norm": 5.40625, + "learning_rate": 7.33905779179296e-06, + "loss": 1.5481226, + "memory(GiB)": 107.26, + "step": 29785, + "train_speed(iter/s)": 1.637172 + }, + { + "acc": 0.65429974, + "epoch": 0.7557077625570776, + "grad_norm": 5.125, + "learning_rate": 7.338130937144756e-06, + "loss": 1.62104034, + "memory(GiB)": 107.26, + "step": 29790, + "train_speed(iter/s)": 1.6372 + }, + { + "acc": 0.67015171, + "epoch": 0.7558346017250127, + "grad_norm": 5.71875, + "learning_rate": 7.3372039796551156e-06, + "loss": 1.5546936, + "memory(GiB)": 107.26, + "step": 29795, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.65639257, + "epoch": 0.7559614408929477, + "grad_norm": 5.21875, + "learning_rate": 7.33627691936481e-06, + "loss": 1.59825134, + "memory(GiB)": 107.26, + "step": 29800, + "train_speed(iter/s)": 1.637261 + }, + { + "acc": 0.68607998, + "epoch": 0.7560882800608828, + "grad_norm": 4.53125, + "learning_rate": 7.335349756314614e-06, + "loss": 1.48566208, + "memory(GiB)": 107.26, + "step": 29805, + "train_speed(iter/s)": 1.637289 + }, + { + "acc": 0.64377146, + "epoch": 0.7562151192288179, + "grad_norm": 5.0625, + "learning_rate": 7.33442249054531e-06, + "loss": 1.66704712, + "memory(GiB)": 107.26, + "step": 29810, + "train_speed(iter/s)": 1.63732 + }, + { + "acc": 0.64667444, + "epoch": 0.7563419583967529, + "grad_norm": 5.0625, + "learning_rate": 7.33349512209768e-06, + "loss": 1.62487621, + "memory(GiB)": 107.26, + "step": 29815, + "train_speed(iter/s)": 1.637351 + }, + { + "acc": 0.64919128, + "epoch": 0.756468797564688, + "grad_norm": 5.78125, + "learning_rate": 7.332567651012518e-06, + "loss": 1.62173386, + "memory(GiB)": 107.26, + "step": 29820, + "train_speed(iter/s)": 1.63738 + }, + { + "acc": 0.66280093, + "epoch": 0.7565956367326231, + "grad_norm": 5.4375, + "learning_rate": 7.331640077330616e-06, + "loss": 1.59632034, + "memory(GiB)": 107.26, + "step": 29825, + "train_speed(iter/s)": 1.637411 + }, + { + "acc": 0.66249547, + "epoch": 0.7567224759005581, + "grad_norm": 6.28125, + "learning_rate": 7.330712401092773e-06, + "loss": 1.64506664, + "memory(GiB)": 107.26, + "step": 29830, + "train_speed(iter/s)": 1.637441 + }, + { + "acc": 0.65253234, + "epoch": 0.7568493150684932, + "grad_norm": 5.46875, + "learning_rate": 7.329784622339794e-06, + "loss": 1.59413738, + "memory(GiB)": 107.26, + "step": 29835, + "train_speed(iter/s)": 1.637472 + }, + { + "acc": 0.64974833, + "epoch": 0.7569761542364282, + "grad_norm": 6.28125, + "learning_rate": 7.328856741112484e-06, + "loss": 1.61772957, + "memory(GiB)": 107.26, + "step": 29840, + "train_speed(iter/s)": 1.637502 + }, + { + "acc": 0.64665971, + "epoch": 0.7571029934043633, + "grad_norm": 6.46875, + "learning_rate": 7.327928757451659e-06, + "loss": 1.61196213, + "memory(GiB)": 107.26, + "step": 29845, + "train_speed(iter/s)": 1.637533 + }, + { + "acc": 0.67542439, + "epoch": 0.7572298325722984, + "grad_norm": 4.96875, + "learning_rate": 7.3270006713981325e-06, + "loss": 1.54231443, + "memory(GiB)": 107.26, + "step": 29850, + "train_speed(iter/s)": 1.637563 + }, + { + "acc": 0.65324078, + "epoch": 0.7573566717402334, + "grad_norm": 5.3125, + "learning_rate": 7.326072482992728e-06, + "loss": 1.65557213, + "memory(GiB)": 107.26, + "step": 29855, + "train_speed(iter/s)": 1.637593 + }, + { + "acc": 0.64249907, + "epoch": 0.7574835109081685, + "grad_norm": 6.40625, + "learning_rate": 7.325144192276269e-06, + "loss": 1.67062187, + "memory(GiB)": 107.26, + "step": 29860, + "train_speed(iter/s)": 1.637623 + }, + { + "acc": 0.65190935, + "epoch": 0.7576103500761036, + "grad_norm": 6.21875, + "learning_rate": 7.324215799289588e-06, + "loss": 1.66738281, + "memory(GiB)": 107.26, + "step": 29865, + "train_speed(iter/s)": 1.637655 + }, + { + "acc": 0.66650739, + "epoch": 0.7577371892440385, + "grad_norm": 7.59375, + "learning_rate": 7.3232873040735194e-06, + "loss": 1.56114635, + "memory(GiB)": 107.26, + "step": 29870, + "train_speed(iter/s)": 1.637685 + }, + { + "acc": 0.65164852, + "epoch": 0.7578640284119736, + "grad_norm": 5.46875, + "learning_rate": 7.322358706668901e-06, + "loss": 1.64391556, + "memory(GiB)": 107.26, + "step": 29875, + "train_speed(iter/s)": 1.637715 + }, + { + "acc": 0.65894613, + "epoch": 0.7579908675799086, + "grad_norm": 4.9375, + "learning_rate": 7.321430007116582e-06, + "loss": 1.60768604, + "memory(GiB)": 107.26, + "step": 29880, + "train_speed(iter/s)": 1.637744 + }, + { + "acc": 0.63830967, + "epoch": 0.7581177067478437, + "grad_norm": 6.46875, + "learning_rate": 7.320501205457403e-06, + "loss": 1.67277222, + "memory(GiB)": 107.26, + "step": 29885, + "train_speed(iter/s)": 1.637773 + }, + { + "acc": 0.65324621, + "epoch": 0.7582445459157788, + "grad_norm": 6.65625, + "learning_rate": 7.319572301732224e-06, + "loss": 1.59981041, + "memory(GiB)": 107.26, + "step": 29890, + "train_speed(iter/s)": 1.637805 + }, + { + "acc": 0.64784412, + "epoch": 0.7583713850837138, + "grad_norm": 4.59375, + "learning_rate": 7.3186432959818956e-06, + "loss": 1.62317047, + "memory(GiB)": 107.26, + "step": 29895, + "train_speed(iter/s)": 1.637368 + }, + { + "acc": 0.66519856, + "epoch": 0.7584982242516489, + "grad_norm": 5.75, + "learning_rate": 7.317714188247285e-06, + "loss": 1.56719046, + "memory(GiB)": 107.26, + "step": 29900, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.64895034, + "epoch": 0.758625063419584, + "grad_norm": 5.53125, + "learning_rate": 7.316784978569256e-06, + "loss": 1.61393585, + "memory(GiB)": 107.26, + "step": 29905, + "train_speed(iter/s)": 1.637429 + }, + { + "acc": 0.6667388, + "epoch": 0.758751902587519, + "grad_norm": 5.5, + "learning_rate": 7.31585566698868e-06, + "loss": 1.49736147, + "memory(GiB)": 107.26, + "step": 29910, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.66874523, + "epoch": 0.7588787417554541, + "grad_norm": 6.15625, + "learning_rate": 7.314926253546433e-06, + "loss": 1.50971413, + "memory(GiB)": 107.26, + "step": 29915, + "train_speed(iter/s)": 1.637488 + }, + { + "acc": 0.65503187, + "epoch": 0.7590055809233891, + "grad_norm": 5.40625, + "learning_rate": 7.313996738283393e-06, + "loss": 1.61468105, + "memory(GiB)": 107.26, + "step": 29920, + "train_speed(iter/s)": 1.637518 + }, + { + "acc": 0.65964861, + "epoch": 0.7591324200913242, + "grad_norm": 6.5, + "learning_rate": 7.3130671212404455e-06, + "loss": 1.63434219, + "memory(GiB)": 107.26, + "step": 29925, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.66120968, + "epoch": 0.7592592592592593, + "grad_norm": 5.0, + "learning_rate": 7.312137402458479e-06, + "loss": 1.59386234, + "memory(GiB)": 107.26, + "step": 29930, + "train_speed(iter/s)": 1.637581 + }, + { + "acc": 0.65374956, + "epoch": 0.7593860984271943, + "grad_norm": 5.03125, + "learning_rate": 7.3112075819783864e-06, + "loss": 1.61699638, + "memory(GiB)": 107.26, + "step": 29935, + "train_speed(iter/s)": 1.637611 + }, + { + "acc": 0.63833857, + "epoch": 0.7595129375951294, + "grad_norm": 7.96875, + "learning_rate": 7.310277659841066e-06, + "loss": 1.71940956, + "memory(GiB)": 107.26, + "step": 29940, + "train_speed(iter/s)": 1.63764 + }, + { + "acc": 0.63947573, + "epoch": 0.7596397767630645, + "grad_norm": 5.6875, + "learning_rate": 7.309347636087418e-06, + "loss": 1.63878136, + "memory(GiB)": 107.26, + "step": 29945, + "train_speed(iter/s)": 1.637669 + }, + { + "acc": 0.63901157, + "epoch": 0.7597666159309995, + "grad_norm": 4.59375, + "learning_rate": 7.308417510758353e-06, + "loss": 1.66148605, + "memory(GiB)": 107.26, + "step": 29950, + "train_speed(iter/s)": 1.637699 + }, + { + "acc": 0.63761992, + "epoch": 0.7598934550989346, + "grad_norm": 5.21875, + "learning_rate": 7.307487283894777e-06, + "loss": 1.69525356, + "memory(GiB)": 107.26, + "step": 29955, + "train_speed(iter/s)": 1.637728 + }, + { + "acc": 0.67261648, + "epoch": 0.7600202942668696, + "grad_norm": 6.0625, + "learning_rate": 7.30655695553761e-06, + "loss": 1.51520844, + "memory(GiB)": 107.26, + "step": 29960, + "train_speed(iter/s)": 1.63776 + }, + { + "acc": 0.66503682, + "epoch": 0.7601471334348047, + "grad_norm": 6.375, + "learning_rate": 7.305626525727769e-06, + "loss": 1.58030739, + "memory(GiB)": 107.26, + "step": 29965, + "train_speed(iter/s)": 1.637791 + }, + { + "acc": 0.65106478, + "epoch": 0.7602739726027398, + "grad_norm": 4.9375, + "learning_rate": 7.30469599450618e-06, + "loss": 1.70409756, + "memory(GiB)": 107.26, + "step": 29970, + "train_speed(iter/s)": 1.637822 + }, + { + "acc": 0.65443373, + "epoch": 0.7604008117706748, + "grad_norm": 4.5, + "learning_rate": 7.30376536191377e-06, + "loss": 1.58743601, + "memory(GiB)": 107.26, + "step": 29975, + "train_speed(iter/s)": 1.63785 + }, + { + "acc": 0.65636206, + "epoch": 0.7605276509386099, + "grad_norm": 4.71875, + "learning_rate": 7.302834627991477e-06, + "loss": 1.61941223, + "memory(GiB)": 107.26, + "step": 29980, + "train_speed(iter/s)": 1.637877 + }, + { + "acc": 0.65274434, + "epoch": 0.760654490106545, + "grad_norm": 5.125, + "learning_rate": 7.301903792780233e-06, + "loss": 1.6437767, + "memory(GiB)": 107.26, + "step": 29985, + "train_speed(iter/s)": 1.637908 + }, + { + "acc": 0.66308994, + "epoch": 0.76078132927448, + "grad_norm": 5.125, + "learning_rate": 7.300972856320984e-06, + "loss": 1.54835691, + "memory(GiB)": 107.26, + "step": 29990, + "train_speed(iter/s)": 1.637938 + }, + { + "acc": 0.64939518, + "epoch": 0.760908168442415, + "grad_norm": 6.4375, + "learning_rate": 7.3000418186546754e-06, + "loss": 1.61641197, + "memory(GiB)": 107.26, + "step": 29995, + "train_speed(iter/s)": 1.637969 + }, + { + "acc": 0.67032819, + "epoch": 0.76103500761035, + "grad_norm": 4.875, + "learning_rate": 7.299110679822258e-06, + "loss": 1.51219215, + "memory(GiB)": 107.26, + "step": 30000, + "train_speed(iter/s)": 1.637998 + }, + { + "epoch": 0.76103500761035, + "eval_acc": 0.6454574121292995, + "eval_loss": 1.5773452520370483, + "eval_runtime": 58.4813, + "eval_samples_per_second": 108.924, + "eval_steps_per_second": 27.239, + "step": 30000 + }, + { + "acc": 0.64851847, + "epoch": 0.7611618467782851, + "grad_norm": 5.59375, + "learning_rate": 7.298179439864689e-06, + "loss": 1.64854012, + "memory(GiB)": 107.26, + "step": 30005, + "train_speed(iter/s)": 1.632436 + }, + { + "acc": 0.65962582, + "epoch": 0.7612886859462202, + "grad_norm": 5.15625, + "learning_rate": 7.297248098822926e-06, + "loss": 1.59453669, + "memory(GiB)": 107.26, + "step": 30010, + "train_speed(iter/s)": 1.632465 + }, + { + "acc": 0.65277605, + "epoch": 0.7614155251141552, + "grad_norm": 4.78125, + "learning_rate": 7.296316656737936e-06, + "loss": 1.59894285, + "memory(GiB)": 107.26, + "step": 30015, + "train_speed(iter/s)": 1.632497 + }, + { + "acc": 0.67766733, + "epoch": 0.7615423642820903, + "grad_norm": 5.25, + "learning_rate": 7.295385113650689e-06, + "loss": 1.56710539, + "memory(GiB)": 107.26, + "step": 30020, + "train_speed(iter/s)": 1.632526 + }, + { + "acc": 0.65055079, + "epoch": 0.7616692034500254, + "grad_norm": 5.65625, + "learning_rate": 7.294453469602154e-06, + "loss": 1.5896925, + "memory(GiB)": 107.26, + "step": 30025, + "train_speed(iter/s)": 1.632557 + }, + { + "acc": 0.67665253, + "epoch": 0.7617960426179604, + "grad_norm": 4.96875, + "learning_rate": 7.293521724633313e-06, + "loss": 1.54910517, + "memory(GiB)": 107.26, + "step": 30030, + "train_speed(iter/s)": 1.632589 + }, + { + "acc": 0.6501286, + "epoch": 0.7619228817858955, + "grad_norm": 5.34375, + "learning_rate": 7.2925898787851455e-06, + "loss": 1.60441208, + "memory(GiB)": 107.26, + "step": 30035, + "train_speed(iter/s)": 1.632618 + }, + { + "acc": 0.65851078, + "epoch": 0.7620497209538305, + "grad_norm": 5.4375, + "learning_rate": 7.2916579320986415e-06, + "loss": 1.57115822, + "memory(GiB)": 107.26, + "step": 30040, + "train_speed(iter/s)": 1.632648 + }, + { + "acc": 0.66216612, + "epoch": 0.7621765601217656, + "grad_norm": 5.9375, + "learning_rate": 7.290725884614787e-06, + "loss": 1.62082214, + "memory(GiB)": 107.26, + "step": 30045, + "train_speed(iter/s)": 1.632679 + }, + { + "acc": 0.65391493, + "epoch": 0.7623033992897007, + "grad_norm": 5.0, + "learning_rate": 7.2897937363745844e-06, + "loss": 1.59441214, + "memory(GiB)": 107.26, + "step": 30050, + "train_speed(iter/s)": 1.632709 + }, + { + "acc": 0.64813786, + "epoch": 0.7624302384576357, + "grad_norm": 6.53125, + "learning_rate": 7.2888614874190276e-06, + "loss": 1.64497299, + "memory(GiB)": 107.26, + "step": 30055, + "train_speed(iter/s)": 1.632739 + }, + { + "acc": 0.65853043, + "epoch": 0.7625570776255708, + "grad_norm": 5.96875, + "learning_rate": 7.287929137789124e-06, + "loss": 1.60991402, + "memory(GiB)": 107.26, + "step": 30060, + "train_speed(iter/s)": 1.632767 + }, + { + "acc": 0.66545115, + "epoch": 0.7626839167935059, + "grad_norm": 6.1875, + "learning_rate": 7.286996687525882e-06, + "loss": 1.59634819, + "memory(GiB)": 107.26, + "step": 30065, + "train_speed(iter/s)": 1.632797 + }, + { + "acc": 0.64672623, + "epoch": 0.7628107559614409, + "grad_norm": 5.0, + "learning_rate": 7.2860641366703155e-06, + "loss": 1.61122017, + "memory(GiB)": 107.26, + "step": 30070, + "train_speed(iter/s)": 1.632828 + }, + { + "acc": 0.64658146, + "epoch": 0.762937595129376, + "grad_norm": 5.53125, + "learning_rate": 7.285131485263441e-06, + "loss": 1.63771935, + "memory(GiB)": 107.26, + "step": 30075, + "train_speed(iter/s)": 1.632857 + }, + { + "acc": 0.65267591, + "epoch": 0.763064434297311, + "grad_norm": 5.3125, + "learning_rate": 7.2841987333462815e-06, + "loss": 1.68563614, + "memory(GiB)": 107.26, + "step": 30080, + "train_speed(iter/s)": 1.632887 + }, + { + "acc": 0.65449533, + "epoch": 0.7631912734652461, + "grad_norm": 6.0625, + "learning_rate": 7.283265880959863e-06, + "loss": 1.59625053, + "memory(GiB)": 107.26, + "step": 30085, + "train_speed(iter/s)": 1.632915 + }, + { + "acc": 0.65416627, + "epoch": 0.7633181126331812, + "grad_norm": 5.03125, + "learning_rate": 7.282332928145219e-06, + "loss": 1.60183277, + "memory(GiB)": 107.26, + "step": 30090, + "train_speed(iter/s)": 1.632944 + }, + { + "acc": 0.65568857, + "epoch": 0.7634449518011162, + "grad_norm": 5.4375, + "learning_rate": 7.281399874943381e-06, + "loss": 1.55371704, + "memory(GiB)": 107.26, + "step": 30095, + "train_speed(iter/s)": 1.632973 + }, + { + "acc": 0.65219216, + "epoch": 0.7635717909690513, + "grad_norm": 6.03125, + "learning_rate": 7.280466721395393e-06, + "loss": 1.67741356, + "memory(GiB)": 107.26, + "step": 30100, + "train_speed(iter/s)": 1.633002 + }, + { + "acc": 0.64180088, + "epoch": 0.7636986301369864, + "grad_norm": 5.1875, + "learning_rate": 7.279533467542295e-06, + "loss": 1.63484802, + "memory(GiB)": 107.26, + "step": 30105, + "train_speed(iter/s)": 1.633032 + }, + { + "acc": 0.64799442, + "epoch": 0.7638254693049213, + "grad_norm": 5.78125, + "learning_rate": 7.2786001134251385e-06, + "loss": 1.65406303, + "memory(GiB)": 107.26, + "step": 30110, + "train_speed(iter/s)": 1.633065 + }, + { + "acc": 0.6596735, + "epoch": 0.7639523084728564, + "grad_norm": 6.25, + "learning_rate": 7.2776666590849744e-06, + "loss": 1.58586216, + "memory(GiB)": 107.26, + "step": 30115, + "train_speed(iter/s)": 1.633096 + }, + { + "acc": 0.66249547, + "epoch": 0.7640791476407914, + "grad_norm": 5.0, + "learning_rate": 7.276733104562863e-06, + "loss": 1.5884119, + "memory(GiB)": 107.26, + "step": 30120, + "train_speed(iter/s)": 1.633124 + }, + { + "acc": 0.65575652, + "epoch": 0.7642059868087265, + "grad_norm": 4.9375, + "learning_rate": 7.275799449899865e-06, + "loss": 1.62027416, + "memory(GiB)": 107.26, + "step": 30125, + "train_speed(iter/s)": 1.633154 + }, + { + "acc": 0.65884333, + "epoch": 0.7643328259766616, + "grad_norm": 5.8125, + "learning_rate": 7.274865695137046e-06, + "loss": 1.63180504, + "memory(GiB)": 107.26, + "step": 30130, + "train_speed(iter/s)": 1.633184 + }, + { + "acc": 0.65675788, + "epoch": 0.7644596651445966, + "grad_norm": 5.71875, + "learning_rate": 7.273931840315477e-06, + "loss": 1.59313021, + "memory(GiB)": 107.26, + "step": 30135, + "train_speed(iter/s)": 1.633214 + }, + { + "acc": 0.66940193, + "epoch": 0.7645865043125317, + "grad_norm": 4.96875, + "learning_rate": 7.272997885476234e-06, + "loss": 1.57314262, + "memory(GiB)": 107.26, + "step": 30140, + "train_speed(iter/s)": 1.633244 + }, + { + "acc": 0.64023342, + "epoch": 0.7647133434804668, + "grad_norm": 5.09375, + "learning_rate": 7.272063830660395e-06, + "loss": 1.68861885, + "memory(GiB)": 107.26, + "step": 30145, + "train_speed(iter/s)": 1.633273 + }, + { + "acc": 0.65716252, + "epoch": 0.7648401826484018, + "grad_norm": 5.28125, + "learning_rate": 7.271129675909046e-06, + "loss": 1.59184484, + "memory(GiB)": 107.26, + "step": 30150, + "train_speed(iter/s)": 1.633304 + }, + { + "acc": 0.66924739, + "epoch": 0.7649670218163369, + "grad_norm": 6.21875, + "learning_rate": 7.270195421263271e-06, + "loss": 1.53166142, + "memory(GiB)": 107.26, + "step": 30155, + "train_speed(iter/s)": 1.633328 + }, + { + "acc": 0.65546036, + "epoch": 0.7650938609842719, + "grad_norm": 6.1875, + "learning_rate": 7.269261066764169e-06, + "loss": 1.61620693, + "memory(GiB)": 107.26, + "step": 30160, + "train_speed(iter/s)": 1.633357 + }, + { + "acc": 0.63966408, + "epoch": 0.765220700152207, + "grad_norm": 6.875, + "learning_rate": 7.268326612452832e-06, + "loss": 1.6111599, + "memory(GiB)": 107.26, + "step": 30165, + "train_speed(iter/s)": 1.633392 + }, + { + "acc": 0.66058455, + "epoch": 0.7653475393201421, + "grad_norm": 7.09375, + "learning_rate": 7.267392058370364e-06, + "loss": 1.57169638, + "memory(GiB)": 107.26, + "step": 30170, + "train_speed(iter/s)": 1.633422 + }, + { + "acc": 0.65681047, + "epoch": 0.7654743784880771, + "grad_norm": 5.59375, + "learning_rate": 7.2664574045578685e-06, + "loss": 1.61545181, + "memory(GiB)": 107.26, + "step": 30175, + "train_speed(iter/s)": 1.633452 + }, + { + "acc": 0.65944204, + "epoch": 0.7656012176560122, + "grad_norm": 5.28125, + "learning_rate": 7.26552265105646e-06, + "loss": 1.62280788, + "memory(GiB)": 107.26, + "step": 30180, + "train_speed(iter/s)": 1.633482 + }, + { + "acc": 0.64229665, + "epoch": 0.7657280568239473, + "grad_norm": 5.125, + "learning_rate": 7.264587797907248e-06, + "loss": 1.60261612, + "memory(GiB)": 107.26, + "step": 30185, + "train_speed(iter/s)": 1.633512 + }, + { + "acc": 0.65486569, + "epoch": 0.7658548959918823, + "grad_norm": 5.09375, + "learning_rate": 7.263652845151354e-06, + "loss": 1.56436367, + "memory(GiB)": 107.26, + "step": 30190, + "train_speed(iter/s)": 1.633542 + }, + { + "acc": 0.65718236, + "epoch": 0.7659817351598174, + "grad_norm": 6.0, + "learning_rate": 7.262717792829903e-06, + "loss": 1.58778667, + "memory(GiB)": 107.26, + "step": 30195, + "train_speed(iter/s)": 1.633573 + }, + { + "acc": 0.660671, + "epoch": 0.7661085743277524, + "grad_norm": 7.125, + "learning_rate": 7.261782640984021e-06, + "loss": 1.57647133, + "memory(GiB)": 107.26, + "step": 30200, + "train_speed(iter/s)": 1.633603 + }, + { + "acc": 0.63908391, + "epoch": 0.7662354134956875, + "grad_norm": 5.875, + "learning_rate": 7.26084738965484e-06, + "loss": 1.64984779, + "memory(GiB)": 107.26, + "step": 30205, + "train_speed(iter/s)": 1.633633 + }, + { + "acc": 0.65461035, + "epoch": 0.7663622526636226, + "grad_norm": 6.34375, + "learning_rate": 7.2599120388834964e-06, + "loss": 1.65573215, + "memory(GiB)": 107.26, + "step": 30210, + "train_speed(iter/s)": 1.633663 + }, + { + "acc": 0.66593251, + "epoch": 0.7664890918315576, + "grad_norm": 5.84375, + "learning_rate": 7.258976588711133e-06, + "loss": 1.59546089, + "memory(GiB)": 107.26, + "step": 30215, + "train_speed(iter/s)": 1.633695 + }, + { + "acc": 0.66262951, + "epoch": 0.7666159309994927, + "grad_norm": 6.28125, + "learning_rate": 7.258041039178891e-06, + "loss": 1.56573458, + "memory(GiB)": 107.26, + "step": 30220, + "train_speed(iter/s)": 1.633725 + }, + { + "acc": 0.64453897, + "epoch": 0.7667427701674278, + "grad_norm": 6.71875, + "learning_rate": 7.257105390327925e-06, + "loss": 1.64220371, + "memory(GiB)": 107.26, + "step": 30225, + "train_speed(iter/s)": 1.633754 + }, + { + "acc": 0.65888615, + "epoch": 0.7668696093353627, + "grad_norm": 5.46875, + "learning_rate": 7.256169642199386e-06, + "loss": 1.56288233, + "memory(GiB)": 107.26, + "step": 30230, + "train_speed(iter/s)": 1.633784 + }, + { + "acc": 0.649687, + "epoch": 0.7669964485032978, + "grad_norm": 5.625, + "learning_rate": 7.255233794834432e-06, + "loss": 1.61188564, + "memory(GiB)": 107.26, + "step": 30235, + "train_speed(iter/s)": 1.633815 + }, + { + "acc": 0.67573872, + "epoch": 0.7671232876712328, + "grad_norm": 6.0625, + "learning_rate": 7.254297848274229e-06, + "loss": 1.52309999, + "memory(GiB)": 107.26, + "step": 30240, + "train_speed(iter/s)": 1.633845 + }, + { + "acc": 0.6575633, + "epoch": 0.7672501268391679, + "grad_norm": 6.90625, + "learning_rate": 7.25336180255994e-06, + "loss": 1.60211353, + "memory(GiB)": 107.26, + "step": 30245, + "train_speed(iter/s)": 1.633875 + }, + { + "acc": 0.65879192, + "epoch": 0.767376966007103, + "grad_norm": 4.875, + "learning_rate": 7.25242565773274e-06, + "loss": 1.66904621, + "memory(GiB)": 107.26, + "step": 30250, + "train_speed(iter/s)": 1.633902 + }, + { + "acc": 0.65146317, + "epoch": 0.767503805175038, + "grad_norm": 5.09375, + "learning_rate": 7.251489413833801e-06, + "loss": 1.62308445, + "memory(GiB)": 107.26, + "step": 30255, + "train_speed(iter/s)": 1.633932 + }, + { + "acc": 0.65045757, + "epoch": 0.7676306443429731, + "grad_norm": 5.875, + "learning_rate": 7.250553070904307e-06, + "loss": 1.65252571, + "memory(GiB)": 107.26, + "step": 30260, + "train_speed(iter/s)": 1.633962 + }, + { + "acc": 0.66111755, + "epoch": 0.7677574835109082, + "grad_norm": 5.9375, + "learning_rate": 7.2496166289854404e-06, + "loss": 1.57442474, + "memory(GiB)": 107.26, + "step": 30265, + "train_speed(iter/s)": 1.633992 + }, + { + "acc": 0.65822687, + "epoch": 0.7678843226788432, + "grad_norm": 7.4375, + "learning_rate": 7.24868008811839e-06, + "loss": 1.60991058, + "memory(GiB)": 107.26, + "step": 30270, + "train_speed(iter/s)": 1.634024 + }, + { + "acc": 0.66016645, + "epoch": 0.7680111618467783, + "grad_norm": 5.78125, + "learning_rate": 7.247743448344351e-06, + "loss": 1.61061821, + "memory(GiB)": 107.26, + "step": 30275, + "train_speed(iter/s)": 1.634055 + }, + { + "acc": 0.64137654, + "epoch": 0.7681380010147133, + "grad_norm": 4.96875, + "learning_rate": 7.246806709704519e-06, + "loss": 1.6832943, + "memory(GiB)": 107.26, + "step": 30280, + "train_speed(iter/s)": 1.634083 + }, + { + "acc": 0.64787626, + "epoch": 0.7682648401826484, + "grad_norm": 5.1875, + "learning_rate": 7.245869872240098e-06, + "loss": 1.6844635, + "memory(GiB)": 107.26, + "step": 30285, + "train_speed(iter/s)": 1.63411 + }, + { + "acc": 0.66646914, + "epoch": 0.7683916793505835, + "grad_norm": 5.625, + "learning_rate": 7.244932935992292e-06, + "loss": 1.63848801, + "memory(GiB)": 107.26, + "step": 30290, + "train_speed(iter/s)": 1.63414 + }, + { + "acc": 0.65764675, + "epoch": 0.7685185185185185, + "grad_norm": 7.0, + "learning_rate": 7.243995901002312e-06, + "loss": 1.66301079, + "memory(GiB)": 107.26, + "step": 30295, + "train_speed(iter/s)": 1.63417 + }, + { + "acc": 0.65270033, + "epoch": 0.7686453576864536, + "grad_norm": 5.84375, + "learning_rate": 7.243058767311374e-06, + "loss": 1.60568981, + "memory(GiB)": 107.26, + "step": 30300, + "train_speed(iter/s)": 1.634199 + }, + { + "acc": 0.67055054, + "epoch": 0.7687721968543887, + "grad_norm": 6.78125, + "learning_rate": 7.2421215349606955e-06, + "loss": 1.54382925, + "memory(GiB)": 107.26, + "step": 30305, + "train_speed(iter/s)": 1.63423 + }, + { + "acc": 0.66608658, + "epoch": 0.7688990360223237, + "grad_norm": 5.9375, + "learning_rate": 7.241184203991505e-06, + "loss": 1.51537476, + "memory(GiB)": 107.26, + "step": 30310, + "train_speed(iter/s)": 1.634258 + }, + { + "acc": 0.64886823, + "epoch": 0.7690258751902588, + "grad_norm": 5.9375, + "learning_rate": 7.240246774445024e-06, + "loss": 1.59635525, + "memory(GiB)": 107.26, + "step": 30315, + "train_speed(iter/s)": 1.634288 + }, + { + "acc": 0.65674829, + "epoch": 0.7691527143581938, + "grad_norm": 5.90625, + "learning_rate": 7.23930924636249e-06, + "loss": 1.62847023, + "memory(GiB)": 107.26, + "step": 30320, + "train_speed(iter/s)": 1.634316 + }, + { + "acc": 0.64854236, + "epoch": 0.7692795535261289, + "grad_norm": 5.8125, + "learning_rate": 7.238371619785134e-06, + "loss": 1.58781557, + "memory(GiB)": 107.26, + "step": 30325, + "train_speed(iter/s)": 1.634346 + }, + { + "acc": 0.65082769, + "epoch": 0.769406392694064, + "grad_norm": 5.46875, + "learning_rate": 7.237433894754205e-06, + "loss": 1.62256737, + "memory(GiB)": 107.26, + "step": 30330, + "train_speed(iter/s)": 1.634373 + }, + { + "acc": 0.65163741, + "epoch": 0.769533231861999, + "grad_norm": 5.8125, + "learning_rate": 7.23649607131094e-06, + "loss": 1.64911957, + "memory(GiB)": 107.26, + "step": 30335, + "train_speed(iter/s)": 1.634402 + }, + { + "acc": 0.65260997, + "epoch": 0.7696600710299341, + "grad_norm": 5.5, + "learning_rate": 7.235558149496595e-06, + "loss": 1.59176273, + "memory(GiB)": 107.26, + "step": 30340, + "train_speed(iter/s)": 1.634433 + }, + { + "acc": 0.66915121, + "epoch": 0.7697869101978692, + "grad_norm": 5.8125, + "learning_rate": 7.23462012935242e-06, + "loss": 1.58822021, + "memory(GiB)": 107.26, + "step": 30345, + "train_speed(iter/s)": 1.634462 + }, + { + "acc": 0.64483595, + "epoch": 0.7699137493658041, + "grad_norm": 5.5625, + "learning_rate": 7.233682010919676e-06, + "loss": 1.62909527, + "memory(GiB)": 107.26, + "step": 30350, + "train_speed(iter/s)": 1.634492 + }, + { + "acc": 0.66873941, + "epoch": 0.7700405885337392, + "grad_norm": 4.9375, + "learning_rate": 7.2327437942396236e-06, + "loss": 1.5787117, + "memory(GiB)": 107.26, + "step": 30355, + "train_speed(iter/s)": 1.634521 + }, + { + "acc": 0.66961937, + "epoch": 0.7701674277016742, + "grad_norm": 6.0625, + "learning_rate": 7.231805479353532e-06, + "loss": 1.54168282, + "memory(GiB)": 107.26, + "step": 30360, + "train_speed(iter/s)": 1.634548 + }, + { + "acc": 0.66384664, + "epoch": 0.7702942668696093, + "grad_norm": 5.0625, + "learning_rate": 7.2308670663026705e-06, + "loss": 1.639851, + "memory(GiB)": 107.26, + "step": 30365, + "train_speed(iter/s)": 1.634579 + }, + { + "acc": 0.64510469, + "epoch": 0.7704211060375444, + "grad_norm": 4.84375, + "learning_rate": 7.229928555128315e-06, + "loss": 1.65079098, + "memory(GiB)": 107.26, + "step": 30370, + "train_speed(iter/s)": 1.634608 + }, + { + "acc": 0.66312037, + "epoch": 0.7705479452054794, + "grad_norm": 6.625, + "learning_rate": 7.228989945871745e-06, + "loss": 1.63608627, + "memory(GiB)": 107.26, + "step": 30375, + "train_speed(iter/s)": 1.634636 + }, + { + "acc": 0.65444736, + "epoch": 0.7706747843734145, + "grad_norm": 5.09375, + "learning_rate": 7.2280512385742475e-06, + "loss": 1.5800992, + "memory(GiB)": 107.26, + "step": 30380, + "train_speed(iter/s)": 1.634665 + }, + { + "acc": 0.65825605, + "epoch": 0.7708016235413496, + "grad_norm": 6.34375, + "learning_rate": 7.227112433277107e-06, + "loss": 1.58305798, + "memory(GiB)": 107.26, + "step": 30385, + "train_speed(iter/s)": 1.634694 + }, + { + "acc": 0.65663486, + "epoch": 0.7709284627092846, + "grad_norm": 6.59375, + "learning_rate": 7.2261735300216195e-06, + "loss": 1.63428841, + "memory(GiB)": 107.26, + "step": 30390, + "train_speed(iter/s)": 1.634721 + }, + { + "acc": 0.65226974, + "epoch": 0.7710553018772197, + "grad_norm": 6.75, + "learning_rate": 7.22523452884908e-06, + "loss": 1.65994682, + "memory(GiB)": 107.26, + "step": 30395, + "train_speed(iter/s)": 1.634751 + }, + { + "acc": 0.63917208, + "epoch": 0.7711821410451547, + "grad_norm": 6.0625, + "learning_rate": 7.224295429800792e-06, + "loss": 1.68284969, + "memory(GiB)": 107.26, + "step": 30400, + "train_speed(iter/s)": 1.634782 + }, + { + "acc": 0.65070257, + "epoch": 0.7713089802130898, + "grad_norm": 5.46875, + "learning_rate": 7.22335623291806e-06, + "loss": 1.64408379, + "memory(GiB)": 107.26, + "step": 30405, + "train_speed(iter/s)": 1.634812 + }, + { + "acc": 0.65407734, + "epoch": 0.7714358193810249, + "grad_norm": 5.15625, + "learning_rate": 7.222416938242194e-06, + "loss": 1.64246502, + "memory(GiB)": 107.26, + "step": 30410, + "train_speed(iter/s)": 1.634843 + }, + { + "acc": 0.67182989, + "epoch": 0.7715626585489599, + "grad_norm": 6.40625, + "learning_rate": 7.221477545814509e-06, + "loss": 1.4954545, + "memory(GiB)": 107.26, + "step": 30415, + "train_speed(iter/s)": 1.634872 + }, + { + "acc": 0.65843563, + "epoch": 0.771689497716895, + "grad_norm": 5.0, + "learning_rate": 7.220538055676323e-06, + "loss": 1.60254974, + "memory(GiB)": 107.26, + "step": 30420, + "train_speed(iter/s)": 1.634901 + }, + { + "acc": 0.64874716, + "epoch": 0.7718163368848301, + "grad_norm": 9.5625, + "learning_rate": 7.21959846786896e-06, + "loss": 1.61789932, + "memory(GiB)": 107.26, + "step": 30425, + "train_speed(iter/s)": 1.634928 + }, + { + "acc": 0.6658041, + "epoch": 0.7719431760527651, + "grad_norm": 6.84375, + "learning_rate": 7.218658782433746e-06, + "loss": 1.57872143, + "memory(GiB)": 107.26, + "step": 30430, + "train_speed(iter/s)": 1.634957 + }, + { + "acc": 0.66571145, + "epoch": 0.7720700152207002, + "grad_norm": 7.6875, + "learning_rate": 7.217718999412013e-06, + "loss": 1.54681292, + "memory(GiB)": 107.26, + "step": 30435, + "train_speed(iter/s)": 1.634987 + }, + { + "acc": 0.65551558, + "epoch": 0.7721968543886352, + "grad_norm": 9.9375, + "learning_rate": 7.216779118845097e-06, + "loss": 1.61225777, + "memory(GiB)": 107.26, + "step": 30440, + "train_speed(iter/s)": 1.635016 + }, + { + "acc": 0.64738917, + "epoch": 0.7723236935565703, + "grad_norm": 6.21875, + "learning_rate": 7.215839140774339e-06, + "loss": 1.60115471, + "memory(GiB)": 107.26, + "step": 30445, + "train_speed(iter/s)": 1.635046 + }, + { + "acc": 0.65567551, + "epoch": 0.7724505327245054, + "grad_norm": 5.375, + "learning_rate": 7.214899065241082e-06, + "loss": 1.63994865, + "memory(GiB)": 107.26, + "step": 30450, + "train_speed(iter/s)": 1.635074 + }, + { + "acc": 0.66446095, + "epoch": 0.7725773718924404, + "grad_norm": 6.6875, + "learning_rate": 7.213958892286674e-06, + "loss": 1.59551678, + "memory(GiB)": 107.26, + "step": 30455, + "train_speed(iter/s)": 1.635103 + }, + { + "acc": 0.65487013, + "epoch": 0.7727042110603755, + "grad_norm": 5.40625, + "learning_rate": 7.213018621952472e-06, + "loss": 1.57060051, + "memory(GiB)": 107.26, + "step": 30460, + "train_speed(iter/s)": 1.635134 + }, + { + "acc": 0.66386309, + "epoch": 0.7728310502283106, + "grad_norm": 6.09375, + "learning_rate": 7.212078254279828e-06, + "loss": 1.54142752, + "memory(GiB)": 107.26, + "step": 30465, + "train_speed(iter/s)": 1.635164 + }, + { + "acc": 0.64772062, + "epoch": 0.7729578893962455, + "grad_norm": 5.25, + "learning_rate": 7.211137789310109e-06, + "loss": 1.66402931, + "memory(GiB)": 107.26, + "step": 30470, + "train_speed(iter/s)": 1.635193 + }, + { + "acc": 0.67215691, + "epoch": 0.7730847285641806, + "grad_norm": 5.34375, + "learning_rate": 7.2101972270846756e-06, + "loss": 1.52053585, + "memory(GiB)": 107.26, + "step": 30475, + "train_speed(iter/s)": 1.635222 + }, + { + "acc": 0.67320266, + "epoch": 0.7732115677321156, + "grad_norm": 5.03125, + "learning_rate": 7.2092565676449e-06, + "loss": 1.49859238, + "memory(GiB)": 107.26, + "step": 30480, + "train_speed(iter/s)": 1.635251 + }, + { + "acc": 0.67555428, + "epoch": 0.7733384069000507, + "grad_norm": 6.5625, + "learning_rate": 7.208315811032158e-06, + "loss": 1.48835354, + "memory(GiB)": 107.26, + "step": 30485, + "train_speed(iter/s)": 1.63528 + }, + { + "acc": 0.66074233, + "epoch": 0.7734652460679858, + "grad_norm": 6.3125, + "learning_rate": 7.207374957287828e-06, + "loss": 1.58826437, + "memory(GiB)": 107.26, + "step": 30490, + "train_speed(iter/s)": 1.635312 + }, + { + "acc": 0.65030069, + "epoch": 0.7735920852359208, + "grad_norm": 5.84375, + "learning_rate": 7.2064340064532914e-06, + "loss": 1.62204819, + "memory(GiB)": 107.26, + "step": 30495, + "train_speed(iter/s)": 1.63534 + }, + { + "acc": 0.62768269, + "epoch": 0.7737189244038559, + "grad_norm": 5.84375, + "learning_rate": 7.205492958569936e-06, + "loss": 1.67540817, + "memory(GiB)": 107.26, + "step": 30500, + "train_speed(iter/s)": 1.635371 + }, + { + "acc": 0.66002293, + "epoch": 0.773845763571791, + "grad_norm": 5.3125, + "learning_rate": 7.204551813679154e-06, + "loss": 1.6727169, + "memory(GiB)": 107.26, + "step": 30505, + "train_speed(iter/s)": 1.6354 + }, + { + "acc": 0.64538474, + "epoch": 0.773972602739726, + "grad_norm": 5.96875, + "learning_rate": 7.2036105718223405e-06, + "loss": 1.65025215, + "memory(GiB)": 107.26, + "step": 30510, + "train_speed(iter/s)": 1.63543 + }, + { + "acc": 0.6518836, + "epoch": 0.7740994419076611, + "grad_norm": 5.875, + "learning_rate": 7.202669233040896e-06, + "loss": 1.63001099, + "memory(GiB)": 107.26, + "step": 30515, + "train_speed(iter/s)": 1.635461 + }, + { + "acc": 0.65905566, + "epoch": 0.7742262810755961, + "grad_norm": 6.21875, + "learning_rate": 7.201727797376223e-06, + "loss": 1.65753632, + "memory(GiB)": 107.26, + "step": 30520, + "train_speed(iter/s)": 1.635488 + }, + { + "acc": 0.6614409, + "epoch": 0.7743531202435312, + "grad_norm": 5.4375, + "learning_rate": 7.200786264869732e-06, + "loss": 1.61937027, + "memory(GiB)": 107.26, + "step": 30525, + "train_speed(iter/s)": 1.635517 + }, + { + "acc": 0.64703631, + "epoch": 0.7744799594114663, + "grad_norm": 6.75, + "learning_rate": 7.199844635562836e-06, + "loss": 1.6106411, + "memory(GiB)": 107.26, + "step": 30530, + "train_speed(iter/s)": 1.635545 + }, + { + "acc": 0.64952555, + "epoch": 0.7746067985794013, + "grad_norm": 4.90625, + "learning_rate": 7.19890290949695e-06, + "loss": 1.64010735, + "memory(GiB)": 107.26, + "step": 30535, + "train_speed(iter/s)": 1.635574 + }, + { + "acc": 0.63504782, + "epoch": 0.7747336377473364, + "grad_norm": 5.6875, + "learning_rate": 7.197961086713498e-06, + "loss": 1.64006424, + "memory(GiB)": 107.26, + "step": 30540, + "train_speed(iter/s)": 1.635604 + }, + { + "acc": 0.64450493, + "epoch": 0.7748604769152715, + "grad_norm": 6.90625, + "learning_rate": 7.197019167253904e-06, + "loss": 1.61944942, + "memory(GiB)": 107.26, + "step": 30545, + "train_speed(iter/s)": 1.635636 + }, + { + "acc": 0.66522994, + "epoch": 0.7749873160832065, + "grad_norm": 4.25, + "learning_rate": 7.196077151159597e-06, + "loss": 1.60584488, + "memory(GiB)": 107.26, + "step": 30550, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.66381049, + "epoch": 0.7751141552511416, + "grad_norm": 5.75, + "learning_rate": 7.195135038472013e-06, + "loss": 1.5891283, + "memory(GiB)": 107.26, + "step": 30555, + "train_speed(iter/s)": 1.635697 + }, + { + "acc": 0.68021297, + "epoch": 0.7752409944190766, + "grad_norm": 5.3125, + "learning_rate": 7.194192829232589e-06, + "loss": 1.58056335, + "memory(GiB)": 107.26, + "step": 30560, + "train_speed(iter/s)": 1.635725 + }, + { + "acc": 0.65287666, + "epoch": 0.7753678335870117, + "grad_norm": 6.875, + "learning_rate": 7.1932505234827686e-06, + "loss": 1.60172901, + "memory(GiB)": 107.26, + "step": 30565, + "train_speed(iter/s)": 1.635756 + }, + { + "acc": 0.65607567, + "epoch": 0.7754946727549468, + "grad_norm": 5.8125, + "learning_rate": 7.192308121263998e-06, + "loss": 1.65015259, + "memory(GiB)": 107.26, + "step": 30570, + "train_speed(iter/s)": 1.635786 + }, + { + "acc": 0.6658083, + "epoch": 0.7756215119228818, + "grad_norm": 5.90625, + "learning_rate": 7.191365622617728e-06, + "loss": 1.59011879, + "memory(GiB)": 107.26, + "step": 30575, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.64922142, + "epoch": 0.7757483510908169, + "grad_norm": 5.09375, + "learning_rate": 7.190423027585414e-06, + "loss": 1.65585232, + "memory(GiB)": 107.26, + "step": 30580, + "train_speed(iter/s)": 1.635846 + }, + { + "acc": 0.65652857, + "epoch": 0.775875190258752, + "grad_norm": 8.125, + "learning_rate": 7.189480336208516e-06, + "loss": 1.60404053, + "memory(GiB)": 107.26, + "step": 30585, + "train_speed(iter/s)": 1.635878 + }, + { + "acc": 0.65228233, + "epoch": 0.776002029426687, + "grad_norm": 6.5625, + "learning_rate": 7.188537548528498e-06, + "loss": 1.59489765, + "memory(GiB)": 107.26, + "step": 30590, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.64943886, + "epoch": 0.776128868594622, + "grad_norm": 5.53125, + "learning_rate": 7.187594664586826e-06, + "loss": 1.61623878, + "memory(GiB)": 107.26, + "step": 30595, + "train_speed(iter/s)": 1.635936 + }, + { + "acc": 0.65434179, + "epoch": 0.776255707762557, + "grad_norm": 5.46875, + "learning_rate": 7.186651684424975e-06, + "loss": 1.66895905, + "memory(GiB)": 107.26, + "step": 30600, + "train_speed(iter/s)": 1.635966 + }, + { + "acc": 0.64830589, + "epoch": 0.7763825469304921, + "grad_norm": 7.90625, + "learning_rate": 7.185708608084418e-06, + "loss": 1.6182003, + "memory(GiB)": 107.26, + "step": 30605, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.66292844, + "epoch": 0.7765093860984272, + "grad_norm": 4.96875, + "learning_rate": 7.184765435606642e-06, + "loss": 1.57399521, + "memory(GiB)": 107.26, + "step": 30610, + "train_speed(iter/s)": 1.636024 + }, + { + "acc": 0.66935806, + "epoch": 0.7766362252663622, + "grad_norm": 6.21875, + "learning_rate": 7.183822167033124e-06, + "loss": 1.61123848, + "memory(GiB)": 107.26, + "step": 30615, + "train_speed(iter/s)": 1.636053 + }, + { + "acc": 0.65486174, + "epoch": 0.7767630644342973, + "grad_norm": 6.03125, + "learning_rate": 7.18287880240536e-06, + "loss": 1.60520649, + "memory(GiB)": 107.26, + "step": 30620, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.67707996, + "epoch": 0.7768899036022324, + "grad_norm": 6.96875, + "learning_rate": 7.1819353417648386e-06, + "loss": 1.55840244, + "memory(GiB)": 107.26, + "step": 30625, + "train_speed(iter/s)": 1.636113 + }, + { + "acc": 0.65891223, + "epoch": 0.7770167427701674, + "grad_norm": 6.0, + "learning_rate": 7.180991785153059e-06, + "loss": 1.62628822, + "memory(GiB)": 107.26, + "step": 30630, + "train_speed(iter/s)": 1.636141 + }, + { + "acc": 0.65944872, + "epoch": 0.7771435819381025, + "grad_norm": 5.46875, + "learning_rate": 7.180048132611524e-06, + "loss": 1.5419241, + "memory(GiB)": 107.26, + "step": 30635, + "train_speed(iter/s)": 1.636171 + }, + { + "acc": 0.65319638, + "epoch": 0.7772704211060375, + "grad_norm": 10.25, + "learning_rate": 7.17910438418174e-06, + "loss": 1.58357925, + "memory(GiB)": 107.26, + "step": 30640, + "train_speed(iter/s)": 1.636199 + }, + { + "acc": 0.64738016, + "epoch": 0.7773972602739726, + "grad_norm": 5.875, + "learning_rate": 7.178160539905214e-06, + "loss": 1.65631466, + "memory(GiB)": 107.26, + "step": 30645, + "train_speed(iter/s)": 1.63623 + }, + { + "acc": 0.66691637, + "epoch": 0.7775240994419077, + "grad_norm": 6.1875, + "learning_rate": 7.1772165998234645e-06, + "loss": 1.55498533, + "memory(GiB)": 107.26, + "step": 30650, + "train_speed(iter/s)": 1.636258 + }, + { + "acc": 0.64904814, + "epoch": 0.7776509386098427, + "grad_norm": 4.875, + "learning_rate": 7.176272563978007e-06, + "loss": 1.64443932, + "memory(GiB)": 107.26, + "step": 30655, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.65022783, + "epoch": 0.7777777777777778, + "grad_norm": 6.125, + "learning_rate": 7.175328432410367e-06, + "loss": 1.61195831, + "memory(GiB)": 107.26, + "step": 30660, + "train_speed(iter/s)": 1.636316 + }, + { + "acc": 0.64395561, + "epoch": 0.7779046169457129, + "grad_norm": 6.3125, + "learning_rate": 7.17438420516207e-06, + "loss": 1.66042862, + "memory(GiB)": 107.26, + "step": 30665, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.6630724, + "epoch": 0.7780314561136479, + "grad_norm": 5.84375, + "learning_rate": 7.173439882274647e-06, + "loss": 1.53771601, + "memory(GiB)": 107.26, + "step": 30670, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.65414743, + "epoch": 0.778158295281583, + "grad_norm": 5.6875, + "learning_rate": 7.172495463789635e-06, + "loss": 1.63615723, + "memory(GiB)": 107.26, + "step": 30675, + "train_speed(iter/s)": 1.636402 + }, + { + "acc": 0.65784302, + "epoch": 0.778285134449518, + "grad_norm": 5.6875, + "learning_rate": 7.171550949748574e-06, + "loss": 1.61946259, + "memory(GiB)": 107.26, + "step": 30680, + "train_speed(iter/s)": 1.636431 + }, + { + "acc": 0.65488625, + "epoch": 0.7784119736174531, + "grad_norm": 5.0, + "learning_rate": 7.170606340193003e-06, + "loss": 1.59765997, + "memory(GiB)": 107.26, + "step": 30685, + "train_speed(iter/s)": 1.636461 + }, + { + "acc": 0.65165424, + "epoch": 0.7785388127853882, + "grad_norm": 5.84375, + "learning_rate": 7.1696616351644786e-06, + "loss": 1.67578697, + "memory(GiB)": 107.26, + "step": 30690, + "train_speed(iter/s)": 1.636489 + }, + { + "acc": 0.6690865, + "epoch": 0.7786656519533232, + "grad_norm": 5.875, + "learning_rate": 7.168716834704546e-06, + "loss": 1.60980949, + "memory(GiB)": 107.26, + "step": 30695, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.65484409, + "epoch": 0.7787924911212583, + "grad_norm": 5.34375, + "learning_rate": 7.167771938854766e-06, + "loss": 1.62836246, + "memory(GiB)": 107.26, + "step": 30700, + "train_speed(iter/s)": 1.636551 + }, + { + "acc": 0.65498524, + "epoch": 0.7789193302891934, + "grad_norm": 7.9375, + "learning_rate": 7.166826947656696e-06, + "loss": 1.69484024, + "memory(GiB)": 107.26, + "step": 30705, + "train_speed(iter/s)": 1.636582 + }, + { + "acc": 0.65259657, + "epoch": 0.7790461694571283, + "grad_norm": 4.75, + "learning_rate": 7.165881861151904e-06, + "loss": 1.60934887, + "memory(GiB)": 107.26, + "step": 30710, + "train_speed(iter/s)": 1.63661 + }, + { + "acc": 0.6601182, + "epoch": 0.7791730086250634, + "grad_norm": 6.09375, + "learning_rate": 7.164936679381957e-06, + "loss": 1.59868069, + "memory(GiB)": 107.26, + "step": 30715, + "train_speed(iter/s)": 1.636638 + }, + { + "acc": 0.65729599, + "epoch": 0.7792998477929984, + "grad_norm": 5.53125, + "learning_rate": 7.16399140238843e-06, + "loss": 1.65911503, + "memory(GiB)": 107.26, + "step": 30720, + "train_speed(iter/s)": 1.636666 + }, + { + "acc": 0.65079694, + "epoch": 0.7794266869609335, + "grad_norm": 4.9375, + "learning_rate": 7.163046030212899e-06, + "loss": 1.62213116, + "memory(GiB)": 107.26, + "step": 30725, + "train_speed(iter/s)": 1.636696 + }, + { + "acc": 0.66206989, + "epoch": 0.7795535261288686, + "grad_norm": 5.53125, + "learning_rate": 7.1621005628969475e-06, + "loss": 1.5307085, + "memory(GiB)": 107.26, + "step": 30730, + "train_speed(iter/s)": 1.636726 + }, + { + "acc": 0.64623404, + "epoch": 0.7796803652968036, + "grad_norm": 6.34375, + "learning_rate": 7.161155000482159e-06, + "loss": 1.6867939, + "memory(GiB)": 107.26, + "step": 30735, + "train_speed(iter/s)": 1.636755 + }, + { + "acc": 0.63943319, + "epoch": 0.7798072044647387, + "grad_norm": 5.53125, + "learning_rate": 7.160209343010125e-06, + "loss": 1.63440323, + "memory(GiB)": 107.26, + "step": 30740, + "train_speed(iter/s)": 1.636783 + }, + { + "acc": 0.65245943, + "epoch": 0.7799340436326738, + "grad_norm": 5.4375, + "learning_rate": 7.1592635905224386e-06, + "loss": 1.67462692, + "memory(GiB)": 107.26, + "step": 30745, + "train_speed(iter/s)": 1.636812 + }, + { + "acc": 0.64611311, + "epoch": 0.7800608828006088, + "grad_norm": 6.34375, + "learning_rate": 7.1583177430606995e-06, + "loss": 1.67396774, + "memory(GiB)": 107.26, + "step": 30750, + "train_speed(iter/s)": 1.63684 + }, + { + "acc": 0.64364958, + "epoch": 0.7801877219685439, + "grad_norm": 5.6875, + "learning_rate": 7.1573718006665095e-06, + "loss": 1.6576107, + "memory(GiB)": 107.26, + "step": 30755, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.64075098, + "epoch": 0.7803145611364789, + "grad_norm": 7.03125, + "learning_rate": 7.156425763381477e-06, + "loss": 1.66395149, + "memory(GiB)": 107.26, + "step": 30760, + "train_speed(iter/s)": 1.636899 + }, + { + "acc": 0.66395569, + "epoch": 0.780441400304414, + "grad_norm": 5.6875, + "learning_rate": 7.155479631247211e-06, + "loss": 1.57881393, + "memory(GiB)": 107.26, + "step": 30765, + "train_speed(iter/s)": 1.636928 + }, + { + "acc": 0.64132605, + "epoch": 0.7805682394723491, + "grad_norm": 5.28125, + "learning_rate": 7.154533404305327e-06, + "loss": 1.65135784, + "memory(GiB)": 107.26, + "step": 30770, + "train_speed(iter/s)": 1.636956 + }, + { + "acc": 0.66526337, + "epoch": 0.7806950786402841, + "grad_norm": 5.875, + "learning_rate": 7.153587082597445e-06, + "loss": 1.47362204, + "memory(GiB)": 107.26, + "step": 30775, + "train_speed(iter/s)": 1.636983 + }, + { + "acc": 0.66942844, + "epoch": 0.7808219178082192, + "grad_norm": 5.3125, + "learning_rate": 7.152640666165187e-06, + "loss": 1.50902376, + "memory(GiB)": 107.26, + "step": 30780, + "train_speed(iter/s)": 1.637014 + }, + { + "acc": 0.66340733, + "epoch": 0.7809487569761543, + "grad_norm": 5.4375, + "learning_rate": 7.151694155050184e-06, + "loss": 1.55822525, + "memory(GiB)": 107.26, + "step": 30785, + "train_speed(iter/s)": 1.637042 + }, + { + "acc": 0.65109696, + "epoch": 0.7810755961440893, + "grad_norm": 5.0625, + "learning_rate": 7.150747549294064e-06, + "loss": 1.66960526, + "memory(GiB)": 107.26, + "step": 30790, + "train_speed(iter/s)": 1.637071 + }, + { + "acc": 0.65702529, + "epoch": 0.7812024353120244, + "grad_norm": 6.625, + "learning_rate": 7.149800848938464e-06, + "loss": 1.57980947, + "memory(GiB)": 107.26, + "step": 30795, + "train_speed(iter/s)": 1.6371 + }, + { + "acc": 0.63773665, + "epoch": 0.7813292744799594, + "grad_norm": 5.90625, + "learning_rate": 7.1488540540250254e-06, + "loss": 1.65127792, + "memory(GiB)": 107.26, + "step": 30800, + "train_speed(iter/s)": 1.637129 + }, + { + "acc": 0.65122681, + "epoch": 0.7814561136478945, + "grad_norm": 5.0625, + "learning_rate": 7.14790716459539e-06, + "loss": 1.60646725, + "memory(GiB)": 107.26, + "step": 30805, + "train_speed(iter/s)": 1.637156 + }, + { + "acc": 0.66043463, + "epoch": 0.7815829528158296, + "grad_norm": 5.15625, + "learning_rate": 7.146960180691209e-06, + "loss": 1.55198698, + "memory(GiB)": 107.26, + "step": 30810, + "train_speed(iter/s)": 1.637185 + }, + { + "acc": 0.64816837, + "epoch": 0.7817097919837646, + "grad_norm": 5.84375, + "learning_rate": 7.146013102354133e-06, + "loss": 1.60648384, + "memory(GiB)": 107.26, + "step": 30815, + "train_speed(iter/s)": 1.637215 + }, + { + "acc": 0.66276803, + "epoch": 0.7818366311516997, + "grad_norm": 5.5, + "learning_rate": 7.145065929625821e-06, + "loss": 1.54901552, + "memory(GiB)": 107.26, + "step": 30820, + "train_speed(iter/s)": 1.637245 + }, + { + "acc": 0.65730925, + "epoch": 0.7819634703196348, + "grad_norm": 5.4375, + "learning_rate": 7.1441186625479304e-06, + "loss": 1.63273067, + "memory(GiB)": 107.26, + "step": 30825, + "train_speed(iter/s)": 1.637272 + }, + { + "acc": 0.65262175, + "epoch": 0.7820903094875697, + "grad_norm": 7.25, + "learning_rate": 7.143171301162131e-06, + "loss": 1.64197006, + "memory(GiB)": 107.26, + "step": 30830, + "train_speed(iter/s)": 1.637302 + }, + { + "acc": 0.63268476, + "epoch": 0.7822171486555048, + "grad_norm": 6.09375, + "learning_rate": 7.142223845510086e-06, + "loss": 1.71115379, + "memory(GiB)": 107.26, + "step": 30835, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.64756174, + "epoch": 0.7823439878234398, + "grad_norm": 6.3125, + "learning_rate": 7.1412762956334746e-06, + "loss": 1.63337746, + "memory(GiB)": 107.26, + "step": 30840, + "train_speed(iter/s)": 1.637363 + }, + { + "acc": 0.65478659, + "epoch": 0.7824708269913749, + "grad_norm": 5.03125, + "learning_rate": 7.140328651573969e-06, + "loss": 1.58976927, + "memory(GiB)": 107.26, + "step": 30845, + "train_speed(iter/s)": 1.637391 + }, + { + "acc": 0.65226812, + "epoch": 0.78259766615931, + "grad_norm": 5.71875, + "learning_rate": 7.139380913373255e-06, + "loss": 1.61686172, + "memory(GiB)": 107.26, + "step": 30850, + "train_speed(iter/s)": 1.63742 + }, + { + "acc": 0.650634, + "epoch": 0.782724505327245, + "grad_norm": 5.03125, + "learning_rate": 7.138433081073017e-06, + "loss": 1.64412384, + "memory(GiB)": 107.26, + "step": 30855, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.66604409, + "epoch": 0.7828513444951801, + "grad_norm": 5.53125, + "learning_rate": 7.137485154714945e-06, + "loss": 1.5839673, + "memory(GiB)": 107.26, + "step": 30860, + "train_speed(iter/s)": 1.637478 + }, + { + "acc": 0.66617074, + "epoch": 0.7829781836631152, + "grad_norm": 5.59375, + "learning_rate": 7.1365371343407304e-06, + "loss": 1.59314184, + "memory(GiB)": 107.26, + "step": 30865, + "train_speed(iter/s)": 1.637506 + }, + { + "acc": 0.67707481, + "epoch": 0.7831050228310502, + "grad_norm": 6.125, + "learning_rate": 7.135589019992076e-06, + "loss": 1.59035444, + "memory(GiB)": 107.26, + "step": 30870, + "train_speed(iter/s)": 1.637535 + }, + { + "acc": 0.66602597, + "epoch": 0.7832318619989853, + "grad_norm": 6.1875, + "learning_rate": 7.134640811710681e-06, + "loss": 1.58947344, + "memory(GiB)": 107.26, + "step": 30875, + "train_speed(iter/s)": 1.637563 + }, + { + "acc": 0.64053383, + "epoch": 0.7833587011669203, + "grad_norm": 6.4375, + "learning_rate": 7.133692509538253e-06, + "loss": 1.60733871, + "memory(GiB)": 107.26, + "step": 30880, + "train_speed(iter/s)": 1.637591 + }, + { + "acc": 0.65207176, + "epoch": 0.7834855403348554, + "grad_norm": 5.65625, + "learning_rate": 7.132744113516502e-06, + "loss": 1.69174347, + "memory(GiB)": 107.26, + "step": 30885, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.67224426, + "epoch": 0.7836123795027905, + "grad_norm": 5.59375, + "learning_rate": 7.1317956236871436e-06, + "loss": 1.58318987, + "memory(GiB)": 107.26, + "step": 30890, + "train_speed(iter/s)": 1.63765 + }, + { + "acc": 0.66608834, + "epoch": 0.7837392186707255, + "grad_norm": 5.59375, + "learning_rate": 7.130847040091893e-06, + "loss": 1.58331203, + "memory(GiB)": 107.26, + "step": 30895, + "train_speed(iter/s)": 1.637678 + }, + { + "acc": 0.65211349, + "epoch": 0.7838660578386606, + "grad_norm": 5.53125, + "learning_rate": 7.1298983627724795e-06, + "loss": 1.6122673, + "memory(GiB)": 107.26, + "step": 30900, + "train_speed(iter/s)": 1.637704 + }, + { + "acc": 0.63837581, + "epoch": 0.7839928970065957, + "grad_norm": 6.5625, + "learning_rate": 7.128949591770624e-06, + "loss": 1.68804188, + "memory(GiB)": 107.26, + "step": 30905, + "train_speed(iter/s)": 1.637736 + }, + { + "acc": 0.66685181, + "epoch": 0.7841197361745307, + "grad_norm": 6.34375, + "learning_rate": 7.128000727128063e-06, + "loss": 1.63648643, + "memory(GiB)": 107.26, + "step": 30910, + "train_speed(iter/s)": 1.637765 + }, + { + "acc": 0.64562726, + "epoch": 0.7842465753424658, + "grad_norm": 6.09375, + "learning_rate": 7.127051768886527e-06, + "loss": 1.63818398, + "memory(GiB)": 107.26, + "step": 30915, + "train_speed(iter/s)": 1.637794 + }, + { + "acc": 0.67620625, + "epoch": 0.7843734145104008, + "grad_norm": 6.59375, + "learning_rate": 7.126102717087758e-06, + "loss": 1.55689049, + "memory(GiB)": 107.26, + "step": 30920, + "train_speed(iter/s)": 1.637821 + }, + { + "acc": 0.64565697, + "epoch": 0.7845002536783359, + "grad_norm": 5.09375, + "learning_rate": 7.1251535717735e-06, + "loss": 1.65126534, + "memory(GiB)": 107.26, + "step": 30925, + "train_speed(iter/s)": 1.63785 + }, + { + "acc": 0.64443326, + "epoch": 0.784627092846271, + "grad_norm": 7.03125, + "learning_rate": 7.1242043329854995e-06, + "loss": 1.69032688, + "memory(GiB)": 107.26, + "step": 30930, + "train_speed(iter/s)": 1.637878 + }, + { + "acc": 0.65155067, + "epoch": 0.784753932014206, + "grad_norm": 5.875, + "learning_rate": 7.123255000765508e-06, + "loss": 1.59071312, + "memory(GiB)": 107.26, + "step": 30935, + "train_speed(iter/s)": 1.637908 + }, + { + "acc": 0.65654821, + "epoch": 0.7848807711821411, + "grad_norm": 6.46875, + "learning_rate": 7.122305575155283e-06, + "loss": 1.60332165, + "memory(GiB)": 107.26, + "step": 30940, + "train_speed(iter/s)": 1.637938 + }, + { + "acc": 0.65992761, + "epoch": 0.7850076103500762, + "grad_norm": 5.5625, + "learning_rate": 7.121356056196582e-06, + "loss": 1.63548241, + "memory(GiB)": 107.26, + "step": 30945, + "train_speed(iter/s)": 1.637966 + }, + { + "acc": 0.67780595, + "epoch": 0.7851344495180111, + "grad_norm": 5.0625, + "learning_rate": 7.1204064439311715e-06, + "loss": 1.56389694, + "memory(GiB)": 107.26, + "step": 30950, + "train_speed(iter/s)": 1.637994 + }, + { + "acc": 0.66311407, + "epoch": 0.7852612886859462, + "grad_norm": 6.375, + "learning_rate": 7.119456738400818e-06, + "loss": 1.56350679, + "memory(GiB)": 107.26, + "step": 30955, + "train_speed(iter/s)": 1.638023 + }, + { + "acc": 0.66002016, + "epoch": 0.7853881278538812, + "grad_norm": 5.34375, + "learning_rate": 7.118506939647295e-06, + "loss": 1.49937019, + "memory(GiB)": 107.26, + "step": 30960, + "train_speed(iter/s)": 1.638052 + }, + { + "acc": 0.65607243, + "epoch": 0.7855149670218163, + "grad_norm": 5.90625, + "learning_rate": 7.1175570477123776e-06, + "loss": 1.62767658, + "memory(GiB)": 107.26, + "step": 30965, + "train_speed(iter/s)": 1.638079 + }, + { + "acc": 0.65050111, + "epoch": 0.7856418061897514, + "grad_norm": 5.9375, + "learning_rate": 7.116607062637848e-06, + "loss": 1.62889214, + "memory(GiB)": 107.26, + "step": 30970, + "train_speed(iter/s)": 1.638106 + }, + { + "acc": 0.65895529, + "epoch": 0.7857686453576864, + "grad_norm": 6.40625, + "learning_rate": 7.115656984465489e-06, + "loss": 1.57907181, + "memory(GiB)": 107.26, + "step": 30975, + "train_speed(iter/s)": 1.638132 + }, + { + "acc": 0.64549918, + "epoch": 0.7858954845256215, + "grad_norm": 6.03125, + "learning_rate": 7.114706813237091e-06, + "loss": 1.68856697, + "memory(GiB)": 107.26, + "step": 30980, + "train_speed(iter/s)": 1.63816 + }, + { + "acc": 0.65708437, + "epoch": 0.7860223236935566, + "grad_norm": 4.8125, + "learning_rate": 7.1137565489944445e-06, + "loss": 1.62159348, + "memory(GiB)": 107.26, + "step": 30985, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.66555815, + "epoch": 0.7861491628614916, + "grad_norm": 5.40625, + "learning_rate": 7.112806191779349e-06, + "loss": 1.56197338, + "memory(GiB)": 107.26, + "step": 30990, + "train_speed(iter/s)": 1.638215 + }, + { + "acc": 0.64561439, + "epoch": 0.7862760020294267, + "grad_norm": 5.21875, + "learning_rate": 7.111855741633603e-06, + "loss": 1.64821491, + "memory(GiB)": 107.26, + "step": 30995, + "train_speed(iter/s)": 1.638242 + }, + { + "acc": 0.65975032, + "epoch": 0.7864028411973617, + "grad_norm": 6.90625, + "learning_rate": 7.1109051985990145e-06, + "loss": 1.62765789, + "memory(GiB)": 107.26, + "step": 31000, + "train_speed(iter/s)": 1.638271 + }, + { + "epoch": 0.7864028411973617, + "eval_acc": 0.6455726866266965, + "eval_loss": 1.5767943859100342, + "eval_runtime": 58.2768, + "eval_samples_per_second": 109.306, + "eval_steps_per_second": 27.335, + "step": 31000 + }, + { + "acc": 0.64757233, + "epoch": 0.7865296803652968, + "grad_norm": 4.875, + "learning_rate": 7.109954562717389e-06, + "loss": 1.63938522, + "memory(GiB)": 107.26, + "step": 31005, + "train_speed(iter/s)": 1.632909 + }, + { + "acc": 0.66574173, + "epoch": 0.7866565195332319, + "grad_norm": 9.3125, + "learning_rate": 7.109003834030543e-06, + "loss": 1.57466583, + "memory(GiB)": 107.26, + "step": 31010, + "train_speed(iter/s)": 1.632937 + }, + { + "acc": 0.65865822, + "epoch": 0.7867833587011669, + "grad_norm": 7.0, + "learning_rate": 7.108053012580291e-06, + "loss": 1.58323221, + "memory(GiB)": 107.26, + "step": 31015, + "train_speed(iter/s)": 1.632965 + }, + { + "acc": 0.65308952, + "epoch": 0.786910197869102, + "grad_norm": 5.1875, + "learning_rate": 7.107102098408457e-06, + "loss": 1.57040215, + "memory(GiB)": 107.26, + "step": 31020, + "train_speed(iter/s)": 1.632993 + }, + { + "acc": 0.65615973, + "epoch": 0.7870370370370371, + "grad_norm": 5.46875, + "learning_rate": 7.106151091556865e-06, + "loss": 1.6058672, + "memory(GiB)": 107.26, + "step": 31025, + "train_speed(iter/s)": 1.633022 + }, + { + "acc": 0.65740523, + "epoch": 0.7871638762049721, + "grad_norm": 5.28125, + "learning_rate": 7.105199992067344e-06, + "loss": 1.60298538, + "memory(GiB)": 107.26, + "step": 31030, + "train_speed(iter/s)": 1.633052 + }, + { + "acc": 0.65304718, + "epoch": 0.7872907153729072, + "grad_norm": 5.6875, + "learning_rate": 7.1042487999817275e-06, + "loss": 1.59884739, + "memory(GiB)": 107.26, + "step": 31035, + "train_speed(iter/s)": 1.633075 + }, + { + "acc": 0.65780239, + "epoch": 0.7874175545408422, + "grad_norm": 6.6875, + "learning_rate": 7.103297515341857e-06, + "loss": 1.59267845, + "memory(GiB)": 107.26, + "step": 31040, + "train_speed(iter/s)": 1.633104 + }, + { + "acc": 0.65838223, + "epoch": 0.7875443937087773, + "grad_norm": 6.90625, + "learning_rate": 7.1023461381895685e-06, + "loss": 1.57966022, + "memory(GiB)": 107.26, + "step": 31045, + "train_speed(iter/s)": 1.633132 + }, + { + "acc": 0.65857053, + "epoch": 0.7876712328767124, + "grad_norm": 5.5, + "learning_rate": 7.1013946685667125e-06, + "loss": 1.62191448, + "memory(GiB)": 107.26, + "step": 31050, + "train_speed(iter/s)": 1.633162 + }, + { + "acc": 0.66186371, + "epoch": 0.7877980720446474, + "grad_norm": 6.28125, + "learning_rate": 7.100443106515135e-06, + "loss": 1.5466877, + "memory(GiB)": 107.26, + "step": 31055, + "train_speed(iter/s)": 1.633189 + }, + { + "acc": 0.67042389, + "epoch": 0.7879249112125825, + "grad_norm": 6.71875, + "learning_rate": 7.099491452076693e-06, + "loss": 1.57204342, + "memory(GiB)": 107.26, + "step": 31060, + "train_speed(iter/s)": 1.633219 + }, + { + "acc": 0.65227017, + "epoch": 0.7880517503805176, + "grad_norm": 8.5625, + "learning_rate": 7.098539705293242e-06, + "loss": 1.62192554, + "memory(GiB)": 107.26, + "step": 31065, + "train_speed(iter/s)": 1.633249 + }, + { + "acc": 0.64737072, + "epoch": 0.7881785895484525, + "grad_norm": 6.8125, + "learning_rate": 7.097587866206647e-06, + "loss": 1.60865879, + "memory(GiB)": 107.26, + "step": 31070, + "train_speed(iter/s)": 1.633279 + }, + { + "acc": 0.66815062, + "epoch": 0.7883054287163876, + "grad_norm": 6.59375, + "learning_rate": 7.096635934858772e-06, + "loss": 1.54231491, + "memory(GiB)": 107.26, + "step": 31075, + "train_speed(iter/s)": 1.633309 + }, + { + "acc": 0.65892611, + "epoch": 0.7884322678843226, + "grad_norm": 5.84375, + "learning_rate": 7.095683911291488e-06, + "loss": 1.62408829, + "memory(GiB)": 107.26, + "step": 31080, + "train_speed(iter/s)": 1.633338 + }, + { + "acc": 0.65147743, + "epoch": 0.7885591070522577, + "grad_norm": 6.65625, + "learning_rate": 7.0947317955466686e-06, + "loss": 1.70138359, + "memory(GiB)": 107.26, + "step": 31085, + "train_speed(iter/s)": 1.633368 + }, + { + "acc": 0.65220976, + "epoch": 0.7886859462201928, + "grad_norm": 5.5625, + "learning_rate": 7.093779587666193e-06, + "loss": 1.62732544, + "memory(GiB)": 107.26, + "step": 31090, + "train_speed(iter/s)": 1.633398 + }, + { + "acc": 0.6654089, + "epoch": 0.7888127853881278, + "grad_norm": 5.34375, + "learning_rate": 7.092827287691943e-06, + "loss": 1.55268145, + "memory(GiB)": 107.26, + "step": 31095, + "train_speed(iter/s)": 1.633425 + }, + { + "acc": 0.66052661, + "epoch": 0.7889396245560629, + "grad_norm": 5.59375, + "learning_rate": 7.091874895665806e-06, + "loss": 1.58308468, + "memory(GiB)": 107.26, + "step": 31100, + "train_speed(iter/s)": 1.633454 + }, + { + "acc": 0.65798273, + "epoch": 0.789066463723998, + "grad_norm": 7.34375, + "learning_rate": 7.09092241162967e-06, + "loss": 1.58995972, + "memory(GiB)": 107.26, + "step": 31105, + "train_speed(iter/s)": 1.633483 + }, + { + "acc": 0.66508694, + "epoch": 0.789193302891933, + "grad_norm": 5.6875, + "learning_rate": 7.089969835625432e-06, + "loss": 1.60591736, + "memory(GiB)": 107.26, + "step": 31110, + "train_speed(iter/s)": 1.633512 + }, + { + "acc": 0.64632173, + "epoch": 0.7893201420598681, + "grad_norm": 5.8125, + "learning_rate": 7.089017167694988e-06, + "loss": 1.65225315, + "memory(GiB)": 107.26, + "step": 31115, + "train_speed(iter/s)": 1.633542 + }, + { + "acc": 0.64636788, + "epoch": 0.7894469812278031, + "grad_norm": 6.125, + "learning_rate": 7.088064407880244e-06, + "loss": 1.68166046, + "memory(GiB)": 107.26, + "step": 31120, + "train_speed(iter/s)": 1.633571 + }, + { + "acc": 0.65687761, + "epoch": 0.7895738203957382, + "grad_norm": 5.6875, + "learning_rate": 7.087111556223103e-06, + "loss": 1.60457573, + "memory(GiB)": 107.26, + "step": 31125, + "train_speed(iter/s)": 1.6336 + }, + { + "acc": 0.65077486, + "epoch": 0.7897006595636733, + "grad_norm": 6.15625, + "learning_rate": 7.08615861276548e-06, + "loss": 1.65730438, + "memory(GiB)": 107.26, + "step": 31130, + "train_speed(iter/s)": 1.63363 + }, + { + "acc": 0.64588904, + "epoch": 0.7898274987316083, + "grad_norm": 5.6875, + "learning_rate": 7.085205577549285e-06, + "loss": 1.61276989, + "memory(GiB)": 107.26, + "step": 31135, + "train_speed(iter/s)": 1.633658 + }, + { + "acc": 0.6609386, + "epoch": 0.7899543378995434, + "grad_norm": 4.875, + "learning_rate": 7.08425245061644e-06, + "loss": 1.57487993, + "memory(GiB)": 107.26, + "step": 31140, + "train_speed(iter/s)": 1.633686 + }, + { + "acc": 0.64660015, + "epoch": 0.7900811770674785, + "grad_norm": 5.40625, + "learning_rate": 7.083299232008867e-06, + "loss": 1.64494247, + "memory(GiB)": 107.26, + "step": 31145, + "train_speed(iter/s)": 1.633716 + }, + { + "acc": 0.65915112, + "epoch": 0.7902080162354135, + "grad_norm": 5.71875, + "learning_rate": 7.082345921768492e-06, + "loss": 1.55005655, + "memory(GiB)": 107.26, + "step": 31150, + "train_speed(iter/s)": 1.633745 + }, + { + "acc": 0.65915904, + "epoch": 0.7903348554033486, + "grad_norm": 5.875, + "learning_rate": 7.0813925199372455e-06, + "loss": 1.60656395, + "memory(GiB)": 107.26, + "step": 31155, + "train_speed(iter/s)": 1.633774 + }, + { + "acc": 0.63018637, + "epoch": 0.7904616945712836, + "grad_norm": 5.1875, + "learning_rate": 7.080439026557065e-06, + "loss": 1.61498108, + "memory(GiB)": 107.26, + "step": 31160, + "train_speed(iter/s)": 1.633801 + }, + { + "acc": 0.67653117, + "epoch": 0.7905885337392187, + "grad_norm": 5.875, + "learning_rate": 7.079485441669887e-06, + "loss": 1.4972065, + "memory(GiB)": 107.26, + "step": 31165, + "train_speed(iter/s)": 1.633829 + }, + { + "acc": 0.63481498, + "epoch": 0.7907153729071538, + "grad_norm": 6.03125, + "learning_rate": 7.0785317653176534e-06, + "loss": 1.63518505, + "memory(GiB)": 107.26, + "step": 31170, + "train_speed(iter/s)": 1.633857 + }, + { + "acc": 0.63802643, + "epoch": 0.7908422120750888, + "grad_norm": 6.0, + "learning_rate": 7.077577997542316e-06, + "loss": 1.67583656, + "memory(GiB)": 107.26, + "step": 31175, + "train_speed(iter/s)": 1.633885 + }, + { + "acc": 0.64503856, + "epoch": 0.7909690512430239, + "grad_norm": 6.09375, + "learning_rate": 7.0766241383858195e-06, + "loss": 1.7330164, + "memory(GiB)": 107.26, + "step": 31180, + "train_speed(iter/s)": 1.633913 + }, + { + "acc": 0.65054131, + "epoch": 0.791095890410959, + "grad_norm": 5.625, + "learning_rate": 7.075670187890123e-06, + "loss": 1.61693344, + "memory(GiB)": 107.26, + "step": 31185, + "train_speed(iter/s)": 1.633942 + }, + { + "acc": 0.66031713, + "epoch": 0.791222729578894, + "grad_norm": 5.5, + "learning_rate": 7.0747161460971845e-06, + "loss": 1.59147406, + "memory(GiB)": 107.26, + "step": 31190, + "train_speed(iter/s)": 1.633973 + }, + { + "acc": 0.64606352, + "epoch": 0.791349568746829, + "grad_norm": 6.25, + "learning_rate": 7.073762013048966e-06, + "loss": 1.67986145, + "memory(GiB)": 107.26, + "step": 31195, + "train_speed(iter/s)": 1.634002 + }, + { + "acc": 0.65435033, + "epoch": 0.791476407914764, + "grad_norm": 5.40625, + "learning_rate": 7.072807788787437e-06, + "loss": 1.58141699, + "memory(GiB)": 107.26, + "step": 31200, + "train_speed(iter/s)": 1.634032 + }, + { + "acc": 0.65349216, + "epoch": 0.7916032470826991, + "grad_norm": 6.5625, + "learning_rate": 7.071853473354566e-06, + "loss": 1.60701485, + "memory(GiB)": 107.26, + "step": 31205, + "train_speed(iter/s)": 1.63406 + }, + { + "acc": 0.66085987, + "epoch": 0.7917300862506342, + "grad_norm": 5.0, + "learning_rate": 7.070899066792329e-06, + "loss": 1.58331842, + "memory(GiB)": 107.26, + "step": 31210, + "train_speed(iter/s)": 1.634088 + }, + { + "acc": 0.66153445, + "epoch": 0.7918569254185692, + "grad_norm": 5.75, + "learning_rate": 7.069944569142706e-06, + "loss": 1.57464418, + "memory(GiB)": 107.26, + "step": 31215, + "train_speed(iter/s)": 1.634118 + }, + { + "acc": 0.6492681, + "epoch": 0.7919837645865043, + "grad_norm": 8.4375, + "learning_rate": 7.068989980447679e-06, + "loss": 1.66913986, + "memory(GiB)": 107.26, + "step": 31220, + "train_speed(iter/s)": 1.634145 + }, + { + "acc": 0.65748329, + "epoch": 0.7921106037544394, + "grad_norm": 6.25, + "learning_rate": 7.068035300749237e-06, + "loss": 1.60141544, + "memory(GiB)": 107.26, + "step": 31225, + "train_speed(iter/s)": 1.634176 + }, + { + "acc": 0.64853754, + "epoch": 0.7922374429223744, + "grad_norm": 4.96875, + "learning_rate": 7.067080530089366e-06, + "loss": 1.68210335, + "memory(GiB)": 107.26, + "step": 31230, + "train_speed(iter/s)": 1.634206 + }, + { + "acc": 0.64236946, + "epoch": 0.7923642820903095, + "grad_norm": 5.75, + "learning_rate": 7.066125668510067e-06, + "loss": 1.66154804, + "memory(GiB)": 107.26, + "step": 31235, + "train_speed(iter/s)": 1.634235 + }, + { + "acc": 0.64561367, + "epoch": 0.7924911212582445, + "grad_norm": 5.5625, + "learning_rate": 7.065170716053336e-06, + "loss": 1.63204155, + "memory(GiB)": 107.26, + "step": 31240, + "train_speed(iter/s)": 1.634264 + }, + { + "acc": 0.64884477, + "epoch": 0.7926179604261796, + "grad_norm": 6.46875, + "learning_rate": 7.064215672761175e-06, + "loss": 1.63680115, + "memory(GiB)": 107.26, + "step": 31245, + "train_speed(iter/s)": 1.634288 + }, + { + "acc": 0.66306729, + "epoch": 0.7927447995941147, + "grad_norm": 5.0625, + "learning_rate": 7.063260538675594e-06, + "loss": 1.56321697, + "memory(GiB)": 107.26, + "step": 31250, + "train_speed(iter/s)": 1.634316 + }, + { + "acc": 0.65950441, + "epoch": 0.7928716387620497, + "grad_norm": 7.0, + "learning_rate": 7.062305313838601e-06, + "loss": 1.5964489, + "memory(GiB)": 107.26, + "step": 31255, + "train_speed(iter/s)": 1.634342 + }, + { + "acc": 0.65043149, + "epoch": 0.7929984779299848, + "grad_norm": 6.03125, + "learning_rate": 7.061349998292215e-06, + "loss": 1.68801231, + "memory(GiB)": 107.26, + "step": 31260, + "train_speed(iter/s)": 1.63437 + }, + { + "acc": 0.65283613, + "epoch": 0.7931253170979199, + "grad_norm": 5.84375, + "learning_rate": 7.060394592078452e-06, + "loss": 1.61269951, + "memory(GiB)": 107.26, + "step": 31265, + "train_speed(iter/s)": 1.634398 + }, + { + "acc": 0.6575778, + "epoch": 0.7932521562658549, + "grad_norm": 5.78125, + "learning_rate": 7.0594390952393365e-06, + "loss": 1.61107903, + "memory(GiB)": 107.26, + "step": 31270, + "train_speed(iter/s)": 1.634426 + }, + { + "acc": 0.64295101, + "epoch": 0.79337899543379, + "grad_norm": 5.53125, + "learning_rate": 7.058483507816894e-06, + "loss": 1.64365768, + "memory(GiB)": 107.26, + "step": 31275, + "train_speed(iter/s)": 1.634456 + }, + { + "acc": 0.66307268, + "epoch": 0.793505834601725, + "grad_norm": 4.71875, + "learning_rate": 7.057527829853157e-06, + "loss": 1.5413166, + "memory(GiB)": 107.26, + "step": 31280, + "train_speed(iter/s)": 1.634486 + }, + { + "acc": 0.64996595, + "epoch": 0.7936326737696601, + "grad_norm": 7.5, + "learning_rate": 7.056572061390159e-06, + "loss": 1.64518414, + "memory(GiB)": 107.26, + "step": 31285, + "train_speed(iter/s)": 1.634515 + }, + { + "acc": 0.65264473, + "epoch": 0.7937595129375952, + "grad_norm": 4.90625, + "learning_rate": 7.055616202469939e-06, + "loss": 1.64386635, + "memory(GiB)": 107.26, + "step": 31290, + "train_speed(iter/s)": 1.634544 + }, + { + "acc": 0.65310621, + "epoch": 0.7938863521055302, + "grad_norm": 6.5625, + "learning_rate": 7.054660253134543e-06, + "loss": 1.60843353, + "memory(GiB)": 107.26, + "step": 31295, + "train_speed(iter/s)": 1.634572 + }, + { + "acc": 0.6521873, + "epoch": 0.7940131912734653, + "grad_norm": 5.59375, + "learning_rate": 7.053704213426015e-06, + "loss": 1.69099503, + "memory(GiB)": 107.26, + "step": 31300, + "train_speed(iter/s)": 1.634603 + }, + { + "acc": 0.65218287, + "epoch": 0.7941400304414004, + "grad_norm": 5.53125, + "learning_rate": 7.052748083386406e-06, + "loss": 1.65677357, + "memory(GiB)": 107.26, + "step": 31305, + "train_speed(iter/s)": 1.634632 + }, + { + "acc": 0.6685442, + "epoch": 0.7942668696093353, + "grad_norm": 6.625, + "learning_rate": 7.051791863057772e-06, + "loss": 1.59142437, + "memory(GiB)": 107.26, + "step": 31310, + "train_speed(iter/s)": 1.63466 + }, + { + "acc": 0.64603367, + "epoch": 0.7943937087772704, + "grad_norm": 6.59375, + "learning_rate": 7.050835552482171e-06, + "loss": 1.61281013, + "memory(GiB)": 107.26, + "step": 31315, + "train_speed(iter/s)": 1.634687 + }, + { + "acc": 0.66085606, + "epoch": 0.7945205479452054, + "grad_norm": 7.75, + "learning_rate": 7.049879151701666e-06, + "loss": 1.60690403, + "memory(GiB)": 107.26, + "step": 31320, + "train_speed(iter/s)": 1.634717 + }, + { + "acc": 0.64276886, + "epoch": 0.7946473871131405, + "grad_norm": 6.03125, + "learning_rate": 7.048922660758324e-06, + "loss": 1.60473785, + "memory(GiB)": 107.26, + "step": 31325, + "train_speed(iter/s)": 1.634746 + }, + { + "acc": 0.66217289, + "epoch": 0.7947742262810756, + "grad_norm": 5.71875, + "learning_rate": 7.047966079694215e-06, + "loss": 1.54961376, + "memory(GiB)": 107.26, + "step": 31330, + "train_speed(iter/s)": 1.634774 + }, + { + "acc": 0.6608645, + "epoch": 0.7949010654490106, + "grad_norm": 7.96875, + "learning_rate": 7.047009408551414e-06, + "loss": 1.55356884, + "memory(GiB)": 107.26, + "step": 31335, + "train_speed(iter/s)": 1.634803 + }, + { + "acc": 0.64311752, + "epoch": 0.7950279046169457, + "grad_norm": 5.5625, + "learning_rate": 7.046052647372002e-06, + "loss": 1.63000526, + "memory(GiB)": 107.26, + "step": 31340, + "train_speed(iter/s)": 1.63483 + }, + { + "acc": 0.65527148, + "epoch": 0.7951547437848808, + "grad_norm": 6.4375, + "learning_rate": 7.045095796198057e-06, + "loss": 1.63339863, + "memory(GiB)": 107.26, + "step": 31345, + "train_speed(iter/s)": 1.634858 + }, + { + "acc": 0.64365945, + "epoch": 0.7952815829528158, + "grad_norm": 5.625, + "learning_rate": 7.044138855071671e-06, + "loss": 1.75067177, + "memory(GiB)": 107.26, + "step": 31350, + "train_speed(iter/s)": 1.634888 + }, + { + "acc": 0.65477886, + "epoch": 0.7954084221207509, + "grad_norm": 6.1875, + "learning_rate": 7.043181824034929e-06, + "loss": 1.61393414, + "memory(GiB)": 107.26, + "step": 31355, + "train_speed(iter/s)": 1.634917 + }, + { + "acc": 0.66993504, + "epoch": 0.7955352612886859, + "grad_norm": 7.1875, + "learning_rate": 7.042224703129929e-06, + "loss": 1.56680183, + "memory(GiB)": 107.26, + "step": 31360, + "train_speed(iter/s)": 1.634948 + }, + { + "acc": 0.65087652, + "epoch": 0.795662100456621, + "grad_norm": 5.96875, + "learning_rate": 7.0412674923987705e-06, + "loss": 1.66979389, + "memory(GiB)": 107.26, + "step": 31365, + "train_speed(iter/s)": 1.634978 + }, + { + "acc": 0.66384845, + "epoch": 0.7957889396245561, + "grad_norm": 7.34375, + "learning_rate": 7.040310191883552e-06, + "loss": 1.55617056, + "memory(GiB)": 107.26, + "step": 31370, + "train_speed(iter/s)": 1.635008 + }, + { + "acc": 0.65551004, + "epoch": 0.7959157787924911, + "grad_norm": 5.40625, + "learning_rate": 7.039352801626383e-06, + "loss": 1.59915304, + "memory(GiB)": 107.26, + "step": 31375, + "train_speed(iter/s)": 1.63504 + }, + { + "acc": 0.67593594, + "epoch": 0.7960426179604262, + "grad_norm": 6.78125, + "learning_rate": 7.0383953216693725e-06, + "loss": 1.45270824, + "memory(GiB)": 107.26, + "step": 31380, + "train_speed(iter/s)": 1.635069 + }, + { + "acc": 0.66488671, + "epoch": 0.7961694571283613, + "grad_norm": 6.03125, + "learning_rate": 7.037437752054635e-06, + "loss": 1.60349197, + "memory(GiB)": 107.26, + "step": 31385, + "train_speed(iter/s)": 1.635098 + }, + { + "acc": 0.6614655, + "epoch": 0.7962962962962963, + "grad_norm": 6.40625, + "learning_rate": 7.036480092824288e-06, + "loss": 1.58103199, + "memory(GiB)": 107.26, + "step": 31390, + "train_speed(iter/s)": 1.635128 + }, + { + "acc": 0.65328074, + "epoch": 0.7964231354642314, + "grad_norm": 5.875, + "learning_rate": 7.035522344020455e-06, + "loss": 1.61199131, + "memory(GiB)": 107.26, + "step": 31395, + "train_speed(iter/s)": 1.635158 + }, + { + "acc": 0.64309921, + "epoch": 0.7965499746321664, + "grad_norm": 6.03125, + "learning_rate": 7.034564505685262e-06, + "loss": 1.59272738, + "memory(GiB)": 107.26, + "step": 31400, + "train_speed(iter/s)": 1.635187 + }, + { + "acc": 0.64977236, + "epoch": 0.7966768138001015, + "grad_norm": 6.3125, + "learning_rate": 7.0336065778608365e-06, + "loss": 1.60957565, + "memory(GiB)": 107.26, + "step": 31405, + "train_speed(iter/s)": 1.635217 + }, + { + "acc": 0.66490154, + "epoch": 0.7968036529680366, + "grad_norm": 4.6875, + "learning_rate": 7.032648560589316e-06, + "loss": 1.54726028, + "memory(GiB)": 107.26, + "step": 31410, + "train_speed(iter/s)": 1.635248 + }, + { + "acc": 0.66051846, + "epoch": 0.7969304921359716, + "grad_norm": 6.0, + "learning_rate": 7.031690453912835e-06, + "loss": 1.5356205, + "memory(GiB)": 107.26, + "step": 31415, + "train_speed(iter/s)": 1.635277 + }, + { + "acc": 0.65544033, + "epoch": 0.7970573313039067, + "grad_norm": 6.75, + "learning_rate": 7.030732257873539e-06, + "loss": 1.63505249, + "memory(GiB)": 107.26, + "step": 31420, + "train_speed(iter/s)": 1.635306 + }, + { + "acc": 0.65876904, + "epoch": 0.7971841704718418, + "grad_norm": 5.46875, + "learning_rate": 7.02977397251357e-06, + "loss": 1.59826202, + "memory(GiB)": 107.26, + "step": 31425, + "train_speed(iter/s)": 1.635336 + }, + { + "acc": 0.67138009, + "epoch": 0.7973110096397767, + "grad_norm": 4.78125, + "learning_rate": 7.028815597875081e-06, + "loss": 1.52501717, + "memory(GiB)": 107.26, + "step": 31430, + "train_speed(iter/s)": 1.635365 + }, + { + "acc": 0.63913412, + "epoch": 0.7974378488077118, + "grad_norm": 5.21875, + "learning_rate": 7.027857134000223e-06, + "loss": 1.69021683, + "memory(GiB)": 107.26, + "step": 31435, + "train_speed(iter/s)": 1.635395 + }, + { + "acc": 0.6709847, + "epoch": 0.7975646879756468, + "grad_norm": 6.75, + "learning_rate": 7.026898580931154e-06, + "loss": 1.6213829, + "memory(GiB)": 107.26, + "step": 31440, + "train_speed(iter/s)": 1.635424 + }, + { + "acc": 0.67155638, + "epoch": 0.7976915271435819, + "grad_norm": 5.65625, + "learning_rate": 7.025939938710037e-06, + "loss": 1.56242714, + "memory(GiB)": 107.26, + "step": 31445, + "train_speed(iter/s)": 1.635454 + }, + { + "acc": 0.63182421, + "epoch": 0.797818366311517, + "grad_norm": 5.5625, + "learning_rate": 7.024981207379036e-06, + "loss": 1.67539463, + "memory(GiB)": 107.26, + "step": 31450, + "train_speed(iter/s)": 1.635481 + }, + { + "acc": 0.63707418, + "epoch": 0.797945205479452, + "grad_norm": 6.90625, + "learning_rate": 7.02402238698032e-06, + "loss": 1.68257847, + "memory(GiB)": 107.26, + "step": 31455, + "train_speed(iter/s)": 1.635509 + }, + { + "acc": 0.64785337, + "epoch": 0.7980720446473871, + "grad_norm": 6.75, + "learning_rate": 7.023063477556064e-06, + "loss": 1.65203876, + "memory(GiB)": 107.26, + "step": 31460, + "train_speed(iter/s)": 1.635537 + }, + { + "acc": 0.65921106, + "epoch": 0.7981988838153222, + "grad_norm": 5.25, + "learning_rate": 7.0221044791484424e-06, + "loss": 1.54721508, + "memory(GiB)": 107.26, + "step": 31465, + "train_speed(iter/s)": 1.635566 + }, + { + "acc": 0.64905682, + "epoch": 0.7983257229832572, + "grad_norm": 5.15625, + "learning_rate": 7.021145391799639e-06, + "loss": 1.59451771, + "memory(GiB)": 107.26, + "step": 31470, + "train_speed(iter/s)": 1.635596 + }, + { + "acc": 0.65162015, + "epoch": 0.7984525621511923, + "grad_norm": 5.125, + "learning_rate": 7.020186215551837e-06, + "loss": 1.61367798, + "memory(GiB)": 107.26, + "step": 31475, + "train_speed(iter/s)": 1.635623 + }, + { + "acc": 0.65757213, + "epoch": 0.7985794013191273, + "grad_norm": 5.78125, + "learning_rate": 7.019226950447227e-06, + "loss": 1.59809198, + "memory(GiB)": 107.26, + "step": 31480, + "train_speed(iter/s)": 1.635652 + }, + { + "acc": 0.64026928, + "epoch": 0.7987062404870624, + "grad_norm": 5.84375, + "learning_rate": 7.018267596527998e-06, + "loss": 1.66018524, + "memory(GiB)": 107.26, + "step": 31485, + "train_speed(iter/s)": 1.635682 + }, + { + "acc": 0.67351179, + "epoch": 0.7988330796549975, + "grad_norm": 6.21875, + "learning_rate": 7.017308153836352e-06, + "loss": 1.53560753, + "memory(GiB)": 107.26, + "step": 31490, + "train_speed(iter/s)": 1.635713 + }, + { + "acc": 0.65102959, + "epoch": 0.7989599188229325, + "grad_norm": 5.0625, + "learning_rate": 7.016348622414484e-06, + "loss": 1.64828358, + "memory(GiB)": 107.26, + "step": 31495, + "train_speed(iter/s)": 1.635742 + }, + { + "acc": 0.66073585, + "epoch": 0.7990867579908676, + "grad_norm": 5.21875, + "learning_rate": 7.015389002304604e-06, + "loss": 1.60693893, + "memory(GiB)": 107.26, + "step": 31500, + "train_speed(iter/s)": 1.635771 + }, + { + "acc": 0.67532949, + "epoch": 0.7992135971588027, + "grad_norm": 5.65625, + "learning_rate": 7.014429293548916e-06, + "loss": 1.50706377, + "memory(GiB)": 107.26, + "step": 31505, + "train_speed(iter/s)": 1.635801 + }, + { + "acc": 0.6562469, + "epoch": 0.7993404363267377, + "grad_norm": 5.96875, + "learning_rate": 7.013469496189633e-06, + "loss": 1.62239532, + "memory(GiB)": 107.26, + "step": 31510, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.66055245, + "epoch": 0.7994672754946728, + "grad_norm": 5.6875, + "learning_rate": 7.012509610268974e-06, + "loss": 1.49975996, + "memory(GiB)": 107.26, + "step": 31515, + "train_speed(iter/s)": 1.635859 + }, + { + "acc": 0.65195799, + "epoch": 0.7995941146626078, + "grad_norm": 6.34375, + "learning_rate": 7.011549635829156e-06, + "loss": 1.60614052, + "memory(GiB)": 107.26, + "step": 31520, + "train_speed(iter/s)": 1.635889 + }, + { + "acc": 0.6493, + "epoch": 0.7997209538305429, + "grad_norm": 5.90625, + "learning_rate": 7.010589572912404e-06, + "loss": 1.59612722, + "memory(GiB)": 107.26, + "step": 31525, + "train_speed(iter/s)": 1.635919 + }, + { + "acc": 0.64656768, + "epoch": 0.799847792998478, + "grad_norm": 5.21875, + "learning_rate": 7.009629421560946e-06, + "loss": 1.63737621, + "memory(GiB)": 107.26, + "step": 31530, + "train_speed(iter/s)": 1.635947 + }, + { + "acc": 0.65564647, + "epoch": 0.799974632166413, + "grad_norm": 5.59375, + "learning_rate": 7.008669181817015e-06, + "loss": 1.63288383, + "memory(GiB)": 107.26, + "step": 31535, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.65118885, + "epoch": 0.8001014713343481, + "grad_norm": 5.34375, + "learning_rate": 7.007708853722844e-06, + "loss": 1.59557819, + "memory(GiB)": 107.26, + "step": 31540, + "train_speed(iter/s)": 1.636005 + }, + { + "acc": 0.64227943, + "epoch": 0.8002283105022832, + "grad_norm": 5.9375, + "learning_rate": 7.006748437320674e-06, + "loss": 1.74523048, + "memory(GiB)": 107.26, + "step": 31545, + "train_speed(iter/s)": 1.636035 + }, + { + "acc": 0.65036449, + "epoch": 0.8003551496702181, + "grad_norm": 5.0625, + "learning_rate": 7.005787932652749e-06, + "loss": 1.57965479, + "memory(GiB)": 107.26, + "step": 31550, + "train_speed(iter/s)": 1.636064 + }, + { + "acc": 0.64340105, + "epoch": 0.8004819888381532, + "grad_norm": 5.5625, + "learning_rate": 7.0048273397613145e-06, + "loss": 1.69293709, + "memory(GiB)": 107.26, + "step": 31555, + "train_speed(iter/s)": 1.636092 + }, + { + "acc": 0.65830016, + "epoch": 0.8006088280060882, + "grad_norm": 5.65625, + "learning_rate": 7.003866658688624e-06, + "loss": 1.59785042, + "memory(GiB)": 107.26, + "step": 31560, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.65423365, + "epoch": 0.8007356671740233, + "grad_norm": 6.09375, + "learning_rate": 7.0029058894769295e-06, + "loss": 1.65084858, + "memory(GiB)": 107.26, + "step": 31565, + "train_speed(iter/s)": 1.636151 + }, + { + "acc": 0.64913125, + "epoch": 0.8008625063419584, + "grad_norm": 5.5, + "learning_rate": 7.001945032168493e-06, + "loss": 1.6263134, + "memory(GiB)": 107.26, + "step": 31570, + "train_speed(iter/s)": 1.636181 + }, + { + "acc": 0.65347681, + "epoch": 0.8009893455098934, + "grad_norm": 6.0, + "learning_rate": 7.000984086805575e-06, + "loss": 1.58074932, + "memory(GiB)": 107.26, + "step": 31575, + "train_speed(iter/s)": 1.63621 + }, + { + "acc": 0.65131097, + "epoch": 0.8011161846778285, + "grad_norm": 5.59375, + "learning_rate": 7.000023053430444e-06, + "loss": 1.60889568, + "memory(GiB)": 107.26, + "step": 31580, + "train_speed(iter/s)": 1.636238 + }, + { + "acc": 0.64893665, + "epoch": 0.8012430238457636, + "grad_norm": 7.0, + "learning_rate": 6.999061932085369e-06, + "loss": 1.6766613, + "memory(GiB)": 107.26, + "step": 31585, + "train_speed(iter/s)": 1.636266 + }, + { + "acc": 0.65664902, + "epoch": 0.8013698630136986, + "grad_norm": 5.40625, + "learning_rate": 6.9981007228126255e-06, + "loss": 1.54973431, + "memory(GiB)": 107.26, + "step": 31590, + "train_speed(iter/s)": 1.636295 + }, + { + "acc": 0.63851833, + "epoch": 0.8014967021816337, + "grad_norm": 5.90625, + "learning_rate": 6.997139425654491e-06, + "loss": 1.65576363, + "memory(GiB)": 107.26, + "step": 31595, + "train_speed(iter/s)": 1.636323 + }, + { + "acc": 0.67258167, + "epoch": 0.8016235413495687, + "grad_norm": 6.59375, + "learning_rate": 6.996178040653248e-06, + "loss": 1.50720177, + "memory(GiB)": 107.26, + "step": 31600, + "train_speed(iter/s)": 1.63635 + }, + { + "acc": 0.67264385, + "epoch": 0.8017503805175038, + "grad_norm": 5.875, + "learning_rate": 6.995216567851183e-06, + "loss": 1.55890961, + "memory(GiB)": 107.26, + "step": 31605, + "train_speed(iter/s)": 1.636377 + }, + { + "acc": 0.63763704, + "epoch": 0.8018772196854389, + "grad_norm": 6.6875, + "learning_rate": 6.994255007290585e-06, + "loss": 1.64323063, + "memory(GiB)": 107.26, + "step": 31610, + "train_speed(iter/s)": 1.636404 + }, + { + "acc": 0.66405573, + "epoch": 0.8020040588533739, + "grad_norm": 5.28125, + "learning_rate": 6.993293359013747e-06, + "loss": 1.6096077, + "memory(GiB)": 107.26, + "step": 31615, + "train_speed(iter/s)": 1.636432 + }, + { + "acc": 0.63168311, + "epoch": 0.802130898021309, + "grad_norm": 5.1875, + "learning_rate": 6.992331623062969e-06, + "loss": 1.67222633, + "memory(GiB)": 107.26, + "step": 31620, + "train_speed(iter/s)": 1.636458 + }, + { + "acc": 0.6563798, + "epoch": 0.8022577371892441, + "grad_norm": 5.5625, + "learning_rate": 6.9913697994805505e-06, + "loss": 1.61397095, + "memory(GiB)": 107.26, + "step": 31625, + "train_speed(iter/s)": 1.636486 + }, + { + "acc": 0.65471926, + "epoch": 0.8023845763571791, + "grad_norm": 6.625, + "learning_rate": 6.990407888308799e-06, + "loss": 1.57690697, + "memory(GiB)": 107.26, + "step": 31630, + "train_speed(iter/s)": 1.636512 + }, + { + "acc": 0.6477561, + "epoch": 0.8025114155251142, + "grad_norm": 4.78125, + "learning_rate": 6.98944588959002e-06, + "loss": 1.6375412, + "memory(GiB)": 107.26, + "step": 31635, + "train_speed(iter/s)": 1.636539 + }, + { + "acc": 0.6483861, + "epoch": 0.8026382546930492, + "grad_norm": 6.25, + "learning_rate": 6.9884838033665305e-06, + "loss": 1.5951931, + "memory(GiB)": 107.26, + "step": 31640, + "train_speed(iter/s)": 1.636566 + }, + { + "acc": 0.63996024, + "epoch": 0.8027650938609843, + "grad_norm": 5.0, + "learning_rate": 6.987521629680643e-06, + "loss": 1.66841087, + "memory(GiB)": 107.26, + "step": 31645, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.64413018, + "epoch": 0.8028919330289194, + "grad_norm": 5.21875, + "learning_rate": 6.9865593685746815e-06, + "loss": 1.6708643, + "memory(GiB)": 107.26, + "step": 31650, + "train_speed(iter/s)": 1.63662 + }, + { + "acc": 0.65133648, + "epoch": 0.8030187721968544, + "grad_norm": 5.71875, + "learning_rate": 6.98559702009097e-06, + "loss": 1.70497379, + "memory(GiB)": 107.26, + "step": 31655, + "train_speed(iter/s)": 1.636648 + }, + { + "acc": 0.65791612, + "epoch": 0.8031456113647895, + "grad_norm": 7.625, + "learning_rate": 6.984634584271836e-06, + "loss": 1.64649849, + "memory(GiB)": 107.26, + "step": 31660, + "train_speed(iter/s)": 1.636677 + }, + { + "acc": 0.65219336, + "epoch": 0.8032724505327246, + "grad_norm": 5.40625, + "learning_rate": 6.983672061159612e-06, + "loss": 1.60834274, + "memory(GiB)": 107.26, + "step": 31665, + "train_speed(iter/s)": 1.636706 + }, + { + "acc": 0.65112314, + "epoch": 0.8033992897006595, + "grad_norm": 6.40625, + "learning_rate": 6.982709450796636e-06, + "loss": 1.61001701, + "memory(GiB)": 107.26, + "step": 31670, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.63708506, + "epoch": 0.8035261288685946, + "grad_norm": 5.84375, + "learning_rate": 6.981746753225245e-06, + "loss": 1.70016632, + "memory(GiB)": 107.26, + "step": 31675, + "train_speed(iter/s)": 1.636761 + }, + { + "acc": 0.65520887, + "epoch": 0.8036529680365296, + "grad_norm": 6.53125, + "learning_rate": 6.980783968487783e-06, + "loss": 1.54497299, + "memory(GiB)": 107.26, + "step": 31680, + "train_speed(iter/s)": 1.636788 + }, + { + "acc": 0.63458281, + "epoch": 0.8037798072044647, + "grad_norm": 10.6875, + "learning_rate": 6.9798210966266e-06, + "loss": 1.68705482, + "memory(GiB)": 107.26, + "step": 31685, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.64320369, + "epoch": 0.8039066463723998, + "grad_norm": 5.1875, + "learning_rate": 6.9788581376840455e-06, + "loss": 1.60026417, + "memory(GiB)": 107.26, + "step": 31690, + "train_speed(iter/s)": 1.63684 + }, + { + "acc": 0.65845203, + "epoch": 0.8040334855403348, + "grad_norm": 6.15625, + "learning_rate": 6.977895091702474e-06, + "loss": 1.5848032, + "memory(GiB)": 107.26, + "step": 31695, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.66474333, + "epoch": 0.8041603247082699, + "grad_norm": 5.40625, + "learning_rate": 6.976931958724248e-06, + "loss": 1.53534956, + "memory(GiB)": 107.26, + "step": 31700, + "train_speed(iter/s)": 1.636895 + }, + { + "acc": 0.64599733, + "epoch": 0.804287163876205, + "grad_norm": 5.53125, + "learning_rate": 6.975968738791726e-06, + "loss": 1.63694839, + "memory(GiB)": 107.26, + "step": 31705, + "train_speed(iter/s)": 1.636924 + }, + { + "acc": 0.64153309, + "epoch": 0.80441400304414, + "grad_norm": 5.78125, + "learning_rate": 6.9750054319472785e-06, + "loss": 1.70781059, + "memory(GiB)": 107.26, + "step": 31710, + "train_speed(iter/s)": 1.63695 + }, + { + "acc": 0.66971951, + "epoch": 0.8045408422120751, + "grad_norm": 5.4375, + "learning_rate": 6.974042038233272e-06, + "loss": 1.59744244, + "memory(GiB)": 107.26, + "step": 31715, + "train_speed(iter/s)": 1.636978 + }, + { + "acc": 0.63684053, + "epoch": 0.8046676813800101, + "grad_norm": 5.34375, + "learning_rate": 6.9730785576920855e-06, + "loss": 1.73447933, + "memory(GiB)": 107.26, + "step": 31720, + "train_speed(iter/s)": 1.637005 + }, + { + "acc": 0.66701121, + "epoch": 0.8047945205479452, + "grad_norm": 6.625, + "learning_rate": 6.972114990366094e-06, + "loss": 1.55795202, + "memory(GiB)": 107.26, + "step": 31725, + "train_speed(iter/s)": 1.637032 + }, + { + "acc": 0.62969141, + "epoch": 0.8049213597158803, + "grad_norm": 6.71875, + "learning_rate": 6.97115133629768e-06, + "loss": 1.59132013, + "memory(GiB)": 107.26, + "step": 31730, + "train_speed(iter/s)": 1.63706 + }, + { + "acc": 0.66779776, + "epoch": 0.8050481988838153, + "grad_norm": 5.25, + "learning_rate": 6.970187595529229e-06, + "loss": 1.5646533, + "memory(GiB)": 107.26, + "step": 31735, + "train_speed(iter/s)": 1.637086 + }, + { + "acc": 0.65147762, + "epoch": 0.8051750380517504, + "grad_norm": 5.78125, + "learning_rate": 6.969223768103133e-06, + "loss": 1.6891964, + "memory(GiB)": 107.26, + "step": 31740, + "train_speed(iter/s)": 1.637113 + }, + { + "acc": 0.6507627, + "epoch": 0.8053018772196855, + "grad_norm": 5.28125, + "learning_rate": 6.968259854061783e-06, + "loss": 1.59935303, + "memory(GiB)": 107.26, + "step": 31745, + "train_speed(iter/s)": 1.637139 + }, + { + "acc": 0.65812893, + "epoch": 0.8054287163876205, + "grad_norm": 5.25, + "learning_rate": 6.967295853447578e-06, + "loss": 1.60760384, + "memory(GiB)": 107.26, + "step": 31750, + "train_speed(iter/s)": 1.637166 + }, + { + "acc": 0.65970874, + "epoch": 0.8055555555555556, + "grad_norm": 5.90625, + "learning_rate": 6.966331766302916e-06, + "loss": 1.63058414, + "memory(GiB)": 107.26, + "step": 31755, + "train_speed(iter/s)": 1.637195 + }, + { + "acc": 0.65536618, + "epoch": 0.8056823947234906, + "grad_norm": 5.3125, + "learning_rate": 6.965367592670206e-06, + "loss": 1.56879215, + "memory(GiB)": 107.26, + "step": 31760, + "train_speed(iter/s)": 1.637224 + }, + { + "acc": 0.66208749, + "epoch": 0.8058092338914257, + "grad_norm": 5.9375, + "learning_rate": 6.964403332591854e-06, + "loss": 1.68444309, + "memory(GiB)": 107.26, + "step": 31765, + "train_speed(iter/s)": 1.637251 + }, + { + "acc": 0.65442572, + "epoch": 0.8059360730593608, + "grad_norm": 5.90625, + "learning_rate": 6.963438986110272e-06, + "loss": 1.60106144, + "memory(GiB)": 107.26, + "step": 31770, + "train_speed(iter/s)": 1.637279 + }, + { + "acc": 0.67014709, + "epoch": 0.8060629122272958, + "grad_norm": 6.09375, + "learning_rate": 6.962474553267877e-06, + "loss": 1.60699959, + "memory(GiB)": 107.26, + "step": 31775, + "train_speed(iter/s)": 1.637304 + }, + { + "acc": 0.63917484, + "epoch": 0.8061897513952309, + "grad_norm": 5.53125, + "learning_rate": 6.96151003410709e-06, + "loss": 1.67974262, + "memory(GiB)": 107.26, + "step": 31780, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.6571743, + "epoch": 0.806316590563166, + "grad_norm": 5.0, + "learning_rate": 6.960545428670333e-06, + "loss": 1.64713554, + "memory(GiB)": 107.26, + "step": 31785, + "train_speed(iter/s)": 1.637359 + }, + { + "acc": 0.66791286, + "epoch": 0.806443429731101, + "grad_norm": 8.1875, + "learning_rate": 6.959580737000038e-06, + "loss": 1.57795753, + "memory(GiB)": 107.26, + "step": 31790, + "train_speed(iter/s)": 1.637388 + }, + { + "acc": 0.66564775, + "epoch": 0.806570268899036, + "grad_norm": 5.09375, + "learning_rate": 6.95861595913863e-06, + "loss": 1.52560682, + "memory(GiB)": 107.26, + "step": 31795, + "train_speed(iter/s)": 1.637415 + }, + { + "acc": 0.6771359, + "epoch": 0.806697108066971, + "grad_norm": 4.78125, + "learning_rate": 6.95765109512855e-06, + "loss": 1.50610638, + "memory(GiB)": 107.26, + "step": 31800, + "train_speed(iter/s)": 1.637442 + }, + { + "acc": 0.65043592, + "epoch": 0.8068239472349061, + "grad_norm": 5.78125, + "learning_rate": 6.956686145012233e-06, + "loss": 1.58894205, + "memory(GiB)": 107.26, + "step": 31805, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.64786196, + "epoch": 0.8069507864028412, + "grad_norm": 6.875, + "learning_rate": 6.955721108832124e-06, + "loss": 1.61288109, + "memory(GiB)": 107.26, + "step": 31810, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.65786214, + "epoch": 0.8070776255707762, + "grad_norm": 5.96875, + "learning_rate": 6.9547559866306695e-06, + "loss": 1.56943264, + "memory(GiB)": 107.26, + "step": 31815, + "train_speed(iter/s)": 1.637525 + }, + { + "acc": 0.65914631, + "epoch": 0.8072044647387113, + "grad_norm": 6.375, + "learning_rate": 6.953790778450318e-06, + "loss": 1.60140972, + "memory(GiB)": 107.26, + "step": 31820, + "train_speed(iter/s)": 1.637551 + }, + { + "acc": 0.65495863, + "epoch": 0.8073313039066464, + "grad_norm": 6.53125, + "learning_rate": 6.9528254843335254e-06, + "loss": 1.60486603, + "memory(GiB)": 107.26, + "step": 31825, + "train_speed(iter/s)": 1.63758 + }, + { + "acc": 0.66042252, + "epoch": 0.8074581430745814, + "grad_norm": 5.625, + "learning_rate": 6.95186010432275e-06, + "loss": 1.54157944, + "memory(GiB)": 107.26, + "step": 31830, + "train_speed(iter/s)": 1.637607 + }, + { + "acc": 0.65015907, + "epoch": 0.8075849822425165, + "grad_norm": 5.5, + "learning_rate": 6.950894638460452e-06, + "loss": 1.63135757, + "memory(GiB)": 107.26, + "step": 31835, + "train_speed(iter/s)": 1.637632 + }, + { + "acc": 0.65165834, + "epoch": 0.8077118214104515, + "grad_norm": 5.25, + "learning_rate": 6.949929086789098e-06, + "loss": 1.62540207, + "memory(GiB)": 107.26, + "step": 31840, + "train_speed(iter/s)": 1.637659 + }, + { + "acc": 0.64335504, + "epoch": 0.8078386605783866, + "grad_norm": 5.6875, + "learning_rate": 6.948963449351156e-06, + "loss": 1.68301506, + "memory(GiB)": 107.26, + "step": 31845, + "train_speed(iter/s)": 1.637687 + }, + { + "acc": 0.64293494, + "epoch": 0.8079654997463217, + "grad_norm": 6.125, + "learning_rate": 6.947997726189102e-06, + "loss": 1.6709095, + "memory(GiB)": 107.26, + "step": 31850, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.65304346, + "epoch": 0.8080923389142567, + "grad_norm": 7.875, + "learning_rate": 6.947031917345409e-06, + "loss": 1.65402298, + "memory(GiB)": 107.26, + "step": 31855, + "train_speed(iter/s)": 1.63774 + }, + { + "acc": 0.64841862, + "epoch": 0.8082191780821918, + "grad_norm": 6.0625, + "learning_rate": 6.946066022862561e-06, + "loss": 1.65863914, + "memory(GiB)": 107.26, + "step": 31860, + "train_speed(iter/s)": 1.637767 + }, + { + "acc": 0.65058589, + "epoch": 0.8083460172501269, + "grad_norm": 5.84375, + "learning_rate": 6.945100042783039e-06, + "loss": 1.64971886, + "memory(GiB)": 107.26, + "step": 31865, + "train_speed(iter/s)": 1.637791 + }, + { + "acc": 0.64818745, + "epoch": 0.8084728564180619, + "grad_norm": 5.6875, + "learning_rate": 6.9441339771493345e-06, + "loss": 1.56851358, + "memory(GiB)": 107.26, + "step": 31870, + "train_speed(iter/s)": 1.637819 + }, + { + "acc": 0.66528578, + "epoch": 0.808599695585997, + "grad_norm": 5.5, + "learning_rate": 6.943167826003937e-06, + "loss": 1.52801495, + "memory(GiB)": 107.26, + "step": 31875, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.66449409, + "epoch": 0.808726534753932, + "grad_norm": 5.09375, + "learning_rate": 6.942201589389344e-06, + "loss": 1.590306, + "memory(GiB)": 107.26, + "step": 31880, + "train_speed(iter/s)": 1.637873 + }, + { + "acc": 0.66747532, + "epoch": 0.8088533739218671, + "grad_norm": 4.8125, + "learning_rate": 6.9412352673480525e-06, + "loss": 1.57964287, + "memory(GiB)": 107.26, + "step": 31885, + "train_speed(iter/s)": 1.637898 + }, + { + "acc": 0.63754005, + "epoch": 0.8089802130898022, + "grad_norm": 4.78125, + "learning_rate": 6.940268859922566e-06, + "loss": 1.66158218, + "memory(GiB)": 107.26, + "step": 31890, + "train_speed(iter/s)": 1.637923 + }, + { + "acc": 0.6645937, + "epoch": 0.8091070522577372, + "grad_norm": 5.71875, + "learning_rate": 6.939302367155394e-06, + "loss": 1.58089075, + "memory(GiB)": 107.26, + "step": 31895, + "train_speed(iter/s)": 1.63795 + }, + { + "acc": 0.63663797, + "epoch": 0.8092338914256723, + "grad_norm": 6.78125, + "learning_rate": 6.9383357890890454e-06, + "loss": 1.75767899, + "memory(GiB)": 107.26, + "step": 31900, + "train_speed(iter/s)": 1.637979 + }, + { + "acc": 0.6385026, + "epoch": 0.8093607305936074, + "grad_norm": 6.40625, + "learning_rate": 6.937369125766033e-06, + "loss": 1.65668526, + "memory(GiB)": 107.26, + "step": 31905, + "train_speed(iter/s)": 1.638007 + }, + { + "acc": 0.64706106, + "epoch": 0.8094875697615423, + "grad_norm": 6.90625, + "learning_rate": 6.936402377228879e-06, + "loss": 1.66105251, + "memory(GiB)": 107.26, + "step": 31910, + "train_speed(iter/s)": 1.638034 + }, + { + "acc": 0.65470076, + "epoch": 0.8096144089294774, + "grad_norm": 5.1875, + "learning_rate": 6.9354355435201015e-06, + "loss": 1.54109869, + "memory(GiB)": 107.26, + "step": 31915, + "train_speed(iter/s)": 1.638061 + }, + { + "acc": 0.64888988, + "epoch": 0.8097412480974124, + "grad_norm": 5.78125, + "learning_rate": 6.934468624682229e-06, + "loss": 1.64780807, + "memory(GiB)": 107.26, + "step": 31920, + "train_speed(iter/s)": 1.638087 + }, + { + "acc": 0.65892801, + "epoch": 0.8098680872653475, + "grad_norm": 6.5, + "learning_rate": 6.933501620757789e-06, + "loss": 1.56067877, + "memory(GiB)": 107.26, + "step": 31925, + "train_speed(iter/s)": 1.638114 + }, + { + "acc": 0.66509089, + "epoch": 0.8099949264332826, + "grad_norm": 5.53125, + "learning_rate": 6.932534531789317e-06, + "loss": 1.547925, + "memory(GiB)": 107.26, + "step": 31930, + "train_speed(iter/s)": 1.638137 + }, + { + "acc": 0.65340157, + "epoch": 0.8101217656012176, + "grad_norm": 5.46875, + "learning_rate": 6.931567357819344e-06, + "loss": 1.59515181, + "memory(GiB)": 107.26, + "step": 31935, + "train_speed(iter/s)": 1.638164 + }, + { + "acc": 0.63720765, + "epoch": 0.8102486047691527, + "grad_norm": 5.1875, + "learning_rate": 6.930600098890419e-06, + "loss": 1.61746025, + "memory(GiB)": 107.26, + "step": 31940, + "train_speed(iter/s)": 1.638189 + }, + { + "acc": 0.64260411, + "epoch": 0.8103754439370878, + "grad_norm": 5.28125, + "learning_rate": 6.929632755045079e-06, + "loss": 1.63631001, + "memory(GiB)": 107.26, + "step": 31945, + "train_speed(iter/s)": 1.638216 + }, + { + "acc": 0.66141033, + "epoch": 0.8105022831050228, + "grad_norm": 8.3125, + "learning_rate": 6.9286653263258765e-06, + "loss": 1.63917561, + "memory(GiB)": 107.26, + "step": 31950, + "train_speed(iter/s)": 1.638245 + }, + { + "acc": 0.63730392, + "epoch": 0.8106291222729579, + "grad_norm": 6.03125, + "learning_rate": 6.927697812775363e-06, + "loss": 1.68924656, + "memory(GiB)": 107.26, + "step": 31955, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.64088702, + "epoch": 0.8107559614408929, + "grad_norm": 6.8125, + "learning_rate": 6.926730214436091e-06, + "loss": 1.65115223, + "memory(GiB)": 107.26, + "step": 31960, + "train_speed(iter/s)": 1.638296 + }, + { + "acc": 0.66395721, + "epoch": 0.810882800608828, + "grad_norm": 6.96875, + "learning_rate": 6.925762531350624e-06, + "loss": 1.58305988, + "memory(GiB)": 107.26, + "step": 31965, + "train_speed(iter/s)": 1.638323 + }, + { + "acc": 0.64590797, + "epoch": 0.8110096397767631, + "grad_norm": 7.375, + "learning_rate": 6.924794763561522e-06, + "loss": 1.65210724, + "memory(GiB)": 107.26, + "step": 31970, + "train_speed(iter/s)": 1.638351 + }, + { + "acc": 0.65053396, + "epoch": 0.8111364789446981, + "grad_norm": 5.65625, + "learning_rate": 6.923826911111353e-06, + "loss": 1.60181446, + "memory(GiB)": 107.26, + "step": 31975, + "train_speed(iter/s)": 1.638378 + }, + { + "acc": 0.65695963, + "epoch": 0.8112633181126332, + "grad_norm": 4.875, + "learning_rate": 6.922858974042688e-06, + "loss": 1.58870478, + "memory(GiB)": 107.26, + "step": 31980, + "train_speed(iter/s)": 1.638404 + }, + { + "acc": 0.66785922, + "epoch": 0.8113901572805683, + "grad_norm": 6.21875, + "learning_rate": 6.921890952398098e-06, + "loss": 1.56708317, + "memory(GiB)": 107.26, + "step": 31985, + "train_speed(iter/s)": 1.63843 + }, + { + "acc": 0.65374861, + "epoch": 0.8115169964485033, + "grad_norm": 7.03125, + "learning_rate": 6.920922846220166e-06, + "loss": 1.67089214, + "memory(GiB)": 107.26, + "step": 31990, + "train_speed(iter/s)": 1.638458 + }, + { + "acc": 0.6325027, + "epoch": 0.8116438356164384, + "grad_norm": 6.46875, + "learning_rate": 6.919954655551469e-06, + "loss": 1.59934502, + "memory(GiB)": 107.26, + "step": 31995, + "train_speed(iter/s)": 1.638487 + }, + { + "acc": 0.6746294, + "epoch": 0.8117706747843734, + "grad_norm": 5.21875, + "learning_rate": 6.918986380434594e-06, + "loss": 1.5499197, + "memory(GiB)": 107.26, + "step": 32000, + "train_speed(iter/s)": 1.638513 + }, + { + "epoch": 0.8117706747843734, + "eval_acc": 0.6457560399033532, + "eval_loss": 1.57610285282135, + "eval_runtime": 58.6414, + "eval_samples_per_second": 108.626, + "eval_steps_per_second": 27.165, + "step": 32000 + }, + { + "acc": 0.66393976, + "epoch": 0.8118975139523085, + "grad_norm": 6.96875, + "learning_rate": 6.918018020912132e-06, + "loss": 1.64369621, + "memory(GiB)": 107.26, + "step": 32005, + "train_speed(iter/s)": 1.633282 + }, + { + "acc": 0.66803837, + "epoch": 0.8120243531202436, + "grad_norm": 6.53125, + "learning_rate": 6.917049577026673e-06, + "loss": 1.57339373, + "memory(GiB)": 107.26, + "step": 32010, + "train_speed(iter/s)": 1.633308 + }, + { + "acc": 0.65935593, + "epoch": 0.8121511922881786, + "grad_norm": 4.75, + "learning_rate": 6.916081048820815e-06, + "loss": 1.56297636, + "memory(GiB)": 107.26, + "step": 32015, + "train_speed(iter/s)": 1.633336 + }, + { + "acc": 0.6554276, + "epoch": 0.8122780314561137, + "grad_norm": 5.3125, + "learning_rate": 6.915112436337157e-06, + "loss": 1.62615929, + "memory(GiB)": 107.26, + "step": 32020, + "train_speed(iter/s)": 1.63336 + }, + { + "acc": 0.64893494, + "epoch": 0.8124048706240488, + "grad_norm": 5.5625, + "learning_rate": 6.914143739618305e-06, + "loss": 1.65617294, + "memory(GiB)": 107.26, + "step": 32025, + "train_speed(iter/s)": 1.633389 + }, + { + "acc": 0.68204007, + "epoch": 0.8125317097919837, + "grad_norm": 6.3125, + "learning_rate": 6.913174958706865e-06, + "loss": 1.52482357, + "memory(GiB)": 107.26, + "step": 32030, + "train_speed(iter/s)": 1.633414 + }, + { + "acc": 0.65520697, + "epoch": 0.8126585489599188, + "grad_norm": 5.25, + "learning_rate": 6.912206093645448e-06, + "loss": 1.63280754, + "memory(GiB)": 107.26, + "step": 32035, + "train_speed(iter/s)": 1.633443 + }, + { + "acc": 0.66827612, + "epoch": 0.8127853881278538, + "grad_norm": 7.96875, + "learning_rate": 6.91123714447667e-06, + "loss": 1.58072968, + "memory(GiB)": 107.26, + "step": 32040, + "train_speed(iter/s)": 1.633473 + }, + { + "acc": 0.64861479, + "epoch": 0.8129122272957889, + "grad_norm": 5.1875, + "learning_rate": 6.910268111243149e-06, + "loss": 1.58176289, + "memory(GiB)": 107.26, + "step": 32045, + "train_speed(iter/s)": 1.6335 + }, + { + "acc": 0.63910809, + "epoch": 0.813039066463724, + "grad_norm": 6.0625, + "learning_rate": 6.909298993987508e-06, + "loss": 1.72066059, + "memory(GiB)": 107.26, + "step": 32050, + "train_speed(iter/s)": 1.633527 + }, + { + "acc": 0.65280857, + "epoch": 0.813165905631659, + "grad_norm": 5.34375, + "learning_rate": 6.908329792752373e-06, + "loss": 1.58054399, + "memory(GiB)": 107.26, + "step": 32055, + "train_speed(iter/s)": 1.633554 + }, + { + "acc": 0.65664806, + "epoch": 0.8132927447995941, + "grad_norm": 4.96875, + "learning_rate": 6.907360507580374e-06, + "loss": 1.62364864, + "memory(GiB)": 107.26, + "step": 32060, + "train_speed(iter/s)": 1.63358 + }, + { + "acc": 0.64661937, + "epoch": 0.8134195839675292, + "grad_norm": 5.1875, + "learning_rate": 6.9063911385141425e-06, + "loss": 1.61508522, + "memory(GiB)": 107.26, + "step": 32065, + "train_speed(iter/s)": 1.633608 + }, + { + "acc": 0.64370565, + "epoch": 0.8135464231354642, + "grad_norm": 5.53125, + "learning_rate": 6.9054216855963194e-06, + "loss": 1.64386616, + "memory(GiB)": 107.26, + "step": 32070, + "train_speed(iter/s)": 1.633636 + }, + { + "acc": 0.64428039, + "epoch": 0.8136732623033993, + "grad_norm": 6.5, + "learning_rate": 6.904452148869541e-06, + "loss": 1.63389359, + "memory(GiB)": 107.26, + "step": 32075, + "train_speed(iter/s)": 1.633662 + }, + { + "acc": 0.64763231, + "epoch": 0.8138001014713343, + "grad_norm": 5.1875, + "learning_rate": 6.903482528376457e-06, + "loss": 1.59145336, + "memory(GiB)": 107.26, + "step": 32080, + "train_speed(iter/s)": 1.633689 + }, + { + "acc": 0.65249796, + "epoch": 0.8139269406392694, + "grad_norm": 6.03125, + "learning_rate": 6.902512824159711e-06, + "loss": 1.67030869, + "memory(GiB)": 107.26, + "step": 32085, + "train_speed(iter/s)": 1.633716 + }, + { + "acc": 0.65229883, + "epoch": 0.8140537798072045, + "grad_norm": 5.5625, + "learning_rate": 6.901543036261957e-06, + "loss": 1.6234724, + "memory(GiB)": 107.26, + "step": 32090, + "train_speed(iter/s)": 1.633743 + }, + { + "acc": 0.64428339, + "epoch": 0.8141806189751395, + "grad_norm": 7.96875, + "learning_rate": 6.900573164725852e-06, + "loss": 1.67717514, + "memory(GiB)": 107.26, + "step": 32095, + "train_speed(iter/s)": 1.633771 + }, + { + "acc": 0.66641817, + "epoch": 0.8143074581430746, + "grad_norm": 5.96875, + "learning_rate": 6.899603209594052e-06, + "loss": 1.56629438, + "memory(GiB)": 107.26, + "step": 32100, + "train_speed(iter/s)": 1.633795 + }, + { + "acc": 0.66047859, + "epoch": 0.8144342973110097, + "grad_norm": 5.09375, + "learning_rate": 6.898633170909224e-06, + "loss": 1.59941158, + "memory(GiB)": 107.26, + "step": 32105, + "train_speed(iter/s)": 1.633822 + }, + { + "acc": 0.65399694, + "epoch": 0.8145611364789447, + "grad_norm": 6.0625, + "learning_rate": 6.897663048714031e-06, + "loss": 1.57719822, + "memory(GiB)": 107.26, + "step": 32110, + "train_speed(iter/s)": 1.633848 + }, + { + "acc": 0.66222644, + "epoch": 0.8146879756468798, + "grad_norm": 5.28125, + "learning_rate": 6.896692843051145e-06, + "loss": 1.60690384, + "memory(GiB)": 107.26, + "step": 32115, + "train_speed(iter/s)": 1.633876 + }, + { + "acc": 0.65599937, + "epoch": 0.8148148148148148, + "grad_norm": 5.9375, + "learning_rate": 6.895722553963239e-06, + "loss": 1.60285091, + "memory(GiB)": 107.26, + "step": 32120, + "train_speed(iter/s)": 1.633903 + }, + { + "acc": 0.65920811, + "epoch": 0.8149416539827499, + "grad_norm": 6.5625, + "learning_rate": 6.8947521814929915e-06, + "loss": 1.65305328, + "memory(GiB)": 107.26, + "step": 32125, + "train_speed(iter/s)": 1.633931 + }, + { + "acc": 0.63986645, + "epoch": 0.815068493150685, + "grad_norm": 7.09375, + "learning_rate": 6.8937817256830834e-06, + "loss": 1.73617954, + "memory(GiB)": 107.26, + "step": 32130, + "train_speed(iter/s)": 1.633959 + }, + { + "acc": 0.65625567, + "epoch": 0.81519533231862, + "grad_norm": 6.6875, + "learning_rate": 6.892811186576199e-06, + "loss": 1.62067204, + "memory(GiB)": 107.26, + "step": 32135, + "train_speed(iter/s)": 1.633986 + }, + { + "acc": 0.65273118, + "epoch": 0.8153221714865551, + "grad_norm": 5.65625, + "learning_rate": 6.8918405642150295e-06, + "loss": 1.61611557, + "memory(GiB)": 107.26, + "step": 32140, + "train_speed(iter/s)": 1.634013 + }, + { + "acc": 0.65577984, + "epoch": 0.8154490106544902, + "grad_norm": 5.0625, + "learning_rate": 6.890869858642264e-06, + "loss": 1.60321445, + "memory(GiB)": 107.26, + "step": 32145, + "train_speed(iter/s)": 1.63404 + }, + { + "acc": 0.65248051, + "epoch": 0.8155758498224251, + "grad_norm": 5.375, + "learning_rate": 6.889899069900603e-06, + "loss": 1.64560661, + "memory(GiB)": 107.26, + "step": 32150, + "train_speed(iter/s)": 1.634067 + }, + { + "acc": 0.65291662, + "epoch": 0.8157026889903602, + "grad_norm": 6.03125, + "learning_rate": 6.888928198032741e-06, + "loss": 1.5980773, + "memory(GiB)": 107.26, + "step": 32155, + "train_speed(iter/s)": 1.634096 + }, + { + "acc": 0.6604311, + "epoch": 0.8158295281582952, + "grad_norm": 6.78125, + "learning_rate": 6.887957243081384e-06, + "loss": 1.59901848, + "memory(GiB)": 107.26, + "step": 32160, + "train_speed(iter/s)": 1.634125 + }, + { + "acc": 0.65565186, + "epoch": 0.8159563673262303, + "grad_norm": 5.65625, + "learning_rate": 6.886986205089237e-06, + "loss": 1.59496717, + "memory(GiB)": 107.26, + "step": 32165, + "train_speed(iter/s)": 1.634152 + }, + { + "acc": 0.65986338, + "epoch": 0.8160832064941654, + "grad_norm": 6.21875, + "learning_rate": 6.886015084099011e-06, + "loss": 1.6038208, + "memory(GiB)": 107.26, + "step": 32170, + "train_speed(iter/s)": 1.634179 + }, + { + "acc": 0.65892019, + "epoch": 0.8162100456621004, + "grad_norm": 5.78125, + "learning_rate": 6.885043880153424e-06, + "loss": 1.6073143, + "memory(GiB)": 107.26, + "step": 32175, + "train_speed(iter/s)": 1.634203 + }, + { + "acc": 0.66693459, + "epoch": 0.8163368848300355, + "grad_norm": 6.0625, + "learning_rate": 6.88407259329519e-06, + "loss": 1.52714939, + "memory(GiB)": 107.26, + "step": 32180, + "train_speed(iter/s)": 1.634232 + }, + { + "acc": 0.66009459, + "epoch": 0.8164637239979706, + "grad_norm": 5.5625, + "learning_rate": 6.883101223567031e-06, + "loss": 1.61822433, + "memory(GiB)": 107.26, + "step": 32185, + "train_speed(iter/s)": 1.634261 + }, + { + "acc": 0.6580308, + "epoch": 0.8165905631659056, + "grad_norm": 4.875, + "learning_rate": 6.882129771011674e-06, + "loss": 1.54892139, + "memory(GiB)": 107.26, + "step": 32190, + "train_speed(iter/s)": 1.634286 + }, + { + "acc": 0.66543446, + "epoch": 0.8167174023338407, + "grad_norm": 6.0625, + "learning_rate": 6.881158235671845e-06, + "loss": 1.5171236, + "memory(GiB)": 107.26, + "step": 32195, + "train_speed(iter/s)": 1.634312 + }, + { + "acc": 0.65951328, + "epoch": 0.8168442415017757, + "grad_norm": 5.875, + "learning_rate": 6.8801866175902785e-06, + "loss": 1.57330246, + "memory(GiB)": 107.26, + "step": 32200, + "train_speed(iter/s)": 1.634338 + }, + { + "acc": 0.64907103, + "epoch": 0.8169710806697108, + "grad_norm": 5.34375, + "learning_rate": 6.87921491680971e-06, + "loss": 1.65742016, + "memory(GiB)": 107.26, + "step": 32205, + "train_speed(iter/s)": 1.634364 + }, + { + "acc": 0.66012859, + "epoch": 0.8170979198376459, + "grad_norm": 5.78125, + "learning_rate": 6.878243133372882e-06, + "loss": 1.5649374, + "memory(GiB)": 107.26, + "step": 32210, + "train_speed(iter/s)": 1.634392 + }, + { + "acc": 0.65850859, + "epoch": 0.8172247590055809, + "grad_norm": 5.78125, + "learning_rate": 6.877271267322532e-06, + "loss": 1.62557621, + "memory(GiB)": 107.26, + "step": 32215, + "train_speed(iter/s)": 1.63442 + }, + { + "acc": 0.63739662, + "epoch": 0.817351598173516, + "grad_norm": 4.59375, + "learning_rate": 6.876299318701412e-06, + "loss": 1.66764774, + "memory(GiB)": 107.26, + "step": 32220, + "train_speed(iter/s)": 1.634447 + }, + { + "acc": 0.65172548, + "epoch": 0.8174784373414511, + "grad_norm": 5.8125, + "learning_rate": 6.875327287552269e-06, + "loss": 1.61338234, + "memory(GiB)": 107.26, + "step": 32225, + "train_speed(iter/s)": 1.634475 + }, + { + "acc": 0.6541935, + "epoch": 0.8176052765093861, + "grad_norm": 6.0625, + "learning_rate": 6.8743551739178615e-06, + "loss": 1.64148178, + "memory(GiB)": 107.26, + "step": 32230, + "train_speed(iter/s)": 1.634502 + }, + { + "acc": 0.64344125, + "epoch": 0.8177321156773212, + "grad_norm": 5.34375, + "learning_rate": 6.8733829778409425e-06, + "loss": 1.64630795, + "memory(GiB)": 107.26, + "step": 32235, + "train_speed(iter/s)": 1.634531 + }, + { + "acc": 0.66356535, + "epoch": 0.8178589548452562, + "grad_norm": 5.75, + "learning_rate": 6.872410699364278e-06, + "loss": 1.46752462, + "memory(GiB)": 107.26, + "step": 32240, + "train_speed(iter/s)": 1.634559 + }, + { + "acc": 0.65554724, + "epoch": 0.8179857940131913, + "grad_norm": 5.1875, + "learning_rate": 6.8714383385306305e-06, + "loss": 1.62728748, + "memory(GiB)": 107.26, + "step": 32245, + "train_speed(iter/s)": 1.634586 + }, + { + "acc": 0.66039906, + "epoch": 0.8181126331811264, + "grad_norm": 5.1875, + "learning_rate": 6.870465895382769e-06, + "loss": 1.54872265, + "memory(GiB)": 107.26, + "step": 32250, + "train_speed(iter/s)": 1.634614 + }, + { + "acc": 0.67040577, + "epoch": 0.8182394723490614, + "grad_norm": 5.53125, + "learning_rate": 6.869493369963468e-06, + "loss": 1.56728411, + "memory(GiB)": 107.26, + "step": 32255, + "train_speed(iter/s)": 1.634642 + }, + { + "acc": 0.66036167, + "epoch": 0.8183663115169965, + "grad_norm": 6.71875, + "learning_rate": 6.8685207623155e-06, + "loss": 1.61439857, + "memory(GiB)": 107.26, + "step": 32260, + "train_speed(iter/s)": 1.63467 + }, + { + "acc": 0.64325314, + "epoch": 0.8184931506849316, + "grad_norm": 5.1875, + "learning_rate": 6.867548072481649e-06, + "loss": 1.67407379, + "memory(GiB)": 107.26, + "step": 32265, + "train_speed(iter/s)": 1.634699 + }, + { + "acc": 0.63347797, + "epoch": 0.8186199898528665, + "grad_norm": 5.34375, + "learning_rate": 6.866575300504695e-06, + "loss": 1.61705627, + "memory(GiB)": 107.26, + "step": 32270, + "train_speed(iter/s)": 1.634723 + }, + { + "acc": 0.65691743, + "epoch": 0.8187468290208016, + "grad_norm": 7.5, + "learning_rate": 6.865602446427424e-06, + "loss": 1.62348709, + "memory(GiB)": 107.26, + "step": 32275, + "train_speed(iter/s)": 1.634752 + }, + { + "acc": 0.65422549, + "epoch": 0.8188736681887366, + "grad_norm": 6.8125, + "learning_rate": 6.864629510292629e-06, + "loss": 1.62963963, + "memory(GiB)": 107.26, + "step": 32280, + "train_speed(iter/s)": 1.634779 + }, + { + "acc": 0.66408701, + "epoch": 0.8190005073566717, + "grad_norm": 4.84375, + "learning_rate": 6.863656492143103e-06, + "loss": 1.56829777, + "memory(GiB)": 107.26, + "step": 32285, + "train_speed(iter/s)": 1.634803 + }, + { + "acc": 0.64758854, + "epoch": 0.8191273465246068, + "grad_norm": 8.1875, + "learning_rate": 6.862683392021644e-06, + "loss": 1.59552746, + "memory(GiB)": 107.26, + "step": 32290, + "train_speed(iter/s)": 1.634831 + }, + { + "acc": 0.66735768, + "epoch": 0.8192541856925418, + "grad_norm": 6.0, + "learning_rate": 6.861710209971052e-06, + "loss": 1.47986202, + "memory(GiB)": 107.26, + "step": 32295, + "train_speed(iter/s)": 1.634857 + }, + { + "acc": 0.64073963, + "epoch": 0.8193810248604769, + "grad_norm": 5.1875, + "learning_rate": 6.860736946034136e-06, + "loss": 1.66423645, + "memory(GiB)": 107.26, + "step": 32300, + "train_speed(iter/s)": 1.634884 + }, + { + "acc": 0.6388639, + "epoch": 0.819507864028412, + "grad_norm": 6.09375, + "learning_rate": 6.859763600253698e-06, + "loss": 1.72613449, + "memory(GiB)": 107.26, + "step": 32305, + "train_speed(iter/s)": 1.634911 + }, + { + "acc": 0.65564804, + "epoch": 0.819634703196347, + "grad_norm": 5.0, + "learning_rate": 6.858790172672556e-06, + "loss": 1.62072411, + "memory(GiB)": 107.26, + "step": 32310, + "train_speed(iter/s)": 1.634938 + }, + { + "acc": 0.66547508, + "epoch": 0.8197615423642821, + "grad_norm": 5.5625, + "learning_rate": 6.857816663333523e-06, + "loss": 1.57943888, + "memory(GiB)": 107.26, + "step": 32315, + "train_speed(iter/s)": 1.634967 + }, + { + "acc": 0.66568441, + "epoch": 0.8198883815322171, + "grad_norm": 7.625, + "learning_rate": 6.856843072279418e-06, + "loss": 1.53723431, + "memory(GiB)": 107.26, + "step": 32320, + "train_speed(iter/s)": 1.634995 + }, + { + "acc": 0.65385957, + "epoch": 0.8200152207001522, + "grad_norm": 5.15625, + "learning_rate": 6.855869399553065e-06, + "loss": 1.6625124, + "memory(GiB)": 107.26, + "step": 32325, + "train_speed(iter/s)": 1.635023 + }, + { + "acc": 0.65744534, + "epoch": 0.8201420598680873, + "grad_norm": 7.125, + "learning_rate": 6.85489564519729e-06, + "loss": 1.59037447, + "memory(GiB)": 107.26, + "step": 32330, + "train_speed(iter/s)": 1.635049 + }, + { + "acc": 0.66599569, + "epoch": 0.8202688990360223, + "grad_norm": 5.28125, + "learning_rate": 6.853921809254922e-06, + "loss": 1.59165907, + "memory(GiB)": 107.26, + "step": 32335, + "train_speed(iter/s)": 1.635076 + }, + { + "acc": 0.63831067, + "epoch": 0.8203957382039574, + "grad_norm": 6.96875, + "learning_rate": 6.852947891768796e-06, + "loss": 1.60572338, + "memory(GiB)": 107.26, + "step": 32340, + "train_speed(iter/s)": 1.635103 + }, + { + "acc": 0.67591271, + "epoch": 0.8205225773718925, + "grad_norm": 6.0625, + "learning_rate": 6.851973892781749e-06, + "loss": 1.50954609, + "memory(GiB)": 107.26, + "step": 32345, + "train_speed(iter/s)": 1.63513 + }, + { + "acc": 0.65335698, + "epoch": 0.8206494165398275, + "grad_norm": 5.84375, + "learning_rate": 6.850999812336623e-06, + "loss": 1.60876675, + "memory(GiB)": 107.26, + "step": 32350, + "train_speed(iter/s)": 1.635153 + }, + { + "acc": 0.64838223, + "epoch": 0.8207762557077626, + "grad_norm": 7.25, + "learning_rate": 6.850025650476259e-06, + "loss": 1.62331543, + "memory(GiB)": 107.26, + "step": 32355, + "train_speed(iter/s)": 1.63518 + }, + { + "acc": 0.6412725, + "epoch": 0.8209030948756976, + "grad_norm": 5.0, + "learning_rate": 6.849051407243509e-06, + "loss": 1.62683983, + "memory(GiB)": 107.26, + "step": 32360, + "train_speed(iter/s)": 1.635206 + }, + { + "acc": 0.64832191, + "epoch": 0.8210299340436327, + "grad_norm": 6.1875, + "learning_rate": 6.8480770826812205e-06, + "loss": 1.61713371, + "memory(GiB)": 107.26, + "step": 32365, + "train_speed(iter/s)": 1.635232 + }, + { + "acc": 0.64966068, + "epoch": 0.8211567732115678, + "grad_norm": 5.53125, + "learning_rate": 6.847102676832253e-06, + "loss": 1.64985809, + "memory(GiB)": 107.26, + "step": 32370, + "train_speed(iter/s)": 1.635258 + }, + { + "acc": 0.6488471, + "epoch": 0.8212836123795028, + "grad_norm": 6.09375, + "learning_rate": 6.8461281897394615e-06, + "loss": 1.62956848, + "memory(GiB)": 107.26, + "step": 32375, + "train_speed(iter/s)": 1.635287 + }, + { + "acc": 0.64167128, + "epoch": 0.8214104515474379, + "grad_norm": 5.90625, + "learning_rate": 6.845153621445711e-06, + "loss": 1.66543274, + "memory(GiB)": 107.26, + "step": 32380, + "train_speed(iter/s)": 1.635315 + }, + { + "acc": 0.65637875, + "epoch": 0.821537290715373, + "grad_norm": 5.6875, + "learning_rate": 6.844178971993866e-06, + "loss": 1.66299438, + "memory(GiB)": 107.26, + "step": 32385, + "train_speed(iter/s)": 1.635342 + }, + { + "acc": 0.66767187, + "epoch": 0.821664129883308, + "grad_norm": 6.1875, + "learning_rate": 6.843204241426797e-06, + "loss": 1.58016481, + "memory(GiB)": 107.26, + "step": 32390, + "train_speed(iter/s)": 1.635371 + }, + { + "acc": 0.6691226, + "epoch": 0.821790969051243, + "grad_norm": 4.78125, + "learning_rate": 6.842229429787375e-06, + "loss": 1.61401806, + "memory(GiB)": 107.26, + "step": 32395, + "train_speed(iter/s)": 1.635398 + }, + { + "acc": 0.65529542, + "epoch": 0.821917808219178, + "grad_norm": 5.21875, + "learning_rate": 6.841254537118477e-06, + "loss": 1.53234911, + "memory(GiB)": 107.26, + "step": 32400, + "train_speed(iter/s)": 1.635425 + }, + { + "acc": 0.64349184, + "epoch": 0.8220446473871131, + "grad_norm": 5.78125, + "learning_rate": 6.840279563462985e-06, + "loss": 1.69066753, + "memory(GiB)": 107.26, + "step": 32405, + "train_speed(iter/s)": 1.635455 + }, + { + "acc": 0.65027962, + "epoch": 0.8221714865550482, + "grad_norm": 5.6875, + "learning_rate": 6.839304508863781e-06, + "loss": 1.61943359, + "memory(GiB)": 107.26, + "step": 32410, + "train_speed(iter/s)": 1.635483 + }, + { + "acc": 0.66247911, + "epoch": 0.8222983257229832, + "grad_norm": 5.03125, + "learning_rate": 6.838329373363753e-06, + "loss": 1.63647079, + "memory(GiB)": 107.26, + "step": 32415, + "train_speed(iter/s)": 1.635511 + }, + { + "acc": 0.65687923, + "epoch": 0.8224251648909183, + "grad_norm": 5.15625, + "learning_rate": 6.8373541570057924e-06, + "loss": 1.61744423, + "memory(GiB)": 107.26, + "step": 32420, + "train_speed(iter/s)": 1.635539 + }, + { + "acc": 0.66247163, + "epoch": 0.8225520040588534, + "grad_norm": 6.15625, + "learning_rate": 6.836378859832791e-06, + "loss": 1.61967697, + "memory(GiB)": 107.26, + "step": 32425, + "train_speed(iter/s)": 1.635568 + }, + { + "acc": 0.64993491, + "epoch": 0.8226788432267884, + "grad_norm": 5.21875, + "learning_rate": 6.83540348188765e-06, + "loss": 1.60473518, + "memory(GiB)": 107.26, + "step": 32430, + "train_speed(iter/s)": 1.635597 + }, + { + "acc": 0.66220131, + "epoch": 0.8228056823947235, + "grad_norm": 7.15625, + "learning_rate": 6.834428023213268e-06, + "loss": 1.5645031, + "memory(GiB)": 107.26, + "step": 32435, + "train_speed(iter/s)": 1.635624 + }, + { + "acc": 0.64771023, + "epoch": 0.8229325215626585, + "grad_norm": 4.6875, + "learning_rate": 6.833452483852554e-06, + "loss": 1.5846118, + "memory(GiB)": 107.26, + "step": 32440, + "train_speed(iter/s)": 1.635654 + }, + { + "acc": 0.65924816, + "epoch": 0.8230593607305936, + "grad_norm": 7.84375, + "learning_rate": 6.832476863848411e-06, + "loss": 1.59638138, + "memory(GiB)": 107.26, + "step": 32445, + "train_speed(iter/s)": 1.63568 + }, + { + "acc": 0.64527903, + "epoch": 0.8231861998985287, + "grad_norm": 6.09375, + "learning_rate": 6.831501163243756e-06, + "loss": 1.67870884, + "memory(GiB)": 107.26, + "step": 32450, + "train_speed(iter/s)": 1.635706 + }, + { + "acc": 0.64573102, + "epoch": 0.8233130390664637, + "grad_norm": 8.0, + "learning_rate": 6.830525382081501e-06, + "loss": 1.62880192, + "memory(GiB)": 107.26, + "step": 32455, + "train_speed(iter/s)": 1.635733 + }, + { + "acc": 0.65216861, + "epoch": 0.8234398782343988, + "grad_norm": 4.96875, + "learning_rate": 6.829549520404568e-06, + "loss": 1.57188931, + "memory(GiB)": 107.26, + "step": 32460, + "train_speed(iter/s)": 1.635762 + }, + { + "acc": 0.66088529, + "epoch": 0.8235667174023339, + "grad_norm": 5.40625, + "learning_rate": 6.828573578255879e-06, + "loss": 1.55594711, + "memory(GiB)": 107.26, + "step": 32465, + "train_speed(iter/s)": 1.635787 + }, + { + "acc": 0.65349712, + "epoch": 0.8236935565702689, + "grad_norm": 5.25, + "learning_rate": 6.82759755567836e-06, + "loss": 1.59866133, + "memory(GiB)": 107.26, + "step": 32470, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.6498455, + "epoch": 0.823820395738204, + "grad_norm": 5.65625, + "learning_rate": 6.826621452714941e-06, + "loss": 1.66149826, + "memory(GiB)": 107.26, + "step": 32475, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.64923029, + "epoch": 0.823947234906139, + "grad_norm": 5.03125, + "learning_rate": 6.825645269408556e-06, + "loss": 1.62181969, + "memory(GiB)": 107.26, + "step": 32480, + "train_speed(iter/s)": 1.635872 + }, + { + "acc": 0.6503171, + "epoch": 0.8240740740740741, + "grad_norm": 6.125, + "learning_rate": 6.82466900580214e-06, + "loss": 1.58356457, + "memory(GiB)": 107.26, + "step": 32485, + "train_speed(iter/s)": 1.635899 + }, + { + "acc": 0.64487762, + "epoch": 0.8242009132420092, + "grad_norm": 5.46875, + "learning_rate": 6.823692661938634e-06, + "loss": 1.66829224, + "memory(GiB)": 107.26, + "step": 32490, + "train_speed(iter/s)": 1.635927 + }, + { + "acc": 0.65993452, + "epoch": 0.8243277524099442, + "grad_norm": 6.25, + "learning_rate": 6.822716237860984e-06, + "loss": 1.62499199, + "memory(GiB)": 107.26, + "step": 32495, + "train_speed(iter/s)": 1.635954 + }, + { + "acc": 0.66408768, + "epoch": 0.8244545915778793, + "grad_norm": 5.09375, + "learning_rate": 6.821739733612135e-06, + "loss": 1.56554947, + "memory(GiB)": 107.26, + "step": 32500, + "train_speed(iter/s)": 1.635983 + }, + { + "acc": 0.65803909, + "epoch": 0.8245814307458144, + "grad_norm": 6.21875, + "learning_rate": 6.820763149235039e-06, + "loss": 1.68456879, + "memory(GiB)": 107.26, + "step": 32505, + "train_speed(iter/s)": 1.636011 + }, + { + "acc": 0.65737128, + "epoch": 0.8247082699137493, + "grad_norm": 5.4375, + "learning_rate": 6.819786484772652e-06, + "loss": 1.58501978, + "memory(GiB)": 107.26, + "step": 32510, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.62374258, + "epoch": 0.8248351090816844, + "grad_norm": 5.53125, + "learning_rate": 6.8188097402679275e-06, + "loss": 1.72287846, + "memory(GiB)": 107.26, + "step": 32515, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.6654952, + "epoch": 0.8249619482496194, + "grad_norm": 4.9375, + "learning_rate": 6.817832915763833e-06, + "loss": 1.54363441, + "memory(GiB)": 107.26, + "step": 32520, + "train_speed(iter/s)": 1.636092 + }, + { + "acc": 0.64710436, + "epoch": 0.8250887874175545, + "grad_norm": 5.53125, + "learning_rate": 6.81685601130333e-06, + "loss": 1.63108559, + "memory(GiB)": 107.26, + "step": 32525, + "train_speed(iter/s)": 1.63612 + }, + { + "acc": 0.66746187, + "epoch": 0.8252156265854896, + "grad_norm": 6.71875, + "learning_rate": 6.8158790269293885e-06, + "loss": 1.55511694, + "memory(GiB)": 107.26, + "step": 32530, + "train_speed(iter/s)": 1.636149 + }, + { + "acc": 0.666292, + "epoch": 0.8253424657534246, + "grad_norm": 6.5, + "learning_rate": 6.8149019626849785e-06, + "loss": 1.53077421, + "memory(GiB)": 107.26, + "step": 32535, + "train_speed(iter/s)": 1.636177 + }, + { + "acc": 0.65224347, + "epoch": 0.8254693049213597, + "grad_norm": 5.28125, + "learning_rate": 6.813924818613079e-06, + "loss": 1.62263145, + "memory(GiB)": 107.26, + "step": 32540, + "train_speed(iter/s)": 1.636203 + }, + { + "acc": 0.66190138, + "epoch": 0.8255961440892948, + "grad_norm": 6.1875, + "learning_rate": 6.812947594756667e-06, + "loss": 1.62164688, + "memory(GiB)": 107.26, + "step": 32545, + "train_speed(iter/s)": 1.636231 + }, + { + "acc": 0.67309875, + "epoch": 0.8257229832572298, + "grad_norm": 5.59375, + "learning_rate": 6.811970291158725e-06, + "loss": 1.58454685, + "memory(GiB)": 107.26, + "step": 32550, + "train_speed(iter/s)": 1.636261 + }, + { + "acc": 0.65828438, + "epoch": 0.8258498224251649, + "grad_norm": 5.53125, + "learning_rate": 6.810992907862239e-06, + "loss": 1.60759621, + "memory(GiB)": 107.26, + "step": 32555, + "train_speed(iter/s)": 1.636287 + }, + { + "acc": 0.63931465, + "epoch": 0.8259766615930999, + "grad_norm": 6.5625, + "learning_rate": 6.810015444910202e-06, + "loss": 1.60354195, + "memory(GiB)": 107.26, + "step": 32560, + "train_speed(iter/s)": 1.636315 + }, + { + "acc": 0.66305399, + "epoch": 0.826103500761035, + "grad_norm": 6.3125, + "learning_rate": 6.809037902345603e-06, + "loss": 1.65512371, + "memory(GiB)": 107.26, + "step": 32565, + "train_speed(iter/s)": 1.636346 + }, + { + "acc": 0.65062537, + "epoch": 0.8262303399289701, + "grad_norm": 6.40625, + "learning_rate": 6.808060280211439e-06, + "loss": 1.59509916, + "memory(GiB)": 107.26, + "step": 32570, + "train_speed(iter/s)": 1.636374 + }, + { + "acc": 0.6655386, + "epoch": 0.8263571790969051, + "grad_norm": 5.65625, + "learning_rate": 6.807082578550713e-06, + "loss": 1.58064823, + "memory(GiB)": 107.26, + "step": 32575, + "train_speed(iter/s)": 1.636403 + }, + { + "acc": 0.6549489, + "epoch": 0.8264840182648402, + "grad_norm": 5.96875, + "learning_rate": 6.806104797406428e-06, + "loss": 1.60469551, + "memory(GiB)": 107.26, + "step": 32580, + "train_speed(iter/s)": 1.63643 + }, + { + "acc": 0.65230603, + "epoch": 0.8266108574327753, + "grad_norm": 5.90625, + "learning_rate": 6.805126936821588e-06, + "loss": 1.62425346, + "memory(GiB)": 107.26, + "step": 32585, + "train_speed(iter/s)": 1.636458 + }, + { + "acc": 0.65537229, + "epoch": 0.8267376966007103, + "grad_norm": 5.5625, + "learning_rate": 6.804148996839208e-06, + "loss": 1.61069221, + "memory(GiB)": 107.26, + "step": 32590, + "train_speed(iter/s)": 1.636486 + }, + { + "acc": 0.65936303, + "epoch": 0.8268645357686454, + "grad_norm": 5.59375, + "learning_rate": 6.803170977502298e-06, + "loss": 1.62829285, + "memory(GiB)": 107.26, + "step": 32595, + "train_speed(iter/s)": 1.636512 + }, + { + "acc": 0.67077489, + "epoch": 0.8269913749365804, + "grad_norm": 9.0, + "learning_rate": 6.802192878853879e-06, + "loss": 1.60500183, + "memory(GiB)": 107.26, + "step": 32600, + "train_speed(iter/s)": 1.636539 + }, + { + "acc": 0.65993624, + "epoch": 0.8271182141045155, + "grad_norm": 6.0, + "learning_rate": 6.801214700936972e-06, + "loss": 1.5348999, + "memory(GiB)": 107.26, + "step": 32605, + "train_speed(iter/s)": 1.636567 + }, + { + "acc": 0.65883837, + "epoch": 0.8272450532724506, + "grad_norm": 5.46875, + "learning_rate": 6.8002364437946e-06, + "loss": 1.58786488, + "memory(GiB)": 107.26, + "step": 32610, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.65503912, + "epoch": 0.8273718924403856, + "grad_norm": 6.0, + "learning_rate": 6.799258107469792e-06, + "loss": 1.64665565, + "memory(GiB)": 107.26, + "step": 32615, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.65461464, + "epoch": 0.8274987316083207, + "grad_norm": 4.40625, + "learning_rate": 6.798279692005578e-06, + "loss": 1.56424026, + "memory(GiB)": 107.26, + "step": 32620, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.66185098, + "epoch": 0.8276255707762558, + "grad_norm": 6.125, + "learning_rate": 6.7973011974449965e-06, + "loss": 1.55325537, + "memory(GiB)": 107.26, + "step": 32625, + "train_speed(iter/s)": 1.636678 + }, + { + "acc": 0.66082478, + "epoch": 0.8277524099441907, + "grad_norm": 8.5, + "learning_rate": 6.796322623831082e-06, + "loss": 1.55895348, + "memory(GiB)": 107.26, + "step": 32630, + "train_speed(iter/s)": 1.636708 + }, + { + "acc": 0.65573144, + "epoch": 0.8278792491121258, + "grad_norm": 6.75, + "learning_rate": 6.795343971206879e-06, + "loss": 1.55839272, + "memory(GiB)": 107.26, + "step": 32635, + "train_speed(iter/s)": 1.636738 + }, + { + "acc": 0.65737138, + "epoch": 0.8280060882800608, + "grad_norm": 11.75, + "learning_rate": 6.794365239615433e-06, + "loss": 1.57717695, + "memory(GiB)": 107.26, + "step": 32640, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.65624828, + "epoch": 0.8281329274479959, + "grad_norm": 4.75, + "learning_rate": 6.793386429099792e-06, + "loss": 1.60059948, + "memory(GiB)": 107.26, + "step": 32645, + "train_speed(iter/s)": 1.636792 + }, + { + "acc": 0.66085892, + "epoch": 0.828259766615931, + "grad_norm": 5.0, + "learning_rate": 6.79240753970301e-06, + "loss": 1.5855855, + "memory(GiB)": 107.26, + "step": 32650, + "train_speed(iter/s)": 1.636819 + }, + { + "acc": 0.64934101, + "epoch": 0.828386605783866, + "grad_norm": 5.125, + "learning_rate": 6.791428571468139e-06, + "loss": 1.64846573, + "memory(GiB)": 107.26, + "step": 32655, + "train_speed(iter/s)": 1.636847 + }, + { + "acc": 0.64298506, + "epoch": 0.8285134449518011, + "grad_norm": 6.6875, + "learning_rate": 6.7904495244382454e-06, + "loss": 1.65990829, + "memory(GiB)": 107.26, + "step": 32660, + "train_speed(iter/s)": 1.636875 + }, + { + "acc": 0.6483386, + "epoch": 0.8286402841197362, + "grad_norm": 5.5, + "learning_rate": 6.789470398656385e-06, + "loss": 1.67112026, + "memory(GiB)": 107.26, + "step": 32665, + "train_speed(iter/s)": 1.636903 + }, + { + "acc": 0.66024408, + "epoch": 0.8287671232876712, + "grad_norm": 5.84375, + "learning_rate": 6.788491194165629e-06, + "loss": 1.5963726, + "memory(GiB)": 107.26, + "step": 32670, + "train_speed(iter/s)": 1.636931 + }, + { + "acc": 0.67239919, + "epoch": 0.8288939624556063, + "grad_norm": 5.65625, + "learning_rate": 6.787511911009044e-06, + "loss": 1.51363029, + "memory(GiB)": 107.26, + "step": 32675, + "train_speed(iter/s)": 1.636959 + }, + { + "acc": 0.64193835, + "epoch": 0.8290208016235413, + "grad_norm": 5.34375, + "learning_rate": 6.786532549229704e-06, + "loss": 1.63559685, + "memory(GiB)": 107.26, + "step": 32680, + "train_speed(iter/s)": 1.636986 + }, + { + "acc": 0.66636362, + "epoch": 0.8291476407914764, + "grad_norm": 5.25, + "learning_rate": 6.785553108870686e-06, + "loss": 1.54751329, + "memory(GiB)": 107.26, + "step": 32685, + "train_speed(iter/s)": 1.637013 + }, + { + "acc": 0.64006414, + "epoch": 0.8292744799594115, + "grad_norm": 5.375, + "learning_rate": 6.784573589975072e-06, + "loss": 1.646315, + "memory(GiB)": 107.26, + "step": 32690, + "train_speed(iter/s)": 1.637041 + }, + { + "acc": 0.63784962, + "epoch": 0.8294013191273465, + "grad_norm": 6.34375, + "learning_rate": 6.783593992585943e-06, + "loss": 1.67931118, + "memory(GiB)": 107.26, + "step": 32695, + "train_speed(iter/s)": 1.637069 + }, + { + "acc": 0.65170984, + "epoch": 0.8295281582952816, + "grad_norm": 5.53125, + "learning_rate": 6.7826143167463876e-06, + "loss": 1.64780731, + "memory(GiB)": 107.26, + "step": 32700, + "train_speed(iter/s)": 1.637096 + }, + { + "acc": 0.65854573, + "epoch": 0.8296549974632167, + "grad_norm": 5.8125, + "learning_rate": 6.781634562499495e-06, + "loss": 1.64535923, + "memory(GiB)": 107.26, + "step": 32705, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.66159353, + "epoch": 0.8297818366311517, + "grad_norm": 6.03125, + "learning_rate": 6.780654729888361e-06, + "loss": 1.60204048, + "memory(GiB)": 107.26, + "step": 32710, + "train_speed(iter/s)": 1.637149 + }, + { + "acc": 0.65879107, + "epoch": 0.8299086757990868, + "grad_norm": 10.6875, + "learning_rate": 6.779674818956081e-06, + "loss": 1.62156944, + "memory(GiB)": 107.26, + "step": 32715, + "train_speed(iter/s)": 1.637177 + }, + { + "acc": 0.63922291, + "epoch": 0.8300355149670218, + "grad_norm": 6.71875, + "learning_rate": 6.778694829745756e-06, + "loss": 1.632094, + "memory(GiB)": 107.26, + "step": 32720, + "train_speed(iter/s)": 1.637204 + }, + { + "acc": 0.63975558, + "epoch": 0.8301623541349569, + "grad_norm": 5.28125, + "learning_rate": 6.777714762300492e-06, + "loss": 1.60052319, + "memory(GiB)": 107.26, + "step": 32725, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.66300774, + "epoch": 0.830289193302892, + "grad_norm": 5.5, + "learning_rate": 6.776734616663397e-06, + "loss": 1.56456165, + "memory(GiB)": 107.26, + "step": 32730, + "train_speed(iter/s)": 1.637256 + }, + { + "acc": 0.67374697, + "epoch": 0.830416032470827, + "grad_norm": 5.46875, + "learning_rate": 6.77575439287758e-06, + "loss": 1.47687073, + "memory(GiB)": 107.26, + "step": 32735, + "train_speed(iter/s)": 1.637282 + }, + { + "acc": 0.65440354, + "epoch": 0.8305428716387621, + "grad_norm": 5.875, + "learning_rate": 6.774774090986157e-06, + "loss": 1.6119854, + "memory(GiB)": 107.26, + "step": 32740, + "train_speed(iter/s)": 1.637309 + }, + { + "acc": 0.67413797, + "epoch": 0.8306697108066972, + "grad_norm": 5.21875, + "learning_rate": 6.773793711032244e-06, + "loss": 1.58809319, + "memory(GiB)": 107.26, + "step": 32745, + "train_speed(iter/s)": 1.637336 + }, + { + "acc": 0.6478025, + "epoch": 0.8307965499746321, + "grad_norm": 5.90625, + "learning_rate": 6.772813253058965e-06, + "loss": 1.63967094, + "memory(GiB)": 107.26, + "step": 32750, + "train_speed(iter/s)": 1.637362 + }, + { + "acc": 0.65996633, + "epoch": 0.8309233891425672, + "grad_norm": 6.78125, + "learning_rate": 6.771832717109444e-06, + "loss": 1.58591442, + "memory(GiB)": 107.26, + "step": 32755, + "train_speed(iter/s)": 1.63739 + }, + { + "acc": 0.65029159, + "epoch": 0.8310502283105022, + "grad_norm": 4.96875, + "learning_rate": 6.77085210322681e-06, + "loss": 1.64019451, + "memory(GiB)": 107.26, + "step": 32760, + "train_speed(iter/s)": 1.637418 + }, + { + "acc": 0.64071603, + "epoch": 0.8311770674784373, + "grad_norm": 5.5, + "learning_rate": 6.769871411454195e-06, + "loss": 1.63422089, + "memory(GiB)": 107.26, + "step": 32765, + "train_speed(iter/s)": 1.637445 + }, + { + "acc": 0.6322793, + "epoch": 0.8313039066463724, + "grad_norm": 5.4375, + "learning_rate": 6.768890641834732e-06, + "loss": 1.68215828, + "memory(GiB)": 107.26, + "step": 32770, + "train_speed(iter/s)": 1.637473 + }, + { + "acc": 0.64893618, + "epoch": 0.8314307458143074, + "grad_norm": 4.6875, + "learning_rate": 6.767909794411562e-06, + "loss": 1.61356106, + "memory(GiB)": 107.26, + "step": 32775, + "train_speed(iter/s)": 1.637499 + }, + { + "acc": 0.66691628, + "epoch": 0.8315575849822425, + "grad_norm": 8.0625, + "learning_rate": 6.7669288692278256e-06, + "loss": 1.6025034, + "memory(GiB)": 107.26, + "step": 32780, + "train_speed(iter/s)": 1.637526 + }, + { + "acc": 0.65464559, + "epoch": 0.8316844241501776, + "grad_norm": 5.96875, + "learning_rate": 6.76594786632667e-06, + "loss": 1.53782272, + "memory(GiB)": 107.26, + "step": 32785, + "train_speed(iter/s)": 1.637552 + }, + { + "acc": 0.65324879, + "epoch": 0.8318112633181126, + "grad_norm": 5.65625, + "learning_rate": 6.764966785751242e-06, + "loss": 1.6249649, + "memory(GiB)": 107.26, + "step": 32790, + "train_speed(iter/s)": 1.637579 + }, + { + "acc": 0.65566382, + "epoch": 0.8319381024860477, + "grad_norm": 5.15625, + "learning_rate": 6.763985627544693e-06, + "loss": 1.62783623, + "memory(GiB)": 107.26, + "step": 32795, + "train_speed(iter/s)": 1.637605 + }, + { + "acc": 0.64118962, + "epoch": 0.8320649416539827, + "grad_norm": 5.3125, + "learning_rate": 6.763004391750183e-06, + "loss": 1.63354149, + "memory(GiB)": 107.26, + "step": 32800, + "train_speed(iter/s)": 1.637632 + }, + { + "acc": 0.65647268, + "epoch": 0.8321917808219178, + "grad_norm": 7.28125, + "learning_rate": 6.762023078410867e-06, + "loss": 1.62805367, + "memory(GiB)": 107.26, + "step": 32805, + "train_speed(iter/s)": 1.637659 + }, + { + "acc": 0.65218029, + "epoch": 0.8323186199898529, + "grad_norm": 5.0625, + "learning_rate": 6.7610416875699095e-06, + "loss": 1.58924828, + "memory(GiB)": 107.26, + "step": 32810, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.6623064, + "epoch": 0.8324454591577879, + "grad_norm": 5.625, + "learning_rate": 6.760060219270476e-06, + "loss": 1.57786417, + "memory(GiB)": 107.26, + "step": 32815, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.66748066, + "epoch": 0.832572298325723, + "grad_norm": 5.53125, + "learning_rate": 6.759078673555736e-06, + "loss": 1.5939127, + "memory(GiB)": 107.26, + "step": 32820, + "train_speed(iter/s)": 1.637739 + }, + { + "acc": 0.65092163, + "epoch": 0.8326991374936581, + "grad_norm": 6.21875, + "learning_rate": 6.758097050468862e-06, + "loss": 1.61537991, + "memory(GiB)": 107.26, + "step": 32825, + "train_speed(iter/s)": 1.637767 + }, + { + "acc": 0.66326427, + "epoch": 0.8328259766615931, + "grad_norm": 5.625, + "learning_rate": 6.757115350053032e-06, + "loss": 1.63619499, + "memory(GiB)": 107.26, + "step": 32830, + "train_speed(iter/s)": 1.637793 + }, + { + "acc": 0.64975243, + "epoch": 0.8329528158295282, + "grad_norm": 4.875, + "learning_rate": 6.756133572351422e-06, + "loss": 1.60693474, + "memory(GiB)": 107.26, + "step": 32835, + "train_speed(iter/s)": 1.63782 + }, + { + "acc": 0.65787697, + "epoch": 0.8330796549974632, + "grad_norm": 5.75, + "learning_rate": 6.755151717407218e-06, + "loss": 1.574506, + "memory(GiB)": 107.26, + "step": 32840, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.64539862, + "epoch": 0.8332064941653983, + "grad_norm": 4.9375, + "learning_rate": 6.754169785263605e-06, + "loss": 1.67978916, + "memory(GiB)": 107.26, + "step": 32845, + "train_speed(iter/s)": 1.637875 + }, + { + "acc": 0.65806422, + "epoch": 0.8333333333333334, + "grad_norm": 6.34375, + "learning_rate": 6.753187775963773e-06, + "loss": 1.61088943, + "memory(GiB)": 107.26, + "step": 32850, + "train_speed(iter/s)": 1.637902 + }, + { + "acc": 0.66494823, + "epoch": 0.8334601725012684, + "grad_norm": 5.9375, + "learning_rate": 6.752205689550915e-06, + "loss": 1.53929348, + "memory(GiB)": 107.26, + "step": 32855, + "train_speed(iter/s)": 1.63793 + }, + { + "acc": 0.65732417, + "epoch": 0.8335870116692035, + "grad_norm": 5.15625, + "learning_rate": 6.751223526068228e-06, + "loss": 1.609342, + "memory(GiB)": 107.26, + "step": 32860, + "train_speed(iter/s)": 1.637957 + }, + { + "acc": 0.64417095, + "epoch": 0.8337138508371386, + "grad_norm": 5.28125, + "learning_rate": 6.75024128555891e-06, + "loss": 1.6455349, + "memory(GiB)": 107.26, + "step": 32865, + "train_speed(iter/s)": 1.637986 + }, + { + "acc": 0.65553541, + "epoch": 0.8338406900050735, + "grad_norm": 5.09375, + "learning_rate": 6.7492589680661695e-06, + "loss": 1.61569824, + "memory(GiB)": 107.26, + "step": 32870, + "train_speed(iter/s)": 1.638013 + }, + { + "acc": 0.64900298, + "epoch": 0.8339675291730086, + "grad_norm": 5.9375, + "learning_rate": 6.748276573633207e-06, + "loss": 1.66464977, + "memory(GiB)": 107.26, + "step": 32875, + "train_speed(iter/s)": 1.638041 + }, + { + "acc": 0.65366497, + "epoch": 0.8340943683409436, + "grad_norm": 5.34375, + "learning_rate": 6.747294102303237e-06, + "loss": 1.6072998, + "memory(GiB)": 107.26, + "step": 32880, + "train_speed(iter/s)": 1.638069 + }, + { + "acc": 0.66518197, + "epoch": 0.8342212075088787, + "grad_norm": 5.625, + "learning_rate": 6.746311554119469e-06, + "loss": 1.56165924, + "memory(GiB)": 107.26, + "step": 32885, + "train_speed(iter/s)": 1.638097 + }, + { + "acc": 0.66784906, + "epoch": 0.8343480466768138, + "grad_norm": 6.03125, + "learning_rate": 6.745328929125125e-06, + "loss": 1.54831572, + "memory(GiB)": 107.26, + "step": 32890, + "train_speed(iter/s)": 1.638124 + }, + { + "acc": 0.65774364, + "epoch": 0.8344748858447488, + "grad_norm": 6.21875, + "learning_rate": 6.7443462273634195e-06, + "loss": 1.54018421, + "memory(GiB)": 107.26, + "step": 32895, + "train_speed(iter/s)": 1.63815 + }, + { + "acc": 0.65409651, + "epoch": 0.8346017250126839, + "grad_norm": 6.09375, + "learning_rate": 6.74336344887758e-06, + "loss": 1.56877642, + "memory(GiB)": 107.26, + "step": 32900, + "train_speed(iter/s)": 1.638178 + }, + { + "acc": 0.65434484, + "epoch": 0.834728564180619, + "grad_norm": 5.90625, + "learning_rate": 6.742380593710834e-06, + "loss": 1.59168415, + "memory(GiB)": 107.26, + "step": 32905, + "train_speed(iter/s)": 1.638205 + }, + { + "acc": 0.64572687, + "epoch": 0.834855403348554, + "grad_norm": 5.96875, + "learning_rate": 6.7413976619064085e-06, + "loss": 1.64755211, + "memory(GiB)": 107.26, + "step": 32910, + "train_speed(iter/s)": 1.638232 + }, + { + "acc": 0.65875878, + "epoch": 0.8349822425164891, + "grad_norm": 5.71875, + "learning_rate": 6.74041465350754e-06, + "loss": 1.5391428, + "memory(GiB)": 107.26, + "step": 32915, + "train_speed(iter/s)": 1.638258 + }, + { + "acc": 0.66857843, + "epoch": 0.8351090816844241, + "grad_norm": 5.375, + "learning_rate": 6.739431568557464e-06, + "loss": 1.49730883, + "memory(GiB)": 107.26, + "step": 32920, + "train_speed(iter/s)": 1.638284 + }, + { + "acc": 0.6353159, + "epoch": 0.8352359208523592, + "grad_norm": 5.1875, + "learning_rate": 6.738448407099423e-06, + "loss": 1.69268703, + "memory(GiB)": 107.26, + "step": 32925, + "train_speed(iter/s)": 1.638312 + }, + { + "acc": 0.65779152, + "epoch": 0.8353627600202943, + "grad_norm": 6.1875, + "learning_rate": 6.737465169176658e-06, + "loss": 1.5484416, + "memory(GiB)": 107.26, + "step": 32930, + "train_speed(iter/s)": 1.638339 + }, + { + "acc": 0.65046997, + "epoch": 0.8354895991882293, + "grad_norm": 8.5625, + "learning_rate": 6.736481854832418e-06, + "loss": 1.63785591, + "memory(GiB)": 107.26, + "step": 32935, + "train_speed(iter/s)": 1.638365 + }, + { + "acc": 0.65530529, + "epoch": 0.8356164383561644, + "grad_norm": 6.5625, + "learning_rate": 6.735498464109953e-06, + "loss": 1.5823575, + "memory(GiB)": 107.26, + "step": 32940, + "train_speed(iter/s)": 1.638391 + }, + { + "acc": 0.65970244, + "epoch": 0.8357432775240995, + "grad_norm": 6.5625, + "learning_rate": 6.734514997052517e-06, + "loss": 1.54790878, + "memory(GiB)": 107.26, + "step": 32945, + "train_speed(iter/s)": 1.638418 + }, + { + "acc": 0.66891809, + "epoch": 0.8358701166920345, + "grad_norm": 8.25, + "learning_rate": 6.733531453703368e-06, + "loss": 1.60948639, + "memory(GiB)": 107.26, + "step": 32950, + "train_speed(iter/s)": 1.638444 + }, + { + "acc": 0.65748167, + "epoch": 0.8359969558599696, + "grad_norm": 7.125, + "learning_rate": 6.732547834105765e-06, + "loss": 1.53375082, + "memory(GiB)": 107.26, + "step": 32955, + "train_speed(iter/s)": 1.638471 + }, + { + "acc": 0.66838403, + "epoch": 0.8361237950279046, + "grad_norm": 5.96875, + "learning_rate": 6.731564138302975e-06, + "loss": 1.58501377, + "memory(GiB)": 107.26, + "step": 32960, + "train_speed(iter/s)": 1.638498 + }, + { + "acc": 0.66609545, + "epoch": 0.8362506341958397, + "grad_norm": 5.0, + "learning_rate": 6.730580366338261e-06, + "loss": 1.60643272, + "memory(GiB)": 107.26, + "step": 32965, + "train_speed(iter/s)": 1.638526 + }, + { + "acc": 0.66789684, + "epoch": 0.8363774733637748, + "grad_norm": 5.09375, + "learning_rate": 6.729596518254897e-06, + "loss": 1.55170212, + "memory(GiB)": 107.26, + "step": 32970, + "train_speed(iter/s)": 1.63855 + }, + { + "acc": 0.64494123, + "epoch": 0.8365043125317098, + "grad_norm": 7.1875, + "learning_rate": 6.728612594096155e-06, + "loss": 1.61461601, + "memory(GiB)": 107.26, + "step": 32975, + "train_speed(iter/s)": 1.638578 + }, + { + "acc": 0.63801928, + "epoch": 0.8366311516996449, + "grad_norm": 5.0, + "learning_rate": 6.727628593905315e-06, + "loss": 1.653965, + "memory(GiB)": 107.26, + "step": 32980, + "train_speed(iter/s)": 1.638604 + }, + { + "acc": 0.66122475, + "epoch": 0.83675799086758, + "grad_norm": 6.21875, + "learning_rate": 6.726644517725655e-06, + "loss": 1.60932159, + "memory(GiB)": 107.26, + "step": 32985, + "train_speed(iter/s)": 1.638631 + }, + { + "acc": 0.63274193, + "epoch": 0.836884830035515, + "grad_norm": 6.59375, + "learning_rate": 6.725660365600462e-06, + "loss": 1.68729858, + "memory(GiB)": 107.26, + "step": 32990, + "train_speed(iter/s)": 1.638658 + }, + { + "acc": 0.63724275, + "epoch": 0.83701166920345, + "grad_norm": 6.21875, + "learning_rate": 6.724676137573021e-06, + "loss": 1.64082508, + "memory(GiB)": 107.26, + "step": 32995, + "train_speed(iter/s)": 1.638684 + }, + { + "acc": 0.63562245, + "epoch": 0.837138508371385, + "grad_norm": 8.3125, + "learning_rate": 6.723691833686622e-06, + "loss": 1.63438835, + "memory(GiB)": 107.26, + "step": 33000, + "train_speed(iter/s)": 1.638711 + }, + { + "epoch": 0.837138508371385, + "eval_acc": 0.6457886174787045, + "eval_loss": 1.575567364692688, + "eval_runtime": 58.089, + "eval_samples_per_second": 109.659, + "eval_steps_per_second": 27.423, + "step": 33000 + }, + { + "acc": 0.65270977, + "epoch": 0.8372653475393201, + "grad_norm": 7.25, + "learning_rate": 6.722707453984561e-06, + "loss": 1.63201828, + "memory(GiB)": 107.26, + "step": 33005, + "train_speed(iter/s)": 1.633681 + }, + { + "acc": 0.67310781, + "epoch": 0.8373921867072552, + "grad_norm": 6.90625, + "learning_rate": 6.721722998510135e-06, + "loss": 1.63963547, + "memory(GiB)": 107.26, + "step": 33010, + "train_speed(iter/s)": 1.633708 + }, + { + "acc": 0.6590044, + "epoch": 0.8375190258751902, + "grad_norm": 5.1875, + "learning_rate": 6.720738467306644e-06, + "loss": 1.58788128, + "memory(GiB)": 107.26, + "step": 33015, + "train_speed(iter/s)": 1.633734 + }, + { + "acc": 0.66271214, + "epoch": 0.8376458650431253, + "grad_norm": 5.1875, + "learning_rate": 6.719753860417394e-06, + "loss": 1.63545761, + "memory(GiB)": 107.26, + "step": 33020, + "train_speed(iter/s)": 1.633759 + }, + { + "acc": 0.64273181, + "epoch": 0.8377727042110604, + "grad_norm": 6.71875, + "learning_rate": 6.718769177885689e-06, + "loss": 1.64766083, + "memory(GiB)": 107.26, + "step": 33025, + "train_speed(iter/s)": 1.633785 + }, + { + "acc": 0.67503633, + "epoch": 0.8378995433789954, + "grad_norm": 5.34375, + "learning_rate": 6.717784419754845e-06, + "loss": 1.50163164, + "memory(GiB)": 107.26, + "step": 33030, + "train_speed(iter/s)": 1.633812 + }, + { + "acc": 0.66373549, + "epoch": 0.8380263825469305, + "grad_norm": 5.78125, + "learning_rate": 6.71679958606817e-06, + "loss": 1.56779099, + "memory(GiB)": 107.26, + "step": 33035, + "train_speed(iter/s)": 1.63384 + }, + { + "acc": 0.63731375, + "epoch": 0.8381532217148655, + "grad_norm": 5.90625, + "learning_rate": 6.715814676868985e-06, + "loss": 1.66424999, + "memory(GiB)": 107.26, + "step": 33040, + "train_speed(iter/s)": 1.633867 + }, + { + "acc": 0.64177208, + "epoch": 0.8382800608828006, + "grad_norm": 5.09375, + "learning_rate": 6.714829692200611e-06, + "loss": 1.68111534, + "memory(GiB)": 107.26, + "step": 33045, + "train_speed(iter/s)": 1.633894 + }, + { + "acc": 0.64890814, + "epoch": 0.8384069000507357, + "grad_norm": 7.6875, + "learning_rate": 6.71384463210637e-06, + "loss": 1.67067566, + "memory(GiB)": 107.26, + "step": 33050, + "train_speed(iter/s)": 1.633919 + }, + { + "acc": 0.65133052, + "epoch": 0.8385337392186707, + "grad_norm": 5.25, + "learning_rate": 6.7128594966295904e-06, + "loss": 1.58246441, + "memory(GiB)": 107.26, + "step": 33055, + "train_speed(iter/s)": 1.63394 + }, + { + "acc": 0.66042347, + "epoch": 0.8386605783866058, + "grad_norm": 5.84375, + "learning_rate": 6.711874285813602e-06, + "loss": 1.56692257, + "memory(GiB)": 107.26, + "step": 33060, + "train_speed(iter/s)": 1.633965 + }, + { + "acc": 0.66584649, + "epoch": 0.8387874175545409, + "grad_norm": 4.96875, + "learning_rate": 6.710888999701741e-06, + "loss": 1.60463047, + "memory(GiB)": 107.26, + "step": 33065, + "train_speed(iter/s)": 1.633991 + }, + { + "acc": 0.66672449, + "epoch": 0.8389142567224759, + "grad_norm": 5.8125, + "learning_rate": 6.7099036383373425e-06, + "loss": 1.53181839, + "memory(GiB)": 107.26, + "step": 33070, + "train_speed(iter/s)": 1.634018 + }, + { + "acc": 0.64782906, + "epoch": 0.839041095890411, + "grad_norm": 5.8125, + "learning_rate": 6.708918201763748e-06, + "loss": 1.63721275, + "memory(GiB)": 107.26, + "step": 33075, + "train_speed(iter/s)": 1.634044 + }, + { + "acc": 0.6564826, + "epoch": 0.839167935058346, + "grad_norm": 5.09375, + "learning_rate": 6.707932690024302e-06, + "loss": 1.63065109, + "memory(GiB)": 107.26, + "step": 33080, + "train_speed(iter/s)": 1.634071 + }, + { + "acc": 0.64856415, + "epoch": 0.8392947742262811, + "grad_norm": 6.28125, + "learning_rate": 6.706947103162348e-06, + "loss": 1.57642059, + "memory(GiB)": 107.26, + "step": 33085, + "train_speed(iter/s)": 1.634099 + }, + { + "acc": 0.65623808, + "epoch": 0.8394216133942162, + "grad_norm": 6.09375, + "learning_rate": 6.7059614412212425e-06, + "loss": 1.64540348, + "memory(GiB)": 107.26, + "step": 33090, + "train_speed(iter/s)": 1.634127 + }, + { + "acc": 0.65889034, + "epoch": 0.8395484525621512, + "grad_norm": 5.3125, + "learning_rate": 6.704975704244334e-06, + "loss": 1.60552216, + "memory(GiB)": 107.26, + "step": 33095, + "train_speed(iter/s)": 1.634154 + }, + { + "acc": 0.65190573, + "epoch": 0.8396752917300863, + "grad_norm": 7.25, + "learning_rate": 6.703989892274985e-06, + "loss": 1.65951271, + "memory(GiB)": 107.26, + "step": 33100, + "train_speed(iter/s)": 1.634182 + }, + { + "acc": 0.65876555, + "epoch": 0.8398021308980214, + "grad_norm": 8.0625, + "learning_rate": 6.703004005356549e-06, + "loss": 1.55150394, + "memory(GiB)": 107.26, + "step": 33105, + "train_speed(iter/s)": 1.634209 + }, + { + "acc": 0.63346443, + "epoch": 0.8399289700659563, + "grad_norm": 7.78125, + "learning_rate": 6.7020180435323965e-06, + "loss": 1.65833244, + "memory(GiB)": 107.26, + "step": 33110, + "train_speed(iter/s)": 1.634239 + }, + { + "acc": 0.64021816, + "epoch": 0.8400558092338914, + "grad_norm": 5.9375, + "learning_rate": 6.701032006845889e-06, + "loss": 1.61048431, + "memory(GiB)": 107.26, + "step": 33115, + "train_speed(iter/s)": 1.634266 + }, + { + "acc": 0.66582642, + "epoch": 0.8401826484018264, + "grad_norm": 5.875, + "learning_rate": 6.700045895340401e-06, + "loss": 1.53527222, + "memory(GiB)": 107.26, + "step": 33120, + "train_speed(iter/s)": 1.634295 + }, + { + "acc": 0.65648694, + "epoch": 0.8403094875697615, + "grad_norm": 7.90625, + "learning_rate": 6.699059709059304e-06, + "loss": 1.64785976, + "memory(GiB)": 107.26, + "step": 33125, + "train_speed(iter/s)": 1.634324 + }, + { + "acc": 0.67328687, + "epoch": 0.8404363267376966, + "grad_norm": 5.34375, + "learning_rate": 6.698073448045975e-06, + "loss": 1.58423519, + "memory(GiB)": 107.26, + "step": 33130, + "train_speed(iter/s)": 1.634351 + }, + { + "acc": 0.6677012, + "epoch": 0.8405631659056316, + "grad_norm": 5.21875, + "learning_rate": 6.697087112343795e-06, + "loss": 1.5326704, + "memory(GiB)": 107.26, + "step": 33135, + "train_speed(iter/s)": 1.634379 + }, + { + "acc": 0.64167943, + "epoch": 0.8406900050735667, + "grad_norm": 5.9375, + "learning_rate": 6.696100701996146e-06, + "loss": 1.68892384, + "memory(GiB)": 107.26, + "step": 33140, + "train_speed(iter/s)": 1.634408 + }, + { + "acc": 0.66409349, + "epoch": 0.8408168442415018, + "grad_norm": 6.25, + "learning_rate": 6.6951142170464164e-06, + "loss": 1.55952644, + "memory(GiB)": 107.26, + "step": 33145, + "train_speed(iter/s)": 1.634436 + }, + { + "acc": 0.65505791, + "epoch": 0.8409436834094368, + "grad_norm": 5.65625, + "learning_rate": 6.694127657537995e-06, + "loss": 1.6306942, + "memory(GiB)": 107.26, + "step": 33150, + "train_speed(iter/s)": 1.634465 + }, + { + "acc": 0.64726124, + "epoch": 0.8410705225773719, + "grad_norm": 5.5, + "learning_rate": 6.693141023514276e-06, + "loss": 1.60471764, + "memory(GiB)": 107.26, + "step": 33155, + "train_speed(iter/s)": 1.634493 + }, + { + "acc": 0.64906836, + "epoch": 0.8411973617453069, + "grad_norm": 5.53125, + "learning_rate": 6.6921543150186555e-06, + "loss": 1.63876686, + "memory(GiB)": 107.26, + "step": 33160, + "train_speed(iter/s)": 1.63452 + }, + { + "acc": 0.66530533, + "epoch": 0.841324200913242, + "grad_norm": 6.8125, + "learning_rate": 6.691167532094531e-06, + "loss": 1.58795719, + "memory(GiB)": 107.26, + "step": 33165, + "train_speed(iter/s)": 1.634548 + }, + { + "acc": 0.66380329, + "epoch": 0.8414510400811771, + "grad_norm": 6.96875, + "learning_rate": 6.690180674785311e-06, + "loss": 1.66640186, + "memory(GiB)": 107.26, + "step": 33170, + "train_speed(iter/s)": 1.634577 + }, + { + "acc": 0.64290233, + "epoch": 0.8415778792491121, + "grad_norm": 5.4375, + "learning_rate": 6.689193743134397e-06, + "loss": 1.64339123, + "memory(GiB)": 107.26, + "step": 33175, + "train_speed(iter/s)": 1.634607 + }, + { + "acc": 0.65074568, + "epoch": 0.8417047184170472, + "grad_norm": 6.65625, + "learning_rate": 6.688206737185201e-06, + "loss": 1.61374569, + "memory(GiB)": 107.26, + "step": 33180, + "train_speed(iter/s)": 1.634636 + }, + { + "acc": 0.64724813, + "epoch": 0.8418315575849823, + "grad_norm": 6.21875, + "learning_rate": 6.687219656981135e-06, + "loss": 1.59716864, + "memory(GiB)": 107.26, + "step": 33185, + "train_speed(iter/s)": 1.634662 + }, + { + "acc": 0.67918143, + "epoch": 0.8419583967529173, + "grad_norm": 5.75, + "learning_rate": 6.686232502565616e-06, + "loss": 1.57895031, + "memory(GiB)": 107.26, + "step": 33190, + "train_speed(iter/s)": 1.634691 + }, + { + "acc": 0.65584588, + "epoch": 0.8420852359208524, + "grad_norm": 5.5625, + "learning_rate": 6.685245273982063e-06, + "loss": 1.64173012, + "memory(GiB)": 107.26, + "step": 33195, + "train_speed(iter/s)": 1.63472 + }, + { + "acc": 0.63902512, + "epoch": 0.8422120750887874, + "grad_norm": 5.875, + "learning_rate": 6.684257971273899e-06, + "loss": 1.72377224, + "memory(GiB)": 107.26, + "step": 33200, + "train_speed(iter/s)": 1.634748 + }, + { + "acc": 0.66817207, + "epoch": 0.8423389142567225, + "grad_norm": 5.1875, + "learning_rate": 6.68327059448455e-06, + "loss": 1.57995291, + "memory(GiB)": 107.26, + "step": 33205, + "train_speed(iter/s)": 1.634776 + }, + { + "acc": 0.66396513, + "epoch": 0.8424657534246576, + "grad_norm": 7.1875, + "learning_rate": 6.682283143657444e-06, + "loss": 1.54377804, + "memory(GiB)": 107.26, + "step": 33210, + "train_speed(iter/s)": 1.634806 + }, + { + "acc": 0.65219955, + "epoch": 0.8425925925925926, + "grad_norm": 7.78125, + "learning_rate": 6.681295618836015e-06, + "loss": 1.67003822, + "memory(GiB)": 107.26, + "step": 33215, + "train_speed(iter/s)": 1.634836 + }, + { + "acc": 0.64577007, + "epoch": 0.8427194317605277, + "grad_norm": 5.28125, + "learning_rate": 6.680308020063699e-06, + "loss": 1.61047211, + "memory(GiB)": 107.26, + "step": 33220, + "train_speed(iter/s)": 1.634864 + }, + { + "acc": 0.65421352, + "epoch": 0.8428462709284628, + "grad_norm": 5.9375, + "learning_rate": 6.679320347383933e-06, + "loss": 1.63300457, + "memory(GiB)": 107.26, + "step": 33225, + "train_speed(iter/s)": 1.634891 + }, + { + "acc": 0.6567976, + "epoch": 0.8429731100963977, + "grad_norm": 7.1875, + "learning_rate": 6.678332600840161e-06, + "loss": 1.63503456, + "memory(GiB)": 107.26, + "step": 33230, + "train_speed(iter/s)": 1.634919 + }, + { + "acc": 0.67380571, + "epoch": 0.8430999492643328, + "grad_norm": 5.625, + "learning_rate": 6.677344780475827e-06, + "loss": 1.53659, + "memory(GiB)": 107.26, + "step": 33235, + "train_speed(iter/s)": 1.634949 + }, + { + "acc": 0.66985335, + "epoch": 0.8432267884322678, + "grad_norm": 5.09375, + "learning_rate": 6.676356886334383e-06, + "loss": 1.56984911, + "memory(GiB)": 107.26, + "step": 33240, + "train_speed(iter/s)": 1.634976 + }, + { + "acc": 0.6458457, + "epoch": 0.8433536276002029, + "grad_norm": 5.90625, + "learning_rate": 6.675368918459276e-06, + "loss": 1.65674934, + "memory(GiB)": 107.26, + "step": 33245, + "train_speed(iter/s)": 1.635005 + }, + { + "acc": 0.67723303, + "epoch": 0.843480466768138, + "grad_norm": 5.09375, + "learning_rate": 6.674380876893967e-06, + "loss": 1.52234802, + "memory(GiB)": 107.26, + "step": 33250, + "train_speed(iter/s)": 1.635035 + }, + { + "acc": 0.65861788, + "epoch": 0.843607305936073, + "grad_norm": 7.34375, + "learning_rate": 6.673392761681908e-06, + "loss": 1.56752672, + "memory(GiB)": 107.26, + "step": 33255, + "train_speed(iter/s)": 1.635063 + }, + { + "acc": 0.66966267, + "epoch": 0.8437341451040081, + "grad_norm": 5.34375, + "learning_rate": 6.672404572866566e-06, + "loss": 1.55277596, + "memory(GiB)": 107.26, + "step": 33260, + "train_speed(iter/s)": 1.635091 + }, + { + "acc": 0.66005096, + "epoch": 0.8438609842719432, + "grad_norm": 5.28125, + "learning_rate": 6.671416310491406e-06, + "loss": 1.53709698, + "memory(GiB)": 107.26, + "step": 33265, + "train_speed(iter/s)": 1.635115 + }, + { + "acc": 0.64196749, + "epoch": 0.8439878234398782, + "grad_norm": 5.96875, + "learning_rate": 6.670427974599891e-06, + "loss": 1.69622765, + "memory(GiB)": 107.26, + "step": 33270, + "train_speed(iter/s)": 1.635143 + }, + { + "acc": 0.65786552, + "epoch": 0.8441146626078133, + "grad_norm": 7.8125, + "learning_rate": 6.669439565235498e-06, + "loss": 1.64209785, + "memory(GiB)": 107.26, + "step": 33275, + "train_speed(iter/s)": 1.635171 + }, + { + "acc": 0.6553998, + "epoch": 0.8442415017757483, + "grad_norm": 5.9375, + "learning_rate": 6.668451082441698e-06, + "loss": 1.61554947, + "memory(GiB)": 107.26, + "step": 33280, + "train_speed(iter/s)": 1.6352 + }, + { + "acc": 0.63697472, + "epoch": 0.8443683409436834, + "grad_norm": 5.875, + "learning_rate": 6.667462526261972e-06, + "loss": 1.66026955, + "memory(GiB)": 107.26, + "step": 33285, + "train_speed(iter/s)": 1.635228 + }, + { + "acc": 0.66530867, + "epoch": 0.8444951801116185, + "grad_norm": 4.59375, + "learning_rate": 6.666473896739798e-06, + "loss": 1.57641735, + "memory(GiB)": 107.26, + "step": 33290, + "train_speed(iter/s)": 1.635256 + }, + { + "acc": 0.65954456, + "epoch": 0.8446220192795535, + "grad_norm": 6.09375, + "learning_rate": 6.665485193918663e-06, + "loss": 1.59219036, + "memory(GiB)": 107.26, + "step": 33295, + "train_speed(iter/s)": 1.635284 + }, + { + "acc": 0.64143925, + "epoch": 0.8447488584474886, + "grad_norm": 6.0625, + "learning_rate": 6.664496417842053e-06, + "loss": 1.61944141, + "memory(GiB)": 107.26, + "step": 33300, + "train_speed(iter/s)": 1.635314 + }, + { + "acc": 0.6585248, + "epoch": 0.8448756976154237, + "grad_norm": 4.6875, + "learning_rate": 6.6635075685534566e-06, + "loss": 1.56756182, + "memory(GiB)": 107.26, + "step": 33305, + "train_speed(iter/s)": 1.635342 + }, + { + "acc": 0.64922209, + "epoch": 0.8450025367833587, + "grad_norm": 7.25, + "learning_rate": 6.662518646096374e-06, + "loss": 1.63727036, + "memory(GiB)": 107.26, + "step": 33310, + "train_speed(iter/s)": 1.635371 + }, + { + "acc": 0.65412908, + "epoch": 0.8451293759512938, + "grad_norm": 4.8125, + "learning_rate": 6.661529650514296e-06, + "loss": 1.61124172, + "memory(GiB)": 107.26, + "step": 33315, + "train_speed(iter/s)": 1.635397 + }, + { + "acc": 0.64485292, + "epoch": 0.8452562151192288, + "grad_norm": 5.40625, + "learning_rate": 6.6605405818507274e-06, + "loss": 1.64717197, + "memory(GiB)": 107.26, + "step": 33320, + "train_speed(iter/s)": 1.635425 + }, + { + "acc": 0.64504681, + "epoch": 0.8453830542871639, + "grad_norm": 6.0, + "learning_rate": 6.659551440149169e-06, + "loss": 1.59644623, + "memory(GiB)": 107.26, + "step": 33325, + "train_speed(iter/s)": 1.635451 + }, + { + "acc": 0.66047468, + "epoch": 0.845509893455099, + "grad_norm": 6.125, + "learning_rate": 6.65856222545313e-06, + "loss": 1.58540964, + "memory(GiB)": 107.26, + "step": 33330, + "train_speed(iter/s)": 1.635478 + }, + { + "acc": 0.65453253, + "epoch": 0.845636732623034, + "grad_norm": 4.65625, + "learning_rate": 6.657572937806118e-06, + "loss": 1.6203083, + "memory(GiB)": 107.26, + "step": 33335, + "train_speed(iter/s)": 1.635504 + }, + { + "acc": 0.66870689, + "epoch": 0.8457635717909691, + "grad_norm": 5.0, + "learning_rate": 6.656583577251649e-06, + "loss": 1.50935307, + "memory(GiB)": 107.26, + "step": 33340, + "train_speed(iter/s)": 1.63553 + }, + { + "acc": 0.63879795, + "epoch": 0.8458904109589042, + "grad_norm": 4.9375, + "learning_rate": 6.655594143833237e-06, + "loss": 1.66118393, + "memory(GiB)": 107.26, + "step": 33345, + "train_speed(iter/s)": 1.635559 + }, + { + "acc": 0.65120659, + "epoch": 0.8460172501268391, + "grad_norm": 5.65625, + "learning_rate": 6.654604637594404e-06, + "loss": 1.60661087, + "memory(GiB)": 107.26, + "step": 33350, + "train_speed(iter/s)": 1.635587 + }, + { + "acc": 0.64487476, + "epoch": 0.8461440892947742, + "grad_norm": 5.75, + "learning_rate": 6.653615058578672e-06, + "loss": 1.6541954, + "memory(GiB)": 107.26, + "step": 33355, + "train_speed(iter/s)": 1.635615 + }, + { + "acc": 0.67058682, + "epoch": 0.8462709284627092, + "grad_norm": 5.59375, + "learning_rate": 6.652625406829566e-06, + "loss": 1.56716003, + "memory(GiB)": 107.26, + "step": 33360, + "train_speed(iter/s)": 1.635644 + }, + { + "acc": 0.65444999, + "epoch": 0.8463977676306443, + "grad_norm": 7.125, + "learning_rate": 6.651635682390616e-06, + "loss": 1.56987514, + "memory(GiB)": 107.26, + "step": 33365, + "train_speed(iter/s)": 1.63567 + }, + { + "acc": 0.66086345, + "epoch": 0.8465246067985794, + "grad_norm": 6.03125, + "learning_rate": 6.650645885305356e-06, + "loss": 1.57297497, + "memory(GiB)": 107.26, + "step": 33370, + "train_speed(iter/s)": 1.635697 + }, + { + "acc": 0.64320002, + "epoch": 0.8466514459665144, + "grad_norm": 8.625, + "learning_rate": 6.649656015617319e-06, + "loss": 1.68561897, + "memory(GiB)": 107.26, + "step": 33375, + "train_speed(iter/s)": 1.635724 + }, + { + "acc": 0.65072069, + "epoch": 0.8467782851344495, + "grad_norm": 5.03125, + "learning_rate": 6.648666073370046e-06, + "loss": 1.58533573, + "memory(GiB)": 107.26, + "step": 33380, + "train_speed(iter/s)": 1.635749 + }, + { + "acc": 0.65497293, + "epoch": 0.8469051243023846, + "grad_norm": 5.25, + "learning_rate": 6.647676058607076e-06, + "loss": 1.62276821, + "memory(GiB)": 107.26, + "step": 33385, + "train_speed(iter/s)": 1.635775 + }, + { + "acc": 0.65208297, + "epoch": 0.8470319634703196, + "grad_norm": 5.84375, + "learning_rate": 6.64668597137196e-06, + "loss": 1.6559433, + "memory(GiB)": 107.26, + "step": 33390, + "train_speed(iter/s)": 1.635803 + }, + { + "acc": 0.66810999, + "epoch": 0.8471588026382547, + "grad_norm": 6.625, + "learning_rate": 6.645695811708241e-06, + "loss": 1.59134617, + "memory(GiB)": 107.26, + "step": 33395, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.66600256, + "epoch": 0.8472856418061897, + "grad_norm": 5.25, + "learning_rate": 6.644705579659474e-06, + "loss": 1.60210209, + "memory(GiB)": 107.26, + "step": 33400, + "train_speed(iter/s)": 1.635857 + }, + { + "acc": 0.65742593, + "epoch": 0.8474124809741248, + "grad_norm": 6.53125, + "learning_rate": 6.643715275269212e-06, + "loss": 1.62654228, + "memory(GiB)": 107.26, + "step": 33405, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.65765524, + "epoch": 0.8475393201420599, + "grad_norm": 6.3125, + "learning_rate": 6.642724898581013e-06, + "loss": 1.62223473, + "memory(GiB)": 107.26, + "step": 33410, + "train_speed(iter/s)": 1.635912 + }, + { + "acc": 0.66433859, + "epoch": 0.8476661593099949, + "grad_norm": 5.90625, + "learning_rate": 6.6417344496384394e-06, + "loss": 1.57372742, + "memory(GiB)": 107.26, + "step": 33415, + "train_speed(iter/s)": 1.635938 + }, + { + "acc": 0.62334013, + "epoch": 0.84779299847793, + "grad_norm": 4.53125, + "learning_rate": 6.640743928485054e-06, + "loss": 1.68893623, + "memory(GiB)": 107.26, + "step": 33420, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.65648451, + "epoch": 0.8479198376458651, + "grad_norm": 8.1875, + "learning_rate": 6.639753335164426e-06, + "loss": 1.66761837, + "memory(GiB)": 107.26, + "step": 33425, + "train_speed(iter/s)": 1.635992 + }, + { + "acc": 0.65545902, + "epoch": 0.8480466768138001, + "grad_norm": 6.1875, + "learning_rate": 6.638762669720126e-06, + "loss": 1.61317215, + "memory(GiB)": 107.26, + "step": 33430, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.67150221, + "epoch": 0.8481735159817352, + "grad_norm": 6.34375, + "learning_rate": 6.637771932195726e-06, + "loss": 1.53342361, + "memory(GiB)": 107.26, + "step": 33435, + "train_speed(iter/s)": 1.636047 + }, + { + "acc": 0.67308769, + "epoch": 0.8483003551496702, + "grad_norm": 5.21875, + "learning_rate": 6.636781122634804e-06, + "loss": 1.57959232, + "memory(GiB)": 107.26, + "step": 33440, + "train_speed(iter/s)": 1.636074 + }, + { + "acc": 0.67521672, + "epoch": 0.8484271943176053, + "grad_norm": 4.90625, + "learning_rate": 6.635790241080941e-06, + "loss": 1.44894133, + "memory(GiB)": 107.26, + "step": 33445, + "train_speed(iter/s)": 1.636102 + }, + { + "acc": 0.65384202, + "epoch": 0.8485540334855404, + "grad_norm": 5.875, + "learning_rate": 6.634799287577721e-06, + "loss": 1.59544468, + "memory(GiB)": 107.26, + "step": 33450, + "train_speed(iter/s)": 1.636128 + }, + { + "acc": 0.66716232, + "epoch": 0.8486808726534754, + "grad_norm": 6.25, + "learning_rate": 6.6338082621687286e-06, + "loss": 1.53411179, + "memory(GiB)": 107.26, + "step": 33455, + "train_speed(iter/s)": 1.636156 + }, + { + "acc": 0.64660311, + "epoch": 0.8488077118214105, + "grad_norm": 8.75, + "learning_rate": 6.6328171648975545e-06, + "loss": 1.68349113, + "memory(GiB)": 107.26, + "step": 33460, + "train_speed(iter/s)": 1.636183 + }, + { + "acc": 0.66802964, + "epoch": 0.8489345509893456, + "grad_norm": 6.25, + "learning_rate": 6.63182599580779e-06, + "loss": 1.53537884, + "memory(GiB)": 107.26, + "step": 33465, + "train_speed(iter/s)": 1.63621 + }, + { + "acc": 0.66363921, + "epoch": 0.8490613901572805, + "grad_norm": 5.21875, + "learning_rate": 6.630834754943036e-06, + "loss": 1.58770275, + "memory(GiB)": 107.26, + "step": 33470, + "train_speed(iter/s)": 1.636238 + }, + { + "acc": 0.65645103, + "epoch": 0.8491882293252156, + "grad_norm": 6.5, + "learning_rate": 6.629843442346886e-06, + "loss": 1.6131958, + "memory(GiB)": 107.26, + "step": 33475, + "train_speed(iter/s)": 1.636266 + }, + { + "acc": 0.66149659, + "epoch": 0.8493150684931506, + "grad_norm": 6.59375, + "learning_rate": 6.628852058062944e-06, + "loss": 1.6268137, + "memory(GiB)": 107.26, + "step": 33480, + "train_speed(iter/s)": 1.636292 + }, + { + "acc": 0.63187261, + "epoch": 0.8494419076610857, + "grad_norm": 5.59375, + "learning_rate": 6.627860602134818e-06, + "loss": 1.74686012, + "memory(GiB)": 107.26, + "step": 33485, + "train_speed(iter/s)": 1.636318 + }, + { + "acc": 0.64392219, + "epoch": 0.8495687468290208, + "grad_norm": 4.90625, + "learning_rate": 6.626869074606113e-06, + "loss": 1.64276581, + "memory(GiB)": 107.26, + "step": 33490, + "train_speed(iter/s)": 1.636346 + }, + { + "acc": 0.63419681, + "epoch": 0.8496955859969558, + "grad_norm": 5.40625, + "learning_rate": 6.625877475520445e-06, + "loss": 1.67645416, + "memory(GiB)": 107.26, + "step": 33495, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.66426578, + "epoch": 0.8498224251648909, + "grad_norm": 6.09375, + "learning_rate": 6.624885804921425e-06, + "loss": 1.49502258, + "memory(GiB)": 107.26, + "step": 33500, + "train_speed(iter/s)": 1.636399 + }, + { + "acc": 0.65908689, + "epoch": 0.849949264332826, + "grad_norm": 6.90625, + "learning_rate": 6.623894062852673e-06, + "loss": 1.55287924, + "memory(GiB)": 107.26, + "step": 33505, + "train_speed(iter/s)": 1.636424 + }, + { + "acc": 0.67805767, + "epoch": 0.850076103500761, + "grad_norm": 4.96875, + "learning_rate": 6.62290224935781e-06, + "loss": 1.53019915, + "memory(GiB)": 107.26, + "step": 33510, + "train_speed(iter/s)": 1.636451 + }, + { + "acc": 0.65787125, + "epoch": 0.8502029426686961, + "grad_norm": 4.625, + "learning_rate": 6.621910364480461e-06, + "loss": 1.57635317, + "memory(GiB)": 107.26, + "step": 33515, + "train_speed(iter/s)": 1.636478 + }, + { + "acc": 0.65534821, + "epoch": 0.8503297818366311, + "grad_norm": 5.8125, + "learning_rate": 6.620918408264252e-06, + "loss": 1.66582336, + "memory(GiB)": 107.26, + "step": 33520, + "train_speed(iter/s)": 1.636505 + }, + { + "acc": 0.64076109, + "epoch": 0.8504566210045662, + "grad_norm": 5.28125, + "learning_rate": 6.6199263807528136e-06, + "loss": 1.69195633, + "memory(GiB)": 107.26, + "step": 33525, + "train_speed(iter/s)": 1.636531 + }, + { + "acc": 0.65869346, + "epoch": 0.8505834601725013, + "grad_norm": 5.90625, + "learning_rate": 6.618934281989783e-06, + "loss": 1.64132214, + "memory(GiB)": 107.26, + "step": 33530, + "train_speed(iter/s)": 1.63656 + }, + { + "acc": 0.65224018, + "epoch": 0.8507102993404363, + "grad_norm": 5.6875, + "learning_rate": 6.6179421120187915e-06, + "loss": 1.56616573, + "memory(GiB)": 107.26, + "step": 33535, + "train_speed(iter/s)": 1.636586 + }, + { + "acc": 0.67532496, + "epoch": 0.8508371385083714, + "grad_norm": 5.90625, + "learning_rate": 6.616949870883486e-06, + "loss": 1.51041489, + "memory(GiB)": 107.26, + "step": 33540, + "train_speed(iter/s)": 1.636614 + }, + { + "acc": 0.64911766, + "epoch": 0.8509639776763065, + "grad_norm": 6.21875, + "learning_rate": 6.615957558627503e-06, + "loss": 1.66333809, + "memory(GiB)": 107.26, + "step": 33545, + "train_speed(iter/s)": 1.63664 + }, + { + "acc": 0.65912933, + "epoch": 0.8510908168442415, + "grad_norm": 5.8125, + "learning_rate": 6.6149651752944945e-06, + "loss": 1.57878094, + "memory(GiB)": 107.26, + "step": 33550, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.64721432, + "epoch": 0.8512176560121766, + "grad_norm": 6.0, + "learning_rate": 6.613972720928105e-06, + "loss": 1.61661415, + "memory(GiB)": 107.26, + "step": 33555, + "train_speed(iter/s)": 1.636696 + }, + { + "acc": 0.65262809, + "epoch": 0.8513444951801116, + "grad_norm": 6.4375, + "learning_rate": 6.61298019557199e-06, + "loss": 1.57162457, + "memory(GiB)": 107.26, + "step": 33560, + "train_speed(iter/s)": 1.636723 + }, + { + "acc": 0.67586737, + "epoch": 0.8514713343480467, + "grad_norm": 4.90625, + "learning_rate": 6.6119875992698045e-06, + "loss": 1.52930984, + "memory(GiB)": 107.26, + "step": 33565, + "train_speed(iter/s)": 1.636749 + }, + { + "acc": 0.66140804, + "epoch": 0.8515981735159818, + "grad_norm": 5.8125, + "learning_rate": 6.610994932065207e-06, + "loss": 1.5717617, + "memory(GiB)": 107.26, + "step": 33570, + "train_speed(iter/s)": 1.636776 + }, + { + "acc": 0.6553977, + "epoch": 0.8517250126839168, + "grad_norm": 5.65625, + "learning_rate": 6.610002194001861e-06, + "loss": 1.62787571, + "memory(GiB)": 107.26, + "step": 33575, + "train_speed(iter/s)": 1.636802 + }, + { + "acc": 0.66403141, + "epoch": 0.8518518518518519, + "grad_norm": 6.6875, + "learning_rate": 6.609009385123429e-06, + "loss": 1.52281885, + "memory(GiB)": 107.26, + "step": 33580, + "train_speed(iter/s)": 1.636829 + }, + { + "acc": 0.66344795, + "epoch": 0.851978691019787, + "grad_norm": 6.0625, + "learning_rate": 6.608016505473582e-06, + "loss": 1.57456589, + "memory(GiB)": 107.26, + "step": 33585, + "train_speed(iter/s)": 1.636856 + }, + { + "acc": 0.65338573, + "epoch": 0.852105530187722, + "grad_norm": 6.21875, + "learning_rate": 6.60702355509599e-06, + "loss": 1.58700981, + "memory(GiB)": 107.26, + "step": 33590, + "train_speed(iter/s)": 1.636882 + }, + { + "acc": 0.65575476, + "epoch": 0.852232369355657, + "grad_norm": 5.28125, + "learning_rate": 6.606030534034326e-06, + "loss": 1.61918526, + "memory(GiB)": 107.26, + "step": 33595, + "train_speed(iter/s)": 1.636907 + }, + { + "acc": 0.64912591, + "epoch": 0.852359208523592, + "grad_norm": 5.8125, + "learning_rate": 6.6050374423322685e-06, + "loss": 1.67333641, + "memory(GiB)": 107.26, + "step": 33600, + "train_speed(iter/s)": 1.636934 + }, + { + "acc": 0.63515944, + "epoch": 0.8524860476915271, + "grad_norm": 6.15625, + "learning_rate": 6.604044280033498e-06, + "loss": 1.67019806, + "memory(GiB)": 107.26, + "step": 33605, + "train_speed(iter/s)": 1.636962 + }, + { + "acc": 0.6477654, + "epoch": 0.8526128868594622, + "grad_norm": 6.8125, + "learning_rate": 6.6030510471817e-06, + "loss": 1.66130009, + "memory(GiB)": 107.26, + "step": 33610, + "train_speed(iter/s)": 1.636991 + }, + { + "acc": 0.6302866, + "epoch": 0.8527397260273972, + "grad_norm": 5.09375, + "learning_rate": 6.602057743820558e-06, + "loss": 1.69997482, + "memory(GiB)": 107.26, + "step": 33615, + "train_speed(iter/s)": 1.637018 + }, + { + "acc": 0.66915846, + "epoch": 0.8528665651953323, + "grad_norm": 8.5625, + "learning_rate": 6.601064369993766e-06, + "loss": 1.52528439, + "memory(GiB)": 107.26, + "step": 33620, + "train_speed(iter/s)": 1.637045 + }, + { + "acc": 0.64886618, + "epoch": 0.8529934043632674, + "grad_norm": 5.4375, + "learning_rate": 6.600070925745012e-06, + "loss": 1.59449921, + "memory(GiB)": 107.26, + "step": 33625, + "train_speed(iter/s)": 1.63707 + }, + { + "acc": 0.65051441, + "epoch": 0.8531202435312024, + "grad_norm": 5.65625, + "learning_rate": 6.599077411117998e-06, + "loss": 1.63744965, + "memory(GiB)": 107.26, + "step": 33630, + "train_speed(iter/s)": 1.637098 + }, + { + "acc": 0.66494141, + "epoch": 0.8532470826991375, + "grad_norm": 6.1875, + "learning_rate": 6.598083826156418e-06, + "loss": 1.53433905, + "memory(GiB)": 107.26, + "step": 33635, + "train_speed(iter/s)": 1.637126 + }, + { + "acc": 0.66096401, + "epoch": 0.8533739218670725, + "grad_norm": 5.375, + "learning_rate": 6.597090170903977e-06, + "loss": 1.62079887, + "memory(GiB)": 107.26, + "step": 33640, + "train_speed(iter/s)": 1.637155 + }, + { + "acc": 0.65777836, + "epoch": 0.8535007610350076, + "grad_norm": 8.0, + "learning_rate": 6.596096445404381e-06, + "loss": 1.61259346, + "memory(GiB)": 107.26, + "step": 33645, + "train_speed(iter/s)": 1.637182 + }, + { + "acc": 0.65028338, + "epoch": 0.8536276002029427, + "grad_norm": 5.625, + "learning_rate": 6.595102649701336e-06, + "loss": 1.60424461, + "memory(GiB)": 107.26, + "step": 33650, + "train_speed(iter/s)": 1.637209 + }, + { + "acc": 0.65557952, + "epoch": 0.8537544393708777, + "grad_norm": 5.78125, + "learning_rate": 6.5941087838385545e-06, + "loss": 1.60531502, + "memory(GiB)": 107.26, + "step": 33655, + "train_speed(iter/s)": 1.637235 + }, + { + "acc": 0.66563034, + "epoch": 0.8538812785388128, + "grad_norm": 5.5, + "learning_rate": 6.593114847859752e-06, + "loss": 1.55379219, + "memory(GiB)": 107.26, + "step": 33660, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.65028367, + "epoch": 0.8540081177067479, + "grad_norm": 7.4375, + "learning_rate": 6.592120841808646e-06, + "loss": 1.65940437, + "memory(GiB)": 107.26, + "step": 33665, + "train_speed(iter/s)": 1.637291 + }, + { + "acc": 0.66156449, + "epoch": 0.8541349568746829, + "grad_norm": 5.15625, + "learning_rate": 6.5911267657289564e-06, + "loss": 1.57866669, + "memory(GiB)": 107.26, + "step": 33670, + "train_speed(iter/s)": 1.637315 + }, + { + "acc": 0.65063314, + "epoch": 0.854261796042618, + "grad_norm": 5.25, + "learning_rate": 6.590132619664408e-06, + "loss": 1.61251144, + "memory(GiB)": 107.26, + "step": 33675, + "train_speed(iter/s)": 1.63734 + }, + { + "acc": 0.66867943, + "epoch": 0.854388635210553, + "grad_norm": 5.03125, + "learning_rate": 6.589138403658728e-06, + "loss": 1.56566391, + "memory(GiB)": 107.26, + "step": 33680, + "train_speed(iter/s)": 1.637367 + }, + { + "acc": 0.64111919, + "epoch": 0.8545154743784881, + "grad_norm": 5.46875, + "learning_rate": 6.588144117755645e-06, + "loss": 1.66057682, + "memory(GiB)": 107.26, + "step": 33685, + "train_speed(iter/s)": 1.637393 + }, + { + "acc": 0.65913076, + "epoch": 0.8546423135464232, + "grad_norm": 9.625, + "learning_rate": 6.5871497619988945e-06, + "loss": 1.58504915, + "memory(GiB)": 107.26, + "step": 33690, + "train_speed(iter/s)": 1.637418 + }, + { + "acc": 0.64105988, + "epoch": 0.8547691527143582, + "grad_norm": 5.25, + "learning_rate": 6.586155336432211e-06, + "loss": 1.65872593, + "memory(GiB)": 107.26, + "step": 33695, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.66763258, + "epoch": 0.8548959918822933, + "grad_norm": 5.375, + "learning_rate": 6.585160841099333e-06, + "loss": 1.53311768, + "memory(GiB)": 107.26, + "step": 33700, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.6632225, + "epoch": 0.8550228310502284, + "grad_norm": 5.71875, + "learning_rate": 6.584166276044005e-06, + "loss": 1.58094616, + "memory(GiB)": 107.26, + "step": 33705, + "train_speed(iter/s)": 1.637497 + }, + { + "acc": 0.64657121, + "epoch": 0.8551496702181633, + "grad_norm": 4.53125, + "learning_rate": 6.583171641309971e-06, + "loss": 1.6748003, + "memory(GiB)": 107.26, + "step": 33710, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.65904293, + "epoch": 0.8552765093860984, + "grad_norm": 6.6875, + "learning_rate": 6.58217693694098e-06, + "loss": 1.60534363, + "memory(GiB)": 107.26, + "step": 33715, + "train_speed(iter/s)": 1.637548 + }, + { + "acc": 0.6524539, + "epoch": 0.8554033485540334, + "grad_norm": 7.15625, + "learning_rate": 6.581182162980784e-06, + "loss": 1.5884263, + "memory(GiB)": 107.26, + "step": 33720, + "train_speed(iter/s)": 1.637573 + }, + { + "acc": 0.64203291, + "epoch": 0.8555301877219685, + "grad_norm": 5.0, + "learning_rate": 6.580187319473137e-06, + "loss": 1.65365448, + "memory(GiB)": 107.26, + "step": 33725, + "train_speed(iter/s)": 1.6376 + }, + { + "acc": 0.63745112, + "epoch": 0.8556570268899036, + "grad_norm": 5.25, + "learning_rate": 6.579192406461796e-06, + "loss": 1.66923809, + "memory(GiB)": 107.26, + "step": 33730, + "train_speed(iter/s)": 1.637626 + }, + { + "acc": 0.6508481, + "epoch": 0.8557838660578386, + "grad_norm": 6.625, + "learning_rate": 6.5781974239905225e-06, + "loss": 1.66174297, + "memory(GiB)": 107.26, + "step": 33735, + "train_speed(iter/s)": 1.637651 + }, + { + "acc": 0.63802643, + "epoch": 0.8559107052257737, + "grad_norm": 7.75, + "learning_rate": 6.57720237210308e-06, + "loss": 1.65425167, + "memory(GiB)": 107.26, + "step": 33740, + "train_speed(iter/s)": 1.637676 + }, + { + "acc": 0.64974241, + "epoch": 0.8560375443937088, + "grad_norm": 6.03125, + "learning_rate": 6.576207250843235e-06, + "loss": 1.58731642, + "memory(GiB)": 107.26, + "step": 33745, + "train_speed(iter/s)": 1.637703 + }, + { + "acc": 0.67265229, + "epoch": 0.8561643835616438, + "grad_norm": 5.1875, + "learning_rate": 6.575212060254759e-06, + "loss": 1.56792126, + "memory(GiB)": 107.26, + "step": 33750, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.66145802, + "epoch": 0.8562912227295789, + "grad_norm": 5.1875, + "learning_rate": 6.574216800381424e-06, + "loss": 1.65183334, + "memory(GiB)": 107.26, + "step": 33755, + "train_speed(iter/s)": 1.637754 + }, + { + "acc": 0.66072688, + "epoch": 0.8564180618975139, + "grad_norm": 6.21875, + "learning_rate": 6.573221471267005e-06, + "loss": 1.56241426, + "memory(GiB)": 107.26, + "step": 33760, + "train_speed(iter/s)": 1.637779 + }, + { + "acc": 0.65756068, + "epoch": 0.856544901065449, + "grad_norm": 4.9375, + "learning_rate": 6.572226072955281e-06, + "loss": 1.59077196, + "memory(GiB)": 107.26, + "step": 33765, + "train_speed(iter/s)": 1.637803 + }, + { + "acc": 0.67466726, + "epoch": 0.8566717402333841, + "grad_norm": 6.3125, + "learning_rate": 6.571230605490036e-06, + "loss": 1.52986774, + "memory(GiB)": 107.26, + "step": 33770, + "train_speed(iter/s)": 1.637831 + }, + { + "acc": 0.67146111, + "epoch": 0.8567985794013191, + "grad_norm": 5.84375, + "learning_rate": 6.570235068915053e-06, + "loss": 1.5328475, + "memory(GiB)": 107.26, + "step": 33775, + "train_speed(iter/s)": 1.637855 + }, + { + "acc": 0.65428061, + "epoch": 0.8569254185692542, + "grad_norm": 5.46875, + "learning_rate": 6.569239463274122e-06, + "loss": 1.65695744, + "memory(GiB)": 107.26, + "step": 33780, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.64167366, + "epoch": 0.8570522577371893, + "grad_norm": 4.8125, + "learning_rate": 6.568243788611033e-06, + "loss": 1.58303642, + "memory(GiB)": 107.26, + "step": 33785, + "train_speed(iter/s)": 1.637907 + }, + { + "acc": 0.65935373, + "epoch": 0.8571790969051243, + "grad_norm": 5.125, + "learning_rate": 6.56724804496958e-06, + "loss": 1.59171257, + "memory(GiB)": 107.26, + "step": 33790, + "train_speed(iter/s)": 1.637933 + }, + { + "acc": 0.6521513, + "epoch": 0.8573059360730594, + "grad_norm": 4.8125, + "learning_rate": 6.566252232393561e-06, + "loss": 1.62133522, + "memory(GiB)": 107.26, + "step": 33795, + "train_speed(iter/s)": 1.637961 + }, + { + "acc": 0.66416121, + "epoch": 0.8574327752409944, + "grad_norm": 5.46875, + "learning_rate": 6.565256350926777e-06, + "loss": 1.59280167, + "memory(GiB)": 107.26, + "step": 33800, + "train_speed(iter/s)": 1.637986 + }, + { + "acc": 0.66706371, + "epoch": 0.8575596144089295, + "grad_norm": 4.6875, + "learning_rate": 6.5642604006130286e-06, + "loss": 1.53438721, + "memory(GiB)": 107.26, + "step": 33805, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.65187678, + "epoch": 0.8576864535768646, + "grad_norm": 4.875, + "learning_rate": 6.563264381496124e-06, + "loss": 1.69195251, + "memory(GiB)": 107.26, + "step": 33810, + "train_speed(iter/s)": 1.638037 + }, + { + "acc": 0.65640607, + "epoch": 0.8578132927447996, + "grad_norm": 11.0625, + "learning_rate": 6.562268293619872e-06, + "loss": 1.6875267, + "memory(GiB)": 107.26, + "step": 33815, + "train_speed(iter/s)": 1.638064 + }, + { + "acc": 0.65623112, + "epoch": 0.8579401319127347, + "grad_norm": 7.96875, + "learning_rate": 6.561272137028089e-06, + "loss": 1.61683464, + "memory(GiB)": 107.26, + "step": 33820, + "train_speed(iter/s)": 1.63809 + }, + { + "acc": 0.64545979, + "epoch": 0.8580669710806698, + "grad_norm": 5.46875, + "learning_rate": 6.560275911764582e-06, + "loss": 1.58695049, + "memory(GiB)": 107.26, + "step": 33825, + "train_speed(iter/s)": 1.638116 + }, + { + "acc": 0.66298537, + "epoch": 0.8581938102486047, + "grad_norm": 6.96875, + "learning_rate": 6.5592796178731776e-06, + "loss": 1.58266106, + "memory(GiB)": 107.26, + "step": 33830, + "train_speed(iter/s)": 1.638142 + }, + { + "acc": 0.63866143, + "epoch": 0.8583206494165398, + "grad_norm": 5.71875, + "learning_rate": 6.5582832553976924e-06, + "loss": 1.60564728, + "memory(GiB)": 107.26, + "step": 33835, + "train_speed(iter/s)": 1.638167 + }, + { + "acc": 0.6707767, + "epoch": 0.8584474885844748, + "grad_norm": 4.78125, + "learning_rate": 6.557286824381955e-06, + "loss": 1.55806322, + "memory(GiB)": 107.26, + "step": 33840, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.65880113, + "epoch": 0.8585743277524099, + "grad_norm": 7.21875, + "learning_rate": 6.556290324869786e-06, + "loss": 1.60772247, + "memory(GiB)": 107.26, + "step": 33845, + "train_speed(iter/s)": 1.638219 + }, + { + "acc": 0.66736221, + "epoch": 0.858701166920345, + "grad_norm": 5.78125, + "learning_rate": 6.555293756905024e-06, + "loss": 1.58186092, + "memory(GiB)": 107.26, + "step": 33850, + "train_speed(iter/s)": 1.638246 + }, + { + "acc": 0.66110554, + "epoch": 0.85882800608828, + "grad_norm": 5.78125, + "learning_rate": 6.554297120531497e-06, + "loss": 1.51177158, + "memory(GiB)": 107.26, + "step": 33855, + "train_speed(iter/s)": 1.638272 + }, + { + "acc": 0.64942493, + "epoch": 0.8589548452562151, + "grad_norm": 4.78125, + "learning_rate": 6.553300415793042e-06, + "loss": 1.63577919, + "memory(GiB)": 107.26, + "step": 33860, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.64790277, + "epoch": 0.8590816844241502, + "grad_norm": 5.40625, + "learning_rate": 6.552303642733502e-06, + "loss": 1.61803627, + "memory(GiB)": 107.26, + "step": 33865, + "train_speed(iter/s)": 1.638325 + }, + { + "acc": 0.6589128, + "epoch": 0.8592085235920852, + "grad_norm": 5.25, + "learning_rate": 6.551306801396715e-06, + "loss": 1.61670704, + "memory(GiB)": 107.26, + "step": 33870, + "train_speed(iter/s)": 1.638351 + }, + { + "acc": 0.65412321, + "epoch": 0.8593353627600203, + "grad_norm": 5.625, + "learning_rate": 6.550309891826531e-06, + "loss": 1.55272522, + "memory(GiB)": 107.26, + "step": 33875, + "train_speed(iter/s)": 1.638378 + }, + { + "acc": 0.6536561, + "epoch": 0.8594622019279553, + "grad_norm": 6.0, + "learning_rate": 6.5493129140667955e-06, + "loss": 1.57131338, + "memory(GiB)": 107.26, + "step": 33880, + "train_speed(iter/s)": 1.638403 + }, + { + "acc": 0.64003358, + "epoch": 0.8595890410958904, + "grad_norm": 6.5625, + "learning_rate": 6.54831586816136e-06, + "loss": 1.64107819, + "memory(GiB)": 107.26, + "step": 33885, + "train_speed(iter/s)": 1.638432 + }, + { + "acc": 0.66077843, + "epoch": 0.8597158802638255, + "grad_norm": 5.53125, + "learning_rate": 6.54731875415408e-06, + "loss": 1.59336863, + "memory(GiB)": 107.26, + "step": 33890, + "train_speed(iter/s)": 1.638458 + }, + { + "acc": 0.66765656, + "epoch": 0.8598427194317605, + "grad_norm": 4.75, + "learning_rate": 6.546321572088814e-06, + "loss": 1.55714464, + "memory(GiB)": 107.26, + "step": 33895, + "train_speed(iter/s)": 1.638485 + }, + { + "acc": 0.65034971, + "epoch": 0.8599695585996956, + "grad_norm": 5.5625, + "learning_rate": 6.545324322009421e-06, + "loss": 1.63762589, + "memory(GiB)": 107.26, + "step": 33900, + "train_speed(iter/s)": 1.638512 + }, + { + "acc": 0.6678153, + "epoch": 0.8600963977676307, + "grad_norm": 6.03125, + "learning_rate": 6.544327003959765e-06, + "loss": 1.56251316, + "memory(GiB)": 107.26, + "step": 33905, + "train_speed(iter/s)": 1.638539 + }, + { + "acc": 0.66459317, + "epoch": 0.8602232369355657, + "grad_norm": 5.5625, + "learning_rate": 6.543329617983713e-06, + "loss": 1.58747234, + "memory(GiB)": 107.26, + "step": 33910, + "train_speed(iter/s)": 1.638565 + }, + { + "acc": 0.66931686, + "epoch": 0.8603500761035008, + "grad_norm": 5.65625, + "learning_rate": 6.5423321641251316e-06, + "loss": 1.61592941, + "memory(GiB)": 107.26, + "step": 33915, + "train_speed(iter/s)": 1.638593 + }, + { + "acc": 0.64662681, + "epoch": 0.8604769152714358, + "grad_norm": 6.28125, + "learning_rate": 6.541334642427898e-06, + "loss": 1.63952312, + "memory(GiB)": 107.26, + "step": 33920, + "train_speed(iter/s)": 1.638619 + }, + { + "acc": 0.65676203, + "epoch": 0.8606037544393709, + "grad_norm": 4.8125, + "learning_rate": 6.540337052935884e-06, + "loss": 1.58259211, + "memory(GiB)": 107.26, + "step": 33925, + "train_speed(iter/s)": 1.638645 + }, + { + "acc": 0.65153465, + "epoch": 0.860730593607306, + "grad_norm": 5.75, + "learning_rate": 6.53933939569297e-06, + "loss": 1.62081699, + "memory(GiB)": 107.26, + "step": 33930, + "train_speed(iter/s)": 1.638673 + }, + { + "acc": 0.65377936, + "epoch": 0.860857432775241, + "grad_norm": 5.5625, + "learning_rate": 6.538341670743037e-06, + "loss": 1.61512146, + "memory(GiB)": 107.26, + "step": 33935, + "train_speed(iter/s)": 1.638698 + }, + { + "acc": 0.66160526, + "epoch": 0.8609842719431761, + "grad_norm": 6.0625, + "learning_rate": 6.537343878129969e-06, + "loss": 1.57845831, + "memory(GiB)": 107.26, + "step": 33940, + "train_speed(iter/s)": 1.638725 + }, + { + "acc": 0.64678173, + "epoch": 0.8611111111111112, + "grad_norm": 6.46875, + "learning_rate": 6.5363460178976524e-06, + "loss": 1.64689846, + "memory(GiB)": 107.26, + "step": 33945, + "train_speed(iter/s)": 1.63875 + }, + { + "acc": 0.64583454, + "epoch": 0.8612379502790461, + "grad_norm": 6.1875, + "learning_rate": 6.53534809008998e-06, + "loss": 1.63763504, + "memory(GiB)": 107.26, + "step": 33950, + "train_speed(iter/s)": 1.638776 + }, + { + "acc": 0.655334, + "epoch": 0.8613647894469812, + "grad_norm": 4.84375, + "learning_rate": 6.534350094750843e-06, + "loss": 1.62189484, + "memory(GiB)": 107.26, + "step": 33955, + "train_speed(iter/s)": 1.6388 + }, + { + "acc": 0.64833665, + "epoch": 0.8614916286149162, + "grad_norm": 5.71875, + "learning_rate": 6.5333520319241385e-06, + "loss": 1.62575493, + "memory(GiB)": 107.26, + "step": 33960, + "train_speed(iter/s)": 1.638826 + }, + { + "acc": 0.63960791, + "epoch": 0.8616184677828513, + "grad_norm": 6.25, + "learning_rate": 6.532353901653765e-06, + "loss": 1.66806908, + "memory(GiB)": 107.26, + "step": 33965, + "train_speed(iter/s)": 1.638854 + }, + { + "acc": 0.64926233, + "epoch": 0.8617453069507864, + "grad_norm": 5.9375, + "learning_rate": 6.531355703983627e-06, + "loss": 1.66905327, + "memory(GiB)": 107.26, + "step": 33970, + "train_speed(iter/s)": 1.638879 + }, + { + "acc": 0.65363922, + "epoch": 0.8618721461187214, + "grad_norm": 5.84375, + "learning_rate": 6.530357438957626e-06, + "loss": 1.65205574, + "memory(GiB)": 107.26, + "step": 33975, + "train_speed(iter/s)": 1.638906 + }, + { + "acc": 0.65834751, + "epoch": 0.8619989852866565, + "grad_norm": 6.125, + "learning_rate": 6.529359106619675e-06, + "loss": 1.5987874, + "memory(GiB)": 107.26, + "step": 33980, + "train_speed(iter/s)": 1.638933 + }, + { + "acc": 0.64411855, + "epoch": 0.8621258244545916, + "grad_norm": 5.0, + "learning_rate": 6.528360707013681e-06, + "loss": 1.62621117, + "memory(GiB)": 107.26, + "step": 33985, + "train_speed(iter/s)": 1.638958 + }, + { + "acc": 0.6748682, + "epoch": 0.8622526636225266, + "grad_norm": 7.09375, + "learning_rate": 6.52736224018356e-06, + "loss": 1.64302311, + "memory(GiB)": 107.26, + "step": 33990, + "train_speed(iter/s)": 1.638985 + }, + { + "acc": 0.65918121, + "epoch": 0.8623795027904617, + "grad_norm": 4.90625, + "learning_rate": 6.526363706173227e-06, + "loss": 1.6388485, + "memory(GiB)": 107.26, + "step": 33995, + "train_speed(iter/s)": 1.639009 + }, + { + "acc": 0.67118726, + "epoch": 0.8625063419583967, + "grad_norm": 5.71875, + "learning_rate": 6.525365105026605e-06, + "loss": 1.47325811, + "memory(GiB)": 107.26, + "step": 34000, + "train_speed(iter/s)": 1.639035 + }, + { + "epoch": 0.8625063419583967, + "eval_acc": 0.6457969707031536, + "eval_loss": 1.5752389430999756, + "eval_runtime": 58.1348, + "eval_samples_per_second": 109.573, + "eval_steps_per_second": 27.402, + "step": 34000 + }, + { + "acc": 0.6792944, + "epoch": 0.8626331811263318, + "grad_norm": 6.03125, + "learning_rate": 6.524366436787615e-06, + "loss": 1.48697662, + "memory(GiB)": 107.26, + "step": 34005, + "train_speed(iter/s)": 1.634143 + }, + { + "acc": 0.65556364, + "epoch": 0.8627600202942669, + "grad_norm": 5.25, + "learning_rate": 6.523367701500183e-06, + "loss": 1.60516357, + "memory(GiB)": 107.26, + "step": 34010, + "train_speed(iter/s)": 1.634165 + }, + { + "acc": 0.65507517, + "epoch": 0.8628868594622019, + "grad_norm": 5.03125, + "learning_rate": 6.5223688992082375e-06, + "loss": 1.56654625, + "memory(GiB)": 107.26, + "step": 34015, + "train_speed(iter/s)": 1.634189 + }, + { + "acc": 0.64477711, + "epoch": 0.863013698630137, + "grad_norm": 6.46875, + "learning_rate": 6.521370029955713e-06, + "loss": 1.65110779, + "memory(GiB)": 107.26, + "step": 34020, + "train_speed(iter/s)": 1.634213 + }, + { + "acc": 0.64231291, + "epoch": 0.8631405377980721, + "grad_norm": 5.125, + "learning_rate": 6.520371093786541e-06, + "loss": 1.69971352, + "memory(GiB)": 107.26, + "step": 34025, + "train_speed(iter/s)": 1.634237 + }, + { + "acc": 0.67796497, + "epoch": 0.8632673769660071, + "grad_norm": 6.71875, + "learning_rate": 6.51937209074466e-06, + "loss": 1.56233158, + "memory(GiB)": 107.26, + "step": 34030, + "train_speed(iter/s)": 1.63426 + }, + { + "acc": 0.65515461, + "epoch": 0.8633942161339422, + "grad_norm": 5.59375, + "learning_rate": 6.51837302087401e-06, + "loss": 1.5657732, + "memory(GiB)": 107.26, + "step": 34035, + "train_speed(iter/s)": 1.634287 + }, + { + "acc": 0.64916468, + "epoch": 0.8635210553018772, + "grad_norm": 7.0625, + "learning_rate": 6.517373884218539e-06, + "loss": 1.67203236, + "memory(GiB)": 107.26, + "step": 34040, + "train_speed(iter/s)": 1.634311 + }, + { + "acc": 0.64695168, + "epoch": 0.8636478944698123, + "grad_norm": 6.1875, + "learning_rate": 6.5163746808221865e-06, + "loss": 1.65379772, + "memory(GiB)": 107.26, + "step": 34045, + "train_speed(iter/s)": 1.634336 + }, + { + "acc": 0.63686333, + "epoch": 0.8637747336377474, + "grad_norm": 4.6875, + "learning_rate": 6.515375410728907e-06, + "loss": 1.58830204, + "memory(GiB)": 107.26, + "step": 34050, + "train_speed(iter/s)": 1.634363 + }, + { + "acc": 0.65573053, + "epoch": 0.8639015728056824, + "grad_norm": 4.65625, + "learning_rate": 6.51437607398265e-06, + "loss": 1.62066689, + "memory(GiB)": 107.26, + "step": 34055, + "train_speed(iter/s)": 1.634388 + }, + { + "acc": 0.67124023, + "epoch": 0.8640284119736175, + "grad_norm": 6.875, + "learning_rate": 6.513376670627374e-06, + "loss": 1.54675541, + "memory(GiB)": 107.26, + "step": 34060, + "train_speed(iter/s)": 1.634412 + }, + { + "acc": 0.64677529, + "epoch": 0.8641552511415526, + "grad_norm": 8.125, + "learning_rate": 6.512377200707033e-06, + "loss": 1.640695, + "memory(GiB)": 107.26, + "step": 34065, + "train_speed(iter/s)": 1.634439 + }, + { + "acc": 0.6600338, + "epoch": 0.8642820903094875, + "grad_norm": 6.59375, + "learning_rate": 6.511377664265591e-06, + "loss": 1.56802282, + "memory(GiB)": 107.26, + "step": 34070, + "train_speed(iter/s)": 1.634466 + }, + { + "acc": 0.65844598, + "epoch": 0.8644089294774226, + "grad_norm": 5.4375, + "learning_rate": 6.510378061347013e-06, + "loss": 1.60723152, + "memory(GiB)": 107.26, + "step": 34075, + "train_speed(iter/s)": 1.634491 + }, + { + "acc": 0.65013862, + "epoch": 0.8645357686453576, + "grad_norm": 7.875, + "learning_rate": 6.509378391995264e-06, + "loss": 1.6730938, + "memory(GiB)": 107.26, + "step": 34080, + "train_speed(iter/s)": 1.634516 + }, + { + "acc": 0.65103164, + "epoch": 0.8646626078132927, + "grad_norm": 4.71875, + "learning_rate": 6.508378656254314e-06, + "loss": 1.69251461, + "memory(GiB)": 107.26, + "step": 34085, + "train_speed(iter/s)": 1.634541 + }, + { + "acc": 0.66417913, + "epoch": 0.8647894469812278, + "grad_norm": 5.4375, + "learning_rate": 6.507378854168136e-06, + "loss": 1.55832195, + "memory(GiB)": 107.26, + "step": 34090, + "train_speed(iter/s)": 1.634566 + }, + { + "acc": 0.64776597, + "epoch": 0.8649162861491628, + "grad_norm": 6.09375, + "learning_rate": 6.506378985780707e-06, + "loss": 1.60819626, + "memory(GiB)": 107.26, + "step": 34095, + "train_speed(iter/s)": 1.63459 + }, + { + "acc": 0.66627274, + "epoch": 0.8650431253170979, + "grad_norm": 5.46875, + "learning_rate": 6.505379051136004e-06, + "loss": 1.57564068, + "memory(GiB)": 107.26, + "step": 34100, + "train_speed(iter/s)": 1.634617 + }, + { + "acc": 0.66107988, + "epoch": 0.865169964485033, + "grad_norm": 5.5, + "learning_rate": 6.504379050278009e-06, + "loss": 1.58241034, + "memory(GiB)": 107.26, + "step": 34105, + "train_speed(iter/s)": 1.634641 + }, + { + "acc": 0.64033117, + "epoch": 0.865296803652968, + "grad_norm": 6.21875, + "learning_rate": 6.503378983250707e-06, + "loss": 1.72999649, + "memory(GiB)": 107.26, + "step": 34110, + "train_speed(iter/s)": 1.634668 + }, + { + "acc": 0.65916958, + "epoch": 0.8654236428209031, + "grad_norm": 9.0, + "learning_rate": 6.5023788500980855e-06, + "loss": 1.59703093, + "memory(GiB)": 107.26, + "step": 34115, + "train_speed(iter/s)": 1.634694 + }, + { + "acc": 0.64947586, + "epoch": 0.8655504819888381, + "grad_norm": 5.3125, + "learning_rate": 6.501378650864135e-06, + "loss": 1.64080048, + "memory(GiB)": 107.26, + "step": 34120, + "train_speed(iter/s)": 1.634719 + }, + { + "acc": 0.65534706, + "epoch": 0.8656773211567732, + "grad_norm": 5.46875, + "learning_rate": 6.500378385592847e-06, + "loss": 1.60876083, + "memory(GiB)": 107.26, + "step": 34125, + "train_speed(iter/s)": 1.634746 + }, + { + "acc": 0.66416488, + "epoch": 0.8658041603247083, + "grad_norm": 7.0, + "learning_rate": 6.49937805432822e-06, + "loss": 1.58953152, + "memory(GiB)": 107.26, + "step": 34130, + "train_speed(iter/s)": 1.634772 + }, + { + "acc": 0.66077566, + "epoch": 0.8659309994926433, + "grad_norm": 6.03125, + "learning_rate": 6.498377657114251e-06, + "loss": 1.55426388, + "memory(GiB)": 107.26, + "step": 34135, + "train_speed(iter/s)": 1.634798 + }, + { + "acc": 0.66569037, + "epoch": 0.8660578386605784, + "grad_norm": 5.59375, + "learning_rate": 6.497377193994944e-06, + "loss": 1.5657814, + "memory(GiB)": 107.26, + "step": 34140, + "train_speed(iter/s)": 1.634823 + }, + { + "acc": 0.66869922, + "epoch": 0.8661846778285135, + "grad_norm": 5.625, + "learning_rate": 6.496376665014301e-06, + "loss": 1.5821269, + "memory(GiB)": 107.26, + "step": 34145, + "train_speed(iter/s)": 1.634849 + }, + { + "acc": 0.64816437, + "epoch": 0.8663115169964485, + "grad_norm": 4.71875, + "learning_rate": 6.4953760702163325e-06, + "loss": 1.65185356, + "memory(GiB)": 107.26, + "step": 34150, + "train_speed(iter/s)": 1.634874 + }, + { + "acc": 0.65632424, + "epoch": 0.8664383561643836, + "grad_norm": 4.625, + "learning_rate": 6.494375409645049e-06, + "loss": 1.57895212, + "memory(GiB)": 107.26, + "step": 34155, + "train_speed(iter/s)": 1.634899 + }, + { + "acc": 0.65427155, + "epoch": 0.8665651953323186, + "grad_norm": 5.71875, + "learning_rate": 6.493374683344462e-06, + "loss": 1.59411678, + "memory(GiB)": 107.26, + "step": 34160, + "train_speed(iter/s)": 1.634921 + }, + { + "acc": 0.66368279, + "epoch": 0.8666920345002537, + "grad_norm": 7.53125, + "learning_rate": 6.492373891358589e-06, + "loss": 1.54134436, + "memory(GiB)": 107.26, + "step": 34165, + "train_speed(iter/s)": 1.634948 + }, + { + "acc": 0.65509548, + "epoch": 0.8668188736681888, + "grad_norm": 7.0625, + "learning_rate": 6.4913730337314495e-06, + "loss": 1.60821648, + "memory(GiB)": 107.26, + "step": 34170, + "train_speed(iter/s)": 1.634973 + }, + { + "acc": 0.6473949, + "epoch": 0.8669457128361238, + "grad_norm": 4.6875, + "learning_rate": 6.490372110507066e-06, + "loss": 1.65505047, + "memory(GiB)": 107.26, + "step": 34175, + "train_speed(iter/s)": 1.634998 + }, + { + "acc": 0.66014204, + "epoch": 0.8670725520040589, + "grad_norm": 5.46875, + "learning_rate": 6.489371121729462e-06, + "loss": 1.57256222, + "memory(GiB)": 107.26, + "step": 34180, + "train_speed(iter/s)": 1.635025 + }, + { + "acc": 0.65014925, + "epoch": 0.867199391171994, + "grad_norm": 5.96875, + "learning_rate": 6.4883700674426666e-06, + "loss": 1.57869263, + "memory(GiB)": 107.26, + "step": 34185, + "train_speed(iter/s)": 1.635051 + }, + { + "acc": 0.65479174, + "epoch": 0.867326230339929, + "grad_norm": 5.3125, + "learning_rate": 6.4873689476907105e-06, + "loss": 1.59951153, + "memory(GiB)": 107.26, + "step": 34190, + "train_speed(iter/s)": 1.635075 + }, + { + "acc": 0.65862598, + "epoch": 0.867453069507864, + "grad_norm": 7.09375, + "learning_rate": 6.486367762517628e-06, + "loss": 1.62387295, + "memory(GiB)": 107.26, + "step": 34195, + "train_speed(iter/s)": 1.635094 + }, + { + "acc": 0.64710178, + "epoch": 0.867579908675799, + "grad_norm": 5.1875, + "learning_rate": 6.4853665119674556e-06, + "loss": 1.6497982, + "memory(GiB)": 107.26, + "step": 34200, + "train_speed(iter/s)": 1.63512 + }, + { + "acc": 0.64454632, + "epoch": 0.8677067478437341, + "grad_norm": 4.65625, + "learning_rate": 6.484365196084231e-06, + "loss": 1.58651199, + "memory(GiB)": 107.26, + "step": 34205, + "train_speed(iter/s)": 1.635146 + }, + { + "acc": 0.65895886, + "epoch": 0.8678335870116692, + "grad_norm": 4.90625, + "learning_rate": 6.4833638149119985e-06, + "loss": 1.62382412, + "memory(GiB)": 107.26, + "step": 34210, + "train_speed(iter/s)": 1.63517 + }, + { + "acc": 0.65482121, + "epoch": 0.8679604261796042, + "grad_norm": 6.34375, + "learning_rate": 6.4823623684948034e-06, + "loss": 1.62744713, + "memory(GiB)": 107.26, + "step": 34215, + "train_speed(iter/s)": 1.635196 + }, + { + "acc": 0.66440773, + "epoch": 0.8680872653475393, + "grad_norm": 5.59375, + "learning_rate": 6.4813608568766924e-06, + "loss": 1.64382381, + "memory(GiB)": 107.26, + "step": 34220, + "train_speed(iter/s)": 1.635221 + }, + { + "acc": 0.65206661, + "epoch": 0.8682141045154744, + "grad_norm": 5.84375, + "learning_rate": 6.480359280101717e-06, + "loss": 1.59996347, + "memory(GiB)": 107.26, + "step": 34225, + "train_speed(iter/s)": 1.635247 + }, + { + "acc": 0.65976686, + "epoch": 0.8683409436834094, + "grad_norm": 5.71875, + "learning_rate": 6.479357638213931e-06, + "loss": 1.54963942, + "memory(GiB)": 107.26, + "step": 34230, + "train_speed(iter/s)": 1.635271 + }, + { + "acc": 0.65868626, + "epoch": 0.8684677828513445, + "grad_norm": 6.0625, + "learning_rate": 6.478355931257392e-06, + "loss": 1.59762096, + "memory(GiB)": 107.26, + "step": 34235, + "train_speed(iter/s)": 1.635296 + }, + { + "acc": 0.66919794, + "epoch": 0.8685946220192795, + "grad_norm": 5.75, + "learning_rate": 6.477354159276158e-06, + "loss": 1.55229073, + "memory(GiB)": 107.26, + "step": 34240, + "train_speed(iter/s)": 1.635323 + }, + { + "acc": 0.64872255, + "epoch": 0.8687214611872146, + "grad_norm": 5.46875, + "learning_rate": 6.476352322314292e-06, + "loss": 1.61540604, + "memory(GiB)": 107.26, + "step": 34245, + "train_speed(iter/s)": 1.63535 + }, + { + "acc": 0.64695129, + "epoch": 0.8688483003551497, + "grad_norm": 6.0, + "learning_rate": 6.47535042041586e-06, + "loss": 1.66697693, + "memory(GiB)": 107.26, + "step": 34250, + "train_speed(iter/s)": 1.635376 + }, + { + "acc": 0.63026171, + "epoch": 0.8689751395230847, + "grad_norm": 5.21875, + "learning_rate": 6.474348453624929e-06, + "loss": 1.70388355, + "memory(GiB)": 107.26, + "step": 34255, + "train_speed(iter/s)": 1.635403 + }, + { + "acc": 0.6526309, + "epoch": 0.8691019786910198, + "grad_norm": 7.0, + "learning_rate": 6.473346421985571e-06, + "loss": 1.63642807, + "memory(GiB)": 107.26, + "step": 34260, + "train_speed(iter/s)": 1.635427 + }, + { + "acc": 0.6500998, + "epoch": 0.8692288178589549, + "grad_norm": 6.21875, + "learning_rate": 6.472344325541859e-06, + "loss": 1.65020237, + "memory(GiB)": 107.26, + "step": 34265, + "train_speed(iter/s)": 1.635453 + }, + { + "acc": 0.64811931, + "epoch": 0.8693556570268899, + "grad_norm": 5.1875, + "learning_rate": 6.4713421643378715e-06, + "loss": 1.65410061, + "memory(GiB)": 107.26, + "step": 34270, + "train_speed(iter/s)": 1.635479 + }, + { + "acc": 0.64889007, + "epoch": 0.869482496194825, + "grad_norm": 5.90625, + "learning_rate": 6.470339938417685e-06, + "loss": 1.57758446, + "memory(GiB)": 107.26, + "step": 34275, + "train_speed(iter/s)": 1.635505 + }, + { + "acc": 0.67047734, + "epoch": 0.86960933536276, + "grad_norm": 5.875, + "learning_rate": 6.469337647825384e-06, + "loss": 1.56917114, + "memory(GiB)": 107.26, + "step": 34280, + "train_speed(iter/s)": 1.63553 + }, + { + "acc": 0.64510756, + "epoch": 0.8697361745306951, + "grad_norm": 9.625, + "learning_rate": 6.468335292605053e-06, + "loss": 1.61944275, + "memory(GiB)": 107.26, + "step": 34285, + "train_speed(iter/s)": 1.635556 + }, + { + "acc": 0.67003632, + "epoch": 0.8698630136986302, + "grad_norm": 5.78125, + "learning_rate": 6.467332872800779e-06, + "loss": 1.51040487, + "memory(GiB)": 107.26, + "step": 34290, + "train_speed(iter/s)": 1.635582 + }, + { + "acc": 0.65759735, + "epoch": 0.8699898528665652, + "grad_norm": 5.375, + "learning_rate": 6.466330388456655e-06, + "loss": 1.61880112, + "memory(GiB)": 107.26, + "step": 34295, + "train_speed(iter/s)": 1.635608 + }, + { + "acc": 0.64413924, + "epoch": 0.8701166920345003, + "grad_norm": 7.28125, + "learning_rate": 6.465327839616774e-06, + "loss": 1.654953, + "memory(GiB)": 107.26, + "step": 34300, + "train_speed(iter/s)": 1.635632 + }, + { + "acc": 0.66525888, + "epoch": 0.8702435312024354, + "grad_norm": 6.09375, + "learning_rate": 6.464325226325232e-06, + "loss": 1.59572163, + "memory(GiB)": 107.26, + "step": 34305, + "train_speed(iter/s)": 1.635656 + }, + { + "acc": 0.64500017, + "epoch": 0.8703703703703703, + "grad_norm": 7.15625, + "learning_rate": 6.46332254862613e-06, + "loss": 1.67218018, + "memory(GiB)": 107.26, + "step": 34310, + "train_speed(iter/s)": 1.635682 + }, + { + "acc": 0.65909228, + "epoch": 0.8704972095383054, + "grad_norm": 4.65625, + "learning_rate": 6.462319806563568e-06, + "loss": 1.60085297, + "memory(GiB)": 107.26, + "step": 34315, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.64792891, + "epoch": 0.8706240487062404, + "grad_norm": 5.75, + "learning_rate": 6.461317000181653e-06, + "loss": 1.5514185, + "memory(GiB)": 107.26, + "step": 34320, + "train_speed(iter/s)": 1.635731 + }, + { + "acc": 0.65251832, + "epoch": 0.8707508878741755, + "grad_norm": 6.4375, + "learning_rate": 6.460314129524491e-06, + "loss": 1.65665169, + "memory(GiB)": 107.26, + "step": 34325, + "train_speed(iter/s)": 1.635755 + }, + { + "acc": 0.65071449, + "epoch": 0.8708777270421106, + "grad_norm": 5.0625, + "learning_rate": 6.4593111946361945e-06, + "loss": 1.60932198, + "memory(GiB)": 107.26, + "step": 34330, + "train_speed(iter/s)": 1.635781 + }, + { + "acc": 0.64631996, + "epoch": 0.8710045662100456, + "grad_norm": 5.28125, + "learning_rate": 6.458308195560874e-06, + "loss": 1.6017498, + "memory(GiB)": 107.26, + "step": 34335, + "train_speed(iter/s)": 1.635808 + }, + { + "acc": 0.64003296, + "epoch": 0.8711314053779807, + "grad_norm": 6.875, + "learning_rate": 6.4573051323426515e-06, + "loss": 1.66052971, + "memory(GiB)": 107.26, + "step": 34340, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.65305729, + "epoch": 0.8712582445459158, + "grad_norm": 6.875, + "learning_rate": 6.456302005025641e-06, + "loss": 1.66724777, + "memory(GiB)": 107.26, + "step": 34345, + "train_speed(iter/s)": 1.635858 + }, + { + "acc": 0.6336185, + "epoch": 0.8713850837138508, + "grad_norm": 6.65625, + "learning_rate": 6.4552988136539675e-06, + "loss": 1.67614632, + "memory(GiB)": 107.26, + "step": 34350, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.66357269, + "epoch": 0.8715119228817859, + "grad_norm": 6.28125, + "learning_rate": 6.454295558271752e-06, + "loss": 1.56703577, + "memory(GiB)": 107.26, + "step": 34355, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.65190077, + "epoch": 0.8716387620497209, + "grad_norm": 5.375, + "learning_rate": 6.4532922389231275e-06, + "loss": 1.57342186, + "memory(GiB)": 107.26, + "step": 34360, + "train_speed(iter/s)": 1.635933 + }, + { + "acc": 0.66487818, + "epoch": 0.871765601217656, + "grad_norm": 5.40625, + "learning_rate": 6.452288855652222e-06, + "loss": 1.56768627, + "memory(GiB)": 107.26, + "step": 34365, + "train_speed(iter/s)": 1.63596 + }, + { + "acc": 0.65949831, + "epoch": 0.8718924403855911, + "grad_norm": 4.90625, + "learning_rate": 6.451285408503167e-06, + "loss": 1.56575298, + "memory(GiB)": 107.26, + "step": 34370, + "train_speed(iter/s)": 1.635984 + }, + { + "acc": 0.64705176, + "epoch": 0.8720192795535261, + "grad_norm": 6.0, + "learning_rate": 6.450281897520102e-06, + "loss": 1.64136944, + "memory(GiB)": 107.26, + "step": 34375, + "train_speed(iter/s)": 1.636008 + }, + { + "acc": 0.66802015, + "epoch": 0.8721461187214612, + "grad_norm": 5.5625, + "learning_rate": 6.449278322747164e-06, + "loss": 1.59115458, + "memory(GiB)": 107.26, + "step": 34380, + "train_speed(iter/s)": 1.636032 + }, + { + "acc": 0.6672204, + "epoch": 0.8722729578893963, + "grad_norm": 5.78125, + "learning_rate": 6.448274684228494e-06, + "loss": 1.6125803, + "memory(GiB)": 107.26, + "step": 34385, + "train_speed(iter/s)": 1.636057 + }, + { + "acc": 0.67955103, + "epoch": 0.8723997970573313, + "grad_norm": 6.5, + "learning_rate": 6.447270982008237e-06, + "loss": 1.52753544, + "memory(GiB)": 107.26, + "step": 34390, + "train_speed(iter/s)": 1.636081 + }, + { + "acc": 0.65122099, + "epoch": 0.8725266362252664, + "grad_norm": 6.0, + "learning_rate": 6.446267216130541e-06, + "loss": 1.63484802, + "memory(GiB)": 107.26, + "step": 34395, + "train_speed(iter/s)": 1.636106 + }, + { + "acc": 0.65745192, + "epoch": 0.8726534753932014, + "grad_norm": 5.125, + "learning_rate": 6.4452633866395555e-06, + "loss": 1.59543476, + "memory(GiB)": 107.26, + "step": 34400, + "train_speed(iter/s)": 1.636133 + }, + { + "acc": 0.66621742, + "epoch": 0.8727803145611365, + "grad_norm": 4.875, + "learning_rate": 6.444259493579433e-06, + "loss": 1.58463697, + "memory(GiB)": 107.26, + "step": 34405, + "train_speed(iter/s)": 1.63616 + }, + { + "acc": 0.66239519, + "epoch": 0.8729071537290716, + "grad_norm": 5.875, + "learning_rate": 6.443255536994331e-06, + "loss": 1.58281994, + "memory(GiB)": 107.26, + "step": 34410, + "train_speed(iter/s)": 1.636186 + }, + { + "acc": 0.66718121, + "epoch": 0.8730339928970066, + "grad_norm": 5.75, + "learning_rate": 6.442251516928406e-06, + "loss": 1.61974564, + "memory(GiB)": 107.26, + "step": 34415, + "train_speed(iter/s)": 1.636213 + }, + { + "acc": 0.67059326, + "epoch": 0.8731608320649417, + "grad_norm": 4.5625, + "learning_rate": 6.441247433425821e-06, + "loss": 1.55733433, + "memory(GiB)": 107.26, + "step": 34420, + "train_speed(iter/s)": 1.636239 + }, + { + "acc": 0.64994831, + "epoch": 0.8732876712328768, + "grad_norm": 5.5625, + "learning_rate": 6.4402432865307384e-06, + "loss": 1.64555511, + "memory(GiB)": 107.26, + "step": 34425, + "train_speed(iter/s)": 1.636265 + }, + { + "acc": 0.65168667, + "epoch": 0.8734145104008117, + "grad_norm": 5.3125, + "learning_rate": 6.439239076287327e-06, + "loss": 1.6625782, + "memory(GiB)": 107.26, + "step": 34430, + "train_speed(iter/s)": 1.636292 + }, + { + "acc": 0.65405111, + "epoch": 0.8735413495687468, + "grad_norm": 6.40625, + "learning_rate": 6.438234802739753e-06, + "loss": 1.60159836, + "memory(GiB)": 107.26, + "step": 34435, + "train_speed(iter/s)": 1.636317 + }, + { + "acc": 0.66004095, + "epoch": 0.8736681887366818, + "grad_norm": 6.53125, + "learning_rate": 6.4372304659321935e-06, + "loss": 1.6271347, + "memory(GiB)": 107.26, + "step": 34440, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65722857, + "epoch": 0.8737950279046169, + "grad_norm": 5.71875, + "learning_rate": 6.43622606590882e-06, + "loss": 1.6335743, + "memory(GiB)": 107.26, + "step": 34445, + "train_speed(iter/s)": 1.636369 + }, + { + "acc": 0.64141674, + "epoch": 0.873921867072552, + "grad_norm": 5.59375, + "learning_rate": 6.4352216027138125e-06, + "loss": 1.58018131, + "memory(GiB)": 107.26, + "step": 34450, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.64842167, + "epoch": 0.874048706240487, + "grad_norm": 6.375, + "learning_rate": 6.434217076391351e-06, + "loss": 1.61761398, + "memory(GiB)": 107.26, + "step": 34455, + "train_speed(iter/s)": 1.636421 + }, + { + "acc": 0.6538404, + "epoch": 0.8741755454084221, + "grad_norm": 5.40625, + "learning_rate": 6.433212486985618e-06, + "loss": 1.60644569, + "memory(GiB)": 107.26, + "step": 34460, + "train_speed(iter/s)": 1.636447 + }, + { + "acc": 0.65105915, + "epoch": 0.8743023845763572, + "grad_norm": 4.90625, + "learning_rate": 6.432207834540802e-06, + "loss": 1.61576042, + "memory(GiB)": 107.26, + "step": 34465, + "train_speed(iter/s)": 1.636473 + }, + { + "acc": 0.66844931, + "epoch": 0.8744292237442922, + "grad_norm": 4.875, + "learning_rate": 6.431203119101093e-06, + "loss": 1.58085232, + "memory(GiB)": 107.26, + "step": 34470, + "train_speed(iter/s)": 1.636497 + }, + { + "acc": 0.64565792, + "epoch": 0.8745560629122273, + "grad_norm": 5.78125, + "learning_rate": 6.430198340710677e-06, + "loss": 1.59852791, + "memory(GiB)": 107.26, + "step": 34475, + "train_speed(iter/s)": 1.636523 + }, + { + "acc": 0.66981173, + "epoch": 0.8746829020801623, + "grad_norm": 5.90625, + "learning_rate": 6.4291934994137566e-06, + "loss": 1.56081276, + "memory(GiB)": 107.26, + "step": 34480, + "train_speed(iter/s)": 1.636549 + }, + { + "acc": 0.66475592, + "epoch": 0.8748097412480974, + "grad_norm": 6.65625, + "learning_rate": 6.428188595254521e-06, + "loss": 1.58975334, + "memory(GiB)": 107.26, + "step": 34485, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.67063007, + "epoch": 0.8749365804160325, + "grad_norm": 7.03125, + "learning_rate": 6.427183628277178e-06, + "loss": 1.61651955, + "memory(GiB)": 107.26, + "step": 34490, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.6530767, + "epoch": 0.8750634195839675, + "grad_norm": 5.21875, + "learning_rate": 6.426178598525925e-06, + "loss": 1.65785427, + "memory(GiB)": 107.26, + "step": 34495, + "train_speed(iter/s)": 1.636628 + }, + { + "acc": 0.65333533, + "epoch": 0.8751902587519026, + "grad_norm": 4.8125, + "learning_rate": 6.4251735060449725e-06, + "loss": 1.66282806, + "memory(GiB)": 107.26, + "step": 34500, + "train_speed(iter/s)": 1.636656 + }, + { + "acc": 0.64274106, + "epoch": 0.8753170979198377, + "grad_norm": 5.34375, + "learning_rate": 6.424168350878524e-06, + "loss": 1.62472115, + "memory(GiB)": 107.26, + "step": 34505, + "train_speed(iter/s)": 1.636681 + }, + { + "acc": 0.66486292, + "epoch": 0.8754439370877727, + "grad_norm": 6.875, + "learning_rate": 6.423163133070792e-06, + "loss": 1.53483925, + "memory(GiB)": 107.26, + "step": 34510, + "train_speed(iter/s)": 1.636708 + }, + { + "acc": 0.66005802, + "epoch": 0.8755707762557078, + "grad_norm": 5.75, + "learning_rate": 6.422157852665993e-06, + "loss": 1.66071949, + "memory(GiB)": 107.26, + "step": 34515, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.65810995, + "epoch": 0.8756976154236428, + "grad_norm": 5.59375, + "learning_rate": 6.421152509708342e-06, + "loss": 1.53953047, + "memory(GiB)": 107.26, + "step": 34520, + "train_speed(iter/s)": 1.636759 + }, + { + "acc": 0.6590312, + "epoch": 0.8758244545915779, + "grad_norm": 6.625, + "learning_rate": 6.4201471042420595e-06, + "loss": 1.578829, + "memory(GiB)": 107.26, + "step": 34525, + "train_speed(iter/s)": 1.636786 + }, + { + "acc": 0.65072832, + "epoch": 0.875951293759513, + "grad_norm": 5.125, + "learning_rate": 6.419141636311366e-06, + "loss": 1.56273546, + "memory(GiB)": 107.26, + "step": 34530, + "train_speed(iter/s)": 1.636812 + }, + { + "acc": 0.65237608, + "epoch": 0.876078132927448, + "grad_norm": 5.90625, + "learning_rate": 6.4181361059604875e-06, + "loss": 1.5652914, + "memory(GiB)": 107.26, + "step": 34535, + "train_speed(iter/s)": 1.636838 + }, + { + "acc": 0.64299164, + "epoch": 0.8762049720953831, + "grad_norm": 5.59375, + "learning_rate": 6.4171305132336515e-06, + "loss": 1.64306889, + "memory(GiB)": 107.26, + "step": 34540, + "train_speed(iter/s)": 1.636864 + }, + { + "acc": 0.65462313, + "epoch": 0.8763318112633182, + "grad_norm": 5.0, + "learning_rate": 6.416124858175088e-06, + "loss": 1.57467413, + "memory(GiB)": 107.26, + "step": 34545, + "train_speed(iter/s)": 1.636891 + }, + { + "acc": 0.6615304, + "epoch": 0.8764586504312532, + "grad_norm": 5.59375, + "learning_rate": 6.415119140829031e-06, + "loss": 1.6883213, + "memory(GiB)": 107.26, + "step": 34550, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.6573936, + "epoch": 0.8765854895991883, + "grad_norm": 5.59375, + "learning_rate": 6.414113361239715e-06, + "loss": 1.5804142, + "memory(GiB)": 107.26, + "step": 34555, + "train_speed(iter/s)": 1.636944 + }, + { + "acc": 0.66936827, + "epoch": 0.8767123287671232, + "grad_norm": 5.6875, + "learning_rate": 6.4131075194513825e-06, + "loss": 1.5039279, + "memory(GiB)": 107.26, + "step": 34560, + "train_speed(iter/s)": 1.636969 + }, + { + "acc": 0.67082405, + "epoch": 0.8768391679350583, + "grad_norm": 4.90625, + "learning_rate": 6.41210161550827e-06, + "loss": 1.40923691, + "memory(GiB)": 107.26, + "step": 34565, + "train_speed(iter/s)": 1.636995 + }, + { + "acc": 0.65074606, + "epoch": 0.8769660071029934, + "grad_norm": 5.03125, + "learning_rate": 6.411095649454626e-06, + "loss": 1.58685188, + "memory(GiB)": 107.26, + "step": 34570, + "train_speed(iter/s)": 1.637021 + }, + { + "acc": 0.65319657, + "epoch": 0.8770928462709284, + "grad_norm": 6.15625, + "learning_rate": 6.410089621334693e-06, + "loss": 1.63718605, + "memory(GiB)": 107.26, + "step": 34575, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.65627975, + "epoch": 0.8772196854388635, + "grad_norm": 5.15625, + "learning_rate": 6.4090835311927236e-06, + "loss": 1.55061054, + "memory(GiB)": 107.26, + "step": 34580, + "train_speed(iter/s)": 1.637071 + }, + { + "acc": 0.67677021, + "epoch": 0.8773465246067986, + "grad_norm": 5.28125, + "learning_rate": 6.40807737907297e-06, + "loss": 1.58040161, + "memory(GiB)": 107.26, + "step": 34585, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.65182686, + "epoch": 0.8774733637747336, + "grad_norm": 7.15625, + "learning_rate": 6.407071165019686e-06, + "loss": 1.64803467, + "memory(GiB)": 107.26, + "step": 34590, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.66373453, + "epoch": 0.8776002029426687, + "grad_norm": 5.125, + "learning_rate": 6.40606488907713e-06, + "loss": 1.58467455, + "memory(GiB)": 107.26, + "step": 34595, + "train_speed(iter/s)": 1.637148 + }, + { + "acc": 0.64267673, + "epoch": 0.8777270421106037, + "grad_norm": 5.03125, + "learning_rate": 6.4050585512895624e-06, + "loss": 1.63575554, + "memory(GiB)": 107.26, + "step": 34600, + "train_speed(iter/s)": 1.637174 + }, + { + "acc": 0.66226158, + "epoch": 0.8778538812785388, + "grad_norm": 5.4375, + "learning_rate": 6.4040521517012475e-06, + "loss": 1.55161991, + "memory(GiB)": 107.26, + "step": 34605, + "train_speed(iter/s)": 1.6372 + }, + { + "acc": 0.64370594, + "epoch": 0.8779807204464739, + "grad_norm": 6.25, + "learning_rate": 6.40304569035645e-06, + "loss": 1.62557526, + "memory(GiB)": 107.26, + "step": 34610, + "train_speed(iter/s)": 1.637226 + }, + { + "acc": 0.65586767, + "epoch": 0.8781075596144089, + "grad_norm": 5.6875, + "learning_rate": 6.402039167299439e-06, + "loss": 1.61429729, + "memory(GiB)": 107.26, + "step": 34615, + "train_speed(iter/s)": 1.637254 + }, + { + "acc": 0.64672518, + "epoch": 0.878234398782344, + "grad_norm": 5.53125, + "learning_rate": 6.401032582574485e-06, + "loss": 1.62737198, + "memory(GiB)": 107.26, + "step": 34620, + "train_speed(iter/s)": 1.63728 + }, + { + "acc": 0.65626516, + "epoch": 0.8783612379502791, + "grad_norm": 6.96875, + "learning_rate": 6.400025936225862e-06, + "loss": 1.58581991, + "memory(GiB)": 107.26, + "step": 34625, + "train_speed(iter/s)": 1.637306 + }, + { + "acc": 0.64161911, + "epoch": 0.8784880771182141, + "grad_norm": 5.21875, + "learning_rate": 6.399019228297851e-06, + "loss": 1.65909843, + "memory(GiB)": 107.26, + "step": 34630, + "train_speed(iter/s)": 1.637332 + }, + { + "acc": 0.66053019, + "epoch": 0.8786149162861492, + "grad_norm": 8.0, + "learning_rate": 6.398012458834724e-06, + "loss": 1.57179422, + "memory(GiB)": 107.26, + "step": 34635, + "train_speed(iter/s)": 1.637357 + }, + { + "acc": 0.64921684, + "epoch": 0.8787417554540842, + "grad_norm": 4.96875, + "learning_rate": 6.397005627880771e-06, + "loss": 1.5992384, + "memory(GiB)": 107.26, + "step": 34640, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.65615377, + "epoch": 0.8788685946220193, + "grad_norm": 6.5, + "learning_rate": 6.395998735480271e-06, + "loss": 1.64360905, + "memory(GiB)": 107.26, + "step": 34645, + "train_speed(iter/s)": 1.637408 + }, + { + "acc": 0.65786705, + "epoch": 0.8789954337899544, + "grad_norm": 5.0625, + "learning_rate": 6.394991781677516e-06, + "loss": 1.53924789, + "memory(GiB)": 107.26, + "step": 34650, + "train_speed(iter/s)": 1.637431 + }, + { + "acc": 0.64682426, + "epoch": 0.8791222729578894, + "grad_norm": 5.6875, + "learning_rate": 6.393984766516792e-06, + "loss": 1.64797173, + "memory(GiB)": 107.26, + "step": 34655, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.64263763, + "epoch": 0.8792491121258245, + "grad_norm": 5.8125, + "learning_rate": 6.392977690042395e-06, + "loss": 1.73442726, + "memory(GiB)": 107.26, + "step": 34660, + "train_speed(iter/s)": 1.637484 + }, + { + "acc": 0.65896811, + "epoch": 0.8793759512937596, + "grad_norm": 6.84375, + "learning_rate": 6.3919705522986205e-06, + "loss": 1.59436302, + "memory(GiB)": 107.26, + "step": 34665, + "train_speed(iter/s)": 1.637508 + }, + { + "acc": 0.6781909, + "epoch": 0.8795027904616946, + "grad_norm": 5.40625, + "learning_rate": 6.390963353329767e-06, + "loss": 1.49957829, + "memory(GiB)": 107.26, + "step": 34670, + "train_speed(iter/s)": 1.637534 + }, + { + "acc": 0.63663464, + "epoch": 0.8796296296296297, + "grad_norm": 5.125, + "learning_rate": 6.389956093180134e-06, + "loss": 1.66632843, + "memory(GiB)": 107.26, + "step": 34675, + "train_speed(iter/s)": 1.637559 + }, + { + "acc": 0.66338162, + "epoch": 0.8797564687975646, + "grad_norm": 5.34375, + "learning_rate": 6.388948771894025e-06, + "loss": 1.58040829, + "memory(GiB)": 107.26, + "step": 34680, + "train_speed(iter/s)": 1.637585 + }, + { + "acc": 0.66679783, + "epoch": 0.8798833079654997, + "grad_norm": 5.15625, + "learning_rate": 6.38794138951575e-06, + "loss": 1.58340797, + "memory(GiB)": 107.26, + "step": 34685, + "train_speed(iter/s)": 1.637611 + }, + { + "acc": 0.66828232, + "epoch": 0.8800101471334348, + "grad_norm": 6.21875, + "learning_rate": 6.386933946089615e-06, + "loss": 1.58165216, + "memory(GiB)": 107.26, + "step": 34690, + "train_speed(iter/s)": 1.637637 + }, + { + "acc": 0.66285686, + "epoch": 0.8801369863013698, + "grad_norm": 6.46875, + "learning_rate": 6.385926441659933e-06, + "loss": 1.61361046, + "memory(GiB)": 107.26, + "step": 34695, + "train_speed(iter/s)": 1.637663 + }, + { + "acc": 0.65112848, + "epoch": 0.8802638254693049, + "grad_norm": 6.59375, + "learning_rate": 6.38491887627102e-06, + "loss": 1.65015106, + "memory(GiB)": 107.26, + "step": 34700, + "train_speed(iter/s)": 1.637687 + }, + { + "acc": 0.65056801, + "epoch": 0.88039066463724, + "grad_norm": 6.65625, + "learning_rate": 6.383911249967188e-06, + "loss": 1.61523705, + "memory(GiB)": 107.26, + "step": 34705, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.65150981, + "epoch": 0.880517503805175, + "grad_norm": 5.34375, + "learning_rate": 6.382903562792764e-06, + "loss": 1.6159996, + "memory(GiB)": 107.26, + "step": 34710, + "train_speed(iter/s)": 1.637736 + }, + { + "acc": 0.63804379, + "epoch": 0.8806443429731101, + "grad_norm": 6.5625, + "learning_rate": 6.381895814792065e-06, + "loss": 1.66233826, + "memory(GiB)": 107.26, + "step": 34715, + "train_speed(iter/s)": 1.637762 + }, + { + "acc": 0.65599394, + "epoch": 0.8807711821410451, + "grad_norm": 5.5, + "learning_rate": 6.38088800600942e-06, + "loss": 1.60345058, + "memory(GiB)": 107.26, + "step": 34720, + "train_speed(iter/s)": 1.637789 + }, + { + "acc": 0.65587921, + "epoch": 0.8808980213089802, + "grad_norm": 5.46875, + "learning_rate": 6.3798801364891535e-06, + "loss": 1.63268394, + "memory(GiB)": 107.26, + "step": 34725, + "train_speed(iter/s)": 1.637815 + }, + { + "acc": 0.64889307, + "epoch": 0.8810248604769153, + "grad_norm": 6.09375, + "learning_rate": 6.378872206275599e-06, + "loss": 1.61623306, + "memory(GiB)": 107.26, + "step": 34730, + "train_speed(iter/s)": 1.637841 + }, + { + "acc": 0.6586792, + "epoch": 0.8811516996448503, + "grad_norm": 5.6875, + "learning_rate": 6.377864215413088e-06, + "loss": 1.55771284, + "memory(GiB)": 107.26, + "step": 34735, + "train_speed(iter/s)": 1.637865 + }, + { + "acc": 0.66447821, + "epoch": 0.8812785388127854, + "grad_norm": 6.84375, + "learning_rate": 6.376856163945957e-06, + "loss": 1.63500576, + "memory(GiB)": 107.26, + "step": 34740, + "train_speed(iter/s)": 1.63789 + }, + { + "acc": 0.65884371, + "epoch": 0.8814053779807205, + "grad_norm": 6.53125, + "learning_rate": 6.375848051918546e-06, + "loss": 1.59303045, + "memory(GiB)": 107.26, + "step": 34745, + "train_speed(iter/s)": 1.637915 + }, + { + "acc": 0.65938687, + "epoch": 0.8815322171486555, + "grad_norm": 5.21875, + "learning_rate": 6.374839879375194e-06, + "loss": 1.56345606, + "memory(GiB)": 107.26, + "step": 34750, + "train_speed(iter/s)": 1.637938 + }, + { + "acc": 0.64880505, + "epoch": 0.8816590563165906, + "grad_norm": 5.0625, + "learning_rate": 6.373831646360245e-06, + "loss": 1.5778017, + "memory(GiB)": 107.26, + "step": 34755, + "train_speed(iter/s)": 1.637965 + }, + { + "acc": 0.65841331, + "epoch": 0.8817858954845256, + "grad_norm": 5.9375, + "learning_rate": 6.372823352918048e-06, + "loss": 1.59769955, + "memory(GiB)": 107.26, + "step": 34760, + "train_speed(iter/s)": 1.637992 + }, + { + "acc": 0.65031161, + "epoch": 0.8819127346524607, + "grad_norm": 5.59375, + "learning_rate": 6.371814999092951e-06, + "loss": 1.60031586, + "memory(GiB)": 107.26, + "step": 34765, + "train_speed(iter/s)": 1.638018 + }, + { + "acc": 0.65365033, + "epoch": 0.8820395738203958, + "grad_norm": 6.53125, + "learning_rate": 6.370806584929305e-06, + "loss": 1.64977074, + "memory(GiB)": 107.26, + "step": 34770, + "train_speed(iter/s)": 1.638045 + }, + { + "acc": 0.65816007, + "epoch": 0.8821664129883308, + "grad_norm": 4.5625, + "learning_rate": 6.369798110471463e-06, + "loss": 1.59827347, + "memory(GiB)": 107.26, + "step": 34775, + "train_speed(iter/s)": 1.638069 + }, + { + "acc": 0.65530052, + "epoch": 0.8822932521562659, + "grad_norm": 6.90625, + "learning_rate": 6.368789575763787e-06, + "loss": 1.56892929, + "memory(GiB)": 107.26, + "step": 34780, + "train_speed(iter/s)": 1.638094 + }, + { + "acc": 0.65165534, + "epoch": 0.882420091324201, + "grad_norm": 6.3125, + "learning_rate": 6.367780980850633e-06, + "loss": 1.57746811, + "memory(GiB)": 107.26, + "step": 34785, + "train_speed(iter/s)": 1.638119 + }, + { + "acc": 0.65830736, + "epoch": 0.882546930492136, + "grad_norm": 6.59375, + "learning_rate": 6.366772325776367e-06, + "loss": 1.65471096, + "memory(GiB)": 107.26, + "step": 34790, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.65235801, + "epoch": 0.882673769660071, + "grad_norm": 5.9375, + "learning_rate": 6.365763610585349e-06, + "loss": 1.5687458, + "memory(GiB)": 107.26, + "step": 34795, + "train_speed(iter/s)": 1.638169 + }, + { + "acc": 0.67540703, + "epoch": 0.882800608828006, + "grad_norm": 6.25, + "learning_rate": 6.3647548353219515e-06, + "loss": 1.57441368, + "memory(GiB)": 107.26, + "step": 34800, + "train_speed(iter/s)": 1.638196 + }, + { + "acc": 0.66891437, + "epoch": 0.8829274479959411, + "grad_norm": 6.625, + "learning_rate": 6.363746000030543e-06, + "loss": 1.63154259, + "memory(GiB)": 107.26, + "step": 34805, + "train_speed(iter/s)": 1.638221 + }, + { + "acc": 0.65212326, + "epoch": 0.8830542871638762, + "grad_norm": 6.53125, + "learning_rate": 6.362737104755497e-06, + "loss": 1.62655983, + "memory(GiB)": 107.26, + "step": 34810, + "train_speed(iter/s)": 1.638248 + }, + { + "acc": 0.66751871, + "epoch": 0.8831811263318112, + "grad_norm": 6.28125, + "learning_rate": 6.361728149541188e-06, + "loss": 1.59664602, + "memory(GiB)": 107.26, + "step": 34815, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.6539156, + "epoch": 0.8833079654997463, + "grad_norm": 5.59375, + "learning_rate": 6.360719134431995e-06, + "loss": 1.6347332, + "memory(GiB)": 107.26, + "step": 34820, + "train_speed(iter/s)": 1.638298 + }, + { + "acc": 0.67178602, + "epoch": 0.8834348046676814, + "grad_norm": 7.25, + "learning_rate": 6.359710059472299e-06, + "loss": 1.59146099, + "memory(GiB)": 107.26, + "step": 34825, + "train_speed(iter/s)": 1.638324 + }, + { + "acc": 0.65588737, + "epoch": 0.8835616438356164, + "grad_norm": 5.90625, + "learning_rate": 6.358700924706486e-06, + "loss": 1.61237297, + "memory(GiB)": 107.26, + "step": 34830, + "train_speed(iter/s)": 1.638349 + }, + { + "acc": 0.65609603, + "epoch": 0.8836884830035515, + "grad_norm": 5.9375, + "learning_rate": 6.357691730178939e-06, + "loss": 1.61122551, + "memory(GiB)": 107.26, + "step": 34835, + "train_speed(iter/s)": 1.638374 + }, + { + "acc": 0.64841795, + "epoch": 0.8838153221714865, + "grad_norm": 5.1875, + "learning_rate": 6.356682475934048e-06, + "loss": 1.67296028, + "memory(GiB)": 107.26, + "step": 34840, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.65464005, + "epoch": 0.8839421613394216, + "grad_norm": 6.375, + "learning_rate": 6.3556731620162036e-06, + "loss": 1.58997316, + "memory(GiB)": 107.26, + "step": 34845, + "train_speed(iter/s)": 1.638424 + }, + { + "acc": 0.66694059, + "epoch": 0.8840690005073567, + "grad_norm": 7.03125, + "learning_rate": 6.354663788469803e-06, + "loss": 1.59241447, + "memory(GiB)": 107.26, + "step": 34850, + "train_speed(iter/s)": 1.638448 + }, + { + "acc": 0.65776868, + "epoch": 0.8841958396752917, + "grad_norm": 6.25, + "learning_rate": 6.353654355339238e-06, + "loss": 1.63055115, + "memory(GiB)": 107.26, + "step": 34855, + "train_speed(iter/s)": 1.638473 + }, + { + "acc": 0.65780544, + "epoch": 0.8843226788432268, + "grad_norm": 6.09375, + "learning_rate": 6.352644862668914e-06, + "loss": 1.58074188, + "memory(GiB)": 107.26, + "step": 34860, + "train_speed(iter/s)": 1.638496 + }, + { + "acc": 0.6692008, + "epoch": 0.8844495180111619, + "grad_norm": 5.5625, + "learning_rate": 6.351635310503228e-06, + "loss": 1.58006744, + "memory(GiB)": 107.26, + "step": 34865, + "train_speed(iter/s)": 1.63852 + }, + { + "acc": 0.66118078, + "epoch": 0.8845763571790969, + "grad_norm": 6.875, + "learning_rate": 6.3506256988865865e-06, + "loss": 1.5850481, + "memory(GiB)": 107.26, + "step": 34870, + "train_speed(iter/s)": 1.638548 + }, + { + "acc": 0.67588987, + "epoch": 0.884703196347032, + "grad_norm": 5.15625, + "learning_rate": 6.349616027863397e-06, + "loss": 1.51588545, + "memory(GiB)": 107.26, + "step": 34875, + "train_speed(iter/s)": 1.638571 + }, + { + "acc": 0.65689621, + "epoch": 0.884830035514967, + "grad_norm": 5.15625, + "learning_rate": 6.34860629747807e-06, + "loss": 1.60547981, + "memory(GiB)": 107.26, + "step": 34880, + "train_speed(iter/s)": 1.638597 + }, + { + "acc": 0.64912934, + "epoch": 0.8849568746829021, + "grad_norm": 7.53125, + "learning_rate": 6.347596507775016e-06, + "loss": 1.60005016, + "memory(GiB)": 107.26, + "step": 34885, + "train_speed(iter/s)": 1.638622 + }, + { + "acc": 0.65871048, + "epoch": 0.8850837138508372, + "grad_norm": 5.09375, + "learning_rate": 6.3465866587986505e-06, + "loss": 1.51602583, + "memory(GiB)": 107.26, + "step": 34890, + "train_speed(iter/s)": 1.638645 + }, + { + "acc": 0.66990614, + "epoch": 0.8852105530187722, + "grad_norm": 5.0625, + "learning_rate": 6.345576750593392e-06, + "loss": 1.55591345, + "memory(GiB)": 107.26, + "step": 34895, + "train_speed(iter/s)": 1.638669 + }, + { + "acc": 0.64289408, + "epoch": 0.8853373921867073, + "grad_norm": 7.125, + "learning_rate": 6.34456678320366e-06, + "loss": 1.67922363, + "memory(GiB)": 107.26, + "step": 34900, + "train_speed(iter/s)": 1.638695 + }, + { + "acc": 0.64900303, + "epoch": 0.8854642313546424, + "grad_norm": 4.875, + "learning_rate": 6.343556756673879e-06, + "loss": 1.64148579, + "memory(GiB)": 107.26, + "step": 34905, + "train_speed(iter/s)": 1.63872 + }, + { + "acc": 0.65437651, + "epoch": 0.8855910705225774, + "grad_norm": 4.75, + "learning_rate": 6.3425466710484726e-06, + "loss": 1.59104872, + "memory(GiB)": 107.26, + "step": 34910, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.64971294, + "epoch": 0.8857179096905125, + "grad_norm": 5.8125, + "learning_rate": 6.3415365263718686e-06, + "loss": 1.6510519, + "memory(GiB)": 107.26, + "step": 34915, + "train_speed(iter/s)": 1.638768 + }, + { + "acc": 0.65020671, + "epoch": 0.8858447488584474, + "grad_norm": 5.3125, + "learning_rate": 6.340526322688501e-06, + "loss": 1.58353453, + "memory(GiB)": 107.26, + "step": 34920, + "train_speed(iter/s)": 1.638793 + }, + { + "acc": 0.66052647, + "epoch": 0.8859715880263825, + "grad_norm": 6.6875, + "learning_rate": 6.339516060042798e-06, + "loss": 1.62916718, + "memory(GiB)": 107.26, + "step": 34925, + "train_speed(iter/s)": 1.638817 + }, + { + "acc": 0.64058785, + "epoch": 0.8860984271943176, + "grad_norm": 5.625, + "learning_rate": 6.3385057384792e-06, + "loss": 1.64945335, + "memory(GiB)": 107.26, + "step": 34930, + "train_speed(iter/s)": 1.638842 + }, + { + "acc": 0.64525433, + "epoch": 0.8862252663622526, + "grad_norm": 6.6875, + "learning_rate": 6.337495358042143e-06, + "loss": 1.6395565, + "memory(GiB)": 107.26, + "step": 34935, + "train_speed(iter/s)": 1.638868 + }, + { + "acc": 0.65324221, + "epoch": 0.8863521055301877, + "grad_norm": 5.4375, + "learning_rate": 6.336484918776069e-06, + "loss": 1.63604259, + "memory(GiB)": 107.26, + "step": 34940, + "train_speed(iter/s)": 1.638894 + }, + { + "acc": 0.66482482, + "epoch": 0.8864789446981228, + "grad_norm": 5.0625, + "learning_rate": 6.335474420725421e-06, + "loss": 1.59167957, + "memory(GiB)": 107.26, + "step": 34945, + "train_speed(iter/s)": 1.63892 + }, + { + "acc": 0.64228411, + "epoch": 0.8866057838660578, + "grad_norm": 5.40625, + "learning_rate": 6.334463863934646e-06, + "loss": 1.64272976, + "memory(GiB)": 107.26, + "step": 34950, + "train_speed(iter/s)": 1.638946 + }, + { + "acc": 0.65288992, + "epoch": 0.8867326230339929, + "grad_norm": 5.09375, + "learning_rate": 6.333453248448192e-06, + "loss": 1.5728159, + "memory(GiB)": 107.26, + "step": 34955, + "train_speed(iter/s)": 1.63897 + }, + { + "acc": 0.63415837, + "epoch": 0.8868594622019279, + "grad_norm": 5.59375, + "learning_rate": 6.33244257431051e-06, + "loss": 1.76514473, + "memory(GiB)": 107.26, + "step": 34960, + "train_speed(iter/s)": 1.638997 + }, + { + "acc": 0.65475917, + "epoch": 0.886986301369863, + "grad_norm": 5.21875, + "learning_rate": 6.331431841566056e-06, + "loss": 1.63356133, + "memory(GiB)": 107.26, + "step": 34965, + "train_speed(iter/s)": 1.639023 + }, + { + "acc": 0.65832453, + "epoch": 0.8871131405377981, + "grad_norm": 6.09375, + "learning_rate": 6.330421050259283e-06, + "loss": 1.61892357, + "memory(GiB)": 107.26, + "step": 34970, + "train_speed(iter/s)": 1.63905 + }, + { + "acc": 0.65398064, + "epoch": 0.8872399797057331, + "grad_norm": 6.53125, + "learning_rate": 6.329410200434655e-06, + "loss": 1.54905472, + "memory(GiB)": 107.26, + "step": 34975, + "train_speed(iter/s)": 1.639074 + }, + { + "acc": 0.65093765, + "epoch": 0.8873668188736682, + "grad_norm": 5.65625, + "learning_rate": 6.328399292136629e-06, + "loss": 1.66408138, + "memory(GiB)": 107.26, + "step": 34980, + "train_speed(iter/s)": 1.639098 + }, + { + "acc": 0.65008602, + "epoch": 0.8874936580416033, + "grad_norm": 6.15625, + "learning_rate": 6.327388325409672e-06, + "loss": 1.64595585, + "memory(GiB)": 107.26, + "step": 34985, + "train_speed(iter/s)": 1.639124 + }, + { + "acc": 0.63527822, + "epoch": 0.8876204972095383, + "grad_norm": 5.09375, + "learning_rate": 6.326377300298251e-06, + "loss": 1.63354034, + "memory(GiB)": 107.26, + "step": 34990, + "train_speed(iter/s)": 1.639149 + }, + { + "acc": 0.66591129, + "epoch": 0.8877473363774734, + "grad_norm": 5.5625, + "learning_rate": 6.325366216846832e-06, + "loss": 1.58777866, + "memory(GiB)": 107.26, + "step": 34995, + "train_speed(iter/s)": 1.639175 + }, + { + "acc": 0.66433449, + "epoch": 0.8878741755454084, + "grad_norm": 6.59375, + "learning_rate": 6.324355075099893e-06, + "loss": 1.65761032, + "memory(GiB)": 107.26, + "step": 35000, + "train_speed(iter/s)": 1.6392 + }, + { + "epoch": 0.8878741755454084, + "eval_acc": 0.6458274599723925, + "eval_loss": 1.5753097534179688, + "eval_runtime": 57.8911, + "eval_samples_per_second": 110.034, + "eval_steps_per_second": 27.517, + "step": 35000 + }, + { + "acc": 0.65280848, + "epoch": 0.8880010147133435, + "grad_norm": 5.28125, + "learning_rate": 6.3233438751019016e-06, + "loss": 1.63273716, + "memory(GiB)": 107.26, + "step": 35005, + "train_speed(iter/s)": 1.634472 + }, + { + "acc": 0.66387553, + "epoch": 0.8881278538812786, + "grad_norm": 7.71875, + "learning_rate": 6.322332616897341e-06, + "loss": 1.57154799, + "memory(GiB)": 107.26, + "step": 35010, + "train_speed(iter/s)": 1.634498 + }, + { + "acc": 0.66874261, + "epoch": 0.8882546930492136, + "grad_norm": 7.3125, + "learning_rate": 6.321321300530685e-06, + "loss": 1.46993999, + "memory(GiB)": 107.26, + "step": 35015, + "train_speed(iter/s)": 1.634523 + }, + { + "acc": 0.66598988, + "epoch": 0.8883815322171487, + "grad_norm": 6.5625, + "learning_rate": 6.320309926046421e-06, + "loss": 1.58416843, + "memory(GiB)": 107.26, + "step": 35020, + "train_speed(iter/s)": 1.63455 + }, + { + "acc": 0.63891273, + "epoch": 0.8885083713850838, + "grad_norm": 6.71875, + "learning_rate": 6.319298493489032e-06, + "loss": 1.6747057, + "memory(GiB)": 107.26, + "step": 35025, + "train_speed(iter/s)": 1.634576 + }, + { + "acc": 0.65055408, + "epoch": 0.8886352105530188, + "grad_norm": 6.5625, + "learning_rate": 6.318287002903004e-06, + "loss": 1.58125954, + "memory(GiB)": 107.26, + "step": 35030, + "train_speed(iter/s)": 1.634601 + }, + { + "acc": 0.63083124, + "epoch": 0.8887620497209539, + "grad_norm": 6.0, + "learning_rate": 6.317275454332829e-06, + "loss": 1.64551086, + "memory(GiB)": 107.26, + "step": 35035, + "train_speed(iter/s)": 1.634627 + }, + { + "acc": 0.64888906, + "epoch": 0.8888888888888888, + "grad_norm": 9.6875, + "learning_rate": 6.3162638478229965e-06, + "loss": 1.64853363, + "memory(GiB)": 107.26, + "step": 35040, + "train_speed(iter/s)": 1.634653 + }, + { + "acc": 0.65330877, + "epoch": 0.8890157280568239, + "grad_norm": 6.125, + "learning_rate": 6.315252183418005e-06, + "loss": 1.60603943, + "memory(GiB)": 107.26, + "step": 35045, + "train_speed(iter/s)": 1.634678 + }, + { + "acc": 0.64801445, + "epoch": 0.889142567224759, + "grad_norm": 5.125, + "learning_rate": 6.31424046116235e-06, + "loss": 1.71050892, + "memory(GiB)": 107.26, + "step": 35050, + "train_speed(iter/s)": 1.634707 + }, + { + "acc": 0.65057969, + "epoch": 0.889269406392694, + "grad_norm": 5.21875, + "learning_rate": 6.313228681100532e-06, + "loss": 1.62970982, + "memory(GiB)": 107.26, + "step": 35055, + "train_speed(iter/s)": 1.634732 + }, + { + "acc": 0.6642458, + "epoch": 0.8893962455606291, + "grad_norm": 5.625, + "learning_rate": 6.312216843277052e-06, + "loss": 1.56750851, + "memory(GiB)": 107.26, + "step": 35060, + "train_speed(iter/s)": 1.634758 + }, + { + "acc": 0.620889, + "epoch": 0.8895230847285642, + "grad_norm": 5.78125, + "learning_rate": 6.3112049477364165e-06, + "loss": 1.71920853, + "memory(GiB)": 107.26, + "step": 35065, + "train_speed(iter/s)": 1.634784 + }, + { + "acc": 0.65542688, + "epoch": 0.8896499238964992, + "grad_norm": 6.625, + "learning_rate": 6.310192994523137e-06, + "loss": 1.64014091, + "memory(GiB)": 107.26, + "step": 35070, + "train_speed(iter/s)": 1.634809 + }, + { + "acc": 0.64823637, + "epoch": 0.8897767630644343, + "grad_norm": 5.5625, + "learning_rate": 6.309180983681716e-06, + "loss": 1.61887169, + "memory(GiB)": 107.26, + "step": 35075, + "train_speed(iter/s)": 1.634835 + }, + { + "acc": 0.64675283, + "epoch": 0.8899036022323693, + "grad_norm": 5.375, + "learning_rate": 6.308168915256671e-06, + "loss": 1.67565269, + "memory(GiB)": 107.26, + "step": 35080, + "train_speed(iter/s)": 1.634858 + }, + { + "acc": 0.6456954, + "epoch": 0.8900304414003044, + "grad_norm": 6.28125, + "learning_rate": 6.307156789292518e-06, + "loss": 1.65382309, + "memory(GiB)": 107.26, + "step": 35085, + "train_speed(iter/s)": 1.634886 + }, + { + "acc": 0.66452851, + "epoch": 0.8901572805682395, + "grad_norm": 5.625, + "learning_rate": 6.306144605833773e-06, + "loss": 1.57362566, + "memory(GiB)": 107.26, + "step": 35090, + "train_speed(iter/s)": 1.634915 + }, + { + "acc": 0.63521528, + "epoch": 0.8902841197361745, + "grad_norm": 6.03125, + "learning_rate": 6.305132364924955e-06, + "loss": 1.62159691, + "memory(GiB)": 107.26, + "step": 35095, + "train_speed(iter/s)": 1.634941 + }, + { + "acc": 0.64541359, + "epoch": 0.8904109589041096, + "grad_norm": 6.53125, + "learning_rate": 6.3041200666105905e-06, + "loss": 1.59821301, + "memory(GiB)": 107.26, + "step": 35100, + "train_speed(iter/s)": 1.634968 + }, + { + "acc": 0.65029306, + "epoch": 0.8905377980720447, + "grad_norm": 5.21875, + "learning_rate": 6.303107710935202e-06, + "loss": 1.62801361, + "memory(GiB)": 107.26, + "step": 35105, + "train_speed(iter/s)": 1.634994 + }, + { + "acc": 0.63913221, + "epoch": 0.8906646372399797, + "grad_norm": 6.28125, + "learning_rate": 6.302095297943319e-06, + "loss": 1.58933678, + "memory(GiB)": 107.26, + "step": 35110, + "train_speed(iter/s)": 1.63502 + }, + { + "acc": 0.6435895, + "epoch": 0.8907914764079148, + "grad_norm": 5.0, + "learning_rate": 6.301082827679472e-06, + "loss": 1.62738838, + "memory(GiB)": 107.26, + "step": 35115, + "train_speed(iter/s)": 1.635047 + }, + { + "acc": 0.6804801, + "epoch": 0.8909183155758498, + "grad_norm": 7.75, + "learning_rate": 6.300070300188192e-06, + "loss": 1.55457878, + "memory(GiB)": 107.26, + "step": 35120, + "train_speed(iter/s)": 1.635074 + }, + { + "acc": 0.65929065, + "epoch": 0.8910451547437849, + "grad_norm": 7.40625, + "learning_rate": 6.2990577155140164e-06, + "loss": 1.62332287, + "memory(GiB)": 107.26, + "step": 35125, + "train_speed(iter/s)": 1.635102 + }, + { + "acc": 0.65902109, + "epoch": 0.89117199391172, + "grad_norm": 6.03125, + "learning_rate": 6.298045073701483e-06, + "loss": 1.57639027, + "memory(GiB)": 107.26, + "step": 35130, + "train_speed(iter/s)": 1.635128 + }, + { + "acc": 0.65303311, + "epoch": 0.891298833079655, + "grad_norm": 5.25, + "learning_rate": 6.29703237479513e-06, + "loss": 1.59200039, + "memory(GiB)": 107.26, + "step": 35135, + "train_speed(iter/s)": 1.635152 + }, + { + "acc": 0.64996223, + "epoch": 0.8914256722475901, + "grad_norm": 5.875, + "learning_rate": 6.296019618839505e-06, + "loss": 1.64976425, + "memory(GiB)": 107.26, + "step": 35140, + "train_speed(iter/s)": 1.635177 + }, + { + "acc": 0.66934071, + "epoch": 0.8915525114155252, + "grad_norm": 5.03125, + "learning_rate": 6.295006805879149e-06, + "loss": 1.54726419, + "memory(GiB)": 107.26, + "step": 35145, + "train_speed(iter/s)": 1.635203 + }, + { + "acc": 0.65224061, + "epoch": 0.8916793505834602, + "grad_norm": 5.5, + "learning_rate": 6.293993935958613e-06, + "loss": 1.56719608, + "memory(GiB)": 107.26, + "step": 35150, + "train_speed(iter/s)": 1.635228 + }, + { + "acc": 0.65799284, + "epoch": 0.8918061897513953, + "grad_norm": 5.09375, + "learning_rate": 6.292981009122445e-06, + "loss": 1.5790575, + "memory(GiB)": 107.26, + "step": 35155, + "train_speed(iter/s)": 1.635254 + }, + { + "acc": 0.6496316, + "epoch": 0.8919330289193302, + "grad_norm": 6.4375, + "learning_rate": 6.291968025415202e-06, + "loss": 1.64907455, + "memory(GiB)": 107.26, + "step": 35160, + "train_speed(iter/s)": 1.635278 + }, + { + "acc": 0.65701466, + "epoch": 0.8920598680872653, + "grad_norm": 5.96875, + "learning_rate": 6.290954984881434e-06, + "loss": 1.59845505, + "memory(GiB)": 107.26, + "step": 35165, + "train_speed(iter/s)": 1.635302 + }, + { + "acc": 0.65795808, + "epoch": 0.8921867072552004, + "grad_norm": 6.21875, + "learning_rate": 6.289941887565703e-06, + "loss": 1.63652306, + "memory(GiB)": 107.26, + "step": 35170, + "train_speed(iter/s)": 1.635329 + }, + { + "acc": 0.67381649, + "epoch": 0.8923135464231354, + "grad_norm": 5.90625, + "learning_rate": 6.288928733512569e-06, + "loss": 1.5697506, + "memory(GiB)": 107.26, + "step": 35175, + "train_speed(iter/s)": 1.635356 + }, + { + "acc": 0.66548557, + "epoch": 0.8924403855910705, + "grad_norm": 6.15625, + "learning_rate": 6.287915522766596e-06, + "loss": 1.52901688, + "memory(GiB)": 107.26, + "step": 35180, + "train_speed(iter/s)": 1.635383 + }, + { + "acc": 0.65016232, + "epoch": 0.8925672247590056, + "grad_norm": 6.5, + "learning_rate": 6.2869022553723465e-06, + "loss": 1.59212151, + "memory(GiB)": 107.26, + "step": 35185, + "train_speed(iter/s)": 1.63541 + }, + { + "acc": 0.66490364, + "epoch": 0.8926940639269406, + "grad_norm": 5.1875, + "learning_rate": 6.285888931374391e-06, + "loss": 1.57499943, + "memory(GiB)": 107.26, + "step": 35190, + "train_speed(iter/s)": 1.635439 + }, + { + "acc": 0.6520072, + "epoch": 0.8928209030948757, + "grad_norm": 6.125, + "learning_rate": 6.284875550817299e-06, + "loss": 1.56602497, + "memory(GiB)": 107.26, + "step": 35195, + "train_speed(iter/s)": 1.635466 + }, + { + "acc": 0.6499917, + "epoch": 0.8929477422628107, + "grad_norm": 5.59375, + "learning_rate": 6.2838621137456425e-06, + "loss": 1.60754051, + "memory(GiB)": 107.26, + "step": 35200, + "train_speed(iter/s)": 1.635492 + }, + { + "acc": 0.65284986, + "epoch": 0.8930745814307458, + "grad_norm": 4.4375, + "learning_rate": 6.282848620203999e-06, + "loss": 1.58412743, + "memory(GiB)": 107.26, + "step": 35205, + "train_speed(iter/s)": 1.635519 + }, + { + "acc": 0.65429516, + "epoch": 0.8932014205986809, + "grad_norm": 5.59375, + "learning_rate": 6.2818350702369466e-06, + "loss": 1.60597458, + "memory(GiB)": 107.26, + "step": 35210, + "train_speed(iter/s)": 1.635546 + }, + { + "acc": 0.66010389, + "epoch": 0.8933282597666159, + "grad_norm": 7.8125, + "learning_rate": 6.280821463889063e-06, + "loss": 1.61679363, + "memory(GiB)": 107.26, + "step": 35215, + "train_speed(iter/s)": 1.635572 + }, + { + "acc": 0.66756291, + "epoch": 0.893455098934551, + "grad_norm": 5.53125, + "learning_rate": 6.279807801204936e-06, + "loss": 1.58928738, + "memory(GiB)": 107.26, + "step": 35220, + "train_speed(iter/s)": 1.635597 + }, + { + "acc": 0.67244115, + "epoch": 0.8935819381024861, + "grad_norm": 5.46875, + "learning_rate": 6.278794082229145e-06, + "loss": 1.58225079, + "memory(GiB)": 107.26, + "step": 35225, + "train_speed(iter/s)": 1.635622 + }, + { + "acc": 0.66417904, + "epoch": 0.8937087772704211, + "grad_norm": 5.9375, + "learning_rate": 6.2777803070062825e-06, + "loss": 1.60984669, + "memory(GiB)": 107.26, + "step": 35230, + "train_speed(iter/s)": 1.635647 + }, + { + "acc": 0.65566659, + "epoch": 0.8938356164383562, + "grad_norm": 5.65625, + "learning_rate": 6.276766475580935e-06, + "loss": 1.57599201, + "memory(GiB)": 107.26, + "step": 35235, + "train_speed(iter/s)": 1.635673 + }, + { + "acc": 0.63814325, + "epoch": 0.8939624556062912, + "grad_norm": 5.90625, + "learning_rate": 6.2757525879977e-06, + "loss": 1.70150547, + "memory(GiB)": 107.26, + "step": 35240, + "train_speed(iter/s)": 1.6357 + }, + { + "acc": 0.65425034, + "epoch": 0.8940892947742263, + "grad_norm": 6.25, + "learning_rate": 6.27473864430117e-06, + "loss": 1.6459486, + "memory(GiB)": 107.26, + "step": 35245, + "train_speed(iter/s)": 1.635726 + }, + { + "acc": 0.63694177, + "epoch": 0.8942161339421614, + "grad_norm": 5.71875, + "learning_rate": 6.273724644535942e-06, + "loss": 1.65102577, + "memory(GiB)": 107.26, + "step": 35250, + "train_speed(iter/s)": 1.635754 + }, + { + "acc": 0.65684929, + "epoch": 0.8943429731100964, + "grad_norm": 5.125, + "learning_rate": 6.272710588746619e-06, + "loss": 1.56203547, + "memory(GiB)": 107.26, + "step": 35255, + "train_speed(iter/s)": 1.635779 + }, + { + "acc": 0.64609299, + "epoch": 0.8944698122780315, + "grad_norm": 5.15625, + "learning_rate": 6.271696476977801e-06, + "loss": 1.65582352, + "memory(GiB)": 107.26, + "step": 35260, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.64698038, + "epoch": 0.8945966514459666, + "grad_norm": 5.65625, + "learning_rate": 6.270682309274094e-06, + "loss": 1.56543884, + "memory(GiB)": 107.26, + "step": 35265, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.65742726, + "epoch": 0.8947234906139016, + "grad_norm": 5.6875, + "learning_rate": 6.269668085680106e-06, + "loss": 1.58538094, + "memory(GiB)": 107.26, + "step": 35270, + "train_speed(iter/s)": 1.63586 + }, + { + "acc": 0.64529839, + "epoch": 0.8948503297818367, + "grad_norm": 6.8125, + "learning_rate": 6.268653806240448e-06, + "loss": 1.66950684, + "memory(GiB)": 107.26, + "step": 35275, + "train_speed(iter/s)": 1.635888 + }, + { + "acc": 0.65966301, + "epoch": 0.8949771689497716, + "grad_norm": 5.40625, + "learning_rate": 6.26763947099973e-06, + "loss": 1.58397522, + "memory(GiB)": 107.26, + "step": 35280, + "train_speed(iter/s)": 1.635914 + }, + { + "acc": 0.64616308, + "epoch": 0.8951040081177067, + "grad_norm": 6.46875, + "learning_rate": 6.266625080002569e-06, + "loss": 1.70310516, + "memory(GiB)": 107.26, + "step": 35285, + "train_speed(iter/s)": 1.635941 + }, + { + "acc": 0.64480777, + "epoch": 0.8952308472856418, + "grad_norm": 5.25, + "learning_rate": 6.265610633293582e-06, + "loss": 1.62725906, + "memory(GiB)": 107.26, + "step": 35290, + "train_speed(iter/s)": 1.635968 + }, + { + "acc": 0.64881401, + "epoch": 0.8953576864535768, + "grad_norm": 6.375, + "learning_rate": 6.264596130917389e-06, + "loss": 1.57150612, + "memory(GiB)": 107.26, + "step": 35295, + "train_speed(iter/s)": 1.635994 + }, + { + "acc": 0.67461615, + "epoch": 0.8954845256215119, + "grad_norm": 6.375, + "learning_rate": 6.2635815729186124e-06, + "loss": 1.5082366, + "memory(GiB)": 107.26, + "step": 35300, + "train_speed(iter/s)": 1.63602 + }, + { + "acc": 0.65389862, + "epoch": 0.895611364789447, + "grad_norm": 6.375, + "learning_rate": 6.2625669593418744e-06, + "loss": 1.60202541, + "memory(GiB)": 107.26, + "step": 35305, + "train_speed(iter/s)": 1.636048 + }, + { + "acc": 0.6394526, + "epoch": 0.895738203957382, + "grad_norm": 6.8125, + "learning_rate": 6.261552290231807e-06, + "loss": 1.61742516, + "memory(GiB)": 107.26, + "step": 35310, + "train_speed(iter/s)": 1.636075 + }, + { + "acc": 0.66084189, + "epoch": 0.8958650431253171, + "grad_norm": 5.0625, + "learning_rate": 6.260537565633037e-06, + "loss": 1.54237661, + "memory(GiB)": 107.26, + "step": 35315, + "train_speed(iter/s)": 1.636101 + }, + { + "acc": 0.65795479, + "epoch": 0.8959918822932521, + "grad_norm": 5.6875, + "learning_rate": 6.259522785590197e-06, + "loss": 1.61592808, + "memory(GiB)": 107.26, + "step": 35320, + "train_speed(iter/s)": 1.636128 + }, + { + "acc": 0.63821888, + "epoch": 0.8961187214611872, + "grad_norm": 7.5, + "learning_rate": 6.2585079501479205e-06, + "loss": 1.67298851, + "memory(GiB)": 107.26, + "step": 35325, + "train_speed(iter/s)": 1.636156 + }, + { + "acc": 0.65477619, + "epoch": 0.8962455606291223, + "grad_norm": 5.8125, + "learning_rate": 6.257493059350848e-06, + "loss": 1.57751865, + "memory(GiB)": 107.26, + "step": 35330, + "train_speed(iter/s)": 1.636184 + }, + { + "acc": 0.65645504, + "epoch": 0.8963723997970573, + "grad_norm": 4.6875, + "learning_rate": 6.256478113243613e-06, + "loss": 1.62918262, + "memory(GiB)": 107.26, + "step": 35335, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.65731502, + "epoch": 0.8964992389649924, + "grad_norm": 6.21875, + "learning_rate": 6.255463111870864e-06, + "loss": 1.5873189, + "memory(GiB)": 107.26, + "step": 35340, + "train_speed(iter/s)": 1.636237 + }, + { + "acc": 0.65349741, + "epoch": 0.8966260781329275, + "grad_norm": 6.3125, + "learning_rate": 6.25444805527724e-06, + "loss": 1.61542244, + "memory(GiB)": 107.26, + "step": 35345, + "train_speed(iter/s)": 1.636264 + }, + { + "acc": 0.64976134, + "epoch": 0.8967529173008625, + "grad_norm": 6.0625, + "learning_rate": 6.253432943507391e-06, + "loss": 1.68300591, + "memory(GiB)": 107.26, + "step": 35350, + "train_speed(iter/s)": 1.636292 + }, + { + "acc": 0.65394454, + "epoch": 0.8968797564687976, + "grad_norm": 6.125, + "learning_rate": 6.252417776605964e-06, + "loss": 1.56053839, + "memory(GiB)": 107.26, + "step": 35355, + "train_speed(iter/s)": 1.636318 + }, + { + "acc": 0.66816502, + "epoch": 0.8970065956367326, + "grad_norm": 5.625, + "learning_rate": 6.251402554617613e-06, + "loss": 1.53104286, + "memory(GiB)": 107.26, + "step": 35360, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.63301992, + "epoch": 0.8971334348046677, + "grad_norm": 7.71875, + "learning_rate": 6.2503872775869886e-06, + "loss": 1.72977295, + "memory(GiB)": 107.26, + "step": 35365, + "train_speed(iter/s)": 1.636371 + }, + { + "acc": 0.67164698, + "epoch": 0.8972602739726028, + "grad_norm": 5.40625, + "learning_rate": 6.249371945558751e-06, + "loss": 1.55892744, + "memory(GiB)": 107.26, + "step": 35370, + "train_speed(iter/s)": 1.636397 + }, + { + "acc": 0.66241236, + "epoch": 0.8973871131405378, + "grad_norm": 5.5, + "learning_rate": 6.248356558577555e-06, + "loss": 1.57639856, + "memory(GiB)": 107.26, + "step": 35375, + "train_speed(iter/s)": 1.636423 + }, + { + "acc": 0.66527195, + "epoch": 0.8975139523084729, + "grad_norm": 7.5, + "learning_rate": 6.247341116688067e-06, + "loss": 1.61928024, + "memory(GiB)": 107.26, + "step": 35380, + "train_speed(iter/s)": 1.63645 + }, + { + "acc": 0.65958743, + "epoch": 0.897640791476408, + "grad_norm": 5.46875, + "learning_rate": 6.246325619934945e-06, + "loss": 1.56521568, + "memory(GiB)": 107.26, + "step": 35385, + "train_speed(iter/s)": 1.636476 + }, + { + "acc": 0.65656252, + "epoch": 0.897767630644343, + "grad_norm": 6.71875, + "learning_rate": 6.245310068362859e-06, + "loss": 1.57629642, + "memory(GiB)": 107.26, + "step": 35390, + "train_speed(iter/s)": 1.636505 + }, + { + "acc": 0.65723815, + "epoch": 0.897894469812278, + "grad_norm": 5.46875, + "learning_rate": 6.244294462016476e-06, + "loss": 1.62229996, + "memory(GiB)": 107.26, + "step": 35395, + "train_speed(iter/s)": 1.636533 + }, + { + "acc": 0.67403202, + "epoch": 0.898021308980213, + "grad_norm": 6.8125, + "learning_rate": 6.243278800940468e-06, + "loss": 1.52510862, + "memory(GiB)": 107.26, + "step": 35400, + "train_speed(iter/s)": 1.636561 + }, + { + "acc": 0.66067467, + "epoch": 0.8981481481481481, + "grad_norm": 6.5, + "learning_rate": 6.242263085179506e-06, + "loss": 1.60486584, + "memory(GiB)": 107.26, + "step": 35405, + "train_speed(iter/s)": 1.636589 + }, + { + "acc": 0.6519989, + "epoch": 0.8982749873160832, + "grad_norm": 5.0, + "learning_rate": 6.241247314778269e-06, + "loss": 1.60743599, + "memory(GiB)": 107.26, + "step": 35410, + "train_speed(iter/s)": 1.636616 + }, + { + "acc": 0.64827452, + "epoch": 0.8984018264840182, + "grad_norm": 4.84375, + "learning_rate": 6.240231489781432e-06, + "loss": 1.67846203, + "memory(GiB)": 107.26, + "step": 35415, + "train_speed(iter/s)": 1.636644 + }, + { + "acc": 0.64432755, + "epoch": 0.8985286656519533, + "grad_norm": 5.96875, + "learning_rate": 6.239215610233678e-06, + "loss": 1.60612183, + "memory(GiB)": 107.26, + "step": 35420, + "train_speed(iter/s)": 1.636673 + }, + { + "acc": 0.66481633, + "epoch": 0.8986555048198884, + "grad_norm": 6.71875, + "learning_rate": 6.238199676179688e-06, + "loss": 1.5848711, + "memory(GiB)": 107.26, + "step": 35425, + "train_speed(iter/s)": 1.6367 + }, + { + "acc": 0.66628904, + "epoch": 0.8987823439878234, + "grad_norm": 8.0625, + "learning_rate": 6.2371836876641475e-06, + "loss": 1.56425457, + "memory(GiB)": 107.26, + "step": 35430, + "train_speed(iter/s)": 1.636726 + }, + { + "acc": 0.65049715, + "epoch": 0.8989091831557585, + "grad_norm": 6.46875, + "learning_rate": 6.236167644731745e-06, + "loss": 1.62554779, + "memory(GiB)": 107.26, + "step": 35435, + "train_speed(iter/s)": 1.636753 + }, + { + "acc": 0.66369848, + "epoch": 0.8990360223236935, + "grad_norm": 5.6875, + "learning_rate": 6.235151547427172e-06, + "loss": 1.58174276, + "memory(GiB)": 107.26, + "step": 35440, + "train_speed(iter/s)": 1.636781 + }, + { + "acc": 0.65990949, + "epoch": 0.8991628614916286, + "grad_norm": 5.40625, + "learning_rate": 6.2341353957951165e-06, + "loss": 1.62984047, + "memory(GiB)": 107.26, + "step": 35445, + "train_speed(iter/s)": 1.636808 + }, + { + "acc": 0.63817658, + "epoch": 0.8992897006595637, + "grad_norm": 7.40625, + "learning_rate": 6.233119189880279e-06, + "loss": 1.64208374, + "memory(GiB)": 107.26, + "step": 35450, + "train_speed(iter/s)": 1.636837 + }, + { + "acc": 0.64803524, + "epoch": 0.8994165398274987, + "grad_norm": 5.40625, + "learning_rate": 6.232102929727353e-06, + "loss": 1.63871555, + "memory(GiB)": 107.26, + "step": 35455, + "train_speed(iter/s)": 1.636864 + }, + { + "acc": 0.64808092, + "epoch": 0.8995433789954338, + "grad_norm": 4.90625, + "learning_rate": 6.231086615381039e-06, + "loss": 1.67996273, + "memory(GiB)": 107.26, + "step": 35460, + "train_speed(iter/s)": 1.636892 + }, + { + "acc": 0.66335268, + "epoch": 0.8996702181633689, + "grad_norm": 6.40625, + "learning_rate": 6.2300702468860385e-06, + "loss": 1.60317669, + "memory(GiB)": 107.26, + "step": 35465, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.6618619, + "epoch": 0.8997970573313039, + "grad_norm": 6.0625, + "learning_rate": 6.229053824287058e-06, + "loss": 1.56038418, + "memory(GiB)": 107.26, + "step": 35470, + "train_speed(iter/s)": 1.636947 + }, + { + "acc": 0.64557924, + "epoch": 0.899923896499239, + "grad_norm": 6.125, + "learning_rate": 6.228037347628803e-06, + "loss": 1.62468853, + "memory(GiB)": 107.26, + "step": 35475, + "train_speed(iter/s)": 1.636975 + }, + { + "acc": 0.66901484, + "epoch": 0.900050735667174, + "grad_norm": 4.8125, + "learning_rate": 6.227020816955982e-06, + "loss": 1.51534071, + "memory(GiB)": 107.26, + "step": 35480, + "train_speed(iter/s)": 1.637002 + }, + { + "acc": 0.6630837, + "epoch": 0.9001775748351091, + "grad_norm": 5.1875, + "learning_rate": 6.226004232313308e-06, + "loss": 1.58505068, + "memory(GiB)": 107.26, + "step": 35485, + "train_speed(iter/s)": 1.63703 + }, + { + "acc": 0.64404302, + "epoch": 0.9003044140030442, + "grad_norm": 5.5625, + "learning_rate": 6.224987593745493e-06, + "loss": 1.62261753, + "memory(GiB)": 107.26, + "step": 35490, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.64504671, + "epoch": 0.9004312531709792, + "grad_norm": 6.53125, + "learning_rate": 6.223970901297255e-06, + "loss": 1.58707047, + "memory(GiB)": 107.26, + "step": 35495, + "train_speed(iter/s)": 1.637086 + }, + { + "acc": 0.65822105, + "epoch": 0.9005580923389143, + "grad_norm": 6.28125, + "learning_rate": 6.222954155013312e-06, + "loss": 1.60274563, + "memory(GiB)": 107.26, + "step": 35500, + "train_speed(iter/s)": 1.637114 + }, + { + "acc": 0.66814632, + "epoch": 0.9006849315068494, + "grad_norm": 5.625, + "learning_rate": 6.221937354938386e-06, + "loss": 1.56985779, + "memory(GiB)": 107.26, + "step": 35505, + "train_speed(iter/s)": 1.637142 + }, + { + "acc": 0.65876322, + "epoch": 0.9008117706747844, + "grad_norm": 5.4375, + "learning_rate": 6.2209205011171995e-06, + "loss": 1.61329308, + "memory(GiB)": 107.26, + "step": 35510, + "train_speed(iter/s)": 1.637168 + }, + { + "acc": 0.63096466, + "epoch": 0.9009386098427195, + "grad_norm": 5.59375, + "learning_rate": 6.219903593594476e-06, + "loss": 1.6204073, + "memory(GiB)": 107.26, + "step": 35515, + "train_speed(iter/s)": 1.637196 + }, + { + "acc": 0.66494236, + "epoch": 0.9010654490106544, + "grad_norm": 6.625, + "learning_rate": 6.218886632414949e-06, + "loss": 1.55846291, + "memory(GiB)": 107.26, + "step": 35520, + "train_speed(iter/s)": 1.637223 + }, + { + "acc": 0.65294962, + "epoch": 0.9011922881785895, + "grad_norm": 5.40625, + "learning_rate": 6.217869617623343e-06, + "loss": 1.60607605, + "memory(GiB)": 107.26, + "step": 35525, + "train_speed(iter/s)": 1.63725 + }, + { + "acc": 0.67328677, + "epoch": 0.9013191273465246, + "grad_norm": 5.53125, + "learning_rate": 6.216852549264396e-06, + "loss": 1.58206215, + "memory(GiB)": 107.26, + "step": 35530, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.66554885, + "epoch": 0.9014459665144596, + "grad_norm": 5.9375, + "learning_rate": 6.215835427382842e-06, + "loss": 1.56776257, + "memory(GiB)": 107.26, + "step": 35535, + "train_speed(iter/s)": 1.637305 + }, + { + "acc": 0.65261765, + "epoch": 0.9015728056823947, + "grad_norm": 5.40625, + "learning_rate": 6.214818252023415e-06, + "loss": 1.60929527, + "memory(GiB)": 107.26, + "step": 35540, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.66164408, + "epoch": 0.9016996448503298, + "grad_norm": 5.03125, + "learning_rate": 6.2138010232308585e-06, + "loss": 1.53942204, + "memory(GiB)": 107.26, + "step": 35545, + "train_speed(iter/s)": 1.63736 + }, + { + "acc": 0.67017527, + "epoch": 0.9018264840182648, + "grad_norm": 6.28125, + "learning_rate": 6.212783741049915e-06, + "loss": 1.55215244, + "memory(GiB)": 107.26, + "step": 35550, + "train_speed(iter/s)": 1.637387 + }, + { + "acc": 0.6511302, + "epoch": 0.9019533231861999, + "grad_norm": 4.6875, + "learning_rate": 6.211766405525326e-06, + "loss": 1.57251072, + "memory(GiB)": 107.26, + "step": 35555, + "train_speed(iter/s)": 1.637413 + }, + { + "acc": 0.64634433, + "epoch": 0.9020801623541349, + "grad_norm": 6.25, + "learning_rate": 6.210749016701842e-06, + "loss": 1.64773903, + "memory(GiB)": 107.26, + "step": 35560, + "train_speed(iter/s)": 1.637441 + }, + { + "acc": 0.64436336, + "epoch": 0.90220700152207, + "grad_norm": 6.3125, + "learning_rate": 6.2097315746242095e-06, + "loss": 1.65151157, + "memory(GiB)": 107.26, + "step": 35565, + "train_speed(iter/s)": 1.637466 + }, + { + "acc": 0.63314352, + "epoch": 0.9023338406900051, + "grad_norm": 5.4375, + "learning_rate": 6.208714079337181e-06, + "loss": 1.69086132, + "memory(GiB)": 107.26, + "step": 35570, + "train_speed(iter/s)": 1.637491 + }, + { + "acc": 0.64559937, + "epoch": 0.9024606798579401, + "grad_norm": 5.03125, + "learning_rate": 6.207696530885511e-06, + "loss": 1.61990757, + "memory(GiB)": 107.26, + "step": 35575, + "train_speed(iter/s)": 1.63752 + }, + { + "acc": 0.65296516, + "epoch": 0.9025875190258752, + "grad_norm": 6.28125, + "learning_rate": 6.2066789293139565e-06, + "loss": 1.6086504, + "memory(GiB)": 107.26, + "step": 35580, + "train_speed(iter/s)": 1.637546 + }, + { + "acc": 0.63693833, + "epoch": 0.9027143581938103, + "grad_norm": 5.5, + "learning_rate": 6.2056612746672736e-06, + "loss": 1.67460136, + "memory(GiB)": 107.26, + "step": 35585, + "train_speed(iter/s)": 1.637572 + }, + { + "acc": 0.65801229, + "epoch": 0.9028411973617453, + "grad_norm": 5.46875, + "learning_rate": 6.204643566990227e-06, + "loss": 1.62313728, + "memory(GiB)": 107.26, + "step": 35590, + "train_speed(iter/s)": 1.6376 + }, + { + "acc": 0.65059948, + "epoch": 0.9029680365296804, + "grad_norm": 5.84375, + "learning_rate": 6.2036258063275764e-06, + "loss": 1.57591248, + "memory(GiB)": 107.26, + "step": 35595, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.65701418, + "epoch": 0.9030948756976154, + "grad_norm": 6.84375, + "learning_rate": 6.20260799272409e-06, + "loss": 1.61003704, + "memory(GiB)": 107.26, + "step": 35600, + "train_speed(iter/s)": 1.637654 + }, + { + "acc": 0.66342473, + "epoch": 0.9032217148655505, + "grad_norm": 5.0625, + "learning_rate": 6.201590126224534e-06, + "loss": 1.60203552, + "memory(GiB)": 107.26, + "step": 35605, + "train_speed(iter/s)": 1.63768 + }, + { + "acc": 0.66524825, + "epoch": 0.9033485540334856, + "grad_norm": 4.875, + "learning_rate": 6.20057220687368e-06, + "loss": 1.51242161, + "memory(GiB)": 107.26, + "step": 35610, + "train_speed(iter/s)": 1.637707 + }, + { + "acc": 0.65484581, + "epoch": 0.9034753932014206, + "grad_norm": 5.1875, + "learning_rate": 6.199554234716301e-06, + "loss": 1.58962402, + "memory(GiB)": 107.26, + "step": 35615, + "train_speed(iter/s)": 1.637734 + }, + { + "acc": 0.65649681, + "epoch": 0.9036022323693557, + "grad_norm": 6.0, + "learning_rate": 6.19853620979717e-06, + "loss": 1.56722755, + "memory(GiB)": 107.26, + "step": 35620, + "train_speed(iter/s)": 1.637762 + }, + { + "acc": 0.65528669, + "epoch": 0.9037290715372908, + "grad_norm": 5.40625, + "learning_rate": 6.1975181321610655e-06, + "loss": 1.58421974, + "memory(GiB)": 107.26, + "step": 35625, + "train_speed(iter/s)": 1.637788 + }, + { + "acc": 0.64572115, + "epoch": 0.9038559107052258, + "grad_norm": 6.40625, + "learning_rate": 6.1965000018527676e-06, + "loss": 1.64173851, + "memory(GiB)": 107.26, + "step": 35630, + "train_speed(iter/s)": 1.637816 + }, + { + "acc": 0.64437065, + "epoch": 0.9039827498731609, + "grad_norm": 5.25, + "learning_rate": 6.195481818917057e-06, + "loss": 1.62821884, + "memory(GiB)": 107.26, + "step": 35635, + "train_speed(iter/s)": 1.637843 + }, + { + "acc": 0.65158839, + "epoch": 0.9041095890410958, + "grad_norm": 5.59375, + "learning_rate": 6.194463583398719e-06, + "loss": 1.61533279, + "memory(GiB)": 107.26, + "step": 35640, + "train_speed(iter/s)": 1.63787 + }, + { + "acc": 0.65070362, + "epoch": 0.9042364282090309, + "grad_norm": 5.875, + "learning_rate": 6.193445295342538e-06, + "loss": 1.59141836, + "memory(GiB)": 107.26, + "step": 35645, + "train_speed(iter/s)": 1.637897 + }, + { + "acc": 0.63709927, + "epoch": 0.904363267376966, + "grad_norm": 4.90625, + "learning_rate": 6.192426954793308e-06, + "loss": 1.63582268, + "memory(GiB)": 107.26, + "step": 35650, + "train_speed(iter/s)": 1.637925 + }, + { + "acc": 0.65651522, + "epoch": 0.904490106544901, + "grad_norm": 5.75, + "learning_rate": 6.1914085617958135e-06, + "loss": 1.61126938, + "memory(GiB)": 107.26, + "step": 35655, + "train_speed(iter/s)": 1.637952 + }, + { + "acc": 0.66297903, + "epoch": 0.9046169457128361, + "grad_norm": 5.25, + "learning_rate": 6.190390116394853e-06, + "loss": 1.59218988, + "memory(GiB)": 107.26, + "step": 35660, + "train_speed(iter/s)": 1.637976 + }, + { + "acc": 0.63934641, + "epoch": 0.9047437848807712, + "grad_norm": 9.25, + "learning_rate": 6.189371618635219e-06, + "loss": 1.64230671, + "memory(GiB)": 107.26, + "step": 35665, + "train_speed(iter/s)": 1.638004 + }, + { + "acc": 0.64188948, + "epoch": 0.9048706240487062, + "grad_norm": 5.03125, + "learning_rate": 6.188353068561714e-06, + "loss": 1.60472813, + "memory(GiB)": 107.26, + "step": 35670, + "train_speed(iter/s)": 1.63803 + }, + { + "acc": 0.65712948, + "epoch": 0.9049974632166413, + "grad_norm": 6.0, + "learning_rate": 6.187334466219133e-06, + "loss": 1.55190039, + "memory(GiB)": 107.26, + "step": 35675, + "train_speed(iter/s)": 1.638059 + }, + { + "acc": 0.65382495, + "epoch": 0.9051243023845763, + "grad_norm": 6.46875, + "learning_rate": 6.18631581165228e-06, + "loss": 1.55043144, + "memory(GiB)": 107.26, + "step": 35680, + "train_speed(iter/s)": 1.638086 + }, + { + "acc": 0.64673357, + "epoch": 0.9052511415525114, + "grad_norm": 6.0625, + "learning_rate": 6.185297104905963e-06, + "loss": 1.62627296, + "memory(GiB)": 107.26, + "step": 35685, + "train_speed(iter/s)": 1.638111 + }, + { + "acc": 0.64564052, + "epoch": 0.9053779807204465, + "grad_norm": 6.1875, + "learning_rate": 6.184278346024988e-06, + "loss": 1.64137344, + "memory(GiB)": 107.26, + "step": 35690, + "train_speed(iter/s)": 1.638141 + }, + { + "acc": 0.6500391, + "epoch": 0.9055048198883815, + "grad_norm": 4.90625, + "learning_rate": 6.183259535054163e-06, + "loss": 1.57553253, + "memory(GiB)": 107.26, + "step": 35695, + "train_speed(iter/s)": 1.638168 + }, + { + "acc": 0.63960915, + "epoch": 0.9056316590563166, + "grad_norm": 5.4375, + "learning_rate": 6.1822406720383e-06, + "loss": 1.60085945, + "memory(GiB)": 107.26, + "step": 35700, + "train_speed(iter/s)": 1.638196 + }, + { + "acc": 0.65586462, + "epoch": 0.9057584982242517, + "grad_norm": 4.96875, + "learning_rate": 6.181221757022215e-06, + "loss": 1.60654297, + "memory(GiB)": 107.26, + "step": 35705, + "train_speed(iter/s)": 1.638221 + }, + { + "acc": 0.65102749, + "epoch": 0.9058853373921867, + "grad_norm": 6.3125, + "learning_rate": 6.180202790050724e-06, + "loss": 1.59761124, + "memory(GiB)": 107.26, + "step": 35710, + "train_speed(iter/s)": 1.638249 + }, + { + "acc": 0.65822906, + "epoch": 0.9060121765601218, + "grad_norm": 5.375, + "learning_rate": 6.179183771168643e-06, + "loss": 1.6333868, + "memory(GiB)": 107.26, + "step": 35715, + "train_speed(iter/s)": 1.638275 + }, + { + "acc": 0.65344114, + "epoch": 0.9061390157280568, + "grad_norm": 5.5625, + "learning_rate": 6.1781647004207965e-06, + "loss": 1.64196205, + "memory(GiB)": 107.26, + "step": 35720, + "train_speed(iter/s)": 1.638302 + }, + { + "acc": 0.64389219, + "epoch": 0.9062658548959919, + "grad_norm": 6.6875, + "learning_rate": 6.177145577852005e-06, + "loss": 1.684412, + "memory(GiB)": 107.26, + "step": 35725, + "train_speed(iter/s)": 1.638328 + }, + { + "acc": 0.64120736, + "epoch": 0.906392694063927, + "grad_norm": 6.0625, + "learning_rate": 6.176126403507097e-06, + "loss": 1.64522171, + "memory(GiB)": 107.26, + "step": 35730, + "train_speed(iter/s)": 1.638357 + }, + { + "acc": 0.66335068, + "epoch": 0.906519533231862, + "grad_norm": 5.46875, + "learning_rate": 6.175107177430897e-06, + "loss": 1.563762, + "memory(GiB)": 107.26, + "step": 35735, + "train_speed(iter/s)": 1.638386 + }, + { + "acc": 0.64939804, + "epoch": 0.9066463723997971, + "grad_norm": 5.625, + "learning_rate": 6.17408789966824e-06, + "loss": 1.57620907, + "memory(GiB)": 107.26, + "step": 35740, + "train_speed(iter/s)": 1.638412 + }, + { + "acc": 0.64524412, + "epoch": 0.9067732115677322, + "grad_norm": 5.25, + "learning_rate": 6.173068570263951e-06, + "loss": 1.61273613, + "memory(GiB)": 107.26, + "step": 35745, + "train_speed(iter/s)": 1.638438 + }, + { + "acc": 0.65329552, + "epoch": 0.9069000507356672, + "grad_norm": 5.9375, + "learning_rate": 6.172049189262872e-06, + "loss": 1.62465801, + "memory(GiB)": 107.26, + "step": 35750, + "train_speed(iter/s)": 1.638466 + }, + { + "acc": 0.66614685, + "epoch": 0.9070268899036023, + "grad_norm": 5.125, + "learning_rate": 6.1710297567098354e-06, + "loss": 1.59923954, + "memory(GiB)": 107.26, + "step": 35755, + "train_speed(iter/s)": 1.638493 + }, + { + "acc": 0.65149946, + "epoch": 0.9071537290715372, + "grad_norm": 7.21875, + "learning_rate": 6.170010272649682e-06, + "loss": 1.63270988, + "memory(GiB)": 107.26, + "step": 35760, + "train_speed(iter/s)": 1.63852 + }, + { + "acc": 0.64007444, + "epoch": 0.9072805682394723, + "grad_norm": 11.9375, + "learning_rate": 6.168990737127254e-06, + "loss": 1.65108566, + "memory(GiB)": 107.26, + "step": 35765, + "train_speed(iter/s)": 1.638545 + }, + { + "acc": 0.65903435, + "epoch": 0.9074074074074074, + "grad_norm": 5.5625, + "learning_rate": 6.167971150187394e-06, + "loss": 1.587397, + "memory(GiB)": 107.26, + "step": 35770, + "train_speed(iter/s)": 1.638572 + }, + { + "acc": 0.63616629, + "epoch": 0.9075342465753424, + "grad_norm": 6.0625, + "learning_rate": 6.166951511874948e-06, + "loss": 1.66819973, + "memory(GiB)": 107.26, + "step": 35775, + "train_speed(iter/s)": 1.638599 + }, + { + "acc": 0.6569768, + "epoch": 0.9076610857432775, + "grad_norm": 6.6875, + "learning_rate": 6.165931822234764e-06, + "loss": 1.58296509, + "memory(GiB)": 107.26, + "step": 35780, + "train_speed(iter/s)": 1.638627 + }, + { + "acc": 0.65801291, + "epoch": 0.9077879249112126, + "grad_norm": 7.15625, + "learning_rate": 6.164912081311694e-06, + "loss": 1.65813446, + "memory(GiB)": 107.26, + "step": 35785, + "train_speed(iter/s)": 1.638654 + }, + { + "acc": 0.65638576, + "epoch": 0.9079147640791476, + "grad_norm": 5.84375, + "learning_rate": 6.163892289150588e-06, + "loss": 1.56630287, + "memory(GiB)": 107.26, + "step": 35790, + "train_speed(iter/s)": 1.638681 + }, + { + "acc": 0.66190662, + "epoch": 0.9080416032470827, + "grad_norm": 6.25, + "learning_rate": 6.162872445796303e-06, + "loss": 1.57662067, + "memory(GiB)": 107.26, + "step": 35795, + "train_speed(iter/s)": 1.638708 + }, + { + "acc": 0.64458818, + "epoch": 0.9081684424150177, + "grad_norm": 5.03125, + "learning_rate": 6.161852551293697e-06, + "loss": 1.6509428, + "memory(GiB)": 107.26, + "step": 35800, + "train_speed(iter/s)": 1.638735 + }, + { + "acc": 0.65495453, + "epoch": 0.9082952815829528, + "grad_norm": 6.90625, + "learning_rate": 6.160832605687628e-06, + "loss": 1.63957863, + "memory(GiB)": 107.26, + "step": 35805, + "train_speed(iter/s)": 1.638761 + }, + { + "acc": 0.65895343, + "epoch": 0.9084221207508879, + "grad_norm": 6.0625, + "learning_rate": 6.159812609022961e-06, + "loss": 1.59441814, + "memory(GiB)": 107.26, + "step": 35810, + "train_speed(iter/s)": 1.638788 + }, + { + "acc": 0.64820228, + "epoch": 0.9085489599188229, + "grad_norm": 4.46875, + "learning_rate": 6.158792561344553e-06, + "loss": 1.64654045, + "memory(GiB)": 107.26, + "step": 35815, + "train_speed(iter/s)": 1.638814 + }, + { + "acc": 0.65361223, + "epoch": 0.908675799086758, + "grad_norm": 6.90625, + "learning_rate": 6.157772462697277e-06, + "loss": 1.63154316, + "memory(GiB)": 107.26, + "step": 35820, + "train_speed(iter/s)": 1.638841 + }, + { + "acc": 0.65724802, + "epoch": 0.9088026382546931, + "grad_norm": 6.375, + "learning_rate": 6.156752313125998e-06, + "loss": 1.6546814, + "memory(GiB)": 107.26, + "step": 35825, + "train_speed(iter/s)": 1.638868 + }, + { + "acc": 0.6573822, + "epoch": 0.9089294774226281, + "grad_norm": 6.40625, + "learning_rate": 6.155732112675587e-06, + "loss": 1.57991428, + "memory(GiB)": 107.26, + "step": 35830, + "train_speed(iter/s)": 1.638894 + }, + { + "acc": 0.66967187, + "epoch": 0.9090563165905632, + "grad_norm": 6.1875, + "learning_rate": 6.154711861390919e-06, + "loss": 1.52042522, + "memory(GiB)": 107.26, + "step": 35835, + "train_speed(iter/s)": 1.63892 + }, + { + "acc": 0.66354117, + "epoch": 0.9091831557584982, + "grad_norm": 4.84375, + "learning_rate": 6.153691559316868e-06, + "loss": 1.60629234, + "memory(GiB)": 107.26, + "step": 35840, + "train_speed(iter/s)": 1.638946 + }, + { + "acc": 0.6384511, + "epoch": 0.9093099949264333, + "grad_norm": 6.25, + "learning_rate": 6.152671206498311e-06, + "loss": 1.70595627, + "memory(GiB)": 107.26, + "step": 35845, + "train_speed(iter/s)": 1.638972 + }, + { + "acc": 0.66828499, + "epoch": 0.9094368340943684, + "grad_norm": 5.5, + "learning_rate": 6.151650802980128e-06, + "loss": 1.55587339, + "memory(GiB)": 107.26, + "step": 35850, + "train_speed(iter/s)": 1.638996 + }, + { + "acc": 0.64764853, + "epoch": 0.9095636732623034, + "grad_norm": 5.84375, + "learning_rate": 6.150630348807201e-06, + "loss": 1.66778984, + "memory(GiB)": 107.26, + "step": 35855, + "train_speed(iter/s)": 1.639023 + }, + { + "acc": 0.63880033, + "epoch": 0.9096905124302385, + "grad_norm": 7.78125, + "learning_rate": 6.149609844024413e-06, + "loss": 1.6870472, + "memory(GiB)": 107.26, + "step": 35860, + "train_speed(iter/s)": 1.639052 + }, + { + "acc": 0.6494606, + "epoch": 0.9098173515981736, + "grad_norm": 5.03125, + "learning_rate": 6.148589288676652e-06, + "loss": 1.67009392, + "memory(GiB)": 107.26, + "step": 35865, + "train_speed(iter/s)": 1.639079 + }, + { + "acc": 0.65264816, + "epoch": 0.9099441907661086, + "grad_norm": 7.09375, + "learning_rate": 6.147568682808808e-06, + "loss": 1.60761185, + "memory(GiB)": 107.26, + "step": 35870, + "train_speed(iter/s)": 1.639107 + }, + { + "acc": 0.66067986, + "epoch": 0.9100710299340437, + "grad_norm": 6.09375, + "learning_rate": 6.146548026465766e-06, + "loss": 1.63269272, + "memory(GiB)": 107.26, + "step": 35875, + "train_speed(iter/s)": 1.639135 + }, + { + "acc": 0.66022673, + "epoch": 0.9101978691019786, + "grad_norm": 6.53125, + "learning_rate": 6.145527319692427e-06, + "loss": 1.54779644, + "memory(GiB)": 107.26, + "step": 35880, + "train_speed(iter/s)": 1.639162 + }, + { + "acc": 0.65475273, + "epoch": 0.9103247082699137, + "grad_norm": 5.84375, + "learning_rate": 6.144506562533678e-06, + "loss": 1.5719347, + "memory(GiB)": 107.26, + "step": 35885, + "train_speed(iter/s)": 1.63919 + }, + { + "acc": 0.65353661, + "epoch": 0.9104515474378488, + "grad_norm": 6.625, + "learning_rate": 6.143485755034425e-06, + "loss": 1.58170137, + "memory(GiB)": 107.26, + "step": 35890, + "train_speed(iter/s)": 1.639217 + }, + { + "acc": 0.65148792, + "epoch": 0.9105783866057838, + "grad_norm": 6.15625, + "learning_rate": 6.14246489723956e-06, + "loss": 1.58532286, + "memory(GiB)": 107.26, + "step": 35895, + "train_speed(iter/s)": 1.639244 + }, + { + "acc": 0.65632925, + "epoch": 0.9107052257737189, + "grad_norm": 7.25, + "learning_rate": 6.141443989193988e-06, + "loss": 1.61256351, + "memory(GiB)": 107.26, + "step": 35900, + "train_speed(iter/s)": 1.639274 + }, + { + "acc": 0.63929205, + "epoch": 0.910832064941654, + "grad_norm": 6.25, + "learning_rate": 6.140423030942615e-06, + "loss": 1.72920818, + "memory(GiB)": 107.26, + "step": 35905, + "train_speed(iter/s)": 1.6393 + }, + { + "acc": 0.65873332, + "epoch": 0.910958904109589, + "grad_norm": 5.25, + "learning_rate": 6.139402022530344e-06, + "loss": 1.62159214, + "memory(GiB)": 107.26, + "step": 35910, + "train_speed(iter/s)": 1.639329 + }, + { + "acc": 0.64691191, + "epoch": 0.9110857432775241, + "grad_norm": 5.84375, + "learning_rate": 6.138380964002087e-06, + "loss": 1.62927647, + "memory(GiB)": 107.26, + "step": 35915, + "train_speed(iter/s)": 1.639356 + }, + { + "acc": 0.67530079, + "epoch": 0.9112125824454591, + "grad_norm": 9.375, + "learning_rate": 6.13735985540275e-06, + "loss": 1.54399252, + "memory(GiB)": 107.26, + "step": 35920, + "train_speed(iter/s)": 1.639384 + }, + { + "acc": 0.65794725, + "epoch": 0.9113394216133942, + "grad_norm": 7.09375, + "learning_rate": 6.13633869677725e-06, + "loss": 1.55421505, + "memory(GiB)": 117.38, + "step": 35925, + "train_speed(iter/s)": 1.639409 + }, + { + "acc": 0.64901609, + "epoch": 0.9114662607813293, + "grad_norm": 5.3125, + "learning_rate": 6.1353174881705e-06, + "loss": 1.63148918, + "memory(GiB)": 117.38, + "step": 35930, + "train_speed(iter/s)": 1.639434 + }, + { + "acc": 0.64009352, + "epoch": 0.9115930999492643, + "grad_norm": 5.78125, + "learning_rate": 6.134296229627419e-06, + "loss": 1.69694366, + "memory(GiB)": 117.38, + "step": 35935, + "train_speed(iter/s)": 1.639461 + }, + { + "acc": 0.65991778, + "epoch": 0.9117199391171994, + "grad_norm": 5.5, + "learning_rate": 6.1332749211929255e-06, + "loss": 1.59715118, + "memory(GiB)": 117.38, + "step": 35940, + "train_speed(iter/s)": 1.639488 + }, + { + "acc": 0.67639484, + "epoch": 0.9118467782851345, + "grad_norm": 6.09375, + "learning_rate": 6.132253562911941e-06, + "loss": 1.56201153, + "memory(GiB)": 117.38, + "step": 35945, + "train_speed(iter/s)": 1.639514 + }, + { + "acc": 0.66358671, + "epoch": 0.9119736174530695, + "grad_norm": 5.6875, + "learning_rate": 6.1312321548293895e-06, + "loss": 1.55208702, + "memory(GiB)": 117.38, + "step": 35950, + "train_speed(iter/s)": 1.639542 + }, + { + "acc": 0.65372281, + "epoch": 0.9121004566210046, + "grad_norm": 5.375, + "learning_rate": 6.130210696990197e-06, + "loss": 1.68709469, + "memory(GiB)": 117.38, + "step": 35955, + "train_speed(iter/s)": 1.639569 + }, + { + "acc": 0.65379219, + "epoch": 0.9122272957889396, + "grad_norm": 6.15625, + "learning_rate": 6.129189189439293e-06, + "loss": 1.64993248, + "memory(GiB)": 117.38, + "step": 35960, + "train_speed(iter/s)": 1.639598 + }, + { + "acc": 0.65376654, + "epoch": 0.9123541349568747, + "grad_norm": 7.96875, + "learning_rate": 6.128167632221605e-06, + "loss": 1.6426651, + "memory(GiB)": 117.38, + "step": 35965, + "train_speed(iter/s)": 1.639626 + }, + { + "acc": 0.65782461, + "epoch": 0.9124809741248098, + "grad_norm": 5.375, + "learning_rate": 6.127146025382069e-06, + "loss": 1.55912247, + "memory(GiB)": 117.38, + "step": 35970, + "train_speed(iter/s)": 1.639654 + }, + { + "acc": 0.64602838, + "epoch": 0.9126078132927448, + "grad_norm": 5.8125, + "learning_rate": 6.126124368965619e-06, + "loss": 1.65744972, + "memory(GiB)": 117.38, + "step": 35975, + "train_speed(iter/s)": 1.639683 + }, + { + "acc": 0.63015456, + "epoch": 0.9127346524606799, + "grad_norm": 5.46875, + "learning_rate": 6.125102663017191e-06, + "loss": 1.72497082, + "memory(GiB)": 117.38, + "step": 35980, + "train_speed(iter/s)": 1.63971 + }, + { + "acc": 0.66336951, + "epoch": 0.912861491628615, + "grad_norm": 5.03125, + "learning_rate": 6.124080907581724e-06, + "loss": 1.57774143, + "memory(GiB)": 117.38, + "step": 35985, + "train_speed(iter/s)": 1.639736 + }, + { + "acc": 0.66320405, + "epoch": 0.91298833079655, + "grad_norm": 5.15625, + "learning_rate": 6.1230591027041605e-06, + "loss": 1.58957319, + "memory(GiB)": 117.38, + "step": 35990, + "train_speed(iter/s)": 1.639762 + }, + { + "acc": 0.6647799, + "epoch": 0.913115169964485, + "grad_norm": 5.1875, + "learning_rate": 6.1220372484294444e-06, + "loss": 1.56535702, + "memory(GiB)": 117.38, + "step": 35995, + "train_speed(iter/s)": 1.63979 + }, + { + "acc": 0.65342131, + "epoch": 0.91324200913242, + "grad_norm": 5.90625, + "learning_rate": 6.12101534480252e-06, + "loss": 1.62986031, + "memory(GiB)": 117.38, + "step": 36000, + "train_speed(iter/s)": 1.639816 + }, + { + "epoch": 0.91324200913242, + "eval_acc": 0.6460513263876272, + "eval_loss": 1.5751882791519165, + "eval_runtime": 58.2682, + "eval_samples_per_second": 109.322, + "eval_steps_per_second": 27.339, + "step": 36000 + }, + { + "acc": 0.65351839, + "epoch": 0.9133688483003551, + "grad_norm": 6.34375, + "learning_rate": 6.119993391868335e-06, + "loss": 1.60084076, + "memory(GiB)": 117.38, + "step": 36005, + "train_speed(iter/s)": 1.635184 + }, + { + "acc": 0.67257266, + "epoch": 0.9134956874682902, + "grad_norm": 5.125, + "learning_rate": 6.118971389671842e-06, + "loss": 1.58251028, + "memory(GiB)": 117.38, + "step": 36010, + "train_speed(iter/s)": 1.635209 + }, + { + "acc": 0.64846458, + "epoch": 0.9136225266362252, + "grad_norm": 4.78125, + "learning_rate": 6.117949338257989e-06, + "loss": 1.63050919, + "memory(GiB)": 117.38, + "step": 36015, + "train_speed(iter/s)": 1.635233 + }, + { + "acc": 0.66973658, + "epoch": 0.9137493658041603, + "grad_norm": 5.96875, + "learning_rate": 6.116927237671735e-06, + "loss": 1.56487503, + "memory(GiB)": 117.38, + "step": 36020, + "train_speed(iter/s)": 1.635256 + }, + { + "acc": 0.65058022, + "epoch": 0.9138762049720954, + "grad_norm": 6.625, + "learning_rate": 6.115905087958032e-06, + "loss": 1.59680977, + "memory(GiB)": 117.38, + "step": 36025, + "train_speed(iter/s)": 1.63528 + }, + { + "acc": 0.65396318, + "epoch": 0.9140030441400304, + "grad_norm": 6.4375, + "learning_rate": 6.114882889161844e-06, + "loss": 1.64201965, + "memory(GiB)": 117.38, + "step": 36030, + "train_speed(iter/s)": 1.635304 + }, + { + "acc": 0.66737761, + "epoch": 0.9141298833079655, + "grad_norm": 6.15625, + "learning_rate": 6.113860641328127e-06, + "loss": 1.60068932, + "memory(GiB)": 117.38, + "step": 36035, + "train_speed(iter/s)": 1.635328 + }, + { + "acc": 0.65820255, + "epoch": 0.9142567224759005, + "grad_norm": 5.84375, + "learning_rate": 6.112838344501846e-06, + "loss": 1.55943222, + "memory(GiB)": 117.38, + "step": 36040, + "train_speed(iter/s)": 1.635352 + }, + { + "acc": 0.64755988, + "epoch": 0.9143835616438356, + "grad_norm": 5.5625, + "learning_rate": 6.111815998727966e-06, + "loss": 1.65948486, + "memory(GiB)": 117.38, + "step": 36045, + "train_speed(iter/s)": 1.635377 + }, + { + "acc": 0.6622467, + "epoch": 0.9145104008117707, + "grad_norm": 6.875, + "learning_rate": 6.110793604051455e-06, + "loss": 1.56997089, + "memory(GiB)": 117.38, + "step": 36050, + "train_speed(iter/s)": 1.635403 + }, + { + "acc": 0.66383572, + "epoch": 0.9146372399797057, + "grad_norm": 6.28125, + "learning_rate": 6.109771160517283e-06, + "loss": 1.60202141, + "memory(GiB)": 117.38, + "step": 36055, + "train_speed(iter/s)": 1.635426 + }, + { + "acc": 0.65538373, + "epoch": 0.9147640791476408, + "grad_norm": 6.4375, + "learning_rate": 6.108748668170419e-06, + "loss": 1.62757683, + "memory(GiB)": 117.38, + "step": 36060, + "train_speed(iter/s)": 1.63545 + }, + { + "acc": 0.6619308, + "epoch": 0.9148909183155759, + "grad_norm": 5.03125, + "learning_rate": 6.1077261270558385e-06, + "loss": 1.59994726, + "memory(GiB)": 117.38, + "step": 36065, + "train_speed(iter/s)": 1.635475 + }, + { + "acc": 0.65563016, + "epoch": 0.9150177574835109, + "grad_norm": 4.8125, + "learning_rate": 6.106703537218518e-06, + "loss": 1.63084145, + "memory(GiB)": 117.38, + "step": 36070, + "train_speed(iter/s)": 1.6355 + }, + { + "acc": 0.64853458, + "epoch": 0.915144596651446, + "grad_norm": 5.21875, + "learning_rate": 6.105680898703434e-06, + "loss": 1.62785873, + "memory(GiB)": 117.38, + "step": 36075, + "train_speed(iter/s)": 1.635525 + }, + { + "acc": 0.64009867, + "epoch": 0.915271435819381, + "grad_norm": 4.8125, + "learning_rate": 6.104658211555568e-06, + "loss": 1.68101349, + "memory(GiB)": 117.38, + "step": 36080, + "train_speed(iter/s)": 1.635549 + }, + { + "acc": 0.65485687, + "epoch": 0.9153982749873161, + "grad_norm": 6.6875, + "learning_rate": 6.103635475819902e-06, + "loss": 1.63143883, + "memory(GiB)": 117.38, + "step": 36085, + "train_speed(iter/s)": 1.635575 + }, + { + "acc": 0.65299535, + "epoch": 0.9155251141552512, + "grad_norm": 5.03125, + "learning_rate": 6.102612691541422e-06, + "loss": 1.61650009, + "memory(GiB)": 117.38, + "step": 36090, + "train_speed(iter/s)": 1.635603 + }, + { + "acc": 0.63208127, + "epoch": 0.9156519533231862, + "grad_norm": 5.4375, + "learning_rate": 6.10158985876511e-06, + "loss": 1.58753653, + "memory(GiB)": 117.38, + "step": 36095, + "train_speed(iter/s)": 1.63563 + }, + { + "acc": 0.65565453, + "epoch": 0.9157787924911213, + "grad_norm": 5.875, + "learning_rate": 6.10056697753596e-06, + "loss": 1.58026237, + "memory(GiB)": 117.38, + "step": 36100, + "train_speed(iter/s)": 1.635655 + }, + { + "acc": 0.65064707, + "epoch": 0.9159056316590564, + "grad_norm": 4.96875, + "learning_rate": 6.0995440478989595e-06, + "loss": 1.60773468, + "memory(GiB)": 117.38, + "step": 36105, + "train_speed(iter/s)": 1.63568 + }, + { + "acc": 0.67643147, + "epoch": 0.9160324708269914, + "grad_norm": 7.59375, + "learning_rate": 6.098521069899104e-06, + "loss": 1.49202375, + "memory(GiB)": 117.38, + "step": 36110, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.65322771, + "epoch": 0.9161593099949265, + "grad_norm": 7.0625, + "learning_rate": 6.097498043581385e-06, + "loss": 1.65457554, + "memory(GiB)": 117.38, + "step": 36115, + "train_speed(iter/s)": 1.635732 + }, + { + "acc": 0.66424999, + "epoch": 0.9162861491628614, + "grad_norm": 6.40625, + "learning_rate": 6.096474968990804e-06, + "loss": 1.5746685, + "memory(GiB)": 117.38, + "step": 36120, + "train_speed(iter/s)": 1.635757 + }, + { + "acc": 0.66097641, + "epoch": 0.9164129883307965, + "grad_norm": 4.90625, + "learning_rate": 6.095451846172358e-06, + "loss": 1.59456024, + "memory(GiB)": 117.38, + "step": 36125, + "train_speed(iter/s)": 1.635781 + }, + { + "acc": 0.64653678, + "epoch": 0.9165398274987316, + "grad_norm": 6.4375, + "learning_rate": 6.094428675171049e-06, + "loss": 1.64743042, + "memory(GiB)": 117.38, + "step": 36130, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.65295224, + "epoch": 0.9166666666666666, + "grad_norm": 6.34375, + "learning_rate": 6.09340545603188e-06, + "loss": 1.63160591, + "memory(GiB)": 117.38, + "step": 36135, + "train_speed(iter/s)": 1.635832 + }, + { + "acc": 0.67040863, + "epoch": 0.9167935058346017, + "grad_norm": 5.8125, + "learning_rate": 6.092382188799858e-06, + "loss": 1.58281822, + "memory(GiB)": 117.38, + "step": 36140, + "train_speed(iter/s)": 1.635856 + }, + { + "acc": 0.66300859, + "epoch": 0.9169203450025368, + "grad_norm": 5.625, + "learning_rate": 6.09135887351999e-06, + "loss": 1.59765091, + "memory(GiB)": 117.38, + "step": 36145, + "train_speed(iter/s)": 1.635881 + }, + { + "acc": 0.65275917, + "epoch": 0.9170471841704718, + "grad_norm": 8.4375, + "learning_rate": 6.090335510237286e-06, + "loss": 1.61115723, + "memory(GiB)": 117.38, + "step": 36150, + "train_speed(iter/s)": 1.635906 + }, + { + "acc": 0.66676817, + "epoch": 0.9171740233384069, + "grad_norm": 5.9375, + "learning_rate": 6.089312098996758e-06, + "loss": 1.56021061, + "memory(GiB)": 117.38, + "step": 36155, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.65913329, + "epoch": 0.9173008625063419, + "grad_norm": 7.5, + "learning_rate": 6.088288639843422e-06, + "loss": 1.62115898, + "memory(GiB)": 117.38, + "step": 36160, + "train_speed(iter/s)": 1.635956 + }, + { + "acc": 0.6623313, + "epoch": 0.917427701674277, + "grad_norm": 6.3125, + "learning_rate": 6.08726513282229e-06, + "loss": 1.58611641, + "memory(GiB)": 117.38, + "step": 36165, + "train_speed(iter/s)": 1.635979 + }, + { + "acc": 0.65777364, + "epoch": 0.9175545408422121, + "grad_norm": 5.46875, + "learning_rate": 6.0862415779783855e-06, + "loss": 1.61070251, + "memory(GiB)": 117.38, + "step": 36170, + "train_speed(iter/s)": 1.636002 + }, + { + "acc": 0.64643722, + "epoch": 0.9176813800101471, + "grad_norm": 7.0, + "learning_rate": 6.085217975356726e-06, + "loss": 1.64098854, + "memory(GiB)": 117.38, + "step": 36175, + "train_speed(iter/s)": 1.636025 + }, + { + "acc": 0.64263363, + "epoch": 0.9178082191780822, + "grad_norm": 4.71875, + "learning_rate": 6.084194325002335e-06, + "loss": 1.68528385, + "memory(GiB)": 117.38, + "step": 36180, + "train_speed(iter/s)": 1.636048 + }, + { + "acc": 0.66855478, + "epoch": 0.9179350583460173, + "grad_norm": 5.875, + "learning_rate": 6.083170626960237e-06, + "loss": 1.56983757, + "memory(GiB)": 117.38, + "step": 36185, + "train_speed(iter/s)": 1.63607 + }, + { + "acc": 0.66414213, + "epoch": 0.9180618975139523, + "grad_norm": 4.65625, + "learning_rate": 6.082146881275458e-06, + "loss": 1.59255886, + "memory(GiB)": 117.38, + "step": 36190, + "train_speed(iter/s)": 1.636096 + }, + { + "acc": 0.65383101, + "epoch": 0.9181887366818874, + "grad_norm": 5.46875, + "learning_rate": 6.081123087993028e-06, + "loss": 1.59816837, + "memory(GiB)": 117.38, + "step": 36195, + "train_speed(iter/s)": 1.636122 + }, + { + "acc": 0.638269, + "epoch": 0.9183155758498224, + "grad_norm": 6.28125, + "learning_rate": 6.0800992471579775e-06, + "loss": 1.63397884, + "memory(GiB)": 117.38, + "step": 36200, + "train_speed(iter/s)": 1.636147 + }, + { + "acc": 0.66062183, + "epoch": 0.9184424150177575, + "grad_norm": 5.71875, + "learning_rate": 6.079075358815341e-06, + "loss": 1.56029272, + "memory(GiB)": 117.38, + "step": 36205, + "train_speed(iter/s)": 1.636173 + }, + { + "acc": 0.64855075, + "epoch": 0.9185692541856926, + "grad_norm": 5.6875, + "learning_rate": 6.078051423010152e-06, + "loss": 1.63212471, + "memory(GiB)": 117.38, + "step": 36210, + "train_speed(iter/s)": 1.636198 + }, + { + "acc": 0.64342275, + "epoch": 0.9186960933536276, + "grad_norm": 6.09375, + "learning_rate": 6.077027439787448e-06, + "loss": 1.68921242, + "memory(GiB)": 117.38, + "step": 36215, + "train_speed(iter/s)": 1.636222 + }, + { + "acc": 0.65709906, + "epoch": 0.9188229325215627, + "grad_norm": 6.15625, + "learning_rate": 6.076003409192268e-06, + "loss": 1.62773457, + "memory(GiB)": 117.38, + "step": 36220, + "train_speed(iter/s)": 1.636242 + }, + { + "acc": 0.65981035, + "epoch": 0.9189497716894978, + "grad_norm": 5.625, + "learning_rate": 6.074979331269656e-06, + "loss": 1.59904537, + "memory(GiB)": 117.38, + "step": 36225, + "train_speed(iter/s)": 1.636269 + }, + { + "acc": 0.65500669, + "epoch": 0.9190766108574328, + "grad_norm": 5.28125, + "learning_rate": 6.0739552060646525e-06, + "loss": 1.67204399, + "memory(GiB)": 117.38, + "step": 36230, + "train_speed(iter/s)": 1.636296 + }, + { + "acc": 0.66228189, + "epoch": 0.9192034500253679, + "grad_norm": 6.46875, + "learning_rate": 6.0729310336223025e-06, + "loss": 1.58430262, + "memory(GiB)": 117.38, + "step": 36235, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.66498661, + "epoch": 0.9193302891933028, + "grad_norm": 5.3125, + "learning_rate": 6.071906813987658e-06, + "loss": 1.58328104, + "memory(GiB)": 117.38, + "step": 36240, + "train_speed(iter/s)": 1.636347 + }, + { + "acc": 0.66475973, + "epoch": 0.9194571283612379, + "grad_norm": 6.375, + "learning_rate": 6.070882547205764e-06, + "loss": 1.61242313, + "memory(GiB)": 117.38, + "step": 36245, + "train_speed(iter/s)": 1.636371 + }, + { + "acc": 0.67056246, + "epoch": 0.919583967529173, + "grad_norm": 5.875, + "learning_rate": 6.069858233321677e-06, + "loss": 1.52447662, + "memory(GiB)": 117.38, + "step": 36250, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.67167501, + "epoch": 0.919710806697108, + "grad_norm": 5.90625, + "learning_rate": 6.068833872380445e-06, + "loss": 1.54489784, + "memory(GiB)": 117.38, + "step": 36255, + "train_speed(iter/s)": 1.636419 + }, + { + "acc": 0.63946757, + "epoch": 0.9198376458650431, + "grad_norm": 6.21875, + "learning_rate": 6.067809464427129e-06, + "loss": 1.66090279, + "memory(GiB)": 117.38, + "step": 36260, + "train_speed(iter/s)": 1.636444 + }, + { + "acc": 0.66337266, + "epoch": 0.9199644850329782, + "grad_norm": 5.75, + "learning_rate": 6.066785009506786e-06, + "loss": 1.578654, + "memory(GiB)": 117.38, + "step": 36265, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.65782366, + "epoch": 0.9200913242009132, + "grad_norm": 5.75, + "learning_rate": 6.065760507664474e-06, + "loss": 1.60326195, + "memory(GiB)": 117.38, + "step": 36270, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.6633647, + "epoch": 0.9202181633688483, + "grad_norm": 7.0625, + "learning_rate": 6.064735958945258e-06, + "loss": 1.60193977, + "memory(GiB)": 117.38, + "step": 36275, + "train_speed(iter/s)": 1.636522 + }, + { + "acc": 0.66284847, + "epoch": 0.9203450025367833, + "grad_norm": 8.0, + "learning_rate": 6.0637113633942006e-06, + "loss": 1.65656433, + "memory(GiB)": 117.38, + "step": 36280, + "train_speed(iter/s)": 1.636546 + }, + { + "acc": 0.64009738, + "epoch": 0.9204718417047184, + "grad_norm": 5.3125, + "learning_rate": 6.0626867210563675e-06, + "loss": 1.71321259, + "memory(GiB)": 117.38, + "step": 36285, + "train_speed(iter/s)": 1.63657 + }, + { + "acc": 0.67031155, + "epoch": 0.9205986808726535, + "grad_norm": 4.9375, + "learning_rate": 6.061662031976828e-06, + "loss": 1.55949631, + "memory(GiB)": 117.38, + "step": 36290, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.65030584, + "epoch": 0.9207255200405885, + "grad_norm": 5.5, + "learning_rate": 6.0606372962006534e-06, + "loss": 1.56111631, + "memory(GiB)": 117.38, + "step": 36295, + "train_speed(iter/s)": 1.636619 + }, + { + "acc": 0.64955354, + "epoch": 0.9208523592085236, + "grad_norm": 5.1875, + "learning_rate": 6.0596125137729145e-06, + "loss": 1.61581497, + "memory(GiB)": 117.38, + "step": 36300, + "train_speed(iter/s)": 1.636642 + }, + { + "acc": 0.64847479, + "epoch": 0.9209791983764587, + "grad_norm": 6.0, + "learning_rate": 6.058587684738685e-06, + "loss": 1.66633797, + "memory(GiB)": 117.38, + "step": 36305, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.66139259, + "epoch": 0.9211060375443937, + "grad_norm": 6.625, + "learning_rate": 6.057562809143045e-06, + "loss": 1.56020889, + "memory(GiB)": 117.38, + "step": 36310, + "train_speed(iter/s)": 1.636691 + }, + { + "acc": 0.65074482, + "epoch": 0.9212328767123288, + "grad_norm": 5.53125, + "learning_rate": 6.056537887031069e-06, + "loss": 1.61470642, + "memory(GiB)": 117.38, + "step": 36315, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.66339464, + "epoch": 0.9213597158802638, + "grad_norm": 5.03125, + "learning_rate": 6.055512918447841e-06, + "loss": 1.60316505, + "memory(GiB)": 117.38, + "step": 36320, + "train_speed(iter/s)": 1.636367 + }, + { + "acc": 0.64431586, + "epoch": 0.9214865550481989, + "grad_norm": 5.03125, + "learning_rate": 6.054487903438442e-06, + "loss": 1.6006237, + "memory(GiB)": 117.38, + "step": 36325, + "train_speed(iter/s)": 1.636392 + }, + { + "acc": 0.65243015, + "epoch": 0.921613394216134, + "grad_norm": 5.875, + "learning_rate": 6.0534628420479576e-06, + "loss": 1.63444786, + "memory(GiB)": 117.38, + "step": 36330, + "train_speed(iter/s)": 1.636418 + }, + { + "acc": 0.6644886, + "epoch": 0.921740233384069, + "grad_norm": 5.4375, + "learning_rate": 6.0524377343214724e-06, + "loss": 1.62916622, + "memory(GiB)": 117.38, + "step": 36335, + "train_speed(iter/s)": 1.636443 + }, + { + "acc": 0.64734612, + "epoch": 0.9218670725520041, + "grad_norm": 5.875, + "learning_rate": 6.051412580304079e-06, + "loss": 1.63898258, + "memory(GiB)": 117.38, + "step": 36340, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.64840808, + "epoch": 0.9219939117199392, + "grad_norm": 4.71875, + "learning_rate": 6.050387380040864e-06, + "loss": 1.64395428, + "memory(GiB)": 117.38, + "step": 36345, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.63976455, + "epoch": 0.9221207508878742, + "grad_norm": 5.375, + "learning_rate": 6.049362133576924e-06, + "loss": 1.65737228, + "memory(GiB)": 117.38, + "step": 36350, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.6784482, + "epoch": 0.9222475900558093, + "grad_norm": 6.15625, + "learning_rate": 6.048336840957351e-06, + "loss": 1.56237516, + "memory(GiB)": 117.38, + "step": 36355, + "train_speed(iter/s)": 1.636545 + }, + { + "acc": 0.66122484, + "epoch": 0.9223744292237442, + "grad_norm": 5.15625, + "learning_rate": 6.047311502227245e-06, + "loss": 1.60077171, + "memory(GiB)": 117.38, + "step": 36360, + "train_speed(iter/s)": 1.636571 + }, + { + "acc": 0.65276766, + "epoch": 0.9225012683916793, + "grad_norm": 5.8125, + "learning_rate": 6.046286117431703e-06, + "loss": 1.61099243, + "memory(GiB)": 117.38, + "step": 36365, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.65844078, + "epoch": 0.9226281075596144, + "grad_norm": 4.40625, + "learning_rate": 6.0452606866158246e-06, + "loss": 1.64127617, + "memory(GiB)": 117.38, + "step": 36370, + "train_speed(iter/s)": 1.636618 + }, + { + "acc": 0.65731187, + "epoch": 0.9227549467275494, + "grad_norm": 5.90625, + "learning_rate": 6.044235209824716e-06, + "loss": 1.55045872, + "memory(GiB)": 117.38, + "step": 36375, + "train_speed(iter/s)": 1.636644 + }, + { + "acc": 0.64707279, + "epoch": 0.9228817858954845, + "grad_norm": 6.3125, + "learning_rate": 6.04320968710348e-06, + "loss": 1.70206375, + "memory(GiB)": 117.38, + "step": 36380, + "train_speed(iter/s)": 1.636668 + }, + { + "acc": 0.63317547, + "epoch": 0.9230086250634196, + "grad_norm": 6.90625, + "learning_rate": 6.042184118497223e-06, + "loss": 1.67799072, + "memory(GiB)": 117.38, + "step": 36385, + "train_speed(iter/s)": 1.636694 + }, + { + "acc": 0.67488623, + "epoch": 0.9231354642313546, + "grad_norm": 6.3125, + "learning_rate": 6.0411585040510576e-06, + "loss": 1.57867241, + "memory(GiB)": 117.38, + "step": 36390, + "train_speed(iter/s)": 1.636719 + }, + { + "acc": 0.66463165, + "epoch": 0.9232623033992897, + "grad_norm": 5.375, + "learning_rate": 6.040132843810091e-06, + "loss": 1.56813745, + "memory(GiB)": 117.38, + "step": 36395, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.68164845, + "epoch": 0.9233891425672247, + "grad_norm": 6.28125, + "learning_rate": 6.03910713781944e-06, + "loss": 1.48613977, + "memory(GiB)": 117.38, + "step": 36400, + "train_speed(iter/s)": 1.636771 + }, + { + "acc": 0.64825621, + "epoch": 0.9235159817351598, + "grad_norm": 6.25, + "learning_rate": 6.038081386124216e-06, + "loss": 1.60890808, + "memory(GiB)": 117.38, + "step": 36405, + "train_speed(iter/s)": 1.636797 + }, + { + "acc": 0.65801048, + "epoch": 0.9236428209030949, + "grad_norm": 5.84375, + "learning_rate": 6.037055588769539e-06, + "loss": 1.58543015, + "memory(GiB)": 117.38, + "step": 36410, + "train_speed(iter/s)": 1.636821 + }, + { + "acc": 0.66115756, + "epoch": 0.9237696600710299, + "grad_norm": 5.15625, + "learning_rate": 6.036029745800527e-06, + "loss": 1.56123562, + "memory(GiB)": 117.38, + "step": 36415, + "train_speed(iter/s)": 1.636846 + }, + { + "acc": 0.65744977, + "epoch": 0.923896499238965, + "grad_norm": 5.40625, + "learning_rate": 6.0350038572623e-06, + "loss": 1.66076927, + "memory(GiB)": 117.38, + "step": 36420, + "train_speed(iter/s)": 1.63687 + }, + { + "acc": 0.64745369, + "epoch": 0.9240233384069001, + "grad_norm": 5.15625, + "learning_rate": 6.033977923199984e-06, + "loss": 1.60215511, + "memory(GiB)": 117.38, + "step": 36425, + "train_speed(iter/s)": 1.636896 + }, + { + "acc": 0.67307682, + "epoch": 0.9241501775748351, + "grad_norm": 6.375, + "learning_rate": 6.032951943658702e-06, + "loss": 1.51451397, + "memory(GiB)": 117.38, + "step": 36430, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.65064034, + "epoch": 0.9242770167427702, + "grad_norm": 6.0, + "learning_rate": 6.031925918683582e-06, + "loss": 1.62221336, + "memory(GiB)": 117.38, + "step": 36435, + "train_speed(iter/s)": 1.636944 + }, + { + "acc": 0.63989716, + "epoch": 0.9244038559107052, + "grad_norm": 6.4375, + "learning_rate": 6.030899848319754e-06, + "loss": 1.66184444, + "memory(GiB)": 117.38, + "step": 36440, + "train_speed(iter/s)": 1.636969 + }, + { + "acc": 0.64444189, + "epoch": 0.9245306950786403, + "grad_norm": 5.25, + "learning_rate": 6.029873732612346e-06, + "loss": 1.66274128, + "memory(GiB)": 117.38, + "step": 36445, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.6538352, + "epoch": 0.9246575342465754, + "grad_norm": 6.25, + "learning_rate": 6.028847571606493e-06, + "loss": 1.65692596, + "memory(GiB)": 117.38, + "step": 36450, + "train_speed(iter/s)": 1.637017 + }, + { + "acc": 0.64574213, + "epoch": 0.9247843734145104, + "grad_norm": 5.25, + "learning_rate": 6.0278213653473305e-06, + "loss": 1.60543823, + "memory(GiB)": 117.38, + "step": 36455, + "train_speed(iter/s)": 1.637042 + }, + { + "acc": 0.65711288, + "epoch": 0.9249112125824455, + "grad_norm": 5.21875, + "learning_rate": 6.026795113879998e-06, + "loss": 1.62518845, + "memory(GiB)": 117.38, + "step": 36460, + "train_speed(iter/s)": 1.637068 + }, + { + "acc": 0.65391207, + "epoch": 0.9250380517503806, + "grad_norm": 5.75, + "learning_rate": 6.025768817249629e-06, + "loss": 1.63711586, + "memory(GiB)": 117.38, + "step": 36465, + "train_speed(iter/s)": 1.637094 + }, + { + "acc": 0.62530565, + "epoch": 0.9251648909183156, + "grad_norm": 6.15625, + "learning_rate": 6.024742475501369e-06, + "loss": 1.71547623, + "memory(GiB)": 117.38, + "step": 36470, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.65347185, + "epoch": 0.9252917300862507, + "grad_norm": 5.625, + "learning_rate": 6.023716088680359e-06, + "loss": 1.5999218, + "memory(GiB)": 117.38, + "step": 36475, + "train_speed(iter/s)": 1.637146 + }, + { + "acc": 0.64557905, + "epoch": 0.9254185692541856, + "grad_norm": 6.0625, + "learning_rate": 6.022689656831746e-06, + "loss": 1.54885921, + "memory(GiB)": 117.38, + "step": 36480, + "train_speed(iter/s)": 1.637172 + }, + { + "acc": 0.65377927, + "epoch": 0.9255454084221207, + "grad_norm": 8.0625, + "learning_rate": 6.021663180000675e-06, + "loss": 1.58136654, + "memory(GiB)": 117.38, + "step": 36485, + "train_speed(iter/s)": 1.637197 + }, + { + "acc": 0.65053911, + "epoch": 0.9256722475900558, + "grad_norm": 5.75, + "learning_rate": 6.020636658232297e-06, + "loss": 1.61182327, + "memory(GiB)": 117.38, + "step": 36490, + "train_speed(iter/s)": 1.637222 + }, + { + "acc": 0.64730105, + "epoch": 0.9257990867579908, + "grad_norm": 6.125, + "learning_rate": 6.019610091571762e-06, + "loss": 1.64903641, + "memory(GiB)": 117.38, + "step": 36495, + "train_speed(iter/s)": 1.637246 + }, + { + "acc": 0.64974604, + "epoch": 0.9259259259259259, + "grad_norm": 7.375, + "learning_rate": 6.018583480064222e-06, + "loss": 1.69215298, + "memory(GiB)": 117.38, + "step": 36500, + "train_speed(iter/s)": 1.637271 + }, + { + "acc": 0.65762601, + "epoch": 0.926052765093861, + "grad_norm": 5.34375, + "learning_rate": 6.017556823754833e-06, + "loss": 1.50422382, + "memory(GiB)": 117.38, + "step": 36505, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.66133471, + "epoch": 0.926179604261796, + "grad_norm": 7.375, + "learning_rate": 6.016530122688753e-06, + "loss": 1.61290493, + "memory(GiB)": 117.38, + "step": 36510, + "train_speed(iter/s)": 1.63732 + }, + { + "acc": 0.66842136, + "epoch": 0.9263064434297311, + "grad_norm": 6.09375, + "learning_rate": 6.015503376911138e-06, + "loss": 1.58520184, + "memory(GiB)": 117.38, + "step": 36515, + "train_speed(iter/s)": 1.637346 + }, + { + "acc": 0.64907427, + "epoch": 0.9264332825976661, + "grad_norm": 4.9375, + "learning_rate": 6.0144765864671515e-06, + "loss": 1.61483612, + "memory(GiB)": 117.38, + "step": 36520, + "train_speed(iter/s)": 1.637372 + }, + { + "acc": 0.6322504, + "epoch": 0.9265601217656012, + "grad_norm": 5.4375, + "learning_rate": 6.013449751401954e-06, + "loss": 1.64845676, + "memory(GiB)": 117.38, + "step": 36525, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.65881457, + "epoch": 0.9266869609335363, + "grad_norm": 6.15625, + "learning_rate": 6.012422871760715e-06, + "loss": 1.59565029, + "memory(GiB)": 117.38, + "step": 36530, + "train_speed(iter/s)": 1.637423 + }, + { + "acc": 0.64303846, + "epoch": 0.9268138001014713, + "grad_norm": 5.25, + "learning_rate": 6.011395947588594e-06, + "loss": 1.64643135, + "memory(GiB)": 117.38, + "step": 36535, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.65159397, + "epoch": 0.9269406392694064, + "grad_norm": 5.46875, + "learning_rate": 6.010368978930767e-06, + "loss": 1.60015945, + "memory(GiB)": 117.38, + "step": 36540, + "train_speed(iter/s)": 1.637474 + }, + { + "acc": 0.65719876, + "epoch": 0.9270674784373415, + "grad_norm": 5.9375, + "learning_rate": 6.0093419658323995e-06, + "loss": 1.65373898, + "memory(GiB)": 117.38, + "step": 36545, + "train_speed(iter/s)": 1.637499 + }, + { + "acc": 0.64759283, + "epoch": 0.9271943176052765, + "grad_norm": 6.0, + "learning_rate": 6.0083149083386675e-06, + "loss": 1.69489746, + "memory(GiB)": 117.38, + "step": 36550, + "train_speed(iter/s)": 1.637525 + }, + { + "acc": 0.64462337, + "epoch": 0.9273211567732116, + "grad_norm": 5.78125, + "learning_rate": 6.007287806494742e-06, + "loss": 1.64526329, + "memory(GiB)": 117.38, + "step": 36555, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.66012745, + "epoch": 0.9274479959411466, + "grad_norm": 5.28125, + "learning_rate": 6.006260660345802e-06, + "loss": 1.59140701, + "memory(GiB)": 117.38, + "step": 36560, + "train_speed(iter/s)": 1.637575 + }, + { + "acc": 0.65189114, + "epoch": 0.9275748351090817, + "grad_norm": 5.3125, + "learning_rate": 6.005233469937027e-06, + "loss": 1.61465282, + "memory(GiB)": 117.38, + "step": 36565, + "train_speed(iter/s)": 1.637601 + }, + { + "acc": 0.66557894, + "epoch": 0.9277016742770168, + "grad_norm": 6.34375, + "learning_rate": 6.004206235313594e-06, + "loss": 1.55650749, + "memory(GiB)": 117.38, + "step": 36570, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.65521727, + "epoch": 0.9278285134449518, + "grad_norm": 5.21875, + "learning_rate": 6.003178956520688e-06, + "loss": 1.59921608, + "memory(GiB)": 117.38, + "step": 36575, + "train_speed(iter/s)": 1.637653 + }, + { + "acc": 0.65742292, + "epoch": 0.9279553526128869, + "grad_norm": 4.4375, + "learning_rate": 6.002151633603493e-06, + "loss": 1.62825947, + "memory(GiB)": 117.38, + "step": 36580, + "train_speed(iter/s)": 1.637679 + }, + { + "acc": 0.64909029, + "epoch": 0.928082191780822, + "grad_norm": 5.03125, + "learning_rate": 6.0011242666071945e-06, + "loss": 1.62185135, + "memory(GiB)": 117.38, + "step": 36585, + "train_speed(iter/s)": 1.637703 + }, + { + "acc": 0.65099921, + "epoch": 0.928209030948757, + "grad_norm": 5.3125, + "learning_rate": 6.000096855576982e-06, + "loss": 1.55952425, + "memory(GiB)": 117.38, + "step": 36590, + "train_speed(iter/s)": 1.637727 + }, + { + "acc": 0.66165724, + "epoch": 0.928335870116692, + "grad_norm": 5.34375, + "learning_rate": 5.999069400558044e-06, + "loss": 1.62240829, + "memory(GiB)": 117.38, + "step": 36595, + "train_speed(iter/s)": 1.637749 + }, + { + "acc": 0.65915008, + "epoch": 0.928462709284627, + "grad_norm": 6.09375, + "learning_rate": 5.998041901595573e-06, + "loss": 1.58293343, + "memory(GiB)": 117.38, + "step": 36600, + "train_speed(iter/s)": 1.637775 + }, + { + "acc": 0.63523045, + "epoch": 0.9285895484525621, + "grad_norm": 5.78125, + "learning_rate": 5.997014358734763e-06, + "loss": 1.6437191, + "memory(GiB)": 117.38, + "step": 36605, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.65464315, + "epoch": 0.9287163876204972, + "grad_norm": 5.9375, + "learning_rate": 5.995986772020811e-06, + "loss": 1.54708357, + "memory(GiB)": 117.38, + "step": 36610, + "train_speed(iter/s)": 1.637825 + }, + { + "acc": 0.64718585, + "epoch": 0.9288432267884322, + "grad_norm": 5.28125, + "learning_rate": 5.994959141498913e-06, + "loss": 1.61540813, + "memory(GiB)": 117.38, + "step": 36615, + "train_speed(iter/s)": 1.637849 + }, + { + "acc": 0.66159501, + "epoch": 0.9289700659563673, + "grad_norm": 4.6875, + "learning_rate": 5.993931467214272e-06, + "loss": 1.6199379, + "memory(GiB)": 117.38, + "step": 36620, + "train_speed(iter/s)": 1.637874 + }, + { + "acc": 0.64900413, + "epoch": 0.9290969051243024, + "grad_norm": 5.59375, + "learning_rate": 5.992903749212084e-06, + "loss": 1.6228775, + "memory(GiB)": 117.38, + "step": 36625, + "train_speed(iter/s)": 1.637899 + }, + { + "acc": 0.64893112, + "epoch": 0.9292237442922374, + "grad_norm": 6.25, + "learning_rate": 5.991875987537559e-06, + "loss": 1.6396759, + "memory(GiB)": 117.38, + "step": 36630, + "train_speed(iter/s)": 1.637924 + }, + { + "acc": 0.65748696, + "epoch": 0.9293505834601725, + "grad_norm": 6.84375, + "learning_rate": 5.990848182235898e-06, + "loss": 1.63052235, + "memory(GiB)": 117.38, + "step": 36635, + "train_speed(iter/s)": 1.637949 + }, + { + "acc": 0.64518785, + "epoch": 0.9294774226281075, + "grad_norm": 7.0, + "learning_rate": 5.98982033335231e-06, + "loss": 1.67615356, + "memory(GiB)": 117.38, + "step": 36640, + "train_speed(iter/s)": 1.637974 + }, + { + "acc": 0.66198421, + "epoch": 0.9296042617960426, + "grad_norm": 5.59375, + "learning_rate": 5.988792440932006e-06, + "loss": 1.51006746, + "memory(GiB)": 117.38, + "step": 36645, + "train_speed(iter/s)": 1.637997 + }, + { + "acc": 0.66837864, + "epoch": 0.9297311009639777, + "grad_norm": 5.40625, + "learning_rate": 5.987764505020195e-06, + "loss": 1.51374893, + "memory(GiB)": 117.38, + "step": 36650, + "train_speed(iter/s)": 1.638023 + }, + { + "acc": 0.64073043, + "epoch": 0.9298579401319127, + "grad_norm": 5.75, + "learning_rate": 5.986736525662091e-06, + "loss": 1.62323952, + "memory(GiB)": 117.38, + "step": 36655, + "train_speed(iter/s)": 1.638048 + }, + { + "acc": 0.65298071, + "epoch": 0.9299847792998478, + "grad_norm": 4.71875, + "learning_rate": 5.985708502902909e-06, + "loss": 1.56594706, + "memory(GiB)": 117.38, + "step": 36660, + "train_speed(iter/s)": 1.638072 + }, + { + "acc": 0.65557222, + "epoch": 0.9301116184677829, + "grad_norm": 5.65625, + "learning_rate": 5.984680436787867e-06, + "loss": 1.62632198, + "memory(GiB)": 117.38, + "step": 36665, + "train_speed(iter/s)": 1.638096 + }, + { + "acc": 0.65136414, + "epoch": 0.9302384576357179, + "grad_norm": 5.28125, + "learning_rate": 5.983652327362182e-06, + "loss": 1.60573158, + "memory(GiB)": 117.38, + "step": 36670, + "train_speed(iter/s)": 1.638119 + }, + { + "acc": 0.65772572, + "epoch": 0.930365296803653, + "grad_norm": 5.09375, + "learning_rate": 5.982624174671077e-06, + "loss": 1.64197807, + "memory(GiB)": 117.38, + "step": 36675, + "train_speed(iter/s)": 1.638142 + }, + { + "acc": 0.65992022, + "epoch": 0.930492135971588, + "grad_norm": 6.78125, + "learning_rate": 5.981595978759773e-06, + "loss": 1.58844309, + "memory(GiB)": 117.38, + "step": 36680, + "train_speed(iter/s)": 1.638167 + }, + { + "acc": 0.65233703, + "epoch": 0.9306189751395231, + "grad_norm": 4.9375, + "learning_rate": 5.980567739673495e-06, + "loss": 1.64766006, + "memory(GiB)": 117.38, + "step": 36685, + "train_speed(iter/s)": 1.638192 + }, + { + "acc": 0.65973969, + "epoch": 0.9307458143074582, + "grad_norm": 5.375, + "learning_rate": 5.979539457457472e-06, + "loss": 1.60913048, + "memory(GiB)": 117.38, + "step": 36690, + "train_speed(iter/s)": 1.638218 + }, + { + "acc": 0.65812969, + "epoch": 0.9308726534753932, + "grad_norm": 6.34375, + "learning_rate": 5.978511132156928e-06, + "loss": 1.63085709, + "memory(GiB)": 117.38, + "step": 36695, + "train_speed(iter/s)": 1.638241 + }, + { + "acc": 0.64890075, + "epoch": 0.9309994926433283, + "grad_norm": 6.28125, + "learning_rate": 5.9774827638170965e-06, + "loss": 1.65883102, + "memory(GiB)": 117.38, + "step": 36700, + "train_speed(iter/s)": 1.638267 + }, + { + "acc": 0.66292391, + "epoch": 0.9311263318112634, + "grad_norm": 5.125, + "learning_rate": 5.9764543524832085e-06, + "loss": 1.55409737, + "memory(GiB)": 117.38, + "step": 36705, + "train_speed(iter/s)": 1.638292 + }, + { + "acc": 0.66930556, + "epoch": 0.9312531709791984, + "grad_norm": 6.21875, + "learning_rate": 5.975425898200499e-06, + "loss": 1.59013996, + "memory(GiB)": 117.38, + "step": 36710, + "train_speed(iter/s)": 1.638316 + }, + { + "acc": 0.65103798, + "epoch": 0.9313800101471335, + "grad_norm": 6.53125, + "learning_rate": 5.974397401014202e-06, + "loss": 1.66796513, + "memory(GiB)": 117.38, + "step": 36715, + "train_speed(iter/s)": 1.63834 + }, + { + "acc": 0.64828854, + "epoch": 0.9315068493150684, + "grad_norm": 5.5, + "learning_rate": 5.973368860969559e-06, + "loss": 1.62634964, + "memory(GiB)": 117.38, + "step": 36720, + "train_speed(iter/s)": 1.638364 + }, + { + "acc": 0.65654998, + "epoch": 0.9316336884830035, + "grad_norm": 6.625, + "learning_rate": 5.972340278111808e-06, + "loss": 1.62117386, + "memory(GiB)": 117.38, + "step": 36725, + "train_speed(iter/s)": 1.638388 + }, + { + "acc": 0.66287837, + "epoch": 0.9317605276509386, + "grad_norm": 5.1875, + "learning_rate": 5.9713116524861895e-06, + "loss": 1.51388626, + "memory(GiB)": 117.38, + "step": 36730, + "train_speed(iter/s)": 1.63841 + }, + { + "acc": 0.65718532, + "epoch": 0.9318873668188736, + "grad_norm": 6.65625, + "learning_rate": 5.970282984137947e-06, + "loss": 1.66140022, + "memory(GiB)": 117.38, + "step": 36735, + "train_speed(iter/s)": 1.638434 + }, + { + "acc": 0.67708817, + "epoch": 0.9320142059868087, + "grad_norm": 5.15625, + "learning_rate": 5.969254273112328e-06, + "loss": 1.53770084, + "memory(GiB)": 117.38, + "step": 36740, + "train_speed(iter/s)": 1.638459 + }, + { + "acc": 0.65507145, + "epoch": 0.9321410451547438, + "grad_norm": 7.03125, + "learning_rate": 5.968225519454577e-06, + "loss": 1.63341446, + "memory(GiB)": 117.38, + "step": 36745, + "train_speed(iter/s)": 1.638481 + }, + { + "acc": 0.66716251, + "epoch": 0.9322678843226788, + "grad_norm": 6.0, + "learning_rate": 5.967196723209947e-06, + "loss": 1.57138758, + "memory(GiB)": 117.38, + "step": 36750, + "train_speed(iter/s)": 1.638504 + }, + { + "acc": 0.64545074, + "epoch": 0.9323947234906139, + "grad_norm": 6.40625, + "learning_rate": 5.966167884423686e-06, + "loss": 1.69396286, + "memory(GiB)": 117.38, + "step": 36755, + "train_speed(iter/s)": 1.638528 + }, + { + "acc": 0.67997723, + "epoch": 0.9325215626585489, + "grad_norm": 6.78125, + "learning_rate": 5.965139003141048e-06, + "loss": 1.50527534, + "memory(GiB)": 117.38, + "step": 36760, + "train_speed(iter/s)": 1.638553 + }, + { + "acc": 0.65336704, + "epoch": 0.932648401826484, + "grad_norm": 5.5, + "learning_rate": 5.964110079407287e-06, + "loss": 1.58715115, + "memory(GiB)": 117.38, + "step": 36765, + "train_speed(iter/s)": 1.638579 + }, + { + "acc": 0.64763618, + "epoch": 0.9327752409944191, + "grad_norm": 5.90625, + "learning_rate": 5.9630811132676625e-06, + "loss": 1.66229744, + "memory(GiB)": 117.38, + "step": 36770, + "train_speed(iter/s)": 1.638603 + }, + { + "acc": 0.65673161, + "epoch": 0.9329020801623541, + "grad_norm": 5.5, + "learning_rate": 5.962052104767427e-06, + "loss": 1.64112015, + "memory(GiB)": 117.38, + "step": 36775, + "train_speed(iter/s)": 1.638627 + }, + { + "acc": 0.66401796, + "epoch": 0.9330289193302892, + "grad_norm": 6.375, + "learning_rate": 5.961023053951848e-06, + "loss": 1.59465237, + "memory(GiB)": 117.38, + "step": 36780, + "train_speed(iter/s)": 1.63865 + }, + { + "acc": 0.66067505, + "epoch": 0.9331557584982243, + "grad_norm": 5.96875, + "learning_rate": 5.9599939608661825e-06, + "loss": 1.65322723, + "memory(GiB)": 117.38, + "step": 36785, + "train_speed(iter/s)": 1.638672 + }, + { + "acc": 0.64260502, + "epoch": 0.9332825976661593, + "grad_norm": 5.4375, + "learning_rate": 5.9589648255556975e-06, + "loss": 1.64575615, + "memory(GiB)": 117.38, + "step": 36790, + "train_speed(iter/s)": 1.638697 + }, + { + "acc": 0.65490389, + "epoch": 0.9334094368340944, + "grad_norm": 6.15625, + "learning_rate": 5.957935648065658e-06, + "loss": 1.58748703, + "memory(GiB)": 117.38, + "step": 36795, + "train_speed(iter/s)": 1.638721 + }, + { + "acc": 0.65766916, + "epoch": 0.9335362760020294, + "grad_norm": 6.875, + "learning_rate": 5.956906428441331e-06, + "loss": 1.58518772, + "memory(GiB)": 117.38, + "step": 36800, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.65439596, + "epoch": 0.9336631151699645, + "grad_norm": 5.1875, + "learning_rate": 5.955877166727988e-06, + "loss": 1.62941055, + "memory(GiB)": 117.38, + "step": 36805, + "train_speed(iter/s)": 1.63877 + }, + { + "acc": 0.65952988, + "epoch": 0.9337899543378996, + "grad_norm": 6.3125, + "learning_rate": 5.954847862970898e-06, + "loss": 1.60132389, + "memory(GiB)": 117.38, + "step": 36810, + "train_speed(iter/s)": 1.638795 + }, + { + "acc": 0.64738297, + "epoch": 0.9339167935058346, + "grad_norm": 4.46875, + "learning_rate": 5.953818517215338e-06, + "loss": 1.66351566, + "memory(GiB)": 117.38, + "step": 36815, + "train_speed(iter/s)": 1.638818 + }, + { + "acc": 0.66023378, + "epoch": 0.9340436326737697, + "grad_norm": 5.375, + "learning_rate": 5.95278912950658e-06, + "loss": 1.60740948, + "memory(GiB)": 117.38, + "step": 36820, + "train_speed(iter/s)": 1.638841 + }, + { + "acc": 0.67299509, + "epoch": 0.9341704718417048, + "grad_norm": 5.09375, + "learning_rate": 5.9517596998899e-06, + "loss": 1.51019211, + "memory(GiB)": 117.38, + "step": 36825, + "train_speed(iter/s)": 1.638867 + }, + { + "acc": 0.66861639, + "epoch": 0.9342973110096398, + "grad_norm": 5.8125, + "learning_rate": 5.9507302284105836e-06, + "loss": 1.55800476, + "memory(GiB)": 117.38, + "step": 36830, + "train_speed(iter/s)": 1.638889 + }, + { + "acc": 0.66935649, + "epoch": 0.9344241501775749, + "grad_norm": 5.5, + "learning_rate": 5.949700715113904e-06, + "loss": 1.59582157, + "memory(GiB)": 117.38, + "step": 36835, + "train_speed(iter/s)": 1.638913 + }, + { + "acc": 0.6585845, + "epoch": 0.9345509893455098, + "grad_norm": 6.4375, + "learning_rate": 5.9486711600451484e-06, + "loss": 1.60249119, + "memory(GiB)": 117.38, + "step": 36840, + "train_speed(iter/s)": 1.638937 + }, + { + "acc": 0.66531453, + "epoch": 0.9346778285134449, + "grad_norm": 5.75, + "learning_rate": 5.9476415632495974e-06, + "loss": 1.59580631, + "memory(GiB)": 117.38, + "step": 36845, + "train_speed(iter/s)": 1.638961 + }, + { + "acc": 0.64385386, + "epoch": 0.93480466768138, + "grad_norm": 5.46875, + "learning_rate": 5.946611924772542e-06, + "loss": 1.61241417, + "memory(GiB)": 117.38, + "step": 36850, + "train_speed(iter/s)": 1.638984 + }, + { + "acc": 0.66709561, + "epoch": 0.934931506849315, + "grad_norm": 5.21875, + "learning_rate": 5.945582244659267e-06, + "loss": 1.55254078, + "memory(GiB)": 117.38, + "step": 36855, + "train_speed(iter/s)": 1.639009 + }, + { + "acc": 0.65680184, + "epoch": 0.9350583460172501, + "grad_norm": 5.90625, + "learning_rate": 5.944552522955063e-06, + "loss": 1.63569489, + "memory(GiB)": 117.38, + "step": 36860, + "train_speed(iter/s)": 1.639033 + }, + { + "acc": 0.67392912, + "epoch": 0.9351851851851852, + "grad_norm": 4.4375, + "learning_rate": 5.943522759705221e-06, + "loss": 1.58377724, + "memory(GiB)": 117.38, + "step": 36865, + "train_speed(iter/s)": 1.639057 + }, + { + "acc": 0.64179478, + "epoch": 0.9353120243531202, + "grad_norm": 5.0, + "learning_rate": 5.942492954955037e-06, + "loss": 1.67848854, + "memory(GiB)": 117.38, + "step": 36870, + "train_speed(iter/s)": 1.639082 + }, + { + "acc": 0.63717403, + "epoch": 0.9354388635210553, + "grad_norm": 5.84375, + "learning_rate": 5.941463108749804e-06, + "loss": 1.67264671, + "memory(GiB)": 117.38, + "step": 36875, + "train_speed(iter/s)": 1.639106 + }, + { + "acc": 0.65555096, + "epoch": 0.9355657026889903, + "grad_norm": 6.65625, + "learning_rate": 5.940433221134821e-06, + "loss": 1.61928978, + "memory(GiB)": 117.38, + "step": 36880, + "train_speed(iter/s)": 1.639132 + }, + { + "acc": 0.66600709, + "epoch": 0.9356925418569254, + "grad_norm": 5.71875, + "learning_rate": 5.9394032921553856e-06, + "loss": 1.61757965, + "memory(GiB)": 117.38, + "step": 36885, + "train_speed(iter/s)": 1.638737 + }, + { + "acc": 0.6494102, + "epoch": 0.9358193810248605, + "grad_norm": 6.625, + "learning_rate": 5.9383733218568e-06, + "loss": 1.65413971, + "memory(GiB)": 117.38, + "step": 36890, + "train_speed(iter/s)": 1.638763 + }, + { + "acc": 0.65740604, + "epoch": 0.9359462201927955, + "grad_norm": 5.28125, + "learning_rate": 5.937343310284365e-06, + "loss": 1.63521404, + "memory(GiB)": 117.38, + "step": 36895, + "train_speed(iter/s)": 1.638365 + }, + { + "acc": 0.65558767, + "epoch": 0.9360730593607306, + "grad_norm": 5.03125, + "learning_rate": 5.936313257483387e-06, + "loss": 1.59242296, + "memory(GiB)": 117.38, + "step": 36900, + "train_speed(iter/s)": 1.638391 + }, + { + "acc": 0.64981451, + "epoch": 0.9361998985286657, + "grad_norm": 5.1875, + "learning_rate": 5.935283163499171e-06, + "loss": 1.64229584, + "memory(GiB)": 117.38, + "step": 36905, + "train_speed(iter/s)": 1.638416 + }, + { + "acc": 0.63950448, + "epoch": 0.9363267376966007, + "grad_norm": 5.25, + "learning_rate": 5.9342530283770274e-06, + "loss": 1.66219273, + "memory(GiB)": 117.38, + "step": 36910, + "train_speed(iter/s)": 1.638438 + }, + { + "acc": 0.66492348, + "epoch": 0.9364535768645358, + "grad_norm": 4.53125, + "learning_rate": 5.9332228521622615e-06, + "loss": 1.51675892, + "memory(GiB)": 117.38, + "step": 36915, + "train_speed(iter/s)": 1.638038 + }, + { + "acc": 0.66963034, + "epoch": 0.9365804160324708, + "grad_norm": 5.6875, + "learning_rate": 5.93219263490019e-06, + "loss": 1.614711, + "memory(GiB)": 117.38, + "step": 36920, + "train_speed(iter/s)": 1.638063 + }, + { + "acc": 0.66557255, + "epoch": 0.9367072552004059, + "grad_norm": 5.75, + "learning_rate": 5.931162376636123e-06, + "loss": 1.6623661, + "memory(GiB)": 117.38, + "step": 36925, + "train_speed(iter/s)": 1.638087 + }, + { + "acc": 0.6570353, + "epoch": 0.936834094368341, + "grad_norm": 6.5, + "learning_rate": 5.93013207741538e-06, + "loss": 1.56609812, + "memory(GiB)": 117.38, + "step": 36930, + "train_speed(iter/s)": 1.63811 + }, + { + "acc": 0.65826535, + "epoch": 0.936960933536276, + "grad_norm": 5.90625, + "learning_rate": 5.929101737283274e-06, + "loss": 1.56675377, + "memory(GiB)": 117.38, + "step": 36935, + "train_speed(iter/s)": 1.638135 + }, + { + "acc": 0.64306622, + "epoch": 0.9370877727042111, + "grad_norm": 6.0625, + "learning_rate": 5.928071356285126e-06, + "loss": 1.60418987, + "memory(GiB)": 117.38, + "step": 36940, + "train_speed(iter/s)": 1.638159 + }, + { + "acc": 0.66855669, + "epoch": 0.9372146118721462, + "grad_norm": 5.4375, + "learning_rate": 5.927040934466255e-06, + "loss": 1.55486479, + "memory(GiB)": 117.38, + "step": 36945, + "train_speed(iter/s)": 1.638185 + }, + { + "acc": 0.67566752, + "epoch": 0.9373414510400812, + "grad_norm": 5.1875, + "learning_rate": 5.926010471871986e-06, + "loss": 1.55081129, + "memory(GiB)": 117.38, + "step": 36950, + "train_speed(iter/s)": 1.638209 + }, + { + "acc": 0.65869427, + "epoch": 0.9374682902080163, + "grad_norm": 4.875, + "learning_rate": 5.924979968547642e-06, + "loss": 1.5476099, + "memory(GiB)": 117.38, + "step": 36955, + "train_speed(iter/s)": 1.638234 + }, + { + "acc": 0.63511362, + "epoch": 0.9375951293759512, + "grad_norm": 6.53125, + "learning_rate": 5.9239494245385485e-06, + "loss": 1.71486073, + "memory(GiB)": 117.38, + "step": 36960, + "train_speed(iter/s)": 1.638258 + }, + { + "acc": 0.65801692, + "epoch": 0.9377219685438863, + "grad_norm": 5.59375, + "learning_rate": 5.9229188398900325e-06, + "loss": 1.62753143, + "memory(GiB)": 117.38, + "step": 36965, + "train_speed(iter/s)": 1.638282 + }, + { + "acc": 0.6414794, + "epoch": 0.9378488077118214, + "grad_norm": 5.84375, + "learning_rate": 5.921888214647429e-06, + "loss": 1.66198082, + "memory(GiB)": 117.38, + "step": 36970, + "train_speed(iter/s)": 1.638306 + }, + { + "acc": 0.65282521, + "epoch": 0.9379756468797564, + "grad_norm": 5.4375, + "learning_rate": 5.920857548856064e-06, + "loss": 1.57444878, + "memory(GiB)": 117.38, + "step": 36975, + "train_speed(iter/s)": 1.63833 + }, + { + "acc": 0.64693775, + "epoch": 0.9381024860476915, + "grad_norm": 7.78125, + "learning_rate": 5.919826842561274e-06, + "loss": 1.61087914, + "memory(GiB)": 117.38, + "step": 36980, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.65350242, + "epoch": 0.9382293252156266, + "grad_norm": 5.6875, + "learning_rate": 5.91879609580839e-06, + "loss": 1.64501724, + "memory(GiB)": 117.38, + "step": 36985, + "train_speed(iter/s)": 1.63838 + }, + { + "acc": 0.64666762, + "epoch": 0.9383561643835616, + "grad_norm": 7.78125, + "learning_rate": 5.917765308642754e-06, + "loss": 1.66936913, + "memory(GiB)": 117.38, + "step": 36990, + "train_speed(iter/s)": 1.638403 + }, + { + "acc": 0.65307064, + "epoch": 0.9384830035514967, + "grad_norm": 8.25, + "learning_rate": 5.9167344811097014e-06, + "loss": 1.66945553, + "memory(GiB)": 117.38, + "step": 36995, + "train_speed(iter/s)": 1.638427 + }, + { + "acc": 0.66428313, + "epoch": 0.9386098427194317, + "grad_norm": 6.40625, + "learning_rate": 5.9157036132545735e-06, + "loss": 1.53685236, + "memory(GiB)": 117.38, + "step": 37000, + "train_speed(iter/s)": 1.638451 + }, + { + "epoch": 0.9386098427194317, + "eval_acc": 0.6460141545388289, + "eval_loss": 1.5747417211532593, + "eval_runtime": 58.4241, + "eval_samples_per_second": 109.03, + "eval_steps_per_second": 27.266, + "step": 37000 + }, + { + "acc": 0.67087278, + "epoch": 0.9387366818873668, + "grad_norm": 7.1875, + "learning_rate": 5.914672705122713e-06, + "loss": 1.61551094, + "memory(GiB)": 117.38, + "step": 37005, + "train_speed(iter/s)": 1.633941 + }, + { + "acc": 0.65566669, + "epoch": 0.9388635210553019, + "grad_norm": 6.625, + "learning_rate": 5.9136417567594615e-06, + "loss": 1.61629143, + "memory(GiB)": 117.38, + "step": 37010, + "train_speed(iter/s)": 1.633966 + }, + { + "acc": 0.65871449, + "epoch": 0.9389903602232369, + "grad_norm": 6.0625, + "learning_rate": 5.9126107682101675e-06, + "loss": 1.65129185, + "memory(GiB)": 117.38, + "step": 37015, + "train_speed(iter/s)": 1.63399 + }, + { + "acc": 0.63790622, + "epoch": 0.939117199391172, + "grad_norm": 4.1875, + "learning_rate": 5.911579739520178e-06, + "loss": 1.61802025, + "memory(GiB)": 117.38, + "step": 37020, + "train_speed(iter/s)": 1.634013 + }, + { + "acc": 0.64074287, + "epoch": 0.9392440385591071, + "grad_norm": 6.125, + "learning_rate": 5.91054867073484e-06, + "loss": 1.66344681, + "memory(GiB)": 117.38, + "step": 37025, + "train_speed(iter/s)": 1.634037 + }, + { + "acc": 0.64235535, + "epoch": 0.9393708777270421, + "grad_norm": 6.3125, + "learning_rate": 5.909517561899508e-06, + "loss": 1.6569458, + "memory(GiB)": 117.38, + "step": 37030, + "train_speed(iter/s)": 1.634062 + }, + { + "acc": 0.62768097, + "epoch": 0.9394977168949772, + "grad_norm": 5.78125, + "learning_rate": 5.908486413059532e-06, + "loss": 1.65637264, + "memory(GiB)": 117.38, + "step": 37035, + "train_speed(iter/s)": 1.634088 + }, + { + "acc": 0.65163279, + "epoch": 0.9396245560629122, + "grad_norm": 4.5, + "learning_rate": 5.907455224260268e-06, + "loss": 1.6116024, + "memory(GiB)": 117.38, + "step": 37040, + "train_speed(iter/s)": 1.634111 + }, + { + "acc": 0.64798603, + "epoch": 0.9397513952308473, + "grad_norm": 6.46875, + "learning_rate": 5.9064239955470704e-06, + "loss": 1.64316425, + "memory(GiB)": 117.38, + "step": 37045, + "train_speed(iter/s)": 1.634136 + }, + { + "acc": 0.63081484, + "epoch": 0.9398782343987824, + "grad_norm": 5.90625, + "learning_rate": 5.9053927269653e-06, + "loss": 1.71127014, + "memory(GiB)": 117.38, + "step": 37050, + "train_speed(iter/s)": 1.634158 + }, + { + "acc": 0.66582217, + "epoch": 0.9400050735667174, + "grad_norm": 5.5, + "learning_rate": 5.904361418560314e-06, + "loss": 1.59358034, + "memory(GiB)": 117.38, + "step": 37055, + "train_speed(iter/s)": 1.634182 + }, + { + "acc": 0.6632164, + "epoch": 0.9401319127346525, + "grad_norm": 6.84375, + "learning_rate": 5.903330070377477e-06, + "loss": 1.62826271, + "memory(GiB)": 117.38, + "step": 37060, + "train_speed(iter/s)": 1.634206 + }, + { + "acc": 0.66595354, + "epoch": 0.9402587519025876, + "grad_norm": 6.21875, + "learning_rate": 5.902298682462147e-06, + "loss": 1.5278923, + "memory(GiB)": 117.38, + "step": 37065, + "train_speed(iter/s)": 1.634229 + }, + { + "acc": 0.66164331, + "epoch": 0.9403855910705226, + "grad_norm": 6.6875, + "learning_rate": 5.901267254859695e-06, + "loss": 1.6072525, + "memory(GiB)": 117.38, + "step": 37070, + "train_speed(iter/s)": 1.634253 + }, + { + "acc": 0.65205784, + "epoch": 0.9405124302384577, + "grad_norm": 4.875, + "learning_rate": 5.900235787615485e-06, + "loss": 1.66901894, + "memory(GiB)": 117.38, + "step": 37075, + "train_speed(iter/s)": 1.634276 + }, + { + "acc": 0.65549827, + "epoch": 0.9406392694063926, + "grad_norm": 6.375, + "learning_rate": 5.8992042807748866e-06, + "loss": 1.58327837, + "memory(GiB)": 117.38, + "step": 37080, + "train_speed(iter/s)": 1.634301 + }, + { + "acc": 0.65217376, + "epoch": 0.9407661085743277, + "grad_norm": 5.75, + "learning_rate": 5.898172734383267e-06, + "loss": 1.64416962, + "memory(GiB)": 117.38, + "step": 37085, + "train_speed(iter/s)": 1.634325 + }, + { + "acc": 0.64489574, + "epoch": 0.9408929477422628, + "grad_norm": 6.21875, + "learning_rate": 5.897141148486003e-06, + "loss": 1.67693863, + "memory(GiB)": 117.38, + "step": 37090, + "train_speed(iter/s)": 1.634349 + }, + { + "acc": 0.66178365, + "epoch": 0.9410197869101978, + "grad_norm": 5.96875, + "learning_rate": 5.8961095231284645e-06, + "loss": 1.56035357, + "memory(GiB)": 117.38, + "step": 37095, + "train_speed(iter/s)": 1.634373 + }, + { + "acc": 0.64687643, + "epoch": 0.9411466260781329, + "grad_norm": 5.21875, + "learning_rate": 5.895077858356029e-06, + "loss": 1.62201366, + "memory(GiB)": 117.38, + "step": 37100, + "train_speed(iter/s)": 1.634394 + }, + { + "acc": 0.6479135, + "epoch": 0.941273465246068, + "grad_norm": 5.9375, + "learning_rate": 5.8940461542140725e-06, + "loss": 1.60642376, + "memory(GiB)": 117.38, + "step": 37105, + "train_speed(iter/s)": 1.634416 + }, + { + "acc": 0.66524668, + "epoch": 0.941400304414003, + "grad_norm": 6.3125, + "learning_rate": 5.893014410747975e-06, + "loss": 1.5475399, + "memory(GiB)": 117.38, + "step": 37110, + "train_speed(iter/s)": 1.634441 + }, + { + "acc": 0.66550741, + "epoch": 0.9415271435819381, + "grad_norm": 5.53125, + "learning_rate": 5.891982628003114e-06, + "loss": 1.58927517, + "memory(GiB)": 117.38, + "step": 37115, + "train_speed(iter/s)": 1.634466 + }, + { + "acc": 0.64765778, + "epoch": 0.9416539827498731, + "grad_norm": 6.9375, + "learning_rate": 5.890950806024879e-06, + "loss": 1.59349022, + "memory(GiB)": 117.38, + "step": 37120, + "train_speed(iter/s)": 1.634491 + }, + { + "acc": 0.65533237, + "epoch": 0.9417808219178082, + "grad_norm": 5.84375, + "learning_rate": 5.889918944858647e-06, + "loss": 1.62578106, + "memory(GiB)": 117.38, + "step": 37125, + "train_speed(iter/s)": 1.634516 + }, + { + "acc": 0.66858497, + "epoch": 0.9419076610857433, + "grad_norm": 5.90625, + "learning_rate": 5.888887044549808e-06, + "loss": 1.54972477, + "memory(GiB)": 117.38, + "step": 37130, + "train_speed(iter/s)": 1.63454 + }, + { + "acc": 0.64566202, + "epoch": 0.9420345002536783, + "grad_norm": 5.59375, + "learning_rate": 5.887855105143746e-06, + "loss": 1.61215782, + "memory(GiB)": 117.38, + "step": 37135, + "train_speed(iter/s)": 1.634564 + }, + { + "acc": 0.65946627, + "epoch": 0.9421613394216134, + "grad_norm": 4.96875, + "learning_rate": 5.886823126685855e-06, + "loss": 1.56578712, + "memory(GiB)": 117.38, + "step": 37140, + "train_speed(iter/s)": 1.634589 + }, + { + "acc": 0.65658712, + "epoch": 0.9422881785895485, + "grad_norm": 6.1875, + "learning_rate": 5.8857911092215214e-06, + "loss": 1.6559248, + "memory(GiB)": 117.38, + "step": 37145, + "train_speed(iter/s)": 1.634614 + }, + { + "acc": 0.65072589, + "epoch": 0.9424150177574835, + "grad_norm": 5.21875, + "learning_rate": 5.884759052796142e-06, + "loss": 1.66819572, + "memory(GiB)": 117.38, + "step": 37150, + "train_speed(iter/s)": 1.634639 + }, + { + "acc": 0.65706954, + "epoch": 0.9425418569254186, + "grad_norm": 5.28125, + "learning_rate": 5.883726957455108e-06, + "loss": 1.60351677, + "memory(GiB)": 117.38, + "step": 37155, + "train_speed(iter/s)": 1.634663 + }, + { + "acc": 0.64331613, + "epoch": 0.9426686960933536, + "grad_norm": 5.25, + "learning_rate": 5.8826948232438176e-06, + "loss": 1.64139805, + "memory(GiB)": 117.38, + "step": 37160, + "train_speed(iter/s)": 1.634688 + }, + { + "acc": 0.66055131, + "epoch": 0.9427955352612887, + "grad_norm": 4.90625, + "learning_rate": 5.881662650207667e-06, + "loss": 1.5450551, + "memory(GiB)": 117.38, + "step": 37165, + "train_speed(iter/s)": 1.634712 + }, + { + "acc": 0.67571325, + "epoch": 0.9429223744292238, + "grad_norm": 6.40625, + "learning_rate": 5.880630438392057e-06, + "loss": 1.56790161, + "memory(GiB)": 117.38, + "step": 37170, + "train_speed(iter/s)": 1.634737 + }, + { + "acc": 0.66040192, + "epoch": 0.9430492135971588, + "grad_norm": 5.65625, + "learning_rate": 5.879598187842389e-06, + "loss": 1.55055294, + "memory(GiB)": 117.38, + "step": 37175, + "train_speed(iter/s)": 1.634762 + }, + { + "acc": 0.6687181, + "epoch": 0.9431760527650939, + "grad_norm": 9.8125, + "learning_rate": 5.878565898604066e-06, + "loss": 1.60814495, + "memory(GiB)": 117.38, + "step": 37180, + "train_speed(iter/s)": 1.634786 + }, + { + "acc": 0.66111364, + "epoch": 0.943302891933029, + "grad_norm": 5.34375, + "learning_rate": 5.87753357072249e-06, + "loss": 1.54269657, + "memory(GiB)": 117.38, + "step": 37185, + "train_speed(iter/s)": 1.634809 + }, + { + "acc": 0.66401286, + "epoch": 0.943429731100964, + "grad_norm": 6.6875, + "learning_rate": 5.876501204243072e-06, + "loss": 1.59320698, + "memory(GiB)": 117.38, + "step": 37190, + "train_speed(iter/s)": 1.634834 + }, + { + "acc": 0.66640902, + "epoch": 0.943556570268899, + "grad_norm": 8.25, + "learning_rate": 5.875468799211217e-06, + "loss": 1.55528622, + "memory(GiB)": 117.38, + "step": 37195, + "train_speed(iter/s)": 1.634859 + }, + { + "acc": 0.65090442, + "epoch": 0.943683409436834, + "grad_norm": 4.6875, + "learning_rate": 5.874436355672337e-06, + "loss": 1.55439949, + "memory(GiB)": 117.38, + "step": 37200, + "train_speed(iter/s)": 1.634883 + }, + { + "acc": 0.65309072, + "epoch": 0.9438102486047691, + "grad_norm": 7.78125, + "learning_rate": 5.873403873671839e-06, + "loss": 1.62836189, + "memory(GiB)": 117.38, + "step": 37205, + "train_speed(iter/s)": 1.634907 + }, + { + "acc": 0.65641594, + "epoch": 0.9439370877727042, + "grad_norm": 6.09375, + "learning_rate": 5.872371353255142e-06, + "loss": 1.61465721, + "memory(GiB)": 117.38, + "step": 37210, + "train_speed(iter/s)": 1.634933 + }, + { + "acc": 0.67033558, + "epoch": 0.9440639269406392, + "grad_norm": 6.15625, + "learning_rate": 5.871338794467656e-06, + "loss": 1.58546095, + "memory(GiB)": 117.38, + "step": 37215, + "train_speed(iter/s)": 1.634958 + }, + { + "acc": 0.65380259, + "epoch": 0.9441907661085743, + "grad_norm": 5.4375, + "learning_rate": 5.8703061973548e-06, + "loss": 1.57519588, + "memory(GiB)": 117.38, + "step": 37220, + "train_speed(iter/s)": 1.634982 + }, + { + "acc": 0.6597271, + "epoch": 0.9443176052765094, + "grad_norm": 4.3125, + "learning_rate": 5.869273561961992e-06, + "loss": 1.56505375, + "memory(GiB)": 117.38, + "step": 37225, + "train_speed(iter/s)": 1.635005 + }, + { + "acc": 0.67193351, + "epoch": 0.9444444444444444, + "grad_norm": 5.75, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.59879799, + "memory(GiB)": 117.38, + "step": 37230, + "train_speed(iter/s)": 1.635029 + }, + { + "acc": 0.66186018, + "epoch": 0.9445712836123795, + "grad_norm": 5.90625, + "learning_rate": 5.867208176518202e-06, + "loss": 1.58376503, + "memory(GiB)": 117.38, + "step": 37235, + "train_speed(iter/s)": 1.635053 + }, + { + "acc": 0.6639473, + "epoch": 0.9446981227803145, + "grad_norm": 8.0625, + "learning_rate": 5.866175426558064e-06, + "loss": 1.56097908, + "memory(GiB)": 117.38, + "step": 37240, + "train_speed(iter/s)": 1.635076 + }, + { + "acc": 0.65162611, + "epoch": 0.9448249619482496, + "grad_norm": 6.0, + "learning_rate": 5.865142638499664e-06, + "loss": 1.66612511, + "memory(GiB)": 117.38, + "step": 37245, + "train_speed(iter/s)": 1.6351 + }, + { + "acc": 0.63316226, + "epoch": 0.9449518011161847, + "grad_norm": 5.78125, + "learning_rate": 5.864109812388426e-06, + "loss": 1.74475288, + "memory(GiB)": 117.38, + "step": 37250, + "train_speed(iter/s)": 1.635125 + }, + { + "acc": 0.65003152, + "epoch": 0.9450786402841197, + "grad_norm": 5.03125, + "learning_rate": 5.863076948269782e-06, + "loss": 1.58734035, + "memory(GiB)": 117.38, + "step": 37255, + "train_speed(iter/s)": 1.635148 + }, + { + "acc": 0.66514273, + "epoch": 0.9452054794520548, + "grad_norm": 5.125, + "learning_rate": 5.862044046189162e-06, + "loss": 1.62495632, + "memory(GiB)": 117.38, + "step": 37260, + "train_speed(iter/s)": 1.635173 + }, + { + "acc": 0.66927495, + "epoch": 0.9453323186199899, + "grad_norm": 6.25, + "learning_rate": 5.8610111061919924e-06, + "loss": 1.59755268, + "memory(GiB)": 117.38, + "step": 37265, + "train_speed(iter/s)": 1.635197 + }, + { + "acc": 0.64952631, + "epoch": 0.9454591577879249, + "grad_norm": 6.0625, + "learning_rate": 5.859978128323713e-06, + "loss": 1.63681412, + "memory(GiB)": 117.38, + "step": 37270, + "train_speed(iter/s)": 1.635222 + }, + { + "acc": 0.65714588, + "epoch": 0.94558599695586, + "grad_norm": 5.75, + "learning_rate": 5.858945112629755e-06, + "loss": 1.5896163, + "memory(GiB)": 117.38, + "step": 37275, + "train_speed(iter/s)": 1.635245 + }, + { + "acc": 0.66810346, + "epoch": 0.945712836123795, + "grad_norm": 5.25, + "learning_rate": 5.857912059155557e-06, + "loss": 1.58438749, + "memory(GiB)": 117.38, + "step": 37280, + "train_speed(iter/s)": 1.635269 + }, + { + "acc": 0.66742859, + "epoch": 0.9458396752917301, + "grad_norm": 5.625, + "learning_rate": 5.856878967946555e-06, + "loss": 1.5479702, + "memory(GiB)": 117.38, + "step": 37285, + "train_speed(iter/s)": 1.635294 + }, + { + "acc": 0.64885755, + "epoch": 0.9459665144596652, + "grad_norm": 6.21875, + "learning_rate": 5.855845839048191e-06, + "loss": 1.63455544, + "memory(GiB)": 117.38, + "step": 37290, + "train_speed(iter/s)": 1.635319 + }, + { + "acc": 0.65881367, + "epoch": 0.9460933536276002, + "grad_norm": 5.625, + "learning_rate": 5.854812672505906e-06, + "loss": 1.67388725, + "memory(GiB)": 117.38, + "step": 37295, + "train_speed(iter/s)": 1.635342 + }, + { + "acc": 0.65646925, + "epoch": 0.9462201927955353, + "grad_norm": 6.0625, + "learning_rate": 5.853779468365144e-06, + "loss": 1.55733929, + "memory(GiB)": 117.38, + "step": 37300, + "train_speed(iter/s)": 1.635367 + }, + { + "acc": 0.64865289, + "epoch": 0.9463470319634704, + "grad_norm": 5.59375, + "learning_rate": 5.852746226671348e-06, + "loss": 1.66837769, + "memory(GiB)": 117.38, + "step": 37305, + "train_speed(iter/s)": 1.635392 + }, + { + "acc": 0.64205294, + "epoch": 0.9464738711314054, + "grad_norm": 5.34375, + "learning_rate": 5.851712947469966e-06, + "loss": 1.61525612, + "memory(GiB)": 117.38, + "step": 37310, + "train_speed(iter/s)": 1.635417 + }, + { + "acc": 0.64573021, + "epoch": 0.9466007102993405, + "grad_norm": 5.53125, + "learning_rate": 5.850679630806446e-06, + "loss": 1.64777679, + "memory(GiB)": 117.38, + "step": 37315, + "train_speed(iter/s)": 1.635441 + }, + { + "acc": 0.65656967, + "epoch": 0.9467275494672754, + "grad_norm": 5.0625, + "learning_rate": 5.849646276726237e-06, + "loss": 1.62326584, + "memory(GiB)": 117.38, + "step": 37320, + "train_speed(iter/s)": 1.635464 + }, + { + "acc": 0.65276423, + "epoch": 0.9468543886352105, + "grad_norm": 6.1875, + "learning_rate": 5.848612885274792e-06, + "loss": 1.64401741, + "memory(GiB)": 117.38, + "step": 37325, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.64332066, + "epoch": 0.9469812278031456, + "grad_norm": 5.46875, + "learning_rate": 5.847579456497564e-06, + "loss": 1.67527542, + "memory(GiB)": 117.38, + "step": 37330, + "train_speed(iter/s)": 1.635511 + }, + { + "acc": 0.65072522, + "epoch": 0.9471080669710806, + "grad_norm": 5.53125, + "learning_rate": 5.8465459904400065e-06, + "loss": 1.5624176, + "memory(GiB)": 117.38, + "step": 37335, + "train_speed(iter/s)": 1.635535 + }, + { + "acc": 0.66717434, + "epoch": 0.9472349061390157, + "grad_norm": 6.15625, + "learning_rate": 5.845512487147579e-06, + "loss": 1.611129, + "memory(GiB)": 117.38, + "step": 37340, + "train_speed(iter/s)": 1.635557 + }, + { + "acc": 0.65510526, + "epoch": 0.9473617453069508, + "grad_norm": 6.34375, + "learning_rate": 5.844478946665733e-06, + "loss": 1.60988865, + "memory(GiB)": 117.38, + "step": 37345, + "train_speed(iter/s)": 1.635582 + }, + { + "acc": 0.65682077, + "epoch": 0.9474885844748858, + "grad_norm": 5.03125, + "learning_rate": 5.843445369039937e-06, + "loss": 1.6545742, + "memory(GiB)": 117.38, + "step": 37350, + "train_speed(iter/s)": 1.635605 + }, + { + "acc": 0.67353334, + "epoch": 0.9476154236428209, + "grad_norm": 7.1875, + "learning_rate": 5.842411754315645e-06, + "loss": 1.51080999, + "memory(GiB)": 117.38, + "step": 37355, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.651086, + "epoch": 0.9477422628107559, + "grad_norm": 5.25, + "learning_rate": 5.841378102538324e-06, + "loss": 1.65048065, + "memory(GiB)": 117.38, + "step": 37360, + "train_speed(iter/s)": 1.635651 + }, + { + "acc": 0.6626194, + "epoch": 0.947869101978691, + "grad_norm": 6.3125, + "learning_rate": 5.840344413753438e-06, + "loss": 1.55746279, + "memory(GiB)": 117.38, + "step": 37365, + "train_speed(iter/s)": 1.635674 + }, + { + "acc": 0.64949627, + "epoch": 0.9479959411466261, + "grad_norm": 5.03125, + "learning_rate": 5.8393106880064535e-06, + "loss": 1.60622749, + "memory(GiB)": 117.38, + "step": 37370, + "train_speed(iter/s)": 1.635699 + }, + { + "acc": 0.65106373, + "epoch": 0.9481227803145611, + "grad_norm": 5.8125, + "learning_rate": 5.838276925342836e-06, + "loss": 1.58245268, + "memory(GiB)": 117.38, + "step": 37375, + "train_speed(iter/s)": 1.635721 + }, + { + "acc": 0.66040907, + "epoch": 0.9482496194824962, + "grad_norm": 5.78125, + "learning_rate": 5.837243125808058e-06, + "loss": 1.57163219, + "memory(GiB)": 117.38, + "step": 37380, + "train_speed(iter/s)": 1.635743 + }, + { + "acc": 0.64261847, + "epoch": 0.9483764586504313, + "grad_norm": 6.75, + "learning_rate": 5.8362092894475886e-06, + "loss": 1.69431362, + "memory(GiB)": 117.38, + "step": 37385, + "train_speed(iter/s)": 1.635767 + }, + { + "acc": 0.64906178, + "epoch": 0.9485032978183663, + "grad_norm": 5.21875, + "learning_rate": 5.835175416306901e-06, + "loss": 1.6429142, + "memory(GiB)": 117.38, + "step": 37390, + "train_speed(iter/s)": 1.635791 + }, + { + "acc": 0.65585403, + "epoch": 0.9486301369863014, + "grad_norm": 6.21875, + "learning_rate": 5.83414150643147e-06, + "loss": 1.55377531, + "memory(GiB)": 117.38, + "step": 37395, + "train_speed(iter/s)": 1.635814 + }, + { + "acc": 0.63683319, + "epoch": 0.9487569761542364, + "grad_norm": 6.5, + "learning_rate": 5.833107559866772e-06, + "loss": 1.55389633, + "memory(GiB)": 117.38, + "step": 37400, + "train_speed(iter/s)": 1.635836 + }, + { + "acc": 0.65886316, + "epoch": 0.9488838153221715, + "grad_norm": 6.4375, + "learning_rate": 5.832073576658282e-06, + "loss": 1.60462189, + "memory(GiB)": 117.38, + "step": 37405, + "train_speed(iter/s)": 1.63586 + }, + { + "acc": 0.66132979, + "epoch": 0.9490106544901066, + "grad_norm": 6.25, + "learning_rate": 5.831039556851485e-06, + "loss": 1.57812099, + "memory(GiB)": 117.38, + "step": 37410, + "train_speed(iter/s)": 1.635882 + }, + { + "acc": 0.64156971, + "epoch": 0.9491374936580416, + "grad_norm": 5.71875, + "learning_rate": 5.8300055004918535e-06, + "loss": 1.60821972, + "memory(GiB)": 117.38, + "step": 37415, + "train_speed(iter/s)": 1.635903 + }, + { + "acc": 0.64402447, + "epoch": 0.9492643328259767, + "grad_norm": 6.25, + "learning_rate": 5.828971407624877e-06, + "loss": 1.65598946, + "memory(GiB)": 117.38, + "step": 37420, + "train_speed(iter/s)": 1.635926 + }, + { + "acc": 0.65985446, + "epoch": 0.9493911719939118, + "grad_norm": 5.0625, + "learning_rate": 5.827937278296037e-06, + "loss": 1.59278069, + "memory(GiB)": 117.38, + "step": 37425, + "train_speed(iter/s)": 1.635949 + }, + { + "acc": 0.67123766, + "epoch": 0.9495180111618468, + "grad_norm": 6.34375, + "learning_rate": 5.826903112550819e-06, + "loss": 1.58408718, + "memory(GiB)": 117.38, + "step": 37430, + "train_speed(iter/s)": 1.635973 + }, + { + "acc": 0.65343204, + "epoch": 0.9496448503297819, + "grad_norm": 6.03125, + "learning_rate": 5.825868910434708e-06, + "loss": 1.58439655, + "memory(GiB)": 117.38, + "step": 37435, + "train_speed(iter/s)": 1.635997 + }, + { + "acc": 0.67348738, + "epoch": 0.9497716894977168, + "grad_norm": 5.0625, + "learning_rate": 5.824834671993197e-06, + "loss": 1.54148617, + "memory(GiB)": 117.38, + "step": 37440, + "train_speed(iter/s)": 1.63602 + }, + { + "acc": 0.64992294, + "epoch": 0.9498985286656519, + "grad_norm": 6.1875, + "learning_rate": 5.823800397271774e-06, + "loss": 1.6309782, + "memory(GiB)": 117.38, + "step": 37445, + "train_speed(iter/s)": 1.636043 + }, + { + "acc": 0.65153122, + "epoch": 0.950025367833587, + "grad_norm": 9.0, + "learning_rate": 5.822766086315932e-06, + "loss": 1.62790108, + "memory(GiB)": 117.38, + "step": 37450, + "train_speed(iter/s)": 1.636066 + }, + { + "acc": 0.65514021, + "epoch": 0.950152207001522, + "grad_norm": 5.28125, + "learning_rate": 5.821731739171164e-06, + "loss": 1.62272606, + "memory(GiB)": 117.38, + "step": 37455, + "train_speed(iter/s)": 1.63609 + }, + { + "acc": 0.66500363, + "epoch": 0.9502790461694571, + "grad_norm": 5.71875, + "learning_rate": 5.820697355882965e-06, + "loss": 1.65374069, + "memory(GiB)": 117.38, + "step": 37460, + "train_speed(iter/s)": 1.636114 + }, + { + "acc": 0.65905337, + "epoch": 0.9504058853373922, + "grad_norm": 6.46875, + "learning_rate": 5.819662936496833e-06, + "loss": 1.62111397, + "memory(GiB)": 117.38, + "step": 37465, + "train_speed(iter/s)": 1.636136 + }, + { + "acc": 0.6510251, + "epoch": 0.9505327245053272, + "grad_norm": 4.8125, + "learning_rate": 5.818628481058265e-06, + "loss": 1.53246517, + "memory(GiB)": 117.38, + "step": 37470, + "train_speed(iter/s)": 1.63616 + }, + { + "acc": 0.6442173, + "epoch": 0.9506595636732623, + "grad_norm": 5.65625, + "learning_rate": 5.81759398961276e-06, + "loss": 1.62716179, + "memory(GiB)": 117.38, + "step": 37475, + "train_speed(iter/s)": 1.636183 + }, + { + "acc": 0.65647416, + "epoch": 0.9507864028411973, + "grad_norm": 5.28125, + "learning_rate": 5.816559462205824e-06, + "loss": 1.57571383, + "memory(GiB)": 117.38, + "step": 37480, + "train_speed(iter/s)": 1.636206 + }, + { + "acc": 0.65683165, + "epoch": 0.9509132420091324, + "grad_norm": 6.1875, + "learning_rate": 5.815524898882954e-06, + "loss": 1.58756084, + "memory(GiB)": 117.38, + "step": 37485, + "train_speed(iter/s)": 1.63623 + }, + { + "acc": 0.67337599, + "epoch": 0.9510400811770675, + "grad_norm": 6.40625, + "learning_rate": 5.8144902996896615e-06, + "loss": 1.58814936, + "memory(GiB)": 117.38, + "step": 37490, + "train_speed(iter/s)": 1.636254 + }, + { + "acc": 0.6454134, + "epoch": 0.9511669203450025, + "grad_norm": 5.0625, + "learning_rate": 5.813455664671446e-06, + "loss": 1.65201683, + "memory(GiB)": 117.38, + "step": 37495, + "train_speed(iter/s)": 1.636278 + }, + { + "acc": 0.67092962, + "epoch": 0.9512937595129376, + "grad_norm": 6.03125, + "learning_rate": 5.812420993873819e-06, + "loss": 1.4710536, + "memory(GiB)": 117.38, + "step": 37500, + "train_speed(iter/s)": 1.6363 + }, + { + "acc": 0.6575067, + "epoch": 0.9514205986808727, + "grad_norm": 5.90625, + "learning_rate": 5.81138628734229e-06, + "loss": 1.59180889, + "memory(GiB)": 117.38, + "step": 37505, + "train_speed(iter/s)": 1.636324 + }, + { + "acc": 0.65847254, + "epoch": 0.9515474378488077, + "grad_norm": 5.8125, + "learning_rate": 5.81035154512237e-06, + "loss": 1.56397066, + "memory(GiB)": 117.38, + "step": 37510, + "train_speed(iter/s)": 1.636348 + }, + { + "acc": 0.64296298, + "epoch": 0.9516742770167428, + "grad_norm": 5.90625, + "learning_rate": 5.809316767259571e-06, + "loss": 1.65934486, + "memory(GiB)": 117.38, + "step": 37515, + "train_speed(iter/s)": 1.636371 + }, + { + "acc": 0.64423733, + "epoch": 0.9518011161846778, + "grad_norm": 5.5, + "learning_rate": 5.808281953799408e-06, + "loss": 1.62972374, + "memory(GiB)": 117.38, + "step": 37520, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.63471594, + "epoch": 0.9519279553526129, + "grad_norm": 6.5625, + "learning_rate": 5.807247104787395e-06, + "loss": 1.63079109, + "memory(GiB)": 117.38, + "step": 37525, + "train_speed(iter/s)": 1.636419 + }, + { + "acc": 0.66951866, + "epoch": 0.952054794520548, + "grad_norm": 6.0, + "learning_rate": 5.806212220269049e-06, + "loss": 1.59792833, + "memory(GiB)": 117.38, + "step": 37530, + "train_speed(iter/s)": 1.636444 + }, + { + "acc": 0.6424468, + "epoch": 0.952181633688483, + "grad_norm": 5.59375, + "learning_rate": 5.805177300289891e-06, + "loss": 1.69618607, + "memory(GiB)": 117.38, + "step": 37535, + "train_speed(iter/s)": 1.636466 + }, + { + "acc": 0.64744864, + "epoch": 0.9523084728564181, + "grad_norm": 5.53125, + "learning_rate": 5.804142344895441e-06, + "loss": 1.60979137, + "memory(GiB)": 117.38, + "step": 37540, + "train_speed(iter/s)": 1.63649 + }, + { + "acc": 0.66371355, + "epoch": 0.9524353120243532, + "grad_norm": 6.53125, + "learning_rate": 5.803107354131221e-06, + "loss": 1.56906843, + "memory(GiB)": 117.38, + "step": 37545, + "train_speed(iter/s)": 1.636512 + }, + { + "acc": 0.64825001, + "epoch": 0.9525621511922882, + "grad_norm": 5.96875, + "learning_rate": 5.802072328042753e-06, + "loss": 1.64823627, + "memory(GiB)": 117.38, + "step": 37550, + "train_speed(iter/s)": 1.636534 + }, + { + "acc": 0.64479704, + "epoch": 0.9526889903602233, + "grad_norm": 5.03125, + "learning_rate": 5.8010372666755625e-06, + "loss": 1.63361969, + "memory(GiB)": 117.38, + "step": 37555, + "train_speed(iter/s)": 1.636557 + }, + { + "acc": 0.65289497, + "epoch": 0.9528158295281582, + "grad_norm": 6.4375, + "learning_rate": 5.800002170075179e-06, + "loss": 1.65221481, + "memory(GiB)": 117.38, + "step": 37560, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.68047056, + "epoch": 0.9529426686960933, + "grad_norm": 5.4375, + "learning_rate": 5.798967038287125e-06, + "loss": 1.53734779, + "memory(GiB)": 117.38, + "step": 37565, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.66665168, + "epoch": 0.9530695078640284, + "grad_norm": 7.21875, + "learning_rate": 5.797931871356936e-06, + "loss": 1.54502392, + "memory(GiB)": 117.38, + "step": 37570, + "train_speed(iter/s)": 1.636626 + }, + { + "acc": 0.65586324, + "epoch": 0.9531963470319634, + "grad_norm": 5.5, + "learning_rate": 5.796896669330139e-06, + "loss": 1.60042305, + "memory(GiB)": 117.38, + "step": 37575, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.64218402, + "epoch": 0.9533231861998985, + "grad_norm": 7.0, + "learning_rate": 5.79586143225227e-06, + "loss": 1.69684486, + "memory(GiB)": 117.38, + "step": 37580, + "train_speed(iter/s)": 1.636672 + }, + { + "acc": 0.64952283, + "epoch": 0.9534500253678336, + "grad_norm": 6.21875, + "learning_rate": 5.79482616016886e-06, + "loss": 1.66387482, + "memory(GiB)": 117.38, + "step": 37585, + "train_speed(iter/s)": 1.636696 + }, + { + "acc": 0.64950857, + "epoch": 0.9535768645357686, + "grad_norm": 7.1875, + "learning_rate": 5.793790853125449e-06, + "loss": 1.63628597, + "memory(GiB)": 117.38, + "step": 37590, + "train_speed(iter/s)": 1.636719 + }, + { + "acc": 0.65667815, + "epoch": 0.9537037037037037, + "grad_norm": 6.4375, + "learning_rate": 5.792755511167572e-06, + "loss": 1.64018726, + "memory(GiB)": 117.38, + "step": 37595, + "train_speed(iter/s)": 1.636742 + }, + { + "acc": 0.64910975, + "epoch": 0.9538305428716387, + "grad_norm": 6.0, + "learning_rate": 5.7917201343407685e-06, + "loss": 1.61706772, + "memory(GiB)": 117.38, + "step": 37600, + "train_speed(iter/s)": 1.636766 + }, + { + "acc": 0.6598978, + "epoch": 0.9539573820395738, + "grad_norm": 5.875, + "learning_rate": 5.790684722690577e-06, + "loss": 1.66795273, + "memory(GiB)": 117.38, + "step": 37605, + "train_speed(iter/s)": 1.636789 + }, + { + "acc": 0.65098424, + "epoch": 0.9540842212075089, + "grad_norm": 5.5625, + "learning_rate": 5.789649276262542e-06, + "loss": 1.64607563, + "memory(GiB)": 117.38, + "step": 37610, + "train_speed(iter/s)": 1.636809 + }, + { + "acc": 0.65861449, + "epoch": 0.9542110603754439, + "grad_norm": 6.65625, + "learning_rate": 5.788613795102207e-06, + "loss": 1.59759712, + "memory(GiB)": 117.38, + "step": 37615, + "train_speed(iter/s)": 1.636833 + }, + { + "acc": 0.65541153, + "epoch": 0.954337899543379, + "grad_norm": 5.1875, + "learning_rate": 5.787578279255116e-06, + "loss": 1.5532217, + "memory(GiB)": 117.38, + "step": 37620, + "train_speed(iter/s)": 1.636856 + }, + { + "acc": 0.66289849, + "epoch": 0.9544647387113141, + "grad_norm": 5.15625, + "learning_rate": 5.786542728766815e-06, + "loss": 1.57978144, + "memory(GiB)": 117.38, + "step": 37625, + "train_speed(iter/s)": 1.63688 + }, + { + "acc": 0.65254831, + "epoch": 0.9545915778792491, + "grad_norm": 9.125, + "learning_rate": 5.785507143682856e-06, + "loss": 1.59954548, + "memory(GiB)": 117.38, + "step": 37630, + "train_speed(iter/s)": 1.636903 + }, + { + "acc": 0.63040018, + "epoch": 0.9547184170471842, + "grad_norm": 5.53125, + "learning_rate": 5.784471524048782e-06, + "loss": 1.6585844, + "memory(GiB)": 117.38, + "step": 37635, + "train_speed(iter/s)": 1.636926 + }, + { + "acc": 0.66040239, + "epoch": 0.9548452562151192, + "grad_norm": 6.375, + "learning_rate": 5.783435869910151e-06, + "loss": 1.60758381, + "memory(GiB)": 117.38, + "step": 37640, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.65827379, + "epoch": 0.9549720953830543, + "grad_norm": 4.71875, + "learning_rate": 5.782400181312511e-06, + "loss": 1.63642273, + "memory(GiB)": 117.38, + "step": 37645, + "train_speed(iter/s)": 1.636973 + }, + { + "acc": 0.65990334, + "epoch": 0.9550989345509894, + "grad_norm": 5.28125, + "learning_rate": 5.781364458301419e-06, + "loss": 1.56688986, + "memory(GiB)": 117.38, + "step": 37650, + "train_speed(iter/s)": 1.636995 + }, + { + "acc": 0.66363926, + "epoch": 0.9552257737189244, + "grad_norm": 5.09375, + "learning_rate": 5.780328700922427e-06, + "loss": 1.60596752, + "memory(GiB)": 117.38, + "step": 37655, + "train_speed(iter/s)": 1.637019 + }, + { + "acc": 0.67590065, + "epoch": 0.9553526128868595, + "grad_norm": 5.25, + "learning_rate": 5.779292909221097e-06, + "loss": 1.58627148, + "memory(GiB)": 117.38, + "step": 37660, + "train_speed(iter/s)": 1.637042 + }, + { + "acc": 0.66148157, + "epoch": 0.9554794520547946, + "grad_norm": 5.78125, + "learning_rate": 5.778257083242986e-06, + "loss": 1.57995892, + "memory(GiB)": 117.38, + "step": 37665, + "train_speed(iter/s)": 1.637066 + }, + { + "acc": 0.65000353, + "epoch": 0.9556062912227296, + "grad_norm": 5.0, + "learning_rate": 5.777221223033653e-06, + "loss": 1.62859459, + "memory(GiB)": 117.38, + "step": 37670, + "train_speed(iter/s)": 1.637089 + }, + { + "acc": 0.64907913, + "epoch": 0.9557331303906647, + "grad_norm": 4.6875, + "learning_rate": 5.77618532863866e-06, + "loss": 1.6252327, + "memory(GiB)": 117.38, + "step": 37675, + "train_speed(iter/s)": 1.637114 + }, + { + "acc": 0.67598896, + "epoch": 0.9558599695585996, + "grad_norm": 7.53125, + "learning_rate": 5.775149400103572e-06, + "loss": 1.49997368, + "memory(GiB)": 117.38, + "step": 37680, + "train_speed(iter/s)": 1.637137 + }, + { + "acc": 0.65692668, + "epoch": 0.9559868087265347, + "grad_norm": 5.6875, + "learning_rate": 5.774113437473953e-06, + "loss": 1.5859129, + "memory(GiB)": 117.38, + "step": 37685, + "train_speed(iter/s)": 1.637161 + }, + { + "acc": 0.64873314, + "epoch": 0.9561136478944698, + "grad_norm": 4.78125, + "learning_rate": 5.7730774407953675e-06, + "loss": 1.56788731, + "memory(GiB)": 117.38, + "step": 37690, + "train_speed(iter/s)": 1.637185 + }, + { + "acc": 0.659622, + "epoch": 0.9562404870624048, + "grad_norm": 6.5, + "learning_rate": 5.772041410113384e-06, + "loss": 1.57235374, + "memory(GiB)": 117.38, + "step": 37695, + "train_speed(iter/s)": 1.637209 + }, + { + "acc": 0.64488158, + "epoch": 0.9563673262303399, + "grad_norm": 5.25, + "learning_rate": 5.771005345473575e-06, + "loss": 1.61084862, + "memory(GiB)": 117.38, + "step": 37700, + "train_speed(iter/s)": 1.637235 + }, + { + "acc": 0.65941591, + "epoch": 0.956494165398275, + "grad_norm": 4.96875, + "learning_rate": 5.769969246921505e-06, + "loss": 1.55018835, + "memory(GiB)": 117.38, + "step": 37705, + "train_speed(iter/s)": 1.63726 + }, + { + "acc": 0.64470463, + "epoch": 0.95662100456621, + "grad_norm": 5.1875, + "learning_rate": 5.768933114502753e-06, + "loss": 1.63372116, + "memory(GiB)": 117.38, + "step": 37710, + "train_speed(iter/s)": 1.637286 + }, + { + "acc": 0.66603556, + "epoch": 0.9567478437341451, + "grad_norm": 6.125, + "learning_rate": 5.7678969482628875e-06, + "loss": 1.55573626, + "memory(GiB)": 117.38, + "step": 37715, + "train_speed(iter/s)": 1.637312 + }, + { + "acc": 0.65465174, + "epoch": 0.9568746829020801, + "grad_norm": 8.375, + "learning_rate": 5.766860748247488e-06, + "loss": 1.55616789, + "memory(GiB)": 117.38, + "step": 37720, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.64066839, + "epoch": 0.9570015220700152, + "grad_norm": 5.09375, + "learning_rate": 5.765824514502126e-06, + "loss": 1.64387627, + "memory(GiB)": 117.38, + "step": 37725, + "train_speed(iter/s)": 1.637362 + }, + { + "acc": 0.67350359, + "epoch": 0.9571283612379503, + "grad_norm": 9.25, + "learning_rate": 5.7647882470723846e-06, + "loss": 1.58552437, + "memory(GiB)": 117.38, + "step": 37730, + "train_speed(iter/s)": 1.637388 + }, + { + "acc": 0.67741766, + "epoch": 0.9572552004058853, + "grad_norm": 8.5, + "learning_rate": 5.763751946003842e-06, + "loss": 1.53572369, + "memory(GiB)": 117.38, + "step": 37735, + "train_speed(iter/s)": 1.637414 + }, + { + "acc": 0.64968634, + "epoch": 0.9573820395738204, + "grad_norm": 5.28125, + "learning_rate": 5.7627156113420775e-06, + "loss": 1.55772839, + "memory(GiB)": 117.38, + "step": 37740, + "train_speed(iter/s)": 1.637438 + }, + { + "acc": 0.64974623, + "epoch": 0.9575088787417555, + "grad_norm": 6.34375, + "learning_rate": 5.761679243132677e-06, + "loss": 1.5931015, + "memory(GiB)": 117.38, + "step": 37745, + "train_speed(iter/s)": 1.637462 + }, + { + "acc": 0.64106536, + "epoch": 0.9576357179096905, + "grad_norm": 5.53125, + "learning_rate": 5.760642841421222e-06, + "loss": 1.60904064, + "memory(GiB)": 117.38, + "step": 37750, + "train_speed(iter/s)": 1.637486 + }, + { + "acc": 0.66763115, + "epoch": 0.9577625570776256, + "grad_norm": 6.5, + "learning_rate": 5.759606406253299e-06, + "loss": 1.55258694, + "memory(GiB)": 117.38, + "step": 37755, + "train_speed(iter/s)": 1.63751 + }, + { + "acc": 0.65933352, + "epoch": 0.9578893962455606, + "grad_norm": 5.03125, + "learning_rate": 5.758569937674494e-06, + "loss": 1.54921799, + "memory(GiB)": 117.38, + "step": 37760, + "train_speed(iter/s)": 1.637534 + }, + { + "acc": 0.66198144, + "epoch": 0.9580162354134957, + "grad_norm": 6.09375, + "learning_rate": 5.7575334357303954e-06, + "loss": 1.54654217, + "memory(GiB)": 117.38, + "step": 37765, + "train_speed(iter/s)": 1.637557 + }, + { + "acc": 0.65941048, + "epoch": 0.9581430745814308, + "grad_norm": 5.53125, + "learning_rate": 5.756496900466596e-06, + "loss": 1.58932486, + "memory(GiB)": 117.38, + "step": 37770, + "train_speed(iter/s)": 1.637583 + }, + { + "acc": 0.66474457, + "epoch": 0.9582699137493658, + "grad_norm": 5.28125, + "learning_rate": 5.755460331928684e-06, + "loss": 1.5787775, + "memory(GiB)": 117.38, + "step": 37775, + "train_speed(iter/s)": 1.637609 + }, + { + "acc": 0.64596987, + "epoch": 0.9583967529173009, + "grad_norm": 6.125, + "learning_rate": 5.754423730162257e-06, + "loss": 1.63899269, + "memory(GiB)": 117.38, + "step": 37780, + "train_speed(iter/s)": 1.637633 + }, + { + "acc": 0.65063429, + "epoch": 0.958523592085236, + "grad_norm": 5.40625, + "learning_rate": 5.753387095212901e-06, + "loss": 1.61835041, + "memory(GiB)": 117.38, + "step": 37785, + "train_speed(iter/s)": 1.637657 + }, + { + "acc": 0.65633698, + "epoch": 0.958650431253171, + "grad_norm": 7.28125, + "learning_rate": 5.752350427126221e-06, + "loss": 1.63100376, + "memory(GiB)": 117.38, + "step": 37790, + "train_speed(iter/s)": 1.637682 + }, + { + "acc": 0.65996699, + "epoch": 0.958777270421106, + "grad_norm": 5.25, + "learning_rate": 5.751313725947808e-06, + "loss": 1.54564714, + "memory(GiB)": 117.38, + "step": 37795, + "train_speed(iter/s)": 1.637706 + }, + { + "acc": 0.6700829, + "epoch": 0.958904109589041, + "grad_norm": 7.28125, + "learning_rate": 5.7502769917232635e-06, + "loss": 1.57003651, + "memory(GiB)": 117.38, + "step": 37800, + "train_speed(iter/s)": 1.637732 + }, + { + "acc": 0.64893789, + "epoch": 0.9590309487569761, + "grad_norm": 5.0625, + "learning_rate": 5.7492402244981885e-06, + "loss": 1.65374413, + "memory(GiB)": 117.38, + "step": 37805, + "train_speed(iter/s)": 1.637755 + }, + { + "acc": 0.65842857, + "epoch": 0.9591577879249112, + "grad_norm": 6.0, + "learning_rate": 5.748203424318182e-06, + "loss": 1.60634327, + "memory(GiB)": 117.38, + "step": 37810, + "train_speed(iter/s)": 1.63778 + }, + { + "acc": 0.67613182, + "epoch": 0.9592846270928462, + "grad_norm": 5.6875, + "learning_rate": 5.747166591228849e-06, + "loss": 1.54881172, + "memory(GiB)": 117.38, + "step": 37815, + "train_speed(iter/s)": 1.637806 + }, + { + "acc": 0.66187201, + "epoch": 0.9594114662607813, + "grad_norm": 5.71875, + "learning_rate": 5.746129725275793e-06, + "loss": 1.6008316, + "memory(GiB)": 117.38, + "step": 37820, + "train_speed(iter/s)": 1.637832 + }, + { + "acc": 0.65212364, + "epoch": 0.9595383054287164, + "grad_norm": 5.5, + "learning_rate": 5.74509282650462e-06, + "loss": 1.63671932, + "memory(GiB)": 117.38, + "step": 37825, + "train_speed(iter/s)": 1.637857 + }, + { + "acc": 0.64497232, + "epoch": 0.9596651445966514, + "grad_norm": 7.53125, + "learning_rate": 5.744055894960938e-06, + "loss": 1.6811491, + "memory(GiB)": 117.38, + "step": 37830, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.64194746, + "epoch": 0.9597919837645865, + "grad_norm": 6.03125, + "learning_rate": 5.743018930690357e-06, + "loss": 1.70499802, + "memory(GiB)": 117.38, + "step": 37835, + "train_speed(iter/s)": 1.637908 + }, + { + "acc": 0.65335422, + "epoch": 0.9599188229325215, + "grad_norm": 9.0, + "learning_rate": 5.7419819337384855e-06, + "loss": 1.60643768, + "memory(GiB)": 117.38, + "step": 37840, + "train_speed(iter/s)": 1.637934 + }, + { + "acc": 0.64465008, + "epoch": 0.9600456621004566, + "grad_norm": 5.9375, + "learning_rate": 5.740944904150934e-06, + "loss": 1.61140461, + "memory(GiB)": 117.38, + "step": 37845, + "train_speed(iter/s)": 1.637961 + }, + { + "acc": 0.65831637, + "epoch": 0.9601725012683917, + "grad_norm": 5.59375, + "learning_rate": 5.739907841973321e-06, + "loss": 1.56716118, + "memory(GiB)": 117.38, + "step": 37850, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.6521193, + "epoch": 0.9602993404363267, + "grad_norm": 6.125, + "learning_rate": 5.738870747251255e-06, + "loss": 1.63041363, + "memory(GiB)": 117.38, + "step": 37855, + "train_speed(iter/s)": 1.63801 + }, + { + "acc": 0.64439826, + "epoch": 0.9604261796042618, + "grad_norm": 5.71875, + "learning_rate": 5.737833620030357e-06, + "loss": 1.61178532, + "memory(GiB)": 117.38, + "step": 37860, + "train_speed(iter/s)": 1.638036 + }, + { + "acc": 0.68125067, + "epoch": 0.9605530187721969, + "grad_norm": 5.90625, + "learning_rate": 5.7367964603562385e-06, + "loss": 1.50592403, + "memory(GiB)": 117.38, + "step": 37865, + "train_speed(iter/s)": 1.638062 + }, + { + "acc": 0.67131348, + "epoch": 0.9606798579401319, + "grad_norm": 5.3125, + "learning_rate": 5.7357592682745245e-06, + "loss": 1.59699163, + "memory(GiB)": 117.38, + "step": 37870, + "train_speed(iter/s)": 1.638086 + }, + { + "acc": 0.67061272, + "epoch": 0.960806697108067, + "grad_norm": 6.3125, + "learning_rate": 5.734722043830833e-06, + "loss": 1.50260658, + "memory(GiB)": 117.38, + "step": 37875, + "train_speed(iter/s)": 1.63811 + }, + { + "acc": 0.65280647, + "epoch": 0.960933536276002, + "grad_norm": 6.28125, + "learning_rate": 5.7336847870707855e-06, + "loss": 1.64194069, + "memory(GiB)": 117.38, + "step": 37880, + "train_speed(iter/s)": 1.638137 + }, + { + "acc": 0.65304947, + "epoch": 0.9610603754439371, + "grad_norm": 5.5625, + "learning_rate": 5.732647498040006e-06, + "loss": 1.61398125, + "memory(GiB)": 117.38, + "step": 37885, + "train_speed(iter/s)": 1.638162 + }, + { + "acc": 0.64910769, + "epoch": 0.9611872146118722, + "grad_norm": 9.1875, + "learning_rate": 5.731610176784118e-06, + "loss": 1.6200182, + "memory(GiB)": 117.38, + "step": 37890, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.65103445, + "epoch": 0.9613140537798072, + "grad_norm": 5.40625, + "learning_rate": 5.730572823348748e-06, + "loss": 1.57979717, + "memory(GiB)": 117.38, + "step": 37895, + "train_speed(iter/s)": 1.638213 + }, + { + "acc": 0.65187325, + "epoch": 0.9614408929477423, + "grad_norm": 5.875, + "learning_rate": 5.729535437779523e-06, + "loss": 1.60123997, + "memory(GiB)": 117.38, + "step": 37900, + "train_speed(iter/s)": 1.638236 + }, + { + "acc": 0.63695345, + "epoch": 0.9615677321156774, + "grad_norm": 5.34375, + "learning_rate": 5.728498020122073e-06, + "loss": 1.65709114, + "memory(GiB)": 117.38, + "step": 37905, + "train_speed(iter/s)": 1.638261 + }, + { + "acc": 0.65966816, + "epoch": 0.9616945712836124, + "grad_norm": 5.9375, + "learning_rate": 5.727460570422028e-06, + "loss": 1.56923771, + "memory(GiB)": 117.38, + "step": 37910, + "train_speed(iter/s)": 1.638285 + }, + { + "acc": 0.67273316, + "epoch": 0.9618214104515475, + "grad_norm": 5.90625, + "learning_rate": 5.726423088725017e-06, + "loss": 1.58475723, + "memory(GiB)": 117.38, + "step": 37915, + "train_speed(iter/s)": 1.638308 + }, + { + "acc": 0.64787149, + "epoch": 0.9619482496194824, + "grad_norm": 4.75, + "learning_rate": 5.725385575076677e-06, + "loss": 1.59393101, + "memory(GiB)": 117.38, + "step": 37920, + "train_speed(iter/s)": 1.638334 + }, + { + "acc": 0.63942108, + "epoch": 0.9620750887874175, + "grad_norm": 6.09375, + "learning_rate": 5.7243480295226405e-06, + "loss": 1.64283791, + "memory(GiB)": 117.38, + "step": 37925, + "train_speed(iter/s)": 1.638359 + }, + { + "acc": 0.65549374, + "epoch": 0.9622019279553526, + "grad_norm": 5.46875, + "learning_rate": 5.723310452108545e-06, + "loss": 1.6001955, + "memory(GiB)": 117.38, + "step": 37930, + "train_speed(iter/s)": 1.638385 + }, + { + "acc": 0.63950868, + "epoch": 0.9623287671232876, + "grad_norm": 5.0625, + "learning_rate": 5.722272842880023e-06, + "loss": 1.70025711, + "memory(GiB)": 117.38, + "step": 37935, + "train_speed(iter/s)": 1.638408 + }, + { + "acc": 0.65101233, + "epoch": 0.9624556062912227, + "grad_norm": 5.375, + "learning_rate": 5.7212352018827215e-06, + "loss": 1.60794716, + "memory(GiB)": 117.38, + "step": 37940, + "train_speed(iter/s)": 1.63843 + }, + { + "acc": 0.66388993, + "epoch": 0.9625824454591578, + "grad_norm": 5.25, + "learning_rate": 5.720197529162272e-06, + "loss": 1.58975258, + "memory(GiB)": 117.38, + "step": 37945, + "train_speed(iter/s)": 1.638456 + }, + { + "acc": 0.66025572, + "epoch": 0.9627092846270928, + "grad_norm": 4.65625, + "learning_rate": 5.719159824764321e-06, + "loss": 1.53744736, + "memory(GiB)": 117.38, + "step": 37950, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.66164737, + "epoch": 0.9628361237950279, + "grad_norm": 5.65625, + "learning_rate": 5.71812208873451e-06, + "loss": 1.64038773, + "memory(GiB)": 117.38, + "step": 37955, + "train_speed(iter/s)": 1.638503 + }, + { + "acc": 0.65875645, + "epoch": 0.9629629629629629, + "grad_norm": 5.625, + "learning_rate": 5.717084321118482e-06, + "loss": 1.55650082, + "memory(GiB)": 117.38, + "step": 37960, + "train_speed(iter/s)": 1.638527 + }, + { + "acc": 0.6570097, + "epoch": 0.963089802130898, + "grad_norm": 6.46875, + "learning_rate": 5.716046521961887e-06, + "loss": 1.66541557, + "memory(GiB)": 117.38, + "step": 37965, + "train_speed(iter/s)": 1.638551 + }, + { + "acc": 0.66001363, + "epoch": 0.9632166412988331, + "grad_norm": 6.6875, + "learning_rate": 5.715008691310366e-06, + "loss": 1.61850758, + "memory(GiB)": 117.38, + "step": 37970, + "train_speed(iter/s)": 1.638576 + }, + { + "acc": 0.64226384, + "epoch": 0.9633434804667681, + "grad_norm": 7.15625, + "learning_rate": 5.713970829209573e-06, + "loss": 1.69995575, + "memory(GiB)": 117.38, + "step": 37975, + "train_speed(iter/s)": 1.6386 + }, + { + "acc": 0.65770597, + "epoch": 0.9634703196347032, + "grad_norm": 5.21875, + "learning_rate": 5.712932935705153e-06, + "loss": 1.64444675, + "memory(GiB)": 117.38, + "step": 37980, + "train_speed(iter/s)": 1.638624 + }, + { + "acc": 0.63616838, + "epoch": 0.9635971588026383, + "grad_norm": 5.5, + "learning_rate": 5.711895010842762e-06, + "loss": 1.69516068, + "memory(GiB)": 117.38, + "step": 37985, + "train_speed(iter/s)": 1.638648 + }, + { + "acc": 0.66105471, + "epoch": 0.9637239979705733, + "grad_norm": 6.1875, + "learning_rate": 5.710857054668048e-06, + "loss": 1.60404091, + "memory(GiB)": 117.38, + "step": 37990, + "train_speed(iter/s)": 1.638673 + }, + { + "acc": 0.66310992, + "epoch": 0.9638508371385084, + "grad_norm": 5.875, + "learning_rate": 5.7098190672266675e-06, + "loss": 1.60103741, + "memory(GiB)": 117.38, + "step": 37995, + "train_speed(iter/s)": 1.638697 + }, + { + "acc": 0.65080233, + "epoch": 0.9639776763064434, + "grad_norm": 5.8125, + "learning_rate": 5.708781048564276e-06, + "loss": 1.59080944, + "memory(GiB)": 117.38, + "step": 38000, + "train_speed(iter/s)": 1.638721 + }, + { + "epoch": 0.9639776763064434, + "eval_acc": 0.6460947631547623, + "eval_loss": 1.5742748975753784, + "eval_runtime": 58.1661, + "eval_samples_per_second": 109.514, + "eval_steps_per_second": 27.387, + "step": 38000 + }, + { + "acc": 0.66002145, + "epoch": 0.9641045154743785, + "grad_norm": 8.875, + "learning_rate": 5.707742998726527e-06, + "loss": 1.56301279, + "memory(GiB)": 117.38, + "step": 38005, + "train_speed(iter/s)": 1.634345 + }, + { + "acc": 0.6494401, + "epoch": 0.9642313546423136, + "grad_norm": 6.4375, + "learning_rate": 5.706704917759085e-06, + "loss": 1.55074511, + "memory(GiB)": 117.38, + "step": 38010, + "train_speed(iter/s)": 1.634372 + }, + { + "acc": 0.67064743, + "epoch": 0.9643581938102486, + "grad_norm": 6.21875, + "learning_rate": 5.705666805707603e-06, + "loss": 1.55384264, + "memory(GiB)": 117.38, + "step": 38015, + "train_speed(iter/s)": 1.634398 + }, + { + "acc": 0.6467577, + "epoch": 0.9644850329781837, + "grad_norm": 4.8125, + "learning_rate": 5.704628662617744e-06, + "loss": 1.57891865, + "memory(GiB)": 117.38, + "step": 38020, + "train_speed(iter/s)": 1.634424 + }, + { + "acc": 0.66551604, + "epoch": 0.9646118721461188, + "grad_norm": 5.84375, + "learning_rate": 5.703590488535171e-06, + "loss": 1.59015503, + "memory(GiB)": 117.38, + "step": 38025, + "train_speed(iter/s)": 1.634449 + }, + { + "acc": 0.66058064, + "epoch": 0.9647387113140538, + "grad_norm": 6.03125, + "learning_rate": 5.702552283505548e-06, + "loss": 1.60008774, + "memory(GiB)": 117.38, + "step": 38030, + "train_speed(iter/s)": 1.634475 + }, + { + "acc": 0.66553555, + "epoch": 0.9648655504819889, + "grad_norm": 5.46875, + "learning_rate": 5.7015140475745376e-06, + "loss": 1.55555792, + "memory(GiB)": 117.38, + "step": 38035, + "train_speed(iter/s)": 1.634501 + }, + { + "acc": 0.66134729, + "epoch": 0.9649923896499238, + "grad_norm": 5.84375, + "learning_rate": 5.700475780787809e-06, + "loss": 1.55473156, + "memory(GiB)": 117.38, + "step": 38040, + "train_speed(iter/s)": 1.634528 + }, + { + "acc": 0.65012608, + "epoch": 0.9651192288178589, + "grad_norm": 5.6875, + "learning_rate": 5.699437483191027e-06, + "loss": 1.55532284, + "memory(GiB)": 117.38, + "step": 38045, + "train_speed(iter/s)": 1.634553 + }, + { + "acc": 0.6550499, + "epoch": 0.965246067985794, + "grad_norm": 5.34375, + "learning_rate": 5.6983991548298615e-06, + "loss": 1.60887337, + "memory(GiB)": 117.38, + "step": 38050, + "train_speed(iter/s)": 1.634578 + }, + { + "acc": 0.66220093, + "epoch": 0.965372907153729, + "grad_norm": 7.0, + "learning_rate": 5.697360795749983e-06, + "loss": 1.5825057, + "memory(GiB)": 117.38, + "step": 38055, + "train_speed(iter/s)": 1.634603 + }, + { + "acc": 0.65022602, + "epoch": 0.9654997463216641, + "grad_norm": 5.25, + "learning_rate": 5.696322405997064e-06, + "loss": 1.64629097, + "memory(GiB)": 117.38, + "step": 38060, + "train_speed(iter/s)": 1.634629 + }, + { + "acc": 0.6586452, + "epoch": 0.9656265854895992, + "grad_norm": 8.0625, + "learning_rate": 5.695283985616775e-06, + "loss": 1.64529858, + "memory(GiB)": 117.38, + "step": 38065, + "train_speed(iter/s)": 1.634655 + }, + { + "acc": 0.64954076, + "epoch": 0.9657534246575342, + "grad_norm": 4.9375, + "learning_rate": 5.694245534654795e-06, + "loss": 1.64456291, + "memory(GiB)": 117.38, + "step": 38070, + "train_speed(iter/s)": 1.634681 + }, + { + "acc": 0.65863709, + "epoch": 0.9658802638254693, + "grad_norm": 6.46875, + "learning_rate": 5.693207053156794e-06, + "loss": 1.46597719, + "memory(GiB)": 117.38, + "step": 38075, + "train_speed(iter/s)": 1.634707 + }, + { + "acc": 0.65898852, + "epoch": 0.9660071029934043, + "grad_norm": 4.5625, + "learning_rate": 5.692168541168455e-06, + "loss": 1.59460192, + "memory(GiB)": 117.38, + "step": 38080, + "train_speed(iter/s)": 1.63473 + }, + { + "acc": 0.66738076, + "epoch": 0.9661339421613394, + "grad_norm": 5.125, + "learning_rate": 5.691129998735449e-06, + "loss": 1.50150843, + "memory(GiB)": 117.38, + "step": 38085, + "train_speed(iter/s)": 1.634756 + }, + { + "acc": 0.65731931, + "epoch": 0.9662607813292745, + "grad_norm": 5.9375, + "learning_rate": 5.690091425903464e-06, + "loss": 1.66143837, + "memory(GiB)": 117.38, + "step": 38090, + "train_speed(iter/s)": 1.634779 + }, + { + "acc": 0.65085773, + "epoch": 0.9663876204972095, + "grad_norm": 6.375, + "learning_rate": 5.689052822718175e-06, + "loss": 1.63089962, + "memory(GiB)": 117.38, + "step": 38095, + "train_speed(iter/s)": 1.634806 + }, + { + "acc": 0.66536517, + "epoch": 0.9665144596651446, + "grad_norm": 6.40625, + "learning_rate": 5.688014189225266e-06, + "loss": 1.52673798, + "memory(GiB)": 117.38, + "step": 38100, + "train_speed(iter/s)": 1.634832 + }, + { + "acc": 0.65155015, + "epoch": 0.9666412988330797, + "grad_norm": 6.0, + "learning_rate": 5.686975525470423e-06, + "loss": 1.63932495, + "memory(GiB)": 117.38, + "step": 38105, + "train_speed(iter/s)": 1.634857 + }, + { + "acc": 0.66317453, + "epoch": 0.9667681380010147, + "grad_norm": 6.5625, + "learning_rate": 5.685936831499328e-06, + "loss": 1.63441505, + "memory(GiB)": 117.38, + "step": 38110, + "train_speed(iter/s)": 1.634881 + }, + { + "acc": 0.65226994, + "epoch": 0.9668949771689498, + "grad_norm": 5.71875, + "learning_rate": 5.684898107357669e-06, + "loss": 1.62173424, + "memory(GiB)": 117.38, + "step": 38115, + "train_speed(iter/s)": 1.634904 + }, + { + "acc": 0.64400449, + "epoch": 0.9670218163368848, + "grad_norm": 6.5, + "learning_rate": 5.683859353091133e-06, + "loss": 1.68070927, + "memory(GiB)": 117.38, + "step": 38120, + "train_speed(iter/s)": 1.634931 + }, + { + "acc": 0.64637203, + "epoch": 0.9671486555048199, + "grad_norm": 5.53125, + "learning_rate": 5.6828205687454094e-06, + "loss": 1.62900791, + "memory(GiB)": 117.38, + "step": 38125, + "train_speed(iter/s)": 1.634957 + }, + { + "acc": 0.64605885, + "epoch": 0.967275494672755, + "grad_norm": 5.3125, + "learning_rate": 5.68178175436619e-06, + "loss": 1.67542953, + "memory(GiB)": 117.38, + "step": 38130, + "train_speed(iter/s)": 1.634983 + }, + { + "acc": 0.65680084, + "epoch": 0.96740233384069, + "grad_norm": 5.5, + "learning_rate": 5.680742909999163e-06, + "loss": 1.62871933, + "memory(GiB)": 117.38, + "step": 38135, + "train_speed(iter/s)": 1.635009 + }, + { + "acc": 0.63650732, + "epoch": 0.9675291730086251, + "grad_norm": 6.6875, + "learning_rate": 5.679704035690026e-06, + "loss": 1.73096752, + "memory(GiB)": 117.38, + "step": 38140, + "train_speed(iter/s)": 1.635035 + }, + { + "acc": 0.65398383, + "epoch": 0.9676560121765602, + "grad_norm": 5.09375, + "learning_rate": 5.6786651314844675e-06, + "loss": 1.53638954, + "memory(GiB)": 117.38, + "step": 38145, + "train_speed(iter/s)": 1.635061 + }, + { + "acc": 0.64991064, + "epoch": 0.9677828513444952, + "grad_norm": 5.90625, + "learning_rate": 5.67762619742819e-06, + "loss": 1.62337036, + "memory(GiB)": 117.38, + "step": 38150, + "train_speed(iter/s)": 1.635087 + }, + { + "acc": 0.64448156, + "epoch": 0.9679096905124303, + "grad_norm": 6.15625, + "learning_rate": 5.676587233566885e-06, + "loss": 1.60961685, + "memory(GiB)": 117.38, + "step": 38155, + "train_speed(iter/s)": 1.635112 + }, + { + "acc": 0.65426359, + "epoch": 0.9680365296803652, + "grad_norm": 5.15625, + "learning_rate": 5.675548239946254e-06, + "loss": 1.59950018, + "memory(GiB)": 117.38, + "step": 38160, + "train_speed(iter/s)": 1.63514 + }, + { + "acc": 0.6396699, + "epoch": 0.9681633688483003, + "grad_norm": 6.25, + "learning_rate": 5.674509216611993e-06, + "loss": 1.68408718, + "memory(GiB)": 117.38, + "step": 38165, + "train_speed(iter/s)": 1.635165 + }, + { + "acc": 0.65698185, + "epoch": 0.9682902080162354, + "grad_norm": 5.625, + "learning_rate": 5.673470163609806e-06, + "loss": 1.57145653, + "memory(GiB)": 117.38, + "step": 38170, + "train_speed(iter/s)": 1.635191 + }, + { + "acc": 0.67125373, + "epoch": 0.9684170471841704, + "grad_norm": 5.125, + "learning_rate": 5.672431080985395e-06, + "loss": 1.51870546, + "memory(GiB)": 117.38, + "step": 38175, + "train_speed(iter/s)": 1.635216 + }, + { + "acc": 0.65947361, + "epoch": 0.9685438863521055, + "grad_norm": 6.5, + "learning_rate": 5.671391968784464e-06, + "loss": 1.58719416, + "memory(GiB)": 117.38, + "step": 38180, + "train_speed(iter/s)": 1.635242 + }, + { + "acc": 0.66260357, + "epoch": 0.9686707255200406, + "grad_norm": 6.09375, + "learning_rate": 5.670352827052715e-06, + "loss": 1.59185772, + "memory(GiB)": 117.38, + "step": 38185, + "train_speed(iter/s)": 1.635268 + }, + { + "acc": 0.66881456, + "epoch": 0.9687975646879756, + "grad_norm": 5.625, + "learning_rate": 5.6693136558358565e-06, + "loss": 1.62469482, + "memory(GiB)": 117.38, + "step": 38190, + "train_speed(iter/s)": 1.635294 + }, + { + "acc": 0.64528675, + "epoch": 0.9689244038559107, + "grad_norm": 5.53125, + "learning_rate": 5.668274455179595e-06, + "loss": 1.64711704, + "memory(GiB)": 117.38, + "step": 38195, + "train_speed(iter/s)": 1.635317 + }, + { + "acc": 0.63186703, + "epoch": 0.9690512430238457, + "grad_norm": 6.28125, + "learning_rate": 5.667235225129639e-06, + "loss": 1.67505989, + "memory(GiB)": 117.38, + "step": 38200, + "train_speed(iter/s)": 1.635342 + }, + { + "acc": 0.63843536, + "epoch": 0.9691780821917808, + "grad_norm": 6.96875, + "learning_rate": 5.6661959657317e-06, + "loss": 1.61682205, + "memory(GiB)": 117.38, + "step": 38205, + "train_speed(iter/s)": 1.635366 + }, + { + "acc": 0.67373304, + "epoch": 0.9693049213597159, + "grad_norm": 7.15625, + "learning_rate": 5.665156677031487e-06, + "loss": 1.55660601, + "memory(GiB)": 117.38, + "step": 38210, + "train_speed(iter/s)": 1.635393 + }, + { + "acc": 0.65169334, + "epoch": 0.9694317605276509, + "grad_norm": 6.84375, + "learning_rate": 5.664117359074712e-06, + "loss": 1.63847847, + "memory(GiB)": 117.38, + "step": 38215, + "train_speed(iter/s)": 1.63542 + }, + { + "acc": 0.64914036, + "epoch": 0.969558599695586, + "grad_norm": 6.1875, + "learning_rate": 5.6630780119070935e-06, + "loss": 1.67669601, + "memory(GiB)": 117.38, + "step": 38220, + "train_speed(iter/s)": 1.635446 + }, + { + "acc": 0.6539856, + "epoch": 0.9696854388635211, + "grad_norm": 5.59375, + "learning_rate": 5.6620386355743415e-06, + "loss": 1.63298473, + "memory(GiB)": 117.38, + "step": 38225, + "train_speed(iter/s)": 1.635471 + }, + { + "acc": 0.65535269, + "epoch": 0.9698122780314561, + "grad_norm": 5.9375, + "learning_rate": 5.660999230122177e-06, + "loss": 1.58836575, + "memory(GiB)": 117.38, + "step": 38230, + "train_speed(iter/s)": 1.635496 + }, + { + "acc": 0.65222006, + "epoch": 0.9699391171993912, + "grad_norm": 5.78125, + "learning_rate": 5.659959795596313e-06, + "loss": 1.59134541, + "memory(GiB)": 117.38, + "step": 38235, + "train_speed(iter/s)": 1.635517 + }, + { + "acc": 0.64766917, + "epoch": 0.9700659563673262, + "grad_norm": 6.40625, + "learning_rate": 5.65892033204247e-06, + "loss": 1.64647732, + "memory(GiB)": 117.38, + "step": 38240, + "train_speed(iter/s)": 1.635542 + }, + { + "acc": 0.67077255, + "epoch": 0.9701927955352613, + "grad_norm": 5.125, + "learning_rate": 5.657880839506371e-06, + "loss": 1.54590244, + "memory(GiB)": 117.38, + "step": 38245, + "train_speed(iter/s)": 1.635567 + }, + { + "acc": 0.65406132, + "epoch": 0.9703196347031964, + "grad_norm": 7.625, + "learning_rate": 5.656841318033735e-06, + "loss": 1.58277416, + "memory(GiB)": 117.38, + "step": 38250, + "train_speed(iter/s)": 1.635593 + }, + { + "acc": 0.65494175, + "epoch": 0.9704464738711314, + "grad_norm": 7.8125, + "learning_rate": 5.6558017676702846e-06, + "loss": 1.58465919, + "memory(GiB)": 117.38, + "step": 38255, + "train_speed(iter/s)": 1.635617 + }, + { + "acc": 0.65172529, + "epoch": 0.9705733130390665, + "grad_norm": 5.59375, + "learning_rate": 5.654762188461744e-06, + "loss": 1.64318771, + "memory(GiB)": 117.38, + "step": 38260, + "train_speed(iter/s)": 1.635643 + }, + { + "acc": 0.6476625, + "epoch": 0.9707001522070016, + "grad_norm": 5.90625, + "learning_rate": 5.653722580453841e-06, + "loss": 1.65056038, + "memory(GiB)": 117.38, + "step": 38265, + "train_speed(iter/s)": 1.635669 + }, + { + "acc": 0.64132814, + "epoch": 0.9708269913749366, + "grad_norm": 5.59375, + "learning_rate": 5.652682943692299e-06, + "loss": 1.6684557, + "memory(GiB)": 117.38, + "step": 38270, + "train_speed(iter/s)": 1.635695 + }, + { + "acc": 0.66135368, + "epoch": 0.9709538305428717, + "grad_norm": 7.3125, + "learning_rate": 5.651643278222847e-06, + "loss": 1.60470467, + "memory(GiB)": 117.38, + "step": 38275, + "train_speed(iter/s)": 1.635723 + }, + { + "acc": 0.67604713, + "epoch": 0.9710806697108066, + "grad_norm": 6.21875, + "learning_rate": 5.6506035840912145e-06, + "loss": 1.55927334, + "memory(GiB)": 117.38, + "step": 38280, + "train_speed(iter/s)": 1.635748 + }, + { + "acc": 0.6676239, + "epoch": 0.9712075088787417, + "grad_norm": 6.0625, + "learning_rate": 5.649563861343131e-06, + "loss": 1.59370117, + "memory(GiB)": 117.38, + "step": 38285, + "train_speed(iter/s)": 1.635773 + }, + { + "acc": 0.63996372, + "epoch": 0.9713343480466768, + "grad_norm": 6.96875, + "learning_rate": 5.648524110024331e-06, + "loss": 1.62207546, + "memory(GiB)": 117.38, + "step": 38290, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.66518064, + "epoch": 0.9714611872146118, + "grad_norm": 5.71875, + "learning_rate": 5.647484330180542e-06, + "loss": 1.60813046, + "memory(GiB)": 117.38, + "step": 38295, + "train_speed(iter/s)": 1.635822 + }, + { + "acc": 0.6587152, + "epoch": 0.9715880263825469, + "grad_norm": 7.53125, + "learning_rate": 5.646444521857504e-06, + "loss": 1.64002476, + "memory(GiB)": 117.38, + "step": 38300, + "train_speed(iter/s)": 1.635847 + }, + { + "acc": 0.67270718, + "epoch": 0.971714865550482, + "grad_norm": 5.8125, + "learning_rate": 5.645404685100948e-06, + "loss": 1.53261986, + "memory(GiB)": 117.38, + "step": 38305, + "train_speed(iter/s)": 1.635873 + }, + { + "acc": 0.65908842, + "epoch": 0.971841704718417, + "grad_norm": 5.65625, + "learning_rate": 5.644364819956613e-06, + "loss": 1.62424812, + "memory(GiB)": 117.38, + "step": 38310, + "train_speed(iter/s)": 1.635898 + }, + { + "acc": 0.65067244, + "epoch": 0.9719685438863521, + "grad_norm": 6.125, + "learning_rate": 5.643324926470236e-06, + "loss": 1.6156662, + "memory(GiB)": 117.38, + "step": 38315, + "train_speed(iter/s)": 1.635923 + }, + { + "acc": 0.6559042, + "epoch": 0.9720953830542871, + "grad_norm": 7.21875, + "learning_rate": 5.642285004687557e-06, + "loss": 1.60668678, + "memory(GiB)": 117.38, + "step": 38320, + "train_speed(iter/s)": 1.635949 + }, + { + "acc": 0.66721106, + "epoch": 0.9722222222222222, + "grad_norm": 6.0, + "learning_rate": 5.6412450546543165e-06, + "loss": 1.58803902, + "memory(GiB)": 117.38, + "step": 38325, + "train_speed(iter/s)": 1.635973 + }, + { + "acc": 0.64878178, + "epoch": 0.9723490613901573, + "grad_norm": 5.90625, + "learning_rate": 5.640205076416254e-06, + "loss": 1.71433201, + "memory(GiB)": 117.38, + "step": 38330, + "train_speed(iter/s)": 1.635998 + }, + { + "acc": 0.65180159, + "epoch": 0.9724759005580923, + "grad_norm": 7.3125, + "learning_rate": 5.639165070019116e-06, + "loss": 1.62308025, + "memory(GiB)": 117.38, + "step": 38335, + "train_speed(iter/s)": 1.636021 + }, + { + "acc": 0.65096407, + "epoch": 0.9726027397260274, + "grad_norm": 6.53125, + "learning_rate": 5.638125035508642e-06, + "loss": 1.60337181, + "memory(GiB)": 117.38, + "step": 38340, + "train_speed(iter/s)": 1.636045 + }, + { + "acc": 0.66166754, + "epoch": 0.9727295788939625, + "grad_norm": 7.625, + "learning_rate": 5.6370849729305825e-06, + "loss": 1.60065327, + "memory(GiB)": 117.38, + "step": 38345, + "train_speed(iter/s)": 1.636068 + }, + { + "acc": 0.63902817, + "epoch": 0.9728564180618975, + "grad_norm": 5.4375, + "learning_rate": 5.63604488233068e-06, + "loss": 1.66976547, + "memory(GiB)": 117.38, + "step": 38350, + "train_speed(iter/s)": 1.636092 + }, + { + "acc": 0.65062933, + "epoch": 0.9729832572298326, + "grad_norm": 6.09375, + "learning_rate": 5.635004763754683e-06, + "loss": 1.58455353, + "memory(GiB)": 117.38, + "step": 38355, + "train_speed(iter/s)": 1.636115 + }, + { + "acc": 0.64914169, + "epoch": 0.9731100963977676, + "grad_norm": 6.375, + "learning_rate": 5.633964617248345e-06, + "loss": 1.62896042, + "memory(GiB)": 117.38, + "step": 38360, + "train_speed(iter/s)": 1.636138 + }, + { + "acc": 0.66763501, + "epoch": 0.9732369355657027, + "grad_norm": 5.3125, + "learning_rate": 5.6329244428574085e-06, + "loss": 1.5472496, + "memory(GiB)": 117.38, + "step": 38365, + "train_speed(iter/s)": 1.636163 + }, + { + "acc": 0.65448613, + "epoch": 0.9733637747336378, + "grad_norm": 5.5625, + "learning_rate": 5.631884240627632e-06, + "loss": 1.62341328, + "memory(GiB)": 117.38, + "step": 38370, + "train_speed(iter/s)": 1.636187 + }, + { + "acc": 0.6505312, + "epoch": 0.9734906139015728, + "grad_norm": 4.75, + "learning_rate": 5.6308440106047634e-06, + "loss": 1.65536308, + "memory(GiB)": 117.38, + "step": 38375, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.66176796, + "epoch": 0.9736174530695079, + "grad_norm": 5.875, + "learning_rate": 5.62980375283456e-06, + "loss": 1.59309978, + "memory(GiB)": 117.38, + "step": 38380, + "train_speed(iter/s)": 1.636234 + }, + { + "acc": 0.65792465, + "epoch": 0.973744292237443, + "grad_norm": 5.8125, + "learning_rate": 5.628763467362775e-06, + "loss": 1.60247307, + "memory(GiB)": 117.38, + "step": 38385, + "train_speed(iter/s)": 1.636256 + }, + { + "acc": 0.64849577, + "epoch": 0.973871131405378, + "grad_norm": 5.375, + "learning_rate": 5.627723154235165e-06, + "loss": 1.61164494, + "memory(GiB)": 117.38, + "step": 38390, + "train_speed(iter/s)": 1.63628 + }, + { + "acc": 0.6562932, + "epoch": 0.973997970573313, + "grad_norm": 5.9375, + "learning_rate": 5.62668281349749e-06, + "loss": 1.66902771, + "memory(GiB)": 117.38, + "step": 38395, + "train_speed(iter/s)": 1.636304 + }, + { + "acc": 0.64309597, + "epoch": 0.974124809741248, + "grad_norm": 6.03125, + "learning_rate": 5.625642445195505e-06, + "loss": 1.69052143, + "memory(GiB)": 117.38, + "step": 38400, + "train_speed(iter/s)": 1.636327 + }, + { + "acc": 0.6652966, + "epoch": 0.9742516489091831, + "grad_norm": 6.09375, + "learning_rate": 5.6246020493749735e-06, + "loss": 1.59762754, + "memory(GiB)": 117.38, + "step": 38405, + "train_speed(iter/s)": 1.63635 + }, + { + "acc": 0.65778608, + "epoch": 0.9743784880771182, + "grad_norm": 6.375, + "learning_rate": 5.623561626081654e-06, + "loss": 1.62377644, + "memory(GiB)": 117.38, + "step": 38410, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.65515709, + "epoch": 0.9745053272450532, + "grad_norm": 7.375, + "learning_rate": 5.622521175361311e-06, + "loss": 1.57583199, + "memory(GiB)": 117.38, + "step": 38415, + "train_speed(iter/s)": 1.636396 + }, + { + "acc": 0.66052942, + "epoch": 0.9746321664129883, + "grad_norm": 5.71875, + "learning_rate": 5.621480697259707e-06, + "loss": 1.68279095, + "memory(GiB)": 117.38, + "step": 38420, + "train_speed(iter/s)": 1.636421 + }, + { + "acc": 0.64012756, + "epoch": 0.9747590055809234, + "grad_norm": 5.5, + "learning_rate": 5.620440191822607e-06, + "loss": 1.6234848, + "memory(GiB)": 117.38, + "step": 38425, + "train_speed(iter/s)": 1.636445 + }, + { + "acc": 0.63879981, + "epoch": 0.9748858447488584, + "grad_norm": 5.15625, + "learning_rate": 5.619399659095778e-06, + "loss": 1.63880997, + "memory(GiB)": 117.38, + "step": 38430, + "train_speed(iter/s)": 1.636469 + }, + { + "acc": 0.65664611, + "epoch": 0.9750126839167935, + "grad_norm": 6.375, + "learning_rate": 5.618359099124985e-06, + "loss": 1.62167263, + "memory(GiB)": 117.38, + "step": 38435, + "train_speed(iter/s)": 1.636494 + }, + { + "acc": 0.6515183, + "epoch": 0.9751395230847285, + "grad_norm": 5.09375, + "learning_rate": 5.617318511956001e-06, + "loss": 1.65804596, + "memory(GiB)": 117.38, + "step": 38440, + "train_speed(iter/s)": 1.636518 + }, + { + "acc": 0.6528573, + "epoch": 0.9752663622526636, + "grad_norm": 6.1875, + "learning_rate": 5.61627789763459e-06, + "loss": 1.64230652, + "memory(GiB)": 117.38, + "step": 38445, + "train_speed(iter/s)": 1.636541 + }, + { + "acc": 0.65744667, + "epoch": 0.9753932014205987, + "grad_norm": 5.53125, + "learning_rate": 5.6152372562065275e-06, + "loss": 1.56383057, + "memory(GiB)": 117.38, + "step": 38450, + "train_speed(iter/s)": 1.636566 + }, + { + "acc": 0.64883323, + "epoch": 0.9755200405885337, + "grad_norm": 5.6875, + "learning_rate": 5.614196587717581e-06, + "loss": 1.62848892, + "memory(GiB)": 117.38, + "step": 38455, + "train_speed(iter/s)": 1.636591 + }, + { + "acc": 0.66938925, + "epoch": 0.9756468797564688, + "grad_norm": 4.75, + "learning_rate": 5.613155892213529e-06, + "loss": 1.55475492, + "memory(GiB)": 117.38, + "step": 38460, + "train_speed(iter/s)": 1.636615 + }, + { + "acc": 0.67106466, + "epoch": 0.9757737189244039, + "grad_norm": 5.1875, + "learning_rate": 5.612115169740142e-06, + "loss": 1.55789709, + "memory(GiB)": 117.38, + "step": 38465, + "train_speed(iter/s)": 1.636639 + }, + { + "acc": 0.66363754, + "epoch": 0.9759005580923389, + "grad_norm": 5.96875, + "learning_rate": 5.611074420343197e-06, + "loss": 1.58945103, + "memory(GiB)": 117.38, + "step": 38470, + "train_speed(iter/s)": 1.636664 + }, + { + "acc": 0.66767759, + "epoch": 0.976027397260274, + "grad_norm": 5.40625, + "learning_rate": 5.610033644068471e-06, + "loss": 1.5798296, + "memory(GiB)": 117.38, + "step": 38475, + "train_speed(iter/s)": 1.63669 + }, + { + "acc": 0.66720638, + "epoch": 0.976154236428209, + "grad_norm": 4.78125, + "learning_rate": 5.608992840961742e-06, + "loss": 1.55297451, + "memory(GiB)": 117.38, + "step": 38480, + "train_speed(iter/s)": 1.636714 + }, + { + "acc": 0.65064545, + "epoch": 0.9762810755961441, + "grad_norm": 5.78125, + "learning_rate": 5.6079520110687876e-06, + "loss": 1.59366951, + "memory(GiB)": 117.38, + "step": 38485, + "train_speed(iter/s)": 1.63674 + }, + { + "acc": 0.66323409, + "epoch": 0.9764079147640792, + "grad_norm": 5.375, + "learning_rate": 5.606911154435392e-06, + "loss": 1.56340561, + "memory(GiB)": 117.38, + "step": 38490, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.65641155, + "epoch": 0.9765347539320142, + "grad_norm": 7.4375, + "learning_rate": 5.605870271107332e-06, + "loss": 1.61000671, + "memory(GiB)": 117.38, + "step": 38495, + "train_speed(iter/s)": 1.636789 + }, + { + "acc": 0.65363274, + "epoch": 0.9766615930999493, + "grad_norm": 5.625, + "learning_rate": 5.6048293611303925e-06, + "loss": 1.61809254, + "memory(GiB)": 117.38, + "step": 38500, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.6769455, + "epoch": 0.9767884322678844, + "grad_norm": 4.75, + "learning_rate": 5.603788424550357e-06, + "loss": 1.57775764, + "memory(GiB)": 117.38, + "step": 38505, + "train_speed(iter/s)": 1.636837 + }, + { + "acc": 0.65813751, + "epoch": 0.9769152714358194, + "grad_norm": 4.8125, + "learning_rate": 5.602747461413014e-06, + "loss": 1.56889706, + "memory(GiB)": 117.38, + "step": 38510, + "train_speed(iter/s)": 1.63686 + }, + { + "acc": 0.66425519, + "epoch": 0.9770421106037545, + "grad_norm": 7.8125, + "learning_rate": 5.6017064717641435e-06, + "loss": 1.53046474, + "memory(GiB)": 117.38, + "step": 38515, + "train_speed(iter/s)": 1.636884 + }, + { + "acc": 0.65539885, + "epoch": 0.9771689497716894, + "grad_norm": 5.15625, + "learning_rate": 5.600665455649538e-06, + "loss": 1.609202, + "memory(GiB)": 117.38, + "step": 38520, + "train_speed(iter/s)": 1.636909 + }, + { + "acc": 0.65283327, + "epoch": 0.9772957889396245, + "grad_norm": 5.84375, + "learning_rate": 5.599624413114981e-06, + "loss": 1.60686741, + "memory(GiB)": 117.38, + "step": 38525, + "train_speed(iter/s)": 1.636932 + }, + { + "acc": 0.657969, + "epoch": 0.9774226281075596, + "grad_norm": 5.28125, + "learning_rate": 5.5985833442062676e-06, + "loss": 1.59414825, + "memory(GiB)": 117.38, + "step": 38530, + "train_speed(iter/s)": 1.636957 + }, + { + "acc": 0.66860781, + "epoch": 0.9775494672754946, + "grad_norm": 5.34375, + "learning_rate": 5.597542248969185e-06, + "loss": 1.51761341, + "memory(GiB)": 117.38, + "step": 38535, + "train_speed(iter/s)": 1.636982 + }, + { + "acc": 0.64384117, + "epoch": 0.9776763064434297, + "grad_norm": 7.28125, + "learning_rate": 5.596501127449527e-06, + "loss": 1.62396908, + "memory(GiB)": 117.38, + "step": 38540, + "train_speed(iter/s)": 1.637006 + }, + { + "acc": 0.65910268, + "epoch": 0.9778031456113648, + "grad_norm": 5.46875, + "learning_rate": 5.595459979693086e-06, + "loss": 1.57738466, + "memory(GiB)": 117.38, + "step": 38545, + "train_speed(iter/s)": 1.63703 + }, + { + "acc": 0.6607317, + "epoch": 0.9779299847792998, + "grad_norm": 5.875, + "learning_rate": 5.594418805745657e-06, + "loss": 1.58972359, + "memory(GiB)": 117.38, + "step": 38550, + "train_speed(iter/s)": 1.637055 + }, + { + "acc": 0.6520052, + "epoch": 0.9780568239472349, + "grad_norm": 6.1875, + "learning_rate": 5.593377605653035e-06, + "loss": 1.58321934, + "memory(GiB)": 117.38, + "step": 38555, + "train_speed(iter/s)": 1.63708 + }, + { + "acc": 0.6600708, + "epoch": 0.9781836631151699, + "grad_norm": 5.625, + "learning_rate": 5.592336379461018e-06, + "loss": 1.57247629, + "memory(GiB)": 117.38, + "step": 38560, + "train_speed(iter/s)": 1.637106 + }, + { + "acc": 0.64563799, + "epoch": 0.978310502283105, + "grad_norm": 7.46875, + "learning_rate": 5.5912951272154004e-06, + "loss": 1.55649872, + "memory(GiB)": 117.38, + "step": 38565, + "train_speed(iter/s)": 1.63713 + }, + { + "acc": 0.65249586, + "epoch": 0.9784373414510401, + "grad_norm": 5.0, + "learning_rate": 5.590253848961984e-06, + "loss": 1.65953255, + "memory(GiB)": 117.38, + "step": 38570, + "train_speed(iter/s)": 1.637154 + }, + { + "acc": 0.64530449, + "epoch": 0.9785641806189751, + "grad_norm": 5.03125, + "learning_rate": 5.589212544746566e-06, + "loss": 1.63760529, + "memory(GiB)": 117.38, + "step": 38575, + "train_speed(iter/s)": 1.637178 + }, + { + "acc": 0.6618928, + "epoch": 0.9786910197869102, + "grad_norm": 7.09375, + "learning_rate": 5.588171214614953e-06, + "loss": 1.63993301, + "memory(GiB)": 117.38, + "step": 38580, + "train_speed(iter/s)": 1.637202 + }, + { + "acc": 0.64911199, + "epoch": 0.9788178589548453, + "grad_norm": 5.28125, + "learning_rate": 5.587129858612941e-06, + "loss": 1.60417976, + "memory(GiB)": 117.38, + "step": 38585, + "train_speed(iter/s)": 1.637226 + }, + { + "acc": 0.6474401, + "epoch": 0.9789446981227803, + "grad_norm": 4.65625, + "learning_rate": 5.586088476786339e-06, + "loss": 1.62720528, + "memory(GiB)": 117.38, + "step": 38590, + "train_speed(iter/s)": 1.637249 + }, + { + "acc": 0.64482594, + "epoch": 0.9790715372907154, + "grad_norm": 5.4375, + "learning_rate": 5.585047069180947e-06, + "loss": 1.67413311, + "memory(GiB)": 117.38, + "step": 38595, + "train_speed(iter/s)": 1.637274 + }, + { + "acc": 0.65761099, + "epoch": 0.9791983764586504, + "grad_norm": 6.375, + "learning_rate": 5.5840056358425755e-06, + "loss": 1.55436783, + "memory(GiB)": 117.38, + "step": 38600, + "train_speed(iter/s)": 1.637298 + }, + { + "acc": 0.6373436, + "epoch": 0.9793252156265855, + "grad_norm": 5.71875, + "learning_rate": 5.582964176817025e-06, + "loss": 1.6881958, + "memory(GiB)": 117.38, + "step": 38605, + "train_speed(iter/s)": 1.637321 + }, + { + "acc": 0.66941342, + "epoch": 0.9794520547945206, + "grad_norm": 7.0, + "learning_rate": 5.58192269215011e-06, + "loss": 1.53719635, + "memory(GiB)": 117.38, + "step": 38610, + "train_speed(iter/s)": 1.637344 + }, + { + "acc": 0.66720614, + "epoch": 0.9795788939624556, + "grad_norm": 5.84375, + "learning_rate": 5.580881181887636e-06, + "loss": 1.60927563, + "memory(GiB)": 117.38, + "step": 38615, + "train_speed(iter/s)": 1.637369 + }, + { + "acc": 0.65170488, + "epoch": 0.9797057331303907, + "grad_norm": 4.84375, + "learning_rate": 5.579839646075414e-06, + "loss": 1.60060921, + "memory(GiB)": 117.38, + "step": 38620, + "train_speed(iter/s)": 1.637392 + }, + { + "acc": 0.64748268, + "epoch": 0.9798325722983258, + "grad_norm": 4.5, + "learning_rate": 5.578798084759257e-06, + "loss": 1.65848389, + "memory(GiB)": 117.38, + "step": 38625, + "train_speed(iter/s)": 1.637413 + }, + { + "acc": 0.66331711, + "epoch": 0.9799594114662608, + "grad_norm": 6.53125, + "learning_rate": 5.577756497984975e-06, + "loss": 1.54005833, + "memory(GiB)": 117.38, + "step": 38630, + "train_speed(iter/s)": 1.637436 + }, + { + "acc": 0.65858107, + "epoch": 0.9800862506341959, + "grad_norm": 5.28125, + "learning_rate": 5.576714885798382e-06, + "loss": 1.53903885, + "memory(GiB)": 117.38, + "step": 38635, + "train_speed(iter/s)": 1.63746 + }, + { + "acc": 0.66127539, + "epoch": 0.9802130898021308, + "grad_norm": 6.625, + "learning_rate": 5.575673248245295e-06, + "loss": 1.57987604, + "memory(GiB)": 117.38, + "step": 38640, + "train_speed(iter/s)": 1.637484 + }, + { + "acc": 0.64577646, + "epoch": 0.9803399289700659, + "grad_norm": 8.375, + "learning_rate": 5.574631585371527e-06, + "loss": 1.71133423, + "memory(GiB)": 117.38, + "step": 38645, + "train_speed(iter/s)": 1.637508 + }, + { + "acc": 0.6480269, + "epoch": 0.980466768138001, + "grad_norm": 6.09375, + "learning_rate": 5.573589897222897e-06, + "loss": 1.59905272, + "memory(GiB)": 117.38, + "step": 38650, + "train_speed(iter/s)": 1.637531 + }, + { + "acc": 0.66004782, + "epoch": 0.980593607305936, + "grad_norm": 5.0, + "learning_rate": 5.572548183845222e-06, + "loss": 1.54437485, + "memory(GiB)": 117.38, + "step": 38655, + "train_speed(iter/s)": 1.637556 + }, + { + "acc": 0.65010457, + "epoch": 0.9807204464738711, + "grad_norm": 4.90625, + "learning_rate": 5.571506445284322e-06, + "loss": 1.61843472, + "memory(GiB)": 117.38, + "step": 38660, + "train_speed(iter/s)": 1.637578 + }, + { + "acc": 0.6674324, + "epoch": 0.9808472856418062, + "grad_norm": 4.5, + "learning_rate": 5.570464681586017e-06, + "loss": 1.57486477, + "memory(GiB)": 117.38, + "step": 38665, + "train_speed(iter/s)": 1.6376 + }, + { + "acc": 0.64909735, + "epoch": 0.9809741248097412, + "grad_norm": 5.34375, + "learning_rate": 5.569422892796129e-06, + "loss": 1.59526539, + "memory(GiB)": 117.38, + "step": 38670, + "train_speed(iter/s)": 1.637623 + }, + { + "acc": 0.63954821, + "epoch": 0.9811009639776763, + "grad_norm": 5.71875, + "learning_rate": 5.568381078960479e-06, + "loss": 1.75403938, + "memory(GiB)": 117.38, + "step": 38675, + "train_speed(iter/s)": 1.637647 + }, + { + "acc": 0.65852995, + "epoch": 0.9812278031456113, + "grad_norm": 6.9375, + "learning_rate": 5.567339240124892e-06, + "loss": 1.53508778, + "memory(GiB)": 117.38, + "step": 38680, + "train_speed(iter/s)": 1.637671 + }, + { + "acc": 0.65534935, + "epoch": 0.9813546423135464, + "grad_norm": 5.40625, + "learning_rate": 5.5662973763351915e-06, + "loss": 1.64453754, + "memory(GiB)": 117.38, + "step": 38685, + "train_speed(iter/s)": 1.637693 + }, + { + "acc": 0.6488111, + "epoch": 0.9814814814814815, + "grad_norm": 6.625, + "learning_rate": 5.565255487637204e-06, + "loss": 1.62392483, + "memory(GiB)": 117.38, + "step": 38690, + "train_speed(iter/s)": 1.637718 + }, + { + "acc": 0.65491405, + "epoch": 0.9816083206494165, + "grad_norm": 6.03125, + "learning_rate": 5.564213574076757e-06, + "loss": 1.58939095, + "memory(GiB)": 117.38, + "step": 38695, + "train_speed(iter/s)": 1.637741 + }, + { + "acc": 0.64531393, + "epoch": 0.9817351598173516, + "grad_norm": 6.53125, + "learning_rate": 5.563171635699678e-06, + "loss": 1.65395947, + "memory(GiB)": 117.38, + "step": 38700, + "train_speed(iter/s)": 1.637765 + }, + { + "acc": 0.67010503, + "epoch": 0.9818619989852867, + "grad_norm": 6.09375, + "learning_rate": 5.562129672551796e-06, + "loss": 1.54790859, + "memory(GiB)": 117.38, + "step": 38705, + "train_speed(iter/s)": 1.637789 + }, + { + "acc": 0.64942441, + "epoch": 0.9819888381532217, + "grad_norm": 5.71875, + "learning_rate": 5.561087684678941e-06, + "loss": 1.61868114, + "memory(GiB)": 117.38, + "step": 38710, + "train_speed(iter/s)": 1.637812 + }, + { + "acc": 0.64999676, + "epoch": 0.9821156773211568, + "grad_norm": 5.3125, + "learning_rate": 5.560045672126945e-06, + "loss": 1.62361717, + "memory(GiB)": 117.38, + "step": 38715, + "train_speed(iter/s)": 1.637835 + }, + { + "acc": 0.65503788, + "epoch": 0.9822425164890918, + "grad_norm": 5.875, + "learning_rate": 5.55900363494164e-06, + "loss": 1.61794205, + "memory(GiB)": 117.38, + "step": 38720, + "train_speed(iter/s)": 1.637858 + }, + { + "acc": 0.66599765, + "epoch": 0.9823693556570269, + "grad_norm": 5.96875, + "learning_rate": 5.557961573168857e-06, + "loss": 1.63279839, + "memory(GiB)": 117.38, + "step": 38725, + "train_speed(iter/s)": 1.637882 + }, + { + "acc": 0.66272669, + "epoch": 0.982496194824962, + "grad_norm": 6.125, + "learning_rate": 5.5569194868544376e-06, + "loss": 1.58562908, + "memory(GiB)": 117.38, + "step": 38730, + "train_speed(iter/s)": 1.637906 + }, + { + "acc": 0.65216403, + "epoch": 0.982623033992897, + "grad_norm": 5.96875, + "learning_rate": 5.555877376044209e-06, + "loss": 1.57129736, + "memory(GiB)": 117.38, + "step": 38735, + "train_speed(iter/s)": 1.637929 + }, + { + "acc": 0.67389421, + "epoch": 0.9827498731608321, + "grad_norm": 5.6875, + "learning_rate": 5.554835240784013e-06, + "loss": 1.59081326, + "memory(GiB)": 117.38, + "step": 38740, + "train_speed(iter/s)": 1.637952 + }, + { + "acc": 0.65340695, + "epoch": 0.9828767123287672, + "grad_norm": 5.8125, + "learning_rate": 5.553793081119685e-06, + "loss": 1.62839966, + "memory(GiB)": 117.38, + "step": 38745, + "train_speed(iter/s)": 1.637973 + }, + { + "acc": 0.65438223, + "epoch": 0.9830035514967022, + "grad_norm": 4.78125, + "learning_rate": 5.552750897097065e-06, + "loss": 1.61699829, + "memory(GiB)": 117.38, + "step": 38750, + "train_speed(iter/s)": 1.637996 + }, + { + "acc": 0.64969902, + "epoch": 0.9831303906646373, + "grad_norm": 6.75, + "learning_rate": 5.551708688761993e-06, + "loss": 1.61930351, + "memory(GiB)": 117.38, + "step": 38755, + "train_speed(iter/s)": 1.63802 + }, + { + "acc": 0.65622864, + "epoch": 0.9832572298325722, + "grad_norm": 5.6875, + "learning_rate": 5.550666456160311e-06, + "loss": 1.59590492, + "memory(GiB)": 117.38, + "step": 38760, + "train_speed(iter/s)": 1.638043 + }, + { + "acc": 0.65961337, + "epoch": 0.9833840690005073, + "grad_norm": 5.03125, + "learning_rate": 5.549624199337857e-06, + "loss": 1.60344238, + "memory(GiB)": 117.38, + "step": 38765, + "train_speed(iter/s)": 1.638068 + }, + { + "acc": 0.65280104, + "epoch": 0.9835109081684424, + "grad_norm": 6.9375, + "learning_rate": 5.548581918340479e-06, + "loss": 1.63526535, + "memory(GiB)": 117.38, + "step": 38770, + "train_speed(iter/s)": 1.638092 + }, + { + "acc": 0.66507111, + "epoch": 0.9836377473363774, + "grad_norm": 5.9375, + "learning_rate": 5.547539613214019e-06, + "loss": 1.56163425, + "memory(GiB)": 117.38, + "step": 38775, + "train_speed(iter/s)": 1.638115 + }, + { + "acc": 0.65190334, + "epoch": 0.9837645865043125, + "grad_norm": 6.8125, + "learning_rate": 5.546497284004321e-06, + "loss": 1.62859459, + "memory(GiB)": 117.38, + "step": 38780, + "train_speed(iter/s)": 1.638141 + }, + { + "acc": 0.66125822, + "epoch": 0.9838914256722476, + "grad_norm": 5.90625, + "learning_rate": 5.545454930757233e-06, + "loss": 1.60100193, + "memory(GiB)": 117.38, + "step": 38785, + "train_speed(iter/s)": 1.638164 + }, + { + "acc": 0.67107859, + "epoch": 0.9840182648401826, + "grad_norm": 5.125, + "learning_rate": 5.544412553518602e-06, + "loss": 1.52109604, + "memory(GiB)": 117.38, + "step": 38790, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.65286479, + "epoch": 0.9841451040081177, + "grad_norm": 4.71875, + "learning_rate": 5.543370152334275e-06, + "loss": 1.57824802, + "memory(GiB)": 117.38, + "step": 38795, + "train_speed(iter/s)": 1.638212 + }, + { + "acc": 0.65725117, + "epoch": 0.9842719431760527, + "grad_norm": 4.9375, + "learning_rate": 5.542327727250105e-06, + "loss": 1.59497948, + "memory(GiB)": 117.38, + "step": 38800, + "train_speed(iter/s)": 1.638233 + }, + { + "acc": 0.64453201, + "epoch": 0.9843987823439878, + "grad_norm": 6.09375, + "learning_rate": 5.5412852783119385e-06, + "loss": 1.62583885, + "memory(GiB)": 117.38, + "step": 38805, + "train_speed(iter/s)": 1.638255 + }, + { + "acc": 0.65223112, + "epoch": 0.9845256215119229, + "grad_norm": 4.78125, + "learning_rate": 5.54024280556563e-06, + "loss": 1.58456421, + "memory(GiB)": 117.38, + "step": 38810, + "train_speed(iter/s)": 1.638279 + }, + { + "acc": 0.64567976, + "epoch": 0.9846524606798579, + "grad_norm": 8.625, + "learning_rate": 5.53920030905703e-06, + "loss": 1.67348175, + "memory(GiB)": 117.38, + "step": 38815, + "train_speed(iter/s)": 1.638303 + }, + { + "acc": 0.65117941, + "epoch": 0.984779299847793, + "grad_norm": 5.15625, + "learning_rate": 5.538157788831993e-06, + "loss": 1.57780151, + "memory(GiB)": 117.38, + "step": 38820, + "train_speed(iter/s)": 1.638328 + }, + { + "acc": 0.65188837, + "epoch": 0.9849061390157281, + "grad_norm": 9.9375, + "learning_rate": 5.537115244936374e-06, + "loss": 1.61107025, + "memory(GiB)": 117.38, + "step": 38825, + "train_speed(iter/s)": 1.638351 + }, + { + "acc": 0.64141674, + "epoch": 0.9850329781836631, + "grad_norm": 6.6875, + "learning_rate": 5.536072677416029e-06, + "loss": 1.68645287, + "memory(GiB)": 117.38, + "step": 38830, + "train_speed(iter/s)": 1.638374 + }, + { + "acc": 0.66752872, + "epoch": 0.9851598173515982, + "grad_norm": 5.375, + "learning_rate": 5.535030086316814e-06, + "loss": 1.56166592, + "memory(GiB)": 117.38, + "step": 38835, + "train_speed(iter/s)": 1.638397 + }, + { + "acc": 0.66933012, + "epoch": 0.9852866565195332, + "grad_norm": 4.84375, + "learning_rate": 5.533987471684586e-06, + "loss": 1.5428009, + "memory(GiB)": 117.38, + "step": 38840, + "train_speed(iter/s)": 1.638418 + }, + { + "acc": 0.65214624, + "epoch": 0.9854134956874683, + "grad_norm": 5.03125, + "learning_rate": 5.532944833565207e-06, + "loss": 1.58116951, + "memory(GiB)": 117.38, + "step": 38845, + "train_speed(iter/s)": 1.63844 + }, + { + "acc": 0.64970388, + "epoch": 0.9855403348554034, + "grad_norm": 5.40625, + "learning_rate": 5.531902172004533e-06, + "loss": 1.61301765, + "memory(GiB)": 117.38, + "step": 38850, + "train_speed(iter/s)": 1.638463 + }, + { + "acc": 0.65473394, + "epoch": 0.9856671740233384, + "grad_norm": 5.625, + "learning_rate": 5.530859487048427e-06, + "loss": 1.62891655, + "memory(GiB)": 117.38, + "step": 38855, + "train_speed(iter/s)": 1.638487 + }, + { + "acc": 0.63526421, + "epoch": 0.9857940131912735, + "grad_norm": 4.75, + "learning_rate": 5.529816778742752e-06, + "loss": 1.70370007, + "memory(GiB)": 117.38, + "step": 38860, + "train_speed(iter/s)": 1.638508 + }, + { + "acc": 0.68229003, + "epoch": 0.9859208523592086, + "grad_norm": 6.0, + "learning_rate": 5.528774047133369e-06, + "loss": 1.50216904, + "memory(GiB)": 117.38, + "step": 38865, + "train_speed(iter/s)": 1.638532 + }, + { + "acc": 0.65571666, + "epoch": 0.9860476915271436, + "grad_norm": 5.0625, + "learning_rate": 5.527731292266142e-06, + "loss": 1.60538349, + "memory(GiB)": 117.38, + "step": 38870, + "train_speed(iter/s)": 1.638556 + }, + { + "acc": 0.66178646, + "epoch": 0.9861745306950787, + "grad_norm": 6.09375, + "learning_rate": 5.5266885141869355e-06, + "loss": 1.60124855, + "memory(GiB)": 117.38, + "step": 38875, + "train_speed(iter/s)": 1.63858 + }, + { + "acc": 0.65636034, + "epoch": 0.9863013698630136, + "grad_norm": 6.5, + "learning_rate": 5.5256457129416185e-06, + "loss": 1.63822937, + "memory(GiB)": 117.38, + "step": 38880, + "train_speed(iter/s)": 1.638605 + }, + { + "acc": 0.67386751, + "epoch": 0.9864282090309487, + "grad_norm": 5.5625, + "learning_rate": 5.524602888576055e-06, + "loss": 1.57072525, + "memory(GiB)": 117.38, + "step": 38885, + "train_speed(iter/s)": 1.638628 + }, + { + "acc": 0.63869648, + "epoch": 0.9865550481988838, + "grad_norm": 5.4375, + "learning_rate": 5.523560041136116e-06, + "loss": 1.65355835, + "memory(GiB)": 117.38, + "step": 38890, + "train_speed(iter/s)": 1.638651 + }, + { + "acc": 0.63306437, + "epoch": 0.9866818873668188, + "grad_norm": 5.4375, + "learning_rate": 5.522517170667667e-06, + "loss": 1.72260876, + "memory(GiB)": 117.38, + "step": 38895, + "train_speed(iter/s)": 1.638676 + }, + { + "acc": 0.66508541, + "epoch": 0.9868087265347539, + "grad_norm": 6.6875, + "learning_rate": 5.5214742772165806e-06, + "loss": 1.55520439, + "memory(GiB)": 117.38, + "step": 38900, + "train_speed(iter/s)": 1.6387 + }, + { + "acc": 0.63914924, + "epoch": 0.986935565702689, + "grad_norm": 6.84375, + "learning_rate": 5.520431360828728e-06, + "loss": 1.68224697, + "memory(GiB)": 117.38, + "step": 38905, + "train_speed(iter/s)": 1.638723 + }, + { + "acc": 0.64293098, + "epoch": 0.987062404870624, + "grad_norm": 6.1875, + "learning_rate": 5.51938842154998e-06, + "loss": 1.65292435, + "memory(GiB)": 117.38, + "step": 38910, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.65842671, + "epoch": 0.9871892440385591, + "grad_norm": 8.0, + "learning_rate": 5.51834545942621e-06, + "loss": 1.57294483, + "memory(GiB)": 117.38, + "step": 38915, + "train_speed(iter/s)": 1.638769 + }, + { + "acc": 0.65121107, + "epoch": 0.9873160832064941, + "grad_norm": 7.28125, + "learning_rate": 5.5173024745032925e-06, + "loss": 1.57889328, + "memory(GiB)": 117.38, + "step": 38920, + "train_speed(iter/s)": 1.638794 + }, + { + "acc": 0.65043373, + "epoch": 0.9874429223744292, + "grad_norm": 6.15625, + "learning_rate": 5.516259466827103e-06, + "loss": 1.64338379, + "memory(GiB)": 117.38, + "step": 38925, + "train_speed(iter/s)": 1.638817 + }, + { + "acc": 0.65469093, + "epoch": 0.9875697615423643, + "grad_norm": 6.375, + "learning_rate": 5.515216436443517e-06, + "loss": 1.58907661, + "memory(GiB)": 117.38, + "step": 38930, + "train_speed(iter/s)": 1.638841 + }, + { + "acc": 0.65526352, + "epoch": 0.9876966007102993, + "grad_norm": 7.15625, + "learning_rate": 5.514173383398412e-06, + "loss": 1.66379585, + "memory(GiB)": 117.38, + "step": 38935, + "train_speed(iter/s)": 1.638865 + }, + { + "acc": 0.65677443, + "epoch": 0.9878234398782344, + "grad_norm": 5.8125, + "learning_rate": 5.513130307737666e-06, + "loss": 1.62588825, + "memory(GiB)": 117.38, + "step": 38940, + "train_speed(iter/s)": 1.638888 + }, + { + "acc": 0.65021114, + "epoch": 0.9879502790461695, + "grad_norm": 4.8125, + "learning_rate": 5.512087209507157e-06, + "loss": 1.65093613, + "memory(GiB)": 117.38, + "step": 38945, + "train_speed(iter/s)": 1.63891 + }, + { + "acc": 0.66078768, + "epoch": 0.9880771182141045, + "grad_norm": 5.125, + "learning_rate": 5.5110440887527684e-06, + "loss": 1.52087193, + "memory(GiB)": 117.38, + "step": 38950, + "train_speed(iter/s)": 1.638935 + }, + { + "acc": 0.66037054, + "epoch": 0.9882039573820396, + "grad_norm": 4.5625, + "learning_rate": 5.510000945520377e-06, + "loss": 1.57690353, + "memory(GiB)": 117.38, + "step": 38955, + "train_speed(iter/s)": 1.638957 + }, + { + "acc": 0.65718465, + "epoch": 0.9883307965499746, + "grad_norm": 5.84375, + "learning_rate": 5.508957779855869e-06, + "loss": 1.61658516, + "memory(GiB)": 117.38, + "step": 38960, + "train_speed(iter/s)": 1.63898 + }, + { + "acc": 0.64686213, + "epoch": 0.9884576357179097, + "grad_norm": 6.15625, + "learning_rate": 5.507914591805124e-06, + "loss": 1.61408405, + "memory(GiB)": 117.38, + "step": 38965, + "train_speed(iter/s)": 1.639003 + }, + { + "acc": 0.65653453, + "epoch": 0.9885844748858448, + "grad_norm": 5.3125, + "learning_rate": 5.506871381414027e-06, + "loss": 1.57099218, + "memory(GiB)": 117.38, + "step": 38970, + "train_speed(iter/s)": 1.639027 + }, + { + "acc": 0.65301905, + "epoch": 0.9887113140537798, + "grad_norm": 6.625, + "learning_rate": 5.505828148728465e-06, + "loss": 1.63272476, + "memory(GiB)": 117.38, + "step": 38975, + "train_speed(iter/s)": 1.639051 + }, + { + "acc": 0.66440511, + "epoch": 0.9888381532217149, + "grad_norm": 5.71875, + "learning_rate": 5.5047848937943225e-06, + "loss": 1.59567413, + "memory(GiB)": 117.38, + "step": 38980, + "train_speed(iter/s)": 1.639075 + }, + { + "acc": 0.6642467, + "epoch": 0.98896499238965, + "grad_norm": 5.65625, + "learning_rate": 5.503741616657486e-06, + "loss": 1.57433796, + "memory(GiB)": 117.38, + "step": 38985, + "train_speed(iter/s)": 1.639098 + }, + { + "acc": 0.65406647, + "epoch": 0.989091831557585, + "grad_norm": 8.125, + "learning_rate": 5.502698317363846e-06, + "loss": 1.66263123, + "memory(GiB)": 117.38, + "step": 38990, + "train_speed(iter/s)": 1.63912 + }, + { + "acc": 0.64882717, + "epoch": 0.98921867072552, + "grad_norm": 6.21875, + "learning_rate": 5.501654995959288e-06, + "loss": 1.65302734, + "memory(GiB)": 117.38, + "step": 38995, + "train_speed(iter/s)": 1.639144 + }, + { + "acc": 0.64091539, + "epoch": 0.989345509893455, + "grad_norm": 6.5, + "learning_rate": 5.5006116524897034e-06, + "loss": 1.63139229, + "memory(GiB)": 117.38, + "step": 39000, + "train_speed(iter/s)": 1.639167 + }, + { + "epoch": 0.989345509893455, + "eval_acc": 0.6460910042037602, + "eval_loss": 1.574126958847046, + "eval_runtime": 58.4403, + "eval_samples_per_second": 109.0, + "eval_steps_per_second": 27.259, + "step": 39000 + }, + { + "acc": 0.65300798, + "epoch": 0.9894723490613901, + "grad_norm": 6.40625, + "learning_rate": 5.499568287000984e-06, + "loss": 1.63923283, + "memory(GiB)": 117.38, + "step": 39005, + "train_speed(iter/s)": 1.634883 + }, + { + "acc": 0.66818128, + "epoch": 0.9895991882293252, + "grad_norm": 5.71875, + "learning_rate": 5.49852489953902e-06, + "loss": 1.6062746, + "memory(GiB)": 117.38, + "step": 39010, + "train_speed(iter/s)": 1.634905 + }, + { + "acc": 0.66122351, + "epoch": 0.9897260273972602, + "grad_norm": 6.6875, + "learning_rate": 5.497481490149705e-06, + "loss": 1.57987423, + "memory(GiB)": 117.38, + "step": 39015, + "train_speed(iter/s)": 1.634928 + }, + { + "acc": 0.65389156, + "epoch": 0.9898528665651953, + "grad_norm": 5.4375, + "learning_rate": 5.496438058878936e-06, + "loss": 1.62811966, + "memory(GiB)": 117.38, + "step": 39020, + "train_speed(iter/s)": 1.63495 + }, + { + "acc": 0.65526714, + "epoch": 0.9899797057331304, + "grad_norm": 5.40625, + "learning_rate": 5.4953946057726005e-06, + "loss": 1.63807335, + "memory(GiB)": 117.38, + "step": 39025, + "train_speed(iter/s)": 1.634974 + }, + { + "acc": 0.66559639, + "epoch": 0.9901065449010654, + "grad_norm": 5.1875, + "learning_rate": 5.494351130876602e-06, + "loss": 1.56376524, + "memory(GiB)": 117.38, + "step": 39030, + "train_speed(iter/s)": 1.634997 + }, + { + "acc": 0.67267427, + "epoch": 0.9902333840690005, + "grad_norm": 5.34375, + "learning_rate": 5.493307634236831e-06, + "loss": 1.56079607, + "memory(GiB)": 117.38, + "step": 39035, + "train_speed(iter/s)": 1.635021 + }, + { + "acc": 0.64773064, + "epoch": 0.9903602232369355, + "grad_norm": 5.59375, + "learning_rate": 5.492264115899189e-06, + "loss": 1.62359657, + "memory(GiB)": 117.38, + "step": 39040, + "train_speed(iter/s)": 1.635043 + }, + { + "acc": 0.65202203, + "epoch": 0.9904870624048706, + "grad_norm": 12.625, + "learning_rate": 5.491220575909573e-06, + "loss": 1.62284679, + "memory(GiB)": 117.38, + "step": 39045, + "train_speed(iter/s)": 1.635066 + }, + { + "acc": 0.65474219, + "epoch": 0.9906139015728057, + "grad_norm": 4.71875, + "learning_rate": 5.4901770143138835e-06, + "loss": 1.64360008, + "memory(GiB)": 117.38, + "step": 39050, + "train_speed(iter/s)": 1.63509 + }, + { + "acc": 0.65606775, + "epoch": 0.9907407407407407, + "grad_norm": 4.75, + "learning_rate": 5.48913343115802e-06, + "loss": 1.58373165, + "memory(GiB)": 117.38, + "step": 39055, + "train_speed(iter/s)": 1.635112 + }, + { + "acc": 0.64923234, + "epoch": 0.9908675799086758, + "grad_norm": 4.84375, + "learning_rate": 5.488089826487884e-06, + "loss": 1.58500471, + "memory(GiB)": 117.38, + "step": 39060, + "train_speed(iter/s)": 1.635135 + }, + { + "acc": 0.65942621, + "epoch": 0.9909944190766109, + "grad_norm": 4.9375, + "learning_rate": 5.48704620034938e-06, + "loss": 1.57913656, + "memory(GiB)": 117.38, + "step": 39065, + "train_speed(iter/s)": 1.635147 + }, + { + "acc": 0.67092609, + "epoch": 0.9911212582445459, + "grad_norm": 7.4375, + "learning_rate": 5.486002552788408e-06, + "loss": 1.60568542, + "memory(GiB)": 117.38, + "step": 39070, + "train_speed(iter/s)": 1.63517 + }, + { + "acc": 0.650278, + "epoch": 0.991248097412481, + "grad_norm": 6.0, + "learning_rate": 5.4849588838508734e-06, + "loss": 1.59628487, + "memory(GiB)": 117.38, + "step": 39075, + "train_speed(iter/s)": 1.635193 + }, + { + "acc": 0.6701498, + "epoch": 0.991374936580416, + "grad_norm": 6.0, + "learning_rate": 5.483915193582684e-06, + "loss": 1.57542067, + "memory(GiB)": 117.38, + "step": 39080, + "train_speed(iter/s)": 1.635216 + }, + { + "acc": 0.6399384, + "epoch": 0.9915017757483511, + "grad_norm": 5.21875, + "learning_rate": 5.482871482029742e-06, + "loss": 1.57752762, + "memory(GiB)": 117.38, + "step": 39085, + "train_speed(iter/s)": 1.635238 + }, + { + "acc": 0.66096268, + "epoch": 0.9916286149162862, + "grad_norm": 5.09375, + "learning_rate": 5.4818277492379565e-06, + "loss": 1.61035233, + "memory(GiB)": 117.38, + "step": 39090, + "train_speed(iter/s)": 1.635261 + }, + { + "acc": 0.66812725, + "epoch": 0.9917554540842212, + "grad_norm": 6.59375, + "learning_rate": 5.480783995253236e-06, + "loss": 1.58545609, + "memory(GiB)": 117.38, + "step": 39095, + "train_speed(iter/s)": 1.635284 + }, + { + "acc": 0.66740046, + "epoch": 0.9918822932521563, + "grad_norm": 6.5, + "learning_rate": 5.47974022012149e-06, + "loss": 1.58404446, + "memory(GiB)": 117.38, + "step": 39100, + "train_speed(iter/s)": 1.635306 + }, + { + "acc": 0.66551247, + "epoch": 0.9920091324200914, + "grad_norm": 6.65625, + "learning_rate": 5.478696423888624e-06, + "loss": 1.55421324, + "memory(GiB)": 117.38, + "step": 39105, + "train_speed(iter/s)": 1.635329 + }, + { + "acc": 0.65430903, + "epoch": 0.9921359715880264, + "grad_norm": 6.40625, + "learning_rate": 5.477652606600555e-06, + "loss": 1.5904953, + "memory(GiB)": 117.38, + "step": 39110, + "train_speed(iter/s)": 1.635353 + }, + { + "acc": 0.66596785, + "epoch": 0.9922628107559615, + "grad_norm": 6.0, + "learning_rate": 5.47660876830319e-06, + "loss": 1.52695141, + "memory(GiB)": 117.38, + "step": 39115, + "train_speed(iter/s)": 1.635374 + }, + { + "acc": 0.66659899, + "epoch": 0.9923896499238964, + "grad_norm": 6.3125, + "learning_rate": 5.475564909042444e-06, + "loss": 1.5241662, + "memory(GiB)": 117.38, + "step": 39120, + "train_speed(iter/s)": 1.635393 + }, + { + "acc": 0.65699735, + "epoch": 0.9925164890918315, + "grad_norm": 6.21875, + "learning_rate": 5.4745210288642306e-06, + "loss": 1.59540148, + "memory(GiB)": 117.38, + "step": 39125, + "train_speed(iter/s)": 1.635416 + }, + { + "acc": 0.63995919, + "epoch": 0.9926433282597666, + "grad_norm": 5.125, + "learning_rate": 5.473477127814464e-06, + "loss": 1.59047737, + "memory(GiB)": 117.38, + "step": 39130, + "train_speed(iter/s)": 1.635439 + }, + { + "acc": 0.63322458, + "epoch": 0.9927701674277016, + "grad_norm": 6.375, + "learning_rate": 5.472433205939058e-06, + "loss": 1.72567482, + "memory(GiB)": 117.38, + "step": 39135, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.67278895, + "epoch": 0.9928970065956367, + "grad_norm": 6.46875, + "learning_rate": 5.471389263283932e-06, + "loss": 1.55444756, + "memory(GiB)": 117.38, + "step": 39140, + "train_speed(iter/s)": 1.635486 + }, + { + "acc": 0.67533879, + "epoch": 0.9930238457635718, + "grad_norm": 6.53125, + "learning_rate": 5.4703452998950005e-06, + "loss": 1.52089148, + "memory(GiB)": 117.38, + "step": 39145, + "train_speed(iter/s)": 1.635507 + }, + { + "acc": 0.65287189, + "epoch": 0.9931506849315068, + "grad_norm": 5.40625, + "learning_rate": 5.469301315818183e-06, + "loss": 1.63992882, + "memory(GiB)": 117.38, + "step": 39150, + "train_speed(iter/s)": 1.635529 + }, + { + "acc": 0.65749435, + "epoch": 0.9932775240994419, + "grad_norm": 5.09375, + "learning_rate": 5.468257311099399e-06, + "loss": 1.49845467, + "memory(GiB)": 117.38, + "step": 39155, + "train_speed(iter/s)": 1.635552 + }, + { + "acc": 0.67615004, + "epoch": 0.9934043632673769, + "grad_norm": 7.03125, + "learning_rate": 5.467213285784567e-06, + "loss": 1.54736443, + "memory(GiB)": 117.38, + "step": 39160, + "train_speed(iter/s)": 1.635576 + }, + { + "acc": 0.65445733, + "epoch": 0.993531202435312, + "grad_norm": 5.8125, + "learning_rate": 5.466169239919608e-06, + "loss": 1.67408276, + "memory(GiB)": 117.38, + "step": 39165, + "train_speed(iter/s)": 1.6356 + }, + { + "acc": 0.65126104, + "epoch": 0.9936580416032471, + "grad_norm": 6.125, + "learning_rate": 5.465125173550446e-06, + "loss": 1.65996876, + "memory(GiB)": 117.38, + "step": 39170, + "train_speed(iter/s)": 1.635623 + }, + { + "acc": 0.65474653, + "epoch": 0.9937848807711821, + "grad_norm": 6.375, + "learning_rate": 5.464081086723001e-06, + "loss": 1.64905415, + "memory(GiB)": 117.38, + "step": 39175, + "train_speed(iter/s)": 1.635647 + }, + { + "acc": 0.64159436, + "epoch": 0.9939117199391172, + "grad_norm": 5.90625, + "learning_rate": 5.4630369794832006e-06, + "loss": 1.66727028, + "memory(GiB)": 117.38, + "step": 39180, + "train_speed(iter/s)": 1.635669 + }, + { + "acc": 0.66229115, + "epoch": 0.9940385591070523, + "grad_norm": 5.9375, + "learning_rate": 5.461992851876963e-06, + "loss": 1.6150219, + "memory(GiB)": 117.38, + "step": 39185, + "train_speed(iter/s)": 1.63569 + }, + { + "acc": 0.65428801, + "epoch": 0.9941653982749873, + "grad_norm": 8.4375, + "learning_rate": 5.460948703950218e-06, + "loss": 1.68495903, + "memory(GiB)": 117.38, + "step": 39190, + "train_speed(iter/s)": 1.635712 + }, + { + "acc": 0.65292454, + "epoch": 0.9942922374429224, + "grad_norm": 6.375, + "learning_rate": 5.459904535748892e-06, + "loss": 1.64405212, + "memory(GiB)": 117.38, + "step": 39195, + "train_speed(iter/s)": 1.635736 + }, + { + "acc": 0.68249865, + "epoch": 0.9944190766108574, + "grad_norm": 5.90625, + "learning_rate": 5.458860347318912e-06, + "loss": 1.50441771, + "memory(GiB)": 117.38, + "step": 39200, + "train_speed(iter/s)": 1.635759 + }, + { + "acc": 0.65553164, + "epoch": 0.9945459157787925, + "grad_norm": 6.15625, + "learning_rate": 5.457816138706203e-06, + "loss": 1.65315666, + "memory(GiB)": 117.38, + "step": 39205, + "train_speed(iter/s)": 1.635783 + }, + { + "acc": 0.66222887, + "epoch": 0.9946727549467276, + "grad_norm": 8.8125, + "learning_rate": 5.456771909956697e-06, + "loss": 1.60525055, + "memory(GiB)": 117.38, + "step": 39210, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.6699338, + "epoch": 0.9947995941146626, + "grad_norm": 6.90625, + "learning_rate": 5.455727661116324e-06, + "loss": 1.56743908, + "memory(GiB)": 117.38, + "step": 39215, + "train_speed(iter/s)": 1.635831 + }, + { + "acc": 0.66158605, + "epoch": 0.9949264332825977, + "grad_norm": 15.1875, + "learning_rate": 5.454683392231014e-06, + "loss": 1.63190041, + "memory(GiB)": 117.38, + "step": 39220, + "train_speed(iter/s)": 1.635854 + }, + { + "acc": 0.66914868, + "epoch": 0.9950532724505328, + "grad_norm": 5.5, + "learning_rate": 5.453639103346697e-06, + "loss": 1.55317698, + "memory(GiB)": 117.38, + "step": 39225, + "train_speed(iter/s)": 1.635877 + }, + { + "acc": 0.6589571, + "epoch": 0.9951801116184678, + "grad_norm": 7.375, + "learning_rate": 5.452594794509307e-06, + "loss": 1.53998489, + "memory(GiB)": 117.38, + "step": 39230, + "train_speed(iter/s)": 1.6359 + }, + { + "acc": 0.66344757, + "epoch": 0.9953069507864029, + "grad_norm": 5.625, + "learning_rate": 5.4515504657647765e-06, + "loss": 1.62008419, + "memory(GiB)": 117.38, + "step": 39235, + "train_speed(iter/s)": 1.635923 + }, + { + "acc": 0.65448818, + "epoch": 0.9954337899543378, + "grad_norm": 5.53125, + "learning_rate": 5.450506117159044e-06, + "loss": 1.68340836, + "memory(GiB)": 117.38, + "step": 39240, + "train_speed(iter/s)": 1.635946 + }, + { + "acc": 0.65918422, + "epoch": 0.9955606291222729, + "grad_norm": 7.1875, + "learning_rate": 5.449461748738037e-06, + "loss": 1.62783661, + "memory(GiB)": 117.38, + "step": 39245, + "train_speed(iter/s)": 1.635969 + }, + { + "acc": 0.66453824, + "epoch": 0.995687468290208, + "grad_norm": 6.09375, + "learning_rate": 5.448417360547699e-06, + "loss": 1.50588951, + "memory(GiB)": 117.38, + "step": 39250, + "train_speed(iter/s)": 1.635991 + }, + { + "acc": 0.66352844, + "epoch": 0.995814307458143, + "grad_norm": 5.28125, + "learning_rate": 5.44737295263396e-06, + "loss": 1.53560629, + "memory(GiB)": 117.38, + "step": 39255, + "train_speed(iter/s)": 1.636014 + }, + { + "acc": 0.64515104, + "epoch": 0.9959411466260781, + "grad_norm": 5.84375, + "learning_rate": 5.446328525042764e-06, + "loss": 1.67365685, + "memory(GiB)": 117.38, + "step": 39260, + "train_speed(iter/s)": 1.636036 + }, + { + "acc": 0.65106459, + "epoch": 0.9960679857940132, + "grad_norm": 6.40625, + "learning_rate": 5.4452840778200456e-06, + "loss": 1.6498579, + "memory(GiB)": 117.38, + "step": 39265, + "train_speed(iter/s)": 1.63606 + }, + { + "acc": 0.65146074, + "epoch": 0.9961948249619482, + "grad_norm": 6.59375, + "learning_rate": 5.444239611011746e-06, + "loss": 1.59827147, + "memory(GiB)": 117.38, + "step": 39270, + "train_speed(iter/s)": 1.636083 + }, + { + "acc": 0.65056009, + "epoch": 0.9963216641298833, + "grad_norm": 4.875, + "learning_rate": 5.443195124663804e-06, + "loss": 1.65174198, + "memory(GiB)": 117.38, + "step": 39275, + "train_speed(iter/s)": 1.636106 + }, + { + "acc": 0.65349693, + "epoch": 0.9964485032978183, + "grad_norm": 6.8125, + "learning_rate": 5.442150618822162e-06, + "loss": 1.61036835, + "memory(GiB)": 117.38, + "step": 39280, + "train_speed(iter/s)": 1.63613 + }, + { + "acc": 0.66394897, + "epoch": 0.9965753424657534, + "grad_norm": 5.15625, + "learning_rate": 5.441106093532762e-06, + "loss": 1.64434776, + "memory(GiB)": 117.38, + "step": 39285, + "train_speed(iter/s)": 1.636152 + }, + { + "acc": 0.66421161, + "epoch": 0.9967021816336885, + "grad_norm": 4.71875, + "learning_rate": 5.440061548841546e-06, + "loss": 1.57276306, + "memory(GiB)": 117.38, + "step": 39290, + "train_speed(iter/s)": 1.636177 + }, + { + "acc": 0.66076994, + "epoch": 0.9968290208016235, + "grad_norm": 6.90625, + "learning_rate": 5.43901698479446e-06, + "loss": 1.60429821, + "memory(GiB)": 117.38, + "step": 39295, + "train_speed(iter/s)": 1.636201 + }, + { + "acc": 0.63934999, + "epoch": 0.9969558599695586, + "grad_norm": 5.75, + "learning_rate": 5.4379724014374455e-06, + "loss": 1.65393124, + "memory(GiB)": 117.38, + "step": 39300, + "train_speed(iter/s)": 1.636227 + }, + { + "acc": 0.64348688, + "epoch": 0.9970826991374937, + "grad_norm": 4.90625, + "learning_rate": 5.436927798816448e-06, + "loss": 1.63104057, + "memory(GiB)": 117.38, + "step": 39305, + "train_speed(iter/s)": 1.63625 + }, + { + "acc": 0.67017736, + "epoch": 0.9972095383054287, + "grad_norm": 11.5, + "learning_rate": 5.4358831769774174e-06, + "loss": 1.54743223, + "memory(GiB)": 117.38, + "step": 39310, + "train_speed(iter/s)": 1.636273 + }, + { + "acc": 0.65907726, + "epoch": 0.9973363774733638, + "grad_norm": 5.15625, + "learning_rate": 5.434838535966298e-06, + "loss": 1.65316467, + "memory(GiB)": 117.38, + "step": 39315, + "train_speed(iter/s)": 1.636295 + }, + { + "acc": 0.64373393, + "epoch": 0.9974632166412988, + "grad_norm": 6.34375, + "learning_rate": 5.43379387582904e-06, + "loss": 1.61678085, + "memory(GiB)": 117.38, + "step": 39320, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.65938292, + "epoch": 0.9975900558092339, + "grad_norm": 5.3125, + "learning_rate": 5.432749196611587e-06, + "loss": 1.5760704, + "memory(GiB)": 117.38, + "step": 39325, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.65438557, + "epoch": 0.997716894977169, + "grad_norm": 5.71875, + "learning_rate": 5.431704498359896e-06, + "loss": 1.58403225, + "memory(GiB)": 117.38, + "step": 39330, + "train_speed(iter/s)": 1.636369 + }, + { + "acc": 0.66184835, + "epoch": 0.997843734145104, + "grad_norm": 6.78125, + "learning_rate": 5.43065978111991e-06, + "loss": 1.60483894, + "memory(GiB)": 117.38, + "step": 39335, + "train_speed(iter/s)": 1.636391 + }, + { + "acc": 0.65278807, + "epoch": 0.9979705733130391, + "grad_norm": 7.15625, + "learning_rate": 5.429615044937586e-06, + "loss": 1.57684555, + "memory(GiB)": 117.38, + "step": 39340, + "train_speed(iter/s)": 1.636416 + }, + { + "acc": 0.65417552, + "epoch": 0.9980974124809742, + "grad_norm": 5.5625, + "learning_rate": 5.4285702898588754e-06, + "loss": 1.6590395, + "memory(GiB)": 117.38, + "step": 39345, + "train_speed(iter/s)": 1.636438 + }, + { + "acc": 0.65630341, + "epoch": 0.9982242516489092, + "grad_norm": 5.1875, + "learning_rate": 5.427525515929729e-06, + "loss": 1.5860218, + "memory(GiB)": 117.38, + "step": 39350, + "train_speed(iter/s)": 1.636463 + }, + { + "acc": 0.67345562, + "epoch": 0.9983510908168443, + "grad_norm": 6.625, + "learning_rate": 5.426480723196102e-06, + "loss": 1.49641132, + "memory(GiB)": 117.38, + "step": 39355, + "train_speed(iter/s)": 1.636487 + }, + { + "acc": 0.6565259, + "epoch": 0.9984779299847792, + "grad_norm": 5.5625, + "learning_rate": 5.425435911703948e-06, + "loss": 1.60635338, + "memory(GiB)": 117.38, + "step": 39360, + "train_speed(iter/s)": 1.636509 + }, + { + "acc": 0.64514771, + "epoch": 0.9986047691527143, + "grad_norm": 6.125, + "learning_rate": 5.424391081499223e-06, + "loss": 1.61983109, + "memory(GiB)": 117.38, + "step": 39365, + "train_speed(iter/s)": 1.636532 + }, + { + "acc": 0.65590596, + "epoch": 0.9987316083206494, + "grad_norm": 5.0, + "learning_rate": 5.423346232627884e-06, + "loss": 1.60533066, + "memory(GiB)": 117.38, + "step": 39370, + "train_speed(iter/s)": 1.636556 + }, + { + "acc": 0.64388051, + "epoch": 0.9988584474885844, + "grad_norm": 6.71875, + "learning_rate": 5.422301365135887e-06, + "loss": 1.66110821, + "memory(GiB)": 117.38, + "step": 39375, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.63931942, + "epoch": 0.9989852866565195, + "grad_norm": 6.21875, + "learning_rate": 5.421256479069191e-06, + "loss": 1.62837257, + "memory(GiB)": 117.38, + "step": 39380, + "train_speed(iter/s)": 1.636604 + }, + { + "acc": 0.66496649, + "epoch": 0.9991121258244546, + "grad_norm": 6.5, + "learning_rate": 5.420211574473754e-06, + "loss": 1.54188709, + "memory(GiB)": 117.38, + "step": 39385, + "train_speed(iter/s)": 1.636629 + }, + { + "acc": 0.65972557, + "epoch": 0.9992389649923896, + "grad_norm": 5.5, + "learning_rate": 5.419166651395536e-06, + "loss": 1.57559299, + "memory(GiB)": 117.38, + "step": 39390, + "train_speed(iter/s)": 1.636652 + }, + { + "acc": 0.66274433, + "epoch": 0.9993658041603247, + "grad_norm": 6.34375, + "learning_rate": 5.418121709880497e-06, + "loss": 1.57407169, + "memory(GiB)": 117.38, + "step": 39395, + "train_speed(iter/s)": 1.636676 + }, + { + "acc": 0.65294142, + "epoch": 0.9994926433282597, + "grad_norm": 6.375, + "learning_rate": 5.4170767499746e-06, + "loss": 1.59775505, + "memory(GiB)": 117.38, + "step": 39400, + "train_speed(iter/s)": 1.636699 + }, + { + "acc": 0.65430641, + "epoch": 0.9996194824961948, + "grad_norm": 6.34375, + "learning_rate": 5.416031771723803e-06, + "loss": 1.63736858, + "memory(GiB)": 117.38, + "step": 39405, + "train_speed(iter/s)": 1.636723 + }, + { + "acc": 0.65762796, + "epoch": 0.9997463216641299, + "grad_norm": 7.1875, + "learning_rate": 5.414986775174073e-06, + "loss": 1.57327948, + "memory(GiB)": 117.38, + "step": 39410, + "train_speed(iter/s)": 1.636748 + }, + { + "acc": 0.66443744, + "epoch": 0.9998731608320649, + "grad_norm": 5.0625, + "learning_rate": 5.41394176037137e-06, + "loss": 1.57263412, + "memory(GiB)": 117.38, + "step": 39415, + "train_speed(iter/s)": 1.636771 + }, + { + "acc": 0.64962502, + "epoch": 1.0, + "grad_norm": 6.375, + "learning_rate": 5.412896727361663e-06, + "loss": 1.66451626, + "memory(GiB)": 117.38, + "step": 39420, + "train_speed(iter/s)": 1.636743 + }, + { + "acc": 0.66950126, + "epoch": 1.000126839167935, + "grad_norm": 5.90625, + "learning_rate": 5.411851676190912e-06, + "loss": 1.56461639, + "memory(GiB)": 117.38, + "step": 39425, + "train_speed(iter/s)": 1.636739 + }, + { + "acc": 0.64897671, + "epoch": 1.0002536783358702, + "grad_norm": 5.4375, + "learning_rate": 5.4108066069050864e-06, + "loss": 1.63193035, + "memory(GiB)": 117.38, + "step": 39430, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.62981844, + "epoch": 1.0003805175038052, + "grad_norm": 5.34375, + "learning_rate": 5.409761519550153e-06, + "loss": 1.65145054, + "memory(GiB)": 117.38, + "step": 39435, + "train_speed(iter/s)": 1.636783 + }, + { + "acc": 0.6696702, + "epoch": 1.0005073566717402, + "grad_norm": 6.125, + "learning_rate": 5.408716414172077e-06, + "loss": 1.55201426, + "memory(GiB)": 117.38, + "step": 39440, + "train_speed(iter/s)": 1.636804 + }, + { + "acc": 0.66762285, + "epoch": 1.0006341958396754, + "grad_norm": 7.0625, + "learning_rate": 5.407671290816829e-06, + "loss": 1.60498619, + "memory(GiB)": 117.38, + "step": 39445, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.63307271, + "epoch": 1.0007610350076104, + "grad_norm": 5.84375, + "learning_rate": 5.406626149530378e-06, + "loss": 1.72459278, + "memory(GiB)": 117.38, + "step": 39450, + "train_speed(iter/s)": 1.636847 + }, + { + "acc": 0.67057791, + "epoch": 1.0008878741755454, + "grad_norm": 4.96875, + "learning_rate": 5.405580990358692e-06, + "loss": 1.51603107, + "memory(GiB)": 117.38, + "step": 39455, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.66682401, + "epoch": 1.0010147133434804, + "grad_norm": 7.0625, + "learning_rate": 5.404535813347746e-06, + "loss": 1.57820301, + "memory(GiB)": 117.38, + "step": 39460, + "train_speed(iter/s)": 1.636891 + }, + { + "acc": 0.66099606, + "epoch": 1.0011415525114156, + "grad_norm": 8.1875, + "learning_rate": 5.403490618543505e-06, + "loss": 1.63314533, + "memory(GiB)": 117.38, + "step": 39465, + "train_speed(iter/s)": 1.636913 + }, + { + "acc": 0.67097068, + "epoch": 1.0012683916793506, + "grad_norm": 5.96875, + "learning_rate": 5.40244540599195e-06, + "loss": 1.56456203, + "memory(GiB)": 117.38, + "step": 39470, + "train_speed(iter/s)": 1.636936 + }, + { + "acc": 0.66049309, + "epoch": 1.0013952308472855, + "grad_norm": 5.0, + "learning_rate": 5.401400175739045e-06, + "loss": 1.63894768, + "memory(GiB)": 117.38, + "step": 39475, + "train_speed(iter/s)": 1.636959 + }, + { + "acc": 0.64392538, + "epoch": 1.0015220700152208, + "grad_norm": 6.8125, + "learning_rate": 5.400354927830769e-06, + "loss": 1.64915848, + "memory(GiB)": 117.38, + "step": 39480, + "train_speed(iter/s)": 1.636985 + }, + { + "acc": 0.66596642, + "epoch": 1.0016489091831557, + "grad_norm": 6.46875, + "learning_rate": 5.399309662313097e-06, + "loss": 1.56436806, + "memory(GiB)": 117.38, + "step": 39485, + "train_speed(iter/s)": 1.637008 + }, + { + "acc": 0.64307508, + "epoch": 1.0017757483510907, + "grad_norm": 5.6875, + "learning_rate": 5.3982643792320024e-06, + "loss": 1.61942768, + "memory(GiB)": 117.38, + "step": 39490, + "train_speed(iter/s)": 1.63703 + }, + { + "acc": 0.65330992, + "epoch": 1.001902587519026, + "grad_norm": 5.09375, + "learning_rate": 5.397219078633462e-06, + "loss": 1.59372082, + "memory(GiB)": 117.38, + "step": 39495, + "train_speed(iter/s)": 1.637052 + }, + { + "acc": 0.65937791, + "epoch": 1.002029426686961, + "grad_norm": 6.6875, + "learning_rate": 5.3961737605634546e-06, + "loss": 1.61160278, + "memory(GiB)": 117.38, + "step": 39500, + "train_speed(iter/s)": 1.637075 + }, + { + "acc": 0.66893864, + "epoch": 1.002156265854896, + "grad_norm": 5.90625, + "learning_rate": 5.395128425067954e-06, + "loss": 1.60031319, + "memory(GiB)": 117.38, + "step": 39505, + "train_speed(iter/s)": 1.637098 + }, + { + "acc": 0.68094864, + "epoch": 1.0022831050228311, + "grad_norm": 5.0625, + "learning_rate": 5.394083072192944e-06, + "loss": 1.51652832, + "memory(GiB)": 117.38, + "step": 39510, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.67578306, + "epoch": 1.0024099441907661, + "grad_norm": 6.3125, + "learning_rate": 5.393037701984399e-06, + "loss": 1.51201839, + "memory(GiB)": 117.38, + "step": 39515, + "train_speed(iter/s)": 1.637143 + }, + { + "acc": 0.64685206, + "epoch": 1.0025367833587011, + "grad_norm": 5.21875, + "learning_rate": 5.391992314488303e-06, + "loss": 1.58944092, + "memory(GiB)": 117.38, + "step": 39520, + "train_speed(iter/s)": 1.637166 + }, + { + "acc": 0.64986658, + "epoch": 1.0026636225266363, + "grad_norm": 5.0625, + "learning_rate": 5.3909469097506314e-06, + "loss": 1.59459209, + "memory(GiB)": 117.38, + "step": 39525, + "train_speed(iter/s)": 1.637188 + }, + { + "acc": 0.63995323, + "epoch": 1.0027904616945713, + "grad_norm": 5.96875, + "learning_rate": 5.389901487817373e-06, + "loss": 1.62200928, + "memory(GiB)": 117.38, + "step": 39530, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.6651732, + "epoch": 1.0029173008625063, + "grad_norm": 6.25, + "learning_rate": 5.388856048734505e-06, + "loss": 1.60574818, + "memory(GiB)": 117.38, + "step": 39535, + "train_speed(iter/s)": 1.637235 + }, + { + "acc": 0.6459837, + "epoch": 1.0030441400304415, + "grad_norm": 5.5625, + "learning_rate": 5.3878105925480115e-06, + "loss": 1.70576916, + "memory(GiB)": 117.38, + "step": 39540, + "train_speed(iter/s)": 1.637258 + }, + { + "acc": 0.66164217, + "epoch": 1.0031709791983765, + "grad_norm": 5.03125, + "learning_rate": 5.3867651193038765e-06, + "loss": 1.60276623, + "memory(GiB)": 117.38, + "step": 39545, + "train_speed(iter/s)": 1.63728 + }, + { + "acc": 0.65812364, + "epoch": 1.0032978183663115, + "grad_norm": 5.625, + "learning_rate": 5.385719629048086e-06, + "loss": 1.5767518, + "memory(GiB)": 117.38, + "step": 39550, + "train_speed(iter/s)": 1.637302 + }, + { + "acc": 0.64759388, + "epoch": 1.0034246575342465, + "grad_norm": 4.96875, + "learning_rate": 5.384674121826622e-06, + "loss": 1.64803886, + "memory(GiB)": 117.38, + "step": 39555, + "train_speed(iter/s)": 1.637324 + }, + { + "acc": 0.6585988, + "epoch": 1.0035514967021817, + "grad_norm": 6.28125, + "learning_rate": 5.383628597685474e-06, + "loss": 1.58061104, + "memory(GiB)": 117.38, + "step": 39560, + "train_speed(iter/s)": 1.637348 + }, + { + "acc": 0.67884254, + "epoch": 1.0036783358701167, + "grad_norm": 5.75, + "learning_rate": 5.382583056670627e-06, + "loss": 1.52536678, + "memory(GiB)": 117.38, + "step": 39565, + "train_speed(iter/s)": 1.63737 + }, + { + "acc": 0.65091953, + "epoch": 1.0038051750380517, + "grad_norm": 6.6875, + "learning_rate": 5.38153749882807e-06, + "loss": 1.63830948, + "memory(GiB)": 117.38, + "step": 39570, + "train_speed(iter/s)": 1.637394 + }, + { + "acc": 0.65617323, + "epoch": 1.0039320142059869, + "grad_norm": 6.15625, + "learning_rate": 5.38049192420379e-06, + "loss": 1.58975906, + "memory(GiB)": 117.38, + "step": 39575, + "train_speed(iter/s)": 1.637416 + }, + { + "acc": 0.65104179, + "epoch": 1.0040588533739219, + "grad_norm": 4.53125, + "learning_rate": 5.3794463328437766e-06, + "loss": 1.60249252, + "memory(GiB)": 117.38, + "step": 39580, + "train_speed(iter/s)": 1.637439 + }, + { + "acc": 0.66326785, + "epoch": 1.0041856925418569, + "grad_norm": 6.6875, + "learning_rate": 5.3784007247940185e-06, + "loss": 1.52351313, + "memory(GiB)": 117.38, + "step": 39585, + "train_speed(iter/s)": 1.637459 + }, + { + "acc": 0.65451446, + "epoch": 1.004312531709792, + "grad_norm": 9.0625, + "learning_rate": 5.377355100100508e-06, + "loss": 1.53261662, + "memory(GiB)": 117.38, + "step": 39590, + "train_speed(iter/s)": 1.63748 + }, + { + "acc": 0.66253719, + "epoch": 1.004439370877727, + "grad_norm": 6.625, + "learning_rate": 5.376309458809235e-06, + "loss": 1.5595974, + "memory(GiB)": 117.38, + "step": 39595, + "train_speed(iter/s)": 1.637501 + }, + { + "acc": 0.64328623, + "epoch": 1.004566210045662, + "grad_norm": 6.03125, + "learning_rate": 5.375263800966192e-06, + "loss": 1.62471447, + "memory(GiB)": 117.38, + "step": 39600, + "train_speed(iter/s)": 1.637521 + }, + { + "acc": 0.64843526, + "epoch": 1.0046930492135973, + "grad_norm": 7.34375, + "learning_rate": 5.374218126617371e-06, + "loss": 1.57931347, + "memory(GiB)": 117.38, + "step": 39605, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.6540575, + "epoch": 1.0048198883815322, + "grad_norm": 5.21875, + "learning_rate": 5.373172435808768e-06, + "loss": 1.62933846, + "memory(GiB)": 117.38, + "step": 39610, + "train_speed(iter/s)": 1.637562 + }, + { + "acc": 0.65167828, + "epoch": 1.0049467275494672, + "grad_norm": 6.625, + "learning_rate": 5.372126728586372e-06, + "loss": 1.64481163, + "memory(GiB)": 117.38, + "step": 39615, + "train_speed(iter/s)": 1.637583 + }, + { + "acc": 0.6512394, + "epoch": 1.0050735667174022, + "grad_norm": 5.375, + "learning_rate": 5.371081004996184e-06, + "loss": 1.57376308, + "memory(GiB)": 117.38, + "step": 39620, + "train_speed(iter/s)": 1.637602 + }, + { + "acc": 0.64877143, + "epoch": 1.0052004058853374, + "grad_norm": 5.34375, + "learning_rate": 5.370035265084195e-06, + "loss": 1.59633064, + "memory(GiB)": 117.38, + "step": 39625, + "train_speed(iter/s)": 1.637623 + }, + { + "acc": 0.65403261, + "epoch": 1.0053272450532724, + "grad_norm": 6.71875, + "learning_rate": 5.3689895088964025e-06, + "loss": 1.63066387, + "memory(GiB)": 117.38, + "step": 39630, + "train_speed(iter/s)": 1.637644 + }, + { + "acc": 0.64735985, + "epoch": 1.0054540842212074, + "grad_norm": 6.0, + "learning_rate": 5.367943736478806e-06, + "loss": 1.62418671, + "memory(GiB)": 117.38, + "step": 39635, + "train_speed(iter/s)": 1.637664 + }, + { + "acc": 0.64825654, + "epoch": 1.0055809233891426, + "grad_norm": 5.875, + "learning_rate": 5.3668979478774e-06, + "loss": 1.67106895, + "memory(GiB)": 117.38, + "step": 39640, + "train_speed(iter/s)": 1.637685 + }, + { + "acc": 0.66372509, + "epoch": 1.0057077625570776, + "grad_norm": 5.71875, + "learning_rate": 5.3658521431381836e-06, + "loss": 1.56859446, + "memory(GiB)": 117.38, + "step": 39645, + "train_speed(iter/s)": 1.637708 + }, + { + "acc": 0.65510969, + "epoch": 1.0058346017250126, + "grad_norm": 5.875, + "learning_rate": 5.364806322307158e-06, + "loss": 1.58570452, + "memory(GiB)": 117.38, + "step": 39650, + "train_speed(iter/s)": 1.637731 + }, + { + "acc": 0.64684038, + "epoch": 1.0059614408929478, + "grad_norm": 5.9375, + "learning_rate": 5.363760485430321e-06, + "loss": 1.6566288, + "memory(GiB)": 117.38, + "step": 39655, + "train_speed(iter/s)": 1.63775 + }, + { + "acc": 0.66637049, + "epoch": 1.0060882800608828, + "grad_norm": 5.8125, + "learning_rate": 5.3627146325536725e-06, + "loss": 1.62865295, + "memory(GiB)": 117.38, + "step": 39660, + "train_speed(iter/s)": 1.637772 + }, + { + "acc": 0.67953014, + "epoch": 1.0062151192288178, + "grad_norm": 5.96875, + "learning_rate": 5.361668763723216e-06, + "loss": 1.46089296, + "memory(GiB)": 117.38, + "step": 39665, + "train_speed(iter/s)": 1.637793 + }, + { + "acc": 0.65385904, + "epoch": 1.006341958396753, + "grad_norm": 5.46875, + "learning_rate": 5.360622878984954e-06, + "loss": 1.57221041, + "memory(GiB)": 117.38, + "step": 39670, + "train_speed(iter/s)": 1.637814 + }, + { + "acc": 0.65011883, + "epoch": 1.006468797564688, + "grad_norm": 5.625, + "learning_rate": 5.359576978384885e-06, + "loss": 1.62987633, + "memory(GiB)": 117.38, + "step": 39675, + "train_speed(iter/s)": 1.637836 + }, + { + "acc": 0.66177073, + "epoch": 1.006595636732623, + "grad_norm": 5.375, + "learning_rate": 5.358531061969018e-06, + "loss": 1.56385326, + "memory(GiB)": 117.38, + "step": 39680, + "train_speed(iter/s)": 1.637858 + }, + { + "acc": 0.65169616, + "epoch": 1.0067224759005582, + "grad_norm": 6.1875, + "learning_rate": 5.357485129783351e-06, + "loss": 1.67378426, + "memory(GiB)": 117.38, + "step": 39685, + "train_speed(iter/s)": 1.637879 + }, + { + "acc": 0.65342159, + "epoch": 1.0068493150684932, + "grad_norm": 5.9375, + "learning_rate": 5.356439181873895e-06, + "loss": 1.60303345, + "memory(GiB)": 117.38, + "step": 39690, + "train_speed(iter/s)": 1.637901 + }, + { + "acc": 0.66733513, + "epoch": 1.0069761542364282, + "grad_norm": 5.625, + "learning_rate": 5.35539321828665e-06, + "loss": 1.52794981, + "memory(GiB)": 117.38, + "step": 39695, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.65667996, + "epoch": 1.0071029934043634, + "grad_norm": 5.1875, + "learning_rate": 5.354347239067625e-06, + "loss": 1.57421608, + "memory(GiB)": 117.38, + "step": 39700, + "train_speed(iter/s)": 1.637942 + }, + { + "acc": 0.66278677, + "epoch": 1.0072298325722984, + "grad_norm": 6.0, + "learning_rate": 5.3533012442628275e-06, + "loss": 1.52947922, + "memory(GiB)": 117.38, + "step": 39705, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.66822367, + "epoch": 1.0073566717402334, + "grad_norm": 8.125, + "learning_rate": 5.3522552339182635e-06, + "loss": 1.51784086, + "memory(GiB)": 117.38, + "step": 39710, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.65919166, + "epoch": 1.0074835109081683, + "grad_norm": 5.875, + "learning_rate": 5.351209208079941e-06, + "loss": 1.62209034, + "memory(GiB)": 117.38, + "step": 39715, + "train_speed(iter/s)": 1.638008 + }, + { + "acc": 0.65563951, + "epoch": 1.0076103500761036, + "grad_norm": 6.4375, + "learning_rate": 5.35016316679387e-06, + "loss": 1.60887737, + "memory(GiB)": 117.38, + "step": 39720, + "train_speed(iter/s)": 1.63803 + }, + { + "acc": 0.66711435, + "epoch": 1.0077371892440385, + "grad_norm": 7.0, + "learning_rate": 5.349117110106059e-06, + "loss": 1.49012108, + "memory(GiB)": 117.38, + "step": 39725, + "train_speed(iter/s)": 1.638054 + }, + { + "acc": 0.65878925, + "epoch": 1.0078640284119735, + "grad_norm": 6.375, + "learning_rate": 5.34807103806252e-06, + "loss": 1.59575462, + "memory(GiB)": 117.38, + "step": 39730, + "train_speed(iter/s)": 1.638076 + }, + { + "acc": 0.64732299, + "epoch": 1.0079908675799087, + "grad_norm": 5.71875, + "learning_rate": 5.347024950709262e-06, + "loss": 1.60237503, + "memory(GiB)": 117.38, + "step": 39735, + "train_speed(iter/s)": 1.638098 + }, + { + "acc": 0.64321766, + "epoch": 1.0081177067478437, + "grad_norm": 5.53125, + "learning_rate": 5.345978848092297e-06, + "loss": 1.66265335, + "memory(GiB)": 117.38, + "step": 39740, + "train_speed(iter/s)": 1.638121 + }, + { + "acc": 0.65173054, + "epoch": 1.0082445459157787, + "grad_norm": 7.625, + "learning_rate": 5.344932730257637e-06, + "loss": 1.58735352, + "memory(GiB)": 117.38, + "step": 39745, + "train_speed(iter/s)": 1.638143 + }, + { + "acc": 0.6556314, + "epoch": 1.008371385083714, + "grad_norm": 6.875, + "learning_rate": 5.343886597251298e-06, + "loss": 1.60826073, + "memory(GiB)": 117.38, + "step": 39750, + "train_speed(iter/s)": 1.638165 + }, + { + "acc": 0.66044836, + "epoch": 1.008498224251649, + "grad_norm": 5.375, + "learning_rate": 5.342840449119287e-06, + "loss": 1.64020424, + "memory(GiB)": 117.38, + "step": 39755, + "train_speed(iter/s)": 1.638187 + }, + { + "acc": 0.65782261, + "epoch": 1.008625063419584, + "grad_norm": 6.03125, + "learning_rate": 5.341794285907627e-06, + "loss": 1.60111179, + "memory(GiB)": 117.38, + "step": 39760, + "train_speed(iter/s)": 1.638209 + }, + { + "acc": 0.64857674, + "epoch": 1.0087519025875191, + "grad_norm": 5.84375, + "learning_rate": 5.340748107662324e-06, + "loss": 1.66032734, + "memory(GiB)": 117.38, + "step": 39765, + "train_speed(iter/s)": 1.638232 + }, + { + "acc": 0.65215044, + "epoch": 1.0088787417554541, + "grad_norm": 5.46875, + "learning_rate": 5.339701914429402e-06, + "loss": 1.62041969, + "memory(GiB)": 117.38, + "step": 39770, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.647154, + "epoch": 1.009005580923389, + "grad_norm": 5.96875, + "learning_rate": 5.338655706254871e-06, + "loss": 1.72091618, + "memory(GiB)": 117.38, + "step": 39775, + "train_speed(iter/s)": 1.638276 + }, + { + "acc": 0.6473125, + "epoch": 1.009132420091324, + "grad_norm": 4.625, + "learning_rate": 5.33760948318475e-06, + "loss": 1.61755104, + "memory(GiB)": 117.38, + "step": 39780, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.6669251, + "epoch": 1.0092592592592593, + "grad_norm": 6.09375, + "learning_rate": 5.336563245265056e-06, + "loss": 1.63113251, + "memory(GiB)": 117.38, + "step": 39785, + "train_speed(iter/s)": 1.63832 + }, + { + "acc": 0.64419541, + "epoch": 1.0093860984271943, + "grad_norm": 6.75, + "learning_rate": 5.3355169925418095e-06, + "loss": 1.69696808, + "memory(GiB)": 117.38, + "step": 39790, + "train_speed(iter/s)": 1.638342 + }, + { + "acc": 0.65545344, + "epoch": 1.0095129375951293, + "grad_norm": 5.1875, + "learning_rate": 5.334470725061027e-06, + "loss": 1.64371738, + "memory(GiB)": 117.38, + "step": 39795, + "train_speed(iter/s)": 1.638364 + }, + { + "acc": 0.65667014, + "epoch": 1.0096397767630645, + "grad_norm": 5.625, + "learning_rate": 5.333424442868729e-06, + "loss": 1.63895397, + "memory(GiB)": 117.38, + "step": 39800, + "train_speed(iter/s)": 1.638386 + }, + { + "acc": 0.65159626, + "epoch": 1.0097666159309995, + "grad_norm": 6.5625, + "learning_rate": 5.3323781460109345e-06, + "loss": 1.65605927, + "memory(GiB)": 117.38, + "step": 39805, + "train_speed(iter/s)": 1.638407 + }, + { + "acc": 0.65893574, + "epoch": 1.0098934550989345, + "grad_norm": 5.9375, + "learning_rate": 5.3313318345336665e-06, + "loss": 1.59443979, + "memory(GiB)": 117.38, + "step": 39810, + "train_speed(iter/s)": 1.63843 + }, + { + "acc": 0.66861, + "epoch": 1.0100202942668697, + "grad_norm": 6.1875, + "learning_rate": 5.330285508482944e-06, + "loss": 1.58127804, + "memory(GiB)": 117.38, + "step": 39815, + "train_speed(iter/s)": 1.638453 + }, + { + "acc": 0.6593791, + "epoch": 1.0101471334348047, + "grad_norm": 9.125, + "learning_rate": 5.3292391679047905e-06, + "loss": 1.58990421, + "memory(GiB)": 117.38, + "step": 39820, + "train_speed(iter/s)": 1.638476 + }, + { + "acc": 0.65348225, + "epoch": 1.0102739726027397, + "grad_norm": 5.09375, + "learning_rate": 5.328192812845228e-06, + "loss": 1.64288521, + "memory(GiB)": 117.38, + "step": 39825, + "train_speed(iter/s)": 1.638498 + }, + { + "acc": 0.65901661, + "epoch": 1.0104008117706749, + "grad_norm": 6.5625, + "learning_rate": 5.3271464433502805e-06, + "loss": 1.57470474, + "memory(GiB)": 117.38, + "step": 39830, + "train_speed(iter/s)": 1.63852 + }, + { + "acc": 0.65531597, + "epoch": 1.0105276509386099, + "grad_norm": 6.75, + "learning_rate": 5.3261000594659715e-06, + "loss": 1.65476475, + "memory(GiB)": 117.38, + "step": 39835, + "train_speed(iter/s)": 1.638542 + }, + { + "acc": 0.63626199, + "epoch": 1.0106544901065448, + "grad_norm": 5.59375, + "learning_rate": 5.3250536612383275e-06, + "loss": 1.70565567, + "memory(GiB)": 117.38, + "step": 39840, + "train_speed(iter/s)": 1.638563 + }, + { + "acc": 0.67136898, + "epoch": 1.01078132927448, + "grad_norm": 4.625, + "learning_rate": 5.32400724871337e-06, + "loss": 1.52749996, + "memory(GiB)": 117.38, + "step": 39845, + "train_speed(iter/s)": 1.638584 + }, + { + "acc": 0.66342649, + "epoch": 1.010908168442415, + "grad_norm": 5.375, + "learning_rate": 5.322960821937129e-06, + "loss": 1.57162724, + "memory(GiB)": 117.38, + "step": 39850, + "train_speed(iter/s)": 1.638608 + }, + { + "acc": 0.64148188, + "epoch": 1.01103500761035, + "grad_norm": 6.40625, + "learning_rate": 5.321914380955628e-06, + "loss": 1.65033703, + "memory(GiB)": 117.38, + "step": 39855, + "train_speed(iter/s)": 1.638631 + }, + { + "acc": 0.65975623, + "epoch": 1.0111618467782852, + "grad_norm": 5.75, + "learning_rate": 5.320867925814896e-06, + "loss": 1.60131836, + "memory(GiB)": 117.38, + "step": 39860, + "train_speed(iter/s)": 1.638652 + }, + { + "acc": 0.65837464, + "epoch": 1.0112886859462202, + "grad_norm": 5.5625, + "learning_rate": 5.31982145656096e-06, + "loss": 1.6029356, + "memory(GiB)": 117.38, + "step": 39865, + "train_speed(iter/s)": 1.638673 + }, + { + "acc": 0.65174217, + "epoch": 1.0114155251141552, + "grad_norm": 5.25, + "learning_rate": 5.318774973239849e-06, + "loss": 1.55748167, + "memory(GiB)": 117.38, + "step": 39870, + "train_speed(iter/s)": 1.638694 + }, + { + "acc": 0.65062418, + "epoch": 1.0115423642820902, + "grad_norm": 5.5, + "learning_rate": 5.31772847589759e-06, + "loss": 1.62412872, + "memory(GiB)": 117.38, + "step": 39875, + "train_speed(iter/s)": 1.638716 + }, + { + "acc": 0.66752129, + "epoch": 1.0116692034500254, + "grad_norm": 6.25, + "learning_rate": 5.316681964580215e-06, + "loss": 1.56336918, + "memory(GiB)": 117.38, + "step": 39880, + "train_speed(iter/s)": 1.638739 + }, + { + "acc": 0.66314564, + "epoch": 1.0117960426179604, + "grad_norm": 6.0625, + "learning_rate": 5.315635439333753e-06, + "loss": 1.59444866, + "memory(GiB)": 117.38, + "step": 39885, + "train_speed(iter/s)": 1.63876 + }, + { + "acc": 0.64957018, + "epoch": 1.0119228817858954, + "grad_norm": 5.65625, + "learning_rate": 5.314588900204235e-06, + "loss": 1.61485176, + "memory(GiB)": 117.38, + "step": 39890, + "train_speed(iter/s)": 1.638781 + }, + { + "acc": 0.65460482, + "epoch": 1.0120497209538306, + "grad_norm": 6.46875, + "learning_rate": 5.313542347237692e-06, + "loss": 1.65141449, + "memory(GiB)": 117.38, + "step": 39895, + "train_speed(iter/s)": 1.638803 + }, + { + "acc": 0.63795977, + "epoch": 1.0121765601217656, + "grad_norm": 6.59375, + "learning_rate": 5.312495780480159e-06, + "loss": 1.63843136, + "memory(GiB)": 117.38, + "step": 39900, + "train_speed(iter/s)": 1.638825 + }, + { + "acc": 0.65736651, + "epoch": 1.0123033992897006, + "grad_norm": 5.40625, + "learning_rate": 5.311449199977664e-06, + "loss": 1.59072075, + "memory(GiB)": 117.38, + "step": 39905, + "train_speed(iter/s)": 1.638846 + }, + { + "acc": 0.66679993, + "epoch": 1.0124302384576358, + "grad_norm": 6.4375, + "learning_rate": 5.310402605776245e-06, + "loss": 1.56476583, + "memory(GiB)": 117.38, + "step": 39910, + "train_speed(iter/s)": 1.638867 + }, + { + "acc": 0.66197143, + "epoch": 1.0125570776255708, + "grad_norm": 5.25, + "learning_rate": 5.309355997921931e-06, + "loss": 1.57712107, + "memory(GiB)": 117.38, + "step": 39915, + "train_speed(iter/s)": 1.638888 + }, + { + "acc": 0.67339616, + "epoch": 1.0126839167935058, + "grad_norm": 5.59375, + "learning_rate": 5.308309376460761e-06, + "loss": 1.59358978, + "memory(GiB)": 117.38, + "step": 39920, + "train_speed(iter/s)": 1.638909 + }, + { + "acc": 0.66828194, + "epoch": 1.012810755961441, + "grad_norm": 4.9375, + "learning_rate": 5.307262741438767e-06, + "loss": 1.53220406, + "memory(GiB)": 117.38, + "step": 39925, + "train_speed(iter/s)": 1.63893 + }, + { + "acc": 0.65965991, + "epoch": 1.012937595129376, + "grad_norm": 6.03125, + "learning_rate": 5.3062160929019855e-06, + "loss": 1.6212965, + "memory(GiB)": 117.38, + "step": 39930, + "train_speed(iter/s)": 1.638953 + }, + { + "acc": 0.67123747, + "epoch": 1.013064434297311, + "grad_norm": 5.25, + "learning_rate": 5.305169430896454e-06, + "loss": 1.56313429, + "memory(GiB)": 117.38, + "step": 39935, + "train_speed(iter/s)": 1.638975 + }, + { + "acc": 0.66302409, + "epoch": 1.013191273465246, + "grad_norm": 4.875, + "learning_rate": 5.304122755468209e-06, + "loss": 1.55104485, + "memory(GiB)": 117.38, + "step": 39940, + "train_speed(iter/s)": 1.638996 + }, + { + "acc": 0.65456266, + "epoch": 1.0133181126331812, + "grad_norm": 6.53125, + "learning_rate": 5.303076066663286e-06, + "loss": 1.5185524, + "memory(GiB)": 117.38, + "step": 39945, + "train_speed(iter/s)": 1.639018 + }, + { + "acc": 0.64203253, + "epoch": 1.0134449518011162, + "grad_norm": 5.5, + "learning_rate": 5.302029364527726e-06, + "loss": 1.644701, + "memory(GiB)": 117.38, + "step": 39950, + "train_speed(iter/s)": 1.639039 + }, + { + "acc": 0.65256262, + "epoch": 1.0135717909690511, + "grad_norm": 6.46875, + "learning_rate": 5.3009826491075645e-06, + "loss": 1.6785635, + "memory(GiB)": 117.38, + "step": 39955, + "train_speed(iter/s)": 1.63906 + }, + { + "acc": 0.6583322, + "epoch": 1.0136986301369864, + "grad_norm": 8.125, + "learning_rate": 5.299935920448843e-06, + "loss": 1.53910131, + "memory(GiB)": 117.38, + "step": 39960, + "train_speed(iter/s)": 1.639079 + }, + { + "acc": 0.66376405, + "epoch": 1.0138254693049213, + "grad_norm": 5.3125, + "learning_rate": 5.298889178597599e-06, + "loss": 1.56935625, + "memory(GiB)": 117.38, + "step": 39965, + "train_speed(iter/s)": 1.639099 + }, + { + "acc": 0.65047426, + "epoch": 1.0139523084728563, + "grad_norm": 6.71875, + "learning_rate": 5.297842423599877e-06, + "loss": 1.59526472, + "memory(GiB)": 117.38, + "step": 39970, + "train_speed(iter/s)": 1.63912 + }, + { + "acc": 0.66500549, + "epoch": 1.0140791476407915, + "grad_norm": 6.65625, + "learning_rate": 5.296795655501714e-06, + "loss": 1.62907639, + "memory(GiB)": 117.38, + "step": 39975, + "train_speed(iter/s)": 1.639142 + }, + { + "acc": 0.65319786, + "epoch": 1.0142059868087265, + "grad_norm": 5.6875, + "learning_rate": 5.295748874349155e-06, + "loss": 1.63263054, + "memory(GiB)": 117.38, + "step": 39980, + "train_speed(iter/s)": 1.639162 + }, + { + "acc": 0.65103827, + "epoch": 1.0143328259766615, + "grad_norm": 5.84375, + "learning_rate": 5.294702080188236e-06, + "loss": 1.56789045, + "memory(GiB)": 117.38, + "step": 39985, + "train_speed(iter/s)": 1.639182 + }, + { + "acc": 0.64973259, + "epoch": 1.0144596651445967, + "grad_norm": 5.4375, + "learning_rate": 5.293655273065008e-06, + "loss": 1.61104507, + "memory(GiB)": 117.38, + "step": 39990, + "train_speed(iter/s)": 1.639203 + }, + { + "acc": 0.66724234, + "epoch": 1.0145865043125317, + "grad_norm": 5.28125, + "learning_rate": 5.2926084530255076e-06, + "loss": 1.59788103, + "memory(GiB)": 117.38, + "step": 39995, + "train_speed(iter/s)": 1.639225 + }, + { + "acc": 0.6543807, + "epoch": 1.0147133434804667, + "grad_norm": 6.15625, + "learning_rate": 5.291561620115781e-06, + "loss": 1.59785461, + "memory(GiB)": 117.38, + "step": 40000, + "train_speed(iter/s)": 1.639248 + }, + { + "epoch": 1.0147133434804667, + "eval_acc": 0.6462142142643837, + "eval_loss": 1.5740785598754883, + "eval_runtime": 57.9965, + "eval_samples_per_second": 109.834, + "eval_steps_per_second": 27.467, + "step": 40000 + }, + { + "acc": 0.65649223, + "epoch": 1.014840182648402, + "grad_norm": 6.40625, + "learning_rate": 5.290514774381874e-06, + "loss": 1.6269558, + "memory(GiB)": 117.38, + "step": 40005, + "train_speed(iter/s)": 1.635093 + }, + { + "acc": 0.65570307, + "epoch": 1.014967021816337, + "grad_norm": 6.15625, + "learning_rate": 5.289467915869829e-06, + "loss": 1.60519314, + "memory(GiB)": 117.38, + "step": 40010, + "train_speed(iter/s)": 1.635113 + }, + { + "acc": 0.66156893, + "epoch": 1.015093860984272, + "grad_norm": 6.5, + "learning_rate": 5.288421044625694e-06, + "loss": 1.59348755, + "memory(GiB)": 117.38, + "step": 40015, + "train_speed(iter/s)": 1.635135 + }, + { + "acc": 0.65349388, + "epoch": 1.0152207001522071, + "grad_norm": 5.3125, + "learning_rate": 5.287374160695513e-06, + "loss": 1.61469593, + "memory(GiB)": 117.38, + "step": 40020, + "train_speed(iter/s)": 1.635157 + }, + { + "acc": 0.64751792, + "epoch": 1.015347539320142, + "grad_norm": 5.46875, + "learning_rate": 5.286327264125332e-06, + "loss": 1.67454109, + "memory(GiB)": 117.38, + "step": 40025, + "train_speed(iter/s)": 1.63518 + }, + { + "acc": 0.64518757, + "epoch": 1.015474378488077, + "grad_norm": 6.28125, + "learning_rate": 5.285280354961202e-06, + "loss": 1.67606239, + "memory(GiB)": 117.38, + "step": 40030, + "train_speed(iter/s)": 1.635203 + }, + { + "acc": 0.65832996, + "epoch": 1.015601217656012, + "grad_norm": 5.59375, + "learning_rate": 5.284233433249167e-06, + "loss": 1.52517776, + "memory(GiB)": 117.38, + "step": 40035, + "train_speed(iter/s)": 1.635225 + }, + { + "acc": 0.65377994, + "epoch": 1.0157280568239473, + "grad_norm": 6.71875, + "learning_rate": 5.283186499035276e-06, + "loss": 1.5628643, + "memory(GiB)": 117.38, + "step": 40040, + "train_speed(iter/s)": 1.635247 + }, + { + "acc": 0.66414504, + "epoch": 1.0158548959918823, + "grad_norm": 5.5625, + "learning_rate": 5.2821395523655795e-06, + "loss": 1.53937492, + "memory(GiB)": 117.38, + "step": 40045, + "train_speed(iter/s)": 1.635268 + }, + { + "acc": 0.66452026, + "epoch": 1.0159817351598173, + "grad_norm": 5.53125, + "learning_rate": 5.281092593286127e-06, + "loss": 1.55305614, + "memory(GiB)": 117.38, + "step": 40050, + "train_speed(iter/s)": 1.635287 + }, + { + "acc": 0.6489089, + "epoch": 1.0161085743277525, + "grad_norm": 5.625, + "learning_rate": 5.280045621842964e-06, + "loss": 1.58909225, + "memory(GiB)": 117.38, + "step": 40055, + "train_speed(iter/s)": 1.63531 + }, + { + "acc": 0.63675885, + "epoch": 1.0162354134956875, + "grad_norm": 5.03125, + "learning_rate": 5.278998638082148e-06, + "loss": 1.62103424, + "memory(GiB)": 117.38, + "step": 40060, + "train_speed(iter/s)": 1.635331 + }, + { + "acc": 0.65525999, + "epoch": 1.0163622526636225, + "grad_norm": 5.03125, + "learning_rate": 5.277951642049722e-06, + "loss": 1.60936203, + "memory(GiB)": 117.38, + "step": 40065, + "train_speed(iter/s)": 1.635352 + }, + { + "acc": 0.66408048, + "epoch": 1.0164890918315577, + "grad_norm": 4.71875, + "learning_rate": 5.276904633791745e-06, + "loss": 1.56101933, + "memory(GiB)": 117.38, + "step": 40070, + "train_speed(iter/s)": 1.635375 + }, + { + "acc": 0.65598583, + "epoch": 1.0166159309994927, + "grad_norm": 5.875, + "learning_rate": 5.275857613354265e-06, + "loss": 1.55445404, + "memory(GiB)": 117.38, + "step": 40075, + "train_speed(iter/s)": 1.635396 + }, + { + "acc": 0.66829014, + "epoch": 1.0167427701674276, + "grad_norm": 4.90625, + "learning_rate": 5.274810580783335e-06, + "loss": 1.58074207, + "memory(GiB)": 117.38, + "step": 40080, + "train_speed(iter/s)": 1.635419 + }, + { + "acc": 0.67111392, + "epoch": 1.0168696093353629, + "grad_norm": 5.71875, + "learning_rate": 5.2737635361250094e-06, + "loss": 1.56327419, + "memory(GiB)": 117.38, + "step": 40085, + "train_speed(iter/s)": 1.63544 + }, + { + "acc": 0.65572786, + "epoch": 1.0169964485032978, + "grad_norm": 5.9375, + "learning_rate": 5.2727164794253415e-06, + "loss": 1.60113869, + "memory(GiB)": 117.38, + "step": 40090, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.64674392, + "epoch": 1.0171232876712328, + "grad_norm": 5.84375, + "learning_rate": 5.271669410730384e-06, + "loss": 1.60128365, + "memory(GiB)": 117.38, + "step": 40095, + "train_speed(iter/s)": 1.635484 + }, + { + "acc": 0.67051244, + "epoch": 1.0172501268391678, + "grad_norm": 5.78125, + "learning_rate": 5.270622330086194e-06, + "loss": 1.59918051, + "memory(GiB)": 117.38, + "step": 40100, + "train_speed(iter/s)": 1.635507 + }, + { + "acc": 0.63950601, + "epoch": 1.017376966007103, + "grad_norm": 5.15625, + "learning_rate": 5.269575237538827e-06, + "loss": 1.59302998, + "memory(GiB)": 117.38, + "step": 40105, + "train_speed(iter/s)": 1.635529 + }, + { + "acc": 0.64420271, + "epoch": 1.017503805175038, + "grad_norm": 6.0, + "learning_rate": 5.268528133134335e-06, + "loss": 1.65467072, + "memory(GiB)": 117.38, + "step": 40110, + "train_speed(iter/s)": 1.635549 + }, + { + "acc": 0.6618732, + "epoch": 1.017630644342973, + "grad_norm": 5.625, + "learning_rate": 5.267481016918776e-06, + "loss": 1.55507221, + "memory(GiB)": 117.38, + "step": 40115, + "train_speed(iter/s)": 1.63557 + }, + { + "acc": 0.65956802, + "epoch": 1.0177574835109082, + "grad_norm": 6.28125, + "learning_rate": 5.266433888938212e-06, + "loss": 1.60486946, + "memory(GiB)": 117.38, + "step": 40120, + "train_speed(iter/s)": 1.635592 + }, + { + "acc": 0.66995301, + "epoch": 1.0178843226788432, + "grad_norm": 5.28125, + "learning_rate": 5.265386749238691e-06, + "loss": 1.54338589, + "memory(GiB)": 117.38, + "step": 40125, + "train_speed(iter/s)": 1.635614 + }, + { + "acc": 0.64883652, + "epoch": 1.0180111618467782, + "grad_norm": 6.375, + "learning_rate": 5.26433959786628e-06, + "loss": 1.62892799, + "memory(GiB)": 117.38, + "step": 40130, + "train_speed(iter/s)": 1.635635 + }, + { + "acc": 0.64977026, + "epoch": 1.0181380010147134, + "grad_norm": 5.9375, + "learning_rate": 5.263292434867031e-06, + "loss": 1.59526377, + "memory(GiB)": 117.38, + "step": 40135, + "train_speed(iter/s)": 1.635657 + }, + { + "acc": 0.67420692, + "epoch": 1.0182648401826484, + "grad_norm": 5.125, + "learning_rate": 5.262245260287006e-06, + "loss": 1.48768463, + "memory(GiB)": 117.38, + "step": 40140, + "train_speed(iter/s)": 1.635678 + }, + { + "acc": 0.64456005, + "epoch": 1.0183916793505834, + "grad_norm": 8.25, + "learning_rate": 5.261198074172262e-06, + "loss": 1.69737701, + "memory(GiB)": 117.38, + "step": 40145, + "train_speed(iter/s)": 1.635702 + }, + { + "acc": 0.65967212, + "epoch": 1.0185185185185186, + "grad_norm": 5.625, + "learning_rate": 5.260150876568862e-06, + "loss": 1.61888218, + "memory(GiB)": 117.38, + "step": 40150, + "train_speed(iter/s)": 1.635726 + }, + { + "acc": 0.65892339, + "epoch": 1.0186453576864536, + "grad_norm": 4.71875, + "learning_rate": 5.259103667522866e-06, + "loss": 1.5869688, + "memory(GiB)": 117.38, + "step": 40155, + "train_speed(iter/s)": 1.635748 + }, + { + "acc": 0.65756025, + "epoch": 1.0187721968543886, + "grad_norm": 5.09375, + "learning_rate": 5.258056447080333e-06, + "loss": 1.64498558, + "memory(GiB)": 117.38, + "step": 40160, + "train_speed(iter/s)": 1.635771 + }, + { + "acc": 0.65321121, + "epoch": 1.0188990360223238, + "grad_norm": 5.53125, + "learning_rate": 5.257009215287325e-06, + "loss": 1.65939293, + "memory(GiB)": 117.38, + "step": 40165, + "train_speed(iter/s)": 1.635793 + }, + { + "acc": 0.65982928, + "epoch": 1.0190258751902588, + "grad_norm": 5.375, + "learning_rate": 5.255961972189905e-06, + "loss": 1.6098177, + "memory(GiB)": 117.38, + "step": 40170, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.64726415, + "epoch": 1.0191527143581938, + "grad_norm": 5.75, + "learning_rate": 5.254914717834133e-06, + "loss": 1.62165527, + "memory(GiB)": 117.38, + "step": 40175, + "train_speed(iter/s)": 1.635838 + }, + { + "acc": 0.65048866, + "epoch": 1.019279553526129, + "grad_norm": 5.03125, + "learning_rate": 5.253867452266075e-06, + "loss": 1.58091764, + "memory(GiB)": 117.38, + "step": 40180, + "train_speed(iter/s)": 1.635861 + }, + { + "acc": 0.67215166, + "epoch": 1.019406392694064, + "grad_norm": 9.0625, + "learning_rate": 5.252820175531792e-06, + "loss": 1.57077713, + "memory(GiB)": 117.38, + "step": 40185, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.64890275, + "epoch": 1.019533231861999, + "grad_norm": 7.46875, + "learning_rate": 5.25177288767735e-06, + "loss": 1.62759724, + "memory(GiB)": 117.38, + "step": 40190, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.66316366, + "epoch": 1.019660071029934, + "grad_norm": 5.96875, + "learning_rate": 5.250725588748811e-06, + "loss": 1.58214417, + "memory(GiB)": 117.38, + "step": 40195, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.6483737, + "epoch": 1.0197869101978692, + "grad_norm": 5.53125, + "learning_rate": 5.249678278792243e-06, + "loss": 1.68145676, + "memory(GiB)": 117.38, + "step": 40200, + "train_speed(iter/s)": 1.635951 + }, + { + "acc": 0.67354927, + "epoch": 1.0199137493658041, + "grad_norm": 5.625, + "learning_rate": 5.248630957853708e-06, + "loss": 1.54078636, + "memory(GiB)": 117.38, + "step": 40205, + "train_speed(iter/s)": 1.635975 + }, + { + "acc": 0.65906215, + "epoch": 1.0200405885337391, + "grad_norm": 5.46875, + "learning_rate": 5.247583625979276e-06, + "loss": 1.54860744, + "memory(GiB)": 117.38, + "step": 40210, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.64174495, + "epoch": 1.0201674277016743, + "grad_norm": 6.71875, + "learning_rate": 5.246536283215007e-06, + "loss": 1.58928385, + "memory(GiB)": 117.38, + "step": 40215, + "train_speed(iter/s)": 1.636018 + }, + { + "acc": 0.65218077, + "epoch": 1.0202942668696093, + "grad_norm": 5.5, + "learning_rate": 5.245488929606974e-06, + "loss": 1.62645531, + "memory(GiB)": 117.38, + "step": 40220, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.65695028, + "epoch": 1.0204211060375443, + "grad_norm": 4.75, + "learning_rate": 5.244441565201241e-06, + "loss": 1.61856422, + "memory(GiB)": 117.38, + "step": 40225, + "train_speed(iter/s)": 1.63606 + }, + { + "acc": 0.65484724, + "epoch": 1.0205479452054795, + "grad_norm": 5.8125, + "learning_rate": 5.243394190043877e-06, + "loss": 1.51860943, + "memory(GiB)": 117.38, + "step": 40230, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.66069336, + "epoch": 1.0206747843734145, + "grad_norm": 5.59375, + "learning_rate": 5.242346804180949e-06, + "loss": 1.58330793, + "memory(GiB)": 117.38, + "step": 40235, + "train_speed(iter/s)": 1.636104 + }, + { + "acc": 0.65965705, + "epoch": 1.0208016235413495, + "grad_norm": 5.375, + "learning_rate": 5.241299407658528e-06, + "loss": 1.61549454, + "memory(GiB)": 117.38, + "step": 40240, + "train_speed(iter/s)": 1.636126 + }, + { + "acc": 0.65619869, + "epoch": 1.0209284627092847, + "grad_norm": 5.28125, + "learning_rate": 5.240252000522681e-06, + "loss": 1.56270351, + "memory(GiB)": 117.38, + "step": 40245, + "train_speed(iter/s)": 1.636148 + }, + { + "acc": 0.64507804, + "epoch": 1.0210553018772197, + "grad_norm": 6.375, + "learning_rate": 5.239204582819479e-06, + "loss": 1.66113567, + "memory(GiB)": 117.38, + "step": 40250, + "train_speed(iter/s)": 1.63617 + }, + { + "acc": 0.65118189, + "epoch": 1.0211821410451547, + "grad_norm": 7.71875, + "learning_rate": 5.238157154594989e-06, + "loss": 1.60753536, + "memory(GiB)": 117.38, + "step": 40255, + "train_speed(iter/s)": 1.636193 + }, + { + "acc": 0.67268829, + "epoch": 1.0213089802130897, + "grad_norm": 6.34375, + "learning_rate": 5.237109715895287e-06, + "loss": 1.47926054, + "memory(GiB)": 117.38, + "step": 40260, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.66148472, + "epoch": 1.021435819381025, + "grad_norm": 5.875, + "learning_rate": 5.2360622667664385e-06, + "loss": 1.57624626, + "memory(GiB)": 117.38, + "step": 40265, + "train_speed(iter/s)": 1.636233 + }, + { + "acc": 0.66665912, + "epoch": 1.02156265854896, + "grad_norm": 6.8125, + "learning_rate": 5.235014807254521e-06, + "loss": 1.56256876, + "memory(GiB)": 117.38, + "step": 40270, + "train_speed(iter/s)": 1.636254 + }, + { + "acc": 0.64318552, + "epoch": 1.0216894977168949, + "grad_norm": 7.40625, + "learning_rate": 5.233967337405599e-06, + "loss": 1.61275978, + "memory(GiB)": 117.38, + "step": 40275, + "train_speed(iter/s)": 1.636277 + }, + { + "acc": 0.66389303, + "epoch": 1.02181633688483, + "grad_norm": 5.0625, + "learning_rate": 5.232919857265752e-06, + "loss": 1.61152496, + "memory(GiB)": 117.38, + "step": 40280, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.65124974, + "epoch": 1.021943176052765, + "grad_norm": 5.3125, + "learning_rate": 5.231872366881048e-06, + "loss": 1.61572285, + "memory(GiB)": 117.38, + "step": 40285, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.66670675, + "epoch": 1.0220700152207, + "grad_norm": 5.03125, + "learning_rate": 5.230824866297563e-06, + "loss": 1.5959487, + "memory(GiB)": 117.38, + "step": 40290, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65785418, + "epoch": 1.0221968543886353, + "grad_norm": 5.78125, + "learning_rate": 5.229777355561368e-06, + "loss": 1.58440151, + "memory(GiB)": 117.38, + "step": 40295, + "train_speed(iter/s)": 1.636363 + }, + { + "acc": 0.65784349, + "epoch": 1.0223236935565703, + "grad_norm": 7.0, + "learning_rate": 5.2287298347185415e-06, + "loss": 1.60356426, + "memory(GiB)": 117.38, + "step": 40300, + "train_speed(iter/s)": 1.636384 + }, + { + "acc": 0.63558426, + "epoch": 1.0224505327245053, + "grad_norm": 5.8125, + "learning_rate": 5.227682303815155e-06, + "loss": 1.63212929, + "memory(GiB)": 117.38, + "step": 40305, + "train_speed(iter/s)": 1.636405 + }, + { + "acc": 0.64989662, + "epoch": 1.0225773718924405, + "grad_norm": 5.3125, + "learning_rate": 5.226634762897284e-06, + "loss": 1.63894234, + "memory(GiB)": 117.38, + "step": 40310, + "train_speed(iter/s)": 1.636428 + }, + { + "acc": 0.65535831, + "epoch": 1.0227042110603755, + "grad_norm": 5.9375, + "learning_rate": 5.225587212011004e-06, + "loss": 1.54392014, + "memory(GiB)": 117.38, + "step": 40315, + "train_speed(iter/s)": 1.63645 + }, + { + "acc": 0.65255675, + "epoch": 1.0228310502283104, + "grad_norm": 5.59375, + "learning_rate": 5.224539651202391e-06, + "loss": 1.56442575, + "memory(GiB)": 117.38, + "step": 40320, + "train_speed(iter/s)": 1.636473 + }, + { + "acc": 0.65225158, + "epoch": 1.0229578893962457, + "grad_norm": 5.9375, + "learning_rate": 5.223492080517523e-06, + "loss": 1.61982002, + "memory(GiB)": 117.38, + "step": 40325, + "train_speed(iter/s)": 1.636494 + }, + { + "acc": 0.65033979, + "epoch": 1.0230847285641806, + "grad_norm": 6.90625, + "learning_rate": 5.2224445000024744e-06, + "loss": 1.65942116, + "memory(GiB)": 117.38, + "step": 40330, + "train_speed(iter/s)": 1.636516 + }, + { + "acc": 0.65091619, + "epoch": 1.0232115677321156, + "grad_norm": 5.78125, + "learning_rate": 5.221396909703322e-06, + "loss": 1.58307533, + "memory(GiB)": 117.38, + "step": 40335, + "train_speed(iter/s)": 1.636539 + }, + { + "acc": 0.66293764, + "epoch": 1.0233384069000508, + "grad_norm": 5.90625, + "learning_rate": 5.220349309666148e-06, + "loss": 1.53305664, + "memory(GiB)": 117.38, + "step": 40340, + "train_speed(iter/s)": 1.636562 + }, + { + "acc": 0.65347309, + "epoch": 1.0234652460679858, + "grad_norm": 6.78125, + "learning_rate": 5.2193016999370265e-06, + "loss": 1.61133499, + "memory(GiB)": 117.38, + "step": 40345, + "train_speed(iter/s)": 1.636584 + }, + { + "acc": 0.65706906, + "epoch": 1.0235920852359208, + "grad_norm": 5.28125, + "learning_rate": 5.218254080562038e-06, + "loss": 1.55974197, + "memory(GiB)": 117.38, + "step": 40350, + "train_speed(iter/s)": 1.636607 + }, + { + "acc": 0.67610836, + "epoch": 1.0237189244038558, + "grad_norm": 7.5, + "learning_rate": 5.2172064515872585e-06, + "loss": 1.49498997, + "memory(GiB)": 117.38, + "step": 40355, + "train_speed(iter/s)": 1.63663 + }, + { + "acc": 0.66202326, + "epoch": 1.023845763571791, + "grad_norm": 6.65625, + "learning_rate": 5.21615881305877e-06, + "loss": 1.62622356, + "memory(GiB)": 117.38, + "step": 40360, + "train_speed(iter/s)": 1.636652 + }, + { + "acc": 0.65148506, + "epoch": 1.023972602739726, + "grad_norm": 5.71875, + "learning_rate": 5.215111165022653e-06, + "loss": 1.57873535, + "memory(GiB)": 117.38, + "step": 40365, + "train_speed(iter/s)": 1.636674 + }, + { + "acc": 0.67259207, + "epoch": 1.024099441907661, + "grad_norm": 5.375, + "learning_rate": 5.2140635075249856e-06, + "loss": 1.5061265, + "memory(GiB)": 117.38, + "step": 40370, + "train_speed(iter/s)": 1.636695 + }, + { + "acc": 0.6600646, + "epoch": 1.0242262810755962, + "grad_norm": 5.6875, + "learning_rate": 5.213015840611851e-06, + "loss": 1.5737422, + "memory(GiB)": 117.38, + "step": 40375, + "train_speed(iter/s)": 1.636717 + }, + { + "acc": 0.64788723, + "epoch": 1.0243531202435312, + "grad_norm": 6.75, + "learning_rate": 5.211968164329328e-06, + "loss": 1.64896126, + "memory(GiB)": 117.38, + "step": 40380, + "train_speed(iter/s)": 1.636738 + }, + { + "acc": 0.66225376, + "epoch": 1.0244799594114662, + "grad_norm": 5.59375, + "learning_rate": 5.210920478723497e-06, + "loss": 1.61479454, + "memory(GiB)": 117.38, + "step": 40385, + "train_speed(iter/s)": 1.63676 + }, + { + "acc": 0.64224291, + "epoch": 1.0246067985794014, + "grad_norm": 5.59375, + "learning_rate": 5.209872783840443e-06, + "loss": 1.6535284, + "memory(GiB)": 117.38, + "step": 40390, + "train_speed(iter/s)": 1.636781 + }, + { + "acc": 0.65905256, + "epoch": 1.0247336377473364, + "grad_norm": 5.03125, + "learning_rate": 5.208825079726248e-06, + "loss": 1.60687561, + "memory(GiB)": 117.38, + "step": 40395, + "train_speed(iter/s)": 1.636802 + }, + { + "acc": 0.66541691, + "epoch": 1.0248604769152714, + "grad_norm": 5.71875, + "learning_rate": 5.207777366426992e-06, + "loss": 1.588976, + "memory(GiB)": 117.38, + "step": 40400, + "train_speed(iter/s)": 1.636822 + }, + { + "acc": 0.65608292, + "epoch": 1.0249873160832066, + "grad_norm": 5.5, + "learning_rate": 5.206729643988759e-06, + "loss": 1.58177643, + "memory(GiB)": 117.38, + "step": 40405, + "train_speed(iter/s)": 1.636845 + }, + { + "acc": 0.66607685, + "epoch": 1.0251141552511416, + "grad_norm": 5.875, + "learning_rate": 5.205681912457635e-06, + "loss": 1.57152004, + "memory(GiB)": 117.38, + "step": 40410, + "train_speed(iter/s)": 1.636866 + }, + { + "acc": 0.63905668, + "epoch": 1.0252409944190766, + "grad_norm": 7.34375, + "learning_rate": 5.204634171879701e-06, + "loss": 1.69107399, + "memory(GiB)": 117.38, + "step": 40415, + "train_speed(iter/s)": 1.636886 + }, + { + "acc": 0.65537505, + "epoch": 1.0253678335870116, + "grad_norm": 4.71875, + "learning_rate": 5.2035864223010445e-06, + "loss": 1.51729584, + "memory(GiB)": 117.38, + "step": 40420, + "train_speed(iter/s)": 1.636907 + }, + { + "acc": 0.65592699, + "epoch": 1.0254946727549468, + "grad_norm": 6.90625, + "learning_rate": 5.202538663767746e-06, + "loss": 1.6054409, + "memory(GiB)": 117.38, + "step": 40425, + "train_speed(iter/s)": 1.636927 + }, + { + "acc": 0.65538116, + "epoch": 1.0256215119228818, + "grad_norm": 5.96875, + "learning_rate": 5.201490896325895e-06, + "loss": 1.63512344, + "memory(GiB)": 117.38, + "step": 40430, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.66591787, + "epoch": 1.0257483510908167, + "grad_norm": 5.78125, + "learning_rate": 5.200443120021572e-06, + "loss": 1.56082277, + "memory(GiB)": 117.38, + "step": 40435, + "train_speed(iter/s)": 1.636971 + }, + { + "acc": 0.66506791, + "epoch": 1.025875190258752, + "grad_norm": 5.8125, + "learning_rate": 5.199395334900868e-06, + "loss": 1.53434143, + "memory(GiB)": 117.38, + "step": 40440, + "train_speed(iter/s)": 1.636993 + }, + { + "acc": 0.65561771, + "epoch": 1.026002029426687, + "grad_norm": 6.53125, + "learning_rate": 5.198347541009866e-06, + "loss": 1.60469818, + "memory(GiB)": 117.38, + "step": 40445, + "train_speed(iter/s)": 1.637015 + }, + { + "acc": 0.65198641, + "epoch": 1.026128868594622, + "grad_norm": 6.125, + "learning_rate": 5.197299738394654e-06, + "loss": 1.64730339, + "memory(GiB)": 117.38, + "step": 40450, + "train_speed(iter/s)": 1.637036 + }, + { + "acc": 0.65640302, + "epoch": 1.0262557077625571, + "grad_norm": 5.25, + "learning_rate": 5.196251927101318e-06, + "loss": 1.61294098, + "memory(GiB)": 117.38, + "step": 40455, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.66402311, + "epoch": 1.0263825469304921, + "grad_norm": 5.96875, + "learning_rate": 5.195204107175946e-06, + "loss": 1.50397654, + "memory(GiB)": 117.38, + "step": 40460, + "train_speed(iter/s)": 1.637079 + }, + { + "acc": 0.66228719, + "epoch": 1.0265093860984271, + "grad_norm": 6.53125, + "learning_rate": 5.194156278664627e-06, + "loss": 1.62087059, + "memory(GiB)": 117.38, + "step": 40465, + "train_speed(iter/s)": 1.637101 + }, + { + "acc": 0.65959306, + "epoch": 1.0266362252663623, + "grad_norm": 4.96875, + "learning_rate": 5.1931084416134466e-06, + "loss": 1.57213478, + "memory(GiB)": 117.38, + "step": 40470, + "train_speed(iter/s)": 1.637122 + }, + { + "acc": 0.65139093, + "epoch": 1.0267630644342973, + "grad_norm": 4.8125, + "learning_rate": 5.192060596068496e-06, + "loss": 1.62109947, + "memory(GiB)": 117.38, + "step": 40475, + "train_speed(iter/s)": 1.637144 + }, + { + "acc": 0.65920916, + "epoch": 1.0268899036022323, + "grad_norm": 5.375, + "learning_rate": 5.191012742075863e-06, + "loss": 1.5731863, + "memory(GiB)": 117.38, + "step": 40480, + "train_speed(iter/s)": 1.637165 + }, + { + "acc": 0.66571569, + "epoch": 1.0270167427701675, + "grad_norm": 7.53125, + "learning_rate": 5.189964879681635e-06, + "loss": 1.59552307, + "memory(GiB)": 117.38, + "step": 40485, + "train_speed(iter/s)": 1.637186 + }, + { + "acc": 0.64958324, + "epoch": 1.0271435819381025, + "grad_norm": 6.0, + "learning_rate": 5.188917008931905e-06, + "loss": 1.62662163, + "memory(GiB)": 117.38, + "step": 40490, + "train_speed(iter/s)": 1.637207 + }, + { + "acc": 0.6562439, + "epoch": 1.0272704211060375, + "grad_norm": 7.21875, + "learning_rate": 5.18786912987276e-06, + "loss": 1.58037453, + "memory(GiB)": 117.38, + "step": 40495, + "train_speed(iter/s)": 1.63723 + }, + { + "acc": 0.66030588, + "epoch": 1.0273972602739727, + "grad_norm": 5.5, + "learning_rate": 5.186821242550294e-06, + "loss": 1.56659536, + "memory(GiB)": 117.38, + "step": 40500, + "train_speed(iter/s)": 1.637251 + }, + { + "acc": 0.64224744, + "epoch": 1.0275240994419077, + "grad_norm": 6.0625, + "learning_rate": 5.185773347010594e-06, + "loss": 1.64190331, + "memory(GiB)": 117.38, + "step": 40505, + "train_speed(iter/s)": 1.637273 + }, + { + "acc": 0.65694728, + "epoch": 1.0276509386098427, + "grad_norm": 5.53125, + "learning_rate": 5.184725443299753e-06, + "loss": 1.59824219, + "memory(GiB)": 117.38, + "step": 40510, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.66392894, + "epoch": 1.0277777777777777, + "grad_norm": 5.8125, + "learning_rate": 5.183677531463863e-06, + "loss": 1.56978006, + "memory(GiB)": 117.38, + "step": 40515, + "train_speed(iter/s)": 1.637317 + }, + { + "acc": 0.64516563, + "epoch": 1.027904616945713, + "grad_norm": 6.15625, + "learning_rate": 5.182629611549015e-06, + "loss": 1.65863686, + "memory(GiB)": 117.38, + "step": 40520, + "train_speed(iter/s)": 1.637339 + }, + { + "acc": 0.65516596, + "epoch": 1.0280314561136479, + "grad_norm": 6.0625, + "learning_rate": 5.181581683601301e-06, + "loss": 1.5598835, + "memory(GiB)": 117.38, + "step": 40525, + "train_speed(iter/s)": 1.63736 + }, + { + "acc": 0.65747061, + "epoch": 1.0281582952815829, + "grad_norm": 6.34375, + "learning_rate": 5.1805337476668135e-06, + "loss": 1.69331379, + "memory(GiB)": 117.38, + "step": 40530, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.66326451, + "epoch": 1.028285134449518, + "grad_norm": 4.375, + "learning_rate": 5.179485803791646e-06, + "loss": 1.62275791, + "memory(GiB)": 117.38, + "step": 40535, + "train_speed(iter/s)": 1.637403 + }, + { + "acc": 0.65492725, + "epoch": 1.028411973617453, + "grad_norm": 5.78125, + "learning_rate": 5.178437852021892e-06, + "loss": 1.67811279, + "memory(GiB)": 117.38, + "step": 40540, + "train_speed(iter/s)": 1.637425 + }, + { + "acc": 0.66354737, + "epoch": 1.028538812785388, + "grad_norm": 6.75, + "learning_rate": 5.177389892403645e-06, + "loss": 1.5604188, + "memory(GiB)": 117.38, + "step": 40545, + "train_speed(iter/s)": 1.637447 + }, + { + "acc": 0.65004511, + "epoch": 1.0286656519533233, + "grad_norm": 5.53125, + "learning_rate": 5.176341924982997e-06, + "loss": 1.58811197, + "memory(GiB)": 117.38, + "step": 40550, + "train_speed(iter/s)": 1.637468 + }, + { + "acc": 0.6679503, + "epoch": 1.0287924911212583, + "grad_norm": 6.59375, + "learning_rate": 5.1752939498060435e-06, + "loss": 1.56891251, + "memory(GiB)": 117.38, + "step": 40555, + "train_speed(iter/s)": 1.637491 + }, + { + "acc": 0.64970713, + "epoch": 1.0289193302891932, + "grad_norm": 5.59375, + "learning_rate": 5.174245966918883e-06, + "loss": 1.69450188, + "memory(GiB)": 117.38, + "step": 40560, + "train_speed(iter/s)": 1.637513 + }, + { + "acc": 0.63921604, + "epoch": 1.0290461694571285, + "grad_norm": 5.6875, + "learning_rate": 5.173197976367603e-06, + "loss": 1.6202755, + "memory(GiB)": 117.38, + "step": 40565, + "train_speed(iter/s)": 1.637536 + }, + { + "acc": 0.6622056, + "epoch": 1.0291730086250634, + "grad_norm": 5.53125, + "learning_rate": 5.1721499781983055e-06, + "loss": 1.54897089, + "memory(GiB)": 117.38, + "step": 40570, + "train_speed(iter/s)": 1.637557 + }, + { + "acc": 0.64063449, + "epoch": 1.0292998477929984, + "grad_norm": 4.96875, + "learning_rate": 5.171101972457081e-06, + "loss": 1.59721107, + "memory(GiB)": 117.38, + "step": 40575, + "train_speed(iter/s)": 1.637579 + }, + { + "acc": 0.65204802, + "epoch": 1.0294266869609334, + "grad_norm": 5.625, + "learning_rate": 5.170053959190029e-06, + "loss": 1.59807692, + "memory(GiB)": 117.38, + "step": 40580, + "train_speed(iter/s)": 1.6376 + }, + { + "acc": 0.65249763, + "epoch": 1.0295535261288686, + "grad_norm": 6.65625, + "learning_rate": 5.169005938443245e-06, + "loss": 1.61302605, + "memory(GiB)": 117.38, + "step": 40585, + "train_speed(iter/s)": 1.637622 + }, + { + "acc": 0.66014142, + "epoch": 1.0296803652968036, + "grad_norm": 5.46875, + "learning_rate": 5.1679579102628245e-06, + "loss": 1.59491444, + "memory(GiB)": 117.38, + "step": 40590, + "train_speed(iter/s)": 1.637644 + }, + { + "acc": 0.6484828, + "epoch": 1.0298072044647386, + "grad_norm": 5.28125, + "learning_rate": 5.166909874694866e-06, + "loss": 1.59603996, + "memory(GiB)": 117.38, + "step": 40595, + "train_speed(iter/s)": 1.637666 + }, + { + "acc": 0.64561977, + "epoch": 1.0299340436326738, + "grad_norm": 7.4375, + "learning_rate": 5.165861831785465e-06, + "loss": 1.69799347, + "memory(GiB)": 117.38, + "step": 40600, + "train_speed(iter/s)": 1.637688 + }, + { + "acc": 0.64031649, + "epoch": 1.0300608828006088, + "grad_norm": 5.75, + "learning_rate": 5.164813781580721e-06, + "loss": 1.62498169, + "memory(GiB)": 117.38, + "step": 40605, + "train_speed(iter/s)": 1.637709 + }, + { + "acc": 0.65559216, + "epoch": 1.0301877219685438, + "grad_norm": 5.125, + "learning_rate": 5.16376572412673e-06, + "loss": 1.57135181, + "memory(GiB)": 117.38, + "step": 40610, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.66612349, + "epoch": 1.030314561136479, + "grad_norm": 5.6875, + "learning_rate": 5.162717659469593e-06, + "loss": 1.6295536, + "memory(GiB)": 117.38, + "step": 40615, + "train_speed(iter/s)": 1.637752 + }, + { + "acc": 0.67055264, + "epoch": 1.030441400304414, + "grad_norm": 5.46875, + "learning_rate": 5.161669587655406e-06, + "loss": 1.52363548, + "memory(GiB)": 117.38, + "step": 40620, + "train_speed(iter/s)": 1.637772 + }, + { + "acc": 0.6556602, + "epoch": 1.030568239472349, + "grad_norm": 7.34375, + "learning_rate": 5.160621508730267e-06, + "loss": 1.56594753, + "memory(GiB)": 117.38, + "step": 40625, + "train_speed(iter/s)": 1.637794 + }, + { + "acc": 0.65218773, + "epoch": 1.0306950786402842, + "grad_norm": 5.65625, + "learning_rate": 5.15957342274028e-06, + "loss": 1.6275959, + "memory(GiB)": 117.38, + "step": 40630, + "train_speed(iter/s)": 1.637816 + }, + { + "acc": 0.64686565, + "epoch": 1.0308219178082192, + "grad_norm": 8.125, + "learning_rate": 5.158525329731539e-06, + "loss": 1.62003994, + "memory(GiB)": 117.38, + "step": 40635, + "train_speed(iter/s)": 1.637837 + }, + { + "acc": 0.65470314, + "epoch": 1.0309487569761542, + "grad_norm": 6.5625, + "learning_rate": 5.157477229750149e-06, + "loss": 1.67855587, + "memory(GiB)": 117.38, + "step": 40640, + "train_speed(iter/s)": 1.63786 + }, + { + "acc": 0.66197357, + "epoch": 1.0310755961440894, + "grad_norm": 5.46875, + "learning_rate": 5.156429122842204e-06, + "loss": 1.60478783, + "memory(GiB)": 117.38, + "step": 40645, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.66182189, + "epoch": 1.0312024353120244, + "grad_norm": 6.28125, + "learning_rate": 5.15538100905381e-06, + "loss": 1.57081213, + "memory(GiB)": 117.38, + "step": 40650, + "train_speed(iter/s)": 1.637901 + }, + { + "acc": 0.656461, + "epoch": 1.0313292744799594, + "grad_norm": 4.8125, + "learning_rate": 5.154332888431064e-06, + "loss": 1.58842239, + "memory(GiB)": 117.38, + "step": 40655, + "train_speed(iter/s)": 1.637923 + }, + { + "acc": 0.65025449, + "epoch": 1.0314561136478946, + "grad_norm": 6.15625, + "learning_rate": 5.15328476102007e-06, + "loss": 1.61803246, + "memory(GiB)": 117.38, + "step": 40660, + "train_speed(iter/s)": 1.637945 + }, + { + "acc": 0.6445549, + "epoch": 1.0315829528158296, + "grad_norm": 5.34375, + "learning_rate": 5.1522366268669264e-06, + "loss": 1.62090549, + "memory(GiB)": 117.38, + "step": 40665, + "train_speed(iter/s)": 1.637966 + }, + { + "acc": 0.65344067, + "epoch": 1.0317097919837646, + "grad_norm": 6.25, + "learning_rate": 5.1511884860177376e-06, + "loss": 1.59310169, + "memory(GiB)": 117.38, + "step": 40670, + "train_speed(iter/s)": 1.637987 + }, + { + "acc": 0.65493965, + "epoch": 1.0318366311516995, + "grad_norm": 5.96875, + "learning_rate": 5.150140338518603e-06, + "loss": 1.6139164, + "memory(GiB)": 117.38, + "step": 40675, + "train_speed(iter/s)": 1.638008 + }, + { + "acc": 0.66272364, + "epoch": 1.0319634703196348, + "grad_norm": 5.09375, + "learning_rate": 5.149092184415627e-06, + "loss": 1.52428246, + "memory(GiB)": 117.38, + "step": 40680, + "train_speed(iter/s)": 1.63803 + }, + { + "acc": 0.67664566, + "epoch": 1.0320903094875697, + "grad_norm": 5.15625, + "learning_rate": 5.148044023754911e-06, + "loss": 1.56870518, + "memory(GiB)": 117.38, + "step": 40685, + "train_speed(iter/s)": 1.638052 + }, + { + "acc": 0.64395576, + "epoch": 1.0322171486555047, + "grad_norm": 4.78125, + "learning_rate": 5.146995856582557e-06, + "loss": 1.62173195, + "memory(GiB)": 117.38, + "step": 40690, + "train_speed(iter/s)": 1.638074 + }, + { + "acc": 0.65144897, + "epoch": 1.03234398782344, + "grad_norm": 5.46875, + "learning_rate": 5.14594768294467e-06, + "loss": 1.65414658, + "memory(GiB)": 117.38, + "step": 40695, + "train_speed(iter/s)": 1.638097 + }, + { + "acc": 0.6609879, + "epoch": 1.032470826991375, + "grad_norm": 6.09375, + "learning_rate": 5.1448995028873515e-06, + "loss": 1.60159454, + "memory(GiB)": 117.38, + "step": 40700, + "train_speed(iter/s)": 1.638117 + }, + { + "acc": 0.65242009, + "epoch": 1.03259766615931, + "grad_norm": 5.5625, + "learning_rate": 5.143851316456706e-06, + "loss": 1.57185392, + "memory(GiB)": 117.38, + "step": 40705, + "train_speed(iter/s)": 1.638139 + }, + { + "acc": 0.64776373, + "epoch": 1.0327245053272451, + "grad_norm": 6.84375, + "learning_rate": 5.142803123698838e-06, + "loss": 1.64793663, + "memory(GiB)": 117.38, + "step": 40710, + "train_speed(iter/s)": 1.63816 + }, + { + "acc": 0.66486349, + "epoch": 1.0328513444951801, + "grad_norm": 5.625, + "learning_rate": 5.14175492465985e-06, + "loss": 1.52684345, + "memory(GiB)": 117.38, + "step": 40715, + "train_speed(iter/s)": 1.638182 + }, + { + "acc": 0.65848589, + "epoch": 1.0329781836631151, + "grad_norm": 5.8125, + "learning_rate": 5.14070671938585e-06, + "loss": 1.62021332, + "memory(GiB)": 117.38, + "step": 40720, + "train_speed(iter/s)": 1.638203 + }, + { + "acc": 0.65813723, + "epoch": 1.0331050228310503, + "grad_norm": 5.59375, + "learning_rate": 5.139658507922937e-06, + "loss": 1.58810902, + "memory(GiB)": 117.38, + "step": 40725, + "train_speed(iter/s)": 1.638225 + }, + { + "acc": 0.6496624, + "epoch": 1.0332318619989853, + "grad_norm": 6.84375, + "learning_rate": 5.138610290317221e-06, + "loss": 1.6423027, + "memory(GiB)": 117.38, + "step": 40730, + "train_speed(iter/s)": 1.638247 + }, + { + "acc": 0.6481904, + "epoch": 1.0333587011669203, + "grad_norm": 5.8125, + "learning_rate": 5.137562066614805e-06, + "loss": 1.61454353, + "memory(GiB)": 117.38, + "step": 40735, + "train_speed(iter/s)": 1.638269 + }, + { + "acc": 0.66562257, + "epoch": 1.0334855403348553, + "grad_norm": 8.0, + "learning_rate": 5.136513836861795e-06, + "loss": 1.53041515, + "memory(GiB)": 117.38, + "step": 40740, + "train_speed(iter/s)": 1.638293 + }, + { + "acc": 0.66315899, + "epoch": 1.0336123795027905, + "grad_norm": 5.09375, + "learning_rate": 5.135465601104298e-06, + "loss": 1.65367966, + "memory(GiB)": 117.38, + "step": 40745, + "train_speed(iter/s)": 1.638314 + }, + { + "acc": 0.68181005, + "epoch": 1.0337392186707255, + "grad_norm": 5.1875, + "learning_rate": 5.134417359388418e-06, + "loss": 1.53159122, + "memory(GiB)": 117.38, + "step": 40750, + "train_speed(iter/s)": 1.638336 + }, + { + "acc": 0.65063896, + "epoch": 1.0338660578386605, + "grad_norm": 6.0625, + "learning_rate": 5.133369111760264e-06, + "loss": 1.60261383, + "memory(GiB)": 117.38, + "step": 40755, + "train_speed(iter/s)": 1.638358 + }, + { + "acc": 0.66811352, + "epoch": 1.0339928970065957, + "grad_norm": 7.40625, + "learning_rate": 5.132320858265939e-06, + "loss": 1.56639633, + "memory(GiB)": 117.38, + "step": 40760, + "train_speed(iter/s)": 1.638381 + }, + { + "acc": 0.66822629, + "epoch": 1.0341197361745307, + "grad_norm": 7.40625, + "learning_rate": 5.131272598951554e-06, + "loss": 1.58580513, + "memory(GiB)": 117.38, + "step": 40765, + "train_speed(iter/s)": 1.638401 + }, + { + "acc": 0.66615558, + "epoch": 1.0342465753424657, + "grad_norm": 4.625, + "learning_rate": 5.130224333863212e-06, + "loss": 1.49721498, + "memory(GiB)": 117.38, + "step": 40770, + "train_speed(iter/s)": 1.638422 + }, + { + "acc": 0.65318537, + "epoch": 1.0343734145104009, + "grad_norm": 6.125, + "learning_rate": 5.129176063047022e-06, + "loss": 1.60644112, + "memory(GiB)": 117.38, + "step": 40775, + "train_speed(iter/s)": 1.638445 + }, + { + "acc": 0.64278841, + "epoch": 1.0345002536783359, + "grad_norm": 6.09375, + "learning_rate": 5.128127786549094e-06, + "loss": 1.67408009, + "memory(GiB)": 117.38, + "step": 40780, + "train_speed(iter/s)": 1.638467 + }, + { + "acc": 0.65311394, + "epoch": 1.0346270928462709, + "grad_norm": 5.40625, + "learning_rate": 5.127079504415532e-06, + "loss": 1.6130373, + "memory(GiB)": 117.38, + "step": 40785, + "train_speed(iter/s)": 1.638487 + }, + { + "acc": 0.67491827, + "epoch": 1.034753932014206, + "grad_norm": 6.53125, + "learning_rate": 5.126031216692449e-06, + "loss": 1.52196407, + "memory(GiB)": 117.38, + "step": 40790, + "train_speed(iter/s)": 1.638508 + }, + { + "acc": 0.65585279, + "epoch": 1.034880771182141, + "grad_norm": 4.71875, + "learning_rate": 5.124982923425947e-06, + "loss": 1.58482971, + "memory(GiB)": 117.38, + "step": 40795, + "train_speed(iter/s)": 1.638527 + }, + { + "acc": 0.65393801, + "epoch": 1.035007610350076, + "grad_norm": 4.875, + "learning_rate": 5.123934624662139e-06, + "loss": 1.61756001, + "memory(GiB)": 117.38, + "step": 40800, + "train_speed(iter/s)": 1.638548 + }, + { + "acc": 0.638975, + "epoch": 1.0351344495180113, + "grad_norm": 7.75, + "learning_rate": 5.1228863204471335e-06, + "loss": 1.65982056, + "memory(GiB)": 117.38, + "step": 40805, + "train_speed(iter/s)": 1.63857 + }, + { + "acc": 0.66074066, + "epoch": 1.0352612886859462, + "grad_norm": 4.5, + "learning_rate": 5.121838010827039e-06, + "loss": 1.56035404, + "memory(GiB)": 117.38, + "step": 40810, + "train_speed(iter/s)": 1.638591 + }, + { + "acc": 0.6509923, + "epoch": 1.0353881278538812, + "grad_norm": 5.4375, + "learning_rate": 5.120789695847965e-06, + "loss": 1.61882, + "memory(GiB)": 117.38, + "step": 40815, + "train_speed(iter/s)": 1.638613 + }, + { + "acc": 0.65156822, + "epoch": 1.0355149670218164, + "grad_norm": 5.875, + "learning_rate": 5.119741375556021e-06, + "loss": 1.62566681, + "memory(GiB)": 117.38, + "step": 40820, + "train_speed(iter/s)": 1.638635 + }, + { + "acc": 0.68064942, + "epoch": 1.0356418061897514, + "grad_norm": 6.25, + "learning_rate": 5.118693049997316e-06, + "loss": 1.51892948, + "memory(GiB)": 117.38, + "step": 40825, + "train_speed(iter/s)": 1.638657 + }, + { + "acc": 0.64971361, + "epoch": 1.0357686453576864, + "grad_norm": 6.4375, + "learning_rate": 5.117644719217961e-06, + "loss": 1.613204, + "memory(GiB)": 117.38, + "step": 40830, + "train_speed(iter/s)": 1.638679 + }, + { + "acc": 0.65265303, + "epoch": 1.0358954845256214, + "grad_norm": 6.09375, + "learning_rate": 5.116596383264066e-06, + "loss": 1.59452076, + "memory(GiB)": 117.38, + "step": 40835, + "train_speed(iter/s)": 1.6387 + }, + { + "acc": 0.64392495, + "epoch": 1.0360223236935566, + "grad_norm": 9.125, + "learning_rate": 5.115548042181742e-06, + "loss": 1.65839653, + "memory(GiB)": 117.38, + "step": 40840, + "train_speed(iter/s)": 1.638721 + }, + { + "acc": 0.66803961, + "epoch": 1.0361491628614916, + "grad_norm": 4.875, + "learning_rate": 5.114499696017098e-06, + "loss": 1.50407085, + "memory(GiB)": 117.38, + "step": 40845, + "train_speed(iter/s)": 1.638742 + }, + { + "acc": 0.65188293, + "epoch": 1.0362760020294266, + "grad_norm": 5.21875, + "learning_rate": 5.1134513448162475e-06, + "loss": 1.66241341, + "memory(GiB)": 117.38, + "step": 40850, + "train_speed(iter/s)": 1.638764 + }, + { + "acc": 0.6425087, + "epoch": 1.0364028411973618, + "grad_norm": 5.3125, + "learning_rate": 5.112402988625299e-06, + "loss": 1.62109718, + "memory(GiB)": 117.38, + "step": 40855, + "train_speed(iter/s)": 1.638786 + }, + { + "acc": 0.66601357, + "epoch": 1.0365296803652968, + "grad_norm": 6.21875, + "learning_rate": 5.111354627490367e-06, + "loss": 1.54996109, + "memory(GiB)": 117.38, + "step": 40860, + "train_speed(iter/s)": 1.638806 + }, + { + "acc": 0.64003944, + "epoch": 1.0366565195332318, + "grad_norm": 5.46875, + "learning_rate": 5.110306261457559e-06, + "loss": 1.66370449, + "memory(GiB)": 117.38, + "step": 40865, + "train_speed(iter/s)": 1.638829 + }, + { + "acc": 0.65367227, + "epoch": 1.036783358701167, + "grad_norm": 5.03125, + "learning_rate": 5.109257890572991e-06, + "loss": 1.66479588, + "memory(GiB)": 117.38, + "step": 40870, + "train_speed(iter/s)": 1.638851 + }, + { + "acc": 0.65728073, + "epoch": 1.036910197869102, + "grad_norm": 6.71875, + "learning_rate": 5.108209514882772e-06, + "loss": 1.58111229, + "memory(GiB)": 117.38, + "step": 40875, + "train_speed(iter/s)": 1.638873 + }, + { + "acc": 0.64618697, + "epoch": 1.037037037037037, + "grad_norm": 6.59375, + "learning_rate": 5.107161134433017e-06, + "loss": 1.64139786, + "memory(GiB)": 117.38, + "step": 40880, + "train_speed(iter/s)": 1.638893 + }, + { + "acc": 0.65989714, + "epoch": 1.0371638762049722, + "grad_norm": 6.84375, + "learning_rate": 5.106112749269835e-06, + "loss": 1.56313238, + "memory(GiB)": 117.38, + "step": 40885, + "train_speed(iter/s)": 1.638915 + }, + { + "acc": 0.65228424, + "epoch": 1.0372907153729072, + "grad_norm": 10.0, + "learning_rate": 5.105064359439341e-06, + "loss": 1.64385548, + "memory(GiB)": 117.38, + "step": 40890, + "train_speed(iter/s)": 1.638936 + }, + { + "acc": 0.6694046, + "epoch": 1.0374175545408422, + "grad_norm": 5.9375, + "learning_rate": 5.1040159649876485e-06, + "loss": 1.50057182, + "memory(GiB)": 117.38, + "step": 40895, + "train_speed(iter/s)": 1.638957 + }, + { + "acc": 0.64544067, + "epoch": 1.0375443937087772, + "grad_norm": 6.5625, + "learning_rate": 5.102967565960868e-06, + "loss": 1.65781517, + "memory(GiB)": 117.38, + "step": 40900, + "train_speed(iter/s)": 1.638977 + }, + { + "acc": 0.65514412, + "epoch": 1.0376712328767124, + "grad_norm": 5.59375, + "learning_rate": 5.101919162405116e-06, + "loss": 1.61020088, + "memory(GiB)": 117.38, + "step": 40905, + "train_speed(iter/s)": 1.638997 + }, + { + "acc": 0.66149073, + "epoch": 1.0377980720446474, + "grad_norm": 5.6875, + "learning_rate": 5.100870754366503e-06, + "loss": 1.56553469, + "memory(GiB)": 117.38, + "step": 40910, + "train_speed(iter/s)": 1.639019 + }, + { + "acc": 0.67709308, + "epoch": 1.0379249112125823, + "grad_norm": 6.6875, + "learning_rate": 5.099822341891144e-06, + "loss": 1.50459242, + "memory(GiB)": 117.38, + "step": 40915, + "train_speed(iter/s)": 1.639041 + }, + { + "acc": 0.64800062, + "epoch": 1.0380517503805176, + "grad_norm": 5.875, + "learning_rate": 5.098773925025152e-06, + "loss": 1.61434307, + "memory(GiB)": 117.38, + "step": 40920, + "train_speed(iter/s)": 1.639063 + }, + { + "acc": 0.63802557, + "epoch": 1.0381785895484525, + "grad_norm": 5.84375, + "learning_rate": 5.097725503814643e-06, + "loss": 1.68307896, + "memory(GiB)": 117.38, + "step": 40925, + "train_speed(iter/s)": 1.639084 + }, + { + "acc": 0.65785446, + "epoch": 1.0383054287163875, + "grad_norm": 4.90625, + "learning_rate": 5.09667707830573e-06, + "loss": 1.56549129, + "memory(GiB)": 117.38, + "step": 40930, + "train_speed(iter/s)": 1.639107 + }, + { + "acc": 0.67706718, + "epoch": 1.0384322678843227, + "grad_norm": 6.3125, + "learning_rate": 5.095628648544526e-06, + "loss": 1.47671614, + "memory(GiB)": 117.38, + "step": 40935, + "train_speed(iter/s)": 1.639128 + }, + { + "acc": 0.65696793, + "epoch": 1.0385591070522577, + "grad_norm": 5.65625, + "learning_rate": 5.0945802145771495e-06, + "loss": 1.56331949, + "memory(GiB)": 117.38, + "step": 40940, + "train_speed(iter/s)": 1.63915 + }, + { + "acc": 0.66067467, + "epoch": 1.0386859462201927, + "grad_norm": 5.5, + "learning_rate": 5.093531776449711e-06, + "loss": 1.54960203, + "memory(GiB)": 117.38, + "step": 40945, + "train_speed(iter/s)": 1.63917 + }, + { + "acc": 0.65931311, + "epoch": 1.038812785388128, + "grad_norm": 5.875, + "learning_rate": 5.092483334208327e-06, + "loss": 1.60602036, + "memory(GiB)": 117.38, + "step": 40950, + "train_speed(iter/s)": 1.639191 + }, + { + "acc": 0.64333758, + "epoch": 1.038939624556063, + "grad_norm": 6.34375, + "learning_rate": 5.091434887899114e-06, + "loss": 1.63052254, + "memory(GiB)": 117.38, + "step": 40955, + "train_speed(iter/s)": 1.639212 + }, + { + "acc": 0.65728874, + "epoch": 1.039066463723998, + "grad_norm": 6.03125, + "learning_rate": 5.0903864375681866e-06, + "loss": 1.6105011, + "memory(GiB)": 117.38, + "step": 40960, + "train_speed(iter/s)": 1.639233 + }, + { + "acc": 0.66116171, + "epoch": 1.0391933028919331, + "grad_norm": 5.46875, + "learning_rate": 5.0893379832616594e-06, + "loss": 1.5755868, + "memory(GiB)": 117.38, + "step": 40965, + "train_speed(iter/s)": 1.639252 + }, + { + "acc": 0.66144433, + "epoch": 1.0393201420598681, + "grad_norm": 5.875, + "learning_rate": 5.08828952502565e-06, + "loss": 1.57303867, + "memory(GiB)": 117.38, + "step": 40970, + "train_speed(iter/s)": 1.639274 + }, + { + "acc": 0.65965929, + "epoch": 1.039446981227803, + "grad_norm": 5.03125, + "learning_rate": 5.087241062906272e-06, + "loss": 1.60068188, + "memory(GiB)": 117.38, + "step": 40975, + "train_speed(iter/s)": 1.639295 + }, + { + "acc": 0.63852367, + "epoch": 1.0395738203957383, + "grad_norm": 4.96875, + "learning_rate": 5.086192596949643e-06, + "loss": 1.64146194, + "memory(GiB)": 117.38, + "step": 40980, + "train_speed(iter/s)": 1.639316 + }, + { + "acc": 0.66799808, + "epoch": 1.0397006595636733, + "grad_norm": 5.53125, + "learning_rate": 5.085144127201879e-06, + "loss": 1.55656929, + "memory(GiB)": 117.38, + "step": 40985, + "train_speed(iter/s)": 1.639338 + }, + { + "acc": 0.66920452, + "epoch": 1.0398274987316083, + "grad_norm": 6.84375, + "learning_rate": 5.084095653709096e-06, + "loss": 1.48647404, + "memory(GiB)": 117.38, + "step": 40990, + "train_speed(iter/s)": 1.639358 + }, + { + "acc": 0.676647, + "epoch": 1.0399543378995433, + "grad_norm": 5.8125, + "learning_rate": 5.0830471765174096e-06, + "loss": 1.5691246, + "memory(GiB)": 117.38, + "step": 40995, + "train_speed(iter/s)": 1.639379 + }, + { + "acc": 0.65418453, + "epoch": 1.0400811770674785, + "grad_norm": 6.59375, + "learning_rate": 5.0819986956729395e-06, + "loss": 1.62014542, + "memory(GiB)": 117.38, + "step": 41000, + "train_speed(iter/s)": 1.639401 + }, + { + "epoch": 1.0400811770674785, + "eval_acc": 0.646207949346047, + "eval_loss": 1.5738683938980103, + "eval_runtime": 58.4919, + "eval_samples_per_second": 108.904, + "eval_steps_per_second": 27.235, + "step": 41000 + }, + { + "acc": 0.67275405, + "epoch": 1.0402080162354135, + "grad_norm": 7.375, + "learning_rate": 5.080950211221799e-06, + "loss": 1.5470417, + "memory(GiB)": 117.38, + "step": 41005, + "train_speed(iter/s)": 1.635319 + }, + { + "acc": 0.65025458, + "epoch": 1.0403348554033485, + "grad_norm": 5.90625, + "learning_rate": 5.079901723210109e-06, + "loss": 1.574189, + "memory(GiB)": 117.38, + "step": 41010, + "train_speed(iter/s)": 1.635339 + }, + { + "acc": 0.65324278, + "epoch": 1.0404616945712837, + "grad_norm": 5.875, + "learning_rate": 5.078853231683981e-06, + "loss": 1.58726282, + "memory(GiB)": 117.38, + "step": 41015, + "train_speed(iter/s)": 1.635358 + }, + { + "acc": 0.65567102, + "epoch": 1.0405885337392187, + "grad_norm": 5.59375, + "learning_rate": 5.077804736689539e-06, + "loss": 1.64108505, + "memory(GiB)": 117.38, + "step": 41020, + "train_speed(iter/s)": 1.635379 + }, + { + "acc": 0.66853514, + "epoch": 1.0407153729071537, + "grad_norm": 6.15625, + "learning_rate": 5.0767562382728955e-06, + "loss": 1.54305401, + "memory(GiB)": 117.38, + "step": 41025, + "train_speed(iter/s)": 1.635399 + }, + { + "acc": 0.64062362, + "epoch": 1.0408422120750889, + "grad_norm": 6.84375, + "learning_rate": 5.075707736480171e-06, + "loss": 1.70396004, + "memory(GiB)": 117.38, + "step": 41030, + "train_speed(iter/s)": 1.635419 + }, + { + "acc": 0.65617466, + "epoch": 1.0409690512430239, + "grad_norm": 5.75, + "learning_rate": 5.074659231357482e-06, + "loss": 1.58250599, + "memory(GiB)": 117.38, + "step": 41035, + "train_speed(iter/s)": 1.635438 + }, + { + "acc": 0.65760169, + "epoch": 1.0410958904109588, + "grad_norm": 4.5625, + "learning_rate": 5.073610722950947e-06, + "loss": 1.60189304, + "memory(GiB)": 117.38, + "step": 41040, + "train_speed(iter/s)": 1.635458 + }, + { + "acc": 0.64942598, + "epoch": 1.041222729578894, + "grad_norm": 7.3125, + "learning_rate": 5.072562211306683e-06, + "loss": 1.58427477, + "memory(GiB)": 117.38, + "step": 41045, + "train_speed(iter/s)": 1.635477 + }, + { + "acc": 0.64224863, + "epoch": 1.041349568746829, + "grad_norm": 6.4375, + "learning_rate": 5.071513696470809e-06, + "loss": 1.65665932, + "memory(GiB)": 117.38, + "step": 41050, + "train_speed(iter/s)": 1.635495 + }, + { + "acc": 0.66928444, + "epoch": 1.041476407914764, + "grad_norm": 7.15625, + "learning_rate": 5.070465178489443e-06, + "loss": 1.58906279, + "memory(GiB)": 117.38, + "step": 41055, + "train_speed(iter/s)": 1.635515 + }, + { + "acc": 0.66286411, + "epoch": 1.041603247082699, + "grad_norm": 5.375, + "learning_rate": 5.069416657408704e-06, + "loss": 1.57125578, + "memory(GiB)": 117.38, + "step": 41060, + "train_speed(iter/s)": 1.635533 + }, + { + "acc": 0.65803609, + "epoch": 1.0417300862506342, + "grad_norm": 4.78125, + "learning_rate": 5.0683681332747105e-06, + "loss": 1.64628754, + "memory(GiB)": 117.38, + "step": 41065, + "train_speed(iter/s)": 1.635552 + }, + { + "acc": 0.658601, + "epoch": 1.0418569254185692, + "grad_norm": 5.0625, + "learning_rate": 5.067319606133583e-06, + "loss": 1.63151436, + "memory(GiB)": 117.38, + "step": 41070, + "train_speed(iter/s)": 1.635572 + }, + { + "acc": 0.67268991, + "epoch": 1.0419837645865042, + "grad_norm": 6.625, + "learning_rate": 5.066271076031436e-06, + "loss": 1.55479927, + "memory(GiB)": 117.38, + "step": 41075, + "train_speed(iter/s)": 1.635592 + }, + { + "acc": 0.65339289, + "epoch": 1.0421106037544394, + "grad_norm": 6.125, + "learning_rate": 5.065222543014394e-06, + "loss": 1.62807522, + "memory(GiB)": 117.38, + "step": 41080, + "train_speed(iter/s)": 1.63561 + }, + { + "acc": 0.652844, + "epoch": 1.0422374429223744, + "grad_norm": 7.78125, + "learning_rate": 5.06417400712857e-06, + "loss": 1.62595863, + "memory(GiB)": 117.38, + "step": 41085, + "train_speed(iter/s)": 1.63563 + }, + { + "acc": 0.66335192, + "epoch": 1.0423642820903094, + "grad_norm": 5.40625, + "learning_rate": 5.0631254684200906e-06, + "loss": 1.53417234, + "memory(GiB)": 117.38, + "step": 41090, + "train_speed(iter/s)": 1.63565 + }, + { + "acc": 0.66178837, + "epoch": 1.0424911212582446, + "grad_norm": 5.75, + "learning_rate": 5.062076926935068e-06, + "loss": 1.59790726, + "memory(GiB)": 117.38, + "step": 41095, + "train_speed(iter/s)": 1.635672 + }, + { + "acc": 0.66018434, + "epoch": 1.0426179604261796, + "grad_norm": 6.15625, + "learning_rate": 5.061028382719626e-06, + "loss": 1.54263077, + "memory(GiB)": 117.38, + "step": 41100, + "train_speed(iter/s)": 1.635693 + }, + { + "acc": 0.66236649, + "epoch": 1.0427447995941146, + "grad_norm": 5.875, + "learning_rate": 5.0599798358198835e-06, + "loss": 1.57164869, + "memory(GiB)": 117.38, + "step": 41105, + "train_speed(iter/s)": 1.635714 + }, + { + "acc": 0.65376139, + "epoch": 1.0428716387620498, + "grad_norm": 5.625, + "learning_rate": 5.0589312862819605e-06, + "loss": 1.60471878, + "memory(GiB)": 117.38, + "step": 41110, + "train_speed(iter/s)": 1.635734 + }, + { + "acc": 0.66896567, + "epoch": 1.0429984779299848, + "grad_norm": 5.28125, + "learning_rate": 5.057882734151977e-06, + "loss": 1.52460117, + "memory(GiB)": 117.38, + "step": 41115, + "train_speed(iter/s)": 1.635754 + }, + { + "acc": 0.65420294, + "epoch": 1.0431253170979198, + "grad_norm": 5.625, + "learning_rate": 5.05683417947605e-06, + "loss": 1.60587082, + "memory(GiB)": 117.38, + "step": 41120, + "train_speed(iter/s)": 1.635775 + }, + { + "acc": 0.65285158, + "epoch": 1.043252156265855, + "grad_norm": 6.28125, + "learning_rate": 5.055785622300303e-06, + "loss": 1.61026115, + "memory(GiB)": 117.38, + "step": 41125, + "train_speed(iter/s)": 1.635796 + }, + { + "acc": 0.65437546, + "epoch": 1.04337899543379, + "grad_norm": 4.90625, + "learning_rate": 5.054737062670857e-06, + "loss": 1.62572021, + "memory(GiB)": 117.38, + "step": 41130, + "train_speed(iter/s)": 1.635816 + }, + { + "acc": 0.65306063, + "epoch": 1.043505834601725, + "grad_norm": 5.53125, + "learning_rate": 5.053688500633828e-06, + "loss": 1.57150707, + "memory(GiB)": 117.38, + "step": 41135, + "train_speed(iter/s)": 1.635836 + }, + { + "acc": 0.66121545, + "epoch": 1.0436326737696602, + "grad_norm": 6.28125, + "learning_rate": 5.052639936235341e-06, + "loss": 1.55277061, + "memory(GiB)": 117.38, + "step": 41140, + "train_speed(iter/s)": 1.63585 + }, + { + "acc": 0.66214256, + "epoch": 1.0437595129375952, + "grad_norm": 7.0625, + "learning_rate": 5.051591369521513e-06, + "loss": 1.6219347, + "memory(GiB)": 117.38, + "step": 41145, + "train_speed(iter/s)": 1.635871 + }, + { + "acc": 0.64794827, + "epoch": 1.0438863521055302, + "grad_norm": 5.96875, + "learning_rate": 5.050542800538469e-06, + "loss": 1.58073626, + "memory(GiB)": 117.38, + "step": 41150, + "train_speed(iter/s)": 1.635893 + }, + { + "acc": 0.66172285, + "epoch": 1.0440131912734651, + "grad_norm": 5.71875, + "learning_rate": 5.049494229332324e-06, + "loss": 1.63676605, + "memory(GiB)": 117.38, + "step": 41155, + "train_speed(iter/s)": 1.635915 + }, + { + "acc": 0.66545863, + "epoch": 1.0441400304414004, + "grad_norm": 5.21875, + "learning_rate": 5.048445655949204e-06, + "loss": 1.60956802, + "memory(GiB)": 117.38, + "step": 41160, + "train_speed(iter/s)": 1.635935 + }, + { + "acc": 0.64485331, + "epoch": 1.0442668696093353, + "grad_norm": 5.78125, + "learning_rate": 5.047397080435225e-06, + "loss": 1.69102707, + "memory(GiB)": 117.38, + "step": 41165, + "train_speed(iter/s)": 1.635956 + }, + { + "acc": 0.65639114, + "epoch": 1.0443937087772703, + "grad_norm": 5.46875, + "learning_rate": 5.046348502836512e-06, + "loss": 1.65284081, + "memory(GiB)": 117.38, + "step": 41170, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.64845624, + "epoch": 1.0445205479452055, + "grad_norm": 5.1875, + "learning_rate": 5.045299923199186e-06, + "loss": 1.61099606, + "memory(GiB)": 117.38, + "step": 41175, + "train_speed(iter/s)": 1.635998 + }, + { + "acc": 0.65766449, + "epoch": 1.0446473871131405, + "grad_norm": 5.0, + "learning_rate": 5.044251341569366e-06, + "loss": 1.57406473, + "memory(GiB)": 117.38, + "step": 41180, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.63691139, + "epoch": 1.0447742262810755, + "grad_norm": 7.0625, + "learning_rate": 5.043202757993175e-06, + "loss": 1.70735168, + "memory(GiB)": 117.38, + "step": 41185, + "train_speed(iter/s)": 1.636038 + }, + { + "acc": 0.64051337, + "epoch": 1.0449010654490107, + "grad_norm": 7.75, + "learning_rate": 5.042154172516734e-06, + "loss": 1.65134926, + "memory(GiB)": 117.38, + "step": 41190, + "train_speed(iter/s)": 1.636059 + }, + { + "acc": 0.66136446, + "epoch": 1.0450279046169457, + "grad_norm": 5.5, + "learning_rate": 5.041105585186164e-06, + "loss": 1.59882412, + "memory(GiB)": 117.38, + "step": 41195, + "train_speed(iter/s)": 1.636079 + }, + { + "acc": 0.67160959, + "epoch": 1.0451547437848807, + "grad_norm": 4.9375, + "learning_rate": 5.040056996047587e-06, + "loss": 1.6220026, + "memory(GiB)": 117.38, + "step": 41200, + "train_speed(iter/s)": 1.6361 + }, + { + "acc": 0.66530619, + "epoch": 1.045281582952816, + "grad_norm": 5.59375, + "learning_rate": 5.039008405147125e-06, + "loss": 1.54878445, + "memory(GiB)": 117.38, + "step": 41205, + "train_speed(iter/s)": 1.63612 + }, + { + "acc": 0.64306927, + "epoch": 1.045408422120751, + "grad_norm": 6.53125, + "learning_rate": 5.0379598125308984e-06, + "loss": 1.64910774, + "memory(GiB)": 117.38, + "step": 41210, + "train_speed(iter/s)": 1.636141 + }, + { + "acc": 0.64517002, + "epoch": 1.045535261288686, + "grad_norm": 6.3125, + "learning_rate": 5.036911218245029e-06, + "loss": 1.68365669, + "memory(GiB)": 117.38, + "step": 41215, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.65690842, + "epoch": 1.045662100456621, + "grad_norm": 6.03125, + "learning_rate": 5.035862622335641e-06, + "loss": 1.59201756, + "memory(GiB)": 117.38, + "step": 41220, + "train_speed(iter/s)": 1.636181 + }, + { + "acc": 0.66846271, + "epoch": 1.045788939624556, + "grad_norm": 5.34375, + "learning_rate": 5.034814024848853e-06, + "loss": 1.55792236, + "memory(GiB)": 117.38, + "step": 41225, + "train_speed(iter/s)": 1.636202 + }, + { + "acc": 0.66630754, + "epoch": 1.045915778792491, + "grad_norm": 5.1875, + "learning_rate": 5.033765425830791e-06, + "loss": 1.5664361, + "memory(GiB)": 117.38, + "step": 41230, + "train_speed(iter/s)": 1.636223 + }, + { + "acc": 0.66141944, + "epoch": 1.046042617960426, + "grad_norm": 7.3125, + "learning_rate": 5.032716825327573e-06, + "loss": 1.5978425, + "memory(GiB)": 117.38, + "step": 41235, + "train_speed(iter/s)": 1.636243 + }, + { + "acc": 0.65883985, + "epoch": 1.0461694571283613, + "grad_norm": 6.375, + "learning_rate": 5.031668223385323e-06, + "loss": 1.58060741, + "memory(GiB)": 117.38, + "step": 41240, + "train_speed(iter/s)": 1.636265 + }, + { + "acc": 0.64730096, + "epoch": 1.0462962962962963, + "grad_norm": 6.03125, + "learning_rate": 5.030619620050163e-06, + "loss": 1.61114693, + "memory(GiB)": 117.38, + "step": 41245, + "train_speed(iter/s)": 1.636287 + }, + { + "acc": 0.6748704, + "epoch": 1.0464231354642313, + "grad_norm": 5.96875, + "learning_rate": 5.029571015368217e-06, + "loss": 1.47259388, + "memory(GiB)": 117.38, + "step": 41250, + "train_speed(iter/s)": 1.636307 + }, + { + "acc": 0.64501467, + "epoch": 1.0465499746321665, + "grad_norm": 5.40625, + "learning_rate": 5.028522409385605e-06, + "loss": 1.59153805, + "memory(GiB)": 117.38, + "step": 41255, + "train_speed(iter/s)": 1.636329 + }, + { + "acc": 0.66175165, + "epoch": 1.0466768138001015, + "grad_norm": 5.03125, + "learning_rate": 5.0274738021484495e-06, + "loss": 1.52612638, + "memory(GiB)": 117.38, + "step": 41260, + "train_speed(iter/s)": 1.636351 + }, + { + "acc": 0.64561868, + "epoch": 1.0468036529680365, + "grad_norm": 5.96875, + "learning_rate": 5.026425193702874e-06, + "loss": 1.63842659, + "memory(GiB)": 117.38, + "step": 41265, + "train_speed(iter/s)": 1.636372 + }, + { + "acc": 0.68379211, + "epoch": 1.0469304921359717, + "grad_norm": 5.53125, + "learning_rate": 5.025376584095001e-06, + "loss": 1.4420042, + "memory(GiB)": 117.38, + "step": 41270, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.66751194, + "epoch": 1.0470573313039067, + "grad_norm": 6.375, + "learning_rate": 5.024327973370951e-06, + "loss": 1.48990412, + "memory(GiB)": 117.38, + "step": 41275, + "train_speed(iter/s)": 1.636417 + }, + { + "acc": 0.64170256, + "epoch": 1.0471841704718416, + "grad_norm": 6.78125, + "learning_rate": 5.02327936157685e-06, + "loss": 1.65868225, + "memory(GiB)": 117.38, + "step": 41280, + "train_speed(iter/s)": 1.636439 + }, + { + "acc": 0.65187325, + "epoch": 1.0473110096397769, + "grad_norm": 5.78125, + "learning_rate": 5.022230748758816e-06, + "loss": 1.55477715, + "memory(GiB)": 117.38, + "step": 41285, + "train_speed(iter/s)": 1.636458 + }, + { + "acc": 0.66740675, + "epoch": 1.0474378488077118, + "grad_norm": 6.53125, + "learning_rate": 5.021182134962978e-06, + "loss": 1.59187527, + "memory(GiB)": 117.38, + "step": 41290, + "train_speed(iter/s)": 1.636478 + }, + { + "acc": 0.66480932, + "epoch": 1.0475646879756468, + "grad_norm": 4.65625, + "learning_rate": 5.020133520235453e-06, + "loss": 1.59848013, + "memory(GiB)": 117.38, + "step": 41295, + "train_speed(iter/s)": 1.636498 + }, + { + "acc": 0.64622912, + "epoch": 1.047691527143582, + "grad_norm": 5.28125, + "learning_rate": 5.019084904622367e-06, + "loss": 1.6313488, + "memory(GiB)": 117.38, + "step": 41300, + "train_speed(iter/s)": 1.636518 + }, + { + "acc": 0.65072818, + "epoch": 1.047818366311517, + "grad_norm": 4.53125, + "learning_rate": 5.01803628816984e-06, + "loss": 1.59510641, + "memory(GiB)": 117.38, + "step": 41305, + "train_speed(iter/s)": 1.636539 + }, + { + "acc": 0.65696106, + "epoch": 1.047945205479452, + "grad_norm": 6.625, + "learning_rate": 5.016987670923998e-06, + "loss": 1.53928385, + "memory(GiB)": 117.38, + "step": 41310, + "train_speed(iter/s)": 1.63656 + }, + { + "acc": 0.66856894, + "epoch": 1.048072044647387, + "grad_norm": 5.90625, + "learning_rate": 5.0159390529309615e-06, + "loss": 1.48899651, + "memory(GiB)": 117.38, + "step": 41315, + "train_speed(iter/s)": 1.636582 + }, + { + "acc": 0.65610499, + "epoch": 1.0481988838153222, + "grad_norm": 6.6875, + "learning_rate": 5.014890434236854e-06, + "loss": 1.56075859, + "memory(GiB)": 117.38, + "step": 41320, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.63999534, + "epoch": 1.0483257229832572, + "grad_norm": 7.21875, + "learning_rate": 5.0138418148878e-06, + "loss": 1.68905296, + "memory(GiB)": 117.38, + "step": 41325, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.66330452, + "epoch": 1.0484525621511922, + "grad_norm": 6.96875, + "learning_rate": 5.01279319492992e-06, + "loss": 1.60673504, + "memory(GiB)": 117.38, + "step": 41330, + "train_speed(iter/s)": 1.636644 + }, + { + "acc": 0.64284172, + "epoch": 1.0485794013191274, + "grad_norm": 5.40625, + "learning_rate": 5.01174457440934e-06, + "loss": 1.60006599, + "memory(GiB)": 117.38, + "step": 41335, + "train_speed(iter/s)": 1.636665 + }, + { + "acc": 0.67079792, + "epoch": 1.0487062404870624, + "grad_norm": 5.09375, + "learning_rate": 5.010695953372179e-06, + "loss": 1.55845718, + "memory(GiB)": 117.38, + "step": 41340, + "train_speed(iter/s)": 1.636686 + }, + { + "acc": 0.66199512, + "epoch": 1.0488330796549974, + "grad_norm": 5.8125, + "learning_rate": 5.009647331864563e-06, + "loss": 1.55456257, + "memory(GiB)": 117.38, + "step": 41345, + "train_speed(iter/s)": 1.636707 + }, + { + "acc": 0.66360178, + "epoch": 1.0489599188229326, + "grad_norm": 5.875, + "learning_rate": 5.008598709932615e-06, + "loss": 1.60426788, + "memory(GiB)": 117.38, + "step": 41350, + "train_speed(iter/s)": 1.636728 + }, + { + "acc": 0.65038514, + "epoch": 1.0490867579908676, + "grad_norm": 5.53125, + "learning_rate": 5.007550087622456e-06, + "loss": 1.64997749, + "memory(GiB)": 117.38, + "step": 41355, + "train_speed(iter/s)": 1.636748 + }, + { + "acc": 0.66603189, + "epoch": 1.0492135971588026, + "grad_norm": 6.90625, + "learning_rate": 5.0065014649802124e-06, + "loss": 1.55911579, + "memory(GiB)": 117.38, + "step": 41360, + "train_speed(iter/s)": 1.636771 + }, + { + "acc": 0.65281935, + "epoch": 1.0493404363267378, + "grad_norm": 5.1875, + "learning_rate": 5.005452842052003e-06, + "loss": 1.62591591, + "memory(GiB)": 117.38, + "step": 41365, + "train_speed(iter/s)": 1.636791 + }, + { + "acc": 0.65610533, + "epoch": 1.0494672754946728, + "grad_norm": 5.59375, + "learning_rate": 5.004404218883955e-06, + "loss": 1.61340828, + "memory(GiB)": 117.38, + "step": 41370, + "train_speed(iter/s)": 1.636812 + }, + { + "acc": 0.67170458, + "epoch": 1.0495941146626078, + "grad_norm": 6.375, + "learning_rate": 5.0033555955221875e-06, + "loss": 1.58470221, + "memory(GiB)": 117.38, + "step": 41375, + "train_speed(iter/s)": 1.636833 + }, + { + "acc": 0.66129222, + "epoch": 1.0497209538305428, + "grad_norm": 6.0, + "learning_rate": 5.002306972012829e-06, + "loss": 1.58265219, + "memory(GiB)": 117.38, + "step": 41380, + "train_speed(iter/s)": 1.636853 + }, + { + "acc": 0.65790405, + "epoch": 1.049847792998478, + "grad_norm": 4.875, + "learning_rate": 5.001258348401998e-06, + "loss": 1.59193287, + "memory(GiB)": 117.38, + "step": 41385, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.66113191, + "epoch": 1.049974632166413, + "grad_norm": 5.78125, + "learning_rate": 5.000209724735819e-06, + "loss": 1.61167622, + "memory(GiB)": 117.38, + "step": 41390, + "train_speed(iter/s)": 1.636893 + }, + { + "acc": 0.65508175, + "epoch": 1.050101471334348, + "grad_norm": 6.09375, + "learning_rate": 4.999161101060416e-06, + "loss": 1.62819176, + "memory(GiB)": 117.38, + "step": 41395, + "train_speed(iter/s)": 1.636915 + }, + { + "acc": 0.64078255, + "epoch": 1.0502283105022832, + "grad_norm": 6.78125, + "learning_rate": 4.99811247742191e-06, + "loss": 1.67368488, + "memory(GiB)": 117.38, + "step": 41400, + "train_speed(iter/s)": 1.636936 + }, + { + "acc": 0.65263281, + "epoch": 1.0503551496702181, + "grad_norm": 6.25, + "learning_rate": 4.9970638538664275e-06, + "loss": 1.61442719, + "memory(GiB)": 117.38, + "step": 41405, + "train_speed(iter/s)": 1.636955 + }, + { + "acc": 0.65208826, + "epoch": 1.0504819888381531, + "grad_norm": 4.84375, + "learning_rate": 4.996015230440091e-06, + "loss": 1.56453342, + "memory(GiB)": 117.38, + "step": 41410, + "train_speed(iter/s)": 1.636975 + }, + { + "acc": 0.66709089, + "epoch": 1.0506088280060883, + "grad_norm": 5.75, + "learning_rate": 4.99496660718902e-06, + "loss": 1.5060358, + "memory(GiB)": 117.38, + "step": 41415, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.65774541, + "epoch": 1.0507356671740233, + "grad_norm": 5.0625, + "learning_rate": 4.99391798415934e-06, + "loss": 1.5928319, + "memory(GiB)": 117.38, + "step": 41420, + "train_speed(iter/s)": 1.637016 + }, + { + "acc": 0.65854659, + "epoch": 1.0508625063419583, + "grad_norm": 6.0625, + "learning_rate": 4.992869361397175e-06, + "loss": 1.5839098, + "memory(GiB)": 117.38, + "step": 41425, + "train_speed(iter/s)": 1.637038 + }, + { + "acc": 0.65281224, + "epoch": 1.0509893455098935, + "grad_norm": 5.4375, + "learning_rate": 4.991820738948649e-06, + "loss": 1.5649519, + "memory(GiB)": 117.38, + "step": 41430, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.64837828, + "epoch": 1.0511161846778285, + "grad_norm": 6.65625, + "learning_rate": 4.9907721168598805e-06, + "loss": 1.6430418, + "memory(GiB)": 117.38, + "step": 41435, + "train_speed(iter/s)": 1.637078 + }, + { + "acc": 0.67129793, + "epoch": 1.0512430238457635, + "grad_norm": 5.0625, + "learning_rate": 4.989723495176997e-06, + "loss": 1.54342794, + "memory(GiB)": 117.38, + "step": 41440, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.67158484, + "epoch": 1.0513698630136987, + "grad_norm": 5.34375, + "learning_rate": 4.988674873946118e-06, + "loss": 1.53408966, + "memory(GiB)": 117.38, + "step": 41445, + "train_speed(iter/s)": 1.637119 + }, + { + "acc": 0.66758113, + "epoch": 1.0514967021816337, + "grad_norm": 6.875, + "learning_rate": 4.987626253213373e-06, + "loss": 1.57656155, + "memory(GiB)": 117.38, + "step": 41450, + "train_speed(iter/s)": 1.637138 + }, + { + "acc": 0.66543889, + "epoch": 1.0516235413495687, + "grad_norm": 5.90625, + "learning_rate": 4.986577633024877e-06, + "loss": 1.59522905, + "memory(GiB)": 117.38, + "step": 41455, + "train_speed(iter/s)": 1.637158 + }, + { + "acc": 0.64872437, + "epoch": 1.051750380517504, + "grad_norm": 5.96875, + "learning_rate": 4.985529013426758e-06, + "loss": 1.61923275, + "memory(GiB)": 117.38, + "step": 41460, + "train_speed(iter/s)": 1.637179 + }, + { + "acc": 0.65970845, + "epoch": 1.051877219685439, + "grad_norm": 5.5, + "learning_rate": 4.984480394465136e-06, + "loss": 1.54783516, + "memory(GiB)": 117.38, + "step": 41465, + "train_speed(iter/s)": 1.637197 + }, + { + "acc": 0.65071931, + "epoch": 1.052004058853374, + "grad_norm": 4.5, + "learning_rate": 4.9834317761861385e-06, + "loss": 1.59526424, + "memory(GiB)": 117.38, + "step": 41470, + "train_speed(iter/s)": 1.637218 + }, + { + "acc": 0.64619188, + "epoch": 1.0521308980213089, + "grad_norm": 6.125, + "learning_rate": 4.982383158635884e-06, + "loss": 1.61843891, + "memory(GiB)": 117.38, + "step": 41475, + "train_speed(iter/s)": 1.637238 + }, + { + "acc": 0.65058489, + "epoch": 1.052257737189244, + "grad_norm": 6.0, + "learning_rate": 4.981334541860496e-06, + "loss": 1.62189579, + "memory(GiB)": 117.38, + "step": 41480, + "train_speed(iter/s)": 1.637258 + }, + { + "acc": 0.65288968, + "epoch": 1.052384576357179, + "grad_norm": 6.09375, + "learning_rate": 4.980285925906098e-06, + "loss": 1.64552898, + "memory(GiB)": 117.38, + "step": 41485, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.64913392, + "epoch": 1.052511415525114, + "grad_norm": 6.09375, + "learning_rate": 4.9792373108188155e-06, + "loss": 1.63377304, + "memory(GiB)": 117.38, + "step": 41490, + "train_speed(iter/s)": 1.6373 + }, + { + "acc": 0.66187911, + "epoch": 1.0526382546930493, + "grad_norm": 7.625, + "learning_rate": 4.978188696644767e-06, + "loss": 1.55747643, + "memory(GiB)": 117.38, + "step": 41495, + "train_speed(iter/s)": 1.637319 + }, + { + "acc": 0.6660954, + "epoch": 1.0527650938609843, + "grad_norm": 5.15625, + "learning_rate": 4.977140083430075e-06, + "loss": 1.54084549, + "memory(GiB)": 117.38, + "step": 41500, + "train_speed(iter/s)": 1.637339 + }, + { + "acc": 0.66575518, + "epoch": 1.0528919330289193, + "grad_norm": 4.46875, + "learning_rate": 4.976091471220867e-06, + "loss": 1.5681531, + "memory(GiB)": 117.38, + "step": 41505, + "train_speed(iter/s)": 1.637359 + }, + { + "acc": 0.65210896, + "epoch": 1.0530187721968545, + "grad_norm": 4.84375, + "learning_rate": 4.975042860063263e-06, + "loss": 1.61182365, + "memory(GiB)": 117.38, + "step": 41510, + "train_speed(iter/s)": 1.63738 + }, + { + "acc": 0.6524231, + "epoch": 1.0531456113647895, + "grad_norm": 5.4375, + "learning_rate": 4.973994250003384e-06, + "loss": 1.63421822, + "memory(GiB)": 117.38, + "step": 41515, + "train_speed(iter/s)": 1.6374 + }, + { + "acc": 0.64087076, + "epoch": 1.0532724505327244, + "grad_norm": 5.59375, + "learning_rate": 4.972945641087355e-06, + "loss": 1.69444199, + "memory(GiB)": 117.38, + "step": 41520, + "train_speed(iter/s)": 1.63742 + }, + { + "acc": 0.65947309, + "epoch": 1.0533992897006597, + "grad_norm": 5.21875, + "learning_rate": 4.9718970333612955e-06, + "loss": 1.5772234, + "memory(GiB)": 117.38, + "step": 41525, + "train_speed(iter/s)": 1.63744 + }, + { + "acc": 0.65320401, + "epoch": 1.0535261288685946, + "grad_norm": 6.0625, + "learning_rate": 4.970848426871333e-06, + "loss": 1.58386173, + "memory(GiB)": 117.38, + "step": 41530, + "train_speed(iter/s)": 1.637461 + }, + { + "acc": 0.64789886, + "epoch": 1.0536529680365296, + "grad_norm": 5.375, + "learning_rate": 4.9697998216635854e-06, + "loss": 1.64066391, + "memory(GiB)": 117.38, + "step": 41535, + "train_speed(iter/s)": 1.63748 + }, + { + "acc": 0.65942602, + "epoch": 1.0537798072044646, + "grad_norm": 5.0, + "learning_rate": 4.9687512177841765e-06, + "loss": 1.55954399, + "memory(GiB)": 117.38, + "step": 41540, + "train_speed(iter/s)": 1.637501 + }, + { + "acc": 0.64906888, + "epoch": 1.0539066463723998, + "grad_norm": 5.28125, + "learning_rate": 4.967702615279227e-06, + "loss": 1.62640915, + "memory(GiB)": 117.38, + "step": 41545, + "train_speed(iter/s)": 1.637519 + }, + { + "acc": 0.64940128, + "epoch": 1.0540334855403348, + "grad_norm": 5.5, + "learning_rate": 4.966654014194863e-06, + "loss": 1.65984116, + "memory(GiB)": 117.38, + "step": 41550, + "train_speed(iter/s)": 1.637539 + }, + { + "acc": 0.66228552, + "epoch": 1.0541603247082698, + "grad_norm": 5.0, + "learning_rate": 4.965605414577204e-06, + "loss": 1.50570974, + "memory(GiB)": 117.38, + "step": 41555, + "train_speed(iter/s)": 1.637557 + }, + { + "acc": 0.67069626, + "epoch": 1.054287163876205, + "grad_norm": 5.78125, + "learning_rate": 4.964556816472371e-06, + "loss": 1.5725421, + "memory(GiB)": 117.38, + "step": 41560, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.6603281, + "epoch": 1.05441400304414, + "grad_norm": 4.6875, + "learning_rate": 4.9635082199264874e-06, + "loss": 1.62031517, + "memory(GiB)": 117.38, + "step": 41565, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.66995239, + "epoch": 1.054540842212075, + "grad_norm": 5.3125, + "learning_rate": 4.962459624985677e-06, + "loss": 1.58806696, + "memory(GiB)": 117.38, + "step": 41570, + "train_speed(iter/s)": 1.637616 + }, + { + "acc": 0.65389962, + "epoch": 1.0546676813800102, + "grad_norm": 6.0625, + "learning_rate": 4.961411031696059e-06, + "loss": 1.63474693, + "memory(GiB)": 117.38, + "step": 41575, + "train_speed(iter/s)": 1.637635 + }, + { + "acc": 0.66805277, + "epoch": 1.0547945205479452, + "grad_norm": 6.53125, + "learning_rate": 4.960362440103756e-06, + "loss": 1.60070915, + "memory(GiB)": 117.38, + "step": 41580, + "train_speed(iter/s)": 1.637656 + }, + { + "acc": 0.67776632, + "epoch": 1.0549213597158802, + "grad_norm": 6.84375, + "learning_rate": 4.95931385025489e-06, + "loss": 1.51767006, + "memory(GiB)": 117.38, + "step": 41585, + "train_speed(iter/s)": 1.637677 + }, + { + "acc": 0.66229258, + "epoch": 1.0550481988838154, + "grad_norm": 5.375, + "learning_rate": 4.958265262195584e-06, + "loss": 1.58374205, + "memory(GiB)": 117.38, + "step": 41590, + "train_speed(iter/s)": 1.637698 + }, + { + "acc": 0.67434092, + "epoch": 1.0551750380517504, + "grad_norm": 5.84375, + "learning_rate": 4.957216675971955e-06, + "loss": 1.57769642, + "memory(GiB)": 117.38, + "step": 41595, + "train_speed(iter/s)": 1.637718 + }, + { + "acc": 0.65956807, + "epoch": 1.0553018772196854, + "grad_norm": 6.25, + "learning_rate": 4.9561680916301295e-06, + "loss": 1.65626869, + "memory(GiB)": 117.38, + "step": 41600, + "train_speed(iter/s)": 1.637737 + }, + { + "acc": 0.65970173, + "epoch": 1.0554287163876206, + "grad_norm": 5.90625, + "learning_rate": 4.955119509216226e-06, + "loss": 1.56891727, + "memory(GiB)": 117.38, + "step": 41605, + "train_speed(iter/s)": 1.637755 + }, + { + "acc": 0.66244111, + "epoch": 1.0555555555555556, + "grad_norm": 5.5625, + "learning_rate": 4.9540709287763685e-06, + "loss": 1.6115097, + "memory(GiB)": 117.38, + "step": 41610, + "train_speed(iter/s)": 1.637776 + }, + { + "acc": 0.65042858, + "epoch": 1.0556823947234906, + "grad_norm": 7.46875, + "learning_rate": 4.953022350356676e-06, + "loss": 1.6223484, + "memory(GiB)": 117.38, + "step": 41615, + "train_speed(iter/s)": 1.637797 + }, + { + "acc": 0.65889292, + "epoch": 1.0558092338914258, + "grad_norm": 7.25, + "learning_rate": 4.951973774003269e-06, + "loss": 1.61314983, + "memory(GiB)": 117.38, + "step": 41620, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.65645108, + "epoch": 1.0559360730593608, + "grad_norm": 6.4375, + "learning_rate": 4.950925199762271e-06, + "loss": 1.64923573, + "memory(GiB)": 117.38, + "step": 41625, + "train_speed(iter/s)": 1.637838 + }, + { + "acc": 0.65752707, + "epoch": 1.0560629122272958, + "grad_norm": 6.1875, + "learning_rate": 4.949876627679803e-06, + "loss": 1.61506424, + "memory(GiB)": 117.38, + "step": 41630, + "train_speed(iter/s)": 1.637857 + }, + { + "acc": 0.66689143, + "epoch": 1.0561897513952307, + "grad_norm": 5.75, + "learning_rate": 4.948828057801983e-06, + "loss": 1.57217751, + "memory(GiB)": 117.38, + "step": 41635, + "train_speed(iter/s)": 1.637877 + }, + { + "acc": 0.64063044, + "epoch": 1.056316590563166, + "grad_norm": 6.34375, + "learning_rate": 4.947779490174933e-06, + "loss": 1.64298973, + "memory(GiB)": 117.38, + "step": 41640, + "train_speed(iter/s)": 1.637898 + }, + { + "acc": 0.65305147, + "epoch": 1.056443429731101, + "grad_norm": 6.25, + "learning_rate": 4.946730924844775e-06, + "loss": 1.6481638, + "memory(GiB)": 117.38, + "step": 41645, + "train_speed(iter/s)": 1.637917 + }, + { + "acc": 0.64655704, + "epoch": 1.056570268899036, + "grad_norm": 5.71875, + "learning_rate": 4.945682361857631e-06, + "loss": 1.61778393, + "memory(GiB)": 117.38, + "step": 41650, + "train_speed(iter/s)": 1.637937 + }, + { + "acc": 0.66422472, + "epoch": 1.0566971080669711, + "grad_norm": 5.71875, + "learning_rate": 4.944633801259615e-06, + "loss": 1.58619061, + "memory(GiB)": 117.38, + "step": 41655, + "train_speed(iter/s)": 1.637954 + }, + { + "acc": 0.63881536, + "epoch": 1.0568239472349061, + "grad_norm": 7.65625, + "learning_rate": 4.943585243096854e-06, + "loss": 1.67109737, + "memory(GiB)": 117.38, + "step": 41660, + "train_speed(iter/s)": 1.637975 + }, + { + "acc": 0.66965723, + "epoch": 1.0569507864028411, + "grad_norm": 5.34375, + "learning_rate": 4.942536687415465e-06, + "loss": 1.53907242, + "memory(GiB)": 117.38, + "step": 41665, + "train_speed(iter/s)": 1.637996 + }, + { + "acc": 0.63948097, + "epoch": 1.0570776255707763, + "grad_norm": 6.0625, + "learning_rate": 4.941488134261571e-06, + "loss": 1.69211578, + "memory(GiB)": 117.38, + "step": 41670, + "train_speed(iter/s)": 1.638016 + }, + { + "acc": 0.66704359, + "epoch": 1.0572044647387113, + "grad_norm": 6.21875, + "learning_rate": 4.940439583681288e-06, + "loss": 1.47416096, + "memory(GiB)": 117.38, + "step": 41675, + "train_speed(iter/s)": 1.638034 + }, + { + "acc": 0.65645337, + "epoch": 1.0573313039066463, + "grad_norm": 5.71875, + "learning_rate": 4.939391035720739e-06, + "loss": 1.54929352, + "memory(GiB)": 117.38, + "step": 41680, + "train_speed(iter/s)": 1.638053 + }, + { + "acc": 0.67585878, + "epoch": 1.0574581430745815, + "grad_norm": 5.78125, + "learning_rate": 4.938342490426041e-06, + "loss": 1.5336607, + "memory(GiB)": 117.38, + "step": 41685, + "train_speed(iter/s)": 1.638071 + }, + { + "acc": 0.65449138, + "epoch": 1.0575849822425165, + "grad_norm": 6.46875, + "learning_rate": 4.937293947843318e-06, + "loss": 1.60138302, + "memory(GiB)": 117.38, + "step": 41690, + "train_speed(iter/s)": 1.638091 + }, + { + "acc": 0.663904, + "epoch": 1.0577118214104515, + "grad_norm": 4.71875, + "learning_rate": 4.936245408018687e-06, + "loss": 1.5568924, + "memory(GiB)": 117.38, + "step": 41695, + "train_speed(iter/s)": 1.63811 + }, + { + "acc": 0.65379052, + "epoch": 1.0578386605783865, + "grad_norm": 5.84375, + "learning_rate": 4.935196870998265e-06, + "loss": 1.64195786, + "memory(GiB)": 117.38, + "step": 41700, + "train_speed(iter/s)": 1.638129 + }, + { + "acc": 0.654774, + "epoch": 1.0579654997463217, + "grad_norm": 15.6875, + "learning_rate": 4.934148336828176e-06, + "loss": 1.6594841, + "memory(GiB)": 117.38, + "step": 41705, + "train_speed(iter/s)": 1.638149 + }, + { + "acc": 0.63969841, + "epoch": 1.0580923389142567, + "grad_norm": 6.8125, + "learning_rate": 4.933099805554538e-06, + "loss": 1.7236372, + "memory(GiB)": 117.38, + "step": 41710, + "train_speed(iter/s)": 1.63817 + }, + { + "acc": 0.65071011, + "epoch": 1.0582191780821917, + "grad_norm": 5.4375, + "learning_rate": 4.932051277223468e-06, + "loss": 1.65656586, + "memory(GiB)": 117.38, + "step": 41715, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.66019106, + "epoch": 1.058346017250127, + "grad_norm": 6.3125, + "learning_rate": 4.931002751881086e-06, + "loss": 1.60512905, + "memory(GiB)": 117.38, + "step": 41720, + "train_speed(iter/s)": 1.638206 + }, + { + "acc": 0.65448594, + "epoch": 1.0584728564180619, + "grad_norm": 9.125, + "learning_rate": 4.929954229573512e-06, + "loss": 1.60945301, + "memory(GiB)": 117.38, + "step": 41725, + "train_speed(iter/s)": 1.638227 + }, + { + "acc": 0.65172033, + "epoch": 1.0585996955859969, + "grad_norm": 5.28125, + "learning_rate": 4.9289057103468635e-06, + "loss": 1.57258883, + "memory(GiB)": 117.38, + "step": 41730, + "train_speed(iter/s)": 1.638246 + }, + { + "acc": 0.65867891, + "epoch": 1.058726534753932, + "grad_norm": 5.125, + "learning_rate": 4.927857194247258e-06, + "loss": 1.56267366, + "memory(GiB)": 117.38, + "step": 41735, + "train_speed(iter/s)": 1.638265 + }, + { + "acc": 0.63669672, + "epoch": 1.058853373921867, + "grad_norm": 5.34375, + "learning_rate": 4.926808681320816e-06, + "loss": 1.64934692, + "memory(GiB)": 117.38, + "step": 41740, + "train_speed(iter/s)": 1.638286 + }, + { + "acc": 0.63982091, + "epoch": 1.058980213089802, + "grad_norm": 7.4375, + "learning_rate": 4.925760171613654e-06, + "loss": 1.65690918, + "memory(GiB)": 117.38, + "step": 41745, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.64528608, + "epoch": 1.0591070522577373, + "grad_norm": 5.90625, + "learning_rate": 4.9247116651718925e-06, + "loss": 1.64378071, + "memory(GiB)": 117.38, + "step": 41750, + "train_speed(iter/s)": 1.638325 + }, + { + "acc": 0.65477071, + "epoch": 1.0592338914256723, + "grad_norm": 7.9375, + "learning_rate": 4.9236631620416486e-06, + "loss": 1.66486778, + "memory(GiB)": 117.38, + "step": 41755, + "train_speed(iter/s)": 1.638344 + }, + { + "acc": 0.65830355, + "epoch": 1.0593607305936072, + "grad_norm": 5.09375, + "learning_rate": 4.922614662269038e-06, + "loss": 1.61162148, + "memory(GiB)": 117.38, + "step": 41760, + "train_speed(iter/s)": 1.638363 + }, + { + "acc": 0.66571188, + "epoch": 1.0594875697615425, + "grad_norm": 5.0625, + "learning_rate": 4.9215661659001805e-06, + "loss": 1.54778767, + "memory(GiB)": 117.38, + "step": 41765, + "train_speed(iter/s)": 1.638382 + }, + { + "acc": 0.68263235, + "epoch": 1.0596144089294774, + "grad_norm": 6.28125, + "learning_rate": 4.920517672981195e-06, + "loss": 1.49134645, + "memory(GiB)": 117.38, + "step": 41770, + "train_speed(iter/s)": 1.638401 + }, + { + "acc": 0.65683346, + "epoch": 1.0597412480974124, + "grad_norm": 5.65625, + "learning_rate": 4.919469183558195e-06, + "loss": 1.60307827, + "memory(GiB)": 117.38, + "step": 41775, + "train_speed(iter/s)": 1.63842 + }, + { + "acc": 0.66316805, + "epoch": 1.0598680872653476, + "grad_norm": 5.28125, + "learning_rate": 4.9184206976773e-06, + "loss": 1.57670088, + "memory(GiB)": 117.38, + "step": 41780, + "train_speed(iter/s)": 1.638439 + }, + { + "acc": 0.65350456, + "epoch": 1.0599949264332826, + "grad_norm": 6.5, + "learning_rate": 4.917372215384627e-06, + "loss": 1.66546021, + "memory(GiB)": 117.38, + "step": 41785, + "train_speed(iter/s)": 1.638458 + }, + { + "acc": 0.65337143, + "epoch": 1.0601217656012176, + "grad_norm": 6.28125, + "learning_rate": 4.916323736726295e-06, + "loss": 1.57240086, + "memory(GiB)": 117.38, + "step": 41790, + "train_speed(iter/s)": 1.638478 + }, + { + "acc": 0.65314188, + "epoch": 1.0602486047691526, + "grad_norm": 5.40625, + "learning_rate": 4.9152752617484156e-06, + "loss": 1.60025139, + "memory(GiB)": 117.38, + "step": 41795, + "train_speed(iter/s)": 1.638498 + }, + { + "acc": 0.64789534, + "epoch": 1.0603754439370878, + "grad_norm": 5.6875, + "learning_rate": 4.91422679049711e-06, + "loss": 1.62481232, + "memory(GiB)": 117.38, + "step": 41800, + "train_speed(iter/s)": 1.638517 + }, + { + "acc": 0.66380148, + "epoch": 1.0605022831050228, + "grad_norm": 5.59375, + "learning_rate": 4.913178323018493e-06, + "loss": 1.54954538, + "memory(GiB)": 117.38, + "step": 41805, + "train_speed(iter/s)": 1.638536 + }, + { + "acc": 0.65828438, + "epoch": 1.0606291222729578, + "grad_norm": 7.71875, + "learning_rate": 4.912129859358682e-06, + "loss": 1.6011261, + "memory(GiB)": 117.38, + "step": 41810, + "train_speed(iter/s)": 1.638555 + }, + { + "acc": 0.66674891, + "epoch": 1.060755961440893, + "grad_norm": 6.25, + "learning_rate": 4.9110813995637905e-06, + "loss": 1.55645895, + "memory(GiB)": 117.38, + "step": 41815, + "train_speed(iter/s)": 1.638574 + }, + { + "acc": 0.65907483, + "epoch": 1.060882800608828, + "grad_norm": 5.09375, + "learning_rate": 4.910032943679936e-06, + "loss": 1.56672668, + "memory(GiB)": 117.38, + "step": 41820, + "train_speed(iter/s)": 1.638593 + }, + { + "acc": 0.66251602, + "epoch": 1.061009639776763, + "grad_norm": 6.125, + "learning_rate": 4.908984491753234e-06, + "loss": 1.56733284, + "memory(GiB)": 117.38, + "step": 41825, + "train_speed(iter/s)": 1.638613 + }, + { + "acc": 0.6519701, + "epoch": 1.0611364789446982, + "grad_norm": 6.125, + "learning_rate": 4.907936043829802e-06, + "loss": 1.61102676, + "memory(GiB)": 117.38, + "step": 41830, + "train_speed(iter/s)": 1.638632 + }, + { + "acc": 0.65537128, + "epoch": 1.0612633181126332, + "grad_norm": 5.40625, + "learning_rate": 4.906887599955754e-06, + "loss": 1.59698248, + "memory(GiB)": 117.38, + "step": 41835, + "train_speed(iter/s)": 1.638649 + }, + { + "acc": 0.66442413, + "epoch": 1.0613901572805682, + "grad_norm": 6.0625, + "learning_rate": 4.905839160177203e-06, + "loss": 1.52137699, + "memory(GiB)": 117.38, + "step": 41840, + "train_speed(iter/s)": 1.638669 + }, + { + "acc": 0.65538521, + "epoch": 1.0615169964485034, + "grad_norm": 6.34375, + "learning_rate": 4.904790724540267e-06, + "loss": 1.57740974, + "memory(GiB)": 117.38, + "step": 41845, + "train_speed(iter/s)": 1.638687 + }, + { + "acc": 0.66363306, + "epoch": 1.0616438356164384, + "grad_norm": 5.9375, + "learning_rate": 4.903742293091061e-06, + "loss": 1.56996784, + "memory(GiB)": 117.38, + "step": 41850, + "train_speed(iter/s)": 1.638707 + }, + { + "acc": 0.65466814, + "epoch": 1.0617706747843734, + "grad_norm": 5.4375, + "learning_rate": 4.902693865875698e-06, + "loss": 1.63040752, + "memory(GiB)": 117.38, + "step": 41855, + "train_speed(iter/s)": 1.638727 + }, + { + "acc": 0.65475426, + "epoch": 1.0618975139523084, + "grad_norm": 6.46875, + "learning_rate": 4.901645442940293e-06, + "loss": 1.60180779, + "memory(GiB)": 117.38, + "step": 41860, + "train_speed(iter/s)": 1.638746 + }, + { + "acc": 0.66409035, + "epoch": 1.0620243531202436, + "grad_norm": 5.59375, + "learning_rate": 4.900597024330961e-06, + "loss": 1.58837566, + "memory(GiB)": 117.38, + "step": 41865, + "train_speed(iter/s)": 1.638764 + }, + { + "acc": 0.65501347, + "epoch": 1.0621511922881786, + "grad_norm": 6.5625, + "learning_rate": 4.899548610093816e-06, + "loss": 1.63757057, + "memory(GiB)": 117.38, + "step": 41870, + "train_speed(iter/s)": 1.638784 + }, + { + "acc": 0.66580958, + "epoch": 1.0622780314561135, + "grad_norm": 8.5, + "learning_rate": 4.89850020027497e-06, + "loss": 1.56432114, + "memory(GiB)": 117.38, + "step": 41875, + "train_speed(iter/s)": 1.638803 + }, + { + "acc": 0.66607304, + "epoch": 1.0624048706240488, + "grad_norm": 5.8125, + "learning_rate": 4.89745179492054e-06, + "loss": 1.56290388, + "memory(GiB)": 117.38, + "step": 41880, + "train_speed(iter/s)": 1.63882 + }, + { + "acc": 0.65710521, + "epoch": 1.0625317097919837, + "grad_norm": 6.625, + "learning_rate": 4.896403394076636e-06, + "loss": 1.61378155, + "memory(GiB)": 117.38, + "step": 41885, + "train_speed(iter/s)": 1.638839 + }, + { + "acc": 0.65485053, + "epoch": 1.0626585489599187, + "grad_norm": 9.0625, + "learning_rate": 4.895354997789377e-06, + "loss": 1.65956841, + "memory(GiB)": 117.38, + "step": 41890, + "train_speed(iter/s)": 1.638858 + }, + { + "acc": 0.66044173, + "epoch": 1.062785388127854, + "grad_norm": 5.5, + "learning_rate": 4.894306606104869e-06, + "loss": 1.59045801, + "memory(GiB)": 117.38, + "step": 41895, + "train_speed(iter/s)": 1.638877 + }, + { + "acc": 0.66803007, + "epoch": 1.062912227295789, + "grad_norm": 5.0, + "learning_rate": 4.893258219069229e-06, + "loss": 1.57747107, + "memory(GiB)": 117.38, + "step": 41900, + "train_speed(iter/s)": 1.638895 + }, + { + "acc": 0.66822958, + "epoch": 1.063039066463724, + "grad_norm": 5.6875, + "learning_rate": 4.892209836728569e-06, + "loss": 1.5444109, + "memory(GiB)": 117.38, + "step": 41905, + "train_speed(iter/s)": 1.638913 + }, + { + "acc": 0.65718756, + "epoch": 1.0631659056316591, + "grad_norm": 5.34375, + "learning_rate": 4.891161459129003e-06, + "loss": 1.61944962, + "memory(GiB)": 117.38, + "step": 41910, + "train_speed(iter/s)": 1.638932 + }, + { + "acc": 0.66516099, + "epoch": 1.0632927447995941, + "grad_norm": 6.78125, + "learning_rate": 4.890113086316641e-06, + "loss": 1.58834476, + "memory(GiB)": 117.38, + "step": 41915, + "train_speed(iter/s)": 1.63895 + }, + { + "acc": 0.65930672, + "epoch": 1.0634195839675291, + "grad_norm": 5.03125, + "learning_rate": 4.889064718337595e-06, + "loss": 1.58916779, + "memory(GiB)": 117.38, + "step": 41920, + "train_speed(iter/s)": 1.638971 + }, + { + "acc": 0.66039019, + "epoch": 1.0635464231354643, + "grad_norm": 4.78125, + "learning_rate": 4.888016355237979e-06, + "loss": 1.58848038, + "memory(GiB)": 117.38, + "step": 41925, + "train_speed(iter/s)": 1.63899 + }, + { + "acc": 0.64981003, + "epoch": 1.0636732623033993, + "grad_norm": 12.3125, + "learning_rate": 4.886967997063905e-06, + "loss": 1.58340187, + "memory(GiB)": 117.38, + "step": 41930, + "train_speed(iter/s)": 1.63901 + }, + { + "acc": 0.65391531, + "epoch": 1.0638001014713343, + "grad_norm": 6.34375, + "learning_rate": 4.885919643861482e-06, + "loss": 1.52405014, + "memory(GiB)": 117.38, + "step": 41935, + "train_speed(iter/s)": 1.639031 + }, + { + "acc": 0.67146859, + "epoch": 1.0639269406392695, + "grad_norm": 7.78125, + "learning_rate": 4.884871295676821e-06, + "loss": 1.48230019, + "memory(GiB)": 117.38, + "step": 41940, + "train_speed(iter/s)": 1.639049 + }, + { + "acc": 0.66181154, + "epoch": 1.0640537798072045, + "grad_norm": 6.1875, + "learning_rate": 4.883822952556036e-06, + "loss": 1.57460527, + "memory(GiB)": 117.38, + "step": 41945, + "train_speed(iter/s)": 1.639069 + }, + { + "acc": 0.65440249, + "epoch": 1.0641806189751395, + "grad_norm": 6.0625, + "learning_rate": 4.882774614545237e-06, + "loss": 1.54522963, + "memory(GiB)": 117.38, + "step": 41950, + "train_speed(iter/s)": 1.639088 + }, + { + "acc": 0.65802555, + "epoch": 1.0643074581430745, + "grad_norm": 6.28125, + "learning_rate": 4.881726281690531e-06, + "loss": 1.620014, + "memory(GiB)": 117.38, + "step": 41955, + "train_speed(iter/s)": 1.639107 + }, + { + "acc": 0.64371748, + "epoch": 1.0644342973110097, + "grad_norm": 6.84375, + "learning_rate": 4.8806779540380335e-06, + "loss": 1.65082741, + "memory(GiB)": 117.38, + "step": 41960, + "train_speed(iter/s)": 1.639128 + }, + { + "acc": 0.67126255, + "epoch": 1.0645611364789447, + "grad_norm": 4.875, + "learning_rate": 4.879629631633851e-06, + "loss": 1.58161755, + "memory(GiB)": 117.38, + "step": 41965, + "train_speed(iter/s)": 1.639148 + }, + { + "acc": 0.64680166, + "epoch": 1.0646879756468797, + "grad_norm": 7.4375, + "learning_rate": 4.8785813145240965e-06, + "loss": 1.61356754, + "memory(GiB)": 117.38, + "step": 41970, + "train_speed(iter/s)": 1.639168 + }, + { + "acc": 0.65687032, + "epoch": 1.0648148148148149, + "grad_norm": 5.625, + "learning_rate": 4.877533002754877e-06, + "loss": 1.58165016, + "memory(GiB)": 117.38, + "step": 41975, + "train_speed(iter/s)": 1.639189 + }, + { + "acc": 0.63934741, + "epoch": 1.0649416539827499, + "grad_norm": 5.03125, + "learning_rate": 4.8764846963723025e-06, + "loss": 1.66624241, + "memory(GiB)": 117.38, + "step": 41980, + "train_speed(iter/s)": 1.63921 + }, + { + "acc": 0.65329185, + "epoch": 1.0650684931506849, + "grad_norm": 6.8125, + "learning_rate": 4.875436395422481e-06, + "loss": 1.60341434, + "memory(GiB)": 117.38, + "step": 41985, + "train_speed(iter/s)": 1.639229 + }, + { + "acc": 0.65343442, + "epoch": 1.06519533231862, + "grad_norm": 5.375, + "learning_rate": 4.874388099951527e-06, + "loss": 1.57683716, + "memory(GiB)": 117.38, + "step": 41990, + "train_speed(iter/s)": 1.639249 + }, + { + "acc": 0.68604231, + "epoch": 1.065322171486555, + "grad_norm": 5.09375, + "learning_rate": 4.873339810005543e-06, + "loss": 1.5390049, + "memory(GiB)": 117.38, + "step": 41995, + "train_speed(iter/s)": 1.639268 + }, + { + "acc": 0.64881458, + "epoch": 1.06544901065449, + "grad_norm": 6.25, + "learning_rate": 4.872291525630638e-06, + "loss": 1.67170715, + "memory(GiB)": 117.38, + "step": 42000, + "train_speed(iter/s)": 1.639288 + }, + { + "epoch": 1.06544901065449, + "eval_acc": 0.6461908252359264, + "eval_loss": 1.5735667943954468, + "eval_runtime": 58.6735, + "eval_samples_per_second": 108.567, + "eval_steps_per_second": 27.15, + "step": 42000 + }, + { + "acc": 0.66267004, + "epoch": 1.0655758498224253, + "grad_norm": 6.15625, + "learning_rate": 4.871243246872923e-06, + "loss": 1.5742548, + "memory(GiB)": 117.38, + "step": 42005, + "train_speed(iter/s)": 1.635292 + }, + { + "acc": 0.65942197, + "epoch": 1.0657026889903602, + "grad_norm": 5.75, + "learning_rate": 4.870194973778506e-06, + "loss": 1.58884411, + "memory(GiB)": 117.38, + "step": 42010, + "train_speed(iter/s)": 1.635312 + }, + { + "acc": 0.64643865, + "epoch": 1.0658295281582952, + "grad_norm": 5.59375, + "learning_rate": 4.869146706393493e-06, + "loss": 1.65027313, + "memory(GiB)": 117.38, + "step": 42015, + "train_speed(iter/s)": 1.635334 + }, + { + "acc": 0.65906415, + "epoch": 1.0659563673262302, + "grad_norm": 6.25, + "learning_rate": 4.868098444763991e-06, + "loss": 1.54234753, + "memory(GiB)": 117.38, + "step": 42020, + "train_speed(iter/s)": 1.635351 + }, + { + "acc": 0.65209513, + "epoch": 1.0660832064941654, + "grad_norm": 8.375, + "learning_rate": 4.86705018893611e-06, + "loss": 1.59075565, + "memory(GiB)": 117.38, + "step": 42025, + "train_speed(iter/s)": 1.635374 + }, + { + "acc": 0.65900273, + "epoch": 1.0662100456621004, + "grad_norm": 5.09375, + "learning_rate": 4.866001938955955e-06, + "loss": 1.57277384, + "memory(GiB)": 117.38, + "step": 42030, + "train_speed(iter/s)": 1.635395 + }, + { + "acc": 0.6565896, + "epoch": 1.0663368848300354, + "grad_norm": 4.8125, + "learning_rate": 4.864953694869632e-06, + "loss": 1.57699404, + "memory(GiB)": 117.38, + "step": 42035, + "train_speed(iter/s)": 1.635417 + }, + { + "acc": 0.66661692, + "epoch": 1.0664637239979706, + "grad_norm": 8.5625, + "learning_rate": 4.863905456723249e-06, + "loss": 1.61874542, + "memory(GiB)": 117.38, + "step": 42040, + "train_speed(iter/s)": 1.635439 + }, + { + "acc": 0.6373539, + "epoch": 1.0665905631659056, + "grad_norm": 5.84375, + "learning_rate": 4.8628572245629105e-06, + "loss": 1.68152905, + "memory(GiB)": 117.38, + "step": 42045, + "train_speed(iter/s)": 1.63546 + }, + { + "acc": 0.64584494, + "epoch": 1.0667174023338406, + "grad_norm": 6.46875, + "learning_rate": 4.861808998434726e-06, + "loss": 1.60825424, + "memory(GiB)": 117.38, + "step": 42050, + "train_speed(iter/s)": 1.635481 + }, + { + "acc": 0.66562757, + "epoch": 1.0668442415017758, + "grad_norm": 6.8125, + "learning_rate": 4.860760778384797e-06, + "loss": 1.51508074, + "memory(GiB)": 117.38, + "step": 42055, + "train_speed(iter/s)": 1.635504 + }, + { + "acc": 0.65530353, + "epoch": 1.0669710806697108, + "grad_norm": 5.4375, + "learning_rate": 4.85971256445923e-06, + "loss": 1.54585171, + "memory(GiB)": 117.38, + "step": 42060, + "train_speed(iter/s)": 1.635524 + }, + { + "acc": 0.65522218, + "epoch": 1.0670979198376458, + "grad_norm": 6.65625, + "learning_rate": 4.858664356704131e-06, + "loss": 1.62526665, + "memory(GiB)": 117.38, + "step": 42065, + "train_speed(iter/s)": 1.635545 + }, + { + "acc": 0.68565059, + "epoch": 1.067224759005581, + "grad_norm": 6.71875, + "learning_rate": 4.857616155165606e-06, + "loss": 1.43904161, + "memory(GiB)": 117.38, + "step": 42070, + "train_speed(iter/s)": 1.635567 + }, + { + "acc": 0.631318, + "epoch": 1.067351598173516, + "grad_norm": 5.03125, + "learning_rate": 4.856567959889758e-06, + "loss": 1.66722412, + "memory(GiB)": 117.38, + "step": 42075, + "train_speed(iter/s)": 1.635589 + }, + { + "acc": 0.65941119, + "epoch": 1.067478437341451, + "grad_norm": 5.90625, + "learning_rate": 4.855519770922691e-06, + "loss": 1.62293243, + "memory(GiB)": 117.38, + "step": 42080, + "train_speed(iter/s)": 1.63561 + }, + { + "acc": 0.64963002, + "epoch": 1.0676052765093862, + "grad_norm": 6.625, + "learning_rate": 4.8544715883105084e-06, + "loss": 1.59394016, + "memory(GiB)": 117.38, + "step": 42085, + "train_speed(iter/s)": 1.635632 + }, + { + "acc": 0.65316563, + "epoch": 1.0677321156773212, + "grad_norm": 6.3125, + "learning_rate": 4.853423412099318e-06, + "loss": 1.61296577, + "memory(GiB)": 117.38, + "step": 42090, + "train_speed(iter/s)": 1.635654 + }, + { + "acc": 0.64957895, + "epoch": 1.0678589548452562, + "grad_norm": 5.71875, + "learning_rate": 4.852375242335217e-06, + "loss": 1.64058018, + "memory(GiB)": 117.38, + "step": 42095, + "train_speed(iter/s)": 1.635676 + }, + { + "acc": 0.65379176, + "epoch": 1.0679857940131914, + "grad_norm": 5.28125, + "learning_rate": 4.851327079064314e-06, + "loss": 1.59891739, + "memory(GiB)": 117.38, + "step": 42100, + "train_speed(iter/s)": 1.635696 + }, + { + "acc": 0.6463707, + "epoch": 1.0681126331811264, + "grad_norm": 6.03125, + "learning_rate": 4.850278922332708e-06, + "loss": 1.70048218, + "memory(GiB)": 117.38, + "step": 42105, + "train_speed(iter/s)": 1.635718 + }, + { + "acc": 0.65905781, + "epoch": 1.0682394723490614, + "grad_norm": 6.46875, + "learning_rate": 4.849230772186508e-06, + "loss": 1.61065731, + "memory(GiB)": 117.38, + "step": 42110, + "train_speed(iter/s)": 1.635738 + }, + { + "acc": 0.66508641, + "epoch": 1.0683663115169963, + "grad_norm": 5.25, + "learning_rate": 4.848182628671806e-06, + "loss": 1.54283333, + "memory(GiB)": 117.38, + "step": 42115, + "train_speed(iter/s)": 1.635759 + }, + { + "acc": 0.67158318, + "epoch": 1.0684931506849316, + "grad_norm": 5.03125, + "learning_rate": 4.847134491834713e-06, + "loss": 1.54667645, + "memory(GiB)": 117.38, + "step": 42120, + "train_speed(iter/s)": 1.63578 + }, + { + "acc": 0.66108823, + "epoch": 1.0686199898528665, + "grad_norm": 5.1875, + "learning_rate": 4.846086361721326e-06, + "loss": 1.54157867, + "memory(GiB)": 117.38, + "step": 42125, + "train_speed(iter/s)": 1.635803 + }, + { + "acc": 0.64167342, + "epoch": 1.0687468290208015, + "grad_norm": 6.46875, + "learning_rate": 4.84503823837775e-06, + "loss": 1.61747932, + "memory(GiB)": 117.38, + "step": 42130, + "train_speed(iter/s)": 1.635826 + }, + { + "acc": 0.6583787, + "epoch": 1.0688736681887367, + "grad_norm": 5.15625, + "learning_rate": 4.843990121850083e-06, + "loss": 1.55810575, + "memory(GiB)": 117.38, + "step": 42135, + "train_speed(iter/s)": 1.635848 + }, + { + "acc": 0.64197702, + "epoch": 1.0690005073566717, + "grad_norm": 5.875, + "learning_rate": 4.842942012184426e-06, + "loss": 1.62249298, + "memory(GiB)": 117.38, + "step": 42140, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.64114494, + "epoch": 1.0691273465246067, + "grad_norm": 5.0, + "learning_rate": 4.841893909426881e-06, + "loss": 1.64206657, + "memory(GiB)": 117.38, + "step": 42145, + "train_speed(iter/s)": 1.635891 + }, + { + "acc": 0.65284929, + "epoch": 1.069254185692542, + "grad_norm": 7.625, + "learning_rate": 4.84084581362355e-06, + "loss": 1.65577507, + "memory(GiB)": 117.38, + "step": 42150, + "train_speed(iter/s)": 1.635912 + }, + { + "acc": 0.65538263, + "epoch": 1.069381024860477, + "grad_norm": 5.375, + "learning_rate": 4.839797724820529e-06, + "loss": 1.61465034, + "memory(GiB)": 117.38, + "step": 42155, + "train_speed(iter/s)": 1.635932 + }, + { + "acc": 0.6463922, + "epoch": 1.069507864028412, + "grad_norm": 6.84375, + "learning_rate": 4.838749643063918e-06, + "loss": 1.62764587, + "memory(GiB)": 117.38, + "step": 42160, + "train_speed(iter/s)": 1.635954 + }, + { + "acc": 0.65110922, + "epoch": 1.0696347031963471, + "grad_norm": 5.46875, + "learning_rate": 4.837701568399819e-06, + "loss": 1.62532349, + "memory(GiB)": 117.38, + "step": 42165, + "train_speed(iter/s)": 1.635975 + }, + { + "acc": 0.64151073, + "epoch": 1.0697615423642821, + "grad_norm": 4.6875, + "learning_rate": 4.836653500874331e-06, + "loss": 1.6430233, + "memory(GiB)": 117.38, + "step": 42170, + "train_speed(iter/s)": 1.635997 + }, + { + "acc": 0.6517695, + "epoch": 1.069888381532217, + "grad_norm": 6.25, + "learning_rate": 4.835605440533549e-06, + "loss": 1.64731331, + "memory(GiB)": 117.38, + "step": 42175, + "train_speed(iter/s)": 1.636018 + }, + { + "acc": 0.66772804, + "epoch": 1.070015220700152, + "grad_norm": 6.625, + "learning_rate": 4.834557387423575e-06, + "loss": 1.50288582, + "memory(GiB)": 117.38, + "step": 42180, + "train_speed(iter/s)": 1.63604 + }, + { + "acc": 0.66155567, + "epoch": 1.0701420598680873, + "grad_norm": 5.1875, + "learning_rate": 4.833509341590503e-06, + "loss": 1.61410942, + "memory(GiB)": 117.38, + "step": 42185, + "train_speed(iter/s)": 1.636061 + }, + { + "acc": 0.66356831, + "epoch": 1.0702688990360223, + "grad_norm": 5.59375, + "learning_rate": 4.8324613030804374e-06, + "loss": 1.53381405, + "memory(GiB)": 117.38, + "step": 42190, + "train_speed(iter/s)": 1.636081 + }, + { + "acc": 0.64900713, + "epoch": 1.0703957382039573, + "grad_norm": 6.03125, + "learning_rate": 4.83141327193947e-06, + "loss": 1.64698257, + "memory(GiB)": 117.38, + "step": 42195, + "train_speed(iter/s)": 1.636102 + }, + { + "acc": 0.66689458, + "epoch": 1.0705225773718925, + "grad_norm": 5.71875, + "learning_rate": 4.8303652482137e-06, + "loss": 1.55340824, + "memory(GiB)": 117.38, + "step": 42200, + "train_speed(iter/s)": 1.636123 + }, + { + "acc": 0.63156323, + "epoch": 1.0706494165398275, + "grad_norm": 6.90625, + "learning_rate": 4.829317231949222e-06, + "loss": 1.68912773, + "memory(GiB)": 117.38, + "step": 42205, + "train_speed(iter/s)": 1.636145 + }, + { + "acc": 0.66935859, + "epoch": 1.0707762557077625, + "grad_norm": 5.875, + "learning_rate": 4.828269223192137e-06, + "loss": 1.58948174, + "memory(GiB)": 117.38, + "step": 42210, + "train_speed(iter/s)": 1.636165 + }, + { + "acc": 0.65661216, + "epoch": 1.0709030948756977, + "grad_norm": 6.0, + "learning_rate": 4.827221221988537e-06, + "loss": 1.56759233, + "memory(GiB)": 117.38, + "step": 42215, + "train_speed(iter/s)": 1.636185 + }, + { + "acc": 0.6571384, + "epoch": 1.0710299340436327, + "grad_norm": 5.28125, + "learning_rate": 4.826173228384518e-06, + "loss": 1.56036091, + "memory(GiB)": 117.38, + "step": 42220, + "train_speed(iter/s)": 1.636207 + }, + { + "acc": 0.66064982, + "epoch": 1.0711567732115677, + "grad_norm": 5.3125, + "learning_rate": 4.8251252424261775e-06, + "loss": 1.61549644, + "memory(GiB)": 117.38, + "step": 42225, + "train_speed(iter/s)": 1.636228 + }, + { + "acc": 0.67536812, + "epoch": 1.0712836123795029, + "grad_norm": 4.9375, + "learning_rate": 4.8240772641596105e-06, + "loss": 1.49482794, + "memory(GiB)": 117.38, + "step": 42230, + "train_speed(iter/s)": 1.636248 + }, + { + "acc": 0.66572533, + "epoch": 1.0714104515474379, + "grad_norm": 5.625, + "learning_rate": 4.82302929363091e-06, + "loss": 1.60782547, + "memory(GiB)": 117.38, + "step": 42235, + "train_speed(iter/s)": 1.63627 + }, + { + "acc": 0.65329785, + "epoch": 1.0715372907153728, + "grad_norm": 6.0, + "learning_rate": 4.8219813308861705e-06, + "loss": 1.6175312, + "memory(GiB)": 117.38, + "step": 42240, + "train_speed(iter/s)": 1.63629 + }, + { + "acc": 0.6694169, + "epoch": 1.071664129883308, + "grad_norm": 5.6875, + "learning_rate": 4.820933375971487e-06, + "loss": 1.53876801, + "memory(GiB)": 117.38, + "step": 42245, + "train_speed(iter/s)": 1.636311 + }, + { + "acc": 0.64812908, + "epoch": 1.071790969051243, + "grad_norm": 6.90625, + "learning_rate": 4.819885428932955e-06, + "loss": 1.61719284, + "memory(GiB)": 117.38, + "step": 42250, + "train_speed(iter/s)": 1.636333 + }, + { + "acc": 0.6650456, + "epoch": 1.071917808219178, + "grad_norm": 5.46875, + "learning_rate": 4.818837489816664e-06, + "loss": 1.55047035, + "memory(GiB)": 117.38, + "step": 42255, + "train_speed(iter/s)": 1.636354 + }, + { + "acc": 0.64859395, + "epoch": 1.0720446473871132, + "grad_norm": 5.25, + "learning_rate": 4.81778955866871e-06, + "loss": 1.64661102, + "memory(GiB)": 117.38, + "step": 42260, + "train_speed(iter/s)": 1.636375 + }, + { + "acc": 0.67052026, + "epoch": 1.0721714865550482, + "grad_norm": 5.40625, + "learning_rate": 4.816741635535183e-06, + "loss": 1.56664429, + "memory(GiB)": 117.38, + "step": 42265, + "train_speed(iter/s)": 1.636396 + }, + { + "acc": 0.6433629, + "epoch": 1.0722983257229832, + "grad_norm": 5.6875, + "learning_rate": 4.81569372046218e-06, + "loss": 1.68916283, + "memory(GiB)": 117.38, + "step": 42270, + "train_speed(iter/s)": 1.636416 + }, + { + "acc": 0.64638591, + "epoch": 1.0724251648909182, + "grad_norm": 6.375, + "learning_rate": 4.814645813495788e-06, + "loss": 1.65827789, + "memory(GiB)": 117.38, + "step": 42275, + "train_speed(iter/s)": 1.636436 + }, + { + "acc": 0.65866117, + "epoch": 1.0725520040588534, + "grad_norm": 4.5625, + "learning_rate": 4.8135979146821e-06, + "loss": 1.62065964, + "memory(GiB)": 117.38, + "step": 42280, + "train_speed(iter/s)": 1.636457 + }, + { + "acc": 0.64541955, + "epoch": 1.0726788432267884, + "grad_norm": 5.96875, + "learning_rate": 4.81255002406721e-06, + "loss": 1.61616898, + "memory(GiB)": 117.38, + "step": 42285, + "train_speed(iter/s)": 1.636478 + }, + { + "acc": 0.67276602, + "epoch": 1.0728056823947234, + "grad_norm": 6.84375, + "learning_rate": 4.811502141697206e-06, + "loss": 1.51470623, + "memory(GiB)": 117.38, + "step": 42290, + "train_speed(iter/s)": 1.636499 + }, + { + "acc": 0.65773182, + "epoch": 1.0729325215626586, + "grad_norm": 4.8125, + "learning_rate": 4.81045426761818e-06, + "loss": 1.58835955, + "memory(GiB)": 117.38, + "step": 42295, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.6545176, + "epoch": 1.0730593607305936, + "grad_norm": 4.8125, + "learning_rate": 4.80940640187622e-06, + "loss": 1.60705605, + "memory(GiB)": 117.38, + "step": 42300, + "train_speed(iter/s)": 1.63654 + }, + { + "acc": 0.66162596, + "epoch": 1.0731861998985286, + "grad_norm": 6.0, + "learning_rate": 4.808358544517418e-06, + "loss": 1.52777958, + "memory(GiB)": 117.38, + "step": 42305, + "train_speed(iter/s)": 1.636561 + }, + { + "acc": 0.66089416, + "epoch": 1.0733130390664638, + "grad_norm": 6.21875, + "learning_rate": 4.807310695587865e-06, + "loss": 1.52624264, + "memory(GiB)": 117.38, + "step": 42310, + "train_speed(iter/s)": 1.636582 + }, + { + "acc": 0.64437509, + "epoch": 1.0734398782343988, + "grad_norm": 5.46875, + "learning_rate": 4.8062628551336445e-06, + "loss": 1.6733593, + "memory(GiB)": 117.38, + "step": 42315, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.65218344, + "epoch": 1.0735667174023338, + "grad_norm": 5.5625, + "learning_rate": 4.80521502320085e-06, + "loss": 1.6339716, + "memory(GiB)": 117.38, + "step": 42320, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.65528421, + "epoch": 1.073693556570269, + "grad_norm": 6.5625, + "learning_rate": 4.804167199835567e-06, + "loss": 1.59885778, + "memory(GiB)": 117.38, + "step": 42325, + "train_speed(iter/s)": 1.636642 + }, + { + "acc": 0.66165695, + "epoch": 1.073820395738204, + "grad_norm": 5.5625, + "learning_rate": 4.8031193850838894e-06, + "loss": 1.54507418, + "memory(GiB)": 117.38, + "step": 42330, + "train_speed(iter/s)": 1.636662 + }, + { + "acc": 0.6511714, + "epoch": 1.073947234906139, + "grad_norm": 5.34375, + "learning_rate": 4.802071578991896e-06, + "loss": 1.6204174, + "memory(GiB)": 117.38, + "step": 42335, + "train_speed(iter/s)": 1.636684 + }, + { + "acc": 0.65364795, + "epoch": 1.074074074074074, + "grad_norm": 5.09375, + "learning_rate": 4.801023781605679e-06, + "loss": 1.62786865, + "memory(GiB)": 117.38, + "step": 42340, + "train_speed(iter/s)": 1.636705 + }, + { + "acc": 0.65816154, + "epoch": 1.0742009132420092, + "grad_norm": 4.8125, + "learning_rate": 4.799975992971325e-06, + "loss": 1.59796553, + "memory(GiB)": 117.38, + "step": 42345, + "train_speed(iter/s)": 1.636725 + }, + { + "acc": 0.6649786, + "epoch": 1.0743277524099442, + "grad_norm": 5.46875, + "learning_rate": 4.798928213134921e-06, + "loss": 1.54111347, + "memory(GiB)": 117.38, + "step": 42350, + "train_speed(iter/s)": 1.636747 + }, + { + "acc": 0.66917233, + "epoch": 1.0744545915778791, + "grad_norm": 6.0, + "learning_rate": 4.797880442142551e-06, + "loss": 1.61523819, + "memory(GiB)": 117.38, + "step": 42355, + "train_speed(iter/s)": 1.636767 + }, + { + "acc": 0.65927925, + "epoch": 1.0745814307458144, + "grad_norm": 5.71875, + "learning_rate": 4.7968326800403e-06, + "loss": 1.57280722, + "memory(GiB)": 117.38, + "step": 42360, + "train_speed(iter/s)": 1.636788 + }, + { + "acc": 0.64824514, + "epoch": 1.0747082699137493, + "grad_norm": 7.15625, + "learning_rate": 4.795784926874255e-06, + "loss": 1.64902573, + "memory(GiB)": 117.38, + "step": 42365, + "train_speed(iter/s)": 1.63681 + }, + { + "acc": 0.66020918, + "epoch": 1.0748351090816843, + "grad_norm": 7.25, + "learning_rate": 4.794737182690503e-06, + "loss": 1.59338675, + "memory(GiB)": 117.38, + "step": 42370, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.63986688, + "epoch": 1.0749619482496195, + "grad_norm": 5.03125, + "learning_rate": 4.793689447535126e-06, + "loss": 1.58832169, + "memory(GiB)": 117.38, + "step": 42375, + "train_speed(iter/s)": 1.636852 + }, + { + "acc": 0.67833238, + "epoch": 1.0750887874175545, + "grad_norm": 6.34375, + "learning_rate": 4.792641721454206e-06, + "loss": 1.57167778, + "memory(GiB)": 117.38, + "step": 42380, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.66357117, + "epoch": 1.0752156265854895, + "grad_norm": 5.40625, + "learning_rate": 4.79159400449383e-06, + "loss": 1.56718082, + "memory(GiB)": 117.38, + "step": 42385, + "train_speed(iter/s)": 1.636895 + }, + { + "acc": 0.66061506, + "epoch": 1.0753424657534247, + "grad_norm": 5.59375, + "learning_rate": 4.7905462967000816e-06, + "loss": 1.62065353, + "memory(GiB)": 117.38, + "step": 42390, + "train_speed(iter/s)": 1.636915 + }, + { + "acc": 0.65036502, + "epoch": 1.0754693049213597, + "grad_norm": 6.21875, + "learning_rate": 4.789498598119039e-06, + "loss": 1.63716011, + "memory(GiB)": 117.38, + "step": 42395, + "train_speed(iter/s)": 1.636937 + }, + { + "acc": 0.65736675, + "epoch": 1.0755961440892947, + "grad_norm": 5.46875, + "learning_rate": 4.78845090879679e-06, + "loss": 1.58783159, + "memory(GiB)": 117.38, + "step": 42400, + "train_speed(iter/s)": 1.636958 + }, + { + "acc": 0.65636415, + "epoch": 1.07572298325723, + "grad_norm": 6.5, + "learning_rate": 4.787403228779413e-06, + "loss": 1.58273706, + "memory(GiB)": 117.38, + "step": 42405, + "train_speed(iter/s)": 1.636979 + }, + { + "acc": 0.65250158, + "epoch": 1.075849822425165, + "grad_norm": 7.5, + "learning_rate": 4.786355558112994e-06, + "loss": 1.65883331, + "memory(GiB)": 117.38, + "step": 42410, + "train_speed(iter/s)": 1.637 + }, + { + "acc": 0.67009602, + "epoch": 1.0759766615931, + "grad_norm": 5.3125, + "learning_rate": 4.78530789684361e-06, + "loss": 1.54517412, + "memory(GiB)": 117.38, + "step": 42415, + "train_speed(iter/s)": 1.637022 + }, + { + "acc": 0.64447303, + "epoch": 1.0761035007610351, + "grad_norm": 4.4375, + "learning_rate": 4.784260245017343e-06, + "loss": 1.65884037, + "memory(GiB)": 117.38, + "step": 42420, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.66058936, + "epoch": 1.07623033992897, + "grad_norm": 5.46875, + "learning_rate": 4.7832126026802725e-06, + "loss": 1.60292816, + "memory(GiB)": 117.38, + "step": 42425, + "train_speed(iter/s)": 1.637064 + }, + { + "acc": 0.66112666, + "epoch": 1.076357179096905, + "grad_norm": 6.28125, + "learning_rate": 4.782164969878482e-06, + "loss": 1.5541626, + "memory(GiB)": 117.38, + "step": 42430, + "train_speed(iter/s)": 1.637086 + }, + { + "acc": 0.63486061, + "epoch": 1.07648401826484, + "grad_norm": 4.9375, + "learning_rate": 4.781117346658047e-06, + "loss": 1.62688007, + "memory(GiB)": 117.38, + "step": 42435, + "train_speed(iter/s)": 1.637106 + }, + { + "acc": 0.64601574, + "epoch": 1.0766108574327753, + "grad_norm": 5.6875, + "learning_rate": 4.780069733065048e-06, + "loss": 1.60367508, + "memory(GiB)": 117.38, + "step": 42440, + "train_speed(iter/s)": 1.637127 + }, + { + "acc": 0.6611485, + "epoch": 1.0767376966007103, + "grad_norm": 8.9375, + "learning_rate": 4.779022129145566e-06, + "loss": 1.55333118, + "memory(GiB)": 117.38, + "step": 42445, + "train_speed(iter/s)": 1.637149 + }, + { + "acc": 0.65445738, + "epoch": 1.0768645357686453, + "grad_norm": 5.59375, + "learning_rate": 4.777974534945677e-06, + "loss": 1.66348419, + "memory(GiB)": 117.38, + "step": 42450, + "train_speed(iter/s)": 1.63717 + }, + { + "acc": 0.65219812, + "epoch": 1.0769913749365805, + "grad_norm": 4.65625, + "learning_rate": 4.776926950511457e-06, + "loss": 1.65615845, + "memory(GiB)": 117.38, + "step": 42455, + "train_speed(iter/s)": 1.637191 + }, + { + "acc": 0.63933501, + "epoch": 1.0771182141045155, + "grad_norm": 5.5, + "learning_rate": 4.775879375888986e-06, + "loss": 1.68618317, + "memory(GiB)": 117.38, + "step": 42460, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.66758986, + "epoch": 1.0772450532724505, + "grad_norm": 5.34375, + "learning_rate": 4.774831811124343e-06, + "loss": 1.58380451, + "memory(GiB)": 117.38, + "step": 42465, + "train_speed(iter/s)": 1.63723 + }, + { + "acc": 0.65434456, + "epoch": 1.0773718924403857, + "grad_norm": 5.375, + "learning_rate": 4.773784256263601e-06, + "loss": 1.59232121, + "memory(GiB)": 117.38, + "step": 42470, + "train_speed(iter/s)": 1.637252 + }, + { + "acc": 0.64999151, + "epoch": 1.0774987316083207, + "grad_norm": 5.34375, + "learning_rate": 4.7727367113528374e-06, + "loss": 1.61950951, + "memory(GiB)": 117.38, + "step": 42475, + "train_speed(iter/s)": 1.637272 + }, + { + "acc": 0.6488997, + "epoch": 1.0776255707762556, + "grad_norm": 5.375, + "learning_rate": 4.771689176438128e-06, + "loss": 1.64509106, + "memory(GiB)": 117.38, + "step": 42480, + "train_speed(iter/s)": 1.637294 + }, + { + "acc": 0.65637975, + "epoch": 1.0777524099441909, + "grad_norm": 6.96875, + "learning_rate": 4.770641651565546e-06, + "loss": 1.53319883, + "memory(GiB)": 117.38, + "step": 42485, + "train_speed(iter/s)": 1.637315 + }, + { + "acc": 0.65938635, + "epoch": 1.0778792491121258, + "grad_norm": 5.6875, + "learning_rate": 4.769594136781172e-06, + "loss": 1.56976175, + "memory(GiB)": 117.38, + "step": 42490, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.66910419, + "epoch": 1.0780060882800608, + "grad_norm": 6.65625, + "learning_rate": 4.768546632131074e-06, + "loss": 1.58377504, + "memory(GiB)": 117.38, + "step": 42495, + "train_speed(iter/s)": 1.637359 + }, + { + "acc": 0.65934429, + "epoch": 1.0781329274479958, + "grad_norm": 8.3125, + "learning_rate": 4.767499137661328e-06, + "loss": 1.65002556, + "memory(GiB)": 117.38, + "step": 42500, + "train_speed(iter/s)": 1.637379 + }, + { + "acc": 0.67151184, + "epoch": 1.078259766615931, + "grad_norm": 4.59375, + "learning_rate": 4.76645165341801e-06, + "loss": 1.50512838, + "memory(GiB)": 117.38, + "step": 42505, + "train_speed(iter/s)": 1.637399 + }, + { + "acc": 0.64283657, + "epoch": 1.078386605783866, + "grad_norm": 5.84375, + "learning_rate": 4.76540417944719e-06, + "loss": 1.59125729, + "memory(GiB)": 117.38, + "step": 42510, + "train_speed(iter/s)": 1.637419 + }, + { + "acc": 0.66571417, + "epoch": 1.078513444951801, + "grad_norm": 4.71875, + "learning_rate": 4.764356715794942e-06, + "loss": 1.60510559, + "memory(GiB)": 117.38, + "step": 42515, + "train_speed(iter/s)": 1.63744 + }, + { + "acc": 0.6593575, + "epoch": 1.0786402841197362, + "grad_norm": 7.96875, + "learning_rate": 4.763309262507336e-06, + "loss": 1.55107212, + "memory(GiB)": 117.38, + "step": 42520, + "train_speed(iter/s)": 1.63746 + }, + { + "acc": 0.65673194, + "epoch": 1.0787671232876712, + "grad_norm": 6.875, + "learning_rate": 4.762261819630447e-06, + "loss": 1.6110527, + "memory(GiB)": 117.38, + "step": 42525, + "train_speed(iter/s)": 1.637481 + }, + { + "acc": 0.66404743, + "epoch": 1.0788939624556062, + "grad_norm": 5.75, + "learning_rate": 4.761214387210345e-06, + "loss": 1.5447691, + "memory(GiB)": 117.38, + "step": 42530, + "train_speed(iter/s)": 1.637503 + }, + { + "acc": 0.67320657, + "epoch": 1.0790208016235414, + "grad_norm": 5.375, + "learning_rate": 4.760166965293099e-06, + "loss": 1.58602543, + "memory(GiB)": 117.38, + "step": 42535, + "train_speed(iter/s)": 1.637524 + }, + { + "acc": 0.65891619, + "epoch": 1.0791476407914764, + "grad_norm": 6.59375, + "learning_rate": 4.759119553924781e-06, + "loss": 1.59398317, + "memory(GiB)": 117.38, + "step": 42540, + "train_speed(iter/s)": 1.637544 + }, + { + "acc": 0.65041962, + "epoch": 1.0792744799594114, + "grad_norm": 5.53125, + "learning_rate": 4.758072153151461e-06, + "loss": 1.56685324, + "memory(GiB)": 117.38, + "step": 42545, + "train_speed(iter/s)": 1.637565 + }, + { + "acc": 0.65652957, + "epoch": 1.0794013191273466, + "grad_norm": 5.65625, + "learning_rate": 4.757024763019209e-06, + "loss": 1.54740772, + "memory(GiB)": 117.38, + "step": 42550, + "train_speed(iter/s)": 1.637585 + }, + { + "acc": 0.65493364, + "epoch": 1.0795281582952816, + "grad_norm": 5.09375, + "learning_rate": 4.755977383574091e-06, + "loss": 1.58426991, + "memory(GiB)": 117.38, + "step": 42555, + "train_speed(iter/s)": 1.637607 + }, + { + "acc": 0.66231947, + "epoch": 1.0796549974632166, + "grad_norm": 5.6875, + "learning_rate": 4.754930014862177e-06, + "loss": 1.60242119, + "memory(GiB)": 117.38, + "step": 42560, + "train_speed(iter/s)": 1.637628 + }, + { + "acc": 0.66401873, + "epoch": 1.0797818366311518, + "grad_norm": 6.3125, + "learning_rate": 4.753882656929535e-06, + "loss": 1.64540787, + "memory(GiB)": 117.38, + "step": 42565, + "train_speed(iter/s)": 1.637649 + }, + { + "acc": 0.6570085, + "epoch": 1.0799086757990868, + "grad_norm": 5.25, + "learning_rate": 4.752835309822234e-06, + "loss": 1.60429153, + "memory(GiB)": 117.38, + "step": 42570, + "train_speed(iter/s)": 1.63767 + }, + { + "acc": 0.66406903, + "epoch": 1.0800355149670218, + "grad_norm": 5.96875, + "learning_rate": 4.7517879735863385e-06, + "loss": 1.57594776, + "memory(GiB)": 117.38, + "step": 42575, + "train_speed(iter/s)": 1.637691 + }, + { + "acc": 0.64655027, + "epoch": 1.080162354134957, + "grad_norm": 4.84375, + "learning_rate": 4.750740648267916e-06, + "loss": 1.62976341, + "memory(GiB)": 117.38, + "step": 42580, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.6545217, + "epoch": 1.080289193302892, + "grad_norm": 4.875, + "learning_rate": 4.749693333913033e-06, + "loss": 1.60688286, + "memory(GiB)": 117.38, + "step": 42585, + "train_speed(iter/s)": 1.637734 + }, + { + "acc": 0.65543194, + "epoch": 1.080416032470827, + "grad_norm": 5.84375, + "learning_rate": 4.748646030567755e-06, + "loss": 1.61598892, + "memory(GiB)": 117.38, + "step": 42590, + "train_speed(iter/s)": 1.637755 + }, + { + "acc": 0.66029997, + "epoch": 1.080542871638762, + "grad_norm": 6.3125, + "learning_rate": 4.747598738278147e-06, + "loss": 1.61568012, + "memory(GiB)": 117.38, + "step": 42595, + "train_speed(iter/s)": 1.637778 + }, + { + "acc": 0.6595809, + "epoch": 1.0806697108066972, + "grad_norm": 4.9375, + "learning_rate": 4.746551457090272e-06, + "loss": 1.61032467, + "memory(GiB)": 117.38, + "step": 42600, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.65252342, + "epoch": 1.0807965499746321, + "grad_norm": 5.25, + "learning_rate": 4.745504187050197e-06, + "loss": 1.65675735, + "memory(GiB)": 117.38, + "step": 42605, + "train_speed(iter/s)": 1.637819 + }, + { + "acc": 0.64243746, + "epoch": 1.0809233891425671, + "grad_norm": 5.5625, + "learning_rate": 4.744456928203985e-06, + "loss": 1.63453903, + "memory(GiB)": 117.38, + "step": 42610, + "train_speed(iter/s)": 1.637839 + }, + { + "acc": 0.65615311, + "epoch": 1.0810502283105023, + "grad_norm": 6.75, + "learning_rate": 4.743409680597695e-06, + "loss": 1.58209953, + "memory(GiB)": 117.38, + "step": 42615, + "train_speed(iter/s)": 1.637861 + }, + { + "acc": 0.64392343, + "epoch": 1.0811770674784373, + "grad_norm": 5.8125, + "learning_rate": 4.742362444277394e-06, + "loss": 1.64784203, + "memory(GiB)": 117.38, + "step": 42620, + "train_speed(iter/s)": 1.637884 + }, + { + "acc": 0.65613842, + "epoch": 1.0813039066463723, + "grad_norm": 5.28125, + "learning_rate": 4.741315219289142e-06, + "loss": 1.63932877, + "memory(GiB)": 117.38, + "step": 42625, + "train_speed(iter/s)": 1.637905 + }, + { + "acc": 0.6664278, + "epoch": 1.0814307458143075, + "grad_norm": 6.15625, + "learning_rate": 4.740268005679005e-06, + "loss": 1.55967646, + "memory(GiB)": 117.38, + "step": 42630, + "train_speed(iter/s)": 1.637926 + }, + { + "acc": 0.66025348, + "epoch": 1.0815575849822425, + "grad_norm": 7.03125, + "learning_rate": 4.739220803493039e-06, + "loss": 1.55208797, + "memory(GiB)": 117.38, + "step": 42635, + "train_speed(iter/s)": 1.637946 + }, + { + "acc": 0.65910797, + "epoch": 1.0816844241501775, + "grad_norm": 6.65625, + "learning_rate": 4.738173612777306e-06, + "loss": 1.65165787, + "memory(GiB)": 117.38, + "step": 42640, + "train_speed(iter/s)": 1.63797 + }, + { + "acc": 0.64050398, + "epoch": 1.0818112633181127, + "grad_norm": 5.3125, + "learning_rate": 4.737126433577866e-06, + "loss": 1.69772148, + "memory(GiB)": 117.38, + "step": 42645, + "train_speed(iter/s)": 1.637991 + }, + { + "acc": 0.6448998, + "epoch": 1.0819381024860477, + "grad_norm": 5.9375, + "learning_rate": 4.736079265940781e-06, + "loss": 1.64091969, + "memory(GiB)": 117.38, + "step": 42650, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.64359446, + "epoch": 1.0820649416539827, + "grad_norm": 6.46875, + "learning_rate": 4.735032109912107e-06, + "loss": 1.6278429, + "memory(GiB)": 117.38, + "step": 42655, + "train_speed(iter/s)": 1.638034 + }, + { + "acc": 0.64896083, + "epoch": 1.0821917808219177, + "grad_norm": 5.53125, + "learning_rate": 4.733984965537903e-06, + "loss": 1.59420853, + "memory(GiB)": 117.38, + "step": 42660, + "train_speed(iter/s)": 1.638055 + }, + { + "acc": 0.67471514, + "epoch": 1.082318619989853, + "grad_norm": 6.5625, + "learning_rate": 4.732937832864229e-06, + "loss": 1.52894554, + "memory(GiB)": 117.38, + "step": 42665, + "train_speed(iter/s)": 1.638076 + }, + { + "acc": 0.64166245, + "epoch": 1.082445459157788, + "grad_norm": 6.34375, + "learning_rate": 4.731890711937141e-06, + "loss": 1.64043427, + "memory(GiB)": 117.38, + "step": 42670, + "train_speed(iter/s)": 1.638098 + }, + { + "acc": 0.65850821, + "epoch": 1.0825722983257229, + "grad_norm": 7.15625, + "learning_rate": 4.730843602802696e-06, + "loss": 1.57808895, + "memory(GiB)": 117.38, + "step": 42675, + "train_speed(iter/s)": 1.638119 + }, + { + "acc": 0.66012197, + "epoch": 1.082699137493658, + "grad_norm": 7.0625, + "learning_rate": 4.729796505506951e-06, + "loss": 1.57560768, + "memory(GiB)": 117.38, + "step": 42680, + "train_speed(iter/s)": 1.638141 + }, + { + "acc": 0.6368959, + "epoch": 1.082825976661593, + "grad_norm": 5.0, + "learning_rate": 4.728749420095964e-06, + "loss": 1.65686798, + "memory(GiB)": 117.38, + "step": 42685, + "train_speed(iter/s)": 1.638162 + }, + { + "acc": 0.66116824, + "epoch": 1.082952815829528, + "grad_norm": 5.71875, + "learning_rate": 4.727702346615788e-06, + "loss": 1.65334988, + "memory(GiB)": 117.38, + "step": 42690, + "train_speed(iter/s)": 1.638184 + }, + { + "acc": 0.66066723, + "epoch": 1.0830796549974633, + "grad_norm": 4.625, + "learning_rate": 4.726655285112477e-06, + "loss": 1.56014986, + "memory(GiB)": 117.38, + "step": 42695, + "train_speed(iter/s)": 1.638204 + }, + { + "acc": 0.64992914, + "epoch": 1.0832064941653983, + "grad_norm": 6.21875, + "learning_rate": 4.725608235632088e-06, + "loss": 1.62344704, + "memory(GiB)": 117.38, + "step": 42700, + "train_speed(iter/s)": 1.638225 + }, + { + "acc": 0.65785198, + "epoch": 1.0833333333333333, + "grad_norm": 6.25, + "learning_rate": 4.724561198220672e-06, + "loss": 1.60169411, + "memory(GiB)": 117.38, + "step": 42705, + "train_speed(iter/s)": 1.638245 + }, + { + "acc": 0.6505753, + "epoch": 1.0834601725012685, + "grad_norm": 5.21875, + "learning_rate": 4.723514172924287e-06, + "loss": 1.62297802, + "memory(GiB)": 117.38, + "step": 42710, + "train_speed(iter/s)": 1.638266 + }, + { + "acc": 0.64711261, + "epoch": 1.0835870116692035, + "grad_norm": 5.0625, + "learning_rate": 4.7224671597889825e-06, + "loss": 1.62159233, + "memory(GiB)": 117.38, + "step": 42715, + "train_speed(iter/s)": 1.638286 + }, + { + "acc": 0.64322829, + "epoch": 1.0837138508371384, + "grad_norm": 6.84375, + "learning_rate": 4.72142015886081e-06, + "loss": 1.66040306, + "memory(GiB)": 117.38, + "step": 42720, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.67230005, + "epoch": 1.0838406900050737, + "grad_norm": 5.46875, + "learning_rate": 4.720373170185823e-06, + "loss": 1.54251385, + "memory(GiB)": 117.38, + "step": 42725, + "train_speed(iter/s)": 1.638327 + }, + { + "acc": 0.66142106, + "epoch": 1.0839675291730086, + "grad_norm": 6.53125, + "learning_rate": 4.719326193810075e-06, + "loss": 1.64745331, + "memory(GiB)": 117.38, + "step": 42730, + "train_speed(iter/s)": 1.638349 + }, + { + "acc": 0.64883218, + "epoch": 1.0840943683409436, + "grad_norm": 7.40625, + "learning_rate": 4.718279229779612e-06, + "loss": 1.65219688, + "memory(GiB)": 117.38, + "step": 42735, + "train_speed(iter/s)": 1.638371 + }, + { + "acc": 0.6469192, + "epoch": 1.0842212075088788, + "grad_norm": 5.5, + "learning_rate": 4.717232278140485e-06, + "loss": 1.67322044, + "memory(GiB)": 117.38, + "step": 42740, + "train_speed(iter/s)": 1.638391 + }, + { + "acc": 0.67113018, + "epoch": 1.0843480466768138, + "grad_norm": 5.21875, + "learning_rate": 4.716185338938746e-06, + "loss": 1.53260422, + "memory(GiB)": 117.38, + "step": 42745, + "train_speed(iter/s)": 1.638411 + }, + { + "acc": 0.64632063, + "epoch": 1.0844748858447488, + "grad_norm": 6.65625, + "learning_rate": 4.7151384122204445e-06, + "loss": 1.57729111, + "memory(GiB)": 117.38, + "step": 42750, + "train_speed(iter/s)": 1.638433 + }, + { + "acc": 0.64745231, + "epoch": 1.0846017250126838, + "grad_norm": 6.15625, + "learning_rate": 4.7140914980316254e-06, + "loss": 1.58687439, + "memory(GiB)": 117.38, + "step": 42755, + "train_speed(iter/s)": 1.638454 + }, + { + "acc": 0.65624466, + "epoch": 1.084728564180619, + "grad_norm": 6.59375, + "learning_rate": 4.713044596418339e-06, + "loss": 1.63883438, + "memory(GiB)": 117.38, + "step": 42760, + "train_speed(iter/s)": 1.638475 + }, + { + "acc": 0.65851412, + "epoch": 1.084855403348554, + "grad_norm": 6.03125, + "learning_rate": 4.711997707426632e-06, + "loss": 1.60370827, + "memory(GiB)": 117.38, + "step": 42765, + "train_speed(iter/s)": 1.638495 + }, + { + "acc": 0.65490026, + "epoch": 1.084982242516489, + "grad_norm": 5.78125, + "learning_rate": 4.710950831102555e-06, + "loss": 1.60263519, + "memory(GiB)": 117.38, + "step": 42770, + "train_speed(iter/s)": 1.638516 + }, + { + "acc": 0.65141826, + "epoch": 1.0851090816844242, + "grad_norm": 6.25, + "learning_rate": 4.709903967492147e-06, + "loss": 1.57416229, + "memory(GiB)": 117.38, + "step": 42775, + "train_speed(iter/s)": 1.638538 + }, + { + "acc": 0.64909539, + "epoch": 1.0852359208523592, + "grad_norm": 5.75, + "learning_rate": 4.7088571166414595e-06, + "loss": 1.59110508, + "memory(GiB)": 117.38, + "step": 42780, + "train_speed(iter/s)": 1.638558 + }, + { + "acc": 0.6568511, + "epoch": 1.0853627600202942, + "grad_norm": 6.15625, + "learning_rate": 4.707810278596534e-06, + "loss": 1.62210732, + "memory(GiB)": 117.38, + "step": 42785, + "train_speed(iter/s)": 1.638579 + }, + { + "acc": 0.6650569, + "epoch": 1.0854895991882294, + "grad_norm": 6.875, + "learning_rate": 4.7067634534034205e-06, + "loss": 1.60056992, + "memory(GiB)": 117.38, + "step": 42790, + "train_speed(iter/s)": 1.638599 + }, + { + "acc": 0.65203667, + "epoch": 1.0856164383561644, + "grad_norm": 6.65625, + "learning_rate": 4.705716641108157e-06, + "loss": 1.59725084, + "memory(GiB)": 117.38, + "step": 42795, + "train_speed(iter/s)": 1.638621 + }, + { + "acc": 0.65436087, + "epoch": 1.0857432775240994, + "grad_norm": 6.46875, + "learning_rate": 4.7046698417567894e-06, + "loss": 1.63516769, + "memory(GiB)": 117.38, + "step": 42800, + "train_speed(iter/s)": 1.638641 + }, + { + "acc": 0.65082111, + "epoch": 1.0858701166920346, + "grad_norm": 5.875, + "learning_rate": 4.7036230553953616e-06, + "loss": 1.61057377, + "memory(GiB)": 117.38, + "step": 42805, + "train_speed(iter/s)": 1.638662 + }, + { + "acc": 0.66845093, + "epoch": 1.0859969558599696, + "grad_norm": 6.34375, + "learning_rate": 4.702576282069916e-06, + "loss": 1.53074169, + "memory(GiB)": 117.38, + "step": 42810, + "train_speed(iter/s)": 1.638681 + }, + { + "acc": 0.64389658, + "epoch": 1.0861237950279046, + "grad_norm": 6.0, + "learning_rate": 4.701529521826492e-06, + "loss": 1.6562664, + "memory(GiB)": 117.38, + "step": 42815, + "train_speed(iter/s)": 1.638703 + }, + { + "acc": 0.66528811, + "epoch": 1.0862506341958396, + "grad_norm": 5.3125, + "learning_rate": 4.700482774711131e-06, + "loss": 1.60115089, + "memory(GiB)": 117.38, + "step": 42820, + "train_speed(iter/s)": 1.638725 + }, + { + "acc": 0.66938047, + "epoch": 1.0863774733637748, + "grad_norm": 6.25, + "learning_rate": 4.699436040769877e-06, + "loss": 1.49395771, + "memory(GiB)": 117.38, + "step": 42825, + "train_speed(iter/s)": 1.638745 + }, + { + "acc": 0.66367378, + "epoch": 1.0865043125317098, + "grad_norm": 6.15625, + "learning_rate": 4.698389320048768e-06, + "loss": 1.59599857, + "memory(GiB)": 117.38, + "step": 42830, + "train_speed(iter/s)": 1.638766 + }, + { + "acc": 0.63971758, + "epoch": 1.0866311516996447, + "grad_norm": 5.40625, + "learning_rate": 4.697342612593841e-06, + "loss": 1.63671455, + "memory(GiB)": 117.38, + "step": 42835, + "train_speed(iter/s)": 1.638786 + }, + { + "acc": 0.65135756, + "epoch": 1.08675799086758, + "grad_norm": 5.125, + "learning_rate": 4.696295918451139e-06, + "loss": 1.56955032, + "memory(GiB)": 117.38, + "step": 42840, + "train_speed(iter/s)": 1.638806 + }, + { + "acc": 0.66414266, + "epoch": 1.086884830035515, + "grad_norm": 5.25, + "learning_rate": 4.695249237666697e-06, + "loss": 1.56630802, + "memory(GiB)": 117.38, + "step": 42845, + "train_speed(iter/s)": 1.638828 + }, + { + "acc": 0.67779994, + "epoch": 1.08701166920345, + "grad_norm": 6.09375, + "learning_rate": 4.694202570286556e-06, + "loss": 1.53982239, + "memory(GiB)": 117.38, + "step": 42850, + "train_speed(iter/s)": 1.638848 + }, + { + "acc": 0.65204387, + "epoch": 1.0871385083713851, + "grad_norm": 5.75, + "learning_rate": 4.693155916356751e-06, + "loss": 1.65156517, + "memory(GiB)": 117.38, + "step": 42855, + "train_speed(iter/s)": 1.638869 + }, + { + "acc": 0.63524466, + "epoch": 1.0872653475393201, + "grad_norm": 5.71875, + "learning_rate": 4.692109275923318e-06, + "loss": 1.68566284, + "memory(GiB)": 117.38, + "step": 42860, + "train_speed(iter/s)": 1.638889 + }, + { + "acc": 0.65680737, + "epoch": 1.0873921867072551, + "grad_norm": 7.625, + "learning_rate": 4.6910626490322925e-06, + "loss": 1.65174942, + "memory(GiB)": 117.38, + "step": 42865, + "train_speed(iter/s)": 1.63891 + }, + { + "acc": 0.65737972, + "epoch": 1.0875190258751903, + "grad_norm": 7.375, + "learning_rate": 4.690016035729714e-06, + "loss": 1.58031292, + "memory(GiB)": 117.38, + "step": 42870, + "train_speed(iter/s)": 1.638931 + }, + { + "acc": 0.6521378, + "epoch": 1.0876458650431253, + "grad_norm": 7.28125, + "learning_rate": 4.688969436061612e-06, + "loss": 1.63118896, + "memory(GiB)": 117.38, + "step": 42875, + "train_speed(iter/s)": 1.638953 + }, + { + "acc": 0.66498389, + "epoch": 1.0877727042110603, + "grad_norm": 6.6875, + "learning_rate": 4.687922850074022e-06, + "loss": 1.56488295, + "memory(GiB)": 117.38, + "step": 42880, + "train_speed(iter/s)": 1.638975 + }, + { + "acc": 0.66012826, + "epoch": 1.0878995433789955, + "grad_norm": 5.34375, + "learning_rate": 4.686876277812981e-06, + "loss": 1.66019077, + "memory(GiB)": 117.38, + "step": 42885, + "train_speed(iter/s)": 1.638996 + }, + { + "acc": 0.64414916, + "epoch": 1.0880263825469305, + "grad_norm": 5.65625, + "learning_rate": 4.685829719324519e-06, + "loss": 1.6694931, + "memory(GiB)": 117.38, + "step": 42890, + "train_speed(iter/s)": 1.639017 + }, + { + "acc": 0.64602871, + "epoch": 1.0881532217148655, + "grad_norm": 5.84375, + "learning_rate": 4.6847831746546664e-06, + "loss": 1.68436737, + "memory(GiB)": 117.38, + "step": 42895, + "train_speed(iter/s)": 1.639039 + }, + { + "acc": 0.65834413, + "epoch": 1.0882800608828007, + "grad_norm": 9.0625, + "learning_rate": 4.683736643849459e-06, + "loss": 1.58339367, + "memory(GiB)": 117.38, + "step": 42900, + "train_speed(iter/s)": 1.639061 + }, + { + "acc": 0.65892487, + "epoch": 1.0884069000507357, + "grad_norm": 5.8125, + "learning_rate": 4.6826901269549255e-06, + "loss": 1.56407623, + "memory(GiB)": 117.38, + "step": 42905, + "train_speed(iter/s)": 1.639082 + }, + { + "acc": 0.64861221, + "epoch": 1.0885337392186707, + "grad_norm": 7.21875, + "learning_rate": 4.681643624017097e-06, + "loss": 1.61880608, + "memory(GiB)": 117.38, + "step": 42910, + "train_speed(iter/s)": 1.639104 + }, + { + "acc": 0.67106977, + "epoch": 1.0886605783866057, + "grad_norm": 6.0, + "learning_rate": 4.680597135082002e-06, + "loss": 1.51148872, + "memory(GiB)": 117.38, + "step": 42915, + "train_speed(iter/s)": 1.639126 + }, + { + "acc": 0.67631531, + "epoch": 1.088787417554541, + "grad_norm": 6.96875, + "learning_rate": 4.679550660195673e-06, + "loss": 1.54404764, + "memory(GiB)": 117.38, + "step": 42920, + "train_speed(iter/s)": 1.639147 + }, + { + "acc": 0.65046248, + "epoch": 1.0889142567224759, + "grad_norm": 8.0, + "learning_rate": 4.6785041994041345e-06, + "loss": 1.62483177, + "memory(GiB)": 117.38, + "step": 42925, + "train_speed(iter/s)": 1.63917 + }, + { + "acc": 0.64308729, + "epoch": 1.0890410958904109, + "grad_norm": 5.59375, + "learning_rate": 4.6774577527534195e-06, + "loss": 1.64539547, + "memory(GiB)": 117.38, + "step": 42930, + "train_speed(iter/s)": 1.639192 + }, + { + "acc": 0.66263704, + "epoch": 1.089167935058346, + "grad_norm": 5.03125, + "learning_rate": 4.676411320289551e-06, + "loss": 1.61424751, + "memory(GiB)": 117.38, + "step": 42935, + "train_speed(iter/s)": 1.639214 + }, + { + "acc": 0.65977211, + "epoch": 1.089294774226281, + "grad_norm": 7.28125, + "learning_rate": 4.675364902058556e-06, + "loss": 1.596766, + "memory(GiB)": 117.38, + "step": 42940, + "train_speed(iter/s)": 1.639236 + }, + { + "acc": 0.68072262, + "epoch": 1.089421613394216, + "grad_norm": 6.125, + "learning_rate": 4.674318498106464e-06, + "loss": 1.58252411, + "memory(GiB)": 117.38, + "step": 42945, + "train_speed(iter/s)": 1.639258 + }, + { + "acc": 0.66876469, + "epoch": 1.0895484525621513, + "grad_norm": 7.21875, + "learning_rate": 4.6732721084792985e-06, + "loss": 1.57067003, + "memory(GiB)": 117.38, + "step": 42950, + "train_speed(iter/s)": 1.63928 + }, + { + "acc": 0.64119525, + "epoch": 1.0896752917300863, + "grad_norm": 6.6875, + "learning_rate": 4.672225733223084e-06, + "loss": 1.67180595, + "memory(GiB)": 117.38, + "step": 42955, + "train_speed(iter/s)": 1.639302 + }, + { + "acc": 0.66922359, + "epoch": 1.0898021308980212, + "grad_norm": 6.28125, + "learning_rate": 4.671179372383844e-06, + "loss": 1.56107616, + "memory(GiB)": 117.38, + "step": 42960, + "train_speed(iter/s)": 1.639323 + }, + { + "acc": 0.65076776, + "epoch": 1.0899289700659565, + "grad_norm": 5.78125, + "learning_rate": 4.670133026007604e-06, + "loss": 1.66118851, + "memory(GiB)": 117.38, + "step": 42965, + "train_speed(iter/s)": 1.639344 + }, + { + "acc": 0.65551682, + "epoch": 1.0900558092338914, + "grad_norm": 6.46875, + "learning_rate": 4.669086694140388e-06, + "loss": 1.64479065, + "memory(GiB)": 117.38, + "step": 42970, + "train_speed(iter/s)": 1.639364 + }, + { + "acc": 0.65045419, + "epoch": 1.0901826484018264, + "grad_norm": 4.90625, + "learning_rate": 4.668040376828214e-06, + "loss": 1.57832069, + "memory(GiB)": 117.38, + "step": 42975, + "train_speed(iter/s)": 1.639384 + }, + { + "acc": 0.67459941, + "epoch": 1.0903094875697614, + "grad_norm": 5.625, + "learning_rate": 4.666994074117108e-06, + "loss": 1.59497089, + "memory(GiB)": 117.38, + "step": 42980, + "train_speed(iter/s)": 1.639405 + }, + { + "acc": 0.64645042, + "epoch": 1.0904363267376966, + "grad_norm": 5.53125, + "learning_rate": 4.665947786053088e-06, + "loss": 1.64929523, + "memory(GiB)": 117.38, + "step": 42985, + "train_speed(iter/s)": 1.639424 + }, + { + "acc": 0.65418391, + "epoch": 1.0905631659056316, + "grad_norm": 5.59375, + "learning_rate": 4.664901512682179e-06, + "loss": 1.60165024, + "memory(GiB)": 117.38, + "step": 42990, + "train_speed(iter/s)": 1.639446 + }, + { + "acc": 0.6610569, + "epoch": 1.0906900050735666, + "grad_norm": 7.46875, + "learning_rate": 4.663855254050394e-06, + "loss": 1.5978941, + "memory(GiB)": 117.38, + "step": 42995, + "train_speed(iter/s)": 1.639466 + }, + { + "acc": 0.64211078, + "epoch": 1.0908168442415018, + "grad_norm": 5.84375, + "learning_rate": 4.662809010203757e-06, + "loss": 1.6563961, + "memory(GiB)": 117.38, + "step": 43000, + "train_speed(iter/s)": 1.639487 + }, + { + "epoch": 1.0908168442415018, + "eval_acc": 0.6461507297585709, + "eval_loss": 1.5734246969223022, + "eval_runtime": 58.6033, + "eval_samples_per_second": 108.697, + "eval_steps_per_second": 27.183, + "step": 43000 + }, + { + "acc": 0.6504776, + "epoch": 1.0909436834094368, + "grad_norm": 5.75, + "learning_rate": 4.661762781188284e-06, + "loss": 1.60215588, + "memory(GiB)": 117.38, + "step": 43005, + "train_speed(iter/s)": 1.635586 + }, + { + "acc": 0.65855298, + "epoch": 1.0910705225773718, + "grad_norm": 4.9375, + "learning_rate": 4.660716567049997e-06, + "loss": 1.62584953, + "memory(GiB)": 117.38, + "step": 43010, + "train_speed(iter/s)": 1.635607 + }, + { + "acc": 0.65604277, + "epoch": 1.091197361745307, + "grad_norm": 6.78125, + "learning_rate": 4.659670367834908e-06, + "loss": 1.61244316, + "memory(GiB)": 117.38, + "step": 43015, + "train_speed(iter/s)": 1.635627 + }, + { + "acc": 0.65257726, + "epoch": 1.091324200913242, + "grad_norm": 6.625, + "learning_rate": 4.658624183589035e-06, + "loss": 1.61347485, + "memory(GiB)": 117.38, + "step": 43020, + "train_speed(iter/s)": 1.635649 + }, + { + "acc": 0.66370211, + "epoch": 1.091451040081177, + "grad_norm": 4.96875, + "learning_rate": 4.657578014358395e-06, + "loss": 1.58295498, + "memory(GiB)": 117.38, + "step": 43025, + "train_speed(iter/s)": 1.63567 + }, + { + "acc": 0.64700222, + "epoch": 1.0915778792491122, + "grad_norm": 5.3125, + "learning_rate": 4.656531860189005e-06, + "loss": 1.60024452, + "memory(GiB)": 117.38, + "step": 43030, + "train_speed(iter/s)": 1.635691 + }, + { + "acc": 0.65322824, + "epoch": 1.0917047184170472, + "grad_norm": 5.625, + "learning_rate": 4.655485721126875e-06, + "loss": 1.61180763, + "memory(GiB)": 117.38, + "step": 43035, + "train_speed(iter/s)": 1.635713 + }, + { + "acc": 0.64594069, + "epoch": 1.0918315575849822, + "grad_norm": 8.5625, + "learning_rate": 4.6544395972180214e-06, + "loss": 1.66605377, + "memory(GiB)": 117.38, + "step": 43040, + "train_speed(iter/s)": 1.635734 + }, + { + "acc": 0.64300685, + "epoch": 1.0919583967529174, + "grad_norm": 5.4375, + "learning_rate": 4.653393488508457e-06, + "loss": 1.63638954, + "memory(GiB)": 117.38, + "step": 43045, + "train_speed(iter/s)": 1.635755 + }, + { + "acc": 0.64903917, + "epoch": 1.0920852359208524, + "grad_norm": 6.3125, + "learning_rate": 4.652347395044197e-06, + "loss": 1.59003773, + "memory(GiB)": 117.38, + "step": 43050, + "train_speed(iter/s)": 1.635776 + }, + { + "acc": 0.65993395, + "epoch": 1.0922120750887874, + "grad_norm": 4.90625, + "learning_rate": 4.651301316871247e-06, + "loss": 1.61316166, + "memory(GiB)": 117.38, + "step": 43055, + "train_speed(iter/s)": 1.635796 + }, + { + "acc": 0.6553215, + "epoch": 1.0923389142567226, + "grad_norm": 5.4375, + "learning_rate": 4.6502552540356235e-06, + "loss": 1.58458881, + "memory(GiB)": 117.38, + "step": 43060, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.66177025, + "epoch": 1.0924657534246576, + "grad_norm": 5.78125, + "learning_rate": 4.649209206583335e-06, + "loss": 1.6367878, + "memory(GiB)": 117.38, + "step": 43065, + "train_speed(iter/s)": 1.635839 + }, + { + "acc": 0.6725316, + "epoch": 1.0925925925925926, + "grad_norm": 5.34375, + "learning_rate": 4.648163174560393e-06, + "loss": 1.55871553, + "memory(GiB)": 117.38, + "step": 43070, + "train_speed(iter/s)": 1.635861 + }, + { + "acc": 0.66052008, + "epoch": 1.0927194317605275, + "grad_norm": 5.375, + "learning_rate": 4.647117158012804e-06, + "loss": 1.58466578, + "memory(GiB)": 117.38, + "step": 43075, + "train_speed(iter/s)": 1.635881 + }, + { + "acc": 0.65092411, + "epoch": 1.0928462709284628, + "grad_norm": 5.625, + "learning_rate": 4.646071156986579e-06, + "loss": 1.55119801, + "memory(GiB)": 117.38, + "step": 43080, + "train_speed(iter/s)": 1.635903 + }, + { + "acc": 0.6483139, + "epoch": 1.0929731100963977, + "grad_norm": 5.96875, + "learning_rate": 4.645025171527723e-06, + "loss": 1.64346695, + "memory(GiB)": 117.38, + "step": 43085, + "train_speed(iter/s)": 1.635924 + }, + { + "acc": 0.64516344, + "epoch": 1.0930999492643327, + "grad_norm": 5.46875, + "learning_rate": 4.643979201682247e-06, + "loss": 1.62893448, + "memory(GiB)": 117.38, + "step": 43090, + "train_speed(iter/s)": 1.635945 + }, + { + "acc": 0.64851823, + "epoch": 1.093226788432268, + "grad_norm": 7.28125, + "learning_rate": 4.642933247496155e-06, + "loss": 1.65581131, + "memory(GiB)": 117.38, + "step": 43095, + "train_speed(iter/s)": 1.635966 + }, + { + "acc": 0.65348835, + "epoch": 1.093353627600203, + "grad_norm": 6.375, + "learning_rate": 4.641887309015451e-06, + "loss": 1.63301296, + "memory(GiB)": 117.38, + "step": 43100, + "train_speed(iter/s)": 1.635986 + }, + { + "acc": 0.66231995, + "epoch": 1.093480466768138, + "grad_norm": 4.875, + "learning_rate": 4.640841386286143e-06, + "loss": 1.52981148, + "memory(GiB)": 117.38, + "step": 43105, + "train_speed(iter/s)": 1.636006 + }, + { + "acc": 0.65333753, + "epoch": 1.0936073059360731, + "grad_norm": 5.875, + "learning_rate": 4.639795479354236e-06, + "loss": 1.65804787, + "memory(GiB)": 117.38, + "step": 43110, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.66842785, + "epoch": 1.0937341451040081, + "grad_norm": 8.5, + "learning_rate": 4.6387495882657295e-06, + "loss": 1.6123127, + "memory(GiB)": 117.38, + "step": 43115, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.65867653, + "epoch": 1.0938609842719431, + "grad_norm": 5.5, + "learning_rate": 4.63770371306663e-06, + "loss": 1.60763702, + "memory(GiB)": 117.38, + "step": 43120, + "train_speed(iter/s)": 1.63607 + }, + { + "acc": 0.65375547, + "epoch": 1.0939878234398783, + "grad_norm": 5.90625, + "learning_rate": 4.636657853802939e-06, + "loss": 1.53959503, + "memory(GiB)": 117.38, + "step": 43125, + "train_speed(iter/s)": 1.636091 + }, + { + "acc": 0.67584362, + "epoch": 1.0941146626078133, + "grad_norm": 6.0625, + "learning_rate": 4.635612010520659e-06, + "loss": 1.62089291, + "memory(GiB)": 117.38, + "step": 43130, + "train_speed(iter/s)": 1.636113 + }, + { + "acc": 0.65151534, + "epoch": 1.0942415017757483, + "grad_norm": 5.9375, + "learning_rate": 4.6345661832657866e-06, + "loss": 1.62925224, + "memory(GiB)": 117.38, + "step": 43135, + "train_speed(iter/s)": 1.636134 + }, + { + "acc": 0.66400709, + "epoch": 1.0943683409436833, + "grad_norm": 5.375, + "learning_rate": 4.633520372084327e-06, + "loss": 1.55168896, + "memory(GiB)": 117.38, + "step": 43140, + "train_speed(iter/s)": 1.636155 + }, + { + "acc": 0.64783411, + "epoch": 1.0944951801116185, + "grad_norm": 5.625, + "learning_rate": 4.632474577022276e-06, + "loss": 1.66214561, + "memory(GiB)": 117.38, + "step": 43145, + "train_speed(iter/s)": 1.636177 + }, + { + "acc": 0.66868038, + "epoch": 1.0946220192795535, + "grad_norm": 5.46875, + "learning_rate": 4.631428798125637e-06, + "loss": 1.58520069, + "memory(GiB)": 117.38, + "step": 43150, + "train_speed(iter/s)": 1.636197 + }, + { + "acc": 0.65542531, + "epoch": 1.0947488584474885, + "grad_norm": 4.75, + "learning_rate": 4.630383035440403e-06, + "loss": 1.60485611, + "memory(GiB)": 117.38, + "step": 43155, + "train_speed(iter/s)": 1.636218 + }, + { + "acc": 0.66994524, + "epoch": 1.0948756976154237, + "grad_norm": 5.40625, + "learning_rate": 4.6293372890125724e-06, + "loss": 1.59670334, + "memory(GiB)": 117.38, + "step": 43160, + "train_speed(iter/s)": 1.636235 + }, + { + "acc": 0.65548763, + "epoch": 1.0950025367833587, + "grad_norm": 4.78125, + "learning_rate": 4.628291558888144e-06, + "loss": 1.63418484, + "memory(GiB)": 117.38, + "step": 43165, + "train_speed(iter/s)": 1.636255 + }, + { + "acc": 0.66280289, + "epoch": 1.0951293759512937, + "grad_norm": 5.9375, + "learning_rate": 4.627245845113113e-06, + "loss": 1.59679852, + "memory(GiB)": 117.38, + "step": 43170, + "train_speed(iter/s)": 1.636277 + }, + { + "acc": 0.65032034, + "epoch": 1.0952562151192289, + "grad_norm": 6.21875, + "learning_rate": 4.626200147733474e-06, + "loss": 1.59159489, + "memory(GiB)": 117.38, + "step": 43175, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.6477489, + "epoch": 1.0953830542871639, + "grad_norm": 5.625, + "learning_rate": 4.62515446679522e-06, + "loss": 1.66620884, + "memory(GiB)": 117.38, + "step": 43180, + "train_speed(iter/s)": 1.636318 + }, + { + "acc": 0.67817831, + "epoch": 1.0955098934550989, + "grad_norm": 5.375, + "learning_rate": 4.624108802344347e-06, + "loss": 1.49521027, + "memory(GiB)": 117.38, + "step": 43185, + "train_speed(iter/s)": 1.63634 + }, + { + "acc": 0.66555071, + "epoch": 1.095636732623034, + "grad_norm": 6.9375, + "learning_rate": 4.623063154426848e-06, + "loss": 1.56619835, + "memory(GiB)": 117.38, + "step": 43190, + "train_speed(iter/s)": 1.636361 + }, + { + "acc": 0.65065045, + "epoch": 1.095763571790969, + "grad_norm": 5.78125, + "learning_rate": 4.622017523088712e-06, + "loss": 1.59552631, + "memory(GiB)": 117.38, + "step": 43195, + "train_speed(iter/s)": 1.636383 + }, + { + "acc": 0.65362248, + "epoch": 1.095890410958904, + "grad_norm": 6.0, + "learning_rate": 4.620971908375934e-06, + "loss": 1.62507133, + "memory(GiB)": 117.38, + "step": 43200, + "train_speed(iter/s)": 1.636404 + }, + { + "acc": 0.6748086, + "epoch": 1.0960172501268393, + "grad_norm": 6.9375, + "learning_rate": 4.619926310334503e-06, + "loss": 1.53247128, + "memory(GiB)": 117.38, + "step": 43205, + "train_speed(iter/s)": 1.636425 + }, + { + "acc": 0.64898806, + "epoch": 1.0961440892947742, + "grad_norm": 6.125, + "learning_rate": 4.618880729010413e-06, + "loss": 1.59856415, + "memory(GiB)": 117.38, + "step": 43210, + "train_speed(iter/s)": 1.636448 + }, + { + "acc": 0.66861658, + "epoch": 1.0962709284627092, + "grad_norm": 5.59375, + "learning_rate": 4.617835164449647e-06, + "loss": 1.52230234, + "memory(GiB)": 117.38, + "step": 43215, + "train_speed(iter/s)": 1.636468 + }, + { + "acc": 0.66404819, + "epoch": 1.0963977676306444, + "grad_norm": 7.21875, + "learning_rate": 4.616789616698197e-06, + "loss": 1.57905912, + "memory(GiB)": 117.38, + "step": 43220, + "train_speed(iter/s)": 1.63649 + }, + { + "acc": 0.63807912, + "epoch": 1.0965246067985794, + "grad_norm": 7.59375, + "learning_rate": 4.61574408580205e-06, + "loss": 1.66577148, + "memory(GiB)": 117.38, + "step": 43225, + "train_speed(iter/s)": 1.63651 + }, + { + "acc": 0.66725111, + "epoch": 1.0966514459665144, + "grad_norm": 5.78125, + "learning_rate": 4.614698571807196e-06, + "loss": 1.54518785, + "memory(GiB)": 117.38, + "step": 43230, + "train_speed(iter/s)": 1.636531 + }, + { + "acc": 0.65628242, + "epoch": 1.0967782851344494, + "grad_norm": 5.71875, + "learning_rate": 4.6136530747596185e-06, + "loss": 1.59053173, + "memory(GiB)": 117.38, + "step": 43235, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.65458994, + "epoch": 1.0969051243023846, + "grad_norm": 5.46875, + "learning_rate": 4.612607594705301e-06, + "loss": 1.65995922, + "memory(GiB)": 117.38, + "step": 43240, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.6680191, + "epoch": 1.0970319634703196, + "grad_norm": 5.96875, + "learning_rate": 4.611562131690234e-06, + "loss": 1.56793299, + "memory(GiB)": 117.38, + "step": 43245, + "train_speed(iter/s)": 1.636596 + }, + { + "acc": 0.65579796, + "epoch": 1.0971588026382546, + "grad_norm": 5.78125, + "learning_rate": 4.610516685760399e-06, + "loss": 1.57751884, + "memory(GiB)": 117.38, + "step": 43250, + "train_speed(iter/s)": 1.636617 + }, + { + "acc": 0.66340847, + "epoch": 1.0972856418061898, + "grad_norm": 5.5625, + "learning_rate": 4.6094712569617775e-06, + "loss": 1.50516148, + "memory(GiB)": 117.38, + "step": 43255, + "train_speed(iter/s)": 1.636637 + }, + { + "acc": 0.64909182, + "epoch": 1.0974124809741248, + "grad_norm": 5.28125, + "learning_rate": 4.608425845340353e-06, + "loss": 1.63684902, + "memory(GiB)": 117.38, + "step": 43260, + "train_speed(iter/s)": 1.636659 + }, + { + "acc": 0.63719807, + "epoch": 1.0975393201420598, + "grad_norm": 6.09375, + "learning_rate": 4.607380450942109e-06, + "loss": 1.66330795, + "memory(GiB)": 117.38, + "step": 43265, + "train_speed(iter/s)": 1.636682 + }, + { + "acc": 0.65189552, + "epoch": 1.097666159309995, + "grad_norm": 6.15625, + "learning_rate": 4.606335073813028e-06, + "loss": 1.64268112, + "memory(GiB)": 117.38, + "step": 43270, + "train_speed(iter/s)": 1.636704 + }, + { + "acc": 0.64464512, + "epoch": 1.09779299847793, + "grad_norm": 5.0625, + "learning_rate": 4.605289713999085e-06, + "loss": 1.6694376, + "memory(GiB)": 117.38, + "step": 43275, + "train_speed(iter/s)": 1.636726 + }, + { + "acc": 0.65150423, + "epoch": 1.097919837645865, + "grad_norm": 4.625, + "learning_rate": 4.604244371546263e-06, + "loss": 1.60193024, + "memory(GiB)": 117.38, + "step": 43280, + "train_speed(iter/s)": 1.636747 + }, + { + "acc": 0.64329195, + "epoch": 1.0980466768138002, + "grad_norm": 5.34375, + "learning_rate": 4.603199046500539e-06, + "loss": 1.6610939, + "memory(GiB)": 117.38, + "step": 43285, + "train_speed(iter/s)": 1.636769 + }, + { + "acc": 0.65891066, + "epoch": 1.0981735159817352, + "grad_norm": 5.75, + "learning_rate": 4.602153738907896e-06, + "loss": 1.62164211, + "memory(GiB)": 117.38, + "step": 43290, + "train_speed(iter/s)": 1.63679 + }, + { + "acc": 0.66268911, + "epoch": 1.0983003551496702, + "grad_norm": 5.75, + "learning_rate": 4.601108448814306e-06, + "loss": 1.60490112, + "memory(GiB)": 117.38, + "step": 43295, + "train_speed(iter/s)": 1.636812 + }, + { + "acc": 0.64971757, + "epoch": 1.0984271943176052, + "grad_norm": 4.8125, + "learning_rate": 4.600063176265749e-06, + "loss": 1.5322319, + "memory(GiB)": 117.38, + "step": 43300, + "train_speed(iter/s)": 1.636833 + }, + { + "acc": 0.66501102, + "epoch": 1.0985540334855404, + "grad_norm": 5.5625, + "learning_rate": 4.599017921308196e-06, + "loss": 1.57546043, + "memory(GiB)": 117.38, + "step": 43305, + "train_speed(iter/s)": 1.636854 + }, + { + "acc": 0.64381585, + "epoch": 1.0986808726534754, + "grad_norm": 5.9375, + "learning_rate": 4.5979726839876285e-06, + "loss": 1.61664104, + "memory(GiB)": 117.38, + "step": 43310, + "train_speed(iter/s)": 1.636875 + }, + { + "acc": 0.64273753, + "epoch": 1.0988077118214103, + "grad_norm": 5.5, + "learning_rate": 4.596927464350015e-06, + "loss": 1.63682766, + "memory(GiB)": 117.38, + "step": 43315, + "train_speed(iter/s)": 1.636897 + }, + { + "acc": 0.64000797, + "epoch": 1.0989345509893456, + "grad_norm": 5.28125, + "learning_rate": 4.595882262441331e-06, + "loss": 1.61038322, + "memory(GiB)": 117.38, + "step": 43320, + "train_speed(iter/s)": 1.636588 + }, + { + "acc": 0.66831989, + "epoch": 1.0990613901572805, + "grad_norm": 4.46875, + "learning_rate": 4.5948370783075505e-06, + "loss": 1.53623734, + "memory(GiB)": 117.38, + "step": 43325, + "train_speed(iter/s)": 1.636609 + }, + { + "acc": 0.65933323, + "epoch": 1.0991882293252155, + "grad_norm": 6.21875, + "learning_rate": 4.5937919119946445e-06, + "loss": 1.630583, + "memory(GiB)": 117.38, + "step": 43330, + "train_speed(iter/s)": 1.63663 + }, + { + "acc": 0.6712594, + "epoch": 1.0993150684931507, + "grad_norm": 6.5, + "learning_rate": 4.592746763548582e-06, + "loss": 1.55620079, + "memory(GiB)": 117.38, + "step": 43335, + "train_speed(iter/s)": 1.636651 + }, + { + "acc": 0.65974803, + "epoch": 1.0994419076610857, + "grad_norm": 6.15625, + "learning_rate": 4.591701633015336e-06, + "loss": 1.61999149, + "memory(GiB)": 117.38, + "step": 43340, + "train_speed(iter/s)": 1.636672 + }, + { + "acc": 0.66459017, + "epoch": 1.0995687468290207, + "grad_norm": 5.0, + "learning_rate": 4.590656520440876e-06, + "loss": 1.62464294, + "memory(GiB)": 117.38, + "step": 43345, + "train_speed(iter/s)": 1.636695 + }, + { + "acc": 0.67729406, + "epoch": 1.099695585996956, + "grad_norm": 5.5625, + "learning_rate": 4.58961142587117e-06, + "loss": 1.51495514, + "memory(GiB)": 117.38, + "step": 43350, + "train_speed(iter/s)": 1.636716 + }, + { + "acc": 0.64763308, + "epoch": 1.099822425164891, + "grad_norm": 5.65625, + "learning_rate": 4.588566349352185e-06, + "loss": 1.55075903, + "memory(GiB)": 117.38, + "step": 43355, + "train_speed(iter/s)": 1.636736 + }, + { + "acc": 0.65509777, + "epoch": 1.099949264332826, + "grad_norm": 7.375, + "learning_rate": 4.5875212909298885e-06, + "loss": 1.56889591, + "memory(GiB)": 117.38, + "step": 43360, + "train_speed(iter/s)": 1.636758 + }, + { + "acc": 0.66368484, + "epoch": 1.1000761035007611, + "grad_norm": 5.34375, + "learning_rate": 4.586476250650246e-06, + "loss": 1.55608835, + "memory(GiB)": 117.38, + "step": 43365, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.6542634, + "epoch": 1.1002029426686961, + "grad_norm": 5.03125, + "learning_rate": 4.585431228559228e-06, + "loss": 1.60081081, + "memory(GiB)": 117.38, + "step": 43370, + "train_speed(iter/s)": 1.6368 + }, + { + "acc": 0.65408893, + "epoch": 1.100329781836631, + "grad_norm": 5.40625, + "learning_rate": 4.584386224702792e-06, + "loss": 1.55857944, + "memory(GiB)": 117.38, + "step": 43375, + "train_speed(iter/s)": 1.636822 + }, + { + "acc": 0.6609746, + "epoch": 1.1004566210045663, + "grad_norm": 6.53125, + "learning_rate": 4.583341239126906e-06, + "loss": 1.60434227, + "memory(GiB)": 117.38, + "step": 43380, + "train_speed(iter/s)": 1.636845 + }, + { + "acc": 0.65415144, + "epoch": 1.1005834601725013, + "grad_norm": 6.375, + "learning_rate": 4.582296271877534e-06, + "loss": 1.63812332, + "memory(GiB)": 117.38, + "step": 43385, + "train_speed(iter/s)": 1.636867 + }, + { + "acc": 0.65546341, + "epoch": 1.1007102993404363, + "grad_norm": 6.15625, + "learning_rate": 4.581251323000636e-06, + "loss": 1.62109108, + "memory(GiB)": 117.38, + "step": 43390, + "train_speed(iter/s)": 1.636889 + }, + { + "acc": 0.6368413, + "epoch": 1.1008371385083713, + "grad_norm": 5.21875, + "learning_rate": 4.580206392542175e-06, + "loss": 1.65205193, + "memory(GiB)": 117.38, + "step": 43395, + "train_speed(iter/s)": 1.636908 + }, + { + "acc": 0.66631503, + "epoch": 1.1009639776763065, + "grad_norm": 5.46875, + "learning_rate": 4.579161480548109e-06, + "loss": 1.59916687, + "memory(GiB)": 117.38, + "step": 43400, + "train_speed(iter/s)": 1.63693 + }, + { + "acc": 0.65419798, + "epoch": 1.1010908168442415, + "grad_norm": 4.90625, + "learning_rate": 4.578116587064402e-06, + "loss": 1.63244247, + "memory(GiB)": 117.38, + "step": 43405, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.66090374, + "epoch": 1.1012176560121765, + "grad_norm": 9.3125, + "learning_rate": 4.577071712137012e-06, + "loss": 1.61473446, + "memory(GiB)": 117.38, + "step": 43410, + "train_speed(iter/s)": 1.636972 + }, + { + "acc": 0.65870848, + "epoch": 1.1013444951801117, + "grad_norm": 5.9375, + "learning_rate": 4.576026855811893e-06, + "loss": 1.61457329, + "memory(GiB)": 117.38, + "step": 43415, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.66705761, + "epoch": 1.1014713343480467, + "grad_norm": 9.875, + "learning_rate": 4.5749820181350095e-06, + "loss": 1.58334522, + "memory(GiB)": 117.38, + "step": 43420, + "train_speed(iter/s)": 1.637015 + }, + { + "acc": 0.64460411, + "epoch": 1.1015981735159817, + "grad_norm": 5.1875, + "learning_rate": 4.57393719915231e-06, + "loss": 1.61328239, + "memory(GiB)": 117.38, + "step": 43425, + "train_speed(iter/s)": 1.637036 + }, + { + "acc": 0.65223541, + "epoch": 1.1017250126839169, + "grad_norm": 5.46875, + "learning_rate": 4.5728923989097604e-06, + "loss": 1.57075558, + "memory(GiB)": 117.38, + "step": 43430, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.65929437, + "epoch": 1.1018518518518519, + "grad_norm": 5.5625, + "learning_rate": 4.571847617453306e-06, + "loss": 1.53379965, + "memory(GiB)": 117.38, + "step": 43435, + "train_speed(iter/s)": 1.637078 + }, + { + "acc": 0.65777717, + "epoch": 1.1019786910197868, + "grad_norm": 6.09375, + "learning_rate": 4.570802854828906e-06, + "loss": 1.62695885, + "memory(GiB)": 117.38, + "step": 43440, + "train_speed(iter/s)": 1.6371 + }, + { + "acc": 0.66201434, + "epoch": 1.102105530187722, + "grad_norm": 5.8125, + "learning_rate": 4.569758111082512e-06, + "loss": 1.55977707, + "memory(GiB)": 117.38, + "step": 43445, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.66806774, + "epoch": 1.102232369355657, + "grad_norm": 6.53125, + "learning_rate": 4.568713386260078e-06, + "loss": 1.51805706, + "memory(GiB)": 117.38, + "step": 43450, + "train_speed(iter/s)": 1.637143 + }, + { + "acc": 0.67697258, + "epoch": 1.102359208523592, + "grad_norm": 6.8125, + "learning_rate": 4.567668680407555e-06, + "loss": 1.55169106, + "memory(GiB)": 117.38, + "step": 43455, + "train_speed(iter/s)": 1.637165 + }, + { + "acc": 0.64767594, + "epoch": 1.102486047691527, + "grad_norm": 6.53125, + "learning_rate": 4.566623993570893e-06, + "loss": 1.64335442, + "memory(GiB)": 117.38, + "step": 43460, + "train_speed(iter/s)": 1.637187 + }, + { + "acc": 0.63292551, + "epoch": 1.1026128868594622, + "grad_norm": 5.71875, + "learning_rate": 4.565579325796043e-06, + "loss": 1.69831314, + "memory(GiB)": 117.38, + "step": 43465, + "train_speed(iter/s)": 1.637209 + }, + { + "acc": 0.66303062, + "epoch": 1.1027397260273972, + "grad_norm": 5.1875, + "learning_rate": 4.564534677128954e-06, + "loss": 1.5953743, + "memory(GiB)": 117.38, + "step": 43470, + "train_speed(iter/s)": 1.637232 + }, + { + "acc": 0.65839834, + "epoch": 1.1028665651953322, + "grad_norm": 4.84375, + "learning_rate": 4.563490047615574e-06, + "loss": 1.6028698, + "memory(GiB)": 117.38, + "step": 43475, + "train_speed(iter/s)": 1.637253 + }, + { + "acc": 0.65589886, + "epoch": 1.1029934043632674, + "grad_norm": 6.1875, + "learning_rate": 4.56244543730185e-06, + "loss": 1.56749649, + "memory(GiB)": 117.38, + "step": 43480, + "train_speed(iter/s)": 1.637274 + }, + { + "acc": 0.65689411, + "epoch": 1.1031202435312024, + "grad_norm": 5.1875, + "learning_rate": 4.561400846233729e-06, + "loss": 1.54747581, + "memory(GiB)": 117.38, + "step": 43485, + "train_speed(iter/s)": 1.637295 + }, + { + "acc": 0.65407953, + "epoch": 1.1032470826991374, + "grad_norm": 6.03125, + "learning_rate": 4.56035627445716e-06, + "loss": 1.55124359, + "memory(GiB)": 117.38, + "step": 43490, + "train_speed(iter/s)": 1.637316 + }, + { + "acc": 0.64014273, + "epoch": 1.1033739218670726, + "grad_norm": 6.375, + "learning_rate": 4.55931172201808e-06, + "loss": 1.70528984, + "memory(GiB)": 117.38, + "step": 43495, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.64971676, + "epoch": 1.1035007610350076, + "grad_norm": 5.78125, + "learning_rate": 4.558267188962441e-06, + "loss": 1.56202087, + "memory(GiB)": 117.38, + "step": 43500, + "train_speed(iter/s)": 1.637358 + }, + { + "acc": 0.65679598, + "epoch": 1.1036276002029426, + "grad_norm": 4.875, + "learning_rate": 4.557222675336182e-06, + "loss": 1.63533001, + "memory(GiB)": 117.38, + "step": 43505, + "train_speed(iter/s)": 1.637379 + }, + { + "acc": 0.65194511, + "epoch": 1.1037544393708778, + "grad_norm": 6.15625, + "learning_rate": 4.556178181185249e-06, + "loss": 1.57605553, + "memory(GiB)": 117.38, + "step": 43510, + "train_speed(iter/s)": 1.637399 + }, + { + "acc": 0.64925585, + "epoch": 1.1038812785388128, + "grad_norm": 5.53125, + "learning_rate": 4.555133706555579e-06, + "loss": 1.67549248, + "memory(GiB)": 117.38, + "step": 43515, + "train_speed(iter/s)": 1.637421 + }, + { + "acc": 0.64526606, + "epoch": 1.1040081177067478, + "grad_norm": 6.0, + "learning_rate": 4.554089251493115e-06, + "loss": 1.64196129, + "memory(GiB)": 117.38, + "step": 43520, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.66295462, + "epoch": 1.104134956874683, + "grad_norm": 6.25, + "learning_rate": 4.553044816043796e-06, + "loss": 1.55494499, + "memory(GiB)": 117.38, + "step": 43525, + "train_speed(iter/s)": 1.637464 + }, + { + "acc": 0.65765047, + "epoch": 1.104261796042618, + "grad_norm": 6.75, + "learning_rate": 4.552000400253563e-06, + "loss": 1.59547186, + "memory(GiB)": 117.38, + "step": 43530, + "train_speed(iter/s)": 1.637486 + }, + { + "acc": 0.66380029, + "epoch": 1.104388635210553, + "grad_norm": 6.15625, + "learning_rate": 4.550956004168352e-06, + "loss": 1.61445999, + "memory(GiB)": 117.38, + "step": 43535, + "train_speed(iter/s)": 1.637506 + }, + { + "acc": 0.65298853, + "epoch": 1.1045154743784882, + "grad_norm": 4.78125, + "learning_rate": 4.5499116278341e-06, + "loss": 1.64223251, + "memory(GiB)": 117.38, + "step": 43540, + "train_speed(iter/s)": 1.637525 + }, + { + "acc": 0.65271883, + "epoch": 1.1046423135464232, + "grad_norm": 8.125, + "learning_rate": 4.548867271296745e-06, + "loss": 1.63382072, + "memory(GiB)": 117.38, + "step": 43545, + "train_speed(iter/s)": 1.637547 + }, + { + "acc": 0.67285204, + "epoch": 1.1047691527143582, + "grad_norm": 5.71875, + "learning_rate": 4.547822934602222e-06, + "loss": 1.54750843, + "memory(GiB)": 117.38, + "step": 43550, + "train_speed(iter/s)": 1.637567 + }, + { + "acc": 0.6586154, + "epoch": 1.1048959918822931, + "grad_norm": 6.4375, + "learning_rate": 4.5467786177964635e-06, + "loss": 1.59689074, + "memory(GiB)": 117.38, + "step": 43555, + "train_speed(iter/s)": 1.637587 + }, + { + "acc": 0.65430512, + "epoch": 1.1050228310502284, + "grad_norm": 5.53125, + "learning_rate": 4.545734320925406e-06, + "loss": 1.61495438, + "memory(GiB)": 117.38, + "step": 43560, + "train_speed(iter/s)": 1.637608 + }, + { + "acc": 0.66548595, + "epoch": 1.1051496702181633, + "grad_norm": 6.4375, + "learning_rate": 4.544690044034981e-06, + "loss": 1.60419312, + "memory(GiB)": 117.38, + "step": 43565, + "train_speed(iter/s)": 1.637629 + }, + { + "acc": 0.65906448, + "epoch": 1.1052765093860983, + "grad_norm": 5.3125, + "learning_rate": 4.543645787171122e-06, + "loss": 1.59676704, + "memory(GiB)": 117.38, + "step": 43570, + "train_speed(iter/s)": 1.637649 + }, + { + "acc": 0.64783249, + "epoch": 1.1054033485540335, + "grad_norm": 5.40625, + "learning_rate": 4.5426015503797565e-06, + "loss": 1.64254856, + "memory(GiB)": 117.38, + "step": 43575, + "train_speed(iter/s)": 1.637669 + }, + { + "acc": 0.65406318, + "epoch": 1.1055301877219685, + "grad_norm": 5.96875, + "learning_rate": 4.5415573337068185e-06, + "loss": 1.62269363, + "memory(GiB)": 117.38, + "step": 43580, + "train_speed(iter/s)": 1.63769 + }, + { + "acc": 0.65632648, + "epoch": 1.1056570268899035, + "grad_norm": 6.34375, + "learning_rate": 4.540513137198233e-06, + "loss": 1.63387146, + "memory(GiB)": 117.38, + "step": 43585, + "train_speed(iter/s)": 1.63771 + }, + { + "acc": 0.67137933, + "epoch": 1.1057838660578387, + "grad_norm": 5.75, + "learning_rate": 4.539468960899936e-06, + "loss": 1.53223467, + "memory(GiB)": 117.38, + "step": 43590, + "train_speed(iter/s)": 1.637731 + }, + { + "acc": 0.64252062, + "epoch": 1.1059107052257737, + "grad_norm": 7.1875, + "learning_rate": 4.538424804857847e-06, + "loss": 1.6505209, + "memory(GiB)": 117.38, + "step": 43595, + "train_speed(iter/s)": 1.63775 + }, + { + "acc": 0.65709219, + "epoch": 1.1060375443937087, + "grad_norm": 5.375, + "learning_rate": 4.537380669117896e-06, + "loss": 1.62953682, + "memory(GiB)": 117.38, + "step": 43600, + "train_speed(iter/s)": 1.637771 + }, + { + "acc": 0.65931711, + "epoch": 1.106164383561644, + "grad_norm": 7.5625, + "learning_rate": 4.536336553726008e-06, + "loss": 1.56825047, + "memory(GiB)": 117.38, + "step": 43605, + "train_speed(iter/s)": 1.637791 + }, + { + "acc": 0.66197777, + "epoch": 1.106291222729579, + "grad_norm": 5.03125, + "learning_rate": 4.535292458728112e-06, + "loss": 1.59539576, + "memory(GiB)": 117.38, + "step": 43610, + "train_speed(iter/s)": 1.637812 + }, + { + "acc": 0.64459934, + "epoch": 1.106418061897514, + "grad_norm": 6.21875, + "learning_rate": 4.534248384170126e-06, + "loss": 1.61581249, + "memory(GiB)": 117.38, + "step": 43615, + "train_speed(iter/s)": 1.637832 + }, + { + "acc": 0.65564461, + "epoch": 1.106544901065449, + "grad_norm": 6.0, + "learning_rate": 4.533204330097974e-06, + "loss": 1.6060112, + "memory(GiB)": 117.38, + "step": 43620, + "train_speed(iter/s)": 1.637852 + }, + { + "acc": 0.66526031, + "epoch": 1.106671740233384, + "grad_norm": 4.59375, + "learning_rate": 4.532160296557581e-06, + "loss": 1.60432014, + "memory(GiB)": 117.38, + "step": 43625, + "train_speed(iter/s)": 1.637873 + }, + { + "acc": 0.67201915, + "epoch": 1.106798579401319, + "grad_norm": 8.125, + "learning_rate": 4.531116283594868e-06, + "loss": 1.53097973, + "memory(GiB)": 117.38, + "step": 43630, + "train_speed(iter/s)": 1.637895 + }, + { + "acc": 0.67831602, + "epoch": 1.106925418569254, + "grad_norm": 6.21875, + "learning_rate": 4.530072291255753e-06, + "loss": 1.55634899, + "memory(GiB)": 117.38, + "step": 43635, + "train_speed(iter/s)": 1.637915 + }, + { + "acc": 0.6624465, + "epoch": 1.1070522577371893, + "grad_norm": 5.5625, + "learning_rate": 4.529028319586157e-06, + "loss": 1.59423561, + "memory(GiB)": 117.38, + "step": 43640, + "train_speed(iter/s)": 1.637936 + }, + { + "acc": 0.64880672, + "epoch": 1.1071790969051243, + "grad_norm": 6.34375, + "learning_rate": 4.527984368631997e-06, + "loss": 1.69873047, + "memory(GiB)": 117.38, + "step": 43645, + "train_speed(iter/s)": 1.637956 + }, + { + "acc": 0.65363159, + "epoch": 1.1073059360730593, + "grad_norm": 5.5625, + "learning_rate": 4.526940438439196e-06, + "loss": 1.5803566, + "memory(GiB)": 117.38, + "step": 43650, + "train_speed(iter/s)": 1.637976 + }, + { + "acc": 0.66382847, + "epoch": 1.1074327752409945, + "grad_norm": 7.03125, + "learning_rate": 4.525896529053662e-06, + "loss": 1.61647263, + "memory(GiB)": 117.38, + "step": 43655, + "train_speed(iter/s)": 1.637996 + }, + { + "acc": 0.65083418, + "epoch": 1.1075596144089295, + "grad_norm": 5.28125, + "learning_rate": 4.524852640521318e-06, + "loss": 1.57888641, + "memory(GiB)": 117.38, + "step": 43660, + "train_speed(iter/s)": 1.638017 + }, + { + "acc": 0.65918074, + "epoch": 1.1076864535768645, + "grad_norm": 5.40625, + "learning_rate": 4.523808772888073e-06, + "loss": 1.53898945, + "memory(GiB)": 117.38, + "step": 43665, + "train_speed(iter/s)": 1.638038 + }, + { + "acc": 0.65505533, + "epoch": 1.1078132927447997, + "grad_norm": 4.90625, + "learning_rate": 4.522764926199848e-06, + "loss": 1.61178856, + "memory(GiB)": 117.38, + "step": 43670, + "train_speed(iter/s)": 1.638058 + }, + { + "acc": 0.64763803, + "epoch": 1.1079401319127347, + "grad_norm": 6.46875, + "learning_rate": 4.5217211005025516e-06, + "loss": 1.59003477, + "memory(GiB)": 117.38, + "step": 43675, + "train_speed(iter/s)": 1.638079 + }, + { + "acc": 0.65308132, + "epoch": 1.1080669710806696, + "grad_norm": 7.125, + "learning_rate": 4.520677295842095e-06, + "loss": 1.55530643, + "memory(GiB)": 117.38, + "step": 43680, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.6806448, + "epoch": 1.1081938102486049, + "grad_norm": 6.5625, + "learning_rate": 4.5196335122643915e-06, + "loss": 1.46125841, + "memory(GiB)": 117.38, + "step": 43685, + "train_speed(iter/s)": 1.63812 + }, + { + "acc": 0.65832281, + "epoch": 1.1083206494165398, + "grad_norm": 5.9375, + "learning_rate": 4.518589749815352e-06, + "loss": 1.54655991, + "memory(GiB)": 117.38, + "step": 43690, + "train_speed(iter/s)": 1.638142 + }, + { + "acc": 0.65218477, + "epoch": 1.1084474885844748, + "grad_norm": 5.8125, + "learning_rate": 4.517546008540884e-06, + "loss": 1.59491978, + "memory(GiB)": 117.38, + "step": 43695, + "train_speed(iter/s)": 1.638163 + }, + { + "acc": 0.65494366, + "epoch": 1.10857432775241, + "grad_norm": 5.375, + "learning_rate": 4.5165022884868946e-06, + "loss": 1.5815609, + "memory(GiB)": 117.38, + "step": 43700, + "train_speed(iter/s)": 1.638184 + }, + { + "acc": 0.63820348, + "epoch": 1.108701166920345, + "grad_norm": 5.90625, + "learning_rate": 4.515458589699295e-06, + "loss": 1.6875267, + "memory(GiB)": 117.38, + "step": 43705, + "train_speed(iter/s)": 1.638205 + }, + { + "acc": 0.66521559, + "epoch": 1.10882800608828, + "grad_norm": 6.75, + "learning_rate": 4.514414912223991e-06, + "loss": 1.58573027, + "memory(GiB)": 117.38, + "step": 43710, + "train_speed(iter/s)": 1.638226 + }, + { + "acc": 0.66810308, + "epoch": 1.108954845256215, + "grad_norm": 6.0, + "learning_rate": 4.513371256106885e-06, + "loss": 1.52283573, + "memory(GiB)": 117.38, + "step": 43715, + "train_speed(iter/s)": 1.638249 + }, + { + "acc": 0.64753456, + "epoch": 1.1090816844241502, + "grad_norm": 6.125, + "learning_rate": 4.512327621393885e-06, + "loss": 1.61174202, + "memory(GiB)": 117.38, + "step": 43720, + "train_speed(iter/s)": 1.638269 + }, + { + "acc": 0.6785759, + "epoch": 1.1092085235920852, + "grad_norm": 5.96875, + "learning_rate": 4.511284008130892e-06, + "loss": 1.5328721, + "memory(GiB)": 117.38, + "step": 43725, + "train_speed(iter/s)": 1.638291 + }, + { + "acc": 0.65266247, + "epoch": 1.1093353627600202, + "grad_norm": 5.375, + "learning_rate": 4.510240416363813e-06, + "loss": 1.59236279, + "memory(GiB)": 117.38, + "step": 43730, + "train_speed(iter/s)": 1.638313 + }, + { + "acc": 0.64590249, + "epoch": 1.1094622019279554, + "grad_norm": 5.15625, + "learning_rate": 4.5091968461385455e-06, + "loss": 1.66553326, + "memory(GiB)": 117.38, + "step": 43735, + "train_speed(iter/s)": 1.638334 + }, + { + "acc": 0.65689449, + "epoch": 1.1095890410958904, + "grad_norm": 5.375, + "learning_rate": 4.508153297500993e-06, + "loss": 1.58050213, + "memory(GiB)": 117.38, + "step": 43740, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.65717506, + "epoch": 1.1097158802638254, + "grad_norm": 5.5625, + "learning_rate": 4.507109770497052e-06, + "loss": 1.57905378, + "memory(GiB)": 117.38, + "step": 43745, + "train_speed(iter/s)": 1.638377 + }, + { + "acc": 0.66607375, + "epoch": 1.1098427194317606, + "grad_norm": 5.46875, + "learning_rate": 4.506066265172626e-06, + "loss": 1.54343758, + "memory(GiB)": 117.38, + "step": 43750, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.62807961, + "epoch": 1.1099695585996956, + "grad_norm": 5.78125, + "learning_rate": 4.505022781573611e-06, + "loss": 1.67012405, + "memory(GiB)": 117.38, + "step": 43755, + "train_speed(iter/s)": 1.638421 + }, + { + "acc": 0.66743011, + "epoch": 1.1100963977676306, + "grad_norm": 5.5, + "learning_rate": 4.503979319745902e-06, + "loss": 1.61075878, + "memory(GiB)": 117.38, + "step": 43760, + "train_speed(iter/s)": 1.638442 + }, + { + "acc": 0.65517569, + "epoch": 1.1102232369355658, + "grad_norm": 5.4375, + "learning_rate": 4.502935879735398e-06, + "loss": 1.5934227, + "memory(GiB)": 117.38, + "step": 43765, + "train_speed(iter/s)": 1.638463 + }, + { + "acc": 0.65846057, + "epoch": 1.1103500761035008, + "grad_norm": 5.15625, + "learning_rate": 4.5018924615879956e-06, + "loss": 1.60735435, + "memory(GiB)": 117.38, + "step": 43770, + "train_speed(iter/s)": 1.638484 + }, + { + "acc": 0.66134844, + "epoch": 1.1104769152714358, + "grad_norm": 5.15625, + "learning_rate": 4.500849065349584e-06, + "loss": 1.57719936, + "memory(GiB)": 117.38, + "step": 43775, + "train_speed(iter/s)": 1.638504 + }, + { + "acc": 0.64201336, + "epoch": 1.1106037544393708, + "grad_norm": 6.53125, + "learning_rate": 4.499805691066059e-06, + "loss": 1.5882637, + "memory(GiB)": 117.38, + "step": 43780, + "train_speed(iter/s)": 1.638525 + }, + { + "acc": 0.65040393, + "epoch": 1.110730593607306, + "grad_norm": 7.4375, + "learning_rate": 4.498762338783314e-06, + "loss": 1.61698227, + "memory(GiB)": 117.38, + "step": 43785, + "train_speed(iter/s)": 1.638546 + }, + { + "acc": 0.67040863, + "epoch": 1.110857432775241, + "grad_norm": 6.15625, + "learning_rate": 4.49771900854724e-06, + "loss": 1.55735893, + "memory(GiB)": 117.38, + "step": 43790, + "train_speed(iter/s)": 1.638567 + }, + { + "acc": 0.64874072, + "epoch": 1.110984271943176, + "grad_norm": 7.40625, + "learning_rate": 4.496675700403724e-06, + "loss": 1.68709431, + "memory(GiB)": 117.38, + "step": 43795, + "train_speed(iter/s)": 1.638588 + }, + { + "acc": 0.65509071, + "epoch": 1.1111111111111112, + "grad_norm": 5.78125, + "learning_rate": 4.495632414398659e-06, + "loss": 1.66659393, + "memory(GiB)": 117.38, + "step": 43800, + "train_speed(iter/s)": 1.638611 + }, + { + "acc": 0.64641938, + "epoch": 1.1112379502790461, + "grad_norm": 5.59375, + "learning_rate": 4.494589150577932e-06, + "loss": 1.70870705, + "memory(GiB)": 117.38, + "step": 43805, + "train_speed(iter/s)": 1.638632 + }, + { + "acc": 0.66988783, + "epoch": 1.1113647894469811, + "grad_norm": 6.125, + "learning_rate": 4.493545908987432e-06, + "loss": 1.6110878, + "memory(GiB)": 117.38, + "step": 43810, + "train_speed(iter/s)": 1.638654 + }, + { + "acc": 0.63878813, + "epoch": 1.1114916286149163, + "grad_norm": 5.03125, + "learning_rate": 4.492502689673044e-06, + "loss": 1.69132462, + "memory(GiB)": 117.38, + "step": 43815, + "train_speed(iter/s)": 1.638675 + }, + { + "acc": 0.67498226, + "epoch": 1.1116184677828513, + "grad_norm": 6.03125, + "learning_rate": 4.491459492680651e-06, + "loss": 1.57180214, + "memory(GiB)": 117.38, + "step": 43820, + "train_speed(iter/s)": 1.638696 + }, + { + "acc": 0.66567869, + "epoch": 1.1117453069507863, + "grad_norm": 5.90625, + "learning_rate": 4.4904163180561425e-06, + "loss": 1.52184563, + "memory(GiB)": 117.38, + "step": 43825, + "train_speed(iter/s)": 1.638718 + }, + { + "acc": 0.6372015, + "epoch": 1.1118721461187215, + "grad_norm": 5.375, + "learning_rate": 4.4893731658453996e-06, + "loss": 1.60513268, + "memory(GiB)": 117.38, + "step": 43830, + "train_speed(iter/s)": 1.63874 + }, + { + "acc": 0.66937075, + "epoch": 1.1119989852866565, + "grad_norm": 5.375, + "learning_rate": 4.4883300360943035e-06, + "loss": 1.5709199, + "memory(GiB)": 117.38, + "step": 43835, + "train_speed(iter/s)": 1.638762 + }, + { + "acc": 0.64969788, + "epoch": 1.1121258244545915, + "grad_norm": 5.84375, + "learning_rate": 4.4872869288487366e-06, + "loss": 1.60509033, + "memory(GiB)": 117.38, + "step": 43840, + "train_speed(iter/s)": 1.638782 + }, + { + "acc": 0.6528121, + "epoch": 1.1122526636225267, + "grad_norm": 5.0, + "learning_rate": 4.48624384415458e-06, + "loss": 1.53557501, + "memory(GiB)": 117.38, + "step": 43845, + "train_speed(iter/s)": 1.638803 + }, + { + "acc": 0.65376334, + "epoch": 1.1123795027904617, + "grad_norm": 5.96875, + "learning_rate": 4.485200782057715e-06, + "loss": 1.64941502, + "memory(GiB)": 117.38, + "step": 43850, + "train_speed(iter/s)": 1.638474 + }, + { + "acc": 0.64608474, + "epoch": 1.1125063419583967, + "grad_norm": 6.125, + "learning_rate": 4.4841577426040145e-06, + "loss": 1.62714024, + "memory(GiB)": 117.38, + "step": 43855, + "train_speed(iter/s)": 1.638494 + }, + { + "acc": 0.66031342, + "epoch": 1.112633181126332, + "grad_norm": 5.375, + "learning_rate": 4.483114725839361e-06, + "loss": 1.5913106, + "memory(GiB)": 117.38, + "step": 43860, + "train_speed(iter/s)": 1.638516 + }, + { + "acc": 0.6516191, + "epoch": 1.112760020294267, + "grad_norm": 6.3125, + "learning_rate": 4.482071731809629e-06, + "loss": 1.65043297, + "memory(GiB)": 117.38, + "step": 43865, + "train_speed(iter/s)": 1.638537 + }, + { + "acc": 0.64568548, + "epoch": 1.112886859462202, + "grad_norm": 6.28125, + "learning_rate": 4.481028760560697e-06, + "loss": 1.66282692, + "memory(GiB)": 117.38, + "step": 43870, + "train_speed(iter/s)": 1.638557 + }, + { + "acc": 0.66215296, + "epoch": 1.1130136986301369, + "grad_norm": 5.90625, + "learning_rate": 4.479985812138435e-06, + "loss": 1.60832958, + "memory(GiB)": 117.38, + "step": 43875, + "train_speed(iter/s)": 1.638579 + }, + { + "acc": 0.65164309, + "epoch": 1.113140537798072, + "grad_norm": 6.0625, + "learning_rate": 4.478942886588719e-06, + "loss": 1.58625851, + "memory(GiB)": 117.38, + "step": 43880, + "train_speed(iter/s)": 1.6386 + }, + { + "acc": 0.65018668, + "epoch": 1.113267376966007, + "grad_norm": 8.8125, + "learning_rate": 4.47789998395742e-06, + "loss": 1.58408499, + "memory(GiB)": 117.38, + "step": 43885, + "train_speed(iter/s)": 1.638621 + }, + { + "acc": 0.66350298, + "epoch": 1.113394216133942, + "grad_norm": 7.3125, + "learning_rate": 4.476857104290413e-06, + "loss": 1.63399525, + "memory(GiB)": 117.38, + "step": 43890, + "train_speed(iter/s)": 1.638642 + }, + { + "acc": 0.67013855, + "epoch": 1.1135210553018773, + "grad_norm": 6.5, + "learning_rate": 4.4758142476335655e-06, + "loss": 1.59673882, + "memory(GiB)": 117.38, + "step": 43895, + "train_speed(iter/s)": 1.638662 + }, + { + "acc": 0.66996784, + "epoch": 1.1136478944698123, + "grad_norm": 4.6875, + "learning_rate": 4.474771414032747e-06, + "loss": 1.64837151, + "memory(GiB)": 117.38, + "step": 43900, + "train_speed(iter/s)": 1.638683 + }, + { + "acc": 0.65942335, + "epoch": 1.1137747336377473, + "grad_norm": 6.3125, + "learning_rate": 4.473728603533827e-06, + "loss": 1.60491714, + "memory(GiB)": 117.38, + "step": 43905, + "train_speed(iter/s)": 1.638364 + }, + { + "acc": 0.64758348, + "epoch": 1.1139015728056825, + "grad_norm": 6.75, + "learning_rate": 4.472685816182674e-06, + "loss": 1.62763023, + "memory(GiB)": 117.38, + "step": 43910, + "train_speed(iter/s)": 1.638385 + }, + { + "acc": 0.65678248, + "epoch": 1.1140284119736175, + "grad_norm": 7.21875, + "learning_rate": 4.471643052025152e-06, + "loss": 1.64759941, + "memory(GiB)": 117.38, + "step": 43915, + "train_speed(iter/s)": 1.638407 + }, + { + "acc": 0.64520226, + "epoch": 1.1141552511415524, + "grad_norm": 5.78125, + "learning_rate": 4.470600311107127e-06, + "loss": 1.67286606, + "memory(GiB)": 117.38, + "step": 43920, + "train_speed(iter/s)": 1.638428 + }, + { + "acc": 0.6421525, + "epoch": 1.1142820903094877, + "grad_norm": 5.6875, + "learning_rate": 4.469557593474464e-06, + "loss": 1.57976723, + "memory(GiB)": 117.38, + "step": 43925, + "train_speed(iter/s)": 1.63845 + }, + { + "acc": 0.64846869, + "epoch": 1.1144089294774226, + "grad_norm": 6.0, + "learning_rate": 4.468514899173027e-06, + "loss": 1.6131958, + "memory(GiB)": 117.38, + "step": 43930, + "train_speed(iter/s)": 1.638471 + }, + { + "acc": 0.66371565, + "epoch": 1.1145357686453576, + "grad_norm": 5.875, + "learning_rate": 4.4674722282486775e-06, + "loss": 1.56924372, + "memory(GiB)": 117.38, + "step": 43935, + "train_speed(iter/s)": 1.638491 + }, + { + "acc": 0.66313229, + "epoch": 1.1146626078132926, + "grad_norm": 5.625, + "learning_rate": 4.4664295807472765e-06, + "loss": 1.56594429, + "memory(GiB)": 117.38, + "step": 43940, + "train_speed(iter/s)": 1.638512 + }, + { + "acc": 0.65459232, + "epoch": 1.1147894469812278, + "grad_norm": 7.0, + "learning_rate": 4.465386956714684e-06, + "loss": 1.60682793, + "memory(GiB)": 117.38, + "step": 43945, + "train_speed(iter/s)": 1.638532 + }, + { + "acc": 0.65105801, + "epoch": 1.1149162861491628, + "grad_norm": 5.9375, + "learning_rate": 4.4643443561967625e-06, + "loss": 1.59797335, + "memory(GiB)": 117.38, + "step": 43950, + "train_speed(iter/s)": 1.638553 + }, + { + "acc": 0.66231003, + "epoch": 1.1150431253170978, + "grad_norm": 6.40625, + "learning_rate": 4.463301779239366e-06, + "loss": 1.50973549, + "memory(GiB)": 117.38, + "step": 43955, + "train_speed(iter/s)": 1.638572 + }, + { + "acc": 0.67661629, + "epoch": 1.115169964485033, + "grad_norm": 5.40625, + "learning_rate": 4.462259225888354e-06, + "loss": 1.53829308, + "memory(GiB)": 117.38, + "step": 43960, + "train_speed(iter/s)": 1.638594 + }, + { + "acc": 0.64776192, + "epoch": 1.115296803652968, + "grad_norm": 5.6875, + "learning_rate": 4.4612166961895805e-06, + "loss": 1.6595377, + "memory(GiB)": 117.38, + "step": 43965, + "train_speed(iter/s)": 1.638615 + }, + { + "acc": 0.65194893, + "epoch": 1.115423642820903, + "grad_norm": 5.0625, + "learning_rate": 4.460174190188905e-06, + "loss": 1.59635143, + "memory(GiB)": 117.38, + "step": 43970, + "train_speed(iter/s)": 1.638635 + }, + { + "acc": 0.64870939, + "epoch": 1.1155504819888382, + "grad_norm": 6.0, + "learning_rate": 4.459131707932177e-06, + "loss": 1.6761734, + "memory(GiB)": 117.38, + "step": 43975, + "train_speed(iter/s)": 1.638656 + }, + { + "acc": 0.64989128, + "epoch": 1.1156773211567732, + "grad_norm": 5.59375, + "learning_rate": 4.458089249465251e-06, + "loss": 1.64217873, + "memory(GiB)": 117.38, + "step": 43980, + "train_speed(iter/s)": 1.638676 + }, + { + "acc": 0.64520297, + "epoch": 1.1158041603247082, + "grad_norm": 4.75, + "learning_rate": 4.45704681483398e-06, + "loss": 1.64760551, + "memory(GiB)": 117.38, + "step": 43985, + "train_speed(iter/s)": 1.638696 + }, + { + "acc": 0.6482317, + "epoch": 1.1159309994926434, + "grad_norm": 5.375, + "learning_rate": 4.456004404084215e-06, + "loss": 1.62359486, + "memory(GiB)": 117.38, + "step": 43990, + "train_speed(iter/s)": 1.638718 + }, + { + "acc": 0.64194298, + "epoch": 1.1160578386605784, + "grad_norm": 6.96875, + "learning_rate": 4.454962017261803e-06, + "loss": 1.67395, + "memory(GiB)": 117.38, + "step": 43995, + "train_speed(iter/s)": 1.638739 + }, + { + "acc": 0.64670744, + "epoch": 1.1161846778285134, + "grad_norm": 6.5, + "learning_rate": 4.453919654412596e-06, + "loss": 1.60647163, + "memory(GiB)": 117.38, + "step": 44000, + "train_speed(iter/s)": 1.638761 + }, + { + "epoch": 1.1161846778285134, + "eval_acc": 0.646246791839735, + "eval_loss": 1.573931336402893, + "eval_runtime": 58.639, + "eval_samples_per_second": 108.631, + "eval_steps_per_second": 27.166, + "step": 44000 + }, + { + "acc": 0.66043444, + "epoch": 1.1163115169964486, + "grad_norm": 5.53125, + "learning_rate": 4.45287731558244e-06, + "loss": 1.55361366, + "memory(GiB)": 117.38, + "step": 44005, + "train_speed(iter/s)": 1.634948 + }, + { + "acc": 0.6583395, + "epoch": 1.1164383561643836, + "grad_norm": 4.6875, + "learning_rate": 4.451835000817185e-06, + "loss": 1.56284523, + "memory(GiB)": 117.38, + "step": 44010, + "train_speed(iter/s)": 1.634968 + }, + { + "acc": 0.66295152, + "epoch": 1.1165651953323186, + "grad_norm": 6.5625, + "learning_rate": 4.450792710162672e-06, + "loss": 1.62032051, + "memory(GiB)": 117.38, + "step": 44015, + "train_speed(iter/s)": 1.634986 + }, + { + "acc": 0.65016289, + "epoch": 1.1166920345002538, + "grad_norm": 5.875, + "learning_rate": 4.449750443664747e-06, + "loss": 1.58196907, + "memory(GiB)": 117.38, + "step": 44020, + "train_speed(iter/s)": 1.635006 + }, + { + "acc": 0.66230268, + "epoch": 1.1168188736681888, + "grad_norm": 4.9375, + "learning_rate": 4.448708201369254e-06, + "loss": 1.57816582, + "memory(GiB)": 117.38, + "step": 44025, + "train_speed(iter/s)": 1.635026 + }, + { + "acc": 0.65436254, + "epoch": 1.1169457128361238, + "grad_norm": 5.65625, + "learning_rate": 4.4476659833220374e-06, + "loss": 1.59933891, + "memory(GiB)": 117.38, + "step": 44030, + "train_speed(iter/s)": 1.635045 + }, + { + "acc": 0.66842809, + "epoch": 1.1170725520040587, + "grad_norm": 5.8125, + "learning_rate": 4.4466237895689365e-06, + "loss": 1.56417561, + "memory(GiB)": 117.38, + "step": 44035, + "train_speed(iter/s)": 1.635063 + }, + { + "acc": 0.65718164, + "epoch": 1.117199391171994, + "grad_norm": 6.75, + "learning_rate": 4.44558162015579e-06, + "loss": 1.59644241, + "memory(GiB)": 117.38, + "step": 44040, + "train_speed(iter/s)": 1.635076 + }, + { + "acc": 0.65321913, + "epoch": 1.117326230339929, + "grad_norm": 6.34375, + "learning_rate": 4.444539475128441e-06, + "loss": 1.63150883, + "memory(GiB)": 117.38, + "step": 44045, + "train_speed(iter/s)": 1.635094 + }, + { + "acc": 0.65699592, + "epoch": 1.117453069507864, + "grad_norm": 5.4375, + "learning_rate": 4.443497354532726e-06, + "loss": 1.62423592, + "memory(GiB)": 117.38, + "step": 44050, + "train_speed(iter/s)": 1.635114 + }, + { + "acc": 0.65642576, + "epoch": 1.1175799086757991, + "grad_norm": 6.0, + "learning_rate": 4.442455258414482e-06, + "loss": 1.53582563, + "memory(GiB)": 117.38, + "step": 44055, + "train_speed(iter/s)": 1.635135 + }, + { + "acc": 0.64688973, + "epoch": 1.1177067478437341, + "grad_norm": 5.75, + "learning_rate": 4.441413186819543e-06, + "loss": 1.59591398, + "memory(GiB)": 117.38, + "step": 44060, + "train_speed(iter/s)": 1.635153 + }, + { + "acc": 0.65907316, + "epoch": 1.1178335870116691, + "grad_norm": 6.78125, + "learning_rate": 4.440371139793747e-06, + "loss": 1.54488697, + "memory(GiB)": 117.38, + "step": 44065, + "train_speed(iter/s)": 1.635173 + }, + { + "acc": 0.65992684, + "epoch": 1.1179604261796043, + "grad_norm": 5.375, + "learning_rate": 4.43932911738293e-06, + "loss": 1.59141226, + "memory(GiB)": 117.38, + "step": 44070, + "train_speed(iter/s)": 1.635192 + }, + { + "acc": 0.64759164, + "epoch": 1.1180872653475393, + "grad_norm": 7.09375, + "learning_rate": 4.438287119632917e-06, + "loss": 1.61874142, + "memory(GiB)": 117.38, + "step": 44075, + "train_speed(iter/s)": 1.635213 + }, + { + "acc": 0.65413737, + "epoch": 1.1182141045154743, + "grad_norm": 9.5, + "learning_rate": 4.4372451465895465e-06, + "loss": 1.59834528, + "memory(GiB)": 117.38, + "step": 44080, + "train_speed(iter/s)": 1.635233 + }, + { + "acc": 0.64904704, + "epoch": 1.1183409436834095, + "grad_norm": 5.625, + "learning_rate": 4.436203198298645e-06, + "loss": 1.63180313, + "memory(GiB)": 117.38, + "step": 44085, + "train_speed(iter/s)": 1.635254 + }, + { + "acc": 0.64984627, + "epoch": 1.1184677828513445, + "grad_norm": 4.96875, + "learning_rate": 4.435161274806049e-06, + "loss": 1.58911619, + "memory(GiB)": 117.38, + "step": 44090, + "train_speed(iter/s)": 1.635274 + }, + { + "acc": 0.64327478, + "epoch": 1.1185946220192795, + "grad_norm": 5.25, + "learning_rate": 4.4341193761575765e-06, + "loss": 1.64164276, + "memory(GiB)": 117.38, + "step": 44095, + "train_speed(iter/s)": 1.635294 + }, + { + "acc": 0.67385597, + "epoch": 1.1187214611872145, + "grad_norm": 6.375, + "learning_rate": 4.433077502399063e-06, + "loss": 1.61192093, + "memory(GiB)": 117.38, + "step": 44100, + "train_speed(iter/s)": 1.635314 + }, + { + "acc": 0.6531714, + "epoch": 1.1188483003551497, + "grad_norm": 5.75, + "learning_rate": 4.43203565357633e-06, + "loss": 1.54096785, + "memory(GiB)": 117.38, + "step": 44105, + "train_speed(iter/s)": 1.635334 + }, + { + "acc": 0.65698619, + "epoch": 1.1189751395230847, + "grad_norm": 6.46875, + "learning_rate": 4.430993829735208e-06, + "loss": 1.57295332, + "memory(GiB)": 117.38, + "step": 44110, + "train_speed(iter/s)": 1.635354 + }, + { + "acc": 0.66614261, + "epoch": 1.1191019786910197, + "grad_norm": 5.21875, + "learning_rate": 4.429952030921516e-06, + "loss": 1.51668005, + "memory(GiB)": 117.38, + "step": 44115, + "train_speed(iter/s)": 1.635375 + }, + { + "acc": 0.67182083, + "epoch": 1.119228817858955, + "grad_norm": 7.125, + "learning_rate": 4.428910257181077e-06, + "loss": 1.61970291, + "memory(GiB)": 117.38, + "step": 44120, + "train_speed(iter/s)": 1.635396 + }, + { + "acc": 0.66382823, + "epoch": 1.1193556570268899, + "grad_norm": 7.25, + "learning_rate": 4.427868508559717e-06, + "loss": 1.55633621, + "memory(GiB)": 117.38, + "step": 44125, + "train_speed(iter/s)": 1.635416 + }, + { + "acc": 0.67406707, + "epoch": 1.1194824961948249, + "grad_norm": 6.34375, + "learning_rate": 4.426826785103256e-06, + "loss": 1.51318779, + "memory(GiB)": 117.38, + "step": 44130, + "train_speed(iter/s)": 1.635435 + }, + { + "acc": 0.65523081, + "epoch": 1.11960933536276, + "grad_norm": 7.8125, + "learning_rate": 4.425785086857509e-06, + "loss": 1.59485798, + "memory(GiB)": 117.38, + "step": 44135, + "train_speed(iter/s)": 1.635456 + }, + { + "acc": 0.65386038, + "epoch": 1.119736174530695, + "grad_norm": 6.09375, + "learning_rate": 4.424743413868298e-06, + "loss": 1.62073269, + "memory(GiB)": 117.38, + "step": 44140, + "train_speed(iter/s)": 1.635476 + }, + { + "acc": 0.66281242, + "epoch": 1.11986301369863, + "grad_norm": 6.40625, + "learning_rate": 4.42370176618144e-06, + "loss": 1.55912018, + "memory(GiB)": 117.38, + "step": 44145, + "train_speed(iter/s)": 1.635496 + }, + { + "acc": 0.63929787, + "epoch": 1.1199898528665653, + "grad_norm": 5.25, + "learning_rate": 4.422660143842753e-06, + "loss": 1.60670071, + "memory(GiB)": 117.38, + "step": 44150, + "train_speed(iter/s)": 1.635517 + }, + { + "acc": 0.65457773, + "epoch": 1.1201166920345003, + "grad_norm": 9.0625, + "learning_rate": 4.421618546898048e-06, + "loss": 1.65196648, + "memory(GiB)": 117.38, + "step": 44155, + "train_speed(iter/s)": 1.635535 + }, + { + "acc": 0.65805984, + "epoch": 1.1202435312024352, + "grad_norm": 6.59375, + "learning_rate": 4.420576975393143e-06, + "loss": 1.50358963, + "memory(GiB)": 117.38, + "step": 44160, + "train_speed(iter/s)": 1.635555 + }, + { + "acc": 0.66869898, + "epoch": 1.1203703703703705, + "grad_norm": 5.40625, + "learning_rate": 4.4195354293738484e-06, + "loss": 1.52080555, + "memory(GiB)": 117.38, + "step": 44165, + "train_speed(iter/s)": 1.635576 + }, + { + "acc": 0.6514441, + "epoch": 1.1204972095383054, + "grad_norm": 6.0, + "learning_rate": 4.418493908885979e-06, + "loss": 1.5425602, + "memory(GiB)": 117.38, + "step": 44170, + "train_speed(iter/s)": 1.635596 + }, + { + "acc": 0.65485201, + "epoch": 1.1206240487062404, + "grad_norm": 5.03125, + "learning_rate": 4.417452413975343e-06, + "loss": 1.5841114, + "memory(GiB)": 117.38, + "step": 44175, + "train_speed(iter/s)": 1.635617 + }, + { + "acc": 0.66331625, + "epoch": 1.1207508878741756, + "grad_norm": 5.0, + "learning_rate": 4.4164109446877514e-06, + "loss": 1.58154135, + "memory(GiB)": 117.38, + "step": 44180, + "train_speed(iter/s)": 1.635638 + }, + { + "acc": 0.66471424, + "epoch": 1.1208777270421106, + "grad_norm": 4.8125, + "learning_rate": 4.41536950106901e-06, + "loss": 1.5538332, + "memory(GiB)": 117.38, + "step": 44185, + "train_speed(iter/s)": 1.635658 + }, + { + "acc": 0.66929636, + "epoch": 1.1210045662100456, + "grad_norm": 6.21875, + "learning_rate": 4.414328083164931e-06, + "loss": 1.56172695, + "memory(GiB)": 117.38, + "step": 44190, + "train_speed(iter/s)": 1.635677 + }, + { + "acc": 0.64653416, + "epoch": 1.1211314053779806, + "grad_norm": 5.46875, + "learning_rate": 4.4132866910213154e-06, + "loss": 1.65548344, + "memory(GiB)": 117.38, + "step": 44195, + "train_speed(iter/s)": 1.635698 + }, + { + "acc": 0.66073055, + "epoch": 1.1212582445459158, + "grad_norm": 5.5, + "learning_rate": 4.41224532468397e-06, + "loss": 1.49708891, + "memory(GiB)": 117.38, + "step": 44200, + "train_speed(iter/s)": 1.635718 + }, + { + "acc": 0.65340195, + "epoch": 1.1213850837138508, + "grad_norm": 5.6875, + "learning_rate": 4.411203984198701e-06, + "loss": 1.61638412, + "memory(GiB)": 117.38, + "step": 44205, + "train_speed(iter/s)": 1.635737 + }, + { + "acc": 0.64651895, + "epoch": 1.1215119228817858, + "grad_norm": 4.5625, + "learning_rate": 4.41016266961131e-06, + "loss": 1.59872532, + "memory(GiB)": 117.38, + "step": 44210, + "train_speed(iter/s)": 1.635757 + }, + { + "acc": 0.66024466, + "epoch": 1.121638762049721, + "grad_norm": 8.1875, + "learning_rate": 4.409121380967597e-06, + "loss": 1.63750057, + "memory(GiB)": 117.38, + "step": 44215, + "train_speed(iter/s)": 1.635776 + }, + { + "acc": 0.66965275, + "epoch": 1.121765601217656, + "grad_norm": 9.25, + "learning_rate": 4.408080118313364e-06, + "loss": 1.5953516, + "memory(GiB)": 117.38, + "step": 44220, + "train_speed(iter/s)": 1.635797 + }, + { + "acc": 0.65244808, + "epoch": 1.121892440385591, + "grad_norm": 6.53125, + "learning_rate": 4.40703888169441e-06, + "loss": 1.63399849, + "memory(GiB)": 117.38, + "step": 44225, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.66125412, + "epoch": 1.1220192795535262, + "grad_norm": 9.25, + "learning_rate": 4.4059976711565355e-06, + "loss": 1.54146194, + "memory(GiB)": 117.38, + "step": 44230, + "train_speed(iter/s)": 1.63584 + }, + { + "acc": 0.65405045, + "epoch": 1.1221461187214612, + "grad_norm": 6.125, + "learning_rate": 4.404956486745532e-06, + "loss": 1.56526117, + "memory(GiB)": 117.38, + "step": 44235, + "train_speed(iter/s)": 1.63586 + }, + { + "acc": 0.65093994, + "epoch": 1.1222729578893962, + "grad_norm": 7.0, + "learning_rate": 4.403915328507201e-06, + "loss": 1.63668766, + "memory(GiB)": 117.38, + "step": 44240, + "train_speed(iter/s)": 1.635882 + }, + { + "acc": 0.65194454, + "epoch": 1.1223997970573314, + "grad_norm": 5.28125, + "learning_rate": 4.4028741964873334e-06, + "loss": 1.5779974, + "memory(GiB)": 117.38, + "step": 44245, + "train_speed(iter/s)": 1.635902 + }, + { + "acc": 0.64303694, + "epoch": 1.1225266362252664, + "grad_norm": 7.625, + "learning_rate": 4.4018330907317275e-06, + "loss": 1.61261368, + "memory(GiB)": 117.38, + "step": 44250, + "train_speed(iter/s)": 1.635922 + }, + { + "acc": 0.64048562, + "epoch": 1.1226534753932014, + "grad_norm": 5.96875, + "learning_rate": 4.400792011286171e-06, + "loss": 1.72979774, + "memory(GiB)": 117.38, + "step": 44255, + "train_speed(iter/s)": 1.635942 + }, + { + "acc": 0.66565065, + "epoch": 1.1227803145611364, + "grad_norm": 5.65625, + "learning_rate": 4.3997509581964566e-06, + "loss": 1.55208635, + "memory(GiB)": 117.38, + "step": 44260, + "train_speed(iter/s)": 1.635961 + }, + { + "acc": 0.65540981, + "epoch": 1.1229071537290716, + "grad_norm": 5.75, + "learning_rate": 4.398709931508376e-06, + "loss": 1.54899044, + "memory(GiB)": 117.38, + "step": 44265, + "train_speed(iter/s)": 1.635982 + }, + { + "acc": 0.65252419, + "epoch": 1.1230339928970066, + "grad_norm": 6.9375, + "learning_rate": 4.397668931267718e-06, + "loss": 1.55044785, + "memory(GiB)": 117.38, + "step": 44270, + "train_speed(iter/s)": 1.636001 + }, + { + "acc": 0.65328288, + "epoch": 1.1231608320649416, + "grad_norm": 5.40625, + "learning_rate": 4.396627957520269e-06, + "loss": 1.63892097, + "memory(GiB)": 117.38, + "step": 44275, + "train_speed(iter/s)": 1.636021 + }, + { + "acc": 0.63981328, + "epoch": 1.1232876712328768, + "grad_norm": 5.90625, + "learning_rate": 4.395587010311815e-06, + "loss": 1.63370705, + "memory(GiB)": 117.38, + "step": 44280, + "train_speed(iter/s)": 1.636041 + }, + { + "acc": 0.66041307, + "epoch": 1.1234145104008117, + "grad_norm": 6.28125, + "learning_rate": 4.394546089688143e-06, + "loss": 1.58444443, + "memory(GiB)": 117.38, + "step": 44285, + "train_speed(iter/s)": 1.63606 + }, + { + "acc": 0.65751848, + "epoch": 1.1235413495687467, + "grad_norm": 7.0, + "learning_rate": 4.3935051956950395e-06, + "loss": 1.6225771, + "memory(GiB)": 117.38, + "step": 44290, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.6475522, + "epoch": 1.123668188736682, + "grad_norm": 5.46875, + "learning_rate": 4.3924643283782824e-06, + "loss": 1.62086678, + "memory(GiB)": 117.38, + "step": 44295, + "train_speed(iter/s)": 1.636103 + }, + { + "acc": 0.66249762, + "epoch": 1.123795027904617, + "grad_norm": 7.4375, + "learning_rate": 4.391423487783657e-06, + "loss": 1.59110508, + "memory(GiB)": 117.38, + "step": 44300, + "train_speed(iter/s)": 1.636124 + }, + { + "acc": 0.66288376, + "epoch": 1.123921867072552, + "grad_norm": 6.09375, + "learning_rate": 4.3903826739569444e-06, + "loss": 1.58983974, + "memory(GiB)": 117.38, + "step": 44305, + "train_speed(iter/s)": 1.636144 + }, + { + "acc": 0.66546173, + "epoch": 1.1240487062404871, + "grad_norm": 6.34375, + "learning_rate": 4.389341886943926e-06, + "loss": 1.56044073, + "memory(GiB)": 117.38, + "step": 44310, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.64793382, + "epoch": 1.1241755454084221, + "grad_norm": 5.90625, + "learning_rate": 4.388301126790374e-06, + "loss": 1.63326569, + "memory(GiB)": 117.38, + "step": 44315, + "train_speed(iter/s)": 1.636183 + }, + { + "acc": 0.66180568, + "epoch": 1.1243023845763571, + "grad_norm": 5.96875, + "learning_rate": 4.387260393542071e-06, + "loss": 1.60197639, + "memory(GiB)": 117.38, + "step": 44320, + "train_speed(iter/s)": 1.636202 + }, + { + "acc": 0.64380836, + "epoch": 1.1244292237442923, + "grad_norm": 5.75, + "learning_rate": 4.38621968724479e-06, + "loss": 1.6886879, + "memory(GiB)": 117.38, + "step": 44325, + "train_speed(iter/s)": 1.636222 + }, + { + "acc": 0.65574608, + "epoch": 1.1245560629122273, + "grad_norm": 6.0, + "learning_rate": 4.385179007944311e-06, + "loss": 1.59371214, + "memory(GiB)": 117.38, + "step": 44330, + "train_speed(iter/s)": 1.636241 + }, + { + "acc": 0.65164137, + "epoch": 1.1246829020801623, + "grad_norm": 8.0625, + "learning_rate": 4.384138355686402e-06, + "loss": 1.65738449, + "memory(GiB)": 117.38, + "step": 44335, + "train_speed(iter/s)": 1.636261 + }, + { + "acc": 0.64402332, + "epoch": 1.1248097412480975, + "grad_norm": 5.84375, + "learning_rate": 4.383097730516837e-06, + "loss": 1.64249401, + "memory(GiB)": 117.38, + "step": 44340, + "train_speed(iter/s)": 1.636282 + }, + { + "acc": 0.67568874, + "epoch": 1.1249365804160325, + "grad_norm": 6.34375, + "learning_rate": 4.382057132481389e-06, + "loss": 1.56319075, + "memory(GiB)": 117.38, + "step": 44345, + "train_speed(iter/s)": 1.636302 + }, + { + "acc": 0.65463829, + "epoch": 1.1250634195839675, + "grad_norm": 6.59375, + "learning_rate": 4.381016561625829e-06, + "loss": 1.54930124, + "memory(GiB)": 117.38, + "step": 44350, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.6481185, + "epoch": 1.1251902587519025, + "grad_norm": 6.1875, + "learning_rate": 4.379976017995922e-06, + "loss": 1.68179588, + "memory(GiB)": 117.38, + "step": 44355, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.66424837, + "epoch": 1.1253170979198377, + "grad_norm": 8.125, + "learning_rate": 4.378935501637438e-06, + "loss": 1.62036667, + "memory(GiB)": 117.38, + "step": 44360, + "train_speed(iter/s)": 1.636362 + }, + { + "acc": 0.65817924, + "epoch": 1.1254439370877727, + "grad_norm": 6.75, + "learning_rate": 4.377895012596144e-06, + "loss": 1.5880394, + "memory(GiB)": 117.38, + "step": 44365, + "train_speed(iter/s)": 1.636382 + }, + { + "acc": 0.64536557, + "epoch": 1.1255707762557077, + "grad_norm": 5.53125, + "learning_rate": 4.376854550917805e-06, + "loss": 1.65076275, + "memory(GiB)": 117.38, + "step": 44370, + "train_speed(iter/s)": 1.636402 + }, + { + "acc": 0.65267496, + "epoch": 1.1256976154236429, + "grad_norm": 5.375, + "learning_rate": 4.375814116648184e-06, + "loss": 1.64900436, + "memory(GiB)": 117.38, + "step": 44375, + "train_speed(iter/s)": 1.636422 + }, + { + "acc": 0.65974469, + "epoch": 1.1258244545915779, + "grad_norm": 4.90625, + "learning_rate": 4.374773709833045e-06, + "loss": 1.58544083, + "memory(GiB)": 117.38, + "step": 44380, + "train_speed(iter/s)": 1.636442 + }, + { + "acc": 0.66633081, + "epoch": 1.1259512937595129, + "grad_norm": 7.15625, + "learning_rate": 4.37373333051815e-06, + "loss": 1.56389456, + "memory(GiB)": 117.38, + "step": 44385, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.63738928, + "epoch": 1.126078132927448, + "grad_norm": 5.90625, + "learning_rate": 4.37269297874926e-06, + "loss": 1.67483444, + "memory(GiB)": 117.38, + "step": 44390, + "train_speed(iter/s)": 1.636483 + }, + { + "acc": 0.64450359, + "epoch": 1.126204972095383, + "grad_norm": 5.5625, + "learning_rate": 4.371652654572134e-06, + "loss": 1.68096046, + "memory(GiB)": 117.38, + "step": 44395, + "train_speed(iter/s)": 1.636503 + }, + { + "acc": 0.65925589, + "epoch": 1.126331811263318, + "grad_norm": 6.46875, + "learning_rate": 4.370612358032529e-06, + "loss": 1.54828854, + "memory(GiB)": 117.38, + "step": 44400, + "train_speed(iter/s)": 1.636523 + }, + { + "acc": 0.66103125, + "epoch": 1.1264586504312533, + "grad_norm": 6.6875, + "learning_rate": 4.369572089176201e-06, + "loss": 1.54554634, + "memory(GiB)": 117.38, + "step": 44405, + "train_speed(iter/s)": 1.636543 + }, + { + "acc": 0.64993038, + "epoch": 1.1265854895991883, + "grad_norm": 6.40625, + "learning_rate": 4.3685318480489095e-06, + "loss": 1.60171547, + "memory(GiB)": 117.38, + "step": 44410, + "train_speed(iter/s)": 1.636563 + }, + { + "acc": 0.66563597, + "epoch": 1.1267123287671232, + "grad_norm": 6.1875, + "learning_rate": 4.367491634696405e-06, + "loss": 1.55331097, + "memory(GiB)": 117.38, + "step": 44415, + "train_speed(iter/s)": 1.636583 + }, + { + "acc": 0.64933901, + "epoch": 1.1268391679350582, + "grad_norm": 6.75, + "learning_rate": 4.366451449164442e-06, + "loss": 1.58267803, + "memory(GiB)": 117.38, + "step": 44420, + "train_speed(iter/s)": 1.636603 + }, + { + "acc": 0.65524464, + "epoch": 1.1269660071029934, + "grad_norm": 5.28125, + "learning_rate": 4.365411291498774e-06, + "loss": 1.61520157, + "memory(GiB)": 117.38, + "step": 44425, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.64920053, + "epoch": 1.1270928462709284, + "grad_norm": 5.78125, + "learning_rate": 4.364371161745151e-06, + "loss": 1.63367004, + "memory(GiB)": 117.38, + "step": 44430, + "train_speed(iter/s)": 1.636643 + }, + { + "acc": 0.64144926, + "epoch": 1.1272196854388636, + "grad_norm": 5.5, + "learning_rate": 4.363331059949321e-06, + "loss": 1.61619301, + "memory(GiB)": 117.38, + "step": 44435, + "train_speed(iter/s)": 1.636663 + }, + { + "acc": 0.66180468, + "epoch": 1.1273465246067986, + "grad_norm": 5.3125, + "learning_rate": 4.362290986157034e-06, + "loss": 1.58090649, + "memory(GiB)": 117.38, + "step": 44440, + "train_speed(iter/s)": 1.636683 + }, + { + "acc": 0.65206394, + "epoch": 1.1274733637747336, + "grad_norm": 6.0625, + "learning_rate": 4.361250940414036e-06, + "loss": 1.61410484, + "memory(GiB)": 117.38, + "step": 44445, + "train_speed(iter/s)": 1.636701 + }, + { + "acc": 0.65910387, + "epoch": 1.1276002029426686, + "grad_norm": 6.71875, + "learning_rate": 4.360210922766076e-06, + "loss": 1.59003572, + "memory(GiB)": 117.38, + "step": 44450, + "train_speed(iter/s)": 1.636722 + }, + { + "acc": 0.63463645, + "epoch": 1.1277270421106038, + "grad_norm": 5.75, + "learning_rate": 4.359170933258893e-06, + "loss": 1.68063622, + "memory(GiB)": 117.38, + "step": 44455, + "train_speed(iter/s)": 1.636741 + }, + { + "acc": 0.66906643, + "epoch": 1.1278538812785388, + "grad_norm": 5.59375, + "learning_rate": 4.358130971938235e-06, + "loss": 1.56312609, + "memory(GiB)": 117.38, + "step": 44460, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.64765439, + "epoch": 1.1279807204464738, + "grad_norm": 5.34375, + "learning_rate": 4.357091038849841e-06, + "loss": 1.64965096, + "memory(GiB)": 117.38, + "step": 44465, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.67054591, + "epoch": 1.128107559614409, + "grad_norm": 5.6875, + "learning_rate": 4.356051134039455e-06, + "loss": 1.56810436, + "memory(GiB)": 117.38, + "step": 44470, + "train_speed(iter/s)": 1.6368 + }, + { + "acc": 0.66476593, + "epoch": 1.128234398782344, + "grad_norm": 5.0, + "learning_rate": 4.3550112575528155e-06, + "loss": 1.56342506, + "memory(GiB)": 117.38, + "step": 44475, + "train_speed(iter/s)": 1.636819 + }, + { + "acc": 0.66813889, + "epoch": 1.128361237950279, + "grad_norm": 5.375, + "learning_rate": 4.353971409435659e-06, + "loss": 1.48909473, + "memory(GiB)": 117.38, + "step": 44480, + "train_speed(iter/s)": 1.636839 + }, + { + "acc": 0.6668561, + "epoch": 1.1284880771182142, + "grad_norm": 6.53125, + "learning_rate": 4.352931589733725e-06, + "loss": 1.5471199, + "memory(GiB)": 117.38, + "step": 44485, + "train_speed(iter/s)": 1.636857 + }, + { + "acc": 0.6454442, + "epoch": 1.1286149162861492, + "grad_norm": 7.65625, + "learning_rate": 4.35189179849275e-06, + "loss": 1.64262047, + "memory(GiB)": 117.38, + "step": 44490, + "train_speed(iter/s)": 1.636878 + }, + { + "acc": 0.65689402, + "epoch": 1.1287417554540842, + "grad_norm": 6.15625, + "learning_rate": 4.350852035758466e-06, + "loss": 1.69725113, + "memory(GiB)": 117.38, + "step": 44495, + "train_speed(iter/s)": 1.636899 + }, + { + "acc": 0.64023209, + "epoch": 1.1288685946220194, + "grad_norm": 4.8125, + "learning_rate": 4.3498123015766066e-06, + "loss": 1.61459503, + "memory(GiB)": 117.38, + "step": 44500, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.66754217, + "epoch": 1.1289954337899544, + "grad_norm": 7.4375, + "learning_rate": 4.348772595992906e-06, + "loss": 1.58678741, + "memory(GiB)": 117.38, + "step": 44505, + "train_speed(iter/s)": 1.636939 + }, + { + "acc": 0.64727921, + "epoch": 1.1291222729578894, + "grad_norm": 5.96875, + "learning_rate": 4.347732919053096e-06, + "loss": 1.58723469, + "memory(GiB)": 117.38, + "step": 44510, + "train_speed(iter/s)": 1.636959 + }, + { + "acc": 0.6448493, + "epoch": 1.1292491121258244, + "grad_norm": 5.4375, + "learning_rate": 4.346693270802902e-06, + "loss": 1.62992878, + "memory(GiB)": 117.38, + "step": 44515, + "train_speed(iter/s)": 1.63698 + }, + { + "acc": 0.65734653, + "epoch": 1.1293759512937596, + "grad_norm": 5.40625, + "learning_rate": 4.345653651288055e-06, + "loss": 1.5425087, + "memory(GiB)": 117.38, + "step": 44520, + "train_speed(iter/s)": 1.637 + }, + { + "acc": 0.64768391, + "epoch": 1.1295027904616946, + "grad_norm": 7.375, + "learning_rate": 4.344614060554281e-06, + "loss": 1.58433056, + "memory(GiB)": 117.38, + "step": 44525, + "train_speed(iter/s)": 1.637021 + }, + { + "acc": 0.6630897, + "epoch": 1.1296296296296295, + "grad_norm": 5.78125, + "learning_rate": 4.343574498647311e-06, + "loss": 1.59602089, + "memory(GiB)": 117.38, + "step": 44530, + "train_speed(iter/s)": 1.637041 + }, + { + "acc": 0.66031818, + "epoch": 1.1297564687975648, + "grad_norm": 5.21875, + "learning_rate": 4.342534965612861e-06, + "loss": 1.55400171, + "memory(GiB)": 117.38, + "step": 44535, + "train_speed(iter/s)": 1.637061 + }, + { + "acc": 0.65776958, + "epoch": 1.1298833079654997, + "grad_norm": 5.84375, + "learning_rate": 4.34149546149666e-06, + "loss": 1.57825499, + "memory(GiB)": 117.38, + "step": 44540, + "train_speed(iter/s)": 1.63708 + }, + { + "acc": 0.65936079, + "epoch": 1.1300101471334347, + "grad_norm": 5.4375, + "learning_rate": 4.340455986344428e-06, + "loss": 1.6558918, + "memory(GiB)": 117.38, + "step": 44545, + "train_speed(iter/s)": 1.6371 + }, + { + "acc": 0.65897188, + "epoch": 1.13013698630137, + "grad_norm": 5.8125, + "learning_rate": 4.3394165402018875e-06, + "loss": 1.61213932, + "memory(GiB)": 117.38, + "step": 44550, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.65728865, + "epoch": 1.130263825469305, + "grad_norm": 5.625, + "learning_rate": 4.338377123114757e-06, + "loss": 1.59036427, + "memory(GiB)": 117.38, + "step": 44555, + "train_speed(iter/s)": 1.637139 + }, + { + "acc": 0.64370222, + "epoch": 1.13039066463724, + "grad_norm": 5.15625, + "learning_rate": 4.337337735128752e-06, + "loss": 1.64670715, + "memory(GiB)": 117.38, + "step": 44560, + "train_speed(iter/s)": 1.637159 + }, + { + "acc": 0.65626535, + "epoch": 1.1305175038051751, + "grad_norm": 4.46875, + "learning_rate": 4.336298376289594e-06, + "loss": 1.60992241, + "memory(GiB)": 117.38, + "step": 44565, + "train_speed(iter/s)": 1.637178 + }, + { + "acc": 0.64314036, + "epoch": 1.1306443429731101, + "grad_norm": 9.1875, + "learning_rate": 4.335259046642998e-06, + "loss": 1.67879658, + "memory(GiB)": 117.38, + "step": 44570, + "train_speed(iter/s)": 1.637197 + }, + { + "acc": 0.65109367, + "epoch": 1.130771182141045, + "grad_norm": 4.96875, + "learning_rate": 4.334219746234675e-06, + "loss": 1.60179787, + "memory(GiB)": 117.38, + "step": 44575, + "train_speed(iter/s)": 1.637217 + }, + { + "acc": 0.64804864, + "epoch": 1.13089802130898, + "grad_norm": 5.78125, + "learning_rate": 4.3331804751103395e-06, + "loss": 1.61155281, + "memory(GiB)": 117.38, + "step": 44580, + "train_speed(iter/s)": 1.637236 + }, + { + "acc": 0.65168433, + "epoch": 1.1310248604769153, + "grad_norm": 7.875, + "learning_rate": 4.332141233315705e-06, + "loss": 1.61855316, + "memory(GiB)": 117.38, + "step": 44585, + "train_speed(iter/s)": 1.637255 + }, + { + "acc": 0.66715488, + "epoch": 1.1311516996448503, + "grad_norm": 5.75, + "learning_rate": 4.331102020896482e-06, + "loss": 1.59727602, + "memory(GiB)": 117.38, + "step": 44590, + "train_speed(iter/s)": 1.637276 + }, + { + "acc": 0.66982784, + "epoch": 1.1312785388127855, + "grad_norm": 6.4375, + "learning_rate": 4.330062837898376e-06, + "loss": 1.52059727, + "memory(GiB)": 117.38, + "step": 44595, + "train_speed(iter/s)": 1.637296 + }, + { + "acc": 0.65167823, + "epoch": 1.1314053779807205, + "grad_norm": 5.90625, + "learning_rate": 4.3290236843670985e-06, + "loss": 1.61874866, + "memory(GiB)": 117.38, + "step": 44600, + "train_speed(iter/s)": 1.637317 + }, + { + "acc": 0.65669365, + "epoch": 1.1315322171486555, + "grad_norm": 5.6875, + "learning_rate": 4.327984560348354e-06, + "loss": 1.62331429, + "memory(GiB)": 117.38, + "step": 44605, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.6701004, + "epoch": 1.1316590563165905, + "grad_norm": 5.28125, + "learning_rate": 4.3269454658878516e-06, + "loss": 1.57906294, + "memory(GiB)": 117.38, + "step": 44610, + "train_speed(iter/s)": 1.637357 + }, + { + "acc": 0.66720662, + "epoch": 1.1317858954845257, + "grad_norm": 5.09375, + "learning_rate": 4.325906401031291e-06, + "loss": 1.60992165, + "memory(GiB)": 117.38, + "step": 44615, + "train_speed(iter/s)": 1.637377 + }, + { + "acc": 0.65735626, + "epoch": 1.1319127346524607, + "grad_norm": 6.09375, + "learning_rate": 4.324867365824376e-06, + "loss": 1.5744648, + "memory(GiB)": 117.38, + "step": 44620, + "train_speed(iter/s)": 1.637397 + }, + { + "acc": 0.6614768, + "epoch": 1.1320395738203957, + "grad_norm": 6.65625, + "learning_rate": 4.323828360312809e-06, + "loss": 1.59246025, + "memory(GiB)": 117.38, + "step": 44625, + "train_speed(iter/s)": 1.637418 + }, + { + "acc": 0.65896254, + "epoch": 1.1321664129883309, + "grad_norm": 5.6875, + "learning_rate": 4.32278938454229e-06, + "loss": 1.55731621, + "memory(GiB)": 117.38, + "step": 44630, + "train_speed(iter/s)": 1.637437 + }, + { + "acc": 0.66510549, + "epoch": 1.1322932521562659, + "grad_norm": 7.6875, + "learning_rate": 4.321750438558517e-06, + "loss": 1.52425518, + "memory(GiB)": 117.38, + "step": 44635, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.65005569, + "epoch": 1.1324200913242009, + "grad_norm": 5.40625, + "learning_rate": 4.3207115224071874e-06, + "loss": 1.5399312, + "memory(GiB)": 117.38, + "step": 44640, + "train_speed(iter/s)": 1.637477 + }, + { + "acc": 0.64005647, + "epoch": 1.132546930492136, + "grad_norm": 5.5, + "learning_rate": 4.319672636133998e-06, + "loss": 1.67645168, + "memory(GiB)": 117.38, + "step": 44645, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.66499214, + "epoch": 1.132673769660071, + "grad_norm": 7.59375, + "learning_rate": 4.318633779784646e-06, + "loss": 1.62051659, + "memory(GiB)": 117.38, + "step": 44650, + "train_speed(iter/s)": 1.637518 + }, + { + "acc": 0.65457191, + "epoch": 1.132800608828006, + "grad_norm": 5.65625, + "learning_rate": 4.317594953404818e-06, + "loss": 1.64559517, + "memory(GiB)": 117.38, + "step": 44655, + "train_speed(iter/s)": 1.637538 + }, + { + "acc": 0.65417347, + "epoch": 1.1329274479959413, + "grad_norm": 4.78125, + "learning_rate": 4.316556157040213e-06, + "loss": 1.57564392, + "memory(GiB)": 117.38, + "step": 44660, + "train_speed(iter/s)": 1.637558 + }, + { + "acc": 0.63608503, + "epoch": 1.1330542871638762, + "grad_norm": 5.59375, + "learning_rate": 4.315517390736519e-06, + "loss": 1.66183567, + "memory(GiB)": 117.38, + "step": 44665, + "train_speed(iter/s)": 1.637578 + }, + { + "acc": 0.64407768, + "epoch": 1.1331811263318112, + "grad_norm": 4.4375, + "learning_rate": 4.314478654539429e-06, + "loss": 1.59842501, + "memory(GiB)": 117.38, + "step": 44670, + "train_speed(iter/s)": 1.637597 + }, + { + "acc": 0.65451493, + "epoch": 1.1333079654997462, + "grad_norm": 5.34375, + "learning_rate": 4.313439948494625e-06, + "loss": 1.62349586, + "memory(GiB)": 117.38, + "step": 44675, + "train_speed(iter/s)": 1.637616 + }, + { + "acc": 0.65971632, + "epoch": 1.1334348046676814, + "grad_norm": 7.34375, + "learning_rate": 4.312401272647799e-06, + "loss": 1.56838341, + "memory(GiB)": 117.38, + "step": 44680, + "train_speed(iter/s)": 1.637633 + }, + { + "acc": 0.65503941, + "epoch": 1.1335616438356164, + "grad_norm": 5.59375, + "learning_rate": 4.311362627044633e-06, + "loss": 1.60436687, + "memory(GiB)": 117.38, + "step": 44685, + "train_speed(iter/s)": 1.637652 + }, + { + "acc": 0.65877519, + "epoch": 1.1336884830035514, + "grad_norm": 5.9375, + "learning_rate": 4.310324011730816e-06, + "loss": 1.61761551, + "memory(GiB)": 117.38, + "step": 44690, + "train_speed(iter/s)": 1.637671 + }, + { + "acc": 0.65009527, + "epoch": 1.1338153221714866, + "grad_norm": 5.65625, + "learning_rate": 4.309285426752027e-06, + "loss": 1.65259628, + "memory(GiB)": 117.38, + "step": 44695, + "train_speed(iter/s)": 1.63769 + }, + { + "acc": 0.65982046, + "epoch": 1.1339421613394216, + "grad_norm": 5.625, + "learning_rate": 4.308246872153947e-06, + "loss": 1.61216545, + "memory(GiB)": 117.38, + "step": 44700, + "train_speed(iter/s)": 1.63771 + }, + { + "acc": 0.66697783, + "epoch": 1.1340690005073566, + "grad_norm": 5.1875, + "learning_rate": 4.307208347982259e-06, + "loss": 1.48208141, + "memory(GiB)": 117.38, + "step": 44705, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.66580992, + "epoch": 1.1341958396752918, + "grad_norm": 6.09375, + "learning_rate": 4.306169854282643e-06, + "loss": 1.53322449, + "memory(GiB)": 117.38, + "step": 44710, + "train_speed(iter/s)": 1.637751 + }, + { + "acc": 0.66776094, + "epoch": 1.1343226788432268, + "grad_norm": 7.25, + "learning_rate": 4.305131391100773e-06, + "loss": 1.5979948, + "memory(GiB)": 117.38, + "step": 44715, + "train_speed(iter/s)": 1.63777 + }, + { + "acc": 0.67849522, + "epoch": 1.1344495180111618, + "grad_norm": 5.25, + "learning_rate": 4.304092958482325e-06, + "loss": 1.5401968, + "memory(GiB)": 117.38, + "step": 44720, + "train_speed(iter/s)": 1.63779 + }, + { + "acc": 0.65658293, + "epoch": 1.134576357179097, + "grad_norm": 7.53125, + "learning_rate": 4.303054556472978e-06, + "loss": 1.59548969, + "memory(GiB)": 117.38, + "step": 44725, + "train_speed(iter/s)": 1.63781 + }, + { + "acc": 0.65887961, + "epoch": 1.134703196347032, + "grad_norm": 5.53125, + "learning_rate": 4.3020161851184036e-06, + "loss": 1.60931931, + "memory(GiB)": 117.38, + "step": 44730, + "train_speed(iter/s)": 1.637828 + }, + { + "acc": 0.64138188, + "epoch": 1.134830035514967, + "grad_norm": 7.34375, + "learning_rate": 4.300977844464273e-06, + "loss": 1.6142828, + "memory(GiB)": 117.38, + "step": 44735, + "train_speed(iter/s)": 1.637846 + }, + { + "acc": 0.6602459, + "epoch": 1.134956874682902, + "grad_norm": 6.125, + "learning_rate": 4.2999395345562564e-06, + "loss": 1.55534592, + "memory(GiB)": 117.38, + "step": 44740, + "train_speed(iter/s)": 1.637866 + }, + { + "acc": 0.64283967, + "epoch": 1.1350837138508372, + "grad_norm": 9.125, + "learning_rate": 4.298901255440025e-06, + "loss": 1.62984886, + "memory(GiB)": 117.38, + "step": 44745, + "train_speed(iter/s)": 1.637886 + }, + { + "acc": 0.64952207, + "epoch": 1.1352105530187722, + "grad_norm": 8.25, + "learning_rate": 4.297863007161249e-06, + "loss": 1.62485161, + "memory(GiB)": 117.38, + "step": 44750, + "train_speed(iter/s)": 1.637906 + }, + { + "acc": 0.65533204, + "epoch": 1.1353373921867074, + "grad_norm": 4.875, + "learning_rate": 4.29682478976559e-06, + "loss": 1.54883928, + "memory(GiB)": 117.38, + "step": 44755, + "train_speed(iter/s)": 1.637925 + }, + { + "acc": 0.65837083, + "epoch": 1.1354642313546424, + "grad_norm": 6.15625, + "learning_rate": 4.295786603298717e-06, + "loss": 1.56862326, + "memory(GiB)": 117.38, + "step": 44760, + "train_speed(iter/s)": 1.637944 + }, + { + "acc": 0.65649004, + "epoch": 1.1355910705225774, + "grad_norm": 4.96875, + "learning_rate": 4.294748447806293e-06, + "loss": 1.58333549, + "memory(GiB)": 117.38, + "step": 44765, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.66233821, + "epoch": 1.1357179096905123, + "grad_norm": 6.59375, + "learning_rate": 4.293710323333983e-06, + "loss": 1.59596853, + "memory(GiB)": 117.38, + "step": 44770, + "train_speed(iter/s)": 1.637982 + }, + { + "acc": 0.65775008, + "epoch": 1.1358447488584476, + "grad_norm": 5.875, + "learning_rate": 4.292672229927445e-06, + "loss": 1.57593956, + "memory(GiB)": 117.38, + "step": 44775, + "train_speed(iter/s)": 1.638 + }, + { + "acc": 0.67099819, + "epoch": 1.1359715880263825, + "grad_norm": 5.8125, + "learning_rate": 4.2916341676323386e-06, + "loss": 1.56701775, + "memory(GiB)": 117.38, + "step": 44780, + "train_speed(iter/s)": 1.63802 + }, + { + "acc": 0.66356764, + "epoch": 1.1360984271943175, + "grad_norm": 6.03125, + "learning_rate": 4.290596136494326e-06, + "loss": 1.54503708, + "memory(GiB)": 117.38, + "step": 44785, + "train_speed(iter/s)": 1.63804 + }, + { + "acc": 0.64491863, + "epoch": 1.1362252663622527, + "grad_norm": 7.15625, + "learning_rate": 4.289558136559063e-06, + "loss": 1.67876015, + "memory(GiB)": 117.38, + "step": 44790, + "train_speed(iter/s)": 1.63806 + }, + { + "acc": 0.65235085, + "epoch": 1.1363521055301877, + "grad_norm": 5.5625, + "learning_rate": 4.288520167872203e-06, + "loss": 1.58397017, + "memory(GiB)": 117.38, + "step": 44795, + "train_speed(iter/s)": 1.638079 + }, + { + "acc": 0.64608803, + "epoch": 1.1364789446981227, + "grad_norm": 5.21875, + "learning_rate": 4.287482230479404e-06, + "loss": 1.70430908, + "memory(GiB)": 117.38, + "step": 44800, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.64419556, + "epoch": 1.136605783866058, + "grad_norm": 6.0625, + "learning_rate": 4.286444324426318e-06, + "loss": 1.63320675, + "memory(GiB)": 117.38, + "step": 44805, + "train_speed(iter/s)": 1.638118 + }, + { + "acc": 0.65612288, + "epoch": 1.136732623033993, + "grad_norm": 5.125, + "learning_rate": 4.2854064497585964e-06, + "loss": 1.58696957, + "memory(GiB)": 117.38, + "step": 44810, + "train_speed(iter/s)": 1.638136 + }, + { + "acc": 0.65463748, + "epoch": 1.136859462201928, + "grad_norm": 5.5625, + "learning_rate": 4.284368606521888e-06, + "loss": 1.64917889, + "memory(GiB)": 117.38, + "step": 44815, + "train_speed(iter/s)": 1.638155 + }, + { + "acc": 0.68417792, + "epoch": 1.1369863013698631, + "grad_norm": 4.96875, + "learning_rate": 4.283330794761845e-06, + "loss": 1.57826042, + "memory(GiB)": 117.38, + "step": 44820, + "train_speed(iter/s)": 1.638175 + }, + { + "acc": 0.65448837, + "epoch": 1.137113140537798, + "grad_norm": 5.875, + "learning_rate": 4.282293014524112e-06, + "loss": 1.58738117, + "memory(GiB)": 117.38, + "step": 44825, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.65662594, + "epoch": 1.137239979705733, + "grad_norm": 5.375, + "learning_rate": 4.281255265854338e-06, + "loss": 1.59228792, + "memory(GiB)": 117.38, + "step": 44830, + "train_speed(iter/s)": 1.638213 + }, + { + "acc": 0.66493349, + "epoch": 1.137366818873668, + "grad_norm": 5.1875, + "learning_rate": 4.280217548798166e-06, + "loss": 1.54698172, + "memory(GiB)": 117.38, + "step": 44835, + "train_speed(iter/s)": 1.638233 + }, + { + "acc": 0.6514761, + "epoch": 1.1374936580416033, + "grad_norm": 6.9375, + "learning_rate": 4.279179863401239e-06, + "loss": 1.6118803, + "memory(GiB)": 117.38, + "step": 44840, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.67122731, + "epoch": 1.1376204972095383, + "grad_norm": 6.75, + "learning_rate": 4.278142209709199e-06, + "loss": 1.53611288, + "memory(GiB)": 117.38, + "step": 44845, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.65977073, + "epoch": 1.1377473363774733, + "grad_norm": 6.34375, + "learning_rate": 4.277104587767691e-06, + "loss": 1.62168236, + "memory(GiB)": 117.38, + "step": 44850, + "train_speed(iter/s)": 1.63829 + }, + { + "acc": 0.64796233, + "epoch": 1.1378741755454085, + "grad_norm": 7.8125, + "learning_rate": 4.276066997622348e-06, + "loss": 1.62023373, + "memory(GiB)": 117.38, + "step": 44855, + "train_speed(iter/s)": 1.638311 + }, + { + "acc": 0.67169561, + "epoch": 1.1380010147133435, + "grad_norm": 6.8125, + "learning_rate": 4.27502943931881e-06, + "loss": 1.54391232, + "memory(GiB)": 117.38, + "step": 44860, + "train_speed(iter/s)": 1.638332 + }, + { + "acc": 0.64612837, + "epoch": 1.1381278538812785, + "grad_norm": 5.21875, + "learning_rate": 4.273991912902716e-06, + "loss": 1.63414402, + "memory(GiB)": 117.38, + "step": 44865, + "train_speed(iter/s)": 1.638353 + }, + { + "acc": 0.64558144, + "epoch": 1.1382546930492137, + "grad_norm": 5.8125, + "learning_rate": 4.272954418419699e-06, + "loss": 1.60734348, + "memory(GiB)": 117.38, + "step": 44870, + "train_speed(iter/s)": 1.638373 + }, + { + "acc": 0.66381435, + "epoch": 1.1383815322171487, + "grad_norm": 5.53125, + "learning_rate": 4.2719169559153905e-06, + "loss": 1.53794365, + "memory(GiB)": 117.38, + "step": 44875, + "train_speed(iter/s)": 1.638394 + }, + { + "acc": 0.67569208, + "epoch": 1.1385083713850837, + "grad_norm": 5.1875, + "learning_rate": 4.270879525435426e-06, + "loss": 1.56903572, + "memory(GiB)": 117.38, + "step": 44880, + "train_speed(iter/s)": 1.638414 + }, + { + "acc": 0.66070075, + "epoch": 1.1386352105530189, + "grad_norm": 5.15625, + "learning_rate": 4.269842127025435e-06, + "loss": 1.59961758, + "memory(GiB)": 117.38, + "step": 44885, + "train_speed(iter/s)": 1.638433 + }, + { + "acc": 0.65325918, + "epoch": 1.1387620497209539, + "grad_norm": 5.84375, + "learning_rate": 4.2688047607310504e-06, + "loss": 1.6062355, + "memory(GiB)": 117.38, + "step": 44890, + "train_speed(iter/s)": 1.638453 + }, + { + "acc": 0.64345627, + "epoch": 1.1388888888888888, + "grad_norm": 5.625, + "learning_rate": 4.267767426597893e-06, + "loss": 1.65838737, + "memory(GiB)": 117.38, + "step": 44895, + "train_speed(iter/s)": 1.638473 + }, + { + "acc": 0.65645738, + "epoch": 1.1390157280568238, + "grad_norm": 4.90625, + "learning_rate": 4.266730124671594e-06, + "loss": 1.58498802, + "memory(GiB)": 117.38, + "step": 44900, + "train_speed(iter/s)": 1.638492 + }, + { + "acc": 0.66112752, + "epoch": 1.139142567224759, + "grad_norm": 5.875, + "learning_rate": 4.265692854997778e-06, + "loss": 1.57835989, + "memory(GiB)": 117.38, + "step": 44905, + "train_speed(iter/s)": 1.638511 + }, + { + "acc": 0.66965404, + "epoch": 1.139269406392694, + "grad_norm": 5.25, + "learning_rate": 4.2646556176220714e-06, + "loss": 1.54745722, + "memory(GiB)": 117.38, + "step": 44910, + "train_speed(iter/s)": 1.638532 + }, + { + "acc": 0.66337614, + "epoch": 1.1393962455606292, + "grad_norm": 5.3125, + "learning_rate": 4.263618412590092e-06, + "loss": 1.54854355, + "memory(GiB)": 117.38, + "step": 44915, + "train_speed(iter/s)": 1.63855 + }, + { + "acc": 0.64248281, + "epoch": 1.1395230847285642, + "grad_norm": 5.4375, + "learning_rate": 4.2625812399474604e-06, + "loss": 1.64810219, + "memory(GiB)": 117.38, + "step": 44920, + "train_speed(iter/s)": 1.63857 + }, + { + "acc": 0.66312189, + "epoch": 1.1396499238964992, + "grad_norm": 5.21875, + "learning_rate": 4.2615440997398e-06, + "loss": 1.6174427, + "memory(GiB)": 117.38, + "step": 44925, + "train_speed(iter/s)": 1.638589 + }, + { + "acc": 0.66680937, + "epoch": 1.1397767630644342, + "grad_norm": 6.125, + "learning_rate": 4.2605069920127284e-06, + "loss": 1.52570429, + "memory(GiB)": 117.38, + "step": 44930, + "train_speed(iter/s)": 1.638609 + }, + { + "acc": 0.66472435, + "epoch": 1.1399036022323694, + "grad_norm": 6.09375, + "learning_rate": 4.25946991681186e-06, + "loss": 1.56732063, + "memory(GiB)": 117.38, + "step": 44935, + "train_speed(iter/s)": 1.638629 + }, + { + "acc": 0.65112996, + "epoch": 1.1400304414003044, + "grad_norm": 5.4375, + "learning_rate": 4.258432874182809e-06, + "loss": 1.61382751, + "memory(GiB)": 117.38, + "step": 44940, + "train_speed(iter/s)": 1.638648 + }, + { + "acc": 0.65448222, + "epoch": 1.1401572805682394, + "grad_norm": 5.53125, + "learning_rate": 4.2573958641711925e-06, + "loss": 1.63212433, + "memory(GiB)": 117.38, + "step": 44945, + "train_speed(iter/s)": 1.638668 + }, + { + "acc": 0.65999322, + "epoch": 1.1402841197361746, + "grad_norm": 4.71875, + "learning_rate": 4.256358886822622e-06, + "loss": 1.52996912, + "memory(GiB)": 117.38, + "step": 44950, + "train_speed(iter/s)": 1.638687 + }, + { + "acc": 0.66640196, + "epoch": 1.1404109589041096, + "grad_norm": 6.53125, + "learning_rate": 4.255321942182707e-06, + "loss": 1.55350838, + "memory(GiB)": 117.38, + "step": 44955, + "train_speed(iter/s)": 1.638707 + }, + { + "acc": 0.65074863, + "epoch": 1.1405377980720446, + "grad_norm": 5.53125, + "learning_rate": 4.254285030297058e-06, + "loss": 1.64202518, + "memory(GiB)": 117.38, + "step": 44960, + "train_speed(iter/s)": 1.638725 + }, + { + "acc": 0.66520491, + "epoch": 1.1406646372399798, + "grad_norm": 5.0, + "learning_rate": 4.2532481512112814e-06, + "loss": 1.63261967, + "memory(GiB)": 117.38, + "step": 44965, + "train_speed(iter/s)": 1.638744 + }, + { + "acc": 0.64974203, + "epoch": 1.1407914764079148, + "grad_norm": 8.8125, + "learning_rate": 4.252211304970988e-06, + "loss": 1.61925392, + "memory(GiB)": 117.38, + "step": 44970, + "train_speed(iter/s)": 1.638765 + }, + { + "acc": 0.65617852, + "epoch": 1.1409183155758498, + "grad_norm": 6.625, + "learning_rate": 4.251174491621778e-06, + "loss": 1.59221592, + "memory(GiB)": 117.38, + "step": 44975, + "train_speed(iter/s)": 1.638784 + }, + { + "acc": 0.65519772, + "epoch": 1.141045154743785, + "grad_norm": 5.15625, + "learning_rate": 4.250137711209258e-06, + "loss": 1.64379768, + "memory(GiB)": 117.38, + "step": 44980, + "train_speed(iter/s)": 1.638804 + }, + { + "acc": 0.65432749, + "epoch": 1.14117199391172, + "grad_norm": 5.875, + "learning_rate": 4.249100963779028e-06, + "loss": 1.69992599, + "memory(GiB)": 117.38, + "step": 44985, + "train_speed(iter/s)": 1.638824 + }, + { + "acc": 0.66140928, + "epoch": 1.141298833079655, + "grad_norm": 6.75, + "learning_rate": 4.248064249376692e-06, + "loss": 1.58639002, + "memory(GiB)": 117.38, + "step": 44990, + "train_speed(iter/s)": 1.638842 + }, + { + "acc": 0.66274366, + "epoch": 1.14142567224759, + "grad_norm": 5.34375, + "learning_rate": 4.2470275680478466e-06, + "loss": 1.58178024, + "memory(GiB)": 117.38, + "step": 44995, + "train_speed(iter/s)": 1.638861 + }, + { + "acc": 0.65980358, + "epoch": 1.1415525114155252, + "grad_norm": 6.53125, + "learning_rate": 4.2459909198380886e-06, + "loss": 1.5451066, + "memory(GiB)": 117.38, + "step": 45000, + "train_speed(iter/s)": 1.63888 + }, + { + "epoch": 1.1415525114155252, + "eval_acc": 0.6462634982886332, + "eval_loss": 1.5736206769943237, + "eval_runtime": 58.592, + "eval_samples_per_second": 108.718, + "eval_steps_per_second": 27.188, + "step": 45000 + }, + { + "acc": 0.67257562, + "epoch": 1.1416793505834602, + "grad_norm": 5.5625, + "learning_rate": 4.244954304793019e-06, + "loss": 1.54084663, + "memory(GiB)": 117.38, + "step": 45005, + "train_speed(iter/s)": 1.635155 + }, + { + "acc": 0.65244265, + "epoch": 1.1418061897513951, + "grad_norm": 5.625, + "learning_rate": 4.2439177229582304e-06, + "loss": 1.62374191, + "memory(GiB)": 117.38, + "step": 45010, + "train_speed(iter/s)": 1.635174 + }, + { + "acc": 0.66174579, + "epoch": 1.1419330289193304, + "grad_norm": 5.59375, + "learning_rate": 4.242881174379313e-06, + "loss": 1.59536924, + "memory(GiB)": 117.38, + "step": 45015, + "train_speed(iter/s)": 1.635192 + }, + { + "acc": 0.64588933, + "epoch": 1.1420598680872653, + "grad_norm": 6.0, + "learning_rate": 4.241844659101865e-06, + "loss": 1.60734749, + "memory(GiB)": 117.38, + "step": 45020, + "train_speed(iter/s)": 1.635212 + }, + { + "acc": 0.64297466, + "epoch": 1.1421867072552003, + "grad_norm": 4.90625, + "learning_rate": 4.240808177171472e-06, + "loss": 1.70465965, + "memory(GiB)": 117.38, + "step": 45025, + "train_speed(iter/s)": 1.635231 + }, + { + "acc": 0.65627313, + "epoch": 1.1423135464231355, + "grad_norm": 5.5625, + "learning_rate": 4.239771728633727e-06, + "loss": 1.56444607, + "memory(GiB)": 117.38, + "step": 45030, + "train_speed(iter/s)": 1.635251 + }, + { + "acc": 0.62234554, + "epoch": 1.1424403855910705, + "grad_norm": 5.0, + "learning_rate": 4.238735313534213e-06, + "loss": 1.70080528, + "memory(GiB)": 117.38, + "step": 45035, + "train_speed(iter/s)": 1.63527 + }, + { + "acc": 0.65512896, + "epoch": 1.1425672247590055, + "grad_norm": 6.25, + "learning_rate": 4.23769893191852e-06, + "loss": 1.64819946, + "memory(GiB)": 117.38, + "step": 45040, + "train_speed(iter/s)": 1.63529 + }, + { + "acc": 0.66054111, + "epoch": 1.1426940639269407, + "grad_norm": 4.96875, + "learning_rate": 4.236662583832229e-06, + "loss": 1.52770672, + "memory(GiB)": 117.38, + "step": 45045, + "train_speed(iter/s)": 1.635309 + }, + { + "acc": 0.64748535, + "epoch": 1.1428209030948757, + "grad_norm": 5.5, + "learning_rate": 4.23562626932093e-06, + "loss": 1.73409805, + "memory(GiB)": 117.38, + "step": 45050, + "train_speed(iter/s)": 1.635329 + }, + { + "acc": 0.64391332, + "epoch": 1.1429477422628107, + "grad_norm": 5.5625, + "learning_rate": 4.234589988430198e-06, + "loss": 1.63327446, + "memory(GiB)": 117.38, + "step": 45055, + "train_speed(iter/s)": 1.635349 + }, + { + "acc": 0.65088005, + "epoch": 1.1430745814307457, + "grad_norm": 5.5625, + "learning_rate": 4.233553741205615e-06, + "loss": 1.64944038, + "memory(GiB)": 117.38, + "step": 45060, + "train_speed(iter/s)": 1.635367 + }, + { + "acc": 0.64193854, + "epoch": 1.143201420598681, + "grad_norm": 6.0, + "learning_rate": 4.2325175276927614e-06, + "loss": 1.59619846, + "memory(GiB)": 117.38, + "step": 45065, + "train_speed(iter/s)": 1.635388 + }, + { + "acc": 0.6534915, + "epoch": 1.143328259766616, + "grad_norm": 6.0625, + "learning_rate": 4.231481347937214e-06, + "loss": 1.56089849, + "memory(GiB)": 117.38, + "step": 45070, + "train_speed(iter/s)": 1.635407 + }, + { + "acc": 0.65843577, + "epoch": 1.143455098934551, + "grad_norm": 5.53125, + "learning_rate": 4.230445201984547e-06, + "loss": 1.54402351, + "memory(GiB)": 117.38, + "step": 45075, + "train_speed(iter/s)": 1.635428 + }, + { + "acc": 0.65084324, + "epoch": 1.143581938102486, + "grad_norm": 5.03125, + "learning_rate": 4.229409089880336e-06, + "loss": 1.61615219, + "memory(GiB)": 117.38, + "step": 45080, + "train_speed(iter/s)": 1.635449 + }, + { + "acc": 0.64929686, + "epoch": 1.143708777270421, + "grad_norm": 6.15625, + "learning_rate": 4.2283730116701535e-06, + "loss": 1.64138031, + "memory(GiB)": 117.38, + "step": 45085, + "train_speed(iter/s)": 1.635469 + }, + { + "acc": 0.66016579, + "epoch": 1.143835616438356, + "grad_norm": 6.25, + "learning_rate": 4.227336967399573e-06, + "loss": 1.61218834, + "memory(GiB)": 117.38, + "step": 45090, + "train_speed(iter/s)": 1.63549 + }, + { + "acc": 0.66132369, + "epoch": 1.1439624556062913, + "grad_norm": 5.46875, + "learning_rate": 4.2263009571141585e-06, + "loss": 1.5713686, + "memory(GiB)": 117.38, + "step": 45095, + "train_speed(iter/s)": 1.63551 + }, + { + "acc": 0.66328506, + "epoch": 1.1440892947742263, + "grad_norm": 5.21875, + "learning_rate": 4.225264980859485e-06, + "loss": 1.57846813, + "memory(GiB)": 117.38, + "step": 45100, + "train_speed(iter/s)": 1.635531 + }, + { + "acc": 0.64998884, + "epoch": 1.1442161339421613, + "grad_norm": 5.8125, + "learning_rate": 4.224229038681115e-06, + "loss": 1.66005554, + "memory(GiB)": 117.38, + "step": 45105, + "train_speed(iter/s)": 1.635552 + }, + { + "acc": 0.64826536, + "epoch": 1.1443429731100965, + "grad_norm": 5.3125, + "learning_rate": 4.223193130624619e-06, + "loss": 1.65065155, + "memory(GiB)": 117.38, + "step": 45110, + "train_speed(iter/s)": 1.635571 + }, + { + "acc": 0.67079296, + "epoch": 1.1444698122780315, + "grad_norm": 5.4375, + "learning_rate": 4.222157256735553e-06, + "loss": 1.50241985, + "memory(GiB)": 117.38, + "step": 45115, + "train_speed(iter/s)": 1.635589 + }, + { + "acc": 0.66567268, + "epoch": 1.1445966514459665, + "grad_norm": 6.1875, + "learning_rate": 4.2211214170594865e-06, + "loss": 1.59095192, + "memory(GiB)": 117.38, + "step": 45120, + "train_speed(iter/s)": 1.63561 + }, + { + "acc": 0.65066247, + "epoch": 1.1447234906139017, + "grad_norm": 8.1875, + "learning_rate": 4.220085611641976e-06, + "loss": 1.6233942, + "memory(GiB)": 117.38, + "step": 45125, + "train_speed(iter/s)": 1.635629 + }, + { + "acc": 0.66051474, + "epoch": 1.1448503297818367, + "grad_norm": 5.125, + "learning_rate": 4.2190498405285826e-06, + "loss": 1.64969177, + "memory(GiB)": 117.38, + "step": 45130, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.66293893, + "epoch": 1.1449771689497716, + "grad_norm": 5.75, + "learning_rate": 4.218014103764865e-06, + "loss": 1.57484598, + "memory(GiB)": 117.38, + "step": 45135, + "train_speed(iter/s)": 1.635668 + }, + { + "acc": 0.63383932, + "epoch": 1.1451040081177069, + "grad_norm": 5.53125, + "learning_rate": 4.216978401396376e-06, + "loss": 1.65815392, + "memory(GiB)": 117.38, + "step": 45140, + "train_speed(iter/s)": 1.635687 + }, + { + "acc": 0.64647589, + "epoch": 1.1452308472856418, + "grad_norm": 6.3125, + "learning_rate": 4.215942733468675e-06, + "loss": 1.63710175, + "memory(GiB)": 117.38, + "step": 45145, + "train_speed(iter/s)": 1.635709 + }, + { + "acc": 0.67577429, + "epoch": 1.1453576864535768, + "grad_norm": 6.09375, + "learning_rate": 4.2149071000273134e-06, + "loss": 1.49446564, + "memory(GiB)": 117.38, + "step": 45150, + "train_speed(iter/s)": 1.635729 + }, + { + "acc": 0.65749898, + "epoch": 1.1454845256215118, + "grad_norm": 7.15625, + "learning_rate": 4.213871501117842e-06, + "loss": 1.54127226, + "memory(GiB)": 117.38, + "step": 45155, + "train_speed(iter/s)": 1.635749 + }, + { + "acc": 0.64806232, + "epoch": 1.145611364789447, + "grad_norm": 5.625, + "learning_rate": 4.212835936785811e-06, + "loss": 1.69376068, + "memory(GiB)": 117.38, + "step": 45160, + "train_speed(iter/s)": 1.635769 + }, + { + "acc": 0.671523, + "epoch": 1.145738203957382, + "grad_norm": 6.03125, + "learning_rate": 4.21180040707677e-06, + "loss": 1.51248293, + "memory(GiB)": 117.38, + "step": 45165, + "train_speed(iter/s)": 1.63579 + }, + { + "acc": 0.6596693, + "epoch": 1.145865043125317, + "grad_norm": 5.4375, + "learning_rate": 4.2107649120362684e-06, + "loss": 1.64692001, + "memory(GiB)": 117.38, + "step": 45170, + "train_speed(iter/s)": 1.63581 + }, + { + "acc": 0.64200983, + "epoch": 1.1459918822932522, + "grad_norm": 4.71875, + "learning_rate": 4.2097294517098465e-06, + "loss": 1.65302429, + "memory(GiB)": 117.38, + "step": 45175, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.65853386, + "epoch": 1.1461187214611872, + "grad_norm": 7.09375, + "learning_rate": 4.208694026143054e-06, + "loss": 1.66464615, + "memory(GiB)": 117.38, + "step": 45180, + "train_speed(iter/s)": 1.635843 + }, + { + "acc": 0.66068316, + "epoch": 1.1462455606291222, + "grad_norm": 6.46875, + "learning_rate": 4.2076586353814295e-06, + "loss": 1.68183842, + "memory(GiB)": 117.38, + "step": 45185, + "train_speed(iter/s)": 1.635862 + }, + { + "acc": 0.66107907, + "epoch": 1.1463723997970574, + "grad_norm": 5.25, + "learning_rate": 4.2066232794705174e-06, + "loss": 1.58206558, + "memory(GiB)": 117.38, + "step": 45190, + "train_speed(iter/s)": 1.635881 + }, + { + "acc": 0.64624529, + "epoch": 1.1464992389649924, + "grad_norm": 5.65625, + "learning_rate": 4.205587958455854e-06, + "loss": 1.60086403, + "memory(GiB)": 117.38, + "step": 45195, + "train_speed(iter/s)": 1.635902 + }, + { + "acc": 0.65240498, + "epoch": 1.1466260781329274, + "grad_norm": 5.34375, + "learning_rate": 4.204552672382981e-06, + "loss": 1.65291519, + "memory(GiB)": 117.38, + "step": 45200, + "train_speed(iter/s)": 1.635922 + }, + { + "acc": 0.65476499, + "epoch": 1.1467529173008626, + "grad_norm": 5.03125, + "learning_rate": 4.203517421297431e-06, + "loss": 1.5894145, + "memory(GiB)": 117.38, + "step": 45205, + "train_speed(iter/s)": 1.635943 + }, + { + "acc": 0.64782419, + "epoch": 1.1468797564687976, + "grad_norm": 5.40625, + "learning_rate": 4.202482205244742e-06, + "loss": 1.57690372, + "memory(GiB)": 117.38, + "step": 45210, + "train_speed(iter/s)": 1.635963 + }, + { + "acc": 0.67050691, + "epoch": 1.1470065956367326, + "grad_norm": 6.78125, + "learning_rate": 4.201447024270446e-06, + "loss": 1.55274944, + "memory(GiB)": 117.38, + "step": 45215, + "train_speed(iter/s)": 1.635984 + }, + { + "acc": 0.65082684, + "epoch": 1.1471334348046676, + "grad_norm": 5.375, + "learning_rate": 4.200411878420074e-06, + "loss": 1.65268574, + "memory(GiB)": 117.38, + "step": 45220, + "train_speed(iter/s)": 1.636004 + }, + { + "acc": 0.64993443, + "epoch": 1.1472602739726028, + "grad_norm": 6.875, + "learning_rate": 4.199376767739158e-06, + "loss": 1.58830147, + "memory(GiB)": 117.38, + "step": 45225, + "train_speed(iter/s)": 1.636024 + }, + { + "acc": 0.67012959, + "epoch": 1.1473871131405378, + "grad_norm": 6.3125, + "learning_rate": 4.1983416922732276e-06, + "loss": 1.56773262, + "memory(GiB)": 117.38, + "step": 45230, + "train_speed(iter/s)": 1.636043 + }, + { + "acc": 0.65392261, + "epoch": 1.147513952308473, + "grad_norm": 5.25, + "learning_rate": 4.197306652067807e-06, + "loss": 1.54658403, + "memory(GiB)": 117.38, + "step": 45235, + "train_speed(iter/s)": 1.636063 + }, + { + "acc": 0.66988049, + "epoch": 1.147640791476408, + "grad_norm": 6.25, + "learning_rate": 4.196271647168425e-06, + "loss": 1.52350998, + "memory(GiB)": 117.38, + "step": 45240, + "train_speed(iter/s)": 1.636083 + }, + { + "acc": 0.63933067, + "epoch": 1.147767630644343, + "grad_norm": 5.71875, + "learning_rate": 4.195236677620604e-06, + "loss": 1.66917629, + "memory(GiB)": 117.38, + "step": 45245, + "train_speed(iter/s)": 1.636104 + }, + { + "acc": 0.67018023, + "epoch": 1.147894469812278, + "grad_norm": 6.90625, + "learning_rate": 4.1942017434698675e-06, + "loss": 1.46641054, + "memory(GiB)": 117.38, + "step": 45250, + "train_speed(iter/s)": 1.636124 + }, + { + "acc": 0.65379639, + "epoch": 1.1480213089802132, + "grad_norm": 4.96875, + "learning_rate": 4.1931668447617346e-06, + "loss": 1.57654285, + "memory(GiB)": 117.38, + "step": 45255, + "train_speed(iter/s)": 1.636143 + }, + { + "acc": 0.66897969, + "epoch": 1.1481481481481481, + "grad_norm": 6.0, + "learning_rate": 4.192131981541727e-06, + "loss": 1.64674473, + "memory(GiB)": 117.38, + "step": 45260, + "train_speed(iter/s)": 1.636163 + }, + { + "acc": 0.65962706, + "epoch": 1.1482749873160831, + "grad_norm": 6.84375, + "learning_rate": 4.19109715385536e-06, + "loss": 1.58185654, + "memory(GiB)": 117.38, + "step": 45265, + "train_speed(iter/s)": 1.636183 + }, + { + "acc": 0.63951559, + "epoch": 1.1484018264840183, + "grad_norm": 5.25, + "learning_rate": 4.190062361748154e-06, + "loss": 1.61705322, + "memory(GiB)": 117.38, + "step": 45270, + "train_speed(iter/s)": 1.636203 + }, + { + "acc": 0.65071244, + "epoch": 1.1485286656519533, + "grad_norm": 7.34375, + "learning_rate": 4.189027605265621e-06, + "loss": 1.6297657, + "memory(GiB)": 117.38, + "step": 45275, + "train_speed(iter/s)": 1.636223 + }, + { + "acc": 0.65401368, + "epoch": 1.1486555048198883, + "grad_norm": 6.84375, + "learning_rate": 4.187992884453273e-06, + "loss": 1.61136131, + "memory(GiB)": 117.38, + "step": 45280, + "train_speed(iter/s)": 1.636243 + }, + { + "acc": 0.65580549, + "epoch": 1.1487823439878235, + "grad_norm": 5.125, + "learning_rate": 4.186958199356624e-06, + "loss": 1.61955166, + "memory(GiB)": 117.38, + "step": 45285, + "train_speed(iter/s)": 1.636264 + }, + { + "acc": 0.64785986, + "epoch": 1.1489091831557585, + "grad_norm": 7.46875, + "learning_rate": 4.185923550021185e-06, + "loss": 1.63833656, + "memory(GiB)": 117.38, + "step": 45290, + "train_speed(iter/s)": 1.636284 + }, + { + "acc": 0.65836873, + "epoch": 1.1490360223236935, + "grad_norm": 4.75, + "learning_rate": 4.1848889364924625e-06, + "loss": 1.58048573, + "memory(GiB)": 117.38, + "step": 45295, + "train_speed(iter/s)": 1.636303 + }, + { + "acc": 0.65235901, + "epoch": 1.1491628614916287, + "grad_norm": 6.34375, + "learning_rate": 4.183854358815962e-06, + "loss": 1.65781555, + "memory(GiB)": 117.38, + "step": 45300, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.6562737, + "epoch": 1.1492897006595637, + "grad_norm": 6.59375, + "learning_rate": 4.182819817037192e-06, + "loss": 1.58850946, + "memory(GiB)": 117.38, + "step": 45305, + "train_speed(iter/s)": 1.636343 + }, + { + "acc": 0.65125065, + "epoch": 1.1494165398274987, + "grad_norm": 5.71875, + "learning_rate": 4.181785311201655e-06, + "loss": 1.62749519, + "memory(GiB)": 117.38, + "step": 45310, + "train_speed(iter/s)": 1.636364 + }, + { + "acc": 0.65248361, + "epoch": 1.1495433789954337, + "grad_norm": 5.4375, + "learning_rate": 4.1807508413548515e-06, + "loss": 1.64515991, + "memory(GiB)": 117.38, + "step": 45315, + "train_speed(iter/s)": 1.636383 + }, + { + "acc": 0.65627723, + "epoch": 1.149670218163369, + "grad_norm": 6.6875, + "learning_rate": 4.179716407542285e-06, + "loss": 1.56477146, + "memory(GiB)": 117.38, + "step": 45320, + "train_speed(iter/s)": 1.636403 + }, + { + "acc": 0.64873562, + "epoch": 1.1497970573313039, + "grad_norm": 5.625, + "learning_rate": 4.178682009809452e-06, + "loss": 1.64755135, + "memory(GiB)": 117.38, + "step": 45325, + "train_speed(iter/s)": 1.636425 + }, + { + "acc": 0.65606441, + "epoch": 1.1499238964992389, + "grad_norm": 4.75, + "learning_rate": 4.177647648201854e-06, + "loss": 1.59035797, + "memory(GiB)": 117.38, + "step": 45330, + "train_speed(iter/s)": 1.636446 + }, + { + "acc": 0.64957151, + "epoch": 1.150050735667174, + "grad_norm": 5.4375, + "learning_rate": 4.1766133227649815e-06, + "loss": 1.58410587, + "memory(GiB)": 117.38, + "step": 45335, + "train_speed(iter/s)": 1.636466 + }, + { + "acc": 0.66110668, + "epoch": 1.150177574835109, + "grad_norm": 5.53125, + "learning_rate": 4.175579033544332e-06, + "loss": 1.55576954, + "memory(GiB)": 117.38, + "step": 45340, + "train_speed(iter/s)": 1.636487 + }, + { + "acc": 0.64153652, + "epoch": 1.150304414003044, + "grad_norm": 5.78125, + "learning_rate": 4.174544780585395e-06, + "loss": 1.61314507, + "memory(GiB)": 117.38, + "step": 45345, + "train_speed(iter/s)": 1.636508 + }, + { + "acc": 0.64415364, + "epoch": 1.1504312531709793, + "grad_norm": 10.3125, + "learning_rate": 4.1735105639336686e-06, + "loss": 1.71888123, + "memory(GiB)": 117.38, + "step": 45350, + "train_speed(iter/s)": 1.636528 + }, + { + "acc": 0.6536171, + "epoch": 1.1505580923389143, + "grad_norm": 5.28125, + "learning_rate": 4.172476383634635e-06, + "loss": 1.64055214, + "memory(GiB)": 117.38, + "step": 45355, + "train_speed(iter/s)": 1.636548 + }, + { + "acc": 0.65930958, + "epoch": 1.1506849315068493, + "grad_norm": 6.03125, + "learning_rate": 4.171442239733783e-06, + "loss": 1.67506371, + "memory(GiB)": 117.38, + "step": 45360, + "train_speed(iter/s)": 1.636567 + }, + { + "acc": 0.64076943, + "epoch": 1.1508117706747845, + "grad_norm": 5.625, + "learning_rate": 4.170408132276603e-06, + "loss": 1.68986702, + "memory(GiB)": 117.38, + "step": 45365, + "train_speed(iter/s)": 1.636586 + }, + { + "acc": 0.64943233, + "epoch": 1.1509386098427195, + "grad_norm": 7.53125, + "learning_rate": 4.1693740613085776e-06, + "loss": 1.6353241, + "memory(GiB)": 117.38, + "step": 45370, + "train_speed(iter/s)": 1.636606 + }, + { + "acc": 0.67001419, + "epoch": 1.1510654490106544, + "grad_norm": 5.59375, + "learning_rate": 4.168340026875188e-06, + "loss": 1.55925875, + "memory(GiB)": 117.38, + "step": 45375, + "train_speed(iter/s)": 1.636628 + }, + { + "acc": 0.65950322, + "epoch": 1.1511922881785894, + "grad_norm": 4.9375, + "learning_rate": 4.167306029021917e-06, + "loss": 1.59508457, + "memory(GiB)": 117.38, + "step": 45380, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.65052991, + "epoch": 1.1513191273465246, + "grad_norm": 6.375, + "learning_rate": 4.166272067794246e-06, + "loss": 1.6451601, + "memory(GiB)": 117.38, + "step": 45385, + "train_speed(iter/s)": 1.636669 + }, + { + "acc": 0.64905534, + "epoch": 1.1514459665144596, + "grad_norm": 4.96875, + "learning_rate": 4.165238143237651e-06, + "loss": 1.64910812, + "memory(GiB)": 117.38, + "step": 45390, + "train_speed(iter/s)": 1.63669 + }, + { + "acc": 0.64131794, + "epoch": 1.1515728056823948, + "grad_norm": 5.125, + "learning_rate": 4.164204255397608e-06, + "loss": 1.61529942, + "memory(GiB)": 117.38, + "step": 45395, + "train_speed(iter/s)": 1.63671 + }, + { + "acc": 0.64460678, + "epoch": 1.1516996448503298, + "grad_norm": 4.75, + "learning_rate": 4.163170404319596e-06, + "loss": 1.61353378, + "memory(GiB)": 117.38, + "step": 45400, + "train_speed(iter/s)": 1.636731 + }, + { + "acc": 0.66338181, + "epoch": 1.1518264840182648, + "grad_norm": 5.375, + "learning_rate": 4.1621365900490825e-06, + "loss": 1.61284351, + "memory(GiB)": 117.38, + "step": 45405, + "train_speed(iter/s)": 1.636752 + }, + { + "acc": 0.66150312, + "epoch": 1.1519533231861998, + "grad_norm": 6.4375, + "learning_rate": 4.1611028126315455e-06, + "loss": 1.56076374, + "memory(GiB)": 117.38, + "step": 45410, + "train_speed(iter/s)": 1.636772 + }, + { + "acc": 0.6500288, + "epoch": 1.152080162354135, + "grad_norm": 5.5625, + "learning_rate": 4.160069072112451e-06, + "loss": 1.59573555, + "memory(GiB)": 117.38, + "step": 45415, + "train_speed(iter/s)": 1.636792 + }, + { + "acc": 0.66158094, + "epoch": 1.15220700152207, + "grad_norm": 4.78125, + "learning_rate": 4.1590353685372695e-06, + "loss": 1.53414373, + "memory(GiB)": 117.38, + "step": 45420, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.66660585, + "epoch": 1.152333840690005, + "grad_norm": 6.5, + "learning_rate": 4.158001701951465e-06, + "loss": 1.52418327, + "memory(GiB)": 117.38, + "step": 45425, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.64957085, + "epoch": 1.1524606798579402, + "grad_norm": 5.9375, + "learning_rate": 4.156968072400508e-06, + "loss": 1.62925873, + "memory(GiB)": 117.38, + "step": 45430, + "train_speed(iter/s)": 1.636853 + }, + { + "acc": 0.65010386, + "epoch": 1.1525875190258752, + "grad_norm": 6.40625, + "learning_rate": 4.155934479929858e-06, + "loss": 1.62804775, + "memory(GiB)": 117.38, + "step": 45435, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.68676844, + "epoch": 1.1527143581938102, + "grad_norm": 9.25, + "learning_rate": 4.154900924584976e-06, + "loss": 1.46937981, + "memory(GiB)": 117.38, + "step": 45440, + "train_speed(iter/s)": 1.636893 + }, + { + "acc": 0.64122305, + "epoch": 1.1528411973617454, + "grad_norm": 5.3125, + "learning_rate": 4.153867406411327e-06, + "loss": 1.68359184, + "memory(GiB)": 117.38, + "step": 45445, + "train_speed(iter/s)": 1.636914 + }, + { + "acc": 0.66693935, + "epoch": 1.1529680365296804, + "grad_norm": 6.53125, + "learning_rate": 4.152833925454367e-06, + "loss": 1.53952808, + "memory(GiB)": 117.38, + "step": 45450, + "train_speed(iter/s)": 1.636934 + }, + { + "acc": 0.66180687, + "epoch": 1.1530948756976154, + "grad_norm": 5.21875, + "learning_rate": 4.1518004817595515e-06, + "loss": 1.54953289, + "memory(GiB)": 117.38, + "step": 45455, + "train_speed(iter/s)": 1.636952 + }, + { + "acc": 0.63008494, + "epoch": 1.1532217148655506, + "grad_norm": 7.21875, + "learning_rate": 4.150767075372338e-06, + "loss": 1.70316315, + "memory(GiB)": 117.38, + "step": 45460, + "train_speed(iter/s)": 1.636973 + }, + { + "acc": 0.65845613, + "epoch": 1.1533485540334856, + "grad_norm": 6.125, + "learning_rate": 4.149733706338182e-06, + "loss": 1.55905695, + "memory(GiB)": 117.38, + "step": 45465, + "train_speed(iter/s)": 1.636993 + }, + { + "acc": 0.65770588, + "epoch": 1.1534753932014206, + "grad_norm": 5.875, + "learning_rate": 4.148700374702533e-06, + "loss": 1.58927193, + "memory(GiB)": 117.38, + "step": 45470, + "train_speed(iter/s)": 1.637011 + }, + { + "acc": 0.66058807, + "epoch": 1.1536022323693556, + "grad_norm": 5.0, + "learning_rate": 4.147667080510841e-06, + "loss": 1.58783236, + "memory(GiB)": 117.38, + "step": 45475, + "train_speed(iter/s)": 1.637031 + }, + { + "acc": 0.6663022, + "epoch": 1.1537290715372908, + "grad_norm": 5.25, + "learning_rate": 4.146633823808557e-06, + "loss": 1.54864883, + "memory(GiB)": 117.38, + "step": 45480, + "train_speed(iter/s)": 1.637051 + }, + { + "acc": 0.64306774, + "epoch": 1.1538559107052258, + "grad_norm": 5.875, + "learning_rate": 4.145600604641127e-06, + "loss": 1.62109756, + "memory(GiB)": 117.38, + "step": 45485, + "train_speed(iter/s)": 1.637071 + }, + { + "acc": 0.66459589, + "epoch": 1.1539827498731607, + "grad_norm": 6.375, + "learning_rate": 4.1445674230539985e-06, + "loss": 1.58660212, + "memory(GiB)": 117.38, + "step": 45490, + "train_speed(iter/s)": 1.63709 + }, + { + "acc": 0.6673192, + "epoch": 1.154109589041096, + "grad_norm": 6.25, + "learning_rate": 4.143534279092613e-06, + "loss": 1.58046036, + "memory(GiB)": 117.38, + "step": 45495, + "train_speed(iter/s)": 1.63711 + }, + { + "acc": 0.65245991, + "epoch": 1.154236428209031, + "grad_norm": 6.09375, + "learning_rate": 4.142501172802412e-06, + "loss": 1.68155785, + "memory(GiB)": 117.38, + "step": 45500, + "train_speed(iter/s)": 1.637129 + }, + { + "acc": 0.64704008, + "epoch": 1.154363267376966, + "grad_norm": 5.9375, + "learning_rate": 4.14146810422884e-06, + "loss": 1.65030556, + "memory(GiB)": 117.38, + "step": 45505, + "train_speed(iter/s)": 1.637148 + }, + { + "acc": 0.65180693, + "epoch": 1.1544901065449011, + "grad_norm": 7.0, + "learning_rate": 4.140435073417335e-06, + "loss": 1.57812862, + "memory(GiB)": 117.38, + "step": 45510, + "train_speed(iter/s)": 1.637168 + }, + { + "acc": 0.65421929, + "epoch": 1.1546169457128361, + "grad_norm": 5.125, + "learning_rate": 4.139402080413331e-06, + "loss": 1.63333874, + "memory(GiB)": 117.38, + "step": 45515, + "train_speed(iter/s)": 1.637187 + }, + { + "acc": 0.66482153, + "epoch": 1.1547437848807711, + "grad_norm": 5.5, + "learning_rate": 4.138369125262266e-06, + "loss": 1.60764771, + "memory(GiB)": 117.38, + "step": 45520, + "train_speed(iter/s)": 1.637208 + }, + { + "acc": 0.65694504, + "epoch": 1.1548706240487063, + "grad_norm": 5.53125, + "learning_rate": 4.137336208009574e-06, + "loss": 1.59771414, + "memory(GiB)": 117.38, + "step": 45525, + "train_speed(iter/s)": 1.637227 + }, + { + "acc": 0.64919019, + "epoch": 1.1549974632166413, + "grad_norm": 5.59375, + "learning_rate": 4.136303328700688e-06, + "loss": 1.64983845, + "memory(GiB)": 117.38, + "step": 45530, + "train_speed(iter/s)": 1.637248 + }, + { + "acc": 0.63951387, + "epoch": 1.1551243023845763, + "grad_norm": 5.78125, + "learning_rate": 4.135270487381037e-06, + "loss": 1.66697044, + "memory(GiB)": 117.38, + "step": 45535, + "train_speed(iter/s)": 1.637269 + }, + { + "acc": 0.66064291, + "epoch": 1.1552511415525113, + "grad_norm": 5.15625, + "learning_rate": 4.13423768409605e-06, + "loss": 1.56094532, + "memory(GiB)": 117.38, + "step": 45540, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.6514194, + "epoch": 1.1553779807204465, + "grad_norm": 5.21875, + "learning_rate": 4.133204918891155e-06, + "loss": 1.63216496, + "memory(GiB)": 117.38, + "step": 45545, + "train_speed(iter/s)": 1.63731 + }, + { + "acc": 0.65630569, + "epoch": 1.1555048198883815, + "grad_norm": 6.21875, + "learning_rate": 4.132172191811781e-06, + "loss": 1.63021584, + "memory(GiB)": 117.38, + "step": 45550, + "train_speed(iter/s)": 1.63733 + }, + { + "acc": 0.65527201, + "epoch": 1.1556316590563167, + "grad_norm": 5.21875, + "learning_rate": 4.131139502903345e-06, + "loss": 1.61451988, + "memory(GiB)": 117.38, + "step": 45555, + "train_speed(iter/s)": 1.63735 + }, + { + "acc": 0.64712048, + "epoch": 1.1557584982242517, + "grad_norm": 5.125, + "learning_rate": 4.130106852211273e-06, + "loss": 1.64628601, + "memory(GiB)": 117.38, + "step": 45560, + "train_speed(iter/s)": 1.63737 + }, + { + "acc": 0.65945153, + "epoch": 1.1558853373921867, + "grad_norm": 5.6875, + "learning_rate": 4.129074239780986e-06, + "loss": 1.55537014, + "memory(GiB)": 117.38, + "step": 45565, + "train_speed(iter/s)": 1.63739 + }, + { + "acc": 0.67395611, + "epoch": 1.1560121765601217, + "grad_norm": 5.9375, + "learning_rate": 4.128041665657903e-06, + "loss": 1.58240671, + "memory(GiB)": 117.38, + "step": 45570, + "train_speed(iter/s)": 1.63741 + }, + { + "acc": 0.66048293, + "epoch": 1.1561390157280569, + "grad_norm": 5.5, + "learning_rate": 4.127009129887441e-06, + "loss": 1.58171101, + "memory(GiB)": 117.38, + "step": 45575, + "train_speed(iter/s)": 1.637431 + }, + { + "acc": 0.6499372, + "epoch": 1.1562658548959919, + "grad_norm": 5.65625, + "learning_rate": 4.125976632515013e-06, + "loss": 1.60724125, + "memory(GiB)": 117.38, + "step": 45580, + "train_speed(iter/s)": 1.637452 + }, + { + "acc": 0.65003896, + "epoch": 1.1563926940639269, + "grad_norm": 5.34375, + "learning_rate": 4.124944173586036e-06, + "loss": 1.62497673, + "memory(GiB)": 117.38, + "step": 45585, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.64358754, + "epoch": 1.156519533231862, + "grad_norm": 6.09375, + "learning_rate": 4.123911753145922e-06, + "loss": 1.64446964, + "memory(GiB)": 117.38, + "step": 45590, + "train_speed(iter/s)": 1.637492 + }, + { + "acc": 0.66175175, + "epoch": 1.156646372399797, + "grad_norm": 7.25, + "learning_rate": 4.12287937124008e-06, + "loss": 1.61964569, + "memory(GiB)": 117.38, + "step": 45595, + "train_speed(iter/s)": 1.637513 + }, + { + "acc": 0.65238795, + "epoch": 1.156773211567732, + "grad_norm": 6.78125, + "learning_rate": 4.121847027913918e-06, + "loss": 1.67598457, + "memory(GiB)": 117.38, + "step": 45600, + "train_speed(iter/s)": 1.637535 + }, + { + "acc": 0.66461244, + "epoch": 1.1569000507356673, + "grad_norm": 6.09375, + "learning_rate": 4.1208147232128456e-06, + "loss": 1.58861532, + "memory(GiB)": 117.38, + "step": 45605, + "train_speed(iter/s)": 1.637555 + }, + { + "acc": 0.64696674, + "epoch": 1.1570268899036023, + "grad_norm": 5.71875, + "learning_rate": 4.119782457182267e-06, + "loss": 1.65146809, + "memory(GiB)": 117.38, + "step": 45610, + "train_speed(iter/s)": 1.637575 + }, + { + "acc": 0.6603157, + "epoch": 1.1571537290715372, + "grad_norm": 5.625, + "learning_rate": 4.118750229867585e-06, + "loss": 1.59788628, + "memory(GiB)": 117.38, + "step": 45615, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.64708014, + "epoch": 1.1572805682394725, + "grad_norm": 6.125, + "learning_rate": 4.117718041314204e-06, + "loss": 1.63051414, + "memory(GiB)": 117.38, + "step": 45620, + "train_speed(iter/s)": 1.637617 + }, + { + "acc": 0.65227494, + "epoch": 1.1574074074074074, + "grad_norm": 5.28125, + "learning_rate": 4.11668589156752e-06, + "loss": 1.6279808, + "memory(GiB)": 117.38, + "step": 45625, + "train_speed(iter/s)": 1.637638 + }, + { + "acc": 0.67124414, + "epoch": 1.1575342465753424, + "grad_norm": 5.6875, + "learning_rate": 4.115653780672937e-06, + "loss": 1.53081388, + "memory(GiB)": 117.38, + "step": 45630, + "train_speed(iter/s)": 1.637658 + }, + { + "acc": 0.65506029, + "epoch": 1.1576610857432774, + "grad_norm": 6.09375, + "learning_rate": 4.1146217086758475e-06, + "loss": 1.60845337, + "memory(GiB)": 117.38, + "step": 45635, + "train_speed(iter/s)": 1.637678 + }, + { + "acc": 0.67593899, + "epoch": 1.1577879249112126, + "grad_norm": 6.03125, + "learning_rate": 4.113589675621649e-06, + "loss": 1.50124655, + "memory(GiB)": 117.38, + "step": 45640, + "train_speed(iter/s)": 1.637698 + }, + { + "acc": 0.66740408, + "epoch": 1.1579147640791476, + "grad_norm": 4.8125, + "learning_rate": 4.112557681555733e-06, + "loss": 1.5421032, + "memory(GiB)": 117.38, + "step": 45645, + "train_speed(iter/s)": 1.637717 + }, + { + "acc": 0.65874996, + "epoch": 1.1580416032470826, + "grad_norm": 5.75, + "learning_rate": 4.111525726523494e-06, + "loss": 1.6162529, + "memory(GiB)": 117.38, + "step": 45650, + "train_speed(iter/s)": 1.637738 + }, + { + "acc": 0.66456437, + "epoch": 1.1581684424150178, + "grad_norm": 4.71875, + "learning_rate": 4.110493810570319e-06, + "loss": 1.50876808, + "memory(GiB)": 117.38, + "step": 45655, + "train_speed(iter/s)": 1.637758 + }, + { + "acc": 0.6542655, + "epoch": 1.1582952815829528, + "grad_norm": 6.59375, + "learning_rate": 4.109461933741598e-06, + "loss": 1.63389492, + "memory(GiB)": 117.38, + "step": 45660, + "train_speed(iter/s)": 1.637778 + }, + { + "acc": 0.64341602, + "epoch": 1.1584221207508878, + "grad_norm": 7.0625, + "learning_rate": 4.108430096082716e-06, + "loss": 1.66366501, + "memory(GiB)": 117.38, + "step": 45665, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.65244751, + "epoch": 1.158548959918823, + "grad_norm": 5.3125, + "learning_rate": 4.107398297639062e-06, + "loss": 1.63552132, + "memory(GiB)": 117.38, + "step": 45670, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.64832549, + "epoch": 1.158675799086758, + "grad_norm": 5.875, + "learning_rate": 4.106366538456013e-06, + "loss": 1.64582176, + "memory(GiB)": 117.38, + "step": 45675, + "train_speed(iter/s)": 1.63784 + }, + { + "acc": 0.67131143, + "epoch": 1.158802638254693, + "grad_norm": 5.375, + "learning_rate": 4.105334818578954e-06, + "loss": 1.53551464, + "memory(GiB)": 117.38, + "step": 45680, + "train_speed(iter/s)": 1.637861 + }, + { + "acc": 0.65744576, + "epoch": 1.1589294774226282, + "grad_norm": 5.4375, + "learning_rate": 4.104303138053265e-06, + "loss": 1.63645992, + "memory(GiB)": 117.38, + "step": 45685, + "train_speed(iter/s)": 1.637882 + }, + { + "acc": 0.67203264, + "epoch": 1.1590563165905632, + "grad_norm": 7.0, + "learning_rate": 4.103271496924323e-06, + "loss": 1.55208626, + "memory(GiB)": 117.38, + "step": 45690, + "train_speed(iter/s)": 1.637902 + }, + { + "acc": 0.67174592, + "epoch": 1.1591831557584982, + "grad_norm": 8.25, + "learning_rate": 4.102239895237503e-06, + "loss": 1.63320484, + "memory(GiB)": 117.38, + "step": 45695, + "train_speed(iter/s)": 1.637923 + }, + { + "acc": 0.67263622, + "epoch": 1.1593099949264332, + "grad_norm": 5.59375, + "learning_rate": 4.101208333038181e-06, + "loss": 1.54915209, + "memory(GiB)": 117.38, + "step": 45700, + "train_speed(iter/s)": 1.637943 + }, + { + "acc": 0.65530787, + "epoch": 1.1594368340943684, + "grad_norm": 6.78125, + "learning_rate": 4.1001768103717285e-06, + "loss": 1.62030373, + "memory(GiB)": 117.38, + "step": 45705, + "train_speed(iter/s)": 1.637962 + }, + { + "acc": 0.64375172, + "epoch": 1.1595636732623034, + "grad_norm": 6.21875, + "learning_rate": 4.09914532728352e-06, + "loss": 1.54258194, + "memory(GiB)": 117.38, + "step": 45710, + "train_speed(iter/s)": 1.637982 + }, + { + "acc": 0.64644041, + "epoch": 1.1596905124302386, + "grad_norm": 6.125, + "learning_rate": 4.09811388381892e-06, + "loss": 1.65020676, + "memory(GiB)": 117.38, + "step": 45715, + "train_speed(iter/s)": 1.638001 + }, + { + "acc": 0.66290736, + "epoch": 1.1598173515981736, + "grad_norm": 6.78125, + "learning_rate": 4.097082480023298e-06, + "loss": 1.62647362, + "memory(GiB)": 117.38, + "step": 45720, + "train_speed(iter/s)": 1.638021 + }, + { + "acc": 0.63748798, + "epoch": 1.1599441907661086, + "grad_norm": 5.46875, + "learning_rate": 4.09605111594202e-06, + "loss": 1.67567177, + "memory(GiB)": 117.38, + "step": 45725, + "train_speed(iter/s)": 1.638041 + }, + { + "acc": 0.65290003, + "epoch": 1.1600710299340435, + "grad_norm": 5.84375, + "learning_rate": 4.095019791620451e-06, + "loss": 1.61052761, + "memory(GiB)": 117.38, + "step": 45730, + "train_speed(iter/s)": 1.638061 + }, + { + "acc": 0.65383158, + "epoch": 1.1601978691019788, + "grad_norm": 5.1875, + "learning_rate": 4.093988507103951e-06, + "loss": 1.56698151, + "memory(GiB)": 117.38, + "step": 45735, + "train_speed(iter/s)": 1.63808 + }, + { + "acc": 0.66335864, + "epoch": 1.1603247082699137, + "grad_norm": 6.21875, + "learning_rate": 4.09295726243788e-06, + "loss": 1.54548235, + "memory(GiB)": 117.38, + "step": 45740, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.67101846, + "epoch": 1.1604515474378487, + "grad_norm": 7.34375, + "learning_rate": 4.091926057667601e-06, + "loss": 1.60606804, + "memory(GiB)": 117.38, + "step": 45745, + "train_speed(iter/s)": 1.638121 + }, + { + "acc": 0.64621401, + "epoch": 1.160578386605784, + "grad_norm": 6.34375, + "learning_rate": 4.0908948928384675e-06, + "loss": 1.5893981, + "memory(GiB)": 117.38, + "step": 45750, + "train_speed(iter/s)": 1.63814 + }, + { + "acc": 0.6651525, + "epoch": 1.160705225773719, + "grad_norm": 6.75, + "learning_rate": 4.089863767995835e-06, + "loss": 1.60778904, + "memory(GiB)": 117.38, + "step": 45755, + "train_speed(iter/s)": 1.638162 + }, + { + "acc": 0.64678073, + "epoch": 1.160832064941654, + "grad_norm": 5.8125, + "learning_rate": 4.088832683185057e-06, + "loss": 1.62014313, + "memory(GiB)": 117.38, + "step": 45760, + "train_speed(iter/s)": 1.638181 + }, + { + "acc": 0.63979187, + "epoch": 1.1609589041095891, + "grad_norm": 6.28125, + "learning_rate": 4.087801638451485e-06, + "loss": 1.65764751, + "memory(GiB)": 117.38, + "step": 45765, + "train_speed(iter/s)": 1.6382 + }, + { + "acc": 0.65871515, + "epoch": 1.1610857432775241, + "grad_norm": 5.15625, + "learning_rate": 4.086770633840472e-06, + "loss": 1.60395947, + "memory(GiB)": 117.38, + "step": 45770, + "train_speed(iter/s)": 1.638218 + }, + { + "acc": 0.65753632, + "epoch": 1.161212582445459, + "grad_norm": 6.53125, + "learning_rate": 4.085739669397362e-06, + "loss": 1.57268314, + "memory(GiB)": 117.38, + "step": 45775, + "train_speed(iter/s)": 1.638239 + }, + { + "acc": 0.6571804, + "epoch": 1.1613394216133943, + "grad_norm": 4.5625, + "learning_rate": 4.084708745167504e-06, + "loss": 1.54842768, + "memory(GiB)": 117.38, + "step": 45780, + "train_speed(iter/s)": 1.63826 + }, + { + "acc": 0.64140844, + "epoch": 1.1614662607813293, + "grad_norm": 5.71875, + "learning_rate": 4.08367786119624e-06, + "loss": 1.71731091, + "memory(GiB)": 117.38, + "step": 45785, + "train_speed(iter/s)": 1.638279 + }, + { + "acc": 0.66937094, + "epoch": 1.1615930999492643, + "grad_norm": 6.4375, + "learning_rate": 4.082647017528918e-06, + "loss": 1.58800392, + "memory(GiB)": 117.38, + "step": 45790, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.65077233, + "epoch": 1.1617199391171993, + "grad_norm": 6.96875, + "learning_rate": 4.081616214210874e-06, + "loss": 1.62099571, + "memory(GiB)": 117.38, + "step": 45795, + "train_speed(iter/s)": 1.638319 + }, + { + "acc": 0.66162701, + "epoch": 1.1618467782851345, + "grad_norm": 5.0, + "learning_rate": 4.0805854512874485e-06, + "loss": 1.59572086, + "memory(GiB)": 117.38, + "step": 45800, + "train_speed(iter/s)": 1.638338 + }, + { + "acc": 0.66195498, + "epoch": 1.1619736174530695, + "grad_norm": 5.15625, + "learning_rate": 4.079554728803981e-06, + "loss": 1.57355719, + "memory(GiB)": 117.38, + "step": 45805, + "train_speed(iter/s)": 1.638357 + }, + { + "acc": 0.66594782, + "epoch": 1.1621004566210045, + "grad_norm": 6.9375, + "learning_rate": 4.078524046805806e-06, + "loss": 1.59419041, + "memory(GiB)": 117.38, + "step": 45810, + "train_speed(iter/s)": 1.638377 + }, + { + "acc": 0.6529388, + "epoch": 1.1622272957889397, + "grad_norm": 6.53125, + "learning_rate": 4.0774934053382576e-06, + "loss": 1.5821641, + "memory(GiB)": 117.38, + "step": 45815, + "train_speed(iter/s)": 1.638397 + }, + { + "acc": 0.67012205, + "epoch": 1.1623541349568747, + "grad_norm": 5.875, + "learning_rate": 4.076462804446667e-06, + "loss": 1.59089537, + "memory(GiB)": 117.38, + "step": 45820, + "train_speed(iter/s)": 1.638415 + }, + { + "acc": 0.65776105, + "epoch": 1.1624809741248097, + "grad_norm": 6.96875, + "learning_rate": 4.0754322441763654e-06, + "loss": 1.63938808, + "memory(GiB)": 117.38, + "step": 45825, + "train_speed(iter/s)": 1.638435 + }, + { + "acc": 0.64684472, + "epoch": 1.1626078132927449, + "grad_norm": 5.0625, + "learning_rate": 4.0744017245726834e-06, + "loss": 1.64120007, + "memory(GiB)": 117.38, + "step": 45830, + "train_speed(iter/s)": 1.638454 + }, + { + "acc": 0.6506197, + "epoch": 1.1627346524606799, + "grad_norm": 5.8125, + "learning_rate": 4.073371245680944e-06, + "loss": 1.63957024, + "memory(GiB)": 117.38, + "step": 45835, + "train_speed(iter/s)": 1.638472 + }, + { + "acc": 0.64056778, + "epoch": 1.1628614916286149, + "grad_norm": 5.34375, + "learning_rate": 4.0723408075464754e-06, + "loss": 1.69090843, + "memory(GiB)": 117.38, + "step": 45840, + "train_speed(iter/s)": 1.638493 + }, + { + "acc": 0.64457269, + "epoch": 1.16298833079655, + "grad_norm": 5.75, + "learning_rate": 4.071310410214598e-06, + "loss": 1.65721245, + "memory(GiB)": 117.38, + "step": 45845, + "train_speed(iter/s)": 1.638511 + }, + { + "acc": 0.65111094, + "epoch": 1.163115169964485, + "grad_norm": 4.8125, + "learning_rate": 4.070280053730639e-06, + "loss": 1.69260597, + "memory(GiB)": 117.38, + "step": 45850, + "train_speed(iter/s)": 1.638531 + }, + { + "acc": 0.65409937, + "epoch": 1.16324200913242, + "grad_norm": 5.8125, + "learning_rate": 4.069249738139911e-06, + "loss": 1.62338486, + "memory(GiB)": 117.38, + "step": 45855, + "train_speed(iter/s)": 1.638551 + }, + { + "acc": 0.65706215, + "epoch": 1.163368848300355, + "grad_norm": 6.96875, + "learning_rate": 4.068219463487736e-06, + "loss": 1.60820999, + "memory(GiB)": 117.38, + "step": 45860, + "train_speed(iter/s)": 1.638568 + }, + { + "acc": 0.66592746, + "epoch": 1.1634956874682902, + "grad_norm": 5.5, + "learning_rate": 4.0671892298194286e-06, + "loss": 1.5888483, + "memory(GiB)": 117.38, + "step": 45865, + "train_speed(iter/s)": 1.638587 + }, + { + "acc": 0.66092119, + "epoch": 1.1636225266362252, + "grad_norm": 5.625, + "learning_rate": 4.066159037180304e-06, + "loss": 1.57736731, + "memory(GiB)": 117.38, + "step": 45870, + "train_speed(iter/s)": 1.638605 + }, + { + "acc": 0.65945692, + "epoch": 1.1637493658041604, + "grad_norm": 5.625, + "learning_rate": 4.065128885615674e-06, + "loss": 1.61729317, + "memory(GiB)": 117.38, + "step": 45875, + "train_speed(iter/s)": 1.638624 + }, + { + "acc": 0.62857614, + "epoch": 1.1638762049720954, + "grad_norm": 5.46875, + "learning_rate": 4.064098775170849e-06, + "loss": 1.64748878, + "memory(GiB)": 117.38, + "step": 45880, + "train_speed(iter/s)": 1.638643 + }, + { + "acc": 0.65355468, + "epoch": 1.1640030441400304, + "grad_norm": 5.84375, + "learning_rate": 4.063068705891139e-06, + "loss": 1.63424339, + "memory(GiB)": 117.38, + "step": 45885, + "train_speed(iter/s)": 1.638663 + }, + { + "acc": 0.65396433, + "epoch": 1.1641298833079654, + "grad_norm": 4.90625, + "learning_rate": 4.062038677821852e-06, + "loss": 1.63054276, + "memory(GiB)": 117.38, + "step": 45890, + "train_speed(iter/s)": 1.638681 + }, + { + "acc": 0.66293154, + "epoch": 1.1642567224759006, + "grad_norm": 5.65625, + "learning_rate": 4.061008691008289e-06, + "loss": 1.59148769, + "memory(GiB)": 117.38, + "step": 45895, + "train_speed(iter/s)": 1.6387 + }, + { + "acc": 0.66535788, + "epoch": 1.1643835616438356, + "grad_norm": 5.40625, + "learning_rate": 4.059978745495757e-06, + "loss": 1.56888647, + "memory(GiB)": 117.38, + "step": 45900, + "train_speed(iter/s)": 1.638719 + }, + { + "acc": 0.66330585, + "epoch": 1.1645104008117706, + "grad_norm": 4.875, + "learning_rate": 4.058948841329557e-06, + "loss": 1.59199753, + "memory(GiB)": 117.38, + "step": 45905, + "train_speed(iter/s)": 1.638739 + }, + { + "acc": 0.64743738, + "epoch": 1.1646372399797058, + "grad_norm": 5.625, + "learning_rate": 4.057918978554989e-06, + "loss": 1.6908392, + "memory(GiB)": 117.38, + "step": 45910, + "train_speed(iter/s)": 1.638758 + }, + { + "acc": 0.66670647, + "epoch": 1.1647640791476408, + "grad_norm": 5.8125, + "learning_rate": 4.056889157217348e-06, + "loss": 1.55901222, + "memory(GiB)": 117.38, + "step": 45915, + "train_speed(iter/s)": 1.638424 + }, + { + "acc": 0.63864088, + "epoch": 1.1648909183155758, + "grad_norm": 6.125, + "learning_rate": 4.0558593773619346e-06, + "loss": 1.68672218, + "memory(GiB)": 117.38, + "step": 45920, + "train_speed(iter/s)": 1.638443 + }, + { + "acc": 0.65399733, + "epoch": 1.165017757483511, + "grad_norm": 6.03125, + "learning_rate": 4.05482963903404e-06, + "loss": 1.62069855, + "memory(GiB)": 117.38, + "step": 45925, + "train_speed(iter/s)": 1.638463 + }, + { + "acc": 0.64970732, + "epoch": 1.165144596651446, + "grad_norm": 5.1875, + "learning_rate": 4.05379994227896e-06, + "loss": 1.67670097, + "memory(GiB)": 117.38, + "step": 45930, + "train_speed(iter/s)": 1.638482 + }, + { + "acc": 0.63536215, + "epoch": 1.165271435819381, + "grad_norm": 7.125, + "learning_rate": 4.052770287141981e-06, + "loss": 1.65553684, + "memory(GiB)": 117.38, + "step": 45935, + "train_speed(iter/s)": 1.6385 + }, + { + "acc": 0.6613471, + "epoch": 1.1653982749873162, + "grad_norm": 6.34375, + "learning_rate": 4.051740673668393e-06, + "loss": 1.517132, + "memory(GiB)": 117.38, + "step": 45940, + "train_speed(iter/s)": 1.638519 + }, + { + "acc": 0.65200567, + "epoch": 1.1655251141552512, + "grad_norm": 6.1875, + "learning_rate": 4.0507111019034855e-06, + "loss": 1.6038599, + "memory(GiB)": 117.38, + "step": 45945, + "train_speed(iter/s)": 1.638538 + }, + { + "acc": 0.65460243, + "epoch": 1.1656519533231862, + "grad_norm": 6.09375, + "learning_rate": 4.049681571892543e-06, + "loss": 1.59833679, + "memory(GiB)": 117.38, + "step": 45950, + "train_speed(iter/s)": 1.638557 + }, + { + "acc": 0.6562912, + "epoch": 1.1657787924911212, + "grad_norm": 5.25, + "learning_rate": 4.048652083680847e-06, + "loss": 1.59626503, + "memory(GiB)": 117.38, + "step": 45955, + "train_speed(iter/s)": 1.638576 + }, + { + "acc": 0.64160509, + "epoch": 1.1659056316590564, + "grad_norm": 5.15625, + "learning_rate": 4.047622637313678e-06, + "loss": 1.72627773, + "memory(GiB)": 117.38, + "step": 45960, + "train_speed(iter/s)": 1.638595 + }, + { + "acc": 0.66135969, + "epoch": 1.1660324708269914, + "grad_norm": 5.25, + "learning_rate": 4.046593232836319e-06, + "loss": 1.54990978, + "memory(GiB)": 117.38, + "step": 45965, + "train_speed(iter/s)": 1.638614 + }, + { + "acc": 0.65821109, + "epoch": 1.1661593099949263, + "grad_norm": 5.6875, + "learning_rate": 4.045563870294047e-06, + "loss": 1.62085037, + "memory(GiB)": 117.38, + "step": 45970, + "train_speed(iter/s)": 1.638636 + }, + { + "acc": 0.67386074, + "epoch": 1.1662861491628616, + "grad_norm": 6.0625, + "learning_rate": 4.044534549732135e-06, + "loss": 1.5024826, + "memory(GiB)": 117.38, + "step": 45975, + "train_speed(iter/s)": 1.638655 + }, + { + "acc": 0.67889004, + "epoch": 1.1664129883307965, + "grad_norm": 5.59375, + "learning_rate": 4.043505271195861e-06, + "loss": 1.51033525, + "memory(GiB)": 117.38, + "step": 45980, + "train_speed(iter/s)": 1.638675 + }, + { + "acc": 0.65484538, + "epoch": 1.1665398274987315, + "grad_norm": 6.78125, + "learning_rate": 4.042476034730494e-06, + "loss": 1.66983604, + "memory(GiB)": 117.38, + "step": 45985, + "train_speed(iter/s)": 1.638694 + }, + { + "acc": 0.64533615, + "epoch": 1.1666666666666667, + "grad_norm": 5.9375, + "learning_rate": 4.041446840381309e-06, + "loss": 1.65745106, + "memory(GiB)": 117.38, + "step": 45990, + "train_speed(iter/s)": 1.638715 + }, + { + "acc": 0.66766167, + "epoch": 1.1667935058346017, + "grad_norm": 5.5625, + "learning_rate": 4.040417688193569e-06, + "loss": 1.57248611, + "memory(GiB)": 117.38, + "step": 45995, + "train_speed(iter/s)": 1.638734 + }, + { + "acc": 0.65643172, + "epoch": 1.1669203450025367, + "grad_norm": 5.5, + "learning_rate": 4.039388578212545e-06, + "loss": 1.56962347, + "memory(GiB)": 117.38, + "step": 46000, + "train_speed(iter/s)": 1.638753 + }, + { + "epoch": 1.1669203450025367, + "eval_acc": 0.6461841426563671, + "eval_loss": 1.5733494758605957, + "eval_runtime": 58.3367, + "eval_samples_per_second": 109.194, + "eval_steps_per_second": 27.307, + "step": 46000 + }, + { + "acc": 0.65896254, + "epoch": 1.167047184170472, + "grad_norm": 6.125, + "learning_rate": 4.0383595104834975e-06, + "loss": 1.58292599, + "memory(GiB)": 117.38, + "step": 46005, + "train_speed(iter/s)": 1.635124 + }, + { + "acc": 0.65144, + "epoch": 1.167174023338407, + "grad_norm": 8.0625, + "learning_rate": 4.037330485051695e-06, + "loss": 1.62439899, + "memory(GiB)": 117.38, + "step": 46010, + "train_speed(iter/s)": 1.635143 + }, + { + "acc": 0.64499083, + "epoch": 1.167300862506342, + "grad_norm": 4.78125, + "learning_rate": 4.0363015019623955e-06, + "loss": 1.63023815, + "memory(GiB)": 117.38, + "step": 46015, + "train_speed(iter/s)": 1.635163 + }, + { + "acc": 0.66844106, + "epoch": 1.167427701674277, + "grad_norm": 5.0, + "learning_rate": 4.0352725612608565e-06, + "loss": 1.57476463, + "memory(GiB)": 117.38, + "step": 46020, + "train_speed(iter/s)": 1.635182 + }, + { + "acc": 0.65385628, + "epoch": 1.167554540842212, + "grad_norm": 6.53125, + "learning_rate": 4.0342436629923385e-06, + "loss": 1.64859009, + "memory(GiB)": 117.38, + "step": 46025, + "train_speed(iter/s)": 1.635201 + }, + { + "acc": 0.66201344, + "epoch": 1.167681380010147, + "grad_norm": 5.75, + "learning_rate": 4.033214807202098e-06, + "loss": 1.5683795, + "memory(GiB)": 117.38, + "step": 46030, + "train_speed(iter/s)": 1.63522 + }, + { + "acc": 0.66079459, + "epoch": 1.1678082191780823, + "grad_norm": 5.9375, + "learning_rate": 4.032185993935385e-06, + "loss": 1.59754753, + "memory(GiB)": 117.38, + "step": 46035, + "train_speed(iter/s)": 1.63524 + }, + { + "acc": 0.66224594, + "epoch": 1.1679350583460173, + "grad_norm": 6.1875, + "learning_rate": 4.031157223237452e-06, + "loss": 1.54156399, + "memory(GiB)": 117.38, + "step": 46040, + "train_speed(iter/s)": 1.635259 + }, + { + "acc": 0.6721055, + "epoch": 1.1680618975139523, + "grad_norm": 6.1875, + "learning_rate": 4.0301284951535504e-06, + "loss": 1.54112396, + "memory(GiB)": 117.38, + "step": 46045, + "train_speed(iter/s)": 1.635278 + }, + { + "acc": 0.6689188, + "epoch": 1.1681887366818873, + "grad_norm": 5.1875, + "learning_rate": 4.029099809728929e-06, + "loss": 1.60583134, + "memory(GiB)": 117.38, + "step": 46050, + "train_speed(iter/s)": 1.635298 + }, + { + "acc": 0.65159554, + "epoch": 1.1683155758498225, + "grad_norm": 5.84375, + "learning_rate": 4.028071167008831e-06, + "loss": 1.63261814, + "memory(GiB)": 117.38, + "step": 46055, + "train_speed(iter/s)": 1.635317 + }, + { + "acc": 0.66183271, + "epoch": 1.1684424150177575, + "grad_norm": 6.0625, + "learning_rate": 4.027042567038503e-06, + "loss": 1.55288048, + "memory(GiB)": 117.38, + "step": 46060, + "train_speed(iter/s)": 1.635333 + }, + { + "acc": 0.65917106, + "epoch": 1.1685692541856925, + "grad_norm": 4.8125, + "learning_rate": 4.026014009863186e-06, + "loss": 1.61998405, + "memory(GiB)": 117.38, + "step": 46065, + "train_speed(iter/s)": 1.635353 + }, + { + "acc": 0.65444598, + "epoch": 1.1686960933536277, + "grad_norm": 5.75, + "learning_rate": 4.024985495528124e-06, + "loss": 1.61353416, + "memory(GiB)": 117.38, + "step": 46070, + "train_speed(iter/s)": 1.635371 + }, + { + "acc": 0.65920663, + "epoch": 1.1688229325215627, + "grad_norm": 6.03125, + "learning_rate": 4.023957024078552e-06, + "loss": 1.60706062, + "memory(GiB)": 117.38, + "step": 46075, + "train_speed(iter/s)": 1.63539 + }, + { + "acc": 0.6608573, + "epoch": 1.1689497716894977, + "grad_norm": 6.625, + "learning_rate": 4.022928595559707e-06, + "loss": 1.56209316, + "memory(GiB)": 117.38, + "step": 46080, + "train_speed(iter/s)": 1.635409 + }, + { + "acc": 0.65752068, + "epoch": 1.1690766108574329, + "grad_norm": 5.5625, + "learning_rate": 4.021900210016824e-06, + "loss": 1.61006203, + "memory(GiB)": 117.38, + "step": 46085, + "train_speed(iter/s)": 1.635427 + }, + { + "acc": 0.66767907, + "epoch": 1.1692034500253679, + "grad_norm": 6.1875, + "learning_rate": 4.020871867495139e-06, + "loss": 1.54134426, + "memory(GiB)": 117.38, + "step": 46090, + "train_speed(iter/s)": 1.635447 + }, + { + "acc": 0.66648788, + "epoch": 1.1693302891933028, + "grad_norm": 5.90625, + "learning_rate": 4.01984356803988e-06, + "loss": 1.5776721, + "memory(GiB)": 117.38, + "step": 46095, + "train_speed(iter/s)": 1.635466 + }, + { + "acc": 0.66540952, + "epoch": 1.169457128361238, + "grad_norm": 5.03125, + "learning_rate": 4.018815311696274e-06, + "loss": 1.60082035, + "memory(GiB)": 117.38, + "step": 46100, + "train_speed(iter/s)": 1.635485 + }, + { + "acc": 0.6458425, + "epoch": 1.169583967529173, + "grad_norm": 6.4375, + "learning_rate": 4.017787098509555e-06, + "loss": 1.62922802, + "memory(GiB)": 117.38, + "step": 46105, + "train_speed(iter/s)": 1.635504 + }, + { + "acc": 0.65588112, + "epoch": 1.169710806697108, + "grad_norm": 6.84375, + "learning_rate": 4.016758928524944e-06, + "loss": 1.58211861, + "memory(GiB)": 117.38, + "step": 46110, + "train_speed(iter/s)": 1.635523 + }, + { + "acc": 0.66940403, + "epoch": 1.169837645865043, + "grad_norm": 5.78125, + "learning_rate": 4.015730801787663e-06, + "loss": 1.51750841, + "memory(GiB)": 117.38, + "step": 46115, + "train_speed(iter/s)": 1.635543 + }, + { + "acc": 0.64090481, + "epoch": 1.1699644850329782, + "grad_norm": 5.46875, + "learning_rate": 4.014702718342938e-06, + "loss": 1.66367912, + "memory(GiB)": 117.38, + "step": 46120, + "train_speed(iter/s)": 1.635563 + }, + { + "acc": 0.64416966, + "epoch": 1.1700913242009132, + "grad_norm": 5.28125, + "learning_rate": 4.013674678235985e-06, + "loss": 1.63240318, + "memory(GiB)": 117.38, + "step": 46125, + "train_speed(iter/s)": 1.635581 + }, + { + "acc": 0.65053287, + "epoch": 1.1702181633688482, + "grad_norm": 5.40625, + "learning_rate": 4.012646681512026e-06, + "loss": 1.58833714, + "memory(GiB)": 117.38, + "step": 46130, + "train_speed(iter/s)": 1.6356 + }, + { + "acc": 0.65251641, + "epoch": 1.1703450025367834, + "grad_norm": 5.84375, + "learning_rate": 4.011618728216271e-06, + "loss": 1.64492264, + "memory(GiB)": 117.38, + "step": 46135, + "train_speed(iter/s)": 1.635621 + }, + { + "acc": 0.65192852, + "epoch": 1.1704718417047184, + "grad_norm": 5.96875, + "learning_rate": 4.010590818393938e-06, + "loss": 1.63998642, + "memory(GiB)": 117.38, + "step": 46140, + "train_speed(iter/s)": 1.635641 + }, + { + "acc": 0.67863503, + "epoch": 1.1705986808726534, + "grad_norm": 6.4375, + "learning_rate": 4.009562952090238e-06, + "loss": 1.55675259, + "memory(GiB)": 117.38, + "step": 46145, + "train_speed(iter/s)": 1.635661 + }, + { + "acc": 0.65378156, + "epoch": 1.1707255200405886, + "grad_norm": 8.875, + "learning_rate": 4.0085351293503825e-06, + "loss": 1.57858629, + "memory(GiB)": 117.38, + "step": 46150, + "train_speed(iter/s)": 1.635682 + }, + { + "acc": 0.65080643, + "epoch": 1.1708523592085236, + "grad_norm": 4.40625, + "learning_rate": 4.007507350219578e-06, + "loss": 1.68782082, + "memory(GiB)": 117.38, + "step": 46155, + "train_speed(iter/s)": 1.635701 + }, + { + "acc": 0.66592493, + "epoch": 1.1709791983764586, + "grad_norm": 5.5625, + "learning_rate": 4.0064796147430305e-06, + "loss": 1.51166668, + "memory(GiB)": 117.38, + "step": 46160, + "train_speed(iter/s)": 1.635721 + }, + { + "acc": 0.66495256, + "epoch": 1.1711060375443938, + "grad_norm": 5.625, + "learning_rate": 4.005451922965946e-06, + "loss": 1.61604385, + "memory(GiB)": 117.38, + "step": 46165, + "train_speed(iter/s)": 1.635741 + }, + { + "acc": 0.66606326, + "epoch": 1.1712328767123288, + "grad_norm": 5.125, + "learning_rate": 4.0044242749335285e-06, + "loss": 1.55738316, + "memory(GiB)": 117.38, + "step": 46170, + "train_speed(iter/s)": 1.63576 + }, + { + "acc": 0.64758453, + "epoch": 1.1713597158802638, + "grad_norm": 5.96875, + "learning_rate": 4.0033966706909735e-06, + "loss": 1.69288578, + "memory(GiB)": 117.38, + "step": 46175, + "train_speed(iter/s)": 1.635779 + }, + { + "acc": 0.65085211, + "epoch": 1.1714865550481988, + "grad_norm": 5.46875, + "learning_rate": 4.002369110283482e-06, + "loss": 1.58347263, + "memory(GiB)": 117.38, + "step": 46180, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.63970423, + "epoch": 1.171613394216134, + "grad_norm": 5.4375, + "learning_rate": 4.001341593756253e-06, + "loss": 1.64356747, + "memory(GiB)": 117.38, + "step": 46185, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.66085186, + "epoch": 1.171740233384069, + "grad_norm": 7.4375, + "learning_rate": 4.00031412115448e-06, + "loss": 1.63768768, + "memory(GiB)": 117.38, + "step": 46190, + "train_speed(iter/s)": 1.635837 + }, + { + "acc": 0.65669694, + "epoch": 1.1718670725520042, + "grad_norm": 6.09375, + "learning_rate": 3.999286692523352e-06, + "loss": 1.57838173, + "memory(GiB)": 117.38, + "step": 46195, + "train_speed(iter/s)": 1.635859 + }, + { + "acc": 0.67000427, + "epoch": 1.1719939117199392, + "grad_norm": 5.21875, + "learning_rate": 3.998259307908065e-06, + "loss": 1.53306589, + "memory(GiB)": 117.38, + "step": 46200, + "train_speed(iter/s)": 1.635878 + }, + { + "acc": 0.64608984, + "epoch": 1.1721207508878742, + "grad_norm": 6.78125, + "learning_rate": 3.997231967353806e-06, + "loss": 1.62242565, + "memory(GiB)": 117.38, + "step": 46205, + "train_speed(iter/s)": 1.635899 + }, + { + "acc": 0.65822487, + "epoch": 1.1722475900558091, + "grad_norm": 5.21875, + "learning_rate": 3.996204670905765e-06, + "loss": 1.58023291, + "memory(GiB)": 117.38, + "step": 46210, + "train_speed(iter/s)": 1.635918 + }, + { + "acc": 0.66076365, + "epoch": 1.1723744292237444, + "grad_norm": 5.40625, + "learning_rate": 3.9951774186091195e-06, + "loss": 1.58831854, + "memory(GiB)": 117.38, + "step": 46215, + "train_speed(iter/s)": 1.635938 + }, + { + "acc": 0.67891111, + "epoch": 1.1725012683916793, + "grad_norm": 6.59375, + "learning_rate": 3.9941502105090594e-06, + "loss": 1.52629194, + "memory(GiB)": 117.38, + "step": 46220, + "train_speed(iter/s)": 1.635957 + }, + { + "acc": 0.67207718, + "epoch": 1.1726281075596143, + "grad_norm": 5.65625, + "learning_rate": 3.9931230466507634e-06, + "loss": 1.52454329, + "memory(GiB)": 117.38, + "step": 46225, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.65900068, + "epoch": 1.1727549467275495, + "grad_norm": 5.3125, + "learning_rate": 3.992095927079412e-06, + "loss": 1.63008919, + "memory(GiB)": 117.38, + "step": 46230, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.67170353, + "epoch": 1.1728817858954845, + "grad_norm": 6.96875, + "learning_rate": 3.991068851840182e-06, + "loss": 1.52715607, + "memory(GiB)": 117.38, + "step": 46235, + "train_speed(iter/s)": 1.636015 + }, + { + "acc": 0.6656848, + "epoch": 1.1730086250634195, + "grad_norm": 6.125, + "learning_rate": 3.990041820978246e-06, + "loss": 1.53950558, + "memory(GiB)": 117.38, + "step": 46240, + "train_speed(iter/s)": 1.636034 + }, + { + "acc": 0.66400881, + "epoch": 1.1731354642313547, + "grad_norm": 6.28125, + "learning_rate": 3.989014834538782e-06, + "loss": 1.54228439, + "memory(GiB)": 117.38, + "step": 46245, + "train_speed(iter/s)": 1.636054 + }, + { + "acc": 0.67212768, + "epoch": 1.1732623033992897, + "grad_norm": 6.9375, + "learning_rate": 3.987987892566959e-06, + "loss": 1.55905561, + "memory(GiB)": 117.38, + "step": 46250, + "train_speed(iter/s)": 1.636074 + }, + { + "acc": 0.67626667, + "epoch": 1.1733891425672247, + "grad_norm": 4.90625, + "learning_rate": 3.986960995107948e-06, + "loss": 1.54889126, + "memory(GiB)": 117.38, + "step": 46255, + "train_speed(iter/s)": 1.636094 + }, + { + "acc": 0.66708112, + "epoch": 1.17351598173516, + "grad_norm": 5.1875, + "learning_rate": 3.985934142206912e-06, + "loss": 1.58366413, + "memory(GiB)": 117.38, + "step": 46260, + "train_speed(iter/s)": 1.636113 + }, + { + "acc": 0.66411142, + "epoch": 1.173642820903095, + "grad_norm": 5.375, + "learning_rate": 3.984907333909022e-06, + "loss": 1.60409508, + "memory(GiB)": 117.38, + "step": 46265, + "train_speed(iter/s)": 1.636132 + }, + { + "acc": 0.64951048, + "epoch": 1.17376966007103, + "grad_norm": 6.28125, + "learning_rate": 3.983880570259441e-06, + "loss": 1.62533855, + "memory(GiB)": 117.38, + "step": 46270, + "train_speed(iter/s)": 1.63615 + }, + { + "acc": 0.65605001, + "epoch": 1.1738964992389649, + "grad_norm": 5.8125, + "learning_rate": 3.982853851303327e-06, + "loss": 1.57857924, + "memory(GiB)": 117.38, + "step": 46275, + "train_speed(iter/s)": 1.63617 + }, + { + "acc": 0.64148264, + "epoch": 1.1740233384069, + "grad_norm": 6.6875, + "learning_rate": 3.981827177085842e-06, + "loss": 1.67251167, + "memory(GiB)": 117.38, + "step": 46280, + "train_speed(iter/s)": 1.63619 + }, + { + "acc": 0.64799423, + "epoch": 1.174150177574835, + "grad_norm": 6.375, + "learning_rate": 3.980800547652143e-06, + "loss": 1.61811638, + "memory(GiB)": 117.38, + "step": 46285, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.63844075, + "epoch": 1.17427701674277, + "grad_norm": 5.21875, + "learning_rate": 3.979773963047388e-06, + "loss": 1.72852268, + "memory(GiB)": 117.38, + "step": 46290, + "train_speed(iter/s)": 1.636231 + }, + { + "acc": 0.66433301, + "epoch": 1.1744038559107053, + "grad_norm": 5.75, + "learning_rate": 3.978747423316729e-06, + "loss": 1.55439405, + "memory(GiB)": 117.38, + "step": 46295, + "train_speed(iter/s)": 1.636249 + }, + { + "acc": 0.64813147, + "epoch": 1.1745306950786403, + "grad_norm": 5.34375, + "learning_rate": 3.977720928505317e-06, + "loss": 1.66334076, + "memory(GiB)": 117.38, + "step": 46300, + "train_speed(iter/s)": 1.636269 + }, + { + "acc": 0.6505229, + "epoch": 1.1746575342465753, + "grad_norm": 7.0, + "learning_rate": 3.976694478658301e-06, + "loss": 1.66106415, + "memory(GiB)": 117.38, + "step": 46305, + "train_speed(iter/s)": 1.636288 + }, + { + "acc": 0.66378841, + "epoch": 1.1747843734145105, + "grad_norm": 6.15625, + "learning_rate": 3.975668073820834e-06, + "loss": 1.56796207, + "memory(GiB)": 117.38, + "step": 46310, + "train_speed(iter/s)": 1.636307 + }, + { + "acc": 0.68842468, + "epoch": 1.1749112125824455, + "grad_norm": 6.71875, + "learning_rate": 3.9746417140380576e-06, + "loss": 1.49618874, + "memory(GiB)": 117.38, + "step": 46315, + "train_speed(iter/s)": 1.636329 + }, + { + "acc": 0.65510254, + "epoch": 1.1750380517503805, + "grad_norm": 7.625, + "learning_rate": 3.973615399355114e-06, + "loss": 1.62154598, + "memory(GiB)": 117.38, + "step": 46320, + "train_speed(iter/s)": 1.636347 + }, + { + "acc": 0.65530357, + "epoch": 1.1751648909183157, + "grad_norm": 5.09375, + "learning_rate": 3.97258912981715e-06, + "loss": 1.63688202, + "memory(GiB)": 117.38, + "step": 46325, + "train_speed(iter/s)": 1.636366 + }, + { + "acc": 0.66143141, + "epoch": 1.1752917300862507, + "grad_norm": 6.8125, + "learning_rate": 3.9715629054693035e-06, + "loss": 1.55610132, + "memory(GiB)": 117.38, + "step": 46330, + "train_speed(iter/s)": 1.636385 + }, + { + "acc": 0.66292305, + "epoch": 1.1754185692541856, + "grad_norm": 5.6875, + "learning_rate": 3.970536726356711e-06, + "loss": 1.59044476, + "memory(GiB)": 117.38, + "step": 46335, + "train_speed(iter/s)": 1.636404 + }, + { + "acc": 0.64639225, + "epoch": 1.1755454084221206, + "grad_norm": 5.4375, + "learning_rate": 3.969510592524509e-06, + "loss": 1.62415352, + "memory(GiB)": 117.38, + "step": 46340, + "train_speed(iter/s)": 1.636423 + }, + { + "acc": 0.65371771, + "epoch": 1.1756722475900558, + "grad_norm": 5.03125, + "learning_rate": 3.968484504017833e-06, + "loss": 1.64485283, + "memory(GiB)": 117.38, + "step": 46345, + "train_speed(iter/s)": 1.636441 + }, + { + "acc": 0.67239647, + "epoch": 1.1757990867579908, + "grad_norm": 7.46875, + "learning_rate": 3.967458460881815e-06, + "loss": 1.52457695, + "memory(GiB)": 117.38, + "step": 46350, + "train_speed(iter/s)": 1.636461 + }, + { + "acc": 0.67113972, + "epoch": 1.175925925925926, + "grad_norm": 5.625, + "learning_rate": 3.96643246316158e-06, + "loss": 1.54952898, + "memory(GiB)": 117.38, + "step": 46355, + "train_speed(iter/s)": 1.636479 + }, + { + "acc": 0.66188698, + "epoch": 1.176052765093861, + "grad_norm": 4.84375, + "learning_rate": 3.965406510902263e-06, + "loss": 1.53429222, + "memory(GiB)": 117.38, + "step": 46360, + "train_speed(iter/s)": 1.636497 + }, + { + "acc": 0.67016797, + "epoch": 1.176179604261796, + "grad_norm": 6.71875, + "learning_rate": 3.9643806041489855e-06, + "loss": 1.55963669, + "memory(GiB)": 117.38, + "step": 46365, + "train_speed(iter/s)": 1.636517 + }, + { + "acc": 0.65091772, + "epoch": 1.176306443429731, + "grad_norm": 6.0, + "learning_rate": 3.963354742946874e-06, + "loss": 1.60377045, + "memory(GiB)": 117.38, + "step": 46370, + "train_speed(iter/s)": 1.636535 + }, + { + "acc": 0.65209856, + "epoch": 1.1764332825976662, + "grad_norm": 4.5, + "learning_rate": 3.962328927341048e-06, + "loss": 1.6182106, + "memory(GiB)": 117.38, + "step": 46375, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.67671337, + "epoch": 1.1765601217656012, + "grad_norm": 6.84375, + "learning_rate": 3.961303157376628e-06, + "loss": 1.534589, + "memory(GiB)": 117.38, + "step": 46380, + "train_speed(iter/s)": 1.636574 + }, + { + "acc": 0.64846845, + "epoch": 1.1766869609335362, + "grad_norm": 6.53125, + "learning_rate": 3.960277433098734e-06, + "loss": 1.57788858, + "memory(GiB)": 117.38, + "step": 46385, + "train_speed(iter/s)": 1.636595 + }, + { + "acc": 0.65173607, + "epoch": 1.1768138001014714, + "grad_norm": 6.5, + "learning_rate": 3.959251754552481e-06, + "loss": 1.65551491, + "memory(GiB)": 117.38, + "step": 46390, + "train_speed(iter/s)": 1.636615 + }, + { + "acc": 0.64662571, + "epoch": 1.1769406392694064, + "grad_norm": 5.0, + "learning_rate": 3.958226121782982e-06, + "loss": 1.61326485, + "memory(GiB)": 117.38, + "step": 46395, + "train_speed(iter/s)": 1.636634 + }, + { + "acc": 0.65729303, + "epoch": 1.1770674784373414, + "grad_norm": 5.78125, + "learning_rate": 3.9572005348353486e-06, + "loss": 1.58313999, + "memory(GiB)": 117.38, + "step": 46400, + "train_speed(iter/s)": 1.636653 + }, + { + "acc": 0.64366126, + "epoch": 1.1771943176052766, + "grad_norm": 7.5625, + "learning_rate": 3.956174993754691e-06, + "loss": 1.63947544, + "memory(GiB)": 117.38, + "step": 46405, + "train_speed(iter/s)": 1.636673 + }, + { + "acc": 0.66744175, + "epoch": 1.1773211567732116, + "grad_norm": 4.875, + "learning_rate": 3.955149498586119e-06, + "loss": 1.49518757, + "memory(GiB)": 117.38, + "step": 46410, + "train_speed(iter/s)": 1.636693 + }, + { + "acc": 0.64974432, + "epoch": 1.1774479959411466, + "grad_norm": 7.15625, + "learning_rate": 3.954124049374736e-06, + "loss": 1.62570076, + "memory(GiB)": 117.38, + "step": 46415, + "train_speed(iter/s)": 1.636713 + }, + { + "acc": 0.66807938, + "epoch": 1.1775748351090818, + "grad_norm": 4.90625, + "learning_rate": 3.9530986461656465e-06, + "loss": 1.53927374, + "memory(GiB)": 117.38, + "step": 46420, + "train_speed(iter/s)": 1.636731 + }, + { + "acc": 0.65424414, + "epoch": 1.1777016742770168, + "grad_norm": 6.125, + "learning_rate": 3.952073289003953e-06, + "loss": 1.57309361, + "memory(GiB)": 117.38, + "step": 46425, + "train_speed(iter/s)": 1.63675 + }, + { + "acc": 0.67555165, + "epoch": 1.1778285134449518, + "grad_norm": 6.0625, + "learning_rate": 3.9510479779347566e-06, + "loss": 1.49967442, + "memory(GiB)": 117.38, + "step": 46430, + "train_speed(iter/s)": 1.636769 + }, + { + "acc": 0.65240335, + "epoch": 1.1779553526128868, + "grad_norm": 5.875, + "learning_rate": 3.950022713003151e-06, + "loss": 1.59517345, + "memory(GiB)": 117.38, + "step": 46435, + "train_speed(iter/s)": 1.636788 + }, + { + "acc": 0.65068264, + "epoch": 1.178082191780822, + "grad_norm": 5.53125, + "learning_rate": 3.9489974942542355e-06, + "loss": 1.6094698, + "memory(GiB)": 117.38, + "step": 46440, + "train_speed(iter/s)": 1.636807 + }, + { + "acc": 0.65861225, + "epoch": 1.178209030948757, + "grad_norm": 5.84375, + "learning_rate": 3.947972321733101e-06, + "loss": 1.57066135, + "memory(GiB)": 117.38, + "step": 46445, + "train_speed(iter/s)": 1.636827 + }, + { + "acc": 0.65753689, + "epoch": 1.178335870116692, + "grad_norm": 5.28125, + "learning_rate": 3.946947195484843e-06, + "loss": 1.60609932, + "memory(GiB)": 117.38, + "step": 46450, + "train_speed(iter/s)": 1.636847 + }, + { + "acc": 0.65107908, + "epoch": 1.1784627092846272, + "grad_norm": 5.8125, + "learning_rate": 3.945922115554548e-06, + "loss": 1.63570251, + "memory(GiB)": 117.38, + "step": 46455, + "train_speed(iter/s)": 1.636866 + }, + { + "acc": 0.66598992, + "epoch": 1.1785895484525621, + "grad_norm": 4.9375, + "learning_rate": 3.944897081987303e-06, + "loss": 1.6481945, + "memory(GiB)": 117.38, + "step": 46460, + "train_speed(iter/s)": 1.636886 + }, + { + "acc": 0.65200796, + "epoch": 1.1787163876204971, + "grad_norm": 8.25, + "learning_rate": 3.943872094828197e-06, + "loss": 1.6242588, + "memory(GiB)": 117.38, + "step": 46465, + "train_speed(iter/s)": 1.636907 + }, + { + "acc": 0.6718935, + "epoch": 1.1788432267884323, + "grad_norm": 5.125, + "learning_rate": 3.942847154122312e-06, + "loss": 1.51998882, + "memory(GiB)": 117.38, + "step": 46470, + "train_speed(iter/s)": 1.636927 + }, + { + "acc": 0.6716465, + "epoch": 1.1789700659563673, + "grad_norm": 6.40625, + "learning_rate": 3.941822259914728e-06, + "loss": 1.51986094, + "memory(GiB)": 117.38, + "step": 46475, + "train_speed(iter/s)": 1.636947 + }, + { + "acc": 0.66168108, + "epoch": 1.1790969051243023, + "grad_norm": 4.71875, + "learning_rate": 3.940797412250524e-06, + "loss": 1.58920403, + "memory(GiB)": 117.38, + "step": 46480, + "train_speed(iter/s)": 1.636966 + }, + { + "acc": 0.65981693, + "epoch": 1.1792237442922375, + "grad_norm": 4.625, + "learning_rate": 3.93977261117478e-06, + "loss": 1.60927353, + "memory(GiB)": 117.38, + "step": 46485, + "train_speed(iter/s)": 1.636985 + }, + { + "acc": 0.65412836, + "epoch": 1.1793505834601725, + "grad_norm": 5.0625, + "learning_rate": 3.938747856732572e-06, + "loss": 1.59747219, + "memory(GiB)": 117.38, + "step": 46490, + "train_speed(iter/s)": 1.637004 + }, + { + "acc": 0.67093582, + "epoch": 1.1794774226281075, + "grad_norm": 5.84375, + "learning_rate": 3.9377231489689685e-06, + "loss": 1.62085819, + "memory(GiB)": 117.38, + "step": 46495, + "train_speed(iter/s)": 1.637023 + }, + { + "acc": 0.65070896, + "epoch": 1.1796042617960425, + "grad_norm": 5.46875, + "learning_rate": 3.936698487929045e-06, + "loss": 1.61765633, + "memory(GiB)": 117.38, + "step": 46500, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.66881008, + "epoch": 1.1797311009639777, + "grad_norm": 5.03125, + "learning_rate": 3.935673873657868e-06, + "loss": 1.54812698, + "memory(GiB)": 117.38, + "step": 46505, + "train_speed(iter/s)": 1.637062 + }, + { + "acc": 0.65189404, + "epoch": 1.1798579401319127, + "grad_norm": 6.78125, + "learning_rate": 3.934649306200508e-06, + "loss": 1.65766869, + "memory(GiB)": 117.38, + "step": 46510, + "train_speed(iter/s)": 1.637081 + }, + { + "acc": 0.65781188, + "epoch": 1.179984779299848, + "grad_norm": 5.5625, + "learning_rate": 3.933624785602027e-06, + "loss": 1.60242882, + "memory(GiB)": 117.38, + "step": 46515, + "train_speed(iter/s)": 1.637099 + }, + { + "acc": 0.64470778, + "epoch": 1.180111618467783, + "grad_norm": 5.15625, + "learning_rate": 3.932600311907489e-06, + "loss": 1.61195717, + "memory(GiB)": 117.38, + "step": 46520, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.6503222, + "epoch": 1.1802384576357179, + "grad_norm": 4.84375, + "learning_rate": 3.931575885161955e-06, + "loss": 1.6452364, + "memory(GiB)": 117.38, + "step": 46525, + "train_speed(iter/s)": 1.637139 + }, + { + "acc": 0.64573269, + "epoch": 1.1803652968036529, + "grad_norm": 5.4375, + "learning_rate": 3.930551505410484e-06, + "loss": 1.60496044, + "memory(GiB)": 117.38, + "step": 46530, + "train_speed(iter/s)": 1.637157 + }, + { + "acc": 0.66018629, + "epoch": 1.180492135971588, + "grad_norm": 5.59375, + "learning_rate": 3.929527172698132e-06, + "loss": 1.60194473, + "memory(GiB)": 117.38, + "step": 46535, + "train_speed(iter/s)": 1.637176 + }, + { + "acc": 0.64571295, + "epoch": 1.180618975139523, + "grad_norm": 4.9375, + "learning_rate": 3.928502887069954e-06, + "loss": 1.60859489, + "memory(GiB)": 117.38, + "step": 46540, + "train_speed(iter/s)": 1.637194 + }, + { + "acc": 0.67574348, + "epoch": 1.180745814307458, + "grad_norm": 6.46875, + "learning_rate": 3.927478648571003e-06, + "loss": 1.48915281, + "memory(GiB)": 117.38, + "step": 46545, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.65071945, + "epoch": 1.1808726534753933, + "grad_norm": 6.71875, + "learning_rate": 3.926454457246331e-06, + "loss": 1.58496666, + "memory(GiB)": 117.38, + "step": 46550, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.64780989, + "epoch": 1.1809994926433283, + "grad_norm": 6.5625, + "learning_rate": 3.9254303131409834e-06, + "loss": 1.63332024, + "memory(GiB)": 117.38, + "step": 46555, + "train_speed(iter/s)": 1.637247 + }, + { + "acc": 0.65483541, + "epoch": 1.1811263318112633, + "grad_norm": 6.65625, + "learning_rate": 3.924406216300009e-06, + "loss": 1.54679193, + "memory(GiB)": 117.38, + "step": 46560, + "train_speed(iter/s)": 1.637266 + }, + { + "acc": 0.66200409, + "epoch": 1.1812531709791985, + "grad_norm": 5.53125, + "learning_rate": 3.92338216676845e-06, + "loss": 1.59225531, + "memory(GiB)": 117.38, + "step": 46565, + "train_speed(iter/s)": 1.637284 + }, + { + "acc": 0.64841104, + "epoch": 1.1813800101471335, + "grad_norm": 7.0625, + "learning_rate": 3.922358164591353e-06, + "loss": 1.65743294, + "memory(GiB)": 117.38, + "step": 46570, + "train_speed(iter/s)": 1.637303 + }, + { + "acc": 0.65236654, + "epoch": 1.1815068493150684, + "grad_norm": 5.59375, + "learning_rate": 3.921334209813752e-06, + "loss": 1.64063339, + "memory(GiB)": 117.38, + "step": 46575, + "train_speed(iter/s)": 1.637321 + }, + { + "acc": 0.65048494, + "epoch": 1.1816336884830037, + "grad_norm": 5.40625, + "learning_rate": 3.92031030248069e-06, + "loss": 1.63058529, + "memory(GiB)": 117.38, + "step": 46580, + "train_speed(iter/s)": 1.637339 + }, + { + "acc": 0.63938637, + "epoch": 1.1817605276509386, + "grad_norm": 5.34375, + "learning_rate": 3.9192864426372e-06, + "loss": 1.66378632, + "memory(GiB)": 117.38, + "step": 46585, + "train_speed(iter/s)": 1.637358 + }, + { + "acc": 0.67071295, + "epoch": 1.1818873668188736, + "grad_norm": 5.53125, + "learning_rate": 3.918262630328319e-06, + "loss": 1.5374176, + "memory(GiB)": 117.38, + "step": 46590, + "train_speed(iter/s)": 1.637376 + }, + { + "acc": 0.66215916, + "epoch": 1.1820142059868086, + "grad_norm": 4.875, + "learning_rate": 3.917238865599077e-06, + "loss": 1.59837723, + "memory(GiB)": 117.38, + "step": 46595, + "train_speed(iter/s)": 1.637394 + }, + { + "acc": 0.68290043, + "epoch": 1.1821410451547438, + "grad_norm": 5.34375, + "learning_rate": 3.916215148494502e-06, + "loss": 1.49271126, + "memory(GiB)": 117.38, + "step": 46600, + "train_speed(iter/s)": 1.637412 + }, + { + "acc": 0.63864908, + "epoch": 1.1822678843226788, + "grad_norm": 6.625, + "learning_rate": 3.9151914790596255e-06, + "loss": 1.66548157, + "memory(GiB)": 117.38, + "step": 46605, + "train_speed(iter/s)": 1.63743 + }, + { + "acc": 0.66377449, + "epoch": 1.1823947234906138, + "grad_norm": 6.71875, + "learning_rate": 3.914167857339472e-06, + "loss": 1.57629261, + "memory(GiB)": 117.38, + "step": 46610, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.64592457, + "epoch": 1.182521562658549, + "grad_norm": 5.8125, + "learning_rate": 3.913144283379061e-06, + "loss": 1.63500137, + "memory(GiB)": 117.38, + "step": 46615, + "train_speed(iter/s)": 1.637469 + }, + { + "acc": 0.6622323, + "epoch": 1.182648401826484, + "grad_norm": 6.40625, + "learning_rate": 3.912120757223418e-06, + "loss": 1.67293892, + "memory(GiB)": 117.38, + "step": 46620, + "train_speed(iter/s)": 1.637487 + }, + { + "acc": 0.667243, + "epoch": 1.182775240994419, + "grad_norm": 6.625, + "learning_rate": 3.911097278917561e-06, + "loss": 1.59625711, + "memory(GiB)": 117.38, + "step": 46625, + "train_speed(iter/s)": 1.637506 + }, + { + "acc": 0.65368738, + "epoch": 1.1829020801623542, + "grad_norm": 6.1875, + "learning_rate": 3.91007384850651e-06, + "loss": 1.57545414, + "memory(GiB)": 117.38, + "step": 46630, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.66435733, + "epoch": 1.1830289193302892, + "grad_norm": 5.28125, + "learning_rate": 3.909050466035274e-06, + "loss": 1.52634087, + "memory(GiB)": 117.38, + "step": 46635, + "train_speed(iter/s)": 1.637542 + }, + { + "acc": 0.65399513, + "epoch": 1.1831557584982242, + "grad_norm": 6.375, + "learning_rate": 3.90802713154887e-06, + "loss": 1.61824646, + "memory(GiB)": 117.38, + "step": 46640, + "train_speed(iter/s)": 1.63756 + }, + { + "acc": 0.67156687, + "epoch": 1.1832825976661594, + "grad_norm": 5.59375, + "learning_rate": 3.9070038450923074e-06, + "loss": 1.50965052, + "memory(GiB)": 117.38, + "step": 46645, + "train_speed(iter/s)": 1.637578 + }, + { + "acc": 0.64963045, + "epoch": 1.1834094368340944, + "grad_norm": 6.46875, + "learning_rate": 3.9059806067105985e-06, + "loss": 1.6243679, + "memory(GiB)": 117.38, + "step": 46650, + "train_speed(iter/s)": 1.637597 + }, + { + "acc": 0.65542622, + "epoch": 1.1835362760020294, + "grad_norm": 6.5625, + "learning_rate": 3.904957416448744e-06, + "loss": 1.64733887, + "memory(GiB)": 117.38, + "step": 46655, + "train_speed(iter/s)": 1.637615 + }, + { + "acc": 0.66609335, + "epoch": 1.1836631151699644, + "grad_norm": 6.21875, + "learning_rate": 3.903934274351753e-06, + "loss": 1.58386116, + "memory(GiB)": 117.38, + "step": 46660, + "train_speed(iter/s)": 1.637633 + }, + { + "acc": 0.66952004, + "epoch": 1.1837899543378996, + "grad_norm": 6.09375, + "learning_rate": 3.9029111804646245e-06, + "loss": 1.5711689, + "memory(GiB)": 117.38, + "step": 46665, + "train_speed(iter/s)": 1.637651 + }, + { + "acc": 0.65237675, + "epoch": 1.1839167935058346, + "grad_norm": 6.0, + "learning_rate": 3.9018881348323626e-06, + "loss": 1.60015182, + "memory(GiB)": 117.38, + "step": 46670, + "train_speed(iter/s)": 1.637671 + }, + { + "acc": 0.66778841, + "epoch": 1.1840436326737698, + "grad_norm": 4.71875, + "learning_rate": 3.9008651374999615e-06, + "loss": 1.56850548, + "memory(GiB)": 117.38, + "step": 46675, + "train_speed(iter/s)": 1.637689 + }, + { + "acc": 0.65507216, + "epoch": 1.1841704718417048, + "grad_norm": 5.28125, + "learning_rate": 3.899842188512419e-06, + "loss": 1.59441872, + "memory(GiB)": 117.38, + "step": 46680, + "train_speed(iter/s)": 1.637706 + }, + { + "acc": 0.65663738, + "epoch": 1.1842973110096398, + "grad_norm": 4.75, + "learning_rate": 3.898819287914729e-06, + "loss": 1.56393642, + "memory(GiB)": 117.38, + "step": 46685, + "train_speed(iter/s)": 1.637725 + }, + { + "acc": 0.65431476, + "epoch": 1.1844241501775747, + "grad_norm": 6.0625, + "learning_rate": 3.897796435751885e-06, + "loss": 1.6180912, + "memory(GiB)": 117.38, + "step": 46690, + "train_speed(iter/s)": 1.637743 + }, + { + "acc": 0.6413754, + "epoch": 1.18455098934551, + "grad_norm": 6.375, + "learning_rate": 3.896773632068873e-06, + "loss": 1.65171318, + "memory(GiB)": 117.38, + "step": 46695, + "train_speed(iter/s)": 1.637761 + }, + { + "acc": 0.65373116, + "epoch": 1.184677828513445, + "grad_norm": 5.71875, + "learning_rate": 3.8957508769106825e-06, + "loss": 1.61337585, + "memory(GiB)": 117.38, + "step": 46700, + "train_speed(iter/s)": 1.63778 + }, + { + "acc": 0.66132746, + "epoch": 1.18480466768138, + "grad_norm": 4.8125, + "learning_rate": 3.894728170322298e-06, + "loss": 1.61730728, + "memory(GiB)": 117.38, + "step": 46705, + "train_speed(iter/s)": 1.637798 + }, + { + "acc": 0.67446327, + "epoch": 1.1849315068493151, + "grad_norm": 5.65625, + "learning_rate": 3.893705512348705e-06, + "loss": 1.5165102, + "memory(GiB)": 117.38, + "step": 46710, + "train_speed(iter/s)": 1.637817 + }, + { + "acc": 0.6756988, + "epoch": 1.1850583460172501, + "grad_norm": 4.6875, + "learning_rate": 3.89268290303488e-06, + "loss": 1.5932478, + "memory(GiB)": 117.38, + "step": 46715, + "train_speed(iter/s)": 1.637833 + }, + { + "acc": 0.66301041, + "epoch": 1.1851851851851851, + "grad_norm": 5.125, + "learning_rate": 3.891660342425807e-06, + "loss": 1.5150445, + "memory(GiB)": 117.38, + "step": 46720, + "train_speed(iter/s)": 1.637851 + }, + { + "acc": 0.67786813, + "epoch": 1.1853120243531203, + "grad_norm": 6.75, + "learning_rate": 3.890637830566459e-06, + "loss": 1.50318928, + "memory(GiB)": 117.38, + "step": 46725, + "train_speed(iter/s)": 1.63787 + }, + { + "acc": 0.65086222, + "epoch": 1.1854388635210553, + "grad_norm": 7.375, + "learning_rate": 3.889615367501815e-06, + "loss": 1.64343834, + "memory(GiB)": 117.38, + "step": 46730, + "train_speed(iter/s)": 1.637887 + }, + { + "acc": 0.66089849, + "epoch": 1.1855657026889903, + "grad_norm": 4.84375, + "learning_rate": 3.888592953276842e-06, + "loss": 1.59469709, + "memory(GiB)": 117.38, + "step": 46735, + "train_speed(iter/s)": 1.637904 + }, + { + "acc": 0.67004609, + "epoch": 1.1856925418569255, + "grad_norm": 5.0, + "learning_rate": 3.8875705879365135e-06, + "loss": 1.50744762, + "memory(GiB)": 117.38, + "step": 46740, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.65864906, + "epoch": 1.1858193810248605, + "grad_norm": 6.15625, + "learning_rate": 3.886548271525797e-06, + "loss": 1.55310087, + "memory(GiB)": 117.38, + "step": 46745, + "train_speed(iter/s)": 1.637941 + }, + { + "acc": 0.66627016, + "epoch": 1.1859462201927955, + "grad_norm": 6.78125, + "learning_rate": 3.88552600408966e-06, + "loss": 1.53203316, + "memory(GiB)": 117.38, + "step": 46750, + "train_speed(iter/s)": 1.63796 + }, + { + "acc": 0.65460253, + "epoch": 1.1860730593607305, + "grad_norm": 6.53125, + "learning_rate": 3.8845037856730646e-06, + "loss": 1.66312294, + "memory(GiB)": 117.38, + "step": 46755, + "train_speed(iter/s)": 1.637979 + }, + { + "acc": 0.6571754, + "epoch": 1.1861998985286657, + "grad_norm": 5.71875, + "learning_rate": 3.883481616320972e-06, + "loss": 1.65006809, + "memory(GiB)": 117.38, + "step": 46760, + "train_speed(iter/s)": 1.637998 + }, + { + "acc": 0.66273699, + "epoch": 1.1863267376966007, + "grad_norm": 6.0625, + "learning_rate": 3.882459496078343e-06, + "loss": 1.60198097, + "memory(GiB)": 117.38, + "step": 46765, + "train_speed(iter/s)": 1.638015 + }, + { + "acc": 0.66143765, + "epoch": 1.1864535768645357, + "grad_norm": 7.71875, + "learning_rate": 3.881437424990137e-06, + "loss": 1.5615799, + "memory(GiB)": 117.38, + "step": 46770, + "train_speed(iter/s)": 1.638033 + }, + { + "acc": 0.65304499, + "epoch": 1.1865804160324709, + "grad_norm": 5.96875, + "learning_rate": 3.880415403101304e-06, + "loss": 1.60328407, + "memory(GiB)": 117.38, + "step": 46775, + "train_speed(iter/s)": 1.638052 + }, + { + "acc": 0.67337389, + "epoch": 1.1867072552004059, + "grad_norm": 5.96875, + "learning_rate": 3.879393430456801e-06, + "loss": 1.50395489, + "memory(GiB)": 117.38, + "step": 46780, + "train_speed(iter/s)": 1.638067 + }, + { + "acc": 0.65592957, + "epoch": 1.1868340943683409, + "grad_norm": 5.28125, + "learning_rate": 3.87837150710158e-06, + "loss": 1.63608093, + "memory(GiB)": 117.38, + "step": 46785, + "train_speed(iter/s)": 1.638085 + }, + { + "acc": 0.66808338, + "epoch": 1.186960933536276, + "grad_norm": 6.46875, + "learning_rate": 3.877349633080587e-06, + "loss": 1.50783844, + "memory(GiB)": 117.38, + "step": 46790, + "train_speed(iter/s)": 1.638104 + }, + { + "acc": 0.66317434, + "epoch": 1.187087772704211, + "grad_norm": 5.84375, + "learning_rate": 3.876327808438767e-06, + "loss": 1.61880226, + "memory(GiB)": 117.38, + "step": 46795, + "train_speed(iter/s)": 1.638121 + }, + { + "acc": 0.66592922, + "epoch": 1.187214611872146, + "grad_norm": 5.625, + "learning_rate": 3.875306033221069e-06, + "loss": 1.53188953, + "memory(GiB)": 117.38, + "step": 46800, + "train_speed(iter/s)": 1.638141 + }, + { + "acc": 0.64452128, + "epoch": 1.1873414510400813, + "grad_norm": 6.0625, + "learning_rate": 3.874284307472432e-06, + "loss": 1.66028233, + "memory(GiB)": 117.38, + "step": 46805, + "train_speed(iter/s)": 1.638158 + }, + { + "acc": 0.6567986, + "epoch": 1.1874682902080163, + "grad_norm": 5.40625, + "learning_rate": 3.873262631237799e-06, + "loss": 1.63864193, + "memory(GiB)": 117.38, + "step": 46810, + "train_speed(iter/s)": 1.638176 + }, + { + "acc": 0.64220586, + "epoch": 1.1875951293759512, + "grad_norm": 5.6875, + "learning_rate": 3.872241004562105e-06, + "loss": 1.65563698, + "memory(GiB)": 117.38, + "step": 46815, + "train_speed(iter/s)": 1.638195 + }, + { + "acc": 0.67266512, + "epoch": 1.1877219685438862, + "grad_norm": 7.75, + "learning_rate": 3.871219427490285e-06, + "loss": 1.54418936, + "memory(GiB)": 117.38, + "step": 46820, + "train_speed(iter/s)": 1.638215 + }, + { + "acc": 0.64564209, + "epoch": 1.1878488077118214, + "grad_norm": 6.75, + "learning_rate": 3.870197900067276e-06, + "loss": 1.64141026, + "memory(GiB)": 117.38, + "step": 46825, + "train_speed(iter/s)": 1.638234 + }, + { + "acc": 0.65323009, + "epoch": 1.1879756468797564, + "grad_norm": 5.71875, + "learning_rate": 3.869176422338009e-06, + "loss": 1.5795701, + "memory(GiB)": 117.38, + "step": 46830, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.6664484, + "epoch": 1.1881024860476916, + "grad_norm": 7.84375, + "learning_rate": 3.868154994347409e-06, + "loss": 1.59463205, + "memory(GiB)": 117.38, + "step": 46835, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.67412996, + "epoch": 1.1882293252156266, + "grad_norm": 6.875, + "learning_rate": 3.867133616140406e-06, + "loss": 1.53573093, + "memory(GiB)": 117.38, + "step": 46840, + "train_speed(iter/s)": 1.63829 + }, + { + "acc": 0.66909733, + "epoch": 1.1883561643835616, + "grad_norm": 6.1875, + "learning_rate": 3.866112287761926e-06, + "loss": 1.63225441, + "memory(GiB)": 117.38, + "step": 46845, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.6454906, + "epoch": 1.1884830035514966, + "grad_norm": 6.65625, + "learning_rate": 3.86509100925689e-06, + "loss": 1.62626801, + "memory(GiB)": 117.38, + "step": 46850, + "train_speed(iter/s)": 1.638326 + }, + { + "acc": 0.65258489, + "epoch": 1.1886098427194318, + "grad_norm": 5.28125, + "learning_rate": 3.8640697806702166e-06, + "loss": 1.58213806, + "memory(GiB)": 117.38, + "step": 46855, + "train_speed(iter/s)": 1.638344 + }, + { + "acc": 0.65415382, + "epoch": 1.1887366818873668, + "grad_norm": 6.34375, + "learning_rate": 3.8630486020468265e-06, + "loss": 1.64464989, + "memory(GiB)": 117.38, + "step": 46860, + "train_speed(iter/s)": 1.638362 + }, + { + "acc": 0.65757532, + "epoch": 1.1888635210553018, + "grad_norm": 5.875, + "learning_rate": 3.862027473431634e-06, + "loss": 1.59540005, + "memory(GiB)": 117.38, + "step": 46865, + "train_speed(iter/s)": 1.638381 + }, + { + "acc": 0.66706028, + "epoch": 1.188990360223237, + "grad_norm": 7.4375, + "learning_rate": 3.861006394869558e-06, + "loss": 1.55304089, + "memory(GiB)": 117.38, + "step": 46870, + "train_speed(iter/s)": 1.638401 + }, + { + "acc": 0.64790449, + "epoch": 1.189117199391172, + "grad_norm": 5.21875, + "learning_rate": 3.859985366405502e-06, + "loss": 1.63214626, + "memory(GiB)": 117.38, + "step": 46875, + "train_speed(iter/s)": 1.63842 + }, + { + "acc": 0.65056038, + "epoch": 1.189244038559107, + "grad_norm": 4.6875, + "learning_rate": 3.85896438808438e-06, + "loss": 1.59073887, + "memory(GiB)": 117.38, + "step": 46880, + "train_speed(iter/s)": 1.638439 + }, + { + "acc": 0.64938898, + "epoch": 1.1893708777270422, + "grad_norm": 6.03125, + "learning_rate": 3.857943459951099e-06, + "loss": 1.60665169, + "memory(GiB)": 117.38, + "step": 46885, + "train_speed(iter/s)": 1.638456 + }, + { + "acc": 0.65199142, + "epoch": 1.1894977168949772, + "grad_norm": 6.28125, + "learning_rate": 3.856922582050565e-06, + "loss": 1.63446999, + "memory(GiB)": 117.38, + "step": 46890, + "train_speed(iter/s)": 1.638475 + }, + { + "acc": 0.67521443, + "epoch": 1.1896245560629122, + "grad_norm": 5.8125, + "learning_rate": 3.855901754427678e-06, + "loss": 1.48979893, + "memory(GiB)": 117.38, + "step": 46895, + "train_speed(iter/s)": 1.638492 + }, + { + "acc": 0.63665633, + "epoch": 1.1897513952308474, + "grad_norm": 6.09375, + "learning_rate": 3.854880977127339e-06, + "loss": 1.68439789, + "memory(GiB)": 117.38, + "step": 46900, + "train_speed(iter/s)": 1.63851 + }, + { + "acc": 0.65221219, + "epoch": 1.1898782343987824, + "grad_norm": 5.84375, + "learning_rate": 3.8538602501944475e-06, + "loss": 1.55919418, + "memory(GiB)": 117.38, + "step": 46905, + "train_speed(iter/s)": 1.638528 + }, + { + "acc": 0.65973887, + "epoch": 1.1900050735667174, + "grad_norm": 7.0625, + "learning_rate": 3.852839573673902e-06, + "loss": 1.58765039, + "memory(GiB)": 117.38, + "step": 46910, + "train_speed(iter/s)": 1.638546 + }, + { + "acc": 0.66354303, + "epoch": 1.1901319127346524, + "grad_norm": 5.46875, + "learning_rate": 3.851818947610591e-06, + "loss": 1.55327473, + "memory(GiB)": 117.38, + "step": 46915, + "train_speed(iter/s)": 1.638565 + }, + { + "acc": 0.65805817, + "epoch": 1.1902587519025876, + "grad_norm": 6.0625, + "learning_rate": 3.850798372049409e-06, + "loss": 1.57910995, + "memory(GiB)": 117.38, + "step": 46920, + "train_speed(iter/s)": 1.638583 + }, + { + "acc": 0.66380262, + "epoch": 1.1903855910705226, + "grad_norm": 5.21875, + "learning_rate": 3.849777847035246e-06, + "loss": 1.55484943, + "memory(GiB)": 117.38, + "step": 46925, + "train_speed(iter/s)": 1.638602 + }, + { + "acc": 0.67047691, + "epoch": 1.1905124302384575, + "grad_norm": 6.0, + "learning_rate": 3.84875737261299e-06, + "loss": 1.49261589, + "memory(GiB)": 117.38, + "step": 46930, + "train_speed(iter/s)": 1.638621 + }, + { + "acc": 0.65380487, + "epoch": 1.1906392694063928, + "grad_norm": 8.3125, + "learning_rate": 3.847736948827523e-06, + "loss": 1.64001846, + "memory(GiB)": 117.38, + "step": 46935, + "train_speed(iter/s)": 1.638638 + }, + { + "acc": 0.66836205, + "epoch": 1.1907661085743277, + "grad_norm": 6.375, + "learning_rate": 3.846716575723729e-06, + "loss": 1.54684496, + "memory(GiB)": 117.38, + "step": 46940, + "train_speed(iter/s)": 1.638657 + }, + { + "acc": 0.65215092, + "epoch": 1.1908929477422627, + "grad_norm": 4.78125, + "learning_rate": 3.845696253346489e-06, + "loss": 1.63874702, + "memory(GiB)": 117.38, + "step": 46945, + "train_speed(iter/s)": 1.638676 + }, + { + "acc": 0.66589842, + "epoch": 1.191019786910198, + "grad_norm": 5.59375, + "learning_rate": 3.8446759817406835e-06, + "loss": 1.58005333, + "memory(GiB)": 117.38, + "step": 46950, + "train_speed(iter/s)": 1.638692 + }, + { + "acc": 0.66307163, + "epoch": 1.191146626078133, + "grad_norm": 5.03125, + "learning_rate": 3.8436557609511856e-06, + "loss": 1.61401005, + "memory(GiB)": 117.38, + "step": 46955, + "train_speed(iter/s)": 1.638711 + }, + { + "acc": 0.67856674, + "epoch": 1.191273465246068, + "grad_norm": 5.625, + "learning_rate": 3.842635591022869e-06, + "loss": 1.45879087, + "memory(GiB)": 117.38, + "step": 46960, + "train_speed(iter/s)": 1.638729 + }, + { + "acc": 0.6582922, + "epoch": 1.1914003044140031, + "grad_norm": 7.0, + "learning_rate": 3.8416154720006065e-06, + "loss": 1.60393963, + "memory(GiB)": 117.38, + "step": 46965, + "train_speed(iter/s)": 1.638747 + }, + { + "acc": 0.66006708, + "epoch": 1.1915271435819381, + "grad_norm": 4.8125, + "learning_rate": 3.840595403929269e-06, + "loss": 1.58464184, + "memory(GiB)": 117.38, + "step": 46970, + "train_speed(iter/s)": 1.638765 + }, + { + "acc": 0.642693, + "epoch": 1.191653982749873, + "grad_norm": 5.9375, + "learning_rate": 3.839575386853721e-06, + "loss": 1.57218618, + "memory(GiB)": 117.38, + "step": 46975, + "train_speed(iter/s)": 1.638784 + }, + { + "acc": 0.65564809, + "epoch": 1.191780821917808, + "grad_norm": 5.21875, + "learning_rate": 3.838555420818827e-06, + "loss": 1.55778847, + "memory(GiB)": 117.38, + "step": 46980, + "train_speed(iter/s)": 1.638802 + }, + { + "acc": 0.66052308, + "epoch": 1.1919076610857433, + "grad_norm": 5.84375, + "learning_rate": 3.837535505869453e-06, + "loss": 1.58639746, + "memory(GiB)": 117.38, + "step": 46985, + "train_speed(iter/s)": 1.638821 + }, + { + "acc": 0.65801759, + "epoch": 1.1920345002536783, + "grad_norm": 5.9375, + "learning_rate": 3.836515642050458e-06, + "loss": 1.56650105, + "memory(GiB)": 117.38, + "step": 46990, + "train_speed(iter/s)": 1.638838 + }, + { + "acc": 0.63576598, + "epoch": 1.1921613394216135, + "grad_norm": 5.09375, + "learning_rate": 3.835495829406698e-06, + "loss": 1.61660118, + "memory(GiB)": 117.38, + "step": 46995, + "train_speed(iter/s)": 1.638857 + }, + { + "acc": 0.66022162, + "epoch": 1.1922881785895485, + "grad_norm": 6.125, + "learning_rate": 3.834476067983031e-06, + "loss": 1.60717754, + "memory(GiB)": 117.38, + "step": 47000, + "train_speed(iter/s)": 1.638875 + }, + { + "epoch": 1.1922881785895485, + "eval_acc": 0.6462300853908369, + "eval_loss": 1.5734953880310059, + "eval_runtime": 58.9324, + "eval_samples_per_second": 108.09, + "eval_steps_per_second": 27.031, + "step": 47000 + }, + { + "acc": 0.65335493, + "epoch": 1.1924150177574835, + "grad_norm": 5.59375, + "learning_rate": 3.83345635782431e-06, + "loss": 1.66574478, + "memory(GiB)": 117.38, + "step": 47005, + "train_speed(iter/s)": 1.635285 + }, + { + "acc": 0.6601388, + "epoch": 1.1925418569254185, + "grad_norm": 6.71875, + "learning_rate": 3.832436698975388e-06, + "loss": 1.61326675, + "memory(GiB)": 117.38, + "step": 47010, + "train_speed(iter/s)": 1.635303 + }, + { + "acc": 0.67149887, + "epoch": 1.1926686960933537, + "grad_norm": 5.75, + "learning_rate": 3.831417091481111e-06, + "loss": 1.53228226, + "memory(GiB)": 117.38, + "step": 47015, + "train_speed(iter/s)": 1.635322 + }, + { + "acc": 0.67479167, + "epoch": 1.1927955352612887, + "grad_norm": 5.3125, + "learning_rate": 3.830397535386328e-06, + "loss": 1.56198921, + "memory(GiB)": 117.38, + "step": 47020, + "train_speed(iter/s)": 1.635339 + }, + { + "acc": 0.65695648, + "epoch": 1.1929223744292237, + "grad_norm": 6.34375, + "learning_rate": 3.829378030735883e-06, + "loss": 1.61739349, + "memory(GiB)": 117.38, + "step": 47025, + "train_speed(iter/s)": 1.635357 + }, + { + "acc": 0.66081076, + "epoch": 1.1930492135971589, + "grad_norm": 5.125, + "learning_rate": 3.82835857757462e-06, + "loss": 1.59440498, + "memory(GiB)": 117.38, + "step": 47030, + "train_speed(iter/s)": 1.635375 + }, + { + "acc": 0.65916214, + "epoch": 1.1931760527650939, + "grad_norm": 4.59375, + "learning_rate": 3.827339175947378e-06, + "loss": 1.620998, + "memory(GiB)": 117.38, + "step": 47035, + "train_speed(iter/s)": 1.635393 + }, + { + "acc": 0.65532503, + "epoch": 1.1933028919330289, + "grad_norm": 6.0625, + "learning_rate": 3.826319825898992e-06, + "loss": 1.55248051, + "memory(GiB)": 117.38, + "step": 47040, + "train_speed(iter/s)": 1.63541 + }, + { + "acc": 0.66407051, + "epoch": 1.193429731100964, + "grad_norm": 5.90625, + "learning_rate": 3.825300527474302e-06, + "loss": 1.58592749, + "memory(GiB)": 117.38, + "step": 47045, + "train_speed(iter/s)": 1.635428 + }, + { + "acc": 0.65257359, + "epoch": 1.193556570268899, + "grad_norm": 6.40625, + "learning_rate": 3.824281280718141e-06, + "loss": 1.65087433, + "memory(GiB)": 117.38, + "step": 47050, + "train_speed(iter/s)": 1.635445 + }, + { + "acc": 0.65596094, + "epoch": 1.193683409436834, + "grad_norm": 7.65625, + "learning_rate": 3.823262085675337e-06, + "loss": 1.59080009, + "memory(GiB)": 117.38, + "step": 47055, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.64660559, + "epoch": 1.1938102486047693, + "grad_norm": 6.0, + "learning_rate": 3.822242942390718e-06, + "loss": 1.64604568, + "memory(GiB)": 117.38, + "step": 47060, + "train_speed(iter/s)": 1.635481 + }, + { + "acc": 0.66204052, + "epoch": 1.1939370877727042, + "grad_norm": 5.65625, + "learning_rate": 3.821223850909115e-06, + "loss": 1.60249252, + "memory(GiB)": 117.38, + "step": 47065, + "train_speed(iter/s)": 1.635498 + }, + { + "acc": 0.65209904, + "epoch": 1.1940639269406392, + "grad_norm": 6.125, + "learning_rate": 3.820204811275351e-06, + "loss": 1.63516426, + "memory(GiB)": 117.38, + "step": 47070, + "train_speed(iter/s)": 1.635517 + }, + { + "acc": 0.66477499, + "epoch": 1.1941907661085742, + "grad_norm": 5.1875, + "learning_rate": 3.8191858235342446e-06, + "loss": 1.61611938, + "memory(GiB)": 117.38, + "step": 47075, + "train_speed(iter/s)": 1.635535 + }, + { + "acc": 0.64243298, + "epoch": 1.1943176052765094, + "grad_norm": 7.09375, + "learning_rate": 3.818166887730618e-06, + "loss": 1.63458061, + "memory(GiB)": 117.38, + "step": 47080, + "train_speed(iter/s)": 1.635554 + }, + { + "acc": 0.64643278, + "epoch": 1.1944444444444444, + "grad_norm": 5.28125, + "learning_rate": 3.817148003909288e-06, + "loss": 1.67639561, + "memory(GiB)": 117.38, + "step": 47085, + "train_speed(iter/s)": 1.635573 + }, + { + "acc": 0.65950713, + "epoch": 1.1945712836123794, + "grad_norm": 5.21875, + "learning_rate": 3.816129172115073e-06, + "loss": 1.58476839, + "memory(GiB)": 117.38, + "step": 47090, + "train_speed(iter/s)": 1.635593 + }, + { + "acc": 0.64445076, + "epoch": 1.1946981227803146, + "grad_norm": 6.3125, + "learning_rate": 3.815110392392778e-06, + "loss": 1.62359428, + "memory(GiB)": 117.38, + "step": 47095, + "train_speed(iter/s)": 1.635611 + }, + { + "acc": 0.6504828, + "epoch": 1.1948249619482496, + "grad_norm": 5.40625, + "learning_rate": 3.8140916647872204e-06, + "loss": 1.59726362, + "memory(GiB)": 117.38, + "step": 47100, + "train_speed(iter/s)": 1.63563 + }, + { + "acc": 0.64806466, + "epoch": 1.1949518011161846, + "grad_norm": 6.84375, + "learning_rate": 3.813072989343205e-06, + "loss": 1.63519783, + "memory(GiB)": 117.38, + "step": 47105, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.65763292, + "epoch": 1.1950786402841198, + "grad_norm": 5.28125, + "learning_rate": 3.812054366105541e-06, + "loss": 1.59549179, + "memory(GiB)": 117.38, + "step": 47110, + "train_speed(iter/s)": 1.635667 + }, + { + "acc": 0.66659985, + "epoch": 1.1952054794520548, + "grad_norm": 5.6875, + "learning_rate": 3.8110357951190284e-06, + "loss": 1.49302597, + "memory(GiB)": 117.38, + "step": 47115, + "train_speed(iter/s)": 1.635686 + }, + { + "acc": 0.62299747, + "epoch": 1.1953323186199898, + "grad_norm": 4.96875, + "learning_rate": 3.8100172764284694e-06, + "loss": 1.68569717, + "memory(GiB)": 117.38, + "step": 47120, + "train_speed(iter/s)": 1.635704 + }, + { + "acc": 0.67369828, + "epoch": 1.195459157787925, + "grad_norm": 5.5, + "learning_rate": 3.8089988100786635e-06, + "loss": 1.56372166, + "memory(GiB)": 117.38, + "step": 47125, + "train_speed(iter/s)": 1.635722 + }, + { + "acc": 0.65630674, + "epoch": 1.19558599695586, + "grad_norm": 6.875, + "learning_rate": 3.807980396114409e-06, + "loss": 1.55270224, + "memory(GiB)": 117.38, + "step": 47130, + "train_speed(iter/s)": 1.635742 + }, + { + "acc": 0.64401217, + "epoch": 1.195712836123795, + "grad_norm": 5.4375, + "learning_rate": 3.8069620345804974e-06, + "loss": 1.60807705, + "memory(GiB)": 117.38, + "step": 47135, + "train_speed(iter/s)": 1.63576 + }, + { + "acc": 0.65233974, + "epoch": 1.19583967529173, + "grad_norm": 5.46875, + "learning_rate": 3.8059437255217214e-06, + "loss": 1.65920963, + "memory(GiB)": 117.38, + "step": 47140, + "train_speed(iter/s)": 1.635779 + }, + { + "acc": 0.66710558, + "epoch": 1.1959665144596652, + "grad_norm": 5.5625, + "learning_rate": 3.8049254689828723e-06, + "loss": 1.55732212, + "memory(GiB)": 117.38, + "step": 47145, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.65564604, + "epoch": 1.1960933536276002, + "grad_norm": 6.4375, + "learning_rate": 3.8039072650087377e-06, + "loss": 1.58524265, + "memory(GiB)": 117.38, + "step": 47150, + "train_speed(iter/s)": 1.635817 + }, + { + "acc": 0.65488806, + "epoch": 1.1962201927955354, + "grad_norm": 4.96875, + "learning_rate": 3.8028891136440994e-06, + "loss": 1.55358696, + "memory(GiB)": 117.38, + "step": 47155, + "train_speed(iter/s)": 1.635837 + }, + { + "acc": 0.64907103, + "epoch": 1.1963470319634704, + "grad_norm": 4.90625, + "learning_rate": 3.801871014933744e-06, + "loss": 1.58658066, + "memory(GiB)": 117.38, + "step": 47160, + "train_speed(iter/s)": 1.635855 + }, + { + "acc": 0.64994564, + "epoch": 1.1964738711314054, + "grad_norm": 6.21875, + "learning_rate": 3.8008529689224493e-06, + "loss": 1.60585403, + "memory(GiB)": 117.38, + "step": 47165, + "train_speed(iter/s)": 1.635873 + }, + { + "acc": 0.64978976, + "epoch": 1.1966007102993403, + "grad_norm": 6.28125, + "learning_rate": 3.7998349756549974e-06, + "loss": 1.60660477, + "memory(GiB)": 117.38, + "step": 47170, + "train_speed(iter/s)": 1.635892 + }, + { + "acc": 0.63557205, + "epoch": 1.1967275494672756, + "grad_norm": 5.5625, + "learning_rate": 3.79881703517616e-06, + "loss": 1.74159088, + "memory(GiB)": 117.38, + "step": 47175, + "train_speed(iter/s)": 1.63591 + }, + { + "acc": 0.66288357, + "epoch": 1.1968543886352105, + "grad_norm": 6.65625, + "learning_rate": 3.797799147530713e-06, + "loss": 1.61801758, + "memory(GiB)": 117.38, + "step": 47180, + "train_speed(iter/s)": 1.635929 + }, + { + "acc": 0.65557961, + "epoch": 1.1969812278031455, + "grad_norm": 6.125, + "learning_rate": 3.796781312763425e-06, + "loss": 1.5821619, + "memory(GiB)": 117.38, + "step": 47185, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.65856256, + "epoch": 1.1971080669710807, + "grad_norm": 5.5, + "learning_rate": 3.79576353091907e-06, + "loss": 1.50847769, + "memory(GiB)": 117.38, + "step": 47190, + "train_speed(iter/s)": 1.635966 + }, + { + "acc": 0.67521152, + "epoch": 1.1972349061390157, + "grad_norm": 6.21875, + "learning_rate": 3.7947458020424094e-06, + "loss": 1.56782303, + "memory(GiB)": 117.38, + "step": 47195, + "train_speed(iter/s)": 1.635986 + }, + { + "acc": 0.65726056, + "epoch": 1.1973617453069507, + "grad_norm": 4.875, + "learning_rate": 3.793728126178209e-06, + "loss": 1.603899, + "memory(GiB)": 117.38, + "step": 47200, + "train_speed(iter/s)": 1.636001 + }, + { + "acc": 0.66021533, + "epoch": 1.197488584474886, + "grad_norm": 6.0, + "learning_rate": 3.792710503371232e-06, + "loss": 1.52472906, + "memory(GiB)": 117.38, + "step": 47205, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.65039697, + "epoch": 1.197615423642821, + "grad_norm": 6.03125, + "learning_rate": 3.7916929336662386e-06, + "loss": 1.72632256, + "memory(GiB)": 117.38, + "step": 47210, + "train_speed(iter/s)": 1.636038 + }, + { + "acc": 0.65964088, + "epoch": 1.197742262810756, + "grad_norm": 5.53125, + "learning_rate": 3.790675417107982e-06, + "loss": 1.57490911, + "memory(GiB)": 117.38, + "step": 47215, + "train_speed(iter/s)": 1.636055 + }, + { + "acc": 0.65741148, + "epoch": 1.1978691019786911, + "grad_norm": 6.125, + "learning_rate": 3.7896579537412213e-06, + "loss": 1.56715117, + "memory(GiB)": 117.38, + "step": 47220, + "train_speed(iter/s)": 1.636075 + }, + { + "acc": 0.63056674, + "epoch": 1.197995941146626, + "grad_norm": 5.375, + "learning_rate": 3.7886405436107076e-06, + "loss": 1.69390068, + "memory(GiB)": 117.38, + "step": 47225, + "train_speed(iter/s)": 1.636095 + }, + { + "acc": 0.64632359, + "epoch": 1.198122780314561, + "grad_norm": 5.875, + "learning_rate": 3.7876231867611917e-06, + "loss": 1.61246586, + "memory(GiB)": 117.38, + "step": 47230, + "train_speed(iter/s)": 1.636112 + }, + { + "acc": 0.64396238, + "epoch": 1.198249619482496, + "grad_norm": 6.46875, + "learning_rate": 3.7866058832374197e-06, + "loss": 1.65640335, + "memory(GiB)": 117.38, + "step": 47235, + "train_speed(iter/s)": 1.63613 + }, + { + "acc": 0.65456672, + "epoch": 1.1983764586504313, + "grad_norm": 5.65625, + "learning_rate": 3.7855886330841383e-06, + "loss": 1.64422455, + "memory(GiB)": 117.38, + "step": 47240, + "train_speed(iter/s)": 1.636149 + }, + { + "acc": 0.65773706, + "epoch": 1.1985032978183663, + "grad_norm": 6.40625, + "learning_rate": 3.7845714363460908e-06, + "loss": 1.60695267, + "memory(GiB)": 117.38, + "step": 47245, + "train_speed(iter/s)": 1.636167 + }, + { + "acc": 0.66238337, + "epoch": 1.1986301369863013, + "grad_norm": 5.125, + "learning_rate": 3.78355429306802e-06, + "loss": 1.60711021, + "memory(GiB)": 117.38, + "step": 47250, + "train_speed(iter/s)": 1.636184 + }, + { + "acc": 0.66299458, + "epoch": 1.1987569761542365, + "grad_norm": 5.875, + "learning_rate": 3.7825372032946605e-06, + "loss": 1.55074139, + "memory(GiB)": 117.38, + "step": 47255, + "train_speed(iter/s)": 1.636202 + }, + { + "acc": 0.6659749, + "epoch": 1.1988838153221715, + "grad_norm": 5.5625, + "learning_rate": 3.7815201670707502e-06, + "loss": 1.58210211, + "memory(GiB)": 117.38, + "step": 47260, + "train_speed(iter/s)": 1.636221 + }, + { + "acc": 0.64726868, + "epoch": 1.1990106544901065, + "grad_norm": 6.21875, + "learning_rate": 3.7805031844410235e-06, + "loss": 1.67704773, + "memory(GiB)": 117.38, + "step": 47265, + "train_speed(iter/s)": 1.636239 + }, + { + "acc": 0.66102352, + "epoch": 1.1991374936580417, + "grad_norm": 5.21875, + "learning_rate": 3.7794862554502126e-06, + "loss": 1.55312347, + "memory(GiB)": 117.38, + "step": 47270, + "train_speed(iter/s)": 1.636256 + }, + { + "acc": 0.65674496, + "epoch": 1.1992643328259767, + "grad_norm": 6.59375, + "learning_rate": 3.778469380143045e-06, + "loss": 1.56913891, + "memory(GiB)": 117.38, + "step": 47275, + "train_speed(iter/s)": 1.636275 + }, + { + "acc": 0.659342, + "epoch": 1.1993911719939117, + "grad_norm": 5.40625, + "learning_rate": 3.777452558564246e-06, + "loss": 1.56910448, + "memory(GiB)": 117.38, + "step": 47280, + "train_speed(iter/s)": 1.636293 + }, + { + "acc": 0.66164751, + "epoch": 1.1995180111618469, + "grad_norm": 6.125, + "learning_rate": 3.776435790758543e-06, + "loss": 1.5565259, + "memory(GiB)": 117.38, + "step": 47285, + "train_speed(iter/s)": 1.636313 + }, + { + "acc": 0.66742363, + "epoch": 1.1996448503297819, + "grad_norm": 8.125, + "learning_rate": 3.7754190767706577e-06, + "loss": 1.49027596, + "memory(GiB)": 117.38, + "step": 47290, + "train_speed(iter/s)": 1.636332 + }, + { + "acc": 0.65464725, + "epoch": 1.1997716894977168, + "grad_norm": 6.5625, + "learning_rate": 3.774402416645307e-06, + "loss": 1.62291069, + "memory(GiB)": 117.38, + "step": 47295, + "train_speed(iter/s)": 1.63635 + }, + { + "acc": 0.65059195, + "epoch": 1.1998985286656518, + "grad_norm": 5.53125, + "learning_rate": 3.77338581042721e-06, + "loss": 1.61271954, + "memory(GiB)": 117.38, + "step": 47300, + "train_speed(iter/s)": 1.636368 + }, + { + "acc": 0.66629171, + "epoch": 1.200025367833587, + "grad_norm": 6.625, + "learning_rate": 3.7723692581610817e-06, + "loss": 1.62539082, + "memory(GiB)": 117.38, + "step": 47305, + "train_speed(iter/s)": 1.636388 + }, + { + "acc": 0.66174989, + "epoch": 1.200152207001522, + "grad_norm": 6.40625, + "learning_rate": 3.771352759891637e-06, + "loss": 1.67625656, + "memory(GiB)": 117.38, + "step": 47310, + "train_speed(iter/s)": 1.636406 + }, + { + "acc": 0.6528501, + "epoch": 1.2002790461694572, + "grad_norm": 5.71875, + "learning_rate": 3.7703363156635807e-06, + "loss": 1.67193146, + "memory(GiB)": 117.38, + "step": 47315, + "train_speed(iter/s)": 1.636425 + }, + { + "acc": 0.66145048, + "epoch": 1.2004058853373922, + "grad_norm": 5.625, + "learning_rate": 3.769319925521624e-06, + "loss": 1.60974789, + "memory(GiB)": 117.38, + "step": 47320, + "train_speed(iter/s)": 1.636445 + }, + { + "acc": 0.66777153, + "epoch": 1.2005327245053272, + "grad_norm": 6.34375, + "learning_rate": 3.76830358951047e-06, + "loss": 1.59220238, + "memory(GiB)": 117.38, + "step": 47325, + "train_speed(iter/s)": 1.636464 + }, + { + "acc": 0.65019341, + "epoch": 1.2006595636732622, + "grad_norm": 6.59375, + "learning_rate": 3.767287307674826e-06, + "loss": 1.54215317, + "memory(GiB)": 117.38, + "step": 47330, + "train_speed(iter/s)": 1.636483 + }, + { + "acc": 0.65536499, + "epoch": 1.2007864028411974, + "grad_norm": 6.0625, + "learning_rate": 3.766271080059389e-06, + "loss": 1.59739056, + "memory(GiB)": 117.38, + "step": 47335, + "train_speed(iter/s)": 1.636501 + }, + { + "acc": 0.65930529, + "epoch": 1.2009132420091324, + "grad_norm": 6.875, + "learning_rate": 3.7652549067088568e-06, + "loss": 1.56609592, + "memory(GiB)": 117.38, + "step": 47340, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.66127963, + "epoch": 1.2010400811770674, + "grad_norm": 7.21875, + "learning_rate": 3.7642387876679275e-06, + "loss": 1.62591248, + "memory(GiB)": 117.38, + "step": 47345, + "train_speed(iter/s)": 1.636538 + }, + { + "acc": 0.65140915, + "epoch": 1.2011669203450026, + "grad_norm": 4.40625, + "learning_rate": 3.7632227229812947e-06, + "loss": 1.60852928, + "memory(GiB)": 117.38, + "step": 47350, + "train_speed(iter/s)": 1.636555 + }, + { + "acc": 0.65252433, + "epoch": 1.2012937595129376, + "grad_norm": 6.65625, + "learning_rate": 3.7622067126936475e-06, + "loss": 1.63069954, + "memory(GiB)": 117.38, + "step": 47355, + "train_speed(iter/s)": 1.636573 + }, + { + "acc": 0.65450258, + "epoch": 1.2014205986808726, + "grad_norm": 5.53125, + "learning_rate": 3.761190756849674e-06, + "loss": 1.57487478, + "memory(GiB)": 117.38, + "step": 47360, + "train_speed(iter/s)": 1.636592 + }, + { + "acc": 0.65323658, + "epoch": 1.2015474378488078, + "grad_norm": 5.96875, + "learning_rate": 3.7601748554940633e-06, + "loss": 1.60449963, + "memory(GiB)": 117.38, + "step": 47365, + "train_speed(iter/s)": 1.636612 + }, + { + "acc": 0.65836024, + "epoch": 1.2016742770167428, + "grad_norm": 5.25, + "learning_rate": 3.7591590086714984e-06, + "loss": 1.52106028, + "memory(GiB)": 117.38, + "step": 47370, + "train_speed(iter/s)": 1.63663 + }, + { + "acc": 0.64723535, + "epoch": 1.2018011161846778, + "grad_norm": 7.78125, + "learning_rate": 3.7581432164266587e-06, + "loss": 1.57000971, + "memory(GiB)": 117.38, + "step": 47375, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.65823426, + "epoch": 1.201927955352613, + "grad_norm": 5.6875, + "learning_rate": 3.7571274788042255e-06, + "loss": 1.58905058, + "memory(GiB)": 117.38, + "step": 47380, + "train_speed(iter/s)": 1.636667 + }, + { + "acc": 0.65346446, + "epoch": 1.202054794520548, + "grad_norm": 5.75, + "learning_rate": 3.756111795848874e-06, + "loss": 1.62489872, + "memory(GiB)": 117.38, + "step": 47385, + "train_speed(iter/s)": 1.636687 + }, + { + "acc": 0.65487256, + "epoch": 1.202181633688483, + "grad_norm": 5.78125, + "learning_rate": 3.755096167605281e-06, + "loss": 1.64551773, + "memory(GiB)": 117.38, + "step": 47390, + "train_speed(iter/s)": 1.636706 + }, + { + "acc": 0.67012811, + "epoch": 1.202308472856418, + "grad_norm": 5.40625, + "learning_rate": 3.7540805941181165e-06, + "loss": 1.53682079, + "memory(GiB)": 117.38, + "step": 47395, + "train_speed(iter/s)": 1.636726 + }, + { + "acc": 0.65593185, + "epoch": 1.2024353120243532, + "grad_norm": 5.28125, + "learning_rate": 3.7530650754320492e-06, + "loss": 1.66485558, + "memory(GiB)": 117.38, + "step": 47400, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.68244362, + "epoch": 1.2025621511922882, + "grad_norm": 5.0, + "learning_rate": 3.752049611591746e-06, + "loss": 1.49088202, + "memory(GiB)": 117.38, + "step": 47405, + "train_speed(iter/s)": 1.636763 + }, + { + "acc": 0.65459929, + "epoch": 1.2026889903602231, + "grad_norm": 5.90625, + "learning_rate": 3.7510342026418756e-06, + "loss": 1.63167744, + "memory(GiB)": 117.38, + "step": 47410, + "train_speed(iter/s)": 1.636782 + }, + { + "acc": 0.65682025, + "epoch": 1.2028158295281584, + "grad_norm": 6.28125, + "learning_rate": 3.7500188486270948e-06, + "loss": 1.58512192, + "memory(GiB)": 117.38, + "step": 47415, + "train_speed(iter/s)": 1.636801 + }, + { + "acc": 0.64577866, + "epoch": 1.2029426686960933, + "grad_norm": 6.15625, + "learning_rate": 3.7490035495920664e-06, + "loss": 1.66989307, + "memory(GiB)": 117.38, + "step": 47420, + "train_speed(iter/s)": 1.63682 + }, + { + "acc": 0.66750875, + "epoch": 1.2030695078640283, + "grad_norm": 5.09375, + "learning_rate": 3.747988305581447e-06, + "loss": 1.54458151, + "memory(GiB)": 117.38, + "step": 47425, + "train_speed(iter/s)": 1.636839 + }, + { + "acc": 0.65450706, + "epoch": 1.2031963470319635, + "grad_norm": 5.71875, + "learning_rate": 3.7469731166398933e-06, + "loss": 1.62785263, + "memory(GiB)": 117.38, + "step": 47430, + "train_speed(iter/s)": 1.636857 + }, + { + "acc": 0.64825401, + "epoch": 1.2033231861998985, + "grad_norm": 5.34375, + "learning_rate": 3.745957982812054e-06, + "loss": 1.63887672, + "memory(GiB)": 117.38, + "step": 47435, + "train_speed(iter/s)": 1.636876 + }, + { + "acc": 0.64772463, + "epoch": 1.2034500253678335, + "grad_norm": 5.40625, + "learning_rate": 3.744942904142582e-06, + "loss": 1.60876656, + "memory(GiB)": 117.38, + "step": 47440, + "train_speed(iter/s)": 1.636894 + }, + { + "acc": 0.67257252, + "epoch": 1.2035768645357687, + "grad_norm": 4.90625, + "learning_rate": 3.743927880676125e-06, + "loss": 1.53730631, + "memory(GiB)": 117.38, + "step": 47445, + "train_speed(iter/s)": 1.636912 + }, + { + "acc": 0.67098913, + "epoch": 1.2037037037037037, + "grad_norm": 4.75, + "learning_rate": 3.742912912457329e-06, + "loss": 1.5400835, + "memory(GiB)": 117.38, + "step": 47450, + "train_speed(iter/s)": 1.63693 + }, + { + "acc": 0.65312138, + "epoch": 1.2038305428716387, + "grad_norm": 5.6875, + "learning_rate": 3.7418979995308336e-06, + "loss": 1.59181395, + "memory(GiB)": 117.38, + "step": 47455, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.65536499, + "epoch": 1.2039573820395737, + "grad_norm": 5.21875, + "learning_rate": 3.740883141941282e-06, + "loss": 1.549055, + "memory(GiB)": 117.38, + "step": 47460, + "train_speed(iter/s)": 1.636966 + }, + { + "acc": 0.66491232, + "epoch": 1.204084221207509, + "grad_norm": 5.6875, + "learning_rate": 3.7398683397333103e-06, + "loss": 1.61014748, + "memory(GiB)": 117.38, + "step": 47465, + "train_speed(iter/s)": 1.636985 + }, + { + "acc": 0.6578342, + "epoch": 1.204211060375444, + "grad_norm": 4.78125, + "learning_rate": 3.7388535929515573e-06, + "loss": 1.55983791, + "memory(GiB)": 117.38, + "step": 47470, + "train_speed(iter/s)": 1.637004 + }, + { + "acc": 0.66928706, + "epoch": 1.204337899543379, + "grad_norm": 6.125, + "learning_rate": 3.737838901640653e-06, + "loss": 1.5119709, + "memory(GiB)": 117.38, + "step": 47475, + "train_speed(iter/s)": 1.637022 + }, + { + "acc": 0.66631021, + "epoch": 1.204464738711314, + "grad_norm": 7.96875, + "learning_rate": 3.736824265845228e-06, + "loss": 1.57373447, + "memory(GiB)": 117.38, + "step": 47480, + "train_speed(iter/s)": 1.63704 + }, + { + "acc": 0.66325426, + "epoch": 1.204591577879249, + "grad_norm": 5.40625, + "learning_rate": 3.7358096856099118e-06, + "loss": 1.57546215, + "memory(GiB)": 117.38, + "step": 47485, + "train_speed(iter/s)": 1.63706 + }, + { + "acc": 0.66052427, + "epoch": 1.204718417047184, + "grad_norm": 5.8125, + "learning_rate": 3.7347951609793315e-06, + "loss": 1.55634375, + "memory(GiB)": 117.38, + "step": 47490, + "train_speed(iter/s)": 1.637079 + }, + { + "acc": 0.64717827, + "epoch": 1.2048452562151193, + "grad_norm": 6.375, + "learning_rate": 3.7337806919981077e-06, + "loss": 1.599263, + "memory(GiB)": 117.38, + "step": 47495, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.66443281, + "epoch": 1.2049720953830543, + "grad_norm": 5.84375, + "learning_rate": 3.732766278710861e-06, + "loss": 1.55201912, + "memory(GiB)": 117.38, + "step": 47500, + "train_speed(iter/s)": 1.637114 + }, + { + "acc": 0.66117811, + "epoch": 1.2050989345509893, + "grad_norm": 5.28125, + "learning_rate": 3.7317519211622123e-06, + "loss": 1.58700237, + "memory(GiB)": 117.38, + "step": 47505, + "train_speed(iter/s)": 1.637133 + }, + { + "acc": 0.65153933, + "epoch": 1.2052257737189245, + "grad_norm": 5.8125, + "learning_rate": 3.7307376193967772e-06, + "loss": 1.63909302, + "memory(GiB)": 117.38, + "step": 47510, + "train_speed(iter/s)": 1.637152 + }, + { + "acc": 0.65395942, + "epoch": 1.2053526128868595, + "grad_norm": 5.90625, + "learning_rate": 3.7297233734591664e-06, + "loss": 1.59631739, + "memory(GiB)": 117.38, + "step": 47515, + "train_speed(iter/s)": 1.637169 + }, + { + "acc": 0.65194464, + "epoch": 1.2054794520547945, + "grad_norm": 6.71875, + "learning_rate": 3.7287091833939948e-06, + "loss": 1.65482063, + "memory(GiB)": 117.38, + "step": 47520, + "train_speed(iter/s)": 1.637186 + }, + { + "acc": 0.67703719, + "epoch": 1.2056062912227297, + "grad_norm": 5.28125, + "learning_rate": 3.7276950492458675e-06, + "loss": 1.51242867, + "memory(GiB)": 117.38, + "step": 47525, + "train_speed(iter/s)": 1.637205 + }, + { + "acc": 0.6592165, + "epoch": 1.2057331303906647, + "grad_norm": 5.34375, + "learning_rate": 3.7266809710593956e-06, + "loss": 1.58366737, + "memory(GiB)": 117.38, + "step": 47530, + "train_speed(iter/s)": 1.637224 + }, + { + "acc": 0.66053681, + "epoch": 1.2058599695585996, + "grad_norm": 6.78125, + "learning_rate": 3.7256669488791763e-06, + "loss": 1.57286053, + "memory(GiB)": 117.38, + "step": 47535, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.66028633, + "epoch": 1.2059868087265349, + "grad_norm": 5.84375, + "learning_rate": 3.7246529827498156e-06, + "loss": 1.51440296, + "memory(GiB)": 117.38, + "step": 47540, + "train_speed(iter/s)": 1.637261 + }, + { + "acc": 0.65863075, + "epoch": 1.2061136478944698, + "grad_norm": 5.4375, + "learning_rate": 3.7236390727159094e-06, + "loss": 1.58927555, + "memory(GiB)": 117.38, + "step": 47545, + "train_speed(iter/s)": 1.63728 + }, + { + "acc": 0.66223278, + "epoch": 1.2062404870624048, + "grad_norm": 5.0625, + "learning_rate": 3.7226252188220573e-06, + "loss": 1.53088074, + "memory(GiB)": 117.38, + "step": 47550, + "train_speed(iter/s)": 1.637299 + }, + { + "acc": 0.662673, + "epoch": 1.2063673262303398, + "grad_norm": 6.28125, + "learning_rate": 3.7216114211128505e-06, + "loss": 1.57977867, + "memory(GiB)": 117.38, + "step": 47555, + "train_speed(iter/s)": 1.637318 + }, + { + "acc": 0.6630456, + "epoch": 1.206494165398275, + "grad_norm": 6.375, + "learning_rate": 3.720597679632879e-06, + "loss": 1.56296463, + "memory(GiB)": 117.38, + "step": 47560, + "train_speed(iter/s)": 1.637336 + }, + { + "acc": 0.65326242, + "epoch": 1.20662100456621, + "grad_norm": 7.53125, + "learning_rate": 3.7195839944267357e-06, + "loss": 1.68292923, + "memory(GiB)": 117.38, + "step": 47565, + "train_speed(iter/s)": 1.637355 + }, + { + "acc": 0.66865773, + "epoch": 1.206747843734145, + "grad_norm": 4.9375, + "learning_rate": 3.718570365539006e-06, + "loss": 1.51249352, + "memory(GiB)": 117.38, + "step": 47570, + "train_speed(iter/s)": 1.637373 + }, + { + "acc": 0.66530428, + "epoch": 1.2068746829020802, + "grad_norm": 6.28125, + "learning_rate": 3.717556793014271e-06, + "loss": 1.60455418, + "memory(GiB)": 117.38, + "step": 47575, + "train_speed(iter/s)": 1.637391 + }, + { + "acc": 0.65385714, + "epoch": 1.2070015220700152, + "grad_norm": 5.3125, + "learning_rate": 3.716543276897113e-06, + "loss": 1.65254364, + "memory(GiB)": 117.38, + "step": 47580, + "train_speed(iter/s)": 1.63741 + }, + { + "acc": 0.6525938, + "epoch": 1.2071283612379502, + "grad_norm": 5.03125, + "learning_rate": 3.715529817232114e-06, + "loss": 1.58671665, + "memory(GiB)": 117.38, + "step": 47585, + "train_speed(iter/s)": 1.637428 + }, + { + "acc": 0.65385737, + "epoch": 1.2072552004058854, + "grad_norm": 6.5625, + "learning_rate": 3.7145164140638483e-06, + "loss": 1.61153679, + "memory(GiB)": 117.38, + "step": 47590, + "train_speed(iter/s)": 1.637447 + }, + { + "acc": 0.64803929, + "epoch": 1.2073820395738204, + "grad_norm": 5.5, + "learning_rate": 3.713503067436889e-06, + "loss": 1.62950382, + "memory(GiB)": 117.38, + "step": 47595, + "train_speed(iter/s)": 1.637465 + }, + { + "acc": 0.65665474, + "epoch": 1.2075088787417554, + "grad_norm": 6.8125, + "learning_rate": 3.7124897773958084e-06, + "loss": 1.57136297, + "memory(GiB)": 117.38, + "step": 47600, + "train_speed(iter/s)": 1.637484 + }, + { + "acc": 0.65311942, + "epoch": 1.2076357179096906, + "grad_norm": 6.15625, + "learning_rate": 3.7114765439851752e-06, + "loss": 1.69280968, + "memory(GiB)": 117.38, + "step": 47605, + "train_speed(iter/s)": 1.637502 + }, + { + "acc": 0.66585712, + "epoch": 1.2077625570776256, + "grad_norm": 5.03125, + "learning_rate": 3.7104633672495584e-06, + "loss": 1.54819841, + "memory(GiB)": 117.38, + "step": 47610, + "train_speed(iter/s)": 1.637521 + }, + { + "acc": 0.67353563, + "epoch": 1.2078893962455606, + "grad_norm": 7.875, + "learning_rate": 3.709450247233519e-06, + "loss": 1.48895893, + "memory(GiB)": 117.38, + "step": 47615, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.63848372, + "epoch": 1.2080162354134956, + "grad_norm": 10.4375, + "learning_rate": 3.7084371839816204e-06, + "loss": 1.6762125, + "memory(GiB)": 117.38, + "step": 47620, + "train_speed(iter/s)": 1.637559 + }, + { + "acc": 0.67311344, + "epoch": 1.2081430745814308, + "grad_norm": 4.5625, + "learning_rate": 3.707424177538419e-06, + "loss": 1.5390255, + "memory(GiB)": 117.38, + "step": 47625, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.66044602, + "epoch": 1.2082699137493658, + "grad_norm": 5.625, + "learning_rate": 3.7064112279484753e-06, + "loss": 1.55703392, + "memory(GiB)": 117.38, + "step": 47630, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.66800637, + "epoch": 1.208396752917301, + "grad_norm": 6.5625, + "learning_rate": 3.7053983352563407e-06, + "loss": 1.58711338, + "memory(GiB)": 117.38, + "step": 47635, + "train_speed(iter/s)": 1.637614 + }, + { + "acc": 0.66317296, + "epoch": 1.208523592085236, + "grad_norm": 4.84375, + "learning_rate": 3.704385499506565e-06, + "loss": 1.54421234, + "memory(GiB)": 117.38, + "step": 47640, + "train_speed(iter/s)": 1.637632 + }, + { + "acc": 0.65253234, + "epoch": 1.208650431253171, + "grad_norm": 5.28125, + "learning_rate": 3.703372720743702e-06, + "loss": 1.59565544, + "memory(GiB)": 117.38, + "step": 47645, + "train_speed(iter/s)": 1.63765 + }, + { + "acc": 0.6506361, + "epoch": 1.208777270421106, + "grad_norm": 6.3125, + "learning_rate": 3.7023599990122966e-06, + "loss": 1.59613762, + "memory(GiB)": 117.38, + "step": 47650, + "train_speed(iter/s)": 1.637669 + }, + { + "acc": 0.65268602, + "epoch": 1.2089041095890412, + "grad_norm": 5.8125, + "learning_rate": 3.7013473343568897e-06, + "loss": 1.5973506, + "memory(GiB)": 117.38, + "step": 47655, + "train_speed(iter/s)": 1.637688 + }, + { + "acc": 0.66887655, + "epoch": 1.2090309487569761, + "grad_norm": 6.15625, + "learning_rate": 3.700334726822026e-06, + "loss": 1.55827675, + "memory(GiB)": 117.38, + "step": 47660, + "train_speed(iter/s)": 1.637707 + }, + { + "acc": 0.67833157, + "epoch": 1.2091577879249111, + "grad_norm": 6.15625, + "learning_rate": 3.6993221764522435e-06, + "loss": 1.5140213, + "memory(GiB)": 117.38, + "step": 47665, + "train_speed(iter/s)": 1.637725 + }, + { + "acc": 0.66088901, + "epoch": 1.2092846270928463, + "grad_norm": 5.15625, + "learning_rate": 3.6983096832920806e-06, + "loss": 1.59403515, + "memory(GiB)": 117.38, + "step": 47670, + "train_speed(iter/s)": 1.637743 + }, + { + "acc": 0.67114058, + "epoch": 1.2094114662607813, + "grad_norm": 5.6875, + "learning_rate": 3.697297247386066e-06, + "loss": 1.60326481, + "memory(GiB)": 117.38, + "step": 47675, + "train_speed(iter/s)": 1.637762 + }, + { + "acc": 0.65843749, + "epoch": 1.2095383054287163, + "grad_norm": 6.46875, + "learning_rate": 3.6962848687787365e-06, + "loss": 1.68851223, + "memory(GiB)": 117.38, + "step": 47680, + "train_speed(iter/s)": 1.63778 + }, + { + "acc": 0.65391116, + "epoch": 1.2096651445966515, + "grad_norm": 5.1875, + "learning_rate": 3.6952725475146183e-06, + "loss": 1.57427654, + "memory(GiB)": 117.38, + "step": 47685, + "train_speed(iter/s)": 1.637796 + }, + { + "acc": 0.65936494, + "epoch": 1.2097919837645865, + "grad_norm": 7.90625, + "learning_rate": 3.69426028363824e-06, + "loss": 1.56655807, + "memory(GiB)": 117.38, + "step": 47690, + "train_speed(iter/s)": 1.637813 + }, + { + "acc": 0.66478758, + "epoch": 1.2099188229325215, + "grad_norm": 6.65625, + "learning_rate": 3.6932480771941237e-06, + "loss": 1.54837332, + "memory(GiB)": 117.38, + "step": 47695, + "train_speed(iter/s)": 1.637829 + }, + { + "acc": 0.66540852, + "epoch": 1.2100456621004567, + "grad_norm": 6.0625, + "learning_rate": 3.6922359282267904e-06, + "loss": 1.55895596, + "memory(GiB)": 117.38, + "step": 47700, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.66476021, + "epoch": 1.2101725012683917, + "grad_norm": 5.375, + "learning_rate": 3.6912238367807606e-06, + "loss": 1.59085398, + "memory(GiB)": 117.38, + "step": 47705, + "train_speed(iter/s)": 1.637865 + }, + { + "acc": 0.63971148, + "epoch": 1.2102993404363267, + "grad_norm": 7.84375, + "learning_rate": 3.6902118029005507e-06, + "loss": 1.64715214, + "memory(GiB)": 117.38, + "step": 47710, + "train_speed(iter/s)": 1.637884 + }, + { + "acc": 0.65537806, + "epoch": 1.2104261796042617, + "grad_norm": 5.9375, + "learning_rate": 3.6891998266306717e-06, + "loss": 1.62349224, + "memory(GiB)": 117.38, + "step": 47715, + "train_speed(iter/s)": 1.637904 + }, + { + "acc": 0.65911365, + "epoch": 1.210553018772197, + "grad_norm": 5.8125, + "learning_rate": 3.688187908015636e-06, + "loss": 1.58713236, + "memory(GiB)": 117.38, + "step": 47720, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.66470337, + "epoch": 1.2106798579401319, + "grad_norm": 7.15625, + "learning_rate": 3.6871760470999546e-06, + "loss": 1.62640915, + "memory(GiB)": 117.38, + "step": 47725, + "train_speed(iter/s)": 1.63794 + }, + { + "acc": 0.65899296, + "epoch": 1.2108066971080669, + "grad_norm": 5.4375, + "learning_rate": 3.6861642439281325e-06, + "loss": 1.56760435, + "memory(GiB)": 117.38, + "step": 47730, + "train_speed(iter/s)": 1.637959 + }, + { + "acc": 0.64711294, + "epoch": 1.210933536276002, + "grad_norm": 5.40625, + "learning_rate": 3.6851524985446707e-06, + "loss": 1.63981018, + "memory(GiB)": 117.38, + "step": 47735, + "train_speed(iter/s)": 1.637978 + }, + { + "acc": 0.65686073, + "epoch": 1.211060375443937, + "grad_norm": 6.21875, + "learning_rate": 3.6841408109940737e-06, + "loss": 1.57695665, + "memory(GiB)": 117.38, + "step": 47740, + "train_speed(iter/s)": 1.637997 + }, + { + "acc": 0.65039511, + "epoch": 1.211187214611872, + "grad_norm": 7.03125, + "learning_rate": 3.6831291813208377e-06, + "loss": 1.6109127, + "memory(GiB)": 117.38, + "step": 47745, + "train_speed(iter/s)": 1.638015 + }, + { + "acc": 0.64319525, + "epoch": 1.2113140537798073, + "grad_norm": 6.21875, + "learning_rate": 3.682117609569462e-06, + "loss": 1.66742916, + "memory(GiB)": 117.38, + "step": 47750, + "train_speed(iter/s)": 1.638033 + }, + { + "acc": 0.64441476, + "epoch": 1.2114408929477423, + "grad_norm": 5.03125, + "learning_rate": 3.681106095784436e-06, + "loss": 1.65064392, + "memory(GiB)": 117.38, + "step": 47755, + "train_speed(iter/s)": 1.638051 + }, + { + "acc": 0.68024392, + "epoch": 1.2115677321156773, + "grad_norm": 4.40625, + "learning_rate": 3.6800946400102522e-06, + "loss": 1.47348938, + "memory(GiB)": 117.38, + "step": 47760, + "train_speed(iter/s)": 1.638069 + }, + { + "acc": 0.65860376, + "epoch": 1.2116945712836125, + "grad_norm": 5.8125, + "learning_rate": 3.6790832422913984e-06, + "loss": 1.60526657, + "memory(GiB)": 117.38, + "step": 47765, + "train_speed(iter/s)": 1.638087 + }, + { + "acc": 0.66641121, + "epoch": 1.2118214104515475, + "grad_norm": 8.0625, + "learning_rate": 3.6780719026723632e-06, + "loss": 1.58406124, + "memory(GiB)": 117.38, + "step": 47770, + "train_speed(iter/s)": 1.638107 + }, + { + "acc": 0.64353762, + "epoch": 1.2119482496194824, + "grad_norm": 6.125, + "learning_rate": 3.677060621197627e-06, + "loss": 1.69302654, + "memory(GiB)": 117.38, + "step": 47775, + "train_speed(iter/s)": 1.638126 + }, + { + "acc": 0.65377769, + "epoch": 1.2120750887874174, + "grad_norm": 4.90625, + "learning_rate": 3.6760493979116696e-06, + "loss": 1.61885071, + "memory(GiB)": 117.38, + "step": 47780, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.66489196, + "epoch": 1.2122019279553526, + "grad_norm": 5.1875, + "learning_rate": 3.6750382328589725e-06, + "loss": 1.58644619, + "memory(GiB)": 117.38, + "step": 47785, + "train_speed(iter/s)": 1.638163 + }, + { + "acc": 0.64299464, + "epoch": 1.2123287671232876, + "grad_norm": 7.625, + "learning_rate": 3.67402712608401e-06, + "loss": 1.65487499, + "memory(GiB)": 117.38, + "step": 47790, + "train_speed(iter/s)": 1.638183 + }, + { + "acc": 0.66387773, + "epoch": 1.2124556062912228, + "grad_norm": 5.9375, + "learning_rate": 3.673016077631253e-06, + "loss": 1.62087059, + "memory(GiB)": 117.38, + "step": 47795, + "train_speed(iter/s)": 1.638201 + }, + { + "acc": 0.65175467, + "epoch": 1.2125824454591578, + "grad_norm": 6.1875, + "learning_rate": 3.672005087545173e-06, + "loss": 1.59164972, + "memory(GiB)": 117.38, + "step": 47800, + "train_speed(iter/s)": 1.63822 + }, + { + "acc": 0.66121831, + "epoch": 1.2127092846270928, + "grad_norm": 5.0625, + "learning_rate": 3.6709941558702393e-06, + "loss": 1.59850273, + "memory(GiB)": 117.38, + "step": 47805, + "train_speed(iter/s)": 1.638239 + }, + { + "acc": 0.65541821, + "epoch": 1.2128361237950278, + "grad_norm": 5.6875, + "learning_rate": 3.6699832826509174e-06, + "loss": 1.62301617, + "memory(GiB)": 117.38, + "step": 47810, + "train_speed(iter/s)": 1.638257 + }, + { + "acc": 0.64708948, + "epoch": 1.212962962962963, + "grad_norm": 5.65625, + "learning_rate": 3.6689724679316665e-06, + "loss": 1.67039871, + "memory(GiB)": 117.38, + "step": 47815, + "train_speed(iter/s)": 1.638276 + }, + { + "acc": 0.65100551, + "epoch": 1.213089802130898, + "grad_norm": 6.15625, + "learning_rate": 3.66796171175695e-06, + "loss": 1.62899761, + "memory(GiB)": 117.38, + "step": 47820, + "train_speed(iter/s)": 1.638295 + }, + { + "acc": 0.6604032, + "epoch": 1.213216641298833, + "grad_norm": 5.5625, + "learning_rate": 3.666951014171224e-06, + "loss": 1.58865461, + "memory(GiB)": 117.38, + "step": 47825, + "train_speed(iter/s)": 1.638315 + }, + { + "acc": 0.6468338, + "epoch": 1.2133434804667682, + "grad_norm": 4.90625, + "learning_rate": 3.6659403752189453e-06, + "loss": 1.58457661, + "memory(GiB)": 117.38, + "step": 47830, + "train_speed(iter/s)": 1.638334 + }, + { + "acc": 0.658566, + "epoch": 1.2134703196347032, + "grad_norm": 5.5, + "learning_rate": 3.664929794944565e-06, + "loss": 1.5639081, + "memory(GiB)": 117.38, + "step": 47835, + "train_speed(iter/s)": 1.638352 + }, + { + "acc": 0.67037497, + "epoch": 1.2135971588026382, + "grad_norm": 5.78125, + "learning_rate": 3.663919273392532e-06, + "loss": 1.53871708, + "memory(GiB)": 117.38, + "step": 47840, + "train_speed(iter/s)": 1.638371 + }, + { + "acc": 0.68368177, + "epoch": 1.2137239979705734, + "grad_norm": 4.96875, + "learning_rate": 3.662908810607294e-06, + "loss": 1.47151546, + "memory(GiB)": 117.38, + "step": 47845, + "train_speed(iter/s)": 1.638389 + }, + { + "acc": 0.66479325, + "epoch": 1.2138508371385084, + "grad_norm": 5.375, + "learning_rate": 3.6618984066332986e-06, + "loss": 1.63972626, + "memory(GiB)": 117.38, + "step": 47850, + "train_speed(iter/s)": 1.638408 + }, + { + "acc": 0.66394463, + "epoch": 1.2139776763064434, + "grad_norm": 6.6875, + "learning_rate": 3.660888061514984e-06, + "loss": 1.58739681, + "memory(GiB)": 117.38, + "step": 47855, + "train_speed(iter/s)": 1.638426 + }, + { + "acc": 0.66627588, + "epoch": 1.2141045154743786, + "grad_norm": 6.6875, + "learning_rate": 3.6598777752967896e-06, + "loss": 1.62743893, + "memory(GiB)": 117.38, + "step": 47860, + "train_speed(iter/s)": 1.638444 + }, + { + "acc": 0.66282902, + "epoch": 1.2142313546423136, + "grad_norm": 6.4375, + "learning_rate": 3.658867548023156e-06, + "loss": 1.61625137, + "memory(GiB)": 117.38, + "step": 47865, + "train_speed(iter/s)": 1.638462 + }, + { + "acc": 0.6609745, + "epoch": 1.2143581938102486, + "grad_norm": 6.9375, + "learning_rate": 3.657857379738515e-06, + "loss": 1.62713032, + "memory(GiB)": 117.38, + "step": 47870, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.65567636, + "epoch": 1.2144850329781836, + "grad_norm": 6.3125, + "learning_rate": 3.656847270487298e-06, + "loss": 1.6482605, + "memory(GiB)": 117.38, + "step": 47875, + "train_speed(iter/s)": 1.638499 + }, + { + "acc": 0.649895, + "epoch": 1.2146118721461188, + "grad_norm": 5.40625, + "learning_rate": 3.655837220313936e-06, + "loss": 1.58997946, + "memory(GiB)": 117.38, + "step": 47880, + "train_speed(iter/s)": 1.638517 + }, + { + "acc": 0.64477386, + "epoch": 1.2147387113140538, + "grad_norm": 4.3125, + "learning_rate": 3.654827229262852e-06, + "loss": 1.64937248, + "memory(GiB)": 117.38, + "step": 47885, + "train_speed(iter/s)": 1.638536 + }, + { + "acc": 0.66857071, + "epoch": 1.2148655504819887, + "grad_norm": 5.8125, + "learning_rate": 3.653817297378476e-06, + "loss": 1.53666782, + "memory(GiB)": 117.38, + "step": 47890, + "train_speed(iter/s)": 1.638554 + }, + { + "acc": 0.66990514, + "epoch": 1.214992389649924, + "grad_norm": 6.375, + "learning_rate": 3.6528074247052225e-06, + "loss": 1.52699547, + "memory(GiB)": 117.38, + "step": 47895, + "train_speed(iter/s)": 1.638573 + }, + { + "acc": 0.66492877, + "epoch": 1.215119228817859, + "grad_norm": 4.53125, + "learning_rate": 3.651797611287514e-06, + "loss": 1.52720156, + "memory(GiB)": 117.38, + "step": 47900, + "train_speed(iter/s)": 1.638592 + }, + { + "acc": 0.68067183, + "epoch": 1.215246067985794, + "grad_norm": 7.40625, + "learning_rate": 3.6507878571697646e-06, + "loss": 1.54174871, + "memory(GiB)": 117.38, + "step": 47905, + "train_speed(iter/s)": 1.63861 + }, + { + "acc": 0.65826068, + "epoch": 1.2153729071537291, + "grad_norm": 5.59375, + "learning_rate": 3.6497781623963915e-06, + "loss": 1.61141224, + "memory(GiB)": 117.38, + "step": 47910, + "train_speed(iter/s)": 1.638628 + }, + { + "acc": 0.65827856, + "epoch": 1.2154997463216641, + "grad_norm": 5.0625, + "learning_rate": 3.648768527011802e-06, + "loss": 1.59890509, + "memory(GiB)": 117.38, + "step": 47915, + "train_speed(iter/s)": 1.638645 + }, + { + "acc": 0.6614789, + "epoch": 1.2156265854895991, + "grad_norm": 5.21875, + "learning_rate": 3.6477589510604044e-06, + "loss": 1.6151392, + "memory(GiB)": 117.38, + "step": 47920, + "train_speed(iter/s)": 1.638664 + }, + { + "acc": 0.64883261, + "epoch": 1.2157534246575343, + "grad_norm": 4.9375, + "learning_rate": 3.646749434586607e-06, + "loss": 1.60067787, + "memory(GiB)": 117.38, + "step": 47925, + "train_speed(iter/s)": 1.638682 + }, + { + "acc": 0.66785889, + "epoch": 1.2158802638254693, + "grad_norm": 7.75, + "learning_rate": 3.645739977634811e-06, + "loss": 1.56778088, + "memory(GiB)": 117.38, + "step": 47930, + "train_speed(iter/s)": 1.638701 + }, + { + "acc": 0.661728, + "epoch": 1.2160071029934043, + "grad_norm": 5.53125, + "learning_rate": 3.6447305802494177e-06, + "loss": 1.57314167, + "memory(GiB)": 117.38, + "step": 47935, + "train_speed(iter/s)": 1.638719 + }, + { + "acc": 0.66300807, + "epoch": 1.2161339421613393, + "grad_norm": 6.4375, + "learning_rate": 3.6437212424748227e-06, + "loss": 1.64865913, + "memory(GiB)": 117.38, + "step": 47940, + "train_speed(iter/s)": 1.638739 + }, + { + "acc": 0.65332799, + "epoch": 1.2162607813292745, + "grad_norm": 6.9375, + "learning_rate": 3.642711964355423e-06, + "loss": 1.62769623, + "memory(GiB)": 117.38, + "step": 47945, + "train_speed(iter/s)": 1.638757 + }, + { + "acc": 0.64575434, + "epoch": 1.2163876204972095, + "grad_norm": 7.25, + "learning_rate": 3.6417027459356134e-06, + "loss": 1.63560429, + "memory(GiB)": 117.38, + "step": 47950, + "train_speed(iter/s)": 1.638775 + }, + { + "acc": 0.63751421, + "epoch": 1.2165144596651447, + "grad_norm": 5.71875, + "learning_rate": 3.640693587259778e-06, + "loss": 1.7098753, + "memory(GiB)": 117.38, + "step": 47955, + "train_speed(iter/s)": 1.638794 + }, + { + "acc": 0.64397469, + "epoch": 1.2166412988330797, + "grad_norm": 4.78125, + "learning_rate": 3.6396844883723092e-06, + "loss": 1.67164021, + "memory(GiB)": 117.38, + "step": 47960, + "train_speed(iter/s)": 1.638812 + }, + { + "acc": 0.66225581, + "epoch": 1.2167681380010147, + "grad_norm": 5.5, + "learning_rate": 3.6386754493175893e-06, + "loss": 1.62941971, + "memory(GiB)": 117.38, + "step": 47965, + "train_speed(iter/s)": 1.638831 + }, + { + "acc": 0.65536351, + "epoch": 1.2168949771689497, + "grad_norm": 4.53125, + "learning_rate": 3.637666470140003e-06, + "loss": 1.58650637, + "memory(GiB)": 117.38, + "step": 47970, + "train_speed(iter/s)": 1.638849 + }, + { + "acc": 0.66715546, + "epoch": 1.2170218163368849, + "grad_norm": 5.5625, + "learning_rate": 3.6366575508839265e-06, + "loss": 1.57593946, + "memory(GiB)": 117.38, + "step": 47975, + "train_speed(iter/s)": 1.638867 + }, + { + "acc": 0.65075779, + "epoch": 1.2171486555048199, + "grad_norm": 6.4375, + "learning_rate": 3.635648691593737e-06, + "loss": 1.67411995, + "memory(GiB)": 117.38, + "step": 47980, + "train_speed(iter/s)": 1.638884 + }, + { + "acc": 0.65510664, + "epoch": 1.2172754946727549, + "grad_norm": 5.34375, + "learning_rate": 3.6346398923138094e-06, + "loss": 1.61411781, + "memory(GiB)": 117.38, + "step": 47985, + "train_speed(iter/s)": 1.638903 + }, + { + "acc": 0.65895376, + "epoch": 1.21740233384069, + "grad_norm": 5.09375, + "learning_rate": 3.633631153088517e-06, + "loss": 1.56976662, + "memory(GiB)": 117.38, + "step": 47990, + "train_speed(iter/s)": 1.638921 + }, + { + "acc": 0.6507803, + "epoch": 1.217529173008625, + "grad_norm": 6.28125, + "learning_rate": 3.6326224739622255e-06, + "loss": 1.67968559, + "memory(GiB)": 117.38, + "step": 47995, + "train_speed(iter/s)": 1.638938 + }, + { + "acc": 0.65087376, + "epoch": 1.21765601217656, + "grad_norm": 6.96875, + "learning_rate": 3.6316138549793024e-06, + "loss": 1.6004715, + "memory(GiB)": 117.38, + "step": 48000, + "train_speed(iter/s)": 1.638956 + }, + { + "epoch": 1.21765601217656, + "eval_acc": 0.6462547274029616, + "eval_loss": 1.5733972787857056, + "eval_runtime": 58.7028, + "eval_samples_per_second": 108.513, + "eval_steps_per_second": 27.137, + "step": 48000 + }, + { + "acc": 0.65838628, + "epoch": 1.2177828513444953, + "grad_norm": 5.9375, + "learning_rate": 3.630605296184111e-06, + "loss": 1.53696594, + "memory(GiB)": 117.38, + "step": 48005, + "train_speed(iter/s)": 1.635455 + }, + { + "acc": 0.63744974, + "epoch": 1.2179096905124303, + "grad_norm": 5.75, + "learning_rate": 3.6295967976210146e-06, + "loss": 1.6900425, + "memory(GiB)": 117.38, + "step": 48010, + "train_speed(iter/s)": 1.635473 + }, + { + "acc": 0.63860259, + "epoch": 1.2180365296803652, + "grad_norm": 5.96875, + "learning_rate": 3.6285883593343685e-06, + "loss": 1.67360191, + "memory(GiB)": 117.38, + "step": 48015, + "train_speed(iter/s)": 1.635489 + }, + { + "acc": 0.65640388, + "epoch": 1.2181633688483005, + "grad_norm": 6.28125, + "learning_rate": 3.6275799813685274e-06, + "loss": 1.60907784, + "memory(GiB)": 117.38, + "step": 48020, + "train_speed(iter/s)": 1.635507 + }, + { + "acc": 0.65977535, + "epoch": 1.2182902080162354, + "grad_norm": 5.5625, + "learning_rate": 3.6265716637678484e-06, + "loss": 1.6038435, + "memory(GiB)": 117.38, + "step": 48025, + "train_speed(iter/s)": 1.635525 + }, + { + "acc": 0.66015639, + "epoch": 1.2184170471841704, + "grad_norm": 6.84375, + "learning_rate": 3.62556340657668e-06, + "loss": 1.61447563, + "memory(GiB)": 117.38, + "step": 48030, + "train_speed(iter/s)": 1.635542 + }, + { + "acc": 0.66132317, + "epoch": 1.2185438863521054, + "grad_norm": 5.875, + "learning_rate": 3.6245552098393665e-06, + "loss": 1.60795059, + "memory(GiB)": 117.38, + "step": 48035, + "train_speed(iter/s)": 1.635559 + }, + { + "acc": 0.65779886, + "epoch": 1.2186707255200406, + "grad_norm": 5.34375, + "learning_rate": 3.6235470736002576e-06, + "loss": 1.58638983, + "memory(GiB)": 117.38, + "step": 48040, + "train_speed(iter/s)": 1.635577 + }, + { + "acc": 0.65934896, + "epoch": 1.2187975646879756, + "grad_norm": 5.09375, + "learning_rate": 3.622538997903693e-06, + "loss": 1.6043148, + "memory(GiB)": 117.38, + "step": 48045, + "train_speed(iter/s)": 1.635594 + }, + { + "acc": 0.64994345, + "epoch": 1.2189244038559106, + "grad_norm": 6.65625, + "learning_rate": 3.621530982794015e-06, + "loss": 1.63655586, + "memory(GiB)": 117.38, + "step": 48050, + "train_speed(iter/s)": 1.635612 + }, + { + "acc": 0.65441327, + "epoch": 1.2190512430238458, + "grad_norm": 6.125, + "learning_rate": 3.620523028315558e-06, + "loss": 1.64425621, + "memory(GiB)": 117.38, + "step": 48055, + "train_speed(iter/s)": 1.635629 + }, + { + "acc": 0.6488965, + "epoch": 1.2191780821917808, + "grad_norm": 6.125, + "learning_rate": 3.6195151345126556e-06, + "loss": 1.6972702, + "memory(GiB)": 117.38, + "step": 48060, + "train_speed(iter/s)": 1.635646 + }, + { + "acc": 0.66290216, + "epoch": 1.2193049213597158, + "grad_norm": 5.0625, + "learning_rate": 3.6185073014296425e-06, + "loss": 1.58812561, + "memory(GiB)": 117.38, + "step": 48065, + "train_speed(iter/s)": 1.635664 + }, + { + "acc": 0.65696516, + "epoch": 1.219431760527651, + "grad_norm": 6.25, + "learning_rate": 3.6174995291108474e-06, + "loss": 1.54748907, + "memory(GiB)": 117.38, + "step": 48070, + "train_speed(iter/s)": 1.635683 + }, + { + "acc": 0.64347086, + "epoch": 1.219558599695586, + "grad_norm": 6.15625, + "learning_rate": 3.6164918176005937e-06, + "loss": 1.65549469, + "memory(GiB)": 117.38, + "step": 48075, + "train_speed(iter/s)": 1.635699 + }, + { + "acc": 0.6439455, + "epoch": 1.219685438863521, + "grad_norm": 6.03125, + "learning_rate": 3.6154841669432062e-06, + "loss": 1.60577316, + "memory(GiB)": 117.38, + "step": 48080, + "train_speed(iter/s)": 1.635715 + }, + { + "acc": 0.66491609, + "epoch": 1.2198122780314562, + "grad_norm": 5.53125, + "learning_rate": 3.614476577183007e-06, + "loss": 1.56016846, + "memory(GiB)": 117.38, + "step": 48085, + "train_speed(iter/s)": 1.635729 + }, + { + "acc": 0.66486816, + "epoch": 1.2199391171993912, + "grad_norm": 4.9375, + "learning_rate": 3.6134690483643154e-06, + "loss": 1.55486898, + "memory(GiB)": 117.38, + "step": 48090, + "train_speed(iter/s)": 1.635747 + }, + { + "acc": 0.65270376, + "epoch": 1.2200659563673262, + "grad_norm": 6.09375, + "learning_rate": 3.6124615805314434e-06, + "loss": 1.62458038, + "memory(GiB)": 117.38, + "step": 48095, + "train_speed(iter/s)": 1.635764 + }, + { + "acc": 0.644172, + "epoch": 1.2201927955352612, + "grad_norm": 5.5, + "learning_rate": 3.611454173728707e-06, + "loss": 1.62348328, + "memory(GiB)": 117.38, + "step": 48100, + "train_speed(iter/s)": 1.635782 + }, + { + "acc": 0.66178865, + "epoch": 1.2203196347031964, + "grad_norm": 5.1875, + "learning_rate": 3.610446828000414e-06, + "loss": 1.57568903, + "memory(GiB)": 117.38, + "step": 48105, + "train_speed(iter/s)": 1.635799 + }, + { + "acc": 0.64918442, + "epoch": 1.2204464738711314, + "grad_norm": 5.375, + "learning_rate": 3.609439543390877e-06, + "loss": 1.60787201, + "memory(GiB)": 117.38, + "step": 48110, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.65388942, + "epoch": 1.2205733130390666, + "grad_norm": 6.59375, + "learning_rate": 3.608432319944394e-06, + "loss": 1.62270794, + "memory(GiB)": 117.38, + "step": 48115, + "train_speed(iter/s)": 1.635835 + }, + { + "acc": 0.65328693, + "epoch": 1.2207001522070016, + "grad_norm": 6.78125, + "learning_rate": 3.607425157705271e-06, + "loss": 1.56901016, + "memory(GiB)": 117.38, + "step": 48120, + "train_speed(iter/s)": 1.635852 + }, + { + "acc": 0.65149579, + "epoch": 1.2208269913749366, + "grad_norm": 5.03125, + "learning_rate": 3.6064180567178064e-06, + "loss": 1.61642399, + "memory(GiB)": 117.38, + "step": 48125, + "train_speed(iter/s)": 1.635872 + }, + { + "acc": 0.66643205, + "epoch": 1.2209538305428715, + "grad_norm": 5.28125, + "learning_rate": 3.6054110170263002e-06, + "loss": 1.57058754, + "memory(GiB)": 117.38, + "step": 48130, + "train_speed(iter/s)": 1.635889 + }, + { + "acc": 0.65270786, + "epoch": 1.2210806697108068, + "grad_norm": 7.875, + "learning_rate": 3.6044040386750423e-06, + "loss": 1.59366932, + "memory(GiB)": 117.38, + "step": 48135, + "train_speed(iter/s)": 1.635909 + }, + { + "acc": 0.67044859, + "epoch": 1.2212075088787417, + "grad_norm": 6.28125, + "learning_rate": 3.6033971217083242e-06, + "loss": 1.55101824, + "memory(GiB)": 117.38, + "step": 48140, + "train_speed(iter/s)": 1.635926 + }, + { + "acc": 0.66404285, + "epoch": 1.2213343480466767, + "grad_norm": 5.0625, + "learning_rate": 3.602390266170438e-06, + "loss": 1.55217323, + "memory(GiB)": 117.38, + "step": 48145, + "train_speed(iter/s)": 1.635945 + }, + { + "acc": 0.65991001, + "epoch": 1.221461187214612, + "grad_norm": 6.96875, + "learning_rate": 3.6013834721056683e-06, + "loss": 1.5990694, + "memory(GiB)": 117.38, + "step": 48150, + "train_speed(iter/s)": 1.635963 + }, + { + "acc": 0.66775904, + "epoch": 1.221588026382547, + "grad_norm": 6.21875, + "learning_rate": 3.6003767395582967e-06, + "loss": 1.45775194, + "memory(GiB)": 117.38, + "step": 48155, + "train_speed(iter/s)": 1.635981 + }, + { + "acc": 0.66359086, + "epoch": 1.221714865550482, + "grad_norm": 6.5, + "learning_rate": 3.599370068572604e-06, + "loss": 1.5262516, + "memory(GiB)": 117.38, + "step": 48160, + "train_speed(iter/s)": 1.636 + }, + { + "acc": 0.66000738, + "epoch": 1.2218417047184171, + "grad_norm": 5.9375, + "learning_rate": 3.5983634591928705e-06, + "loss": 1.58534451, + "memory(GiB)": 117.38, + "step": 48165, + "train_speed(iter/s)": 1.636018 + }, + { + "acc": 0.66850915, + "epoch": 1.2219685438863521, + "grad_norm": 5.625, + "learning_rate": 3.5973569114633704e-06, + "loss": 1.59331226, + "memory(GiB)": 117.38, + "step": 48170, + "train_speed(iter/s)": 1.636036 + }, + { + "acc": 0.66196666, + "epoch": 1.222095383054287, + "grad_norm": 6.15625, + "learning_rate": 3.5963504254283743e-06, + "loss": 1.63140011, + "memory(GiB)": 117.38, + "step": 48175, + "train_speed(iter/s)": 1.636054 + }, + { + "acc": 0.65586276, + "epoch": 1.2222222222222223, + "grad_norm": 5.125, + "learning_rate": 3.595344001132154e-06, + "loss": 1.54338589, + "memory(GiB)": 117.38, + "step": 48180, + "train_speed(iter/s)": 1.636075 + }, + { + "acc": 0.65847549, + "epoch": 1.2223490613901573, + "grad_norm": 5.0625, + "learning_rate": 3.5943376386189744e-06, + "loss": 1.60298615, + "memory(GiB)": 117.38, + "step": 48185, + "train_speed(iter/s)": 1.636093 + }, + { + "acc": 0.6617394, + "epoch": 1.2224759005580923, + "grad_norm": 6.25, + "learning_rate": 3.5933313379331047e-06, + "loss": 1.611586, + "memory(GiB)": 117.38, + "step": 48190, + "train_speed(iter/s)": 1.63611 + }, + { + "acc": 0.64570665, + "epoch": 1.2226027397260273, + "grad_norm": 5.28125, + "learning_rate": 3.5923250991188e-06, + "loss": 1.61911278, + "memory(GiB)": 117.38, + "step": 48195, + "train_speed(iter/s)": 1.636128 + }, + { + "acc": 0.64999189, + "epoch": 1.2227295788939625, + "grad_norm": 6.1875, + "learning_rate": 3.591318922220324e-06, + "loss": 1.6365818, + "memory(GiB)": 117.38, + "step": 48200, + "train_speed(iter/s)": 1.636147 + }, + { + "acc": 0.65993419, + "epoch": 1.2228564180618975, + "grad_norm": 6.6875, + "learning_rate": 3.5903128072819287e-06, + "loss": 1.61732864, + "memory(GiB)": 117.38, + "step": 48205, + "train_speed(iter/s)": 1.636166 + }, + { + "acc": 0.64813704, + "epoch": 1.2229832572298325, + "grad_norm": 6.09375, + "learning_rate": 3.5893067543478733e-06, + "loss": 1.64195747, + "memory(GiB)": 117.38, + "step": 48210, + "train_speed(iter/s)": 1.636185 + }, + { + "acc": 0.6413188, + "epoch": 1.2231100963977677, + "grad_norm": 5.75, + "learning_rate": 3.5883007634624033e-06, + "loss": 1.66897259, + "memory(GiB)": 117.38, + "step": 48215, + "train_speed(iter/s)": 1.636203 + }, + { + "acc": 0.64631791, + "epoch": 1.2232369355657027, + "grad_norm": 5.3125, + "learning_rate": 3.5872948346697676e-06, + "loss": 1.57792816, + "memory(GiB)": 117.38, + "step": 48220, + "train_speed(iter/s)": 1.636223 + }, + { + "acc": 0.68499188, + "epoch": 1.2233637747336377, + "grad_norm": 6.09375, + "learning_rate": 3.5862889680142133e-06, + "loss": 1.53002768, + "memory(GiB)": 117.38, + "step": 48225, + "train_speed(iter/s)": 1.636242 + }, + { + "acc": 0.65768089, + "epoch": 1.2234906139015729, + "grad_norm": 5.78125, + "learning_rate": 3.5852831635399833e-06, + "loss": 1.60810699, + "memory(GiB)": 117.38, + "step": 48230, + "train_speed(iter/s)": 1.63626 + }, + { + "acc": 0.65890818, + "epoch": 1.2236174530695079, + "grad_norm": 6.15625, + "learning_rate": 3.5842774212913144e-06, + "loss": 1.621632, + "memory(GiB)": 117.38, + "step": 48235, + "train_speed(iter/s)": 1.636279 + }, + { + "acc": 0.65425215, + "epoch": 1.2237442922374429, + "grad_norm": 5.75, + "learning_rate": 3.583271741312445e-06, + "loss": 1.61629219, + "memory(GiB)": 117.38, + "step": 48240, + "train_speed(iter/s)": 1.636299 + }, + { + "acc": 0.65213137, + "epoch": 1.223871131405378, + "grad_norm": 7.5625, + "learning_rate": 3.58226612364761e-06, + "loss": 1.59334278, + "memory(GiB)": 117.38, + "step": 48245, + "train_speed(iter/s)": 1.63632 + }, + { + "acc": 0.66750031, + "epoch": 1.223997970573313, + "grad_norm": 5.46875, + "learning_rate": 3.581260568341042e-06, + "loss": 1.56542473, + "memory(GiB)": 117.38, + "step": 48250, + "train_speed(iter/s)": 1.636339 + }, + { + "acc": 0.64325552, + "epoch": 1.224124809741248, + "grad_norm": 5.9375, + "learning_rate": 3.580255075436967e-06, + "loss": 1.67417564, + "memory(GiB)": 117.38, + "step": 48255, + "train_speed(iter/s)": 1.636358 + }, + { + "acc": 0.65055532, + "epoch": 1.224251648909183, + "grad_norm": 5.09375, + "learning_rate": 3.5792496449796127e-06, + "loss": 1.59499588, + "memory(GiB)": 117.38, + "step": 48260, + "train_speed(iter/s)": 1.636376 + }, + { + "acc": 0.67153797, + "epoch": 1.2243784880771182, + "grad_norm": 5.125, + "learning_rate": 3.578244277013201e-06, + "loss": 1.55264473, + "memory(GiB)": 117.38, + "step": 48265, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.66648693, + "epoch": 1.2245053272450532, + "grad_norm": 4.65625, + "learning_rate": 3.5772389715819568e-06, + "loss": 1.60946083, + "memory(GiB)": 117.38, + "step": 48270, + "train_speed(iter/s)": 1.636414 + }, + { + "acc": 0.66712675, + "epoch": 1.2246321664129884, + "grad_norm": 5.875, + "learning_rate": 3.5762337287300925e-06, + "loss": 1.55492907, + "memory(GiB)": 117.38, + "step": 48275, + "train_speed(iter/s)": 1.636433 + }, + { + "acc": 0.65120153, + "epoch": 1.2247590055809234, + "grad_norm": 5.21875, + "learning_rate": 3.575228548501825e-06, + "loss": 1.61043358, + "memory(GiB)": 117.38, + "step": 48280, + "train_speed(iter/s)": 1.636452 + }, + { + "acc": 0.64543929, + "epoch": 1.2248858447488584, + "grad_norm": 5.90625, + "learning_rate": 3.574223430941368e-06, + "loss": 1.62257843, + "memory(GiB)": 117.38, + "step": 48285, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.64128962, + "epoch": 1.2250126839167934, + "grad_norm": 6.875, + "learning_rate": 3.573218376092932e-06, + "loss": 1.70697937, + "memory(GiB)": 117.38, + "step": 48290, + "train_speed(iter/s)": 1.636489 + }, + { + "acc": 0.65640936, + "epoch": 1.2251395230847286, + "grad_norm": 6.5625, + "learning_rate": 3.5722133840007197e-06, + "loss": 1.61730499, + "memory(GiB)": 117.38, + "step": 48295, + "train_speed(iter/s)": 1.636508 + }, + { + "acc": 0.6418808, + "epoch": 1.2252663622526636, + "grad_norm": 7.1875, + "learning_rate": 3.5712084547089367e-06, + "loss": 1.6134119, + "memory(GiB)": 117.38, + "step": 48300, + "train_speed(iter/s)": 1.636527 + }, + { + "acc": 0.65859146, + "epoch": 1.2253932014205986, + "grad_norm": 6.5, + "learning_rate": 3.5702035882617857e-06, + "loss": 1.60098457, + "memory(GiB)": 117.38, + "step": 48305, + "train_speed(iter/s)": 1.636544 + }, + { + "acc": 0.64533696, + "epoch": 1.2255200405885338, + "grad_norm": 6.0, + "learning_rate": 3.5691987847034667e-06, + "loss": 1.65100422, + "memory(GiB)": 117.38, + "step": 48310, + "train_speed(iter/s)": 1.636562 + }, + { + "acc": 0.65092273, + "epoch": 1.2256468797564688, + "grad_norm": 6.34375, + "learning_rate": 3.5681940440781705e-06, + "loss": 1.64381409, + "memory(GiB)": 117.38, + "step": 48315, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.67316704, + "epoch": 1.2257737189244038, + "grad_norm": 5.65625, + "learning_rate": 3.5671893664300934e-06, + "loss": 1.55052252, + "memory(GiB)": 117.38, + "step": 48320, + "train_speed(iter/s)": 1.636597 + }, + { + "acc": 0.66980958, + "epoch": 1.225900558092339, + "grad_norm": 5.1875, + "learning_rate": 3.5661847518034244e-06, + "loss": 1.49008713, + "memory(GiB)": 117.38, + "step": 48325, + "train_speed(iter/s)": 1.636615 + }, + { + "acc": 0.63822718, + "epoch": 1.226027397260274, + "grad_norm": 5.6875, + "learning_rate": 3.5651802002423543e-06, + "loss": 1.62146816, + "memory(GiB)": 117.38, + "step": 48330, + "train_speed(iter/s)": 1.636632 + }, + { + "acc": 0.66818008, + "epoch": 1.226154236428209, + "grad_norm": 6.34375, + "learning_rate": 3.5641757117910625e-06, + "loss": 1.57961998, + "memory(GiB)": 117.38, + "step": 48335, + "train_speed(iter/s)": 1.636649 + }, + { + "acc": 0.64077725, + "epoch": 1.2262810755961442, + "grad_norm": 6.125, + "learning_rate": 3.563171286493734e-06, + "loss": 1.67039032, + "memory(GiB)": 117.38, + "step": 48340, + "train_speed(iter/s)": 1.636667 + }, + { + "acc": 0.63302851, + "epoch": 1.2264079147640792, + "grad_norm": 6.0, + "learning_rate": 3.5621669243945457e-06, + "loss": 1.63067474, + "memory(GiB)": 117.38, + "step": 48345, + "train_speed(iter/s)": 1.636684 + }, + { + "acc": 0.65148969, + "epoch": 1.2265347539320142, + "grad_norm": 5.15625, + "learning_rate": 3.5611626255376785e-06, + "loss": 1.60756302, + "memory(GiB)": 117.38, + "step": 48350, + "train_speed(iter/s)": 1.636703 + }, + { + "acc": 0.67346864, + "epoch": 1.2266615930999492, + "grad_norm": 5.65625, + "learning_rate": 3.560158389967302e-06, + "loss": 1.59825792, + "memory(GiB)": 117.38, + "step": 48355, + "train_speed(iter/s)": 1.636722 + }, + { + "acc": 0.65535822, + "epoch": 1.2267884322678844, + "grad_norm": 7.96875, + "learning_rate": 3.559154217727586e-06, + "loss": 1.67674046, + "memory(GiB)": 117.38, + "step": 48360, + "train_speed(iter/s)": 1.63674 + }, + { + "acc": 0.65066137, + "epoch": 1.2269152714358194, + "grad_norm": 6.5625, + "learning_rate": 3.5581501088627026e-06, + "loss": 1.69373207, + "memory(GiB)": 117.38, + "step": 48365, + "train_speed(iter/s)": 1.636759 + }, + { + "acc": 0.66161833, + "epoch": 1.2270421106037543, + "grad_norm": 5.84375, + "learning_rate": 3.557146063416815e-06, + "loss": 1.57575531, + "memory(GiB)": 117.38, + "step": 48370, + "train_speed(iter/s)": 1.636777 + }, + { + "acc": 0.65113878, + "epoch": 1.2271689497716896, + "grad_norm": 5.625, + "learning_rate": 3.5561420814340843e-06, + "loss": 1.63597908, + "memory(GiB)": 117.38, + "step": 48375, + "train_speed(iter/s)": 1.636796 + }, + { + "acc": 0.64098854, + "epoch": 1.2272957889396245, + "grad_norm": 6.03125, + "learning_rate": 3.555138162958671e-06, + "loss": 1.66200447, + "memory(GiB)": 117.38, + "step": 48380, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.66435156, + "epoch": 1.2274226281075595, + "grad_norm": 6.34375, + "learning_rate": 3.5541343080347325e-06, + "loss": 1.54340277, + "memory(GiB)": 117.38, + "step": 48385, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.65611553, + "epoch": 1.2275494672754947, + "grad_norm": 5.4375, + "learning_rate": 3.5531305167064234e-06, + "loss": 1.60675163, + "memory(GiB)": 117.38, + "step": 48390, + "train_speed(iter/s)": 1.636848 + }, + { + "acc": 0.66836038, + "epoch": 1.2276763064434297, + "grad_norm": 5.6875, + "learning_rate": 3.5521267890178922e-06, + "loss": 1.54638166, + "memory(GiB)": 117.38, + "step": 48395, + "train_speed(iter/s)": 1.636866 + }, + { + "acc": 0.65401192, + "epoch": 1.2278031456113647, + "grad_norm": 5.0625, + "learning_rate": 3.5511231250132905e-06, + "loss": 1.58119812, + "memory(GiB)": 117.38, + "step": 48400, + "train_speed(iter/s)": 1.636883 + }, + { + "acc": 0.65706849, + "epoch": 1.2279299847793, + "grad_norm": 4.84375, + "learning_rate": 3.550119524736761e-06, + "loss": 1.657444, + "memory(GiB)": 117.38, + "step": 48405, + "train_speed(iter/s)": 1.636901 + }, + { + "acc": 0.64591465, + "epoch": 1.228056823947235, + "grad_norm": 6.84375, + "learning_rate": 3.5491159882324513e-06, + "loss": 1.6000946, + "memory(GiB)": 117.38, + "step": 48410, + "train_speed(iter/s)": 1.636918 + }, + { + "acc": 0.65541887, + "epoch": 1.22818366311517, + "grad_norm": 5.875, + "learning_rate": 3.548112515544495e-06, + "loss": 1.59203568, + "memory(GiB)": 117.38, + "step": 48415, + "train_speed(iter/s)": 1.636935 + }, + { + "acc": 0.65563235, + "epoch": 1.228310502283105, + "grad_norm": 7.28125, + "learning_rate": 3.547109106717034e-06, + "loss": 1.58837109, + "memory(GiB)": 117.38, + "step": 48420, + "train_speed(iter/s)": 1.636952 + }, + { + "acc": 0.65881538, + "epoch": 1.22843734145104, + "grad_norm": 5.78125, + "learning_rate": 3.546105761794199e-06, + "loss": 1.62506142, + "memory(GiB)": 117.38, + "step": 48425, + "train_speed(iter/s)": 1.63697 + }, + { + "acc": 0.65422049, + "epoch": 1.228564180618975, + "grad_norm": 5.6875, + "learning_rate": 3.5451024808201268e-06, + "loss": 1.60268974, + "memory(GiB)": 117.38, + "step": 48430, + "train_speed(iter/s)": 1.636989 + }, + { + "acc": 0.63837881, + "epoch": 1.2286910197869103, + "grad_norm": 5.46875, + "learning_rate": 3.5440992638389417e-06, + "loss": 1.66930008, + "memory(GiB)": 117.38, + "step": 48435, + "train_speed(iter/s)": 1.637006 + }, + { + "acc": 0.66433296, + "epoch": 1.2288178589548453, + "grad_norm": 6.625, + "learning_rate": 3.5430961108947705e-06, + "loss": 1.59345856, + "memory(GiB)": 117.38, + "step": 48440, + "train_speed(iter/s)": 1.637023 + }, + { + "acc": 0.64219632, + "epoch": 1.2289446981227803, + "grad_norm": 5.15625, + "learning_rate": 3.5420930220317373e-06, + "loss": 1.60974751, + "memory(GiB)": 117.38, + "step": 48445, + "train_speed(iter/s)": 1.63704 + }, + { + "acc": 0.66603765, + "epoch": 1.2290715372907153, + "grad_norm": 5.65625, + "learning_rate": 3.541089997293964e-06, + "loss": 1.5459074, + "memory(GiB)": 117.38, + "step": 48450, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.6596693, + "epoch": 1.2291983764586505, + "grad_norm": 6.75, + "learning_rate": 3.5400870367255635e-06, + "loss": 1.54205732, + "memory(GiB)": 117.38, + "step": 48455, + "train_speed(iter/s)": 1.637076 + }, + { + "acc": 0.64692497, + "epoch": 1.2293252156265855, + "grad_norm": 6.96875, + "learning_rate": 3.539084140370654e-06, + "loss": 1.58313284, + "memory(GiB)": 117.38, + "step": 48460, + "train_speed(iter/s)": 1.637095 + }, + { + "acc": 0.65525618, + "epoch": 1.2294520547945205, + "grad_norm": 5.5625, + "learning_rate": 3.538081308273347e-06, + "loss": 1.62422295, + "memory(GiB)": 117.38, + "step": 48465, + "train_speed(iter/s)": 1.637112 + }, + { + "acc": 0.65754433, + "epoch": 1.2295788939624557, + "grad_norm": 6.46875, + "learning_rate": 3.537078540477752e-06, + "loss": 1.58634987, + "memory(GiB)": 117.38, + "step": 48470, + "train_speed(iter/s)": 1.63713 + }, + { + "acc": 0.64122953, + "epoch": 1.2297057331303907, + "grad_norm": 5.75, + "learning_rate": 3.5360758370279722e-06, + "loss": 1.64821491, + "memory(GiB)": 117.38, + "step": 48475, + "train_speed(iter/s)": 1.637147 + }, + { + "acc": 0.66488471, + "epoch": 1.2298325722983257, + "grad_norm": 5.65625, + "learning_rate": 3.535073197968114e-06, + "loss": 1.54251547, + "memory(GiB)": 117.38, + "step": 48480, + "train_speed(iter/s)": 1.637165 + }, + { + "acc": 0.66227217, + "epoch": 1.2299594114662609, + "grad_norm": 5.9375, + "learning_rate": 3.5340706233422763e-06, + "loss": 1.6419178, + "memory(GiB)": 117.38, + "step": 48485, + "train_speed(iter/s)": 1.637183 + }, + { + "acc": 0.65090356, + "epoch": 1.2300862506341959, + "grad_norm": 6.1875, + "learning_rate": 3.5330681131945588e-06, + "loss": 1.60829468, + "memory(GiB)": 117.38, + "step": 48490, + "train_speed(iter/s)": 1.637201 + }, + { + "acc": 0.65389252, + "epoch": 1.2302130898021308, + "grad_norm": 5.21875, + "learning_rate": 3.5320656675690546e-06, + "loss": 1.55919609, + "memory(GiB)": 117.38, + "step": 48495, + "train_speed(iter/s)": 1.637218 + }, + { + "acc": 0.65006638, + "epoch": 1.230339928970066, + "grad_norm": 5.875, + "learning_rate": 3.531063286509855e-06, + "loss": 1.56216888, + "memory(GiB)": 117.38, + "step": 48500, + "train_speed(iter/s)": 1.637235 + }, + { + "acc": 0.66443434, + "epoch": 1.230466768138001, + "grad_norm": 6.40625, + "learning_rate": 3.530060970061051e-06, + "loss": 1.60653458, + "memory(GiB)": 117.38, + "step": 48505, + "train_speed(iter/s)": 1.637254 + }, + { + "acc": 0.65085635, + "epoch": 1.230593607305936, + "grad_norm": 5.625, + "learning_rate": 3.52905871826673e-06, + "loss": 1.65162544, + "memory(GiB)": 117.38, + "step": 48510, + "train_speed(iter/s)": 1.637271 + }, + { + "acc": 0.65777621, + "epoch": 1.230720446473871, + "grad_norm": 5.5625, + "learning_rate": 3.5280565311709725e-06, + "loss": 1.56496115, + "memory(GiB)": 117.38, + "step": 48515, + "train_speed(iter/s)": 1.637289 + }, + { + "acc": 0.64512844, + "epoch": 1.2308472856418062, + "grad_norm": 6.53125, + "learning_rate": 3.5270544088178597e-06, + "loss": 1.64786587, + "memory(GiB)": 117.38, + "step": 48520, + "train_speed(iter/s)": 1.637307 + }, + { + "acc": 0.65158119, + "epoch": 1.2309741248097412, + "grad_norm": 5.3125, + "learning_rate": 3.526052351251471e-06, + "loss": 1.62798271, + "memory(GiB)": 117.38, + "step": 48525, + "train_speed(iter/s)": 1.637326 + }, + { + "acc": 0.65848885, + "epoch": 1.2311009639776762, + "grad_norm": 6.375, + "learning_rate": 3.5250503585158825e-06, + "loss": 1.61647625, + "memory(GiB)": 117.38, + "step": 48530, + "train_speed(iter/s)": 1.637343 + }, + { + "acc": 0.65481181, + "epoch": 1.2312278031456114, + "grad_norm": 5.15625, + "learning_rate": 3.5240484306551615e-06, + "loss": 1.5835804, + "memory(GiB)": 117.38, + "step": 48535, + "train_speed(iter/s)": 1.637361 + }, + { + "acc": 0.65276566, + "epoch": 1.2313546423135464, + "grad_norm": 6.71875, + "learning_rate": 3.5230465677133813e-06, + "loss": 1.65755291, + "memory(GiB)": 117.38, + "step": 48540, + "train_speed(iter/s)": 1.637379 + }, + { + "acc": 0.64752941, + "epoch": 1.2314814814814814, + "grad_norm": 5.84375, + "learning_rate": 3.5220447697346063e-06, + "loss": 1.65055313, + "memory(GiB)": 117.38, + "step": 48545, + "train_speed(iter/s)": 1.637397 + }, + { + "acc": 0.65033035, + "epoch": 1.2316083206494166, + "grad_norm": 8.25, + "learning_rate": 3.521043036762903e-06, + "loss": 1.67698288, + "memory(GiB)": 117.38, + "step": 48550, + "train_speed(iter/s)": 1.637414 + }, + { + "acc": 0.65268202, + "epoch": 1.2317351598173516, + "grad_norm": 5.125, + "learning_rate": 3.5200413688423284e-06, + "loss": 1.59862814, + "memory(GiB)": 117.38, + "step": 48555, + "train_speed(iter/s)": 1.637432 + }, + { + "acc": 0.6540966, + "epoch": 1.2318619989852866, + "grad_norm": 6.8125, + "learning_rate": 3.519039766016943e-06, + "loss": 1.6391943, + "memory(GiB)": 117.38, + "step": 48560, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.64522371, + "epoch": 1.2319888381532218, + "grad_norm": 5.78125, + "learning_rate": 3.5180382283307983e-06, + "loss": 1.70728416, + "memory(GiB)": 117.38, + "step": 48565, + "train_speed(iter/s)": 1.637468 + }, + { + "acc": 0.66855621, + "epoch": 1.2321156773211568, + "grad_norm": 4.90625, + "learning_rate": 3.517036755827952e-06, + "loss": 1.56782322, + "memory(GiB)": 117.38, + "step": 48570, + "train_speed(iter/s)": 1.637486 + }, + { + "acc": 0.66778879, + "epoch": 1.2322425164890918, + "grad_norm": 5.1875, + "learning_rate": 3.516035348552449e-06, + "loss": 1.58797293, + "memory(GiB)": 117.38, + "step": 48575, + "train_speed(iter/s)": 1.637504 + }, + { + "acc": 0.65921392, + "epoch": 1.2323693556570268, + "grad_norm": 5.84375, + "learning_rate": 3.515034006548335e-06, + "loss": 1.5883091, + "memory(GiB)": 117.38, + "step": 48580, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.66598668, + "epoch": 1.232496194824962, + "grad_norm": 6.46875, + "learning_rate": 3.5140327298596565e-06, + "loss": 1.5754281, + "memory(GiB)": 117.38, + "step": 48585, + "train_speed(iter/s)": 1.637541 + }, + { + "acc": 0.65871267, + "epoch": 1.232623033992897, + "grad_norm": 4.875, + "learning_rate": 3.5130315185304547e-06, + "loss": 1.56855316, + "memory(GiB)": 117.38, + "step": 48590, + "train_speed(iter/s)": 1.637559 + }, + { + "acc": 0.65208883, + "epoch": 1.2327498731608322, + "grad_norm": 5.53125, + "learning_rate": 3.5120303726047642e-06, + "loss": 1.61041756, + "memory(GiB)": 117.38, + "step": 48595, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.64740219, + "epoch": 1.2328767123287672, + "grad_norm": 5.28125, + "learning_rate": 3.51102929212662e-06, + "loss": 1.69913807, + "memory(GiB)": 117.38, + "step": 48600, + "train_speed(iter/s)": 1.637595 + }, + { + "acc": 0.64866199, + "epoch": 1.2330035514967022, + "grad_norm": 5.375, + "learning_rate": 3.5100282771400563e-06, + "loss": 1.66856041, + "memory(GiB)": 117.38, + "step": 48605, + "train_speed(iter/s)": 1.637614 + }, + { + "acc": 0.64932766, + "epoch": 1.2331303906646371, + "grad_norm": 4.96875, + "learning_rate": 3.5090273276891023e-06, + "loss": 1.60012932, + "memory(GiB)": 117.38, + "step": 48610, + "train_speed(iter/s)": 1.637632 + }, + { + "acc": 0.65619082, + "epoch": 1.2332572298325724, + "grad_norm": 5.5625, + "learning_rate": 3.5080264438177815e-06, + "loss": 1.58124876, + "memory(GiB)": 117.38, + "step": 48615, + "train_speed(iter/s)": 1.637649 + }, + { + "acc": 0.64793406, + "epoch": 1.2333840690005073, + "grad_norm": 5.0625, + "learning_rate": 3.50702562557012e-06, + "loss": 1.67176228, + "memory(GiB)": 117.38, + "step": 48620, + "train_speed(iter/s)": 1.637667 + }, + { + "acc": 0.65264406, + "epoch": 1.2335109081684423, + "grad_norm": 4.6875, + "learning_rate": 3.506024872990135e-06, + "loss": 1.61403351, + "memory(GiB)": 117.38, + "step": 48625, + "train_speed(iter/s)": 1.637685 + }, + { + "acc": 0.65634069, + "epoch": 1.2336377473363775, + "grad_norm": 5.875, + "learning_rate": 3.5050241861218493e-06, + "loss": 1.56550303, + "memory(GiB)": 117.38, + "step": 48630, + "train_speed(iter/s)": 1.637703 + }, + { + "acc": 0.65947065, + "epoch": 1.2337645865043125, + "grad_norm": 5.65625, + "learning_rate": 3.5040235650092725e-06, + "loss": 1.57389193, + "memory(GiB)": 117.38, + "step": 48635, + "train_speed(iter/s)": 1.637721 + }, + { + "acc": 0.68081012, + "epoch": 1.2338914256722475, + "grad_norm": 6.6875, + "learning_rate": 3.503023009696419e-06, + "loss": 1.46642466, + "memory(GiB)": 117.38, + "step": 48640, + "train_speed(iter/s)": 1.637739 + }, + { + "acc": 0.65444117, + "epoch": 1.2340182648401827, + "grad_norm": 5.75, + "learning_rate": 3.5020225202272963e-06, + "loss": 1.63811436, + "memory(GiB)": 117.38, + "step": 48645, + "train_speed(iter/s)": 1.637757 + }, + { + "acc": 0.63551197, + "epoch": 1.2341451040081177, + "grad_norm": 5.65625, + "learning_rate": 3.501022096645913e-06, + "loss": 1.67545319, + "memory(GiB)": 117.38, + "step": 48650, + "train_speed(iter/s)": 1.637774 + }, + { + "acc": 0.64973135, + "epoch": 1.2342719431760527, + "grad_norm": 6.21875, + "learning_rate": 3.5000217389962685e-06, + "loss": 1.64544411, + "memory(GiB)": 117.38, + "step": 48655, + "train_speed(iter/s)": 1.637793 + }, + { + "acc": 0.65190687, + "epoch": 1.234398782343988, + "grad_norm": 5.0, + "learning_rate": 3.499021447322365e-06, + "loss": 1.66008663, + "memory(GiB)": 117.38, + "step": 48660, + "train_speed(iter/s)": 1.637812 + }, + { + "acc": 0.65976696, + "epoch": 1.234525621511923, + "grad_norm": 5.0, + "learning_rate": 3.4980212216681997e-06, + "loss": 1.61526642, + "memory(GiB)": 117.38, + "step": 48665, + "train_speed(iter/s)": 1.637829 + }, + { + "acc": 0.65877628, + "epoch": 1.234652460679858, + "grad_norm": 4.84375, + "learning_rate": 3.4970210620777687e-06, + "loss": 1.59925518, + "memory(GiB)": 117.38, + "step": 48670, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.67364249, + "epoch": 1.2347792998477929, + "grad_norm": 4.84375, + "learning_rate": 3.496020968595059e-06, + "loss": 1.60163021, + "memory(GiB)": 117.38, + "step": 48675, + "train_speed(iter/s)": 1.637866 + }, + { + "acc": 0.67047882, + "epoch": 1.234906139015728, + "grad_norm": 4.8125, + "learning_rate": 3.4950209412640634e-06, + "loss": 1.54944868, + "memory(GiB)": 117.38, + "step": 48680, + "train_speed(iter/s)": 1.637885 + }, + { + "acc": 0.66833687, + "epoch": 1.235032978183663, + "grad_norm": 5.5625, + "learning_rate": 3.494020980128766e-06, + "loss": 1.5363266, + "memory(GiB)": 117.38, + "step": 48685, + "train_speed(iter/s)": 1.637903 + }, + { + "acc": 0.66244659, + "epoch": 1.235159817351598, + "grad_norm": 6.34375, + "learning_rate": 3.4930210852331505e-06, + "loss": 1.52262459, + "memory(GiB)": 117.38, + "step": 48690, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.66017919, + "epoch": 1.2352866565195333, + "grad_norm": 6.90625, + "learning_rate": 3.4920212566211943e-06, + "loss": 1.49845982, + "memory(GiB)": 117.38, + "step": 48695, + "train_speed(iter/s)": 1.637942 + }, + { + "acc": 0.6517664, + "epoch": 1.2354134956874683, + "grad_norm": 5.9375, + "learning_rate": 3.491021494336876e-06, + "loss": 1.60274467, + "memory(GiB)": 117.38, + "step": 48700, + "train_speed(iter/s)": 1.637961 + }, + { + "acc": 0.64981036, + "epoch": 1.2355403348554033, + "grad_norm": 5.6875, + "learning_rate": 3.4900217984241692e-06, + "loss": 1.60644913, + "memory(GiB)": 117.38, + "step": 48705, + "train_speed(iter/s)": 1.637979 + }, + { + "acc": 0.65412197, + "epoch": 1.2356671740233385, + "grad_norm": 7.625, + "learning_rate": 3.4890221689270466e-06, + "loss": 1.58664608, + "memory(GiB)": 117.38, + "step": 48710, + "train_speed(iter/s)": 1.637997 + }, + { + "acc": 0.66361661, + "epoch": 1.2357940131912735, + "grad_norm": 5.96875, + "learning_rate": 3.488022605889475e-06, + "loss": 1.57928696, + "memory(GiB)": 117.38, + "step": 48715, + "train_speed(iter/s)": 1.638015 + }, + { + "acc": 0.65932579, + "epoch": 1.2359208523592085, + "grad_norm": 7.15625, + "learning_rate": 3.4870231093554172e-06, + "loss": 1.54066334, + "memory(GiB)": 117.38, + "step": 48720, + "train_speed(iter/s)": 1.638034 + }, + { + "acc": 0.6463203, + "epoch": 1.2360476915271437, + "grad_norm": 5.46875, + "learning_rate": 3.4860236793688407e-06, + "loss": 1.64001732, + "memory(GiB)": 117.38, + "step": 48725, + "train_speed(iter/s)": 1.638052 + }, + { + "acc": 0.65029235, + "epoch": 1.2361745306950787, + "grad_norm": 5.96875, + "learning_rate": 3.4850243159737024e-06, + "loss": 1.59087286, + "memory(GiB)": 117.38, + "step": 48730, + "train_speed(iter/s)": 1.638071 + }, + { + "acc": 0.6542088, + "epoch": 1.2363013698630136, + "grad_norm": 5.84375, + "learning_rate": 3.4840250192139574e-06, + "loss": 1.64720364, + "memory(GiB)": 117.38, + "step": 48735, + "train_speed(iter/s)": 1.638089 + }, + { + "acc": 0.66338096, + "epoch": 1.2364282090309486, + "grad_norm": 6.34375, + "learning_rate": 3.4830257891335595e-06, + "loss": 1.52913914, + "memory(GiB)": 117.38, + "step": 48740, + "train_speed(iter/s)": 1.638108 + }, + { + "acc": 0.65706387, + "epoch": 1.2365550481988838, + "grad_norm": 4.9375, + "learning_rate": 3.4820266257764613e-06, + "loss": 1.51138716, + "memory(GiB)": 117.38, + "step": 48745, + "train_speed(iter/s)": 1.638128 + }, + { + "acc": 0.66781425, + "epoch": 1.2366818873668188, + "grad_norm": 4.8125, + "learning_rate": 3.4810275291866103e-06, + "loss": 1.55542421, + "memory(GiB)": 117.38, + "step": 48750, + "train_speed(iter/s)": 1.638146 + }, + { + "acc": 0.65449901, + "epoch": 1.236808726534754, + "grad_norm": 5.71875, + "learning_rate": 3.4800284994079487e-06, + "loss": 1.5806776, + "memory(GiB)": 117.38, + "step": 48755, + "train_speed(iter/s)": 1.638165 + }, + { + "acc": 0.64442616, + "epoch": 1.236935565702689, + "grad_norm": 5.25, + "learning_rate": 3.4790295364844207e-06, + "loss": 1.63501282, + "memory(GiB)": 117.38, + "step": 48760, + "train_speed(iter/s)": 1.638184 + }, + { + "acc": 0.64733019, + "epoch": 1.237062404870624, + "grad_norm": 6.03125, + "learning_rate": 3.4780306404599628e-06, + "loss": 1.60623474, + "memory(GiB)": 117.38, + "step": 48765, + "train_speed(iter/s)": 1.638202 + }, + { + "acc": 0.64679103, + "epoch": 1.237189244038559, + "grad_norm": 7.125, + "learning_rate": 3.4770318113785164e-06, + "loss": 1.67006912, + "memory(GiB)": 117.38, + "step": 48770, + "train_speed(iter/s)": 1.638221 + }, + { + "acc": 0.65661383, + "epoch": 1.2373160832064942, + "grad_norm": 6.625, + "learning_rate": 3.4760330492840065e-06, + "loss": 1.64981728, + "memory(GiB)": 117.38, + "step": 48775, + "train_speed(iter/s)": 1.638239 + }, + { + "acc": 0.65511599, + "epoch": 1.2374429223744292, + "grad_norm": 6.25, + "learning_rate": 3.4750343542203684e-06, + "loss": 1.62428207, + "memory(GiB)": 117.38, + "step": 48780, + "train_speed(iter/s)": 1.638258 + }, + { + "acc": 0.649823, + "epoch": 1.2375697615423642, + "grad_norm": 5.65625, + "learning_rate": 3.474035726231527e-06, + "loss": 1.64878502, + "memory(GiB)": 117.38, + "step": 48785, + "train_speed(iter/s)": 1.638276 + }, + { + "acc": 0.66283646, + "epoch": 1.2376966007102994, + "grad_norm": 5.625, + "learning_rate": 3.473037165361409e-06, + "loss": 1.63088703, + "memory(GiB)": 117.38, + "step": 48790, + "train_speed(iter/s)": 1.638294 + }, + { + "acc": 0.66233521, + "epoch": 1.2378234398782344, + "grad_norm": 6.1875, + "learning_rate": 3.4720386716539333e-06, + "loss": 1.5894309, + "memory(GiB)": 117.38, + "step": 48795, + "train_speed(iter/s)": 1.638313 + }, + { + "acc": 0.65756836, + "epoch": 1.2379502790461694, + "grad_norm": 5.21875, + "learning_rate": 3.471040245153018e-06, + "loss": 1.59217596, + "memory(GiB)": 117.38, + "step": 48800, + "train_speed(iter/s)": 1.638332 + }, + { + "acc": 0.65424194, + "epoch": 1.2380771182141046, + "grad_norm": 5.21875, + "learning_rate": 3.4700418859025793e-06, + "loss": 1.61271763, + "memory(GiB)": 117.38, + "step": 48805, + "train_speed(iter/s)": 1.63835 + }, + { + "acc": 0.66022105, + "epoch": 1.2382039573820396, + "grad_norm": 6.84375, + "learning_rate": 3.4690435939465307e-06, + "loss": 1.57620773, + "memory(GiB)": 117.38, + "step": 48810, + "train_speed(iter/s)": 1.638368 + }, + { + "acc": 0.65015993, + "epoch": 1.2383307965499746, + "grad_norm": 4.90625, + "learning_rate": 3.4680453693287786e-06, + "loss": 1.56861115, + "memory(GiB)": 117.38, + "step": 48815, + "train_speed(iter/s)": 1.638386 + }, + { + "acc": 0.6541832, + "epoch": 1.2384576357179098, + "grad_norm": 6.6875, + "learning_rate": 3.4670472120932297e-06, + "loss": 1.62443256, + "memory(GiB)": 117.38, + "step": 48820, + "train_speed(iter/s)": 1.638402 + }, + { + "acc": 0.65462232, + "epoch": 1.2385844748858448, + "grad_norm": 6.84375, + "learning_rate": 3.46604912228379e-06, + "loss": 1.63511925, + "memory(GiB)": 117.38, + "step": 48825, + "train_speed(iter/s)": 1.63842 + }, + { + "acc": 0.66376519, + "epoch": 1.2387113140537798, + "grad_norm": 5.84375, + "learning_rate": 3.46505109994436e-06, + "loss": 1.58396301, + "memory(GiB)": 117.38, + "step": 48830, + "train_speed(iter/s)": 1.638438 + }, + { + "acc": 0.6551754, + "epoch": 1.2388381532217148, + "grad_norm": 5.40625, + "learning_rate": 3.464053145118833e-06, + "loss": 1.53278046, + "memory(GiB)": 117.38, + "step": 48835, + "train_speed(iter/s)": 1.638454 + }, + { + "acc": 0.66460633, + "epoch": 1.23896499238965, + "grad_norm": 7.59375, + "learning_rate": 3.4630552578511073e-06, + "loss": 1.62619667, + "memory(GiB)": 117.38, + "step": 48840, + "train_speed(iter/s)": 1.638472 + }, + { + "acc": 0.64749804, + "epoch": 1.239091831557585, + "grad_norm": 6.125, + "learning_rate": 3.4620574381850723e-06, + "loss": 1.61395226, + "memory(GiB)": 117.38, + "step": 48845, + "train_speed(iter/s)": 1.638491 + }, + { + "acc": 0.65421524, + "epoch": 1.23921867072552, + "grad_norm": 5.8125, + "learning_rate": 3.4610596861646194e-06, + "loss": 1.63729897, + "memory(GiB)": 117.38, + "step": 48850, + "train_speed(iter/s)": 1.638508 + }, + { + "acc": 0.66746111, + "epoch": 1.2393455098934552, + "grad_norm": 6.125, + "learning_rate": 3.460062001833632e-06, + "loss": 1.5572773, + "memory(GiB)": 117.38, + "step": 48855, + "train_speed(iter/s)": 1.638526 + }, + { + "acc": 0.68085127, + "epoch": 1.2394723490613901, + "grad_norm": 6.5, + "learning_rate": 3.459064385235993e-06, + "loss": 1.56359329, + "memory(GiB)": 117.38, + "step": 48860, + "train_speed(iter/s)": 1.638542 + }, + { + "acc": 0.64866028, + "epoch": 1.2395991882293251, + "grad_norm": 6.25, + "learning_rate": 3.45806683641558e-06, + "loss": 1.61035519, + "memory(GiB)": 117.38, + "step": 48865, + "train_speed(iter/s)": 1.63856 + }, + { + "acc": 0.66534638, + "epoch": 1.2397260273972603, + "grad_norm": 5.21875, + "learning_rate": 3.457069355416275e-06, + "loss": 1.50436878, + "memory(GiB)": 117.38, + "step": 48870, + "train_speed(iter/s)": 1.638578 + }, + { + "acc": 0.65916786, + "epoch": 1.2398528665651953, + "grad_norm": 6.03125, + "learning_rate": 3.456071942281947e-06, + "loss": 1.62215271, + "memory(GiB)": 117.38, + "step": 48875, + "train_speed(iter/s)": 1.638598 + }, + { + "acc": 0.64199252, + "epoch": 1.2399797057331303, + "grad_norm": 6.09375, + "learning_rate": 3.455074597056467e-06, + "loss": 1.66741104, + "memory(GiB)": 117.38, + "step": 48880, + "train_speed(iter/s)": 1.638615 + }, + { + "acc": 0.66439462, + "epoch": 1.2401065449010655, + "grad_norm": 5.90625, + "learning_rate": 3.454077319783705e-06, + "loss": 1.50258551, + "memory(GiB)": 117.38, + "step": 48885, + "train_speed(iter/s)": 1.638633 + }, + { + "acc": 0.64252748, + "epoch": 1.2402333840690005, + "grad_norm": 5.34375, + "learning_rate": 3.4530801105075257e-06, + "loss": 1.6697279, + "memory(GiB)": 117.38, + "step": 48890, + "train_speed(iter/s)": 1.638651 + }, + { + "acc": 0.64962044, + "epoch": 1.2403602232369355, + "grad_norm": 5.84375, + "learning_rate": 3.4520829692717874e-06, + "loss": 1.62925987, + "memory(GiB)": 117.38, + "step": 48895, + "train_speed(iter/s)": 1.638669 + }, + { + "acc": 0.64629426, + "epoch": 1.2404870624048705, + "grad_norm": 5.0625, + "learning_rate": 3.451085896120352e-06, + "loss": 1.63187428, + "memory(GiB)": 117.38, + "step": 48900, + "train_speed(iter/s)": 1.638686 + }, + { + "acc": 0.65156045, + "epoch": 1.2406139015728057, + "grad_norm": 5.21875, + "learning_rate": 3.450088891097074e-06, + "loss": 1.60220413, + "memory(GiB)": 117.38, + "step": 48905, + "train_speed(iter/s)": 1.638704 + }, + { + "acc": 0.65528502, + "epoch": 1.2407407407407407, + "grad_norm": 5.78125, + "learning_rate": 3.4490919542458085e-06, + "loss": 1.59906931, + "memory(GiB)": 117.38, + "step": 48910, + "train_speed(iter/s)": 1.638721 + }, + { + "acc": 0.63408594, + "epoch": 1.240867579908676, + "grad_norm": 5.25, + "learning_rate": 3.4480950856104002e-06, + "loss": 1.68478489, + "memory(GiB)": 117.38, + "step": 48915, + "train_speed(iter/s)": 1.63874 + }, + { + "acc": 0.64956036, + "epoch": 1.240994419076611, + "grad_norm": 6.0625, + "learning_rate": 3.4470982852347e-06, + "loss": 1.59546766, + "memory(GiB)": 117.38, + "step": 48920, + "train_speed(iter/s)": 1.638758 + }, + { + "acc": 0.63675089, + "epoch": 1.2411212582445459, + "grad_norm": 5.6875, + "learning_rate": 3.44610155316255e-06, + "loss": 1.65878162, + "memory(GiB)": 117.38, + "step": 48925, + "train_speed(iter/s)": 1.638775 + }, + { + "acc": 0.6600543, + "epoch": 1.2412480974124809, + "grad_norm": 5.5625, + "learning_rate": 3.4451048894377925e-06, + "loss": 1.58657312, + "memory(GiB)": 117.38, + "step": 48930, + "train_speed(iter/s)": 1.638793 + }, + { + "acc": 0.65964618, + "epoch": 1.241374936580416, + "grad_norm": 5.6875, + "learning_rate": 3.444108294104264e-06, + "loss": 1.59323673, + "memory(GiB)": 117.38, + "step": 48935, + "train_speed(iter/s)": 1.638811 + }, + { + "acc": 0.66680927, + "epoch": 1.241501775748351, + "grad_norm": 5.0625, + "learning_rate": 3.443111767205797e-06, + "loss": 1.60281677, + "memory(GiB)": 117.38, + "step": 48940, + "train_speed(iter/s)": 1.638828 + }, + { + "acc": 0.64805794, + "epoch": 1.241628614916286, + "grad_norm": 6.4375, + "learning_rate": 3.442115308786227e-06, + "loss": 1.61931705, + "memory(GiB)": 117.38, + "step": 48945, + "train_speed(iter/s)": 1.638847 + }, + { + "acc": 0.65316944, + "epoch": 1.2417554540842213, + "grad_norm": 6.3125, + "learning_rate": 3.4411189188893822e-06, + "loss": 1.61966019, + "memory(GiB)": 117.38, + "step": 48950, + "train_speed(iter/s)": 1.638865 + }, + { + "acc": 0.65052328, + "epoch": 1.2418822932521563, + "grad_norm": 5.15625, + "learning_rate": 3.4401225975590867e-06, + "loss": 1.65285664, + "memory(GiB)": 117.38, + "step": 48955, + "train_speed(iter/s)": 1.638883 + }, + { + "acc": 0.63330669, + "epoch": 1.2420091324200913, + "grad_norm": 4.5, + "learning_rate": 3.439126344839163e-06, + "loss": 1.61611805, + "memory(GiB)": 117.38, + "step": 48960, + "train_speed(iter/s)": 1.638901 + }, + { + "acc": 0.65963011, + "epoch": 1.2421359715880265, + "grad_norm": 5.96875, + "learning_rate": 3.438130160773431e-06, + "loss": 1.59650936, + "memory(GiB)": 117.38, + "step": 48965, + "train_speed(iter/s)": 1.638918 + }, + { + "acc": 0.65164604, + "epoch": 1.2422628107559615, + "grad_norm": 4.78125, + "learning_rate": 3.43713404540571e-06, + "loss": 1.65793076, + "memory(GiB)": 117.38, + "step": 48970, + "train_speed(iter/s)": 1.638937 + }, + { + "acc": 0.65964561, + "epoch": 1.2423896499238964, + "grad_norm": 5.125, + "learning_rate": 3.4361379987798094e-06, + "loss": 1.64129562, + "memory(GiB)": 117.38, + "step": 48975, + "train_speed(iter/s)": 1.638954 + }, + { + "acc": 0.65840273, + "epoch": 1.2425164890918317, + "grad_norm": 6.3125, + "learning_rate": 3.435142020939542e-06, + "loss": 1.568015, + "memory(GiB)": 117.38, + "step": 48980, + "train_speed(iter/s)": 1.638971 + }, + { + "acc": 0.66496153, + "epoch": 1.2426433282597666, + "grad_norm": 6.0, + "learning_rate": 3.4341461119287144e-06, + "loss": 1.55102015, + "memory(GiB)": 117.38, + "step": 48985, + "train_speed(iter/s)": 1.638989 + }, + { + "acc": 0.66440287, + "epoch": 1.2427701674277016, + "grad_norm": 7.28125, + "learning_rate": 3.433150271791135e-06, + "loss": 1.5986311, + "memory(GiB)": 117.38, + "step": 48990, + "train_speed(iter/s)": 1.639007 + }, + { + "acc": 0.66295815, + "epoch": 1.2428970065956366, + "grad_norm": 6.4375, + "learning_rate": 3.432154500570599e-06, + "loss": 1.54974756, + "memory(GiB)": 117.38, + "step": 48995, + "train_speed(iter/s)": 1.639025 + }, + { + "acc": 0.66154995, + "epoch": 1.2430238457635718, + "grad_norm": 4.78125, + "learning_rate": 3.431158798310909e-06, + "loss": 1.56376553, + "memory(GiB)": 117.38, + "step": 49000, + "train_speed(iter/s)": 1.639043 + }, + { + "epoch": 1.2430238457635718, + "eval_acc": 0.6462434505499555, + "eval_loss": 1.5734843015670776, + "eval_runtime": 58.7744, + "eval_samples_per_second": 108.381, + "eval_steps_per_second": 27.104, + "step": 49000 + }, + { + "acc": 0.64433823, + "epoch": 1.2431506849315068, + "grad_norm": 5.6875, + "learning_rate": 3.4301631650558588e-06, + "loss": 1.63979473, + "memory(GiB)": 117.38, + "step": 49005, + "train_speed(iter/s)": 1.635609 + }, + { + "acc": 0.6495863, + "epoch": 1.2432775240994418, + "grad_norm": 6.03125, + "learning_rate": 3.4291676008492424e-06, + "loss": 1.57677193, + "memory(GiB)": 117.38, + "step": 49010, + "train_speed(iter/s)": 1.635627 + }, + { + "acc": 0.6675385, + "epoch": 1.243404363267377, + "grad_norm": 5.34375, + "learning_rate": 3.428172105734848e-06, + "loss": 1.57157698, + "memory(GiB)": 117.38, + "step": 49015, + "train_speed(iter/s)": 1.635644 + }, + { + "acc": 0.65708847, + "epoch": 1.243531202435312, + "grad_norm": 5.5, + "learning_rate": 3.4271766797564608e-06, + "loss": 1.62360497, + "memory(GiB)": 117.38, + "step": 49020, + "train_speed(iter/s)": 1.635662 + }, + { + "acc": 0.65542431, + "epoch": 1.243658041603247, + "grad_norm": 7.1875, + "learning_rate": 3.4261813229578665e-06, + "loss": 1.61267757, + "memory(GiB)": 117.38, + "step": 49025, + "train_speed(iter/s)": 1.63568 + }, + { + "acc": 0.66790047, + "epoch": 1.2437848807711822, + "grad_norm": 6.15625, + "learning_rate": 3.425186035382846e-06, + "loss": 1.55474529, + "memory(GiB)": 117.38, + "step": 49030, + "train_speed(iter/s)": 1.635696 + }, + { + "acc": 0.65289712, + "epoch": 1.2439117199391172, + "grad_norm": 6.25, + "learning_rate": 3.4241908170751727e-06, + "loss": 1.5711834, + "memory(GiB)": 117.38, + "step": 49035, + "train_speed(iter/s)": 1.635713 + }, + { + "acc": 0.65860806, + "epoch": 1.2440385591070522, + "grad_norm": 5.65625, + "learning_rate": 3.4231956680786217e-06, + "loss": 1.54433584, + "memory(GiB)": 117.38, + "step": 49040, + "train_speed(iter/s)": 1.635731 + }, + { + "acc": 0.65533757, + "epoch": 1.2441653982749874, + "grad_norm": 5.375, + "learning_rate": 3.422200588436967e-06, + "loss": 1.62314548, + "memory(GiB)": 117.38, + "step": 49045, + "train_speed(iter/s)": 1.635748 + }, + { + "acc": 0.64220061, + "epoch": 1.2442922374429224, + "grad_norm": 4.59375, + "learning_rate": 3.4212055781939744e-06, + "loss": 1.65563011, + "memory(GiB)": 117.38, + "step": 49050, + "train_speed(iter/s)": 1.635765 + }, + { + "acc": 0.65179672, + "epoch": 1.2444190766108574, + "grad_norm": 6.4375, + "learning_rate": 3.4202106373934085e-06, + "loss": 1.67295418, + "memory(GiB)": 117.38, + "step": 49055, + "train_speed(iter/s)": 1.635783 + }, + { + "acc": 0.65387635, + "epoch": 1.2445459157787924, + "grad_norm": 6.21875, + "learning_rate": 3.4192157660790324e-06, + "loss": 1.64216576, + "memory(GiB)": 117.38, + "step": 49060, + "train_speed(iter/s)": 1.635796 + }, + { + "acc": 0.64455962, + "epoch": 1.2446727549467276, + "grad_norm": 5.875, + "learning_rate": 3.418220964294604e-06, + "loss": 1.64652824, + "memory(GiB)": 117.38, + "step": 49065, + "train_speed(iter/s)": 1.635813 + }, + { + "acc": 0.64922309, + "epoch": 1.2447995941146626, + "grad_norm": 5.71875, + "learning_rate": 3.417226232083881e-06, + "loss": 1.63553753, + "memory(GiB)": 117.38, + "step": 49070, + "train_speed(iter/s)": 1.635831 + }, + { + "acc": 0.65225453, + "epoch": 1.2449264332825978, + "grad_norm": 6.4375, + "learning_rate": 3.416231569490615e-06, + "loss": 1.64579239, + "memory(GiB)": 117.38, + "step": 49075, + "train_speed(iter/s)": 1.635847 + }, + { + "acc": 0.69131317, + "epoch": 1.2450532724505328, + "grad_norm": 6.5625, + "learning_rate": 3.4152369765585545e-06, + "loss": 1.42371483, + "memory(GiB)": 117.38, + "step": 49080, + "train_speed(iter/s)": 1.635865 + }, + { + "acc": 0.6623085, + "epoch": 1.2451801116184678, + "grad_norm": 6.03125, + "learning_rate": 3.4142424533314474e-06, + "loss": 1.57119064, + "memory(GiB)": 117.38, + "step": 49085, + "train_speed(iter/s)": 1.635883 + }, + { + "acc": 0.63752794, + "epoch": 1.2453069507864027, + "grad_norm": 5.90625, + "learning_rate": 3.4132479998530383e-06, + "loss": 1.66836777, + "memory(GiB)": 117.38, + "step": 49090, + "train_speed(iter/s)": 1.635901 + }, + { + "acc": 0.65183396, + "epoch": 1.245433789954338, + "grad_norm": 5.28125, + "learning_rate": 3.4122536161670656e-06, + "loss": 1.59478483, + "memory(GiB)": 117.38, + "step": 49095, + "train_speed(iter/s)": 1.635919 + }, + { + "acc": 0.66704235, + "epoch": 1.245560629122273, + "grad_norm": 7.75, + "learning_rate": 3.411259302317267e-06, + "loss": 1.54212542, + "memory(GiB)": 117.38, + "step": 49100, + "train_speed(iter/s)": 1.635936 + }, + { + "acc": 0.65002127, + "epoch": 1.245687468290208, + "grad_norm": 5.4375, + "learning_rate": 3.410265058347378e-06, + "loss": 1.55483828, + "memory(GiB)": 117.38, + "step": 49105, + "train_speed(iter/s)": 1.635954 + }, + { + "acc": 0.66530871, + "epoch": 1.2458143074581431, + "grad_norm": 7.3125, + "learning_rate": 3.4092708843011303e-06, + "loss": 1.57090197, + "memory(GiB)": 117.38, + "step": 49110, + "train_speed(iter/s)": 1.635971 + }, + { + "acc": 0.64639974, + "epoch": 1.2459411466260781, + "grad_norm": 5.71875, + "learning_rate": 3.4082767802222493e-06, + "loss": 1.63317451, + "memory(GiB)": 117.38, + "step": 49115, + "train_speed(iter/s)": 1.635989 + }, + { + "acc": 0.65806675, + "epoch": 1.2460679857940131, + "grad_norm": 6.21875, + "learning_rate": 3.4072827461544635e-06, + "loss": 1.61141415, + "memory(GiB)": 117.38, + "step": 49120, + "train_speed(iter/s)": 1.636005 + }, + { + "acc": 0.65344653, + "epoch": 1.2461948249619483, + "grad_norm": 5.8125, + "learning_rate": 3.4062887821414935e-06, + "loss": 1.57752876, + "memory(GiB)": 117.38, + "step": 49125, + "train_speed(iter/s)": 1.636023 + }, + { + "acc": 0.65416827, + "epoch": 1.2463216641298833, + "grad_norm": 5.40625, + "learning_rate": 3.4052948882270585e-06, + "loss": 1.66028385, + "memory(GiB)": 117.38, + "step": 49130, + "train_speed(iter/s)": 1.63604 + }, + { + "acc": 0.65115395, + "epoch": 1.2464485032978183, + "grad_norm": 6.96875, + "learning_rate": 3.404301064454873e-06, + "loss": 1.69198875, + "memory(GiB)": 117.38, + "step": 49135, + "train_speed(iter/s)": 1.636057 + }, + { + "acc": 0.65345488, + "epoch": 1.2465753424657535, + "grad_norm": 5.96875, + "learning_rate": 3.4033073108686515e-06, + "loss": 1.61722412, + "memory(GiB)": 117.38, + "step": 49140, + "train_speed(iter/s)": 1.636074 + }, + { + "acc": 0.64142108, + "epoch": 1.2467021816336885, + "grad_norm": 5.5, + "learning_rate": 3.4023136275121026e-06, + "loss": 1.62568779, + "memory(GiB)": 117.38, + "step": 49145, + "train_speed(iter/s)": 1.636092 + }, + { + "acc": 0.63762274, + "epoch": 1.2468290208016235, + "grad_norm": 6.59375, + "learning_rate": 3.401320014428935e-06, + "loss": 1.70274925, + "memory(GiB)": 117.38, + "step": 49150, + "train_speed(iter/s)": 1.636111 + }, + { + "acc": 0.65560827, + "epoch": 1.2469558599695585, + "grad_norm": 5.34375, + "learning_rate": 3.40032647166285e-06, + "loss": 1.56986227, + "memory(GiB)": 117.38, + "step": 49155, + "train_speed(iter/s)": 1.636129 + }, + { + "acc": 0.6660059, + "epoch": 1.2470826991374937, + "grad_norm": 5.78125, + "learning_rate": 3.3993329992575473e-06, + "loss": 1.56397486, + "memory(GiB)": 117.38, + "step": 49160, + "train_speed(iter/s)": 1.636146 + }, + { + "acc": 0.64460168, + "epoch": 1.2472095383054287, + "grad_norm": 5.6875, + "learning_rate": 3.3983395972567277e-06, + "loss": 1.6256506, + "memory(GiB)": 117.38, + "step": 49165, + "train_speed(iter/s)": 1.636165 + }, + { + "acc": 0.65856991, + "epoch": 1.2473363774733637, + "grad_norm": 5.71875, + "learning_rate": 3.397346265704084e-06, + "loss": 1.56199818, + "memory(GiB)": 117.38, + "step": 49170, + "train_speed(iter/s)": 1.636182 + }, + { + "acc": 0.66867533, + "epoch": 1.2474632166412989, + "grad_norm": 6.125, + "learning_rate": 3.396353004643306e-06, + "loss": 1.54220743, + "memory(GiB)": 117.38, + "step": 49175, + "train_speed(iter/s)": 1.636201 + }, + { + "acc": 0.64999056, + "epoch": 1.2475900558092339, + "grad_norm": 5.15625, + "learning_rate": 3.3953598141180817e-06, + "loss": 1.62617893, + "memory(GiB)": 117.38, + "step": 49180, + "train_speed(iter/s)": 1.636219 + }, + { + "acc": 0.66229234, + "epoch": 1.2477168949771689, + "grad_norm": 9.3125, + "learning_rate": 3.3943666941720978e-06, + "loss": 1.57770357, + "memory(GiB)": 117.38, + "step": 49185, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.65444107, + "epoch": 1.247843734145104, + "grad_norm": 5.4375, + "learning_rate": 3.3933736448490363e-06, + "loss": 1.58703175, + "memory(GiB)": 117.38, + "step": 49190, + "train_speed(iter/s)": 1.636255 + }, + { + "acc": 0.65784883, + "epoch": 1.247970573313039, + "grad_norm": 7.59375, + "learning_rate": 3.392380666192573e-06, + "loss": 1.59426956, + "memory(GiB)": 117.38, + "step": 49195, + "train_speed(iter/s)": 1.636273 + }, + { + "acc": 0.64774876, + "epoch": 1.248097412480974, + "grad_norm": 5.28125, + "learning_rate": 3.391387758246386e-06, + "loss": 1.66628914, + "memory(GiB)": 117.38, + "step": 49200, + "train_speed(iter/s)": 1.636291 + }, + { + "acc": 0.67271008, + "epoch": 1.2482242516489093, + "grad_norm": 5.96875, + "learning_rate": 3.3903949210541477e-06, + "loss": 1.5133131, + "memory(GiB)": 117.38, + "step": 49205, + "train_speed(iter/s)": 1.636309 + }, + { + "acc": 0.65915227, + "epoch": 1.2483510908168443, + "grad_norm": 4.78125, + "learning_rate": 3.389402154659529e-06, + "loss": 1.61975937, + "memory(GiB)": 117.38, + "step": 49210, + "train_speed(iter/s)": 1.636326 + }, + { + "acc": 0.64514656, + "epoch": 1.2484779299847792, + "grad_norm": 6.3125, + "learning_rate": 3.388409459106192e-06, + "loss": 1.65869923, + "memory(GiB)": 117.38, + "step": 49215, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.6475831, + "epoch": 1.2486047691527142, + "grad_norm": 5.34375, + "learning_rate": 3.3874168344378024e-06, + "loss": 1.57521687, + "memory(GiB)": 117.38, + "step": 49220, + "train_speed(iter/s)": 1.636358 + }, + { + "acc": 0.66036663, + "epoch": 1.2487316083206494, + "grad_norm": 6.3125, + "learning_rate": 3.38642428069802e-06, + "loss": 1.58773098, + "memory(GiB)": 117.38, + "step": 49225, + "train_speed(iter/s)": 1.636376 + }, + { + "acc": 0.66770535, + "epoch": 1.2488584474885844, + "grad_norm": 7.15625, + "learning_rate": 3.385431797930503e-06, + "loss": 1.58640594, + "memory(GiB)": 117.38, + "step": 49230, + "train_speed(iter/s)": 1.636394 + }, + { + "acc": 0.64684696, + "epoch": 1.2489852866565196, + "grad_norm": 5.0625, + "learning_rate": 3.3844393861789036e-06, + "loss": 1.6708519, + "memory(GiB)": 117.38, + "step": 49235, + "train_speed(iter/s)": 1.636411 + }, + { + "acc": 0.64961667, + "epoch": 1.2491121258244546, + "grad_norm": 6.875, + "learning_rate": 3.383447045486872e-06, + "loss": 1.66736317, + "memory(GiB)": 117.38, + "step": 49240, + "train_speed(iter/s)": 1.636428 + }, + { + "acc": 0.6387187, + "epoch": 1.2492389649923896, + "grad_norm": 5.28125, + "learning_rate": 3.382454775898057e-06, + "loss": 1.6750761, + "memory(GiB)": 117.38, + "step": 49245, + "train_speed(iter/s)": 1.636447 + }, + { + "acc": 0.65937762, + "epoch": 1.2493658041603246, + "grad_norm": 6.875, + "learning_rate": 3.381462577456104e-06, + "loss": 1.60116062, + "memory(GiB)": 117.38, + "step": 49250, + "train_speed(iter/s)": 1.636465 + }, + { + "acc": 0.66407385, + "epoch": 1.2494926433282598, + "grad_norm": 5.96875, + "learning_rate": 3.3804704502046527e-06, + "loss": 1.5733098, + "memory(GiB)": 117.38, + "step": 49255, + "train_speed(iter/s)": 1.636482 + }, + { + "acc": 0.66543508, + "epoch": 1.2496194824961948, + "grad_norm": 5.46875, + "learning_rate": 3.3794783941873406e-06, + "loss": 1.54474564, + "memory(GiB)": 117.38, + "step": 49260, + "train_speed(iter/s)": 1.6365 + }, + { + "acc": 0.65033073, + "epoch": 1.2497463216641298, + "grad_norm": 7.9375, + "learning_rate": 3.3784864094478044e-06, + "loss": 1.64596825, + "memory(GiB)": 117.38, + "step": 49265, + "train_speed(iter/s)": 1.636518 + }, + { + "acc": 0.65129375, + "epoch": 1.249873160832065, + "grad_norm": 5.21875, + "learning_rate": 3.377494496029677e-06, + "loss": 1.59719515, + "memory(GiB)": 117.38, + "step": 49270, + "train_speed(iter/s)": 1.636534 + }, + { + "acc": 0.66075315, + "epoch": 1.25, + "grad_norm": 6.0, + "learning_rate": 3.3765026539765832e-06, + "loss": 1.59290981, + "memory(GiB)": 117.38, + "step": 49275, + "train_speed(iter/s)": 1.636553 + }, + { + "acc": 0.65610685, + "epoch": 1.250126839167935, + "grad_norm": 6.3125, + "learning_rate": 3.375510883332152e-06, + "loss": 1.6449049, + "memory(GiB)": 117.38, + "step": 49280, + "train_speed(iter/s)": 1.636572 + }, + { + "acc": 0.63608141, + "epoch": 1.2502536783358702, + "grad_norm": 5.1875, + "learning_rate": 3.3745191841400037e-06, + "loss": 1.6381237, + "memory(GiB)": 117.38, + "step": 49285, + "train_speed(iter/s)": 1.636588 + }, + { + "acc": 0.64765563, + "epoch": 1.2503805175038052, + "grad_norm": 4.8125, + "learning_rate": 3.373527556443762e-06, + "loss": 1.65049858, + "memory(GiB)": 117.38, + "step": 49290, + "train_speed(iter/s)": 1.636605 + }, + { + "acc": 0.65141196, + "epoch": 1.2505073566717402, + "grad_norm": 5.34375, + "learning_rate": 3.372536000287038e-06, + "loss": 1.60003185, + "memory(GiB)": 117.38, + "step": 49295, + "train_speed(iter/s)": 1.636623 + }, + { + "acc": 0.66425533, + "epoch": 1.2506341958396754, + "grad_norm": 7.53125, + "learning_rate": 3.3715445157134474e-06, + "loss": 1.57765913, + "memory(GiB)": 117.38, + "step": 49300, + "train_speed(iter/s)": 1.636641 + }, + { + "acc": 0.65659471, + "epoch": 1.2507610350076104, + "grad_norm": 5.8125, + "learning_rate": 3.370553102766598e-06, + "loss": 1.58344316, + "memory(GiB)": 117.38, + "step": 49305, + "train_speed(iter/s)": 1.636658 + }, + { + "acc": 0.64507413, + "epoch": 1.2508878741755454, + "grad_norm": 5.3125, + "learning_rate": 3.369561761490101e-06, + "loss": 1.6362751, + "memory(GiB)": 117.38, + "step": 49310, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.65808887, + "epoch": 1.2510147133434804, + "grad_norm": 7.0, + "learning_rate": 3.3685704919275553e-06, + "loss": 1.64250736, + "memory(GiB)": 117.38, + "step": 49315, + "train_speed(iter/s)": 1.636692 + }, + { + "acc": 0.67082272, + "epoch": 1.2511415525114156, + "grad_norm": 6.0625, + "learning_rate": 3.3675792941225625e-06, + "loss": 1.50596876, + "memory(GiB)": 117.38, + "step": 49320, + "train_speed(iter/s)": 1.63671 + }, + { + "acc": 0.66129255, + "epoch": 1.2512683916793506, + "grad_norm": 6.25, + "learning_rate": 3.3665881681187214e-06, + "loss": 1.62942715, + "memory(GiB)": 117.38, + "step": 49325, + "train_speed(iter/s)": 1.636727 + }, + { + "acc": 0.66666231, + "epoch": 1.2513952308472858, + "grad_norm": 6.5625, + "learning_rate": 3.3655971139596265e-06, + "loss": 1.6081974, + "memory(GiB)": 117.38, + "step": 49330, + "train_speed(iter/s)": 1.636744 + }, + { + "acc": 0.65374498, + "epoch": 1.2515220700152208, + "grad_norm": 5.4375, + "learning_rate": 3.3646061316888655e-06, + "loss": 1.63329544, + "memory(GiB)": 117.38, + "step": 49335, + "train_speed(iter/s)": 1.636761 + }, + { + "acc": 0.65066166, + "epoch": 1.2516489091831557, + "grad_norm": 6.96875, + "learning_rate": 3.3636152213500295e-06, + "loss": 1.66578007, + "memory(GiB)": 117.38, + "step": 49340, + "train_speed(iter/s)": 1.636779 + }, + { + "acc": 0.64874635, + "epoch": 1.2517757483510907, + "grad_norm": 6.21875, + "learning_rate": 3.362624382986702e-06, + "loss": 1.63890839, + "memory(GiB)": 117.38, + "step": 49345, + "train_speed(iter/s)": 1.636796 + }, + { + "acc": 0.66319718, + "epoch": 1.251902587519026, + "grad_norm": 5.03125, + "learning_rate": 3.3616336166424653e-06, + "loss": 1.60451412, + "memory(GiB)": 117.38, + "step": 49350, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.67067146, + "epoch": 1.252029426686961, + "grad_norm": 6.03125, + "learning_rate": 3.360642922360895e-06, + "loss": 1.50495701, + "memory(GiB)": 117.38, + "step": 49355, + "train_speed(iter/s)": 1.636831 + }, + { + "acc": 0.64086423, + "epoch": 1.252156265854896, + "grad_norm": 5.84375, + "learning_rate": 3.3596523001855684e-06, + "loss": 1.71857986, + "memory(GiB)": 117.38, + "step": 49360, + "train_speed(iter/s)": 1.636848 + }, + { + "acc": 0.67533321, + "epoch": 1.2522831050228311, + "grad_norm": 5.40625, + "learning_rate": 3.358661750160057e-06, + "loss": 1.50119133, + "memory(GiB)": 117.38, + "step": 49365, + "train_speed(iter/s)": 1.636865 + }, + { + "acc": 0.65372791, + "epoch": 1.2524099441907661, + "grad_norm": 4.8125, + "learning_rate": 3.3576712723279326e-06, + "loss": 1.57447681, + "memory(GiB)": 117.38, + "step": 49370, + "train_speed(iter/s)": 1.636882 + }, + { + "acc": 0.64516697, + "epoch": 1.2525367833587011, + "grad_norm": 5.25, + "learning_rate": 3.3566808667327566e-06, + "loss": 1.69332848, + "memory(GiB)": 117.38, + "step": 49375, + "train_speed(iter/s)": 1.636901 + }, + { + "acc": 0.67300053, + "epoch": 1.252663622526636, + "grad_norm": 6.0625, + "learning_rate": 3.355690533418091e-06, + "loss": 1.58054142, + "memory(GiB)": 117.38, + "step": 49380, + "train_speed(iter/s)": 1.636918 + }, + { + "acc": 0.64516811, + "epoch": 1.2527904616945713, + "grad_norm": 5.125, + "learning_rate": 3.354700272427499e-06, + "loss": 1.65030861, + "memory(GiB)": 117.38, + "step": 49385, + "train_speed(iter/s)": 1.636935 + }, + { + "acc": 0.65779667, + "epoch": 1.2529173008625063, + "grad_norm": 6.75, + "learning_rate": 3.3537100838045356e-06, + "loss": 1.66063805, + "memory(GiB)": 117.38, + "step": 49390, + "train_speed(iter/s)": 1.636952 + }, + { + "acc": 0.66360779, + "epoch": 1.2530441400304415, + "grad_norm": 6.53125, + "learning_rate": 3.3527199675927526e-06, + "loss": 1.63214455, + "memory(GiB)": 117.38, + "step": 49395, + "train_speed(iter/s)": 1.63697 + }, + { + "acc": 0.64223914, + "epoch": 1.2531709791983765, + "grad_norm": 6.21875, + "learning_rate": 3.3517299238356982e-06, + "loss": 1.67133522, + "memory(GiB)": 117.38, + "step": 49400, + "train_speed(iter/s)": 1.636989 + }, + { + "acc": 0.65898533, + "epoch": 1.2532978183663115, + "grad_norm": 5.0, + "learning_rate": 3.3507399525769214e-06, + "loss": 1.57467442, + "memory(GiB)": 117.38, + "step": 49405, + "train_speed(iter/s)": 1.637006 + }, + { + "acc": 0.64663792, + "epoch": 1.2534246575342465, + "grad_norm": 6.71875, + "learning_rate": 3.3497500538599664e-06, + "loss": 1.65558052, + "memory(GiB)": 117.38, + "step": 49410, + "train_speed(iter/s)": 1.637024 + }, + { + "acc": 0.6522294, + "epoch": 1.2535514967021817, + "grad_norm": 5.34375, + "learning_rate": 3.34876022772837e-06, + "loss": 1.64799156, + "memory(GiB)": 117.38, + "step": 49415, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.65776939, + "epoch": 1.2536783358701167, + "grad_norm": 5.53125, + "learning_rate": 3.347770474225672e-06, + "loss": 1.67325172, + "memory(GiB)": 117.38, + "step": 49420, + "train_speed(iter/s)": 1.637061 + }, + { + "acc": 0.65369911, + "epoch": 1.2538051750380519, + "grad_norm": 5.90625, + "learning_rate": 3.3467807933954034e-06, + "loss": 1.59110832, + "memory(GiB)": 117.38, + "step": 49425, + "train_speed(iter/s)": 1.637079 + }, + { + "acc": 0.64510088, + "epoch": 1.2539320142059869, + "grad_norm": 4.90625, + "learning_rate": 3.345791185281101e-06, + "loss": 1.60621529, + "memory(GiB)": 117.38, + "step": 49430, + "train_speed(iter/s)": 1.637098 + }, + { + "acc": 0.66607342, + "epoch": 1.2540588533739219, + "grad_norm": 5.875, + "learning_rate": 3.3448016499262836e-06, + "loss": 1.57269688, + "memory(GiB)": 117.38, + "step": 49435, + "train_speed(iter/s)": 1.637116 + }, + { + "acc": 0.64941864, + "epoch": 1.2541856925418569, + "grad_norm": 5.71875, + "learning_rate": 3.3438121873744812e-06, + "loss": 1.63756599, + "memory(GiB)": 117.38, + "step": 49440, + "train_speed(iter/s)": 1.637134 + }, + { + "acc": 0.64895248, + "epoch": 1.254312531709792, + "grad_norm": 5.40625, + "learning_rate": 3.342822797669212e-06, + "loss": 1.60085907, + "memory(GiB)": 117.38, + "step": 49445, + "train_speed(iter/s)": 1.637152 + }, + { + "acc": 0.65342312, + "epoch": 1.254439370877727, + "grad_norm": 5.90625, + "learning_rate": 3.3418334808539966e-06, + "loss": 1.66911201, + "memory(GiB)": 117.38, + "step": 49450, + "train_speed(iter/s)": 1.63717 + }, + { + "acc": 0.63991094, + "epoch": 1.254566210045662, + "grad_norm": 5.125, + "learning_rate": 3.340844236972347e-06, + "loss": 1.67086163, + "memory(GiB)": 117.38, + "step": 49455, + "train_speed(iter/s)": 1.637188 + }, + { + "acc": 0.6574192, + "epoch": 1.2546930492135973, + "grad_norm": 6.5, + "learning_rate": 3.3398550660677748e-06, + "loss": 1.6120245, + "memory(GiB)": 117.38, + "step": 49460, + "train_speed(iter/s)": 1.637207 + }, + { + "acc": 0.6440146, + "epoch": 1.2548198883815322, + "grad_norm": 7.375, + "learning_rate": 3.3388659681837898e-06, + "loss": 1.62507801, + "memory(GiB)": 117.38, + "step": 49465, + "train_speed(iter/s)": 1.637225 + }, + { + "acc": 0.64015174, + "epoch": 1.2549467275494672, + "grad_norm": 7.5625, + "learning_rate": 3.3378769433638965e-06, + "loss": 1.6876976, + "memory(GiB)": 117.38, + "step": 49470, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.6564045, + "epoch": 1.2550735667174022, + "grad_norm": 6.25, + "learning_rate": 3.336887991651595e-06, + "loss": 1.6462719, + "memory(GiB)": 117.38, + "step": 49475, + "train_speed(iter/s)": 1.637261 + }, + { + "acc": 0.65404158, + "epoch": 1.2552004058853374, + "grad_norm": 5.1875, + "learning_rate": 3.3358991130903845e-06, + "loss": 1.61365509, + "memory(GiB)": 117.38, + "step": 49480, + "train_speed(iter/s)": 1.637279 + }, + { + "acc": 0.68134108, + "epoch": 1.2553272450532724, + "grad_norm": 6.25, + "learning_rate": 3.334910307723761e-06, + "loss": 1.51830635, + "memory(GiB)": 117.38, + "step": 49485, + "train_speed(iter/s)": 1.637298 + }, + { + "acc": 0.66393123, + "epoch": 1.2554540842212076, + "grad_norm": 6.75, + "learning_rate": 3.333921575595218e-06, + "loss": 1.57889557, + "memory(GiB)": 117.38, + "step": 49490, + "train_speed(iter/s)": 1.637316 + }, + { + "acc": 0.63359385, + "epoch": 1.2555809233891426, + "grad_norm": 5.53125, + "learning_rate": 3.3329329167482404e-06, + "loss": 1.65368576, + "memory(GiB)": 117.38, + "step": 49495, + "train_speed(iter/s)": 1.637335 + }, + { + "acc": 0.67127409, + "epoch": 1.2557077625570776, + "grad_norm": 6.46875, + "learning_rate": 3.331944331226317e-06, + "loss": 1.58295994, + "memory(GiB)": 117.38, + "step": 49500, + "train_speed(iter/s)": 1.637353 + }, + { + "acc": 0.67291746, + "epoch": 1.2558346017250126, + "grad_norm": 5.03125, + "learning_rate": 3.330955819072928e-06, + "loss": 1.58522968, + "memory(GiB)": 117.38, + "step": 49505, + "train_speed(iter/s)": 1.637371 + }, + { + "acc": 0.68650851, + "epoch": 1.2559614408929478, + "grad_norm": 6.15625, + "learning_rate": 3.329967380331556e-06, + "loss": 1.51698895, + "memory(GiB)": 117.38, + "step": 49510, + "train_speed(iter/s)": 1.63739 + }, + { + "acc": 0.64403601, + "epoch": 1.2560882800608828, + "grad_norm": 9.25, + "learning_rate": 3.3289790150456737e-06, + "loss": 1.64234772, + "memory(GiB)": 117.38, + "step": 49515, + "train_speed(iter/s)": 1.637408 + }, + { + "acc": 0.64918604, + "epoch": 1.2562151192288178, + "grad_norm": 5.34375, + "learning_rate": 3.327990723258755e-06, + "loss": 1.61009407, + "memory(GiB)": 117.38, + "step": 49520, + "train_speed(iter/s)": 1.637427 + }, + { + "acc": 0.65023804, + "epoch": 1.256341958396753, + "grad_norm": 5.15625, + "learning_rate": 3.3270025050142684e-06, + "loss": 1.59021721, + "memory(GiB)": 117.38, + "step": 49525, + "train_speed(iter/s)": 1.637444 + }, + { + "acc": 0.65463924, + "epoch": 1.256468797564688, + "grad_norm": 4.90625, + "learning_rate": 3.3260143603556827e-06, + "loss": 1.58479223, + "memory(GiB)": 117.38, + "step": 49530, + "train_speed(iter/s)": 1.637461 + }, + { + "acc": 0.64296741, + "epoch": 1.256595636732623, + "grad_norm": 5.75, + "learning_rate": 3.3250262893264583e-06, + "loss": 1.62325058, + "memory(GiB)": 117.38, + "step": 49535, + "train_speed(iter/s)": 1.637479 + }, + { + "acc": 0.64920883, + "epoch": 1.256722475900558, + "grad_norm": 6.03125, + "learning_rate": 3.3240382919700555e-06, + "loss": 1.61260529, + "memory(GiB)": 117.38, + "step": 49540, + "train_speed(iter/s)": 1.637497 + }, + { + "acc": 0.64705167, + "epoch": 1.2568493150684932, + "grad_norm": 5.84375, + "learning_rate": 3.3230503683299316e-06, + "loss": 1.59736347, + "memory(GiB)": 117.38, + "step": 49545, + "train_speed(iter/s)": 1.637515 + }, + { + "acc": 0.65897007, + "epoch": 1.2569761542364282, + "grad_norm": 6.40625, + "learning_rate": 3.3220625184495404e-06, + "loss": 1.60819931, + "memory(GiB)": 117.38, + "step": 49550, + "train_speed(iter/s)": 1.637534 + }, + { + "acc": 0.6525991, + "epoch": 1.2571029934043634, + "grad_norm": 6.75, + "learning_rate": 3.3210747423723293e-06, + "loss": 1.5896122, + "memory(GiB)": 117.38, + "step": 49555, + "train_speed(iter/s)": 1.637553 + }, + { + "acc": 0.6589148, + "epoch": 1.2572298325722984, + "grad_norm": 5.21875, + "learning_rate": 3.3200870401417486e-06, + "loss": 1.61852245, + "memory(GiB)": 117.38, + "step": 49560, + "train_speed(iter/s)": 1.637569 + }, + { + "acc": 0.66295719, + "epoch": 1.2573566717402334, + "grad_norm": 5.28125, + "learning_rate": 3.3190994118012387e-06, + "loss": 1.63465843, + "memory(GiB)": 117.38, + "step": 49565, + "train_speed(iter/s)": 1.637587 + }, + { + "acc": 0.65516825, + "epoch": 1.2574835109081683, + "grad_norm": 7.3125, + "learning_rate": 3.318111857394244e-06, + "loss": 1.64129333, + "memory(GiB)": 117.38, + "step": 49570, + "train_speed(iter/s)": 1.637604 + }, + { + "acc": 0.66525354, + "epoch": 1.2576103500761036, + "grad_norm": 5.8125, + "learning_rate": 3.3171243769641957e-06, + "loss": 1.56072397, + "memory(GiB)": 117.38, + "step": 49575, + "train_speed(iter/s)": 1.637622 + }, + { + "acc": 0.65193467, + "epoch": 1.2577371892440385, + "grad_norm": 6.34375, + "learning_rate": 3.316136970554532e-06, + "loss": 1.6028141, + "memory(GiB)": 117.38, + "step": 49580, + "train_speed(iter/s)": 1.63764 + }, + { + "acc": 0.66604323, + "epoch": 1.2578640284119738, + "grad_norm": 6.03125, + "learning_rate": 3.315149638208681e-06, + "loss": 1.62820702, + "memory(GiB)": 117.38, + "step": 49585, + "train_speed(iter/s)": 1.637657 + }, + { + "acc": 0.66368127, + "epoch": 1.2579908675799087, + "grad_norm": 6.6875, + "learning_rate": 3.3141623799700738e-06, + "loss": 1.59735012, + "memory(GiB)": 117.38, + "step": 49590, + "train_speed(iter/s)": 1.637675 + }, + { + "acc": 0.63849754, + "epoch": 1.2581177067478437, + "grad_norm": 6.875, + "learning_rate": 3.3131751958821313e-06, + "loss": 1.68269958, + "memory(GiB)": 117.38, + "step": 49595, + "train_speed(iter/s)": 1.637693 + }, + { + "acc": 0.66122522, + "epoch": 1.2582445459157787, + "grad_norm": 6.0, + "learning_rate": 3.312188085988273e-06, + "loss": 1.552705, + "memory(GiB)": 117.38, + "step": 49600, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.66504526, + "epoch": 1.258371385083714, + "grad_norm": 5.25, + "learning_rate": 3.311201050331919e-06, + "loss": 1.59725103, + "memory(GiB)": 117.38, + "step": 49605, + "train_speed(iter/s)": 1.637731 + }, + { + "acc": 0.66653099, + "epoch": 1.258498224251649, + "grad_norm": 5.5625, + "learning_rate": 3.310214088956485e-06, + "loss": 1.58143606, + "memory(GiB)": 117.38, + "step": 49610, + "train_speed(iter/s)": 1.637749 + }, + { + "acc": 0.67515736, + "epoch": 1.258625063419584, + "grad_norm": 5.78125, + "learning_rate": 3.3092272019053773e-06, + "loss": 1.47481222, + "memory(GiB)": 117.38, + "step": 49615, + "train_speed(iter/s)": 1.637766 + }, + { + "acc": 0.68036356, + "epoch": 1.2587519025875191, + "grad_norm": 4.75, + "learning_rate": 3.308240389222006e-06, + "loss": 1.49470243, + "memory(GiB)": 117.38, + "step": 49620, + "train_speed(iter/s)": 1.637782 + }, + { + "acc": 0.6625773, + "epoch": 1.2588787417554541, + "grad_norm": 5.65625, + "learning_rate": 3.3072536509497762e-06, + "loss": 1.57462616, + "memory(GiB)": 117.38, + "step": 49625, + "train_speed(iter/s)": 1.637801 + }, + { + "acc": 0.65595551, + "epoch": 1.259005580923389, + "grad_norm": 5.46875, + "learning_rate": 3.306266987132089e-06, + "loss": 1.58381948, + "memory(GiB)": 117.38, + "step": 49630, + "train_speed(iter/s)": 1.63782 + }, + { + "acc": 0.66937542, + "epoch": 1.259132420091324, + "grad_norm": 6.6875, + "learning_rate": 3.3052803978123405e-06, + "loss": 1.53871021, + "memory(GiB)": 117.38, + "step": 49635, + "train_speed(iter/s)": 1.637838 + }, + { + "acc": 0.66464615, + "epoch": 1.2592592592592593, + "grad_norm": 4.625, + "learning_rate": 3.3042938830339264e-06, + "loss": 1.55959225, + "memory(GiB)": 117.38, + "step": 49640, + "train_speed(iter/s)": 1.637856 + }, + { + "acc": 0.65394125, + "epoch": 1.2593860984271943, + "grad_norm": 5.625, + "learning_rate": 3.303307442840238e-06, + "loss": 1.54883499, + "memory(GiB)": 117.38, + "step": 49645, + "train_speed(iter/s)": 1.637873 + }, + { + "acc": 0.66544194, + "epoch": 1.2595129375951295, + "grad_norm": 5.75, + "learning_rate": 3.302321077274666e-06, + "loss": 1.52851467, + "memory(GiB)": 117.38, + "step": 49650, + "train_speed(iter/s)": 1.637892 + }, + { + "acc": 0.63880501, + "epoch": 1.2596397767630645, + "grad_norm": 4.96875, + "learning_rate": 3.30133478638059e-06, + "loss": 1.67572861, + "memory(GiB)": 117.38, + "step": 49655, + "train_speed(iter/s)": 1.637911 + }, + { + "acc": 0.65686507, + "epoch": 1.2597666159309995, + "grad_norm": 4.6875, + "learning_rate": 3.300348570201395e-06, + "loss": 1.56951628, + "memory(GiB)": 117.38, + "step": 49660, + "train_speed(iter/s)": 1.637927 + }, + { + "acc": 0.64188352, + "epoch": 1.2598934550989345, + "grad_norm": 5.375, + "learning_rate": 3.299362428780457e-06, + "loss": 1.62201042, + "memory(GiB)": 117.38, + "step": 49665, + "train_speed(iter/s)": 1.637946 + }, + { + "acc": 0.66775746, + "epoch": 1.2600202942668697, + "grad_norm": 5.78125, + "learning_rate": 3.298376362161154e-06, + "loss": 1.44218502, + "memory(GiB)": 117.38, + "step": 49670, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.66670451, + "epoch": 1.2601471334348047, + "grad_norm": 6.28125, + "learning_rate": 3.297390370386856e-06, + "loss": 1.61631966, + "memory(GiB)": 117.38, + "step": 49675, + "train_speed(iter/s)": 1.637982 + }, + { + "acc": 0.64637809, + "epoch": 1.2602739726027397, + "grad_norm": 5.28125, + "learning_rate": 3.2964044535009288e-06, + "loss": 1.64414711, + "memory(GiB)": 117.38, + "step": 49680, + "train_speed(iter/s)": 1.637999 + }, + { + "acc": 0.66050034, + "epoch": 1.2604008117706749, + "grad_norm": 5.59375, + "learning_rate": 3.2954186115467412e-06, + "loss": 1.57813931, + "memory(GiB)": 117.38, + "step": 49685, + "train_speed(iter/s)": 1.638018 + }, + { + "acc": 0.64761658, + "epoch": 1.2605276509386099, + "grad_norm": 6.34375, + "learning_rate": 3.2944328445676543e-06, + "loss": 1.64748592, + "memory(GiB)": 117.38, + "step": 49690, + "train_speed(iter/s)": 1.638036 + }, + { + "acc": 0.64268999, + "epoch": 1.2606544901065448, + "grad_norm": 5.875, + "learning_rate": 3.2934471526070254e-06, + "loss": 1.6700901, + "memory(GiB)": 117.38, + "step": 49695, + "train_speed(iter/s)": 1.638054 + }, + { + "acc": 0.66521635, + "epoch": 1.2607813292744798, + "grad_norm": 6.96875, + "learning_rate": 3.2924615357082078e-06, + "loss": 1.56490726, + "memory(GiB)": 117.38, + "step": 49700, + "train_speed(iter/s)": 1.638072 + }, + { + "acc": 0.6459219, + "epoch": 1.260908168442415, + "grad_norm": 6.28125, + "learning_rate": 3.2914759939145574e-06, + "loss": 1.64286423, + "memory(GiB)": 117.38, + "step": 49705, + "train_speed(iter/s)": 1.638089 + }, + { + "acc": 0.64992886, + "epoch": 1.26103500761035, + "grad_norm": 6.65625, + "learning_rate": 3.2904905272694214e-06, + "loss": 1.63684788, + "memory(GiB)": 117.38, + "step": 49710, + "train_speed(iter/s)": 1.638107 + }, + { + "acc": 0.65978479, + "epoch": 1.2611618467782852, + "grad_norm": 5.0625, + "learning_rate": 3.289505135816142e-06, + "loss": 1.56385193, + "memory(GiB)": 117.38, + "step": 49715, + "train_speed(iter/s)": 1.638124 + }, + { + "acc": 0.65311298, + "epoch": 1.2612886859462202, + "grad_norm": 5.59375, + "learning_rate": 3.2885198195980653e-06, + "loss": 1.62536316, + "memory(GiB)": 117.38, + "step": 49720, + "train_speed(iter/s)": 1.638143 + }, + { + "acc": 0.65693417, + "epoch": 1.2614155251141552, + "grad_norm": 5.8125, + "learning_rate": 3.287534578658527e-06, + "loss": 1.6061451, + "memory(GiB)": 117.38, + "step": 49725, + "train_speed(iter/s)": 1.638161 + }, + { + "acc": 0.64645748, + "epoch": 1.2615423642820902, + "grad_norm": 5.71875, + "learning_rate": 3.2865494130408657e-06, + "loss": 1.60314655, + "memory(GiB)": 117.38, + "step": 49730, + "train_speed(iter/s)": 1.638179 + }, + { + "acc": 0.64851303, + "epoch": 1.2616692034500254, + "grad_norm": 8.0, + "learning_rate": 3.2855643227884097e-06, + "loss": 1.64891071, + "memory(GiB)": 117.38, + "step": 49735, + "train_speed(iter/s)": 1.638198 + }, + { + "acc": 0.66854925, + "epoch": 1.2617960426179604, + "grad_norm": 5.59375, + "learning_rate": 3.28457930794449e-06, + "loss": 1.54447021, + "memory(GiB)": 117.38, + "step": 49740, + "train_speed(iter/s)": 1.638215 + }, + { + "acc": 0.67055426, + "epoch": 1.2619228817858956, + "grad_norm": 7.75, + "learning_rate": 3.283594368552429e-06, + "loss": 1.55973034, + "memory(GiB)": 117.38, + "step": 49745, + "train_speed(iter/s)": 1.638235 + }, + { + "acc": 0.65528841, + "epoch": 1.2620497209538306, + "grad_norm": 6.1875, + "learning_rate": 3.282609504655554e-06, + "loss": 1.62334137, + "memory(GiB)": 117.38, + "step": 49750, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.65968323, + "epoch": 1.2621765601217656, + "grad_norm": 7.28125, + "learning_rate": 3.281624716297179e-06, + "loss": 1.55405102, + "memory(GiB)": 117.38, + "step": 49755, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.64967728, + "epoch": 1.2623033992897006, + "grad_norm": 7.84375, + "learning_rate": 3.28064000352062e-06, + "loss": 1.62208481, + "memory(GiB)": 117.38, + "step": 49760, + "train_speed(iter/s)": 1.63829 + }, + { + "acc": 0.65230274, + "epoch": 1.2624302384576358, + "grad_norm": 5.59375, + "learning_rate": 3.279655366369191e-06, + "loss": 1.5856945, + "memory(GiB)": 117.38, + "step": 49765, + "train_speed(iter/s)": 1.638308 + }, + { + "acc": 0.65068178, + "epoch": 1.2625570776255708, + "grad_norm": 5.59375, + "learning_rate": 3.2786708048862e-06, + "loss": 1.64711056, + "memory(GiB)": 117.38, + "step": 49770, + "train_speed(iter/s)": 1.638326 + }, + { + "acc": 0.66608849, + "epoch": 1.2626839167935058, + "grad_norm": 7.4375, + "learning_rate": 3.2776863191149517e-06, + "loss": 1.58934221, + "memory(GiB)": 117.38, + "step": 49775, + "train_speed(iter/s)": 1.638345 + }, + { + "acc": 0.65145817, + "epoch": 1.262810755961441, + "grad_norm": 6.96875, + "learning_rate": 3.2767019090987483e-06, + "loss": 1.64121132, + "memory(GiB)": 117.38, + "step": 49780, + "train_speed(iter/s)": 1.638364 + }, + { + "acc": 0.64563627, + "epoch": 1.262937595129376, + "grad_norm": 4.96875, + "learning_rate": 3.27571757488089e-06, + "loss": 1.69266663, + "memory(GiB)": 117.38, + "step": 49785, + "train_speed(iter/s)": 1.638381 + }, + { + "acc": 0.66047411, + "epoch": 1.263064434297311, + "grad_norm": 5.5625, + "learning_rate": 3.274733316504672e-06, + "loss": 1.55904579, + "memory(GiB)": 117.38, + "step": 49790, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.65889931, + "epoch": 1.263191273465246, + "grad_norm": 4.8125, + "learning_rate": 3.273749134013383e-06, + "loss": 1.52647629, + "memory(GiB)": 117.38, + "step": 49795, + "train_speed(iter/s)": 1.638418 + }, + { + "acc": 0.66125517, + "epoch": 1.2633181126331812, + "grad_norm": 5.125, + "learning_rate": 3.2727650274503154e-06, + "loss": 1.56951008, + "memory(GiB)": 117.38, + "step": 49800, + "train_speed(iter/s)": 1.638436 + }, + { + "acc": 0.65565701, + "epoch": 1.2634449518011162, + "grad_norm": 6.75, + "learning_rate": 3.2717809968587523e-06, + "loss": 1.5748394, + "memory(GiB)": 117.38, + "step": 49805, + "train_speed(iter/s)": 1.638455 + }, + { + "acc": 0.65837555, + "epoch": 1.2635717909690514, + "grad_norm": 6.0625, + "learning_rate": 3.270797042281979e-06, + "loss": 1.57841663, + "memory(GiB)": 117.38, + "step": 49810, + "train_speed(iter/s)": 1.638474 + }, + { + "acc": 0.64887285, + "epoch": 1.2636986301369864, + "grad_norm": 5.09375, + "learning_rate": 3.269813163763271e-06, + "loss": 1.68857403, + "memory(GiB)": 117.38, + "step": 49815, + "train_speed(iter/s)": 1.638492 + }, + { + "acc": 0.65504847, + "epoch": 1.2638254693049213, + "grad_norm": 6.0, + "learning_rate": 3.268829361345904e-06, + "loss": 1.6133667, + "memory(GiB)": 117.38, + "step": 49820, + "train_speed(iter/s)": 1.63851 + }, + { + "acc": 0.66069002, + "epoch": 1.2639523084728563, + "grad_norm": 6.1875, + "learning_rate": 3.2678456350731526e-06, + "loss": 1.56747561, + "memory(GiB)": 117.38, + "step": 49825, + "train_speed(iter/s)": 1.638529 + }, + { + "acc": 0.67424536, + "epoch": 1.2640791476407915, + "grad_norm": 7.3125, + "learning_rate": 3.266861984988283e-06, + "loss": 1.52973499, + "memory(GiB)": 117.38, + "step": 49830, + "train_speed(iter/s)": 1.638547 + }, + { + "acc": 0.66885996, + "epoch": 1.2642059868087265, + "grad_norm": 6.65625, + "learning_rate": 3.2658784111345614e-06, + "loss": 1.60544052, + "memory(GiB)": 117.38, + "step": 49835, + "train_speed(iter/s)": 1.638566 + }, + { + "acc": 0.65317492, + "epoch": 1.2643328259766615, + "grad_norm": 5.6875, + "learning_rate": 3.2648949135552482e-06, + "loss": 1.5562499, + "memory(GiB)": 117.38, + "step": 49840, + "train_speed(iter/s)": 1.638585 + }, + { + "acc": 0.65379677, + "epoch": 1.2644596651445967, + "grad_norm": 9.0, + "learning_rate": 3.2639114922936045e-06, + "loss": 1.64710789, + "memory(GiB)": 117.38, + "step": 49845, + "train_speed(iter/s)": 1.638604 + }, + { + "acc": 0.66617098, + "epoch": 1.2645865043125317, + "grad_norm": 7.75, + "learning_rate": 3.2629281473928855e-06, + "loss": 1.52495041, + "memory(GiB)": 117.38, + "step": 49850, + "train_speed(iter/s)": 1.638622 + }, + { + "acc": 0.65657992, + "epoch": 1.2647133434804667, + "grad_norm": 4.875, + "learning_rate": 3.26194487889634e-06, + "loss": 1.57321072, + "memory(GiB)": 117.38, + "step": 49855, + "train_speed(iter/s)": 1.638642 + }, + { + "acc": 0.66751165, + "epoch": 1.2648401826484017, + "grad_norm": 6.6875, + "learning_rate": 3.2609616868472192e-06, + "loss": 1.52704563, + "memory(GiB)": 117.38, + "step": 49860, + "train_speed(iter/s)": 1.63866 + }, + { + "acc": 0.65924549, + "epoch": 1.264967021816337, + "grad_norm": 7.09375, + "learning_rate": 3.259978571288767e-06, + "loss": 1.5563652, + "memory(GiB)": 117.38, + "step": 49865, + "train_speed(iter/s)": 1.638678 + }, + { + "acc": 0.64887381, + "epoch": 1.265093860984272, + "grad_norm": 5.0625, + "learning_rate": 3.2589955322642293e-06, + "loss": 1.68247833, + "memory(GiB)": 117.38, + "step": 49870, + "train_speed(iter/s)": 1.638697 + }, + { + "acc": 0.64818001, + "epoch": 1.2652207001522071, + "grad_norm": 4.375, + "learning_rate": 3.2580125698168376e-06, + "loss": 1.58861504, + "memory(GiB)": 117.38, + "step": 49875, + "train_speed(iter/s)": 1.638716 + }, + { + "acc": 0.65233889, + "epoch": 1.265347539320142, + "grad_norm": 5.46875, + "learning_rate": 3.2570296839898314e-06, + "loss": 1.64512596, + "memory(GiB)": 117.38, + "step": 49880, + "train_speed(iter/s)": 1.638734 + }, + { + "acc": 0.65906014, + "epoch": 1.265474378488077, + "grad_norm": 6.8125, + "learning_rate": 3.2560468748264405e-06, + "loss": 1.57112417, + "memory(GiB)": 117.38, + "step": 49885, + "train_speed(iter/s)": 1.638752 + }, + { + "acc": 0.64581718, + "epoch": 1.265601217656012, + "grad_norm": 5.5625, + "learning_rate": 3.2550641423698965e-06, + "loss": 1.62922554, + "memory(GiB)": 117.38, + "step": 49890, + "train_speed(iter/s)": 1.638771 + }, + { + "acc": 0.63296289, + "epoch": 1.2657280568239473, + "grad_norm": 5.53125, + "learning_rate": 3.2540814866634206e-06, + "loss": 1.68817787, + "memory(GiB)": 117.38, + "step": 49895, + "train_speed(iter/s)": 1.638789 + }, + { + "acc": 0.65115576, + "epoch": 1.2658548959918823, + "grad_norm": 5.5, + "learning_rate": 3.2530989077502355e-06, + "loss": 1.57354288, + "memory(GiB)": 117.38, + "step": 49900, + "train_speed(iter/s)": 1.638807 + }, + { + "acc": 0.66715145, + "epoch": 1.2659817351598175, + "grad_norm": 7.09375, + "learning_rate": 3.252116405673561e-06, + "loss": 1.52467728, + "memory(GiB)": 117.38, + "step": 49905, + "train_speed(iter/s)": 1.638824 + }, + { + "acc": 0.66883411, + "epoch": 1.2661085743277525, + "grad_norm": 6.3125, + "learning_rate": 3.2511339804766107e-06, + "loss": 1.53025894, + "memory(GiB)": 117.38, + "step": 49910, + "train_speed(iter/s)": 1.638842 + }, + { + "acc": 0.67596464, + "epoch": 1.2662354134956875, + "grad_norm": 6.125, + "learning_rate": 3.250151632202596e-06, + "loss": 1.56433115, + "memory(GiB)": 117.38, + "step": 49915, + "train_speed(iter/s)": 1.63886 + }, + { + "acc": 0.66348505, + "epoch": 1.2663622526636225, + "grad_norm": 6.75, + "learning_rate": 3.249169360894724e-06, + "loss": 1.61058025, + "memory(GiB)": 117.38, + "step": 49920, + "train_speed(iter/s)": 1.638878 + }, + { + "acc": 0.64913659, + "epoch": 1.2664890918315577, + "grad_norm": 8.125, + "learning_rate": 3.2481871665962006e-06, + "loss": 1.67259274, + "memory(GiB)": 117.38, + "step": 49925, + "train_speed(iter/s)": 1.638896 + }, + { + "acc": 0.66692619, + "epoch": 1.2666159309994927, + "grad_norm": 5.875, + "learning_rate": 3.2472050493502282e-06, + "loss": 1.53537865, + "memory(GiB)": 117.38, + "step": 49930, + "train_speed(iter/s)": 1.638915 + }, + { + "acc": 0.66921182, + "epoch": 1.2667427701674276, + "grad_norm": 6.46875, + "learning_rate": 3.2462230092000017e-06, + "loss": 1.5311655, + "memory(GiB)": 117.38, + "step": 49935, + "train_speed(iter/s)": 1.638933 + }, + { + "acc": 0.66158123, + "epoch": 1.2668696093353629, + "grad_norm": 6.25, + "learning_rate": 3.2452410461887184e-06, + "loss": 1.54242058, + "memory(GiB)": 117.38, + "step": 49940, + "train_speed(iter/s)": 1.63895 + }, + { + "acc": 0.66830521, + "epoch": 1.2669964485032978, + "grad_norm": 5.59375, + "learning_rate": 3.244259160359567e-06, + "loss": 1.54080524, + "memory(GiB)": 117.38, + "step": 49945, + "train_speed(iter/s)": 1.638967 + }, + { + "acc": 0.65941219, + "epoch": 1.2671232876712328, + "grad_norm": 5.53125, + "learning_rate": 3.2432773517557385e-06, + "loss": 1.61795483, + "memory(GiB)": 117.38, + "step": 49950, + "train_speed(iter/s)": 1.638985 + }, + { + "acc": 0.6512094, + "epoch": 1.2672501268391678, + "grad_norm": 4.8125, + "learning_rate": 3.2422956204204147e-06, + "loss": 1.59664774, + "memory(GiB)": 117.38, + "step": 49955, + "train_speed(iter/s)": 1.639002 + }, + { + "acc": 0.66292543, + "epoch": 1.267376966007103, + "grad_norm": 6.03125, + "learning_rate": 3.2413139663967763e-06, + "loss": 1.61979713, + "memory(GiB)": 117.38, + "step": 49960, + "train_speed(iter/s)": 1.63902 + }, + { + "acc": 0.67166414, + "epoch": 1.267503805175038, + "grad_norm": 5.46875, + "learning_rate": 3.2403323897280013e-06, + "loss": 1.49602633, + "memory(GiB)": 117.38, + "step": 49965, + "train_speed(iter/s)": 1.639039 + }, + { + "acc": 0.65016065, + "epoch": 1.2676306443429732, + "grad_norm": 6.375, + "learning_rate": 3.2393508904572663e-06, + "loss": 1.55993385, + "memory(GiB)": 117.38, + "step": 49970, + "train_speed(iter/s)": 1.639057 + }, + { + "acc": 0.64739633, + "epoch": 1.2677574835109082, + "grad_norm": 4.59375, + "learning_rate": 3.2383694686277382e-06, + "loss": 1.63961601, + "memory(GiB)": 117.38, + "step": 49975, + "train_speed(iter/s)": 1.639075 + }, + { + "acc": 0.65471816, + "epoch": 1.2678843226788432, + "grad_norm": 5.09375, + "learning_rate": 3.2373881242825857e-06, + "loss": 1.61855984, + "memory(GiB)": 117.38, + "step": 49980, + "train_speed(iter/s)": 1.639092 + }, + { + "acc": 0.65718489, + "epoch": 1.2680111618467782, + "grad_norm": 5.125, + "learning_rate": 3.236406857464973e-06, + "loss": 1.56859512, + "memory(GiB)": 117.38, + "step": 49985, + "train_speed(iter/s)": 1.63911 + }, + { + "acc": 0.65096259, + "epoch": 1.2681380010147134, + "grad_norm": 5.28125, + "learning_rate": 3.235425668218063e-06, + "loss": 1.61637688, + "memory(GiB)": 117.38, + "step": 49990, + "train_speed(iter/s)": 1.639128 + }, + { + "acc": 0.6488801, + "epoch": 1.2682648401826484, + "grad_norm": 5.15625, + "learning_rate": 3.234444556585007e-06, + "loss": 1.60163269, + "memory(GiB)": 117.38, + "step": 49995, + "train_speed(iter/s)": 1.639147 + }, + { + "acc": 0.65126643, + "epoch": 1.2683916793505834, + "grad_norm": 5.25, + "learning_rate": 3.233463522608964e-06, + "loss": 1.58340807, + "memory(GiB)": 117.38, + "step": 50000, + "train_speed(iter/s)": 1.639165 + }, + { + "epoch": 1.2683916793505834, + "eval_acc": 0.6463165412638846, + "eval_loss": 1.5733541250228882, + "eval_runtime": 59.0635, + "eval_samples_per_second": 107.85, + "eval_steps_per_second": 26.971, + "step": 50000 + }, + { + "acc": 0.67112527, + "epoch": 1.2685185185185186, + "grad_norm": 6.75, + "learning_rate": 3.2324825663330818e-06, + "loss": 1.59025059, + "memory(GiB)": 117.38, + "step": 50005, + "train_speed(iter/s)": 1.635785 + }, + { + "acc": 0.66425056, + "epoch": 1.2686453576864536, + "grad_norm": 6.75, + "learning_rate": 3.231501687800509e-06, + "loss": 1.5514925, + "memory(GiB)": 117.38, + "step": 50010, + "train_speed(iter/s)": 1.635802 + }, + { + "acc": 0.65746942, + "epoch": 1.2687721968543886, + "grad_norm": 5.5625, + "learning_rate": 3.2305208870543857e-06, + "loss": 1.58647041, + "memory(GiB)": 117.38, + "step": 50015, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.66604352, + "epoch": 1.2688990360223236, + "grad_norm": 6.21875, + "learning_rate": 3.2295401641378544e-06, + "loss": 1.59931889, + "memory(GiB)": 117.38, + "step": 50020, + "train_speed(iter/s)": 1.635837 + }, + { + "acc": 0.66053581, + "epoch": 1.2690258751902588, + "grad_norm": 6.125, + "learning_rate": 3.2285595190940513e-06, + "loss": 1.61431999, + "memory(GiB)": 117.38, + "step": 50025, + "train_speed(iter/s)": 1.635854 + }, + { + "acc": 0.65378671, + "epoch": 1.2691527143581938, + "grad_norm": 6.0625, + "learning_rate": 3.2275789519661103e-06, + "loss": 1.69963989, + "memory(GiB)": 117.38, + "step": 50030, + "train_speed(iter/s)": 1.635872 + }, + { + "acc": 0.65933676, + "epoch": 1.269279553526129, + "grad_norm": 5.65625, + "learning_rate": 3.2265984627971595e-06, + "loss": 1.63584557, + "memory(GiB)": 117.38, + "step": 50035, + "train_speed(iter/s)": 1.63589 + }, + { + "acc": 0.64069567, + "epoch": 1.269406392694064, + "grad_norm": 5.4375, + "learning_rate": 3.225618051630326e-06, + "loss": 1.65519104, + "memory(GiB)": 117.38, + "step": 50040, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.66593528, + "epoch": 1.269533231861999, + "grad_norm": 5.5, + "learning_rate": 3.2246377185087325e-06, + "loss": 1.61460819, + "memory(GiB)": 117.38, + "step": 50045, + "train_speed(iter/s)": 1.635926 + }, + { + "acc": 0.6475225, + "epoch": 1.269660071029934, + "grad_norm": 6.75, + "learning_rate": 3.2236574634755003e-06, + "loss": 1.66770687, + "memory(GiB)": 117.38, + "step": 50050, + "train_speed(iter/s)": 1.635944 + }, + { + "acc": 0.63543253, + "epoch": 1.2697869101978692, + "grad_norm": 5.78125, + "learning_rate": 3.222677286573742e-06, + "loss": 1.72865925, + "memory(GiB)": 117.38, + "step": 50055, + "train_speed(iter/s)": 1.63596 + }, + { + "acc": 0.65067654, + "epoch": 1.2699137493658041, + "grad_norm": 5.9375, + "learning_rate": 3.221697187846571e-06, + "loss": 1.6313343, + "memory(GiB)": 117.38, + "step": 50060, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.6621747, + "epoch": 1.2700405885337394, + "grad_norm": 7.5, + "learning_rate": 3.2207171673370984e-06, + "loss": 1.63395157, + "memory(GiB)": 117.38, + "step": 50065, + "train_speed(iter/s)": 1.635993 + }, + { + "acc": 0.6543478, + "epoch": 1.2701674277016743, + "grad_norm": 5.125, + "learning_rate": 3.2197372250884295e-06, + "loss": 1.6200882, + "memory(GiB)": 117.38, + "step": 50070, + "train_speed(iter/s)": 1.636011 + }, + { + "acc": 0.65862293, + "epoch": 1.2702942668696093, + "grad_norm": 6.25, + "learning_rate": 3.218757361143664e-06, + "loss": 1.62272625, + "memory(GiB)": 117.38, + "step": 50075, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.65577483, + "epoch": 1.2704211060375443, + "grad_norm": 6.0625, + "learning_rate": 3.2177775755459034e-06, + "loss": 1.62437916, + "memory(GiB)": 117.38, + "step": 50080, + "train_speed(iter/s)": 1.636045 + }, + { + "acc": 0.64790387, + "epoch": 1.2705479452054795, + "grad_norm": 6.0625, + "learning_rate": 3.216797868338241e-06, + "loss": 1.62575417, + "memory(GiB)": 117.38, + "step": 50085, + "train_speed(iter/s)": 1.636062 + }, + { + "acc": 0.65712094, + "epoch": 1.2706747843734145, + "grad_norm": 5.4375, + "learning_rate": 3.215818239563773e-06, + "loss": 1.60138454, + "memory(GiB)": 117.38, + "step": 50090, + "train_speed(iter/s)": 1.636078 + }, + { + "acc": 0.66571302, + "epoch": 1.2708016235413495, + "grad_norm": 5.125, + "learning_rate": 3.2148386892655814e-06, + "loss": 1.53991909, + "memory(GiB)": 117.38, + "step": 50095, + "train_speed(iter/s)": 1.63609 + }, + { + "acc": 0.65168362, + "epoch": 1.2709284627092847, + "grad_norm": 6.4375, + "learning_rate": 3.2138592174867556e-06, + "loss": 1.66510296, + "memory(GiB)": 117.38, + "step": 50100, + "train_speed(iter/s)": 1.635843 + }, + { + "acc": 0.66659975, + "epoch": 1.2710553018772197, + "grad_norm": 6.9375, + "learning_rate": 3.2128798242703745e-06, + "loss": 1.58371181, + "memory(GiB)": 117.38, + "step": 50105, + "train_speed(iter/s)": 1.635857 + }, + { + "acc": 0.66018152, + "epoch": 1.2711821410451547, + "grad_norm": 5.25, + "learning_rate": 3.2119005096595203e-06, + "loss": 1.61099701, + "memory(GiB)": 117.38, + "step": 50110, + "train_speed(iter/s)": 1.635874 + }, + { + "acc": 0.66646771, + "epoch": 1.2713089802130897, + "grad_norm": 4.875, + "learning_rate": 3.2109212736972636e-06, + "loss": 1.57505798, + "memory(GiB)": 117.38, + "step": 50115, + "train_speed(iter/s)": 1.635891 + }, + { + "acc": 0.66070452, + "epoch": 1.271435819381025, + "grad_norm": 5.84375, + "learning_rate": 3.2099421164266758e-06, + "loss": 1.53111658, + "memory(GiB)": 117.38, + "step": 50120, + "train_speed(iter/s)": 1.635908 + }, + { + "acc": 0.65686388, + "epoch": 1.27156265854896, + "grad_norm": 7.09375, + "learning_rate": 3.2089630378908264e-06, + "loss": 1.60889378, + "memory(GiB)": 117.38, + "step": 50125, + "train_speed(iter/s)": 1.635925 + }, + { + "acc": 0.66921763, + "epoch": 1.271689497716895, + "grad_norm": 4.4375, + "learning_rate": 3.207984038132781e-06, + "loss": 1.54144535, + "memory(GiB)": 117.38, + "step": 50130, + "train_speed(iter/s)": 1.635944 + }, + { + "acc": 0.66131573, + "epoch": 1.27181633688483, + "grad_norm": 5.6875, + "learning_rate": 3.2070051171955966e-06, + "loss": 1.59992342, + "memory(GiB)": 117.38, + "step": 50135, + "train_speed(iter/s)": 1.635961 + }, + { + "acc": 0.68003874, + "epoch": 1.271943176052765, + "grad_norm": 5.0625, + "learning_rate": 3.206026275122332e-06, + "loss": 1.49340334, + "memory(GiB)": 117.38, + "step": 50140, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.67815485, + "epoch": 1.2720700152207, + "grad_norm": 5.8125, + "learning_rate": 3.205047511956042e-06, + "loss": 1.51377974, + "memory(GiB)": 117.38, + "step": 50145, + "train_speed(iter/s)": 1.635995 + }, + { + "acc": 0.6564024, + "epoch": 1.2721968543886353, + "grad_norm": 5.6875, + "learning_rate": 3.204068827739777e-06, + "loss": 1.69637527, + "memory(GiB)": 117.38, + "step": 50150, + "train_speed(iter/s)": 1.636014 + }, + { + "acc": 0.65682774, + "epoch": 1.2723236935565703, + "grad_norm": 5.8125, + "learning_rate": 3.2030902225165814e-06, + "loss": 1.63150272, + "memory(GiB)": 117.38, + "step": 50155, + "train_speed(iter/s)": 1.636031 + }, + { + "acc": 0.66434259, + "epoch": 1.2724505327245053, + "grad_norm": 4.625, + "learning_rate": 3.2021116963295016e-06, + "loss": 1.57543173, + "memory(GiB)": 117.38, + "step": 50160, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.64593782, + "epoch": 1.2725773718924405, + "grad_norm": 4.90625, + "learning_rate": 3.2011332492215753e-06, + "loss": 1.61526718, + "memory(GiB)": 117.38, + "step": 50165, + "train_speed(iter/s)": 1.636067 + }, + { + "acc": 0.65929508, + "epoch": 1.2727042110603755, + "grad_norm": 5.875, + "learning_rate": 3.200154881235842e-06, + "loss": 1.60331402, + "memory(GiB)": 117.38, + "step": 50170, + "train_speed(iter/s)": 1.636085 + }, + { + "acc": 0.66562729, + "epoch": 1.2728310502283104, + "grad_norm": 6.09375, + "learning_rate": 3.1991765924153316e-06, + "loss": 1.59988985, + "memory(GiB)": 117.38, + "step": 50175, + "train_speed(iter/s)": 1.636102 + }, + { + "acc": 0.6452199, + "epoch": 1.2729578893962454, + "grad_norm": 5.28125, + "learning_rate": 3.198198382803075e-06, + "loss": 1.59723969, + "memory(GiB)": 117.38, + "step": 50180, + "train_speed(iter/s)": 1.636119 + }, + { + "acc": 0.64359026, + "epoch": 1.2730847285641806, + "grad_norm": 6.46875, + "learning_rate": 3.197220252442097e-06, + "loss": 1.61370697, + "memory(GiB)": 117.38, + "step": 50185, + "train_speed(iter/s)": 1.636137 + }, + { + "acc": 0.64292336, + "epoch": 1.2732115677321156, + "grad_norm": 5.3125, + "learning_rate": 3.1962422013754237e-06, + "loss": 1.63647594, + "memory(GiB)": 117.38, + "step": 50190, + "train_speed(iter/s)": 1.636155 + }, + { + "acc": 0.67618837, + "epoch": 1.2733384069000508, + "grad_norm": 6.5625, + "learning_rate": 3.1952642296460696e-06, + "loss": 1.52273808, + "memory(GiB)": 117.38, + "step": 50195, + "train_speed(iter/s)": 1.636173 + }, + { + "acc": 0.63622956, + "epoch": 1.2734652460679858, + "grad_norm": 5.71875, + "learning_rate": 3.194286337297051e-06, + "loss": 1.6601965, + "memory(GiB)": 117.38, + "step": 50200, + "train_speed(iter/s)": 1.636191 + }, + { + "acc": 0.64998941, + "epoch": 1.2735920852359208, + "grad_norm": 9.125, + "learning_rate": 3.1933085243713837e-06, + "loss": 1.54855881, + "memory(GiB)": 117.38, + "step": 50205, + "train_speed(iter/s)": 1.63621 + }, + { + "acc": 0.66501269, + "epoch": 1.2737189244038558, + "grad_norm": 5.5625, + "learning_rate": 3.1923307909120736e-06, + "loss": 1.64308395, + "memory(GiB)": 117.38, + "step": 50210, + "train_speed(iter/s)": 1.636228 + }, + { + "acc": 0.65833664, + "epoch": 1.273845763571791, + "grad_norm": 5.15625, + "learning_rate": 3.1913531369621243e-06, + "loss": 1.60546989, + "memory(GiB)": 117.38, + "step": 50215, + "train_speed(iter/s)": 1.636245 + }, + { + "acc": 0.65798097, + "epoch": 1.273972602739726, + "grad_norm": 5.96875, + "learning_rate": 3.19037556256454e-06, + "loss": 1.60512352, + "memory(GiB)": 117.38, + "step": 50220, + "train_speed(iter/s)": 1.636263 + }, + { + "acc": 0.64735341, + "epoch": 1.2740994419076612, + "grad_norm": 6.03125, + "learning_rate": 3.189398067762318e-06, + "loss": 1.6562561, + "memory(GiB)": 117.38, + "step": 50225, + "train_speed(iter/s)": 1.63628 + }, + { + "acc": 0.66178579, + "epoch": 1.2742262810755962, + "grad_norm": 6.9375, + "learning_rate": 3.1884206525984535e-06, + "loss": 1.6252594, + "memory(GiB)": 117.38, + "step": 50230, + "train_speed(iter/s)": 1.636297 + }, + { + "acc": 0.64683838, + "epoch": 1.2743531202435312, + "grad_norm": 5.53125, + "learning_rate": 3.1874433171159348e-06, + "loss": 1.64285927, + "memory(GiB)": 117.38, + "step": 50235, + "train_speed(iter/s)": 1.636316 + }, + { + "acc": 0.65800476, + "epoch": 1.2744799594114662, + "grad_norm": 5.15625, + "learning_rate": 3.1864660613577523e-06, + "loss": 1.62914963, + "memory(GiB)": 117.38, + "step": 50240, + "train_speed(iter/s)": 1.636333 + }, + { + "acc": 0.66175122, + "epoch": 1.2746067985794014, + "grad_norm": 8.4375, + "learning_rate": 3.185488885366889e-06, + "loss": 1.55219488, + "memory(GiB)": 117.38, + "step": 50245, + "train_speed(iter/s)": 1.636349 + }, + { + "acc": 0.65221691, + "epoch": 1.2747336377473364, + "grad_norm": 4.84375, + "learning_rate": 3.1845117891863274e-06, + "loss": 1.54298582, + "memory(GiB)": 117.38, + "step": 50250, + "train_speed(iter/s)": 1.636366 + }, + { + "acc": 0.65278244, + "epoch": 1.2748604769152714, + "grad_norm": 5.21875, + "learning_rate": 3.1835347728590414e-06, + "loss": 1.60195274, + "memory(GiB)": 117.38, + "step": 50255, + "train_speed(iter/s)": 1.636382 + }, + { + "acc": 0.65044451, + "epoch": 1.2749873160832066, + "grad_norm": 5.53125, + "learning_rate": 3.1825578364280064e-06, + "loss": 1.55243683, + "memory(GiB)": 117.38, + "step": 50260, + "train_speed(iter/s)": 1.636398 + }, + { + "acc": 0.66600599, + "epoch": 1.2751141552511416, + "grad_norm": 6.875, + "learning_rate": 3.181580979936192e-06, + "loss": 1.59391289, + "memory(GiB)": 117.38, + "step": 50265, + "train_speed(iter/s)": 1.636415 + }, + { + "acc": 0.64651403, + "epoch": 1.2752409944190766, + "grad_norm": 5.375, + "learning_rate": 3.1806042034265656e-06, + "loss": 1.64264088, + "memory(GiB)": 117.38, + "step": 50270, + "train_speed(iter/s)": 1.636432 + }, + { + "acc": 0.65382719, + "epoch": 1.2753678335870116, + "grad_norm": 6.375, + "learning_rate": 3.179627506942089e-06, + "loss": 1.60444183, + "memory(GiB)": 117.38, + "step": 50275, + "train_speed(iter/s)": 1.63645 + }, + { + "acc": 0.66716862, + "epoch": 1.2754946727549468, + "grad_norm": 5.4375, + "learning_rate": 3.17865089052572e-06, + "loss": 1.59919567, + "memory(GiB)": 117.38, + "step": 50280, + "train_speed(iter/s)": 1.636466 + }, + { + "acc": 0.66760807, + "epoch": 1.2756215119228818, + "grad_norm": 5.09375, + "learning_rate": 3.177674354220418e-06, + "loss": 1.55120783, + "memory(GiB)": 117.38, + "step": 50285, + "train_speed(iter/s)": 1.636484 + }, + { + "acc": 0.65816998, + "epoch": 1.275748351090817, + "grad_norm": 5.5625, + "learning_rate": 3.1766978980691355e-06, + "loss": 1.59152641, + "memory(GiB)": 117.38, + "step": 50290, + "train_speed(iter/s)": 1.636501 + }, + { + "acc": 0.67199364, + "epoch": 1.275875190258752, + "grad_norm": 5.0625, + "learning_rate": 3.1757215221148173e-06, + "loss": 1.59214211, + "memory(GiB)": 117.38, + "step": 50295, + "train_speed(iter/s)": 1.636518 + }, + { + "acc": 0.65828438, + "epoch": 1.276002029426687, + "grad_norm": 5.25, + "learning_rate": 3.174745226400413e-06, + "loss": 1.60132904, + "memory(GiB)": 117.38, + "step": 50300, + "train_speed(iter/s)": 1.636536 + }, + { + "acc": 0.66523685, + "epoch": 1.276128868594622, + "grad_norm": 5.65625, + "learning_rate": 3.1737690109688613e-06, + "loss": 1.55143909, + "memory(GiB)": 117.38, + "step": 50305, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.65723853, + "epoch": 1.2762557077625571, + "grad_norm": 5.8125, + "learning_rate": 3.1727928758631054e-06, + "loss": 1.56019688, + "memory(GiB)": 117.38, + "step": 50310, + "train_speed(iter/s)": 1.636572 + }, + { + "acc": 0.65220823, + "epoch": 1.2763825469304921, + "grad_norm": 5.71875, + "learning_rate": 3.1718168211260734e-06, + "loss": 1.66425323, + "memory(GiB)": 117.38, + "step": 50315, + "train_speed(iter/s)": 1.636589 + }, + { + "acc": 0.64509587, + "epoch": 1.2765093860984271, + "grad_norm": 7.40625, + "learning_rate": 3.1708408468007014e-06, + "loss": 1.66488075, + "memory(GiB)": 117.38, + "step": 50320, + "train_speed(iter/s)": 1.636607 + }, + { + "acc": 0.65077329, + "epoch": 1.2766362252663623, + "grad_norm": 5.21875, + "learning_rate": 3.169864952929914e-06, + "loss": 1.64300232, + "memory(GiB)": 117.38, + "step": 50325, + "train_speed(iter/s)": 1.636624 + }, + { + "acc": 0.65535846, + "epoch": 1.2767630644342973, + "grad_norm": 4.84375, + "learning_rate": 3.168889139556639e-06, + "loss": 1.61614685, + "memory(GiB)": 117.38, + "step": 50330, + "train_speed(iter/s)": 1.636641 + }, + { + "acc": 0.65707207, + "epoch": 1.2768899036022323, + "grad_norm": 4.71875, + "learning_rate": 3.1679134067237942e-06, + "loss": 1.60960522, + "memory(GiB)": 117.38, + "step": 50335, + "train_speed(iter/s)": 1.636659 + }, + { + "acc": 0.65848408, + "epoch": 1.2770167427701673, + "grad_norm": 5.8125, + "learning_rate": 3.166937754474296e-06, + "loss": 1.59378109, + "memory(GiB)": 117.38, + "step": 50340, + "train_speed(iter/s)": 1.636676 + }, + { + "acc": 0.65967274, + "epoch": 1.2771435819381025, + "grad_norm": 5.46875, + "learning_rate": 3.165962182851061e-06, + "loss": 1.5828186, + "memory(GiB)": 117.38, + "step": 50345, + "train_speed(iter/s)": 1.636693 + }, + { + "acc": 0.66474075, + "epoch": 1.2772704211060375, + "grad_norm": 6.28125, + "learning_rate": 3.1649866918969984e-06, + "loss": 1.57147827, + "memory(GiB)": 117.38, + "step": 50350, + "train_speed(iter/s)": 1.636711 + }, + { + "acc": 0.64480772, + "epoch": 1.2773972602739727, + "grad_norm": 6.40625, + "learning_rate": 3.164011281655013e-06, + "loss": 1.60990448, + "memory(GiB)": 117.38, + "step": 50355, + "train_speed(iter/s)": 1.636729 + }, + { + "acc": 0.64572506, + "epoch": 1.2775240994419077, + "grad_norm": 5.75, + "learning_rate": 3.163035952168007e-06, + "loss": 1.66291122, + "memory(GiB)": 117.38, + "step": 50360, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.66791496, + "epoch": 1.2776509386098427, + "grad_norm": 7.625, + "learning_rate": 3.1620607034788832e-06, + "loss": 1.64908981, + "memory(GiB)": 117.38, + "step": 50365, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.65327721, + "epoch": 1.2777777777777777, + "grad_norm": 4.96875, + "learning_rate": 3.1610855356305354e-06, + "loss": 1.65682831, + "memory(GiB)": 117.38, + "step": 50370, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.64920874, + "epoch": 1.277904616945713, + "grad_norm": 5.84375, + "learning_rate": 3.160110448665854e-06, + "loss": 1.61971951, + "memory(GiB)": 117.38, + "step": 50375, + "train_speed(iter/s)": 1.636797 + }, + { + "acc": 0.66433744, + "epoch": 1.2780314561136479, + "grad_norm": 4.71875, + "learning_rate": 3.159135442627731e-06, + "loss": 1.54368992, + "memory(GiB)": 117.38, + "step": 50380, + "train_speed(iter/s)": 1.636814 + }, + { + "acc": 0.66488256, + "epoch": 1.278158295281583, + "grad_norm": 5.4375, + "learning_rate": 3.158160517559049e-06, + "loss": 1.58785248, + "memory(GiB)": 117.38, + "step": 50385, + "train_speed(iter/s)": 1.636831 + }, + { + "acc": 0.65733833, + "epoch": 1.278285134449518, + "grad_norm": 5.375, + "learning_rate": 3.157185673502693e-06, + "loss": 1.61690845, + "memory(GiB)": 117.38, + "step": 50390, + "train_speed(iter/s)": 1.636846 + }, + { + "acc": 0.65631394, + "epoch": 1.278411973617453, + "grad_norm": 6.84375, + "learning_rate": 3.156210910501537e-06, + "loss": 1.58626194, + "memory(GiB)": 117.38, + "step": 50395, + "train_speed(iter/s)": 1.636864 + }, + { + "acc": 0.64931765, + "epoch": 1.278538812785388, + "grad_norm": 5.0625, + "learning_rate": 3.155236228598457e-06, + "loss": 1.59983044, + "memory(GiB)": 117.38, + "step": 50400, + "train_speed(iter/s)": 1.636882 + }, + { + "acc": 0.65192442, + "epoch": 1.2786656519533233, + "grad_norm": 6.0625, + "learning_rate": 3.1542616278363238e-06, + "loss": 1.60695801, + "memory(GiB)": 117.38, + "step": 50405, + "train_speed(iter/s)": 1.636898 + }, + { + "acc": 0.65318117, + "epoch": 1.2787924911212583, + "grad_norm": 5.9375, + "learning_rate": 3.1532871082580064e-06, + "loss": 1.63932648, + "memory(GiB)": 117.38, + "step": 50410, + "train_speed(iter/s)": 1.636915 + }, + { + "acc": 0.65757351, + "epoch": 1.2789193302891932, + "grad_norm": 5.53125, + "learning_rate": 3.152312669906366e-06, + "loss": 1.60349197, + "memory(GiB)": 117.38, + "step": 50415, + "train_speed(iter/s)": 1.636933 + }, + { + "acc": 0.67174292, + "epoch": 1.2790461694571285, + "grad_norm": 5.53125, + "learning_rate": 3.1513383128242624e-06, + "loss": 1.52506351, + "memory(GiB)": 117.38, + "step": 50420, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.66882143, + "epoch": 1.2791730086250634, + "grad_norm": 6.90625, + "learning_rate": 3.150364037054555e-06, + "loss": 1.61610775, + "memory(GiB)": 117.38, + "step": 50425, + "train_speed(iter/s)": 1.636967 + }, + { + "acc": 0.65421519, + "epoch": 1.2792998477929984, + "grad_norm": 5.8125, + "learning_rate": 3.149389842640096e-06, + "loss": 1.55048885, + "memory(GiB)": 117.38, + "step": 50430, + "train_speed(iter/s)": 1.636984 + }, + { + "acc": 0.66348271, + "epoch": 1.2794266869609334, + "grad_norm": 6.46875, + "learning_rate": 3.148415729623732e-06, + "loss": 1.5821991, + "memory(GiB)": 117.38, + "step": 50435, + "train_speed(iter/s)": 1.637002 + }, + { + "acc": 0.6693789, + "epoch": 1.2795535261288686, + "grad_norm": 4.59375, + "learning_rate": 3.1474416980483126e-06, + "loss": 1.61463356, + "memory(GiB)": 117.38, + "step": 50440, + "train_speed(iter/s)": 1.63702 + }, + { + "acc": 0.67107062, + "epoch": 1.2796803652968036, + "grad_norm": 6.8125, + "learning_rate": 3.1464677479566774e-06, + "loss": 1.58897066, + "memory(GiB)": 117.38, + "step": 50445, + "train_speed(iter/s)": 1.637037 + }, + { + "acc": 0.65986547, + "epoch": 1.2798072044647388, + "grad_norm": 5.875, + "learning_rate": 3.1454938793916677e-06, + "loss": 1.62581615, + "memory(GiB)": 117.38, + "step": 50450, + "train_speed(iter/s)": 1.637055 + }, + { + "acc": 0.64644847, + "epoch": 1.2799340436326738, + "grad_norm": 5.8125, + "learning_rate": 3.144520092396115e-06, + "loss": 1.63034859, + "memory(GiB)": 117.38, + "step": 50455, + "train_speed(iter/s)": 1.637073 + }, + { + "acc": 0.64993873, + "epoch": 1.2800608828006088, + "grad_norm": 4.84375, + "learning_rate": 3.1435463870128536e-06, + "loss": 1.6300766, + "memory(GiB)": 117.38, + "step": 50460, + "train_speed(iter/s)": 1.63709 + }, + { + "acc": 0.67384334, + "epoch": 1.2801877219685438, + "grad_norm": 6.125, + "learning_rate": 3.142572763284709e-06, + "loss": 1.54737701, + "memory(GiB)": 117.38, + "step": 50465, + "train_speed(iter/s)": 1.637109 + }, + { + "acc": 0.65003557, + "epoch": 1.280314561136479, + "grad_norm": 5.625, + "learning_rate": 3.14159922125451e-06, + "loss": 1.63682899, + "memory(GiB)": 117.38, + "step": 50470, + "train_speed(iter/s)": 1.637128 + }, + { + "acc": 0.65351772, + "epoch": 1.280441400304414, + "grad_norm": 4.6875, + "learning_rate": 3.1406257609650724e-06, + "loss": 1.58972874, + "memory(GiB)": 117.38, + "step": 50475, + "train_speed(iter/s)": 1.637145 + }, + { + "acc": 0.66596484, + "epoch": 1.280568239472349, + "grad_norm": 7.40625, + "learning_rate": 3.139652382459215e-06, + "loss": 1.60235176, + "memory(GiB)": 117.38, + "step": 50480, + "train_speed(iter/s)": 1.637163 + }, + { + "acc": 0.64224672, + "epoch": 1.2806950786402842, + "grad_norm": 5.96875, + "learning_rate": 3.1386790857797535e-06, + "loss": 1.64954071, + "memory(GiB)": 117.38, + "step": 50485, + "train_speed(iter/s)": 1.637181 + }, + { + "acc": 0.67724533, + "epoch": 1.2808219178082192, + "grad_norm": 7.40625, + "learning_rate": 3.1377058709694957e-06, + "loss": 1.5474041, + "memory(GiB)": 117.38, + "step": 50490, + "train_speed(iter/s)": 1.637198 + }, + { + "acc": 0.65323787, + "epoch": 1.2809487569761542, + "grad_norm": 5.0625, + "learning_rate": 3.1367327380712477e-06, + "loss": 1.54239893, + "memory(GiB)": 117.38, + "step": 50495, + "train_speed(iter/s)": 1.637215 + }, + { + "acc": 0.64975319, + "epoch": 1.2810755961440892, + "grad_norm": 4.875, + "learning_rate": 3.135759687127812e-06, + "loss": 1.59814892, + "memory(GiB)": 117.38, + "step": 50500, + "train_speed(iter/s)": 1.637232 + }, + { + "acc": 0.65838127, + "epoch": 1.2812024353120244, + "grad_norm": 4.5625, + "learning_rate": 3.134786718181989e-06, + "loss": 1.60836372, + "memory(GiB)": 117.38, + "step": 50505, + "train_speed(iter/s)": 1.637249 + }, + { + "acc": 0.65520487, + "epoch": 1.2813292744799594, + "grad_norm": 5.78125, + "learning_rate": 3.133813831276575e-06, + "loss": 1.58764572, + "memory(GiB)": 117.38, + "step": 50510, + "train_speed(iter/s)": 1.637267 + }, + { + "acc": 0.65381041, + "epoch": 1.2814561136478946, + "grad_norm": 6.75, + "learning_rate": 3.1328410264543585e-06, + "loss": 1.61964016, + "memory(GiB)": 117.38, + "step": 50515, + "train_speed(iter/s)": 1.637284 + }, + { + "acc": 0.65839658, + "epoch": 1.2815829528158296, + "grad_norm": 5.75, + "learning_rate": 3.131868303758131e-06, + "loss": 1.54832602, + "memory(GiB)": 117.38, + "step": 50520, + "train_speed(iter/s)": 1.6373 + }, + { + "acc": 0.65777388, + "epoch": 1.2817097919837646, + "grad_norm": 6.53125, + "learning_rate": 3.1308956632306754e-06, + "loss": 1.5596983, + "memory(GiB)": 117.38, + "step": 50525, + "train_speed(iter/s)": 1.637319 + }, + { + "acc": 0.65868034, + "epoch": 1.2818366311516995, + "grad_norm": 5.5, + "learning_rate": 3.129923104914776e-06, + "loss": 1.60860634, + "memory(GiB)": 117.38, + "step": 50530, + "train_speed(iter/s)": 1.637337 + }, + { + "acc": 0.66887999, + "epoch": 1.2819634703196348, + "grad_norm": 6.09375, + "learning_rate": 3.1289506288532045e-06, + "loss": 1.47076874, + "memory(GiB)": 117.38, + "step": 50535, + "train_speed(iter/s)": 1.637354 + }, + { + "acc": 0.64477448, + "epoch": 1.2820903094875697, + "grad_norm": 6.0, + "learning_rate": 3.12797823508874e-06, + "loss": 1.62783394, + "memory(GiB)": 117.38, + "step": 50540, + "train_speed(iter/s)": 1.637372 + }, + { + "acc": 0.66084676, + "epoch": 1.282217148655505, + "grad_norm": 5.5625, + "learning_rate": 3.127005923664149e-06, + "loss": 1.59168491, + "memory(GiB)": 117.38, + "step": 50545, + "train_speed(iter/s)": 1.637388 + }, + { + "acc": 0.65986476, + "epoch": 1.28234398782344, + "grad_norm": 5.21875, + "learning_rate": 3.1260336946222014e-06, + "loss": 1.55225887, + "memory(GiB)": 117.38, + "step": 50550, + "train_speed(iter/s)": 1.637406 + }, + { + "acc": 0.6664011, + "epoch": 1.282470826991375, + "grad_norm": 6.3125, + "learning_rate": 3.1250615480056584e-06, + "loss": 1.51310654, + "memory(GiB)": 117.38, + "step": 50555, + "train_speed(iter/s)": 1.637423 + }, + { + "acc": 0.67312164, + "epoch": 1.28259766615931, + "grad_norm": 5.0, + "learning_rate": 3.124089483857278e-06, + "loss": 1.48702517, + "memory(GiB)": 117.38, + "step": 50560, + "train_speed(iter/s)": 1.637439 + }, + { + "acc": 0.65216436, + "epoch": 1.2827245053272451, + "grad_norm": 8.0625, + "learning_rate": 3.123117502219819e-06, + "loss": 1.62879372, + "memory(GiB)": 117.38, + "step": 50565, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.65934901, + "epoch": 1.2828513444951801, + "grad_norm": 6.375, + "learning_rate": 3.122145603136032e-06, + "loss": 1.59183836, + "memory(GiB)": 117.38, + "step": 50570, + "train_speed(iter/s)": 1.637474 + }, + { + "acc": 0.66513715, + "epoch": 1.2829781836631151, + "grad_norm": 6.5, + "learning_rate": 3.1211737866486653e-06, + "loss": 1.63434715, + "memory(GiB)": 117.38, + "step": 50575, + "train_speed(iter/s)": 1.63749 + }, + { + "acc": 0.66325121, + "epoch": 1.2831050228310503, + "grad_norm": 4.84375, + "learning_rate": 3.1202020528004627e-06, + "loss": 1.57187872, + "memory(GiB)": 117.38, + "step": 50580, + "train_speed(iter/s)": 1.637507 + }, + { + "acc": 0.65429831, + "epoch": 1.2832318619989853, + "grad_norm": 5.1875, + "learning_rate": 3.119230401634167e-06, + "loss": 1.59203625, + "memory(GiB)": 117.38, + "step": 50585, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.65885706, + "epoch": 1.2833587011669203, + "grad_norm": 5.4375, + "learning_rate": 3.118258833192517e-06, + "loss": 1.55022364, + "memory(GiB)": 117.38, + "step": 50590, + "train_speed(iter/s)": 1.637541 + }, + { + "acc": 0.64973927, + "epoch": 1.2834855403348553, + "grad_norm": 6.875, + "learning_rate": 3.117287347518242e-06, + "loss": 1.59403229, + "memory(GiB)": 117.38, + "step": 50595, + "train_speed(iter/s)": 1.637558 + }, + { + "acc": 0.65729885, + "epoch": 1.2836123795027905, + "grad_norm": 6.71875, + "learning_rate": 3.116315944654077e-06, + "loss": 1.69018822, + "memory(GiB)": 117.38, + "step": 50600, + "train_speed(iter/s)": 1.637575 + }, + { + "acc": 0.64665127, + "epoch": 1.2837392186707255, + "grad_norm": 6.09375, + "learning_rate": 3.115344624642745e-06, + "loss": 1.64469833, + "memory(GiB)": 117.38, + "step": 50605, + "train_speed(iter/s)": 1.637592 + }, + { + "acc": 0.67346849, + "epoch": 1.2838660578386607, + "grad_norm": 7.34375, + "learning_rate": 3.1143733875269734e-06, + "loss": 1.61861629, + "memory(GiB)": 117.38, + "step": 50610, + "train_speed(iter/s)": 1.63761 + }, + { + "acc": 0.67737298, + "epoch": 1.2839928970065957, + "grad_norm": 5.8125, + "learning_rate": 3.1134022333494774e-06, + "loss": 1.52774324, + "memory(GiB)": 117.38, + "step": 50615, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.67497382, + "epoch": 1.2841197361745307, + "grad_norm": 5.8125, + "learning_rate": 3.1124311621529744e-06, + "loss": 1.51304722, + "memory(GiB)": 117.38, + "step": 50620, + "train_speed(iter/s)": 1.637644 + }, + { + "acc": 0.65862589, + "epoch": 1.2842465753424657, + "grad_norm": 6.5, + "learning_rate": 3.111460173980175e-06, + "loss": 1.63301544, + "memory(GiB)": 117.38, + "step": 50625, + "train_speed(iter/s)": 1.637661 + }, + { + "acc": 0.65544624, + "epoch": 1.2843734145104009, + "grad_norm": 5.34375, + "learning_rate": 3.110489268873792e-06, + "loss": 1.55189438, + "memory(GiB)": 117.38, + "step": 50630, + "train_speed(iter/s)": 1.637679 + }, + { + "acc": 0.66510191, + "epoch": 1.2845002536783359, + "grad_norm": 6.3125, + "learning_rate": 3.1095184468765248e-06, + "loss": 1.61021194, + "memory(GiB)": 117.38, + "step": 50635, + "train_speed(iter/s)": 1.637696 + }, + { + "acc": 0.65029812, + "epoch": 1.2846270928462709, + "grad_norm": 4.625, + "learning_rate": 3.1085477080310766e-06, + "loss": 1.59244518, + "memory(GiB)": 117.38, + "step": 50640, + "train_speed(iter/s)": 1.637714 + }, + { + "acc": 0.66593041, + "epoch": 1.284753932014206, + "grad_norm": 5.75, + "learning_rate": 3.1075770523801453e-06, + "loss": 1.52028494, + "memory(GiB)": 117.38, + "step": 50645, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.65899096, + "epoch": 1.284880771182141, + "grad_norm": 5.34375, + "learning_rate": 3.106606479966426e-06, + "loss": 1.58623772, + "memory(GiB)": 117.38, + "step": 50650, + "train_speed(iter/s)": 1.637747 + }, + { + "acc": 0.67684488, + "epoch": 1.285007610350076, + "grad_norm": 6.0, + "learning_rate": 3.1056359908326044e-06, + "loss": 1.57980261, + "memory(GiB)": 117.38, + "step": 50655, + "train_speed(iter/s)": 1.637765 + }, + { + "acc": 0.64775853, + "epoch": 1.285134449518011, + "grad_norm": 6.84375, + "learning_rate": 3.1046655850213707e-06, + "loss": 1.59221067, + "memory(GiB)": 117.38, + "step": 50660, + "train_speed(iter/s)": 1.637783 + }, + { + "acc": 0.64618044, + "epoch": 1.2852612886859462, + "grad_norm": 6.03125, + "learning_rate": 3.103695262575407e-06, + "loss": 1.6291193, + "memory(GiB)": 117.38, + "step": 50665, + "train_speed(iter/s)": 1.6378 + }, + { + "acc": 0.65867229, + "epoch": 1.2853881278538812, + "grad_norm": 5.75, + "learning_rate": 3.102725023537393e-06, + "loss": 1.61899948, + "memory(GiB)": 117.38, + "step": 50670, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.66438456, + "epoch": 1.2855149670218164, + "grad_norm": 5.46875, + "learning_rate": 3.1017548679500008e-06, + "loss": 1.52022285, + "memory(GiB)": 117.38, + "step": 50675, + "train_speed(iter/s)": 1.637835 + }, + { + "acc": 0.64881101, + "epoch": 1.2856418061897514, + "grad_norm": 7.46875, + "learning_rate": 3.1007847958559057e-06, + "loss": 1.61680336, + "memory(GiB)": 117.38, + "step": 50680, + "train_speed(iter/s)": 1.637853 + }, + { + "acc": 0.64892354, + "epoch": 1.2857686453576864, + "grad_norm": 5.5625, + "learning_rate": 3.099814807297774e-06, + "loss": 1.63759174, + "memory(GiB)": 117.38, + "step": 50685, + "train_speed(iter/s)": 1.637871 + }, + { + "acc": 0.67104535, + "epoch": 1.2858954845256214, + "grad_norm": 5.5625, + "learning_rate": 3.098844902318272e-06, + "loss": 1.51733265, + "memory(GiB)": 117.38, + "step": 50690, + "train_speed(iter/s)": 1.637888 + }, + { + "acc": 0.66727409, + "epoch": 1.2860223236935566, + "grad_norm": 5.78125, + "learning_rate": 3.0978750809600596e-06, + "loss": 1.59357586, + "memory(GiB)": 117.38, + "step": 50695, + "train_speed(iter/s)": 1.637903 + }, + { + "acc": 0.66900167, + "epoch": 1.2861491628614916, + "grad_norm": 5.8125, + "learning_rate": 3.0969053432657913e-06, + "loss": 1.56347475, + "memory(GiB)": 117.38, + "step": 50700, + "train_speed(iter/s)": 1.63792 + }, + { + "acc": 0.66129742, + "epoch": 1.2862760020294268, + "grad_norm": 5.65625, + "learning_rate": 3.0959356892781246e-06, + "loss": 1.57301655, + "memory(GiB)": 117.38, + "step": 50705, + "train_speed(iter/s)": 1.637937 + }, + { + "acc": 0.66649795, + "epoch": 1.2864028411973618, + "grad_norm": 6.5, + "learning_rate": 3.0949661190397072e-06, + "loss": 1.5851676, + "memory(GiB)": 117.38, + "step": 50710, + "train_speed(iter/s)": 1.637955 + }, + { + "acc": 0.65414267, + "epoch": 1.2865296803652968, + "grad_norm": 6.09375, + "learning_rate": 3.0939966325931852e-06, + "loss": 1.64785309, + "memory(GiB)": 117.38, + "step": 50715, + "train_speed(iter/s)": 1.637972 + }, + { + "acc": 0.63294878, + "epoch": 1.2866565195332318, + "grad_norm": 5.1875, + "learning_rate": 3.0930272299812e-06, + "loss": 1.68220959, + "memory(GiB)": 117.38, + "step": 50720, + "train_speed(iter/s)": 1.63799 + }, + { + "acc": 0.65390005, + "epoch": 1.286783358701167, + "grad_norm": 5.21875, + "learning_rate": 3.0920579112463916e-06, + "loss": 1.59368076, + "memory(GiB)": 117.38, + "step": 50725, + "train_speed(iter/s)": 1.638009 + }, + { + "acc": 0.65120931, + "epoch": 1.286910197869102, + "grad_norm": 5.40625, + "learning_rate": 3.0910886764313964e-06, + "loss": 1.68505592, + "memory(GiB)": 117.38, + "step": 50730, + "train_speed(iter/s)": 1.638026 + }, + { + "acc": 0.66721745, + "epoch": 1.287037037037037, + "grad_norm": 5.34375, + "learning_rate": 3.0901195255788406e-06, + "loss": 1.54817085, + "memory(GiB)": 117.38, + "step": 50735, + "train_speed(iter/s)": 1.638044 + }, + { + "acc": 0.6673748, + "epoch": 1.2871638762049722, + "grad_norm": 5.9375, + "learning_rate": 3.089150458731357e-06, + "loss": 1.54632559, + "memory(GiB)": 117.38, + "step": 50740, + "train_speed(iter/s)": 1.638061 + }, + { + "acc": 0.67838879, + "epoch": 1.2872907153729072, + "grad_norm": 5.4375, + "learning_rate": 3.0881814759315666e-06, + "loss": 1.48835278, + "memory(GiB)": 117.38, + "step": 50745, + "train_speed(iter/s)": 1.638078 + }, + { + "acc": 0.65278931, + "epoch": 1.2874175545408422, + "grad_norm": 5.625, + "learning_rate": 3.0872125772220934e-06, + "loss": 1.64299603, + "memory(GiB)": 117.38, + "step": 50750, + "train_speed(iter/s)": 1.638094 + }, + { + "acc": 0.65849876, + "epoch": 1.2875443937087772, + "grad_norm": 6.15625, + "learning_rate": 3.0862437626455483e-06, + "loss": 1.62352982, + "memory(GiB)": 117.38, + "step": 50755, + "train_speed(iter/s)": 1.638111 + }, + { + "acc": 0.66881943, + "epoch": 1.2876712328767124, + "grad_norm": 6.1875, + "learning_rate": 3.0852750322445473e-06, + "loss": 1.51064625, + "memory(GiB)": 117.38, + "step": 50760, + "train_speed(iter/s)": 1.638128 + }, + { + "acc": 0.66159663, + "epoch": 1.2877980720446474, + "grad_norm": 5.84375, + "learning_rate": 3.0843063860616982e-06, + "loss": 1.55973902, + "memory(GiB)": 117.38, + "step": 50765, + "train_speed(iter/s)": 1.638146 + }, + { + "acc": 0.66133451, + "epoch": 1.2879249112125826, + "grad_norm": 5.59375, + "learning_rate": 3.0833378241396094e-06, + "loss": 1.58226347, + "memory(GiB)": 117.38, + "step": 50770, + "train_speed(iter/s)": 1.638165 + }, + { + "acc": 0.64204926, + "epoch": 1.2880517503805176, + "grad_norm": 5.09375, + "learning_rate": 3.0823693465208794e-06, + "loss": 1.62418671, + "memory(GiB)": 117.38, + "step": 50775, + "train_speed(iter/s)": 1.638182 + }, + { + "acc": 0.65554996, + "epoch": 1.2881785895484525, + "grad_norm": 7.96875, + "learning_rate": 3.081400953248106e-06, + "loss": 1.58124094, + "memory(GiB)": 117.38, + "step": 50780, + "train_speed(iter/s)": 1.6382 + }, + { + "acc": 0.65164385, + "epoch": 1.2883054287163875, + "grad_norm": 7.875, + "learning_rate": 3.0804326443638854e-06, + "loss": 1.59955006, + "memory(GiB)": 117.38, + "step": 50785, + "train_speed(iter/s)": 1.638218 + }, + { + "acc": 0.65098739, + "epoch": 1.2884322678843227, + "grad_norm": 6.96875, + "learning_rate": 3.0794644199108087e-06, + "loss": 1.67862701, + "memory(GiB)": 117.38, + "step": 50790, + "train_speed(iter/s)": 1.638236 + }, + { + "acc": 0.65335464, + "epoch": 1.2885591070522577, + "grad_norm": 5.875, + "learning_rate": 3.07849627993146e-06, + "loss": 1.6175621, + "memory(GiB)": 117.38, + "step": 50795, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.65353198, + "epoch": 1.2886859462201927, + "grad_norm": 5.03125, + "learning_rate": 3.0775282244684233e-06, + "loss": 1.56455421, + "memory(GiB)": 117.38, + "step": 50800, + "train_speed(iter/s)": 1.63827 + }, + { + "acc": 0.64816279, + "epoch": 1.288812785388128, + "grad_norm": 7.34375, + "learning_rate": 3.076560253564279e-06, + "loss": 1.63729172, + "memory(GiB)": 117.38, + "step": 50805, + "train_speed(iter/s)": 1.638288 + }, + { + "acc": 0.65356383, + "epoch": 1.288939624556063, + "grad_norm": 7.46875, + "learning_rate": 3.0755923672616038e-06, + "loss": 1.6340889, + "memory(GiB)": 117.38, + "step": 50810, + "train_speed(iter/s)": 1.638305 + }, + { + "acc": 0.6475235, + "epoch": 1.289066463723998, + "grad_norm": 6.4375, + "learning_rate": 3.074624565602966e-06, + "loss": 1.68175354, + "memory(GiB)": 117.38, + "step": 50815, + "train_speed(iter/s)": 1.638322 + }, + { + "acc": 0.6499897, + "epoch": 1.289193302891933, + "grad_norm": 5.8125, + "learning_rate": 3.073656848630937e-06, + "loss": 1.59709034, + "memory(GiB)": 117.38, + "step": 50820, + "train_speed(iter/s)": 1.638339 + }, + { + "acc": 0.64909759, + "epoch": 1.2893201420598681, + "grad_norm": 5.59375, + "learning_rate": 3.0726892163880784e-06, + "loss": 1.61072235, + "memory(GiB)": 117.38, + "step": 50825, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.65183229, + "epoch": 1.289446981227803, + "grad_norm": 4.90625, + "learning_rate": 3.071721668916956e-06, + "loss": 1.61939106, + "memory(GiB)": 117.38, + "step": 50830, + "train_speed(iter/s)": 1.638372 + }, + { + "acc": 0.67048807, + "epoch": 1.2895738203957383, + "grad_norm": 6.71875, + "learning_rate": 3.0707542062601225e-06, + "loss": 1.52533302, + "memory(GiB)": 117.38, + "step": 50835, + "train_speed(iter/s)": 1.63839 + }, + { + "acc": 0.645679, + "epoch": 1.2897006595636733, + "grad_norm": 5.125, + "learning_rate": 3.0697868284601323e-06, + "loss": 1.6127409, + "memory(GiB)": 117.38, + "step": 50840, + "train_speed(iter/s)": 1.638408 + }, + { + "acc": 0.6747376, + "epoch": 1.2898274987316083, + "grad_norm": 6.34375, + "learning_rate": 3.068819535559534e-06, + "loss": 1.50929184, + "memory(GiB)": 117.38, + "step": 50845, + "train_speed(iter/s)": 1.638425 + }, + { + "acc": 0.65932112, + "epoch": 1.2899543378995433, + "grad_norm": 5.78125, + "learning_rate": 3.0678523276008774e-06, + "loss": 1.56526995, + "memory(GiB)": 117.38, + "step": 50850, + "train_speed(iter/s)": 1.638443 + }, + { + "acc": 0.6631443, + "epoch": 1.2900811770674785, + "grad_norm": 6.21875, + "learning_rate": 3.0668852046267e-06, + "loss": 1.56042929, + "memory(GiB)": 117.38, + "step": 50855, + "train_speed(iter/s)": 1.63846 + }, + { + "acc": 0.65926046, + "epoch": 1.2902080162354135, + "grad_norm": 6.3125, + "learning_rate": 3.0659181666795413e-06, + "loss": 1.56487341, + "memory(GiB)": 117.38, + "step": 50860, + "train_speed(iter/s)": 1.638477 + }, + { + "acc": 0.65164356, + "epoch": 1.2903348554033487, + "grad_norm": 6.0625, + "learning_rate": 3.0649512138019376e-06, + "loss": 1.61396122, + "memory(GiB)": 117.38, + "step": 50865, + "train_speed(iter/s)": 1.638495 + }, + { + "acc": 0.65864949, + "epoch": 1.2904616945712837, + "grad_norm": 5.71875, + "learning_rate": 3.0639843460364203e-06, + "loss": 1.58406963, + "memory(GiB)": 117.38, + "step": 50870, + "train_speed(iter/s)": 1.638241 + }, + { + "acc": 0.64596262, + "epoch": 1.2905885337392187, + "grad_norm": 6.0625, + "learning_rate": 3.0630175634255134e-06, + "loss": 1.66630001, + "memory(GiB)": 117.38, + "step": 50875, + "train_speed(iter/s)": 1.638257 + }, + { + "acc": 0.64331188, + "epoch": 1.2907153729071537, + "grad_norm": 5.34375, + "learning_rate": 3.062050866011742e-06, + "loss": 1.6839241, + "memory(GiB)": 117.38, + "step": 50880, + "train_speed(iter/s)": 1.638274 + }, + { + "acc": 0.646241, + "epoch": 1.2908422120750889, + "grad_norm": 7.71875, + "learning_rate": 3.0610842538376264e-06, + "loss": 1.71417999, + "memory(GiB)": 117.38, + "step": 50885, + "train_speed(iter/s)": 1.638291 + }, + { + "acc": 0.66505051, + "epoch": 1.2909690512430239, + "grad_norm": 5.4375, + "learning_rate": 3.060117726945683e-06, + "loss": 1.60677147, + "memory(GiB)": 117.38, + "step": 50890, + "train_speed(iter/s)": 1.638309 + }, + { + "acc": 0.63203182, + "epoch": 1.2910958904109588, + "grad_norm": 5.53125, + "learning_rate": 3.059151285378421e-06, + "loss": 1.70354424, + "memory(GiB)": 117.38, + "step": 50895, + "train_speed(iter/s)": 1.638327 + }, + { + "acc": 0.66179781, + "epoch": 1.291222729578894, + "grad_norm": 6.0, + "learning_rate": 3.0581849291783518e-06, + "loss": 1.54617586, + "memory(GiB)": 117.38, + "step": 50900, + "train_speed(iter/s)": 1.638345 + }, + { + "acc": 0.65977721, + "epoch": 1.291349568746829, + "grad_norm": 5.90625, + "learning_rate": 3.057218658387977e-06, + "loss": 1.56992302, + "memory(GiB)": 117.38, + "step": 50905, + "train_speed(iter/s)": 1.638363 + }, + { + "acc": 0.65622191, + "epoch": 1.291476407914764, + "grad_norm": 6.96875, + "learning_rate": 3.0562524730498023e-06, + "loss": 1.62710361, + "memory(GiB)": 117.38, + "step": 50910, + "train_speed(iter/s)": 1.638382 + }, + { + "acc": 0.64522934, + "epoch": 1.291603247082699, + "grad_norm": 6.125, + "learning_rate": 3.055286373206321e-06, + "loss": 1.67231064, + "memory(GiB)": 117.38, + "step": 50915, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.66228938, + "epoch": 1.2917300862506342, + "grad_norm": 5.0625, + "learning_rate": 3.054320358900027e-06, + "loss": 1.62173042, + "memory(GiB)": 117.38, + "step": 50920, + "train_speed(iter/s)": 1.638417 + }, + { + "acc": 0.64072666, + "epoch": 1.2918569254185692, + "grad_norm": 5.6875, + "learning_rate": 3.053354430173411e-06, + "loss": 1.66871567, + "memory(GiB)": 117.38, + "step": 50925, + "train_speed(iter/s)": 1.638435 + }, + { + "acc": 0.67633533, + "epoch": 1.2919837645865044, + "grad_norm": 7.8125, + "learning_rate": 3.0523885870689595e-06, + "loss": 1.53640633, + "memory(GiB)": 117.38, + "step": 50930, + "train_speed(iter/s)": 1.638454 + }, + { + "acc": 0.66682391, + "epoch": 1.2921106037544394, + "grad_norm": 6.75, + "learning_rate": 3.051422829629152e-06, + "loss": 1.53191719, + "memory(GiB)": 117.38, + "step": 50935, + "train_speed(iter/s)": 1.638472 + }, + { + "acc": 0.65137725, + "epoch": 1.2922374429223744, + "grad_norm": 5.21875, + "learning_rate": 3.050457157896467e-06, + "loss": 1.6059185, + "memory(GiB)": 117.38, + "step": 50940, + "train_speed(iter/s)": 1.63849 + }, + { + "acc": 0.65863485, + "epoch": 1.2923642820903094, + "grad_norm": 4.75, + "learning_rate": 3.049491571913382e-06, + "loss": 1.56069822, + "memory(GiB)": 117.38, + "step": 50945, + "train_speed(iter/s)": 1.638508 + }, + { + "acc": 0.64616456, + "epoch": 1.2924911212582446, + "grad_norm": 5.40625, + "learning_rate": 3.048526071722367e-06, + "loss": 1.61379967, + "memory(GiB)": 117.38, + "step": 50950, + "train_speed(iter/s)": 1.638527 + }, + { + "acc": 0.65950646, + "epoch": 1.2926179604261796, + "grad_norm": 6.125, + "learning_rate": 3.047560657365886e-06, + "loss": 1.59002857, + "memory(GiB)": 117.38, + "step": 50955, + "train_speed(iter/s)": 1.638546 + }, + { + "acc": 0.63919864, + "epoch": 1.2927447995941146, + "grad_norm": 5.71875, + "learning_rate": 3.046595328886405e-06, + "loss": 1.69314194, + "memory(GiB)": 117.38, + "step": 50960, + "train_speed(iter/s)": 1.638565 + }, + { + "acc": 0.65888104, + "epoch": 1.2928716387620498, + "grad_norm": 5.53125, + "learning_rate": 3.0456300863263815e-06, + "loss": 1.5710393, + "memory(GiB)": 117.38, + "step": 50965, + "train_speed(iter/s)": 1.638583 + }, + { + "acc": 0.65923815, + "epoch": 1.2929984779299848, + "grad_norm": 5.0625, + "learning_rate": 3.044664929728276e-06, + "loss": 1.55587101, + "memory(GiB)": 117.38, + "step": 50970, + "train_speed(iter/s)": 1.638602 + }, + { + "acc": 0.67046094, + "epoch": 1.2931253170979198, + "grad_norm": 6.0625, + "learning_rate": 3.0436998591345336e-06, + "loss": 1.5219985, + "memory(GiB)": 117.38, + "step": 50975, + "train_speed(iter/s)": 1.63862 + }, + { + "acc": 0.64892979, + "epoch": 1.2932521562658548, + "grad_norm": 8.0, + "learning_rate": 3.042734874587607e-06, + "loss": 1.6151371, + "memory(GiB)": 117.38, + "step": 50980, + "train_speed(iter/s)": 1.638638 + }, + { + "acc": 0.64846973, + "epoch": 1.29337899543379, + "grad_norm": 5.46875, + "learning_rate": 3.041769976129938e-06, + "loss": 1.56504183, + "memory(GiB)": 117.38, + "step": 50985, + "train_speed(iter/s)": 1.638656 + }, + { + "acc": 0.66654482, + "epoch": 1.293505834601725, + "grad_norm": 5.375, + "learning_rate": 3.0408051638039697e-06, + "loss": 1.59809179, + "memory(GiB)": 117.38, + "step": 50990, + "train_speed(iter/s)": 1.638672 + }, + { + "acc": 0.63057394, + "epoch": 1.2936326737696602, + "grad_norm": 9.0, + "learning_rate": 3.039840437652137e-06, + "loss": 1.69659061, + "memory(GiB)": 117.38, + "step": 50995, + "train_speed(iter/s)": 1.63869 + }, + { + "acc": 0.65516634, + "epoch": 1.2937595129375952, + "grad_norm": 5.9375, + "learning_rate": 3.0388757977168724e-06, + "loss": 1.63966293, + "memory(GiB)": 117.38, + "step": 51000, + "train_speed(iter/s)": 1.638707 + }, + { + "epoch": 1.2937595129375952, + "eval_acc": 0.6462405269213982, + "eval_loss": 1.573349118232727, + "eval_runtime": 58.8372, + "eval_samples_per_second": 108.265, + "eval_steps_per_second": 27.075, + "step": 51000 + }, + { + "acc": 0.66557379, + "epoch": 1.2938863521055302, + "grad_norm": 6.84375, + "learning_rate": 3.0379112440406066e-06, + "loss": 1.547052, + "memory(GiB)": 117.38, + "step": 51005, + "train_speed(iter/s)": 1.635404 + }, + { + "acc": 0.6597537, + "epoch": 1.2940131912734651, + "grad_norm": 4.96875, + "learning_rate": 3.036946776665766e-06, + "loss": 1.57173166, + "memory(GiB)": 117.38, + "step": 51010, + "train_speed(iter/s)": 1.63542 + }, + { + "acc": 0.65995388, + "epoch": 1.2941400304414004, + "grad_norm": 6.125, + "learning_rate": 3.0359823956347695e-06, + "loss": 1.56989346, + "memory(GiB)": 117.38, + "step": 51015, + "train_speed(iter/s)": 1.635435 + }, + { + "acc": 0.65556507, + "epoch": 1.2942668696093353, + "grad_norm": 4.875, + "learning_rate": 3.0350181009900347e-06, + "loss": 1.68399258, + "memory(GiB)": 117.38, + "step": 51020, + "train_speed(iter/s)": 1.635452 + }, + { + "acc": 0.67168455, + "epoch": 1.2943937087772706, + "grad_norm": 5.6875, + "learning_rate": 3.0340538927739784e-06, + "loss": 1.53910589, + "memory(GiB)": 117.38, + "step": 51025, + "train_speed(iter/s)": 1.635468 + }, + { + "acc": 0.67344546, + "epoch": 1.2945205479452055, + "grad_norm": 5.5, + "learning_rate": 3.0330897710290093e-06, + "loss": 1.60014553, + "memory(GiB)": 117.38, + "step": 51030, + "train_speed(iter/s)": 1.635485 + }, + { + "acc": 0.64574738, + "epoch": 1.2946473871131405, + "grad_norm": 5.4375, + "learning_rate": 3.032125735797532e-06, + "loss": 1.60611382, + "memory(GiB)": 117.38, + "step": 51035, + "train_speed(iter/s)": 1.6355 + }, + { + "acc": 0.64362578, + "epoch": 1.2947742262810755, + "grad_norm": 6.40625, + "learning_rate": 3.031161787121952e-06, + "loss": 1.67409859, + "memory(GiB)": 117.38, + "step": 51040, + "train_speed(iter/s)": 1.635516 + }, + { + "acc": 0.67234669, + "epoch": 1.2949010654490107, + "grad_norm": 4.71875, + "learning_rate": 3.0301979250446655e-06, + "loss": 1.63208065, + "memory(GiB)": 117.38, + "step": 51045, + "train_speed(iter/s)": 1.635533 + }, + { + "acc": 0.68056087, + "epoch": 1.2950279046169457, + "grad_norm": 9.1875, + "learning_rate": 3.029234149608071e-06, + "loss": 1.54029303, + "memory(GiB)": 117.38, + "step": 51050, + "train_speed(iter/s)": 1.635548 + }, + { + "acc": 0.668679, + "epoch": 1.2951547437848807, + "grad_norm": 5.9375, + "learning_rate": 3.0282704608545566e-06, + "loss": 1.55561857, + "memory(GiB)": 117.38, + "step": 51055, + "train_speed(iter/s)": 1.635565 + }, + { + "acc": 0.66406183, + "epoch": 1.295281582952816, + "grad_norm": 5.8125, + "learning_rate": 3.0273068588265097e-06, + "loss": 1.59801769, + "memory(GiB)": 117.38, + "step": 51060, + "train_speed(iter/s)": 1.635581 + }, + { + "acc": 0.64535794, + "epoch": 1.295408422120751, + "grad_norm": 4.84375, + "learning_rate": 3.0263433435663143e-06, + "loss": 1.64406738, + "memory(GiB)": 117.38, + "step": 51065, + "train_speed(iter/s)": 1.635595 + }, + { + "acc": 0.64878621, + "epoch": 1.295535261288686, + "grad_norm": 5.03125, + "learning_rate": 3.0253799151163522e-06, + "loss": 1.63740387, + "memory(GiB)": 117.38, + "step": 51070, + "train_speed(iter/s)": 1.635612 + }, + { + "acc": 0.64827609, + "epoch": 1.295662100456621, + "grad_norm": 5.46875, + "learning_rate": 3.0244165735189967e-06, + "loss": 1.59974203, + "memory(GiB)": 117.38, + "step": 51075, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.64578218, + "epoch": 1.295788939624556, + "grad_norm": 5.40625, + "learning_rate": 3.023453318816619e-06, + "loss": 1.65839882, + "memory(GiB)": 117.38, + "step": 51080, + "train_speed(iter/s)": 1.635643 + }, + { + "acc": 0.64653873, + "epoch": 1.295915778792491, + "grad_norm": 6.625, + "learning_rate": 3.022490151051591e-06, + "loss": 1.58992701, + "memory(GiB)": 117.38, + "step": 51085, + "train_speed(iter/s)": 1.635658 + }, + { + "acc": 0.65987306, + "epoch": 1.2960426179604263, + "grad_norm": 8.3125, + "learning_rate": 3.0215270702662753e-06, + "loss": 1.61896954, + "memory(GiB)": 117.38, + "step": 51090, + "train_speed(iter/s)": 1.635674 + }, + { + "acc": 0.67181969, + "epoch": 1.2961694571283613, + "grad_norm": 5.4375, + "learning_rate": 3.020564076503031e-06, + "loss": 1.50718708, + "memory(GiB)": 117.38, + "step": 51095, + "train_speed(iter/s)": 1.635689 + }, + { + "acc": 0.65677891, + "epoch": 1.2962962962962963, + "grad_norm": 4.9375, + "learning_rate": 3.019601169804216e-06, + "loss": 1.60704212, + "memory(GiB)": 117.38, + "step": 51100, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.66070251, + "epoch": 1.2964231354642313, + "grad_norm": 4.65625, + "learning_rate": 3.018638350212184e-06, + "loss": 1.6213335, + "memory(GiB)": 117.38, + "step": 51105, + "train_speed(iter/s)": 1.635721 + }, + { + "acc": 0.65107613, + "epoch": 1.2965499746321665, + "grad_norm": 5.09375, + "learning_rate": 3.0176756177692845e-06, + "loss": 1.57734585, + "memory(GiB)": 117.38, + "step": 51110, + "train_speed(iter/s)": 1.635738 + }, + { + "acc": 0.65010014, + "epoch": 1.2966768138001015, + "grad_norm": 5.625, + "learning_rate": 3.01671297251786e-06, + "loss": 1.63011723, + "memory(GiB)": 117.38, + "step": 51115, + "train_speed(iter/s)": 1.635754 + }, + { + "acc": 0.6633954, + "epoch": 1.2968036529680365, + "grad_norm": 5.21875, + "learning_rate": 3.0157504145002546e-06, + "loss": 1.56812382, + "memory(GiB)": 117.38, + "step": 51120, + "train_speed(iter/s)": 1.63577 + }, + { + "acc": 0.64187508, + "epoch": 1.2969304921359717, + "grad_norm": 5.625, + "learning_rate": 3.0147879437588046e-06, + "loss": 1.62867241, + "memory(GiB)": 117.38, + "step": 51125, + "train_speed(iter/s)": 1.635786 + }, + { + "acc": 0.65675321, + "epoch": 1.2970573313039067, + "grad_norm": 7.375, + "learning_rate": 3.013825560335845e-06, + "loss": 1.60088959, + "memory(GiB)": 117.38, + "step": 51130, + "train_speed(iter/s)": 1.635803 + }, + { + "acc": 0.65545635, + "epoch": 1.2971841704718416, + "grad_norm": 7.1875, + "learning_rate": 3.0128632642737044e-06, + "loss": 1.56130943, + "memory(GiB)": 117.38, + "step": 51135, + "train_speed(iter/s)": 1.63582 + }, + { + "acc": 0.67316637, + "epoch": 1.2973110096397766, + "grad_norm": 6.9375, + "learning_rate": 3.0119010556147088e-06, + "loss": 1.57268896, + "memory(GiB)": 117.38, + "step": 51140, + "train_speed(iter/s)": 1.635835 + }, + { + "acc": 0.6693522, + "epoch": 1.2974378488077118, + "grad_norm": 5.46875, + "learning_rate": 3.0109389344011813e-06, + "loss": 1.5877161, + "memory(GiB)": 117.38, + "step": 51145, + "train_speed(iter/s)": 1.635851 + }, + { + "acc": 0.65841799, + "epoch": 1.2975646879756468, + "grad_norm": 5.6875, + "learning_rate": 3.0099769006754415e-06, + "loss": 1.57503414, + "memory(GiB)": 117.38, + "step": 51150, + "train_speed(iter/s)": 1.635869 + }, + { + "acc": 0.65028243, + "epoch": 1.297691527143582, + "grad_norm": 5.625, + "learning_rate": 3.0090149544798007e-06, + "loss": 1.5683279, + "memory(GiB)": 117.38, + "step": 51155, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.64297285, + "epoch": 1.297818366311517, + "grad_norm": 6.875, + "learning_rate": 3.0080530958565712e-06, + "loss": 1.63781223, + "memory(GiB)": 117.38, + "step": 51160, + "train_speed(iter/s)": 1.6359 + }, + { + "acc": 0.66097555, + "epoch": 1.297945205479452, + "grad_norm": 6.5625, + "learning_rate": 3.0070913248480602e-06, + "loss": 1.58337421, + "memory(GiB)": 117.38, + "step": 51165, + "train_speed(iter/s)": 1.635916 + }, + { + "acc": 0.66105242, + "epoch": 1.298072044647387, + "grad_norm": 5.4375, + "learning_rate": 3.0061296414965724e-06, + "loss": 1.59135027, + "memory(GiB)": 117.38, + "step": 51170, + "train_speed(iter/s)": 1.635933 + }, + { + "acc": 0.67142873, + "epoch": 1.2981988838153222, + "grad_norm": 4.6875, + "learning_rate": 3.005168045844402e-06, + "loss": 1.58040609, + "memory(GiB)": 117.38, + "step": 51175, + "train_speed(iter/s)": 1.635949 + }, + { + "acc": 0.65379391, + "epoch": 1.2983257229832572, + "grad_norm": 6.0625, + "learning_rate": 3.0042065379338486e-06, + "loss": 1.61228695, + "memory(GiB)": 117.38, + "step": 51180, + "train_speed(iter/s)": 1.635966 + }, + { + "acc": 0.65575447, + "epoch": 1.2984525621511924, + "grad_norm": 6.96875, + "learning_rate": 3.003245117807201e-06, + "loss": 1.61400337, + "memory(GiB)": 117.38, + "step": 51185, + "train_speed(iter/s)": 1.635983 + }, + { + "acc": 0.66946974, + "epoch": 1.2985794013191274, + "grad_norm": 5.3125, + "learning_rate": 3.0022837855067514e-06, + "loss": 1.51799068, + "memory(GiB)": 117.38, + "step": 51190, + "train_speed(iter/s)": 1.636 + }, + { + "acc": 0.6639267, + "epoch": 1.2987062404870624, + "grad_norm": 5.0625, + "learning_rate": 3.0013225410747772e-06, + "loss": 1.53750954, + "memory(GiB)": 117.38, + "step": 51195, + "train_speed(iter/s)": 1.636015 + }, + { + "acc": 0.64466162, + "epoch": 1.2988330796549974, + "grad_norm": 6.09375, + "learning_rate": 3.0003613845535617e-06, + "loss": 1.62723694, + "memory(GiB)": 117.38, + "step": 51200, + "train_speed(iter/s)": 1.636033 + }, + { + "acc": 0.65698643, + "epoch": 1.2989599188229326, + "grad_norm": 6.4375, + "learning_rate": 2.9994003159853793e-06, + "loss": 1.61211662, + "memory(GiB)": 117.38, + "step": 51205, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.65932074, + "epoch": 1.2990867579908676, + "grad_norm": 5.8125, + "learning_rate": 2.998439335412505e-06, + "loss": 1.54120035, + "memory(GiB)": 117.38, + "step": 51210, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.65672312, + "epoch": 1.2992135971588026, + "grad_norm": 4.6875, + "learning_rate": 2.9974784428772043e-06, + "loss": 1.56844044, + "memory(GiB)": 117.38, + "step": 51215, + "train_speed(iter/s)": 1.636079 + }, + { + "acc": 0.66167655, + "epoch": 1.2993404363267378, + "grad_norm": 5.3125, + "learning_rate": 2.996517638421741e-06, + "loss": 1.61957092, + "memory(GiB)": 117.38, + "step": 51220, + "train_speed(iter/s)": 1.636097 + }, + { + "acc": 0.658776, + "epoch": 1.2994672754946728, + "grad_norm": 5.375, + "learning_rate": 2.9955569220883777e-06, + "loss": 1.58248234, + "memory(GiB)": 117.38, + "step": 51225, + "train_speed(iter/s)": 1.636112 + }, + { + "acc": 0.6524682, + "epoch": 1.2995941146626078, + "grad_norm": 6.28125, + "learning_rate": 2.9945962939193718e-06, + "loss": 1.6409996, + "memory(GiB)": 117.38, + "step": 51230, + "train_speed(iter/s)": 1.636128 + }, + { + "acc": 0.66665049, + "epoch": 1.2997209538305428, + "grad_norm": 4.96875, + "learning_rate": 2.9936357539569728e-06, + "loss": 1.54585724, + "memory(GiB)": 117.38, + "step": 51235, + "train_speed(iter/s)": 1.636146 + }, + { + "acc": 0.6669415, + "epoch": 1.299847792998478, + "grad_norm": 6.1875, + "learning_rate": 2.9926753022434306e-06, + "loss": 1.53547039, + "memory(GiB)": 117.38, + "step": 51240, + "train_speed(iter/s)": 1.63616 + }, + { + "acc": 0.67339115, + "epoch": 1.299974632166413, + "grad_norm": 5.25, + "learning_rate": 2.9917149388209908e-06, + "loss": 1.58321886, + "memory(GiB)": 117.38, + "step": 51245, + "train_speed(iter/s)": 1.636175 + }, + { + "acc": 0.65516682, + "epoch": 1.3001014713343482, + "grad_norm": 5.15625, + "learning_rate": 2.9907546637318964e-06, + "loss": 1.6069809, + "memory(GiB)": 117.38, + "step": 51250, + "train_speed(iter/s)": 1.636192 + }, + { + "acc": 0.65986772, + "epoch": 1.3002283105022832, + "grad_norm": 8.25, + "learning_rate": 2.98979447701838e-06, + "loss": 1.60593185, + "memory(GiB)": 117.38, + "step": 51255, + "train_speed(iter/s)": 1.636209 + }, + { + "acc": 0.66070147, + "epoch": 1.3003551496702181, + "grad_norm": 10.1875, + "learning_rate": 2.988834378722679e-06, + "loss": 1.61141033, + "memory(GiB)": 117.38, + "step": 51260, + "train_speed(iter/s)": 1.636227 + }, + { + "acc": 0.65299373, + "epoch": 1.3004819888381531, + "grad_norm": 5.0625, + "learning_rate": 2.9878743688870193e-06, + "loss": 1.58934889, + "memory(GiB)": 117.38, + "step": 51265, + "train_speed(iter/s)": 1.636243 + }, + { + "acc": 0.65360303, + "epoch": 1.3006088280060883, + "grad_norm": 5.03125, + "learning_rate": 2.9869144475536306e-06, + "loss": 1.61109886, + "memory(GiB)": 117.38, + "step": 51270, + "train_speed(iter/s)": 1.63626 + }, + { + "acc": 0.64588294, + "epoch": 1.3007356671740233, + "grad_norm": 5.1875, + "learning_rate": 2.9859546147647316e-06, + "loss": 1.69761391, + "memory(GiB)": 117.38, + "step": 51275, + "train_speed(iter/s)": 1.636276 + }, + { + "acc": 0.650596, + "epoch": 1.3008625063419583, + "grad_norm": 5.4375, + "learning_rate": 2.984994870562541e-06, + "loss": 1.64292259, + "memory(GiB)": 117.38, + "step": 51280, + "train_speed(iter/s)": 1.636291 + }, + { + "acc": 0.65538807, + "epoch": 1.3009893455098935, + "grad_norm": 5.375, + "learning_rate": 2.9840352149892703e-06, + "loss": 1.59412556, + "memory(GiB)": 117.38, + "step": 51285, + "train_speed(iter/s)": 1.636307 + }, + { + "acc": 0.67346067, + "epoch": 1.3011161846778285, + "grad_norm": 5.6875, + "learning_rate": 2.9830756480871343e-06, + "loss": 1.61384506, + "memory(GiB)": 117.38, + "step": 51290, + "train_speed(iter/s)": 1.636323 + }, + { + "acc": 0.65672727, + "epoch": 1.3012430238457635, + "grad_norm": 5.6875, + "learning_rate": 2.9821161698983347e-06, + "loss": 1.62566128, + "memory(GiB)": 117.38, + "step": 51295, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65628948, + "epoch": 1.3013698630136985, + "grad_norm": 5.78125, + "learning_rate": 2.9811567804650747e-06, + "loss": 1.59102726, + "memory(GiB)": 117.38, + "step": 51300, + "train_speed(iter/s)": 1.636359 + }, + { + "acc": 0.65287066, + "epoch": 1.3014967021816337, + "grad_norm": 5.96875, + "learning_rate": 2.980197479829554e-06, + "loss": 1.59878273, + "memory(GiB)": 117.38, + "step": 51305, + "train_speed(iter/s)": 1.636376 + }, + { + "acc": 0.66711597, + "epoch": 1.3016235413495687, + "grad_norm": 6.65625, + "learning_rate": 2.9792382680339666e-06, + "loss": 1.55396156, + "memory(GiB)": 117.38, + "step": 51310, + "train_speed(iter/s)": 1.636393 + }, + { + "acc": 0.64026814, + "epoch": 1.301750380517504, + "grad_norm": 6.03125, + "learning_rate": 2.9782791451205e-06, + "loss": 1.62613811, + "memory(GiB)": 117.38, + "step": 51315, + "train_speed(iter/s)": 1.636409 + }, + { + "acc": 0.63746824, + "epoch": 1.301877219685439, + "grad_norm": 6.96875, + "learning_rate": 2.9773201111313444e-06, + "loss": 1.66666603, + "memory(GiB)": 117.38, + "step": 51320, + "train_speed(iter/s)": 1.636427 + }, + { + "acc": 0.66756029, + "epoch": 1.302004058853374, + "grad_norm": 6.25, + "learning_rate": 2.9763611661086806e-06, + "loss": 1.58938808, + "memory(GiB)": 117.38, + "step": 51325, + "train_speed(iter/s)": 1.636443 + }, + { + "acc": 0.66998334, + "epoch": 1.3021308980213089, + "grad_norm": 6.34375, + "learning_rate": 2.975402310094689e-06, + "loss": 1.55547972, + "memory(GiB)": 117.38, + "step": 51330, + "train_speed(iter/s)": 1.636461 + }, + { + "acc": 0.64836669, + "epoch": 1.302257737189244, + "grad_norm": 4.71875, + "learning_rate": 2.9744435431315403e-06, + "loss": 1.61106319, + "memory(GiB)": 117.38, + "step": 51335, + "train_speed(iter/s)": 1.636476 + }, + { + "acc": 0.65225735, + "epoch": 1.302384576357179, + "grad_norm": 7.40625, + "learning_rate": 2.9734848652614097e-06, + "loss": 1.69089603, + "memory(GiB)": 117.38, + "step": 51340, + "train_speed(iter/s)": 1.636494 + }, + { + "acc": 0.66204529, + "epoch": 1.3025114155251143, + "grad_norm": 6.46875, + "learning_rate": 2.972526276526461e-06, + "loss": 1.60239105, + "memory(GiB)": 117.38, + "step": 51345, + "train_speed(iter/s)": 1.63651 + }, + { + "acc": 0.66751919, + "epoch": 1.3026382546930493, + "grad_norm": 6.21875, + "learning_rate": 2.971567776968861e-06, + "loss": 1.55322838, + "memory(GiB)": 117.38, + "step": 51350, + "train_speed(iter/s)": 1.636527 + }, + { + "acc": 0.67617989, + "epoch": 1.3027650938609843, + "grad_norm": 5.5, + "learning_rate": 2.9706093666307654e-06, + "loss": 1.56108465, + "memory(GiB)": 117.38, + "step": 51355, + "train_speed(iter/s)": 1.636542 + }, + { + "acc": 0.65380955, + "epoch": 1.3028919330289193, + "grad_norm": 5.25, + "learning_rate": 2.969651045554329e-06, + "loss": 1.57948246, + "memory(GiB)": 117.38, + "step": 51360, + "train_speed(iter/s)": 1.63656 + }, + { + "acc": 0.65414896, + "epoch": 1.3030187721968545, + "grad_norm": 5.3125, + "learning_rate": 2.968692813781706e-06, + "loss": 1.61331062, + "memory(GiB)": 117.38, + "step": 51365, + "train_speed(iter/s)": 1.636576 + }, + { + "acc": 0.6445148, + "epoch": 1.3031456113647895, + "grad_norm": 5.9375, + "learning_rate": 2.9677346713550437e-06, + "loss": 1.61052322, + "memory(GiB)": 117.38, + "step": 51370, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.67251377, + "epoch": 1.3032724505327244, + "grad_norm": 4.1875, + "learning_rate": 2.966776618316482e-06, + "loss": 1.51120205, + "memory(GiB)": 117.38, + "step": 51375, + "train_speed(iter/s)": 1.636611 + }, + { + "acc": 0.64410119, + "epoch": 1.3033992897006597, + "grad_norm": 5.75, + "learning_rate": 2.9658186547081612e-06, + "loss": 1.64027081, + "memory(GiB)": 117.38, + "step": 51380, + "train_speed(iter/s)": 1.636626 + }, + { + "acc": 0.67041311, + "epoch": 1.3035261288685946, + "grad_norm": 6.1875, + "learning_rate": 2.9648607805722197e-06, + "loss": 1.56244469, + "memory(GiB)": 117.38, + "step": 51385, + "train_speed(iter/s)": 1.636642 + }, + { + "acc": 0.65798521, + "epoch": 1.3036529680365296, + "grad_norm": 5.53125, + "learning_rate": 2.963902995950788e-06, + "loss": 1.5554759, + "memory(GiB)": 117.38, + "step": 51390, + "train_speed(iter/s)": 1.636659 + }, + { + "acc": 0.64611349, + "epoch": 1.3037798072044646, + "grad_norm": 6.25, + "learning_rate": 2.962945300885991e-06, + "loss": 1.65860405, + "memory(GiB)": 117.38, + "step": 51395, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.65637007, + "epoch": 1.3039066463723998, + "grad_norm": 6.71875, + "learning_rate": 2.9619876954199557e-06, + "loss": 1.57655602, + "memory(GiB)": 117.38, + "step": 51400, + "train_speed(iter/s)": 1.636692 + }, + { + "acc": 0.67129002, + "epoch": 1.3040334855403348, + "grad_norm": 4.90625, + "learning_rate": 2.9610301795947992e-06, + "loss": 1.54279785, + "memory(GiB)": 117.38, + "step": 51405, + "train_speed(iter/s)": 1.636707 + }, + { + "acc": 0.65228286, + "epoch": 1.30416032470827, + "grad_norm": 6.53125, + "learning_rate": 2.9600727534526417e-06, + "loss": 1.64160862, + "memory(GiB)": 117.38, + "step": 51410, + "train_speed(iter/s)": 1.636724 + }, + { + "acc": 0.65549097, + "epoch": 1.304287163876205, + "grad_norm": 6.25, + "learning_rate": 2.9591154170355895e-06, + "loss": 1.67537041, + "memory(GiB)": 117.38, + "step": 51415, + "train_speed(iter/s)": 1.63674 + }, + { + "acc": 0.64177289, + "epoch": 1.30441400304414, + "grad_norm": 5.5, + "learning_rate": 2.9581581703857545e-06, + "loss": 1.66877747, + "memory(GiB)": 117.38, + "step": 51420, + "train_speed(iter/s)": 1.636757 + }, + { + "acc": 0.65517545, + "epoch": 1.304540842212075, + "grad_norm": 6.15625, + "learning_rate": 2.9572010135452377e-06, + "loss": 1.57466717, + "memory(GiB)": 117.38, + "step": 51425, + "train_speed(iter/s)": 1.636772 + }, + { + "acc": 0.65859098, + "epoch": 1.3046676813800102, + "grad_norm": 8.625, + "learning_rate": 2.9562439465561425e-06, + "loss": 1.63903542, + "memory(GiB)": 117.38, + "step": 51430, + "train_speed(iter/s)": 1.636789 + }, + { + "acc": 0.66389856, + "epoch": 1.3047945205479452, + "grad_norm": 6.40625, + "learning_rate": 2.955286969460563e-06, + "loss": 1.55800343, + "memory(GiB)": 117.38, + "step": 51435, + "train_speed(iter/s)": 1.636804 + }, + { + "acc": 0.64793568, + "epoch": 1.3049213597158802, + "grad_norm": 6.40625, + "learning_rate": 2.9543300823005903e-06, + "loss": 1.67616043, + "memory(GiB)": 117.38, + "step": 51440, + "train_speed(iter/s)": 1.63682 + }, + { + "acc": 0.6390697, + "epoch": 1.3050481988838154, + "grad_norm": 5.65625, + "learning_rate": 2.953373285118315e-06, + "loss": 1.63063946, + "memory(GiB)": 117.38, + "step": 51445, + "train_speed(iter/s)": 1.636836 + }, + { + "acc": 0.65421939, + "epoch": 1.3051750380517504, + "grad_norm": 6.15625, + "learning_rate": 2.9524165779558206e-06, + "loss": 1.65491867, + "memory(GiB)": 117.38, + "step": 51450, + "train_speed(iter/s)": 1.636853 + }, + { + "acc": 0.6562438, + "epoch": 1.3053018772196854, + "grad_norm": 5.78125, + "learning_rate": 2.9514599608551865e-06, + "loss": 1.60461826, + "memory(GiB)": 117.38, + "step": 51455, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.64334412, + "epoch": 1.3054287163876204, + "grad_norm": 5.1875, + "learning_rate": 2.9505034338584882e-06, + "loss": 1.72497978, + "memory(GiB)": 117.38, + "step": 51460, + "train_speed(iter/s)": 1.636884 + }, + { + "acc": 0.67794304, + "epoch": 1.3055555555555556, + "grad_norm": 5.875, + "learning_rate": 2.9495469970078e-06, + "loss": 1.58000813, + "memory(GiB)": 117.38, + "step": 51465, + "train_speed(iter/s)": 1.6369 + }, + { + "acc": 0.64928265, + "epoch": 1.3056823947234906, + "grad_norm": 5.6875, + "learning_rate": 2.9485906503451907e-06, + "loss": 1.66637421, + "memory(GiB)": 117.38, + "step": 51470, + "train_speed(iter/s)": 1.636916 + }, + { + "acc": 0.64091649, + "epoch": 1.3058092338914258, + "grad_norm": 5.21875, + "learning_rate": 2.9476343939127217e-06, + "loss": 1.62044888, + "memory(GiB)": 117.38, + "step": 51475, + "train_speed(iter/s)": 1.636931 + }, + { + "acc": 0.67019229, + "epoch": 1.3059360730593608, + "grad_norm": 5.625, + "learning_rate": 2.9466782277524554e-06, + "loss": 1.48893566, + "memory(GiB)": 117.38, + "step": 51480, + "train_speed(iter/s)": 1.636948 + }, + { + "acc": 0.65354128, + "epoch": 1.3060629122272958, + "grad_norm": 9.5625, + "learning_rate": 2.9457221519064477e-06, + "loss": 1.64324951, + "memory(GiB)": 117.38, + "step": 51485, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.66596785, + "epoch": 1.3061897513952307, + "grad_norm": 5.6875, + "learning_rate": 2.944766166416754e-06, + "loss": 1.57936859, + "memory(GiB)": 117.38, + "step": 51490, + "train_speed(iter/s)": 1.636981 + }, + { + "acc": 0.65529156, + "epoch": 1.306316590563166, + "grad_norm": 5.21875, + "learning_rate": 2.943810271325418e-06, + "loss": 1.59710217, + "memory(GiB)": 117.38, + "step": 51495, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.64954548, + "epoch": 1.306443429731101, + "grad_norm": 5.78125, + "learning_rate": 2.9428544666744873e-06, + "loss": 1.57202291, + "memory(GiB)": 117.38, + "step": 51500, + "train_speed(iter/s)": 1.637012 + }, + { + "acc": 0.63954611, + "epoch": 1.3065702688990362, + "grad_norm": 5.96875, + "learning_rate": 2.9418987525060004e-06, + "loss": 1.74219055, + "memory(GiB)": 117.38, + "step": 51505, + "train_speed(iter/s)": 1.637028 + }, + { + "acc": 0.6643631, + "epoch": 1.3066971080669711, + "grad_norm": 7.1875, + "learning_rate": 2.9409431288619973e-06, + "loss": 1.60146503, + "memory(GiB)": 117.38, + "step": 51510, + "train_speed(iter/s)": 1.637044 + }, + { + "acc": 0.65021467, + "epoch": 1.3068239472349061, + "grad_norm": 5.5, + "learning_rate": 2.939987595784507e-06, + "loss": 1.6500515, + "memory(GiB)": 117.38, + "step": 51515, + "train_speed(iter/s)": 1.637061 + }, + { + "acc": 0.6537488, + "epoch": 1.3069507864028411, + "grad_norm": 5.59375, + "learning_rate": 2.9390321533155585e-06, + "loss": 1.68441391, + "memory(GiB)": 117.38, + "step": 51520, + "train_speed(iter/s)": 1.637076 + }, + { + "acc": 0.66419053, + "epoch": 1.3070776255707763, + "grad_norm": 5.375, + "learning_rate": 2.9380768014971794e-06, + "loss": 1.63850346, + "memory(GiB)": 117.38, + "step": 51525, + "train_speed(iter/s)": 1.637094 + }, + { + "acc": 0.67292109, + "epoch": 1.3072044647387113, + "grad_norm": 6.21875, + "learning_rate": 2.937121540371389e-06, + "loss": 1.52047367, + "memory(GiB)": 117.38, + "step": 51530, + "train_speed(iter/s)": 1.637111 + }, + { + "acc": 0.66494627, + "epoch": 1.3073313039066463, + "grad_norm": 7.5, + "learning_rate": 2.9361663699802007e-06, + "loss": 1.60929298, + "memory(GiB)": 117.38, + "step": 51535, + "train_speed(iter/s)": 1.637126 + }, + { + "acc": 0.65330281, + "epoch": 1.3074581430745815, + "grad_norm": 5.90625, + "learning_rate": 2.9352112903656315e-06, + "loss": 1.59436474, + "memory(GiB)": 117.38, + "step": 51540, + "train_speed(iter/s)": 1.637144 + }, + { + "acc": 0.65280647, + "epoch": 1.3075849822425165, + "grad_norm": 5.15625, + "learning_rate": 2.9342563015696866e-06, + "loss": 1.68462639, + "memory(GiB)": 117.38, + "step": 51545, + "train_speed(iter/s)": 1.637159 + }, + { + "acc": 0.67475739, + "epoch": 1.3077118214104515, + "grad_norm": 6.28125, + "learning_rate": 2.9333014036343765e-06, + "loss": 1.53416767, + "memory(GiB)": 117.38, + "step": 51550, + "train_speed(iter/s)": 1.637177 + }, + { + "acc": 0.65203185, + "epoch": 1.3078386605783865, + "grad_norm": 5.3125, + "learning_rate": 2.932346596601694e-06, + "loss": 1.5928648, + "memory(GiB)": 117.38, + "step": 51555, + "train_speed(iter/s)": 1.637192 + }, + { + "acc": 0.64538231, + "epoch": 1.3079654997463217, + "grad_norm": 5.46875, + "learning_rate": 2.931391880513641e-06, + "loss": 1.5798933, + "memory(GiB)": 117.38, + "step": 51560, + "train_speed(iter/s)": 1.637209 + }, + { + "acc": 0.64531078, + "epoch": 1.3080923389142567, + "grad_norm": 6.28125, + "learning_rate": 2.9304372554122074e-06, + "loss": 1.65907631, + "memory(GiB)": 117.38, + "step": 51565, + "train_speed(iter/s)": 1.637225 + }, + { + "acc": 0.64039068, + "epoch": 1.308219178082192, + "grad_norm": 5.875, + "learning_rate": 2.9294827213393857e-06, + "loss": 1.64682217, + "memory(GiB)": 117.38, + "step": 51570, + "train_speed(iter/s)": 1.637241 + }, + { + "acc": 0.66979671, + "epoch": 1.308346017250127, + "grad_norm": 5.25, + "learning_rate": 2.9285282783371567e-06, + "loss": 1.50388784, + "memory(GiB)": 117.38, + "step": 51575, + "train_speed(iter/s)": 1.637257 + }, + { + "acc": 0.65458665, + "epoch": 1.3084728564180619, + "grad_norm": 6.0, + "learning_rate": 2.9275739264475013e-06, + "loss": 1.57106981, + "memory(GiB)": 117.38, + "step": 51580, + "train_speed(iter/s)": 1.637274 + }, + { + "acc": 0.64749331, + "epoch": 1.3085996955859969, + "grad_norm": 5.5, + "learning_rate": 2.926619665712399e-06, + "loss": 1.5768364, + "memory(GiB)": 117.38, + "step": 51585, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.64772253, + "epoch": 1.308726534753932, + "grad_norm": 7.84375, + "learning_rate": 2.9256654961738217e-06, + "loss": 1.61024647, + "memory(GiB)": 117.38, + "step": 51590, + "train_speed(iter/s)": 1.637306 + }, + { + "acc": 0.66716747, + "epoch": 1.308853373921867, + "grad_norm": 5.5, + "learning_rate": 2.9247114178737356e-06, + "loss": 1.52597733, + "memory(GiB)": 117.38, + "step": 51595, + "train_speed(iter/s)": 1.637322 + }, + { + "acc": 0.65529041, + "epoch": 1.308980213089802, + "grad_norm": 5.03125, + "learning_rate": 2.9237574308541063e-06, + "loss": 1.59420547, + "memory(GiB)": 117.38, + "step": 51600, + "train_speed(iter/s)": 1.637338 + }, + { + "acc": 0.64674397, + "epoch": 1.3091070522577373, + "grad_norm": 5.0, + "learning_rate": 2.9228035351568955e-06, + "loss": 1.63080368, + "memory(GiB)": 117.38, + "step": 51605, + "train_speed(iter/s)": 1.637354 + }, + { + "acc": 0.67508087, + "epoch": 1.3092338914256723, + "grad_norm": 6.90625, + "learning_rate": 2.92184973082406e-06, + "loss": 1.47398796, + "memory(GiB)": 117.38, + "step": 51610, + "train_speed(iter/s)": 1.63737 + }, + { + "acc": 0.66170506, + "epoch": 1.3093607305936072, + "grad_norm": 5.8125, + "learning_rate": 2.920896017897551e-06, + "loss": 1.55876312, + "memory(GiB)": 117.38, + "step": 51615, + "train_speed(iter/s)": 1.637386 + }, + { + "acc": 0.64708872, + "epoch": 1.3094875697615422, + "grad_norm": 4.71875, + "learning_rate": 2.9199423964193176e-06, + "loss": 1.59538851, + "memory(GiB)": 117.38, + "step": 51620, + "train_speed(iter/s)": 1.637402 + }, + { + "acc": 0.65633287, + "epoch": 1.3096144089294774, + "grad_norm": 5.78125, + "learning_rate": 2.9189888664313045e-06, + "loss": 1.60734482, + "memory(GiB)": 117.38, + "step": 51625, + "train_speed(iter/s)": 1.637418 + }, + { + "acc": 0.65228448, + "epoch": 1.3097412480974124, + "grad_norm": 5.40625, + "learning_rate": 2.9180354279754517e-06, + "loss": 1.60209885, + "memory(GiB)": 117.38, + "step": 51630, + "train_speed(iter/s)": 1.637434 + }, + { + "acc": 0.65717149, + "epoch": 1.3098680872653476, + "grad_norm": 7.21875, + "learning_rate": 2.9170820810936968e-06, + "loss": 1.57594357, + "memory(GiB)": 117.38, + "step": 51635, + "train_speed(iter/s)": 1.637449 + }, + { + "acc": 0.65055823, + "epoch": 1.3099949264332826, + "grad_norm": 5.75, + "learning_rate": 2.9161288258279715e-06, + "loss": 1.61533432, + "memory(GiB)": 117.38, + "step": 51640, + "train_speed(iter/s)": 1.637465 + }, + { + "acc": 0.65223579, + "epoch": 1.3101217656012176, + "grad_norm": 6.0625, + "learning_rate": 2.9151756622202037e-06, + "loss": 1.6044939, + "memory(GiB)": 117.38, + "step": 51645, + "train_speed(iter/s)": 1.637481 + }, + { + "acc": 0.65353446, + "epoch": 1.3102486047691526, + "grad_norm": 9.0625, + "learning_rate": 2.914222590312319e-06, + "loss": 1.61331139, + "memory(GiB)": 117.38, + "step": 51650, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.65009022, + "epoch": 1.3103754439370878, + "grad_norm": 8.25, + "learning_rate": 2.9132696101462366e-06, + "loss": 1.60575142, + "memory(GiB)": 117.38, + "step": 51655, + "train_speed(iter/s)": 1.637515 + }, + { + "acc": 0.64684038, + "epoch": 1.3105022831050228, + "grad_norm": 5.96875, + "learning_rate": 2.912316721763874e-06, + "loss": 1.70278587, + "memory(GiB)": 117.38, + "step": 51660, + "train_speed(iter/s)": 1.637532 + }, + { + "acc": 0.66801019, + "epoch": 1.310629122272958, + "grad_norm": 5.78125, + "learning_rate": 2.9113639252071395e-06, + "loss": 1.50198326, + "memory(GiB)": 117.38, + "step": 51665, + "train_speed(iter/s)": 1.637549 + }, + { + "acc": 0.64791107, + "epoch": 1.310755961440893, + "grad_norm": 6.5, + "learning_rate": 2.91041122051795e-06, + "loss": 1.61493797, + "memory(GiB)": 117.38, + "step": 51670, + "train_speed(iter/s)": 1.637565 + }, + { + "acc": 0.64566798, + "epoch": 1.310882800608828, + "grad_norm": 5.0, + "learning_rate": 2.9094586077382016e-06, + "loss": 1.63597908, + "memory(GiB)": 117.38, + "step": 51675, + "train_speed(iter/s)": 1.63758 + }, + { + "acc": 0.65538793, + "epoch": 1.311009639776763, + "grad_norm": 6.78125, + "learning_rate": 2.9085060869097977e-06, + "loss": 1.59805403, + "memory(GiB)": 117.38, + "step": 51680, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.66358232, + "epoch": 1.3111364789446982, + "grad_norm": 5.25, + "learning_rate": 2.907553658074631e-06, + "loss": 1.52724018, + "memory(GiB)": 117.38, + "step": 51685, + "train_speed(iter/s)": 1.63761 + }, + { + "acc": 0.65128002, + "epoch": 1.3112633181126332, + "grad_norm": 5.21875, + "learning_rate": 2.906601321274601e-06, + "loss": 1.66135139, + "memory(GiB)": 117.38, + "step": 51690, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.6644032, + "epoch": 1.3113901572805682, + "grad_norm": 4.9375, + "learning_rate": 2.90564907655159e-06, + "loss": 1.61644211, + "memory(GiB)": 117.38, + "step": 51695, + "train_speed(iter/s)": 1.637643 + }, + { + "acc": 0.66594381, + "epoch": 1.3115169964485034, + "grad_norm": 5.53125, + "learning_rate": 2.9046969239474808e-06, + "loss": 1.53055096, + "memory(GiB)": 117.38, + "step": 51700, + "train_speed(iter/s)": 1.63766 + }, + { + "acc": 0.65734358, + "epoch": 1.3116438356164384, + "grad_norm": 5.25, + "learning_rate": 2.9037448635041574e-06, + "loss": 1.58900261, + "memory(GiB)": 117.38, + "step": 51705, + "train_speed(iter/s)": 1.637677 + }, + { + "acc": 0.6529006, + "epoch": 1.3117706747843734, + "grad_norm": 5.375, + "learning_rate": 2.9027928952634964e-06, + "loss": 1.57910213, + "memory(GiB)": 117.38, + "step": 51710, + "train_speed(iter/s)": 1.637694 + }, + { + "acc": 0.66224136, + "epoch": 1.3118975139523084, + "grad_norm": 5.4375, + "learning_rate": 2.901841019267363e-06, + "loss": 1.53208561, + "memory(GiB)": 117.38, + "step": 51715, + "train_speed(iter/s)": 1.637709 + }, + { + "acc": 0.6627141, + "epoch": 1.3120243531202436, + "grad_norm": 5.65625, + "learning_rate": 2.900889235557631e-06, + "loss": 1.63631554, + "memory(GiB)": 117.38, + "step": 51720, + "train_speed(iter/s)": 1.637727 + }, + { + "acc": 0.6683043, + "epoch": 1.3121511922881786, + "grad_norm": 5.625, + "learning_rate": 2.8999375441761627e-06, + "loss": 1.57904625, + "memory(GiB)": 117.38, + "step": 51725, + "train_speed(iter/s)": 1.637741 + }, + { + "acc": 0.66034756, + "epoch": 1.3122780314561138, + "grad_norm": 5.25, + "learning_rate": 2.8989859451648193e-06, + "loss": 1.54935474, + "memory(GiB)": 117.38, + "step": 51730, + "train_speed(iter/s)": 1.637758 + }, + { + "acc": 0.66374736, + "epoch": 1.3124048706240488, + "grad_norm": 7.125, + "learning_rate": 2.89803443856545e-06, + "loss": 1.50767307, + "memory(GiB)": 117.38, + "step": 51735, + "train_speed(iter/s)": 1.637773 + }, + { + "acc": 0.65610485, + "epoch": 1.3125317097919837, + "grad_norm": 5.28125, + "learning_rate": 2.897083024419913e-06, + "loss": 1.63265266, + "memory(GiB)": 117.38, + "step": 51740, + "train_speed(iter/s)": 1.637789 + }, + { + "acc": 0.65229473, + "epoch": 1.3126585489599187, + "grad_norm": 6.28125, + "learning_rate": 2.8961317027700534e-06, + "loss": 1.58904819, + "memory(GiB)": 117.38, + "step": 51745, + "train_speed(iter/s)": 1.637805 + }, + { + "acc": 0.65377903, + "epoch": 1.312785388127854, + "grad_norm": 6.9375, + "learning_rate": 2.8951804736577148e-06, + "loss": 1.61027641, + "memory(GiB)": 117.38, + "step": 51750, + "train_speed(iter/s)": 1.637821 + }, + { + "acc": 0.67582321, + "epoch": 1.312912227295789, + "grad_norm": 6.09375, + "learning_rate": 2.894229337124736e-06, + "loss": 1.57504196, + "memory(GiB)": 117.38, + "step": 51755, + "train_speed(iter/s)": 1.637836 + }, + { + "acc": 0.63971281, + "epoch": 1.313039066463724, + "grad_norm": 5.5, + "learning_rate": 2.8932782932129524e-06, + "loss": 1.60785809, + "memory(GiB)": 117.38, + "step": 51760, + "train_speed(iter/s)": 1.637852 + }, + { + "acc": 0.6622704, + "epoch": 1.3131659056316591, + "grad_norm": 5.875, + "learning_rate": 2.8923273419641956e-06, + "loss": 1.50102777, + "memory(GiB)": 117.38, + "step": 51765, + "train_speed(iter/s)": 1.637869 + }, + { + "acc": 0.66209106, + "epoch": 1.3132927447995941, + "grad_norm": 6.28125, + "learning_rate": 2.891376483420292e-06, + "loss": 1.58476448, + "memory(GiB)": 117.38, + "step": 51770, + "train_speed(iter/s)": 1.637884 + }, + { + "acc": 0.66455984, + "epoch": 1.3134195839675291, + "grad_norm": 6.28125, + "learning_rate": 2.8904257176230655e-06, + "loss": 1.56132154, + "memory(GiB)": 117.38, + "step": 51775, + "train_speed(iter/s)": 1.637901 + }, + { + "acc": 0.66377859, + "epoch": 1.313546423135464, + "grad_norm": 5.8125, + "learning_rate": 2.8894750446143345e-06, + "loss": 1.5216713, + "memory(GiB)": 117.38, + "step": 51780, + "train_speed(iter/s)": 1.637918 + }, + { + "acc": 0.64870396, + "epoch": 1.3136732623033993, + "grad_norm": 6.53125, + "learning_rate": 2.8885244644359134e-06, + "loss": 1.59811602, + "memory(GiB)": 117.38, + "step": 51785, + "train_speed(iter/s)": 1.637935 + }, + { + "acc": 0.64730048, + "epoch": 1.3138001014713343, + "grad_norm": 6.28125, + "learning_rate": 2.887573977129614e-06, + "loss": 1.649617, + "memory(GiB)": 117.38, + "step": 51790, + "train_speed(iter/s)": 1.637953 + }, + { + "acc": 0.66811953, + "epoch": 1.3139269406392695, + "grad_norm": 6.96875, + "learning_rate": 2.886623582737242e-06, + "loss": 1.58251591, + "memory(GiB)": 117.38, + "step": 51795, + "train_speed(iter/s)": 1.637968 + }, + { + "acc": 0.66126637, + "epoch": 1.3140537798072045, + "grad_norm": 5.5, + "learning_rate": 2.8856732813006007e-06, + "loss": 1.64648476, + "memory(GiB)": 117.38, + "step": 51800, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.65193205, + "epoch": 1.3141806189751395, + "grad_norm": 5.4375, + "learning_rate": 2.8847230728614854e-06, + "loss": 1.66526375, + "memory(GiB)": 117.38, + "step": 51805, + "train_speed(iter/s)": 1.638 + }, + { + "acc": 0.67225418, + "epoch": 1.3143074581430745, + "grad_norm": 5.5, + "learning_rate": 2.883772957461698e-06, + "loss": 1.50958004, + "memory(GiB)": 117.38, + "step": 51810, + "train_speed(iter/s)": 1.638016 + }, + { + "acc": 0.64741755, + "epoch": 1.3144342973110097, + "grad_norm": 6.96875, + "learning_rate": 2.8828229351430224e-06, + "loss": 1.67167568, + "memory(GiB)": 117.38, + "step": 51815, + "train_speed(iter/s)": 1.638033 + }, + { + "acc": 0.66516385, + "epoch": 1.3145611364789447, + "grad_norm": 6.28125, + "learning_rate": 2.881873005947247e-06, + "loss": 1.56833324, + "memory(GiB)": 117.38, + "step": 51820, + "train_speed(iter/s)": 1.638048 + }, + { + "acc": 0.64847803, + "epoch": 1.31468797564688, + "grad_norm": 5.25, + "learning_rate": 2.88092316991615e-06, + "loss": 1.6051199, + "memory(GiB)": 117.38, + "step": 51825, + "train_speed(iter/s)": 1.638064 + }, + { + "acc": 0.66874151, + "epoch": 1.3148148148148149, + "grad_norm": 5.78125, + "learning_rate": 2.879973427091518e-06, + "loss": 1.56490974, + "memory(GiB)": 117.38, + "step": 51830, + "train_speed(iter/s)": 1.638081 + }, + { + "acc": 0.66776762, + "epoch": 1.3149416539827499, + "grad_norm": 7.03125, + "learning_rate": 2.879023777515118e-06, + "loss": 1.54060535, + "memory(GiB)": 117.38, + "step": 51835, + "train_speed(iter/s)": 1.638097 + }, + { + "acc": 0.64110527, + "epoch": 1.3150684931506849, + "grad_norm": 5.5625, + "learning_rate": 2.8780742212287192e-06, + "loss": 1.59418535, + "memory(GiB)": 117.38, + "step": 51840, + "train_speed(iter/s)": 1.638114 + }, + { + "acc": 0.64460273, + "epoch": 1.31519533231862, + "grad_norm": 5.96875, + "learning_rate": 2.8771247582740924e-06, + "loss": 1.63289852, + "memory(GiB)": 117.38, + "step": 51845, + "train_speed(iter/s)": 1.63813 + }, + { + "acc": 0.66308203, + "epoch": 1.315322171486555, + "grad_norm": 5.625, + "learning_rate": 2.876175388692999e-06, + "loss": 1.57990704, + "memory(GiB)": 117.38, + "step": 51850, + "train_speed(iter/s)": 1.638147 + }, + { + "acc": 0.64651594, + "epoch": 1.31544901065449, + "grad_norm": 7.03125, + "learning_rate": 2.875226112527192e-06, + "loss": 1.67886295, + "memory(GiB)": 117.38, + "step": 51855, + "train_speed(iter/s)": 1.638163 + }, + { + "acc": 0.66003923, + "epoch": 1.3155758498224253, + "grad_norm": 5.75, + "learning_rate": 2.8742769298184246e-06, + "loss": 1.6152523, + "memory(GiB)": 117.38, + "step": 51860, + "train_speed(iter/s)": 1.63818 + }, + { + "acc": 0.65803847, + "epoch": 1.3157026889903602, + "grad_norm": 4.875, + "learning_rate": 2.8733278406084507e-06, + "loss": 1.60375576, + "memory(GiB)": 117.38, + "step": 51865, + "train_speed(iter/s)": 1.638195 + }, + { + "acc": 0.66233673, + "epoch": 1.3158295281582952, + "grad_norm": 5.90625, + "learning_rate": 2.872378844939015e-06, + "loss": 1.6214613, + "memory(GiB)": 117.38, + "step": 51870, + "train_speed(iter/s)": 1.638212 + }, + { + "acc": 0.65733314, + "epoch": 1.3159563673262302, + "grad_norm": 5.59375, + "learning_rate": 2.871429942851853e-06, + "loss": 1.66374626, + "memory(GiB)": 117.38, + "step": 51875, + "train_speed(iter/s)": 1.638227 + }, + { + "acc": 0.66757278, + "epoch": 1.3160832064941654, + "grad_norm": 7.0625, + "learning_rate": 2.8704811343887075e-06, + "loss": 1.53596821, + "memory(GiB)": 117.38, + "step": 51880, + "train_speed(iter/s)": 1.638243 + }, + { + "acc": 0.66010361, + "epoch": 1.3162100456621004, + "grad_norm": 6.25, + "learning_rate": 2.86953241959131e-06, + "loss": 1.58630199, + "memory(GiB)": 117.38, + "step": 51885, + "train_speed(iter/s)": 1.638259 + }, + { + "acc": 0.67056479, + "epoch": 1.3163368848300356, + "grad_norm": 5.71875, + "learning_rate": 2.8685837985013874e-06, + "loss": 1.56700478, + "memory(GiB)": 117.38, + "step": 51890, + "train_speed(iter/s)": 1.638275 + }, + { + "acc": 0.66395802, + "epoch": 1.3164637239979706, + "grad_norm": 6.0, + "learning_rate": 2.867635271160666e-06, + "loss": 1.55505075, + "memory(GiB)": 117.38, + "step": 51895, + "train_speed(iter/s)": 1.638291 + }, + { + "acc": 0.64235859, + "epoch": 1.3165905631659056, + "grad_norm": 6.4375, + "learning_rate": 2.8666868376108658e-06, + "loss": 1.71422424, + "memory(GiB)": 117.38, + "step": 51900, + "train_speed(iter/s)": 1.638308 + }, + { + "acc": 0.6542592, + "epoch": 1.3167174023338406, + "grad_norm": 6.46875, + "learning_rate": 2.865738497893703e-06, + "loss": 1.5798089, + "memory(GiB)": 117.38, + "step": 51905, + "train_speed(iter/s)": 1.638324 + }, + { + "acc": 0.64640779, + "epoch": 1.3168442415017758, + "grad_norm": 5.90625, + "learning_rate": 2.8647902520508896e-06, + "loss": 1.576754, + "memory(GiB)": 117.38, + "step": 51910, + "train_speed(iter/s)": 1.638341 + }, + { + "acc": 0.64876933, + "epoch": 1.3169710806697108, + "grad_norm": 5.4375, + "learning_rate": 2.8638421001241346e-06, + "loss": 1.61733742, + "memory(GiB)": 117.38, + "step": 51915, + "train_speed(iter/s)": 1.638357 + }, + { + "acc": 0.65364203, + "epoch": 1.3170979198376458, + "grad_norm": 6.40625, + "learning_rate": 2.8628940421551404e-06, + "loss": 1.6147459, + "memory(GiB)": 117.38, + "step": 51920, + "train_speed(iter/s)": 1.638373 + }, + { + "acc": 0.64499273, + "epoch": 1.317224759005581, + "grad_norm": 5.9375, + "learning_rate": 2.861946078185608e-06, + "loss": 1.66424904, + "memory(GiB)": 117.38, + "step": 51925, + "train_speed(iter/s)": 1.638388 + }, + { + "acc": 0.66102796, + "epoch": 1.317351598173516, + "grad_norm": 5.34375, + "learning_rate": 2.860998208257233e-06, + "loss": 1.56621389, + "memory(GiB)": 117.38, + "step": 51930, + "train_speed(iter/s)": 1.638406 + }, + { + "acc": 0.65145807, + "epoch": 1.317478437341451, + "grad_norm": 5.3125, + "learning_rate": 2.860050432411707e-06, + "loss": 1.654772, + "memory(GiB)": 117.38, + "step": 51935, + "train_speed(iter/s)": 1.638422 + }, + { + "acc": 0.64750786, + "epoch": 1.317605276509386, + "grad_norm": 5.375, + "learning_rate": 2.8591027506907167e-06, + "loss": 1.66390533, + "memory(GiB)": 117.38, + "step": 51940, + "train_speed(iter/s)": 1.638437 + }, + { + "acc": 0.6699873, + "epoch": 1.3177321156773212, + "grad_norm": 5.09375, + "learning_rate": 2.858155163135946e-06, + "loss": 1.59997644, + "memory(GiB)": 117.38, + "step": 51945, + "train_speed(iter/s)": 1.638453 + }, + { + "acc": 0.64035473, + "epoch": 1.3178589548452562, + "grad_norm": 5.375, + "learning_rate": 2.857207669789074e-06, + "loss": 1.64997883, + "memory(GiB)": 117.38, + "step": 51950, + "train_speed(iter/s)": 1.63847 + }, + { + "acc": 0.64072218, + "epoch": 1.3179857940131914, + "grad_norm": 5.40625, + "learning_rate": 2.8562602706917754e-06, + "loss": 1.65395164, + "memory(GiB)": 117.38, + "step": 51955, + "train_speed(iter/s)": 1.638486 + }, + { + "acc": 0.66478128, + "epoch": 1.3181126331811264, + "grad_norm": 5.15625, + "learning_rate": 2.8553129658857215e-06, + "loss": 1.61188622, + "memory(GiB)": 117.38, + "step": 51960, + "train_speed(iter/s)": 1.638503 + }, + { + "acc": 0.6422164, + "epoch": 1.3182394723490614, + "grad_norm": 5.84375, + "learning_rate": 2.854365755412576e-06, + "loss": 1.59453106, + "memory(GiB)": 117.38, + "step": 51965, + "train_speed(iter/s)": 1.638517 + }, + { + "acc": 0.66423454, + "epoch": 1.3183663115169963, + "grad_norm": 6.28125, + "learning_rate": 2.8534186393140083e-06, + "loss": 1.53146667, + "memory(GiB)": 117.38, + "step": 51970, + "train_speed(iter/s)": 1.638534 + }, + { + "acc": 0.65198679, + "epoch": 1.3184931506849316, + "grad_norm": 5.34375, + "learning_rate": 2.8524716176316715e-06, + "loss": 1.55946875, + "memory(GiB)": 117.38, + "step": 51975, + "train_speed(iter/s)": 1.638548 + }, + { + "acc": 0.63573198, + "epoch": 1.3186199898528665, + "grad_norm": 6.84375, + "learning_rate": 2.851524690407218e-06, + "loss": 1.69592514, + "memory(GiB)": 117.38, + "step": 51980, + "train_speed(iter/s)": 1.638565 + }, + { + "acc": 0.65509243, + "epoch": 1.3187468290208018, + "grad_norm": 4.59375, + "learning_rate": 2.8505778576823036e-06, + "loss": 1.6254961, + "memory(GiB)": 117.38, + "step": 51985, + "train_speed(iter/s)": 1.638581 + }, + { + "acc": 0.65358362, + "epoch": 1.3188736681887367, + "grad_norm": 6.5, + "learning_rate": 2.849631119498573e-06, + "loss": 1.65867023, + "memory(GiB)": 117.38, + "step": 51990, + "train_speed(iter/s)": 1.638597 + }, + { + "acc": 0.64058247, + "epoch": 1.3190005073566717, + "grad_norm": 5.21875, + "learning_rate": 2.8486844758976652e-06, + "loss": 1.633815, + "memory(GiB)": 117.38, + "step": 51995, + "train_speed(iter/s)": 1.638613 + }, + { + "acc": 0.66000299, + "epoch": 1.3191273465246067, + "grad_norm": 5.90625, + "learning_rate": 2.8477379269212157e-06, + "loss": 1.57531452, + "memory(GiB)": 117.38, + "step": 52000, + "train_speed(iter/s)": 1.638631 + }, + { + "epoch": 1.3191273465246067, + "eval_acc": 0.6463119469904376, + "eval_loss": 1.5733332633972168, + "eval_runtime": 58.9081, + "eval_samples_per_second": 108.134, + "eval_steps_per_second": 27.042, + "step": 52000 + }, + { + "acc": 0.65874939, + "epoch": 1.319254185692542, + "grad_norm": 5.3125, + "learning_rate": 2.846791472610865e-06, + "loss": 1.58662806, + "memory(GiB)": 117.38, + "step": 52005, + "train_speed(iter/s)": 1.635387 + }, + { + "acc": 0.64676714, + "epoch": 1.319381024860477, + "grad_norm": 6.0, + "learning_rate": 2.845845113008239e-06, + "loss": 1.65532303, + "memory(GiB)": 117.38, + "step": 52010, + "train_speed(iter/s)": 1.635404 + }, + { + "acc": 0.65876184, + "epoch": 1.319507864028412, + "grad_norm": 5.6875, + "learning_rate": 2.84489884815496e-06, + "loss": 1.59165421, + "memory(GiB)": 117.38, + "step": 52015, + "train_speed(iter/s)": 1.635421 + }, + { + "acc": 0.64427605, + "epoch": 1.3196347031963471, + "grad_norm": 5.0625, + "learning_rate": 2.843952678092653e-06, + "loss": 1.57793026, + "memory(GiB)": 117.38, + "step": 52020, + "train_speed(iter/s)": 1.635437 + }, + { + "acc": 0.64241514, + "epoch": 1.3197615423642821, + "grad_norm": 6.3125, + "learning_rate": 2.8430066028629328e-06, + "loss": 1.65505943, + "memory(GiB)": 117.38, + "step": 52025, + "train_speed(iter/s)": 1.635453 + }, + { + "acc": 0.66073937, + "epoch": 1.319888381532217, + "grad_norm": 5.4375, + "learning_rate": 2.842060622507415e-06, + "loss": 1.54876308, + "memory(GiB)": 117.38, + "step": 52030, + "train_speed(iter/s)": 1.63547 + }, + { + "acc": 0.66686354, + "epoch": 1.320015220700152, + "grad_norm": 5.53125, + "learning_rate": 2.841114737067702e-06, + "loss": 1.53359814, + "memory(GiB)": 117.38, + "step": 52035, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.67591119, + "epoch": 1.3201420598680873, + "grad_norm": 5.09375, + "learning_rate": 2.840168946585402e-06, + "loss": 1.49785595, + "memory(GiB)": 117.38, + "step": 52040, + "train_speed(iter/s)": 1.635503 + }, + { + "acc": 0.66978974, + "epoch": 1.3202688990360223, + "grad_norm": 6.15625, + "learning_rate": 2.8392232511021158e-06, + "loss": 1.56766491, + "memory(GiB)": 117.38, + "step": 52045, + "train_speed(iter/s)": 1.635518 + }, + { + "acc": 0.64868822, + "epoch": 1.3203957382039575, + "grad_norm": 7.75, + "learning_rate": 2.8382776506594385e-06, + "loss": 1.64825268, + "memory(GiB)": 117.38, + "step": 52050, + "train_speed(iter/s)": 1.635534 + }, + { + "acc": 0.65092087, + "epoch": 1.3205225773718925, + "grad_norm": 5.75, + "learning_rate": 2.837332145298961e-06, + "loss": 1.60266342, + "memory(GiB)": 117.38, + "step": 52055, + "train_speed(iter/s)": 1.63555 + }, + { + "acc": 0.662743, + "epoch": 1.3206494165398275, + "grad_norm": 7.34375, + "learning_rate": 2.836386735062271e-06, + "loss": 1.61657543, + "memory(GiB)": 117.38, + "step": 52060, + "train_speed(iter/s)": 1.635565 + }, + { + "acc": 0.67379699, + "epoch": 1.3207762557077625, + "grad_norm": 6.03125, + "learning_rate": 2.835441419990953e-06, + "loss": 1.52454882, + "memory(GiB)": 117.38, + "step": 52065, + "train_speed(iter/s)": 1.635581 + }, + { + "acc": 0.64366484, + "epoch": 1.3209030948756977, + "grad_norm": 4.90625, + "learning_rate": 2.834496200126585e-06, + "loss": 1.60783329, + "memory(GiB)": 117.38, + "step": 52070, + "train_speed(iter/s)": 1.635597 + }, + { + "acc": 0.63625898, + "epoch": 1.3210299340436327, + "grad_norm": 6.75, + "learning_rate": 2.8335510755107426e-06, + "loss": 1.69429092, + "memory(GiB)": 117.38, + "step": 52075, + "train_speed(iter/s)": 1.635613 + }, + { + "acc": 0.66968241, + "epoch": 1.3211567732115677, + "grad_norm": 7.3125, + "learning_rate": 2.8326060461849966e-06, + "loss": 1.59520359, + "memory(GiB)": 117.38, + "step": 52080, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.6709352, + "epoch": 1.3212836123795029, + "grad_norm": 7.25, + "learning_rate": 2.8316611121909126e-06, + "loss": 1.57514286, + "memory(GiB)": 117.38, + "step": 52085, + "train_speed(iter/s)": 1.635643 + }, + { + "acc": 0.65509658, + "epoch": 1.3214104515474379, + "grad_norm": 5.34375, + "learning_rate": 2.8307162735700544e-06, + "loss": 1.7069088, + "memory(GiB)": 117.38, + "step": 52090, + "train_speed(iter/s)": 1.635659 + }, + { + "acc": 0.66510782, + "epoch": 1.3215372907153728, + "grad_norm": 5.84375, + "learning_rate": 2.8297715303639796e-06, + "loss": 1.53103218, + "memory(GiB)": 117.38, + "step": 52095, + "train_speed(iter/s)": 1.635675 + }, + { + "acc": 0.64029341, + "epoch": 1.3216641298833078, + "grad_norm": 5.0, + "learning_rate": 2.8288268826142423e-06, + "loss": 1.67757034, + "memory(GiB)": 117.38, + "step": 52100, + "train_speed(iter/s)": 1.635691 + }, + { + "acc": 0.6592268, + "epoch": 1.321790969051243, + "grad_norm": 7.21875, + "learning_rate": 2.8278823303623905e-06, + "loss": 1.60833187, + "memory(GiB)": 117.38, + "step": 52105, + "train_speed(iter/s)": 1.635707 + }, + { + "acc": 0.64504409, + "epoch": 1.321917808219178, + "grad_norm": 5.625, + "learning_rate": 2.8269378736499754e-06, + "loss": 1.65360146, + "memory(GiB)": 117.38, + "step": 52110, + "train_speed(iter/s)": 1.635724 + }, + { + "acc": 0.65946589, + "epoch": 1.3220446473871132, + "grad_norm": 5.84375, + "learning_rate": 2.8259935125185323e-06, + "loss": 1.64948692, + "memory(GiB)": 117.38, + "step": 52115, + "train_speed(iter/s)": 1.635741 + }, + { + "acc": 0.6404623, + "epoch": 1.3221714865550482, + "grad_norm": 6.8125, + "learning_rate": 2.8250492470096008e-06, + "loss": 1.69871597, + "memory(GiB)": 117.38, + "step": 52120, + "train_speed(iter/s)": 1.635757 + }, + { + "acc": 0.65764208, + "epoch": 1.3222983257229832, + "grad_norm": 7.8125, + "learning_rate": 2.824105077164712e-06, + "loss": 1.57595005, + "memory(GiB)": 117.38, + "step": 52125, + "train_speed(iter/s)": 1.635769 + }, + { + "acc": 0.64921436, + "epoch": 1.3224251648909182, + "grad_norm": 5.03125, + "learning_rate": 2.823161003025401e-06, + "loss": 1.62484474, + "memory(GiB)": 117.38, + "step": 52130, + "train_speed(iter/s)": 1.635785 + }, + { + "acc": 0.6612052, + "epoch": 1.3225520040588534, + "grad_norm": 9.8125, + "learning_rate": 2.822217024633186e-06, + "loss": 1.67833023, + "memory(GiB)": 117.38, + "step": 52135, + "train_speed(iter/s)": 1.635801 + }, + { + "acc": 0.65908689, + "epoch": 1.3226788432267884, + "grad_norm": 5.59375, + "learning_rate": 2.821273142029587e-06, + "loss": 1.60943222, + "memory(GiB)": 117.38, + "step": 52140, + "train_speed(iter/s)": 1.635815 + }, + { + "acc": 0.64429984, + "epoch": 1.3228056823947236, + "grad_norm": 6.0, + "learning_rate": 2.820329355256124e-06, + "loss": 1.5831439, + "memory(GiB)": 117.38, + "step": 52145, + "train_speed(iter/s)": 1.635831 + }, + { + "acc": 0.65499277, + "epoch": 1.3229325215626586, + "grad_norm": 5.65625, + "learning_rate": 2.8193856643543106e-06, + "loss": 1.58729792, + "memory(GiB)": 117.38, + "step": 52150, + "train_speed(iter/s)": 1.635846 + }, + { + "acc": 0.65308132, + "epoch": 1.3230593607305936, + "grad_norm": 7.09375, + "learning_rate": 2.8184420693656468e-06, + "loss": 1.62044601, + "memory(GiB)": 117.38, + "step": 52155, + "train_speed(iter/s)": 1.635862 + }, + { + "acc": 0.66029449, + "epoch": 1.3231861998985286, + "grad_norm": 5.15625, + "learning_rate": 2.817498570331643e-06, + "loss": 1.56551914, + "memory(GiB)": 117.38, + "step": 52160, + "train_speed(iter/s)": 1.635878 + }, + { + "acc": 0.66244607, + "epoch": 1.3233130390664638, + "grad_norm": 5.0, + "learning_rate": 2.816555167293795e-06, + "loss": 1.54772625, + "memory(GiB)": 117.38, + "step": 52165, + "train_speed(iter/s)": 1.635894 + }, + { + "acc": 0.65782738, + "epoch": 1.3234398782343988, + "grad_norm": 5.25, + "learning_rate": 2.815611860293603e-06, + "loss": 1.65788116, + "memory(GiB)": 117.38, + "step": 52170, + "train_speed(iter/s)": 1.635909 + }, + { + "acc": 0.65940089, + "epoch": 1.3235667174023338, + "grad_norm": 5.15625, + "learning_rate": 2.814668649372549e-06, + "loss": 1.61913071, + "memory(GiB)": 117.38, + "step": 52175, + "train_speed(iter/s)": 1.635924 + }, + { + "acc": 0.65202379, + "epoch": 1.323693556570269, + "grad_norm": 6.25, + "learning_rate": 2.8137255345721266e-06, + "loss": 1.64482841, + "memory(GiB)": 117.38, + "step": 52180, + "train_speed(iter/s)": 1.635939 + }, + { + "acc": 0.65637221, + "epoch": 1.323820395738204, + "grad_norm": 6.65625, + "learning_rate": 2.8127825159338163e-06, + "loss": 1.63288727, + "memory(GiB)": 117.38, + "step": 52185, + "train_speed(iter/s)": 1.635954 + }, + { + "acc": 0.64874058, + "epoch": 1.323947234906139, + "grad_norm": 5.96875, + "learning_rate": 2.8118395934990962e-06, + "loss": 1.69786911, + "memory(GiB)": 117.38, + "step": 52190, + "train_speed(iter/s)": 1.635968 + }, + { + "acc": 0.63527327, + "epoch": 1.324074074074074, + "grad_norm": 5.53125, + "learning_rate": 2.81089676730944e-06, + "loss": 1.64451103, + "memory(GiB)": 117.38, + "step": 52195, + "train_speed(iter/s)": 1.635982 + }, + { + "acc": 0.66795893, + "epoch": 1.3242009132420092, + "grad_norm": 4.875, + "learning_rate": 2.8099540374063185e-06, + "loss": 1.54161692, + "memory(GiB)": 117.38, + "step": 52200, + "train_speed(iter/s)": 1.635996 + }, + { + "acc": 0.64185085, + "epoch": 1.3243277524099442, + "grad_norm": 4.65625, + "learning_rate": 2.8090114038311956e-06, + "loss": 1.660499, + "memory(GiB)": 117.38, + "step": 52205, + "train_speed(iter/s)": 1.636011 + }, + { + "acc": 0.65227818, + "epoch": 1.3244545915778794, + "grad_norm": 6.125, + "learning_rate": 2.8080688666255328e-06, + "loss": 1.58447628, + "memory(GiB)": 117.38, + "step": 52210, + "train_speed(iter/s)": 1.636026 + }, + { + "acc": 0.6516387, + "epoch": 1.3245814307458144, + "grad_norm": 5.40625, + "learning_rate": 2.8071264258307884e-06, + "loss": 1.58347549, + "memory(GiB)": 117.38, + "step": 52215, + "train_speed(iter/s)": 1.636038 + }, + { + "acc": 0.66553926, + "epoch": 1.3247082699137493, + "grad_norm": 5.1875, + "learning_rate": 2.8061840814884133e-06, + "loss": 1.56262131, + "memory(GiB)": 117.38, + "step": 52220, + "train_speed(iter/s)": 1.636053 + }, + { + "acc": 0.66843071, + "epoch": 1.3248351090816843, + "grad_norm": 4.84375, + "learning_rate": 2.805241833639858e-06, + "loss": 1.57691097, + "memory(GiB)": 117.38, + "step": 52225, + "train_speed(iter/s)": 1.636068 + }, + { + "acc": 0.6546845, + "epoch": 1.3249619482496195, + "grad_norm": 5.09375, + "learning_rate": 2.804299682326565e-06, + "loss": 1.62780075, + "memory(GiB)": 117.38, + "step": 52230, + "train_speed(iter/s)": 1.636082 + }, + { + "acc": 0.65772219, + "epoch": 1.3250887874175545, + "grad_norm": 5.5625, + "learning_rate": 2.8033576275899752e-06, + "loss": 1.62959042, + "memory(GiB)": 117.38, + "step": 52235, + "train_speed(iter/s)": 1.636096 + }, + { + "acc": 0.65651793, + "epoch": 1.3252156265854895, + "grad_norm": 5.125, + "learning_rate": 2.8024156694715242e-06, + "loss": 1.62317276, + "memory(GiB)": 117.38, + "step": 52240, + "train_speed(iter/s)": 1.63611 + }, + { + "acc": 0.64496408, + "epoch": 1.3253424657534247, + "grad_norm": 10.4375, + "learning_rate": 2.8014738080126424e-06, + "loss": 1.64377098, + "memory(GiB)": 117.38, + "step": 52245, + "train_speed(iter/s)": 1.636124 + }, + { + "acc": 0.65751052, + "epoch": 1.3254693049213597, + "grad_norm": 4.46875, + "learning_rate": 2.8005320432547612e-06, + "loss": 1.6021944, + "memory(GiB)": 117.38, + "step": 52250, + "train_speed(iter/s)": 1.636139 + }, + { + "acc": 0.64199843, + "epoch": 1.3255961440892947, + "grad_norm": 6.3125, + "learning_rate": 2.7995903752392993e-06, + "loss": 1.63557053, + "memory(GiB)": 117.38, + "step": 52255, + "train_speed(iter/s)": 1.636154 + }, + { + "acc": 0.67634807, + "epoch": 1.3257229832572297, + "grad_norm": 7.09375, + "learning_rate": 2.7986488040076764e-06, + "loss": 1.50680571, + "memory(GiB)": 117.38, + "step": 52260, + "train_speed(iter/s)": 1.63617 + }, + { + "acc": 0.65127783, + "epoch": 1.325849822425165, + "grad_norm": 7.1875, + "learning_rate": 2.797707329601306e-06, + "loss": 1.66753712, + "memory(GiB)": 117.38, + "step": 52265, + "train_speed(iter/s)": 1.636185 + }, + { + "acc": 0.654738, + "epoch": 1.3259766615931, + "grad_norm": 7.28125, + "learning_rate": 2.7967659520616032e-06, + "loss": 1.60343819, + "memory(GiB)": 117.38, + "step": 52270, + "train_speed(iter/s)": 1.636201 + }, + { + "acc": 0.66296921, + "epoch": 1.3261035007610351, + "grad_norm": 5.09375, + "learning_rate": 2.7958246714299685e-06, + "loss": 1.48076677, + "memory(GiB)": 117.38, + "step": 52275, + "train_speed(iter/s)": 1.636215 + }, + { + "acc": 0.64425993, + "epoch": 1.32623033992897, + "grad_norm": 4.84375, + "learning_rate": 2.7948834877478035e-06, + "loss": 1.64650574, + "memory(GiB)": 117.38, + "step": 52280, + "train_speed(iter/s)": 1.636231 + }, + { + "acc": 0.63907843, + "epoch": 1.326357179096905, + "grad_norm": 6.8125, + "learning_rate": 2.7939424010565107e-06, + "loss": 1.67418308, + "memory(GiB)": 117.38, + "step": 52285, + "train_speed(iter/s)": 1.636246 + }, + { + "acc": 0.66446128, + "epoch": 1.32648401826484, + "grad_norm": 5.4375, + "learning_rate": 2.793001411397482e-06, + "loss": 1.63926468, + "memory(GiB)": 117.38, + "step": 52290, + "train_speed(iter/s)": 1.636262 + }, + { + "acc": 0.65369778, + "epoch": 1.3266108574327753, + "grad_norm": 6.8125, + "learning_rate": 2.792060518812103e-06, + "loss": 1.65218544, + "memory(GiB)": 117.38, + "step": 52295, + "train_speed(iter/s)": 1.636278 + }, + { + "acc": 0.66207428, + "epoch": 1.3267376966007103, + "grad_norm": 6.46875, + "learning_rate": 2.7911197233417574e-06, + "loss": 1.60361862, + "memory(GiB)": 117.38, + "step": 52300, + "train_speed(iter/s)": 1.636293 + }, + { + "acc": 0.65392046, + "epoch": 1.3268645357686455, + "grad_norm": 6.125, + "learning_rate": 2.790179025027831e-06, + "loss": 1.62812309, + "memory(GiB)": 117.38, + "step": 52305, + "train_speed(iter/s)": 1.63631 + }, + { + "acc": 0.64088154, + "epoch": 1.3269913749365805, + "grad_norm": 6.5625, + "learning_rate": 2.789238423911699e-06, + "loss": 1.64108238, + "memory(GiB)": 117.38, + "step": 52310, + "train_speed(iter/s)": 1.636328 + }, + { + "acc": 0.66546278, + "epoch": 1.3271182141045155, + "grad_norm": 5.5625, + "learning_rate": 2.788297920034727e-06, + "loss": 1.53426723, + "memory(GiB)": 117.38, + "step": 52315, + "train_speed(iter/s)": 1.636344 + }, + { + "acc": 0.66275949, + "epoch": 1.3272450532724505, + "grad_norm": 6.5, + "learning_rate": 2.78735751343829e-06, + "loss": 1.59943008, + "memory(GiB)": 117.38, + "step": 52320, + "train_speed(iter/s)": 1.636362 + }, + { + "acc": 0.65351934, + "epoch": 1.3273718924403857, + "grad_norm": 6.1875, + "learning_rate": 2.786417204163748e-06, + "loss": 1.6237751, + "memory(GiB)": 117.38, + "step": 52325, + "train_speed(iter/s)": 1.636378 + }, + { + "acc": 0.64910564, + "epoch": 1.3274987316083207, + "grad_norm": 4.9375, + "learning_rate": 2.7854769922524593e-06, + "loss": 1.59807625, + "memory(GiB)": 117.38, + "step": 52330, + "train_speed(iter/s)": 1.636395 + }, + { + "acc": 0.6614953, + "epoch": 1.3276255707762556, + "grad_norm": 5.65625, + "learning_rate": 2.7845368777457803e-06, + "loss": 1.61577263, + "memory(GiB)": 117.38, + "step": 52335, + "train_speed(iter/s)": 1.636412 + }, + { + "acc": 0.65810719, + "epoch": 1.3277524099441909, + "grad_norm": 5.40625, + "learning_rate": 2.7835968606850616e-06, + "loss": 1.60805588, + "memory(GiB)": 117.38, + "step": 52340, + "train_speed(iter/s)": 1.636428 + }, + { + "acc": 0.64208531, + "epoch": 1.3278792491121258, + "grad_norm": 7.03125, + "learning_rate": 2.782656941111648e-06, + "loss": 1.63031311, + "memory(GiB)": 117.38, + "step": 52345, + "train_speed(iter/s)": 1.636446 + }, + { + "acc": 0.67260337, + "epoch": 1.3280060882800608, + "grad_norm": 8.0625, + "learning_rate": 2.7817171190668812e-06, + "loss": 1.49973927, + "memory(GiB)": 117.38, + "step": 52350, + "train_speed(iter/s)": 1.636464 + }, + { + "acc": 0.65336275, + "epoch": 1.3281329274479958, + "grad_norm": 6.34375, + "learning_rate": 2.7807773945921e-06, + "loss": 1.60625591, + "memory(GiB)": 117.38, + "step": 52355, + "train_speed(iter/s)": 1.636481 + }, + { + "acc": 0.66692867, + "epoch": 1.328259766615931, + "grad_norm": 5.28125, + "learning_rate": 2.7798377677286363e-06, + "loss": 1.57732439, + "memory(GiB)": 117.38, + "step": 52360, + "train_speed(iter/s)": 1.636497 + }, + { + "acc": 0.66101351, + "epoch": 1.328386605783866, + "grad_norm": 5.375, + "learning_rate": 2.778898238517821e-06, + "loss": 1.53063793, + "memory(GiB)": 117.38, + "step": 52365, + "train_speed(iter/s)": 1.636512 + }, + { + "acc": 0.65004921, + "epoch": 1.3285134449518012, + "grad_norm": 5.46875, + "learning_rate": 2.7779588070009767e-06, + "loss": 1.58471775, + "memory(GiB)": 117.38, + "step": 52370, + "train_speed(iter/s)": 1.636528 + }, + { + "acc": 0.66250114, + "epoch": 1.3286402841197362, + "grad_norm": 5.5, + "learning_rate": 2.7770194732194256e-06, + "loss": 1.57993488, + "memory(GiB)": 117.38, + "step": 52375, + "train_speed(iter/s)": 1.636545 + }, + { + "acc": 0.65336142, + "epoch": 1.3287671232876712, + "grad_norm": 4.78125, + "learning_rate": 2.7760802372144825e-06, + "loss": 1.65320168, + "memory(GiB)": 117.38, + "step": 52380, + "train_speed(iter/s)": 1.636563 + }, + { + "acc": 0.66621313, + "epoch": 1.3288939624556062, + "grad_norm": 6.65625, + "learning_rate": 2.7751410990274596e-06, + "loss": 1.56870127, + "memory(GiB)": 117.38, + "step": 52385, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.65803304, + "epoch": 1.3290208016235414, + "grad_norm": 6.625, + "learning_rate": 2.774202058699664e-06, + "loss": 1.53034534, + "memory(GiB)": 117.38, + "step": 52390, + "train_speed(iter/s)": 1.636596 + }, + { + "acc": 0.6521492, + "epoch": 1.3291476407914764, + "grad_norm": 4.96875, + "learning_rate": 2.7732631162724005e-06, + "loss": 1.57482338, + "memory(GiB)": 117.38, + "step": 52395, + "train_speed(iter/s)": 1.636611 + }, + { + "acc": 0.65294418, + "epoch": 1.3292744799594114, + "grad_norm": 4.59375, + "learning_rate": 2.772324271786966e-06, + "loss": 1.61312847, + "memory(GiB)": 117.38, + "step": 52400, + "train_speed(iter/s)": 1.636627 + }, + { + "acc": 0.65981736, + "epoch": 1.3294013191273466, + "grad_norm": 6.0, + "learning_rate": 2.7713855252846545e-06, + "loss": 1.58860378, + "memory(GiB)": 117.38, + "step": 52405, + "train_speed(iter/s)": 1.636643 + }, + { + "acc": 0.66145015, + "epoch": 1.3295281582952816, + "grad_norm": 7.4375, + "learning_rate": 2.7704468768067616e-06, + "loss": 1.55884542, + "memory(GiB)": 117.38, + "step": 52410, + "train_speed(iter/s)": 1.63666 + }, + { + "acc": 0.64659901, + "epoch": 1.3296549974632166, + "grad_norm": 7.34375, + "learning_rate": 2.7695083263945664e-06, + "loss": 1.66289177, + "memory(GiB)": 117.38, + "step": 52415, + "train_speed(iter/s)": 1.636676 + }, + { + "acc": 0.64792838, + "epoch": 1.3297818366311516, + "grad_norm": 6.09375, + "learning_rate": 2.7685698740893516e-06, + "loss": 1.68667831, + "memory(GiB)": 117.38, + "step": 52420, + "train_speed(iter/s)": 1.636694 + }, + { + "acc": 0.65250673, + "epoch": 1.3299086757990868, + "grad_norm": 6.28125, + "learning_rate": 2.7676315199323995e-06, + "loss": 1.5916996, + "memory(GiB)": 117.38, + "step": 52425, + "train_speed(iter/s)": 1.636711 + }, + { + "acc": 0.6755506, + "epoch": 1.3300355149670218, + "grad_norm": 5.8125, + "learning_rate": 2.7666932639649814e-06, + "loss": 1.50242176, + "memory(GiB)": 117.38, + "step": 52430, + "train_speed(iter/s)": 1.636729 + }, + { + "acc": 0.64364338, + "epoch": 1.330162354134957, + "grad_norm": 5.375, + "learning_rate": 2.765755106228362e-06, + "loss": 1.60989609, + "memory(GiB)": 117.38, + "step": 52435, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.64933214, + "epoch": 1.330289193302892, + "grad_norm": 6.1875, + "learning_rate": 2.764817046763807e-06, + "loss": 1.59384193, + "memory(GiB)": 117.38, + "step": 52440, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.6595407, + "epoch": 1.330416032470827, + "grad_norm": 5.78125, + "learning_rate": 2.7638790856125786e-06, + "loss": 1.64224262, + "memory(GiB)": 117.38, + "step": 52445, + "train_speed(iter/s)": 1.636779 + }, + { + "acc": 0.65388603, + "epoch": 1.330542871638762, + "grad_norm": 5.84375, + "learning_rate": 2.7629412228159346e-06, + "loss": 1.64019585, + "memory(GiB)": 117.38, + "step": 52450, + "train_speed(iter/s)": 1.636795 + }, + { + "acc": 0.64686003, + "epoch": 1.3306697108066972, + "grad_norm": 7.28125, + "learning_rate": 2.762003458415119e-06, + "loss": 1.65340157, + "memory(GiB)": 117.38, + "step": 52455, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.66051583, + "epoch": 1.3307965499746321, + "grad_norm": 5.375, + "learning_rate": 2.7610657924513853e-06, + "loss": 1.54742651, + "memory(GiB)": 117.38, + "step": 52460, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.66580324, + "epoch": 1.3309233891425674, + "grad_norm": 5.84375, + "learning_rate": 2.7601282249659737e-06, + "loss": 1.61788673, + "memory(GiB)": 117.38, + "step": 52465, + "train_speed(iter/s)": 1.636846 + }, + { + "acc": 0.64221544, + "epoch": 1.3310502283105023, + "grad_norm": 5.90625, + "learning_rate": 2.759190756000126e-06, + "loss": 1.67348537, + "memory(GiB)": 117.38, + "step": 52470, + "train_speed(iter/s)": 1.636862 + }, + { + "acc": 0.64889803, + "epoch": 1.3311770674784373, + "grad_norm": 6.9375, + "learning_rate": 2.7582533855950687e-06, + "loss": 1.63913956, + "memory(GiB)": 117.38, + "step": 52475, + "train_speed(iter/s)": 1.636878 + }, + { + "acc": 0.6537281, + "epoch": 1.3313039066463723, + "grad_norm": 5.34375, + "learning_rate": 2.757316113792038e-06, + "loss": 1.63948975, + "memory(GiB)": 117.38, + "step": 52480, + "train_speed(iter/s)": 1.636896 + }, + { + "acc": 0.63775034, + "epoch": 1.3314307458143075, + "grad_norm": 4.8125, + "learning_rate": 2.756378940632258e-06, + "loss": 1.65937729, + "memory(GiB)": 117.38, + "step": 52485, + "train_speed(iter/s)": 1.636914 + }, + { + "acc": 0.64764094, + "epoch": 1.3315575849822425, + "grad_norm": 6.71875, + "learning_rate": 2.755441866156949e-06, + "loss": 1.65567741, + "memory(GiB)": 117.38, + "step": 52490, + "train_speed(iter/s)": 1.63693 + }, + { + "acc": 0.66028323, + "epoch": 1.3316844241501775, + "grad_norm": 5.40625, + "learning_rate": 2.7545048904073278e-06, + "loss": 1.62102776, + "memory(GiB)": 117.38, + "step": 52495, + "train_speed(iter/s)": 1.636947 + }, + { + "acc": 0.66409416, + "epoch": 1.3318112633181127, + "grad_norm": 5.53125, + "learning_rate": 2.7535680134246067e-06, + "loss": 1.57515907, + "memory(GiB)": 117.38, + "step": 52500, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.67666497, + "epoch": 1.3319381024860477, + "grad_norm": 5.15625, + "learning_rate": 2.752631235249995e-06, + "loss": 1.61476402, + "memory(GiB)": 117.38, + "step": 52505, + "train_speed(iter/s)": 1.63698 + }, + { + "acc": 0.66616154, + "epoch": 1.3320649416539827, + "grad_norm": 8.1875, + "learning_rate": 2.7516945559246945e-06, + "loss": 1.49781132, + "memory(GiB)": 117.38, + "step": 52510, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.66098967, + "epoch": 1.3321917808219177, + "grad_norm": 5.5, + "learning_rate": 2.7507579754899053e-06, + "loss": 1.57758045, + "memory(GiB)": 117.38, + "step": 52515, + "train_speed(iter/s)": 1.637014 + }, + { + "acc": 0.67135019, + "epoch": 1.332318619989853, + "grad_norm": 6.15625, + "learning_rate": 2.749821493986823e-06, + "loss": 1.50583439, + "memory(GiB)": 117.38, + "step": 52520, + "train_speed(iter/s)": 1.637031 + }, + { + "acc": 0.66961088, + "epoch": 1.332445459157788, + "grad_norm": 6.4375, + "learning_rate": 2.748885111456637e-06, + "loss": 1.60621929, + "memory(GiB)": 117.38, + "step": 52525, + "train_speed(iter/s)": 1.637049 + }, + { + "acc": 0.66691132, + "epoch": 1.332572298325723, + "grad_norm": 6.375, + "learning_rate": 2.7479488279405354e-06, + "loss": 1.57460241, + "memory(GiB)": 117.38, + "step": 52530, + "train_speed(iter/s)": 1.637067 + }, + { + "acc": 0.65898952, + "epoch": 1.332699137493658, + "grad_norm": 6.15625, + "learning_rate": 2.7470126434796984e-06, + "loss": 1.57970161, + "memory(GiB)": 117.38, + "step": 52535, + "train_speed(iter/s)": 1.637085 + }, + { + "acc": 0.63932762, + "epoch": 1.332825976661593, + "grad_norm": 5.625, + "learning_rate": 2.746076558115304e-06, + "loss": 1.62135773, + "memory(GiB)": 117.38, + "step": 52540, + "train_speed(iter/s)": 1.6371 + }, + { + "acc": 0.66982231, + "epoch": 1.332952815829528, + "grad_norm": 5.375, + "learning_rate": 2.7451405718885237e-06, + "loss": 1.54981756, + "memory(GiB)": 117.38, + "step": 52545, + "train_speed(iter/s)": 1.637117 + }, + { + "acc": 0.65233712, + "epoch": 1.3330796549974633, + "grad_norm": 5.625, + "learning_rate": 2.7442046848405328e-06, + "loss": 1.5745225, + "memory(GiB)": 117.38, + "step": 52550, + "train_speed(iter/s)": 1.637134 + }, + { + "acc": 0.66620231, + "epoch": 1.3332064941653983, + "grad_norm": 5.15625, + "learning_rate": 2.743268897012489e-06, + "loss": 1.54645576, + "memory(GiB)": 117.38, + "step": 52555, + "train_speed(iter/s)": 1.637151 + }, + { + "acc": 0.65001745, + "epoch": 1.3333333333333333, + "grad_norm": 6.0, + "learning_rate": 2.7423332084455543e-06, + "loss": 1.62158928, + "memory(GiB)": 117.38, + "step": 52560, + "train_speed(iter/s)": 1.637168 + }, + { + "acc": 0.67157116, + "epoch": 1.3334601725012685, + "grad_norm": 5.78125, + "learning_rate": 2.741397619180883e-06, + "loss": 1.49711065, + "memory(GiB)": 117.38, + "step": 52565, + "train_speed(iter/s)": 1.637186 + }, + { + "acc": 0.64873924, + "epoch": 1.3335870116692035, + "grad_norm": 5.5, + "learning_rate": 2.740462129259633e-06, + "loss": 1.65579453, + "memory(GiB)": 117.38, + "step": 52570, + "train_speed(iter/s)": 1.637204 + }, + { + "acc": 0.65199451, + "epoch": 1.3337138508371384, + "grad_norm": 5.28125, + "learning_rate": 2.739526738722944e-06, + "loss": 1.66223907, + "memory(GiB)": 117.38, + "step": 52575, + "train_speed(iter/s)": 1.637221 + }, + { + "acc": 0.65989714, + "epoch": 1.3338406900050734, + "grad_norm": 5.3125, + "learning_rate": 2.738591447611959e-06, + "loss": 1.54689894, + "memory(GiB)": 117.38, + "step": 52580, + "train_speed(iter/s)": 1.637238 + }, + { + "acc": 0.6383184, + "epoch": 1.3339675291730086, + "grad_norm": 5.78125, + "learning_rate": 2.7376562559678214e-06, + "loss": 1.65324364, + "memory(GiB)": 117.38, + "step": 52585, + "train_speed(iter/s)": 1.637255 + }, + { + "acc": 0.65521321, + "epoch": 1.3340943683409436, + "grad_norm": 6.53125, + "learning_rate": 2.7367211638316637e-06, + "loss": 1.64562035, + "memory(GiB)": 117.38, + "step": 52590, + "train_speed(iter/s)": 1.637272 + }, + { + "acc": 0.65895061, + "epoch": 1.3342212075088788, + "grad_norm": 5.0625, + "learning_rate": 2.735786171244611e-06, + "loss": 1.60423737, + "memory(GiB)": 117.38, + "step": 52595, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.64557638, + "epoch": 1.3343480466768138, + "grad_norm": 5.4375, + "learning_rate": 2.7348512782477922e-06, + "loss": 1.61355, + "memory(GiB)": 117.38, + "step": 52600, + "train_speed(iter/s)": 1.637306 + }, + { + "acc": 0.66050453, + "epoch": 1.3344748858447488, + "grad_norm": 7.15625, + "learning_rate": 2.7339164848823287e-06, + "loss": 1.58573818, + "memory(GiB)": 117.38, + "step": 52605, + "train_speed(iter/s)": 1.637323 + }, + { + "acc": 0.65355721, + "epoch": 1.3346017250126838, + "grad_norm": 5.78125, + "learning_rate": 2.7329817911893365e-06, + "loss": 1.63428097, + "memory(GiB)": 117.38, + "step": 52610, + "train_speed(iter/s)": 1.637341 + }, + { + "acc": 0.65812111, + "epoch": 1.334728564180619, + "grad_norm": 6.09375, + "learning_rate": 2.7320471972099226e-06, + "loss": 1.58594646, + "memory(GiB)": 117.38, + "step": 52615, + "train_speed(iter/s)": 1.637358 + }, + { + "acc": 0.6591898, + "epoch": 1.334855403348554, + "grad_norm": 5.5625, + "learning_rate": 2.7311127029852007e-06, + "loss": 1.59654484, + "memory(GiB)": 117.38, + "step": 52620, + "train_speed(iter/s)": 1.637376 + }, + { + "acc": 0.64393187, + "epoch": 1.3349822425164892, + "grad_norm": 5.40625, + "learning_rate": 2.7301783085562726e-06, + "loss": 1.65579853, + "memory(GiB)": 117.38, + "step": 52625, + "train_speed(iter/s)": 1.637393 + }, + { + "acc": 0.65317993, + "epoch": 1.3351090816844242, + "grad_norm": 5.59375, + "learning_rate": 2.7292440139642364e-06, + "loss": 1.60785484, + "memory(GiB)": 117.38, + "step": 52630, + "train_speed(iter/s)": 1.637409 + }, + { + "acc": 0.64856644, + "epoch": 1.3352359208523592, + "grad_norm": 5.96875, + "learning_rate": 2.7283098192501855e-06, + "loss": 1.59053631, + "memory(GiB)": 117.38, + "step": 52635, + "train_speed(iter/s)": 1.637425 + }, + { + "acc": 0.66258683, + "epoch": 1.3353627600202942, + "grad_norm": 6.0, + "learning_rate": 2.7273757244552124e-06, + "loss": 1.53788548, + "memory(GiB)": 117.38, + "step": 52640, + "train_speed(iter/s)": 1.637442 + }, + { + "acc": 0.65966253, + "epoch": 1.3354895991882294, + "grad_norm": 5.40625, + "learning_rate": 2.726441729620401e-06, + "loss": 1.56639271, + "memory(GiB)": 117.38, + "step": 52645, + "train_speed(iter/s)": 1.637458 + }, + { + "acc": 0.66232243, + "epoch": 1.3356164383561644, + "grad_norm": 5.5, + "learning_rate": 2.725507834786833e-06, + "loss": 1.65225945, + "memory(GiB)": 117.38, + "step": 52650, + "train_speed(iter/s)": 1.637477 + }, + { + "acc": 0.6623137, + "epoch": 1.3357432775240994, + "grad_norm": 6.71875, + "learning_rate": 2.7245740399955857e-06, + "loss": 1.59502268, + "memory(GiB)": 117.38, + "step": 52655, + "train_speed(iter/s)": 1.637494 + }, + { + "acc": 0.65993319, + "epoch": 1.3358701166920346, + "grad_norm": 5.375, + "learning_rate": 2.72364034528773e-06, + "loss": 1.58483095, + "memory(GiB)": 117.38, + "step": 52660, + "train_speed(iter/s)": 1.637512 + }, + { + "acc": 0.65926423, + "epoch": 1.3359969558599696, + "grad_norm": 5.625, + "learning_rate": 2.722706750704337e-06, + "loss": 1.59527245, + "memory(GiB)": 117.38, + "step": 52665, + "train_speed(iter/s)": 1.637528 + }, + { + "acc": 0.65151391, + "epoch": 1.3361237950279046, + "grad_norm": 7.1875, + "learning_rate": 2.7217732562864673e-06, + "loss": 1.61700287, + "memory(GiB)": 117.38, + "step": 52670, + "train_speed(iter/s)": 1.637545 + }, + { + "acc": 0.64332972, + "epoch": 1.3362506341958396, + "grad_norm": 6.9375, + "learning_rate": 2.720839862075181e-06, + "loss": 1.65035, + "memory(GiB)": 117.38, + "step": 52675, + "train_speed(iter/s)": 1.637561 + }, + { + "acc": 0.66069026, + "epoch": 1.3363774733637748, + "grad_norm": 5.0625, + "learning_rate": 2.7199065681115344e-06, + "loss": 1.61837101, + "memory(GiB)": 117.38, + "step": 52680, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.66102395, + "epoch": 1.3365043125317098, + "grad_norm": 6.1875, + "learning_rate": 2.7189733744365742e-06, + "loss": 1.60382118, + "memory(GiB)": 117.38, + "step": 52685, + "train_speed(iter/s)": 1.637594 + }, + { + "acc": 0.66155758, + "epoch": 1.336631151699645, + "grad_norm": 5.625, + "learning_rate": 2.718040281091353e-06, + "loss": 1.52586889, + "memory(GiB)": 117.38, + "step": 52690, + "train_speed(iter/s)": 1.637611 + }, + { + "acc": 0.65605564, + "epoch": 1.33675799086758, + "grad_norm": 5.53125, + "learning_rate": 2.717107288116906e-06, + "loss": 1.567558, + "memory(GiB)": 117.38, + "step": 52695, + "train_speed(iter/s)": 1.637629 + }, + { + "acc": 0.65056314, + "epoch": 1.336884830035515, + "grad_norm": 6.0, + "learning_rate": 2.716174395554274e-06, + "loss": 1.62484398, + "memory(GiB)": 117.38, + "step": 52700, + "train_speed(iter/s)": 1.637645 + }, + { + "acc": 0.6628602, + "epoch": 1.33701166920345, + "grad_norm": 4.90625, + "learning_rate": 2.715241603444486e-06, + "loss": 1.62239742, + "memory(GiB)": 117.38, + "step": 52705, + "train_speed(iter/s)": 1.637663 + }, + { + "acc": 0.65894337, + "epoch": 1.3371385083713851, + "grad_norm": 6.3125, + "learning_rate": 2.714308911828577e-06, + "loss": 1.56882267, + "memory(GiB)": 117.38, + "step": 52710, + "train_speed(iter/s)": 1.637678 + }, + { + "acc": 0.64816999, + "epoch": 1.3372653475393201, + "grad_norm": 6.03125, + "learning_rate": 2.713376320747565e-06, + "loss": 1.66252346, + "memory(GiB)": 117.38, + "step": 52715, + "train_speed(iter/s)": 1.637694 + }, + { + "acc": 0.65537429, + "epoch": 1.3373921867072551, + "grad_norm": 5.71875, + "learning_rate": 2.7124438302424696e-06, + "loss": 1.58291273, + "memory(GiB)": 117.38, + "step": 52720, + "train_speed(iter/s)": 1.637711 + }, + { + "acc": 0.65098572, + "epoch": 1.3375190258751903, + "grad_norm": 7.25, + "learning_rate": 2.711511440354309e-06, + "loss": 1.61649456, + "memory(GiB)": 117.38, + "step": 52725, + "train_speed(iter/s)": 1.637727 + }, + { + "acc": 0.65720758, + "epoch": 1.3376458650431253, + "grad_norm": 6.71875, + "learning_rate": 2.710579151124095e-06, + "loss": 1.6403759, + "memory(GiB)": 117.38, + "step": 52730, + "train_speed(iter/s)": 1.637744 + }, + { + "acc": 0.65557075, + "epoch": 1.3377727042110603, + "grad_norm": 5.5, + "learning_rate": 2.70964696259283e-06, + "loss": 1.56805096, + "memory(GiB)": 117.38, + "step": 52735, + "train_speed(iter/s)": 1.637761 + }, + { + "acc": 0.6476429, + "epoch": 1.3378995433789953, + "grad_norm": 5.375, + "learning_rate": 2.7087148748015146e-06, + "loss": 1.57338219, + "memory(GiB)": 117.38, + "step": 52740, + "train_speed(iter/s)": 1.637777 + }, + { + "acc": 0.63488717, + "epoch": 1.3380263825469305, + "grad_norm": 5.28125, + "learning_rate": 2.7077828877911517e-06, + "loss": 1.73790627, + "memory(GiB)": 117.38, + "step": 52745, + "train_speed(iter/s)": 1.637794 + }, + { + "acc": 0.64949837, + "epoch": 1.3381532217148655, + "grad_norm": 6.84375, + "learning_rate": 2.706851001602733e-06, + "loss": 1.55197601, + "memory(GiB)": 117.38, + "step": 52750, + "train_speed(iter/s)": 1.637811 + }, + { + "acc": 0.6568469, + "epoch": 1.3382800608828007, + "grad_norm": 5.03125, + "learning_rate": 2.7059192162772407e-06, + "loss": 1.60443554, + "memory(GiB)": 117.38, + "step": 52755, + "train_speed(iter/s)": 1.637829 + }, + { + "acc": 0.65990729, + "epoch": 1.3384069000507357, + "grad_norm": 6.3125, + "learning_rate": 2.704987531855666e-06, + "loss": 1.61475754, + "memory(GiB)": 117.38, + "step": 52760, + "train_speed(iter/s)": 1.637847 + }, + { + "acc": 0.66679478, + "epoch": 1.3385337392186707, + "grad_norm": 5.34375, + "learning_rate": 2.704055948378986e-06, + "loss": 1.60053635, + "memory(GiB)": 117.38, + "step": 52765, + "train_speed(iter/s)": 1.637865 + }, + { + "acc": 0.65497775, + "epoch": 1.3386605783866057, + "grad_norm": 5.9375, + "learning_rate": 2.7031244658881773e-06, + "loss": 1.557059, + "memory(GiB)": 117.38, + "step": 52770, + "train_speed(iter/s)": 1.637883 + }, + { + "acc": 0.65754519, + "epoch": 1.338787417554541, + "grad_norm": 7.65625, + "learning_rate": 2.7021930844242085e-06, + "loss": 1.62008839, + "memory(GiB)": 117.38, + "step": 52775, + "train_speed(iter/s)": 1.637901 + }, + { + "acc": 0.65888691, + "epoch": 1.3389142567224759, + "grad_norm": 7.21875, + "learning_rate": 2.7012618040280463e-06, + "loss": 1.61328239, + "memory(GiB)": 117.38, + "step": 52780, + "train_speed(iter/s)": 1.637918 + }, + { + "acc": 0.67575898, + "epoch": 1.339041095890411, + "grad_norm": 6.3125, + "learning_rate": 2.7003306247406536e-06, + "loss": 1.54389858, + "memory(GiB)": 117.38, + "step": 52785, + "train_speed(iter/s)": 1.637936 + }, + { + "acc": 0.65001764, + "epoch": 1.339167935058346, + "grad_norm": 5.40625, + "learning_rate": 2.6993995466029877e-06, + "loss": 1.69043427, + "memory(GiB)": 117.38, + "step": 52790, + "train_speed(iter/s)": 1.637954 + }, + { + "acc": 0.6416666, + "epoch": 1.339294774226281, + "grad_norm": 4.84375, + "learning_rate": 2.6984685696560002e-06, + "loss": 1.62393398, + "memory(GiB)": 117.38, + "step": 52795, + "train_speed(iter/s)": 1.637971 + }, + { + "acc": 0.65619736, + "epoch": 1.339421613394216, + "grad_norm": 5.28125, + "learning_rate": 2.6975376939406418e-06, + "loss": 1.5741333, + "memory(GiB)": 117.38, + "step": 52800, + "train_speed(iter/s)": 1.637988 + }, + { + "acc": 0.65987396, + "epoch": 1.3395484525621513, + "grad_norm": 5.21875, + "learning_rate": 2.6966069194978537e-06, + "loss": 1.56193695, + "memory(GiB)": 117.38, + "step": 52805, + "train_speed(iter/s)": 1.638006 + }, + { + "acc": 0.64126892, + "epoch": 1.3396752917300863, + "grad_norm": 6.15625, + "learning_rate": 2.6956762463685787e-06, + "loss": 1.65374451, + "memory(GiB)": 117.38, + "step": 52810, + "train_speed(iter/s)": 1.638022 + }, + { + "acc": 0.66136961, + "epoch": 1.3398021308980212, + "grad_norm": 5.25, + "learning_rate": 2.69474567459375e-06, + "loss": 1.63458233, + "memory(GiB)": 117.38, + "step": 52815, + "train_speed(iter/s)": 1.638039 + }, + { + "acc": 0.65879898, + "epoch": 1.3399289700659565, + "grad_norm": 6.4375, + "learning_rate": 2.693815204214299e-06, + "loss": 1.61339455, + "memory(GiB)": 117.38, + "step": 52820, + "train_speed(iter/s)": 1.638056 + }, + { + "acc": 0.65663776, + "epoch": 1.3400558092338914, + "grad_norm": 5.625, + "learning_rate": 2.692884835271151e-06, + "loss": 1.59304152, + "memory(GiB)": 117.38, + "step": 52825, + "train_speed(iter/s)": 1.638074 + }, + { + "acc": 0.67202492, + "epoch": 1.3401826484018264, + "grad_norm": 6.875, + "learning_rate": 2.6919545678052296e-06, + "loss": 1.58589973, + "memory(GiB)": 117.38, + "step": 52830, + "train_speed(iter/s)": 1.638091 + }, + { + "acc": 0.67168484, + "epoch": 1.3403094875697614, + "grad_norm": 6.84375, + "learning_rate": 2.69102440185745e-06, + "loss": 1.50302792, + "memory(GiB)": 117.38, + "step": 52835, + "train_speed(iter/s)": 1.638108 + }, + { + "acc": 0.64203653, + "epoch": 1.3404363267376966, + "grad_norm": 5.21875, + "learning_rate": 2.690094337468726e-06, + "loss": 1.73719349, + "memory(GiB)": 117.38, + "step": 52840, + "train_speed(iter/s)": 1.638125 + }, + { + "acc": 0.64537287, + "epoch": 1.3405631659056316, + "grad_norm": 6.65625, + "learning_rate": 2.6891643746799643e-06, + "loss": 1.62930698, + "memory(GiB)": 117.38, + "step": 52845, + "train_speed(iter/s)": 1.638143 + }, + { + "acc": 0.6239182, + "epoch": 1.3406900050735668, + "grad_norm": 5.0625, + "learning_rate": 2.6882345135320753e-06, + "loss": 1.65400143, + "memory(GiB)": 117.38, + "step": 52850, + "train_speed(iter/s)": 1.638161 + }, + { + "acc": 0.65918274, + "epoch": 1.3408168442415018, + "grad_norm": 5.375, + "learning_rate": 2.68730475406595e-06, + "loss": 1.62808533, + "memory(GiB)": 117.38, + "step": 52855, + "train_speed(iter/s)": 1.638178 + }, + { + "acc": 0.64438286, + "epoch": 1.3409436834094368, + "grad_norm": 5.5625, + "learning_rate": 2.6863750963224867e-06, + "loss": 1.66250687, + "memory(GiB)": 117.38, + "step": 52860, + "train_speed(iter/s)": 1.638196 + }, + { + "acc": 0.63586493, + "epoch": 1.3410705225773718, + "grad_norm": 5.84375, + "learning_rate": 2.685445540342577e-06, + "loss": 1.65475388, + "memory(GiB)": 117.38, + "step": 52865, + "train_speed(iter/s)": 1.638212 + }, + { + "acc": 0.64148946, + "epoch": 1.341197361745307, + "grad_norm": 6.15625, + "learning_rate": 2.6845160861671094e-06, + "loss": 1.70497818, + "memory(GiB)": 117.38, + "step": 52870, + "train_speed(iter/s)": 1.63823 + }, + { + "acc": 0.65571108, + "epoch": 1.341324200913242, + "grad_norm": 5.84375, + "learning_rate": 2.6835867338369593e-06, + "loss": 1.54213448, + "memory(GiB)": 117.38, + "step": 52875, + "train_speed(iter/s)": 1.638248 + }, + { + "acc": 0.656812, + "epoch": 1.341451040081177, + "grad_norm": 5.53125, + "learning_rate": 2.6826574833930053e-06, + "loss": 1.52983913, + "memory(GiB)": 117.38, + "step": 52880, + "train_speed(iter/s)": 1.638266 + }, + { + "acc": 0.64465132, + "epoch": 1.3415778792491122, + "grad_norm": 6.71875, + "learning_rate": 2.681728334876123e-06, + "loss": 1.6092041, + "memory(GiB)": 117.38, + "step": 52885, + "train_speed(iter/s)": 1.638285 + }, + { + "acc": 0.66320653, + "epoch": 1.3417047184170472, + "grad_norm": 6.4375, + "learning_rate": 2.6807992883271806e-06, + "loss": 1.51731644, + "memory(GiB)": 117.38, + "step": 52890, + "train_speed(iter/s)": 1.638303 + }, + { + "acc": 0.65563526, + "epoch": 1.3418315575849822, + "grad_norm": 6.46875, + "learning_rate": 2.6798703437870364e-06, + "loss": 1.5687912, + "memory(GiB)": 117.38, + "step": 52895, + "train_speed(iter/s)": 1.638061 + }, + { + "acc": 0.66147747, + "epoch": 1.3419583967529172, + "grad_norm": 5.90625, + "learning_rate": 2.678941501296555e-06, + "loss": 1.60652409, + "memory(GiB)": 117.38, + "step": 52900, + "train_speed(iter/s)": 1.638081 + }, + { + "acc": 0.67350202, + "epoch": 1.3420852359208524, + "grad_norm": 5.40625, + "learning_rate": 2.6780127608965896e-06, + "loss": 1.56545982, + "memory(GiB)": 117.38, + "step": 52905, + "train_speed(iter/s)": 1.638098 + }, + { + "acc": 0.66206641, + "epoch": 1.3422120750887874, + "grad_norm": 5.90625, + "learning_rate": 2.677084122627991e-06, + "loss": 1.59041185, + "memory(GiB)": 117.38, + "step": 52910, + "train_speed(iter/s)": 1.637857 + }, + { + "acc": 0.65431671, + "epoch": 1.3423389142567226, + "grad_norm": 4.28125, + "learning_rate": 2.6761555865316003e-06, + "loss": 1.66435547, + "memory(GiB)": 117.38, + "step": 52915, + "train_speed(iter/s)": 1.637874 + }, + { + "acc": 0.67400656, + "epoch": 1.3424657534246576, + "grad_norm": 6.59375, + "learning_rate": 2.6752271526482644e-06, + "loss": 1.60946808, + "memory(GiB)": 117.38, + "step": 52920, + "train_speed(iter/s)": 1.637892 + }, + { + "acc": 0.64252248, + "epoch": 1.3425925925925926, + "grad_norm": 5.75, + "learning_rate": 2.674298821018817e-06, + "loss": 1.65829601, + "memory(GiB)": 117.38, + "step": 52925, + "train_speed(iter/s)": 1.637909 + }, + { + "acc": 0.65550051, + "epoch": 1.3427194317605275, + "grad_norm": 6.8125, + "learning_rate": 2.673370591684091e-06, + "loss": 1.66814632, + "memory(GiB)": 117.38, + "step": 52930, + "train_speed(iter/s)": 1.637926 + }, + { + "acc": 0.64613943, + "epoch": 1.3428462709284628, + "grad_norm": 5.34375, + "learning_rate": 2.672442464684915e-06, + "loss": 1.68707199, + "memory(GiB)": 117.38, + "step": 52935, + "train_speed(iter/s)": 1.637942 + }, + { + "acc": 0.65846262, + "epoch": 1.3429731100963977, + "grad_norm": 4.78125, + "learning_rate": 2.671514440062111e-06, + "loss": 1.57678671, + "memory(GiB)": 117.38, + "step": 52940, + "train_speed(iter/s)": 1.637959 + }, + { + "acc": 0.66221523, + "epoch": 1.343099949264333, + "grad_norm": 5.6875, + "learning_rate": 2.6705865178564973e-06, + "loss": 1.5938448, + "memory(GiB)": 117.38, + "step": 52945, + "train_speed(iter/s)": 1.637975 + }, + { + "acc": 0.64873471, + "epoch": 1.343226788432268, + "grad_norm": 5.4375, + "learning_rate": 2.6696586981088886e-06, + "loss": 1.65513458, + "memory(GiB)": 117.38, + "step": 52950, + "train_speed(iter/s)": 1.637992 + }, + { + "acc": 0.64291525, + "epoch": 1.343353627600203, + "grad_norm": 6.96875, + "learning_rate": 2.6687309808600947e-06, + "loss": 1.65151024, + "memory(GiB)": 117.38, + "step": 52955, + "train_speed(iter/s)": 1.63801 + }, + { + "acc": 0.6530447, + "epoch": 1.343480466768138, + "grad_norm": 6.15625, + "learning_rate": 2.6678033661509208e-06, + "loss": 1.62684059, + "memory(GiB)": 117.38, + "step": 52960, + "train_speed(iter/s)": 1.638027 + }, + { + "acc": 0.64571304, + "epoch": 1.3436073059360731, + "grad_norm": 5.78125, + "learning_rate": 2.6668758540221665e-06, + "loss": 1.66499176, + "memory(GiB)": 117.38, + "step": 52965, + "train_speed(iter/s)": 1.638044 + }, + { + "acc": 0.65314083, + "epoch": 1.3437341451040081, + "grad_norm": 6.28125, + "learning_rate": 2.66594844451463e-06, + "loss": 1.55702152, + "memory(GiB)": 117.38, + "step": 52970, + "train_speed(iter/s)": 1.63806 + }, + { + "acc": 0.65581408, + "epoch": 1.3438609842719431, + "grad_norm": 8.0, + "learning_rate": 2.6650211376691008e-06, + "loss": 1.70864468, + "memory(GiB)": 117.38, + "step": 52975, + "train_speed(iter/s)": 1.638077 + }, + { + "acc": 0.65982456, + "epoch": 1.3439878234398783, + "grad_norm": 4.8125, + "learning_rate": 2.664093933526368e-06, + "loss": 1.59009027, + "memory(GiB)": 117.38, + "step": 52980, + "train_speed(iter/s)": 1.638093 + }, + { + "acc": 0.6477129, + "epoch": 1.3441146626078133, + "grad_norm": 5.6875, + "learning_rate": 2.6631668321272097e-06, + "loss": 1.64277306, + "memory(GiB)": 117.38, + "step": 52985, + "train_speed(iter/s)": 1.638111 + }, + { + "acc": 0.64682226, + "epoch": 1.3442415017757483, + "grad_norm": 6.90625, + "learning_rate": 2.6622398335124122e-06, + "loss": 1.62259197, + "memory(GiB)": 117.38, + "step": 52990, + "train_speed(iter/s)": 1.638128 + }, + { + "acc": 0.6515841, + "epoch": 1.3443683409436833, + "grad_norm": 5.4375, + "learning_rate": 2.661312937722742e-06, + "loss": 1.62858467, + "memory(GiB)": 117.38, + "step": 52995, + "train_speed(iter/s)": 1.638145 + }, + { + "acc": 0.65542135, + "epoch": 1.3444951801116185, + "grad_norm": 5.0625, + "learning_rate": 2.6603861447989703e-06, + "loss": 1.57048187, + "memory(GiB)": 117.38, + "step": 53000, + "train_speed(iter/s)": 1.638163 + }, + { + "epoch": 1.3444951801116185, + "eval_acc": 0.6462505507907371, + "eval_loss": 1.5732170343399048, + "eval_runtime": 59.1654, + "eval_samples_per_second": 107.664, + "eval_steps_per_second": 26.925, + "step": 53000 + }, + { + "acc": 0.65749884, + "epoch": 1.3446220192795535, + "grad_norm": 5.875, + "learning_rate": 2.65945945478186e-06, + "loss": 1.5651413, + "memory(GiB)": 117.38, + "step": 53005, + "train_speed(iter/s)": 1.63497 + }, + { + "acc": 0.6618412, + "epoch": 1.3447488584474887, + "grad_norm": 5.65625, + "learning_rate": 2.658532867712176e-06, + "loss": 1.59753542, + "memory(GiB)": 117.38, + "step": 53010, + "train_speed(iter/s)": 1.634986 + }, + { + "acc": 0.6620615, + "epoch": 1.3448756976154237, + "grad_norm": 5.46875, + "learning_rate": 2.6576063836306687e-06, + "loss": 1.55156517, + "memory(GiB)": 117.38, + "step": 53015, + "train_speed(iter/s)": 1.635002 + }, + { + "acc": 0.65507374, + "epoch": 1.3450025367833587, + "grad_norm": 7.15625, + "learning_rate": 2.656680002578088e-06, + "loss": 1.60954323, + "memory(GiB)": 117.38, + "step": 53020, + "train_speed(iter/s)": 1.63502 + }, + { + "acc": 0.65546637, + "epoch": 1.3451293759512937, + "grad_norm": 5.28125, + "learning_rate": 2.655753724595186e-06, + "loss": 1.56377649, + "memory(GiB)": 117.38, + "step": 53025, + "train_speed(iter/s)": 1.635037 + }, + { + "acc": 0.64627085, + "epoch": 1.3452562151192289, + "grad_norm": 5.84375, + "learning_rate": 2.6548275497227028e-06, + "loss": 1.58946438, + "memory(GiB)": 117.38, + "step": 53030, + "train_speed(iter/s)": 1.635054 + }, + { + "acc": 0.66005816, + "epoch": 1.3453830542871639, + "grad_norm": 5.09375, + "learning_rate": 2.6539014780013707e-06, + "loss": 1.58129702, + "memory(GiB)": 117.38, + "step": 53035, + "train_speed(iter/s)": 1.635071 + }, + { + "acc": 0.64971399, + "epoch": 1.3455098934550989, + "grad_norm": 6.0, + "learning_rate": 2.6529755094719276e-06, + "loss": 1.66034966, + "memory(GiB)": 117.38, + "step": 53040, + "train_speed(iter/s)": 1.635088 + }, + { + "acc": 0.64803171, + "epoch": 1.345636732623034, + "grad_norm": 6.21875, + "learning_rate": 2.652049644175101e-06, + "loss": 1.63495255, + "memory(GiB)": 117.38, + "step": 53045, + "train_speed(iter/s)": 1.635105 + }, + { + "acc": 0.65598598, + "epoch": 1.345763571790969, + "grad_norm": 4.84375, + "learning_rate": 2.6511238821516154e-06, + "loss": 1.5769042, + "memory(GiB)": 117.38, + "step": 53050, + "train_speed(iter/s)": 1.635122 + }, + { + "acc": 0.65694847, + "epoch": 1.345890410958904, + "grad_norm": 5.1875, + "learning_rate": 2.650198223442185e-06, + "loss": 1.55485935, + "memory(GiB)": 117.38, + "step": 53055, + "train_speed(iter/s)": 1.635138 + }, + { + "acc": 0.648349, + "epoch": 1.346017250126839, + "grad_norm": 5.71875, + "learning_rate": 2.6492726680875296e-06, + "loss": 1.6262619, + "memory(GiB)": 117.38, + "step": 53060, + "train_speed(iter/s)": 1.635156 + }, + { + "acc": 0.67691779, + "epoch": 1.3461440892947742, + "grad_norm": 6.09375, + "learning_rate": 2.6483472161283576e-06, + "loss": 1.57359123, + "memory(GiB)": 117.38, + "step": 53065, + "train_speed(iter/s)": 1.635173 + }, + { + "acc": 0.65168762, + "epoch": 1.3462709284627092, + "grad_norm": 6.25, + "learning_rate": 2.647421867605374e-06, + "loss": 1.57416496, + "memory(GiB)": 117.38, + "step": 53070, + "train_speed(iter/s)": 1.63519 + }, + { + "acc": 0.65546989, + "epoch": 1.3463977676306444, + "grad_norm": 6.15625, + "learning_rate": 2.6464966225592804e-06, + "loss": 1.59917831, + "memory(GiB)": 117.38, + "step": 53075, + "train_speed(iter/s)": 1.635207 + }, + { + "acc": 0.64507484, + "epoch": 1.3465246067985794, + "grad_norm": 6.40625, + "learning_rate": 2.645571481030773e-06, + "loss": 1.65069351, + "memory(GiB)": 117.38, + "step": 53080, + "train_speed(iter/s)": 1.635225 + }, + { + "acc": 0.66880908, + "epoch": 1.3466514459665144, + "grad_norm": 7.59375, + "learning_rate": 2.6446464430605434e-06, + "loss": 1.58419876, + "memory(GiB)": 117.38, + "step": 53085, + "train_speed(iter/s)": 1.635242 + }, + { + "acc": 0.65629783, + "epoch": 1.3467782851344494, + "grad_norm": 5.1875, + "learning_rate": 2.6437215086892797e-06, + "loss": 1.57410336, + "memory(GiB)": 117.38, + "step": 53090, + "train_speed(iter/s)": 1.635259 + }, + { + "acc": 0.66677265, + "epoch": 1.3469051243023846, + "grad_norm": 6.03125, + "learning_rate": 2.642796677957664e-06, + "loss": 1.57001553, + "memory(GiB)": 117.38, + "step": 53095, + "train_speed(iter/s)": 1.635277 + }, + { + "acc": 0.64633703, + "epoch": 1.3470319634703196, + "grad_norm": 5.15625, + "learning_rate": 2.641871950906374e-06, + "loss": 1.65141735, + "memory(GiB)": 117.38, + "step": 53100, + "train_speed(iter/s)": 1.635296 + }, + { + "acc": 0.65298882, + "epoch": 1.3471588026382548, + "grad_norm": 4.96875, + "learning_rate": 2.6409473275760843e-06, + "loss": 1.62078323, + "memory(GiB)": 117.38, + "step": 53105, + "train_speed(iter/s)": 1.635313 + }, + { + "acc": 0.65995712, + "epoch": 1.3472856418061898, + "grad_norm": 5.84375, + "learning_rate": 2.640022808007463e-06, + "loss": 1.67465019, + "memory(GiB)": 117.38, + "step": 53110, + "train_speed(iter/s)": 1.63533 + }, + { + "acc": 0.65148191, + "epoch": 1.3474124809741248, + "grad_norm": 6.0, + "learning_rate": 2.639098392241176e-06, + "loss": 1.6521965, + "memory(GiB)": 117.38, + "step": 53115, + "train_speed(iter/s)": 1.635347 + }, + { + "acc": 0.64239984, + "epoch": 1.3475393201420598, + "grad_norm": 5.5625, + "learning_rate": 2.6381740803178826e-06, + "loss": 1.61989326, + "memory(GiB)": 117.38, + "step": 53120, + "train_speed(iter/s)": 1.635364 + }, + { + "acc": 0.65592341, + "epoch": 1.347666159309995, + "grad_norm": 5.46875, + "learning_rate": 2.6372498722782346e-06, + "loss": 1.58394699, + "memory(GiB)": 117.38, + "step": 53125, + "train_speed(iter/s)": 1.635382 + }, + { + "acc": 0.65077953, + "epoch": 1.34779299847793, + "grad_norm": 5.28125, + "learning_rate": 2.6363257681628907e-06, + "loss": 1.62168217, + "memory(GiB)": 117.38, + "step": 53130, + "train_speed(iter/s)": 1.635398 + }, + { + "acc": 0.66489758, + "epoch": 1.347919837645865, + "grad_norm": 8.4375, + "learning_rate": 2.635401768012491e-06, + "loss": 1.55354099, + "memory(GiB)": 117.38, + "step": 53135, + "train_speed(iter/s)": 1.635415 + }, + { + "acc": 0.66087213, + "epoch": 1.3480466768138002, + "grad_norm": 6.5625, + "learning_rate": 2.6344778718676783e-06, + "loss": 1.56101646, + "memory(GiB)": 117.38, + "step": 53140, + "train_speed(iter/s)": 1.63543 + }, + { + "acc": 0.64854054, + "epoch": 1.3481735159817352, + "grad_norm": 6.625, + "learning_rate": 2.6335540797690886e-06, + "loss": 1.62264214, + "memory(GiB)": 117.38, + "step": 53145, + "train_speed(iter/s)": 1.635447 + }, + { + "acc": 0.67462988, + "epoch": 1.3483003551496702, + "grad_norm": 5.03125, + "learning_rate": 2.632630391757359e-06, + "loss": 1.54717245, + "memory(GiB)": 117.38, + "step": 53150, + "train_speed(iter/s)": 1.635464 + }, + { + "acc": 0.64381123, + "epoch": 1.3484271943176052, + "grad_norm": 5.96875, + "learning_rate": 2.6317068078731126e-06, + "loss": 1.554562, + "memory(GiB)": 117.38, + "step": 53155, + "train_speed(iter/s)": 1.63548 + }, + { + "acc": 0.66206274, + "epoch": 1.3485540334855404, + "grad_norm": 5.46875, + "learning_rate": 2.630783328156973e-06, + "loss": 1.52031784, + "memory(GiB)": 117.38, + "step": 53160, + "train_speed(iter/s)": 1.635498 + }, + { + "acc": 0.64649653, + "epoch": 1.3486808726534754, + "grad_norm": 5.78125, + "learning_rate": 2.629859952649562e-06, + "loss": 1.57510061, + "memory(GiB)": 117.38, + "step": 53165, + "train_speed(iter/s)": 1.635515 + }, + { + "acc": 0.67456112, + "epoch": 1.3488077118214106, + "grad_norm": 6.0, + "learning_rate": 2.628936681391494e-06, + "loss": 1.53614311, + "memory(GiB)": 117.38, + "step": 53170, + "train_speed(iter/s)": 1.635533 + }, + { + "acc": 0.65889921, + "epoch": 1.3489345509893456, + "grad_norm": 6.0625, + "learning_rate": 2.628013514423375e-06, + "loss": 1.65717278, + "memory(GiB)": 117.38, + "step": 53175, + "train_speed(iter/s)": 1.635548 + }, + { + "acc": 0.64561205, + "epoch": 1.3490613901572805, + "grad_norm": 5.625, + "learning_rate": 2.6270904517858102e-06, + "loss": 1.651898, + "memory(GiB)": 117.38, + "step": 53180, + "train_speed(iter/s)": 1.635565 + }, + { + "acc": 0.65892339, + "epoch": 1.3491882293252155, + "grad_norm": 6.90625, + "learning_rate": 2.6261674935194036e-06, + "loss": 1.60423622, + "memory(GiB)": 117.38, + "step": 53185, + "train_speed(iter/s)": 1.635582 + }, + { + "acc": 0.66684794, + "epoch": 1.3493150684931507, + "grad_norm": 6.59375, + "learning_rate": 2.6252446396647503e-06, + "loss": 1.49408035, + "memory(GiB)": 117.38, + "step": 53190, + "train_speed(iter/s)": 1.635599 + }, + { + "acc": 0.63280678, + "epoch": 1.3494419076610857, + "grad_norm": 5.5, + "learning_rate": 2.6243218902624367e-06, + "loss": 1.68768597, + "memory(GiB)": 117.38, + "step": 53195, + "train_speed(iter/s)": 1.635616 + }, + { + "acc": 0.65492496, + "epoch": 1.3495687468290207, + "grad_norm": 5.84375, + "learning_rate": 2.6233992453530555e-06, + "loss": 1.58912382, + "memory(GiB)": 117.38, + "step": 53200, + "train_speed(iter/s)": 1.635631 + }, + { + "acc": 0.67314806, + "epoch": 1.349695585996956, + "grad_norm": 5.03125, + "learning_rate": 2.6224767049771856e-06, + "loss": 1.52840376, + "memory(GiB)": 117.38, + "step": 53205, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.63613391, + "epoch": 1.349822425164891, + "grad_norm": 6.875, + "learning_rate": 2.621554269175405e-06, + "loss": 1.68029366, + "memory(GiB)": 117.38, + "step": 53210, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.63658056, + "epoch": 1.349949264332826, + "grad_norm": 6.03125, + "learning_rate": 2.620631937988287e-06, + "loss": 1.69095383, + "memory(GiB)": 117.38, + "step": 53215, + "train_speed(iter/s)": 1.635682 + }, + { + "acc": 0.64621515, + "epoch": 1.350076103500761, + "grad_norm": 5.84375, + "learning_rate": 2.6197097114564e-06, + "loss": 1.62177086, + "memory(GiB)": 117.38, + "step": 53220, + "train_speed(iter/s)": 1.635698 + }, + { + "acc": 0.6440197, + "epoch": 1.3502029426686961, + "grad_norm": 5.75, + "learning_rate": 2.618787589620306e-06, + "loss": 1.63779526, + "memory(GiB)": 117.38, + "step": 53225, + "train_speed(iter/s)": 1.635716 + }, + { + "acc": 0.66838298, + "epoch": 1.350329781836631, + "grad_norm": 6.46875, + "learning_rate": 2.6178655725205653e-06, + "loss": 1.54495974, + "memory(GiB)": 117.38, + "step": 53230, + "train_speed(iter/s)": 1.635733 + }, + { + "acc": 0.67129593, + "epoch": 1.3504566210045663, + "grad_norm": 6.46875, + "learning_rate": 2.6169436601977325e-06, + "loss": 1.5674818, + "memory(GiB)": 117.38, + "step": 53235, + "train_speed(iter/s)": 1.635749 + }, + { + "acc": 0.65012026, + "epoch": 1.3505834601725013, + "grad_norm": 5.03125, + "learning_rate": 2.6160218526923576e-06, + "loss": 1.64780731, + "memory(GiB)": 117.38, + "step": 53240, + "train_speed(iter/s)": 1.635765 + }, + { + "acc": 0.63448963, + "epoch": 1.3507102993404363, + "grad_norm": 7.28125, + "learning_rate": 2.6151001500449847e-06, + "loss": 1.67943039, + "memory(GiB)": 117.38, + "step": 53245, + "train_speed(iter/s)": 1.635783 + }, + { + "acc": 0.65939102, + "epoch": 1.3508371385083713, + "grad_norm": 4.96875, + "learning_rate": 2.614178552296155e-06, + "loss": 1.62487183, + "memory(GiB)": 117.38, + "step": 53250, + "train_speed(iter/s)": 1.635798 + }, + { + "acc": 0.6523737, + "epoch": 1.3509639776763065, + "grad_norm": 6.03125, + "learning_rate": 2.6132570594864047e-06, + "loss": 1.63586292, + "memory(GiB)": 117.38, + "step": 53255, + "train_speed(iter/s)": 1.635815 + }, + { + "acc": 0.67681699, + "epoch": 1.3510908168442415, + "grad_norm": 5.375, + "learning_rate": 2.612335671656265e-06, + "loss": 1.61519775, + "memory(GiB)": 117.38, + "step": 53260, + "train_speed(iter/s)": 1.635831 + }, + { + "acc": 0.68297505, + "epoch": 1.3512176560121767, + "grad_norm": 6.03125, + "learning_rate": 2.6114143888462607e-06, + "loss": 1.52014666, + "memory(GiB)": 117.38, + "step": 53265, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.661343, + "epoch": 1.3513444951801117, + "grad_norm": 6.0625, + "learning_rate": 2.6104932110969195e-06, + "loss": 1.61415863, + "memory(GiB)": 117.38, + "step": 53270, + "train_speed(iter/s)": 1.635862 + }, + { + "acc": 0.65879097, + "epoch": 1.3514713343480467, + "grad_norm": 5.15625, + "learning_rate": 2.609572138448753e-06, + "loss": 1.55318985, + "memory(GiB)": 117.38, + "step": 53275, + "train_speed(iter/s)": 1.635879 + }, + { + "acc": 0.65854053, + "epoch": 1.3515981735159817, + "grad_norm": 5.15625, + "learning_rate": 2.608651170942277e-06, + "loss": 1.61781635, + "memory(GiB)": 117.38, + "step": 53280, + "train_speed(iter/s)": 1.635896 + }, + { + "acc": 0.662182, + "epoch": 1.3517250126839169, + "grad_norm": 5.3125, + "learning_rate": 2.607730308617997e-06, + "loss": 1.56530571, + "memory(GiB)": 117.38, + "step": 53285, + "train_speed(iter/s)": 1.635914 + }, + { + "acc": 0.67736173, + "epoch": 1.3518518518518519, + "grad_norm": 6.46875, + "learning_rate": 2.6068095515164226e-06, + "loss": 1.54855499, + "memory(GiB)": 117.38, + "step": 53290, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.64335728, + "epoch": 1.3519786910197868, + "grad_norm": 5.71875, + "learning_rate": 2.605888899678047e-06, + "loss": 1.63052826, + "memory(GiB)": 117.38, + "step": 53295, + "train_speed(iter/s)": 1.635947 + }, + { + "acc": 0.65918455, + "epoch": 1.352105530187722, + "grad_norm": 7.40625, + "learning_rate": 2.6049683531433645e-06, + "loss": 1.61806126, + "memory(GiB)": 117.38, + "step": 53300, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.64922285, + "epoch": 1.352232369355657, + "grad_norm": 7.6875, + "learning_rate": 2.6040479119528683e-06, + "loss": 1.64680138, + "memory(GiB)": 117.38, + "step": 53305, + "train_speed(iter/s)": 1.635981 + }, + { + "acc": 0.66121531, + "epoch": 1.352359208523592, + "grad_norm": 5.75, + "learning_rate": 2.6031275761470447e-06, + "loss": 1.63919601, + "memory(GiB)": 117.38, + "step": 53310, + "train_speed(iter/s)": 1.635998 + }, + { + "acc": 0.63653183, + "epoch": 1.352486047691527, + "grad_norm": 6.6875, + "learning_rate": 2.60220734576637e-06, + "loss": 1.6381813, + "memory(GiB)": 117.38, + "step": 53315, + "train_speed(iter/s)": 1.636016 + }, + { + "acc": 0.63646011, + "epoch": 1.3526128868594622, + "grad_norm": 6.15625, + "learning_rate": 2.601287220851319e-06, + "loss": 1.73433304, + "memory(GiB)": 117.38, + "step": 53320, + "train_speed(iter/s)": 1.636032 + }, + { + "acc": 0.67886047, + "epoch": 1.3527397260273972, + "grad_norm": 9.9375, + "learning_rate": 2.6003672014423677e-06, + "loss": 1.60608101, + "memory(GiB)": 117.38, + "step": 53325, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.65038939, + "epoch": 1.3528665651953324, + "grad_norm": 5.3125, + "learning_rate": 2.5994472875799827e-06, + "loss": 1.62295265, + "memory(GiB)": 117.38, + "step": 53330, + "train_speed(iter/s)": 1.636066 + }, + { + "acc": 0.65853438, + "epoch": 1.3529934043632674, + "grad_norm": 4.59375, + "learning_rate": 2.598527479304619e-06, + "loss": 1.61953373, + "memory(GiB)": 117.38, + "step": 53335, + "train_speed(iter/s)": 1.636083 + }, + { + "acc": 0.67081928, + "epoch": 1.3531202435312024, + "grad_norm": 5.34375, + "learning_rate": 2.597607776656741e-06, + "loss": 1.55799046, + "memory(GiB)": 117.38, + "step": 53340, + "train_speed(iter/s)": 1.636099 + }, + { + "acc": 0.6609271, + "epoch": 1.3532470826991374, + "grad_norm": 6.09375, + "learning_rate": 2.5966881796767984e-06, + "loss": 1.5861496, + "memory(GiB)": 117.38, + "step": 53345, + "train_speed(iter/s)": 1.636117 + }, + { + "acc": 0.64542503, + "epoch": 1.3533739218670726, + "grad_norm": 9.0625, + "learning_rate": 2.5957686884052423e-06, + "loss": 1.64579029, + "memory(GiB)": 117.38, + "step": 53350, + "train_speed(iter/s)": 1.636134 + }, + { + "acc": 0.65269003, + "epoch": 1.3535007610350076, + "grad_norm": 9.0, + "learning_rate": 2.5948493028825093e-06, + "loss": 1.64242401, + "memory(GiB)": 117.38, + "step": 53355, + "train_speed(iter/s)": 1.636152 + }, + { + "acc": 0.65905704, + "epoch": 1.3536276002029426, + "grad_norm": 6.15625, + "learning_rate": 2.593930023149044e-06, + "loss": 1.62445793, + "memory(GiB)": 117.38, + "step": 53360, + "train_speed(iter/s)": 1.636168 + }, + { + "acc": 0.65242438, + "epoch": 1.3537544393708778, + "grad_norm": 5.5625, + "learning_rate": 2.593010849245279e-06, + "loss": 1.58600121, + "memory(GiB)": 117.38, + "step": 53365, + "train_speed(iter/s)": 1.636186 + }, + { + "acc": 0.64623499, + "epoch": 1.3538812785388128, + "grad_norm": 5.875, + "learning_rate": 2.592091781211643e-06, + "loss": 1.60641956, + "memory(GiB)": 117.38, + "step": 53370, + "train_speed(iter/s)": 1.636203 + }, + { + "acc": 0.68461699, + "epoch": 1.3540081177067478, + "grad_norm": 6.15625, + "learning_rate": 2.591172819088561e-06, + "loss": 1.46721954, + "memory(GiB)": 117.38, + "step": 53375, + "train_speed(iter/s)": 1.63622 + }, + { + "acc": 0.64659529, + "epoch": 1.3541349568746828, + "grad_norm": 4.59375, + "learning_rate": 2.590253962916453e-06, + "loss": 1.62378082, + "memory(GiB)": 117.38, + "step": 53380, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.65734396, + "epoch": 1.354261796042618, + "grad_norm": 5.34375, + "learning_rate": 2.5893352127357347e-06, + "loss": 1.54402256, + "memory(GiB)": 117.38, + "step": 53385, + "train_speed(iter/s)": 1.636254 + }, + { + "acc": 0.65798941, + "epoch": 1.354388635210553, + "grad_norm": 5.625, + "learning_rate": 2.5884165685868164e-06, + "loss": 1.58096571, + "memory(GiB)": 117.38, + "step": 53390, + "train_speed(iter/s)": 1.63627 + }, + { + "acc": 0.64605174, + "epoch": 1.3545154743784882, + "grad_norm": 5.90625, + "learning_rate": 2.5874980305101045e-06, + "loss": 1.64534111, + "memory(GiB)": 117.38, + "step": 53395, + "train_speed(iter/s)": 1.636288 + }, + { + "acc": 0.65400658, + "epoch": 1.3546423135464232, + "grad_norm": 6.09375, + "learning_rate": 2.586579598546e-06, + "loss": 1.65377083, + "memory(GiB)": 117.38, + "step": 53400, + "train_speed(iter/s)": 1.636305 + }, + { + "acc": 0.67116776, + "epoch": 1.3547691527143582, + "grad_norm": 7.59375, + "learning_rate": 2.5856612727348995e-06, + "loss": 1.51648579, + "memory(GiB)": 117.38, + "step": 53405, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.66574535, + "epoch": 1.3548959918822931, + "grad_norm": 5.46875, + "learning_rate": 2.584743053117196e-06, + "loss": 1.55454884, + "memory(GiB)": 117.38, + "step": 53410, + "train_speed(iter/s)": 1.636339 + }, + { + "acc": 0.66346292, + "epoch": 1.3550228310502284, + "grad_norm": 5.59375, + "learning_rate": 2.583824939733277e-06, + "loss": 1.52395287, + "memory(GiB)": 117.38, + "step": 53415, + "train_speed(iter/s)": 1.636355 + }, + { + "acc": 0.64766903, + "epoch": 1.3551496702181633, + "grad_norm": 5.625, + "learning_rate": 2.5829069326235234e-06, + "loss": 1.66314182, + "memory(GiB)": 117.38, + "step": 53420, + "train_speed(iter/s)": 1.636372 + }, + { + "acc": 0.63730884, + "epoch": 1.3552765093860986, + "grad_norm": 5.96875, + "learning_rate": 2.5819890318283137e-06, + "loss": 1.60120277, + "memory(GiB)": 117.38, + "step": 53425, + "train_speed(iter/s)": 1.636389 + }, + { + "acc": 0.65782394, + "epoch": 1.3554033485540335, + "grad_norm": 6.1875, + "learning_rate": 2.5810712373880253e-06, + "loss": 1.65483608, + "memory(GiB)": 117.38, + "step": 53430, + "train_speed(iter/s)": 1.636407 + }, + { + "acc": 0.66530867, + "epoch": 1.3555301877219685, + "grad_norm": 4.875, + "learning_rate": 2.5801535493430215e-06, + "loss": 1.59025354, + "memory(GiB)": 117.38, + "step": 53435, + "train_speed(iter/s)": 1.636424 + }, + { + "acc": 0.67099171, + "epoch": 1.3556570268899035, + "grad_norm": 5.6875, + "learning_rate": 2.5792359677336685e-06, + "loss": 1.5561408, + "memory(GiB)": 117.38, + "step": 53440, + "train_speed(iter/s)": 1.636441 + }, + { + "acc": 0.66039257, + "epoch": 1.3557838660578387, + "grad_norm": 5.59375, + "learning_rate": 2.5783184926003237e-06, + "loss": 1.6002964, + "memory(GiB)": 117.38, + "step": 53445, + "train_speed(iter/s)": 1.636457 + }, + { + "acc": 0.64508629, + "epoch": 1.3559107052257737, + "grad_norm": 6.3125, + "learning_rate": 2.5774011239833473e-06, + "loss": 1.64640236, + "memory(GiB)": 117.38, + "step": 53450, + "train_speed(iter/s)": 1.636475 + }, + { + "acc": 0.66321273, + "epoch": 1.3560375443937087, + "grad_norm": 6.71875, + "learning_rate": 2.5764838619230843e-06, + "loss": 1.62081356, + "memory(GiB)": 117.38, + "step": 53455, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.67482982, + "epoch": 1.356164383561644, + "grad_norm": 7.53125, + "learning_rate": 2.575566706459879e-06, + "loss": 1.52221003, + "memory(GiB)": 117.38, + "step": 53460, + "train_speed(iter/s)": 1.636507 + }, + { + "acc": 0.65256124, + "epoch": 1.356291222729579, + "grad_norm": 5.78125, + "learning_rate": 2.574649657634076e-06, + "loss": 1.63283176, + "memory(GiB)": 117.38, + "step": 53465, + "train_speed(iter/s)": 1.636524 + }, + { + "acc": 0.66210742, + "epoch": 1.356418061897514, + "grad_norm": 6.53125, + "learning_rate": 2.5737327154860116e-06, + "loss": 1.61089935, + "memory(GiB)": 117.38, + "step": 53470, + "train_speed(iter/s)": 1.636542 + }, + { + "acc": 0.6434392, + "epoch": 1.356544901065449, + "grad_norm": 5.8125, + "learning_rate": 2.572815880056011e-06, + "loss": 1.67750626, + "memory(GiB)": 117.38, + "step": 53475, + "train_speed(iter/s)": 1.636559 + }, + { + "acc": 0.66380305, + "epoch": 1.356671740233384, + "grad_norm": 5.9375, + "learning_rate": 2.571899151384406e-06, + "loss": 1.53025331, + "memory(GiB)": 117.38, + "step": 53480, + "train_speed(iter/s)": 1.636574 + }, + { + "acc": 0.66234779, + "epoch": 1.356798579401319, + "grad_norm": 5.21875, + "learning_rate": 2.5709825295115178e-06, + "loss": 1.62158527, + "memory(GiB)": 117.38, + "step": 53485, + "train_speed(iter/s)": 1.636591 + }, + { + "acc": 0.64412508, + "epoch": 1.3569254185692543, + "grad_norm": 5.4375, + "learning_rate": 2.5700660144776647e-06, + "loss": 1.6475872, + "memory(GiB)": 117.38, + "step": 53490, + "train_speed(iter/s)": 1.636608 + }, + { + "acc": 0.65411391, + "epoch": 1.3570522577371893, + "grad_norm": 6.46875, + "learning_rate": 2.5691496063231527e-06, + "loss": 1.61621971, + "memory(GiB)": 117.38, + "step": 53495, + "train_speed(iter/s)": 1.636625 + }, + { + "acc": 0.65778389, + "epoch": 1.3571790969051243, + "grad_norm": 9.1875, + "learning_rate": 2.568233305088296e-06, + "loss": 1.53290215, + "memory(GiB)": 117.38, + "step": 53500, + "train_speed(iter/s)": 1.636641 + }, + { + "acc": 0.66203542, + "epoch": 1.3573059360730593, + "grad_norm": 5.375, + "learning_rate": 2.5673171108133956e-06, + "loss": 1.56835804, + "memory(GiB)": 117.38, + "step": 53505, + "train_speed(iter/s)": 1.636658 + }, + { + "acc": 0.64234896, + "epoch": 1.3574327752409945, + "grad_norm": 5.0, + "learning_rate": 2.5664010235387503e-06, + "loss": 1.62272148, + "memory(GiB)": 117.38, + "step": 53510, + "train_speed(iter/s)": 1.636676 + }, + { + "acc": 0.66369486, + "epoch": 1.3575596144089295, + "grad_norm": 7.46875, + "learning_rate": 2.565485043304653e-06, + "loss": 1.58520908, + "memory(GiB)": 117.38, + "step": 53515, + "train_speed(iter/s)": 1.636692 + }, + { + "acc": 0.64480968, + "epoch": 1.3576864535768645, + "grad_norm": 6.15625, + "learning_rate": 2.564569170151392e-06, + "loss": 1.62558746, + "memory(GiB)": 117.38, + "step": 53520, + "train_speed(iter/s)": 1.63671 + }, + { + "acc": 0.65705748, + "epoch": 1.3578132927447997, + "grad_norm": 5.34375, + "learning_rate": 2.5636534041192534e-06, + "loss": 1.54227839, + "memory(GiB)": 117.38, + "step": 53525, + "train_speed(iter/s)": 1.636726 + }, + { + "acc": 0.65346851, + "epoch": 1.3579401319127347, + "grad_norm": 5.28125, + "learning_rate": 2.5627377452485153e-06, + "loss": 1.59480934, + "memory(GiB)": 117.38, + "step": 53530, + "train_speed(iter/s)": 1.636744 + }, + { + "acc": 0.66570468, + "epoch": 1.3580669710806696, + "grad_norm": 4.90625, + "learning_rate": 2.561822193579453e-06, + "loss": 1.52571135, + "memory(GiB)": 117.38, + "step": 53535, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.6710207, + "epoch": 1.3581938102486046, + "grad_norm": 5.6875, + "learning_rate": 2.560906749152335e-06, + "loss": 1.60008297, + "memory(GiB)": 117.38, + "step": 53540, + "train_speed(iter/s)": 1.636778 + }, + { + "acc": 0.65436282, + "epoch": 1.3583206494165398, + "grad_norm": 5.71875, + "learning_rate": 2.55999141200743e-06, + "loss": 1.56700296, + "memory(GiB)": 117.38, + "step": 53545, + "train_speed(iter/s)": 1.636795 + }, + { + "acc": 0.66621675, + "epoch": 1.3584474885844748, + "grad_norm": 4.90625, + "learning_rate": 2.5590761821849954e-06, + "loss": 1.50343142, + "memory(GiB)": 117.38, + "step": 53550, + "train_speed(iter/s)": 1.636812 + }, + { + "acc": 0.66937995, + "epoch": 1.35857432775241, + "grad_norm": 6.03125, + "learning_rate": 2.5581610597252883e-06, + "loss": 1.57336159, + "memory(GiB)": 117.38, + "step": 53555, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.65682297, + "epoch": 1.358701166920345, + "grad_norm": 5.3125, + "learning_rate": 2.5572460446685593e-06, + "loss": 1.53592196, + "memory(GiB)": 117.38, + "step": 53560, + "train_speed(iter/s)": 1.636846 + }, + { + "acc": 0.65299702, + "epoch": 1.35882800608828, + "grad_norm": 5.9375, + "learning_rate": 2.5563311370550535e-06, + "loss": 1.61211033, + "memory(GiB)": 117.38, + "step": 53565, + "train_speed(iter/s)": 1.636862 + }, + { + "acc": 0.64969316, + "epoch": 1.358954845256215, + "grad_norm": 6.25, + "learning_rate": 2.5554163369250194e-06, + "loss": 1.67772903, + "memory(GiB)": 117.38, + "step": 53570, + "train_speed(iter/s)": 1.636878 + }, + { + "acc": 0.66940522, + "epoch": 1.3590816844241502, + "grad_norm": 6.03125, + "learning_rate": 2.5545016443186867e-06, + "loss": 1.53773775, + "memory(GiB)": 117.38, + "step": 53575, + "train_speed(iter/s)": 1.636895 + }, + { + "acc": 0.65389729, + "epoch": 1.3592085235920852, + "grad_norm": 6.0625, + "learning_rate": 2.55358705927629e-06, + "loss": 1.58248663, + "memory(GiB)": 117.38, + "step": 53580, + "train_speed(iter/s)": 1.63691 + }, + { + "acc": 0.64850359, + "epoch": 1.3593353627600204, + "grad_norm": 5.09375, + "learning_rate": 2.552672581838055e-06, + "loss": 1.62079391, + "memory(GiB)": 117.38, + "step": 53585, + "train_speed(iter/s)": 1.636927 + }, + { + "acc": 0.68043618, + "epoch": 1.3594622019279554, + "grad_norm": 6.25, + "learning_rate": 2.5517582120442095e-06, + "loss": 1.54785957, + "memory(GiB)": 117.38, + "step": 53590, + "train_speed(iter/s)": 1.636944 + }, + { + "acc": 0.64452925, + "epoch": 1.3595890410958904, + "grad_norm": 5.6875, + "learning_rate": 2.5508439499349675e-06, + "loss": 1.61808472, + "memory(GiB)": 117.38, + "step": 53595, + "train_speed(iter/s)": 1.636961 + }, + { + "acc": 0.6411911, + "epoch": 1.3597158802638254, + "grad_norm": 5.375, + "learning_rate": 2.549929795550541e-06, + "loss": 1.62348404, + "memory(GiB)": 117.38, + "step": 53600, + "train_speed(iter/s)": 1.636978 + }, + { + "acc": 0.66476412, + "epoch": 1.3598427194317606, + "grad_norm": 7.28125, + "learning_rate": 2.549015748931143e-06, + "loss": 1.56165237, + "memory(GiB)": 117.38, + "step": 53605, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.66282101, + "epoch": 1.3599695585996956, + "grad_norm": 5.875, + "learning_rate": 2.5481018101169763e-06, + "loss": 1.59731636, + "memory(GiB)": 117.38, + "step": 53610, + "train_speed(iter/s)": 1.637011 + }, + { + "acc": 0.65457664, + "epoch": 1.3600963977676306, + "grad_norm": 6.40625, + "learning_rate": 2.547187979148238e-06, + "loss": 1.56694136, + "memory(GiB)": 117.38, + "step": 53615, + "train_speed(iter/s)": 1.637029 + }, + { + "acc": 0.66665554, + "epoch": 1.3602232369355658, + "grad_norm": 4.875, + "learning_rate": 2.546274256065121e-06, + "loss": 1.59870129, + "memory(GiB)": 117.38, + "step": 53620, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.65498114, + "epoch": 1.3603500761035008, + "grad_norm": 5.625, + "learning_rate": 2.545360640907819e-06, + "loss": 1.59269962, + "memory(GiB)": 117.38, + "step": 53625, + "train_speed(iter/s)": 1.637064 + }, + { + "acc": 0.64923649, + "epoch": 1.3604769152714358, + "grad_norm": 5.8125, + "learning_rate": 2.544447133716518e-06, + "loss": 1.56173363, + "memory(GiB)": 117.38, + "step": 53630, + "train_speed(iter/s)": 1.637079 + }, + { + "acc": 0.637815, + "epoch": 1.3606037544393708, + "grad_norm": 5.21875, + "learning_rate": 2.5435337345313904e-06, + "loss": 1.64669476, + "memory(GiB)": 117.38, + "step": 53635, + "train_speed(iter/s)": 1.637095 + }, + { + "acc": 0.64696646, + "epoch": 1.360730593607306, + "grad_norm": 6.28125, + "learning_rate": 2.5426204433926194e-06, + "loss": 1.5872736, + "memory(GiB)": 117.38, + "step": 53640, + "train_speed(iter/s)": 1.637111 + }, + { + "acc": 0.64977741, + "epoch": 1.360857432775241, + "grad_norm": 5.28125, + "learning_rate": 2.541707260340372e-06, + "loss": 1.65675945, + "memory(GiB)": 117.38, + "step": 53645, + "train_speed(iter/s)": 1.637127 + }, + { + "acc": 0.66039391, + "epoch": 1.3609842719431762, + "grad_norm": 5.125, + "learning_rate": 2.5407941854148156e-06, + "loss": 1.61841202, + "memory(GiB)": 117.38, + "step": 53650, + "train_speed(iter/s)": 1.637144 + }, + { + "acc": 0.64172859, + "epoch": 1.3611111111111112, + "grad_norm": 5.65625, + "learning_rate": 2.5398812186561095e-06, + "loss": 1.67692986, + "memory(GiB)": 117.38, + "step": 53655, + "train_speed(iter/s)": 1.63716 + }, + { + "acc": 0.64275894, + "epoch": 1.3612379502790461, + "grad_norm": 4.875, + "learning_rate": 2.5389683601044114e-06, + "loss": 1.61739941, + "memory(GiB)": 117.38, + "step": 53660, + "train_speed(iter/s)": 1.637177 + }, + { + "acc": 0.65812073, + "epoch": 1.3613647894469811, + "grad_norm": 6.46875, + "learning_rate": 2.538055609799873e-06, + "loss": 1.59457846, + "memory(GiB)": 117.38, + "step": 53665, + "train_speed(iter/s)": 1.637194 + }, + { + "acc": 0.65167179, + "epoch": 1.3614916286149163, + "grad_norm": 5.625, + "learning_rate": 2.5371429677826397e-06, + "loss": 1.59267788, + "memory(GiB)": 117.38, + "step": 53670, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.68152027, + "epoch": 1.3616184677828513, + "grad_norm": 6.15625, + "learning_rate": 2.5362304340928556e-06, + "loss": 1.51827164, + "memory(GiB)": 117.38, + "step": 53675, + "train_speed(iter/s)": 1.637228 + }, + { + "acc": 0.64996185, + "epoch": 1.3617453069507863, + "grad_norm": 6.125, + "learning_rate": 2.535318008770656e-06, + "loss": 1.6348875, + "memory(GiB)": 117.38, + "step": 53680, + "train_speed(iter/s)": 1.637244 + }, + { + "acc": 0.66188283, + "epoch": 1.3618721461187215, + "grad_norm": 5.53125, + "learning_rate": 2.534405691856175e-06, + "loss": 1.57440948, + "memory(GiB)": 117.38, + "step": 53685, + "train_speed(iter/s)": 1.63726 + }, + { + "acc": 0.6661376, + "epoch": 1.3619989852866565, + "grad_norm": 5.1875, + "learning_rate": 2.5334934833895396e-06, + "loss": 1.54691305, + "memory(GiB)": 117.38, + "step": 53690, + "train_speed(iter/s)": 1.637276 + }, + { + "acc": 0.65001383, + "epoch": 1.3621258244545915, + "grad_norm": 5.0, + "learning_rate": 2.5325813834108724e-06, + "loss": 1.61841278, + "memory(GiB)": 117.38, + "step": 53695, + "train_speed(iter/s)": 1.637292 + }, + { + "acc": 0.6714757, + "epoch": 1.3622526636225265, + "grad_norm": 5.1875, + "learning_rate": 2.531669391960293e-06, + "loss": 1.57934675, + "memory(GiB)": 117.38, + "step": 53700, + "train_speed(iter/s)": 1.637309 + }, + { + "acc": 0.65481548, + "epoch": 1.3623795027904617, + "grad_norm": 5.03125, + "learning_rate": 2.5307575090779125e-06, + "loss": 1.65041409, + "memory(GiB)": 117.38, + "step": 53705, + "train_speed(iter/s)": 1.637325 + }, + { + "acc": 0.65153427, + "epoch": 1.3625063419583967, + "grad_norm": 6.15625, + "learning_rate": 2.529845734803844e-06, + "loss": 1.59865665, + "memory(GiB)": 117.38, + "step": 53710, + "train_speed(iter/s)": 1.637341 + }, + { + "acc": 0.66214414, + "epoch": 1.362633181126332, + "grad_norm": 6.625, + "learning_rate": 2.5289340691781872e-06, + "loss": 1.65031738, + "memory(GiB)": 117.38, + "step": 53715, + "train_speed(iter/s)": 1.637359 + }, + { + "acc": 0.65672021, + "epoch": 1.362760020294267, + "grad_norm": 6.28125, + "learning_rate": 2.528022512241042e-06, + "loss": 1.57944927, + "memory(GiB)": 117.38, + "step": 53720, + "train_speed(iter/s)": 1.637376 + }, + { + "acc": 0.66379671, + "epoch": 1.362886859462202, + "grad_norm": 5.53125, + "learning_rate": 2.5271110640325013e-06, + "loss": 1.57880936, + "memory(GiB)": 117.38, + "step": 53725, + "train_speed(iter/s)": 1.637392 + }, + { + "acc": 0.65716343, + "epoch": 1.3630136986301369, + "grad_norm": 5.0625, + "learning_rate": 2.5261997245926612e-06, + "loss": 1.59805593, + "memory(GiB)": 117.38, + "step": 53730, + "train_speed(iter/s)": 1.637408 + }, + { + "acc": 0.64645138, + "epoch": 1.363140537798072, + "grad_norm": 5.875, + "learning_rate": 2.5252884939615995e-06, + "loss": 1.62116013, + "memory(GiB)": 117.38, + "step": 53735, + "train_speed(iter/s)": 1.637424 + }, + { + "acc": 0.65044165, + "epoch": 1.363267376966007, + "grad_norm": 4.6875, + "learning_rate": 2.5243773721793973e-06, + "loss": 1.59862518, + "memory(GiB)": 117.38, + "step": 53740, + "train_speed(iter/s)": 1.637441 + }, + { + "acc": 0.66431432, + "epoch": 1.3633942161339423, + "grad_norm": 6.5, + "learning_rate": 2.5234663592861325e-06, + "loss": 1.55725327, + "memory(GiB)": 117.38, + "step": 53745, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.65912647, + "epoch": 1.3635210553018773, + "grad_norm": 5.75, + "learning_rate": 2.522555455321876e-06, + "loss": 1.57857447, + "memory(GiB)": 117.38, + "step": 53750, + "train_speed(iter/s)": 1.637473 + }, + { + "acc": 0.65796304, + "epoch": 1.3636478944698123, + "grad_norm": 6.4375, + "learning_rate": 2.52164466032669e-06, + "loss": 1.56846848, + "memory(GiB)": 117.38, + "step": 53755, + "train_speed(iter/s)": 1.63749 + }, + { + "acc": 0.67043819, + "epoch": 1.3637747336377473, + "grad_norm": 4.96875, + "learning_rate": 2.5207339743406344e-06, + "loss": 1.61434784, + "memory(GiB)": 117.38, + "step": 53760, + "train_speed(iter/s)": 1.637507 + }, + { + "acc": 0.63644285, + "epoch": 1.3639015728056825, + "grad_norm": 4.46875, + "learning_rate": 2.5198233974037705e-06, + "loss": 1.60361748, + "memory(GiB)": 117.38, + "step": 53765, + "train_speed(iter/s)": 1.637523 + }, + { + "acc": 0.64687481, + "epoch": 1.3640284119736175, + "grad_norm": 6.5, + "learning_rate": 2.5189129295561486e-06, + "loss": 1.64974556, + "memory(GiB)": 117.38, + "step": 53770, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.65588765, + "epoch": 1.3641552511415524, + "grad_norm": 5.03125, + "learning_rate": 2.518002570837809e-06, + "loss": 1.60140057, + "memory(GiB)": 117.38, + "step": 53775, + "train_speed(iter/s)": 1.637557 + }, + { + "acc": 0.64455504, + "epoch": 1.3642820903094877, + "grad_norm": 6.59375, + "learning_rate": 2.5170923212887997e-06, + "loss": 1.70492039, + "memory(GiB)": 117.38, + "step": 53780, + "train_speed(iter/s)": 1.637574 + }, + { + "acc": 0.68089471, + "epoch": 1.3644089294774226, + "grad_norm": 5.75, + "learning_rate": 2.5161821809491554e-06, + "loss": 1.55858107, + "memory(GiB)": 117.38, + "step": 53785, + "train_speed(iter/s)": 1.637591 + }, + { + "acc": 0.66361413, + "epoch": 1.3645357686453576, + "grad_norm": 5.46875, + "learning_rate": 2.5152721498589104e-06, + "loss": 1.57757607, + "memory(GiB)": 117.38, + "step": 53790, + "train_speed(iter/s)": 1.637607 + }, + { + "acc": 0.64991264, + "epoch": 1.3646626078132926, + "grad_norm": 7.875, + "learning_rate": 2.514362228058086e-06, + "loss": 1.62747116, + "memory(GiB)": 117.38, + "step": 53795, + "train_speed(iter/s)": 1.637623 + }, + { + "acc": 0.65238428, + "epoch": 1.3647894469812278, + "grad_norm": 6.3125, + "learning_rate": 2.51345241558671e-06, + "loss": 1.64885216, + "memory(GiB)": 117.38, + "step": 53800, + "train_speed(iter/s)": 1.63764 + }, + { + "acc": 0.65161285, + "epoch": 1.3649162861491628, + "grad_norm": 5.875, + "learning_rate": 2.5125427124847985e-06, + "loss": 1.67882156, + "memory(GiB)": 117.38, + "step": 53805, + "train_speed(iter/s)": 1.637658 + }, + { + "acc": 0.66031466, + "epoch": 1.365043125317098, + "grad_norm": 6.03125, + "learning_rate": 2.5116331187923645e-06, + "loss": 1.55103235, + "memory(GiB)": 117.38, + "step": 53810, + "train_speed(iter/s)": 1.637674 + }, + { + "acc": 0.65867724, + "epoch": 1.365169964485033, + "grad_norm": 6.28125, + "learning_rate": 2.510723634549415e-06, + "loss": 1.60701294, + "memory(GiB)": 117.38, + "step": 53815, + "train_speed(iter/s)": 1.63769 + }, + { + "acc": 0.66362724, + "epoch": 1.365296803652968, + "grad_norm": 6.84375, + "learning_rate": 2.509814259795954e-06, + "loss": 1.59920597, + "memory(GiB)": 117.38, + "step": 53820, + "train_speed(iter/s)": 1.637706 + }, + { + "acc": 0.66874352, + "epoch": 1.365423642820903, + "grad_norm": 6.03125, + "learning_rate": 2.50890499457198e-06, + "loss": 1.58843727, + "memory(GiB)": 117.38, + "step": 53825, + "train_speed(iter/s)": 1.637723 + }, + { + "acc": 0.67149086, + "epoch": 1.3655504819888382, + "grad_norm": 5.4375, + "learning_rate": 2.5079958389174865e-06, + "loss": 1.51825161, + "memory(GiB)": 117.38, + "step": 53830, + "train_speed(iter/s)": 1.637738 + }, + { + "acc": 0.64371719, + "epoch": 1.3656773211567732, + "grad_norm": 6.5, + "learning_rate": 2.5070867928724618e-06, + "loss": 1.6810482, + "memory(GiB)": 117.38, + "step": 53835, + "train_speed(iter/s)": 1.637754 + }, + { + "acc": 0.65660286, + "epoch": 1.3658041603247082, + "grad_norm": 5.625, + "learning_rate": 2.50617785647689e-06, + "loss": 1.55161934, + "memory(GiB)": 117.38, + "step": 53840, + "train_speed(iter/s)": 1.637771 + }, + { + "acc": 0.65005198, + "epoch": 1.3659309994926434, + "grad_norm": 7.09375, + "learning_rate": 2.5052690297707506e-06, + "loss": 1.58277245, + "memory(GiB)": 117.38, + "step": 53845, + "train_speed(iter/s)": 1.637787 + }, + { + "acc": 0.66490679, + "epoch": 1.3660578386605784, + "grad_norm": 6.4375, + "learning_rate": 2.5043603127940164e-06, + "loss": 1.48484783, + "memory(GiB)": 117.38, + "step": 53850, + "train_speed(iter/s)": 1.637804 + }, + { + "acc": 0.65216084, + "epoch": 1.3661846778285134, + "grad_norm": 6.5625, + "learning_rate": 2.503451705586659e-06, + "loss": 1.61127319, + "memory(GiB)": 117.38, + "step": 53855, + "train_speed(iter/s)": 1.63782 + }, + { + "acc": 0.66695738, + "epoch": 1.3663115169964484, + "grad_norm": 5.3125, + "learning_rate": 2.5025432081886412e-06, + "loss": 1.54849424, + "memory(GiB)": 117.38, + "step": 53860, + "train_speed(iter/s)": 1.637837 + }, + { + "acc": 0.65645399, + "epoch": 1.3664383561643836, + "grad_norm": 6.40625, + "learning_rate": 2.5016348206399215e-06, + "loss": 1.57486687, + "memory(GiB)": 117.38, + "step": 53865, + "train_speed(iter/s)": 1.637854 + }, + { + "acc": 0.66213379, + "epoch": 1.3665651953323186, + "grad_norm": 6.6875, + "learning_rate": 2.500726542980461e-06, + "loss": 1.6468626, + "memory(GiB)": 117.38, + "step": 53870, + "train_speed(iter/s)": 1.63787 + }, + { + "acc": 0.65925865, + "epoch": 1.3666920345002538, + "grad_norm": 5.5, + "learning_rate": 2.499818375250204e-06, + "loss": 1.59423265, + "memory(GiB)": 117.38, + "step": 53875, + "train_speed(iter/s)": 1.637885 + }, + { + "acc": 0.65358038, + "epoch": 1.3668188736681888, + "grad_norm": 7.0, + "learning_rate": 2.4989103174890946e-06, + "loss": 1.63917847, + "memory(GiB)": 117.38, + "step": 53880, + "train_speed(iter/s)": 1.637901 + }, + { + "acc": 0.65485501, + "epoch": 1.3669457128361238, + "grad_norm": 4.96875, + "learning_rate": 2.498002369737078e-06, + "loss": 1.64345894, + "memory(GiB)": 117.38, + "step": 53885, + "train_speed(iter/s)": 1.637916 + }, + { + "acc": 0.66022215, + "epoch": 1.3670725520040587, + "grad_norm": 7.0625, + "learning_rate": 2.49709453203409e-06, + "loss": 1.58046474, + "memory(GiB)": 117.38, + "step": 53890, + "train_speed(iter/s)": 1.637933 + }, + { + "acc": 0.65218558, + "epoch": 1.367199391171994, + "grad_norm": 5.0, + "learning_rate": 2.496186804420057e-06, + "loss": 1.62577248, + "memory(GiB)": 117.38, + "step": 53895, + "train_speed(iter/s)": 1.637948 + }, + { + "acc": 0.65749688, + "epoch": 1.367326230339929, + "grad_norm": 4.65625, + "learning_rate": 2.4952791869349056e-06, + "loss": 1.61458359, + "memory(GiB)": 117.38, + "step": 53900, + "train_speed(iter/s)": 1.637965 + }, + { + "acc": 0.66395006, + "epoch": 1.3674530695078642, + "grad_norm": 8.4375, + "learning_rate": 2.4943716796185603e-06, + "loss": 1.59068127, + "memory(GiB)": 117.38, + "step": 53905, + "train_speed(iter/s)": 1.637981 + }, + { + "acc": 0.65934453, + "epoch": 1.3675799086757991, + "grad_norm": 6.0625, + "learning_rate": 2.493464282510937e-06, + "loss": 1.65735378, + "memory(GiB)": 117.38, + "step": 53910, + "train_speed(iter/s)": 1.637999 + }, + { + "acc": 0.67474036, + "epoch": 1.3677067478437341, + "grad_norm": 5.84375, + "learning_rate": 2.4925569956519414e-06, + "loss": 1.548738, + "memory(GiB)": 117.38, + "step": 53915, + "train_speed(iter/s)": 1.638014 + }, + { + "acc": 0.6487464, + "epoch": 1.3678335870116691, + "grad_norm": 5.4375, + "learning_rate": 2.491649819081486e-06, + "loss": 1.66526947, + "memory(GiB)": 117.38, + "step": 53920, + "train_speed(iter/s)": 1.638031 + }, + { + "acc": 0.66771212, + "epoch": 1.3679604261796043, + "grad_norm": 5.4375, + "learning_rate": 2.490742752839471e-06, + "loss": 1.63681526, + "memory(GiB)": 117.38, + "step": 53925, + "train_speed(iter/s)": 1.638048 + }, + { + "acc": 0.65810323, + "epoch": 1.3680872653475393, + "grad_norm": 6.90625, + "learning_rate": 2.4898357969657943e-06, + "loss": 1.5605608, + "memory(GiB)": 117.38, + "step": 53930, + "train_speed(iter/s)": 1.638066 + }, + { + "acc": 0.66883926, + "epoch": 1.3682141045154743, + "grad_norm": 5.53125, + "learning_rate": 2.4889289515003425e-06, + "loss": 1.58948851, + "memory(GiB)": 117.38, + "step": 53935, + "train_speed(iter/s)": 1.638082 + }, + { + "acc": 0.67358646, + "epoch": 1.3683409436834095, + "grad_norm": 7.1875, + "learning_rate": 2.4880222164830085e-06, + "loss": 1.56756659, + "memory(GiB)": 117.38, + "step": 53940, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.66199064, + "epoch": 1.3684677828513445, + "grad_norm": 4.9375, + "learning_rate": 2.4871155919536725e-06, + "loss": 1.59989166, + "memory(GiB)": 117.38, + "step": 53945, + "train_speed(iter/s)": 1.638116 + }, + { + "acc": 0.6757699, + "epoch": 1.3685946220192795, + "grad_norm": 7.96875, + "learning_rate": 2.486209077952212e-06, + "loss": 1.54847317, + "memory(GiB)": 117.38, + "step": 53950, + "train_speed(iter/s)": 1.638131 + }, + { + "acc": 0.66980858, + "epoch": 1.3687214611872145, + "grad_norm": 8.5, + "learning_rate": 2.4853026745185e-06, + "loss": 1.59327583, + "memory(GiB)": 117.38, + "step": 53955, + "train_speed(iter/s)": 1.638147 + }, + { + "acc": 0.66415458, + "epoch": 1.3688483003551497, + "grad_norm": 5.125, + "learning_rate": 2.4843963816924035e-06, + "loss": 1.5571785, + "memory(GiB)": 117.38, + "step": 53960, + "train_speed(iter/s)": 1.638162 + }, + { + "acc": 0.65693302, + "epoch": 1.3689751395230847, + "grad_norm": 5.15625, + "learning_rate": 2.483490199513785e-06, + "loss": 1.55262871, + "memory(GiB)": 117.38, + "step": 53965, + "train_speed(iter/s)": 1.638178 + }, + { + "acc": 0.67094212, + "epoch": 1.36910197869102, + "grad_norm": 7.09375, + "learning_rate": 2.4825841280225033e-06, + "loss": 1.60100746, + "memory(GiB)": 117.38, + "step": 53970, + "train_speed(iter/s)": 1.638195 + }, + { + "acc": 0.64782677, + "epoch": 1.369228817858955, + "grad_norm": 5.71875, + "learning_rate": 2.4816781672584107e-06, + "loss": 1.62448101, + "memory(GiB)": 117.38, + "step": 53975, + "train_speed(iter/s)": 1.638212 + }, + { + "acc": 0.64191036, + "epoch": 1.3693556570268899, + "grad_norm": 5.4375, + "learning_rate": 2.480772317261356e-06, + "loss": 1.62860718, + "memory(GiB)": 117.38, + "step": 53980, + "train_speed(iter/s)": 1.638227 + }, + { + "acc": 0.65724363, + "epoch": 1.3694824961948249, + "grad_norm": 5.75, + "learning_rate": 2.479866578071183e-06, + "loss": 1.60973969, + "memory(GiB)": 117.38, + "step": 53985, + "train_speed(iter/s)": 1.638244 + }, + { + "acc": 0.65571737, + "epoch": 1.36960933536276, + "grad_norm": 7.96875, + "learning_rate": 2.4789609497277284e-06, + "loss": 1.56727686, + "memory(GiB)": 117.38, + "step": 53990, + "train_speed(iter/s)": 1.638261 + }, + { + "acc": 0.65684423, + "epoch": 1.369736174530695, + "grad_norm": 6.28125, + "learning_rate": 2.478055432270828e-06, + "loss": 1.60636482, + "memory(GiB)": 117.38, + "step": 53995, + "train_speed(iter/s)": 1.638277 + }, + { + "acc": 0.69125848, + "epoch": 1.36986301369863, + "grad_norm": 4.625, + "learning_rate": 2.4771500257403086e-06, + "loss": 1.42107573, + "memory(GiB)": 117.38, + "step": 54000, + "train_speed(iter/s)": 1.638292 + }, + { + "epoch": 1.36986301369863, + "eval_acc": 0.646273522157972, + "eval_loss": 1.5731202363967896, + "eval_runtime": 58.4127, + "eval_samples_per_second": 109.052, + "eval_steps_per_second": 27.271, + "step": 54000 + }, + { + "acc": 0.6596487, + "epoch": 1.3699898528665653, + "grad_norm": 4.9375, + "learning_rate": 2.476244730175993e-06, + "loss": 1.59097767, + "memory(GiB)": 117.38, + "step": 54005, + "train_speed(iter/s)": 1.635195 + }, + { + "acc": 0.65782223, + "epoch": 1.3701166920345003, + "grad_norm": 5.90625, + "learning_rate": 2.4753395456177056e-06, + "loss": 1.6168663, + "memory(GiB)": 117.38, + "step": 54010, + "train_speed(iter/s)": 1.635211 + }, + { + "acc": 0.64816422, + "epoch": 1.3702435312024352, + "grad_norm": 5.25, + "learning_rate": 2.474434472105255e-06, + "loss": 1.63149796, + "memory(GiB)": 117.38, + "step": 54015, + "train_speed(iter/s)": 1.635228 + }, + { + "acc": 0.6532033, + "epoch": 1.3703703703703702, + "grad_norm": 5.96875, + "learning_rate": 2.473529509678452e-06, + "loss": 1.57334414, + "memory(GiB)": 117.38, + "step": 54020, + "train_speed(iter/s)": 1.635245 + }, + { + "acc": 0.65062523, + "epoch": 1.3704972095383054, + "grad_norm": 5.9375, + "learning_rate": 2.4726246583770996e-06, + "loss": 1.65277958, + "memory(GiB)": 117.38, + "step": 54025, + "train_speed(iter/s)": 1.635262 + }, + { + "acc": 0.65357847, + "epoch": 1.3706240487062404, + "grad_norm": 5.0, + "learning_rate": 2.4717199182410025e-06, + "loss": 1.54226589, + "memory(GiB)": 117.38, + "step": 54030, + "train_speed(iter/s)": 1.635278 + }, + { + "acc": 0.67414503, + "epoch": 1.3707508878741756, + "grad_norm": 7.46875, + "learning_rate": 2.4708152893099493e-06, + "loss": 1.57174397, + "memory(GiB)": 117.38, + "step": 54035, + "train_speed(iter/s)": 1.635294 + }, + { + "acc": 0.66791396, + "epoch": 1.3708777270421106, + "grad_norm": 7.3125, + "learning_rate": 2.4699107716237293e-06, + "loss": 1.55494394, + "memory(GiB)": 117.38, + "step": 54040, + "train_speed(iter/s)": 1.63531 + }, + { + "acc": 0.66002736, + "epoch": 1.3710045662100456, + "grad_norm": 4.84375, + "learning_rate": 2.469006365222132e-06, + "loss": 1.55516348, + "memory(GiB)": 117.38, + "step": 54045, + "train_speed(iter/s)": 1.635325 + }, + { + "acc": 0.64921136, + "epoch": 1.3711314053779806, + "grad_norm": 6.3125, + "learning_rate": 2.4681020701449365e-06, + "loss": 1.60622787, + "memory(GiB)": 117.38, + "step": 54050, + "train_speed(iter/s)": 1.635341 + }, + { + "acc": 0.64892731, + "epoch": 1.3712582445459158, + "grad_norm": 5.5625, + "learning_rate": 2.4671978864319123e-06, + "loss": 1.60169106, + "memory(GiB)": 117.38, + "step": 54055, + "train_speed(iter/s)": 1.635358 + }, + { + "acc": 0.65513239, + "epoch": 1.3713850837138508, + "grad_norm": 6.40625, + "learning_rate": 2.466293814122835e-06, + "loss": 1.64919014, + "memory(GiB)": 117.38, + "step": 54060, + "train_speed(iter/s)": 1.635374 + }, + { + "acc": 0.67323503, + "epoch": 1.371511922881786, + "grad_norm": 6.875, + "learning_rate": 2.4653898532574684e-06, + "loss": 1.53550606, + "memory(GiB)": 117.38, + "step": 54065, + "train_speed(iter/s)": 1.635391 + }, + { + "acc": 0.66631775, + "epoch": 1.371638762049721, + "grad_norm": 5.6875, + "learning_rate": 2.4644860038755737e-06, + "loss": 1.58006268, + "memory(GiB)": 117.38, + "step": 54070, + "train_speed(iter/s)": 1.635407 + }, + { + "acc": 0.64985061, + "epoch": 1.371765601217656, + "grad_norm": 7.5625, + "learning_rate": 2.4635822660169007e-06, + "loss": 1.6397522, + "memory(GiB)": 117.38, + "step": 54075, + "train_speed(iter/s)": 1.635424 + }, + { + "acc": 0.65005298, + "epoch": 1.371892440385591, + "grad_norm": 6.9375, + "learning_rate": 2.4626786397212065e-06, + "loss": 1.67704964, + "memory(GiB)": 117.38, + "step": 54080, + "train_speed(iter/s)": 1.63544 + }, + { + "acc": 0.6678525, + "epoch": 1.3720192795535262, + "grad_norm": 7.25, + "learning_rate": 2.461775125028234e-06, + "loss": 1.61338387, + "memory(GiB)": 117.38, + "step": 54085, + "train_speed(iter/s)": 1.635456 + }, + { + "acc": 0.63961186, + "epoch": 1.3721461187214612, + "grad_norm": 8.4375, + "learning_rate": 2.4608717219777236e-06, + "loss": 1.65973167, + "memory(GiB)": 117.38, + "step": 54090, + "train_speed(iter/s)": 1.635472 + }, + { + "acc": 0.65916772, + "epoch": 1.3722729578893962, + "grad_norm": 5.5625, + "learning_rate": 2.459968430609411e-06, + "loss": 1.58463268, + "memory(GiB)": 117.38, + "step": 54095, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.65970016, + "epoch": 1.3723997970573314, + "grad_norm": 5.78125, + "learning_rate": 2.459065250963028e-06, + "loss": 1.54564285, + "memory(GiB)": 117.38, + "step": 54100, + "train_speed(iter/s)": 1.635504 + }, + { + "acc": 0.6704771, + "epoch": 1.3725266362252664, + "grad_norm": 5.84375, + "learning_rate": 2.458162183078299e-06, + "loss": 1.56549339, + "memory(GiB)": 117.38, + "step": 54105, + "train_speed(iter/s)": 1.63552 + }, + { + "acc": 0.67744837, + "epoch": 1.3726534753932014, + "grad_norm": 5.875, + "learning_rate": 2.4572592269949464e-06, + "loss": 1.47730169, + "memory(GiB)": 117.38, + "step": 54110, + "train_speed(iter/s)": 1.635536 + }, + { + "acc": 0.65629334, + "epoch": 1.3727803145611364, + "grad_norm": 5.90625, + "learning_rate": 2.4563563827526848e-06, + "loss": 1.59929123, + "memory(GiB)": 117.38, + "step": 54115, + "train_speed(iter/s)": 1.635552 + }, + { + "acc": 0.64446516, + "epoch": 1.3729071537290716, + "grad_norm": 6.3125, + "learning_rate": 2.455453650391226e-06, + "loss": 1.5962491, + "memory(GiB)": 117.38, + "step": 54120, + "train_speed(iter/s)": 1.635569 + }, + { + "acc": 0.66375628, + "epoch": 1.3730339928970066, + "grad_norm": 5.6875, + "learning_rate": 2.454551029950277e-06, + "loss": 1.60608788, + "memory(GiB)": 117.38, + "step": 54125, + "train_speed(iter/s)": 1.635586 + }, + { + "acc": 0.66493359, + "epoch": 1.3731608320649418, + "grad_norm": 5.3125, + "learning_rate": 2.4536485214695377e-06, + "loss": 1.51064301, + "memory(GiB)": 117.38, + "step": 54130, + "train_speed(iter/s)": 1.635603 + }, + { + "acc": 0.65660586, + "epoch": 1.3732876712328768, + "grad_norm": 4.59375, + "learning_rate": 2.4527461249887054e-06, + "loss": 1.5727293, + "memory(GiB)": 117.38, + "step": 54135, + "train_speed(iter/s)": 1.635619 + }, + { + "acc": 0.65723758, + "epoch": 1.3734145104008117, + "grad_norm": 5.34375, + "learning_rate": 2.451843840547471e-06, + "loss": 1.57690506, + "memory(GiB)": 117.38, + "step": 54140, + "train_speed(iter/s)": 1.635637 + }, + { + "acc": 0.65991144, + "epoch": 1.3735413495687467, + "grad_norm": 5.6875, + "learning_rate": 2.4509416681855193e-06, + "loss": 1.63394127, + "memory(GiB)": 117.38, + "step": 54145, + "train_speed(iter/s)": 1.635649 + }, + { + "acc": 0.65920591, + "epoch": 1.373668188736682, + "grad_norm": 6.375, + "learning_rate": 2.4500396079425377e-06, + "loss": 1.62301826, + "memory(GiB)": 117.38, + "step": 54150, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.65910325, + "epoch": 1.373795027904617, + "grad_norm": 7.15625, + "learning_rate": 2.4491376598581967e-06, + "loss": 1.60065994, + "memory(GiB)": 117.38, + "step": 54155, + "train_speed(iter/s)": 1.635681 + }, + { + "acc": 0.65766726, + "epoch": 1.373921867072552, + "grad_norm": 6.96875, + "learning_rate": 2.4482358239721704e-06, + "loss": 1.62270088, + "memory(GiB)": 117.38, + "step": 54160, + "train_speed(iter/s)": 1.635696 + }, + { + "acc": 0.66118236, + "epoch": 1.3740487062404871, + "grad_norm": 5.71875, + "learning_rate": 2.4473341003241234e-06, + "loss": 1.60639324, + "memory(GiB)": 117.38, + "step": 54165, + "train_speed(iter/s)": 1.635712 + }, + { + "acc": 0.6666163, + "epoch": 1.3741755454084221, + "grad_norm": 6.125, + "learning_rate": 2.446432488953724e-06, + "loss": 1.46311569, + "memory(GiB)": 117.38, + "step": 54170, + "train_speed(iter/s)": 1.635729 + }, + { + "acc": 0.64757438, + "epoch": 1.3743023845763571, + "grad_norm": 6.9375, + "learning_rate": 2.445530989900622e-06, + "loss": 1.58069305, + "memory(GiB)": 117.38, + "step": 54175, + "train_speed(iter/s)": 1.635744 + }, + { + "acc": 0.65932388, + "epoch": 1.374429223744292, + "grad_norm": 6.375, + "learning_rate": 2.4446296032044697e-06, + "loss": 1.6389431, + "memory(GiB)": 117.38, + "step": 54180, + "train_speed(iter/s)": 1.635761 + }, + { + "acc": 0.65855026, + "epoch": 1.3745560629122273, + "grad_norm": 6.0, + "learning_rate": 2.443728328904919e-06, + "loss": 1.57429266, + "memory(GiB)": 117.38, + "step": 54185, + "train_speed(iter/s)": 1.635778 + }, + { + "acc": 0.64706569, + "epoch": 1.3746829020801623, + "grad_norm": 6.78125, + "learning_rate": 2.442827167041611e-06, + "loss": 1.63452606, + "memory(GiB)": 117.38, + "step": 54190, + "train_speed(iter/s)": 1.635794 + }, + { + "acc": 0.66797485, + "epoch": 1.3748097412480975, + "grad_norm": 6.125, + "learning_rate": 2.441926117654179e-06, + "loss": 1.57827721, + "memory(GiB)": 117.38, + "step": 54195, + "train_speed(iter/s)": 1.635811 + }, + { + "acc": 0.65382614, + "epoch": 1.3749365804160325, + "grad_norm": 6.5, + "learning_rate": 2.4410251807822555e-06, + "loss": 1.66293869, + "memory(GiB)": 117.38, + "step": 54200, + "train_speed(iter/s)": 1.635828 + }, + { + "acc": 0.6648035, + "epoch": 1.3750634195839675, + "grad_norm": 5.71875, + "learning_rate": 2.4401243564654713e-06, + "loss": 1.59469013, + "memory(GiB)": 117.38, + "step": 54205, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.66369085, + "epoch": 1.3751902587519025, + "grad_norm": 5.5625, + "learning_rate": 2.4392236447434494e-06, + "loss": 1.56073608, + "memory(GiB)": 117.38, + "step": 54210, + "train_speed(iter/s)": 1.635862 + }, + { + "acc": 0.66230984, + "epoch": 1.3753170979198377, + "grad_norm": 5.9375, + "learning_rate": 2.4383230456558005e-06, + "loss": 1.5442441, + "memory(GiB)": 117.38, + "step": 54215, + "train_speed(iter/s)": 1.635879 + }, + { + "acc": 0.65729065, + "epoch": 1.3754439370877727, + "grad_norm": 6.4375, + "learning_rate": 2.437422559242143e-06, + "loss": 1.56194792, + "memory(GiB)": 117.38, + "step": 54220, + "train_speed(iter/s)": 1.635895 + }, + { + "acc": 0.66322613, + "epoch": 1.375570776255708, + "grad_norm": 5.59375, + "learning_rate": 2.4365221855420822e-06, + "loss": 1.61534748, + "memory(GiB)": 117.38, + "step": 54225, + "train_speed(iter/s)": 1.635913 + }, + { + "acc": 0.65688257, + "epoch": 1.3756976154236429, + "grad_norm": 5.71875, + "learning_rate": 2.435621924595221e-06, + "loss": 1.56321974, + "memory(GiB)": 117.38, + "step": 54230, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.6661449, + "epoch": 1.3758244545915779, + "grad_norm": 5.6875, + "learning_rate": 2.4347217764411567e-06, + "loss": 1.65239201, + "memory(GiB)": 117.38, + "step": 54235, + "train_speed(iter/s)": 1.635946 + }, + { + "acc": 0.66999512, + "epoch": 1.3759512937595129, + "grad_norm": 6.09375, + "learning_rate": 2.433821741119482e-06, + "loss": 1.53836136, + "memory(GiB)": 117.38, + "step": 54240, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.64185319, + "epoch": 1.376078132927448, + "grad_norm": 6.875, + "learning_rate": 2.432921818669784e-06, + "loss": 1.63987522, + "memory(GiB)": 117.38, + "step": 54245, + "train_speed(iter/s)": 1.63598 + }, + { + "acc": 0.67233548, + "epoch": 1.376204972095383, + "grad_norm": 5.3125, + "learning_rate": 2.432022009131646e-06, + "loss": 1.55459204, + "memory(GiB)": 117.38, + "step": 54250, + "train_speed(iter/s)": 1.635997 + }, + { + "acc": 0.64748497, + "epoch": 1.376331811263318, + "grad_norm": 5.71875, + "learning_rate": 2.4311223125446447e-06, + "loss": 1.68569221, + "memory(GiB)": 117.38, + "step": 54255, + "train_speed(iter/s)": 1.636014 + }, + { + "acc": 0.64551373, + "epoch": 1.3764586504312533, + "grad_norm": 5.1875, + "learning_rate": 2.4302227289483537e-06, + "loss": 1.67029037, + "memory(GiB)": 117.38, + "step": 54260, + "train_speed(iter/s)": 1.636031 + }, + { + "acc": 0.65856504, + "epoch": 1.3765854895991883, + "grad_norm": 6.5625, + "learning_rate": 2.42932325838234e-06, + "loss": 1.62255516, + "memory(GiB)": 117.38, + "step": 54265, + "train_speed(iter/s)": 1.636048 + }, + { + "acc": 0.66313119, + "epoch": 1.3767123287671232, + "grad_norm": 5.34375, + "learning_rate": 2.4284239008861665e-06, + "loss": 1.58286352, + "memory(GiB)": 117.38, + "step": 54270, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.65616484, + "epoch": 1.3768391679350582, + "grad_norm": 6.5625, + "learning_rate": 2.4275246564993917e-06, + "loss": 1.62755585, + "memory(GiB)": 117.38, + "step": 54275, + "train_speed(iter/s)": 1.636081 + }, + { + "acc": 0.65982361, + "epoch": 1.3769660071029934, + "grad_norm": 5.6875, + "learning_rate": 2.426625525261567e-06, + "loss": 1.55663738, + "memory(GiB)": 117.38, + "step": 54280, + "train_speed(iter/s)": 1.636097 + }, + { + "acc": 0.6573525, + "epoch": 1.3770928462709284, + "grad_norm": 5.0, + "learning_rate": 2.425726507212242e-06, + "loss": 1.60071392, + "memory(GiB)": 117.38, + "step": 54285, + "train_speed(iter/s)": 1.636114 + }, + { + "acc": 0.66005955, + "epoch": 1.3772196854388636, + "grad_norm": 5.46875, + "learning_rate": 2.424827602390958e-06, + "loss": 1.60382023, + "memory(GiB)": 117.38, + "step": 54290, + "train_speed(iter/s)": 1.636132 + }, + { + "acc": 0.64934292, + "epoch": 1.3773465246067986, + "grad_norm": 4.84375, + "learning_rate": 2.4239288108372534e-06, + "loss": 1.59679947, + "memory(GiB)": 117.38, + "step": 54295, + "train_speed(iter/s)": 1.636147 + }, + { + "acc": 0.66654916, + "epoch": 1.3774733637747336, + "grad_norm": 6.65625, + "learning_rate": 2.4230301325906606e-06, + "loss": 1.58921251, + "memory(GiB)": 117.38, + "step": 54300, + "train_speed(iter/s)": 1.636165 + }, + { + "acc": 0.64945903, + "epoch": 1.3776002029426686, + "grad_norm": 5.78125, + "learning_rate": 2.4221315676907066e-06, + "loss": 1.64934788, + "memory(GiB)": 117.38, + "step": 54305, + "train_speed(iter/s)": 1.636182 + }, + { + "acc": 0.65386157, + "epoch": 1.3777270421106038, + "grad_norm": 6.09375, + "learning_rate": 2.4212331161769194e-06, + "loss": 1.6603405, + "memory(GiB)": 117.38, + "step": 54310, + "train_speed(iter/s)": 1.636199 + }, + { + "acc": 0.6477253, + "epoch": 1.3778538812785388, + "grad_norm": 5.6875, + "learning_rate": 2.420334778088811e-06, + "loss": 1.60431728, + "memory(GiB)": 117.38, + "step": 54315, + "train_speed(iter/s)": 1.636216 + }, + { + "acc": 0.66344695, + "epoch": 1.3779807204464738, + "grad_norm": 5.15625, + "learning_rate": 2.4194365534658944e-06, + "loss": 1.60724869, + "memory(GiB)": 117.38, + "step": 54320, + "train_speed(iter/s)": 1.636233 + }, + { + "acc": 0.64429431, + "epoch": 1.378107559614409, + "grad_norm": 6.125, + "learning_rate": 2.4185384423476817e-06, + "loss": 1.67158604, + "memory(GiB)": 117.38, + "step": 54325, + "train_speed(iter/s)": 1.63625 + }, + { + "acc": 0.64293561, + "epoch": 1.378234398782344, + "grad_norm": 6.15625, + "learning_rate": 2.4176404447736758e-06, + "loss": 1.66435394, + "memory(GiB)": 117.38, + "step": 54330, + "train_speed(iter/s)": 1.636268 + }, + { + "acc": 0.64840803, + "epoch": 1.378361237950279, + "grad_norm": 6.59375, + "learning_rate": 2.41674256078337e-06, + "loss": 1.59952517, + "memory(GiB)": 117.38, + "step": 54335, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.64119782, + "epoch": 1.378488077118214, + "grad_norm": 5.59375, + "learning_rate": 2.4158447904162585e-06, + "loss": 1.68831482, + "memory(GiB)": 117.38, + "step": 54340, + "train_speed(iter/s)": 1.636302 + }, + { + "acc": 0.65265064, + "epoch": 1.3786149162861492, + "grad_norm": 6.3125, + "learning_rate": 2.414947133711832e-06, + "loss": 1.60845928, + "memory(GiB)": 117.38, + "step": 54345, + "train_speed(iter/s)": 1.636319 + }, + { + "acc": 0.64305625, + "epoch": 1.3787417554540842, + "grad_norm": 5.375, + "learning_rate": 2.414049590709574e-06, + "loss": 1.60407028, + "memory(GiB)": 117.38, + "step": 54350, + "train_speed(iter/s)": 1.636337 + }, + { + "acc": 0.6422936, + "epoch": 1.3788685946220194, + "grad_norm": 5.84375, + "learning_rate": 2.4131521614489567e-06, + "loss": 1.6680769, + "memory(GiB)": 117.38, + "step": 54355, + "train_speed(iter/s)": 1.636355 + }, + { + "acc": 0.66786118, + "epoch": 1.3789954337899544, + "grad_norm": 5.0625, + "learning_rate": 2.412254845969459e-06, + "loss": 1.52127104, + "memory(GiB)": 117.38, + "step": 54360, + "train_speed(iter/s)": 1.636372 + }, + { + "acc": 0.66609874, + "epoch": 1.3791222729578894, + "grad_norm": 5.0625, + "learning_rate": 2.4113576443105464e-06, + "loss": 1.54702654, + "memory(GiB)": 117.38, + "step": 54365, + "train_speed(iter/s)": 1.63639 + }, + { + "acc": 0.64409099, + "epoch": 1.3792491121258244, + "grad_norm": 5.6875, + "learning_rate": 2.410460556511684e-06, + "loss": 1.60263042, + "memory(GiB)": 117.38, + "step": 54370, + "train_speed(iter/s)": 1.636407 + }, + { + "acc": 0.6562439, + "epoch": 1.3793759512937596, + "grad_norm": 5.71875, + "learning_rate": 2.4095635826123235e-06, + "loss": 1.56911697, + "memory(GiB)": 117.38, + "step": 54375, + "train_speed(iter/s)": 1.636424 + }, + { + "acc": 0.66322141, + "epoch": 1.3795027904616946, + "grad_norm": 5.21875, + "learning_rate": 2.4086667226519245e-06, + "loss": 1.57778282, + "memory(GiB)": 117.38, + "step": 54380, + "train_speed(iter/s)": 1.63644 + }, + { + "acc": 0.65481992, + "epoch": 1.3796296296296298, + "grad_norm": 5.875, + "learning_rate": 2.4077699766699323e-06, + "loss": 1.64075966, + "memory(GiB)": 117.38, + "step": 54385, + "train_speed(iter/s)": 1.636457 + }, + { + "acc": 0.66791763, + "epoch": 1.3797564687975648, + "grad_norm": 4.96875, + "learning_rate": 2.4068733447057903e-06, + "loss": 1.5885067, + "memory(GiB)": 117.38, + "step": 54390, + "train_speed(iter/s)": 1.636475 + }, + { + "acc": 0.62991519, + "epoch": 1.3798833079654997, + "grad_norm": 5.375, + "learning_rate": 2.405976826798936e-06, + "loss": 1.69023056, + "memory(GiB)": 117.38, + "step": 54395, + "train_speed(iter/s)": 1.636493 + }, + { + "acc": 0.66863132, + "epoch": 1.3800101471334347, + "grad_norm": 4.8125, + "learning_rate": 2.405080422988802e-06, + "loss": 1.57402372, + "memory(GiB)": 117.38, + "step": 54400, + "train_speed(iter/s)": 1.63651 + }, + { + "acc": 0.65594616, + "epoch": 1.38013698630137, + "grad_norm": 5.6875, + "learning_rate": 2.404184133314817e-06, + "loss": 1.66446877, + "memory(GiB)": 117.38, + "step": 54405, + "train_speed(iter/s)": 1.636526 + }, + { + "acc": 0.66494308, + "epoch": 1.380263825469305, + "grad_norm": 6.71875, + "learning_rate": 2.4032879578164027e-06, + "loss": 1.59335918, + "memory(GiB)": 117.38, + "step": 54410, + "train_speed(iter/s)": 1.636542 + }, + { + "acc": 0.65673313, + "epoch": 1.38039066463724, + "grad_norm": 5.71875, + "learning_rate": 2.402391896532978e-06, + "loss": 1.58920755, + "memory(GiB)": 117.38, + "step": 54415, + "train_speed(iter/s)": 1.636559 + }, + { + "acc": 0.64633751, + "epoch": 1.3805175038051751, + "grad_norm": 5.59375, + "learning_rate": 2.4014959495039548e-06, + "loss": 1.6979847, + "memory(GiB)": 117.38, + "step": 54420, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.65312424, + "epoch": 1.3806443429731101, + "grad_norm": 5.75, + "learning_rate": 2.4006001167687416e-06, + "loss": 1.63549957, + "memory(GiB)": 117.38, + "step": 54425, + "train_speed(iter/s)": 1.636592 + }, + { + "acc": 0.66206479, + "epoch": 1.380771182141045, + "grad_norm": 5.03125, + "learning_rate": 2.39970439836674e-06, + "loss": 1.5907217, + "memory(GiB)": 117.38, + "step": 54430, + "train_speed(iter/s)": 1.636608 + }, + { + "acc": 0.647054, + "epoch": 1.38089802130898, + "grad_norm": 5.25, + "learning_rate": 2.3988087943373497e-06, + "loss": 1.59917736, + "memory(GiB)": 117.38, + "step": 54435, + "train_speed(iter/s)": 1.636625 + }, + { + "acc": 0.67288084, + "epoch": 1.3810248604769153, + "grad_norm": 5.1875, + "learning_rate": 2.397913304719961e-06, + "loss": 1.51494026, + "memory(GiB)": 117.38, + "step": 54440, + "train_speed(iter/s)": 1.636642 + }, + { + "acc": 0.65642438, + "epoch": 1.3811516996448503, + "grad_norm": 6.1875, + "learning_rate": 2.397017929553961e-06, + "loss": 1.56472073, + "memory(GiB)": 117.38, + "step": 54445, + "train_speed(iter/s)": 1.636658 + }, + { + "acc": 0.66393561, + "epoch": 1.3812785388127855, + "grad_norm": 5.71875, + "learning_rate": 2.396122668878738e-06, + "loss": 1.5561264, + "memory(GiB)": 117.38, + "step": 54450, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.64073334, + "epoch": 1.3814053779807205, + "grad_norm": 5.625, + "learning_rate": 2.3952275227336636e-06, + "loss": 1.73151112, + "memory(GiB)": 117.38, + "step": 54455, + "train_speed(iter/s)": 1.636692 + }, + { + "acc": 0.65882759, + "epoch": 1.3815322171486555, + "grad_norm": 5.15625, + "learning_rate": 2.3943324911581117e-06, + "loss": 1.56787491, + "memory(GiB)": 117.38, + "step": 54460, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.65736914, + "epoch": 1.3816590563165905, + "grad_norm": 5.0625, + "learning_rate": 2.393437574191449e-06, + "loss": 1.57225094, + "memory(GiB)": 117.38, + "step": 54465, + "train_speed(iter/s)": 1.636725 + }, + { + "acc": 0.65833235, + "epoch": 1.3817858954845257, + "grad_norm": 5.21875, + "learning_rate": 2.3925427718730426e-06, + "loss": 1.59292936, + "memory(GiB)": 117.38, + "step": 54470, + "train_speed(iter/s)": 1.636742 + }, + { + "acc": 0.6675139, + "epoch": 1.3819127346524607, + "grad_norm": 6.3125, + "learning_rate": 2.391648084242245e-06, + "loss": 1.52889223, + "memory(GiB)": 117.38, + "step": 54475, + "train_speed(iter/s)": 1.636757 + }, + { + "acc": 0.65570593, + "epoch": 1.3820395738203957, + "grad_norm": 5.65625, + "learning_rate": 2.3907535113384084e-06, + "loss": 1.64451466, + "memory(GiB)": 117.38, + "step": 54480, + "train_speed(iter/s)": 1.636773 + }, + { + "acc": 0.65295038, + "epoch": 1.3821664129883309, + "grad_norm": 6.0, + "learning_rate": 2.389859053200883e-06, + "loss": 1.63188324, + "memory(GiB)": 117.38, + "step": 54485, + "train_speed(iter/s)": 1.63679 + }, + { + "acc": 0.66879358, + "epoch": 1.3822932521562659, + "grad_norm": 5.875, + "learning_rate": 2.3889647098690127e-06, + "loss": 1.58088703, + "memory(GiB)": 117.38, + "step": 54490, + "train_speed(iter/s)": 1.636805 + }, + { + "acc": 0.65506191, + "epoch": 1.3824200913242009, + "grad_norm": 5.53125, + "learning_rate": 2.3880704813821275e-06, + "loss": 1.59755859, + "memory(GiB)": 117.38, + "step": 54495, + "train_speed(iter/s)": 1.636822 + }, + { + "acc": 0.65546737, + "epoch": 1.3825469304921358, + "grad_norm": 4.9375, + "learning_rate": 2.3871763677795656e-06, + "loss": 1.62362061, + "memory(GiB)": 117.38, + "step": 54500, + "train_speed(iter/s)": 1.63684 + }, + { + "acc": 0.65757484, + "epoch": 1.382673769660071, + "grad_norm": 6.0625, + "learning_rate": 2.386282369100653e-06, + "loss": 1.58688421, + "memory(GiB)": 117.38, + "step": 54505, + "train_speed(iter/s)": 1.636857 + }, + { + "acc": 0.65478721, + "epoch": 1.382800608828006, + "grad_norm": 5.375, + "learning_rate": 2.385388485384713e-06, + "loss": 1.65988483, + "memory(GiB)": 117.38, + "step": 54510, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.66759429, + "epoch": 1.3829274479959413, + "grad_norm": 6.28125, + "learning_rate": 2.384494716671057e-06, + "loss": 1.59211254, + "memory(GiB)": 117.38, + "step": 54515, + "train_speed(iter/s)": 1.63689 + }, + { + "acc": 0.67711535, + "epoch": 1.3830542871638762, + "grad_norm": 6.03125, + "learning_rate": 2.3836010629990027e-06, + "loss": 1.47437019, + "memory(GiB)": 117.38, + "step": 54520, + "train_speed(iter/s)": 1.636907 + }, + { + "acc": 0.65576553, + "epoch": 1.3831811263318112, + "grad_norm": 7.9375, + "learning_rate": 2.382707524407855e-06, + "loss": 1.62105312, + "memory(GiB)": 117.38, + "step": 54525, + "train_speed(iter/s)": 1.636924 + }, + { + "acc": 0.65163727, + "epoch": 1.3833079654997462, + "grad_norm": 5.71875, + "learning_rate": 2.3818141009369155e-06, + "loss": 1.61962643, + "memory(GiB)": 117.38, + "step": 54530, + "train_speed(iter/s)": 1.636941 + }, + { + "acc": 0.65830121, + "epoch": 1.3834348046676814, + "grad_norm": 5.4375, + "learning_rate": 2.3809207926254813e-06, + "loss": 1.61833763, + "memory(GiB)": 117.38, + "step": 54535, + "train_speed(iter/s)": 1.636959 + }, + { + "acc": 0.67367992, + "epoch": 1.3835616438356164, + "grad_norm": 5.40625, + "learning_rate": 2.380027599512844e-06, + "loss": 1.59038191, + "memory(GiB)": 117.38, + "step": 54540, + "train_speed(iter/s)": 1.636976 + }, + { + "acc": 0.65162201, + "epoch": 1.3836884830035516, + "grad_norm": 7.59375, + "learning_rate": 2.3791345216382906e-06, + "loss": 1.6196228, + "memory(GiB)": 117.38, + "step": 54545, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.65452595, + "epoch": 1.3838153221714866, + "grad_norm": 5.78125, + "learning_rate": 2.378241559041102e-06, + "loss": 1.63103619, + "memory(GiB)": 117.38, + "step": 54550, + "train_speed(iter/s)": 1.637011 + }, + { + "acc": 0.6582305, + "epoch": 1.3839421613394216, + "grad_norm": 5.875, + "learning_rate": 2.377348711760555e-06, + "loss": 1.6536974, + "memory(GiB)": 117.38, + "step": 54555, + "train_speed(iter/s)": 1.637027 + }, + { + "acc": 0.64812927, + "epoch": 1.3840690005073566, + "grad_norm": 6.53125, + "learning_rate": 2.3764559798359204e-06, + "loss": 1.6553772, + "memory(GiB)": 117.38, + "step": 54560, + "train_speed(iter/s)": 1.637046 + }, + { + "acc": 0.6654295, + "epoch": 1.3841958396752918, + "grad_norm": 5.125, + "learning_rate": 2.3755633633064658e-06, + "loss": 1.51865454, + "memory(GiB)": 117.38, + "step": 54565, + "train_speed(iter/s)": 1.637062 + }, + { + "acc": 0.62914252, + "epoch": 1.3843226788432268, + "grad_norm": 4.71875, + "learning_rate": 2.374670862211451e-06, + "loss": 1.70866776, + "memory(GiB)": 117.38, + "step": 54570, + "train_speed(iter/s)": 1.63708 + }, + { + "acc": 0.66669559, + "epoch": 1.3844495180111618, + "grad_norm": 6.5625, + "learning_rate": 2.373778476590134e-06, + "loss": 1.56118927, + "memory(GiB)": 117.38, + "step": 54575, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.66218114, + "epoch": 1.384576357179097, + "grad_norm": 5.4375, + "learning_rate": 2.372886206481764e-06, + "loss": 1.58207045, + "memory(GiB)": 117.38, + "step": 54580, + "train_speed(iter/s)": 1.637114 + }, + { + "acc": 0.64545379, + "epoch": 1.384703196347032, + "grad_norm": 4.5625, + "learning_rate": 2.3719940519255864e-06, + "loss": 1.64367085, + "memory(GiB)": 117.38, + "step": 54585, + "train_speed(iter/s)": 1.637131 + }, + { + "acc": 0.64167919, + "epoch": 1.384830035514967, + "grad_norm": 6.21875, + "learning_rate": 2.371102012960847e-06, + "loss": 1.66074333, + "memory(GiB)": 117.38, + "step": 54590, + "train_speed(iter/s)": 1.637147 + }, + { + "acc": 0.65991077, + "epoch": 1.384956874682902, + "grad_norm": 7.5625, + "learning_rate": 2.3702100896267767e-06, + "loss": 1.58650208, + "memory(GiB)": 117.38, + "step": 54595, + "train_speed(iter/s)": 1.637164 + }, + { + "acc": 0.66061969, + "epoch": 1.3850837138508372, + "grad_norm": 4.8125, + "learning_rate": 2.3693182819626077e-06, + "loss": 1.59709663, + "memory(GiB)": 117.38, + "step": 54600, + "train_speed(iter/s)": 1.637179 + }, + { + "acc": 0.64905834, + "epoch": 1.3852105530187722, + "grad_norm": 6.1875, + "learning_rate": 2.3684265900075637e-06, + "loss": 1.61571255, + "memory(GiB)": 117.38, + "step": 54605, + "train_speed(iter/s)": 1.637196 + }, + { + "acc": 0.662532, + "epoch": 1.3853373921867074, + "grad_norm": 5.21875, + "learning_rate": 2.3675350138008714e-06, + "loss": 1.61051311, + "memory(GiB)": 117.38, + "step": 54610, + "train_speed(iter/s)": 1.637212 + }, + { + "acc": 0.66010008, + "epoch": 1.3854642313546424, + "grad_norm": 5.53125, + "learning_rate": 2.3666435533817406e-06, + "loss": 1.6194458, + "memory(GiB)": 117.38, + "step": 54615, + "train_speed(iter/s)": 1.637227 + }, + { + "acc": 0.65904284, + "epoch": 1.3855910705225774, + "grad_norm": 7.125, + "learning_rate": 2.3657522087893806e-06, + "loss": 1.61277561, + "memory(GiB)": 117.38, + "step": 54620, + "train_speed(iter/s)": 1.637244 + }, + { + "acc": 0.66919174, + "epoch": 1.3857179096905123, + "grad_norm": 5.34375, + "learning_rate": 2.3648609800630022e-06, + "loss": 1.57579651, + "memory(GiB)": 117.38, + "step": 54625, + "train_speed(iter/s)": 1.63726 + }, + { + "acc": 0.64660482, + "epoch": 1.3858447488584476, + "grad_norm": 5.3125, + "learning_rate": 2.363969867241805e-06, + "loss": 1.59215279, + "memory(GiB)": 117.38, + "step": 54630, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.64593916, + "epoch": 1.3859715880263825, + "grad_norm": 6.09375, + "learning_rate": 2.36307887036498e-06, + "loss": 1.60736256, + "memory(GiB)": 117.38, + "step": 54635, + "train_speed(iter/s)": 1.637294 + }, + { + "acc": 0.64959455, + "epoch": 1.3860984271943175, + "grad_norm": 5.4375, + "learning_rate": 2.3621879894717177e-06, + "loss": 1.57939568, + "memory(GiB)": 117.38, + "step": 54640, + "train_speed(iter/s)": 1.637311 + }, + { + "acc": 0.67929225, + "epoch": 1.3862252663622527, + "grad_norm": 6.40625, + "learning_rate": 2.361297224601206e-06, + "loss": 1.55587587, + "memory(GiB)": 117.38, + "step": 54645, + "train_speed(iter/s)": 1.637327 + }, + { + "acc": 0.67475038, + "epoch": 1.3863521055301877, + "grad_norm": 5.625, + "learning_rate": 2.360406575792625e-06, + "loss": 1.52268801, + "memory(GiB)": 117.38, + "step": 54650, + "train_speed(iter/s)": 1.637342 + }, + { + "acc": 0.67204671, + "epoch": 1.3864789446981227, + "grad_norm": 5.34375, + "learning_rate": 2.3595160430851445e-06, + "loss": 1.49425068, + "memory(GiB)": 117.38, + "step": 54655, + "train_speed(iter/s)": 1.637359 + }, + { + "acc": 0.6472033, + "epoch": 1.3866057838660577, + "grad_norm": 7.28125, + "learning_rate": 2.3586256265179392e-06, + "loss": 1.62397461, + "memory(GiB)": 117.38, + "step": 54660, + "train_speed(iter/s)": 1.637375 + }, + { + "acc": 0.66439342, + "epoch": 1.386732623033993, + "grad_norm": 6.09375, + "learning_rate": 2.3577353261301715e-06, + "loss": 1.52258282, + "memory(GiB)": 117.38, + "step": 54665, + "train_speed(iter/s)": 1.637391 + }, + { + "acc": 0.65613689, + "epoch": 1.386859462201928, + "grad_norm": 6.40625, + "learning_rate": 2.356845141961001e-06, + "loss": 1.63419609, + "memory(GiB)": 117.38, + "step": 54670, + "train_speed(iter/s)": 1.637406 + }, + { + "acc": 0.64701138, + "epoch": 1.3869863013698631, + "grad_norm": 5.65625, + "learning_rate": 2.355955074049582e-06, + "loss": 1.600383, + "memory(GiB)": 117.38, + "step": 54675, + "train_speed(iter/s)": 1.637423 + }, + { + "acc": 0.66493602, + "epoch": 1.387113140537798, + "grad_norm": 6.65625, + "learning_rate": 2.355065122435064e-06, + "loss": 1.57711973, + "memory(GiB)": 117.38, + "step": 54680, + "train_speed(iter/s)": 1.63744 + }, + { + "acc": 0.65375957, + "epoch": 1.387239979705733, + "grad_norm": 6.40625, + "learning_rate": 2.3541752871565902e-06, + "loss": 1.63986969, + "memory(GiB)": 117.38, + "step": 54685, + "train_speed(iter/s)": 1.637456 + }, + { + "acc": 0.65510831, + "epoch": 1.387366818873668, + "grad_norm": 4.78125, + "learning_rate": 2.3532855682533003e-06, + "loss": 1.59486332, + "memory(GiB)": 117.38, + "step": 54690, + "train_speed(iter/s)": 1.637472 + }, + { + "acc": 0.66585188, + "epoch": 1.3874936580416033, + "grad_norm": 6.34375, + "learning_rate": 2.352395965764328e-06, + "loss": 1.59426537, + "memory(GiB)": 117.38, + "step": 54695, + "train_speed(iter/s)": 1.637488 + }, + { + "acc": 0.65251646, + "epoch": 1.3876204972095383, + "grad_norm": 4.96875, + "learning_rate": 2.3515064797288013e-06, + "loss": 1.5913372, + "memory(GiB)": 117.38, + "step": 54700, + "train_speed(iter/s)": 1.637504 + }, + { + "acc": 0.65431547, + "epoch": 1.3877473363774735, + "grad_norm": 5.125, + "learning_rate": 2.350617110185845e-06, + "loss": 1.65106812, + "memory(GiB)": 117.38, + "step": 54705, + "train_speed(iter/s)": 1.63752 + }, + { + "acc": 0.65923738, + "epoch": 1.3878741755454085, + "grad_norm": 5.21875, + "learning_rate": 2.3497278571745763e-06, + "loss": 1.59178038, + "memory(GiB)": 117.38, + "step": 54710, + "train_speed(iter/s)": 1.637537 + }, + { + "acc": 0.66219721, + "epoch": 1.3880010147133435, + "grad_norm": 5.875, + "learning_rate": 2.348838720734109e-06, + "loss": 1.55047865, + "memory(GiB)": 117.38, + "step": 54715, + "train_speed(iter/s)": 1.637554 + }, + { + "acc": 0.66923361, + "epoch": 1.3881278538812785, + "grad_norm": 6.125, + "learning_rate": 2.347949700903552e-06, + "loss": 1.62030277, + "memory(GiB)": 117.38, + "step": 54720, + "train_speed(iter/s)": 1.63757 + }, + { + "acc": 0.65253315, + "epoch": 1.3882546930492137, + "grad_norm": 6.65625, + "learning_rate": 2.3470607977220066e-06, + "loss": 1.57142239, + "memory(GiB)": 117.38, + "step": 54725, + "train_speed(iter/s)": 1.637586 + }, + { + "acc": 0.65744362, + "epoch": 1.3883815322171487, + "grad_norm": 5.34375, + "learning_rate": 2.346172011228573e-06, + "loss": 1.62530308, + "memory(GiB)": 117.38, + "step": 54730, + "train_speed(iter/s)": 1.637604 + }, + { + "acc": 0.6665256, + "epoch": 1.3885083713850837, + "grad_norm": 5.5625, + "learning_rate": 2.345283341462342e-06, + "loss": 1.56241436, + "memory(GiB)": 117.38, + "step": 54735, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.65214968, + "epoch": 1.3886352105530189, + "grad_norm": 5.46875, + "learning_rate": 2.3443947884624026e-06, + "loss": 1.6450182, + "memory(GiB)": 117.38, + "step": 54740, + "train_speed(iter/s)": 1.637636 + }, + { + "acc": 0.66708302, + "epoch": 1.3887620497209539, + "grad_norm": 5.625, + "learning_rate": 2.3435063522678346e-06, + "loss": 1.54241266, + "memory(GiB)": 117.38, + "step": 54745, + "train_speed(iter/s)": 1.637652 + }, + { + "acc": 0.65314879, + "epoch": 1.3888888888888888, + "grad_norm": 4.875, + "learning_rate": 2.3426180329177217e-06, + "loss": 1.61339149, + "memory(GiB)": 117.38, + "step": 54750, + "train_speed(iter/s)": 1.637668 + }, + { + "acc": 0.6494813, + "epoch": 1.3890157280568238, + "grad_norm": 5.03125, + "learning_rate": 2.3417298304511297e-06, + "loss": 1.5931282, + "memory(GiB)": 117.38, + "step": 54755, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.63992729, + "epoch": 1.389142567224759, + "grad_norm": 5.53125, + "learning_rate": 2.340841744907127e-06, + "loss": 1.70688591, + "memory(GiB)": 117.38, + "step": 54760, + "train_speed(iter/s)": 1.637701 + }, + { + "acc": 0.66486187, + "epoch": 1.389269406392694, + "grad_norm": 6.5, + "learning_rate": 2.3399537763247783e-06, + "loss": 1.53978243, + "memory(GiB)": 117.38, + "step": 54765, + "train_speed(iter/s)": 1.637718 + }, + { + "acc": 0.64669728, + "epoch": 1.3893962455606292, + "grad_norm": 5.375, + "learning_rate": 2.3390659247431404e-06, + "loss": 1.60624332, + "memory(GiB)": 117.38, + "step": 54770, + "train_speed(iter/s)": 1.637734 + }, + { + "acc": 0.66217871, + "epoch": 1.3895230847285642, + "grad_norm": 6.34375, + "learning_rate": 2.338178190201261e-06, + "loss": 1.56973162, + "memory(GiB)": 117.38, + "step": 54775, + "train_speed(iter/s)": 1.637751 + }, + { + "acc": 0.65562439, + "epoch": 1.3896499238964992, + "grad_norm": 5.8125, + "learning_rate": 2.3372905727381877e-06, + "loss": 1.60259323, + "memory(GiB)": 117.38, + "step": 54780, + "train_speed(iter/s)": 1.637767 + }, + { + "acc": 0.67119951, + "epoch": 1.3897767630644342, + "grad_norm": 5.8125, + "learning_rate": 2.3364030723929647e-06, + "loss": 1.54498358, + "memory(GiB)": 117.38, + "step": 54785, + "train_speed(iter/s)": 1.637783 + }, + { + "acc": 0.66917076, + "epoch": 1.3899036022323694, + "grad_norm": 5.375, + "learning_rate": 2.335515689204629e-06, + "loss": 1.55101547, + "memory(GiB)": 117.38, + "step": 54790, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.65899162, + "epoch": 1.3900304414003044, + "grad_norm": 6.25, + "learning_rate": 2.334628423212206e-06, + "loss": 1.59170866, + "memory(GiB)": 117.38, + "step": 54795, + "train_speed(iter/s)": 1.637816 + }, + { + "acc": 0.65975137, + "epoch": 1.3901572805682394, + "grad_norm": 6.40625, + "learning_rate": 2.3337412744547256e-06, + "loss": 1.58847504, + "memory(GiB)": 117.38, + "step": 54800, + "train_speed(iter/s)": 1.637832 + }, + { + "acc": 0.66842957, + "epoch": 1.3902841197361746, + "grad_norm": 5.59375, + "learning_rate": 2.332854242971209e-06, + "loss": 1.56257715, + "memory(GiB)": 117.38, + "step": 54805, + "train_speed(iter/s)": 1.637849 + }, + { + "acc": 0.65224628, + "epoch": 1.3904109589041096, + "grad_norm": 8.0625, + "learning_rate": 2.331967328800672e-06, + "loss": 1.58485088, + "memory(GiB)": 117.38, + "step": 54810, + "train_speed(iter/s)": 1.637866 + }, + { + "acc": 0.65946445, + "epoch": 1.3905377980720446, + "grad_norm": 6.1875, + "learning_rate": 2.33108053198212e-06, + "loss": 1.53212433, + "memory(GiB)": 117.38, + "step": 54815, + "train_speed(iter/s)": 1.637882 + }, + { + "acc": 0.65844302, + "epoch": 1.3906646372399796, + "grad_norm": 5.4375, + "learning_rate": 2.330193852554564e-06, + "loss": 1.57762346, + "memory(GiB)": 117.38, + "step": 54820, + "train_speed(iter/s)": 1.637899 + }, + { + "acc": 0.66593008, + "epoch": 1.3907914764079148, + "grad_norm": 6.90625, + "learning_rate": 2.3293072905570024e-06, + "loss": 1.60221062, + "memory(GiB)": 117.38, + "step": 54825, + "train_speed(iter/s)": 1.637914 + }, + { + "acc": 0.67084436, + "epoch": 1.3909183155758498, + "grad_norm": 4.78125, + "learning_rate": 2.3284208460284303e-06, + "loss": 1.56453915, + "memory(GiB)": 117.38, + "step": 54830, + "train_speed(iter/s)": 1.63793 + }, + { + "acc": 0.66992269, + "epoch": 1.391045154743785, + "grad_norm": 6.46875, + "learning_rate": 2.3275345190078364e-06, + "loss": 1.60624962, + "memory(GiB)": 117.38, + "step": 54835, + "train_speed(iter/s)": 1.637947 + }, + { + "acc": 0.64309888, + "epoch": 1.39117199391172, + "grad_norm": 5.40625, + "learning_rate": 2.3266483095342064e-06, + "loss": 1.62993565, + "memory(GiB)": 117.38, + "step": 54840, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.66009889, + "epoch": 1.391298833079655, + "grad_norm": 5.6875, + "learning_rate": 2.3257622176465194e-06, + "loss": 1.54410744, + "memory(GiB)": 117.38, + "step": 54845, + "train_speed(iter/s)": 1.63798 + }, + { + "acc": 0.65634217, + "epoch": 1.39142567224759, + "grad_norm": 6.34375, + "learning_rate": 2.3248762433837494e-06, + "loss": 1.63350487, + "memory(GiB)": 117.38, + "step": 54850, + "train_speed(iter/s)": 1.637997 + }, + { + "acc": 0.64866471, + "epoch": 1.3915525114155252, + "grad_norm": 5.28125, + "learning_rate": 2.323990386784867e-06, + "loss": 1.58710537, + "memory(GiB)": 117.38, + "step": 54855, + "train_speed(iter/s)": 1.638014 + }, + { + "acc": 0.66248913, + "epoch": 1.3916793505834602, + "grad_norm": 8.5, + "learning_rate": 2.3231046478888335e-06, + "loss": 1.60312004, + "memory(GiB)": 117.38, + "step": 54860, + "train_speed(iter/s)": 1.638031 + }, + { + "acc": 0.65088711, + "epoch": 1.3918061897513954, + "grad_norm": 5.75, + "learning_rate": 2.3222190267346094e-06, + "loss": 1.60557976, + "memory(GiB)": 117.38, + "step": 54865, + "train_speed(iter/s)": 1.638049 + }, + { + "acc": 0.65654144, + "epoch": 1.3919330289193304, + "grad_norm": 5.78125, + "learning_rate": 2.3213335233611484e-06, + "loss": 1.60163422, + "memory(GiB)": 117.38, + "step": 54870, + "train_speed(iter/s)": 1.638066 + }, + { + "acc": 0.65225067, + "epoch": 1.3920598680872653, + "grad_norm": 6.96875, + "learning_rate": 2.320448137807398e-06, + "loss": 1.59785175, + "memory(GiB)": 117.38, + "step": 54875, + "train_speed(iter/s)": 1.638083 + }, + { + "acc": 0.65564923, + "epoch": 1.3921867072552003, + "grad_norm": 5.8125, + "learning_rate": 2.3195628701123017e-06, + "loss": 1.6506506, + "memory(GiB)": 117.38, + "step": 54880, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.66144891, + "epoch": 1.3923135464231355, + "grad_norm": 5.25, + "learning_rate": 2.3186777203147964e-06, + "loss": 1.57962303, + "memory(GiB)": 117.38, + "step": 54885, + "train_speed(iter/s)": 1.638117 + }, + { + "acc": 0.66502342, + "epoch": 1.3924403855910705, + "grad_norm": 6.09375, + "learning_rate": 2.3177926884538193e-06, + "loss": 1.63195229, + "memory(GiB)": 117.38, + "step": 54890, + "train_speed(iter/s)": 1.638135 + }, + { + "acc": 0.66079597, + "epoch": 1.3925672247590055, + "grad_norm": 6.15625, + "learning_rate": 2.3169077745682933e-06, + "loss": 1.575457, + "memory(GiB)": 117.38, + "step": 54895, + "train_speed(iter/s)": 1.638153 + }, + { + "acc": 0.65259972, + "epoch": 1.3926940639269407, + "grad_norm": 6.0, + "learning_rate": 2.316022978697143e-06, + "loss": 1.58997059, + "memory(GiB)": 117.38, + "step": 54900, + "train_speed(iter/s)": 1.63817 + }, + { + "acc": 0.65334277, + "epoch": 1.3928209030948757, + "grad_norm": 5.15625, + "learning_rate": 2.3151383008792826e-06, + "loss": 1.59839573, + "memory(GiB)": 117.38, + "step": 54905, + "train_speed(iter/s)": 1.638187 + }, + { + "acc": 0.65705514, + "epoch": 1.3929477422628107, + "grad_norm": 5.8125, + "learning_rate": 2.314253741153631e-06, + "loss": 1.58769064, + "memory(GiB)": 117.38, + "step": 54910, + "train_speed(iter/s)": 1.638203 + }, + { + "acc": 0.65232139, + "epoch": 1.3930745814307457, + "grad_norm": 6.34375, + "learning_rate": 2.313369299559088e-06, + "loss": 1.55695667, + "memory(GiB)": 117.38, + "step": 54915, + "train_speed(iter/s)": 1.63822 + }, + { + "acc": 0.6594377, + "epoch": 1.393201420598681, + "grad_norm": 4.6875, + "learning_rate": 2.3124849761345576e-06, + "loss": 1.50405865, + "memory(GiB)": 117.38, + "step": 54920, + "train_speed(iter/s)": 1.638236 + }, + { + "acc": 0.66456151, + "epoch": 1.393328259766616, + "grad_norm": 4.71875, + "learning_rate": 2.311600770918938e-06, + "loss": 1.59530659, + "memory(GiB)": 117.38, + "step": 54925, + "train_speed(iter/s)": 1.638254 + }, + { + "acc": 0.66930633, + "epoch": 1.393455098934551, + "grad_norm": 5.15625, + "learning_rate": 2.310716683951122e-06, + "loss": 1.57571974, + "memory(GiB)": 117.38, + "step": 54930, + "train_speed(iter/s)": 1.63827 + }, + { + "acc": 0.6582674, + "epoch": 1.393581938102486, + "grad_norm": 7.53125, + "learning_rate": 2.3098327152699884e-06, + "loss": 1.65666256, + "memory(GiB)": 117.38, + "step": 54935, + "train_speed(iter/s)": 1.638288 + }, + { + "acc": 0.6635499, + "epoch": 1.393708777270421, + "grad_norm": 5.84375, + "learning_rate": 2.308948864914425e-06, + "loss": 1.60049973, + "memory(GiB)": 117.38, + "step": 54940, + "train_speed(iter/s)": 1.638304 + }, + { + "acc": 0.65124331, + "epoch": 1.393835616438356, + "grad_norm": 5.84375, + "learning_rate": 2.308065132923305e-06, + "loss": 1.63863602, + "memory(GiB)": 117.38, + "step": 54945, + "train_speed(iter/s)": 1.638321 + }, + { + "acc": 0.6608758, + "epoch": 1.3939624556062913, + "grad_norm": 5.4375, + "learning_rate": 2.3071815193355005e-06, + "loss": 1.55960388, + "memory(GiB)": 117.38, + "step": 54950, + "train_speed(iter/s)": 1.638339 + }, + { + "acc": 0.66309366, + "epoch": 1.3940892947742263, + "grad_norm": 5.625, + "learning_rate": 2.3062980241898725e-06, + "loss": 1.5562542, + "memory(GiB)": 117.38, + "step": 54955, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.64930382, + "epoch": 1.3942161339421613, + "grad_norm": 6.625, + "learning_rate": 2.3054146475252852e-06, + "loss": 1.63108883, + "memory(GiB)": 117.38, + "step": 54960, + "train_speed(iter/s)": 1.638374 + }, + { + "acc": 0.67725663, + "epoch": 1.3943429731100965, + "grad_norm": 6.125, + "learning_rate": 2.3045313893805926e-06, + "loss": 1.45865288, + "memory(GiB)": 117.38, + "step": 54965, + "train_speed(iter/s)": 1.638393 + }, + { + "acc": 0.6610034, + "epoch": 1.3944698122780315, + "grad_norm": 7.59375, + "learning_rate": 2.303648249794644e-06, + "loss": 1.62752399, + "memory(GiB)": 117.38, + "step": 54970, + "train_speed(iter/s)": 1.63841 + }, + { + "acc": 0.65557089, + "epoch": 1.3945966514459665, + "grad_norm": 5.25, + "learning_rate": 2.302765228806283e-06, + "loss": 1.57916603, + "memory(GiB)": 117.38, + "step": 54975, + "train_speed(iter/s)": 1.638428 + }, + { + "acc": 0.65172729, + "epoch": 1.3947234906139014, + "grad_norm": 6.4375, + "learning_rate": 2.30188232645435e-06, + "loss": 1.6548996, + "memory(GiB)": 117.38, + "step": 54980, + "train_speed(iter/s)": 1.638444 + }, + { + "acc": 0.64850979, + "epoch": 1.3948503297818367, + "grad_norm": 5.84375, + "learning_rate": 2.300999542777678e-06, + "loss": 1.67980804, + "memory(GiB)": 117.38, + "step": 54985, + "train_speed(iter/s)": 1.638461 + }, + { + "acc": 0.6555099, + "epoch": 1.3949771689497716, + "grad_norm": 5.6875, + "learning_rate": 2.300116877815097e-06, + "loss": 1.53745718, + "memory(GiB)": 117.38, + "step": 54990, + "train_speed(iter/s)": 1.638478 + }, + { + "acc": 0.64655685, + "epoch": 1.3951040081177069, + "grad_norm": 5.78125, + "learning_rate": 2.2992343316054296e-06, + "loss": 1.63145065, + "memory(GiB)": 117.38, + "step": 54995, + "train_speed(iter/s)": 1.638496 + }, + { + "acc": 0.65918884, + "epoch": 1.3952308472856418, + "grad_norm": 5.1875, + "learning_rate": 2.298351904187494e-06, + "loss": 1.57708597, + "memory(GiB)": 117.38, + "step": 55000, + "train_speed(iter/s)": 1.638513 + }, + { + "epoch": 1.3952308472856418, + "eval_acc": 0.6461933312032611, + "eval_loss": 1.5732694864273071, + "eval_runtime": 58.4638, + "eval_samples_per_second": 108.956, + "eval_steps_per_second": 27.248, + "step": 55000 + }, + { + "acc": 0.65558567, + "epoch": 1.3953576864535768, + "grad_norm": 5.59375, + "learning_rate": 2.297469595600104e-06, + "loss": 1.62075157, + "memory(GiB)": 117.38, + "step": 55005, + "train_speed(iter/s)": 1.635469 + }, + { + "acc": 0.65070286, + "epoch": 1.3954845256215118, + "grad_norm": 5.28125, + "learning_rate": 2.2965874058820668e-06, + "loss": 1.64342937, + "memory(GiB)": 117.38, + "step": 55010, + "train_speed(iter/s)": 1.635485 + }, + { + "acc": 0.65479927, + "epoch": 1.395611364789447, + "grad_norm": 7.34375, + "learning_rate": 2.2957053350721857e-06, + "loss": 1.60056801, + "memory(GiB)": 117.38, + "step": 55015, + "train_speed(iter/s)": 1.635501 + }, + { + "acc": 0.65444026, + "epoch": 1.395738203957382, + "grad_norm": 6.4375, + "learning_rate": 2.294823383209258e-06, + "loss": 1.620327, + "memory(GiB)": 117.38, + "step": 55020, + "train_speed(iter/s)": 1.635516 + }, + { + "acc": 0.65791359, + "epoch": 1.3958650431253172, + "grad_norm": 5.6875, + "learning_rate": 2.2939415503320733e-06, + "loss": 1.57491636, + "memory(GiB)": 117.38, + "step": 55025, + "train_speed(iter/s)": 1.635529 + }, + { + "acc": 0.63341837, + "epoch": 1.3959918822932522, + "grad_norm": 5.15625, + "learning_rate": 2.293059836479425e-06, + "loss": 1.65767384, + "memory(GiB)": 117.38, + "step": 55030, + "train_speed(iter/s)": 1.635546 + }, + { + "acc": 0.66097469, + "epoch": 1.3961187214611872, + "grad_norm": 5.46875, + "learning_rate": 2.2921782416900883e-06, + "loss": 1.55248327, + "memory(GiB)": 117.38, + "step": 55035, + "train_speed(iter/s)": 1.635562 + }, + { + "acc": 0.66823359, + "epoch": 1.3962455606291222, + "grad_norm": 6.59375, + "learning_rate": 2.2912967660028425e-06, + "loss": 1.57226105, + "memory(GiB)": 117.38, + "step": 55040, + "train_speed(iter/s)": 1.635578 + }, + { + "acc": 0.66065459, + "epoch": 1.3963723997970574, + "grad_norm": 4.28125, + "learning_rate": 2.2904154094564568e-06, + "loss": 1.56496401, + "memory(GiB)": 117.38, + "step": 55045, + "train_speed(iter/s)": 1.635594 + }, + { + "acc": 0.65135527, + "epoch": 1.3964992389649924, + "grad_norm": 6.21875, + "learning_rate": 2.2895341720897018e-06, + "loss": 1.6189209, + "memory(GiB)": 117.38, + "step": 55050, + "train_speed(iter/s)": 1.63561 + }, + { + "acc": 0.64938097, + "epoch": 1.3966260781329274, + "grad_norm": 5.90625, + "learning_rate": 2.2886530539413336e-06, + "loss": 1.66383324, + "memory(GiB)": 117.38, + "step": 55055, + "train_speed(iter/s)": 1.635627 + }, + { + "acc": 0.66291885, + "epoch": 1.3967529173008626, + "grad_norm": 5.53125, + "learning_rate": 2.2877720550501082e-06, + "loss": 1.59094162, + "memory(GiB)": 117.38, + "step": 55060, + "train_speed(iter/s)": 1.635644 + }, + { + "acc": 0.68015699, + "epoch": 1.3968797564687976, + "grad_norm": 6.40625, + "learning_rate": 2.2868911754547783e-06, + "loss": 1.5454298, + "memory(GiB)": 117.38, + "step": 55065, + "train_speed(iter/s)": 1.63566 + }, + { + "acc": 0.65581703, + "epoch": 1.3970065956367326, + "grad_norm": 5.5, + "learning_rate": 2.28601041519409e-06, + "loss": 1.55169697, + "memory(GiB)": 117.38, + "step": 55070, + "train_speed(iter/s)": 1.635676 + }, + { + "acc": 0.6483285, + "epoch": 1.3971334348046676, + "grad_norm": 6.3125, + "learning_rate": 2.2851297743067786e-06, + "loss": 1.6016367, + "memory(GiB)": 117.38, + "step": 55075, + "train_speed(iter/s)": 1.635692 + }, + { + "acc": 0.66045046, + "epoch": 1.3972602739726028, + "grad_norm": 6.0, + "learning_rate": 2.2842492528315784e-06, + "loss": 1.52488384, + "memory(GiB)": 117.38, + "step": 55080, + "train_speed(iter/s)": 1.635709 + }, + { + "acc": 0.66070461, + "epoch": 1.3973871131405378, + "grad_norm": 5.8125, + "learning_rate": 2.283368850807223e-06, + "loss": 1.57361317, + "memory(GiB)": 117.38, + "step": 55085, + "train_speed(iter/s)": 1.635726 + }, + { + "acc": 0.66746345, + "epoch": 1.397513952308473, + "grad_norm": 5.78125, + "learning_rate": 2.282488568272437e-06, + "loss": 1.58817005, + "memory(GiB)": 117.38, + "step": 55090, + "train_speed(iter/s)": 1.635737 + }, + { + "acc": 0.64651456, + "epoch": 1.397640791476408, + "grad_norm": 5.65625, + "learning_rate": 2.2816084052659326e-06, + "loss": 1.62980385, + "memory(GiB)": 117.38, + "step": 55095, + "train_speed(iter/s)": 1.635752 + }, + { + "acc": 0.65791364, + "epoch": 1.397767630644343, + "grad_norm": 5.375, + "learning_rate": 2.2807283618264288e-06, + "loss": 1.62679749, + "memory(GiB)": 117.38, + "step": 55100, + "train_speed(iter/s)": 1.635769 + }, + { + "acc": 0.65377564, + "epoch": 1.397894469812278, + "grad_norm": 6.28125, + "learning_rate": 2.2798484379926324e-06, + "loss": 1.57475939, + "memory(GiB)": 117.38, + "step": 55105, + "train_speed(iter/s)": 1.635786 + }, + { + "acc": 0.66424837, + "epoch": 1.3980213089802132, + "grad_norm": 5.78125, + "learning_rate": 2.278968633803246e-06, + "loss": 1.52081919, + "memory(GiB)": 117.38, + "step": 55110, + "train_speed(iter/s)": 1.635802 + }, + { + "acc": 0.65621176, + "epoch": 1.3981481481481481, + "grad_norm": 5.625, + "learning_rate": 2.2780889492969684e-06, + "loss": 1.56401024, + "memory(GiB)": 117.38, + "step": 55115, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.6706502, + "epoch": 1.3982749873160831, + "grad_norm": 6.21875, + "learning_rate": 2.277209384512491e-06, + "loss": 1.53983889, + "memory(GiB)": 117.38, + "step": 55120, + "train_speed(iter/s)": 1.635836 + }, + { + "acc": 0.64606833, + "epoch": 1.3984018264840183, + "grad_norm": 8.25, + "learning_rate": 2.2763299394885013e-06, + "loss": 1.61826744, + "memory(GiB)": 117.38, + "step": 55125, + "train_speed(iter/s)": 1.635853 + }, + { + "acc": 0.65655813, + "epoch": 1.3985286656519533, + "grad_norm": 5.0, + "learning_rate": 2.2754506142636808e-06, + "loss": 1.62069283, + "memory(GiB)": 117.38, + "step": 55130, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.66974339, + "epoch": 1.3986555048198883, + "grad_norm": 5.0625, + "learning_rate": 2.274571408876707e-06, + "loss": 1.5615572, + "memory(GiB)": 117.38, + "step": 55135, + "train_speed(iter/s)": 1.635886 + }, + { + "acc": 0.66698098, + "epoch": 1.3987823439878233, + "grad_norm": 6.53125, + "learning_rate": 2.2736923233662504e-06, + "loss": 1.58785343, + "memory(GiB)": 117.38, + "step": 55140, + "train_speed(iter/s)": 1.635903 + }, + { + "acc": 0.65583334, + "epoch": 1.3989091831557585, + "grad_norm": 5.0, + "learning_rate": 2.2728133577709776e-06, + "loss": 1.65006905, + "memory(GiB)": 117.38, + "step": 55145, + "train_speed(iter/s)": 1.63592 + }, + { + "acc": 0.65484171, + "epoch": 1.3990360223236935, + "grad_norm": 6.78125, + "learning_rate": 2.2719345121295495e-06, + "loss": 1.61099815, + "memory(GiB)": 117.38, + "step": 55150, + "train_speed(iter/s)": 1.635937 + }, + { + "acc": 0.64947262, + "epoch": 1.3991628614916287, + "grad_norm": 5.3125, + "learning_rate": 2.2710557864806214e-06, + "loss": 1.67447109, + "memory(GiB)": 117.38, + "step": 55155, + "train_speed(iter/s)": 1.635953 + }, + { + "acc": 0.6540925, + "epoch": 1.3992897006595637, + "grad_norm": 5.59375, + "learning_rate": 2.2701771808628438e-06, + "loss": 1.61505032, + "memory(GiB)": 117.38, + "step": 55160, + "train_speed(iter/s)": 1.63597 + }, + { + "acc": 0.65101833, + "epoch": 1.3994165398274987, + "grad_norm": 5.78125, + "learning_rate": 2.269298695314861e-06, + "loss": 1.64927082, + "memory(GiB)": 117.38, + "step": 55165, + "train_speed(iter/s)": 1.635986 + }, + { + "acc": 0.65803304, + "epoch": 1.3995433789954337, + "grad_norm": 5.90625, + "learning_rate": 2.268420329875314e-06, + "loss": 1.61427803, + "memory(GiB)": 117.38, + "step": 55170, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.65885649, + "epoch": 1.399670218163369, + "grad_norm": 5.40625, + "learning_rate": 2.2675420845828363e-06, + "loss": 1.52747374, + "memory(GiB)": 117.38, + "step": 55175, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.65031261, + "epoch": 1.3997970573313039, + "grad_norm": 7.46875, + "learning_rate": 2.266663959476057e-06, + "loss": 1.58954802, + "memory(GiB)": 117.38, + "step": 55180, + "train_speed(iter/s)": 1.636036 + }, + { + "acc": 0.66151996, + "epoch": 1.399923896499239, + "grad_norm": 6.6875, + "learning_rate": 2.265785954593598e-06, + "loss": 1.5725173, + "memory(GiB)": 117.38, + "step": 55185, + "train_speed(iter/s)": 1.636054 + }, + { + "acc": 0.64516759, + "epoch": 1.400050735667174, + "grad_norm": 6.0, + "learning_rate": 2.264908069974085e-06, + "loss": 1.59116058, + "memory(GiB)": 117.38, + "step": 55190, + "train_speed(iter/s)": 1.636071 + }, + { + "acc": 0.66232872, + "epoch": 1.400177574835109, + "grad_norm": 6.4375, + "learning_rate": 2.2640303056561236e-06, + "loss": 1.54893026, + "memory(GiB)": 117.38, + "step": 55195, + "train_speed(iter/s)": 1.636086 + }, + { + "acc": 0.64688368, + "epoch": 1.400304414003044, + "grad_norm": 7.625, + "learning_rate": 2.2631526616783234e-06, + "loss": 1.59646835, + "memory(GiB)": 117.38, + "step": 55200, + "train_speed(iter/s)": 1.636104 + }, + { + "acc": 0.66893501, + "epoch": 1.4004312531709793, + "grad_norm": 5.6875, + "learning_rate": 2.2622751380792896e-06, + "loss": 1.55401516, + "memory(GiB)": 117.38, + "step": 55205, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.64609971, + "epoch": 1.4005580923389143, + "grad_norm": 5.75, + "learning_rate": 2.26139773489762e-06, + "loss": 1.64581604, + "memory(GiB)": 117.38, + "step": 55210, + "train_speed(iter/s)": 1.636138 + }, + { + "acc": 0.66110115, + "epoch": 1.4006849315068493, + "grad_norm": 4.84375, + "learning_rate": 2.260520452171904e-06, + "loss": 1.56986027, + "memory(GiB)": 117.38, + "step": 55215, + "train_speed(iter/s)": 1.636155 + }, + { + "acc": 0.65141363, + "epoch": 1.4008117706747845, + "grad_norm": 5.4375, + "learning_rate": 2.259643289940727e-06, + "loss": 1.57192612, + "memory(GiB)": 117.38, + "step": 55220, + "train_speed(iter/s)": 1.636171 + }, + { + "acc": 0.66034131, + "epoch": 1.4009386098427195, + "grad_norm": 5.59375, + "learning_rate": 2.2587662482426748e-06, + "loss": 1.59899263, + "memory(GiB)": 117.38, + "step": 55225, + "train_speed(iter/s)": 1.636188 + }, + { + "acc": 0.65564446, + "epoch": 1.4010654490106544, + "grad_norm": 7.125, + "learning_rate": 2.2578893271163234e-06, + "loss": 1.65590668, + "memory(GiB)": 117.38, + "step": 55230, + "train_speed(iter/s)": 1.636204 + }, + { + "acc": 0.6525218, + "epoch": 1.4011922881785894, + "grad_norm": 5.6875, + "learning_rate": 2.2570125266002385e-06, + "loss": 1.5934391, + "memory(GiB)": 117.38, + "step": 55235, + "train_speed(iter/s)": 1.636221 + }, + { + "acc": 0.66293912, + "epoch": 1.4013191273465246, + "grad_norm": 6.71875, + "learning_rate": 2.2561358467329907e-06, + "loss": 1.59095764, + "memory(GiB)": 117.38, + "step": 55240, + "train_speed(iter/s)": 1.636238 + }, + { + "acc": 0.66244698, + "epoch": 1.4014459665144596, + "grad_norm": 6.375, + "learning_rate": 2.2552592875531397e-06, + "loss": 1.59582348, + "memory(GiB)": 117.38, + "step": 55245, + "train_speed(iter/s)": 1.636254 + }, + { + "acc": 0.66017885, + "epoch": 1.4015728056823948, + "grad_norm": 5.5, + "learning_rate": 2.254382849099241e-06, + "loss": 1.57824068, + "memory(GiB)": 117.38, + "step": 55250, + "train_speed(iter/s)": 1.63627 + }, + { + "acc": 0.66618204, + "epoch": 1.4016996448503298, + "grad_norm": 6.1875, + "learning_rate": 2.253506531409839e-06, + "loss": 1.54128647, + "memory(GiB)": 117.38, + "step": 55255, + "train_speed(iter/s)": 1.636287 + }, + { + "acc": 0.6507659, + "epoch": 1.4018264840182648, + "grad_norm": 6.71875, + "learning_rate": 2.252630334523484e-06, + "loss": 1.64957542, + "memory(GiB)": 117.38, + "step": 55260, + "train_speed(iter/s)": 1.636303 + }, + { + "acc": 0.64942713, + "epoch": 1.4019533231861998, + "grad_norm": 6.5, + "learning_rate": 2.2517542584787134e-06, + "loss": 1.66868362, + "memory(GiB)": 117.38, + "step": 55265, + "train_speed(iter/s)": 1.636319 + }, + { + "acc": 0.66312199, + "epoch": 1.402080162354135, + "grad_norm": 5.9375, + "learning_rate": 2.2508783033140596e-06, + "loss": 1.56169405, + "memory(GiB)": 117.38, + "step": 55270, + "train_speed(iter/s)": 1.636335 + }, + { + "acc": 0.63883991, + "epoch": 1.40220700152207, + "grad_norm": 7.15625, + "learning_rate": 2.2500024690680528e-06, + "loss": 1.68168755, + "memory(GiB)": 117.38, + "step": 55275, + "train_speed(iter/s)": 1.636353 + }, + { + "acc": 0.66981516, + "epoch": 1.402333840690005, + "grad_norm": 5.40625, + "learning_rate": 2.249126755779215e-06, + "loss": 1.51545362, + "memory(GiB)": 117.38, + "step": 55280, + "train_speed(iter/s)": 1.636369 + }, + { + "acc": 0.65273981, + "epoch": 1.4024606798579402, + "grad_norm": 5.40625, + "learning_rate": 2.2482511634860645e-06, + "loss": 1.5833827, + "memory(GiB)": 117.38, + "step": 55285, + "train_speed(iter/s)": 1.636385 + }, + { + "acc": 0.64515276, + "epoch": 1.4025875190258752, + "grad_norm": 6.75, + "learning_rate": 2.247375692227113e-06, + "loss": 1.6318943, + "memory(GiB)": 117.38, + "step": 55290, + "train_speed(iter/s)": 1.636402 + }, + { + "acc": 0.6652565, + "epoch": 1.4027143581938102, + "grad_norm": 6.375, + "learning_rate": 2.2465003420408683e-06, + "loss": 1.60801868, + "memory(GiB)": 117.38, + "step": 55295, + "train_speed(iter/s)": 1.636419 + }, + { + "acc": 0.65893621, + "epoch": 1.4028411973617452, + "grad_norm": 5.65625, + "learning_rate": 2.2456251129658325e-06, + "loss": 1.60439358, + "memory(GiB)": 117.38, + "step": 55300, + "train_speed(iter/s)": 1.636434 + }, + { + "acc": 0.64047923, + "epoch": 1.4029680365296804, + "grad_norm": 5.78125, + "learning_rate": 2.2447500050405008e-06, + "loss": 1.64773216, + "memory(GiB)": 117.38, + "step": 55305, + "train_speed(iter/s)": 1.63645 + }, + { + "acc": 0.66279316, + "epoch": 1.4030948756976154, + "grad_norm": 9.9375, + "learning_rate": 2.2438750183033657e-06, + "loss": 1.60474739, + "memory(GiB)": 117.38, + "step": 55310, + "train_speed(iter/s)": 1.636468 + }, + { + "acc": 0.66279058, + "epoch": 1.4032217148655506, + "grad_norm": 5.5, + "learning_rate": 2.2430001527929123e-06, + "loss": 1.59033442, + "memory(GiB)": 117.38, + "step": 55315, + "train_speed(iter/s)": 1.636486 + }, + { + "acc": 0.65323105, + "epoch": 1.4033485540334856, + "grad_norm": 7.0, + "learning_rate": 2.242125408547622e-06, + "loss": 1.65653687, + "memory(GiB)": 117.38, + "step": 55320, + "train_speed(iter/s)": 1.636503 + }, + { + "acc": 0.65559368, + "epoch": 1.4034753932014206, + "grad_norm": 7.5, + "learning_rate": 2.2412507856059667e-06, + "loss": 1.64404964, + "memory(GiB)": 117.38, + "step": 55325, + "train_speed(iter/s)": 1.636521 + }, + { + "acc": 0.65261588, + "epoch": 1.4036022323693556, + "grad_norm": 5.65625, + "learning_rate": 2.2403762840064223e-06, + "loss": 1.58226433, + "memory(GiB)": 117.38, + "step": 55330, + "train_speed(iter/s)": 1.636538 + }, + { + "acc": 0.67162457, + "epoch": 1.4037290715372908, + "grad_norm": 4.8125, + "learning_rate": 2.239501903787448e-06, + "loss": 1.57336235, + "memory(GiB)": 117.38, + "step": 55335, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.65162492, + "epoch": 1.4038559107052258, + "grad_norm": 7.375, + "learning_rate": 2.2386276449875057e-06, + "loss": 1.62444992, + "memory(GiB)": 117.38, + "step": 55340, + "train_speed(iter/s)": 1.63657 + }, + { + "acc": 0.64949026, + "epoch": 1.403982749873161, + "grad_norm": 6.78125, + "learning_rate": 2.2377535076450452e-06, + "loss": 1.61994133, + "memory(GiB)": 117.38, + "step": 55345, + "train_speed(iter/s)": 1.636586 + }, + { + "acc": 0.65781751, + "epoch": 1.404109589041096, + "grad_norm": 6.0, + "learning_rate": 2.236879491798522e-06, + "loss": 1.5575325, + "memory(GiB)": 117.38, + "step": 55350, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.66194191, + "epoch": 1.404236428209031, + "grad_norm": 6.375, + "learning_rate": 2.2360055974863737e-06, + "loss": 1.61743317, + "memory(GiB)": 117.38, + "step": 55355, + "train_speed(iter/s)": 1.63662 + }, + { + "acc": 0.66451869, + "epoch": 1.404363267376966, + "grad_norm": 5.28125, + "learning_rate": 2.2351318247470376e-06, + "loss": 1.56740646, + "memory(GiB)": 117.38, + "step": 55360, + "train_speed(iter/s)": 1.636636 + }, + { + "acc": 0.64429374, + "epoch": 1.4044901065449011, + "grad_norm": 5.0625, + "learning_rate": 2.2342581736189496e-06, + "loss": 1.62785683, + "memory(GiB)": 117.38, + "step": 55365, + "train_speed(iter/s)": 1.636651 + }, + { + "acc": 0.65563703, + "epoch": 1.4046169457128361, + "grad_norm": 5.125, + "learning_rate": 2.233384644140537e-06, + "loss": 1.59156523, + "memory(GiB)": 117.38, + "step": 55370, + "train_speed(iter/s)": 1.636667 + }, + { + "acc": 0.67236867, + "epoch": 1.4047437848807711, + "grad_norm": 7.1875, + "learning_rate": 2.2325112363502167e-06, + "loss": 1.51769981, + "memory(GiB)": 117.38, + "step": 55375, + "train_speed(iter/s)": 1.636683 + }, + { + "acc": 0.66594553, + "epoch": 1.4048706240487063, + "grad_norm": 5.78125, + "learning_rate": 2.23163795028641e-06, + "loss": 1.54175014, + "memory(GiB)": 117.38, + "step": 55380, + "train_speed(iter/s)": 1.636697 + }, + { + "acc": 0.65792103, + "epoch": 1.4049974632166413, + "grad_norm": 5.21875, + "learning_rate": 2.230764785987526e-06, + "loss": 1.56255751, + "memory(GiB)": 117.38, + "step": 55385, + "train_speed(iter/s)": 1.636714 + }, + { + "acc": 0.65201612, + "epoch": 1.4051243023845763, + "grad_norm": 6.90625, + "learning_rate": 2.229891743491972e-06, + "loss": 1.54268131, + "memory(GiB)": 117.38, + "step": 55390, + "train_speed(iter/s)": 1.63673 + }, + { + "acc": 0.65024819, + "epoch": 1.4052511415525113, + "grad_norm": 5.28125, + "learning_rate": 2.2290188228381434e-06, + "loss": 1.58718824, + "memory(GiB)": 117.38, + "step": 55395, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.66040421, + "epoch": 1.4053779807204465, + "grad_norm": 5.0625, + "learning_rate": 2.2281460240644397e-06, + "loss": 1.59405689, + "memory(GiB)": 117.38, + "step": 55400, + "train_speed(iter/s)": 1.636761 + }, + { + "acc": 0.64602785, + "epoch": 1.4055048198883815, + "grad_norm": 5.6875, + "learning_rate": 2.22727334720925e-06, + "loss": 1.61734486, + "memory(GiB)": 117.38, + "step": 55405, + "train_speed(iter/s)": 1.636778 + }, + { + "acc": 0.66305494, + "epoch": 1.4056316590563167, + "grad_norm": 5.25, + "learning_rate": 2.2264007923109575e-06, + "loss": 1.57885513, + "memory(GiB)": 117.38, + "step": 55410, + "train_speed(iter/s)": 1.636792 + }, + { + "acc": 0.66346731, + "epoch": 1.4057584982242517, + "grad_norm": 6.8125, + "learning_rate": 2.225528359407942e-06, + "loss": 1.60510044, + "memory(GiB)": 117.38, + "step": 55415, + "train_speed(iter/s)": 1.636808 + }, + { + "acc": 0.65615096, + "epoch": 1.4058853373921867, + "grad_norm": 5.96875, + "learning_rate": 2.2246560485385756e-06, + "loss": 1.56561718, + "memory(GiB)": 117.38, + "step": 55420, + "train_speed(iter/s)": 1.636824 + }, + { + "acc": 0.65666571, + "epoch": 1.4060121765601217, + "grad_norm": 5.28125, + "learning_rate": 2.2237838597412277e-06, + "loss": 1.59757919, + "memory(GiB)": 117.38, + "step": 55425, + "train_speed(iter/s)": 1.636841 + }, + { + "acc": 0.66810236, + "epoch": 1.4061390157280569, + "grad_norm": 5.40625, + "learning_rate": 2.22291179305426e-06, + "loss": 1.51167374, + "memory(GiB)": 117.38, + "step": 55430, + "train_speed(iter/s)": 1.636857 + }, + { + "acc": 0.66811314, + "epoch": 1.4062658548959919, + "grad_norm": 5.6875, + "learning_rate": 2.222039848516031e-06, + "loss": 1.56607914, + "memory(GiB)": 117.38, + "step": 55435, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.65201788, + "epoch": 1.4063926940639269, + "grad_norm": 6.03125, + "learning_rate": 2.2211680261648918e-06, + "loss": 1.60019817, + "memory(GiB)": 117.38, + "step": 55440, + "train_speed(iter/s)": 1.63689 + }, + { + "acc": 0.65698485, + "epoch": 1.406519533231862, + "grad_norm": 5.40625, + "learning_rate": 2.22029632603919e-06, + "loss": 1.59546318, + "memory(GiB)": 117.38, + "step": 55445, + "train_speed(iter/s)": 1.636906 + }, + { + "acc": 0.65754786, + "epoch": 1.406646372399797, + "grad_norm": 6.125, + "learning_rate": 2.2194247481772652e-06, + "loss": 1.59910812, + "memory(GiB)": 117.38, + "step": 55450, + "train_speed(iter/s)": 1.636924 + }, + { + "acc": 0.65745711, + "epoch": 1.406773211567732, + "grad_norm": 5.5625, + "learning_rate": 2.218553292617455e-06, + "loss": 1.6074646, + "memory(GiB)": 117.38, + "step": 55455, + "train_speed(iter/s)": 1.636941 + }, + { + "acc": 0.65728054, + "epoch": 1.406900050735667, + "grad_norm": 5.125, + "learning_rate": 2.2176819593980892e-06, + "loss": 1.55498753, + "memory(GiB)": 117.38, + "step": 55460, + "train_speed(iter/s)": 1.636957 + }, + { + "acc": 0.65538864, + "epoch": 1.4070268899036023, + "grad_norm": 7.125, + "learning_rate": 2.2168107485574914e-06, + "loss": 1.68885326, + "memory(GiB)": 117.38, + "step": 55465, + "train_speed(iter/s)": 1.636973 + }, + { + "acc": 0.64546318, + "epoch": 1.4071537290715372, + "grad_norm": 5.875, + "learning_rate": 2.215939660133986e-06, + "loss": 1.5962822, + "memory(GiB)": 117.38, + "step": 55470, + "train_speed(iter/s)": 1.63699 + }, + { + "acc": 0.66275277, + "epoch": 1.4072805682394725, + "grad_norm": 5.6875, + "learning_rate": 2.215068694165883e-06, + "loss": 1.60228329, + "memory(GiB)": 117.38, + "step": 55475, + "train_speed(iter/s)": 1.637007 + }, + { + "acc": 0.66169815, + "epoch": 1.4074074074074074, + "grad_norm": 5.15625, + "learning_rate": 2.2141978506914922e-06, + "loss": 1.50000324, + "memory(GiB)": 117.38, + "step": 55480, + "train_speed(iter/s)": 1.637024 + }, + { + "acc": 0.63967075, + "epoch": 1.4075342465753424, + "grad_norm": 5.3125, + "learning_rate": 2.2133271297491165e-06, + "loss": 1.69420967, + "memory(GiB)": 117.38, + "step": 55485, + "train_speed(iter/s)": 1.637041 + }, + { + "acc": 0.66241055, + "epoch": 1.4076610857432774, + "grad_norm": 5.625, + "learning_rate": 2.2124565313770584e-06, + "loss": 1.56081228, + "memory(GiB)": 117.38, + "step": 55490, + "train_speed(iter/s)": 1.637056 + }, + { + "acc": 0.66528988, + "epoch": 1.4077879249112126, + "grad_norm": 5.09375, + "learning_rate": 2.211586055613606e-06, + "loss": 1.57082357, + "memory(GiB)": 117.38, + "step": 55495, + "train_speed(iter/s)": 1.637072 + }, + { + "acc": 0.65307875, + "epoch": 1.4079147640791476, + "grad_norm": 5.09375, + "learning_rate": 2.210715702497046e-06, + "loss": 1.58034525, + "memory(GiB)": 117.38, + "step": 55500, + "train_speed(iter/s)": 1.63709 + }, + { + "acc": 0.66315298, + "epoch": 1.4080416032470828, + "grad_norm": 6.0, + "learning_rate": 2.2098454720656647e-06, + "loss": 1.6256485, + "memory(GiB)": 117.38, + "step": 55505, + "train_speed(iter/s)": 1.637106 + }, + { + "acc": 0.66349649, + "epoch": 1.4081684424150178, + "grad_norm": 7.0, + "learning_rate": 2.2089753643577384e-06, + "loss": 1.55409184, + "memory(GiB)": 117.38, + "step": 55510, + "train_speed(iter/s)": 1.637123 + }, + { + "acc": 0.66498456, + "epoch": 1.4082952815829528, + "grad_norm": 5.3125, + "learning_rate": 2.208105379411535e-06, + "loss": 1.56644669, + "memory(GiB)": 117.38, + "step": 55515, + "train_speed(iter/s)": 1.637141 + }, + { + "acc": 0.65405598, + "epoch": 1.4084221207508878, + "grad_norm": 5.78125, + "learning_rate": 2.2072355172653197e-06, + "loss": 1.56064854, + "memory(GiB)": 117.38, + "step": 55520, + "train_speed(iter/s)": 1.637158 + }, + { + "acc": 0.65392909, + "epoch": 1.408548959918823, + "grad_norm": 6.03125, + "learning_rate": 2.2063657779573573e-06, + "loss": 1.56219311, + "memory(GiB)": 117.38, + "step": 55525, + "train_speed(iter/s)": 1.637175 + }, + { + "acc": 0.66338797, + "epoch": 1.408675799086758, + "grad_norm": 5.59375, + "learning_rate": 2.2054961615259023e-06, + "loss": 1.57238331, + "memory(GiB)": 117.38, + "step": 55530, + "train_speed(iter/s)": 1.637192 + }, + { + "acc": 0.67478223, + "epoch": 1.408802638254693, + "grad_norm": 4.71875, + "learning_rate": 2.2046266680091994e-06, + "loss": 1.52098751, + "memory(GiB)": 117.38, + "step": 55535, + "train_speed(iter/s)": 1.63721 + }, + { + "acc": 0.64974861, + "epoch": 1.4089294774226282, + "grad_norm": 5.46875, + "learning_rate": 2.2037572974454974e-06, + "loss": 1.69125538, + "memory(GiB)": 117.38, + "step": 55540, + "train_speed(iter/s)": 1.637227 + }, + { + "acc": 0.66572843, + "epoch": 1.4090563165905632, + "grad_norm": 6.8125, + "learning_rate": 2.202888049873034e-06, + "loss": 1.58676138, + "memory(GiB)": 117.38, + "step": 55545, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.65373125, + "epoch": 1.4091831557584982, + "grad_norm": 6.0, + "learning_rate": 2.2020189253300428e-06, + "loss": 1.71913567, + "memory(GiB)": 117.38, + "step": 55550, + "train_speed(iter/s)": 1.63726 + }, + { + "acc": 0.6474061, + "epoch": 1.4093099949264332, + "grad_norm": 5.46875, + "learning_rate": 2.2011499238547506e-06, + "loss": 1.66344872, + "memory(GiB)": 117.38, + "step": 55555, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.64668779, + "epoch": 1.4094368340943684, + "grad_norm": 7.125, + "learning_rate": 2.2002810454853813e-06, + "loss": 1.646521, + "memory(GiB)": 117.38, + "step": 55560, + "train_speed(iter/s)": 1.637296 + }, + { + "acc": 0.65319386, + "epoch": 1.4095636732623034, + "grad_norm": 6.46875, + "learning_rate": 2.1994122902601513e-06, + "loss": 1.62287388, + "memory(GiB)": 117.38, + "step": 55565, + "train_speed(iter/s)": 1.637313 + }, + { + "acc": 0.65166717, + "epoch": 1.4096905124302386, + "grad_norm": 5.65625, + "learning_rate": 2.1985436582172724e-06, + "loss": 1.58232927, + "memory(GiB)": 117.38, + "step": 55570, + "train_speed(iter/s)": 1.637329 + }, + { + "acc": 0.65493631, + "epoch": 1.4098173515981736, + "grad_norm": 5.65625, + "learning_rate": 2.1976751493949512e-06, + "loss": 1.64372635, + "memory(GiB)": 117.38, + "step": 55575, + "train_speed(iter/s)": 1.637345 + }, + { + "acc": 0.65079679, + "epoch": 1.4099441907661086, + "grad_norm": 5.9375, + "learning_rate": 2.196806763831388e-06, + "loss": 1.59450188, + "memory(GiB)": 117.38, + "step": 55580, + "train_speed(iter/s)": 1.637362 + }, + { + "acc": 0.66014271, + "epoch": 1.4100710299340435, + "grad_norm": 5.875, + "learning_rate": 2.1959385015647794e-06, + "loss": 1.65788307, + "memory(GiB)": 117.38, + "step": 55585, + "train_speed(iter/s)": 1.637379 + }, + { + "acc": 0.65923615, + "epoch": 1.4101978691019788, + "grad_norm": 5.1875, + "learning_rate": 2.195070362633314e-06, + "loss": 1.6290554, + "memory(GiB)": 117.38, + "step": 55590, + "train_speed(iter/s)": 1.637396 + }, + { + "acc": 0.65366058, + "epoch": 1.4103247082699137, + "grad_norm": 7.75, + "learning_rate": 2.194202347075178e-06, + "loss": 1.62625923, + "memory(GiB)": 117.38, + "step": 55595, + "train_speed(iter/s)": 1.637414 + }, + { + "acc": 0.6517252, + "epoch": 1.4104515474378487, + "grad_norm": 5.75, + "learning_rate": 2.1933344549285485e-06, + "loss": 1.64228897, + "memory(GiB)": 117.38, + "step": 55600, + "train_speed(iter/s)": 1.63743 + }, + { + "acc": 0.65299754, + "epoch": 1.410578386605784, + "grad_norm": 9.0, + "learning_rate": 2.1924666862316015e-06, + "loss": 1.62476692, + "memory(GiB)": 117.38, + "step": 55605, + "train_speed(iter/s)": 1.637448 + }, + { + "acc": 0.66371651, + "epoch": 1.410705225773719, + "grad_norm": 5.9375, + "learning_rate": 2.191599041022504e-06, + "loss": 1.564042, + "memory(GiB)": 117.38, + "step": 55610, + "train_speed(iter/s)": 1.637465 + }, + { + "acc": 0.66063395, + "epoch": 1.410832064941654, + "grad_norm": 5.1875, + "learning_rate": 2.19073151933942e-06, + "loss": 1.53701601, + "memory(GiB)": 117.38, + "step": 55615, + "train_speed(iter/s)": 1.637481 + }, + { + "acc": 0.65441561, + "epoch": 1.410958904109589, + "grad_norm": 5.625, + "learning_rate": 2.1898641212205053e-06, + "loss": 1.59882078, + "memory(GiB)": 117.38, + "step": 55620, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.64533687, + "epoch": 1.4110857432775241, + "grad_norm": 5.28125, + "learning_rate": 2.1889968467039114e-06, + "loss": 1.60900536, + "memory(GiB)": 117.38, + "step": 55625, + "train_speed(iter/s)": 1.637516 + }, + { + "acc": 0.65276604, + "epoch": 1.411212582445459, + "grad_norm": 5.53125, + "learning_rate": 2.1881296958277897e-06, + "loss": 1.59919558, + "memory(GiB)": 117.38, + "step": 55630, + "train_speed(iter/s)": 1.637532 + }, + { + "acc": 0.67059474, + "epoch": 1.4113394216133943, + "grad_norm": 6.21875, + "learning_rate": 2.1872626686302767e-06, + "loss": 1.54859467, + "memory(GiB)": 117.38, + "step": 55635, + "train_speed(iter/s)": 1.637549 + }, + { + "acc": 0.6617178, + "epoch": 1.4114662607813293, + "grad_norm": 5.9375, + "learning_rate": 2.186395765149508e-06, + "loss": 1.50149059, + "memory(GiB)": 117.38, + "step": 55640, + "train_speed(iter/s)": 1.637566 + }, + { + "acc": 0.6456203, + "epoch": 1.4115930999492643, + "grad_norm": 5.3125, + "learning_rate": 2.1855289854236165e-06, + "loss": 1.66820869, + "memory(GiB)": 117.38, + "step": 55645, + "train_speed(iter/s)": 1.637582 + }, + { + "acc": 0.65637083, + "epoch": 1.4117199391171993, + "grad_norm": 5.0625, + "learning_rate": 2.184662329490728e-06, + "loss": 1.58305645, + "memory(GiB)": 117.38, + "step": 55650, + "train_speed(iter/s)": 1.637598 + }, + { + "acc": 0.65677662, + "epoch": 1.4118467782851345, + "grad_norm": 6.75, + "learning_rate": 2.1837957973889584e-06, + "loss": 1.60450344, + "memory(GiB)": 117.38, + "step": 55655, + "train_speed(iter/s)": 1.637614 + }, + { + "acc": 0.63853054, + "epoch": 1.4119736174530695, + "grad_norm": 6.4375, + "learning_rate": 2.1829293891564212e-06, + "loss": 1.64831276, + "memory(GiB)": 117.38, + "step": 55660, + "train_speed(iter/s)": 1.637631 + }, + { + "acc": 0.6536479, + "epoch": 1.4121004566210047, + "grad_norm": 5.4375, + "learning_rate": 2.182063104831228e-06, + "loss": 1.60060005, + "memory(GiB)": 117.38, + "step": 55665, + "train_speed(iter/s)": 1.637648 + }, + { + "acc": 0.65629382, + "epoch": 1.4122272957889397, + "grad_norm": 5.21875, + "learning_rate": 2.181196944451483e-06, + "loss": 1.59252434, + "memory(GiB)": 117.38, + "step": 55670, + "train_speed(iter/s)": 1.637664 + }, + { + "acc": 0.65572109, + "epoch": 1.4123541349568747, + "grad_norm": 6.40625, + "learning_rate": 2.180330908055278e-06, + "loss": 1.65411358, + "memory(GiB)": 117.38, + "step": 55675, + "train_speed(iter/s)": 1.63768 + }, + { + "acc": 0.64828358, + "epoch": 1.4124809741248097, + "grad_norm": 6.3125, + "learning_rate": 2.1794649956807094e-06, + "loss": 1.61419334, + "memory(GiB)": 117.38, + "step": 55680, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.66647234, + "epoch": 1.4126078132927449, + "grad_norm": 5.65625, + "learning_rate": 2.178599207365864e-06, + "loss": 1.50949287, + "memory(GiB)": 117.38, + "step": 55685, + "train_speed(iter/s)": 1.637714 + }, + { + "acc": 0.66886826, + "epoch": 1.4127346524606799, + "grad_norm": 5.1875, + "learning_rate": 2.1777335431488234e-06, + "loss": 1.55648022, + "memory(GiB)": 117.38, + "step": 55690, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.66577911, + "epoch": 1.4128614916286149, + "grad_norm": 5.875, + "learning_rate": 2.176868003067659e-06, + "loss": 1.57494259, + "memory(GiB)": 117.38, + "step": 55695, + "train_speed(iter/s)": 1.637748 + }, + { + "acc": 0.65630693, + "epoch": 1.41298833079655, + "grad_norm": 5.96875, + "learning_rate": 2.1760025871604456e-06, + "loss": 1.59197769, + "memory(GiB)": 117.38, + "step": 55700, + "train_speed(iter/s)": 1.637766 + }, + { + "acc": 0.66285625, + "epoch": 1.413115169964485, + "grad_norm": 6.90625, + "learning_rate": 2.175137295465247e-06, + "loss": 1.60119648, + "memory(GiB)": 117.38, + "step": 55705, + "train_speed(iter/s)": 1.637782 + }, + { + "acc": 0.65775256, + "epoch": 1.41324200913242, + "grad_norm": 4.71875, + "learning_rate": 2.174272128020122e-06, + "loss": 1.62330437, + "memory(GiB)": 117.38, + "step": 55710, + "train_speed(iter/s)": 1.637799 + }, + { + "acc": 0.6739552, + "epoch": 1.413368848300355, + "grad_norm": 6.0625, + "learning_rate": 2.1734070848631245e-06, + "loss": 1.56818304, + "memory(GiB)": 117.38, + "step": 55715, + "train_speed(iter/s)": 1.637815 + }, + { + "acc": 0.66088099, + "epoch": 1.4134956874682902, + "grad_norm": 10.4375, + "learning_rate": 2.172542166032303e-06, + "loss": 1.54413128, + "memory(GiB)": 117.38, + "step": 55720, + "train_speed(iter/s)": 1.637833 + }, + { + "acc": 0.65214715, + "epoch": 1.4136225266362252, + "grad_norm": 5.3125, + "learning_rate": 2.171677371565701e-06, + "loss": 1.6402565, + "memory(GiB)": 117.38, + "step": 55725, + "train_speed(iter/s)": 1.637848 + }, + { + "acc": 0.66133156, + "epoch": 1.4137493658041604, + "grad_norm": 6.3125, + "learning_rate": 2.1708127015013565e-06, + "loss": 1.61832027, + "memory(GiB)": 117.38, + "step": 55730, + "train_speed(iter/s)": 1.637864 + }, + { + "acc": 0.64802055, + "epoch": 1.4138762049720954, + "grad_norm": 5.84375, + "learning_rate": 2.1699481558773e-06, + "loss": 1.58707027, + "memory(GiB)": 117.38, + "step": 55735, + "train_speed(iter/s)": 1.63788 + }, + { + "acc": 0.6499661, + "epoch": 1.4140030441400304, + "grad_norm": 4.4375, + "learning_rate": 2.1690837347315597e-06, + "loss": 1.6137928, + "memory(GiB)": 117.38, + "step": 55740, + "train_speed(iter/s)": 1.637897 + }, + { + "acc": 0.66404982, + "epoch": 1.4141298833079654, + "grad_norm": 6.34375, + "learning_rate": 2.168219438102155e-06, + "loss": 1.57214193, + "memory(GiB)": 117.38, + "step": 55745, + "train_speed(iter/s)": 1.637913 + }, + { + "acc": 0.65018663, + "epoch": 1.4142567224759006, + "grad_norm": 6.84375, + "learning_rate": 2.167355266027103e-06, + "loss": 1.60913467, + "memory(GiB)": 117.38, + "step": 55750, + "train_speed(iter/s)": 1.63793 + }, + { + "acc": 0.66313162, + "epoch": 1.4143835616438356, + "grad_norm": 6.125, + "learning_rate": 2.1664912185444127e-06, + "loss": 1.5276576, + "memory(GiB)": 117.38, + "step": 55755, + "train_speed(iter/s)": 1.637948 + }, + { + "acc": 0.65130501, + "epoch": 1.4145104008117706, + "grad_norm": 5.65625, + "learning_rate": 2.16562729569209e-06, + "loss": 1.69024944, + "memory(GiB)": 117.38, + "step": 55760, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.66500769, + "epoch": 1.4146372399797058, + "grad_norm": 5.84375, + "learning_rate": 2.164763497508131e-06, + "loss": 1.54899521, + "memory(GiB)": 117.38, + "step": 55765, + "train_speed(iter/s)": 1.63798 + }, + { + "acc": 0.67111855, + "epoch": 1.4147640791476408, + "grad_norm": 5.625, + "learning_rate": 2.1638998240305355e-06, + "loss": 1.57638149, + "memory(GiB)": 117.38, + "step": 55770, + "train_speed(iter/s)": 1.637998 + }, + { + "acc": 0.6395658, + "epoch": 1.4148909183155758, + "grad_norm": 6.125, + "learning_rate": 2.163036275297286e-06, + "loss": 1.65650291, + "memory(GiB)": 117.38, + "step": 55775, + "train_speed(iter/s)": 1.638014 + }, + { + "acc": 0.65556026, + "epoch": 1.4150177574835108, + "grad_norm": 5.53125, + "learning_rate": 2.162172851346368e-06, + "loss": 1.57244339, + "memory(GiB)": 117.38, + "step": 55780, + "train_speed(iter/s)": 1.63803 + }, + { + "acc": 0.66797991, + "epoch": 1.415144596651446, + "grad_norm": 6.28125, + "learning_rate": 2.1613095522157557e-06, + "loss": 1.57551298, + "memory(GiB)": 117.38, + "step": 55785, + "train_speed(iter/s)": 1.638047 + }, + { + "acc": 0.65960522, + "epoch": 1.415271435819381, + "grad_norm": 5.40625, + "learning_rate": 2.1604463779434267e-06, + "loss": 1.56033268, + "memory(GiB)": 117.38, + "step": 55790, + "train_speed(iter/s)": 1.638063 + }, + { + "acc": 0.65391316, + "epoch": 1.4153982749873162, + "grad_norm": 5.78125, + "learning_rate": 2.159583328567342e-06, + "loss": 1.63294563, + "memory(GiB)": 117.38, + "step": 55795, + "train_speed(iter/s)": 1.63808 + }, + { + "acc": 0.66140633, + "epoch": 1.4155251141552512, + "grad_norm": 5.65625, + "learning_rate": 2.158720404125462e-06, + "loss": 1.55224876, + "memory(GiB)": 117.38, + "step": 55800, + "train_speed(iter/s)": 1.638096 + }, + { + "acc": 0.66071339, + "epoch": 1.4156519533231862, + "grad_norm": 5.4375, + "learning_rate": 2.1578576046557463e-06, + "loss": 1.57883329, + "memory(GiB)": 117.38, + "step": 55805, + "train_speed(iter/s)": 1.638113 + }, + { + "acc": 0.64482632, + "epoch": 1.4157787924911212, + "grad_norm": 5.8125, + "learning_rate": 2.156994930196144e-06, + "loss": 1.6230217, + "memory(GiB)": 117.38, + "step": 55810, + "train_speed(iter/s)": 1.63813 + }, + { + "acc": 0.66490197, + "epoch": 1.4159056316590564, + "grad_norm": 5.5625, + "learning_rate": 2.156132380784594e-06, + "loss": 1.53020048, + "memory(GiB)": 117.38, + "step": 55815, + "train_speed(iter/s)": 1.638146 + }, + { + "acc": 0.64963198, + "epoch": 1.4160324708269914, + "grad_norm": 7.40625, + "learning_rate": 2.155269956459041e-06, + "loss": 1.61162949, + "memory(GiB)": 117.38, + "step": 55820, + "train_speed(iter/s)": 1.638163 + }, + { + "acc": 0.67074494, + "epoch": 1.4161593099949266, + "grad_norm": 5.78125, + "learning_rate": 2.1544076572574156e-06, + "loss": 1.57712784, + "memory(GiB)": 117.38, + "step": 55825, + "train_speed(iter/s)": 1.638179 + }, + { + "acc": 0.65239029, + "epoch": 1.4162861491628616, + "grad_norm": 5.15625, + "learning_rate": 2.1535454832176482e-06, + "loss": 1.59608822, + "memory(GiB)": 117.38, + "step": 55830, + "train_speed(iter/s)": 1.638195 + }, + { + "acc": 0.64519591, + "epoch": 1.4164129883307965, + "grad_norm": 6.03125, + "learning_rate": 2.1526834343776556e-06, + "loss": 1.67440529, + "memory(GiB)": 117.38, + "step": 55835, + "train_speed(iter/s)": 1.638213 + }, + { + "acc": 0.6316041, + "epoch": 1.4165398274987315, + "grad_norm": 7.4375, + "learning_rate": 2.1518215107753593e-06, + "loss": 1.67723885, + "memory(GiB)": 117.38, + "step": 55840, + "train_speed(iter/s)": 1.63823 + }, + { + "acc": 0.63944454, + "epoch": 1.4166666666666667, + "grad_norm": 5.46875, + "learning_rate": 2.1509597124486693e-06, + "loss": 1.62081108, + "memory(GiB)": 117.38, + "step": 55845, + "train_speed(iter/s)": 1.638247 + }, + { + "acc": 0.66900139, + "epoch": 1.4167935058346017, + "grad_norm": 5.0625, + "learning_rate": 2.1500980394354907e-06, + "loss": 1.5674037, + "memory(GiB)": 117.38, + "step": 55850, + "train_speed(iter/s)": 1.638263 + }, + { + "acc": 0.6554409, + "epoch": 1.4169203450025367, + "grad_norm": 5.78125, + "learning_rate": 2.1492364917737252e-06, + "loss": 1.60124626, + "memory(GiB)": 117.38, + "step": 55855, + "train_speed(iter/s)": 1.63828 + }, + { + "acc": 0.66146145, + "epoch": 1.417047184170472, + "grad_norm": 5.71875, + "learning_rate": 2.148375069501266e-06, + "loss": 1.56895208, + "memory(GiB)": 117.38, + "step": 55860, + "train_speed(iter/s)": 1.638296 + }, + { + "acc": 0.65574379, + "epoch": 1.417174023338407, + "grad_norm": 4.46875, + "learning_rate": 2.147513772656003e-06, + "loss": 1.57744589, + "memory(GiB)": 117.38, + "step": 55865, + "train_speed(iter/s)": 1.638313 + }, + { + "acc": 0.64747858, + "epoch": 1.417300862506342, + "grad_norm": 7.5625, + "learning_rate": 2.1466526012758194e-06, + "loss": 1.59985256, + "memory(GiB)": 117.38, + "step": 55870, + "train_speed(iter/s)": 1.63833 + }, + { + "acc": 0.66212597, + "epoch": 1.417427701674277, + "grad_norm": 5.84375, + "learning_rate": 2.145791555398594e-06, + "loss": 1.54324026, + "memory(GiB)": 117.38, + "step": 55875, + "train_speed(iter/s)": 1.638346 + }, + { + "acc": 0.66027417, + "epoch": 1.417554540842212, + "grad_norm": 6.46875, + "learning_rate": 2.144930635062199e-06, + "loss": 1.51939335, + "memory(GiB)": 117.38, + "step": 55880, + "train_speed(iter/s)": 1.638363 + }, + { + "acc": 0.63951788, + "epoch": 1.417681380010147, + "grad_norm": 6.15625, + "learning_rate": 2.144069840304502e-06, + "loss": 1.6772913, + "memory(GiB)": 117.38, + "step": 55885, + "train_speed(iter/s)": 1.63838 + }, + { + "acc": 0.66648498, + "epoch": 1.4178082191780823, + "grad_norm": 8.8125, + "learning_rate": 2.1432091711633634e-06, + "loss": 1.62115002, + "memory(GiB)": 117.38, + "step": 55890, + "train_speed(iter/s)": 1.638396 + }, + { + "acc": 0.65990505, + "epoch": 1.4179350583460173, + "grad_norm": 5.8125, + "learning_rate": 2.142348627676641e-06, + "loss": 1.58954868, + "memory(GiB)": 117.38, + "step": 55895, + "train_speed(iter/s)": 1.638412 + }, + { + "acc": 0.65212393, + "epoch": 1.4180618975139523, + "grad_norm": 5.375, + "learning_rate": 2.1414882098821836e-06, + "loss": 1.59152317, + "memory(GiB)": 117.38, + "step": 55900, + "train_speed(iter/s)": 1.638429 + }, + { + "acc": 0.66284485, + "epoch": 1.4181887366818873, + "grad_norm": 6.03125, + "learning_rate": 2.1406279178178355e-06, + "loss": 1.61916275, + "memory(GiB)": 117.38, + "step": 55905, + "train_speed(iter/s)": 1.638445 + }, + { + "acc": 0.65922441, + "epoch": 1.4183155758498225, + "grad_norm": 5.875, + "learning_rate": 2.1397677515214422e-06, + "loss": 1.60093956, + "memory(GiB)": 117.38, + "step": 55910, + "train_speed(iter/s)": 1.638462 + }, + { + "acc": 0.65031242, + "epoch": 1.4184424150177575, + "grad_norm": 7.53125, + "learning_rate": 2.1389077110308304e-06, + "loss": 1.63191299, + "memory(GiB)": 117.38, + "step": 55915, + "train_speed(iter/s)": 1.638478 + }, + { + "acc": 0.64999566, + "epoch": 1.4185692541856925, + "grad_norm": 6.40625, + "learning_rate": 2.138047796383832e-06, + "loss": 1.66177025, + "memory(GiB)": 117.38, + "step": 55920, + "train_speed(iter/s)": 1.638496 + }, + { + "acc": 0.65035777, + "epoch": 1.4186960933536277, + "grad_norm": 9.3125, + "learning_rate": 2.1371880076182666e-06, + "loss": 1.66402454, + "memory(GiB)": 117.38, + "step": 55925, + "train_speed(iter/s)": 1.638512 + }, + { + "acc": 0.64330549, + "epoch": 1.4188229325215627, + "grad_norm": 6.8125, + "learning_rate": 2.1363283447719584e-06, + "loss": 1.67426949, + "memory(GiB)": 117.38, + "step": 55930, + "train_speed(iter/s)": 1.638529 + }, + { + "acc": 0.6642303, + "epoch": 1.4189497716894977, + "grad_norm": 7.21875, + "learning_rate": 2.135468807882713e-06, + "loss": 1.55817451, + "memory(GiB)": 117.38, + "step": 55935, + "train_speed(iter/s)": 1.638547 + }, + { + "acc": 0.64840069, + "epoch": 1.4190766108574326, + "grad_norm": 6.5, + "learning_rate": 2.1346093969883367e-06, + "loss": 1.63250809, + "memory(GiB)": 117.38, + "step": 55940, + "train_speed(iter/s)": 1.638564 + }, + { + "acc": 0.65852861, + "epoch": 1.4192034500253679, + "grad_norm": 5.5625, + "learning_rate": 2.1337501121266345e-06, + "loss": 1.56873293, + "memory(GiB)": 117.38, + "step": 55945, + "train_speed(iter/s)": 1.63858 + }, + { + "acc": 0.64395843, + "epoch": 1.4193302891933028, + "grad_norm": 6.90625, + "learning_rate": 2.132890953335401e-06, + "loss": 1.67318535, + "memory(GiB)": 117.38, + "step": 55950, + "train_speed(iter/s)": 1.638599 + }, + { + "acc": 0.65122981, + "epoch": 1.419457128361238, + "grad_norm": 5.75, + "learning_rate": 2.1320319206524215e-06, + "loss": 1.69549026, + "memory(GiB)": 117.38, + "step": 55955, + "train_speed(iter/s)": 1.638616 + }, + { + "acc": 0.65861716, + "epoch": 1.419583967529173, + "grad_norm": 5.0625, + "learning_rate": 2.1311730141154813e-06, + "loss": 1.60972843, + "memory(GiB)": 117.38, + "step": 55960, + "train_speed(iter/s)": 1.638632 + }, + { + "acc": 0.6550437, + "epoch": 1.419710806697108, + "grad_norm": 5.15625, + "learning_rate": 2.1303142337623623e-06, + "loss": 1.60616798, + "memory(GiB)": 117.38, + "step": 55965, + "train_speed(iter/s)": 1.638649 + }, + { + "acc": 0.65619144, + "epoch": 1.419837645865043, + "grad_norm": 6.34375, + "learning_rate": 2.1294555796308375e-06, + "loss": 1.57014132, + "memory(GiB)": 117.38, + "step": 55970, + "train_speed(iter/s)": 1.638665 + }, + { + "acc": 0.65816698, + "epoch": 1.4199644850329782, + "grad_norm": 5.90625, + "learning_rate": 2.1285970517586686e-06, + "loss": 1.6872406, + "memory(GiB)": 117.38, + "step": 55975, + "train_speed(iter/s)": 1.638683 + }, + { + "acc": 0.64084349, + "epoch": 1.4200913242009132, + "grad_norm": 6.8125, + "learning_rate": 2.127738650183623e-06, + "loss": 1.60930195, + "memory(GiB)": 117.38, + "step": 55980, + "train_speed(iter/s)": 1.638699 + }, + { + "acc": 0.65255833, + "epoch": 1.4202181633688484, + "grad_norm": 5.46875, + "learning_rate": 2.1268803749434546e-06, + "loss": 1.6115757, + "memory(GiB)": 117.38, + "step": 55985, + "train_speed(iter/s)": 1.638715 + }, + { + "acc": 0.64217582, + "epoch": 1.4203450025367834, + "grad_norm": 8.25, + "learning_rate": 2.1260222260759158e-06, + "loss": 1.5963171, + "memory(GiB)": 117.38, + "step": 55990, + "train_speed(iter/s)": 1.638732 + }, + { + "acc": 0.66537237, + "epoch": 1.4204718417047184, + "grad_norm": 5.46875, + "learning_rate": 2.1251642036187502e-06, + "loss": 1.60161285, + "memory(GiB)": 117.38, + "step": 55995, + "train_speed(iter/s)": 1.638748 + }, + { + "acc": 0.66448884, + "epoch": 1.4205986808726534, + "grad_norm": 5.46875, + "learning_rate": 2.124306307609699e-06, + "loss": 1.59291735, + "memory(GiB)": 117.38, + "step": 56000, + "train_speed(iter/s)": 1.638764 + }, + { + "epoch": 1.4205986808726534, + "eval_acc": 0.6462279970847247, + "eval_loss": 1.573153018951416, + "eval_runtime": 58.4646, + "eval_samples_per_second": 108.955, + "eval_steps_per_second": 27.247, + "step": 56000 + }, + { + "acc": 0.63351436, + "epoch": 1.4207255200405886, + "grad_norm": 5.84375, + "learning_rate": 2.123448538086495e-06, + "loss": 1.70207481, + "memory(GiB)": 117.38, + "step": 56005, + "train_speed(iter/s)": 1.635773 + }, + { + "acc": 0.64949937, + "epoch": 1.4208523592085236, + "grad_norm": 6.65625, + "learning_rate": 2.122590895086867e-06, + "loss": 1.66971092, + "memory(GiB)": 117.38, + "step": 56010, + "train_speed(iter/s)": 1.635787 + }, + { + "acc": 0.65008979, + "epoch": 1.4209791983764586, + "grad_norm": 6.28125, + "learning_rate": 2.1217333786485385e-06, + "loss": 1.58441048, + "memory(GiB)": 117.38, + "step": 56015, + "train_speed(iter/s)": 1.635804 + }, + { + "acc": 0.65072289, + "epoch": 1.4211060375443938, + "grad_norm": 5.96875, + "learning_rate": 2.120875988809226e-06, + "loss": 1.63161125, + "memory(GiB)": 117.38, + "step": 56020, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.65938396, + "epoch": 1.4212328767123288, + "grad_norm": 6.0, + "learning_rate": 2.1200187256066425e-06, + "loss": 1.57050209, + "memory(GiB)": 117.38, + "step": 56025, + "train_speed(iter/s)": 1.635835 + }, + { + "acc": 0.66699615, + "epoch": 1.4213597158802638, + "grad_norm": 6.53125, + "learning_rate": 2.119161589078493e-06, + "loss": 1.59796228, + "memory(GiB)": 117.38, + "step": 56030, + "train_speed(iter/s)": 1.63585 + }, + { + "acc": 0.6388308, + "epoch": 1.4214865550481988, + "grad_norm": 5.90625, + "learning_rate": 2.118304579262479e-06, + "loss": 1.70197716, + "memory(GiB)": 117.38, + "step": 56035, + "train_speed(iter/s)": 1.635865 + }, + { + "acc": 0.67194672, + "epoch": 1.421613394216134, + "grad_norm": 6.4375, + "learning_rate": 2.1174476961962957e-06, + "loss": 1.58991671, + "memory(GiB)": 117.38, + "step": 56040, + "train_speed(iter/s)": 1.635881 + }, + { + "acc": 0.64458737, + "epoch": 1.421740233384069, + "grad_norm": 6.125, + "learning_rate": 2.1165909399176328e-06, + "loss": 1.6444334, + "memory(GiB)": 117.38, + "step": 56045, + "train_speed(iter/s)": 1.635898 + }, + { + "acc": 0.64549866, + "epoch": 1.4218670725520042, + "grad_norm": 5.5625, + "learning_rate": 2.1157343104641733e-06, + "loss": 1.65981789, + "memory(GiB)": 117.38, + "step": 56050, + "train_speed(iter/s)": 1.635914 + }, + { + "acc": 0.65684814, + "epoch": 1.4219939117199392, + "grad_norm": 6.25, + "learning_rate": 2.114877807873596e-06, + "loss": 1.55561247, + "memory(GiB)": 117.38, + "step": 56055, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.64241743, + "epoch": 1.4221207508878742, + "grad_norm": 5.28125, + "learning_rate": 2.114021432183574e-06, + "loss": 1.6173317, + "memory(GiB)": 117.38, + "step": 56060, + "train_speed(iter/s)": 1.635945 + }, + { + "acc": 0.64835644, + "epoch": 1.4222475900558091, + "grad_norm": 5.71875, + "learning_rate": 2.113165183431773e-06, + "loss": 1.61581402, + "memory(GiB)": 117.38, + "step": 56065, + "train_speed(iter/s)": 1.635961 + }, + { + "acc": 0.67609644, + "epoch": 1.4223744292237444, + "grad_norm": 6.8125, + "learning_rate": 2.112309061655859e-06, + "loss": 1.50941238, + "memory(GiB)": 117.38, + "step": 56070, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.68436561, + "epoch": 1.4225012683916793, + "grad_norm": 6.15625, + "learning_rate": 2.1114530668934836e-06, + "loss": 1.51605415, + "memory(GiB)": 117.38, + "step": 56075, + "train_speed(iter/s)": 1.635991 + }, + { + "acc": 0.64157991, + "epoch": 1.4226281075596143, + "grad_norm": 5.59375, + "learning_rate": 2.1105971991822966e-06, + "loss": 1.61358604, + "memory(GiB)": 117.38, + "step": 56080, + "train_speed(iter/s)": 1.636007 + }, + { + "acc": 0.64268394, + "epoch": 1.4227549467275495, + "grad_norm": 7.59375, + "learning_rate": 2.1097414585599474e-06, + "loss": 1.65658455, + "memory(GiB)": 117.38, + "step": 56085, + "train_speed(iter/s)": 1.636024 + }, + { + "acc": 0.66413655, + "epoch": 1.4228817858954845, + "grad_norm": 6.71875, + "learning_rate": 2.1088858450640743e-06, + "loss": 1.63587303, + "memory(GiB)": 117.38, + "step": 56090, + "train_speed(iter/s)": 1.636039 + }, + { + "acc": 0.66147404, + "epoch": 1.4230086250634195, + "grad_norm": 7.3125, + "learning_rate": 2.108030358732308e-06, + "loss": 1.57082043, + "memory(GiB)": 117.38, + "step": 56095, + "train_speed(iter/s)": 1.636056 + }, + { + "acc": 0.65377502, + "epoch": 1.4231354642313545, + "grad_norm": 5.09375, + "learning_rate": 2.107174999602277e-06, + "loss": 1.62279377, + "memory(GiB)": 117.38, + "step": 56100, + "train_speed(iter/s)": 1.636071 + }, + { + "acc": 0.66465039, + "epoch": 1.4232623033992897, + "grad_norm": 6.03125, + "learning_rate": 2.106319767711606e-06, + "loss": 1.561306, + "memory(GiB)": 117.38, + "step": 56105, + "train_speed(iter/s)": 1.636087 + }, + { + "acc": 0.64482765, + "epoch": 1.4233891425672247, + "grad_norm": 6.34375, + "learning_rate": 2.105464663097913e-06, + "loss": 1.64756794, + "memory(GiB)": 117.38, + "step": 56110, + "train_speed(iter/s)": 1.636103 + }, + { + "acc": 0.66628647, + "epoch": 1.42351598173516, + "grad_norm": 6.03125, + "learning_rate": 2.1046096857988033e-06, + "loss": 1.55195751, + "memory(GiB)": 117.38, + "step": 56115, + "train_speed(iter/s)": 1.636117 + }, + { + "acc": 0.6538125, + "epoch": 1.423642820903095, + "grad_norm": 5.0625, + "learning_rate": 2.103754835851889e-06, + "loss": 1.5825346, + "memory(GiB)": 117.38, + "step": 56120, + "train_speed(iter/s)": 1.636133 + }, + { + "acc": 0.66755147, + "epoch": 1.42376966007103, + "grad_norm": 5.40625, + "learning_rate": 2.102900113294768e-06, + "loss": 1.54982643, + "memory(GiB)": 117.38, + "step": 56125, + "train_speed(iter/s)": 1.636148 + }, + { + "acc": 0.647686, + "epoch": 1.4238964992389649, + "grad_norm": 5.90625, + "learning_rate": 2.1020455181650356e-06, + "loss": 1.58786545, + "memory(GiB)": 117.38, + "step": 56130, + "train_speed(iter/s)": 1.636164 + }, + { + "acc": 0.66273246, + "epoch": 1.4240233384069, + "grad_norm": 6.0625, + "learning_rate": 2.101191050500277e-06, + "loss": 1.59326544, + "memory(GiB)": 117.38, + "step": 56135, + "train_speed(iter/s)": 1.63618 + }, + { + "acc": 0.64656487, + "epoch": 1.424150177574835, + "grad_norm": 6.03125, + "learning_rate": 2.1003367103380797e-06, + "loss": 1.61582146, + "memory(GiB)": 117.38, + "step": 56140, + "train_speed(iter/s)": 1.636196 + }, + { + "acc": 0.65650616, + "epoch": 1.4242770167427703, + "grad_norm": 6.46875, + "learning_rate": 2.09948249771602e-06, + "loss": 1.66964779, + "memory(GiB)": 117.38, + "step": 56145, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.65303869, + "epoch": 1.4244038559107053, + "grad_norm": 6.96875, + "learning_rate": 2.09862841267167e-06, + "loss": 1.52922688, + "memory(GiB)": 117.38, + "step": 56150, + "train_speed(iter/s)": 1.636226 + }, + { + "acc": 0.65279756, + "epoch": 1.4245306950786403, + "grad_norm": 7.125, + "learning_rate": 2.097774455242596e-06, + "loss": 1.62442284, + "memory(GiB)": 117.38, + "step": 56155, + "train_speed(iter/s)": 1.636242 + }, + { + "acc": 0.65691862, + "epoch": 1.4246575342465753, + "grad_norm": 5.53125, + "learning_rate": 2.096920625466359e-06, + "loss": 1.58108883, + "memory(GiB)": 117.38, + "step": 56160, + "train_speed(iter/s)": 1.636257 + }, + { + "acc": 0.64106445, + "epoch": 1.4247843734145105, + "grad_norm": 5.4375, + "learning_rate": 2.0960669233805136e-06, + "loss": 1.66551342, + "memory(GiB)": 117.38, + "step": 56165, + "train_speed(iter/s)": 1.636269 + }, + { + "acc": 0.65888739, + "epoch": 1.4249112125824455, + "grad_norm": 4.78125, + "learning_rate": 2.0952133490226106e-06, + "loss": 1.66334324, + "memory(GiB)": 117.38, + "step": 56170, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.649435, + "epoch": 1.4250380517503805, + "grad_norm": 6.28125, + "learning_rate": 2.0943599024301935e-06, + "loss": 1.64008312, + "memory(GiB)": 117.38, + "step": 56175, + "train_speed(iter/s)": 1.636301 + }, + { + "acc": 0.66099072, + "epoch": 1.4251648909183157, + "grad_norm": 6.21875, + "learning_rate": 2.093506583640801e-06, + "loss": 1.55307102, + "memory(GiB)": 117.38, + "step": 56180, + "train_speed(iter/s)": 1.636317 + }, + { + "acc": 0.65816002, + "epoch": 1.4252917300862507, + "grad_norm": 6.0625, + "learning_rate": 2.092653392691965e-06, + "loss": 1.56229925, + "memory(GiB)": 117.38, + "step": 56185, + "train_speed(iter/s)": 1.636333 + }, + { + "acc": 0.65581341, + "epoch": 1.4254185692541856, + "grad_norm": 6.1875, + "learning_rate": 2.091800329621213e-06, + "loss": 1.59925108, + "memory(GiB)": 117.38, + "step": 56190, + "train_speed(iter/s)": 1.636349 + }, + { + "acc": 0.67187867, + "epoch": 1.4255454084221206, + "grad_norm": 5.0625, + "learning_rate": 2.0909473944660667e-06, + "loss": 1.5537281, + "memory(GiB)": 117.38, + "step": 56195, + "train_speed(iter/s)": 1.636365 + }, + { + "acc": 0.65392485, + "epoch": 1.4256722475900558, + "grad_norm": 5.875, + "learning_rate": 2.0900945872640427e-06, + "loss": 1.64563847, + "memory(GiB)": 117.38, + "step": 56200, + "train_speed(iter/s)": 1.636382 + }, + { + "acc": 0.66792488, + "epoch": 1.4257990867579908, + "grad_norm": 6.59375, + "learning_rate": 2.0892419080526484e-06, + "loss": 1.5796175, + "memory(GiB)": 117.38, + "step": 56205, + "train_speed(iter/s)": 1.636397 + }, + { + "acc": 0.66316986, + "epoch": 1.425925925925926, + "grad_norm": 5.875, + "learning_rate": 2.0883893568693935e-06, + "loss": 1.56624556, + "memory(GiB)": 117.38, + "step": 56210, + "train_speed(iter/s)": 1.636413 + }, + { + "acc": 0.66564879, + "epoch": 1.426052765093861, + "grad_norm": 5.25, + "learning_rate": 2.087536933751773e-06, + "loss": 1.59430733, + "memory(GiB)": 117.38, + "step": 56215, + "train_speed(iter/s)": 1.63643 + }, + { + "acc": 0.65701547, + "epoch": 1.426179604261796, + "grad_norm": 7.78125, + "learning_rate": 2.0866846387372814e-06, + "loss": 1.61675491, + "memory(GiB)": 117.38, + "step": 56220, + "train_speed(iter/s)": 1.636445 + }, + { + "acc": 0.64128389, + "epoch": 1.426306443429731, + "grad_norm": 5.75, + "learning_rate": 2.085832471863404e-06, + "loss": 1.69387493, + "memory(GiB)": 117.38, + "step": 56225, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.64727774, + "epoch": 1.4264332825976662, + "grad_norm": 6.0625, + "learning_rate": 2.08498043316763e-06, + "loss": 1.65560341, + "memory(GiB)": 117.38, + "step": 56230, + "train_speed(iter/s)": 1.636479 + }, + { + "acc": 0.6692924, + "epoch": 1.4265601217656012, + "grad_norm": 6.15625, + "learning_rate": 2.0841285226874282e-06, + "loss": 1.59545631, + "memory(GiB)": 117.38, + "step": 56235, + "train_speed(iter/s)": 1.636494 + }, + { + "acc": 0.6468823, + "epoch": 1.4266869609335362, + "grad_norm": 5.53125, + "learning_rate": 2.083276740460271e-06, + "loss": 1.63130455, + "memory(GiB)": 117.38, + "step": 56240, + "train_speed(iter/s)": 1.636511 + }, + { + "acc": 0.66154289, + "epoch": 1.4268138001014714, + "grad_norm": 5.25, + "learning_rate": 2.082425086523627e-06, + "loss": 1.52195644, + "memory(GiB)": 117.38, + "step": 56245, + "train_speed(iter/s)": 1.636526 + }, + { + "acc": 0.65864873, + "epoch": 1.4269406392694064, + "grad_norm": 5.875, + "learning_rate": 2.0815735609149556e-06, + "loss": 1.57371788, + "memory(GiB)": 117.38, + "step": 56250, + "train_speed(iter/s)": 1.636543 + }, + { + "acc": 0.67760506, + "epoch": 1.4270674784373414, + "grad_norm": 5.46875, + "learning_rate": 2.080722163671705e-06, + "loss": 1.54128227, + "memory(GiB)": 117.38, + "step": 56255, + "train_speed(iter/s)": 1.63656 + }, + { + "acc": 0.66518574, + "epoch": 1.4271943176052764, + "grad_norm": 5.40625, + "learning_rate": 2.079870894831329e-06, + "loss": 1.60316563, + "memory(GiB)": 117.38, + "step": 56260, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.65389404, + "epoch": 1.4273211567732116, + "grad_norm": 5.4375, + "learning_rate": 2.0790197544312683e-06, + "loss": 1.64111633, + "memory(GiB)": 117.38, + "step": 56265, + "train_speed(iter/s)": 1.636591 + }, + { + "acc": 0.6464653, + "epoch": 1.4274479959411466, + "grad_norm": 5.375, + "learning_rate": 2.0781687425089624e-06, + "loss": 1.64246216, + "memory(GiB)": 117.38, + "step": 56270, + "train_speed(iter/s)": 1.636606 + }, + { + "acc": 0.65611019, + "epoch": 1.4275748351090818, + "grad_norm": 6.90625, + "learning_rate": 2.0773178591018363e-06, + "loss": 1.67025375, + "memory(GiB)": 117.38, + "step": 56275, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.67640247, + "epoch": 1.4277016742770168, + "grad_norm": 6.21875, + "learning_rate": 2.076467104247322e-06, + "loss": 1.51072865, + "memory(GiB)": 117.38, + "step": 56280, + "train_speed(iter/s)": 1.636638 + }, + { + "acc": 0.67425675, + "epoch": 1.4278285134449518, + "grad_norm": 5.28125, + "learning_rate": 2.0756164779828365e-06, + "loss": 1.51532183, + "memory(GiB)": 117.38, + "step": 56285, + "train_speed(iter/s)": 1.636655 + }, + { + "acc": 0.65487924, + "epoch": 1.4279553526128868, + "grad_norm": 4.90625, + "learning_rate": 2.0747659803457943e-06, + "loss": 1.58731823, + "memory(GiB)": 117.38, + "step": 56290, + "train_speed(iter/s)": 1.636671 + }, + { + "acc": 0.67112637, + "epoch": 1.428082191780822, + "grad_norm": 5.25, + "learning_rate": 2.073915611373605e-06, + "loss": 1.5472847, + "memory(GiB)": 117.38, + "step": 56295, + "train_speed(iter/s)": 1.636687 + }, + { + "acc": 0.64999266, + "epoch": 1.428209030948757, + "grad_norm": 4.9375, + "learning_rate": 2.0730653711036713e-06, + "loss": 1.59760695, + "memory(GiB)": 117.38, + "step": 56300, + "train_speed(iter/s)": 1.636703 + }, + { + "acc": 0.66855268, + "epoch": 1.4283358701166922, + "grad_norm": 6.25, + "learning_rate": 2.0722152595733903e-06, + "loss": 1.51363525, + "memory(GiB)": 117.38, + "step": 56305, + "train_speed(iter/s)": 1.636719 + }, + { + "acc": 0.66257396, + "epoch": 1.4284627092846272, + "grad_norm": 6.0, + "learning_rate": 2.0713652768201536e-06, + "loss": 1.56442022, + "memory(GiB)": 117.38, + "step": 56310, + "train_speed(iter/s)": 1.636737 + }, + { + "acc": 0.65923796, + "epoch": 1.4285895484525621, + "grad_norm": 7.0625, + "learning_rate": 2.0705154228813477e-06, + "loss": 1.60552979, + "memory(GiB)": 117.38, + "step": 56315, + "train_speed(iter/s)": 1.636754 + }, + { + "acc": 0.66712313, + "epoch": 1.4287163876204971, + "grad_norm": 5.03125, + "learning_rate": 2.0696656977943524e-06, + "loss": 1.52747192, + "memory(GiB)": 117.38, + "step": 56320, + "train_speed(iter/s)": 1.636769 + }, + { + "acc": 0.64965863, + "epoch": 1.4288432267884323, + "grad_norm": 5.78125, + "learning_rate": 2.068816101596543e-06, + "loss": 1.63623161, + "memory(GiB)": 117.38, + "step": 56325, + "train_speed(iter/s)": 1.636784 + }, + { + "acc": 0.6621418, + "epoch": 1.4289700659563673, + "grad_norm": 5.0625, + "learning_rate": 2.067966634325288e-06, + "loss": 1.5722641, + "memory(GiB)": 117.38, + "step": 56330, + "train_speed(iter/s)": 1.6368 + }, + { + "acc": 0.66067924, + "epoch": 1.4290969051243023, + "grad_norm": 6.78125, + "learning_rate": 2.0671172960179513e-06, + "loss": 1.57224522, + "memory(GiB)": 117.38, + "step": 56335, + "train_speed(iter/s)": 1.636816 + }, + { + "acc": 0.66659017, + "epoch": 1.4292237442922375, + "grad_norm": 5.65625, + "learning_rate": 2.06626808671189e-06, + "loss": 1.5469346, + "memory(GiB)": 117.38, + "step": 56340, + "train_speed(iter/s)": 1.636832 + }, + { + "acc": 0.65840988, + "epoch": 1.4293505834601725, + "grad_norm": 6.1875, + "learning_rate": 2.065419006444455e-06, + "loss": 1.60647335, + "memory(GiB)": 117.38, + "step": 56345, + "train_speed(iter/s)": 1.636848 + }, + { + "acc": 0.66085382, + "epoch": 1.4294774226281075, + "grad_norm": 5.8125, + "learning_rate": 2.0645700552529973e-06, + "loss": 1.60675125, + "memory(GiB)": 117.38, + "step": 56350, + "train_speed(iter/s)": 1.636865 + }, + { + "acc": 0.63569336, + "epoch": 1.4296042617960425, + "grad_norm": 5.40625, + "learning_rate": 2.063721233174853e-06, + "loss": 1.6966711, + "memory(GiB)": 117.38, + "step": 56355, + "train_speed(iter/s)": 1.63688 + }, + { + "acc": 0.66826468, + "epoch": 1.4297311009639777, + "grad_norm": 6.6875, + "learning_rate": 2.0628725402473574e-06, + "loss": 1.50753717, + "memory(GiB)": 117.38, + "step": 56360, + "train_speed(iter/s)": 1.636896 + }, + { + "acc": 0.66465473, + "epoch": 1.4298579401319127, + "grad_norm": 7.0, + "learning_rate": 2.0620239765078404e-06, + "loss": 1.55679455, + "memory(GiB)": 117.38, + "step": 56365, + "train_speed(iter/s)": 1.636912 + }, + { + "acc": 0.65117722, + "epoch": 1.429984779299848, + "grad_norm": 5.1875, + "learning_rate": 2.0611755419936285e-06, + "loss": 1.57975626, + "memory(GiB)": 117.38, + "step": 56370, + "train_speed(iter/s)": 1.636928 + }, + { + "acc": 0.66004658, + "epoch": 1.430111618467783, + "grad_norm": 5.3125, + "learning_rate": 2.0603272367420357e-06, + "loss": 1.58404722, + "memory(GiB)": 117.38, + "step": 56375, + "train_speed(iter/s)": 1.636945 + }, + { + "acc": 0.66127939, + "epoch": 1.4302384576357179, + "grad_norm": 6.0, + "learning_rate": 2.0594790607903743e-06, + "loss": 1.55673847, + "memory(GiB)": 117.38, + "step": 56380, + "train_speed(iter/s)": 1.636961 + }, + { + "acc": 0.66097116, + "epoch": 1.4303652968036529, + "grad_norm": 5.375, + "learning_rate": 2.0586310141759534e-06, + "loss": 1.58874025, + "memory(GiB)": 117.38, + "step": 56385, + "train_speed(iter/s)": 1.636977 + }, + { + "acc": 0.64952879, + "epoch": 1.430492135971588, + "grad_norm": 5.71875, + "learning_rate": 2.0577830969360753e-06, + "loss": 1.61046181, + "memory(GiB)": 117.38, + "step": 56390, + "train_speed(iter/s)": 1.636995 + }, + { + "acc": 0.66503525, + "epoch": 1.430618975139523, + "grad_norm": 6.84375, + "learning_rate": 2.0569353091080304e-06, + "loss": 1.56640615, + "memory(GiB)": 117.38, + "step": 56395, + "train_speed(iter/s)": 1.63701 + }, + { + "acc": 0.66329212, + "epoch": 1.430745814307458, + "grad_norm": 5.6875, + "learning_rate": 2.056087650729109e-06, + "loss": 1.52314224, + "memory(GiB)": 117.38, + "step": 56400, + "train_speed(iter/s)": 1.637026 + }, + { + "acc": 0.6475297, + "epoch": 1.4308726534753933, + "grad_norm": 5.4375, + "learning_rate": 2.0552401218365975e-06, + "loss": 1.63864059, + "memory(GiB)": 117.38, + "step": 56405, + "train_speed(iter/s)": 1.637041 + }, + { + "acc": 0.65295458, + "epoch": 1.4309994926433283, + "grad_norm": 4.90625, + "learning_rate": 2.054392722467775e-06, + "loss": 1.56698761, + "memory(GiB)": 117.38, + "step": 56410, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.63664794, + "epoch": 1.4311263318112633, + "grad_norm": 5.46875, + "learning_rate": 2.0535454526599086e-06, + "loss": 1.70207424, + "memory(GiB)": 117.38, + "step": 56415, + "train_speed(iter/s)": 1.637073 + }, + { + "acc": 0.65941172, + "epoch": 1.4312531709791982, + "grad_norm": 6.46875, + "learning_rate": 2.0526983124502692e-06, + "loss": 1.55500393, + "memory(GiB)": 117.38, + "step": 56420, + "train_speed(iter/s)": 1.637087 + }, + { + "acc": 0.64197426, + "epoch": 1.4313800101471335, + "grad_norm": 6.46875, + "learning_rate": 2.051851301876117e-06, + "loss": 1.64885674, + "memory(GiB)": 117.38, + "step": 56425, + "train_speed(iter/s)": 1.637103 + }, + { + "acc": 0.65074062, + "epoch": 1.4315068493150684, + "grad_norm": 6.59375, + "learning_rate": 2.0510044209747078e-06, + "loss": 1.64282455, + "memory(GiB)": 117.38, + "step": 56430, + "train_speed(iter/s)": 1.637119 + }, + { + "acc": 0.65405674, + "epoch": 1.4316336884830037, + "grad_norm": 5.84375, + "learning_rate": 2.05015766978329e-06, + "loss": 1.60566139, + "memory(GiB)": 117.38, + "step": 56435, + "train_speed(iter/s)": 1.637135 + }, + { + "acc": 0.64742441, + "epoch": 1.4317605276509386, + "grad_norm": 5.25, + "learning_rate": 2.049311048339108e-06, + "loss": 1.67665367, + "memory(GiB)": 117.38, + "step": 56440, + "train_speed(iter/s)": 1.637151 + }, + { + "acc": 0.65694876, + "epoch": 1.4318873668188736, + "grad_norm": 5.3125, + "learning_rate": 2.0484645566793993e-06, + "loss": 1.62892342, + "memory(GiB)": 117.38, + "step": 56445, + "train_speed(iter/s)": 1.637166 + }, + { + "acc": 0.64785614, + "epoch": 1.4320142059868086, + "grad_norm": 6.03125, + "learning_rate": 2.0476181948413975e-06, + "loss": 1.67124596, + "memory(GiB)": 117.38, + "step": 56450, + "train_speed(iter/s)": 1.637183 + }, + { + "acc": 0.65103188, + "epoch": 1.4321410451547438, + "grad_norm": 6.40625, + "learning_rate": 2.0467719628623293e-06, + "loss": 1.59926062, + "memory(GiB)": 117.38, + "step": 56455, + "train_speed(iter/s)": 1.637199 + }, + { + "acc": 0.66634035, + "epoch": 1.4322678843226788, + "grad_norm": 5.53125, + "learning_rate": 2.045925860779415e-06, + "loss": 1.52695847, + "memory(GiB)": 117.38, + "step": 56460, + "train_speed(iter/s)": 1.637215 + }, + { + "acc": 0.64607854, + "epoch": 1.432394723490614, + "grad_norm": 5.96875, + "learning_rate": 2.0450798886298707e-06, + "loss": 1.60617828, + "memory(GiB)": 117.38, + "step": 56465, + "train_speed(iter/s)": 1.63723 + }, + { + "acc": 0.65271149, + "epoch": 1.432521562658549, + "grad_norm": 5.5, + "learning_rate": 2.044234046450905e-06, + "loss": 1.57414618, + "memory(GiB)": 117.38, + "step": 56470, + "train_speed(iter/s)": 1.637245 + }, + { + "acc": 0.66269073, + "epoch": 1.432648401826484, + "grad_norm": 5.5, + "learning_rate": 2.0433883342797233e-06, + "loss": 1.54614553, + "memory(GiB)": 117.38, + "step": 56475, + "train_speed(iter/s)": 1.637261 + }, + { + "acc": 0.66632228, + "epoch": 1.432775240994419, + "grad_norm": 5.5, + "learning_rate": 2.042542752153522e-06, + "loss": 1.56111469, + "memory(GiB)": 117.38, + "step": 56480, + "train_speed(iter/s)": 1.637277 + }, + { + "acc": 0.65908566, + "epoch": 1.4329020801623542, + "grad_norm": 6.8125, + "learning_rate": 2.0416973001094953e-06, + "loss": 1.60543747, + "memory(GiB)": 117.38, + "step": 56485, + "train_speed(iter/s)": 1.637293 + }, + { + "acc": 0.66335478, + "epoch": 1.4330289193302892, + "grad_norm": 6.0625, + "learning_rate": 2.0408519781848292e-06, + "loss": 1.60566158, + "memory(GiB)": 117.38, + "step": 56490, + "train_speed(iter/s)": 1.637309 + }, + { + "acc": 0.673488, + "epoch": 1.4331557584982242, + "grad_norm": 4.9375, + "learning_rate": 2.0400067864167044e-06, + "loss": 1.52056847, + "memory(GiB)": 117.38, + "step": 56495, + "train_speed(iter/s)": 1.637324 + }, + { + "acc": 0.66895247, + "epoch": 1.4332825976661594, + "grad_norm": 5.625, + "learning_rate": 2.0391617248422967e-06, + "loss": 1.54592705, + "memory(GiB)": 117.38, + "step": 56500, + "train_speed(iter/s)": 1.63734 + }, + { + "acc": 0.65170913, + "epoch": 1.4334094368340944, + "grad_norm": 6.28125, + "learning_rate": 2.038316793498774e-06, + "loss": 1.66394424, + "memory(GiB)": 117.38, + "step": 56505, + "train_speed(iter/s)": 1.637355 + }, + { + "acc": 0.65481281, + "epoch": 1.4335362760020294, + "grad_norm": 7.40625, + "learning_rate": 2.037471992423305e-06, + "loss": 1.59310789, + "memory(GiB)": 117.38, + "step": 56510, + "train_speed(iter/s)": 1.637371 + }, + { + "acc": 0.63660326, + "epoch": 1.4336631151699644, + "grad_norm": 6.34375, + "learning_rate": 2.036627321653043e-06, + "loss": 1.66875992, + "memory(GiB)": 117.38, + "step": 56515, + "train_speed(iter/s)": 1.637387 + }, + { + "acc": 0.64011612, + "epoch": 1.4337899543378996, + "grad_norm": 7.0, + "learning_rate": 2.0357827812251403e-06, + "loss": 1.67292538, + "memory(GiB)": 117.38, + "step": 56520, + "train_speed(iter/s)": 1.637403 + }, + { + "acc": 0.65720253, + "epoch": 1.4339167935058346, + "grad_norm": 7.0, + "learning_rate": 2.0349383711767463e-06, + "loss": 1.62096252, + "memory(GiB)": 117.38, + "step": 56525, + "train_speed(iter/s)": 1.63742 + }, + { + "acc": 0.64625545, + "epoch": 1.4340436326737698, + "grad_norm": 5.78125, + "learning_rate": 2.0340940915450026e-06, + "loss": 1.59795609, + "memory(GiB)": 117.38, + "step": 56530, + "train_speed(iter/s)": 1.637435 + }, + { + "acc": 0.64653025, + "epoch": 1.4341704718417048, + "grad_norm": 6.03125, + "learning_rate": 2.033249942367041e-06, + "loss": 1.62711067, + "memory(GiB)": 117.38, + "step": 56535, + "train_speed(iter/s)": 1.637451 + }, + { + "acc": 0.66112103, + "epoch": 1.4342973110096398, + "grad_norm": 6.03125, + "learning_rate": 2.032405923679991e-06, + "loss": 1.54778318, + "memory(GiB)": 117.38, + "step": 56540, + "train_speed(iter/s)": 1.637466 + }, + { + "acc": 0.65065727, + "epoch": 1.4344241501775747, + "grad_norm": 4.96875, + "learning_rate": 2.0315620355209792e-06, + "loss": 1.64242725, + "memory(GiB)": 117.38, + "step": 56545, + "train_speed(iter/s)": 1.637482 + }, + { + "acc": 0.6565176, + "epoch": 1.43455098934551, + "grad_norm": 8.625, + "learning_rate": 2.030718277927124e-06, + "loss": 1.61386261, + "memory(GiB)": 117.38, + "step": 56550, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.67432275, + "epoch": 1.434677828513445, + "grad_norm": 5.84375, + "learning_rate": 2.0298746509355326e-06, + "loss": 1.53392601, + "memory(GiB)": 117.38, + "step": 56555, + "train_speed(iter/s)": 1.637514 + }, + { + "acc": 0.66257658, + "epoch": 1.43480466768138, + "grad_norm": 5.09375, + "learning_rate": 2.0290311545833166e-06, + "loss": 1.58483057, + "memory(GiB)": 117.38, + "step": 56560, + "train_speed(iter/s)": 1.63753 + }, + { + "acc": 0.67124596, + "epoch": 1.4349315068493151, + "grad_norm": 5.1875, + "learning_rate": 2.028187788907574e-06, + "loss": 1.58956041, + "memory(GiB)": 117.38, + "step": 56565, + "train_speed(iter/s)": 1.637546 + }, + { + "acc": 0.66505065, + "epoch": 1.4350583460172501, + "grad_norm": 4.625, + "learning_rate": 2.027344553945403e-06, + "loss": 1.60864868, + "memory(GiB)": 117.38, + "step": 56570, + "train_speed(iter/s)": 1.637561 + }, + { + "acc": 0.66054506, + "epoch": 1.4351851851851851, + "grad_norm": 5.09375, + "learning_rate": 2.0265014497338868e-06, + "loss": 1.54737091, + "memory(GiB)": 117.38, + "step": 56575, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.64677486, + "epoch": 1.43531202435312, + "grad_norm": 7.0, + "learning_rate": 2.0256584763101145e-06, + "loss": 1.61277962, + "memory(GiB)": 117.38, + "step": 56580, + "train_speed(iter/s)": 1.637592 + }, + { + "acc": 0.6490932, + "epoch": 1.4354388635210553, + "grad_norm": 6.03125, + "learning_rate": 2.024815633711162e-06, + "loss": 1.6407465, + "memory(GiB)": 117.38, + "step": 56585, + "train_speed(iter/s)": 1.637607 + }, + { + "acc": 0.65596676, + "epoch": 1.4355657026889903, + "grad_norm": 5.28125, + "learning_rate": 2.0239729219741005e-06, + "loss": 1.56711426, + "memory(GiB)": 117.38, + "step": 56590, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.65874748, + "epoch": 1.4356925418569255, + "grad_norm": 6.5, + "learning_rate": 2.0231303411359975e-06, + "loss": 1.56645145, + "memory(GiB)": 117.38, + "step": 56595, + "train_speed(iter/s)": 1.637636 + }, + { + "acc": 0.6512485, + "epoch": 1.4358193810248605, + "grad_norm": 6.8125, + "learning_rate": 2.0222878912339127e-06, + "loss": 1.62066383, + "memory(GiB)": 117.38, + "step": 56600, + "train_speed(iter/s)": 1.637652 + }, + { + "acc": 0.66414938, + "epoch": 1.4359462201927955, + "grad_norm": 5.4375, + "learning_rate": 2.021445572304901e-06, + "loss": 1.61221886, + "memory(GiB)": 117.38, + "step": 56605, + "train_speed(iter/s)": 1.637667 + }, + { + "acc": 0.66144352, + "epoch": 1.4360730593607305, + "grad_norm": 6.46875, + "learning_rate": 2.0206033843860113e-06, + "loss": 1.56715546, + "memory(GiB)": 117.38, + "step": 56610, + "train_speed(iter/s)": 1.637682 + }, + { + "acc": 0.65715666, + "epoch": 1.4361998985286657, + "grad_norm": 5.21875, + "learning_rate": 2.0197613275142868e-06, + "loss": 1.60859222, + "memory(GiB)": 117.38, + "step": 56615, + "train_speed(iter/s)": 1.637698 + }, + { + "acc": 0.66180749, + "epoch": 1.4363267376966007, + "grad_norm": 6.78125, + "learning_rate": 2.018919401726765e-06, + "loss": 1.56663246, + "memory(GiB)": 117.38, + "step": 56620, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.65509725, + "epoch": 1.436453576864536, + "grad_norm": 4.84375, + "learning_rate": 2.0180776070604773e-06, + "loss": 1.61059113, + "memory(GiB)": 117.38, + "step": 56625, + "train_speed(iter/s)": 1.63773 + }, + { + "acc": 0.66102705, + "epoch": 1.4365804160324709, + "grad_norm": 5.71875, + "learning_rate": 2.0172359435524497e-06, + "loss": 1.64509811, + "memory(GiB)": 117.38, + "step": 56630, + "train_speed(iter/s)": 1.637745 + }, + { + "acc": 0.65620785, + "epoch": 1.4367072552004059, + "grad_norm": 6.875, + "learning_rate": 2.0163944112397027e-06, + "loss": 1.57133694, + "memory(GiB)": 117.38, + "step": 56635, + "train_speed(iter/s)": 1.63776 + }, + { + "acc": 0.65012503, + "epoch": 1.4368340943683409, + "grad_norm": 5.3125, + "learning_rate": 2.01555301015925e-06, + "loss": 1.64002457, + "memory(GiB)": 117.38, + "step": 56640, + "train_speed(iter/s)": 1.637775 + }, + { + "acc": 0.6535697, + "epoch": 1.436960933536276, + "grad_norm": 4.875, + "learning_rate": 2.0147117403480994e-06, + "loss": 1.62286396, + "memory(GiB)": 117.38, + "step": 56645, + "train_speed(iter/s)": 1.637792 + }, + { + "acc": 0.66047297, + "epoch": 1.437087772704211, + "grad_norm": 5.34375, + "learning_rate": 2.0138706018432576e-06, + "loss": 1.54875898, + "memory(GiB)": 117.38, + "step": 56650, + "train_speed(iter/s)": 1.637808 + }, + { + "acc": 0.64496484, + "epoch": 1.437214611872146, + "grad_norm": 4.5625, + "learning_rate": 2.0130295946817176e-06, + "loss": 1.6263237, + "memory(GiB)": 117.38, + "step": 56655, + "train_speed(iter/s)": 1.637823 + }, + { + "acc": 0.66204586, + "epoch": 1.4373414510400813, + "grad_norm": 5.03125, + "learning_rate": 2.0121887189004713e-06, + "loss": 1.57126503, + "memory(GiB)": 117.38, + "step": 56660, + "train_speed(iter/s)": 1.637839 + }, + { + "acc": 0.66267447, + "epoch": 1.4374682902080163, + "grad_norm": 4.90625, + "learning_rate": 2.0113479745365033e-06, + "loss": 1.53914337, + "memory(GiB)": 117.38, + "step": 56665, + "train_speed(iter/s)": 1.637854 + }, + { + "acc": 0.65045338, + "epoch": 1.4375951293759512, + "grad_norm": 6.25, + "learning_rate": 2.0105073616267984e-06, + "loss": 1.64538155, + "memory(GiB)": 117.38, + "step": 56670, + "train_speed(iter/s)": 1.637869 + }, + { + "acc": 0.65917873, + "epoch": 1.4377219685438862, + "grad_norm": 6.4375, + "learning_rate": 2.0096668802083254e-06, + "loss": 1.62476234, + "memory(GiB)": 117.38, + "step": 56675, + "train_speed(iter/s)": 1.637884 + }, + { + "acc": 0.65910206, + "epoch": 1.4378488077118214, + "grad_norm": 5.4375, + "learning_rate": 2.0088265303180516e-06, + "loss": 1.57755928, + "memory(GiB)": 117.38, + "step": 56680, + "train_speed(iter/s)": 1.637898 + }, + { + "acc": 0.64925337, + "epoch": 1.4379756468797564, + "grad_norm": 6.875, + "learning_rate": 2.0079863119929434e-06, + "loss": 1.61306782, + "memory(GiB)": 117.38, + "step": 56685, + "train_speed(iter/s)": 1.637914 + }, + { + "acc": 0.654246, + "epoch": 1.4381024860476916, + "grad_norm": 5.59375, + "learning_rate": 2.0071462252699575e-06, + "loss": 1.65449734, + "memory(GiB)": 117.38, + "step": 56690, + "train_speed(iter/s)": 1.637929 + }, + { + "acc": 0.64911981, + "epoch": 1.4382293252156266, + "grad_norm": 5.5, + "learning_rate": 2.006306270186039e-06, + "loss": 1.58105955, + "memory(GiB)": 117.38, + "step": 56695, + "train_speed(iter/s)": 1.637945 + }, + { + "acc": 0.65274992, + "epoch": 1.4383561643835616, + "grad_norm": 5.90625, + "learning_rate": 2.005466446778139e-06, + "loss": 1.59600353, + "memory(GiB)": 117.38, + "step": 56700, + "train_speed(iter/s)": 1.63796 + }, + { + "acc": 0.64707499, + "epoch": 1.4384830035514966, + "grad_norm": 5.78125, + "learning_rate": 2.0046267550831935e-06, + "loss": 1.66900558, + "memory(GiB)": 117.38, + "step": 56705, + "train_speed(iter/s)": 1.637977 + }, + { + "acc": 0.65609694, + "epoch": 1.4386098427194318, + "grad_norm": 5.25, + "learning_rate": 2.003787195138139e-06, + "loss": 1.63397541, + "memory(GiB)": 117.38, + "step": 56710, + "train_speed(iter/s)": 1.637992 + }, + { + "acc": 0.65700374, + "epoch": 1.4387366818873668, + "grad_norm": 5.3125, + "learning_rate": 2.002947766979897e-06, + "loss": 1.63378906, + "memory(GiB)": 117.38, + "step": 56715, + "train_speed(iter/s)": 1.638007 + }, + { + "acc": 0.65730853, + "epoch": 1.4388635210553018, + "grad_norm": 5.28125, + "learning_rate": 2.0021084706453945e-06, + "loss": 1.55307751, + "memory(GiB)": 117.38, + "step": 56720, + "train_speed(iter/s)": 1.638022 + }, + { + "acc": 0.64480267, + "epoch": 1.438990360223237, + "grad_norm": 5.25, + "learning_rate": 2.0012693061715467e-06, + "loss": 1.59397993, + "memory(GiB)": 117.38, + "step": 56725, + "train_speed(iter/s)": 1.638037 + }, + { + "acc": 0.67191734, + "epoch": 1.439117199391172, + "grad_norm": 5.96875, + "learning_rate": 2.000430273595263e-06, + "loss": 1.53846283, + "memory(GiB)": 117.38, + "step": 56730, + "train_speed(iter/s)": 1.638052 + }, + { + "acc": 0.64228039, + "epoch": 1.439244038559107, + "grad_norm": 5.84375, + "learning_rate": 1.9995913729534477e-06, + "loss": 1.60652103, + "memory(GiB)": 117.38, + "step": 56735, + "train_speed(iter/s)": 1.638066 + }, + { + "acc": 0.64356232, + "epoch": 1.439370877727042, + "grad_norm": 4.8125, + "learning_rate": 1.9987526042830003e-06, + "loss": 1.64345322, + "memory(GiB)": 117.38, + "step": 56740, + "train_speed(iter/s)": 1.638082 + }, + { + "acc": 0.66152668, + "epoch": 1.4394977168949772, + "grad_norm": 4.875, + "learning_rate": 1.9979139676208124e-06, + "loss": 1.565271, + "memory(GiB)": 117.38, + "step": 56745, + "train_speed(iter/s)": 1.638096 + }, + { + "acc": 0.6499929, + "epoch": 1.4396245560629122, + "grad_norm": 6.9375, + "learning_rate": 1.9970754630037718e-06, + "loss": 1.5653801, + "memory(GiB)": 117.38, + "step": 56750, + "train_speed(iter/s)": 1.638111 + }, + { + "acc": 0.66611886, + "epoch": 1.4397513952308474, + "grad_norm": 5.53125, + "learning_rate": 1.9962370904687596e-06, + "loss": 1.5523344, + "memory(GiB)": 117.38, + "step": 56755, + "train_speed(iter/s)": 1.638126 + }, + { + "acc": 0.64757495, + "epoch": 1.4398782343987824, + "grad_norm": 7.0, + "learning_rate": 1.9953988500526506e-06, + "loss": 1.66170254, + "memory(GiB)": 117.38, + "step": 56760, + "train_speed(iter/s)": 1.638141 + }, + { + "acc": 0.66140223, + "epoch": 1.4400050735667174, + "grad_norm": 5.5, + "learning_rate": 1.994560741792315e-06, + "loss": 1.56403284, + "memory(GiB)": 117.38, + "step": 56765, + "train_speed(iter/s)": 1.638157 + }, + { + "acc": 0.65791788, + "epoch": 1.4401319127346524, + "grad_norm": 5.9375, + "learning_rate": 1.993722765724616e-06, + "loss": 1.59685392, + "memory(GiB)": 117.38, + "step": 56770, + "train_speed(iter/s)": 1.638172 + }, + { + "acc": 0.65203524, + "epoch": 1.4402587519025876, + "grad_norm": 5.78125, + "learning_rate": 1.992884921886412e-06, + "loss": 1.63024254, + "memory(GiB)": 117.38, + "step": 56775, + "train_speed(iter/s)": 1.638187 + }, + { + "acc": 0.65303907, + "epoch": 1.4403855910705226, + "grad_norm": 5.5625, + "learning_rate": 1.9920472103145555e-06, + "loss": 1.65169296, + "memory(GiB)": 117.38, + "step": 56780, + "train_speed(iter/s)": 1.638202 + }, + { + "acc": 0.65772057, + "epoch": 1.4405124302384578, + "grad_norm": 5.21875, + "learning_rate": 1.99120963104589e-06, + "loss": 1.60849762, + "memory(GiB)": 117.38, + "step": 56785, + "train_speed(iter/s)": 1.638217 + }, + { + "acc": 0.65142798, + "epoch": 1.4406392694063928, + "grad_norm": 6.71875, + "learning_rate": 1.990372184117262e-06, + "loss": 1.62114925, + "memory(GiB)": 117.38, + "step": 56790, + "train_speed(iter/s)": 1.638232 + }, + { + "acc": 0.64926682, + "epoch": 1.4407661085743277, + "grad_norm": 6.15625, + "learning_rate": 1.9895348695655e-06, + "loss": 1.60237045, + "memory(GiB)": 117.38, + "step": 56795, + "train_speed(iter/s)": 1.638247 + }, + { + "acc": 0.67513599, + "epoch": 1.4408929477422627, + "grad_norm": 5.34375, + "learning_rate": 1.9886976874274356e-06, + "loss": 1.55294781, + "memory(GiB)": 117.38, + "step": 56800, + "train_speed(iter/s)": 1.638262 + }, + { + "acc": 0.66617613, + "epoch": 1.441019786910198, + "grad_norm": 5.875, + "learning_rate": 1.9878606377398895e-06, + "loss": 1.52820234, + "memory(GiB)": 117.38, + "step": 56805, + "train_speed(iter/s)": 1.638278 + }, + { + "acc": 0.65975051, + "epoch": 1.441146626078133, + "grad_norm": 4.375, + "learning_rate": 1.9870237205396844e-06, + "loss": 1.58299789, + "memory(GiB)": 117.38, + "step": 56810, + "train_speed(iter/s)": 1.638293 + }, + { + "acc": 0.65558829, + "epoch": 1.441273465246068, + "grad_norm": 5.71875, + "learning_rate": 1.986186935863626e-06, + "loss": 1.63474922, + "memory(GiB)": 117.38, + "step": 56815, + "train_speed(iter/s)": 1.638308 + }, + { + "acc": 0.65376973, + "epoch": 1.4414003044140031, + "grad_norm": 5.78125, + "learning_rate": 1.9853502837485207e-06, + "loss": 1.60126038, + "memory(GiB)": 117.38, + "step": 56820, + "train_speed(iter/s)": 1.638324 + }, + { + "acc": 0.64889507, + "epoch": 1.4415271435819381, + "grad_norm": 5.5, + "learning_rate": 1.9845137642311707e-06, + "loss": 1.59639645, + "memory(GiB)": 117.38, + "step": 56825, + "train_speed(iter/s)": 1.638339 + }, + { + "acc": 0.66802053, + "epoch": 1.441653982749873, + "grad_norm": 6.28125, + "learning_rate": 1.9836773773483704e-06, + "loss": 1.58461447, + "memory(GiB)": 117.38, + "step": 56830, + "train_speed(iter/s)": 1.638355 + }, + { + "acc": 0.64627705, + "epoch": 1.441780821917808, + "grad_norm": 5.65625, + "learning_rate": 1.982841123136904e-06, + "loss": 1.64518509, + "memory(GiB)": 117.38, + "step": 56835, + "train_speed(iter/s)": 1.63837 + }, + { + "acc": 0.645298, + "epoch": 1.4419076610857433, + "grad_norm": 5.40625, + "learning_rate": 1.982005001633554e-06, + "loss": 1.62318802, + "memory(GiB)": 117.38, + "step": 56840, + "train_speed(iter/s)": 1.638385 + }, + { + "acc": 0.63613777, + "epoch": 1.4420345002536783, + "grad_norm": 5.6875, + "learning_rate": 1.9811690128751002e-06, + "loss": 1.7154295, + "memory(GiB)": 117.38, + "step": 56845, + "train_speed(iter/s)": 1.6384 + }, + { + "acc": 0.65435224, + "epoch": 1.4421613394216135, + "grad_norm": 5.25, + "learning_rate": 1.980333156898313e-06, + "loss": 1.57885752, + "memory(GiB)": 117.38, + "step": 56850, + "train_speed(iter/s)": 1.638416 + }, + { + "acc": 0.64419703, + "epoch": 1.4422881785895485, + "grad_norm": 5.46875, + "learning_rate": 1.979497433739952e-06, + "loss": 1.63819427, + "memory(GiB)": 117.38, + "step": 56855, + "train_speed(iter/s)": 1.638432 + }, + { + "acc": 0.66411519, + "epoch": 1.4424150177574835, + "grad_norm": 5.96875, + "learning_rate": 1.9786618434367814e-06, + "loss": 1.55240612, + "memory(GiB)": 117.38, + "step": 56860, + "train_speed(iter/s)": 1.638448 + }, + { + "acc": 0.65778127, + "epoch": 1.4425418569254185, + "grad_norm": 6.09375, + "learning_rate": 1.977826386025552e-06, + "loss": 1.59033175, + "memory(GiB)": 117.38, + "step": 56865, + "train_speed(iter/s)": 1.638464 + }, + { + "acc": 0.65405087, + "epoch": 1.4426686960933537, + "grad_norm": 5.25, + "learning_rate": 1.976991061543011e-06, + "loss": 1.59910669, + "memory(GiB)": 117.38, + "step": 56870, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.64710789, + "epoch": 1.4427955352612887, + "grad_norm": 6.15625, + "learning_rate": 1.9761558700259e-06, + "loss": 1.62930298, + "memory(GiB)": 117.38, + "step": 56875, + "train_speed(iter/s)": 1.638494 + }, + { + "acc": 0.65761924, + "epoch": 1.4429223744292237, + "grad_norm": 5.90625, + "learning_rate": 1.9753208115109546e-06, + "loss": 1.56891394, + "memory(GiB)": 117.38, + "step": 56880, + "train_speed(iter/s)": 1.638508 + }, + { + "acc": 0.65390573, + "epoch": 1.4430492135971589, + "grad_norm": 5.59375, + "learning_rate": 1.9744858860349043e-06, + "loss": 1.59684925, + "memory(GiB)": 117.38, + "step": 56885, + "train_speed(iter/s)": 1.638524 + }, + { + "acc": 0.66021557, + "epoch": 1.4431760527650939, + "grad_norm": 6.1875, + "learning_rate": 1.9736510936344723e-06, + "loss": 1.60303688, + "memory(GiB)": 117.38, + "step": 56890, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.64649925, + "epoch": 1.4433028919330289, + "grad_norm": 5.59375, + "learning_rate": 1.9728164343463764e-06, + "loss": 1.66878395, + "memory(GiB)": 117.38, + "step": 56895, + "train_speed(iter/s)": 1.638322 + }, + { + "acc": 0.67600212, + "epoch": 1.4434297311009638, + "grad_norm": 7.90625, + "learning_rate": 1.97198190820733e-06, + "loss": 1.57320032, + "memory(GiB)": 117.38, + "step": 56900, + "train_speed(iter/s)": 1.638338 + }, + { + "acc": 0.64314008, + "epoch": 1.443556570268899, + "grad_norm": 4.84375, + "learning_rate": 1.9711475152540376e-06, + "loss": 1.65345192, + "memory(GiB)": 117.38, + "step": 56905, + "train_speed(iter/s)": 1.638354 + }, + { + "acc": 0.65243139, + "epoch": 1.443683409436834, + "grad_norm": 5.5625, + "learning_rate": 1.9703132555232007e-06, + "loss": 1.66248093, + "memory(GiB)": 117.38, + "step": 56910, + "train_speed(iter/s)": 1.63837 + }, + { + "acc": 0.67233706, + "epoch": 1.4438102486047693, + "grad_norm": 5.4375, + "learning_rate": 1.9694791290515135e-06, + "loss": 1.57454777, + "memory(GiB)": 117.38, + "step": 56915, + "train_speed(iter/s)": 1.638386 + }, + { + "acc": 0.66005259, + "epoch": 1.4439370877727042, + "grad_norm": 6.78125, + "learning_rate": 1.968645135875665e-06, + "loss": 1.60908051, + "memory(GiB)": 117.38, + "step": 56920, + "train_speed(iter/s)": 1.638402 + }, + { + "acc": 0.65143685, + "epoch": 1.4440639269406392, + "grad_norm": 5.25, + "learning_rate": 1.967811276032335e-06, + "loss": 1.58872051, + "memory(GiB)": 117.38, + "step": 56925, + "train_speed(iter/s)": 1.638418 + }, + { + "acc": 0.64992266, + "epoch": 1.4441907661085742, + "grad_norm": 5.5625, + "learning_rate": 1.966977549558206e-06, + "loss": 1.64243546, + "memory(GiB)": 117.38, + "step": 56930, + "train_speed(iter/s)": 1.638435 + }, + { + "acc": 0.67083831, + "epoch": 1.4443176052765094, + "grad_norm": 5.03125, + "learning_rate": 1.966143956489945e-06, + "loss": 1.56842413, + "memory(GiB)": 117.38, + "step": 56935, + "train_speed(iter/s)": 1.638449 + }, + { + "acc": 0.67404747, + "epoch": 1.4444444444444444, + "grad_norm": 5.25, + "learning_rate": 1.965310496864217e-06, + "loss": 1.59623775, + "memory(GiB)": 117.38, + "step": 56940, + "train_speed(iter/s)": 1.638464 + }, + { + "acc": 0.66441669, + "epoch": 1.4445712836123796, + "grad_norm": 6.3125, + "learning_rate": 1.9644771707176813e-06, + "loss": 1.56829948, + "memory(GiB)": 117.38, + "step": 56945, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.64249125, + "epoch": 1.4446981227803146, + "grad_norm": 4.90625, + "learning_rate": 1.963643978086996e-06, + "loss": 1.60039444, + "memory(GiB)": 117.38, + "step": 56950, + "train_speed(iter/s)": 1.638494 + }, + { + "acc": 0.65368681, + "epoch": 1.4448249619482496, + "grad_norm": 6.125, + "learning_rate": 1.9628109190088023e-06, + "loss": 1.58377132, + "memory(GiB)": 117.38, + "step": 56955, + "train_speed(iter/s)": 1.638509 + }, + { + "acc": 0.65559635, + "epoch": 1.4449518011161846, + "grad_norm": 6.53125, + "learning_rate": 1.961977993519743e-06, + "loss": 1.64477673, + "memory(GiB)": 117.38, + "step": 56960, + "train_speed(iter/s)": 1.638524 + }, + { + "acc": 0.65625477, + "epoch": 1.4450786402841198, + "grad_norm": 5.53125, + "learning_rate": 1.9611452016564574e-06, + "loss": 1.54010897, + "memory(GiB)": 117.38, + "step": 56965, + "train_speed(iter/s)": 1.638541 + }, + { + "acc": 0.65136757, + "epoch": 1.4452054794520548, + "grad_norm": 5.84375, + "learning_rate": 1.960312543455575e-06, + "loss": 1.63736649, + "memory(GiB)": 117.38, + "step": 56970, + "train_speed(iter/s)": 1.638556 + }, + { + "acc": 0.6611475, + "epoch": 1.4453323186199898, + "grad_norm": 6.0625, + "learning_rate": 1.959480018953716e-06, + "loss": 1.61653004, + "memory(GiB)": 117.38, + "step": 56975, + "train_speed(iter/s)": 1.638572 + }, + { + "acc": 0.63838387, + "epoch": 1.445459157787925, + "grad_norm": 4.25, + "learning_rate": 1.9586476281874994e-06, + "loss": 1.64717407, + "memory(GiB)": 117.38, + "step": 56980, + "train_speed(iter/s)": 1.638588 + }, + { + "acc": 0.6426403, + "epoch": 1.44558599695586, + "grad_norm": 6.59375, + "learning_rate": 1.9578153711935403e-06, + "loss": 1.62657452, + "memory(GiB)": 117.38, + "step": 56985, + "train_speed(iter/s)": 1.638604 + }, + { + "acc": 0.64297981, + "epoch": 1.445712836123795, + "grad_norm": 6.375, + "learning_rate": 1.9569832480084456e-06, + "loss": 1.63859196, + "memory(GiB)": 117.38, + "step": 56990, + "train_speed(iter/s)": 1.63862 + }, + { + "acc": 0.64794579, + "epoch": 1.44583967529173, + "grad_norm": 7.96875, + "learning_rate": 1.9561512586688096e-06, + "loss": 1.67420025, + "memory(GiB)": 117.38, + "step": 56995, + "train_speed(iter/s)": 1.638635 + }, + { + "acc": 0.65910435, + "epoch": 1.4459665144596652, + "grad_norm": 5.65625, + "learning_rate": 1.9553194032112334e-06, + "loss": 1.56917953, + "memory(GiB)": 117.38, + "step": 57000, + "train_speed(iter/s)": 1.638651 + }, + { + "epoch": 1.4459665144596652, + "eval_acc": 0.6462751928028618, + "eval_loss": 1.5732783079147339, + "eval_runtime": 58.8551, + "eval_samples_per_second": 108.232, + "eval_steps_per_second": 27.066, + "step": 57000 + }, + { + "acc": 0.65983486, + "epoch": 1.4460933536276002, + "grad_norm": 10.0625, + "learning_rate": 1.954487681672303e-06, + "loss": 1.63470917, + "memory(GiB)": 117.38, + "step": 57005, + "train_speed(iter/s)": 1.635694 + }, + { + "acc": 0.66568861, + "epoch": 1.4462201927955354, + "grad_norm": 7.96875, + "learning_rate": 1.9536560940886033e-06, + "loss": 1.5311286, + "memory(GiB)": 117.38, + "step": 57010, + "train_speed(iter/s)": 1.635711 + }, + { + "acc": 0.63711953, + "epoch": 1.4463470319634704, + "grad_norm": 5.625, + "learning_rate": 1.9528246404967067e-06, + "loss": 1.67905521, + "memory(GiB)": 117.38, + "step": 57015, + "train_speed(iter/s)": 1.635726 + }, + { + "acc": 0.65817933, + "epoch": 1.4464738711314054, + "grad_norm": 6.0, + "learning_rate": 1.951993320933188e-06, + "loss": 1.62685814, + "memory(GiB)": 117.38, + "step": 57020, + "train_speed(iter/s)": 1.635742 + }, + { + "acc": 0.6541234, + "epoch": 1.4466007102993403, + "grad_norm": 6.78125, + "learning_rate": 1.951162135434612e-06, + "loss": 1.66277428, + "memory(GiB)": 117.38, + "step": 57025, + "train_speed(iter/s)": 1.635757 + }, + { + "acc": 0.66161323, + "epoch": 1.4467275494672756, + "grad_norm": 5.6875, + "learning_rate": 1.9503310840375374e-06, + "loss": 1.62241707, + "memory(GiB)": 117.38, + "step": 57030, + "train_speed(iter/s)": 1.635774 + }, + { + "acc": 0.65783582, + "epoch": 1.4468543886352105, + "grad_norm": 6.0625, + "learning_rate": 1.949500166778517e-06, + "loss": 1.67539177, + "memory(GiB)": 117.38, + "step": 57035, + "train_speed(iter/s)": 1.63579 + }, + { + "acc": 0.6523447, + "epoch": 1.4469812278031455, + "grad_norm": 5.625, + "learning_rate": 1.948669383694099e-06, + "loss": 1.6104454, + "memory(GiB)": 117.38, + "step": 57040, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.66037321, + "epoch": 1.4471080669710807, + "grad_norm": 6.0, + "learning_rate": 1.947838734820825e-06, + "loss": 1.54887028, + "memory(GiB)": 117.38, + "step": 57045, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.65758562, + "epoch": 1.4472349061390157, + "grad_norm": 5.96875, + "learning_rate": 1.94700822019523e-06, + "loss": 1.566259, + "memory(GiB)": 117.38, + "step": 57050, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.67645731, + "epoch": 1.4473617453069507, + "grad_norm": 7.28125, + "learning_rate": 1.9461778398538447e-06, + "loss": 1.52994108, + "memory(GiB)": 117.38, + "step": 57055, + "train_speed(iter/s)": 1.635848 + }, + { + "acc": 0.62973032, + "epoch": 1.4474885844748857, + "grad_norm": 5.75, + "learning_rate": 1.945347593833191e-06, + "loss": 1.73797112, + "memory(GiB)": 117.38, + "step": 57060, + "train_speed(iter/s)": 1.635864 + }, + { + "acc": 0.64589596, + "epoch": 1.447615423642821, + "grad_norm": 6.375, + "learning_rate": 1.9445174821697893e-06, + "loss": 1.65696621, + "memory(GiB)": 117.38, + "step": 57065, + "train_speed(iter/s)": 1.635879 + }, + { + "acc": 0.62811861, + "epoch": 1.447742262810756, + "grad_norm": 5.125, + "learning_rate": 1.94368750490015e-06, + "loss": 1.6904026, + "memory(GiB)": 117.38, + "step": 57070, + "train_speed(iter/s)": 1.635895 + }, + { + "acc": 0.67131414, + "epoch": 1.4478691019786911, + "grad_norm": 6.8125, + "learning_rate": 1.94285766206078e-06, + "loss": 1.53685827, + "memory(GiB)": 117.38, + "step": 57075, + "train_speed(iter/s)": 1.635911 + }, + { + "acc": 0.657477, + "epoch": 1.447995941146626, + "grad_norm": 6.90625, + "learning_rate": 1.9420279536881794e-06, + "loss": 1.64505463, + "memory(GiB)": 117.38, + "step": 57080, + "train_speed(iter/s)": 1.635927 + }, + { + "acc": 0.65891447, + "epoch": 1.448122780314561, + "grad_norm": 5.0625, + "learning_rate": 1.9411983798188398e-06, + "loss": 1.64063225, + "memory(GiB)": 117.38, + "step": 57085, + "train_speed(iter/s)": 1.635942 + }, + { + "acc": 0.64775205, + "epoch": 1.448249619482496, + "grad_norm": 5.96875, + "learning_rate": 1.940368940489256e-06, + "loss": 1.64184151, + "memory(GiB)": 117.38, + "step": 57090, + "train_speed(iter/s)": 1.635958 + }, + { + "acc": 0.66083183, + "epoch": 1.4483764586504313, + "grad_norm": 6.46875, + "learning_rate": 1.939539635735905e-06, + "loss": 1.57934237, + "memory(GiB)": 117.38, + "step": 57095, + "train_speed(iter/s)": 1.635973 + }, + { + "acc": 0.67137265, + "epoch": 1.4485032978183663, + "grad_norm": 6.5, + "learning_rate": 1.9387104655952625e-06, + "loss": 1.58787498, + "memory(GiB)": 117.38, + "step": 57100, + "train_speed(iter/s)": 1.635989 + }, + { + "acc": 0.65753026, + "epoch": 1.4486301369863015, + "grad_norm": 6.0, + "learning_rate": 1.9378814301038033e-06, + "loss": 1.55950699, + "memory(GiB)": 117.38, + "step": 57105, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.65536742, + "epoch": 1.4487569761542365, + "grad_norm": 6.03125, + "learning_rate": 1.937052529297992e-06, + "loss": 1.59135695, + "memory(GiB)": 117.38, + "step": 57110, + "train_speed(iter/s)": 1.636019 + }, + { + "acc": 0.67116528, + "epoch": 1.4488838153221715, + "grad_norm": 6.28125, + "learning_rate": 1.9362237632142838e-06, + "loss": 1.55093994, + "memory(GiB)": 117.38, + "step": 57115, + "train_speed(iter/s)": 1.636035 + }, + { + "acc": 0.67192211, + "epoch": 1.4490106544901065, + "grad_norm": 5.4375, + "learning_rate": 1.9353951318891313e-06, + "loss": 1.52644215, + "memory(GiB)": 117.38, + "step": 57120, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.66900139, + "epoch": 1.4491374936580417, + "grad_norm": 6.75, + "learning_rate": 1.9345666353589855e-06, + "loss": 1.5844758, + "memory(GiB)": 117.38, + "step": 57125, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.66390681, + "epoch": 1.4492643328259767, + "grad_norm": 6.5, + "learning_rate": 1.9337382736602868e-06, + "loss": 1.57343369, + "memory(GiB)": 117.38, + "step": 57130, + "train_speed(iter/s)": 1.63608 + }, + { + "acc": 0.65626774, + "epoch": 1.4493911719939117, + "grad_norm": 7.3125, + "learning_rate": 1.9329100468294646e-06, + "loss": 1.59688339, + "memory(GiB)": 117.38, + "step": 57135, + "train_speed(iter/s)": 1.636095 + }, + { + "acc": 0.66577759, + "epoch": 1.4495180111618469, + "grad_norm": 5.0, + "learning_rate": 1.9320819549029546e-06, + "loss": 1.54080753, + "memory(GiB)": 117.38, + "step": 57140, + "train_speed(iter/s)": 1.636111 + }, + { + "acc": 0.65928102, + "epoch": 1.4496448503297819, + "grad_norm": 5.46875, + "learning_rate": 1.9312539979171774e-06, + "loss": 1.5902153, + "memory(GiB)": 117.38, + "step": 57145, + "train_speed(iter/s)": 1.636126 + }, + { + "acc": 0.64968801, + "epoch": 1.4497716894977168, + "grad_norm": 4.8125, + "learning_rate": 1.9304261759085525e-06, + "loss": 1.57117939, + "memory(GiB)": 117.38, + "step": 57150, + "train_speed(iter/s)": 1.636141 + }, + { + "acc": 0.66583667, + "epoch": 1.4498985286656518, + "grad_norm": 6.21875, + "learning_rate": 1.929598488913485e-06, + "loss": 1.59090929, + "memory(GiB)": 117.38, + "step": 57155, + "train_speed(iter/s)": 1.636157 + }, + { + "acc": 0.66483445, + "epoch": 1.450025367833587, + "grad_norm": 4.78125, + "learning_rate": 1.928770936968386e-06, + "loss": 1.54363451, + "memory(GiB)": 117.38, + "step": 57160, + "train_speed(iter/s)": 1.636172 + }, + { + "acc": 0.67496901, + "epoch": 1.450152207001522, + "grad_norm": 5.09375, + "learning_rate": 1.927943520109653e-06, + "loss": 1.51304855, + "memory(GiB)": 117.38, + "step": 57165, + "train_speed(iter/s)": 1.636189 + }, + { + "acc": 0.65530767, + "epoch": 1.4502790461694572, + "grad_norm": 6.1875, + "learning_rate": 1.9271162383736804e-06, + "loss": 1.65991783, + "memory(GiB)": 117.38, + "step": 57170, + "train_speed(iter/s)": 1.636205 + }, + { + "acc": 0.66496162, + "epoch": 1.4504058853373922, + "grad_norm": 6.3125, + "learning_rate": 1.9262890917968547e-06, + "loss": 1.58260727, + "memory(GiB)": 117.38, + "step": 57175, + "train_speed(iter/s)": 1.63622 + }, + { + "acc": 0.64246774, + "epoch": 1.4505327245053272, + "grad_norm": 6.0625, + "learning_rate": 1.925462080415558e-06, + "loss": 1.72915134, + "memory(GiB)": 117.38, + "step": 57180, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.64967732, + "epoch": 1.4506595636732622, + "grad_norm": 5.90625, + "learning_rate": 1.924635204266166e-06, + "loss": 1.63484974, + "memory(GiB)": 117.38, + "step": 57185, + "train_speed(iter/s)": 1.636251 + }, + { + "acc": 0.66788197, + "epoch": 1.4507864028411974, + "grad_norm": 4.71875, + "learning_rate": 1.923808463385048e-06, + "loss": 1.56585808, + "memory(GiB)": 117.38, + "step": 57190, + "train_speed(iter/s)": 1.636266 + }, + { + "acc": 0.64836664, + "epoch": 1.4509132420091324, + "grad_norm": 5.15625, + "learning_rate": 1.922981857808568e-06, + "loss": 1.63585682, + "memory(GiB)": 117.38, + "step": 57195, + "train_speed(iter/s)": 1.636282 + }, + { + "acc": 0.64952722, + "epoch": 1.4510400811770674, + "grad_norm": 4.6875, + "learning_rate": 1.9221553875730835e-06, + "loss": 1.63347435, + "memory(GiB)": 117.38, + "step": 57200, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.6486619, + "epoch": 1.4511669203450026, + "grad_norm": 5.3125, + "learning_rate": 1.921329052714947e-06, + "loss": 1.60259781, + "memory(GiB)": 117.38, + "step": 57205, + "train_speed(iter/s)": 1.636314 + }, + { + "acc": 0.66280117, + "epoch": 1.4512937595129376, + "grad_norm": 6.34375, + "learning_rate": 1.920502853270504e-06, + "loss": 1.57172356, + "memory(GiB)": 117.38, + "step": 57210, + "train_speed(iter/s)": 1.63633 + }, + { + "acc": 0.65556793, + "epoch": 1.4514205986808726, + "grad_norm": 5.1875, + "learning_rate": 1.919676789276094e-06, + "loss": 1.63801041, + "memory(GiB)": 117.38, + "step": 57215, + "train_speed(iter/s)": 1.636346 + }, + { + "acc": 0.65053039, + "epoch": 1.4515474378488076, + "grad_norm": 6.71875, + "learning_rate": 1.918850860768052e-06, + "loss": 1.67173576, + "memory(GiB)": 117.38, + "step": 57220, + "train_speed(iter/s)": 1.636361 + }, + { + "acc": 0.65983396, + "epoch": 1.4516742770167428, + "grad_norm": 6.3125, + "learning_rate": 1.918025067782704e-06, + "loss": 1.57795067, + "memory(GiB)": 117.38, + "step": 57225, + "train_speed(iter/s)": 1.636377 + }, + { + "acc": 0.6572083, + "epoch": 1.4518011161846778, + "grad_norm": 5.59375, + "learning_rate": 1.9171994103563766e-06, + "loss": 1.63053951, + "memory(GiB)": 117.38, + "step": 57230, + "train_speed(iter/s)": 1.636392 + }, + { + "acc": 0.63794103, + "epoch": 1.451927955352613, + "grad_norm": 5.84375, + "learning_rate": 1.916373888525381e-06, + "loss": 1.71828842, + "memory(GiB)": 117.38, + "step": 57235, + "train_speed(iter/s)": 1.636407 + }, + { + "acc": 0.63756962, + "epoch": 1.452054794520548, + "grad_norm": 6.3125, + "learning_rate": 1.9155485023260294e-06, + "loss": 1.65989151, + "memory(GiB)": 117.38, + "step": 57240, + "train_speed(iter/s)": 1.636423 + }, + { + "acc": 0.64877052, + "epoch": 1.452181633688483, + "grad_norm": 5.21875, + "learning_rate": 1.914723251794624e-06, + "loss": 1.57665539, + "memory(GiB)": 117.38, + "step": 57245, + "train_speed(iter/s)": 1.636437 + }, + { + "acc": 0.66091881, + "epoch": 1.452308472856418, + "grad_norm": 6.0, + "learning_rate": 1.9138981369674688e-06, + "loss": 1.64759636, + "memory(GiB)": 117.38, + "step": 57250, + "train_speed(iter/s)": 1.636453 + }, + { + "acc": 0.64368081, + "epoch": 1.4524353120243532, + "grad_norm": 5.25, + "learning_rate": 1.9130731578808493e-06, + "loss": 1.63277321, + "memory(GiB)": 117.38, + "step": 57255, + "train_speed(iter/s)": 1.636469 + }, + { + "acc": 0.64925041, + "epoch": 1.4525621511922882, + "grad_norm": 5.5, + "learning_rate": 1.912248314571053e-06, + "loss": 1.63647041, + "memory(GiB)": 117.38, + "step": 57260, + "train_speed(iter/s)": 1.636484 + }, + { + "acc": 0.6674449, + "epoch": 1.4526889903602234, + "grad_norm": 6.375, + "learning_rate": 1.9114236070743638e-06, + "loss": 1.58786201, + "memory(GiB)": 117.38, + "step": 57265, + "train_speed(iter/s)": 1.636499 + }, + { + "acc": 0.66013389, + "epoch": 1.4528158295281584, + "grad_norm": 5.46875, + "learning_rate": 1.910599035427055e-06, + "loss": 1.60119324, + "memory(GiB)": 117.38, + "step": 57270, + "train_speed(iter/s)": 1.636515 + }, + { + "acc": 0.6530014, + "epoch": 1.4529426686960933, + "grad_norm": 7.125, + "learning_rate": 1.909774599665392e-06, + "loss": 1.55767431, + "memory(GiB)": 117.38, + "step": 57275, + "train_speed(iter/s)": 1.636531 + }, + { + "acc": 0.66979566, + "epoch": 1.4530695078640283, + "grad_norm": 7.25, + "learning_rate": 1.9089502998256382e-06, + "loss": 1.53461914, + "memory(GiB)": 117.38, + "step": 57280, + "train_speed(iter/s)": 1.636546 + }, + { + "acc": 0.65918865, + "epoch": 1.4531963470319635, + "grad_norm": 6.71875, + "learning_rate": 1.9081261359440517e-06, + "loss": 1.58309984, + "memory(GiB)": 117.38, + "step": 57285, + "train_speed(iter/s)": 1.636561 + }, + { + "acc": 0.67190862, + "epoch": 1.4533231861998985, + "grad_norm": 6.3125, + "learning_rate": 1.9073021080568837e-06, + "loss": 1.55598526, + "memory(GiB)": 117.38, + "step": 57290, + "train_speed(iter/s)": 1.636577 + }, + { + "acc": 0.65918016, + "epoch": 1.4534500253678335, + "grad_norm": 6.5, + "learning_rate": 1.9064782162003737e-06, + "loss": 1.55874691, + "memory(GiB)": 117.38, + "step": 57295, + "train_speed(iter/s)": 1.636594 + }, + { + "acc": 0.65292559, + "epoch": 1.4535768645357687, + "grad_norm": 5.21875, + "learning_rate": 1.9056544604107646e-06, + "loss": 1.56381721, + "memory(GiB)": 117.38, + "step": 57300, + "train_speed(iter/s)": 1.636609 + }, + { + "acc": 0.65367217, + "epoch": 1.4537037037037037, + "grad_norm": 6.28125, + "learning_rate": 1.9048308407242882e-06, + "loss": 1.59002361, + "memory(GiB)": 117.38, + "step": 57305, + "train_speed(iter/s)": 1.636625 + }, + { + "acc": 0.65160046, + "epoch": 1.4538305428716387, + "grad_norm": 6.96875, + "learning_rate": 1.90400735717717e-06, + "loss": 1.62826805, + "memory(GiB)": 117.38, + "step": 57310, + "train_speed(iter/s)": 1.63664 + }, + { + "acc": 0.64498286, + "epoch": 1.4539573820395737, + "grad_norm": 5.90625, + "learning_rate": 1.903184009805631e-06, + "loss": 1.57962074, + "memory(GiB)": 117.38, + "step": 57315, + "train_speed(iter/s)": 1.636655 + }, + { + "acc": 0.64771957, + "epoch": 1.454084221207509, + "grad_norm": 5.25, + "learning_rate": 1.9023607986458854e-06, + "loss": 1.70104218, + "memory(GiB)": 117.38, + "step": 57320, + "train_speed(iter/s)": 1.636671 + }, + { + "acc": 0.66043119, + "epoch": 1.454211060375444, + "grad_norm": 5.4375, + "learning_rate": 1.901537723734142e-06, + "loss": 1.5954442, + "memory(GiB)": 117.38, + "step": 57325, + "train_speed(iter/s)": 1.636687 + }, + { + "acc": 0.66115427, + "epoch": 1.454337899543379, + "grad_norm": 4.84375, + "learning_rate": 1.9007147851066031e-06, + "loss": 1.56853418, + "memory(GiB)": 117.38, + "step": 57330, + "train_speed(iter/s)": 1.636702 + }, + { + "acc": 0.67260337, + "epoch": 1.454464738711314, + "grad_norm": 5.625, + "learning_rate": 1.8998919827994654e-06, + "loss": 1.49298887, + "memory(GiB)": 117.38, + "step": 57335, + "train_speed(iter/s)": 1.636718 + }, + { + "acc": 0.65983915, + "epoch": 1.454591577879249, + "grad_norm": 5.8125, + "learning_rate": 1.899069316848919e-06, + "loss": 1.64879665, + "memory(GiB)": 117.38, + "step": 57340, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.66017113, + "epoch": 1.454718417047184, + "grad_norm": 5.0, + "learning_rate": 1.8982467872911486e-06, + "loss": 1.68707008, + "memory(GiB)": 117.38, + "step": 57345, + "train_speed(iter/s)": 1.636749 + }, + { + "acc": 0.65039816, + "epoch": 1.4548452562151193, + "grad_norm": 5.15625, + "learning_rate": 1.8974243941623332e-06, + "loss": 1.61987057, + "memory(GiB)": 117.38, + "step": 57350, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.64611359, + "epoch": 1.4549720953830543, + "grad_norm": 6.59375, + "learning_rate": 1.896602137498645e-06, + "loss": 1.68173237, + "memory(GiB)": 117.38, + "step": 57355, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.65224314, + "epoch": 1.4550989345509893, + "grad_norm": 5.75, + "learning_rate": 1.89578001733625e-06, + "loss": 1.60310459, + "memory(GiB)": 117.38, + "step": 57360, + "train_speed(iter/s)": 1.636796 + }, + { + "acc": 0.66426563, + "epoch": 1.4552257737189245, + "grad_norm": 5.65625, + "learning_rate": 1.8949580337113078e-06, + "loss": 1.56433716, + "memory(GiB)": 117.38, + "step": 57365, + "train_speed(iter/s)": 1.636811 + }, + { + "acc": 0.64145288, + "epoch": 1.4553526128868595, + "grad_norm": 6.40625, + "learning_rate": 1.8941361866599778e-06, + "loss": 1.66888504, + "memory(GiB)": 117.38, + "step": 57370, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.64828243, + "epoch": 1.4554794520547945, + "grad_norm": 5.125, + "learning_rate": 1.893314476218403e-06, + "loss": 1.638554, + "memory(GiB)": 117.38, + "step": 57375, + "train_speed(iter/s)": 1.636842 + }, + { + "acc": 0.65154319, + "epoch": 1.4556062912227294, + "grad_norm": 5.0, + "learning_rate": 1.8924929024227279e-06, + "loss": 1.60959511, + "memory(GiB)": 117.38, + "step": 57380, + "train_speed(iter/s)": 1.636858 + }, + { + "acc": 0.64478154, + "epoch": 1.4557331303906647, + "grad_norm": 6.0625, + "learning_rate": 1.8916714653090874e-06, + "loss": 1.65695953, + "memory(GiB)": 117.38, + "step": 57385, + "train_speed(iter/s)": 1.636873 + }, + { + "acc": 0.65126963, + "epoch": 1.4558599695585996, + "grad_norm": 5.09375, + "learning_rate": 1.8908501649136174e-06, + "loss": 1.6137291, + "memory(GiB)": 117.38, + "step": 57390, + "train_speed(iter/s)": 1.636889 + }, + { + "acc": 0.6604413, + "epoch": 1.4559868087265349, + "grad_norm": 5.625, + "learning_rate": 1.8900290012724358e-06, + "loss": 1.57032309, + "memory(GiB)": 117.38, + "step": 57395, + "train_speed(iter/s)": 1.636905 + }, + { + "acc": 0.66509604, + "epoch": 1.4561136478944698, + "grad_norm": 6.125, + "learning_rate": 1.889207974421663e-06, + "loss": 1.53082504, + "memory(GiB)": 117.38, + "step": 57400, + "train_speed(iter/s)": 1.63692 + }, + { + "acc": 0.67246957, + "epoch": 1.4562404870624048, + "grad_norm": 14.0, + "learning_rate": 1.8883870843974134e-06, + "loss": 1.51925716, + "memory(GiB)": 117.38, + "step": 57405, + "train_speed(iter/s)": 1.636935 + }, + { + "acc": 0.63677759, + "epoch": 1.4563673262303398, + "grad_norm": 6.0, + "learning_rate": 1.887566331235794e-06, + "loss": 1.71867752, + "memory(GiB)": 117.38, + "step": 57410, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.64237099, + "epoch": 1.456494165398275, + "grad_norm": 7.34375, + "learning_rate": 1.8867457149729013e-06, + "loss": 1.74663906, + "memory(GiB)": 117.38, + "step": 57415, + "train_speed(iter/s)": 1.636967 + }, + { + "acc": 0.65436869, + "epoch": 1.45662100456621, + "grad_norm": 5.34375, + "learning_rate": 1.8859252356448305e-06, + "loss": 1.60367146, + "memory(GiB)": 117.38, + "step": 57420, + "train_speed(iter/s)": 1.636982 + }, + { + "acc": 0.66943574, + "epoch": 1.4567478437341452, + "grad_norm": 6.625, + "learning_rate": 1.8851048932876725e-06, + "loss": 1.52174606, + "memory(GiB)": 117.38, + "step": 57425, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.67208447, + "epoch": 1.4568746829020802, + "grad_norm": 5.59375, + "learning_rate": 1.8842846879375104e-06, + "loss": 1.5853405, + "memory(GiB)": 117.38, + "step": 57430, + "train_speed(iter/s)": 1.637012 + }, + { + "acc": 0.66352468, + "epoch": 1.4570015220700152, + "grad_norm": 6.5625, + "learning_rate": 1.8834646196304146e-06, + "loss": 1.58954268, + "memory(GiB)": 117.38, + "step": 57435, + "train_speed(iter/s)": 1.637027 + }, + { + "acc": 0.6562253, + "epoch": 1.4571283612379502, + "grad_norm": 6.59375, + "learning_rate": 1.8826446884024612e-06, + "loss": 1.68271179, + "memory(GiB)": 117.38, + "step": 57440, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.66146083, + "epoch": 1.4572552004058854, + "grad_norm": 4.90625, + "learning_rate": 1.8818248942897122e-06, + "loss": 1.5480011, + "memory(GiB)": 117.38, + "step": 57445, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.64915457, + "epoch": 1.4573820395738204, + "grad_norm": 11.625, + "learning_rate": 1.8810052373282277e-06, + "loss": 1.6108326, + "memory(GiB)": 117.38, + "step": 57450, + "train_speed(iter/s)": 1.637074 + }, + { + "acc": 0.65405035, + "epoch": 1.4575088787417554, + "grad_norm": 5.78125, + "learning_rate": 1.880185717554055e-06, + "loss": 1.62377968, + "memory(GiB)": 117.38, + "step": 57455, + "train_speed(iter/s)": 1.637089 + }, + { + "acc": 0.64473839, + "epoch": 1.4576357179096906, + "grad_norm": 5.625, + "learning_rate": 1.879366335003245e-06, + "loss": 1.64802246, + "memory(GiB)": 117.38, + "step": 57460, + "train_speed(iter/s)": 1.637104 + }, + { + "acc": 0.6480298, + "epoch": 1.4577625570776256, + "grad_norm": 6.53125, + "learning_rate": 1.8785470897118362e-06, + "loss": 1.64533787, + "memory(GiB)": 117.38, + "step": 57465, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.65634394, + "epoch": 1.4578893962455606, + "grad_norm": 5.9375, + "learning_rate": 1.8777279817158627e-06, + "loss": 1.59434662, + "memory(GiB)": 117.38, + "step": 57470, + "train_speed(iter/s)": 1.637136 + }, + { + "acc": 0.66031961, + "epoch": 1.4580162354134956, + "grad_norm": 5.15625, + "learning_rate": 1.8769090110513522e-06, + "loss": 1.59686947, + "memory(GiB)": 117.38, + "step": 57475, + "train_speed(iter/s)": 1.637152 + }, + { + "acc": 0.642873, + "epoch": 1.4581430745814308, + "grad_norm": 6.09375, + "learning_rate": 1.8760901777543273e-06, + "loss": 1.67422428, + "memory(GiB)": 117.38, + "step": 57480, + "train_speed(iter/s)": 1.637167 + }, + { + "acc": 0.66464052, + "epoch": 1.4582699137493658, + "grad_norm": 5.8125, + "learning_rate": 1.8752714818608036e-06, + "loss": 1.58630896, + "memory(GiB)": 117.38, + "step": 57485, + "train_speed(iter/s)": 1.637182 + }, + { + "acc": 0.66656523, + "epoch": 1.458396752917301, + "grad_norm": 5.40625, + "learning_rate": 1.874452923406791e-06, + "loss": 1.57180309, + "memory(GiB)": 117.38, + "step": 57490, + "train_speed(iter/s)": 1.637198 + }, + { + "acc": 0.65065546, + "epoch": 1.458523592085236, + "grad_norm": 5.4375, + "learning_rate": 1.8736345024282937e-06, + "loss": 1.63041382, + "memory(GiB)": 117.38, + "step": 57495, + "train_speed(iter/s)": 1.637214 + }, + { + "acc": 0.6569746, + "epoch": 1.458650431253171, + "grad_norm": 6.78125, + "learning_rate": 1.8728162189613085e-06, + "loss": 1.56969585, + "memory(GiB)": 117.38, + "step": 57500, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.66193199, + "epoch": 1.458777270421106, + "grad_norm": 7.21875, + "learning_rate": 1.8719980730418285e-06, + "loss": 1.61484318, + "memory(GiB)": 117.38, + "step": 57505, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.64654274, + "epoch": 1.4589041095890412, + "grad_norm": 6.875, + "learning_rate": 1.8711800647058388e-06, + "loss": 1.60167351, + "memory(GiB)": 117.38, + "step": 57510, + "train_speed(iter/s)": 1.637259 + }, + { + "acc": 0.64001398, + "epoch": 1.4590309487569761, + "grad_norm": 5.8125, + "learning_rate": 1.8703621939893185e-06, + "loss": 1.6887989, + "memory(GiB)": 117.38, + "step": 57515, + "train_speed(iter/s)": 1.637274 + }, + { + "acc": 0.64813566, + "epoch": 1.4591577879249111, + "grad_norm": 7.0625, + "learning_rate": 1.869544460928242e-06, + "loss": 1.58926125, + "memory(GiB)": 117.38, + "step": 57520, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.65228615, + "epoch": 1.4592846270928463, + "grad_norm": 5.5, + "learning_rate": 1.868726865558575e-06, + "loss": 1.59925556, + "memory(GiB)": 117.38, + "step": 57525, + "train_speed(iter/s)": 1.637304 + }, + { + "acc": 0.65611172, + "epoch": 1.4594114662607813, + "grad_norm": 5.0, + "learning_rate": 1.8679094079162835e-06, + "loss": 1.55919685, + "memory(GiB)": 117.38, + "step": 57530, + "train_speed(iter/s)": 1.637319 + }, + { + "acc": 0.65540648, + "epoch": 1.4595383054287163, + "grad_norm": 7.59375, + "learning_rate": 1.867092088037319e-06, + "loss": 1.57990208, + "memory(GiB)": 117.38, + "step": 57535, + "train_speed(iter/s)": 1.637335 + }, + { + "acc": 0.64379635, + "epoch": 1.4596651445966513, + "grad_norm": 5.9375, + "learning_rate": 1.8662749059576296e-06, + "loss": 1.65772171, + "memory(GiB)": 117.38, + "step": 57540, + "train_speed(iter/s)": 1.637352 + }, + { + "acc": 0.65981836, + "epoch": 1.4597919837645865, + "grad_norm": 7.28125, + "learning_rate": 1.865457861713163e-06, + "loss": 1.64376106, + "memory(GiB)": 117.38, + "step": 57545, + "train_speed(iter/s)": 1.637367 + }, + { + "acc": 0.67210875, + "epoch": 1.4599188229325215, + "grad_norm": 5.84375, + "learning_rate": 1.8646409553398558e-06, + "loss": 1.54521103, + "memory(GiB)": 117.38, + "step": 57550, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.66094265, + "epoch": 1.4600456621004567, + "grad_norm": 5.40625, + "learning_rate": 1.8638241868736367e-06, + "loss": 1.57712822, + "memory(GiB)": 117.38, + "step": 57555, + "train_speed(iter/s)": 1.637397 + }, + { + "acc": 0.65462923, + "epoch": 1.4601725012683917, + "grad_norm": 6.0625, + "learning_rate": 1.8630075563504297e-06, + "loss": 1.65783138, + "memory(GiB)": 117.38, + "step": 57560, + "train_speed(iter/s)": 1.637412 + }, + { + "acc": 0.6534667, + "epoch": 1.4602993404363267, + "grad_norm": 5.28125, + "learning_rate": 1.8621910638061575e-06, + "loss": 1.61833553, + "memory(GiB)": 117.38, + "step": 57565, + "train_speed(iter/s)": 1.637428 + }, + { + "acc": 0.64576778, + "epoch": 1.4604261796042617, + "grad_norm": 5.8125, + "learning_rate": 1.8613747092767336e-06, + "loss": 1.62031097, + "memory(GiB)": 117.38, + "step": 57570, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.65791631, + "epoch": 1.460553018772197, + "grad_norm": 5.5, + "learning_rate": 1.8605584927980596e-06, + "loss": 1.58129625, + "memory(GiB)": 117.38, + "step": 57575, + "train_speed(iter/s)": 1.637458 + }, + { + "acc": 0.65510769, + "epoch": 1.4606798579401319, + "grad_norm": 6.0625, + "learning_rate": 1.859742414406041e-06, + "loss": 1.54510098, + "memory(GiB)": 117.38, + "step": 57580, + "train_speed(iter/s)": 1.637473 + }, + { + "acc": 0.68095303, + "epoch": 1.460806697108067, + "grad_norm": 5.71875, + "learning_rate": 1.8589264741365714e-06, + "loss": 1.45404854, + "memory(GiB)": 117.38, + "step": 57585, + "train_speed(iter/s)": 1.637489 + }, + { + "acc": 0.64815416, + "epoch": 1.460933536276002, + "grad_norm": 4.71875, + "learning_rate": 1.8581106720255414e-06, + "loss": 1.6676609, + "memory(GiB)": 117.38, + "step": 57590, + "train_speed(iter/s)": 1.637504 + }, + { + "acc": 0.65925474, + "epoch": 1.461060375443937, + "grad_norm": 6.3125, + "learning_rate": 1.8572950081088282e-06, + "loss": 1.58584385, + "memory(GiB)": 117.38, + "step": 57595, + "train_speed(iter/s)": 1.637519 + }, + { + "acc": 0.65165644, + "epoch": 1.461187214611872, + "grad_norm": 6.15625, + "learning_rate": 1.856479482422313e-06, + "loss": 1.57179966, + "memory(GiB)": 117.38, + "step": 57600, + "train_speed(iter/s)": 1.637535 + }, + { + "acc": 0.66127176, + "epoch": 1.4613140537798073, + "grad_norm": 6.71875, + "learning_rate": 1.8556640950018651e-06, + "loss": 1.5802146, + "memory(GiB)": 117.38, + "step": 57605, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.67094517, + "epoch": 1.4614408929477423, + "grad_norm": 6.8125, + "learning_rate": 1.8548488458833485e-06, + "loss": 1.55746155, + "memory(GiB)": 117.38, + "step": 57610, + "train_speed(iter/s)": 1.637565 + }, + { + "acc": 0.66236334, + "epoch": 1.4615677321156773, + "grad_norm": 5.15625, + "learning_rate": 1.854033735102622e-06, + "loss": 1.60938721, + "memory(GiB)": 117.38, + "step": 57615, + "train_speed(iter/s)": 1.63758 + }, + { + "acc": 0.65521197, + "epoch": 1.4616945712836125, + "grad_norm": 6.90625, + "learning_rate": 1.8532187626955377e-06, + "loss": 1.6438303, + "memory(GiB)": 117.38, + "step": 57620, + "train_speed(iter/s)": 1.637597 + }, + { + "acc": 0.66102042, + "epoch": 1.4618214104515475, + "grad_norm": 5.6875, + "learning_rate": 1.8524039286979417e-06, + "loss": 1.62839165, + "memory(GiB)": 117.38, + "step": 57625, + "train_speed(iter/s)": 1.637612 + }, + { + "acc": 0.66023965, + "epoch": 1.4619482496194824, + "grad_norm": 5.65625, + "learning_rate": 1.8515892331456736e-06, + "loss": 1.61627388, + "memory(GiB)": 117.38, + "step": 57630, + "train_speed(iter/s)": 1.637627 + }, + { + "acc": 0.68634062, + "epoch": 1.4620750887874174, + "grad_norm": 5.53125, + "learning_rate": 1.850774676074568e-06, + "loss": 1.47016926, + "memory(GiB)": 117.38, + "step": 57635, + "train_speed(iter/s)": 1.637643 + }, + { + "acc": 0.65195532, + "epoch": 1.4622019279553526, + "grad_norm": 5.625, + "learning_rate": 1.8499602575204522e-06, + "loss": 1.64178848, + "memory(GiB)": 117.38, + "step": 57640, + "train_speed(iter/s)": 1.637657 + }, + { + "acc": 0.64850588, + "epoch": 1.4623287671232876, + "grad_norm": 5.3125, + "learning_rate": 1.8491459775191484e-06, + "loss": 1.58326969, + "memory(GiB)": 117.38, + "step": 57645, + "train_speed(iter/s)": 1.637673 + }, + { + "acc": 0.64244986, + "epoch": 1.4624556062912228, + "grad_norm": 6.09375, + "learning_rate": 1.8483318361064716e-06, + "loss": 1.6479784, + "memory(GiB)": 117.38, + "step": 57650, + "train_speed(iter/s)": 1.637689 + }, + { + "acc": 0.66887412, + "epoch": 1.4625824454591578, + "grad_norm": 6.09375, + "learning_rate": 1.847517833318232e-06, + "loss": 1.58741856, + "memory(GiB)": 117.38, + "step": 57655, + "train_speed(iter/s)": 1.637705 + }, + { + "acc": 0.67089691, + "epoch": 1.4627092846270928, + "grad_norm": 6.03125, + "learning_rate": 1.8467039691902334e-06, + "loss": 1.49151707, + "memory(GiB)": 117.38, + "step": 57660, + "train_speed(iter/s)": 1.63772 + }, + { + "acc": 0.66514549, + "epoch": 1.4628361237950278, + "grad_norm": 5.65625, + "learning_rate": 1.8458902437582705e-06, + "loss": 1.57454929, + "memory(GiB)": 117.38, + "step": 57665, + "train_speed(iter/s)": 1.637735 + }, + { + "acc": 0.65667682, + "epoch": 1.462962962962963, + "grad_norm": 5.25, + "learning_rate": 1.8450766570581402e-06, + "loss": 1.63363724, + "memory(GiB)": 117.38, + "step": 57670, + "train_speed(iter/s)": 1.63775 + }, + { + "acc": 0.65358829, + "epoch": 1.463089802130898, + "grad_norm": 5.9375, + "learning_rate": 1.8442632091256223e-06, + "loss": 1.60633087, + "memory(GiB)": 117.38, + "step": 57675, + "train_speed(iter/s)": 1.637766 + }, + { + "acc": 0.65549574, + "epoch": 1.463216641298833, + "grad_norm": 5.46875, + "learning_rate": 1.8434498999964983e-06, + "loss": 1.59054184, + "memory(GiB)": 117.38, + "step": 57680, + "train_speed(iter/s)": 1.63778 + }, + { + "acc": 0.66804919, + "epoch": 1.4633434804667682, + "grad_norm": 5.625, + "learning_rate": 1.8426367297065384e-06, + "loss": 1.56434231, + "memory(GiB)": 117.38, + "step": 57685, + "train_speed(iter/s)": 1.637795 + }, + { + "acc": 0.64665184, + "epoch": 1.4634703196347032, + "grad_norm": 5.25, + "learning_rate": 1.841823698291516e-06, + "loss": 1.66826744, + "memory(GiB)": 117.38, + "step": 57690, + "train_speed(iter/s)": 1.63781 + }, + { + "acc": 0.66567688, + "epoch": 1.4635971588026382, + "grad_norm": 5.8125, + "learning_rate": 1.8410108057871851e-06, + "loss": 1.53914165, + "memory(GiB)": 117.38, + "step": 57695, + "train_speed(iter/s)": 1.637825 + }, + { + "acc": 0.66354566, + "epoch": 1.4637239979705732, + "grad_norm": 8.0625, + "learning_rate": 1.8401980522293017e-06, + "loss": 1.55408611, + "memory(GiB)": 117.38, + "step": 57700, + "train_speed(iter/s)": 1.63784 + }, + { + "acc": 0.64570875, + "epoch": 1.4638508371385084, + "grad_norm": 8.125, + "learning_rate": 1.839385437653617e-06, + "loss": 1.68113461, + "memory(GiB)": 117.38, + "step": 57705, + "train_speed(iter/s)": 1.637855 + }, + { + "acc": 0.65801697, + "epoch": 1.4639776763064434, + "grad_norm": 4.78125, + "learning_rate": 1.8385729620958731e-06, + "loss": 1.64358425, + "memory(GiB)": 117.38, + "step": 57710, + "train_speed(iter/s)": 1.63787 + }, + { + "acc": 0.65845752, + "epoch": 1.4641045154743786, + "grad_norm": 6.6875, + "learning_rate": 1.8377606255918024e-06, + "loss": 1.59352703, + "memory(GiB)": 117.38, + "step": 57715, + "train_speed(iter/s)": 1.637885 + }, + { + "acc": 0.64405642, + "epoch": 1.4642313546423136, + "grad_norm": 5.90625, + "learning_rate": 1.8369484281771388e-06, + "loss": 1.65216274, + "memory(GiB)": 117.38, + "step": 57720, + "train_speed(iter/s)": 1.6379 + }, + { + "acc": 0.65615058, + "epoch": 1.4643581938102486, + "grad_norm": 5.28125, + "learning_rate": 1.836136369887606e-06, + "loss": 1.57136021, + "memory(GiB)": 117.38, + "step": 57725, + "train_speed(iter/s)": 1.637915 + }, + { + "acc": 0.66528053, + "epoch": 1.4644850329781836, + "grad_norm": 12.0, + "learning_rate": 1.8353244507589225e-06, + "loss": 1.58454514, + "memory(GiB)": 117.38, + "step": 57730, + "train_speed(iter/s)": 1.63793 + }, + { + "acc": 0.6485703, + "epoch": 1.4646118721461188, + "grad_norm": 5.3125, + "learning_rate": 1.8345126708267958e-06, + "loss": 1.60334282, + "memory(GiB)": 117.38, + "step": 57735, + "train_speed(iter/s)": 1.637945 + }, + { + "acc": 0.6711556, + "epoch": 1.4647387113140538, + "grad_norm": 6.25, + "learning_rate": 1.8337010301269364e-06, + "loss": 1.54415798, + "memory(GiB)": 117.38, + "step": 57740, + "train_speed(iter/s)": 1.63796 + }, + { + "acc": 0.65799427, + "epoch": 1.464865550481989, + "grad_norm": 7.125, + "learning_rate": 1.8328895286950422e-06, + "loss": 1.61259537, + "memory(GiB)": 117.38, + "step": 57745, + "train_speed(iter/s)": 1.637976 + }, + { + "acc": 0.64889278, + "epoch": 1.464992389649924, + "grad_norm": 5.78125, + "learning_rate": 1.8320781665668063e-06, + "loss": 1.61780052, + "memory(GiB)": 117.38, + "step": 57750, + "train_speed(iter/s)": 1.637991 + }, + { + "acc": 0.63451982, + "epoch": 1.465119228817859, + "grad_norm": 5.6875, + "learning_rate": 1.8312669437779167e-06, + "loss": 1.71501007, + "memory(GiB)": 117.38, + "step": 57755, + "train_speed(iter/s)": 1.638006 + }, + { + "acc": 0.65335474, + "epoch": 1.465246067985794, + "grad_norm": 5.59375, + "learning_rate": 1.8304558603640544e-06, + "loss": 1.64657059, + "memory(GiB)": 117.38, + "step": 57760, + "train_speed(iter/s)": 1.638021 + }, + { + "acc": 0.64696302, + "epoch": 1.4653729071537291, + "grad_norm": 5.125, + "learning_rate": 1.8296449163608942e-06, + "loss": 1.67038918, + "memory(GiB)": 117.38, + "step": 57765, + "train_speed(iter/s)": 1.638035 + }, + { + "acc": 0.67117066, + "epoch": 1.4654997463216641, + "grad_norm": 5.09375, + "learning_rate": 1.8288341118041052e-06, + "loss": 1.4601757, + "memory(GiB)": 117.38, + "step": 57770, + "train_speed(iter/s)": 1.63805 + }, + { + "acc": 0.66161356, + "epoch": 1.4656265854895991, + "grad_norm": 6.34375, + "learning_rate": 1.82802344672935e-06, + "loss": 1.62998199, + "memory(GiB)": 117.38, + "step": 57775, + "train_speed(iter/s)": 1.638066 + }, + { + "acc": 0.64639921, + "epoch": 1.4657534246575343, + "grad_norm": 5.40625, + "learning_rate": 1.8272129211722855e-06, + "loss": 1.60705299, + "memory(GiB)": 117.38, + "step": 57780, + "train_speed(iter/s)": 1.638081 + }, + { + "acc": 0.65424089, + "epoch": 1.4658802638254693, + "grad_norm": 6.4375, + "learning_rate": 1.8264025351685627e-06, + "loss": 1.63999844, + "memory(GiB)": 117.38, + "step": 57785, + "train_speed(iter/s)": 1.638096 + }, + { + "acc": 0.65920467, + "epoch": 1.4660071029934043, + "grad_norm": 6.0625, + "learning_rate": 1.8255922887538251e-06, + "loss": 1.56166487, + "memory(GiB)": 117.38, + "step": 57790, + "train_speed(iter/s)": 1.638111 + }, + { + "acc": 0.65805445, + "epoch": 1.4661339421613393, + "grad_norm": 9.625, + "learning_rate": 1.8247821819637112e-06, + "loss": 1.57964792, + "memory(GiB)": 117.38, + "step": 57795, + "train_speed(iter/s)": 1.638127 + }, + { + "acc": 0.65947523, + "epoch": 1.4662607813292745, + "grad_norm": 7.5625, + "learning_rate": 1.8239722148338534e-06, + "loss": 1.55246401, + "memory(GiB)": 117.38, + "step": 57800, + "train_speed(iter/s)": 1.638143 + }, + { + "acc": 0.65045786, + "epoch": 1.4663876204972095, + "grad_norm": 5.5625, + "learning_rate": 1.823162387399876e-06, + "loss": 1.64657764, + "memory(GiB)": 117.38, + "step": 57805, + "train_speed(iter/s)": 1.638157 + }, + { + "acc": 0.67498369, + "epoch": 1.4665144596651447, + "grad_norm": 5.90625, + "learning_rate": 1.822352699697404e-06, + "loss": 1.55461931, + "memory(GiB)": 117.38, + "step": 57810, + "train_speed(iter/s)": 1.638173 + }, + { + "acc": 0.66299582, + "epoch": 1.4666412988330797, + "grad_norm": 5.59375, + "learning_rate": 1.8215431517620452e-06, + "loss": 1.602038, + "memory(GiB)": 117.38, + "step": 57815, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.64773974, + "epoch": 1.4667681380010147, + "grad_norm": 5.8125, + "learning_rate": 1.8207337436294097e-06, + "loss": 1.71699619, + "memory(GiB)": 117.38, + "step": 57820, + "train_speed(iter/s)": 1.638203 + }, + { + "acc": 0.65654716, + "epoch": 1.4668949771689497, + "grad_norm": 6.53125, + "learning_rate": 1.819924475335097e-06, + "loss": 1.59594221, + "memory(GiB)": 117.38, + "step": 57825, + "train_speed(iter/s)": 1.638218 + }, + { + "acc": 0.65022173, + "epoch": 1.4670218163368849, + "grad_norm": 6.125, + "learning_rate": 1.8191153469147065e-06, + "loss": 1.57646351, + "memory(GiB)": 117.38, + "step": 57830, + "train_speed(iter/s)": 1.638233 + }, + { + "acc": 0.64668159, + "epoch": 1.4671486555048199, + "grad_norm": 5.28125, + "learning_rate": 1.8183063584038236e-06, + "loss": 1.68735371, + "memory(GiB)": 117.38, + "step": 57835, + "train_speed(iter/s)": 1.638249 + }, + { + "acc": 0.6517664, + "epoch": 1.4672754946727549, + "grad_norm": 5.5625, + "learning_rate": 1.8174975098380304e-06, + "loss": 1.62523823, + "memory(GiB)": 117.38, + "step": 57840, + "train_speed(iter/s)": 1.638265 + }, + { + "acc": 0.66627073, + "epoch": 1.46740233384069, + "grad_norm": 5.53125, + "learning_rate": 1.8166888012529078e-06, + "loss": 1.57713547, + "memory(GiB)": 117.38, + "step": 57845, + "train_speed(iter/s)": 1.638279 + }, + { + "acc": 0.65476637, + "epoch": 1.467529173008625, + "grad_norm": 6.0, + "learning_rate": 1.8158802326840252e-06, + "loss": 1.55509501, + "memory(GiB)": 117.38, + "step": 57850, + "train_speed(iter/s)": 1.638294 + }, + { + "acc": 0.6728941, + "epoch": 1.46765601217656, + "grad_norm": 4.625, + "learning_rate": 1.8150718041669447e-06, + "loss": 1.53818569, + "memory(GiB)": 117.38, + "step": 57855, + "train_speed(iter/s)": 1.638308 + }, + { + "acc": 0.65525231, + "epoch": 1.467782851344495, + "grad_norm": 5.75, + "learning_rate": 1.814263515737224e-06, + "loss": 1.61912804, + "memory(GiB)": 117.38, + "step": 57860, + "train_speed(iter/s)": 1.638322 + }, + { + "acc": 0.65495143, + "epoch": 1.4679096905124303, + "grad_norm": 4.5, + "learning_rate": 1.813455367430419e-06, + "loss": 1.55582657, + "memory(GiB)": 117.38, + "step": 57865, + "train_speed(iter/s)": 1.638337 + }, + { + "acc": 0.65118151, + "epoch": 1.4680365296803652, + "grad_norm": 5.46875, + "learning_rate": 1.812647359282076e-06, + "loss": 1.7126833, + "memory(GiB)": 117.38, + "step": 57870, + "train_speed(iter/s)": 1.638352 + }, + { + "acc": 0.64035082, + "epoch": 1.4681633688483005, + "grad_norm": 5.15625, + "learning_rate": 1.8118394913277287e-06, + "loss": 1.60532188, + "memory(GiB)": 117.38, + "step": 57875, + "train_speed(iter/s)": 1.638121 + }, + { + "acc": 0.64400587, + "epoch": 1.4682902080162354, + "grad_norm": 5.9375, + "learning_rate": 1.8110317636029162e-06, + "loss": 1.59559479, + "memory(GiB)": 117.38, + "step": 57880, + "train_speed(iter/s)": 1.638136 + }, + { + "acc": 0.66109982, + "epoch": 1.4684170471841704, + "grad_norm": 5.90625, + "learning_rate": 1.810224176143165e-06, + "loss": 1.64881477, + "memory(GiB)": 117.38, + "step": 57885, + "train_speed(iter/s)": 1.63815 + }, + { + "acc": 0.6538269, + "epoch": 1.4685438863521054, + "grad_norm": 6.34375, + "learning_rate": 1.8094167289839953e-06, + "loss": 1.52976017, + "memory(GiB)": 117.38, + "step": 57890, + "train_speed(iter/s)": 1.638164 + }, + { + "acc": 0.64798522, + "epoch": 1.4686707255200406, + "grad_norm": 6.28125, + "learning_rate": 1.808609422160923e-06, + "loss": 1.66295319, + "memory(GiB)": 117.38, + "step": 57895, + "train_speed(iter/s)": 1.638179 + }, + { + "acc": 0.65884228, + "epoch": 1.4687975646879756, + "grad_norm": 6.03125, + "learning_rate": 1.8078022557094571e-06, + "loss": 1.59045715, + "memory(GiB)": 117.38, + "step": 57900, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.6462811, + "epoch": 1.4689244038559108, + "grad_norm": 5.96875, + "learning_rate": 1.8069952296651e-06, + "loss": 1.63971462, + "memory(GiB)": 117.38, + "step": 57905, + "train_speed(iter/s)": 1.638208 + }, + { + "acc": 0.64991851, + "epoch": 1.4690512430238458, + "grad_norm": 6.9375, + "learning_rate": 1.8061883440633481e-06, + "loss": 1.6211853, + "memory(GiB)": 117.38, + "step": 57910, + "train_speed(iter/s)": 1.638223 + }, + { + "acc": 0.6713758, + "epoch": 1.4691780821917808, + "grad_norm": 5.78125, + "learning_rate": 1.8053815989396927e-06, + "loss": 1.58132153, + "memory(GiB)": 117.38, + "step": 57915, + "train_speed(iter/s)": 1.638238 + }, + { + "acc": 0.66793137, + "epoch": 1.4693049213597158, + "grad_norm": 5.8125, + "learning_rate": 1.8045749943296171e-06, + "loss": 1.54448557, + "memory(GiB)": 117.38, + "step": 57920, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.65428467, + "epoch": 1.469431760527651, + "grad_norm": 6.53125, + "learning_rate": 1.8037685302686003e-06, + "loss": 1.6049469, + "memory(GiB)": 117.38, + "step": 57925, + "train_speed(iter/s)": 1.638268 + }, + { + "acc": 0.65100832, + "epoch": 1.469558599695586, + "grad_norm": 5.84375, + "learning_rate": 1.8029622067921133e-06, + "loss": 1.67594109, + "memory(GiB)": 117.38, + "step": 57930, + "train_speed(iter/s)": 1.638283 + }, + { + "acc": 0.66949358, + "epoch": 1.469685438863521, + "grad_norm": 6.8125, + "learning_rate": 1.8021560239356223e-06, + "loss": 1.52854862, + "memory(GiB)": 117.38, + "step": 57935, + "train_speed(iter/s)": 1.638298 + }, + { + "acc": 0.66745014, + "epoch": 1.4698122780314562, + "grad_norm": 5.15625, + "learning_rate": 1.8013499817345865e-06, + "loss": 1.57253742, + "memory(GiB)": 117.38, + "step": 57940, + "train_speed(iter/s)": 1.638314 + }, + { + "acc": 0.66670198, + "epoch": 1.4699391171993912, + "grad_norm": 5.78125, + "learning_rate": 1.8005440802244595e-06, + "loss": 1.60223846, + "memory(GiB)": 117.38, + "step": 57945, + "train_speed(iter/s)": 1.638329 + }, + { + "acc": 0.66258121, + "epoch": 1.4700659563673262, + "grad_norm": 4.90625, + "learning_rate": 1.7997383194406887e-06, + "loss": 1.55077305, + "memory(GiB)": 117.38, + "step": 57950, + "train_speed(iter/s)": 1.638344 + }, + { + "acc": 0.66343145, + "epoch": 1.4701927955352612, + "grad_norm": 5.0625, + "learning_rate": 1.7989326994187146e-06, + "loss": 1.60424995, + "memory(GiB)": 117.38, + "step": 57955, + "train_speed(iter/s)": 1.638358 + }, + { + "acc": 0.66509609, + "epoch": 1.4703196347031964, + "grad_norm": 6.1875, + "learning_rate": 1.798127220193972e-06, + "loss": 1.61679001, + "memory(GiB)": 117.38, + "step": 57960, + "train_speed(iter/s)": 1.638374 + }, + { + "acc": 0.64575863, + "epoch": 1.4704464738711314, + "grad_norm": 6.59375, + "learning_rate": 1.7973218818018878e-06, + "loss": 1.60709801, + "memory(GiB)": 117.38, + "step": 57965, + "train_speed(iter/s)": 1.63839 + }, + { + "acc": 0.65588737, + "epoch": 1.4705733130390666, + "grad_norm": 5.40625, + "learning_rate": 1.7965166842778897e-06, + "loss": 1.60618763, + "memory(GiB)": 117.38, + "step": 57970, + "train_speed(iter/s)": 1.638405 + }, + { + "acc": 0.66633224, + "epoch": 1.4707001522070016, + "grad_norm": 5.15625, + "learning_rate": 1.7957116276573888e-06, + "loss": 1.60000439, + "memory(GiB)": 117.38, + "step": 57975, + "train_speed(iter/s)": 1.638419 + }, + { + "acc": 0.6571146, + "epoch": 1.4708269913749366, + "grad_norm": 6.5, + "learning_rate": 1.7949067119757951e-06, + "loss": 1.56607904, + "memory(GiB)": 117.38, + "step": 57980, + "train_speed(iter/s)": 1.638433 + }, + { + "acc": 0.64114771, + "epoch": 1.4709538305428715, + "grad_norm": 5.875, + "learning_rate": 1.7941019372685154e-06, + "loss": 1.65334702, + "memory(GiB)": 117.38, + "step": 57985, + "train_speed(iter/s)": 1.638448 + }, + { + "acc": 0.65712395, + "epoch": 1.4710806697108068, + "grad_norm": 7.1875, + "learning_rate": 1.7932973035709471e-06, + "loss": 1.53186407, + "memory(GiB)": 117.38, + "step": 57990, + "train_speed(iter/s)": 1.638462 + }, + { + "acc": 0.65712624, + "epoch": 1.4712075088787417, + "grad_norm": 6.875, + "learning_rate": 1.792492810918479e-06, + "loss": 1.60296822, + "memory(GiB)": 117.38, + "step": 57995, + "train_speed(iter/s)": 1.638477 + }, + { + "acc": 0.64174623, + "epoch": 1.4713343480466767, + "grad_norm": 5.53125, + "learning_rate": 1.7916884593464957e-06, + "loss": 1.651194, + "memory(GiB)": 117.38, + "step": 58000, + "train_speed(iter/s)": 1.638491 + }, + { + "epoch": 1.4713343480466767, + "eval_acc": 0.6463060997333233, + "eval_loss": 1.5731773376464844, + "eval_runtime": 58.4715, + "eval_samples_per_second": 108.942, + "eval_steps_per_second": 27.244, + "step": 58000 + }, + { + "acc": 0.65707092, + "epoch": 1.471461187214612, + "grad_norm": 6.59375, + "learning_rate": 1.79088424889038e-06, + "loss": 1.65774002, + "memory(GiB)": 117.38, + "step": 58005, + "train_speed(iter/s)": 1.635603 + }, + { + "acc": 0.65804262, + "epoch": 1.471588026382547, + "grad_norm": 6.78125, + "learning_rate": 1.7900801795855043e-06, + "loss": 1.56835489, + "memory(GiB)": 117.38, + "step": 58010, + "train_speed(iter/s)": 1.635619 + }, + { + "acc": 0.65304799, + "epoch": 1.471714865550482, + "grad_norm": 6.5, + "learning_rate": 1.7892762514672303e-06, + "loss": 1.59533587, + "memory(GiB)": 117.38, + "step": 58015, + "train_speed(iter/s)": 1.635634 + }, + { + "acc": 0.64941578, + "epoch": 1.471841704718417, + "grad_norm": 6.28125, + "learning_rate": 1.7884724645709228e-06, + "loss": 1.66165295, + "memory(GiB)": 117.38, + "step": 58020, + "train_speed(iter/s)": 1.635649 + }, + { + "acc": 0.66016474, + "epoch": 1.4719685438863521, + "grad_norm": 5.46875, + "learning_rate": 1.7876688189319353e-06, + "loss": 1.62394047, + "memory(GiB)": 117.38, + "step": 58025, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.66501589, + "epoch": 1.472095383054287, + "grad_norm": 4.59375, + "learning_rate": 1.7868653145856163e-06, + "loss": 1.54129066, + "memory(GiB)": 117.38, + "step": 58030, + "train_speed(iter/s)": 1.63568 + }, + { + "acc": 0.67722635, + "epoch": 1.4722222222222223, + "grad_norm": 6.5, + "learning_rate": 1.7860619515673034e-06, + "loss": 1.46738453, + "memory(GiB)": 117.38, + "step": 58035, + "train_speed(iter/s)": 1.635697 + }, + { + "acc": 0.67051888, + "epoch": 1.4723490613901573, + "grad_norm": 5.1875, + "learning_rate": 1.785258729912337e-06, + "loss": 1.58444004, + "memory(GiB)": 117.38, + "step": 58040, + "train_speed(iter/s)": 1.635712 + }, + { + "acc": 0.63772449, + "epoch": 1.4724759005580923, + "grad_norm": 5.59375, + "learning_rate": 1.784455649656044e-06, + "loss": 1.70479698, + "memory(GiB)": 117.38, + "step": 58045, + "train_speed(iter/s)": 1.635727 + }, + { + "acc": 0.66457853, + "epoch": 1.4726027397260273, + "grad_norm": 4.71875, + "learning_rate": 1.7836527108337482e-06, + "loss": 1.58913021, + "memory(GiB)": 117.38, + "step": 58050, + "train_speed(iter/s)": 1.635742 + }, + { + "acc": 0.66410203, + "epoch": 1.4727295788939625, + "grad_norm": 4.8125, + "learning_rate": 1.782849913480766e-06, + "loss": 1.58055525, + "memory(GiB)": 117.38, + "step": 58055, + "train_speed(iter/s)": 1.635758 + }, + { + "acc": 0.65571756, + "epoch": 1.4728564180618975, + "grad_norm": 5.34375, + "learning_rate": 1.7820472576324078e-06, + "loss": 1.60898399, + "memory(GiB)": 117.38, + "step": 58060, + "train_speed(iter/s)": 1.635774 + }, + { + "acc": 0.66803703, + "epoch": 1.4729832572298327, + "grad_norm": 5.5625, + "learning_rate": 1.7812447433239789e-06, + "loss": 1.54877119, + "memory(GiB)": 117.38, + "step": 58065, + "train_speed(iter/s)": 1.63579 + }, + { + "acc": 0.64985929, + "epoch": 1.4731100963977677, + "grad_norm": 5.46875, + "learning_rate": 1.7804423705907764e-06, + "loss": 1.71767006, + "memory(GiB)": 117.38, + "step": 58070, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.65960436, + "epoch": 1.4732369355657027, + "grad_norm": 5.03125, + "learning_rate": 1.779640139468093e-06, + "loss": 1.5335762, + "memory(GiB)": 117.38, + "step": 58075, + "train_speed(iter/s)": 1.635822 + }, + { + "acc": 0.65477872, + "epoch": 1.4733637747336377, + "grad_norm": 6.15625, + "learning_rate": 1.778838049991214e-06, + "loss": 1.63821888, + "memory(GiB)": 117.38, + "step": 58080, + "train_speed(iter/s)": 1.635838 + }, + { + "acc": 0.64955473, + "epoch": 1.4734906139015729, + "grad_norm": 5.53125, + "learning_rate": 1.778036102195419e-06, + "loss": 1.58565264, + "memory(GiB)": 117.38, + "step": 58085, + "train_speed(iter/s)": 1.635853 + }, + { + "acc": 0.65545125, + "epoch": 1.4736174530695079, + "grad_norm": 5.34375, + "learning_rate": 1.7772342961159817e-06, + "loss": 1.60906105, + "memory(GiB)": 117.38, + "step": 58090, + "train_speed(iter/s)": 1.635868 + }, + { + "acc": 0.65875525, + "epoch": 1.4737442922374429, + "grad_norm": 6.375, + "learning_rate": 1.7764326317881681e-06, + "loss": 1.59845238, + "memory(GiB)": 117.38, + "step": 58095, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.66537104, + "epoch": 1.473871131405378, + "grad_norm": 6.90625, + "learning_rate": 1.77563110924724e-06, + "loss": 1.59925146, + "memory(GiB)": 117.38, + "step": 58100, + "train_speed(iter/s)": 1.6359 + }, + { + "acc": 0.64920645, + "epoch": 1.473997970573313, + "grad_norm": 5.6875, + "learning_rate": 1.7748297285284494e-06, + "loss": 1.61591263, + "memory(GiB)": 117.38, + "step": 58105, + "train_speed(iter/s)": 1.635916 + }, + { + "acc": 0.66434312, + "epoch": 1.474124809741248, + "grad_norm": 7.3125, + "learning_rate": 1.7740284896670507e-06, + "loss": 1.56024122, + "memory(GiB)": 117.38, + "step": 58110, + "train_speed(iter/s)": 1.635932 + }, + { + "acc": 0.66220188, + "epoch": 1.474251648909183, + "grad_norm": 5.9375, + "learning_rate": 1.7732273926982796e-06, + "loss": 1.58656502, + "memory(GiB)": 117.38, + "step": 58115, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.65404601, + "epoch": 1.4743784880771182, + "grad_norm": 5.21875, + "learning_rate": 1.7724264376573747e-06, + "loss": 1.54699917, + "memory(GiB)": 117.38, + "step": 58120, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.64706726, + "epoch": 1.4745053272450532, + "grad_norm": 5.65625, + "learning_rate": 1.7716256245795631e-06, + "loss": 1.65383949, + "memory(GiB)": 117.38, + "step": 58125, + "train_speed(iter/s)": 1.63598 + }, + { + "acc": 0.65086303, + "epoch": 1.4746321664129884, + "grad_norm": 5.8125, + "learning_rate": 1.7708249535000737e-06, + "loss": 1.63307858, + "memory(GiB)": 117.38, + "step": 58130, + "train_speed(iter/s)": 1.635996 + }, + { + "acc": 0.64020905, + "epoch": 1.4747590055809234, + "grad_norm": 5.75, + "learning_rate": 1.7700244244541182e-06, + "loss": 1.61890812, + "memory(GiB)": 117.38, + "step": 58135, + "train_speed(iter/s)": 1.636012 + }, + { + "acc": 0.66835909, + "epoch": 1.4748858447488584, + "grad_norm": 6.1875, + "learning_rate": 1.7692240374769081e-06, + "loss": 1.59846554, + "memory(GiB)": 117.38, + "step": 58140, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.65756869, + "epoch": 1.4750126839167934, + "grad_norm": 6.90625, + "learning_rate": 1.7684237926036507e-06, + "loss": 1.65764236, + "memory(GiB)": 117.38, + "step": 58145, + "train_speed(iter/s)": 1.636043 + }, + { + "acc": 0.64486589, + "epoch": 1.4751395230847286, + "grad_norm": 5.40625, + "learning_rate": 1.7676236898695442e-06, + "loss": 1.6357933, + "memory(GiB)": 117.38, + "step": 58150, + "train_speed(iter/s)": 1.63606 + }, + { + "acc": 0.65813298, + "epoch": 1.4752663622526636, + "grad_norm": 5.78125, + "learning_rate": 1.7668237293097762e-06, + "loss": 1.63641014, + "memory(GiB)": 117.38, + "step": 58155, + "train_speed(iter/s)": 1.636077 + }, + { + "acc": 0.66899223, + "epoch": 1.4753932014205986, + "grad_norm": 5.71875, + "learning_rate": 1.7660239109595374e-06, + "loss": 1.54526062, + "memory(GiB)": 117.38, + "step": 58160, + "train_speed(iter/s)": 1.636091 + }, + { + "acc": 0.63701143, + "epoch": 1.4755200405885338, + "grad_norm": 8.9375, + "learning_rate": 1.7652242348540056e-06, + "loss": 1.69027996, + "memory(GiB)": 117.38, + "step": 58165, + "train_speed(iter/s)": 1.636108 + }, + { + "acc": 0.65848742, + "epoch": 1.4756468797564688, + "grad_norm": 5.40625, + "learning_rate": 1.764424701028356e-06, + "loss": 1.5350193, + "memory(GiB)": 117.38, + "step": 58170, + "train_speed(iter/s)": 1.636123 + }, + { + "acc": 0.64417906, + "epoch": 1.4757737189244038, + "grad_norm": 5.625, + "learning_rate": 1.7636253095177507e-06, + "loss": 1.6628521, + "memory(GiB)": 117.38, + "step": 58175, + "train_speed(iter/s)": 1.636139 + }, + { + "acc": 0.63725128, + "epoch": 1.4759005580923388, + "grad_norm": 5.5, + "learning_rate": 1.762826060357355e-06, + "loss": 1.59830732, + "memory(GiB)": 117.38, + "step": 58180, + "train_speed(iter/s)": 1.636155 + }, + { + "acc": 0.65749836, + "epoch": 1.476027397260274, + "grad_norm": 5.84375, + "learning_rate": 1.762026953582322e-06, + "loss": 1.56753473, + "memory(GiB)": 117.38, + "step": 58185, + "train_speed(iter/s)": 1.636167 + }, + { + "acc": 0.65688167, + "epoch": 1.476154236428209, + "grad_norm": 6.53125, + "learning_rate": 1.7612279892278006e-06, + "loss": 1.58181448, + "memory(GiB)": 117.38, + "step": 58190, + "train_speed(iter/s)": 1.636183 + }, + { + "acc": 0.66077704, + "epoch": 1.4762810755961442, + "grad_norm": 6.53125, + "learning_rate": 1.7604291673289314e-06, + "loss": 1.58817921, + "memory(GiB)": 117.38, + "step": 58195, + "train_speed(iter/s)": 1.636199 + }, + { + "acc": 0.66322813, + "epoch": 1.4764079147640792, + "grad_norm": 5.625, + "learning_rate": 1.759630487920852e-06, + "loss": 1.51315727, + "memory(GiB)": 117.38, + "step": 58200, + "train_speed(iter/s)": 1.636214 + }, + { + "acc": 0.65955381, + "epoch": 1.4765347539320142, + "grad_norm": 5.0, + "learning_rate": 1.7588319510386903e-06, + "loss": 1.564571, + "memory(GiB)": 117.38, + "step": 58205, + "train_speed(iter/s)": 1.63623 + }, + { + "acc": 0.65465112, + "epoch": 1.4766615930999492, + "grad_norm": 5.03125, + "learning_rate": 1.7580335567175704e-06, + "loss": 1.58541908, + "memory(GiB)": 117.38, + "step": 58210, + "train_speed(iter/s)": 1.636246 + }, + { + "acc": 0.63913035, + "epoch": 1.4767884322678844, + "grad_norm": 5.1875, + "learning_rate": 1.7572353049926094e-06, + "loss": 1.71614113, + "memory(GiB)": 117.38, + "step": 58215, + "train_speed(iter/s)": 1.636261 + }, + { + "acc": 0.66926317, + "epoch": 1.4769152714358194, + "grad_norm": 5.03125, + "learning_rate": 1.7564371958989173e-06, + "loss": 1.52571945, + "memory(GiB)": 117.38, + "step": 58220, + "train_speed(iter/s)": 1.636277 + }, + { + "acc": 0.66025009, + "epoch": 1.4770421106037546, + "grad_norm": 7.78125, + "learning_rate": 1.7556392294715984e-06, + "loss": 1.60552883, + "memory(GiB)": 117.38, + "step": 58225, + "train_speed(iter/s)": 1.636293 + }, + { + "acc": 0.64934721, + "epoch": 1.4771689497716896, + "grad_norm": 6.15625, + "learning_rate": 1.7548414057457518e-06, + "loss": 1.60746765, + "memory(GiB)": 117.38, + "step": 58230, + "train_speed(iter/s)": 1.636309 + }, + { + "acc": 0.65709004, + "epoch": 1.4772957889396245, + "grad_norm": 5.0, + "learning_rate": 1.7540437247564685e-06, + "loss": 1.60906181, + "memory(GiB)": 117.38, + "step": 58235, + "train_speed(iter/s)": 1.636325 + }, + { + "acc": 0.67588587, + "epoch": 1.4774226281075595, + "grad_norm": 4.65625, + "learning_rate": 1.7532461865388345e-06, + "loss": 1.51192713, + "memory(GiB)": 117.38, + "step": 58240, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.6588172, + "epoch": 1.4775494672754947, + "grad_norm": 5.875, + "learning_rate": 1.752448791127927e-06, + "loss": 1.59473209, + "memory(GiB)": 117.38, + "step": 58245, + "train_speed(iter/s)": 1.636357 + }, + { + "acc": 0.66801414, + "epoch": 1.4776763064434297, + "grad_norm": 5.875, + "learning_rate": 1.7516515385588245e-06, + "loss": 1.59867077, + "memory(GiB)": 117.38, + "step": 58250, + "train_speed(iter/s)": 1.636372 + }, + { + "acc": 0.64722624, + "epoch": 1.4778031456113647, + "grad_norm": 6.65625, + "learning_rate": 1.7508544288665885e-06, + "loss": 1.6347002, + "memory(GiB)": 117.38, + "step": 58255, + "train_speed(iter/s)": 1.636389 + }, + { + "acc": 0.66878166, + "epoch": 1.4779299847793, + "grad_norm": 5.21875, + "learning_rate": 1.750057462086281e-06, + "loss": 1.57623711, + "memory(GiB)": 117.38, + "step": 58260, + "train_speed(iter/s)": 1.636405 + }, + { + "acc": 0.66516342, + "epoch": 1.478056823947235, + "grad_norm": 5.375, + "learning_rate": 1.7492606382529542e-06, + "loss": 1.58060951, + "memory(GiB)": 117.38, + "step": 58265, + "train_speed(iter/s)": 1.636421 + }, + { + "acc": 0.66816592, + "epoch": 1.47818366311517, + "grad_norm": 6.15625, + "learning_rate": 1.748463957401662e-06, + "loss": 1.59277229, + "memory(GiB)": 117.38, + "step": 58270, + "train_speed(iter/s)": 1.636438 + }, + { + "acc": 0.65403109, + "epoch": 1.478310502283105, + "grad_norm": 6.28125, + "learning_rate": 1.7476674195674404e-06, + "loss": 1.60921288, + "memory(GiB)": 117.38, + "step": 58275, + "train_speed(iter/s)": 1.636454 + }, + { + "acc": 0.66395659, + "epoch": 1.47843734145104, + "grad_norm": 7.21875, + "learning_rate": 1.7468710247853244e-06, + "loss": 1.60885086, + "memory(GiB)": 117.38, + "step": 58280, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.65570054, + "epoch": 1.478564180618975, + "grad_norm": 5.78125, + "learning_rate": 1.7460747730903466e-06, + "loss": 1.54865913, + "memory(GiB)": 117.38, + "step": 58285, + "train_speed(iter/s)": 1.636486 + }, + { + "acc": 0.65777349, + "epoch": 1.4786910197869103, + "grad_norm": 7.71875, + "learning_rate": 1.7452786645175297e-06, + "loss": 1.59784489, + "memory(GiB)": 117.38, + "step": 58290, + "train_speed(iter/s)": 1.636503 + }, + { + "acc": 0.64883413, + "epoch": 1.4788178589548453, + "grad_norm": 5.46875, + "learning_rate": 1.7444826991018864e-06, + "loss": 1.65592422, + "memory(GiB)": 117.38, + "step": 58295, + "train_speed(iter/s)": 1.636518 + }, + { + "acc": 0.65616913, + "epoch": 1.4789446981227803, + "grad_norm": 5.5, + "learning_rate": 1.7436868768784276e-06, + "loss": 1.59346552, + "memory(GiB)": 117.38, + "step": 58300, + "train_speed(iter/s)": 1.636534 + }, + { + "acc": 0.66765242, + "epoch": 1.4790715372907153, + "grad_norm": 6.84375, + "learning_rate": 1.7428911978821594e-06, + "loss": 1.59195023, + "memory(GiB)": 117.38, + "step": 58305, + "train_speed(iter/s)": 1.63655 + }, + { + "acc": 0.64725127, + "epoch": 1.4791983764586505, + "grad_norm": 6.1875, + "learning_rate": 1.7420956621480806e-06, + "loss": 1.63658428, + "memory(GiB)": 117.38, + "step": 58310, + "train_speed(iter/s)": 1.636566 + }, + { + "acc": 0.66420803, + "epoch": 1.4793252156265855, + "grad_norm": 4.875, + "learning_rate": 1.7413002697111765e-06, + "loss": 1.53049049, + "memory(GiB)": 117.38, + "step": 58315, + "train_speed(iter/s)": 1.636582 + }, + { + "acc": 0.66570177, + "epoch": 1.4794520547945205, + "grad_norm": 5.96875, + "learning_rate": 1.7405050206064372e-06, + "loss": 1.60504017, + "memory(GiB)": 117.38, + "step": 58320, + "train_speed(iter/s)": 1.636597 + }, + { + "acc": 0.65544472, + "epoch": 1.4795788939624557, + "grad_norm": 5.375, + "learning_rate": 1.73970991486884e-06, + "loss": 1.61635227, + "memory(GiB)": 117.38, + "step": 58325, + "train_speed(iter/s)": 1.636614 + }, + { + "acc": 0.64594116, + "epoch": 1.4797057331303907, + "grad_norm": 8.25, + "learning_rate": 1.7389149525333565e-06, + "loss": 1.64471931, + "memory(GiB)": 117.38, + "step": 58330, + "train_speed(iter/s)": 1.636628 + }, + { + "acc": 0.63516736, + "epoch": 1.4798325722983257, + "grad_norm": 5.25, + "learning_rate": 1.7381201336349535e-06, + "loss": 1.60180054, + "memory(GiB)": 117.38, + "step": 58335, + "train_speed(iter/s)": 1.636644 + }, + { + "acc": 0.65250211, + "epoch": 1.4799594114662606, + "grad_norm": 5.625, + "learning_rate": 1.7373254582085896e-06, + "loss": 1.60495605, + "memory(GiB)": 117.38, + "step": 58340, + "train_speed(iter/s)": 1.636659 + }, + { + "acc": 0.65782995, + "epoch": 1.4800862506341959, + "grad_norm": 7.375, + "learning_rate": 1.7365309262892194e-06, + "loss": 1.56491375, + "memory(GiB)": 117.38, + "step": 58345, + "train_speed(iter/s)": 1.636675 + }, + { + "acc": 0.63674431, + "epoch": 1.4802130898021308, + "grad_norm": 6.90625, + "learning_rate": 1.735736537911789e-06, + "loss": 1.67274647, + "memory(GiB)": 117.38, + "step": 58350, + "train_speed(iter/s)": 1.636692 + }, + { + "acc": 0.63673859, + "epoch": 1.480339928970066, + "grad_norm": 4.875, + "learning_rate": 1.7349422931112403e-06, + "loss": 1.66101303, + "memory(GiB)": 117.38, + "step": 58355, + "train_speed(iter/s)": 1.636708 + }, + { + "acc": 0.66338797, + "epoch": 1.480466768138001, + "grad_norm": 4.65625, + "learning_rate": 1.7341481919225062e-06, + "loss": 1.59977922, + "memory(GiB)": 117.38, + "step": 58360, + "train_speed(iter/s)": 1.636724 + }, + { + "acc": 0.65562878, + "epoch": 1.480593607305936, + "grad_norm": 5.46875, + "learning_rate": 1.733354234380516e-06, + "loss": 1.5597949, + "memory(GiB)": 117.38, + "step": 58365, + "train_speed(iter/s)": 1.63674 + }, + { + "acc": 0.65842876, + "epoch": 1.480720446473871, + "grad_norm": 5.25, + "learning_rate": 1.7325604205201912e-06, + "loss": 1.53835945, + "memory(GiB)": 117.38, + "step": 58370, + "train_speed(iter/s)": 1.636755 + }, + { + "acc": 0.6519269, + "epoch": 1.4808472856418062, + "grad_norm": 4.875, + "learning_rate": 1.7317667503764468e-06, + "loss": 1.5427165, + "memory(GiB)": 117.38, + "step": 58375, + "train_speed(iter/s)": 1.636771 + }, + { + "acc": 0.63990617, + "epoch": 1.4809741248097412, + "grad_norm": 5.75, + "learning_rate": 1.7309732239841926e-06, + "loss": 1.73910656, + "memory(GiB)": 117.38, + "step": 58380, + "train_speed(iter/s)": 1.636787 + }, + { + "acc": 0.66312122, + "epoch": 1.4811009639776764, + "grad_norm": 5.3125, + "learning_rate": 1.730179841378331e-06, + "loss": 1.5415411, + "memory(GiB)": 117.38, + "step": 58385, + "train_speed(iter/s)": 1.636803 + }, + { + "acc": 0.65903182, + "epoch": 1.4812278031456114, + "grad_norm": 5.21875, + "learning_rate": 1.7293866025937589e-06, + "loss": 1.57861357, + "memory(GiB)": 117.38, + "step": 58390, + "train_speed(iter/s)": 1.636819 + }, + { + "acc": 0.66187086, + "epoch": 1.4813546423135464, + "grad_norm": 5.65625, + "learning_rate": 1.7285935076653659e-06, + "loss": 1.56488838, + "memory(GiB)": 117.38, + "step": 58395, + "train_speed(iter/s)": 1.636835 + }, + { + "acc": 0.67188573, + "epoch": 1.4814814814814814, + "grad_norm": 6.15625, + "learning_rate": 1.7278005566280365e-06, + "loss": 1.56783524, + "memory(GiB)": 117.38, + "step": 58400, + "train_speed(iter/s)": 1.636851 + }, + { + "acc": 0.64539118, + "epoch": 1.4816083206494166, + "grad_norm": 6.15625, + "learning_rate": 1.727007749516646e-06, + "loss": 1.67626705, + "memory(GiB)": 117.38, + "step": 58405, + "train_speed(iter/s)": 1.636867 + }, + { + "acc": 0.65323706, + "epoch": 1.4817351598173516, + "grad_norm": 5.125, + "learning_rate": 1.7262150863660709e-06, + "loss": 1.66554279, + "memory(GiB)": 117.38, + "step": 58410, + "train_speed(iter/s)": 1.636883 + }, + { + "acc": 0.65039778, + "epoch": 1.4818619989852866, + "grad_norm": 5.21875, + "learning_rate": 1.7254225672111713e-06, + "loss": 1.62035866, + "memory(GiB)": 117.38, + "step": 58415, + "train_speed(iter/s)": 1.636899 + }, + { + "acc": 0.65661621, + "epoch": 1.4819888381532218, + "grad_norm": 6.65625, + "learning_rate": 1.7246301920868052e-06, + "loss": 1.58803453, + "memory(GiB)": 117.38, + "step": 58420, + "train_speed(iter/s)": 1.636915 + }, + { + "acc": 0.66044226, + "epoch": 1.4821156773211568, + "grad_norm": 6.59375, + "learning_rate": 1.723837961027829e-06, + "loss": 1.4941515, + "memory(GiB)": 117.38, + "step": 58425, + "train_speed(iter/s)": 1.636931 + }, + { + "acc": 0.65878887, + "epoch": 1.4822425164890918, + "grad_norm": 6.84375, + "learning_rate": 1.723045874069087e-06, + "loss": 1.60147572, + "memory(GiB)": 117.38, + "step": 58430, + "train_speed(iter/s)": 1.636947 + }, + { + "acc": 0.66305676, + "epoch": 1.4823693556570268, + "grad_norm": 5.375, + "learning_rate": 1.7222539312454167e-06, + "loss": 1.64099464, + "memory(GiB)": 117.38, + "step": 58435, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.6523262, + "epoch": 1.482496194824962, + "grad_norm": 6.65625, + "learning_rate": 1.7214621325916515e-06, + "loss": 1.63348141, + "memory(GiB)": 117.38, + "step": 58440, + "train_speed(iter/s)": 1.636981 + }, + { + "acc": 0.67395077, + "epoch": 1.482623033992897, + "grad_norm": 9.0625, + "learning_rate": 1.7206704781426204e-06, + "loss": 1.55686607, + "memory(GiB)": 117.38, + "step": 58445, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.66666398, + "epoch": 1.4827498731608322, + "grad_norm": 5.90625, + "learning_rate": 1.7198789679331445e-06, + "loss": 1.58234825, + "memory(GiB)": 117.38, + "step": 58450, + "train_speed(iter/s)": 1.637012 + }, + { + "acc": 0.67018414, + "epoch": 1.4828767123287672, + "grad_norm": 4.09375, + "learning_rate": 1.7190876019980329e-06, + "loss": 1.52543058, + "memory(GiB)": 117.38, + "step": 58455, + "train_speed(iter/s)": 1.637027 + }, + { + "acc": 0.65292039, + "epoch": 1.4830035514967022, + "grad_norm": 5.5, + "learning_rate": 1.7182963803720987e-06, + "loss": 1.58325291, + "memory(GiB)": 117.38, + "step": 58460, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.63819237, + "epoch": 1.4831303906646371, + "grad_norm": 7.0625, + "learning_rate": 1.7175053030901418e-06, + "loss": 1.67336922, + "memory(GiB)": 117.38, + "step": 58465, + "train_speed(iter/s)": 1.637059 + }, + { + "acc": 0.66205082, + "epoch": 1.4832572298325724, + "grad_norm": 5.8125, + "learning_rate": 1.7167143701869582e-06, + "loss": 1.59941053, + "memory(GiB)": 117.38, + "step": 58470, + "train_speed(iter/s)": 1.637074 + }, + { + "acc": 0.65847273, + "epoch": 1.4833840690005073, + "grad_norm": 6.53125, + "learning_rate": 1.7159235816973318e-06, + "loss": 1.5616128, + "memory(GiB)": 117.38, + "step": 58475, + "train_speed(iter/s)": 1.637091 + }, + { + "acc": 0.66114902, + "epoch": 1.4835109081684423, + "grad_norm": 4.875, + "learning_rate": 1.7151329376560506e-06, + "loss": 1.62488689, + "memory(GiB)": 117.38, + "step": 58480, + "train_speed(iter/s)": 1.637106 + }, + { + "acc": 0.65020003, + "epoch": 1.4836377473363775, + "grad_norm": 4.625, + "learning_rate": 1.7143424380978885e-06, + "loss": 1.59014368, + "memory(GiB)": 117.38, + "step": 58485, + "train_speed(iter/s)": 1.637122 + }, + { + "acc": 0.65273342, + "epoch": 1.4837645865043125, + "grad_norm": 7.34375, + "learning_rate": 1.7135520830576157e-06, + "loss": 1.66563416, + "memory(GiB)": 117.38, + "step": 58490, + "train_speed(iter/s)": 1.637137 + }, + { + "acc": 0.66002617, + "epoch": 1.4838914256722475, + "grad_norm": 5.15625, + "learning_rate": 1.712761872569995e-06, + "loss": 1.58130112, + "memory(GiB)": 117.38, + "step": 58495, + "train_speed(iter/s)": 1.637153 + }, + { + "acc": 0.65367932, + "epoch": 1.4840182648401825, + "grad_norm": 5.375, + "learning_rate": 1.7119718066697838e-06, + "loss": 1.64396267, + "memory(GiB)": 117.38, + "step": 58500, + "train_speed(iter/s)": 1.637169 + }, + { + "acc": 0.65111408, + "epoch": 1.4841451040081177, + "grad_norm": 5.3125, + "learning_rate": 1.7111818853917323e-06, + "loss": 1.59455919, + "memory(GiB)": 117.38, + "step": 58505, + "train_speed(iter/s)": 1.637184 + }, + { + "acc": 0.6549758, + "epoch": 1.4842719431760527, + "grad_norm": 5.21875, + "learning_rate": 1.710392108770585e-06, + "loss": 1.65722713, + "memory(GiB)": 117.38, + "step": 58510, + "train_speed(iter/s)": 1.6372 + }, + { + "acc": 0.65516052, + "epoch": 1.484398782343988, + "grad_norm": 5.625, + "learning_rate": 1.7096024768410796e-06, + "loss": 1.65799637, + "memory(GiB)": 117.38, + "step": 58515, + "train_speed(iter/s)": 1.637216 + }, + { + "acc": 0.64082503, + "epoch": 1.484525621511923, + "grad_norm": 4.6875, + "learning_rate": 1.7088129896379484e-06, + "loss": 1.66720009, + "memory(GiB)": 117.38, + "step": 58520, + "train_speed(iter/s)": 1.637231 + }, + { + "acc": 0.63991313, + "epoch": 1.484652460679858, + "grad_norm": 5.375, + "learning_rate": 1.7080236471959155e-06, + "loss": 1.61568604, + "memory(GiB)": 117.38, + "step": 58525, + "train_speed(iter/s)": 1.637247 + }, + { + "acc": 0.65633616, + "epoch": 1.4847792998477929, + "grad_norm": 6.09375, + "learning_rate": 1.7072344495497007e-06, + "loss": 1.61981888, + "memory(GiB)": 117.38, + "step": 58530, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.68699226, + "epoch": 1.484906139015728, + "grad_norm": 5.6875, + "learning_rate": 1.7064453967340155e-06, + "loss": 1.50519199, + "memory(GiB)": 117.38, + "step": 58535, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.65298004, + "epoch": 1.485032978183663, + "grad_norm": 6.0, + "learning_rate": 1.7056564887835664e-06, + "loss": 1.5819809, + "memory(GiB)": 117.38, + "step": 58540, + "train_speed(iter/s)": 1.637294 + }, + { + "acc": 0.64004951, + "epoch": 1.4851598173515983, + "grad_norm": 5.65625, + "learning_rate": 1.704867725733052e-06, + "loss": 1.69057503, + "memory(GiB)": 117.38, + "step": 58545, + "train_speed(iter/s)": 1.637311 + }, + { + "acc": 0.65730515, + "epoch": 1.4852866565195333, + "grad_norm": 6.53125, + "learning_rate": 1.7040791076171692e-06, + "loss": 1.55900059, + "memory(GiB)": 117.38, + "step": 58550, + "train_speed(iter/s)": 1.637327 + }, + { + "acc": 0.65295768, + "epoch": 1.4854134956874683, + "grad_norm": 5.28125, + "learning_rate": 1.7032906344706017e-06, + "loss": 1.60283566, + "memory(GiB)": 117.38, + "step": 58555, + "train_speed(iter/s)": 1.637343 + }, + { + "acc": 0.63999052, + "epoch": 1.4855403348554033, + "grad_norm": 5.375, + "learning_rate": 1.7025023063280306e-06, + "loss": 1.64884109, + "memory(GiB)": 117.38, + "step": 58560, + "train_speed(iter/s)": 1.63736 + }, + { + "acc": 0.64783883, + "epoch": 1.4856671740233385, + "grad_norm": 5.53125, + "learning_rate": 1.701714123224128e-06, + "loss": 1.65695744, + "memory(GiB)": 117.38, + "step": 58565, + "train_speed(iter/s)": 1.637375 + }, + { + "acc": 0.64992628, + "epoch": 1.4857940131912735, + "grad_norm": 6.25, + "learning_rate": 1.7009260851935684e-06, + "loss": 1.6325901, + "memory(GiB)": 117.38, + "step": 58570, + "train_speed(iter/s)": 1.637392 + }, + { + "acc": 0.63948412, + "epoch": 1.4859208523592085, + "grad_norm": 5.0, + "learning_rate": 1.7001381922710064e-06, + "loss": 1.70411797, + "memory(GiB)": 117.38, + "step": 58575, + "train_speed(iter/s)": 1.637407 + }, + { + "acc": 0.65391884, + "epoch": 1.4860476915271437, + "grad_norm": 6.125, + "learning_rate": 1.699350444491098e-06, + "loss": 1.58855133, + "memory(GiB)": 117.38, + "step": 58580, + "train_speed(iter/s)": 1.637422 + }, + { + "acc": 0.64425001, + "epoch": 1.4861745306950787, + "grad_norm": 5.25, + "learning_rate": 1.6985628418884953e-06, + "loss": 1.63907661, + "memory(GiB)": 117.38, + "step": 58585, + "train_speed(iter/s)": 1.637438 + }, + { + "acc": 0.66493506, + "epoch": 1.4863013698630136, + "grad_norm": 5.84375, + "learning_rate": 1.6977753844978406e-06, + "loss": 1.59763718, + "memory(GiB)": 117.38, + "step": 58590, + "train_speed(iter/s)": 1.637454 + }, + { + "acc": 0.65835514, + "epoch": 1.4864282090309486, + "grad_norm": 5.6875, + "learning_rate": 1.696988072353764e-06, + "loss": 1.58148785, + "memory(GiB)": 117.38, + "step": 58595, + "train_speed(iter/s)": 1.63747 + }, + { + "acc": 0.64658823, + "epoch": 1.4865550481988838, + "grad_norm": 5.46875, + "learning_rate": 1.6962009054909007e-06, + "loss": 1.62080841, + "memory(GiB)": 117.38, + "step": 58600, + "train_speed(iter/s)": 1.637487 + }, + { + "acc": 0.66238937, + "epoch": 1.4866818873668188, + "grad_norm": 4.5625, + "learning_rate": 1.6954138839438723e-06, + "loss": 1.56551476, + "memory(GiB)": 117.38, + "step": 58605, + "train_speed(iter/s)": 1.637504 + }, + { + "acc": 0.65568867, + "epoch": 1.486808726534754, + "grad_norm": 5.96875, + "learning_rate": 1.6946270077472966e-06, + "loss": 1.63482723, + "memory(GiB)": 117.38, + "step": 58610, + "train_speed(iter/s)": 1.63752 + }, + { + "acc": 0.66745057, + "epoch": 1.486935565702689, + "grad_norm": 5.375, + "learning_rate": 1.6938402769357787e-06, + "loss": 1.53524981, + "memory(GiB)": 117.38, + "step": 58615, + "train_speed(iter/s)": 1.637537 + }, + { + "acc": 0.65915036, + "epoch": 1.487062404870624, + "grad_norm": 10.0625, + "learning_rate": 1.6930536915439288e-06, + "loss": 1.60668488, + "memory(GiB)": 117.38, + "step": 58620, + "train_speed(iter/s)": 1.637552 + }, + { + "acc": 0.65312009, + "epoch": 1.487189244038559, + "grad_norm": 6.625, + "learning_rate": 1.6922672516063415e-06, + "loss": 1.60617638, + "memory(GiB)": 117.38, + "step": 58625, + "train_speed(iter/s)": 1.637568 + }, + { + "acc": 0.6509902, + "epoch": 1.4873160832064942, + "grad_norm": 4.84375, + "learning_rate": 1.6914809571576086e-06, + "loss": 1.59868441, + "memory(GiB)": 117.38, + "step": 58630, + "train_speed(iter/s)": 1.637584 + }, + { + "acc": 0.6462038, + "epoch": 1.4874429223744292, + "grad_norm": 5.3125, + "learning_rate": 1.6906948082323149e-06, + "loss": 1.69357262, + "memory(GiB)": 117.38, + "step": 58635, + "train_speed(iter/s)": 1.6376 + }, + { + "acc": 0.65631628, + "epoch": 1.4875697615423642, + "grad_norm": 5.59375, + "learning_rate": 1.689908804865038e-06, + "loss": 1.64427738, + "memory(GiB)": 117.38, + "step": 58640, + "train_speed(iter/s)": 1.637616 + }, + { + "acc": 0.65549636, + "epoch": 1.4876966007102994, + "grad_norm": 5.375, + "learning_rate": 1.6891229470903509e-06, + "loss": 1.64407234, + "memory(GiB)": 117.38, + "step": 58645, + "train_speed(iter/s)": 1.637631 + }, + { + "acc": 0.647651, + "epoch": 1.4878234398782344, + "grad_norm": 6.15625, + "learning_rate": 1.6883372349428184e-06, + "loss": 1.67279606, + "memory(GiB)": 117.38, + "step": 58650, + "train_speed(iter/s)": 1.637649 + }, + { + "acc": 0.67380109, + "epoch": 1.4879502790461694, + "grad_norm": 6.46875, + "learning_rate": 1.6875516684569999e-06, + "loss": 1.56411819, + "memory(GiB)": 117.38, + "step": 58655, + "train_speed(iter/s)": 1.637665 + }, + { + "acc": 0.66175508, + "epoch": 1.4880771182141044, + "grad_norm": 5.5625, + "learning_rate": 1.686766247667448e-06, + "loss": 1.63842773, + "memory(GiB)": 117.38, + "step": 58660, + "train_speed(iter/s)": 1.637681 + }, + { + "acc": 0.65478916, + "epoch": 1.4882039573820396, + "grad_norm": 6.03125, + "learning_rate": 1.685980972608709e-06, + "loss": 1.6224247, + "memory(GiB)": 117.38, + "step": 58665, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.65100369, + "epoch": 1.4883307965499746, + "grad_norm": 5.0, + "learning_rate": 1.6851958433153227e-06, + "loss": 1.5648324, + "memory(GiB)": 117.38, + "step": 58670, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.64601846, + "epoch": 1.4884576357179098, + "grad_norm": 6.34375, + "learning_rate": 1.684410859821823e-06, + "loss": 1.67129402, + "memory(GiB)": 117.38, + "step": 58675, + "train_speed(iter/s)": 1.637727 + }, + { + "acc": 0.67288537, + "epoch": 1.4885844748858448, + "grad_norm": 5.1875, + "learning_rate": 1.6836260221627364e-06, + "loss": 1.56544971, + "memory(GiB)": 117.38, + "step": 58680, + "train_speed(iter/s)": 1.637741 + }, + { + "acc": 0.65607591, + "epoch": 1.4887113140537798, + "grad_norm": 5.15625, + "learning_rate": 1.682841330372582e-06, + "loss": 1.5784235, + "memory(GiB)": 117.38, + "step": 58685, + "train_speed(iter/s)": 1.637755 + }, + { + "acc": 0.66048188, + "epoch": 1.4888381532217148, + "grad_norm": 4.5625, + "learning_rate": 1.6820567844858798e-06, + "loss": 1.52764893, + "memory(GiB)": 117.38, + "step": 58690, + "train_speed(iter/s)": 1.63777 + }, + { + "acc": 0.65201206, + "epoch": 1.48896499238965, + "grad_norm": 6.03125, + "learning_rate": 1.681272384537132e-06, + "loss": 1.58908329, + "memory(GiB)": 117.38, + "step": 58695, + "train_speed(iter/s)": 1.637783 + }, + { + "acc": 0.6660645, + "epoch": 1.489091831557585, + "grad_norm": 5.375, + "learning_rate": 1.6804881305608423e-06, + "loss": 1.54748325, + "memory(GiB)": 117.38, + "step": 58700, + "train_speed(iter/s)": 1.637796 + }, + { + "acc": 0.64187741, + "epoch": 1.4892186707255202, + "grad_norm": 7.03125, + "learning_rate": 1.679704022591503e-06, + "loss": 1.65535927, + "memory(GiB)": 117.38, + "step": 58705, + "train_speed(iter/s)": 1.637813 + }, + { + "acc": 0.64683318, + "epoch": 1.4893455098934552, + "grad_norm": 6.34375, + "learning_rate": 1.678920060663608e-06, + "loss": 1.60943108, + "memory(GiB)": 117.38, + "step": 58710, + "train_speed(iter/s)": 1.637828 + }, + { + "acc": 0.65541849, + "epoch": 1.4894723490613901, + "grad_norm": 5.34375, + "learning_rate": 1.6781362448116344e-06, + "loss": 1.5401967, + "memory(GiB)": 117.38, + "step": 58715, + "train_speed(iter/s)": 1.637844 + }, + { + "acc": 0.64753494, + "epoch": 1.4895991882293251, + "grad_norm": 5.0625, + "learning_rate": 1.6773525750700586e-06, + "loss": 1.61058998, + "memory(GiB)": 117.38, + "step": 58720, + "train_speed(iter/s)": 1.63786 + }, + { + "acc": 0.65694828, + "epoch": 1.4897260273972603, + "grad_norm": 6.34375, + "learning_rate": 1.676569051473353e-06, + "loss": 1.57637215, + "memory(GiB)": 117.38, + "step": 58725, + "train_speed(iter/s)": 1.637876 + }, + { + "acc": 0.65121851, + "epoch": 1.4898528665651953, + "grad_norm": 5.28125, + "learning_rate": 1.6757856740559796e-06, + "loss": 1.587288, + "memory(GiB)": 117.38, + "step": 58730, + "train_speed(iter/s)": 1.637891 + }, + { + "acc": 0.6652545, + "epoch": 1.4899797057331303, + "grad_norm": 5.125, + "learning_rate": 1.6750024428523926e-06, + "loss": 1.53108788, + "memory(GiB)": 117.38, + "step": 58735, + "train_speed(iter/s)": 1.637907 + }, + { + "acc": 0.66659002, + "epoch": 1.4901065449010655, + "grad_norm": 4.96875, + "learning_rate": 1.6742193578970418e-06, + "loss": 1.60661926, + "memory(GiB)": 117.38, + "step": 58740, + "train_speed(iter/s)": 1.637923 + }, + { + "acc": 0.66636882, + "epoch": 1.4902333840690005, + "grad_norm": 7.1875, + "learning_rate": 1.673436419224373e-06, + "loss": 1.5998848, + "memory(GiB)": 117.38, + "step": 58745, + "train_speed(iter/s)": 1.637939 + }, + { + "acc": 0.64998679, + "epoch": 1.4903602232369355, + "grad_norm": 5.25, + "learning_rate": 1.6726536268688248e-06, + "loss": 1.68369675, + "memory(GiB)": 117.38, + "step": 58750, + "train_speed(iter/s)": 1.637955 + }, + { + "acc": 0.6552546, + "epoch": 1.4904870624048705, + "grad_norm": 6.71875, + "learning_rate": 1.671870980864822e-06, + "loss": 1.62197609, + "memory(GiB)": 117.38, + "step": 58755, + "train_speed(iter/s)": 1.63797 + }, + { + "acc": 0.66752291, + "epoch": 1.4906139015728057, + "grad_norm": 5.84375, + "learning_rate": 1.6710884812467943e-06, + "loss": 1.50893412, + "memory(GiB)": 117.38, + "step": 58760, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.65801306, + "epoch": 1.4907407407407407, + "grad_norm": 5.6875, + "learning_rate": 1.6703061280491579e-06, + "loss": 1.62512569, + "memory(GiB)": 117.38, + "step": 58765, + "train_speed(iter/s)": 1.638001 + }, + { + "acc": 0.66275606, + "epoch": 1.490867579908676, + "grad_norm": 5.40625, + "learning_rate": 1.6695239213063237e-06, + "loss": 1.56917524, + "memory(GiB)": 117.38, + "step": 58770, + "train_speed(iter/s)": 1.638017 + }, + { + "acc": 0.66071, + "epoch": 1.490994419076611, + "grad_norm": 6.5, + "learning_rate": 1.6687418610526972e-06, + "loss": 1.59299698, + "memory(GiB)": 117.38, + "step": 58775, + "train_speed(iter/s)": 1.638033 + }, + { + "acc": 0.6698513, + "epoch": 1.4911212582445459, + "grad_norm": 6.25, + "learning_rate": 1.6679599473226766e-06, + "loss": 1.51940384, + "memory(GiB)": 117.38, + "step": 58780, + "train_speed(iter/s)": 1.638047 + }, + { + "acc": 0.65332928, + "epoch": 1.4912480974124809, + "grad_norm": 6.46875, + "learning_rate": 1.6671781801506536e-06, + "loss": 1.65746822, + "memory(GiB)": 117.38, + "step": 58785, + "train_speed(iter/s)": 1.638062 + }, + { + "acc": 0.65322771, + "epoch": 1.491374936580416, + "grad_norm": 4.6875, + "learning_rate": 1.6663965595710147e-06, + "loss": 1.58195763, + "memory(GiB)": 117.38, + "step": 58790, + "train_speed(iter/s)": 1.638078 + }, + { + "acc": 0.66882024, + "epoch": 1.491501775748351, + "grad_norm": 7.28125, + "learning_rate": 1.6656150856181386e-06, + "loss": 1.5820878, + "memory(GiB)": 117.38, + "step": 58795, + "train_speed(iter/s)": 1.638094 + }, + { + "acc": 0.65017233, + "epoch": 1.491628614916286, + "grad_norm": 5.375, + "learning_rate": 1.6648337583263974e-06, + "loss": 1.65157356, + "memory(GiB)": 117.38, + "step": 58800, + "train_speed(iter/s)": 1.63811 + }, + { + "acc": 0.65503259, + "epoch": 1.4917554540842213, + "grad_norm": 5.1875, + "learning_rate": 1.6640525777301586e-06, + "loss": 1.61745586, + "memory(GiB)": 117.38, + "step": 58805, + "train_speed(iter/s)": 1.638125 + }, + { + "acc": 0.66357312, + "epoch": 1.4918822932521563, + "grad_norm": 6.65625, + "learning_rate": 1.663271543863781e-06, + "loss": 1.59421711, + "memory(GiB)": 117.38, + "step": 58810, + "train_speed(iter/s)": 1.638142 + }, + { + "acc": 0.66036906, + "epoch": 1.4920091324200913, + "grad_norm": 6.3125, + "learning_rate": 1.6624906567616183e-06, + "loss": 1.63050842, + "memory(GiB)": 117.38, + "step": 58815, + "train_speed(iter/s)": 1.638157 + }, + { + "acc": 0.65754991, + "epoch": 1.4921359715880262, + "grad_norm": 5.59375, + "learning_rate": 1.6617099164580175e-06, + "loss": 1.56847639, + "memory(GiB)": 117.38, + "step": 58820, + "train_speed(iter/s)": 1.638173 + }, + { + "acc": 0.65467749, + "epoch": 1.4922628107559615, + "grad_norm": 6.4375, + "learning_rate": 1.660929322987319e-06, + "loss": 1.58660259, + "memory(GiB)": 117.38, + "step": 58825, + "train_speed(iter/s)": 1.638188 + }, + { + "acc": 0.65053029, + "epoch": 1.4923896499238964, + "grad_norm": 7.0, + "learning_rate": 1.660148876383857e-06, + "loss": 1.59802055, + "memory(GiB)": 117.38, + "step": 58830, + "train_speed(iter/s)": 1.638204 + }, + { + "acc": 0.66431561, + "epoch": 1.4925164890918317, + "grad_norm": 6.0, + "learning_rate": 1.6593685766819584e-06, + "loss": 1.55291405, + "memory(GiB)": 117.38, + "step": 58835, + "train_speed(iter/s)": 1.63822 + }, + { + "acc": 0.65234203, + "epoch": 1.4926433282597666, + "grad_norm": 5.34375, + "learning_rate": 1.658588423915945e-06, + "loss": 1.56638832, + "memory(GiB)": 117.38, + "step": 58840, + "train_speed(iter/s)": 1.638235 + }, + { + "acc": 0.66264772, + "epoch": 1.4927701674277016, + "grad_norm": 5.125, + "learning_rate": 1.6578084181201293e-06, + "loss": 1.57011166, + "memory(GiB)": 117.38, + "step": 58845, + "train_speed(iter/s)": 1.63825 + }, + { + "acc": 0.65326414, + "epoch": 1.4928970065956366, + "grad_norm": 6.25, + "learning_rate": 1.6570285593288242e-06, + "loss": 1.64257793, + "memory(GiB)": 117.38, + "step": 58850, + "train_speed(iter/s)": 1.638265 + }, + { + "acc": 0.64946322, + "epoch": 1.4930238457635718, + "grad_norm": 6.375, + "learning_rate": 1.6562488475763267e-06, + "loss": 1.59806156, + "memory(GiB)": 117.38, + "step": 58855, + "train_speed(iter/s)": 1.638281 + }, + { + "acc": 0.65304618, + "epoch": 1.4931506849315068, + "grad_norm": 7.40625, + "learning_rate": 1.6554692828969321e-06, + "loss": 1.62616596, + "memory(GiB)": 117.38, + "step": 58860, + "train_speed(iter/s)": 1.638296 + }, + { + "acc": 0.64253006, + "epoch": 1.493277524099442, + "grad_norm": 7.65625, + "learning_rate": 1.6546898653249326e-06, + "loss": 1.60036926, + "memory(GiB)": 117.38, + "step": 58865, + "train_speed(iter/s)": 1.638311 + }, + { + "acc": 0.64423571, + "epoch": 1.493404363267377, + "grad_norm": 5.46875, + "learning_rate": 1.65391059489461e-06, + "loss": 1.68086586, + "memory(GiB)": 117.38, + "step": 58870, + "train_speed(iter/s)": 1.638327 + }, + { + "acc": 0.64976792, + "epoch": 1.493531202435312, + "grad_norm": 4.6875, + "learning_rate": 1.6531314716402369e-06, + "loss": 1.60481491, + "memory(GiB)": 117.38, + "step": 58875, + "train_speed(iter/s)": 1.638343 + }, + { + "acc": 0.6548851, + "epoch": 1.493658041603247, + "grad_norm": 5.8125, + "learning_rate": 1.652352495596083e-06, + "loss": 1.59254417, + "memory(GiB)": 117.38, + "step": 58880, + "train_speed(iter/s)": 1.638358 + }, + { + "acc": 0.65525436, + "epoch": 1.4937848807711822, + "grad_norm": 5.28125, + "learning_rate": 1.6515736667964144e-06, + "loss": 1.59278698, + "memory(GiB)": 117.38, + "step": 58885, + "train_speed(iter/s)": 1.638373 + }, + { + "acc": 0.65833759, + "epoch": 1.4939117199391172, + "grad_norm": 6.0625, + "learning_rate": 1.6507949852754867e-06, + "loss": 1.64661064, + "memory(GiB)": 117.38, + "step": 58890, + "train_speed(iter/s)": 1.638389 + }, + { + "acc": 0.65120153, + "epoch": 1.4940385591070522, + "grad_norm": 5.21875, + "learning_rate": 1.6500164510675453e-06, + "loss": 1.61090374, + "memory(GiB)": 117.38, + "step": 58895, + "train_speed(iter/s)": 1.638404 + }, + { + "acc": 0.66952782, + "epoch": 1.4941653982749874, + "grad_norm": 6.625, + "learning_rate": 1.649238064206839e-06, + "loss": 1.5535059, + "memory(GiB)": 117.38, + "step": 58900, + "train_speed(iter/s)": 1.638419 + }, + { + "acc": 0.65716972, + "epoch": 1.4942922374429224, + "grad_norm": 5.03125, + "learning_rate": 1.6484598247276023e-06, + "loss": 1.60048904, + "memory(GiB)": 117.38, + "step": 58905, + "train_speed(iter/s)": 1.638434 + }, + { + "acc": 0.63739233, + "epoch": 1.4944190766108574, + "grad_norm": 5.6875, + "learning_rate": 1.6476817326640682e-06, + "loss": 1.6312542, + "memory(GiB)": 117.38, + "step": 58910, + "train_speed(iter/s)": 1.638449 + }, + { + "acc": 0.66941986, + "epoch": 1.4945459157787924, + "grad_norm": 7.3125, + "learning_rate": 1.646903788050455e-06, + "loss": 1.54043446, + "memory(GiB)": 117.38, + "step": 58915, + "train_speed(iter/s)": 1.638464 + }, + { + "acc": 0.65407443, + "epoch": 1.4946727549467276, + "grad_norm": 5.0, + "learning_rate": 1.6461259909209853e-06, + "loss": 1.58598537, + "memory(GiB)": 117.38, + "step": 58920, + "train_speed(iter/s)": 1.638479 + }, + { + "acc": 0.66881928, + "epoch": 1.4947995941146626, + "grad_norm": 5.4375, + "learning_rate": 1.6453483413098687e-06, + "loss": 1.59661674, + "memory(GiB)": 117.38, + "step": 58925, + "train_speed(iter/s)": 1.638495 + }, + { + "acc": 0.65527649, + "epoch": 1.4949264332825978, + "grad_norm": 5.53125, + "learning_rate": 1.6445708392513093e-06, + "loss": 1.64659462, + "memory(GiB)": 117.38, + "step": 58930, + "train_speed(iter/s)": 1.63851 + }, + { + "acc": 0.64791303, + "epoch": 1.4950532724505328, + "grad_norm": 5.375, + "learning_rate": 1.643793484779505e-06, + "loss": 1.67163944, + "memory(GiB)": 117.38, + "step": 58935, + "train_speed(iter/s)": 1.638524 + }, + { + "acc": 0.66956196, + "epoch": 1.4951801116184678, + "grad_norm": 7.46875, + "learning_rate": 1.6430162779286484e-06, + "loss": 1.56772776, + "memory(GiB)": 117.38, + "step": 58940, + "train_speed(iter/s)": 1.638539 + }, + { + "acc": 0.6662281, + "epoch": 1.4953069507864027, + "grad_norm": 4.84375, + "learning_rate": 1.6422392187329233e-06, + "loss": 1.53606396, + "memory(GiB)": 117.38, + "step": 58945, + "train_speed(iter/s)": 1.638555 + }, + { + "acc": 0.64550695, + "epoch": 1.495433789954338, + "grad_norm": 7.15625, + "learning_rate": 1.6414623072265085e-06, + "loss": 1.69089851, + "memory(GiB)": 117.38, + "step": 58950, + "train_speed(iter/s)": 1.63857 + }, + { + "acc": 0.66228223, + "epoch": 1.495560629122273, + "grad_norm": 5.65625, + "learning_rate": 1.6406855434435765e-06, + "loss": 1.58152933, + "memory(GiB)": 117.38, + "step": 58955, + "train_speed(iter/s)": 1.638586 + }, + { + "acc": 0.64446435, + "epoch": 1.495687468290208, + "grad_norm": 4.90625, + "learning_rate": 1.6399089274182922e-06, + "loss": 1.62432594, + "memory(GiB)": 117.38, + "step": 58960, + "train_speed(iter/s)": 1.638601 + }, + { + "acc": 0.66495709, + "epoch": 1.4958143074581431, + "grad_norm": 4.75, + "learning_rate": 1.6391324591848156e-06, + "loss": 1.59021397, + "memory(GiB)": 117.38, + "step": 58965, + "train_speed(iter/s)": 1.638617 + }, + { + "acc": 0.66466942, + "epoch": 1.4959411466260781, + "grad_norm": 5.375, + "learning_rate": 1.6383561387772984e-06, + "loss": 1.5989727, + "memory(GiB)": 117.38, + "step": 58970, + "train_speed(iter/s)": 1.638632 + }, + { + "acc": 0.65867825, + "epoch": 1.4960679857940131, + "grad_norm": 5.1875, + "learning_rate": 1.6375799662298868e-06, + "loss": 1.59532871, + "memory(GiB)": 117.38, + "step": 58975, + "train_speed(iter/s)": 1.638647 + }, + { + "acc": 0.64182115, + "epoch": 1.4961948249619481, + "grad_norm": 6.125, + "learning_rate": 1.6368039415767201e-06, + "loss": 1.64483757, + "memory(GiB)": 117.38, + "step": 58980, + "train_speed(iter/s)": 1.638662 + }, + { + "acc": 0.64847851, + "epoch": 1.4963216641298833, + "grad_norm": 5.8125, + "learning_rate": 1.6360280648519305e-06, + "loss": 1.61589432, + "memory(GiB)": 117.38, + "step": 58985, + "train_speed(iter/s)": 1.638678 + }, + { + "acc": 0.67074327, + "epoch": 1.4964485032978183, + "grad_norm": 6.09375, + "learning_rate": 1.6352523360896488e-06, + "loss": 1.54374924, + "memory(GiB)": 117.38, + "step": 58990, + "train_speed(iter/s)": 1.638692 + }, + { + "acc": 0.65385571, + "epoch": 1.4965753424657535, + "grad_norm": 5.09375, + "learning_rate": 1.63447675532399e-06, + "loss": 1.61351757, + "memory(GiB)": 117.38, + "step": 58995, + "train_speed(iter/s)": 1.638708 + }, + { + "acc": 0.66785917, + "epoch": 1.4967021816336885, + "grad_norm": 6.03125, + "learning_rate": 1.6337013225890698e-06, + "loss": 1.64189816, + "memory(GiB)": 117.38, + "step": 59000, + "train_speed(iter/s)": 1.638723 + }, + { + "epoch": 1.4967021816336885, + "eval_acc": 0.6462726868355271, + "eval_loss": 1.5733360052108765, + "eval_runtime": 58.5083, + "eval_samples_per_second": 108.873, + "eval_steps_per_second": 27.227, + "step": 59000 + }, + { + "acc": 0.66507397, + "epoch": 1.4968290208016235, + "grad_norm": 4.71875, + "learning_rate": 1.6329260379189932e-06, + "loss": 1.5519351, + "memory(GiB)": 117.38, + "step": 59005, + "train_speed(iter/s)": 1.635881 + }, + { + "acc": 0.64430628, + "epoch": 1.4969558599695585, + "grad_norm": 5.78125, + "learning_rate": 1.6321509013478653e-06, + "loss": 1.58763018, + "memory(GiB)": 117.38, + "step": 59010, + "train_speed(iter/s)": 1.635894 + }, + { + "acc": 0.6571928, + "epoch": 1.4970826991374937, + "grad_norm": 5.28125, + "learning_rate": 1.6313759129097757e-06, + "loss": 1.62825317, + "memory(GiB)": 117.38, + "step": 59015, + "train_speed(iter/s)": 1.635907 + }, + { + "acc": 0.64946027, + "epoch": 1.4972095383054287, + "grad_norm": 5.53125, + "learning_rate": 1.6306010726388117e-06, + "loss": 1.6424118, + "memory(GiB)": 117.38, + "step": 59020, + "train_speed(iter/s)": 1.635921 + }, + { + "acc": 0.65573363, + "epoch": 1.497336377473364, + "grad_norm": 6.53125, + "learning_rate": 1.6298263805690573e-06, + "loss": 1.65517006, + "memory(GiB)": 117.38, + "step": 59025, + "train_speed(iter/s)": 1.635935 + }, + { + "acc": 0.65678153, + "epoch": 1.4974632166412989, + "grad_norm": 5.65625, + "learning_rate": 1.629051836734587e-06, + "loss": 1.58574963, + "memory(GiB)": 117.38, + "step": 59030, + "train_speed(iter/s)": 1.635949 + }, + { + "acc": 0.68190155, + "epoch": 1.4975900558092339, + "grad_norm": 7.03125, + "learning_rate": 1.6282774411694641e-06, + "loss": 1.54179535, + "memory(GiB)": 117.38, + "step": 59035, + "train_speed(iter/s)": 1.635964 + }, + { + "acc": 0.66086006, + "epoch": 1.4977168949771689, + "grad_norm": 7.0, + "learning_rate": 1.6275031939077545e-06, + "loss": 1.60549583, + "memory(GiB)": 117.38, + "step": 59040, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.66562696, + "epoch": 1.497843734145104, + "grad_norm": 5.375, + "learning_rate": 1.6267290949835119e-06, + "loss": 1.52672119, + "memory(GiB)": 117.38, + "step": 59045, + "train_speed(iter/s)": 1.635992 + }, + { + "acc": 0.64777427, + "epoch": 1.497970573313039, + "grad_norm": 5.0625, + "learning_rate": 1.6259551444307852e-06, + "loss": 1.5901722, + "memory(GiB)": 117.38, + "step": 59050, + "train_speed(iter/s)": 1.636007 + }, + { + "acc": 0.64258065, + "epoch": 1.498097412480974, + "grad_norm": 5.75, + "learning_rate": 1.6251813422836127e-06, + "loss": 1.62169666, + "memory(GiB)": 117.38, + "step": 59055, + "train_speed(iter/s)": 1.636023 + }, + { + "acc": 0.65736609, + "epoch": 1.4982242516489093, + "grad_norm": 6.71875, + "learning_rate": 1.6244076885760334e-06, + "loss": 1.56433506, + "memory(GiB)": 117.38, + "step": 59060, + "train_speed(iter/s)": 1.636038 + }, + { + "acc": 0.65047703, + "epoch": 1.4983510908168443, + "grad_norm": 5.75, + "learning_rate": 1.6236341833420755e-06, + "loss": 1.66114349, + "memory(GiB)": 117.38, + "step": 59065, + "train_speed(iter/s)": 1.636049 + }, + { + "acc": 0.65166845, + "epoch": 1.4984779299847792, + "grad_norm": 6.03125, + "learning_rate": 1.6228608266157596e-06, + "loss": 1.57607479, + "memory(GiB)": 117.38, + "step": 59070, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.66794491, + "epoch": 1.4986047691527142, + "grad_norm": 5.375, + "learning_rate": 1.6220876184311034e-06, + "loss": 1.56458521, + "memory(GiB)": 117.38, + "step": 59075, + "train_speed(iter/s)": 1.63608 + }, + { + "acc": 0.64701505, + "epoch": 1.4987316083206494, + "grad_norm": 7.90625, + "learning_rate": 1.6213145588221146e-06, + "loss": 1.57172394, + "memory(GiB)": 117.38, + "step": 59080, + "train_speed(iter/s)": 1.636088 + }, + { + "acc": 0.66153364, + "epoch": 1.4988584474885844, + "grad_norm": 8.1875, + "learning_rate": 1.620541647822796e-06, + "loss": 1.5491334, + "memory(GiB)": 117.38, + "step": 59085, + "train_speed(iter/s)": 1.636103 + }, + { + "acc": 0.65983672, + "epoch": 1.4989852866565196, + "grad_norm": 6.625, + "learning_rate": 1.6197688854671444e-06, + "loss": 1.60656357, + "memory(GiB)": 117.38, + "step": 59090, + "train_speed(iter/s)": 1.636119 + }, + { + "acc": 0.66430807, + "epoch": 1.4991121258244546, + "grad_norm": 5.34375, + "learning_rate": 1.6189962717891484e-06, + "loss": 1.52204475, + "memory(GiB)": 117.38, + "step": 59095, + "train_speed(iter/s)": 1.636135 + }, + { + "acc": 0.63608198, + "epoch": 1.4992389649923896, + "grad_norm": 5.5625, + "learning_rate": 1.6182238068227917e-06, + "loss": 1.68204269, + "memory(GiB)": 117.38, + "step": 59100, + "train_speed(iter/s)": 1.63615 + }, + { + "acc": 0.64643054, + "epoch": 1.4993658041603246, + "grad_norm": 5.53125, + "learning_rate": 1.6174514906020505e-06, + "loss": 1.62971668, + "memory(GiB)": 117.38, + "step": 59105, + "train_speed(iter/s)": 1.636166 + }, + { + "acc": 0.66912565, + "epoch": 1.4994926433282598, + "grad_norm": 4.71875, + "learning_rate": 1.6166793231608952e-06, + "loss": 1.5243782, + "memory(GiB)": 117.38, + "step": 59110, + "train_speed(iter/s)": 1.63618 + }, + { + "acc": 0.64613824, + "epoch": 1.4996194824961948, + "grad_norm": 5.65625, + "learning_rate": 1.615907304533288e-06, + "loss": 1.63399925, + "memory(GiB)": 117.38, + "step": 59115, + "train_speed(iter/s)": 1.636195 + }, + { + "acc": 0.65427275, + "epoch": 1.4997463216641298, + "grad_norm": 5.40625, + "learning_rate": 1.6151354347531868e-06, + "loss": 1.62950878, + "memory(GiB)": 117.38, + "step": 59120, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.66316547, + "epoch": 1.499873160832065, + "grad_norm": 6.0625, + "learning_rate": 1.61436371385454e-06, + "loss": 1.61837769, + "memory(GiB)": 117.38, + "step": 59125, + "train_speed(iter/s)": 1.636227 + }, + { + "acc": 0.65671606, + "epoch": 1.5, + "grad_norm": 6.21875, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.60000172, + "memory(GiB)": 117.38, + "step": 59130, + "train_speed(iter/s)": 1.636242 + }, + { + "acc": 0.6654242, + "epoch": 1.500126839167935, + "grad_norm": 5.9375, + "learning_rate": 1.6128207188373867e-06, + "loss": 1.58890524, + "memory(GiB)": 117.38, + "step": 59135, + "train_speed(iter/s)": 1.636257 + }, + { + "acc": 0.64930043, + "epoch": 1.50025367833587, + "grad_norm": 5.0, + "learning_rate": 1.6120494447867451e-06, + "loss": 1.616535, + "memory(GiB)": 117.38, + "step": 59140, + "train_speed(iter/s)": 1.636272 + }, + { + "acc": 0.66388211, + "epoch": 1.5003805175038052, + "grad_norm": 9.0, + "learning_rate": 1.6112783197532932e-06, + "loss": 1.54436531, + "memory(GiB)": 117.38, + "step": 59145, + "train_speed(iter/s)": 1.636286 + }, + { + "acc": 0.65104041, + "epoch": 1.5005073566717404, + "grad_norm": 5.59375, + "learning_rate": 1.6105073437709545e-06, + "loss": 1.60603886, + "memory(GiB)": 117.38, + "step": 59150, + "train_speed(iter/s)": 1.636301 + }, + { + "acc": 0.64264188, + "epoch": 1.5006341958396754, + "grad_norm": 5.84375, + "learning_rate": 1.6097365168736335e-06, + "loss": 1.63858948, + "memory(GiB)": 117.38, + "step": 59155, + "train_speed(iter/s)": 1.636317 + }, + { + "acc": 0.64266996, + "epoch": 1.5007610350076104, + "grad_norm": 5.6875, + "learning_rate": 1.6089658390952351e-06, + "loss": 1.63354034, + "memory(GiB)": 117.38, + "step": 59160, + "train_speed(iter/s)": 1.636332 + }, + { + "acc": 0.65540562, + "epoch": 1.5008878741755454, + "grad_norm": 5.71875, + "learning_rate": 1.6081953104696612e-06, + "loss": 1.61162224, + "memory(GiB)": 117.38, + "step": 59165, + "train_speed(iter/s)": 1.636347 + }, + { + "acc": 0.66873426, + "epoch": 1.5010147133434804, + "grad_norm": 5.21875, + "learning_rate": 1.6074249310308021e-06, + "loss": 1.58953333, + "memory(GiB)": 117.38, + "step": 59170, + "train_speed(iter/s)": 1.636361 + }, + { + "acc": 0.6648479, + "epoch": 1.5011415525114156, + "grad_norm": 5.875, + "learning_rate": 1.6066547008125399e-06, + "loss": 1.55940399, + "memory(GiB)": 117.38, + "step": 59175, + "train_speed(iter/s)": 1.636376 + }, + { + "acc": 0.65285358, + "epoch": 1.5012683916793506, + "grad_norm": 6.6875, + "learning_rate": 1.6058846198487522e-06, + "loss": 1.6120573, + "memory(GiB)": 117.38, + "step": 59180, + "train_speed(iter/s)": 1.636391 + }, + { + "acc": 0.64392085, + "epoch": 1.5013952308472858, + "grad_norm": 5.21875, + "learning_rate": 1.6051146881733142e-06, + "loss": 1.5530077, + "memory(GiB)": 117.38, + "step": 59185, + "train_speed(iter/s)": 1.636407 + }, + { + "acc": 0.65379677, + "epoch": 1.5015220700152208, + "grad_norm": 6.9375, + "learning_rate": 1.6043449058200916e-06, + "loss": 1.61747742, + "memory(GiB)": 117.38, + "step": 59190, + "train_speed(iter/s)": 1.636421 + }, + { + "acc": 0.65701532, + "epoch": 1.5016489091831557, + "grad_norm": 5.59375, + "learning_rate": 1.6035752728229364e-06, + "loss": 1.59188852, + "memory(GiB)": 117.38, + "step": 59195, + "train_speed(iter/s)": 1.636436 + }, + { + "acc": 0.65050678, + "epoch": 1.5017757483510907, + "grad_norm": 4.9375, + "learning_rate": 1.6028057892157067e-06, + "loss": 1.69276161, + "memory(GiB)": 117.38, + "step": 59200, + "train_speed(iter/s)": 1.636452 + }, + { + "acc": 0.65947394, + "epoch": 1.5019025875190257, + "grad_norm": 6.21875, + "learning_rate": 1.602036455032246e-06, + "loss": 1.58296204, + "memory(GiB)": 117.38, + "step": 59205, + "train_speed(iter/s)": 1.636466 + }, + { + "acc": 0.65613728, + "epoch": 1.502029426686961, + "grad_norm": 6.40625, + "learning_rate": 1.6012672703063925e-06, + "loss": 1.63416252, + "memory(GiB)": 117.38, + "step": 59210, + "train_speed(iter/s)": 1.636481 + }, + { + "acc": 0.64894209, + "epoch": 1.5021562658548961, + "grad_norm": 6.25, + "learning_rate": 1.600498235071979e-06, + "loss": 1.64049835, + "memory(GiB)": 117.38, + "step": 59215, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.63984246, + "epoch": 1.5022831050228311, + "grad_norm": 5.625, + "learning_rate": 1.5997293493628301e-06, + "loss": 1.65934372, + "memory(GiB)": 117.38, + "step": 59220, + "train_speed(iter/s)": 1.636511 + }, + { + "acc": 0.65474558, + "epoch": 1.5024099441907661, + "grad_norm": 6.03125, + "learning_rate": 1.598960613212766e-06, + "loss": 1.60389862, + "memory(GiB)": 117.38, + "step": 59225, + "train_speed(iter/s)": 1.636527 + }, + { + "acc": 0.66082439, + "epoch": 1.5025367833587011, + "grad_norm": 5.90625, + "learning_rate": 1.598192026655599e-06, + "loss": 1.63202114, + "memory(GiB)": 117.38, + "step": 59230, + "train_speed(iter/s)": 1.636542 + }, + { + "acc": 0.64346704, + "epoch": 1.502663622526636, + "grad_norm": 5.6875, + "learning_rate": 1.5974235897251344e-06, + "loss": 1.60650558, + "memory(GiB)": 117.38, + "step": 59235, + "train_speed(iter/s)": 1.636555 + }, + { + "acc": 0.65211668, + "epoch": 1.5027904616945713, + "grad_norm": 7.1875, + "learning_rate": 1.5966553024551717e-06, + "loss": 1.63110733, + "memory(GiB)": 117.38, + "step": 59240, + "train_speed(iter/s)": 1.636571 + }, + { + "acc": 0.65274272, + "epoch": 1.5029173008625063, + "grad_norm": 5.53125, + "learning_rate": 1.5958871648795032e-06, + "loss": 1.62780304, + "memory(GiB)": 117.38, + "step": 59245, + "train_speed(iter/s)": 1.636587 + }, + { + "acc": 0.66678596, + "epoch": 1.5030441400304415, + "grad_norm": 5.59375, + "learning_rate": 1.5951191770319164e-06, + "loss": 1.55899258, + "memory(GiB)": 117.38, + "step": 59250, + "train_speed(iter/s)": 1.636602 + }, + { + "acc": 0.6561008, + "epoch": 1.5031709791983765, + "grad_norm": 8.3125, + "learning_rate": 1.594351338946189e-06, + "loss": 1.57421989, + "memory(GiB)": 117.38, + "step": 59255, + "train_speed(iter/s)": 1.636618 + }, + { + "acc": 0.66732426, + "epoch": 1.5032978183663115, + "grad_norm": 5.9375, + "learning_rate": 1.5935836506560953e-06, + "loss": 1.58876553, + "memory(GiB)": 117.38, + "step": 59260, + "train_speed(iter/s)": 1.636633 + }, + { + "acc": 0.65766935, + "epoch": 1.5034246575342465, + "grad_norm": 4.96875, + "learning_rate": 1.5928161121954012e-06, + "loss": 1.56830015, + "memory(GiB)": 117.38, + "step": 59265, + "train_speed(iter/s)": 1.636647 + }, + { + "acc": 0.6670042, + "epoch": 1.5035514967021817, + "grad_norm": 6.0, + "learning_rate": 1.592048723597866e-06, + "loss": 1.54631958, + "memory(GiB)": 117.38, + "step": 59270, + "train_speed(iter/s)": 1.636662 + }, + { + "acc": 0.67544322, + "epoch": 1.5036783358701167, + "grad_norm": 7.375, + "learning_rate": 1.591281484897244e-06, + "loss": 1.54305363, + "memory(GiB)": 117.38, + "step": 59275, + "train_speed(iter/s)": 1.636678 + }, + { + "acc": 0.66650209, + "epoch": 1.5038051750380519, + "grad_norm": 6.34375, + "learning_rate": 1.5905143961272807e-06, + "loss": 1.61135864, + "memory(GiB)": 117.38, + "step": 59280, + "train_speed(iter/s)": 1.636693 + }, + { + "acc": 0.66681828, + "epoch": 1.5039320142059869, + "grad_norm": 6.4375, + "learning_rate": 1.5897474573217153e-06, + "loss": 1.5059103, + "memory(GiB)": 117.38, + "step": 59285, + "train_speed(iter/s)": 1.636707 + }, + { + "acc": 0.64960423, + "epoch": 1.5040588533739219, + "grad_norm": 7.21875, + "learning_rate": 1.588980668514285e-06, + "loss": 1.60573177, + "memory(GiB)": 117.38, + "step": 59290, + "train_speed(iter/s)": 1.636722 + }, + { + "acc": 0.64444962, + "epoch": 1.5041856925418569, + "grad_norm": 5.3125, + "learning_rate": 1.5882140297387127e-06, + "loss": 1.58477182, + "memory(GiB)": 117.38, + "step": 59295, + "train_speed(iter/s)": 1.636737 + }, + { + "acc": 0.65833349, + "epoch": 1.5043125317097918, + "grad_norm": 5.375, + "learning_rate": 1.5874475410287189e-06, + "loss": 1.64329796, + "memory(GiB)": 117.38, + "step": 59300, + "train_speed(iter/s)": 1.636752 + }, + { + "acc": 0.66364546, + "epoch": 1.504439370877727, + "grad_norm": 5.625, + "learning_rate": 1.586681202418019e-06, + "loss": 1.59969435, + "memory(GiB)": 117.38, + "step": 59305, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.65419703, + "epoch": 1.5045662100456623, + "grad_norm": 6.40625, + "learning_rate": 1.5859150139403212e-06, + "loss": 1.69760265, + "memory(GiB)": 117.38, + "step": 59310, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.66216364, + "epoch": 1.5046930492135973, + "grad_norm": 5.71875, + "learning_rate": 1.585148975629322e-06, + "loss": 1.54835224, + "memory(GiB)": 117.38, + "step": 59315, + "train_speed(iter/s)": 1.636795 + }, + { + "acc": 0.65177455, + "epoch": 1.5048198883815322, + "grad_norm": 5.71875, + "learning_rate": 1.5843830875187155e-06, + "loss": 1.59192553, + "memory(GiB)": 117.38, + "step": 59320, + "train_speed(iter/s)": 1.63681 + }, + { + "acc": 0.66120262, + "epoch": 1.5049467275494672, + "grad_norm": 5.9375, + "learning_rate": 1.583617349642192e-06, + "loss": 1.63134613, + "memory(GiB)": 117.38, + "step": 59325, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.64850907, + "epoch": 1.5050735667174022, + "grad_norm": 4.8125, + "learning_rate": 1.5828517620334322e-06, + "loss": 1.63779488, + "memory(GiB)": 117.38, + "step": 59330, + "train_speed(iter/s)": 1.636841 + }, + { + "acc": 0.66626501, + "epoch": 1.5052004058853374, + "grad_norm": 5.15625, + "learning_rate": 1.5820863247261054e-06, + "loss": 1.59134541, + "memory(GiB)": 117.38, + "step": 59335, + "train_speed(iter/s)": 1.636857 + }, + { + "acc": 0.66665621, + "epoch": 1.5053272450532724, + "grad_norm": 5.46875, + "learning_rate": 1.5813210377538834e-06, + "loss": 1.58963318, + "memory(GiB)": 117.38, + "step": 59340, + "train_speed(iter/s)": 1.636872 + }, + { + "acc": 0.65015426, + "epoch": 1.5054540842212076, + "grad_norm": 5.1875, + "learning_rate": 1.5805559011504252e-06, + "loss": 1.64164085, + "memory(GiB)": 117.38, + "step": 59345, + "train_speed(iter/s)": 1.636887 + }, + { + "acc": 0.66221218, + "epoch": 1.5055809233891426, + "grad_norm": 5.40625, + "learning_rate": 1.5797909149493873e-06, + "loss": 1.51169243, + "memory(GiB)": 117.38, + "step": 59350, + "train_speed(iter/s)": 1.636903 + }, + { + "acc": 0.63912196, + "epoch": 1.5057077625570776, + "grad_norm": 5.6875, + "learning_rate": 1.5790260791844114e-06, + "loss": 1.70168667, + "memory(GiB)": 117.38, + "step": 59355, + "train_speed(iter/s)": 1.636918 + }, + { + "acc": 0.66426086, + "epoch": 1.5058346017250126, + "grad_norm": 5.21875, + "learning_rate": 1.5782613938891438e-06, + "loss": 1.57795753, + "memory(GiB)": 117.38, + "step": 59360, + "train_speed(iter/s)": 1.636933 + }, + { + "acc": 0.67836018, + "epoch": 1.5059614408929476, + "grad_norm": 7.46875, + "learning_rate": 1.5774968590972172e-06, + "loss": 1.58142729, + "memory(GiB)": 117.38, + "step": 59365, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.65407772, + "epoch": 1.5060882800608828, + "grad_norm": 6.46875, + "learning_rate": 1.5767324748422592e-06, + "loss": 1.60446033, + "memory(GiB)": 117.38, + "step": 59370, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.65910501, + "epoch": 1.506215119228818, + "grad_norm": 6.90625, + "learning_rate": 1.5759682411578909e-06, + "loss": 1.59832134, + "memory(GiB)": 117.38, + "step": 59375, + "train_speed(iter/s)": 1.636978 + }, + { + "acc": 0.66232061, + "epoch": 1.506341958396753, + "grad_norm": 5.5, + "learning_rate": 1.575204158077726e-06, + "loss": 1.63297272, + "memory(GiB)": 117.38, + "step": 59380, + "train_speed(iter/s)": 1.636994 + }, + { + "acc": 0.66719317, + "epoch": 1.506468797564688, + "grad_norm": 5.75, + "learning_rate": 1.574440225635373e-06, + "loss": 1.56606274, + "memory(GiB)": 117.38, + "step": 59385, + "train_speed(iter/s)": 1.63701 + }, + { + "acc": 0.65939445, + "epoch": 1.506595636732623, + "grad_norm": 5.21875, + "learning_rate": 1.5736764438644332e-06, + "loss": 1.57893143, + "memory(GiB)": 117.38, + "step": 59390, + "train_speed(iter/s)": 1.637025 + }, + { + "acc": 0.66406002, + "epoch": 1.506722475900558, + "grad_norm": 6.8125, + "learning_rate": 1.5729128127985004e-06, + "loss": 1.53627806, + "memory(GiB)": 117.38, + "step": 59395, + "train_speed(iter/s)": 1.637042 + }, + { + "acc": 0.63740206, + "epoch": 1.5068493150684932, + "grad_norm": 5.3125, + "learning_rate": 1.5721493324711633e-06, + "loss": 1.63831291, + "memory(GiB)": 117.38, + "step": 59400, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.65534267, + "epoch": 1.5069761542364282, + "grad_norm": 5.6875, + "learning_rate": 1.5713860029160028e-06, + "loss": 1.60846481, + "memory(GiB)": 117.38, + "step": 59405, + "train_speed(iter/s)": 1.637073 + }, + { + "acc": 0.64372668, + "epoch": 1.5071029934043634, + "grad_norm": 5.9375, + "learning_rate": 1.5706228241665932e-06, + "loss": 1.64875259, + "memory(GiB)": 117.38, + "step": 59410, + "train_speed(iter/s)": 1.637088 + }, + { + "acc": 0.66544375, + "epoch": 1.5072298325722984, + "grad_norm": 5.09375, + "learning_rate": 1.5698597962565032e-06, + "loss": 1.60581703, + "memory(GiB)": 117.38, + "step": 59415, + "train_speed(iter/s)": 1.637104 + }, + { + "acc": 0.66465173, + "epoch": 1.5073566717402334, + "grad_norm": 6.9375, + "learning_rate": 1.5690969192192933e-06, + "loss": 1.56793594, + "memory(GiB)": 117.38, + "step": 59420, + "train_speed(iter/s)": 1.63712 + }, + { + "acc": 0.66263733, + "epoch": 1.5074835109081683, + "grad_norm": 4.625, + "learning_rate": 1.5683341930885183e-06, + "loss": 1.56303272, + "memory(GiB)": 117.38, + "step": 59425, + "train_speed(iter/s)": 1.637135 + }, + { + "acc": 0.64411612, + "epoch": 1.5076103500761036, + "grad_norm": 5.90625, + "learning_rate": 1.567571617897729e-06, + "loss": 1.65568466, + "memory(GiB)": 117.38, + "step": 59430, + "train_speed(iter/s)": 1.637151 + }, + { + "acc": 0.66698713, + "epoch": 1.5077371892440385, + "grad_norm": 4.71875, + "learning_rate": 1.566809193680463e-06, + "loss": 1.53366222, + "memory(GiB)": 117.38, + "step": 59435, + "train_speed(iter/s)": 1.637167 + }, + { + "acc": 0.64838991, + "epoch": 1.5078640284119738, + "grad_norm": 7.15625, + "learning_rate": 1.566046920470257e-06, + "loss": 1.67905045, + "memory(GiB)": 117.38, + "step": 59440, + "train_speed(iter/s)": 1.637184 + }, + { + "acc": 0.67054396, + "epoch": 1.5079908675799087, + "grad_norm": 5.03125, + "learning_rate": 1.5652847983006376e-06, + "loss": 1.53800392, + "memory(GiB)": 117.38, + "step": 59445, + "train_speed(iter/s)": 1.637199 + }, + { + "acc": 0.66188765, + "epoch": 1.5081177067478437, + "grad_norm": 4.75, + "learning_rate": 1.564522827205131e-06, + "loss": 1.5667593, + "memory(GiB)": 117.38, + "step": 59450, + "train_speed(iter/s)": 1.637215 + }, + { + "acc": 0.64526963, + "epoch": 1.5082445459157787, + "grad_norm": 7.3125, + "learning_rate": 1.5637610072172464e-06, + "loss": 1.60111561, + "memory(GiB)": 117.38, + "step": 59455, + "train_speed(iter/s)": 1.637231 + }, + { + "acc": 0.65367398, + "epoch": 1.5083713850837137, + "grad_norm": 6.0625, + "learning_rate": 1.5629993383704933e-06, + "loss": 1.69005527, + "memory(GiB)": 117.38, + "step": 59460, + "train_speed(iter/s)": 1.637246 + }, + { + "acc": 0.66148043, + "epoch": 1.508498224251649, + "grad_norm": 6.15625, + "learning_rate": 1.5622378206983764e-06, + "loss": 1.62536736, + "memory(GiB)": 117.38, + "step": 59465, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.6600419, + "epoch": 1.5086250634195841, + "grad_norm": 5.8125, + "learning_rate": 1.5614764542343896e-06, + "loss": 1.56128855, + "memory(GiB)": 117.38, + "step": 59470, + "train_speed(iter/s)": 1.637277 + }, + { + "acc": 0.65574646, + "epoch": 1.5087519025875191, + "grad_norm": 5.59375, + "learning_rate": 1.5607152390120173e-06, + "loss": 1.54552221, + "memory(GiB)": 117.38, + "step": 59475, + "train_speed(iter/s)": 1.637293 + }, + { + "acc": 0.64856749, + "epoch": 1.5088787417554541, + "grad_norm": 5.71875, + "learning_rate": 1.5599541750647457e-06, + "loss": 1.61822491, + "memory(GiB)": 117.38, + "step": 59480, + "train_speed(iter/s)": 1.637308 + }, + { + "acc": 0.65341444, + "epoch": 1.509005580923389, + "grad_norm": 5.21875, + "learning_rate": 1.559193262426048e-06, + "loss": 1.56120615, + "memory(GiB)": 117.38, + "step": 59485, + "train_speed(iter/s)": 1.637324 + }, + { + "acc": 0.65711451, + "epoch": 1.509132420091324, + "grad_norm": 5.40625, + "learning_rate": 1.5584325011293943e-06, + "loss": 1.59803238, + "memory(GiB)": 117.38, + "step": 59490, + "train_speed(iter/s)": 1.637338 + }, + { + "acc": 0.64721985, + "epoch": 1.5092592592592593, + "grad_norm": 5.53125, + "learning_rate": 1.5576718912082417e-06, + "loss": 1.63914948, + "memory(GiB)": 117.38, + "step": 59495, + "train_speed(iter/s)": 1.637353 + }, + { + "acc": 0.65776649, + "epoch": 1.5093860984271943, + "grad_norm": 6.25, + "learning_rate": 1.5569114326960494e-06, + "loss": 1.60691013, + "memory(GiB)": 117.38, + "step": 59500, + "train_speed(iter/s)": 1.637368 + }, + { + "acc": 0.66006384, + "epoch": 1.5095129375951295, + "grad_norm": 6.125, + "learning_rate": 1.5561511256262651e-06, + "loss": 1.58847275, + "memory(GiB)": 117.38, + "step": 59505, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.65260458, + "epoch": 1.5096397767630645, + "grad_norm": 5.25, + "learning_rate": 1.55539097003233e-06, + "loss": 1.61360741, + "memory(GiB)": 117.38, + "step": 59510, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.66343279, + "epoch": 1.5097666159309995, + "grad_norm": 5.25, + "learning_rate": 1.5546309659476788e-06, + "loss": 1.58285294, + "memory(GiB)": 117.38, + "step": 59515, + "train_speed(iter/s)": 1.637413 + }, + { + "acc": 0.67283044, + "epoch": 1.5098934550989345, + "grad_norm": 5.21875, + "learning_rate": 1.55387111340574e-06, + "loss": 1.58267479, + "memory(GiB)": 117.38, + "step": 59520, + "train_speed(iter/s)": 1.637428 + }, + { + "acc": 0.66473532, + "epoch": 1.5100202942668695, + "grad_norm": 6.09375, + "learning_rate": 1.553111412439936e-06, + "loss": 1.56194286, + "memory(GiB)": 117.38, + "step": 59525, + "train_speed(iter/s)": 1.637442 + }, + { + "acc": 0.65723758, + "epoch": 1.5101471334348047, + "grad_norm": 5.34375, + "learning_rate": 1.5523518630836809e-06, + "loss": 1.64816818, + "memory(GiB)": 117.38, + "step": 59530, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.67080235, + "epoch": 1.5102739726027399, + "grad_norm": 6.8125, + "learning_rate": 1.551592465370384e-06, + "loss": 1.50460768, + "memory(GiB)": 117.38, + "step": 59535, + "train_speed(iter/s)": 1.637472 + }, + { + "acc": 0.63776093, + "epoch": 1.5104008117706749, + "grad_norm": 6.0, + "learning_rate": 1.5508332193334457e-06, + "loss": 1.60111427, + "memory(GiB)": 117.38, + "step": 59540, + "train_speed(iter/s)": 1.637487 + }, + { + "acc": 0.64542737, + "epoch": 1.5105276509386099, + "grad_norm": 5.46875, + "learning_rate": 1.5500741250062628e-06, + "loss": 1.68193874, + "memory(GiB)": 117.38, + "step": 59545, + "train_speed(iter/s)": 1.637502 + }, + { + "acc": 0.68000145, + "epoch": 1.5106544901065448, + "grad_norm": 5.9375, + "learning_rate": 1.549315182422222e-06, + "loss": 1.53957243, + "memory(GiB)": 117.38, + "step": 59550, + "train_speed(iter/s)": 1.637517 + }, + { + "acc": 0.66647639, + "epoch": 1.5107813292744798, + "grad_norm": 4.90625, + "learning_rate": 1.5485563916147062e-06, + "loss": 1.60158615, + "memory(GiB)": 117.38, + "step": 59555, + "train_speed(iter/s)": 1.637531 + }, + { + "acc": 0.63762536, + "epoch": 1.510908168442415, + "grad_norm": 5.8125, + "learning_rate": 1.5477977526170895e-06, + "loss": 1.59889803, + "memory(GiB)": 117.38, + "step": 59560, + "train_speed(iter/s)": 1.637546 + }, + { + "acc": 0.65561848, + "epoch": 1.51103500761035, + "grad_norm": 5.875, + "learning_rate": 1.5470392654627392e-06, + "loss": 1.62687626, + "memory(GiB)": 117.38, + "step": 59565, + "train_speed(iter/s)": 1.637561 + }, + { + "acc": 0.65773377, + "epoch": 1.5111618467782852, + "grad_norm": 5.375, + "learning_rate": 1.5462809301850212e-06, + "loss": 1.60167675, + "memory(GiB)": 117.38, + "step": 59570, + "train_speed(iter/s)": 1.637576 + }, + { + "acc": 0.65607738, + "epoch": 1.5112886859462202, + "grad_norm": 5.25, + "learning_rate": 1.5455227468172862e-06, + "loss": 1.64228477, + "memory(GiB)": 117.38, + "step": 59575, + "train_speed(iter/s)": 1.637591 + }, + { + "acc": 0.6501904, + "epoch": 1.5114155251141552, + "grad_norm": 7.125, + "learning_rate": 1.5447647153928842e-06, + "loss": 1.65314693, + "memory(GiB)": 117.38, + "step": 59580, + "train_speed(iter/s)": 1.637606 + }, + { + "acc": 0.65768318, + "epoch": 1.5115423642820902, + "grad_norm": 6.4375, + "learning_rate": 1.5440068359451548e-06, + "loss": 1.61936302, + "memory(GiB)": 117.38, + "step": 59585, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.66429658, + "epoch": 1.5116692034500254, + "grad_norm": 9.625, + "learning_rate": 1.5432491085074381e-06, + "loss": 1.59712486, + "memory(GiB)": 117.38, + "step": 59590, + "train_speed(iter/s)": 1.637637 + }, + { + "acc": 0.64664769, + "epoch": 1.5117960426179604, + "grad_norm": 4.875, + "learning_rate": 1.5424915331130568e-06, + "loss": 1.64883251, + "memory(GiB)": 117.38, + "step": 59595, + "train_speed(iter/s)": 1.637652 + }, + { + "acc": 0.66840744, + "epoch": 1.5119228817858956, + "grad_norm": 8.125, + "learning_rate": 1.5417341097953332e-06, + "loss": 1.5181076, + "memory(GiB)": 117.38, + "step": 59600, + "train_speed(iter/s)": 1.637666 + }, + { + "acc": 0.67108917, + "epoch": 1.5120497209538306, + "grad_norm": 7.1875, + "learning_rate": 1.540976838587585e-06, + "loss": 1.59098244, + "memory(GiB)": 117.38, + "step": 59605, + "train_speed(iter/s)": 1.637682 + }, + { + "acc": 0.65572405, + "epoch": 1.5121765601217656, + "grad_norm": 5.28125, + "learning_rate": 1.5402197195231205e-06, + "loss": 1.60146599, + "memory(GiB)": 117.38, + "step": 59610, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.66543159, + "epoch": 1.5123033992897006, + "grad_norm": 6.59375, + "learning_rate": 1.5394627526352379e-06, + "loss": 1.57943907, + "memory(GiB)": 117.38, + "step": 59615, + "train_speed(iter/s)": 1.637712 + }, + { + "acc": 0.65606475, + "epoch": 1.5124302384576356, + "grad_norm": 5.75, + "learning_rate": 1.5387059379572322e-06, + "loss": 1.60311127, + "memory(GiB)": 117.38, + "step": 59620, + "train_speed(iter/s)": 1.637728 + }, + { + "acc": 0.68065972, + "epoch": 1.5125570776255708, + "grad_norm": 6.0, + "learning_rate": 1.537949275522394e-06, + "loss": 1.54807396, + "memory(GiB)": 117.38, + "step": 59625, + "train_speed(iter/s)": 1.637743 + }, + { + "acc": 0.64313092, + "epoch": 1.512683916793506, + "grad_norm": 10.1875, + "learning_rate": 1.5371927653640056e-06, + "loss": 1.66140862, + "memory(GiB)": 117.38, + "step": 59630, + "train_speed(iter/s)": 1.637759 + }, + { + "acc": 0.65956144, + "epoch": 1.512810755961441, + "grad_norm": 5.1875, + "learning_rate": 1.5364364075153366e-06, + "loss": 1.57860355, + "memory(GiB)": 117.38, + "step": 59635, + "train_speed(iter/s)": 1.637774 + }, + { + "acc": 0.66728215, + "epoch": 1.512937595129376, + "grad_norm": 5.875, + "learning_rate": 1.5356802020096595e-06, + "loss": 1.5678194, + "memory(GiB)": 117.38, + "step": 59640, + "train_speed(iter/s)": 1.637789 + }, + { + "acc": 0.66470032, + "epoch": 1.513064434297311, + "grad_norm": 5.5625, + "learning_rate": 1.5349241488802346e-06, + "loss": 1.57913189, + "memory(GiB)": 117.38, + "step": 59645, + "train_speed(iter/s)": 1.637804 + }, + { + "acc": 0.65819478, + "epoch": 1.513191273465246, + "grad_norm": 5.90625, + "learning_rate": 1.5341682481603155e-06, + "loss": 1.63519821, + "memory(GiB)": 117.38, + "step": 59650, + "train_speed(iter/s)": 1.637819 + }, + { + "acc": 0.66518583, + "epoch": 1.5133181126331812, + "grad_norm": 6.125, + "learning_rate": 1.5334124998831512e-06, + "loss": 1.56683216, + "memory(GiB)": 117.38, + "step": 59655, + "train_speed(iter/s)": 1.637833 + }, + { + "acc": 0.63133111, + "epoch": 1.5134449518011162, + "grad_norm": 5.03125, + "learning_rate": 1.532656904081982e-06, + "loss": 1.64764099, + "memory(GiB)": 117.38, + "step": 59660, + "train_speed(iter/s)": 1.637849 + }, + { + "acc": 0.66431923, + "epoch": 1.5135717909690514, + "grad_norm": 6.0625, + "learning_rate": 1.5319014607900428e-06, + "loss": 1.56693516, + "memory(GiB)": 117.38, + "step": 59665, + "train_speed(iter/s)": 1.637863 + }, + { + "acc": 0.66129799, + "epoch": 1.5136986301369864, + "grad_norm": 5.65625, + "learning_rate": 1.5311461700405617e-06, + "loss": 1.60601902, + "memory(GiB)": 117.38, + "step": 59670, + "train_speed(iter/s)": 1.637877 + }, + { + "acc": 0.65645337, + "epoch": 1.5138254693049213, + "grad_norm": 6.71875, + "learning_rate": 1.5303910318667586e-06, + "loss": 1.6351366, + "memory(GiB)": 117.38, + "step": 59675, + "train_speed(iter/s)": 1.637893 + }, + { + "acc": 0.6603056, + "epoch": 1.5139523084728563, + "grad_norm": 5.625, + "learning_rate": 1.529636046301849e-06, + "loss": 1.61880131, + "memory(GiB)": 117.38, + "step": 59680, + "train_speed(iter/s)": 1.637908 + }, + { + "acc": 0.63773842, + "epoch": 1.5140791476407913, + "grad_norm": 6.3125, + "learning_rate": 1.5288812133790405e-06, + "loss": 1.68753853, + "memory(GiB)": 117.38, + "step": 59685, + "train_speed(iter/s)": 1.637924 + }, + { + "acc": 0.64779053, + "epoch": 1.5142059868087265, + "grad_norm": 5.21875, + "learning_rate": 1.5281265331315332e-06, + "loss": 1.6136404, + "memory(GiB)": 117.38, + "step": 59690, + "train_speed(iter/s)": 1.637938 + }, + { + "acc": 0.65977907, + "epoch": 1.5143328259766617, + "grad_norm": 5.8125, + "learning_rate": 1.5273720055925217e-06, + "loss": 1.58422794, + "memory(GiB)": 117.38, + "step": 59695, + "train_speed(iter/s)": 1.637954 + }, + { + "acc": 0.65920839, + "epoch": 1.5144596651445967, + "grad_norm": 5.21875, + "learning_rate": 1.5266176307951936e-06, + "loss": 1.61589317, + "memory(GiB)": 117.38, + "step": 59700, + "train_speed(iter/s)": 1.637968 + }, + { + "acc": 0.62904959, + "epoch": 1.5145865043125317, + "grad_norm": 5.28125, + "learning_rate": 1.5258634087727298e-06, + "loss": 1.59031744, + "memory(GiB)": 117.38, + "step": 59705, + "train_speed(iter/s)": 1.637983 + }, + { + "acc": 0.65859184, + "epoch": 1.5147133434804667, + "grad_norm": 6.1875, + "learning_rate": 1.5251093395583045e-06, + "loss": 1.60083256, + "memory(GiB)": 117.38, + "step": 59710, + "train_speed(iter/s)": 1.637998 + }, + { + "acc": 0.66071482, + "epoch": 1.5148401826484017, + "grad_norm": 5.28125, + "learning_rate": 1.5243554231850843e-06, + "loss": 1.57996674, + "memory(GiB)": 117.38, + "step": 59715, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.65562344, + "epoch": 1.514967021816337, + "grad_norm": 5.9375, + "learning_rate": 1.5236016596862302e-06, + "loss": 1.57238712, + "memory(GiB)": 117.38, + "step": 59720, + "train_speed(iter/s)": 1.638027 + }, + { + "acc": 0.65562716, + "epoch": 1.515093860984272, + "grad_norm": 4.90625, + "learning_rate": 1.5228480490948943e-06, + "loss": 1.5669693, + "memory(GiB)": 117.38, + "step": 59725, + "train_speed(iter/s)": 1.638042 + }, + { + "acc": 0.66293459, + "epoch": 1.5152207001522071, + "grad_norm": 5.78125, + "learning_rate": 1.5220945914442292e-06, + "loss": 1.61664486, + "memory(GiB)": 117.38, + "step": 59730, + "train_speed(iter/s)": 1.638057 + }, + { + "acc": 0.6535965, + "epoch": 1.515347539320142, + "grad_norm": 5.3125, + "learning_rate": 1.52134128676737e-06, + "loss": 1.56146193, + "memory(GiB)": 117.38, + "step": 59735, + "train_speed(iter/s)": 1.638072 + }, + { + "acc": 0.65644436, + "epoch": 1.515474378488077, + "grad_norm": 6.4375, + "learning_rate": 1.5205881350974504e-06, + "loss": 1.6927681, + "memory(GiB)": 117.38, + "step": 59740, + "train_speed(iter/s)": 1.638086 + }, + { + "acc": 0.64107752, + "epoch": 1.515601217656012, + "grad_norm": 7.25, + "learning_rate": 1.5198351364676012e-06, + "loss": 1.61112938, + "memory(GiB)": 117.38, + "step": 59745, + "train_speed(iter/s)": 1.638101 + }, + { + "acc": 0.6616334, + "epoch": 1.5157280568239473, + "grad_norm": 6.4375, + "learning_rate": 1.5190822909109415e-06, + "loss": 1.61259651, + "memory(GiB)": 117.38, + "step": 59750, + "train_speed(iter/s)": 1.638115 + }, + { + "acc": 0.65482903, + "epoch": 1.5158548959918823, + "grad_norm": 5.0625, + "learning_rate": 1.5183295984605824e-06, + "loss": 1.63801155, + "memory(GiB)": 117.38, + "step": 59755, + "train_speed(iter/s)": 1.638129 + }, + { + "acc": 0.66039543, + "epoch": 1.5159817351598175, + "grad_norm": 5.21875, + "learning_rate": 1.5175770591496303e-06, + "loss": 1.58310776, + "memory(GiB)": 117.38, + "step": 59760, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.65295467, + "epoch": 1.5161085743277525, + "grad_norm": 5.25, + "learning_rate": 1.5168246730111892e-06, + "loss": 1.61428986, + "memory(GiB)": 117.38, + "step": 59765, + "train_speed(iter/s)": 1.638158 + }, + { + "acc": 0.68336835, + "epoch": 1.5162354134956875, + "grad_norm": 5.5625, + "learning_rate": 1.5160724400783511e-06, + "loss": 1.59299622, + "memory(GiB)": 117.38, + "step": 59770, + "train_speed(iter/s)": 1.638172 + }, + { + "acc": 0.65406723, + "epoch": 1.5163622526636225, + "grad_norm": 5.1875, + "learning_rate": 1.5153203603841992e-06, + "loss": 1.59817829, + "memory(GiB)": 117.38, + "step": 59775, + "train_speed(iter/s)": 1.638186 + }, + { + "acc": 0.65257907, + "epoch": 1.5164890918315574, + "grad_norm": 5.84375, + "learning_rate": 1.5145684339618172e-06, + "loss": 1.66878014, + "memory(GiB)": 117.38, + "step": 59780, + "train_speed(iter/s)": 1.6382 + }, + { + "acc": 0.67384071, + "epoch": 1.5166159309994927, + "grad_norm": 7.625, + "learning_rate": 1.5138166608442768e-06, + "loss": 1.52178497, + "memory(GiB)": 117.38, + "step": 59785, + "train_speed(iter/s)": 1.638215 + }, + { + "acc": 0.65550747, + "epoch": 1.5167427701674279, + "grad_norm": 6.25, + "learning_rate": 1.5130650410646452e-06, + "loss": 1.57656002, + "memory(GiB)": 117.38, + "step": 59790, + "train_speed(iter/s)": 1.638229 + }, + { + "acc": 0.66310744, + "epoch": 1.5168696093353629, + "grad_norm": 4.53125, + "learning_rate": 1.5123135746559792e-06, + "loss": 1.59146099, + "memory(GiB)": 117.38, + "step": 59795, + "train_speed(iter/s)": 1.638243 + }, + { + "acc": 0.67225218, + "epoch": 1.5169964485032978, + "grad_norm": 6.46875, + "learning_rate": 1.5115622616513343e-06, + "loss": 1.52971506, + "memory(GiB)": 117.38, + "step": 59800, + "train_speed(iter/s)": 1.638257 + }, + { + "acc": 0.64547067, + "epoch": 1.5171232876712328, + "grad_norm": 5.28125, + "learning_rate": 1.5108111020837564e-06, + "loss": 1.71772346, + "memory(GiB)": 117.38, + "step": 59805, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.64823647, + "epoch": 1.5172501268391678, + "grad_norm": 5.15625, + "learning_rate": 1.5100600959862838e-06, + "loss": 1.58661766, + "memory(GiB)": 117.38, + "step": 59810, + "train_speed(iter/s)": 1.638285 + }, + { + "acc": 0.65941296, + "epoch": 1.517376966007103, + "grad_norm": 5.6875, + "learning_rate": 1.5093092433919497e-06, + "loss": 1.59504967, + "memory(GiB)": 117.38, + "step": 59815, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.65098925, + "epoch": 1.517503805175038, + "grad_norm": 5.8125, + "learning_rate": 1.5085585443337803e-06, + "loss": 1.5551198, + "memory(GiB)": 117.38, + "step": 59820, + "train_speed(iter/s)": 1.638314 + }, + { + "acc": 0.65557141, + "epoch": 1.5176306443429732, + "grad_norm": 6.5, + "learning_rate": 1.507807998844794e-06, + "loss": 1.68314095, + "memory(GiB)": 117.38, + "step": 59825, + "train_speed(iter/s)": 1.638329 + }, + { + "acc": 0.65469613, + "epoch": 1.5177574835109082, + "grad_norm": 5.875, + "learning_rate": 1.5070576069580039e-06, + "loss": 1.57277184, + "memory(GiB)": 117.38, + "step": 59830, + "train_speed(iter/s)": 1.638343 + }, + { + "acc": 0.65734887, + "epoch": 1.5178843226788432, + "grad_norm": 4.75, + "learning_rate": 1.5063073687064144e-06, + "loss": 1.63628693, + "memory(GiB)": 117.38, + "step": 59835, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.6680016, + "epoch": 1.5180111618467782, + "grad_norm": 5.71875, + "learning_rate": 1.5055572841230253e-06, + "loss": 1.57862234, + "memory(GiB)": 117.38, + "step": 59840, + "train_speed(iter/s)": 1.638371 + }, + { + "acc": 0.66577682, + "epoch": 1.5181380010147132, + "grad_norm": 6.1875, + "learning_rate": 1.5048073532408287e-06, + "loss": 1.5896265, + "memory(GiB)": 117.38, + "step": 59845, + "train_speed(iter/s)": 1.638385 + }, + { + "acc": 0.66489553, + "epoch": 1.5182648401826484, + "grad_norm": 4.84375, + "learning_rate": 1.5040575760928094e-06, + "loss": 1.58216515, + "memory(GiB)": 117.38, + "step": 59850, + "train_speed(iter/s)": 1.638399 + }, + { + "acc": 0.64914498, + "epoch": 1.5183916793505836, + "grad_norm": 6.25, + "learning_rate": 1.5033079527119466e-06, + "loss": 1.66169624, + "memory(GiB)": 117.38, + "step": 59855, + "train_speed(iter/s)": 1.638413 + }, + { + "acc": 0.66525979, + "epoch": 1.5185185185185186, + "grad_norm": 5.875, + "learning_rate": 1.5025584831312112e-06, + "loss": 1.58025627, + "memory(GiB)": 117.38, + "step": 59860, + "train_speed(iter/s)": 1.638427 + }, + { + "acc": 0.650809, + "epoch": 1.5186453576864536, + "grad_norm": 6.6875, + "learning_rate": 1.5018091673835667e-06, + "loss": 1.62069435, + "memory(GiB)": 117.38, + "step": 59865, + "train_speed(iter/s)": 1.638442 + }, + { + "acc": 0.66731596, + "epoch": 1.5187721968543886, + "grad_norm": 5.6875, + "learning_rate": 1.501060005501977e-06, + "loss": 1.57781525, + "memory(GiB)": 117.38, + "step": 59870, + "train_speed(iter/s)": 1.638456 + }, + { + "acc": 0.66133699, + "epoch": 1.5188990360223236, + "grad_norm": 5.96875, + "learning_rate": 1.500310997519388e-06, + "loss": 1.6319046, + "memory(GiB)": 117.38, + "step": 59875, + "train_speed(iter/s)": 1.63847 + }, + { + "acc": 0.66813779, + "epoch": 1.5190258751902588, + "grad_norm": 5.375, + "learning_rate": 1.4995621434687468e-06, + "loss": 1.58863544, + "memory(GiB)": 117.38, + "step": 59880, + "train_speed(iter/s)": 1.638483 + }, + { + "acc": 0.64411836, + "epoch": 1.5191527143581938, + "grad_norm": 5.65625, + "learning_rate": 1.4988134433829892e-06, + "loss": 1.66836929, + "memory(GiB)": 117.38, + "step": 59885, + "train_speed(iter/s)": 1.638496 + }, + { + "acc": 0.64596262, + "epoch": 1.519279553526129, + "grad_norm": 6.21875, + "learning_rate": 1.4980648972950507e-06, + "loss": 1.61121902, + "memory(GiB)": 117.38, + "step": 59890, + "train_speed(iter/s)": 1.63829 + }, + { + "acc": 0.65286098, + "epoch": 1.519406392694064, + "grad_norm": 6.1875, + "learning_rate": 1.4973165052378518e-06, + "loss": 1.5737771, + "memory(GiB)": 117.38, + "step": 59895, + "train_speed(iter/s)": 1.638303 + }, + { + "acc": 0.649895, + "epoch": 1.519533231861999, + "grad_norm": 5.9375, + "learning_rate": 1.49656826724431e-06, + "loss": 1.61807003, + "memory(GiB)": 117.38, + "step": 59900, + "train_speed(iter/s)": 1.638081 + }, + { + "acc": 0.66251173, + "epoch": 1.519660071029934, + "grad_norm": 5.5625, + "learning_rate": 1.4958201833473386e-06, + "loss": 1.55404568, + "memory(GiB)": 117.38, + "step": 59905, + "train_speed(iter/s)": 1.638095 + }, + { + "acc": 0.63647819, + "epoch": 1.5197869101978692, + "grad_norm": 5.90625, + "learning_rate": 1.4950722535798423e-06, + "loss": 1.65166512, + "memory(GiB)": 117.38, + "step": 59910, + "train_speed(iter/s)": 1.638109 + }, + { + "acc": 0.65204973, + "epoch": 1.5199137493658041, + "grad_norm": 5.5, + "learning_rate": 1.4943244779747134e-06, + "loss": 1.61760178, + "memory(GiB)": 117.38, + "step": 59915, + "train_speed(iter/s)": 1.638123 + }, + { + "acc": 0.64628162, + "epoch": 1.5200405885337394, + "grad_norm": 5.03125, + "learning_rate": 1.4935768565648478e-06, + "loss": 1.5935832, + "memory(GiB)": 117.38, + "step": 59920, + "train_speed(iter/s)": 1.638137 + }, + { + "acc": 0.65135431, + "epoch": 1.5201674277016743, + "grad_norm": 7.625, + "learning_rate": 1.4928293893831265e-06, + "loss": 1.66151772, + "memory(GiB)": 117.38, + "step": 59925, + "train_speed(iter/s)": 1.638152 + }, + { + "acc": 0.65062904, + "epoch": 1.5202942668696093, + "grad_norm": 6.25, + "learning_rate": 1.4920820764624288e-06, + "loss": 1.6067997, + "memory(GiB)": 117.38, + "step": 59930, + "train_speed(iter/s)": 1.638166 + }, + { + "acc": 0.65350237, + "epoch": 1.5204211060375443, + "grad_norm": 5.59375, + "learning_rate": 1.4913349178356202e-06, + "loss": 1.63344421, + "memory(GiB)": 117.38, + "step": 59935, + "train_speed(iter/s)": 1.63818 + }, + { + "acc": 0.64665518, + "epoch": 1.5205479452054793, + "grad_norm": 7.0, + "learning_rate": 1.4905879135355684e-06, + "loss": 1.66476669, + "memory(GiB)": 117.38, + "step": 59940, + "train_speed(iter/s)": 1.638195 + }, + { + "acc": 0.64801178, + "epoch": 1.5206747843734145, + "grad_norm": 6.625, + "learning_rate": 1.4898410635951282e-06, + "loss": 1.57352085, + "memory(GiB)": 117.38, + "step": 59945, + "train_speed(iter/s)": 1.638209 + }, + { + "acc": 0.65974984, + "epoch": 1.5208016235413497, + "grad_norm": 5.9375, + "learning_rate": 1.4890943680471503e-06, + "loss": 1.57228327, + "memory(GiB)": 117.38, + "step": 59950, + "train_speed(iter/s)": 1.638223 + }, + { + "acc": 0.65208964, + "epoch": 1.5209284627092847, + "grad_norm": 5.53125, + "learning_rate": 1.4883478269244766e-06, + "loss": 1.58871241, + "memory(GiB)": 117.38, + "step": 59955, + "train_speed(iter/s)": 1.638238 + }, + { + "acc": 0.65905828, + "epoch": 1.5210553018772197, + "grad_norm": 5.40625, + "learning_rate": 1.4876014402599443e-06, + "loss": 1.65592327, + "memory(GiB)": 117.38, + "step": 59960, + "train_speed(iter/s)": 1.638253 + }, + { + "acc": 0.64793797, + "epoch": 1.5211821410451547, + "grad_norm": 5.65625, + "learning_rate": 1.4868552080863824e-06, + "loss": 1.66602058, + "memory(GiB)": 117.38, + "step": 59965, + "train_speed(iter/s)": 1.638267 + }, + { + "acc": 0.67543235, + "epoch": 1.5213089802130897, + "grad_norm": 7.125, + "learning_rate": 1.4861091304366139e-06, + "loss": 1.5205225, + "memory(GiB)": 117.38, + "step": 59970, + "train_speed(iter/s)": 1.638281 + }, + { + "acc": 0.66942892, + "epoch": 1.521435819381025, + "grad_norm": 5.96875, + "learning_rate": 1.4853632073434533e-06, + "loss": 1.50150871, + "memory(GiB)": 117.38, + "step": 59975, + "train_speed(iter/s)": 1.638295 + }, + { + "acc": 0.64819031, + "epoch": 1.52156265854896, + "grad_norm": 10.0, + "learning_rate": 1.484617438839711e-06, + "loss": 1.66951733, + "memory(GiB)": 117.38, + "step": 59980, + "train_speed(iter/s)": 1.63831 + }, + { + "acc": 0.66218333, + "epoch": 1.521689497716895, + "grad_norm": 8.0, + "learning_rate": 1.483871824958189e-06, + "loss": 1.55853748, + "memory(GiB)": 117.38, + "step": 59985, + "train_speed(iter/s)": 1.638324 + }, + { + "acc": 0.64892235, + "epoch": 1.52181633688483, + "grad_norm": 6.3125, + "learning_rate": 1.483126365731682e-06, + "loss": 1.59185219, + "memory(GiB)": 117.38, + "step": 59990, + "train_speed(iter/s)": 1.638339 + }, + { + "acc": 0.66940746, + "epoch": 1.521943176052765, + "grad_norm": 5.375, + "learning_rate": 1.4823810611929795e-06, + "loss": 1.52065325, + "memory(GiB)": 117.38, + "step": 59995, + "train_speed(iter/s)": 1.638354 + }, + { + "acc": 0.67647648, + "epoch": 1.5220700152207, + "grad_norm": 5.34375, + "learning_rate": 1.481635911374863e-06, + "loss": 1.50588226, + "memory(GiB)": 117.38, + "step": 60000, + "train_speed(iter/s)": 1.638368 + }, + { + "epoch": 1.5220700152207, + "eval_acc": 0.6462238204725002, + "eval_loss": 1.5732791423797607, + "eval_runtime": 58.3904, + "eval_samples_per_second": 109.093, + "eval_steps_per_second": 27.282, + "step": 60000 + }, + { + "acc": 0.65408506, + "epoch": 1.522196854388635, + "grad_norm": 7.3125, + "learning_rate": 1.480890916310106e-06, + "loss": 1.63356438, + "memory(GiB)": 117.38, + "step": 60005, + "train_speed(iter/s)": 1.635583 + }, + { + "acc": 0.67194986, + "epoch": 1.5223236935565703, + "grad_norm": 6.25, + "learning_rate": 1.4801460760314811e-06, + "loss": 1.54215813, + "memory(GiB)": 117.38, + "step": 60010, + "train_speed(iter/s)": 1.635595 + }, + { + "acc": 0.64792438, + "epoch": 1.5224505327245055, + "grad_norm": 7.03125, + "learning_rate": 1.4794013905717453e-06, + "loss": 1.64431, + "memory(GiB)": 117.38, + "step": 60015, + "train_speed(iter/s)": 1.635609 + }, + { + "acc": 0.65688086, + "epoch": 1.5225773718924405, + "grad_norm": 5.25, + "learning_rate": 1.4786568599636548e-06, + "loss": 1.59861679, + "memory(GiB)": 117.38, + "step": 60020, + "train_speed(iter/s)": 1.635623 + }, + { + "acc": 0.64923539, + "epoch": 1.5227042110603755, + "grad_norm": 5.21875, + "learning_rate": 1.4779124842399556e-06, + "loss": 1.66376095, + "memory(GiB)": 117.38, + "step": 60025, + "train_speed(iter/s)": 1.635637 + }, + { + "acc": 0.65651307, + "epoch": 1.5228310502283104, + "grad_norm": 5.9375, + "learning_rate": 1.4771682634333933e-06, + "loss": 1.54960289, + "memory(GiB)": 117.38, + "step": 60030, + "train_speed(iter/s)": 1.635651 + }, + { + "acc": 0.66246595, + "epoch": 1.5229578893962454, + "grad_norm": 6.9375, + "learning_rate": 1.4764241975766975e-06, + "loss": 1.54694185, + "memory(GiB)": 117.38, + "step": 60035, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.65754814, + "epoch": 1.5230847285641806, + "grad_norm": 6.0625, + "learning_rate": 1.475680286702596e-06, + "loss": 1.57554607, + "memory(GiB)": 117.38, + "step": 60040, + "train_speed(iter/s)": 1.635679 + }, + { + "acc": 0.65941486, + "epoch": 1.5232115677321156, + "grad_norm": 5.28125, + "learning_rate": 1.474936530843812e-06, + "loss": 1.53971472, + "memory(GiB)": 117.38, + "step": 60045, + "train_speed(iter/s)": 1.635691 + }, + { + "acc": 0.6630558, + "epoch": 1.5233384069000508, + "grad_norm": 5.8125, + "learning_rate": 1.4741929300330588e-06, + "loss": 1.57260914, + "memory(GiB)": 117.38, + "step": 60050, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.65574389, + "epoch": 1.5234652460679858, + "grad_norm": 5.71875, + "learning_rate": 1.4734494843030405e-06, + "loss": 1.60111217, + "memory(GiB)": 117.38, + "step": 60055, + "train_speed(iter/s)": 1.63572 + }, + { + "acc": 0.65624018, + "epoch": 1.5235920852359208, + "grad_norm": 5.65625, + "learning_rate": 1.4727061936864573e-06, + "loss": 1.61748276, + "memory(GiB)": 117.38, + "step": 60060, + "train_speed(iter/s)": 1.635733 + }, + { + "acc": 0.64636483, + "epoch": 1.5237189244038558, + "grad_norm": 5.71875, + "learning_rate": 1.4719630582160056e-06, + "loss": 1.66449432, + "memory(GiB)": 117.38, + "step": 60065, + "train_speed(iter/s)": 1.635747 + }, + { + "acc": 0.65765944, + "epoch": 1.523845763571791, + "grad_norm": 5.5, + "learning_rate": 1.4712200779243718e-06, + "loss": 1.6092598, + "memory(GiB)": 117.38, + "step": 60070, + "train_speed(iter/s)": 1.635762 + }, + { + "acc": 0.65735688, + "epoch": 1.523972602739726, + "grad_norm": 5.8125, + "learning_rate": 1.4704772528442308e-06, + "loss": 1.63104286, + "memory(GiB)": 117.38, + "step": 60075, + "train_speed(iter/s)": 1.635776 + }, + { + "acc": 0.66829715, + "epoch": 1.5240994419076612, + "grad_norm": 5.03125, + "learning_rate": 1.46973458300826e-06, + "loss": 1.50231695, + "memory(GiB)": 117.38, + "step": 60080, + "train_speed(iter/s)": 1.63579 + }, + { + "acc": 0.65415154, + "epoch": 1.5242262810755962, + "grad_norm": 6.375, + "learning_rate": 1.4689920684491232e-06, + "loss": 1.59666777, + "memory(GiB)": 117.38, + "step": 60085, + "train_speed(iter/s)": 1.635804 + }, + { + "acc": 0.66662707, + "epoch": 1.5243531202435312, + "grad_norm": 5.78125, + "learning_rate": 1.4682497091994807e-06, + "loss": 1.59601307, + "memory(GiB)": 117.38, + "step": 60090, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.66017933, + "epoch": 1.5244799594114662, + "grad_norm": 5.40625, + "learning_rate": 1.467507505291984e-06, + "loss": 1.58230801, + "memory(GiB)": 117.38, + "step": 60095, + "train_speed(iter/s)": 1.635832 + }, + { + "acc": 0.64228773, + "epoch": 1.5246067985794012, + "grad_norm": 5.84375, + "learning_rate": 1.4667654567592781e-06, + "loss": 1.68955727, + "memory(GiB)": 117.38, + "step": 60100, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.64893932, + "epoch": 1.5247336377473364, + "grad_norm": 5.9375, + "learning_rate": 1.4660235636340025e-06, + "loss": 1.62020645, + "memory(GiB)": 117.38, + "step": 60105, + "train_speed(iter/s)": 1.635859 + }, + { + "acc": 0.65086508, + "epoch": 1.5248604769152716, + "grad_norm": 6.5, + "learning_rate": 1.465281825948789e-06, + "loss": 1.62364807, + "memory(GiB)": 117.38, + "step": 60110, + "train_speed(iter/s)": 1.635873 + }, + { + "acc": 0.64010787, + "epoch": 1.5249873160832066, + "grad_norm": 5.625, + "learning_rate": 1.464540243736262e-06, + "loss": 1.63286953, + "memory(GiB)": 117.38, + "step": 60115, + "train_speed(iter/s)": 1.635887 + }, + { + "acc": 0.65779972, + "epoch": 1.5251141552511416, + "grad_norm": 5.875, + "learning_rate": 1.4637988170290396e-06, + "loss": 1.5491478, + "memory(GiB)": 117.38, + "step": 60120, + "train_speed(iter/s)": 1.635902 + }, + { + "acc": 0.66906114, + "epoch": 1.5252409944190766, + "grad_norm": 5.53125, + "learning_rate": 1.4630575458597334e-06, + "loss": 1.56732073, + "memory(GiB)": 117.38, + "step": 60125, + "train_speed(iter/s)": 1.635916 + }, + { + "acc": 0.67282534, + "epoch": 1.5253678335870116, + "grad_norm": 6.75, + "learning_rate": 1.4623164302609472e-06, + "loss": 1.51577368, + "memory(GiB)": 117.38, + "step": 60130, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.64581718, + "epoch": 1.5254946727549468, + "grad_norm": 5.9375, + "learning_rate": 1.4615754702652796e-06, + "loss": 1.68193436, + "memory(GiB)": 117.38, + "step": 60135, + "train_speed(iter/s)": 1.635943 + }, + { + "acc": 0.66856804, + "epoch": 1.5256215119228818, + "grad_norm": 5.375, + "learning_rate": 1.4608346659053208e-06, + "loss": 1.53394117, + "memory(GiB)": 117.38, + "step": 60140, + "train_speed(iter/s)": 1.635958 + }, + { + "acc": 0.65752859, + "epoch": 1.525748351090817, + "grad_norm": 6.03125, + "learning_rate": 1.4600940172136541e-06, + "loss": 1.58509798, + "memory(GiB)": 117.38, + "step": 60145, + "train_speed(iter/s)": 1.635972 + }, + { + "acc": 0.66464953, + "epoch": 1.525875190258752, + "grad_norm": 6.46875, + "learning_rate": 1.4593535242228575e-06, + "loss": 1.55644188, + "memory(GiB)": 117.38, + "step": 60150, + "train_speed(iter/s)": 1.635985 + }, + { + "acc": 0.65431881, + "epoch": 1.526002029426687, + "grad_norm": 6.9375, + "learning_rate": 1.4586131869655001e-06, + "loss": 1.59162655, + "memory(GiB)": 117.38, + "step": 60155, + "train_speed(iter/s)": 1.636 + }, + { + "acc": 0.64266405, + "epoch": 1.526128868594622, + "grad_norm": 6.21875, + "learning_rate": 1.4578730054741462e-06, + "loss": 1.65194492, + "memory(GiB)": 117.38, + "step": 60160, + "train_speed(iter/s)": 1.636013 + }, + { + "acc": 0.65512137, + "epoch": 1.526255707762557, + "grad_norm": 5.5625, + "learning_rate": 1.4571329797813511e-06, + "loss": 1.56337051, + "memory(GiB)": 117.38, + "step": 60165, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.66711531, + "epoch": 1.5263825469304921, + "grad_norm": 6.4375, + "learning_rate": 1.4563931099196678e-06, + "loss": 1.54521618, + "memory(GiB)": 117.38, + "step": 60170, + "train_speed(iter/s)": 1.636043 + }, + { + "acc": 0.67029066, + "epoch": 1.5265093860984273, + "grad_norm": 6.0625, + "learning_rate": 1.455653395921635e-06, + "loss": 1.5556139, + "memory(GiB)": 117.38, + "step": 60175, + "train_speed(iter/s)": 1.636057 + }, + { + "acc": 0.66554408, + "epoch": 1.5266362252663623, + "grad_norm": 7.8125, + "learning_rate": 1.4549138378197891e-06, + "loss": 1.61882439, + "memory(GiB)": 117.38, + "step": 60180, + "train_speed(iter/s)": 1.636071 + }, + { + "acc": 0.65113807, + "epoch": 1.5267630644342973, + "grad_norm": 5.4375, + "learning_rate": 1.4541744356466615e-06, + "loss": 1.63148766, + "memory(GiB)": 117.38, + "step": 60185, + "train_speed(iter/s)": 1.636085 + }, + { + "acc": 0.65669012, + "epoch": 1.5268899036022323, + "grad_norm": 5.03125, + "learning_rate": 1.4534351894347748e-06, + "loss": 1.53870411, + "memory(GiB)": 117.38, + "step": 60190, + "train_speed(iter/s)": 1.636099 + }, + { + "acc": 0.66183615, + "epoch": 1.5270167427701673, + "grad_norm": 5.78125, + "learning_rate": 1.4526960992166412e-06, + "loss": 1.60380211, + "memory(GiB)": 117.38, + "step": 60195, + "train_speed(iter/s)": 1.636113 + }, + { + "acc": 0.65956669, + "epoch": 1.5271435819381025, + "grad_norm": 5.28125, + "learning_rate": 1.4519571650247687e-06, + "loss": 1.57495747, + "memory(GiB)": 117.38, + "step": 60200, + "train_speed(iter/s)": 1.636127 + }, + { + "acc": 0.66309075, + "epoch": 1.5272704211060375, + "grad_norm": 5.40625, + "learning_rate": 1.4512183868916629e-06, + "loss": 1.5950633, + "memory(GiB)": 117.38, + "step": 60205, + "train_speed(iter/s)": 1.636139 + }, + { + "acc": 0.6613862, + "epoch": 1.5273972602739727, + "grad_norm": 6.15625, + "learning_rate": 1.4504797648498186e-06, + "loss": 1.60185127, + "memory(GiB)": 117.38, + "step": 60210, + "train_speed(iter/s)": 1.636153 + }, + { + "acc": 0.6566926, + "epoch": 1.5275240994419077, + "grad_norm": 5.375, + "learning_rate": 1.4497412989317184e-06, + "loss": 1.57675056, + "memory(GiB)": 117.38, + "step": 60215, + "train_speed(iter/s)": 1.636168 + }, + { + "acc": 0.66068482, + "epoch": 1.5276509386098427, + "grad_norm": 5.65625, + "learning_rate": 1.4490029891698476e-06, + "loss": 1.6750309, + "memory(GiB)": 117.38, + "step": 60220, + "train_speed(iter/s)": 1.636182 + }, + { + "acc": 0.65352163, + "epoch": 1.5277777777777777, + "grad_norm": 5.84375, + "learning_rate": 1.44826483559668e-06, + "loss": 1.54409981, + "memory(GiB)": 117.38, + "step": 60225, + "train_speed(iter/s)": 1.636196 + }, + { + "acc": 0.66548691, + "epoch": 1.527904616945713, + "grad_norm": 6.03125, + "learning_rate": 1.4475268382446833e-06, + "loss": 1.52457895, + "memory(GiB)": 117.38, + "step": 60230, + "train_speed(iter/s)": 1.63621 + }, + { + "acc": 0.6594141, + "epoch": 1.5280314561136479, + "grad_norm": 6.125, + "learning_rate": 1.4467889971463144e-06, + "loss": 1.59770603, + "memory(GiB)": 117.38, + "step": 60235, + "train_speed(iter/s)": 1.636226 + }, + { + "acc": 0.66068048, + "epoch": 1.528158295281583, + "grad_norm": 4.875, + "learning_rate": 1.4460513123340308e-06, + "loss": 1.59796419, + "memory(GiB)": 117.38, + "step": 60240, + "train_speed(iter/s)": 1.63624 + }, + { + "acc": 0.66386471, + "epoch": 1.528285134449518, + "grad_norm": 6.75, + "learning_rate": 1.4453137838402775e-06, + "loss": 1.59874687, + "memory(GiB)": 117.38, + "step": 60245, + "train_speed(iter/s)": 1.636255 + }, + { + "acc": 0.65998678, + "epoch": 1.528411973617453, + "grad_norm": 5.6875, + "learning_rate": 1.4445764116974948e-06, + "loss": 1.54967842, + "memory(GiB)": 117.38, + "step": 60250, + "train_speed(iter/s)": 1.636271 + }, + { + "acc": 0.66433616, + "epoch": 1.528538812785388, + "grad_norm": 5.5625, + "learning_rate": 1.4438391959381149e-06, + "loss": 1.56788483, + "memory(GiB)": 117.38, + "step": 60255, + "train_speed(iter/s)": 1.636285 + }, + { + "acc": 0.65703015, + "epoch": 1.528665651953323, + "grad_norm": 6.0625, + "learning_rate": 1.4431021365945647e-06, + "loss": 1.58435555, + "memory(GiB)": 117.38, + "step": 60260, + "train_speed(iter/s)": 1.636299 + }, + { + "acc": 0.64913206, + "epoch": 1.5287924911212583, + "grad_norm": 6.375, + "learning_rate": 1.4423652336992627e-06, + "loss": 1.64434853, + "memory(GiB)": 117.38, + "step": 60265, + "train_speed(iter/s)": 1.636313 + }, + { + "acc": 0.65893993, + "epoch": 1.5289193302891935, + "grad_norm": 6.59375, + "learning_rate": 1.4416284872846215e-06, + "loss": 1.55997715, + "memory(GiB)": 117.38, + "step": 60270, + "train_speed(iter/s)": 1.636328 + }, + { + "acc": 0.67031555, + "epoch": 1.5290461694571285, + "grad_norm": 5.78125, + "learning_rate": 1.440891897383046e-06, + "loss": 1.48104639, + "memory(GiB)": 117.38, + "step": 60275, + "train_speed(iter/s)": 1.636343 + }, + { + "acc": 0.67872453, + "epoch": 1.5291730086250634, + "grad_norm": 7.15625, + "learning_rate": 1.4401554640269354e-06, + "loss": 1.45684309, + "memory(GiB)": 117.38, + "step": 60280, + "train_speed(iter/s)": 1.636357 + }, + { + "acc": 0.64625325, + "epoch": 1.5292998477929984, + "grad_norm": 5.9375, + "learning_rate": 1.4394191872486812e-06, + "loss": 1.64098358, + "memory(GiB)": 117.38, + "step": 60285, + "train_speed(iter/s)": 1.63637 + }, + { + "acc": 0.65403628, + "epoch": 1.5294266869609334, + "grad_norm": 6.125, + "learning_rate": 1.4386830670806684e-06, + "loss": 1.66142616, + "memory(GiB)": 117.38, + "step": 60290, + "train_speed(iter/s)": 1.636384 + }, + { + "acc": 0.66217656, + "epoch": 1.5295535261288686, + "grad_norm": 5.71875, + "learning_rate": 1.4379471035552738e-06, + "loss": 1.56906013, + "memory(GiB)": 117.38, + "step": 60295, + "train_speed(iter/s)": 1.636398 + }, + { + "acc": 0.65460219, + "epoch": 1.5296803652968036, + "grad_norm": 6.53125, + "learning_rate": 1.437211296704869e-06, + "loss": 1.63268814, + "memory(GiB)": 117.38, + "step": 60300, + "train_speed(iter/s)": 1.636413 + }, + { + "acc": 0.65590053, + "epoch": 1.5298072044647388, + "grad_norm": 4.71875, + "learning_rate": 1.4364756465618167e-06, + "loss": 1.62609615, + "memory(GiB)": 117.38, + "step": 60305, + "train_speed(iter/s)": 1.636427 + }, + { + "acc": 0.67777815, + "epoch": 1.5299340436326738, + "grad_norm": 5.9375, + "learning_rate": 1.4357401531584792e-06, + "loss": 1.53188763, + "memory(GiB)": 117.38, + "step": 60310, + "train_speed(iter/s)": 1.636441 + }, + { + "acc": 0.65823317, + "epoch": 1.5300608828006088, + "grad_norm": 6.59375, + "learning_rate": 1.4350048165272006e-06, + "loss": 1.5599124, + "memory(GiB)": 117.38, + "step": 60315, + "train_speed(iter/s)": 1.636455 + }, + { + "acc": 0.6543376, + "epoch": 1.5301877219685438, + "grad_norm": 6.5, + "learning_rate": 1.4342696367003272e-06, + "loss": 1.66451702, + "memory(GiB)": 117.38, + "step": 60320, + "train_speed(iter/s)": 1.636468 + }, + { + "acc": 0.64522972, + "epoch": 1.5303145611364788, + "grad_norm": 6.65625, + "learning_rate": 1.433534613710193e-06, + "loss": 1.55607014, + "memory(GiB)": 117.38, + "step": 60325, + "train_speed(iter/s)": 1.636482 + }, + { + "acc": 0.66271977, + "epoch": 1.530441400304414, + "grad_norm": 5.625, + "learning_rate": 1.4327997475891331e-06, + "loss": 1.57936897, + "memory(GiB)": 117.38, + "step": 60330, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.65755644, + "epoch": 1.5305682394723492, + "grad_norm": 6.75, + "learning_rate": 1.432065038369465e-06, + "loss": 1.58260489, + "memory(GiB)": 117.38, + "step": 60335, + "train_speed(iter/s)": 1.63651 + }, + { + "acc": 0.66864429, + "epoch": 1.5306950786402842, + "grad_norm": 5.5, + "learning_rate": 1.4313304860835048e-06, + "loss": 1.54122305, + "memory(GiB)": 117.38, + "step": 60340, + "train_speed(iter/s)": 1.636523 + }, + { + "acc": 0.65223794, + "epoch": 1.5308219178082192, + "grad_norm": 5.0625, + "learning_rate": 1.4305960907635641e-06, + "loss": 1.62357731, + "memory(GiB)": 117.38, + "step": 60345, + "train_speed(iter/s)": 1.636537 + }, + { + "acc": 0.66769967, + "epoch": 1.5309487569761542, + "grad_norm": 5.25, + "learning_rate": 1.4298618524419455e-06, + "loss": 1.58717089, + "memory(GiB)": 117.38, + "step": 60350, + "train_speed(iter/s)": 1.636552 + }, + { + "acc": 0.65245843, + "epoch": 1.5310755961440892, + "grad_norm": 5.1875, + "learning_rate": 1.4291277711509388e-06, + "loss": 1.64225197, + "memory(GiB)": 117.38, + "step": 60355, + "train_speed(iter/s)": 1.636566 + }, + { + "acc": 0.6647758, + "epoch": 1.5312024353120244, + "grad_norm": 5.71875, + "learning_rate": 1.428393846922837e-06, + "loss": 1.48412437, + "memory(GiB)": 117.38, + "step": 60360, + "train_speed(iter/s)": 1.636581 + }, + { + "acc": 0.67687397, + "epoch": 1.5313292744799594, + "grad_norm": 5.0625, + "learning_rate": 1.4276600797899199e-06, + "loss": 1.50028172, + "memory(GiB)": 117.38, + "step": 60365, + "train_speed(iter/s)": 1.636595 + }, + { + "acc": 0.66287665, + "epoch": 1.5314561136478946, + "grad_norm": 5.46875, + "learning_rate": 1.426926469784463e-06, + "loss": 1.56222439, + "memory(GiB)": 117.38, + "step": 60370, + "train_speed(iter/s)": 1.636609 + }, + { + "acc": 0.65942359, + "epoch": 1.5315829528158296, + "grad_norm": 7.375, + "learning_rate": 1.42619301693873e-06, + "loss": 1.66098785, + "memory(GiB)": 117.38, + "step": 60375, + "train_speed(iter/s)": 1.636624 + }, + { + "acc": 0.66916437, + "epoch": 1.5317097919837646, + "grad_norm": 5.375, + "learning_rate": 1.4254597212849858e-06, + "loss": 1.54763126, + "memory(GiB)": 117.38, + "step": 60380, + "train_speed(iter/s)": 1.636638 + }, + { + "acc": 0.65305176, + "epoch": 1.5318366311516995, + "grad_norm": 6.0625, + "learning_rate": 1.4247265828554819e-06, + "loss": 1.60305157, + "memory(GiB)": 117.38, + "step": 60385, + "train_speed(iter/s)": 1.636652 + }, + { + "acc": 0.64701223, + "epoch": 1.5319634703196348, + "grad_norm": 6.4375, + "learning_rate": 1.423993601682465e-06, + "loss": 1.64670525, + "memory(GiB)": 117.38, + "step": 60390, + "train_speed(iter/s)": 1.636666 + }, + { + "acc": 0.6553968, + "epoch": 1.5320903094875697, + "grad_norm": 5.46875, + "learning_rate": 1.423260777798176e-06, + "loss": 1.58378267, + "memory(GiB)": 117.38, + "step": 60395, + "train_speed(iter/s)": 1.636681 + }, + { + "acc": 0.65751104, + "epoch": 1.532217148655505, + "grad_norm": 6.78125, + "learning_rate": 1.4225281112348466e-06, + "loss": 1.61481094, + "memory(GiB)": 117.38, + "step": 60400, + "train_speed(iter/s)": 1.636695 + }, + { + "acc": 0.63155832, + "epoch": 1.53234398782344, + "grad_norm": 6.84375, + "learning_rate": 1.421795602024703e-06, + "loss": 1.69914551, + "memory(GiB)": 117.38, + "step": 60405, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.66281881, + "epoch": 1.532470826991375, + "grad_norm": 6.25, + "learning_rate": 1.4210632501999643e-06, + "loss": 1.60844517, + "memory(GiB)": 117.38, + "step": 60410, + "train_speed(iter/s)": 1.636723 + }, + { + "acc": 0.65548601, + "epoch": 1.53259766615931, + "grad_norm": 5.28125, + "learning_rate": 1.4203310557928428e-06, + "loss": 1.58384342, + "memory(GiB)": 117.38, + "step": 60415, + "train_speed(iter/s)": 1.636737 + }, + { + "acc": 0.6365365, + "epoch": 1.532724505327245, + "grad_norm": 5.78125, + "learning_rate": 1.4195990188355435e-06, + "loss": 1.71370163, + "memory(GiB)": 117.38, + "step": 60420, + "train_speed(iter/s)": 1.636752 + }, + { + "acc": 0.65308132, + "epoch": 1.5328513444951801, + "grad_norm": 7.625, + "learning_rate": 1.418867139360265e-06, + "loss": 1.57925014, + "memory(GiB)": 117.38, + "step": 60425, + "train_speed(iter/s)": 1.636765 + }, + { + "acc": 0.6641264, + "epoch": 1.5329781836631153, + "grad_norm": 7.0625, + "learning_rate": 1.418135417399198e-06, + "loss": 1.58189831, + "memory(GiB)": 117.38, + "step": 60430, + "train_speed(iter/s)": 1.63678 + }, + { + "acc": 0.64957228, + "epoch": 1.5331050228310503, + "grad_norm": 7.4375, + "learning_rate": 1.4174038529845273e-06, + "loss": 1.67629395, + "memory(GiB)": 117.38, + "step": 60435, + "train_speed(iter/s)": 1.636795 + }, + { + "acc": 0.64387145, + "epoch": 1.5332318619989853, + "grad_norm": 6.625, + "learning_rate": 1.4166724461484304e-06, + "loss": 1.64619179, + "memory(GiB)": 117.38, + "step": 60440, + "train_speed(iter/s)": 1.63681 + }, + { + "acc": 0.64260406, + "epoch": 1.5333587011669203, + "grad_norm": 4.8125, + "learning_rate": 1.4159411969230758e-06, + "loss": 1.6511631, + "memory(GiB)": 117.38, + "step": 60445, + "train_speed(iter/s)": 1.636824 + }, + { + "acc": 0.65834703, + "epoch": 1.5334855403348553, + "grad_norm": 5.1875, + "learning_rate": 1.4152101053406325e-06, + "loss": 1.5497858, + "memory(GiB)": 117.38, + "step": 60450, + "train_speed(iter/s)": 1.636838 + }, + { + "acc": 0.65695348, + "epoch": 1.5336123795027905, + "grad_norm": 5.59375, + "learning_rate": 1.4144791714332517e-06, + "loss": 1.56492538, + "memory(GiB)": 117.38, + "step": 60455, + "train_speed(iter/s)": 1.636852 + }, + { + "acc": 0.65435057, + "epoch": 1.5337392186707255, + "grad_norm": 5.375, + "learning_rate": 1.4137483952330855e-06, + "loss": 1.60241508, + "memory(GiB)": 117.38, + "step": 60460, + "train_speed(iter/s)": 1.636867 + }, + { + "acc": 0.66194444, + "epoch": 1.5338660578386607, + "grad_norm": 6.96875, + "learning_rate": 1.4130177767722753e-06, + "loss": 1.59549732, + "memory(GiB)": 117.38, + "step": 60465, + "train_speed(iter/s)": 1.636881 + }, + { + "acc": 0.66424203, + "epoch": 1.5339928970065957, + "grad_norm": 5.75, + "learning_rate": 1.4122873160829603e-06, + "loss": 1.5887455, + "memory(GiB)": 117.38, + "step": 60470, + "train_speed(iter/s)": 1.636894 + }, + { + "acc": 0.64726505, + "epoch": 1.5341197361745307, + "grad_norm": 4.65625, + "learning_rate": 1.4115570131972655e-06, + "loss": 1.62380295, + "memory(GiB)": 117.38, + "step": 60475, + "train_speed(iter/s)": 1.636909 + }, + { + "acc": 0.66210413, + "epoch": 1.5342465753424657, + "grad_norm": 5.21875, + "learning_rate": 1.4108268681473136e-06, + "loss": 1.61364632, + "memory(GiB)": 117.38, + "step": 60480, + "train_speed(iter/s)": 1.636923 + }, + { + "acc": 0.66844954, + "epoch": 1.5343734145104007, + "grad_norm": 9.0, + "learning_rate": 1.4100968809652215e-06, + "loss": 1.62208824, + "memory(GiB)": 117.38, + "step": 60485, + "train_speed(iter/s)": 1.636936 + }, + { + "acc": 0.66008363, + "epoch": 1.5345002536783359, + "grad_norm": 6.9375, + "learning_rate": 1.4093670516830982e-06, + "loss": 1.60053558, + "memory(GiB)": 117.38, + "step": 60490, + "train_speed(iter/s)": 1.636951 + }, + { + "acc": 0.6450129, + "epoch": 1.534627092846271, + "grad_norm": 6.28125, + "learning_rate": 1.4086373803330417e-06, + "loss": 1.60832767, + "memory(GiB)": 117.38, + "step": 60495, + "train_speed(iter/s)": 1.636965 + }, + { + "acc": 0.65314512, + "epoch": 1.534753932014206, + "grad_norm": 6.15625, + "learning_rate": 1.4079078669471457e-06, + "loss": 1.64581146, + "memory(GiB)": 117.38, + "step": 60500, + "train_speed(iter/s)": 1.636979 + }, + { + "acc": 0.65411048, + "epoch": 1.534880771182141, + "grad_norm": 6.15625, + "learning_rate": 1.4071785115575005e-06, + "loss": 1.56350098, + "memory(GiB)": 117.38, + "step": 60505, + "train_speed(iter/s)": 1.636993 + }, + { + "acc": 0.67226439, + "epoch": 1.535007610350076, + "grad_norm": 7.125, + "learning_rate": 1.4064493141961872e-06, + "loss": 1.5487051, + "memory(GiB)": 117.38, + "step": 60510, + "train_speed(iter/s)": 1.637007 + }, + { + "acc": 0.67400651, + "epoch": 1.535134449518011, + "grad_norm": 6.96875, + "learning_rate": 1.4057202748952736e-06, + "loss": 1.58081856, + "memory(GiB)": 117.38, + "step": 60515, + "train_speed(iter/s)": 1.637022 + }, + { + "acc": 0.66180763, + "epoch": 1.5352612886859462, + "grad_norm": 6.90625, + "learning_rate": 1.4049913936868314e-06, + "loss": 1.55781975, + "memory(GiB)": 117.38, + "step": 60520, + "train_speed(iter/s)": 1.637036 + }, + { + "acc": 0.64300485, + "epoch": 1.5353881278538812, + "grad_norm": 5.8125, + "learning_rate": 1.4042626706029184e-06, + "loss": 1.63914547, + "memory(GiB)": 117.38, + "step": 60525, + "train_speed(iter/s)": 1.63705 + }, + { + "acc": 0.6416573, + "epoch": 1.5355149670218164, + "grad_norm": 6.03125, + "learning_rate": 1.4035341056755864e-06, + "loss": 1.64014778, + "memory(GiB)": 117.38, + "step": 60530, + "train_speed(iter/s)": 1.637064 + }, + { + "acc": 0.67381473, + "epoch": 1.5356418061897514, + "grad_norm": 6.4375, + "learning_rate": 1.402805698936882e-06, + "loss": 1.56713543, + "memory(GiB)": 117.38, + "step": 60535, + "train_speed(iter/s)": 1.637078 + }, + { + "acc": 0.65821733, + "epoch": 1.5357686453576864, + "grad_norm": 5.1875, + "learning_rate": 1.4020774504188428e-06, + "loss": 1.6099659, + "memory(GiB)": 117.38, + "step": 60540, + "train_speed(iter/s)": 1.637093 + }, + { + "acc": 0.65900126, + "epoch": 1.5358954845256214, + "grad_norm": 5.625, + "learning_rate": 1.4013493601535016e-06, + "loss": 1.60410881, + "memory(GiB)": 117.38, + "step": 60545, + "train_speed(iter/s)": 1.637107 + }, + { + "acc": 0.6433095, + "epoch": 1.5360223236935566, + "grad_norm": 9.6875, + "learning_rate": 1.400621428172882e-06, + "loss": 1.68636627, + "memory(GiB)": 117.38, + "step": 60550, + "train_speed(iter/s)": 1.637121 + }, + { + "acc": 0.65696101, + "epoch": 1.5361491628614916, + "grad_norm": 6.875, + "learning_rate": 1.399893654509002e-06, + "loss": 1.64220657, + "memory(GiB)": 117.38, + "step": 60555, + "train_speed(iter/s)": 1.637136 + }, + { + "acc": 0.64825425, + "epoch": 1.5362760020294268, + "grad_norm": 5.40625, + "learning_rate": 1.3991660391938721e-06, + "loss": 1.62121353, + "memory(GiB)": 117.38, + "step": 60560, + "train_speed(iter/s)": 1.63715 + }, + { + "acc": 0.65843449, + "epoch": 1.5364028411973618, + "grad_norm": 7.1875, + "learning_rate": 1.398438582259497e-06, + "loss": 1.61310921, + "memory(GiB)": 117.38, + "step": 60565, + "train_speed(iter/s)": 1.637165 + }, + { + "acc": 0.65139036, + "epoch": 1.5365296803652968, + "grad_norm": 6.375, + "learning_rate": 1.3977112837378726e-06, + "loss": 1.60211563, + "memory(GiB)": 117.38, + "step": 60570, + "train_speed(iter/s)": 1.63718 + }, + { + "acc": 0.64712067, + "epoch": 1.5366565195332318, + "grad_norm": 5.59375, + "learning_rate": 1.3969841436609888e-06, + "loss": 1.6357235, + "memory(GiB)": 117.38, + "step": 60575, + "train_speed(iter/s)": 1.637194 + }, + { + "acc": 0.6533452, + "epoch": 1.5367833587011668, + "grad_norm": 4.90625, + "learning_rate": 1.396257162060829e-06, + "loss": 1.58391037, + "memory(GiB)": 117.38, + "step": 60580, + "train_speed(iter/s)": 1.637208 + }, + { + "acc": 0.63941755, + "epoch": 1.536910197869102, + "grad_norm": 4.625, + "learning_rate": 1.395530338969367e-06, + "loss": 1.67452774, + "memory(GiB)": 117.38, + "step": 60585, + "train_speed(iter/s)": 1.637222 + }, + { + "acc": 0.64957657, + "epoch": 1.5370370370370372, + "grad_norm": 5.5, + "learning_rate": 1.3948036744185767e-06, + "loss": 1.59090881, + "memory(GiB)": 117.38, + "step": 60590, + "train_speed(iter/s)": 1.637237 + }, + { + "acc": 0.64799085, + "epoch": 1.5371638762049722, + "grad_norm": 6.46875, + "learning_rate": 1.3940771684404153e-06, + "loss": 1.56088428, + "memory(GiB)": 117.38, + "step": 60595, + "train_speed(iter/s)": 1.637251 + }, + { + "acc": 0.66535153, + "epoch": 1.5372907153729072, + "grad_norm": 5.5, + "learning_rate": 1.393350821066839e-06, + "loss": 1.53039045, + "memory(GiB)": 117.38, + "step": 60600, + "train_speed(iter/s)": 1.637264 + }, + { + "acc": 0.67648168, + "epoch": 1.5374175545408422, + "grad_norm": 5.40625, + "learning_rate": 1.3926246323297948e-06, + "loss": 1.56906986, + "memory(GiB)": 117.38, + "step": 60605, + "train_speed(iter/s)": 1.637278 + }, + { + "acc": 0.6554822, + "epoch": 1.5375443937087772, + "grad_norm": 5.71875, + "learning_rate": 1.3918986022612285e-06, + "loss": 1.53117218, + "memory(GiB)": 117.38, + "step": 60610, + "train_speed(iter/s)": 1.637293 + }, + { + "acc": 0.66043124, + "epoch": 1.5376712328767124, + "grad_norm": 6.78125, + "learning_rate": 1.3911727308930684e-06, + "loss": 1.61840401, + "memory(GiB)": 117.38, + "step": 60615, + "train_speed(iter/s)": 1.637306 + }, + { + "acc": 0.64953861, + "epoch": 1.5377980720446474, + "grad_norm": 6.65625, + "learning_rate": 1.3904470182572428e-06, + "loss": 1.60523891, + "memory(GiB)": 117.38, + "step": 60620, + "train_speed(iter/s)": 1.637319 + }, + { + "acc": 0.6532434, + "epoch": 1.5379249112125826, + "grad_norm": 5.9375, + "learning_rate": 1.3897214643856744e-06, + "loss": 1.67258987, + "memory(GiB)": 117.38, + "step": 60625, + "train_speed(iter/s)": 1.637333 + }, + { + "acc": 0.67551403, + "epoch": 1.5380517503805176, + "grad_norm": 7.625, + "learning_rate": 1.388996069310276e-06, + "loss": 1.53150196, + "memory(GiB)": 117.38, + "step": 60630, + "train_speed(iter/s)": 1.637345 + }, + { + "acc": 0.65537319, + "epoch": 1.5381785895484525, + "grad_norm": 5.34375, + "learning_rate": 1.3882708330629514e-06, + "loss": 1.6425642, + "memory(GiB)": 117.38, + "step": 60635, + "train_speed(iter/s)": 1.63736 + }, + { + "acc": 0.65168991, + "epoch": 1.5383054287163875, + "grad_norm": 5.21875, + "learning_rate": 1.3875457556755989e-06, + "loss": 1.61624603, + "memory(GiB)": 117.38, + "step": 60640, + "train_speed(iter/s)": 1.637375 + }, + { + "acc": 0.65625429, + "epoch": 1.5384322678843225, + "grad_norm": 6.0, + "learning_rate": 1.386820837180114e-06, + "loss": 1.56737709, + "memory(GiB)": 117.38, + "step": 60645, + "train_speed(iter/s)": 1.637388 + }, + { + "acc": 0.66664801, + "epoch": 1.5385591070522577, + "grad_norm": 6.0, + "learning_rate": 1.386096077608382e-06, + "loss": 1.56442862, + "memory(GiB)": 117.38, + "step": 60650, + "train_speed(iter/s)": 1.637401 + }, + { + "acc": 0.65851536, + "epoch": 1.538685946220193, + "grad_norm": 5.3125, + "learning_rate": 1.385371476992276e-06, + "loss": 1.5890173, + "memory(GiB)": 117.38, + "step": 60655, + "train_speed(iter/s)": 1.637415 + }, + { + "acc": 0.6478879, + "epoch": 1.538812785388128, + "grad_norm": 5.8125, + "learning_rate": 1.3846470353636726e-06, + "loss": 1.63984299, + "memory(GiB)": 117.38, + "step": 60660, + "train_speed(iter/s)": 1.637429 + }, + { + "acc": 0.6499064, + "epoch": 1.538939624556063, + "grad_norm": 4.84375, + "learning_rate": 1.3839227527544336e-06, + "loss": 1.61538315, + "memory(GiB)": 117.38, + "step": 60665, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.6608439, + "epoch": 1.539066463723998, + "grad_norm": 5.4375, + "learning_rate": 1.3831986291964184e-06, + "loss": 1.5654541, + "memory(GiB)": 117.38, + "step": 60670, + "train_speed(iter/s)": 1.637457 + }, + { + "acc": 0.65856328, + "epoch": 1.539193302891933, + "grad_norm": 6.5, + "learning_rate": 1.382474664721472e-06, + "loss": 1.61815567, + "memory(GiB)": 117.38, + "step": 60675, + "train_speed(iter/s)": 1.63747 + }, + { + "acc": 0.65965481, + "epoch": 1.5393201420598681, + "grad_norm": 5.96875, + "learning_rate": 1.3817508593614425e-06, + "loss": 1.5282938, + "memory(GiB)": 117.38, + "step": 60680, + "train_speed(iter/s)": 1.637484 + }, + { + "acc": 0.65463648, + "epoch": 1.539446981227803, + "grad_norm": 5.28125, + "learning_rate": 1.381027213148165e-06, + "loss": 1.63583755, + "memory(GiB)": 117.38, + "step": 60685, + "train_speed(iter/s)": 1.637498 + }, + { + "acc": 0.64780493, + "epoch": 1.5395738203957383, + "grad_norm": 6.9375, + "learning_rate": 1.3803037261134678e-06, + "loss": 1.61269207, + "memory(GiB)": 117.38, + "step": 60690, + "train_speed(iter/s)": 1.637513 + }, + { + "acc": 0.65708666, + "epoch": 1.5397006595636733, + "grad_norm": 6.0625, + "learning_rate": 1.3795803982891736e-06, + "loss": 1.61737595, + "memory(GiB)": 117.38, + "step": 60695, + "train_speed(iter/s)": 1.637525 + }, + { + "acc": 0.65703225, + "epoch": 1.5398274987316083, + "grad_norm": 5.4375, + "learning_rate": 1.3788572297070974e-06, + "loss": 1.56723289, + "memory(GiB)": 117.38, + "step": 60700, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.66658363, + "epoch": 1.5399543378995433, + "grad_norm": 4.8125, + "learning_rate": 1.3781342203990478e-06, + "loss": 1.57033339, + "memory(GiB)": 117.38, + "step": 60705, + "train_speed(iter/s)": 1.637553 + }, + { + "acc": 0.65937982, + "epoch": 1.5400811770674785, + "grad_norm": 5.84375, + "learning_rate": 1.3774113703968255e-06, + "loss": 1.62580185, + "memory(GiB)": 117.38, + "step": 60710, + "train_speed(iter/s)": 1.637567 + }, + { + "acc": 0.6533318, + "epoch": 1.5402080162354135, + "grad_norm": 5.9375, + "learning_rate": 1.3766886797322248e-06, + "loss": 1.56688538, + "memory(GiB)": 117.38, + "step": 60715, + "train_speed(iter/s)": 1.637581 + }, + { + "acc": 0.67758055, + "epoch": 1.5403348554033487, + "grad_norm": 4.65625, + "learning_rate": 1.3759661484370324e-06, + "loss": 1.50598564, + "memory(GiB)": 117.38, + "step": 60720, + "train_speed(iter/s)": 1.637594 + }, + { + "acc": 0.66571622, + "epoch": 1.5404616945712837, + "grad_norm": 6.65625, + "learning_rate": 1.3752437765430294e-06, + "loss": 1.55234203, + "memory(GiB)": 117.38, + "step": 60725, + "train_speed(iter/s)": 1.637609 + }, + { + "acc": 0.66015892, + "epoch": 1.5405885337392187, + "grad_norm": 8.0625, + "learning_rate": 1.3745215640819886e-06, + "loss": 1.59730568, + "memory(GiB)": 117.38, + "step": 60730, + "train_speed(iter/s)": 1.637623 + }, + { + "acc": 0.67173948, + "epoch": 1.5407153729071537, + "grad_norm": 6.03125, + "learning_rate": 1.373799511085676e-06, + "loss": 1.5125762, + "memory(GiB)": 117.38, + "step": 60735, + "train_speed(iter/s)": 1.637637 + }, + { + "acc": 0.65377264, + "epoch": 1.5408422120750886, + "grad_norm": 5.34375, + "learning_rate": 1.3730776175858506e-06, + "loss": 1.63221302, + "memory(GiB)": 117.38, + "step": 60740, + "train_speed(iter/s)": 1.63765 + }, + { + "acc": 0.66715221, + "epoch": 1.5409690512430239, + "grad_norm": 5.78125, + "learning_rate": 1.3723558836142631e-06, + "loss": 1.58224907, + "memory(GiB)": 117.38, + "step": 60745, + "train_speed(iter/s)": 1.637664 + }, + { + "acc": 0.6688014, + "epoch": 1.541095890410959, + "grad_norm": 4.8125, + "learning_rate": 1.371634309202663e-06, + "loss": 1.55853424, + "memory(GiB)": 117.38, + "step": 60750, + "train_speed(iter/s)": 1.637678 + }, + { + "acc": 0.65355935, + "epoch": 1.541222729578894, + "grad_norm": 6.28125, + "learning_rate": 1.3709128943827842e-06, + "loss": 1.62020283, + "memory(GiB)": 117.38, + "step": 60755, + "train_speed(iter/s)": 1.637691 + }, + { + "acc": 0.63813562, + "epoch": 1.541349568746829, + "grad_norm": 6.03125, + "learning_rate": 1.3701916391863573e-06, + "loss": 1.64351025, + "memory(GiB)": 117.38, + "step": 60760, + "train_speed(iter/s)": 1.637706 + }, + { + "acc": 0.64319868, + "epoch": 1.541476407914764, + "grad_norm": 6.09375, + "learning_rate": 1.3694705436451093e-06, + "loss": 1.62536964, + "memory(GiB)": 117.38, + "step": 60765, + "train_speed(iter/s)": 1.63772 + }, + { + "acc": 0.6715888, + "epoch": 1.541603247082699, + "grad_norm": 5.84375, + "learning_rate": 1.368749607790758e-06, + "loss": 1.51326818, + "memory(GiB)": 117.38, + "step": 60770, + "train_speed(iter/s)": 1.637733 + }, + { + "acc": 0.66085086, + "epoch": 1.5417300862506342, + "grad_norm": 5.625, + "learning_rate": 1.3680288316550095e-06, + "loss": 1.55030251, + "memory(GiB)": 117.38, + "step": 60775, + "train_speed(iter/s)": 1.637747 + }, + { + "acc": 0.64841723, + "epoch": 1.5418569254185692, + "grad_norm": 6.90625, + "learning_rate": 1.3673082152695672e-06, + "loss": 1.66748238, + "memory(GiB)": 117.38, + "step": 60780, + "train_speed(iter/s)": 1.637761 + }, + { + "acc": 0.63058891, + "epoch": 1.5419837645865044, + "grad_norm": 4.96875, + "learning_rate": 1.3665877586661296e-06, + "loss": 1.67498512, + "memory(GiB)": 117.38, + "step": 60785, + "train_speed(iter/s)": 1.637775 + }, + { + "acc": 0.66198859, + "epoch": 1.5421106037544394, + "grad_norm": 5.5, + "learning_rate": 1.3658674618763862e-06, + "loss": 1.57020464, + "memory(GiB)": 117.38, + "step": 60790, + "train_speed(iter/s)": 1.637788 + }, + { + "acc": 0.64411054, + "epoch": 1.5422374429223744, + "grad_norm": 6.65625, + "learning_rate": 1.365147324932014e-06, + "loss": 1.59758244, + "memory(GiB)": 117.38, + "step": 60795, + "train_speed(iter/s)": 1.637802 + }, + { + "acc": 0.64423852, + "epoch": 1.5423642820903094, + "grad_norm": 6.53125, + "learning_rate": 1.3644273478646925e-06, + "loss": 1.68765011, + "memory(GiB)": 117.38, + "step": 60800, + "train_speed(iter/s)": 1.637816 + }, + { + "acc": 0.65495815, + "epoch": 1.5424911212582444, + "grad_norm": 5.75, + "learning_rate": 1.3637075307060877e-06, + "loss": 1.62945271, + "memory(GiB)": 117.38, + "step": 60805, + "train_speed(iter/s)": 1.63783 + }, + { + "acc": 0.66216636, + "epoch": 1.5426179604261796, + "grad_norm": 5.1875, + "learning_rate": 1.362987873487862e-06, + "loss": 1.5814949, + "memory(GiB)": 117.38, + "step": 60810, + "train_speed(iter/s)": 1.637845 + }, + { + "acc": 0.6472908, + "epoch": 1.5427447995941148, + "grad_norm": 5.4375, + "learning_rate": 1.362268376241665e-06, + "loss": 1.59065418, + "memory(GiB)": 117.38, + "step": 60815, + "train_speed(iter/s)": 1.637858 + }, + { + "acc": 0.63248982, + "epoch": 1.5428716387620498, + "grad_norm": 7.28125, + "learning_rate": 1.3615490389991476e-06, + "loss": 1.71146088, + "memory(GiB)": 117.38, + "step": 60820, + "train_speed(iter/s)": 1.637872 + }, + { + "acc": 0.65516539, + "epoch": 1.5429984779299848, + "grad_norm": 5.34375, + "learning_rate": 1.360829861791948e-06, + "loss": 1.57450037, + "memory(GiB)": 117.38, + "step": 60825, + "train_speed(iter/s)": 1.637886 + }, + { + "acc": 0.6589664, + "epoch": 1.5431253170979198, + "grad_norm": 6.46875, + "learning_rate": 1.3601108446516985e-06, + "loss": 1.6026823, + "memory(GiB)": 117.38, + "step": 60830, + "train_speed(iter/s)": 1.6379 + }, + { + "acc": 0.67016191, + "epoch": 1.5432521562658548, + "grad_norm": 5.875, + "learning_rate": 1.3593919876100254e-06, + "loss": 1.54794121, + "memory(GiB)": 117.38, + "step": 60835, + "train_speed(iter/s)": 1.637914 + }, + { + "acc": 0.64960527, + "epoch": 1.54337899543379, + "grad_norm": 6.15625, + "learning_rate": 1.3586732906985467e-06, + "loss": 1.63107777, + "memory(GiB)": 117.38, + "step": 60840, + "train_speed(iter/s)": 1.637928 + }, + { + "acc": 0.66171756, + "epoch": 1.543505834601725, + "grad_norm": 6.15625, + "learning_rate": 1.357954753948874e-06, + "loss": 1.58175297, + "memory(GiB)": 117.38, + "step": 60845, + "train_speed(iter/s)": 1.637942 + }, + { + "acc": 0.68012829, + "epoch": 1.5436326737696602, + "grad_norm": 5.375, + "learning_rate": 1.3572363773926117e-06, + "loss": 1.52260113, + "memory(GiB)": 117.38, + "step": 60850, + "train_speed(iter/s)": 1.637955 + }, + { + "acc": 0.66428514, + "epoch": 1.5437595129375952, + "grad_norm": 7.0, + "learning_rate": 1.3565181610613571e-06, + "loss": 1.61247501, + "memory(GiB)": 117.38, + "step": 60855, + "train_speed(iter/s)": 1.637968 + }, + { + "acc": 0.65341148, + "epoch": 1.5438863521055302, + "grad_norm": 6.3125, + "learning_rate": 1.3558001049867008e-06, + "loss": 1.58205357, + "memory(GiB)": 117.38, + "step": 60860, + "train_speed(iter/s)": 1.637981 + }, + { + "acc": 0.66341696, + "epoch": 1.5440131912734651, + "grad_norm": 5.375, + "learning_rate": 1.3550822092002264e-06, + "loss": 1.57977371, + "memory(GiB)": 117.38, + "step": 60865, + "train_speed(iter/s)": 1.637995 + }, + { + "acc": 0.66312504, + "epoch": 1.5441400304414004, + "grad_norm": 4.6875, + "learning_rate": 1.3543644737335099e-06, + "loss": 1.50969276, + "memory(GiB)": 117.38, + "step": 60870, + "train_speed(iter/s)": 1.638008 + }, + { + "acc": 0.64081087, + "epoch": 1.5442668696093353, + "grad_norm": 6.28125, + "learning_rate": 1.35364689861812e-06, + "loss": 1.63472366, + "memory(GiB)": 117.38, + "step": 60875, + "train_speed(iter/s)": 1.638023 + }, + { + "acc": 0.67057085, + "epoch": 1.5443937087772706, + "grad_norm": 6.53125, + "learning_rate": 1.3529294838856194e-06, + "loss": 1.56202316, + "memory(GiB)": 117.38, + "step": 60880, + "train_speed(iter/s)": 1.638037 + }, + { + "acc": 0.64279742, + "epoch": 1.5445205479452055, + "grad_norm": 6.25, + "learning_rate": 1.3522122295675616e-06, + "loss": 1.59809437, + "memory(GiB)": 117.38, + "step": 60885, + "train_speed(iter/s)": 1.638051 + }, + { + "acc": 0.65396233, + "epoch": 1.5446473871131405, + "grad_norm": 7.0625, + "learning_rate": 1.351495135695499e-06, + "loss": 1.61423817, + "memory(GiB)": 117.38, + "step": 60890, + "train_speed(iter/s)": 1.638065 + }, + { + "acc": 0.6627183, + "epoch": 1.5447742262810755, + "grad_norm": 5.8125, + "learning_rate": 1.3507782023009692e-06, + "loss": 1.58792992, + "memory(GiB)": 117.38, + "step": 60895, + "train_speed(iter/s)": 1.638079 + }, + { + "acc": 0.66423535, + "epoch": 1.5449010654490105, + "grad_norm": 5.15625, + "learning_rate": 1.3500614294155056e-06, + "loss": 1.57492504, + "memory(GiB)": 117.38, + "step": 60900, + "train_speed(iter/s)": 1.638092 + }, + { + "acc": 0.67106886, + "epoch": 1.5450279046169457, + "grad_norm": 5.625, + "learning_rate": 1.3493448170706347e-06, + "loss": 1.60399418, + "memory(GiB)": 117.38, + "step": 60905, + "train_speed(iter/s)": 1.638105 + }, + { + "acc": 0.65734005, + "epoch": 1.545154743784881, + "grad_norm": 5.25, + "learning_rate": 1.348628365297881e-06, + "loss": 1.59561119, + "memory(GiB)": 117.38, + "step": 60910, + "train_speed(iter/s)": 1.63812 + }, + { + "acc": 0.65490484, + "epoch": 1.545281582952816, + "grad_norm": 6.03125, + "learning_rate": 1.3479120741287526e-06, + "loss": 1.59032516, + "memory(GiB)": 117.38, + "step": 60915, + "train_speed(iter/s)": 1.638134 + }, + { + "acc": 0.65446835, + "epoch": 1.545408422120751, + "grad_norm": 5.40625, + "learning_rate": 1.3471959435947552e-06, + "loss": 1.66792221, + "memory(GiB)": 117.38, + "step": 60920, + "train_speed(iter/s)": 1.638147 + }, + { + "acc": 0.64600048, + "epoch": 1.545535261288686, + "grad_norm": 5.1875, + "learning_rate": 1.3464799737273898e-06, + "loss": 1.63425903, + "memory(GiB)": 117.38, + "step": 60925, + "train_speed(iter/s)": 1.63816 + }, + { + "acc": 0.65153418, + "epoch": 1.545662100456621, + "grad_norm": 6.1875, + "learning_rate": 1.3457641645581487e-06, + "loss": 1.60337391, + "memory(GiB)": 117.38, + "step": 60930, + "train_speed(iter/s)": 1.638174 + }, + { + "acc": 0.64624863, + "epoch": 1.545788939624556, + "grad_norm": 5.84375, + "learning_rate": 1.3450485161185133e-06, + "loss": 1.64083881, + "memory(GiB)": 117.38, + "step": 60935, + "train_speed(iter/s)": 1.638187 + }, + { + "acc": 0.65853739, + "epoch": 1.545915778792491, + "grad_norm": 7.6875, + "learning_rate": 1.344333028439961e-06, + "loss": 1.56394987, + "memory(GiB)": 117.38, + "step": 60940, + "train_speed(iter/s)": 1.638201 + }, + { + "acc": 0.67052164, + "epoch": 1.5460426179604263, + "grad_norm": 5.8125, + "learning_rate": 1.3436177015539647e-06, + "loss": 1.54976892, + "memory(GiB)": 117.38, + "step": 60945, + "train_speed(iter/s)": 1.638216 + }, + { + "acc": 0.67395668, + "epoch": 1.5461694571283613, + "grad_norm": 6.15625, + "learning_rate": 1.3429025354919877e-06, + "loss": 1.548804, + "memory(GiB)": 117.38, + "step": 60950, + "train_speed(iter/s)": 1.638229 + }, + { + "acc": 0.64560823, + "epoch": 1.5462962962962963, + "grad_norm": 6.09375, + "learning_rate": 1.3421875302854826e-06, + "loss": 1.62121315, + "memory(GiB)": 117.38, + "step": 60955, + "train_speed(iter/s)": 1.638243 + }, + { + "acc": 0.64463673, + "epoch": 1.5464231354642313, + "grad_norm": 5.625, + "learning_rate": 1.3414726859659016e-06, + "loss": 1.62692986, + "memory(GiB)": 117.38, + "step": 60960, + "train_speed(iter/s)": 1.638257 + }, + { + "acc": 0.63233428, + "epoch": 1.5465499746321663, + "grad_norm": 5.5625, + "learning_rate": 1.3407580025646866e-06, + "loss": 1.6654583, + "memory(GiB)": 117.38, + "step": 60965, + "train_speed(iter/s)": 1.638271 + }, + { + "acc": 0.64950213, + "epoch": 1.5466768138001015, + "grad_norm": 7.8125, + "learning_rate": 1.3400434801132716e-06, + "loss": 1.63694344, + "memory(GiB)": 117.38, + "step": 60970, + "train_speed(iter/s)": 1.638286 + }, + { + "acc": 0.65419989, + "epoch": 1.5468036529680367, + "grad_norm": 5.375, + "learning_rate": 1.3393291186430852e-06, + "loss": 1.61325874, + "memory(GiB)": 117.38, + "step": 60975, + "train_speed(iter/s)": 1.638299 + }, + { + "acc": 0.64392223, + "epoch": 1.5469304921359717, + "grad_norm": 10.625, + "learning_rate": 1.338614918185548e-06, + "loss": 1.6487709, + "memory(GiB)": 117.38, + "step": 60980, + "train_speed(iter/s)": 1.638313 + }, + { + "acc": 0.66089287, + "epoch": 1.5470573313039067, + "grad_norm": 4.90625, + "learning_rate": 1.3379008787720732e-06, + "loss": 1.59338989, + "memory(GiB)": 117.38, + "step": 60985, + "train_speed(iter/s)": 1.638326 + }, + { + "acc": 0.66049089, + "epoch": 1.5471841704718416, + "grad_norm": 6.15625, + "learning_rate": 1.3371870004340681e-06, + "loss": 1.61071091, + "memory(GiB)": 117.38, + "step": 60990, + "train_speed(iter/s)": 1.638341 + }, + { + "acc": 0.66481338, + "epoch": 1.5473110096397766, + "grad_norm": 5.90625, + "learning_rate": 1.3364732832029315e-06, + "loss": 1.57785244, + "memory(GiB)": 117.38, + "step": 60995, + "train_speed(iter/s)": 1.638355 + }, + { + "acc": 0.66319914, + "epoch": 1.5474378488077118, + "grad_norm": 5.6875, + "learning_rate": 1.335759727110057e-06, + "loss": 1.56670542, + "memory(GiB)": 117.38, + "step": 61000, + "train_speed(iter/s)": 1.63837 + }, + { + "epoch": 1.5474378488077118, + "eval_acc": 0.6462568157090739, + "eval_loss": 1.5732755661010742, + "eval_runtime": 58.4487, + "eval_samples_per_second": 108.985, + "eval_steps_per_second": 27.255, + "step": 61000 + }, + { + "acc": 0.65179029, + "epoch": 1.5475646879756468, + "grad_norm": 6.8125, + "learning_rate": 1.335046332186829e-06, + "loss": 1.62582741, + "memory(GiB)": 117.38, + "step": 61005, + "train_speed(iter/s)": 1.635626 + }, + { + "acc": 0.66695957, + "epoch": 1.547691527143582, + "grad_norm": 5.8125, + "learning_rate": 1.3343330984646262e-06, + "loss": 1.54848003, + "memory(GiB)": 117.38, + "step": 61010, + "train_speed(iter/s)": 1.63564 + }, + { + "acc": 0.67391047, + "epoch": 1.547818366311517, + "grad_norm": 5.09375, + "learning_rate": 1.33362002597482e-06, + "loss": 1.59511576, + "memory(GiB)": 117.38, + "step": 61015, + "train_speed(iter/s)": 1.635653 + }, + { + "acc": 0.64972095, + "epoch": 1.547945205479452, + "grad_norm": 5.53125, + "learning_rate": 1.3329071147487743e-06, + "loss": 1.62496414, + "memory(GiB)": 117.38, + "step": 61020, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.65210514, + "epoch": 1.548072044647387, + "grad_norm": 5.75, + "learning_rate": 1.3321943648178442e-06, + "loss": 1.5740551, + "memory(GiB)": 117.38, + "step": 61025, + "train_speed(iter/s)": 1.63568 + }, + { + "acc": 0.64864464, + "epoch": 1.5481988838153222, + "grad_norm": 5.625, + "learning_rate": 1.3314817762133848e-06, + "loss": 1.63391151, + "memory(GiB)": 117.38, + "step": 61030, + "train_speed(iter/s)": 1.635692 + }, + { + "acc": 0.66745377, + "epoch": 1.5483257229832572, + "grad_norm": 5.84375, + "learning_rate": 1.330769348966734e-06, + "loss": 1.61501598, + "memory(GiB)": 117.38, + "step": 61035, + "train_speed(iter/s)": 1.635705 + }, + { + "acc": 0.65837078, + "epoch": 1.5484525621511924, + "grad_norm": 5.8125, + "learning_rate": 1.3300570831092292e-06, + "loss": 1.61278992, + "memory(GiB)": 117.38, + "step": 61040, + "train_speed(iter/s)": 1.635718 + }, + { + "acc": 0.67180367, + "epoch": 1.5485794013191274, + "grad_norm": 8.125, + "learning_rate": 1.3293449786721973e-06, + "loss": 1.49866285, + "memory(GiB)": 117.38, + "step": 61045, + "train_speed(iter/s)": 1.635732 + }, + { + "acc": 0.65392499, + "epoch": 1.5487062404870624, + "grad_norm": 6.03125, + "learning_rate": 1.3286330356869648e-06, + "loss": 1.63412418, + "memory(GiB)": 117.38, + "step": 61050, + "train_speed(iter/s)": 1.635746 + }, + { + "acc": 0.64719868, + "epoch": 1.5488330796549974, + "grad_norm": 6.28125, + "learning_rate": 1.3279212541848413e-06, + "loss": 1.63076973, + "memory(GiB)": 117.38, + "step": 61055, + "train_speed(iter/s)": 1.63576 + }, + { + "acc": 0.64747467, + "epoch": 1.5489599188229324, + "grad_norm": 7.25, + "learning_rate": 1.3272096341971342e-06, + "loss": 1.60763969, + "memory(GiB)": 117.38, + "step": 61060, + "train_speed(iter/s)": 1.635774 + }, + { + "acc": 0.65634375, + "epoch": 1.5490867579908676, + "grad_norm": 7.59375, + "learning_rate": 1.326498175755147e-06, + "loss": 1.64439659, + "memory(GiB)": 117.38, + "step": 61065, + "train_speed(iter/s)": 1.635788 + }, + { + "acc": 0.66577773, + "epoch": 1.5492135971588028, + "grad_norm": 7.71875, + "learning_rate": 1.3257868788901722e-06, + "loss": 1.61822376, + "memory(GiB)": 117.38, + "step": 61070, + "train_speed(iter/s)": 1.635801 + }, + { + "acc": 0.65768957, + "epoch": 1.5493404363267378, + "grad_norm": 5.3125, + "learning_rate": 1.3250757436334932e-06, + "loss": 1.57822151, + "memory(GiB)": 117.38, + "step": 61075, + "train_speed(iter/s)": 1.635814 + }, + { + "acc": 0.66250062, + "epoch": 1.5494672754946728, + "grad_norm": 6.4375, + "learning_rate": 1.3243647700163887e-06, + "loss": 1.56040421, + "memory(GiB)": 117.38, + "step": 61080, + "train_speed(iter/s)": 1.635829 + }, + { + "acc": 0.65652628, + "epoch": 1.5495941146626078, + "grad_norm": 5.28125, + "learning_rate": 1.323653958070134e-06, + "loss": 1.61688766, + "memory(GiB)": 117.38, + "step": 61085, + "train_speed(iter/s)": 1.635841 + }, + { + "acc": 0.65840831, + "epoch": 1.5497209538305428, + "grad_norm": 6.90625, + "learning_rate": 1.3229433078259928e-06, + "loss": 1.57728958, + "memory(GiB)": 117.38, + "step": 61090, + "train_speed(iter/s)": 1.635856 + }, + { + "acc": 0.64791031, + "epoch": 1.549847792998478, + "grad_norm": 5.125, + "learning_rate": 1.3222328193152195e-06, + "loss": 1.56909704, + "memory(GiB)": 117.38, + "step": 61095, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.66923327, + "epoch": 1.549974632166413, + "grad_norm": 5.84375, + "learning_rate": 1.3215224925690683e-06, + "loss": 1.63195572, + "memory(GiB)": 117.38, + "step": 61100, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.67461915, + "epoch": 1.5501014713343482, + "grad_norm": 5.15625, + "learning_rate": 1.3208123276187807e-06, + "loss": 1.54255314, + "memory(GiB)": 117.38, + "step": 61105, + "train_speed(iter/s)": 1.635898 + }, + { + "acc": 0.65144835, + "epoch": 1.5502283105022832, + "grad_norm": 4.9375, + "learning_rate": 1.3201023244955952e-06, + "loss": 1.61530056, + "memory(GiB)": 117.38, + "step": 61110, + "train_speed(iter/s)": 1.63591 + }, + { + "acc": 0.66974187, + "epoch": 1.5503551496702181, + "grad_norm": 4.9375, + "learning_rate": 1.319392483230736e-06, + "loss": 1.56306629, + "memory(GiB)": 117.38, + "step": 61115, + "train_speed(iter/s)": 1.635925 + }, + { + "acc": 0.64651308, + "epoch": 1.5504819888381531, + "grad_norm": 5.625, + "learning_rate": 1.318682803855429e-06, + "loss": 1.67715492, + "memory(GiB)": 117.38, + "step": 61120, + "train_speed(iter/s)": 1.635939 + }, + { + "acc": 0.65884929, + "epoch": 1.5506088280060881, + "grad_norm": 6.375, + "learning_rate": 1.3179732864008888e-06, + "loss": 1.62272549, + "memory(GiB)": 117.38, + "step": 61125, + "train_speed(iter/s)": 1.635953 + }, + { + "acc": 0.6555727, + "epoch": 1.5507356671740233, + "grad_norm": 5.96875, + "learning_rate": 1.3172639308983226e-06, + "loss": 1.5814868, + "memory(GiB)": 117.38, + "step": 61130, + "train_speed(iter/s)": 1.635967 + }, + { + "acc": 0.6675652, + "epoch": 1.5508625063419585, + "grad_norm": 5.0, + "learning_rate": 1.3165547373789306e-06, + "loss": 1.55184765, + "memory(GiB)": 117.38, + "step": 61135, + "train_speed(iter/s)": 1.63598 + }, + { + "acc": 0.65238214, + "epoch": 1.5509893455098935, + "grad_norm": 5.75, + "learning_rate": 1.3158457058739066e-06, + "loss": 1.63189926, + "memory(GiB)": 117.38, + "step": 61140, + "train_speed(iter/s)": 1.635994 + }, + { + "acc": 0.66230302, + "epoch": 1.5511161846778285, + "grad_norm": 5.9375, + "learning_rate": 1.3151368364144373e-06, + "loss": 1.64738388, + "memory(GiB)": 117.38, + "step": 61145, + "train_speed(iter/s)": 1.636009 + }, + { + "acc": 0.65345531, + "epoch": 1.5512430238457635, + "grad_norm": 5.96875, + "learning_rate": 1.3144281290317012e-06, + "loss": 1.60982533, + "memory(GiB)": 117.38, + "step": 61150, + "train_speed(iter/s)": 1.636023 + }, + { + "acc": 0.66240487, + "epoch": 1.5513698630136985, + "grad_norm": 6.125, + "learning_rate": 1.3137195837568716e-06, + "loss": 1.51676483, + "memory(GiB)": 117.38, + "step": 61155, + "train_speed(iter/s)": 1.636036 + }, + { + "acc": 0.66346321, + "epoch": 1.5514967021816337, + "grad_norm": 5.71875, + "learning_rate": 1.313011200621112e-06, + "loss": 1.57237358, + "memory(GiB)": 117.38, + "step": 61160, + "train_speed(iter/s)": 1.636051 + }, + { + "acc": 0.65693192, + "epoch": 1.5516235413495687, + "grad_norm": 4.375, + "learning_rate": 1.312302979655582e-06, + "loss": 1.58477821, + "memory(GiB)": 117.38, + "step": 61165, + "train_speed(iter/s)": 1.636065 + }, + { + "acc": 0.65312538, + "epoch": 1.551750380517504, + "grad_norm": 6.40625, + "learning_rate": 1.3115949208914302e-06, + "loss": 1.59472466, + "memory(GiB)": 117.38, + "step": 61170, + "train_speed(iter/s)": 1.636079 + }, + { + "acc": 0.65050321, + "epoch": 1.551877219685439, + "grad_norm": 5.40625, + "learning_rate": 1.3108870243598022e-06, + "loss": 1.61206379, + "memory(GiB)": 117.38, + "step": 61175, + "train_speed(iter/s)": 1.636093 + }, + { + "acc": 0.65264826, + "epoch": 1.552004058853374, + "grad_norm": 5.75, + "learning_rate": 1.310179290091833e-06, + "loss": 1.65162907, + "memory(GiB)": 117.38, + "step": 61180, + "train_speed(iter/s)": 1.636107 + }, + { + "acc": 0.66704984, + "epoch": 1.5521308980213089, + "grad_norm": 5.90625, + "learning_rate": 1.3094717181186518e-06, + "loss": 1.55495434, + "memory(GiB)": 117.38, + "step": 61185, + "train_speed(iter/s)": 1.636122 + }, + { + "acc": 0.64566684, + "epoch": 1.552257737189244, + "grad_norm": 7.84375, + "learning_rate": 1.3087643084713836e-06, + "loss": 1.59781446, + "memory(GiB)": 117.38, + "step": 61190, + "train_speed(iter/s)": 1.636137 + }, + { + "acc": 0.66397443, + "epoch": 1.552384576357179, + "grad_norm": 6.53125, + "learning_rate": 1.30805706118114e-06, + "loss": 1.58068399, + "memory(GiB)": 117.38, + "step": 61195, + "train_speed(iter/s)": 1.636151 + }, + { + "acc": 0.65234904, + "epoch": 1.5525114155251143, + "grad_norm": 6.21875, + "learning_rate": 1.3073499762790287e-06, + "loss": 1.62983627, + "memory(GiB)": 117.38, + "step": 61200, + "train_speed(iter/s)": 1.636165 + }, + { + "acc": 0.65697637, + "epoch": 1.5526382546930493, + "grad_norm": 5.03125, + "learning_rate": 1.306643053796154e-06, + "loss": 1.59063282, + "memory(GiB)": 117.38, + "step": 61205, + "train_speed(iter/s)": 1.63618 + }, + { + "acc": 0.66361923, + "epoch": 1.5527650938609843, + "grad_norm": 7.1875, + "learning_rate": 1.3059362937636084e-06, + "loss": 1.61873455, + "memory(GiB)": 117.38, + "step": 61210, + "train_speed(iter/s)": 1.636195 + }, + { + "acc": 0.66971402, + "epoch": 1.5528919330289193, + "grad_norm": 5.875, + "learning_rate": 1.3052296962124756e-06, + "loss": 1.52288589, + "memory(GiB)": 117.38, + "step": 61215, + "train_speed(iter/s)": 1.636208 + }, + { + "acc": 0.6382885, + "epoch": 1.5530187721968542, + "grad_norm": 6.125, + "learning_rate": 1.3045232611738357e-06, + "loss": 1.63298988, + "memory(GiB)": 117.38, + "step": 61220, + "train_speed(iter/s)": 1.636222 + }, + { + "acc": 0.65036759, + "epoch": 1.5531456113647895, + "grad_norm": 5.84375, + "learning_rate": 1.3038169886787632e-06, + "loss": 1.60665169, + "memory(GiB)": 117.38, + "step": 61225, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.68696127, + "epoch": 1.5532724505327247, + "grad_norm": 8.875, + "learning_rate": 1.3031108787583235e-06, + "loss": 1.53828192, + "memory(GiB)": 117.38, + "step": 61230, + "train_speed(iter/s)": 1.63625 + }, + { + "acc": 0.64480381, + "epoch": 1.5533992897006597, + "grad_norm": 5.375, + "learning_rate": 1.3024049314435694e-06, + "loss": 1.60862217, + "memory(GiB)": 117.38, + "step": 61235, + "train_speed(iter/s)": 1.636264 + }, + { + "acc": 0.64851785, + "epoch": 1.5535261288685946, + "grad_norm": 5.75, + "learning_rate": 1.301699146765557e-06, + "loss": 1.62237988, + "memory(GiB)": 117.38, + "step": 61240, + "train_speed(iter/s)": 1.636278 + }, + { + "acc": 0.67204733, + "epoch": 1.5536529680365296, + "grad_norm": 5.8125, + "learning_rate": 1.3009935247553274e-06, + "loss": 1.60659752, + "memory(GiB)": 117.38, + "step": 61245, + "train_speed(iter/s)": 1.636292 + }, + { + "acc": 0.64988775, + "epoch": 1.5537798072044646, + "grad_norm": 5.65625, + "learning_rate": 1.3002880654439192e-06, + "loss": 1.67673302, + "memory(GiB)": 117.38, + "step": 61250, + "train_speed(iter/s)": 1.636307 + }, + { + "acc": 0.64734554, + "epoch": 1.5539066463723998, + "grad_norm": 4.90625, + "learning_rate": 1.2995827688623568e-06, + "loss": 1.66474113, + "memory(GiB)": 117.38, + "step": 61255, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.66390038, + "epoch": 1.5540334855403348, + "grad_norm": 5.28125, + "learning_rate": 1.298877635041667e-06, + "loss": 1.51859856, + "memory(GiB)": 117.38, + "step": 61260, + "train_speed(iter/s)": 1.636335 + }, + { + "acc": 0.64104948, + "epoch": 1.55416032470827, + "grad_norm": 5.125, + "learning_rate": 1.2981726640128633e-06, + "loss": 1.63175583, + "memory(GiB)": 117.38, + "step": 61265, + "train_speed(iter/s)": 1.636349 + }, + { + "acc": 0.65757418, + "epoch": 1.554287163876205, + "grad_norm": 4.71875, + "learning_rate": 1.297467855806953e-06, + "loss": 1.53976688, + "memory(GiB)": 117.38, + "step": 61270, + "train_speed(iter/s)": 1.636363 + }, + { + "acc": 0.66194124, + "epoch": 1.55441400304414, + "grad_norm": 9.875, + "learning_rate": 1.2967632104549371e-06, + "loss": 1.62684193, + "memory(GiB)": 117.38, + "step": 61275, + "train_speed(iter/s)": 1.636377 + }, + { + "acc": 0.65334568, + "epoch": 1.554540842212075, + "grad_norm": 5.28125, + "learning_rate": 1.296058727987809e-06, + "loss": 1.58629646, + "memory(GiB)": 117.38, + "step": 61280, + "train_speed(iter/s)": 1.636392 + }, + { + "acc": 0.65941591, + "epoch": 1.55466768138001, + "grad_norm": 4.96875, + "learning_rate": 1.295354408436555e-06, + "loss": 1.64778557, + "memory(GiB)": 117.38, + "step": 61285, + "train_speed(iter/s)": 1.636406 + }, + { + "acc": 0.66195574, + "epoch": 1.5547945205479452, + "grad_norm": 6.5625, + "learning_rate": 1.294650251832154e-06, + "loss": 1.56548967, + "memory(GiB)": 117.38, + "step": 61290, + "train_speed(iter/s)": 1.63642 + }, + { + "acc": 0.6524641, + "epoch": 1.5549213597158804, + "grad_norm": 6.25, + "learning_rate": 1.2939462582055784e-06, + "loss": 1.5599987, + "memory(GiB)": 117.38, + "step": 61295, + "train_speed(iter/s)": 1.636434 + }, + { + "acc": 0.66586266, + "epoch": 1.5550481988838154, + "grad_norm": 5.625, + "learning_rate": 1.2932424275877926e-06, + "loss": 1.5513875, + "memory(GiB)": 117.38, + "step": 61300, + "train_speed(iter/s)": 1.636448 + }, + { + "acc": 0.66176119, + "epoch": 1.5551750380517504, + "grad_norm": 6.59375, + "learning_rate": 1.2925387600097543e-06, + "loss": 1.57261553, + "memory(GiB)": 117.38, + "step": 61305, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.64702597, + "epoch": 1.5553018772196854, + "grad_norm": 4.375, + "learning_rate": 1.291835255502414e-06, + "loss": 1.562537, + "memory(GiB)": 117.38, + "step": 61310, + "train_speed(iter/s)": 1.636477 + }, + { + "acc": 0.64694924, + "epoch": 1.5554287163876204, + "grad_norm": 4.84375, + "learning_rate": 1.2911319140967148e-06, + "loss": 1.61142235, + "memory(GiB)": 117.38, + "step": 61315, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.64774475, + "epoch": 1.5555555555555556, + "grad_norm": 5.375, + "learning_rate": 1.290428735823593e-06, + "loss": 1.64048691, + "memory(GiB)": 117.38, + "step": 61320, + "train_speed(iter/s)": 1.636505 + }, + { + "acc": 0.64523745, + "epoch": 1.5556823947234906, + "grad_norm": 5.5625, + "learning_rate": 1.2897257207139758e-06, + "loss": 1.6832386, + "memory(GiB)": 117.38, + "step": 61325, + "train_speed(iter/s)": 1.636519 + }, + { + "acc": 0.65186181, + "epoch": 1.5558092338914258, + "grad_norm": 5.40625, + "learning_rate": 1.28902286879879e-06, + "loss": 1.58911114, + "memory(GiB)": 117.38, + "step": 61330, + "train_speed(iter/s)": 1.636534 + }, + { + "acc": 0.64767332, + "epoch": 1.5559360730593608, + "grad_norm": 6.125, + "learning_rate": 1.2883201801089445e-06, + "loss": 1.6280632, + "memory(GiB)": 117.38, + "step": 61335, + "train_speed(iter/s)": 1.636548 + }, + { + "acc": 0.64430685, + "epoch": 1.5560629122272958, + "grad_norm": 5.5625, + "learning_rate": 1.2876176546753494e-06, + "loss": 1.66660042, + "memory(GiB)": 117.38, + "step": 61340, + "train_speed(iter/s)": 1.636563 + }, + { + "acc": 0.65408363, + "epoch": 1.5561897513952307, + "grad_norm": 5.9375, + "learning_rate": 1.286915292528903e-06, + "loss": 1.65121346, + "memory(GiB)": 117.38, + "step": 61345, + "train_speed(iter/s)": 1.636577 + }, + { + "acc": 0.65273867, + "epoch": 1.556316590563166, + "grad_norm": 5.96875, + "learning_rate": 1.286213093700503e-06, + "loss": 1.57195454, + "memory(GiB)": 117.38, + "step": 61350, + "train_speed(iter/s)": 1.636592 + }, + { + "acc": 0.66110177, + "epoch": 1.556443429731101, + "grad_norm": 6.6875, + "learning_rate": 1.28551105822103e-06, + "loss": 1.551651, + "memory(GiB)": 117.38, + "step": 61355, + "train_speed(iter/s)": 1.636608 + }, + { + "acc": 0.65804434, + "epoch": 1.5565702688990362, + "grad_norm": 6.65625, + "learning_rate": 1.2848091861213636e-06, + "loss": 1.59232464, + "memory(GiB)": 117.38, + "step": 61360, + "train_speed(iter/s)": 1.636623 + }, + { + "acc": 0.64280934, + "epoch": 1.5566971080669711, + "grad_norm": 5.71875, + "learning_rate": 1.2841074774323775e-06, + "loss": 1.69599457, + "memory(GiB)": 117.38, + "step": 61365, + "train_speed(iter/s)": 1.636636 + }, + { + "acc": 0.6431088, + "epoch": 1.5568239472349061, + "grad_norm": 5.4375, + "learning_rate": 1.2834059321849363e-06, + "loss": 1.64311657, + "memory(GiB)": 117.38, + "step": 61370, + "train_speed(iter/s)": 1.636651 + }, + { + "acc": 0.68124781, + "epoch": 1.5569507864028411, + "grad_norm": 7.09375, + "learning_rate": 1.2827045504098928e-06, + "loss": 1.5941206, + "memory(GiB)": 117.38, + "step": 61375, + "train_speed(iter/s)": 1.636665 + }, + { + "acc": 0.65773225, + "epoch": 1.5570776255707761, + "grad_norm": 6.0, + "learning_rate": 1.2820033321381009e-06, + "loss": 1.56766624, + "memory(GiB)": 117.38, + "step": 61380, + "train_speed(iter/s)": 1.636678 + }, + { + "acc": 0.67315111, + "epoch": 1.5572044647387113, + "grad_norm": 5.8125, + "learning_rate": 1.2813022774004024e-06, + "loss": 1.54263153, + "memory(GiB)": 117.38, + "step": 61385, + "train_speed(iter/s)": 1.636691 + }, + { + "acc": 0.65441117, + "epoch": 1.5573313039066465, + "grad_norm": 5.5625, + "learning_rate": 1.280601386227634e-06, + "loss": 1.62208328, + "memory(GiB)": 117.38, + "step": 61390, + "train_speed(iter/s)": 1.636706 + }, + { + "acc": 0.65167103, + "epoch": 1.5574581430745815, + "grad_norm": 5.625, + "learning_rate": 1.279900658650619e-06, + "loss": 1.64166241, + "memory(GiB)": 117.38, + "step": 61395, + "train_speed(iter/s)": 1.63672 + }, + { + "acc": 0.64311442, + "epoch": 1.5575849822425165, + "grad_norm": 7.0, + "learning_rate": 1.2792000947001842e-06, + "loss": 1.65377274, + "memory(GiB)": 117.38, + "step": 61400, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.66006284, + "epoch": 1.5577118214104515, + "grad_norm": 5.125, + "learning_rate": 1.2784996944071415e-06, + "loss": 1.5988925, + "memory(GiB)": 117.38, + "step": 61405, + "train_speed(iter/s)": 1.636747 + }, + { + "acc": 0.65563731, + "epoch": 1.5578386605783865, + "grad_norm": 5.84375, + "learning_rate": 1.2777994578022972e-06, + "loss": 1.66416016, + "memory(GiB)": 117.38, + "step": 61410, + "train_speed(iter/s)": 1.636761 + }, + { + "acc": 0.65468316, + "epoch": 1.5579654997463217, + "grad_norm": 6.1875, + "learning_rate": 1.2770993849164514e-06, + "loss": 1.63350372, + "memory(GiB)": 117.38, + "step": 61415, + "train_speed(iter/s)": 1.636775 + }, + { + "acc": 0.64138761, + "epoch": 1.5580923389142567, + "grad_norm": 6.125, + "learning_rate": 1.276399475780396e-06, + "loss": 1.7182621, + "memory(GiB)": 117.38, + "step": 61420, + "train_speed(iter/s)": 1.636789 + }, + { + "acc": 0.66096191, + "epoch": 1.558219178082192, + "grad_norm": 5.09375, + "learning_rate": 1.2756997304249164e-06, + "loss": 1.5984087, + "memory(GiB)": 117.38, + "step": 61425, + "train_speed(iter/s)": 1.636802 + }, + { + "acc": 0.6529911, + "epoch": 1.558346017250127, + "grad_norm": 5.78125, + "learning_rate": 1.2750001488807906e-06, + "loss": 1.56206484, + "memory(GiB)": 117.38, + "step": 61430, + "train_speed(iter/s)": 1.636816 + }, + { + "acc": 0.66399221, + "epoch": 1.5584728564180619, + "grad_norm": 6.53125, + "learning_rate": 1.2743007311787892e-06, + "loss": 1.61260223, + "memory(GiB)": 117.38, + "step": 61435, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.66387944, + "epoch": 1.5585996955859969, + "grad_norm": 6.34375, + "learning_rate": 1.2736014773496757e-06, + "loss": 1.54993162, + "memory(GiB)": 117.38, + "step": 61440, + "train_speed(iter/s)": 1.636846 + }, + { + "acc": 0.66723323, + "epoch": 1.5587265347539319, + "grad_norm": 6.125, + "learning_rate": 1.2729023874242064e-06, + "loss": 1.52294521, + "memory(GiB)": 117.38, + "step": 61445, + "train_speed(iter/s)": 1.636859 + }, + { + "acc": 0.64380026, + "epoch": 1.558853373921867, + "grad_norm": 6.59375, + "learning_rate": 1.2722034614331303e-06, + "loss": 1.71321545, + "memory(GiB)": 117.38, + "step": 61450, + "train_speed(iter/s)": 1.636874 + }, + { + "acc": 0.64076867, + "epoch": 1.5589802130898023, + "grad_norm": 6.0, + "learning_rate": 1.2715046994071889e-06, + "loss": 1.66914959, + "memory(GiB)": 117.38, + "step": 61455, + "train_speed(iter/s)": 1.636888 + }, + { + "acc": 0.64837737, + "epoch": 1.5591070522577373, + "grad_norm": 5.78125, + "learning_rate": 1.2708061013771179e-06, + "loss": 1.60034485, + "memory(GiB)": 117.38, + "step": 61460, + "train_speed(iter/s)": 1.636902 + }, + { + "acc": 0.65644279, + "epoch": 1.5592338914256723, + "grad_norm": 4.75, + "learning_rate": 1.2701076673736428e-06, + "loss": 1.56525068, + "memory(GiB)": 117.38, + "step": 61465, + "train_speed(iter/s)": 1.636915 + }, + { + "acc": 0.65083151, + "epoch": 1.5593607305936072, + "grad_norm": 5.6875, + "learning_rate": 1.269409397427488e-06, + "loss": 1.62524376, + "memory(GiB)": 117.38, + "step": 61470, + "train_speed(iter/s)": 1.636928 + }, + { + "acc": 0.66408896, + "epoch": 1.5594875697615422, + "grad_norm": 5.875, + "learning_rate": 1.2687112915693622e-06, + "loss": 1.52112322, + "memory(GiB)": 117.38, + "step": 61475, + "train_speed(iter/s)": 1.636941 + }, + { + "acc": 0.64434872, + "epoch": 1.5596144089294774, + "grad_norm": 5.34375, + "learning_rate": 1.2680133498299729e-06, + "loss": 1.68178368, + "memory(GiB)": 117.38, + "step": 61480, + "train_speed(iter/s)": 1.636955 + }, + { + "acc": 0.6533885, + "epoch": 1.5597412480974124, + "grad_norm": 5.4375, + "learning_rate": 1.2673155722400177e-06, + "loss": 1.58763123, + "memory(GiB)": 117.38, + "step": 61485, + "train_speed(iter/s)": 1.63697 + }, + { + "acc": 0.64297004, + "epoch": 1.5598680872653476, + "grad_norm": 5.6875, + "learning_rate": 1.2666179588301908e-06, + "loss": 1.69776306, + "memory(GiB)": 117.38, + "step": 61490, + "train_speed(iter/s)": 1.636984 + }, + { + "acc": 0.66218548, + "epoch": 1.5599949264332826, + "grad_norm": 7.28125, + "learning_rate": 1.2659205096311738e-06, + "loss": 1.59236755, + "memory(GiB)": 117.38, + "step": 61495, + "train_speed(iter/s)": 1.636999 + }, + { + "acc": 0.64790192, + "epoch": 1.5601217656012176, + "grad_norm": 5.875, + "learning_rate": 1.2652232246736423e-06, + "loss": 1.60135746, + "memory(GiB)": 117.38, + "step": 61500, + "train_speed(iter/s)": 1.637013 + }, + { + "acc": 0.6695004, + "epoch": 1.5602486047691526, + "grad_norm": 5.78125, + "learning_rate": 1.2645261039882694e-06, + "loss": 1.65159931, + "memory(GiB)": 117.38, + "step": 61505, + "train_speed(iter/s)": 1.637027 + }, + { + "acc": 0.65085125, + "epoch": 1.5603754439370878, + "grad_norm": 6.03125, + "learning_rate": 1.263829147605718e-06, + "loss": 1.62512245, + "memory(GiB)": 117.38, + "step": 61510, + "train_speed(iter/s)": 1.637041 + }, + { + "acc": 0.65961914, + "epoch": 1.5605022831050228, + "grad_norm": 6.34375, + "learning_rate": 1.26313235555664e-06, + "loss": 1.61042614, + "memory(GiB)": 117.38, + "step": 61515, + "train_speed(iter/s)": 1.637055 + }, + { + "acc": 0.6738863, + "epoch": 1.560629122272958, + "grad_norm": 5.625, + "learning_rate": 1.2624357278716832e-06, + "loss": 1.5503334, + "memory(GiB)": 117.38, + "step": 61520, + "train_speed(iter/s)": 1.63707 + }, + { + "acc": 0.64084396, + "epoch": 1.560755961440893, + "grad_norm": 5.46875, + "learning_rate": 1.2617392645814913e-06, + "loss": 1.5916008, + "memory(GiB)": 117.38, + "step": 61525, + "train_speed(iter/s)": 1.637083 + }, + { + "acc": 0.6529985, + "epoch": 1.560882800608828, + "grad_norm": 5.4375, + "learning_rate": 1.2610429657166983e-06, + "loss": 1.67311039, + "memory(GiB)": 117.38, + "step": 61530, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.67050619, + "epoch": 1.561009639776763, + "grad_norm": 5.21875, + "learning_rate": 1.2603468313079265e-06, + "loss": 1.52844963, + "memory(GiB)": 117.38, + "step": 61535, + "train_speed(iter/s)": 1.637112 + }, + { + "acc": 0.66273241, + "epoch": 1.561136478944698, + "grad_norm": 5.34375, + "learning_rate": 1.2596508613857982e-06, + "loss": 1.54820318, + "memory(GiB)": 117.38, + "step": 61540, + "train_speed(iter/s)": 1.637126 + }, + { + "acc": 0.64780221, + "epoch": 1.5612633181126332, + "grad_norm": 5.34375, + "learning_rate": 1.258955055980925e-06, + "loss": 1.59088593, + "memory(GiB)": 117.38, + "step": 61545, + "train_speed(iter/s)": 1.63714 + }, + { + "acc": 0.64865084, + "epoch": 1.5613901572805684, + "grad_norm": 5.125, + "learning_rate": 1.258259415123911e-06, + "loss": 1.60054665, + "memory(GiB)": 117.38, + "step": 61550, + "train_speed(iter/s)": 1.637154 + }, + { + "acc": 0.6744133, + "epoch": 1.5615169964485034, + "grad_norm": 5.125, + "learning_rate": 1.2575639388453532e-06, + "loss": 1.49141979, + "memory(GiB)": 117.38, + "step": 61555, + "train_speed(iter/s)": 1.637169 + }, + { + "acc": 0.65708084, + "epoch": 1.5616438356164384, + "grad_norm": 6.125, + "learning_rate": 1.2568686271758423e-06, + "loss": 1.55554161, + "memory(GiB)": 117.38, + "step": 61560, + "train_speed(iter/s)": 1.637183 + }, + { + "acc": 0.64792876, + "epoch": 1.5617706747843734, + "grad_norm": 5.21875, + "learning_rate": 1.2561734801459612e-06, + "loss": 1.64314842, + "memory(GiB)": 117.38, + "step": 61565, + "train_speed(iter/s)": 1.637197 + }, + { + "acc": 0.66256962, + "epoch": 1.5618975139523084, + "grad_norm": 6.0, + "learning_rate": 1.2554784977862856e-06, + "loss": 1.5920846, + "memory(GiB)": 117.38, + "step": 61570, + "train_speed(iter/s)": 1.637212 + }, + { + "acc": 0.65574789, + "epoch": 1.5620243531202436, + "grad_norm": 5.625, + "learning_rate": 1.2547836801273833e-06, + "loss": 1.60377159, + "memory(GiB)": 117.38, + "step": 61575, + "train_speed(iter/s)": 1.637226 + }, + { + "acc": 0.65416446, + "epoch": 1.5621511922881786, + "grad_norm": 5.875, + "learning_rate": 1.2540890271998162e-06, + "loss": 1.64567146, + "memory(GiB)": 117.38, + "step": 61580, + "train_speed(iter/s)": 1.637241 + }, + { + "acc": 0.64260716, + "epoch": 1.5622780314561138, + "grad_norm": 4.8125, + "learning_rate": 1.2533945390341379e-06, + "loss": 1.68209915, + "memory(GiB)": 117.38, + "step": 61585, + "train_speed(iter/s)": 1.637254 + }, + { + "acc": 0.65278797, + "epoch": 1.5624048706240488, + "grad_norm": 5.75, + "learning_rate": 1.2527002156608946e-06, + "loss": 1.610886, + "memory(GiB)": 117.38, + "step": 61590, + "train_speed(iter/s)": 1.637269 + }, + { + "acc": 0.66167889, + "epoch": 1.5625317097919837, + "grad_norm": 6.1875, + "learning_rate": 1.2520060571106275e-06, + "loss": 1.57451191, + "memory(GiB)": 117.38, + "step": 61595, + "train_speed(iter/s)": 1.637284 + }, + { + "acc": 0.64473963, + "epoch": 1.5626585489599187, + "grad_norm": 5.90625, + "learning_rate": 1.2513120634138665e-06, + "loss": 1.6647213, + "memory(GiB)": 117.38, + "step": 61600, + "train_speed(iter/s)": 1.637297 + }, + { + "acc": 0.65826559, + "epoch": 1.5627853881278537, + "grad_norm": 5.875, + "learning_rate": 1.250618234601138e-06, + "loss": 1.54982643, + "memory(GiB)": 117.38, + "step": 61605, + "train_speed(iter/s)": 1.63731 + }, + { + "acc": 0.66492996, + "epoch": 1.562912227295789, + "grad_norm": 6.4375, + "learning_rate": 1.2499245707029595e-06, + "loss": 1.54981127, + "memory(GiB)": 117.38, + "step": 61610, + "train_speed(iter/s)": 1.637325 + }, + { + "acc": 0.64957094, + "epoch": 1.5630390664637241, + "grad_norm": 6.0, + "learning_rate": 1.2492310717498412e-06, + "loss": 1.59807701, + "memory(GiB)": 117.38, + "step": 61615, + "train_speed(iter/s)": 1.637339 + }, + { + "acc": 0.67292881, + "epoch": 1.5631659056316591, + "grad_norm": 6.25, + "learning_rate": 1.2485377377722863e-06, + "loss": 1.60729065, + "memory(GiB)": 117.38, + "step": 61620, + "train_speed(iter/s)": 1.637354 + }, + { + "acc": 0.67249317, + "epoch": 1.5632927447995941, + "grad_norm": 10.375, + "learning_rate": 1.2478445688007894e-06, + "loss": 1.5835084, + "memory(GiB)": 117.38, + "step": 61625, + "train_speed(iter/s)": 1.637368 + }, + { + "acc": 0.66082482, + "epoch": 1.5634195839675291, + "grad_norm": 5.9375, + "learning_rate": 1.2471515648658434e-06, + "loss": 1.56413441, + "memory(GiB)": 117.38, + "step": 61630, + "train_speed(iter/s)": 1.637383 + }, + { + "acc": 0.6674428, + "epoch": 1.563546423135464, + "grad_norm": 6.1875, + "learning_rate": 1.2464587259979254e-06, + "loss": 1.58378563, + "memory(GiB)": 117.38, + "step": 61635, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.65258722, + "epoch": 1.5636732623033993, + "grad_norm": 9.125, + "learning_rate": 1.2457660522275095e-06, + "loss": 1.60452137, + "memory(GiB)": 117.38, + "step": 61640, + "train_speed(iter/s)": 1.637412 + }, + { + "acc": 0.65679646, + "epoch": 1.5638001014713343, + "grad_norm": 8.875, + "learning_rate": 1.2450735435850654e-06, + "loss": 1.57998753, + "memory(GiB)": 117.38, + "step": 61645, + "train_speed(iter/s)": 1.637427 + }, + { + "acc": 0.65105896, + "epoch": 1.5639269406392695, + "grad_norm": 6.375, + "learning_rate": 1.244381200101053e-06, + "loss": 1.63169594, + "memory(GiB)": 117.38, + "step": 61650, + "train_speed(iter/s)": 1.637441 + }, + { + "acc": 0.65601263, + "epoch": 1.5640537798072045, + "grad_norm": 5.6875, + "learning_rate": 1.2436890218059217e-06, + "loss": 1.6355278, + "memory(GiB)": 117.38, + "step": 61655, + "train_speed(iter/s)": 1.637455 + }, + { + "acc": 0.65831928, + "epoch": 1.5641806189751395, + "grad_norm": 5.4375, + "learning_rate": 1.2429970087301163e-06, + "loss": 1.57492008, + "memory(GiB)": 117.38, + "step": 61660, + "train_speed(iter/s)": 1.637469 + }, + { + "acc": 0.650877, + "epoch": 1.5643074581430745, + "grad_norm": 5.9375, + "learning_rate": 1.2423051609040777e-06, + "loss": 1.65746803, + "memory(GiB)": 117.38, + "step": 61665, + "train_speed(iter/s)": 1.637483 + }, + { + "acc": 0.64344234, + "epoch": 1.5644342973110097, + "grad_norm": 6.09375, + "learning_rate": 1.2416134783582368e-06, + "loss": 1.63480892, + "memory(GiB)": 117.38, + "step": 61670, + "train_speed(iter/s)": 1.637497 + }, + { + "acc": 0.65756078, + "epoch": 1.5645611364789447, + "grad_norm": 5.15625, + "learning_rate": 1.2409219611230116e-06, + "loss": 1.6118103, + "memory(GiB)": 117.38, + "step": 61675, + "train_speed(iter/s)": 1.637511 + }, + { + "acc": 0.66029873, + "epoch": 1.56468797564688, + "grad_norm": 5.65625, + "learning_rate": 1.2402306092288236e-06, + "loss": 1.56962442, + "memory(GiB)": 117.38, + "step": 61680, + "train_speed(iter/s)": 1.637525 + }, + { + "acc": 0.66273575, + "epoch": 1.5648148148148149, + "grad_norm": 6.625, + "learning_rate": 1.2395394227060793e-06, + "loss": 1.62700691, + "memory(GiB)": 117.38, + "step": 61685, + "train_speed(iter/s)": 1.637539 + }, + { + "acc": 0.65293407, + "epoch": 1.5649416539827499, + "grad_norm": 6.875, + "learning_rate": 1.238848401585182e-06, + "loss": 1.61073303, + "memory(GiB)": 117.38, + "step": 61690, + "train_speed(iter/s)": 1.637553 + }, + { + "acc": 0.64637957, + "epoch": 1.5650684931506849, + "grad_norm": 5.75, + "learning_rate": 1.2381575458965218e-06, + "loss": 1.56928968, + "memory(GiB)": 117.38, + "step": 61695, + "train_speed(iter/s)": 1.637567 + }, + { + "acc": 0.65601873, + "epoch": 1.5651953323186198, + "grad_norm": 5.96875, + "learning_rate": 1.2374668556704888e-06, + "loss": 1.61248283, + "memory(GiB)": 117.38, + "step": 61700, + "train_speed(iter/s)": 1.637581 + }, + { + "acc": 0.66395602, + "epoch": 1.565322171486555, + "grad_norm": 6.5625, + "learning_rate": 1.2367763309374625e-06, + "loss": 1.56700306, + "memory(GiB)": 117.38, + "step": 61705, + "train_speed(iter/s)": 1.637596 + }, + { + "acc": 0.66082716, + "epoch": 1.5654490106544903, + "grad_norm": 5.1875, + "learning_rate": 1.2360859717278145e-06, + "loss": 1.59073658, + "memory(GiB)": 117.38, + "step": 61710, + "train_speed(iter/s)": 1.63761 + }, + { + "acc": 0.64166784, + "epoch": 1.5655758498224253, + "grad_norm": 5.875, + "learning_rate": 1.2353957780719106e-06, + "loss": 1.60281105, + "memory(GiB)": 117.38, + "step": 61715, + "train_speed(iter/s)": 1.637624 + }, + { + "acc": 0.64489803, + "epoch": 1.5657026889903602, + "grad_norm": 5.90625, + "learning_rate": 1.2347057500001075e-06, + "loss": 1.69349117, + "memory(GiB)": 117.38, + "step": 61720, + "train_speed(iter/s)": 1.637638 + }, + { + "acc": 0.65718765, + "epoch": 1.5658295281582952, + "grad_norm": 5.15625, + "learning_rate": 1.2340158875427566e-06, + "loss": 1.59059925, + "memory(GiB)": 117.38, + "step": 61725, + "train_speed(iter/s)": 1.637653 + }, + { + "acc": 0.64590435, + "epoch": 1.5659563673262302, + "grad_norm": 6.40625, + "learning_rate": 1.2333261907302013e-06, + "loss": 1.58984556, + "memory(GiB)": 117.38, + "step": 61730, + "train_speed(iter/s)": 1.637667 + }, + { + "acc": 0.67564478, + "epoch": 1.5660832064941654, + "grad_norm": 5.3125, + "learning_rate": 1.2326366595927763e-06, + "loss": 1.56249037, + "memory(GiB)": 117.38, + "step": 61735, + "train_speed(iter/s)": 1.637682 + }, + { + "acc": 0.65448256, + "epoch": 1.5662100456621004, + "grad_norm": 5.0625, + "learning_rate": 1.2319472941608118e-06, + "loss": 1.6299675, + "memory(GiB)": 117.38, + "step": 61740, + "train_speed(iter/s)": 1.637696 + }, + { + "acc": 0.65279875, + "epoch": 1.5663368848300356, + "grad_norm": 6.21875, + "learning_rate": 1.231258094464628e-06, + "loss": 1.61363525, + "memory(GiB)": 117.38, + "step": 61745, + "train_speed(iter/s)": 1.637711 + }, + { + "acc": 0.6548912, + "epoch": 1.5664637239979706, + "grad_norm": 6.84375, + "learning_rate": 1.23056906053454e-06, + "loss": 1.62874794, + "memory(GiB)": 117.38, + "step": 61750, + "train_speed(iter/s)": 1.637726 + }, + { + "acc": 0.64261389, + "epoch": 1.5665905631659056, + "grad_norm": 5.9375, + "learning_rate": 1.2298801924008535e-06, + "loss": 1.65138493, + "memory(GiB)": 117.38, + "step": 61755, + "train_speed(iter/s)": 1.63774 + }, + { + "acc": 0.64777794, + "epoch": 1.5667174023338406, + "grad_norm": 5.03125, + "learning_rate": 1.2291914900938685e-06, + "loss": 1.60229836, + "memory(GiB)": 117.38, + "step": 61760, + "train_speed(iter/s)": 1.637755 + }, + { + "acc": 0.65367351, + "epoch": 1.5668442415017756, + "grad_norm": 6.4375, + "learning_rate": 1.2285029536438759e-06, + "loss": 1.62405052, + "memory(GiB)": 117.38, + "step": 61765, + "train_speed(iter/s)": 1.63777 + }, + { + "acc": 0.6409234, + "epoch": 1.5669710806697108, + "grad_norm": 4.6875, + "learning_rate": 1.227814583081165e-06, + "loss": 1.62026291, + "memory(GiB)": 117.38, + "step": 61770, + "train_speed(iter/s)": 1.637784 + }, + { + "acc": 0.65885797, + "epoch": 1.567097919837646, + "grad_norm": 8.25, + "learning_rate": 1.2271263784360088e-06, + "loss": 1.58672199, + "memory(GiB)": 117.38, + "step": 61775, + "train_speed(iter/s)": 1.637798 + }, + { + "acc": 0.65834951, + "epoch": 1.567224759005581, + "grad_norm": 5.75, + "learning_rate": 1.2264383397386787e-06, + "loss": 1.64564991, + "memory(GiB)": 117.38, + "step": 61780, + "train_speed(iter/s)": 1.637813 + }, + { + "acc": 0.66006937, + "epoch": 1.567351598173516, + "grad_norm": 6.625, + "learning_rate": 1.225750467019437e-06, + "loss": 1.62769165, + "memory(GiB)": 117.38, + "step": 61785, + "train_speed(iter/s)": 1.637827 + }, + { + "acc": 0.6515975, + "epoch": 1.567478437341451, + "grad_norm": 5.53125, + "learning_rate": 1.2250627603085435e-06, + "loss": 1.65951691, + "memory(GiB)": 117.38, + "step": 61790, + "train_speed(iter/s)": 1.63784 + }, + { + "acc": 0.66303587, + "epoch": 1.567605276509386, + "grad_norm": 6.84375, + "learning_rate": 1.2243752196362423e-06, + "loss": 1.61594296, + "memory(GiB)": 117.38, + "step": 61795, + "train_speed(iter/s)": 1.637854 + }, + { + "acc": 0.65061016, + "epoch": 1.5677321156773212, + "grad_norm": 5.28125, + "learning_rate": 1.2236878450327743e-06, + "loss": 1.63613167, + "memory(GiB)": 117.38, + "step": 61800, + "train_speed(iter/s)": 1.637867 + }, + { + "acc": 0.65407839, + "epoch": 1.5678589548452562, + "grad_norm": 7.5, + "learning_rate": 1.223000636528377e-06, + "loss": 1.64742126, + "memory(GiB)": 117.38, + "step": 61805, + "train_speed(iter/s)": 1.637881 + }, + { + "acc": 0.67551012, + "epoch": 1.5679857940131914, + "grad_norm": 5.34375, + "learning_rate": 1.2223135941532754e-06, + "loss": 1.50869789, + "memory(GiB)": 117.38, + "step": 61810, + "train_speed(iter/s)": 1.637896 + }, + { + "acc": 0.6596117, + "epoch": 1.5681126331811264, + "grad_norm": 6.53125, + "learning_rate": 1.2216267179376857e-06, + "loss": 1.61112366, + "memory(GiB)": 117.38, + "step": 61815, + "train_speed(iter/s)": 1.637909 + }, + { + "acc": 0.66198697, + "epoch": 1.5682394723490614, + "grad_norm": 5.71875, + "learning_rate": 1.2209400079118233e-06, + "loss": 1.61957493, + "memory(GiB)": 117.38, + "step": 61820, + "train_speed(iter/s)": 1.637923 + }, + { + "acc": 0.66062198, + "epoch": 1.5683663115169963, + "grad_norm": 4.6875, + "learning_rate": 1.2202534641058916e-06, + "loss": 1.57511215, + "memory(GiB)": 117.38, + "step": 61825, + "train_speed(iter/s)": 1.637937 + }, + { + "acc": 0.68316174, + "epoch": 1.5684931506849316, + "grad_norm": 5.34375, + "learning_rate": 1.2195670865500896e-06, + "loss": 1.45706577, + "memory(GiB)": 117.38, + "step": 61830, + "train_speed(iter/s)": 1.63795 + }, + { + "acc": 0.64681873, + "epoch": 1.5686199898528665, + "grad_norm": 5.375, + "learning_rate": 1.2188808752746022e-06, + "loss": 1.65325012, + "memory(GiB)": 117.38, + "step": 61835, + "train_speed(iter/s)": 1.637964 + }, + { + "acc": 0.6711802, + "epoch": 1.5687468290208018, + "grad_norm": 6.6875, + "learning_rate": 1.2181948303096176e-06, + "loss": 1.57406139, + "memory(GiB)": 117.38, + "step": 61840, + "train_speed(iter/s)": 1.637978 + }, + { + "acc": 0.65753489, + "epoch": 1.5688736681887367, + "grad_norm": 5.625, + "learning_rate": 1.2175089516853083e-06, + "loss": 1.5384573, + "memory(GiB)": 117.38, + "step": 61845, + "train_speed(iter/s)": 1.637992 + }, + { + "acc": 0.67108984, + "epoch": 1.5690005073566717, + "grad_norm": 5.15625, + "learning_rate": 1.216823239431843e-06, + "loss": 1.54368925, + "memory(GiB)": 117.38, + "step": 61850, + "train_speed(iter/s)": 1.638006 + }, + { + "acc": 0.66490846, + "epoch": 1.5691273465246067, + "grad_norm": 7.6875, + "learning_rate": 1.2161376935793827e-06, + "loss": 1.57292805, + "memory(GiB)": 117.38, + "step": 61855, + "train_speed(iter/s)": 1.63802 + }, + { + "acc": 0.64931173, + "epoch": 1.5692541856925417, + "grad_norm": 5.90625, + "learning_rate": 1.21545231415808e-06, + "loss": 1.618433, + "memory(GiB)": 117.38, + "step": 61860, + "train_speed(iter/s)": 1.638033 + }, + { + "acc": 0.64918823, + "epoch": 1.569381024860477, + "grad_norm": 5.75, + "learning_rate": 1.2147671011980816e-06, + "loss": 1.65103722, + "memory(GiB)": 117.38, + "step": 61865, + "train_speed(iter/s)": 1.638048 + }, + { + "acc": 0.66030388, + "epoch": 1.5695078640284121, + "grad_norm": 5.96875, + "learning_rate": 1.2140820547295256e-06, + "loss": 1.55695314, + "memory(GiB)": 117.38, + "step": 61870, + "train_speed(iter/s)": 1.638063 + }, + { + "acc": 0.64868951, + "epoch": 1.5696347031963471, + "grad_norm": 5.4375, + "learning_rate": 1.2133971747825435e-06, + "loss": 1.53055067, + "memory(GiB)": 117.38, + "step": 61875, + "train_speed(iter/s)": 1.638076 + }, + { + "acc": 0.66051111, + "epoch": 1.5697615423642821, + "grad_norm": 9.0625, + "learning_rate": 1.2127124613872603e-06, + "loss": 1.60274849, + "memory(GiB)": 117.38, + "step": 61880, + "train_speed(iter/s)": 1.638091 + }, + { + "acc": 0.65472326, + "epoch": 1.569888381532217, + "grad_norm": 6.78125, + "learning_rate": 1.2120279145737918e-06, + "loss": 1.61206284, + "memory(GiB)": 117.38, + "step": 61885, + "train_speed(iter/s)": 1.638104 + }, + { + "acc": 0.67270355, + "epoch": 1.570015220700152, + "grad_norm": 5.75, + "learning_rate": 1.2113435343722474e-06, + "loss": 1.51774015, + "memory(GiB)": 117.38, + "step": 61890, + "train_speed(iter/s)": 1.638118 + }, + { + "acc": 0.64952636, + "epoch": 1.5701420598680873, + "grad_norm": 5.59375, + "learning_rate": 1.21065932081273e-06, + "loss": 1.60222054, + "memory(GiB)": 117.38, + "step": 61895, + "train_speed(iter/s)": 1.638131 + }, + { + "acc": 0.65741158, + "epoch": 1.5702688990360223, + "grad_norm": 5.34375, + "learning_rate": 1.2099752739253334e-06, + "loss": 1.61414833, + "memory(GiB)": 117.38, + "step": 61900, + "train_speed(iter/s)": 1.638144 + }, + { + "acc": 0.67015448, + "epoch": 1.5703957382039575, + "grad_norm": 6.125, + "learning_rate": 1.209291393740144e-06, + "loss": 1.56904812, + "memory(GiB)": 117.38, + "step": 61905, + "train_speed(iter/s)": 1.638159 + }, + { + "acc": 0.64554729, + "epoch": 1.5705225773718925, + "grad_norm": 5.65625, + "learning_rate": 1.2086076802872472e-06, + "loss": 1.63488846, + "memory(GiB)": 117.38, + "step": 61910, + "train_speed(iter/s)": 1.638172 + }, + { + "acc": 0.65189953, + "epoch": 1.5706494165398275, + "grad_norm": 5.53125, + "learning_rate": 1.2079241335967096e-06, + "loss": 1.6647541, + "memory(GiB)": 117.38, + "step": 61915, + "train_speed(iter/s)": 1.638186 + }, + { + "acc": 0.65260954, + "epoch": 1.5707762557077625, + "grad_norm": 5.75, + "learning_rate": 1.207240753698599e-06, + "loss": 1.63380394, + "memory(GiB)": 117.38, + "step": 61920, + "train_speed(iter/s)": 1.6382 + }, + { + "acc": 0.68474879, + "epoch": 1.5709030948756975, + "grad_norm": 6.34375, + "learning_rate": 1.2065575406229723e-06, + "loss": 1.44651051, + "memory(GiB)": 117.38, + "step": 61925, + "train_speed(iter/s)": 1.638213 + }, + { + "acc": 0.67783751, + "epoch": 1.5710299340436327, + "grad_norm": 5.53125, + "learning_rate": 1.2058744943998847e-06, + "loss": 1.54204235, + "memory(GiB)": 117.38, + "step": 61930, + "train_speed(iter/s)": 1.638227 + }, + { + "acc": 0.66343856, + "epoch": 1.5711567732115679, + "grad_norm": 6.84375, + "learning_rate": 1.2051916150593746e-06, + "loss": 1.588445, + "memory(GiB)": 117.38, + "step": 61935, + "train_speed(iter/s)": 1.638241 + }, + { + "acc": 0.64701262, + "epoch": 1.5712836123795029, + "grad_norm": 6.125, + "learning_rate": 1.2045089026314783e-06, + "loss": 1.61286488, + "memory(GiB)": 117.38, + "step": 61940, + "train_speed(iter/s)": 1.638254 + }, + { + "acc": 0.64579959, + "epoch": 1.5714104515474379, + "grad_norm": 6.34375, + "learning_rate": 1.2038263571462278e-06, + "loss": 1.59338303, + "memory(GiB)": 117.38, + "step": 61945, + "train_speed(iter/s)": 1.638268 + }, + { + "acc": 0.66114888, + "epoch": 1.5715372907153728, + "grad_norm": 8.625, + "learning_rate": 1.203143978633644e-06, + "loss": 1.62083321, + "memory(GiB)": 117.38, + "step": 61950, + "train_speed(iter/s)": 1.638282 + }, + { + "acc": 0.650282, + "epoch": 1.5716641298833078, + "grad_norm": 6.5625, + "learning_rate": 1.2024617671237388e-06, + "loss": 1.56989927, + "memory(GiB)": 117.38, + "step": 61955, + "train_speed(iter/s)": 1.638294 + }, + { + "acc": 0.64680548, + "epoch": 1.571790969051243, + "grad_norm": 4.09375, + "learning_rate": 1.2017797226465178e-06, + "loss": 1.622472, + "memory(GiB)": 117.38, + "step": 61960, + "train_speed(iter/s)": 1.638307 + }, + { + "acc": 0.63211393, + "epoch": 1.571917808219178, + "grad_norm": 5.125, + "learning_rate": 1.2010978452319843e-06, + "loss": 1.63500671, + "memory(GiB)": 117.38, + "step": 61965, + "train_speed(iter/s)": 1.63832 + }, + { + "acc": 0.64652376, + "epoch": 1.5720446473871132, + "grad_norm": 6.0625, + "learning_rate": 1.2004161349101295e-06, + "loss": 1.61188011, + "memory(GiB)": 117.38, + "step": 61970, + "train_speed(iter/s)": 1.638335 + }, + { + "acc": 0.66792326, + "epoch": 1.5721714865550482, + "grad_norm": 7.78125, + "learning_rate": 1.1997345917109348e-06, + "loss": 1.57882843, + "memory(GiB)": 117.38, + "step": 61975, + "train_speed(iter/s)": 1.638349 + }, + { + "acc": 0.65533686, + "epoch": 1.5722983257229832, + "grad_norm": 6.15625, + "learning_rate": 1.1990532156643808e-06, + "loss": 1.63751373, + "memory(GiB)": 117.38, + "step": 61980, + "train_speed(iter/s)": 1.638363 + }, + { + "acc": 0.65759177, + "epoch": 1.5724251648909182, + "grad_norm": 5.625, + "learning_rate": 1.198372006800436e-06, + "loss": 1.59721909, + "memory(GiB)": 117.38, + "step": 61985, + "train_speed(iter/s)": 1.638378 + }, + { + "acc": 0.64423833, + "epoch": 1.5725520040588534, + "grad_norm": 6.3125, + "learning_rate": 1.1976909651490637e-06, + "loss": 1.61672058, + "memory(GiB)": 117.38, + "step": 61990, + "train_speed(iter/s)": 1.638392 + }, + { + "acc": 0.66801519, + "epoch": 1.5726788432267884, + "grad_norm": 4.5, + "learning_rate": 1.1970100907402188e-06, + "loss": 1.56915073, + "memory(GiB)": 117.38, + "step": 61995, + "train_speed(iter/s)": 1.638406 + }, + { + "acc": 0.661198, + "epoch": 1.5728056823947236, + "grad_norm": 7.5, + "learning_rate": 1.1963293836038492e-06, + "loss": 1.62549477, + "memory(GiB)": 117.38, + "step": 62000, + "train_speed(iter/s)": 1.638419 + }, + { + "epoch": 1.5728056823947236, + "eval_acc": 0.6462472095009575, + "eval_loss": 1.5733178853988647, + "eval_runtime": 58.6077, + "eval_samples_per_second": 108.689, + "eval_steps_per_second": 27.181, + "step": 62000 + }, + { + "acc": 0.6578105, + "epoch": 1.5729325215626586, + "grad_norm": 6.34375, + "learning_rate": 1.195648843769896e-06, + "loss": 1.51423578, + "memory(GiB)": 117.38, + "step": 62005, + "train_speed(iter/s)": 1.635712 + }, + { + "acc": 0.6574604, + "epoch": 1.5730593607305936, + "grad_norm": 6.78125, + "learning_rate": 1.1949684712682912e-06, + "loss": 1.66762466, + "memory(GiB)": 117.38, + "step": 62010, + "train_speed(iter/s)": 1.635725 + }, + { + "acc": 0.66303735, + "epoch": 1.5731861998985286, + "grad_norm": 6.03125, + "learning_rate": 1.1942882661289618e-06, + "loss": 1.5493082, + "memory(GiB)": 117.38, + "step": 62015, + "train_speed(iter/s)": 1.635738 + }, + { + "acc": 0.66413956, + "epoch": 1.5733130390664636, + "grad_norm": 7.875, + "learning_rate": 1.1936082283818252e-06, + "loss": 1.57665606, + "memory(GiB)": 117.38, + "step": 62020, + "train_speed(iter/s)": 1.635752 + }, + { + "acc": 0.67713356, + "epoch": 1.5734398782343988, + "grad_norm": 5.34375, + "learning_rate": 1.1929283580567936e-06, + "loss": 1.51045151, + "memory(GiB)": 117.38, + "step": 62025, + "train_speed(iter/s)": 1.635765 + }, + { + "acc": 0.67225881, + "epoch": 1.573566717402334, + "grad_norm": 6.09375, + "learning_rate": 1.1922486551837697e-06, + "loss": 1.58824205, + "memory(GiB)": 117.38, + "step": 62030, + "train_speed(iter/s)": 1.635778 + }, + { + "acc": 0.65589252, + "epoch": 1.573693556570269, + "grad_norm": 6.0625, + "learning_rate": 1.1915691197926505e-06, + "loss": 1.65517998, + "memory(GiB)": 117.38, + "step": 62035, + "train_speed(iter/s)": 1.635791 + }, + { + "acc": 0.65546012, + "epoch": 1.573820395738204, + "grad_norm": 6.09375, + "learning_rate": 1.1908897519133244e-06, + "loss": 1.5825655, + "memory(GiB)": 117.38, + "step": 62040, + "train_speed(iter/s)": 1.635804 + }, + { + "acc": 0.66267967, + "epoch": 1.573947234906139, + "grad_norm": 4.71875, + "learning_rate": 1.190210551575674e-06, + "loss": 1.56688929, + "memory(GiB)": 117.38, + "step": 62045, + "train_speed(iter/s)": 1.635817 + }, + { + "acc": 0.66432581, + "epoch": 1.574074074074074, + "grad_norm": 5.71875, + "learning_rate": 1.189531518809573e-06, + "loss": 1.5060647, + "memory(GiB)": 117.38, + "step": 62050, + "train_speed(iter/s)": 1.63583 + }, + { + "acc": 0.63940964, + "epoch": 1.5742009132420092, + "grad_norm": 6.28125, + "learning_rate": 1.188852653644888e-06, + "loss": 1.72135315, + "memory(GiB)": 117.38, + "step": 62055, + "train_speed(iter/s)": 1.635843 + }, + { + "acc": 0.65729837, + "epoch": 1.5743277524099442, + "grad_norm": 6.5625, + "learning_rate": 1.1881739561114792e-06, + "loss": 1.55113869, + "memory(GiB)": 117.38, + "step": 62060, + "train_speed(iter/s)": 1.635857 + }, + { + "acc": 0.65653839, + "epoch": 1.5744545915778794, + "grad_norm": 6.15625, + "learning_rate": 1.1874954262391968e-06, + "loss": 1.62202301, + "memory(GiB)": 117.38, + "step": 62065, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.65237069, + "epoch": 1.5745814307458144, + "grad_norm": 6.65625, + "learning_rate": 1.1868170640578901e-06, + "loss": 1.67198792, + "memory(GiB)": 117.38, + "step": 62070, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.6636961, + "epoch": 1.5747082699137493, + "grad_norm": 4.84375, + "learning_rate": 1.1861388695973918e-06, + "loss": 1.56774473, + "memory(GiB)": 117.38, + "step": 62075, + "train_speed(iter/s)": 1.635897 + }, + { + "acc": 0.6428288, + "epoch": 1.5748351090816843, + "grad_norm": 5.8125, + "learning_rate": 1.1854608428875332e-06, + "loss": 1.62966423, + "memory(GiB)": 117.38, + "step": 62080, + "train_speed(iter/s)": 1.635911 + }, + { + "acc": 0.66092014, + "epoch": 1.5749619482496193, + "grad_norm": 5.90625, + "learning_rate": 1.1847829839581377e-06, + "loss": 1.58559055, + "memory(GiB)": 117.38, + "step": 62085, + "train_speed(iter/s)": 1.635924 + }, + { + "acc": 0.6670011, + "epoch": 1.5750887874175545, + "grad_norm": 4.4375, + "learning_rate": 1.1841052928390223e-06, + "loss": 1.5324194, + "memory(GiB)": 117.38, + "step": 62090, + "train_speed(iter/s)": 1.635938 + }, + { + "acc": 0.65734129, + "epoch": 1.5752156265854897, + "grad_norm": 5.53125, + "learning_rate": 1.183427769559991e-06, + "loss": 1.54112549, + "memory(GiB)": 117.38, + "step": 62095, + "train_speed(iter/s)": 1.635952 + }, + { + "acc": 0.66071706, + "epoch": 1.5753424657534247, + "grad_norm": 5.90625, + "learning_rate": 1.1827504141508456e-06, + "loss": 1.53575726, + "memory(GiB)": 117.38, + "step": 62100, + "train_speed(iter/s)": 1.635966 + }, + { + "acc": 0.64720974, + "epoch": 1.5754693049213597, + "grad_norm": 5.875, + "learning_rate": 1.1820732266413803e-06, + "loss": 1.64022408, + "memory(GiB)": 117.38, + "step": 62105, + "train_speed(iter/s)": 1.63598 + }, + { + "acc": 0.6631947, + "epoch": 1.5755961440892947, + "grad_norm": 6.6875, + "learning_rate": 1.181396207061382e-06, + "loss": 1.59497671, + "memory(GiB)": 117.38, + "step": 62110, + "train_speed(iter/s)": 1.635994 + }, + { + "acc": 0.66114702, + "epoch": 1.5757229832572297, + "grad_norm": 5.71875, + "learning_rate": 1.1807193554406248e-06, + "loss": 1.60235004, + "memory(GiB)": 117.38, + "step": 62115, + "train_speed(iter/s)": 1.636008 + }, + { + "acc": 0.64758363, + "epoch": 1.575849822425165, + "grad_norm": 6.28125, + "learning_rate": 1.1800426718088837e-06, + "loss": 1.62359772, + "memory(GiB)": 117.38, + "step": 62120, + "train_speed(iter/s)": 1.636021 + }, + { + "acc": 0.66658449, + "epoch": 1.5759766615931, + "grad_norm": 5.25, + "learning_rate": 1.1793661561959201e-06, + "loss": 1.6198698, + "memory(GiB)": 117.38, + "step": 62125, + "train_speed(iter/s)": 1.636036 + }, + { + "acc": 0.66156578, + "epoch": 1.5761035007610351, + "grad_norm": 8.3125, + "learning_rate": 1.178689808631493e-06, + "loss": 1.54968681, + "memory(GiB)": 117.38, + "step": 62130, + "train_speed(iter/s)": 1.636051 + }, + { + "acc": 0.65869455, + "epoch": 1.57623033992897, + "grad_norm": 5.90625, + "learning_rate": 1.178013629145346e-06, + "loss": 1.51563187, + "memory(GiB)": 117.38, + "step": 62135, + "train_speed(iter/s)": 1.636066 + }, + { + "acc": 0.6438611, + "epoch": 1.576357179096905, + "grad_norm": 6.40625, + "learning_rate": 1.1773376177672246e-06, + "loss": 1.61444626, + "memory(GiB)": 117.38, + "step": 62140, + "train_speed(iter/s)": 1.63608 + }, + { + "acc": 0.66838465, + "epoch": 1.57648401826484, + "grad_norm": 5.34375, + "learning_rate": 1.176661774526862e-06, + "loss": 1.56084623, + "memory(GiB)": 117.38, + "step": 62145, + "train_speed(iter/s)": 1.636094 + }, + { + "acc": 0.65313196, + "epoch": 1.5766108574327753, + "grad_norm": 5.25, + "learning_rate": 1.1759860994539846e-06, + "loss": 1.61813583, + "memory(GiB)": 117.38, + "step": 62150, + "train_speed(iter/s)": 1.636107 + }, + { + "acc": 0.67126603, + "epoch": 1.5767376966007103, + "grad_norm": 6.21875, + "learning_rate": 1.1753105925783114e-06, + "loss": 1.59181824, + "memory(GiB)": 117.38, + "step": 62155, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.65228877, + "epoch": 1.5768645357686455, + "grad_norm": 5.125, + "learning_rate": 1.174635253929554e-06, + "loss": 1.61933212, + "memory(GiB)": 117.38, + "step": 62160, + "train_speed(iter/s)": 1.636135 + }, + { + "acc": 0.6476645, + "epoch": 1.5769913749365805, + "grad_norm": 7.03125, + "learning_rate": 1.1739600835374177e-06, + "loss": 1.61889229, + "memory(GiB)": 117.38, + "step": 62165, + "train_speed(iter/s)": 1.636148 + }, + { + "acc": 0.65815201, + "epoch": 1.5771182141045155, + "grad_norm": 5.71875, + "learning_rate": 1.173285081431599e-06, + "loss": 1.55051842, + "memory(GiB)": 117.38, + "step": 62170, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.65677757, + "epoch": 1.5772450532724505, + "grad_norm": 4.75, + "learning_rate": 1.1726102476417871e-06, + "loss": 1.62125092, + "memory(GiB)": 117.38, + "step": 62175, + "train_speed(iter/s)": 1.636175 + }, + { + "acc": 0.66700225, + "epoch": 1.5773718924403854, + "grad_norm": 6.03125, + "learning_rate": 1.1719355821976647e-06, + "loss": 1.57801123, + "memory(GiB)": 117.38, + "step": 62180, + "train_speed(iter/s)": 1.636188 + }, + { + "acc": 0.64894843, + "epoch": 1.5774987316083207, + "grad_norm": 5.71875, + "learning_rate": 1.1712610851289069e-06, + "loss": 1.66683464, + "memory(GiB)": 117.38, + "step": 62185, + "train_speed(iter/s)": 1.636201 + }, + { + "acc": 0.64769721, + "epoch": 1.5776255707762559, + "grad_norm": 6.125, + "learning_rate": 1.1705867564651802e-06, + "loss": 1.61999207, + "memory(GiB)": 117.38, + "step": 62190, + "train_speed(iter/s)": 1.636215 + }, + { + "acc": 0.65408702, + "epoch": 1.5777524099441909, + "grad_norm": 5.34375, + "learning_rate": 1.1699125962361451e-06, + "loss": 1.67037792, + "memory(GiB)": 117.38, + "step": 62195, + "train_speed(iter/s)": 1.636229 + }, + { + "acc": 0.66943359, + "epoch": 1.5778792491121258, + "grad_norm": 5.8125, + "learning_rate": 1.1692386044714543e-06, + "loss": 1.51186943, + "memory(GiB)": 117.38, + "step": 62200, + "train_speed(iter/s)": 1.636242 + }, + { + "acc": 0.63414979, + "epoch": 1.5780060882800608, + "grad_norm": 6.1875, + "learning_rate": 1.1685647812007512e-06, + "loss": 1.66729527, + "memory(GiB)": 117.38, + "step": 62205, + "train_speed(iter/s)": 1.636256 + }, + { + "acc": 0.64741068, + "epoch": 1.5781329274479958, + "grad_norm": 4.96875, + "learning_rate": 1.167891126453678e-06, + "loss": 1.58616371, + "memory(GiB)": 117.38, + "step": 62210, + "train_speed(iter/s)": 1.636269 + }, + { + "acc": 0.65222759, + "epoch": 1.578259766615931, + "grad_norm": 5.5625, + "learning_rate": 1.1672176402598607e-06, + "loss": 1.64116974, + "memory(GiB)": 117.38, + "step": 62215, + "train_speed(iter/s)": 1.636284 + }, + { + "acc": 0.66092558, + "epoch": 1.578386605783866, + "grad_norm": 5.875, + "learning_rate": 1.1665443226489236e-06, + "loss": 1.50364008, + "memory(GiB)": 117.38, + "step": 62220, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.65456295, + "epoch": 1.5785134449518012, + "grad_norm": 5.6875, + "learning_rate": 1.1658711736504814e-06, + "loss": 1.5665453, + "memory(GiB)": 117.38, + "step": 62225, + "train_speed(iter/s)": 1.636308 + }, + { + "acc": 0.65833483, + "epoch": 1.5786402841197362, + "grad_norm": 5.1875, + "learning_rate": 1.165198193294146e-06, + "loss": 1.58607111, + "memory(GiB)": 117.38, + "step": 62230, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.67806826, + "epoch": 1.5787671232876712, + "grad_norm": 7.34375, + "learning_rate": 1.1645253816095131e-06, + "loss": 1.49834433, + "memory(GiB)": 117.38, + "step": 62235, + "train_speed(iter/s)": 1.636336 + }, + { + "acc": 0.66620808, + "epoch": 1.5788939624556062, + "grad_norm": 5.0, + "learning_rate": 1.1638527386261772e-06, + "loss": 1.63065815, + "memory(GiB)": 117.38, + "step": 62240, + "train_speed(iter/s)": 1.63635 + }, + { + "acc": 0.65975437, + "epoch": 1.5790208016235412, + "grad_norm": 5.5, + "learning_rate": 1.163180264373726e-06, + "loss": 1.54425268, + "memory(GiB)": 117.38, + "step": 62245, + "train_speed(iter/s)": 1.636364 + }, + { + "acc": 0.65600834, + "epoch": 1.5791476407914764, + "grad_norm": 6.75, + "learning_rate": 1.162507958881739e-06, + "loss": 1.61289635, + "memory(GiB)": 117.38, + "step": 62250, + "train_speed(iter/s)": 1.636378 + }, + { + "acc": 0.68163147, + "epoch": 1.5792744799594116, + "grad_norm": 4.90625, + "learning_rate": 1.161835822179782e-06, + "loss": 1.46188602, + "memory(GiB)": 117.38, + "step": 62255, + "train_speed(iter/s)": 1.636392 + }, + { + "acc": 0.64814558, + "epoch": 1.5794013191273466, + "grad_norm": 5.25, + "learning_rate": 1.1611638542974229e-06, + "loss": 1.63704948, + "memory(GiB)": 117.38, + "step": 62260, + "train_speed(iter/s)": 1.636405 + }, + { + "acc": 0.6573988, + "epoch": 1.5795281582952816, + "grad_norm": 7.4375, + "learning_rate": 1.160492055264217e-06, + "loss": 1.63067589, + "memory(GiB)": 117.38, + "step": 62265, + "train_speed(iter/s)": 1.636419 + }, + { + "acc": 0.65740538, + "epoch": 1.5796549974632166, + "grad_norm": 5.03125, + "learning_rate": 1.159820425109714e-06, + "loss": 1.57271214, + "memory(GiB)": 117.38, + "step": 62270, + "train_speed(iter/s)": 1.636433 + }, + { + "acc": 0.65450792, + "epoch": 1.5797818366311516, + "grad_norm": 5.28125, + "learning_rate": 1.1591489638634513e-06, + "loss": 1.61792564, + "memory(GiB)": 117.38, + "step": 62275, + "train_speed(iter/s)": 1.636448 + }, + { + "acc": 0.66762323, + "epoch": 1.5799086757990868, + "grad_norm": 5.21875, + "learning_rate": 1.1584776715549662e-06, + "loss": 1.61795654, + "memory(GiB)": 117.38, + "step": 62280, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.659761, + "epoch": 1.5800355149670218, + "grad_norm": 5.46875, + "learning_rate": 1.1578065482137845e-06, + "loss": 1.5779583, + "memory(GiB)": 117.38, + "step": 62285, + "train_speed(iter/s)": 1.636475 + }, + { + "acc": 0.63942657, + "epoch": 1.580162354134957, + "grad_norm": 5.78125, + "learning_rate": 1.157135593869425e-06, + "loss": 1.66861153, + "memory(GiB)": 117.38, + "step": 62290, + "train_speed(iter/s)": 1.636489 + }, + { + "acc": 0.65551019, + "epoch": 1.580289193302892, + "grad_norm": 5.21875, + "learning_rate": 1.156464808551399e-06, + "loss": 1.59630804, + "memory(GiB)": 117.38, + "step": 62295, + "train_speed(iter/s)": 1.636502 + }, + { + "acc": 0.65974436, + "epoch": 1.580416032470827, + "grad_norm": 5.71875, + "learning_rate": 1.1557941922892113e-06, + "loss": 1.59582405, + "memory(GiB)": 117.38, + "step": 62300, + "train_speed(iter/s)": 1.636516 + }, + { + "acc": 0.65655079, + "epoch": 1.580542871638762, + "grad_norm": 5.15625, + "learning_rate": 1.155123745112358e-06, + "loss": 1.52191238, + "memory(GiB)": 117.38, + "step": 62305, + "train_speed(iter/s)": 1.63653 + }, + { + "acc": 0.68731256, + "epoch": 1.5806697108066972, + "grad_norm": 6.96875, + "learning_rate": 1.1544534670503282e-06, + "loss": 1.47692757, + "memory(GiB)": 117.38, + "step": 62310, + "train_speed(iter/s)": 1.636544 + }, + { + "acc": 0.65880308, + "epoch": 1.5807965499746321, + "grad_norm": 6.125, + "learning_rate": 1.1537833581326048e-06, + "loss": 1.54738388, + "memory(GiB)": 117.38, + "step": 62315, + "train_speed(iter/s)": 1.636558 + }, + { + "acc": 0.66302571, + "epoch": 1.5809233891425674, + "grad_norm": 7.125, + "learning_rate": 1.1531134183886606e-06, + "loss": 1.57727394, + "memory(GiB)": 117.38, + "step": 62320, + "train_speed(iter/s)": 1.636571 + }, + { + "acc": 0.66358538, + "epoch": 1.5810502283105023, + "grad_norm": 5.875, + "learning_rate": 1.1524436478479638e-06, + "loss": 1.59886723, + "memory(GiB)": 117.38, + "step": 62325, + "train_speed(iter/s)": 1.636585 + }, + { + "acc": 0.65415287, + "epoch": 1.5811770674784373, + "grad_norm": 7.3125, + "learning_rate": 1.1517740465399736e-06, + "loss": 1.62806416, + "memory(GiB)": 117.38, + "step": 62330, + "train_speed(iter/s)": 1.636599 + }, + { + "acc": 0.68105745, + "epoch": 1.5813039066463723, + "grad_norm": 6.15625, + "learning_rate": 1.1511046144941417e-06, + "loss": 1.56159506, + "memory(GiB)": 117.38, + "step": 62335, + "train_speed(iter/s)": 1.636612 + }, + { + "acc": 0.66029892, + "epoch": 1.5814307458143073, + "grad_norm": 5.9375, + "learning_rate": 1.150435351739913e-06, + "loss": 1.5606842, + "memory(GiB)": 117.38, + "step": 62340, + "train_speed(iter/s)": 1.636626 + }, + { + "acc": 0.66902351, + "epoch": 1.5815575849822425, + "grad_norm": 4.90625, + "learning_rate": 1.1497662583067231e-06, + "loss": 1.54241104, + "memory(GiB)": 117.38, + "step": 62345, + "train_speed(iter/s)": 1.636638 + }, + { + "acc": 0.67156844, + "epoch": 1.5816844241501777, + "grad_norm": 6.0625, + "learning_rate": 1.1490973342240063e-06, + "loss": 1.56138258, + "memory(GiB)": 117.38, + "step": 62350, + "train_speed(iter/s)": 1.636652 + }, + { + "acc": 0.64750681, + "epoch": 1.5818112633181127, + "grad_norm": 5.90625, + "learning_rate": 1.1484285795211803e-06, + "loss": 1.66835213, + "memory(GiB)": 117.38, + "step": 62355, + "train_speed(iter/s)": 1.636666 + }, + { + "acc": 0.66828451, + "epoch": 1.5819381024860477, + "grad_norm": 7.0625, + "learning_rate": 1.1477599942276613e-06, + "loss": 1.60028343, + "memory(GiB)": 117.38, + "step": 62360, + "train_speed(iter/s)": 1.63668 + }, + { + "acc": 0.6553627, + "epoch": 1.5820649416539827, + "grad_norm": 5.65625, + "learning_rate": 1.1470915783728547e-06, + "loss": 1.54887571, + "memory(GiB)": 117.38, + "step": 62365, + "train_speed(iter/s)": 1.636693 + }, + { + "acc": 0.66102877, + "epoch": 1.5821917808219177, + "grad_norm": 5.09375, + "learning_rate": 1.1464233319861661e-06, + "loss": 1.55981331, + "memory(GiB)": 117.38, + "step": 62370, + "train_speed(iter/s)": 1.636706 + }, + { + "acc": 0.6541666, + "epoch": 1.582318619989853, + "grad_norm": 5.5625, + "learning_rate": 1.145755255096983e-06, + "loss": 1.55644989, + "memory(GiB)": 117.38, + "step": 62375, + "train_speed(iter/s)": 1.63672 + }, + { + "acc": 0.64556084, + "epoch": 1.582445459157788, + "grad_norm": 5.5625, + "learning_rate": 1.1450873477346901e-06, + "loss": 1.60299301, + "memory(GiB)": 117.38, + "step": 62380, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.6577282, + "epoch": 1.582572298325723, + "grad_norm": 5.90625, + "learning_rate": 1.1444196099286682e-06, + "loss": 1.55571156, + "memory(GiB)": 117.38, + "step": 62385, + "train_speed(iter/s)": 1.636747 + }, + { + "acc": 0.64839506, + "epoch": 1.582699137493658, + "grad_norm": 5.1875, + "learning_rate": 1.143752041708287e-06, + "loss": 1.64312897, + "memory(GiB)": 117.38, + "step": 62390, + "train_speed(iter/s)": 1.63676 + }, + { + "acc": 0.65850468, + "epoch": 1.582825976661593, + "grad_norm": 10.125, + "learning_rate": 1.1430846431029062e-06, + "loss": 1.57215195, + "memory(GiB)": 117.38, + "step": 62395, + "train_speed(iter/s)": 1.636774 + }, + { + "acc": 0.64436893, + "epoch": 1.582952815829528, + "grad_norm": 6.8125, + "learning_rate": 1.1424174141418815e-06, + "loss": 1.68215828, + "memory(GiB)": 117.38, + "step": 62400, + "train_speed(iter/s)": 1.636787 + }, + { + "acc": 0.65718699, + "epoch": 1.583079654997463, + "grad_norm": 6.59375, + "learning_rate": 1.1417503548545634e-06, + "loss": 1.60325508, + "memory(GiB)": 117.38, + "step": 62405, + "train_speed(iter/s)": 1.6368 + }, + { + "acc": 0.64169102, + "epoch": 1.5832064941653983, + "grad_norm": 5.28125, + "learning_rate": 1.1410834652702918e-06, + "loss": 1.63935871, + "memory(GiB)": 117.38, + "step": 62410, + "train_speed(iter/s)": 1.636815 + }, + { + "acc": 0.6585021, + "epoch": 1.5833333333333335, + "grad_norm": 7.03125, + "learning_rate": 1.1404167454183957e-06, + "loss": 1.56873226, + "memory(GiB)": 117.38, + "step": 62415, + "train_speed(iter/s)": 1.636829 + }, + { + "acc": 0.63946252, + "epoch": 1.5834601725012685, + "grad_norm": 5.28125, + "learning_rate": 1.1397501953282042e-06, + "loss": 1.64788818, + "memory(GiB)": 117.38, + "step": 62420, + "train_speed(iter/s)": 1.636841 + }, + { + "acc": 0.64478893, + "epoch": 1.5835870116692035, + "grad_norm": 8.5, + "learning_rate": 1.1390838150290334e-06, + "loss": 1.61939316, + "memory(GiB)": 117.38, + "step": 62425, + "train_speed(iter/s)": 1.636854 + }, + { + "acc": 0.66862283, + "epoch": 1.5837138508371384, + "grad_norm": 7.125, + "learning_rate": 1.1384176045501944e-06, + "loss": 1.56827202, + "memory(GiB)": 117.38, + "step": 62430, + "train_speed(iter/s)": 1.636869 + }, + { + "acc": 0.64439735, + "epoch": 1.5838406900050734, + "grad_norm": 5.4375, + "learning_rate": 1.1377515639209902e-06, + "loss": 1.6099762, + "memory(GiB)": 117.38, + "step": 62435, + "train_speed(iter/s)": 1.636884 + }, + { + "acc": 0.66032171, + "epoch": 1.5839675291730086, + "grad_norm": 7.34375, + "learning_rate": 1.1370856931707159e-06, + "loss": 1.63359871, + "memory(GiB)": 117.38, + "step": 62440, + "train_speed(iter/s)": 1.636898 + }, + { + "acc": 0.65483494, + "epoch": 1.5840943683409436, + "grad_norm": 7.34375, + "learning_rate": 1.1364199923286589e-06, + "loss": 1.60862846, + "memory(GiB)": 117.38, + "step": 62445, + "train_speed(iter/s)": 1.63691 + }, + { + "acc": 0.67228365, + "epoch": 1.5842212075088788, + "grad_norm": 7.59375, + "learning_rate": 1.1357544614241012e-06, + "loss": 1.57928505, + "memory(GiB)": 117.38, + "step": 62450, + "train_speed(iter/s)": 1.636925 + }, + { + "acc": 0.66140218, + "epoch": 1.5843480466768138, + "grad_norm": 6.9375, + "learning_rate": 1.135089100486314e-06, + "loss": 1.59014063, + "memory(GiB)": 117.38, + "step": 62455, + "train_speed(iter/s)": 1.63694 + }, + { + "acc": 0.659905, + "epoch": 1.5844748858447488, + "grad_norm": 6.0, + "learning_rate": 1.1344239095445642e-06, + "loss": 1.57373943, + "memory(GiB)": 117.38, + "step": 62460, + "train_speed(iter/s)": 1.636952 + }, + { + "acc": 0.66732683, + "epoch": 1.5846017250126838, + "grad_norm": 4.8125, + "learning_rate": 1.1337588886281092e-06, + "loss": 1.50141506, + "memory(GiB)": 117.38, + "step": 62465, + "train_speed(iter/s)": 1.636965 + }, + { + "acc": 0.66228685, + "epoch": 1.584728564180619, + "grad_norm": 5.9375, + "learning_rate": 1.1330940377662002e-06, + "loss": 1.59462929, + "memory(GiB)": 117.38, + "step": 62470, + "train_speed(iter/s)": 1.636978 + }, + { + "acc": 0.65680861, + "epoch": 1.584855403348554, + "grad_norm": 5.4375, + "learning_rate": 1.1324293569880795e-06, + "loss": 1.62669182, + "memory(GiB)": 117.38, + "step": 62475, + "train_speed(iter/s)": 1.636992 + }, + { + "acc": 0.66022425, + "epoch": 1.5849822425164892, + "grad_norm": 6.8125, + "learning_rate": 1.1317648463229835e-06, + "loss": 1.56034737, + "memory(GiB)": 117.38, + "step": 62480, + "train_speed(iter/s)": 1.637007 + }, + { + "acc": 0.64006104, + "epoch": 1.5851090816844242, + "grad_norm": 6.75, + "learning_rate": 1.1311005058001396e-06, + "loss": 1.63513985, + "memory(GiB)": 117.38, + "step": 62485, + "train_speed(iter/s)": 1.637021 + }, + { + "acc": 0.64555693, + "epoch": 1.5852359208523592, + "grad_norm": 5.375, + "learning_rate": 1.130436335448769e-06, + "loss": 1.64385014, + "memory(GiB)": 117.38, + "step": 62490, + "train_speed(iter/s)": 1.637033 + }, + { + "acc": 0.65974517, + "epoch": 1.5853627600202942, + "grad_norm": 5.9375, + "learning_rate": 1.129772335298085e-06, + "loss": 1.60770988, + "memory(GiB)": 117.38, + "step": 62495, + "train_speed(iter/s)": 1.637046 + }, + { + "acc": 0.64732871, + "epoch": 1.5854895991882292, + "grad_norm": 6.03125, + "learning_rate": 1.1291085053772926e-06, + "loss": 1.62634773, + "memory(GiB)": 117.38, + "step": 62500, + "train_speed(iter/s)": 1.637059 + }, + { + "acc": 0.65343914, + "epoch": 1.5856164383561644, + "grad_norm": 5.625, + "learning_rate": 1.1284448457155893e-06, + "loss": 1.64265175, + "memory(GiB)": 117.38, + "step": 62505, + "train_speed(iter/s)": 1.637072 + }, + { + "acc": 0.65827475, + "epoch": 1.5857432775240996, + "grad_norm": 6.53125, + "learning_rate": 1.1277813563421697e-06, + "loss": 1.591856, + "memory(GiB)": 117.38, + "step": 62510, + "train_speed(iter/s)": 1.637085 + }, + { + "acc": 0.66083798, + "epoch": 1.5858701166920346, + "grad_norm": 5.15625, + "learning_rate": 1.127118037286213e-06, + "loss": 1.55919638, + "memory(GiB)": 117.38, + "step": 62515, + "train_speed(iter/s)": 1.637098 + }, + { + "acc": 0.65217314, + "epoch": 1.5859969558599696, + "grad_norm": 5.65625, + "learning_rate": 1.1264548885768944e-06, + "loss": 1.66456738, + "memory(GiB)": 117.38, + "step": 62520, + "train_speed(iter/s)": 1.637112 + }, + { + "acc": 0.6598146, + "epoch": 1.5861237950279046, + "grad_norm": 5.9375, + "learning_rate": 1.1257919102433856e-06, + "loss": 1.6061161, + "memory(GiB)": 117.38, + "step": 62525, + "train_speed(iter/s)": 1.637125 + }, + { + "acc": 0.65738888, + "epoch": 1.5862506341958396, + "grad_norm": 5.34375, + "learning_rate": 1.125129102314847e-06, + "loss": 1.58573866, + "memory(GiB)": 117.38, + "step": 62530, + "train_speed(iter/s)": 1.637138 + }, + { + "acc": 0.63902454, + "epoch": 1.5863774733637748, + "grad_norm": 5.28125, + "learning_rate": 1.124466464820429e-06, + "loss": 1.64181061, + "memory(GiB)": 117.38, + "step": 62535, + "train_speed(iter/s)": 1.637152 + }, + { + "acc": 0.65840492, + "epoch": 1.5865043125317098, + "grad_norm": 6.65625, + "learning_rate": 1.123803997789278e-06, + "loss": 1.54912586, + "memory(GiB)": 117.38, + "step": 62540, + "train_speed(iter/s)": 1.637164 + }, + { + "acc": 0.65408926, + "epoch": 1.586631151699645, + "grad_norm": 5.59375, + "learning_rate": 1.1231417012505342e-06, + "loss": 1.61890259, + "memory(GiB)": 117.38, + "step": 62545, + "train_speed(iter/s)": 1.637177 + }, + { + "acc": 0.66086903, + "epoch": 1.58675799086758, + "grad_norm": 6.0625, + "learning_rate": 1.1224795752333283e-06, + "loss": 1.58828669, + "memory(GiB)": 117.38, + "step": 62550, + "train_speed(iter/s)": 1.63719 + }, + { + "acc": 0.66073389, + "epoch": 1.586884830035515, + "grad_norm": 6.03125, + "learning_rate": 1.1218176197667802e-06, + "loss": 1.65278244, + "memory(GiB)": 117.38, + "step": 62555, + "train_speed(iter/s)": 1.637203 + }, + { + "acc": 0.64921055, + "epoch": 1.58701166920345, + "grad_norm": 4.875, + "learning_rate": 1.1211558348800095e-06, + "loss": 1.60193367, + "memory(GiB)": 117.38, + "step": 62560, + "train_speed(iter/s)": 1.637216 + }, + { + "acc": 0.65964622, + "epoch": 1.587138508371385, + "grad_norm": 5.90625, + "learning_rate": 1.120494220602123e-06, + "loss": 1.56493664, + "memory(GiB)": 117.38, + "step": 62565, + "train_speed(iter/s)": 1.637229 + }, + { + "acc": 0.6770113, + "epoch": 1.5872653475393201, + "grad_norm": 7.375, + "learning_rate": 1.1198327769622224e-06, + "loss": 1.57329111, + "memory(GiB)": 117.38, + "step": 62570, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.65843763, + "epoch": 1.5873921867072553, + "grad_norm": 5.03125, + "learning_rate": 1.1191715039893975e-06, + "loss": 1.55712767, + "memory(GiB)": 117.38, + "step": 62575, + "train_speed(iter/s)": 1.637256 + }, + { + "acc": 0.65831566, + "epoch": 1.5875190258751903, + "grad_norm": 6.03125, + "learning_rate": 1.1185104017127379e-06, + "loss": 1.71317406, + "memory(GiB)": 117.38, + "step": 62580, + "train_speed(iter/s)": 1.63727 + }, + { + "acc": 0.6672946, + "epoch": 1.5876458650431253, + "grad_norm": 6.34375, + "learning_rate": 1.1178494701613202e-06, + "loss": 1.63364353, + "memory(GiB)": 117.38, + "step": 62585, + "train_speed(iter/s)": 1.637282 + }, + { + "acc": 0.66725416, + "epoch": 1.5877727042110603, + "grad_norm": 5.84375, + "learning_rate": 1.1171887093642158e-06, + "loss": 1.55805531, + "memory(GiB)": 117.38, + "step": 62590, + "train_speed(iter/s)": 1.637296 + }, + { + "acc": 0.68033695, + "epoch": 1.5878995433789953, + "grad_norm": 5.09375, + "learning_rate": 1.1165281193504873e-06, + "loss": 1.53453856, + "memory(GiB)": 117.38, + "step": 62595, + "train_speed(iter/s)": 1.63731 + }, + { + "acc": 0.64932861, + "epoch": 1.5880263825469305, + "grad_norm": 6.25, + "learning_rate": 1.1158677001491902e-06, + "loss": 1.6040556, + "memory(GiB)": 117.38, + "step": 62600, + "train_speed(iter/s)": 1.637323 + }, + { + "acc": 0.64266424, + "epoch": 1.5881532217148655, + "grad_norm": 6.90625, + "learning_rate": 1.1152074517893735e-06, + "loss": 1.69743309, + "memory(GiB)": 117.38, + "step": 62605, + "train_speed(iter/s)": 1.637336 + }, + { + "acc": 0.64480076, + "epoch": 1.5882800608828007, + "grad_norm": 4.8125, + "learning_rate": 1.1145473743000773e-06, + "loss": 1.67770576, + "memory(GiB)": 117.38, + "step": 62610, + "train_speed(iter/s)": 1.63735 + }, + { + "acc": 0.66311855, + "epoch": 1.5884069000507357, + "grad_norm": 6.5625, + "learning_rate": 1.1138874677103345e-06, + "loss": 1.55446396, + "memory(GiB)": 117.38, + "step": 62615, + "train_speed(iter/s)": 1.637363 + }, + { + "acc": 0.66786499, + "epoch": 1.5885337392186707, + "grad_norm": 5.875, + "learning_rate": 1.1132277320491713e-06, + "loss": 1.55443649, + "memory(GiB)": 117.38, + "step": 62620, + "train_speed(iter/s)": 1.637377 + }, + { + "acc": 0.64248624, + "epoch": 1.5886605783866057, + "grad_norm": 5.3125, + "learning_rate": 1.1125681673456062e-06, + "loss": 1.60297012, + "memory(GiB)": 117.38, + "step": 62625, + "train_speed(iter/s)": 1.63739 + }, + { + "acc": 0.6707387, + "epoch": 1.588787417554541, + "grad_norm": 6.65625, + "learning_rate": 1.1119087736286489e-06, + "loss": 1.56099186, + "memory(GiB)": 117.38, + "step": 62630, + "train_speed(iter/s)": 1.637403 + }, + { + "acc": 0.64204235, + "epoch": 1.5889142567224759, + "grad_norm": 5.375, + "learning_rate": 1.1112495509273025e-06, + "loss": 1.65730133, + "memory(GiB)": 117.38, + "step": 62635, + "train_speed(iter/s)": 1.637417 + }, + { + "acc": 0.65478725, + "epoch": 1.589041095890411, + "grad_norm": 5.53125, + "learning_rate": 1.110590499270563e-06, + "loss": 1.57113113, + "memory(GiB)": 117.38, + "step": 62640, + "train_speed(iter/s)": 1.637432 + }, + { + "acc": 0.66281033, + "epoch": 1.589167935058346, + "grad_norm": 5.375, + "learning_rate": 1.109931618687417e-06, + "loss": 1.57591667, + "memory(GiB)": 117.38, + "step": 62645, + "train_speed(iter/s)": 1.637446 + }, + { + "acc": 0.64522371, + "epoch": 1.589294774226281, + "grad_norm": 6.40625, + "learning_rate": 1.1092729092068495e-06, + "loss": 1.659021, + "memory(GiB)": 117.38, + "step": 62650, + "train_speed(iter/s)": 1.637458 + }, + { + "acc": 0.65676346, + "epoch": 1.589421613394216, + "grad_norm": 6.15625, + "learning_rate": 1.1086143708578285e-06, + "loss": 1.67405357, + "memory(GiB)": 117.38, + "step": 62655, + "train_speed(iter/s)": 1.637471 + }, + { + "acc": 0.66228676, + "epoch": 1.589548452562151, + "grad_norm": 8.9375, + "learning_rate": 1.107956003669321e-06, + "loss": 1.5373064, + "memory(GiB)": 117.38, + "step": 62660, + "train_speed(iter/s)": 1.637484 + }, + { + "acc": 0.66487112, + "epoch": 1.5896752917300863, + "grad_norm": 6.03125, + "learning_rate": 1.107297807670284e-06, + "loss": 1.58625841, + "memory(GiB)": 117.38, + "step": 62665, + "train_speed(iter/s)": 1.637496 + }, + { + "acc": 0.65883241, + "epoch": 1.5898021308980215, + "grad_norm": 6.21875, + "learning_rate": 1.106639782889672e-06, + "loss": 1.60930405, + "memory(GiB)": 117.38, + "step": 62670, + "train_speed(iter/s)": 1.637509 + }, + { + "acc": 0.66167426, + "epoch": 1.5899289700659565, + "grad_norm": 5.59375, + "learning_rate": 1.1059819293564233e-06, + "loss": 1.54043045, + "memory(GiB)": 117.38, + "step": 62675, + "train_speed(iter/s)": 1.637524 + }, + { + "acc": 0.65661821, + "epoch": 1.5900558092338914, + "grad_norm": 5.4375, + "learning_rate": 1.105324247099474e-06, + "loss": 1.61117821, + "memory(GiB)": 117.38, + "step": 62680, + "train_speed(iter/s)": 1.637536 + }, + { + "acc": 0.65390182, + "epoch": 1.5901826484018264, + "grad_norm": 5.15625, + "learning_rate": 1.1046667361477537e-06, + "loss": 1.53831892, + "memory(GiB)": 117.38, + "step": 62685, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.67002668, + "epoch": 1.5903094875697614, + "grad_norm": 6.21875, + "learning_rate": 1.1040093965301835e-06, + "loss": 1.56735458, + "memory(GiB)": 117.38, + "step": 62690, + "train_speed(iter/s)": 1.637563 + }, + { + "acc": 0.6513566, + "epoch": 1.5904363267376966, + "grad_norm": 6.25, + "learning_rate": 1.1033522282756716e-06, + "loss": 1.60883369, + "memory(GiB)": 117.38, + "step": 62695, + "train_speed(iter/s)": 1.637577 + }, + { + "acc": 0.66003828, + "epoch": 1.5905631659056316, + "grad_norm": 5.625, + "learning_rate": 1.1026952314131268e-06, + "loss": 1.58817825, + "memory(GiB)": 117.38, + "step": 62700, + "train_speed(iter/s)": 1.63759 + }, + { + "acc": 0.67945404, + "epoch": 1.5906900050735668, + "grad_norm": 4.90625, + "learning_rate": 1.1020384059714463e-06, + "loss": 1.52609806, + "memory(GiB)": 117.38, + "step": 62705, + "train_speed(iter/s)": 1.637602 + }, + { + "acc": 0.67456856, + "epoch": 1.5908168442415018, + "grad_norm": 5.4375, + "learning_rate": 1.1013817519795211e-06, + "loss": 1.5038702, + "memory(GiB)": 117.38, + "step": 62710, + "train_speed(iter/s)": 1.637615 + }, + { + "acc": 0.65819688, + "epoch": 1.5909436834094368, + "grad_norm": 6.34375, + "learning_rate": 1.1007252694662302e-06, + "loss": 1.62090111, + "memory(GiB)": 117.38, + "step": 62715, + "train_speed(iter/s)": 1.637629 + }, + { + "acc": 0.67165565, + "epoch": 1.5910705225773718, + "grad_norm": 6.28125, + "learning_rate": 1.1000689584604519e-06, + "loss": 1.50161076, + "memory(GiB)": 117.38, + "step": 62720, + "train_speed(iter/s)": 1.637643 + }, + { + "acc": 0.64866056, + "epoch": 1.5911973617453068, + "grad_norm": 5.8125, + "learning_rate": 1.099412818991053e-06, + "loss": 1.63522167, + "memory(GiB)": 117.38, + "step": 62725, + "train_speed(iter/s)": 1.637656 + }, + { + "acc": 0.66707015, + "epoch": 1.591324200913242, + "grad_norm": 6.71875, + "learning_rate": 1.098756851086893e-06, + "loss": 1.56834793, + "memory(GiB)": 117.38, + "step": 62730, + "train_speed(iter/s)": 1.637668 + }, + { + "acc": 0.64237709, + "epoch": 1.5914510400811772, + "grad_norm": 5.5, + "learning_rate": 1.0981010547768244e-06, + "loss": 1.66979275, + "memory(GiB)": 117.38, + "step": 62735, + "train_speed(iter/s)": 1.637681 + }, + { + "acc": 0.65112667, + "epoch": 1.5915778792491122, + "grad_norm": 5.78125, + "learning_rate": 1.0974454300896924e-06, + "loss": 1.63156357, + "memory(GiB)": 117.38, + "step": 62740, + "train_speed(iter/s)": 1.637695 + }, + { + "acc": 0.65356369, + "epoch": 1.5917047184170472, + "grad_norm": 5.6875, + "learning_rate": 1.0967899770543344e-06, + "loss": 1.68217621, + "memory(GiB)": 117.38, + "step": 62745, + "train_speed(iter/s)": 1.637709 + }, + { + "acc": 0.65835867, + "epoch": 1.5918315575849822, + "grad_norm": 6.3125, + "learning_rate": 1.0961346956995795e-06, + "loss": 1.62671337, + "memory(GiB)": 117.38, + "step": 62750, + "train_speed(iter/s)": 1.637721 + }, + { + "acc": 0.663344, + "epoch": 1.5919583967529172, + "grad_norm": 6.40625, + "learning_rate": 1.0954795860542495e-06, + "loss": 1.54263611, + "memory(GiB)": 117.38, + "step": 62755, + "train_speed(iter/s)": 1.637736 + }, + { + "acc": 0.64407768, + "epoch": 1.5920852359208524, + "grad_norm": 5.3125, + "learning_rate": 1.0948246481471603e-06, + "loss": 1.66214676, + "memory(GiB)": 117.38, + "step": 62760, + "train_speed(iter/s)": 1.637749 + }, + { + "acc": 0.66005678, + "epoch": 1.5922120750887874, + "grad_norm": 6.6875, + "learning_rate": 1.0941698820071183e-06, + "loss": 1.54358101, + "memory(GiB)": 117.38, + "step": 62765, + "train_speed(iter/s)": 1.637762 + }, + { + "acc": 0.65918379, + "epoch": 1.5923389142567226, + "grad_norm": 5.46875, + "learning_rate": 1.0935152876629234e-06, + "loss": 1.49937801, + "memory(GiB)": 117.38, + "step": 62770, + "train_speed(iter/s)": 1.637776 + }, + { + "acc": 0.66754518, + "epoch": 1.5924657534246576, + "grad_norm": 6.46875, + "learning_rate": 1.0928608651433675e-06, + "loss": 1.55539265, + "memory(GiB)": 117.38, + "step": 62775, + "train_speed(iter/s)": 1.637789 + }, + { + "acc": 0.64254384, + "epoch": 1.5925925925925926, + "grad_norm": 6.84375, + "learning_rate": 1.0922066144772342e-06, + "loss": 1.6642046, + "memory(GiB)": 117.38, + "step": 62780, + "train_speed(iter/s)": 1.637804 + }, + { + "acc": 0.65602951, + "epoch": 1.5927194317605275, + "grad_norm": 6.375, + "learning_rate": 1.0915525356933004e-06, + "loss": 1.67031956, + "memory(GiB)": 117.38, + "step": 62785, + "train_speed(iter/s)": 1.637817 + }, + { + "acc": 0.66519742, + "epoch": 1.5928462709284628, + "grad_norm": 6.1875, + "learning_rate": 1.0908986288203382e-06, + "loss": 1.60669537, + "memory(GiB)": 117.38, + "step": 62790, + "train_speed(iter/s)": 1.63783 + }, + { + "acc": 0.66883984, + "epoch": 1.5929731100963977, + "grad_norm": 5.53125, + "learning_rate": 1.0902448938871064e-06, + "loss": 1.52913857, + "memory(GiB)": 117.38, + "step": 62795, + "train_speed(iter/s)": 1.637843 + }, + { + "acc": 0.66484003, + "epoch": 1.593099949264333, + "grad_norm": 5.03125, + "learning_rate": 1.0895913309223594e-06, + "loss": 1.58305054, + "memory(GiB)": 117.38, + "step": 62800, + "train_speed(iter/s)": 1.637856 + }, + { + "acc": 0.66887884, + "epoch": 1.593226788432268, + "grad_norm": 5.84375, + "learning_rate": 1.0889379399548432e-06, + "loss": 1.54780502, + "memory(GiB)": 117.38, + "step": 62805, + "train_speed(iter/s)": 1.637869 + }, + { + "acc": 0.64801464, + "epoch": 1.593353627600203, + "grad_norm": 6.84375, + "learning_rate": 1.0882847210133007e-06, + "loss": 1.62212696, + "memory(GiB)": 117.38, + "step": 62810, + "train_speed(iter/s)": 1.637882 + }, + { + "acc": 0.66171775, + "epoch": 1.593480466768138, + "grad_norm": 4.96875, + "learning_rate": 1.0876316741264598e-06, + "loss": 1.59099846, + "memory(GiB)": 117.38, + "step": 62815, + "train_speed(iter/s)": 1.637895 + }, + { + "acc": 0.65383253, + "epoch": 1.593607305936073, + "grad_norm": 5.0, + "learning_rate": 1.0869787993230435e-06, + "loss": 1.66024075, + "memory(GiB)": 117.38, + "step": 62820, + "train_speed(iter/s)": 1.637908 + }, + { + "acc": 0.66324282, + "epoch": 1.5937341451040081, + "grad_norm": 5.40625, + "learning_rate": 1.0863260966317713e-06, + "loss": 1.57417755, + "memory(GiB)": 117.38, + "step": 62825, + "train_speed(iter/s)": 1.637922 + }, + { + "acc": 0.65766764, + "epoch": 1.5938609842719433, + "grad_norm": 4.90625, + "learning_rate": 1.0856735660813523e-06, + "loss": 1.55127659, + "memory(GiB)": 117.38, + "step": 62830, + "train_speed(iter/s)": 1.637935 + }, + { + "acc": 0.65404606, + "epoch": 1.5939878234398783, + "grad_norm": 5.09375, + "learning_rate": 1.0850212077004845e-06, + "loss": 1.6048912, + "memory(GiB)": 117.38, + "step": 62835, + "train_speed(iter/s)": 1.637948 + }, + { + "acc": 0.66554499, + "epoch": 1.5941146626078133, + "grad_norm": 5.5625, + "learning_rate": 1.084369021517862e-06, + "loss": 1.57158184, + "memory(GiB)": 117.38, + "step": 62840, + "train_speed(iter/s)": 1.637962 + }, + { + "acc": 0.66656065, + "epoch": 1.5942415017757483, + "grad_norm": 5.625, + "learning_rate": 1.0837170075621733e-06, + "loss": 1.56499786, + "memory(GiB)": 117.38, + "step": 62845, + "train_speed(iter/s)": 1.637976 + }, + { + "acc": 0.66513052, + "epoch": 1.5943683409436833, + "grad_norm": 5.6875, + "learning_rate": 1.0830651658620966e-06, + "loss": 1.56380625, + "memory(GiB)": 117.38, + "step": 62850, + "train_speed(iter/s)": 1.637989 + }, + { + "acc": 0.68336992, + "epoch": 1.5944951801116185, + "grad_norm": 4.96875, + "learning_rate": 1.0824134964462996e-06, + "loss": 1.49707603, + "memory(GiB)": 117.38, + "step": 62855, + "train_speed(iter/s)": 1.638003 + }, + { + "acc": 0.66165628, + "epoch": 1.5946220192795535, + "grad_norm": 5.03125, + "learning_rate": 1.0817619993434486e-06, + "loss": 1.5601078, + "memory(GiB)": 117.38, + "step": 62860, + "train_speed(iter/s)": 1.638017 + }, + { + "acc": 0.64193153, + "epoch": 1.5947488584474887, + "grad_norm": 5.28125, + "learning_rate": 1.0811106745821987e-06, + "loss": 1.6751091, + "memory(GiB)": 117.38, + "step": 62865, + "train_speed(iter/s)": 1.638031 + }, + { + "acc": 0.66296811, + "epoch": 1.5948756976154237, + "grad_norm": 5.375, + "learning_rate": 1.0804595221911978e-06, + "loss": 1.60402641, + "memory(GiB)": 117.38, + "step": 62870, + "train_speed(iter/s)": 1.638045 + }, + { + "acc": 0.65604777, + "epoch": 1.5950025367833587, + "grad_norm": 5.78125, + "learning_rate": 1.0798085421990867e-06, + "loss": 1.59422054, + "memory(GiB)": 117.38, + "step": 62875, + "train_speed(iter/s)": 1.638059 + }, + { + "acc": 0.67371092, + "epoch": 1.5951293759512937, + "grad_norm": 6.3125, + "learning_rate": 1.079157734634499e-06, + "loss": 1.55721741, + "memory(GiB)": 117.38, + "step": 62880, + "train_speed(iter/s)": 1.638074 + }, + { + "acc": 0.65964994, + "epoch": 1.5952562151192287, + "grad_norm": 4.28125, + "learning_rate": 1.0785070995260582e-06, + "loss": 1.63469543, + "memory(GiB)": 117.38, + "step": 62885, + "train_speed(iter/s)": 1.638087 + }, + { + "acc": 0.67139902, + "epoch": 1.5953830542871639, + "grad_norm": 5.1875, + "learning_rate": 1.0778566369023841e-06, + "loss": 1.56823635, + "memory(GiB)": 117.38, + "step": 62890, + "train_speed(iter/s)": 1.6381 + }, + { + "acc": 0.6539577, + "epoch": 1.595509893455099, + "grad_norm": 5.125, + "learning_rate": 1.0772063467920863e-06, + "loss": 1.59358273, + "memory(GiB)": 117.38, + "step": 62895, + "train_speed(iter/s)": 1.638113 + }, + { + "acc": 0.65581532, + "epoch": 1.595636732623034, + "grad_norm": 5.75, + "learning_rate": 1.076556229223767e-06, + "loss": 1.61210937, + "memory(GiB)": 117.38, + "step": 62900, + "train_speed(iter/s)": 1.638127 + }, + { + "acc": 0.67634935, + "epoch": 1.595763571790969, + "grad_norm": 5.90625, + "learning_rate": 1.0759062842260221e-06, + "loss": 1.54376431, + "memory(GiB)": 117.38, + "step": 62905, + "train_speed(iter/s)": 1.63814 + }, + { + "acc": 0.66490984, + "epoch": 1.595890410958904, + "grad_norm": 6.15625, + "learning_rate": 1.0752565118274383e-06, + "loss": 1.62718925, + "memory(GiB)": 117.38, + "step": 62910, + "train_speed(iter/s)": 1.638154 + }, + { + "acc": 0.66052189, + "epoch": 1.596017250126839, + "grad_norm": 8.3125, + "learning_rate": 1.0746069120565961e-06, + "loss": 1.61135654, + "memory(GiB)": 117.38, + "step": 62915, + "train_speed(iter/s)": 1.638166 + }, + { + "acc": 0.67061701, + "epoch": 1.5961440892947742, + "grad_norm": 5.59375, + "learning_rate": 1.073957484942067e-06, + "loss": 1.55945606, + "memory(GiB)": 117.38, + "step": 62920, + "train_speed(iter/s)": 1.638179 + }, + { + "acc": 0.65347948, + "epoch": 1.5962709284627092, + "grad_norm": 5.1875, + "learning_rate": 1.0733082305124166e-06, + "loss": 1.58091908, + "memory(GiB)": 117.38, + "step": 62925, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.65152078, + "epoch": 1.5963977676306444, + "grad_norm": 6.40625, + "learning_rate": 1.0726591487962018e-06, + "loss": 1.61406746, + "memory(GiB)": 117.38, + "step": 62930, + "train_speed(iter/s)": 1.638207 + }, + { + "acc": 0.65219269, + "epoch": 1.5965246067985794, + "grad_norm": 6.0625, + "learning_rate": 1.0720102398219716e-06, + "loss": 1.61286201, + "memory(GiB)": 117.38, + "step": 62935, + "train_speed(iter/s)": 1.638222 + }, + { + "acc": 0.6677989, + "epoch": 1.5966514459665144, + "grad_norm": 6.0625, + "learning_rate": 1.0713615036182684e-06, + "loss": 1.56803465, + "memory(GiB)": 117.38, + "step": 62940, + "train_speed(iter/s)": 1.638236 + }, + { + "acc": 0.66225033, + "epoch": 1.5967782851344494, + "grad_norm": 7.0625, + "learning_rate": 1.0707129402136252e-06, + "loss": 1.59018974, + "memory(GiB)": 117.38, + "step": 62945, + "train_speed(iter/s)": 1.638248 + }, + { + "acc": 0.65961542, + "epoch": 1.5969051243023846, + "grad_norm": 6.40625, + "learning_rate": 1.0700645496365725e-06, + "loss": 1.57412443, + "memory(GiB)": 117.38, + "step": 62950, + "train_speed(iter/s)": 1.638262 + }, + { + "acc": 0.65433278, + "epoch": 1.5970319634703196, + "grad_norm": 5.28125, + "learning_rate": 1.0694163319156254e-06, + "loss": 1.58099747, + "memory(GiB)": 117.38, + "step": 62955, + "train_speed(iter/s)": 1.638275 + }, + { + "acc": 0.66535811, + "epoch": 1.5971588026382548, + "grad_norm": 5.625, + "learning_rate": 1.0687682870792953e-06, + "loss": 1.55464916, + "memory(GiB)": 117.38, + "step": 62960, + "train_speed(iter/s)": 1.638288 + }, + { + "acc": 0.64707308, + "epoch": 1.5972856418061898, + "grad_norm": 7.65625, + "learning_rate": 1.0681204151560891e-06, + "loss": 1.60858459, + "memory(GiB)": 117.38, + "step": 62965, + "train_speed(iter/s)": 1.638302 + }, + { + "acc": 0.64534931, + "epoch": 1.5974124809741248, + "grad_norm": 5.625, + "learning_rate": 1.067472716174503e-06, + "loss": 1.64561844, + "memory(GiB)": 117.38, + "step": 62970, + "train_speed(iter/s)": 1.638315 + }, + { + "acc": 0.65165234, + "epoch": 1.5975393201420598, + "grad_norm": 5.9375, + "learning_rate": 1.0668251901630227e-06, + "loss": 1.6185854, + "memory(GiB)": 117.38, + "step": 62975, + "train_speed(iter/s)": 1.638328 + }, + { + "acc": 0.67528396, + "epoch": 1.5976661593099948, + "grad_norm": 6.65625, + "learning_rate": 1.0661778371501303e-06, + "loss": 1.55878162, + "memory(GiB)": 117.38, + "step": 62980, + "train_speed(iter/s)": 1.638342 + }, + { + "acc": 0.67354817, + "epoch": 1.59779299847793, + "grad_norm": 6.84375, + "learning_rate": 1.0655306571643004e-06, + "loss": 1.61317043, + "memory(GiB)": 117.38, + "step": 62985, + "train_speed(iter/s)": 1.638355 + }, + { + "acc": 0.63953276, + "epoch": 1.5979198376458652, + "grad_norm": 5.9375, + "learning_rate": 1.0648836502339998e-06, + "loss": 1.66579113, + "memory(GiB)": 117.38, + "step": 62990, + "train_speed(iter/s)": 1.638369 + }, + { + "acc": 0.65304585, + "epoch": 1.5980466768138002, + "grad_norm": 5.90625, + "learning_rate": 1.0642368163876832e-06, + "loss": 1.58958511, + "memory(GiB)": 117.38, + "step": 62995, + "train_speed(iter/s)": 1.638382 + }, + { + "acc": 0.64993043, + "epoch": 1.5981735159817352, + "grad_norm": 5.21875, + "learning_rate": 1.0635901556538042e-06, + "loss": 1.58726244, + "memory(GiB)": 117.38, + "step": 63000, + "train_speed(iter/s)": 1.638394 + }, + { + "epoch": 1.5981735159817352, + "eval_acc": 0.6463198825536642, + "eval_loss": 1.5732152462005615, + "eval_runtime": 58.4796, + "eval_samples_per_second": 108.927, + "eval_steps_per_second": 27.24, + "step": 63000 + }, + { + "acc": 0.67321014, + "epoch": 1.5983003551496702, + "grad_norm": 5.1875, + "learning_rate": 1.0629436680608051e-06, + "loss": 1.52636433, + "memory(GiB)": 117.38, + "step": 63005, + "train_speed(iter/s)": 1.635737 + }, + { + "acc": 0.66551385, + "epoch": 1.5984271943176052, + "grad_norm": 5.375, + "learning_rate": 1.0622973536371223e-06, + "loss": 1.53802767, + "memory(GiB)": 117.38, + "step": 63010, + "train_speed(iter/s)": 1.63575 + }, + { + "acc": 0.66202545, + "epoch": 1.5985540334855404, + "grad_norm": 5.84375, + "learning_rate": 1.06165121241118e-06, + "loss": 1.53575535, + "memory(GiB)": 117.38, + "step": 63015, + "train_speed(iter/s)": 1.635764 + }, + { + "acc": 0.65583754, + "epoch": 1.5986808726534754, + "grad_norm": 6.25, + "learning_rate": 1.0610052444114023e-06, + "loss": 1.59028873, + "memory(GiB)": 117.38, + "step": 63020, + "train_speed(iter/s)": 1.635778 + }, + { + "acc": 0.64928775, + "epoch": 1.5988077118214106, + "grad_norm": 6.375, + "learning_rate": 1.0603594496662001e-06, + "loss": 1.64033756, + "memory(GiB)": 117.38, + "step": 63025, + "train_speed(iter/s)": 1.635791 + }, + { + "acc": 0.6517415, + "epoch": 1.5989345509893456, + "grad_norm": 6.1875, + "learning_rate": 1.0597138282039786e-06, + "loss": 1.55655165, + "memory(GiB)": 117.38, + "step": 63030, + "train_speed(iter/s)": 1.635804 + }, + { + "acc": 0.66357374, + "epoch": 1.5990613901572805, + "grad_norm": 5.5625, + "learning_rate": 1.0590683800531348e-06, + "loss": 1.53634586, + "memory(GiB)": 117.38, + "step": 63035, + "train_speed(iter/s)": 1.635818 + }, + { + "acc": 0.66937804, + "epoch": 1.5991882293252155, + "grad_norm": 5.53125, + "learning_rate": 1.058423105242059e-06, + "loss": 1.56521683, + "memory(GiB)": 117.38, + "step": 63040, + "train_speed(iter/s)": 1.635831 + }, + { + "acc": 0.66186819, + "epoch": 1.5993150684931505, + "grad_norm": 5.90625, + "learning_rate": 1.057778003799133e-06, + "loss": 1.59319096, + "memory(GiB)": 117.38, + "step": 63045, + "train_speed(iter/s)": 1.635843 + }, + { + "acc": 0.65413027, + "epoch": 1.5994419076610857, + "grad_norm": 5.875, + "learning_rate": 1.0571330757527309e-06, + "loss": 1.63452091, + "memory(GiB)": 117.38, + "step": 63050, + "train_speed(iter/s)": 1.635856 + }, + { + "acc": 0.64303432, + "epoch": 1.599568746829021, + "grad_norm": 5.8125, + "learning_rate": 1.0564883211312199e-06, + "loss": 1.68220978, + "memory(GiB)": 117.38, + "step": 63055, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.65824366, + "epoch": 1.599695585996956, + "grad_norm": 6.28125, + "learning_rate": 1.0558437399629584e-06, + "loss": 1.58481245, + "memory(GiB)": 117.38, + "step": 63060, + "train_speed(iter/s)": 1.635884 + }, + { + "acc": 0.67312307, + "epoch": 1.599822425164891, + "grad_norm": 8.9375, + "learning_rate": 1.0551993322762994e-06, + "loss": 1.59917183, + "memory(GiB)": 117.38, + "step": 63065, + "train_speed(iter/s)": 1.635897 + }, + { + "acc": 0.65208092, + "epoch": 1.599949264332826, + "grad_norm": 6.28125, + "learning_rate": 1.0545550980995857e-06, + "loss": 1.56832161, + "memory(GiB)": 117.38, + "step": 63070, + "train_speed(iter/s)": 1.63591 + }, + { + "acc": 0.67117381, + "epoch": 1.600076103500761, + "grad_norm": 5.96875, + "learning_rate": 1.0539110374611538e-06, + "loss": 1.56425953, + "memory(GiB)": 117.38, + "step": 63075, + "train_speed(iter/s)": 1.635923 + }, + { + "acc": 0.66582322, + "epoch": 1.6002029426686961, + "grad_norm": 5.84375, + "learning_rate": 1.0532671503893328e-06, + "loss": 1.51314793, + "memory(GiB)": 117.38, + "step": 63080, + "train_speed(iter/s)": 1.635936 + }, + { + "acc": 0.65818939, + "epoch": 1.600329781836631, + "grad_norm": 7.0625, + "learning_rate": 1.052623436912442e-06, + "loss": 1.55100384, + "memory(GiB)": 117.38, + "step": 63085, + "train_speed(iter/s)": 1.63595 + }, + { + "acc": 0.65439148, + "epoch": 1.6004566210045663, + "grad_norm": 5.78125, + "learning_rate": 1.0519798970587992e-06, + "loss": 1.58639488, + "memory(GiB)": 117.38, + "step": 63090, + "train_speed(iter/s)": 1.635963 + }, + { + "acc": 0.64888353, + "epoch": 1.6005834601725013, + "grad_norm": 5.1875, + "learning_rate": 1.0513365308567054e-06, + "loss": 1.61622467, + "memory(GiB)": 117.38, + "step": 63095, + "train_speed(iter/s)": 1.635978 + }, + { + "acc": 0.64919519, + "epoch": 1.6007102993404363, + "grad_norm": 4.9375, + "learning_rate": 1.0506933383344602e-06, + "loss": 1.64282627, + "memory(GiB)": 117.38, + "step": 63100, + "train_speed(iter/s)": 1.635991 + }, + { + "acc": 0.65175028, + "epoch": 1.6008371385083713, + "grad_norm": 5.15625, + "learning_rate": 1.0500503195203537e-06, + "loss": 1.62532806, + "memory(GiB)": 117.38, + "step": 63105, + "train_speed(iter/s)": 1.636002 + }, + { + "acc": 0.65444393, + "epoch": 1.6009639776763065, + "grad_norm": 5.1875, + "learning_rate": 1.049407474442672e-06, + "loss": 1.62911377, + "memory(GiB)": 117.38, + "step": 63110, + "train_speed(iter/s)": 1.636016 + }, + { + "acc": 0.64304824, + "epoch": 1.6010908168442415, + "grad_norm": 5.59375, + "learning_rate": 1.048764803129686e-06, + "loss": 1.64158211, + "memory(GiB)": 117.38, + "step": 63115, + "train_speed(iter/s)": 1.636028 + }, + { + "acc": 0.67096558, + "epoch": 1.6012176560121767, + "grad_norm": 6.28125, + "learning_rate": 1.0481223056096635e-06, + "loss": 1.54007549, + "memory(GiB)": 117.38, + "step": 63120, + "train_speed(iter/s)": 1.636042 + }, + { + "acc": 0.65090814, + "epoch": 1.6013444951801117, + "grad_norm": 5.875, + "learning_rate": 1.0474799819108677e-06, + "loss": 1.61761513, + "memory(GiB)": 117.38, + "step": 63125, + "train_speed(iter/s)": 1.636054 + }, + { + "acc": 0.65764327, + "epoch": 1.6014713343480467, + "grad_norm": 6.4375, + "learning_rate": 1.0468378320615502e-06, + "loss": 1.57159176, + "memory(GiB)": 117.38, + "step": 63130, + "train_speed(iter/s)": 1.636068 + }, + { + "acc": 0.65537605, + "epoch": 1.6015981735159817, + "grad_norm": 5.5, + "learning_rate": 1.0461958560899516e-06, + "loss": 1.60358562, + "memory(GiB)": 117.38, + "step": 63135, + "train_speed(iter/s)": 1.636081 + }, + { + "acc": 0.66216707, + "epoch": 1.6017250126839166, + "grad_norm": 5.875, + "learning_rate": 1.045554054024313e-06, + "loss": 1.53025026, + "memory(GiB)": 117.38, + "step": 63140, + "train_speed(iter/s)": 1.636095 + }, + { + "acc": 0.65361862, + "epoch": 1.6018518518518519, + "grad_norm": 5.6875, + "learning_rate": 1.0449124258928627e-06, + "loss": 1.59925518, + "memory(GiB)": 117.38, + "step": 63145, + "train_speed(iter/s)": 1.636109 + }, + { + "acc": 0.65425138, + "epoch": 1.601978691019787, + "grad_norm": 6.125, + "learning_rate": 1.044270971723823e-06, + "loss": 1.58673649, + "memory(GiB)": 117.38, + "step": 63150, + "train_speed(iter/s)": 1.636122 + }, + { + "acc": 0.65936747, + "epoch": 1.602105530187722, + "grad_norm": 6.875, + "learning_rate": 1.0436296915454048e-06, + "loss": 1.60257912, + "memory(GiB)": 117.38, + "step": 63155, + "train_speed(iter/s)": 1.636135 + }, + { + "acc": 0.64741063, + "epoch": 1.602232369355657, + "grad_norm": 6.25, + "learning_rate": 1.042988585385818e-06, + "loss": 1.65389118, + "memory(GiB)": 117.38, + "step": 63160, + "train_speed(iter/s)": 1.636149 + }, + { + "acc": 0.66876011, + "epoch": 1.602359208523592, + "grad_norm": 5.9375, + "learning_rate": 1.0423476532732602e-06, + "loss": 1.5550148, + "memory(GiB)": 117.38, + "step": 63165, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.65169382, + "epoch": 1.602486047691527, + "grad_norm": 7.375, + "learning_rate": 1.0417068952359216e-06, + "loss": 1.58853741, + "memory(GiB)": 117.38, + "step": 63170, + "train_speed(iter/s)": 1.636176 + }, + { + "acc": 0.68113117, + "epoch": 1.6026128868594622, + "grad_norm": 4.65625, + "learning_rate": 1.0410663113019869e-06, + "loss": 1.53986454, + "memory(GiB)": 117.38, + "step": 63175, + "train_speed(iter/s)": 1.636191 + }, + { + "acc": 0.66872358, + "epoch": 1.6027397260273972, + "grad_norm": 5.53125, + "learning_rate": 1.040425901499631e-06, + "loss": 1.57916679, + "memory(GiB)": 117.38, + "step": 63180, + "train_speed(iter/s)": 1.636204 + }, + { + "acc": 0.67065878, + "epoch": 1.6028665651953324, + "grad_norm": 6.375, + "learning_rate": 1.0397856658570216e-06, + "loss": 1.52845459, + "memory(GiB)": 117.38, + "step": 63185, + "train_speed(iter/s)": 1.636218 + }, + { + "acc": 0.65603676, + "epoch": 1.6029934043632674, + "grad_norm": 6.875, + "learning_rate": 1.03914560440232e-06, + "loss": 1.60039139, + "memory(GiB)": 117.38, + "step": 63190, + "train_speed(iter/s)": 1.636232 + }, + { + "acc": 0.64850883, + "epoch": 1.6031202435312024, + "grad_norm": 5.84375, + "learning_rate": 1.0385057171636791e-06, + "loss": 1.64557247, + "memory(GiB)": 117.38, + "step": 63195, + "train_speed(iter/s)": 1.636245 + }, + { + "acc": 0.67557688, + "epoch": 1.6032470826991374, + "grad_norm": 5.71875, + "learning_rate": 1.0378660041692435e-06, + "loss": 1.58251038, + "memory(GiB)": 117.38, + "step": 63200, + "train_speed(iter/s)": 1.63626 + }, + { + "acc": 0.66818404, + "epoch": 1.6033739218670724, + "grad_norm": 5.03125, + "learning_rate": 1.0372264654471504e-06, + "loss": 1.54091148, + "memory(GiB)": 117.38, + "step": 63205, + "train_speed(iter/s)": 1.636273 + }, + { + "acc": 0.66613693, + "epoch": 1.6035007610350076, + "grad_norm": 5.71875, + "learning_rate": 1.0365871010255303e-06, + "loss": 1.61775627, + "memory(GiB)": 117.38, + "step": 63210, + "train_speed(iter/s)": 1.636286 + }, + { + "acc": 0.65165963, + "epoch": 1.6036276002029428, + "grad_norm": 4.90625, + "learning_rate": 1.0359479109325043e-06, + "loss": 1.61288433, + "memory(GiB)": 117.38, + "step": 63215, + "train_speed(iter/s)": 1.6363 + }, + { + "acc": 0.67193794, + "epoch": 1.6037544393708778, + "grad_norm": 6.375, + "learning_rate": 1.0353088951961877e-06, + "loss": 1.5572052, + "memory(GiB)": 117.38, + "step": 63220, + "train_speed(iter/s)": 1.636313 + }, + { + "acc": 0.66676292, + "epoch": 1.6038812785388128, + "grad_norm": 5.8125, + "learning_rate": 1.0346700538446853e-06, + "loss": 1.55466948, + "memory(GiB)": 117.38, + "step": 63225, + "train_speed(iter/s)": 1.636327 + }, + { + "acc": 0.66083107, + "epoch": 1.6040081177067478, + "grad_norm": 5.34375, + "learning_rate": 1.034031386906101e-06, + "loss": 1.55668364, + "memory(GiB)": 117.38, + "step": 63230, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65881033, + "epoch": 1.6041349568746828, + "grad_norm": 4.40625, + "learning_rate": 1.0333928944085214e-06, + "loss": 1.57668314, + "memory(GiB)": 117.38, + "step": 63235, + "train_speed(iter/s)": 1.636354 + }, + { + "acc": 0.65985451, + "epoch": 1.604261796042618, + "grad_norm": 6.21875, + "learning_rate": 1.0327545763800322e-06, + "loss": 1.60206566, + "memory(GiB)": 117.38, + "step": 63240, + "train_speed(iter/s)": 1.636368 + }, + { + "acc": 0.66663475, + "epoch": 1.604388635210553, + "grad_norm": 12.8125, + "learning_rate": 1.0321164328487077e-06, + "loss": 1.5337532, + "memory(GiB)": 117.38, + "step": 63245, + "train_speed(iter/s)": 1.636381 + }, + { + "acc": 0.64081488, + "epoch": 1.6045154743784882, + "grad_norm": 5.46875, + "learning_rate": 1.0314784638426207e-06, + "loss": 1.62483768, + "memory(GiB)": 117.38, + "step": 63250, + "train_speed(iter/s)": 1.636394 + }, + { + "acc": 0.65010276, + "epoch": 1.6046423135464232, + "grad_norm": 5.5, + "learning_rate": 1.030840669389827e-06, + "loss": 1.60685387, + "memory(GiB)": 117.38, + "step": 63255, + "train_speed(iter/s)": 1.636408 + }, + { + "acc": 0.65588932, + "epoch": 1.6047691527143582, + "grad_norm": 6.34375, + "learning_rate": 1.0302030495183812e-06, + "loss": 1.61661301, + "memory(GiB)": 117.38, + "step": 63260, + "train_speed(iter/s)": 1.636421 + }, + { + "acc": 0.65224919, + "epoch": 1.6048959918822931, + "grad_norm": 6.15625, + "learning_rate": 1.0295656042563302e-06, + "loss": 1.58130646, + "memory(GiB)": 117.38, + "step": 63265, + "train_speed(iter/s)": 1.636433 + }, + { + "acc": 0.65845861, + "epoch": 1.6050228310502284, + "grad_norm": 5.34375, + "learning_rate": 1.0289283336317119e-06, + "loss": 1.59415398, + "memory(GiB)": 117.38, + "step": 63270, + "train_speed(iter/s)": 1.636448 + }, + { + "acc": 0.66230006, + "epoch": 1.6051496702181633, + "grad_norm": 5.84375, + "learning_rate": 1.0282912376725535e-06, + "loss": 1.62725029, + "memory(GiB)": 117.38, + "step": 63275, + "train_speed(iter/s)": 1.636463 + }, + { + "acc": 0.65525117, + "epoch": 1.6052765093860986, + "grad_norm": 5.71875, + "learning_rate": 1.0276543164068776e-06, + "loss": 1.63565178, + "memory(GiB)": 117.38, + "step": 63280, + "train_speed(iter/s)": 1.636477 + }, + { + "acc": 0.66614847, + "epoch": 1.6054033485540335, + "grad_norm": 6.40625, + "learning_rate": 1.0270175698627015e-06, + "loss": 1.54426994, + "memory(GiB)": 117.38, + "step": 63285, + "train_speed(iter/s)": 1.636491 + }, + { + "acc": 0.65716567, + "epoch": 1.6055301877219685, + "grad_norm": 5.6875, + "learning_rate": 1.0263809980680323e-06, + "loss": 1.57237215, + "memory(GiB)": 117.38, + "step": 63290, + "train_speed(iter/s)": 1.636504 + }, + { + "acc": 0.65186343, + "epoch": 1.6056570268899035, + "grad_norm": 6.4375, + "learning_rate": 1.0257446010508648e-06, + "loss": 1.56913452, + "memory(GiB)": 117.38, + "step": 63295, + "train_speed(iter/s)": 1.636519 + }, + { + "acc": 0.64145284, + "epoch": 1.6057838660578385, + "grad_norm": 5.46875, + "learning_rate": 1.0251083788391952e-06, + "loss": 1.75254288, + "memory(GiB)": 117.38, + "step": 63300, + "train_speed(iter/s)": 1.636533 + }, + { + "acc": 0.64904003, + "epoch": 1.6059107052257737, + "grad_norm": 6.84375, + "learning_rate": 1.0244723314610055e-06, + "loss": 1.60689201, + "memory(GiB)": 117.38, + "step": 63305, + "train_speed(iter/s)": 1.636547 + }, + { + "acc": 0.65746627, + "epoch": 1.606037544393709, + "grad_norm": 5.0, + "learning_rate": 1.0238364589442723e-06, + "loss": 1.67799797, + "memory(GiB)": 117.38, + "step": 63310, + "train_speed(iter/s)": 1.63656 + }, + { + "acc": 0.66065416, + "epoch": 1.606164383561644, + "grad_norm": 5.65625, + "learning_rate": 1.0232007613169637e-06, + "loss": 1.62901859, + "memory(GiB)": 117.38, + "step": 63315, + "train_speed(iter/s)": 1.636575 + }, + { + "acc": 0.66691303, + "epoch": 1.606291222729579, + "grad_norm": 7.625, + "learning_rate": 1.0225652386070406e-06, + "loss": 1.51083708, + "memory(GiB)": 117.38, + "step": 63320, + "train_speed(iter/s)": 1.636587 + }, + { + "acc": 0.65998182, + "epoch": 1.606418061897514, + "grad_norm": 6.59375, + "learning_rate": 1.0219298908424568e-06, + "loss": 1.60301971, + "memory(GiB)": 117.38, + "step": 63325, + "train_speed(iter/s)": 1.636601 + }, + { + "acc": 0.65262203, + "epoch": 1.606544901065449, + "grad_norm": 7.28125, + "learning_rate": 1.0212947180511567e-06, + "loss": 1.6119339, + "memory(GiB)": 117.38, + "step": 63330, + "train_speed(iter/s)": 1.636615 + }, + { + "acc": 0.6560483, + "epoch": 1.606671740233384, + "grad_norm": 6.09375, + "learning_rate": 1.020659720261079e-06, + "loss": 1.63825474, + "memory(GiB)": 117.38, + "step": 63335, + "train_speed(iter/s)": 1.636629 + }, + { + "acc": 0.66190338, + "epoch": 1.606798579401319, + "grad_norm": 5.46875, + "learning_rate": 1.020024897500153e-06, + "loss": 1.58186569, + "memory(GiB)": 117.38, + "step": 63340, + "train_speed(iter/s)": 1.636643 + }, + { + "acc": 0.66282234, + "epoch": 1.6069254185692543, + "grad_norm": 5.625, + "learning_rate": 1.0193902497963014e-06, + "loss": 1.55005302, + "memory(GiB)": 117.38, + "step": 63345, + "train_speed(iter/s)": 1.636657 + }, + { + "acc": 0.66492939, + "epoch": 1.6070522577371893, + "grad_norm": 7.4375, + "learning_rate": 1.0187557771774387e-06, + "loss": 1.61549568, + "memory(GiB)": 117.38, + "step": 63350, + "train_speed(iter/s)": 1.636671 + }, + { + "acc": 0.65569873, + "epoch": 1.6071790969051243, + "grad_norm": 5.84375, + "learning_rate": 1.0181214796714717e-06, + "loss": 1.66181927, + "memory(GiB)": 117.38, + "step": 63355, + "train_speed(iter/s)": 1.636684 + }, + { + "acc": 0.65417132, + "epoch": 1.6073059360730593, + "grad_norm": 5.0, + "learning_rate": 1.0174873573062998e-06, + "loss": 1.67353935, + "memory(GiB)": 117.38, + "step": 63360, + "train_speed(iter/s)": 1.636697 + }, + { + "acc": 0.66234956, + "epoch": 1.6074327752409943, + "grad_norm": 6.3125, + "learning_rate": 1.0168534101098148e-06, + "loss": 1.59113045, + "memory(GiB)": 117.38, + "step": 63365, + "train_speed(iter/s)": 1.636711 + }, + { + "acc": 0.6557631, + "epoch": 1.6075596144089295, + "grad_norm": 5.40625, + "learning_rate": 1.0162196381099004e-06, + "loss": 1.61100235, + "memory(GiB)": 117.38, + "step": 63370, + "train_speed(iter/s)": 1.636725 + }, + { + "acc": 0.66415567, + "epoch": 1.6076864535768647, + "grad_norm": 6.09375, + "learning_rate": 1.0155860413344327e-06, + "loss": 1.57860041, + "memory(GiB)": 117.38, + "step": 63375, + "train_speed(iter/s)": 1.636738 + }, + { + "acc": 0.66326399, + "epoch": 1.6078132927447997, + "grad_norm": 6.53125, + "learning_rate": 1.0149526198112797e-06, + "loss": 1.55059662, + "memory(GiB)": 117.38, + "step": 63380, + "train_speed(iter/s)": 1.636751 + }, + { + "acc": 0.66914272, + "epoch": 1.6079401319127347, + "grad_norm": 5.3125, + "learning_rate": 1.0143193735683016e-06, + "loss": 1.58386917, + "memory(GiB)": 117.38, + "step": 63385, + "train_speed(iter/s)": 1.636764 + }, + { + "acc": 0.6591526, + "epoch": 1.6080669710806696, + "grad_norm": 6.40625, + "learning_rate": 1.0136863026333543e-06, + "loss": 1.60580235, + "memory(GiB)": 117.38, + "step": 63390, + "train_speed(iter/s)": 1.636778 + }, + { + "acc": 0.65191479, + "epoch": 1.6081938102486046, + "grad_norm": 5.65625, + "learning_rate": 1.0130534070342802e-06, + "loss": 1.62703438, + "memory(GiB)": 117.38, + "step": 63395, + "train_speed(iter/s)": 1.636791 + }, + { + "acc": 0.66894474, + "epoch": 1.6083206494165398, + "grad_norm": 6.09375, + "learning_rate": 1.0124206867989157e-06, + "loss": 1.61279545, + "memory(GiB)": 117.38, + "step": 63400, + "train_speed(iter/s)": 1.636804 + }, + { + "acc": 0.64818106, + "epoch": 1.6084474885844748, + "grad_norm": 5.1875, + "learning_rate": 1.0117881419550945e-06, + "loss": 1.62728996, + "memory(GiB)": 117.38, + "step": 63405, + "train_speed(iter/s)": 1.636818 + }, + { + "acc": 0.66701317, + "epoch": 1.60857432775241, + "grad_norm": 6.125, + "learning_rate": 1.0111557725306382e-06, + "loss": 1.57233353, + "memory(GiB)": 117.38, + "step": 63410, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.63460493, + "epoch": 1.608701166920345, + "grad_norm": 6.4375, + "learning_rate": 1.0105235785533585e-06, + "loss": 1.6997776, + "memory(GiB)": 117.38, + "step": 63415, + "train_speed(iter/s)": 1.636844 + }, + { + "acc": 0.64710674, + "epoch": 1.60882800608828, + "grad_norm": 5.8125, + "learning_rate": 1.0098915600510623e-06, + "loss": 1.64802704, + "memory(GiB)": 117.38, + "step": 63420, + "train_speed(iter/s)": 1.636858 + }, + { + "acc": 0.66063366, + "epoch": 1.608954845256215, + "grad_norm": 9.5, + "learning_rate": 1.0092597170515512e-06, + "loss": 1.55857086, + "memory(GiB)": 117.38, + "step": 63425, + "train_speed(iter/s)": 1.636872 + }, + { + "acc": 0.65296106, + "epoch": 1.6090816844241502, + "grad_norm": 5.25, + "learning_rate": 1.0086280495826161e-06, + "loss": 1.61271439, + "memory(GiB)": 117.38, + "step": 63430, + "train_speed(iter/s)": 1.636885 + }, + { + "acc": 0.66210623, + "epoch": 1.6092085235920852, + "grad_norm": 5.125, + "learning_rate": 1.0079965576720375e-06, + "loss": 1.54774313, + "memory(GiB)": 117.38, + "step": 63435, + "train_speed(iter/s)": 1.636898 + }, + { + "acc": 0.66705546, + "epoch": 1.6093353627600204, + "grad_norm": 6.125, + "learning_rate": 1.0073652413475936e-06, + "loss": 1.57116737, + "memory(GiB)": 117.38, + "step": 63440, + "train_speed(iter/s)": 1.636911 + }, + { + "acc": 0.67672005, + "epoch": 1.6094622019279554, + "grad_norm": 7.15625, + "learning_rate": 1.0067341006370535e-06, + "loss": 1.5560276, + "memory(GiB)": 117.38, + "step": 63445, + "train_speed(iter/s)": 1.636923 + }, + { + "acc": 0.64681377, + "epoch": 1.6095890410958904, + "grad_norm": 5.53125, + "learning_rate": 1.0061031355681766e-06, + "loss": 1.55814152, + "memory(GiB)": 117.38, + "step": 63450, + "train_speed(iter/s)": 1.636937 + }, + { + "acc": 0.65562639, + "epoch": 1.6097158802638254, + "grad_norm": 5.15625, + "learning_rate": 1.0054723461687133e-06, + "loss": 1.63386269, + "memory(GiB)": 117.38, + "step": 63455, + "train_speed(iter/s)": 1.63695 + }, + { + "acc": 0.67083125, + "epoch": 1.6098427194317604, + "grad_norm": 4.875, + "learning_rate": 1.0048417324664118e-06, + "loss": 1.5490263, + "memory(GiB)": 117.38, + "step": 63460, + "train_speed(iter/s)": 1.636964 + }, + { + "acc": 0.66167526, + "epoch": 1.6099695585996956, + "grad_norm": 5.28125, + "learning_rate": 1.0042112944890075e-06, + "loss": 1.60895329, + "memory(GiB)": 117.38, + "step": 63465, + "train_speed(iter/s)": 1.636977 + }, + { + "acc": 0.67047067, + "epoch": 1.6100963977676308, + "grad_norm": 6.875, + "learning_rate": 1.003581032264231e-06, + "loss": 1.5283864, + "memory(GiB)": 117.38, + "step": 63470, + "train_speed(iter/s)": 1.63699 + }, + { + "acc": 0.66012039, + "epoch": 1.6102232369355658, + "grad_norm": 6.0, + "learning_rate": 1.0029509458198027e-06, + "loss": 1.56061935, + "memory(GiB)": 117.38, + "step": 63475, + "train_speed(iter/s)": 1.637005 + }, + { + "acc": 0.66320295, + "epoch": 1.6103500761035008, + "grad_norm": 5.84375, + "learning_rate": 1.0023210351834378e-06, + "loss": 1.61925793, + "memory(GiB)": 117.38, + "step": 63480, + "train_speed(iter/s)": 1.637018 + }, + { + "acc": 0.65719404, + "epoch": 1.6104769152714358, + "grad_norm": 5.8125, + "learning_rate": 1.001691300382842e-06, + "loss": 1.61541443, + "memory(GiB)": 117.38, + "step": 63485, + "train_speed(iter/s)": 1.637031 + }, + { + "acc": 0.66457138, + "epoch": 1.6106037544393708, + "grad_norm": 5.3125, + "learning_rate": 1.001061741445714e-06, + "loss": 1.53346319, + "memory(GiB)": 117.38, + "step": 63490, + "train_speed(iter/s)": 1.637044 + }, + { + "acc": 0.64835467, + "epoch": 1.610730593607306, + "grad_norm": 6.0625, + "learning_rate": 1.000432358399745e-06, + "loss": 1.59138069, + "memory(GiB)": 117.38, + "step": 63495, + "train_speed(iter/s)": 1.637057 + }, + { + "acc": 0.67487812, + "epoch": 1.610857432775241, + "grad_norm": 6.125, + "learning_rate": 9.99803151272617e-07, + "loss": 1.54327688, + "memory(GiB)": 117.38, + "step": 63500, + "train_speed(iter/s)": 1.637069 + }, + { + "acc": 0.66052384, + "epoch": 1.6109842719431762, + "grad_norm": 6.96875, + "learning_rate": 9.991741200920062e-07, + "loss": 1.64372787, + "memory(GiB)": 117.38, + "step": 63505, + "train_speed(iter/s)": 1.637083 + }, + { + "acc": 0.6670166, + "epoch": 1.6111111111111112, + "grad_norm": 5.28125, + "learning_rate": 9.985452648855803e-07, + "loss": 1.56333094, + "memory(GiB)": 117.38, + "step": 63510, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.65294228, + "epoch": 1.6112379502790461, + "grad_norm": 5.4375, + "learning_rate": 9.979165856809985e-07, + "loss": 1.5945447, + "memory(GiB)": 117.38, + "step": 63515, + "train_speed(iter/s)": 1.63711 + }, + { + "acc": 0.65545826, + "epoch": 1.6113647894469811, + "grad_norm": 6.21875, + "learning_rate": 9.972880825059134e-07, + "loss": 1.64409695, + "memory(GiB)": 117.38, + "step": 63520, + "train_speed(iter/s)": 1.637124 + }, + { + "acc": 0.65820494, + "epoch": 1.6114916286149161, + "grad_norm": 5.65625, + "learning_rate": 9.966597553879681e-07, + "loss": 1.5589798, + "memory(GiB)": 117.38, + "step": 63525, + "train_speed(iter/s)": 1.637137 + }, + { + "acc": 0.66281343, + "epoch": 1.6116184677828513, + "grad_norm": 6.03125, + "learning_rate": 9.96031604354803e-07, + "loss": 1.65268631, + "memory(GiB)": 117.38, + "step": 63530, + "train_speed(iter/s)": 1.637151 + }, + { + "acc": 0.65771389, + "epoch": 1.6117453069507865, + "grad_norm": 6.84375, + "learning_rate": 9.954036294340425e-07, + "loss": 1.64918041, + "memory(GiB)": 117.38, + "step": 63535, + "train_speed(iter/s)": 1.637165 + }, + { + "acc": 0.66634417, + "epoch": 1.6118721461187215, + "grad_norm": 7.0625, + "learning_rate": 9.947758306533101e-07, + "loss": 1.5305583, + "memory(GiB)": 117.38, + "step": 63540, + "train_speed(iter/s)": 1.637179 + }, + { + "acc": 0.66381764, + "epoch": 1.6119989852866565, + "grad_norm": 5.15625, + "learning_rate": 9.941482080402177e-07, + "loss": 1.57927227, + "memory(GiB)": 117.38, + "step": 63545, + "train_speed(iter/s)": 1.637193 + }, + { + "acc": 0.6571826, + "epoch": 1.6121258244545915, + "grad_norm": 6.375, + "learning_rate": 9.935207616223741e-07, + "loss": 1.58145609, + "memory(GiB)": 117.38, + "step": 63550, + "train_speed(iter/s)": 1.637207 + }, + { + "acc": 0.66311655, + "epoch": 1.6122526636225265, + "grad_norm": 6.09375, + "learning_rate": 9.928934914273735e-07, + "loss": 1.57143135, + "memory(GiB)": 117.38, + "step": 63555, + "train_speed(iter/s)": 1.63722 + }, + { + "acc": 0.66612053, + "epoch": 1.6123795027904617, + "grad_norm": 5.03125, + "learning_rate": 9.922663974828066e-07, + "loss": 1.60198345, + "memory(GiB)": 117.38, + "step": 63560, + "train_speed(iter/s)": 1.637234 + }, + { + "acc": 0.64748149, + "epoch": 1.6125063419583967, + "grad_norm": 5.25, + "learning_rate": 9.916394798162582e-07, + "loss": 1.60756569, + "memory(GiB)": 117.38, + "step": 63565, + "train_speed(iter/s)": 1.637248 + }, + { + "acc": 0.65283551, + "epoch": 1.612633181126332, + "grad_norm": 5.6875, + "learning_rate": 9.91012738455303e-07, + "loss": 1.59576664, + "memory(GiB)": 117.38, + "step": 63570, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.6569778, + "epoch": 1.612760020294267, + "grad_norm": 5.84375, + "learning_rate": 9.903861734275032e-07, + "loss": 1.53625526, + "memory(GiB)": 117.38, + "step": 63575, + "train_speed(iter/s)": 1.637276 + }, + { + "acc": 0.67263918, + "epoch": 1.612886859462202, + "grad_norm": 5.8125, + "learning_rate": 9.897597847604228e-07, + "loss": 1.58514309, + "memory(GiB)": 117.38, + "step": 63580, + "train_speed(iter/s)": 1.63729 + }, + { + "acc": 0.63764133, + "epoch": 1.6130136986301369, + "grad_norm": 8.0625, + "learning_rate": 9.89133572481612e-07, + "loss": 1.66299706, + "memory(GiB)": 117.38, + "step": 63585, + "train_speed(iter/s)": 1.637304 + }, + { + "acc": 0.65980377, + "epoch": 1.613140537798072, + "grad_norm": 5.6875, + "learning_rate": 9.885075366186148e-07, + "loss": 1.59518061, + "memory(GiB)": 117.38, + "step": 63590, + "train_speed(iter/s)": 1.637318 + }, + { + "acc": 0.6663126, + "epoch": 1.613267376966007, + "grad_norm": 4.40625, + "learning_rate": 9.87881677198963e-07, + "loss": 1.5243556, + "memory(GiB)": 117.38, + "step": 63595, + "train_speed(iter/s)": 1.637332 + }, + { + "acc": 0.64048948, + "epoch": 1.6133942161339423, + "grad_norm": 5.71875, + "learning_rate": 9.872559942501897e-07, + "loss": 1.71440315, + "memory(GiB)": 117.38, + "step": 63600, + "train_speed(iter/s)": 1.637347 + }, + { + "acc": 0.63816881, + "epoch": 1.6135210553018773, + "grad_norm": 5.125, + "learning_rate": 9.866304877998134e-07, + "loss": 1.66364098, + "memory(GiB)": 117.38, + "step": 63605, + "train_speed(iter/s)": 1.637362 + }, + { + "acc": 0.65931482, + "epoch": 1.6136478944698123, + "grad_norm": 6.09375, + "learning_rate": 9.860051578753466e-07, + "loss": 1.62982101, + "memory(GiB)": 117.38, + "step": 63610, + "train_speed(iter/s)": 1.637375 + }, + { + "acc": 0.65058303, + "epoch": 1.6137747336377473, + "grad_norm": 6.03125, + "learning_rate": 9.853800045042938e-07, + "loss": 1.60727596, + "memory(GiB)": 117.38, + "step": 63615, + "train_speed(iter/s)": 1.637388 + }, + { + "acc": 0.65808096, + "epoch": 1.6139015728056822, + "grad_norm": 4.9375, + "learning_rate": 9.847550277141526e-07, + "loss": 1.59751825, + "memory(GiB)": 117.38, + "step": 63620, + "train_speed(iter/s)": 1.637402 + }, + { + "acc": 0.65432973, + "epoch": 1.6140284119736175, + "grad_norm": 6.25, + "learning_rate": 9.841302275324128e-07, + "loss": 1.57043667, + "memory(GiB)": 117.38, + "step": 63625, + "train_speed(iter/s)": 1.637415 + }, + { + "acc": 0.65543289, + "epoch": 1.6141552511415527, + "grad_norm": 6.59375, + "learning_rate": 9.835056039865542e-07, + "loss": 1.61725349, + "memory(GiB)": 117.38, + "step": 63630, + "train_speed(iter/s)": 1.637429 + }, + { + "acc": 0.64806819, + "epoch": 1.6142820903094877, + "grad_norm": 9.375, + "learning_rate": 9.82881157104052e-07, + "loss": 1.66217003, + "memory(GiB)": 117.38, + "step": 63635, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.64420433, + "epoch": 1.6144089294774226, + "grad_norm": 4.9375, + "learning_rate": 9.822568869123712e-07, + "loss": 1.69623299, + "memory(GiB)": 117.38, + "step": 63640, + "train_speed(iter/s)": 1.637456 + }, + { + "acc": 0.65110359, + "epoch": 1.6145357686453576, + "grad_norm": 4.78125, + "learning_rate": 9.816327934389707e-07, + "loss": 1.58794594, + "memory(GiB)": 117.38, + "step": 63645, + "train_speed(iter/s)": 1.63747 + }, + { + "acc": 0.66972895, + "epoch": 1.6146626078132926, + "grad_norm": 5.09375, + "learning_rate": 9.810088767113008e-07, + "loss": 1.52417698, + "memory(GiB)": 117.38, + "step": 63650, + "train_speed(iter/s)": 1.637483 + }, + { + "acc": 0.64999018, + "epoch": 1.6147894469812278, + "grad_norm": 5.8125, + "learning_rate": 9.80385136756804e-07, + "loss": 1.64205666, + "memory(GiB)": 117.38, + "step": 63655, + "train_speed(iter/s)": 1.637496 + }, + { + "acc": 0.65315814, + "epoch": 1.6149162861491628, + "grad_norm": 6.40625, + "learning_rate": 9.797615736029148e-07, + "loss": 1.62650452, + "memory(GiB)": 117.38, + "step": 63660, + "train_speed(iter/s)": 1.637511 + }, + { + "acc": 0.66172247, + "epoch": 1.615043125317098, + "grad_norm": 5.0625, + "learning_rate": 9.791381872770594e-07, + "loss": 1.60049706, + "memory(GiB)": 117.38, + "step": 63665, + "train_speed(iter/s)": 1.637524 + }, + { + "acc": 0.65203691, + "epoch": 1.615169964485033, + "grad_norm": 5.8125, + "learning_rate": 9.785149778066615e-07, + "loss": 1.64893551, + "memory(GiB)": 117.38, + "step": 63670, + "train_speed(iter/s)": 1.637537 + }, + { + "acc": 0.66618047, + "epoch": 1.615296803652968, + "grad_norm": 6.25, + "learning_rate": 9.778919452191277e-07, + "loss": 1.58618259, + "memory(GiB)": 117.38, + "step": 63675, + "train_speed(iter/s)": 1.637551 + }, + { + "acc": 0.68306036, + "epoch": 1.615423642820903, + "grad_norm": 7.0, + "learning_rate": 9.77269089541864e-07, + "loss": 1.52129574, + "memory(GiB)": 117.38, + "step": 63680, + "train_speed(iter/s)": 1.637565 + }, + { + "acc": 0.65918469, + "epoch": 1.615550481988838, + "grad_norm": 5.4375, + "learning_rate": 9.766464108022644e-07, + "loss": 1.66668892, + "memory(GiB)": 117.38, + "step": 63685, + "train_speed(iter/s)": 1.637579 + }, + { + "acc": 0.66751914, + "epoch": 1.6156773211567732, + "grad_norm": 5.90625, + "learning_rate": 9.760239090277213e-07, + "loss": 1.57237225, + "memory(GiB)": 117.38, + "step": 63690, + "train_speed(iter/s)": 1.637592 + }, + { + "acc": 0.67409949, + "epoch": 1.6158041603247084, + "grad_norm": 6.5, + "learning_rate": 9.75401584245611e-07, + "loss": 1.55902081, + "memory(GiB)": 117.38, + "step": 63695, + "train_speed(iter/s)": 1.637606 + }, + { + "acc": 0.6373436, + "epoch": 1.6159309994926434, + "grad_norm": 6.0625, + "learning_rate": 9.747794364833063e-07, + "loss": 1.71150436, + "memory(GiB)": 117.38, + "step": 63700, + "train_speed(iter/s)": 1.63762 + }, + { + "acc": 0.64090705, + "epoch": 1.6160578386605784, + "grad_norm": 5.3125, + "learning_rate": 9.741574657681747e-07, + "loss": 1.65247822, + "memory(GiB)": 117.38, + "step": 63705, + "train_speed(iter/s)": 1.637633 + }, + { + "acc": 0.64660087, + "epoch": 1.6161846778285134, + "grad_norm": 5.75, + "learning_rate": 9.735356721275734e-07, + "loss": 1.60407181, + "memory(GiB)": 117.38, + "step": 63710, + "train_speed(iter/s)": 1.637646 + }, + { + "acc": 0.65684347, + "epoch": 1.6163115169964484, + "grad_norm": 6.46875, + "learning_rate": 9.729140555888483e-07, + "loss": 1.4954215, + "memory(GiB)": 117.38, + "step": 63715, + "train_speed(iter/s)": 1.63766 + }, + { + "acc": 0.66543336, + "epoch": 1.6164383561643836, + "grad_norm": 6.5, + "learning_rate": 9.722926161793417e-07, + "loss": 1.5744091, + "memory(GiB)": 117.38, + "step": 63720, + "train_speed(iter/s)": 1.637673 + }, + { + "acc": 0.65732365, + "epoch": 1.6165651953323186, + "grad_norm": 5.78125, + "learning_rate": 9.716713539263895e-07, + "loss": 1.63254242, + "memory(GiB)": 117.38, + "step": 63725, + "train_speed(iter/s)": 1.637686 + }, + { + "acc": 0.65619011, + "epoch": 1.6166920345002538, + "grad_norm": 5.34375, + "learning_rate": 9.710502688573175e-07, + "loss": 1.60967789, + "memory(GiB)": 117.38, + "step": 63730, + "train_speed(iter/s)": 1.6377 + }, + { + "acc": 0.65877409, + "epoch": 1.6168188736681888, + "grad_norm": 5.15625, + "learning_rate": 9.704293609994403e-07, + "loss": 1.61505661, + "memory(GiB)": 117.38, + "step": 63735, + "train_speed(iter/s)": 1.637713 + }, + { + "acc": 0.6580019, + "epoch": 1.6169457128361238, + "grad_norm": 5.40625, + "learning_rate": 9.69808630380072e-07, + "loss": 1.63126183, + "memory(GiB)": 117.38, + "step": 63740, + "train_speed(iter/s)": 1.637726 + }, + { + "acc": 0.66299114, + "epoch": 1.6170725520040587, + "grad_norm": 5.875, + "learning_rate": 9.691880770265132e-07, + "loss": 1.5350893, + "memory(GiB)": 117.38, + "step": 63745, + "train_speed(iter/s)": 1.637738 + }, + { + "acc": 0.6549655, + "epoch": 1.617199391171994, + "grad_norm": 6.375, + "learning_rate": 9.685677009660587e-07, + "loss": 1.64639626, + "memory(GiB)": 117.38, + "step": 63750, + "train_speed(iter/s)": 1.637751 + }, + { + "acc": 0.64077358, + "epoch": 1.617326230339929, + "grad_norm": 7.59375, + "learning_rate": 9.679475022259965e-07, + "loss": 1.69925995, + "memory(GiB)": 117.38, + "step": 63755, + "train_speed(iter/s)": 1.637765 + }, + { + "acc": 0.65708489, + "epoch": 1.6174530695078642, + "grad_norm": 6.25, + "learning_rate": 9.673274808336047e-07, + "loss": 1.63301392, + "memory(GiB)": 117.38, + "step": 63760, + "train_speed(iter/s)": 1.637778 + }, + { + "acc": 0.65188012, + "epoch": 1.6175799086757991, + "grad_norm": 5.90625, + "learning_rate": 9.66707636816155e-07, + "loss": 1.69917145, + "memory(GiB)": 117.38, + "step": 63765, + "train_speed(iter/s)": 1.637791 + }, + { + "acc": 0.65312948, + "epoch": 1.6177067478437341, + "grad_norm": 6.28125, + "learning_rate": 9.660879702009106e-07, + "loss": 1.68750992, + "memory(GiB)": 117.38, + "step": 63770, + "train_speed(iter/s)": 1.637805 + }, + { + "acc": 0.63581977, + "epoch": 1.6178335870116691, + "grad_norm": 7.15625, + "learning_rate": 9.654684810151276e-07, + "loss": 1.69862938, + "memory(GiB)": 117.38, + "step": 63775, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.65914712, + "epoch": 1.6179604261796041, + "grad_norm": 5.875, + "learning_rate": 9.648491692860534e-07, + "loss": 1.5766984, + "memory(GiB)": 117.38, + "step": 63780, + "train_speed(iter/s)": 1.637833 + }, + { + "acc": 0.65791836, + "epoch": 1.6180872653475393, + "grad_norm": 5.78125, + "learning_rate": 9.642300350409289e-07, + "loss": 1.59953127, + "memory(GiB)": 117.38, + "step": 63785, + "train_speed(iter/s)": 1.637846 + }, + { + "acc": 0.64600964, + "epoch": 1.6182141045154745, + "grad_norm": 6.96875, + "learning_rate": 9.636110783069852e-07, + "loss": 1.70841484, + "memory(GiB)": 117.38, + "step": 63790, + "train_speed(iter/s)": 1.637858 + }, + { + "acc": 0.64777722, + "epoch": 1.6183409436834095, + "grad_norm": 5.59375, + "learning_rate": 9.629922991114482e-07, + "loss": 1.67116699, + "memory(GiB)": 117.38, + "step": 63795, + "train_speed(iter/s)": 1.637872 + }, + { + "acc": 0.64868355, + "epoch": 1.6184677828513445, + "grad_norm": 5.375, + "learning_rate": 9.623736974815334e-07, + "loss": 1.60910454, + "memory(GiB)": 117.38, + "step": 63800, + "train_speed(iter/s)": 1.637885 + }, + { + "acc": 0.67498946, + "epoch": 1.6185946220192795, + "grad_norm": 6.3125, + "learning_rate": 9.617552734444502e-07, + "loss": 1.49038763, + "memory(GiB)": 117.38, + "step": 63805, + "train_speed(iter/s)": 1.637899 + }, + { + "acc": 0.66613836, + "epoch": 1.6187214611872145, + "grad_norm": 4.625, + "learning_rate": 9.611370270273996e-07, + "loss": 1.50617428, + "memory(GiB)": 117.38, + "step": 63810, + "train_speed(iter/s)": 1.637912 + }, + { + "acc": 0.65473881, + "epoch": 1.6188483003551497, + "grad_norm": 5.09375, + "learning_rate": 9.605189582575741e-07, + "loss": 1.57273035, + "memory(GiB)": 117.38, + "step": 63815, + "train_speed(iter/s)": 1.637926 + }, + { + "acc": 0.65938063, + "epoch": 1.6189751395230847, + "grad_norm": 5.96875, + "learning_rate": 9.599010671621605e-07, + "loss": 1.63436584, + "memory(GiB)": 117.38, + "step": 63820, + "train_speed(iter/s)": 1.637938 + }, + { + "acc": 0.64908562, + "epoch": 1.61910197869102, + "grad_norm": 5.65625, + "learning_rate": 9.592833537683344e-07, + "loss": 1.63700027, + "memory(GiB)": 117.38, + "step": 63825, + "train_speed(iter/s)": 1.637952 + }, + { + "acc": 0.64169598, + "epoch": 1.619228817858955, + "grad_norm": 6.28125, + "learning_rate": 9.586658181032693e-07, + "loss": 1.65708389, + "memory(GiB)": 117.38, + "step": 63830, + "train_speed(iter/s)": 1.637966 + }, + { + "acc": 0.65639119, + "epoch": 1.6193556570268899, + "grad_norm": 4.96875, + "learning_rate": 9.580484601941237e-07, + "loss": 1.55072823, + "memory(GiB)": 117.38, + "step": 63835, + "train_speed(iter/s)": 1.637978 + }, + { + "acc": 0.65030546, + "epoch": 1.6194824961948249, + "grad_norm": 6.53125, + "learning_rate": 9.574312800680514e-07, + "loss": 1.65216236, + "memory(GiB)": 117.38, + "step": 63840, + "train_speed(iter/s)": 1.637991 + }, + { + "acc": 0.65782022, + "epoch": 1.6196093353627599, + "grad_norm": 6.125, + "learning_rate": 9.56814277752201e-07, + "loss": 1.57568588, + "memory(GiB)": 117.38, + "step": 63845, + "train_speed(iter/s)": 1.638004 + }, + { + "acc": 0.64032516, + "epoch": 1.619736174530695, + "grad_norm": 8.0625, + "learning_rate": 9.561974532737124e-07, + "loss": 1.68418541, + "memory(GiB)": 117.38, + "step": 63850, + "train_speed(iter/s)": 1.638018 + }, + { + "acc": 0.64176078, + "epoch": 1.6198630136986303, + "grad_norm": 5.0625, + "learning_rate": 9.555808066597123e-07, + "loss": 1.56404171, + "memory(GiB)": 117.38, + "step": 63855, + "train_speed(iter/s)": 1.638032 + }, + { + "acc": 0.66833396, + "epoch": 1.6199898528665653, + "grad_norm": 6.03125, + "learning_rate": 9.549643379373236e-07, + "loss": 1.56283779, + "memory(GiB)": 117.38, + "step": 63860, + "train_speed(iter/s)": 1.638046 + }, + { + "acc": 0.65467529, + "epoch": 1.6201166920345003, + "grad_norm": 6.46875, + "learning_rate": 9.543480471336652e-07, + "loss": 1.58498173, + "memory(GiB)": 117.38, + "step": 63865, + "train_speed(iter/s)": 1.638059 + }, + { + "acc": 0.68168626, + "epoch": 1.6202435312024352, + "grad_norm": 5.40625, + "learning_rate": 9.537319342758434e-07, + "loss": 1.51265812, + "memory(GiB)": 117.38, + "step": 63870, + "train_speed(iter/s)": 1.638074 + }, + { + "acc": 0.64589157, + "epoch": 1.6203703703703702, + "grad_norm": 7.65625, + "learning_rate": 9.531159993909533e-07, + "loss": 1.59957457, + "memory(GiB)": 117.38, + "step": 63875, + "train_speed(iter/s)": 1.638088 + }, + { + "acc": 0.67579846, + "epoch": 1.6204972095383054, + "grad_norm": 5.8125, + "learning_rate": 9.525002425060914e-07, + "loss": 1.48189831, + "memory(GiB)": 117.38, + "step": 63880, + "train_speed(iter/s)": 1.638101 + }, + { + "acc": 0.6530838, + "epoch": 1.6206240487062404, + "grad_norm": 5.40625, + "learning_rate": 9.518846636483392e-07, + "loss": 1.62513905, + "memory(GiB)": 117.38, + "step": 63885, + "train_speed(iter/s)": 1.638114 + }, + { + "acc": 0.63869891, + "epoch": 1.6207508878741756, + "grad_norm": 4.84375, + "learning_rate": 9.512692628447745e-07, + "loss": 1.68012791, + "memory(GiB)": 117.38, + "step": 63890, + "train_speed(iter/s)": 1.638126 + }, + { + "acc": 0.65494466, + "epoch": 1.6208777270421106, + "grad_norm": 6.03125, + "learning_rate": 9.506540401224612e-07, + "loss": 1.58157272, + "memory(GiB)": 117.38, + "step": 63895, + "train_speed(iter/s)": 1.638139 + }, + { + "acc": 0.65738754, + "epoch": 1.6210045662100456, + "grad_norm": 5.03125, + "learning_rate": 9.500389955084638e-07, + "loss": 1.58201752, + "memory(GiB)": 117.38, + "step": 63900, + "train_speed(iter/s)": 1.638152 + }, + { + "acc": 0.66312809, + "epoch": 1.6211314053779806, + "grad_norm": 7.4375, + "learning_rate": 9.494241290298334e-07, + "loss": 1.60181408, + "memory(GiB)": 117.38, + "step": 63905, + "train_speed(iter/s)": 1.638166 + }, + { + "acc": 0.66506815, + "epoch": 1.6212582445459158, + "grad_norm": 6.375, + "learning_rate": 9.488094407136139e-07, + "loss": 1.6024065, + "memory(GiB)": 117.38, + "step": 63910, + "train_speed(iter/s)": 1.63818 + }, + { + "acc": 0.65851274, + "epoch": 1.6213850837138508, + "grad_norm": 6.71875, + "learning_rate": 9.481949305868421e-07, + "loss": 1.60496998, + "memory(GiB)": 117.38, + "step": 63915, + "train_speed(iter/s)": 1.638193 + }, + { + "acc": 0.64477277, + "epoch": 1.621511922881786, + "grad_norm": 7.1875, + "learning_rate": 9.475805986765479e-07, + "loss": 1.67787819, + "memory(GiB)": 117.38, + "step": 63920, + "train_speed(iter/s)": 1.638207 + }, + { + "acc": 0.64880447, + "epoch": 1.621638762049721, + "grad_norm": 4.9375, + "learning_rate": 9.469664450097515e-07, + "loss": 1.56779327, + "memory(GiB)": 117.38, + "step": 63925, + "train_speed(iter/s)": 1.63822 + }, + { + "acc": 0.6680727, + "epoch": 1.621765601217656, + "grad_norm": 5.65625, + "learning_rate": 9.463524696134663e-07, + "loss": 1.62146034, + "memory(GiB)": 117.38, + "step": 63930, + "train_speed(iter/s)": 1.638234 + }, + { + "acc": 0.65873756, + "epoch": 1.621892440385591, + "grad_norm": 5.875, + "learning_rate": 9.457386725146978e-07, + "loss": 1.57686338, + "memory(GiB)": 117.38, + "step": 63935, + "train_speed(iter/s)": 1.638246 + }, + { + "acc": 0.64717083, + "epoch": 1.622019279553526, + "grad_norm": 5.03125, + "learning_rate": 9.451250537404433e-07, + "loss": 1.63794975, + "memory(GiB)": 117.38, + "step": 63940, + "train_speed(iter/s)": 1.63826 + }, + { + "acc": 0.65129347, + "epoch": 1.6221461187214612, + "grad_norm": 5.03125, + "learning_rate": 9.44511613317693e-07, + "loss": 1.61691475, + "memory(GiB)": 117.38, + "step": 63945, + "train_speed(iter/s)": 1.638273 + }, + { + "acc": 0.65615168, + "epoch": 1.6222729578893964, + "grad_norm": 7.34375, + "learning_rate": 9.438983512734279e-07, + "loss": 1.60385094, + "memory(GiB)": 117.38, + "step": 63950, + "train_speed(iter/s)": 1.638286 + }, + { + "acc": 0.66852179, + "epoch": 1.6223997970573314, + "grad_norm": 5.90625, + "learning_rate": 9.432852676346233e-07, + "loss": 1.56938505, + "memory(GiB)": 117.38, + "step": 63955, + "train_speed(iter/s)": 1.638301 + }, + { + "acc": 0.65511212, + "epoch": 1.6225266362252664, + "grad_norm": 6.875, + "learning_rate": 9.426723624282436e-07, + "loss": 1.62275887, + "memory(GiB)": 117.38, + "step": 63960, + "train_speed(iter/s)": 1.638315 + }, + { + "acc": 0.6600563, + "epoch": 1.6226534753932014, + "grad_norm": 5.625, + "learning_rate": 9.420596356812473e-07, + "loss": 1.59218225, + "memory(GiB)": 117.38, + "step": 63965, + "train_speed(iter/s)": 1.638328 + }, + { + "acc": 0.6611042, + "epoch": 1.6227803145611364, + "grad_norm": 5.78125, + "learning_rate": 9.414470874205883e-07, + "loss": 1.52277365, + "memory(GiB)": 117.38, + "step": 63970, + "train_speed(iter/s)": 1.638342 + }, + { + "acc": 0.65541153, + "epoch": 1.6229071537290716, + "grad_norm": 7.28125, + "learning_rate": 9.408347176732053e-07, + "loss": 1.62493229, + "memory(GiB)": 117.38, + "step": 63975, + "train_speed(iter/s)": 1.638356 + }, + { + "acc": 0.67996044, + "epoch": 1.6230339928970066, + "grad_norm": 5.3125, + "learning_rate": 9.402225264660336e-07, + "loss": 1.4980999, + "memory(GiB)": 117.38, + "step": 63980, + "train_speed(iter/s)": 1.638369 + }, + { + "acc": 0.6695806, + "epoch": 1.6231608320649418, + "grad_norm": 6.78125, + "learning_rate": 9.396105138259997e-07, + "loss": 1.54065456, + "memory(GiB)": 117.38, + "step": 63985, + "train_speed(iter/s)": 1.638383 + }, + { + "acc": 0.66249571, + "epoch": 1.6232876712328768, + "grad_norm": 6.46875, + "learning_rate": 9.389986797800271e-07, + "loss": 1.56858435, + "memory(GiB)": 117.38, + "step": 63990, + "train_speed(iter/s)": 1.638396 + }, + { + "acc": 0.65884171, + "epoch": 1.6234145104008117, + "grad_norm": 5.59375, + "learning_rate": 9.383870243550214e-07, + "loss": 1.57668962, + "memory(GiB)": 117.38, + "step": 63995, + "train_speed(iter/s)": 1.63841 + }, + { + "acc": 0.65744395, + "epoch": 1.6235413495687467, + "grad_norm": 4.8125, + "learning_rate": 9.377755475778877e-07, + "loss": 1.5917057, + "memory(GiB)": 117.38, + "step": 64000, + "train_speed(iter/s)": 1.638425 + }, + { + "epoch": 1.6235413495687467, + "eval_acc": 0.646299417153764, + "eval_loss": 1.5733387470245361, + "eval_runtime": 58.7285, + "eval_samples_per_second": 108.465, + "eval_steps_per_second": 27.125, + "step": 64000 + }, + { + "acc": 0.65668488, + "epoch": 1.6236681887366817, + "grad_norm": 4.53125, + "learning_rate": 9.371642494755223e-07, + "loss": 1.5850708, + "memory(GiB)": 117.38, + "step": 64005, + "train_speed(iter/s)": 1.635799 + }, + { + "acc": 0.66200113, + "epoch": 1.623795027904617, + "grad_norm": 5.21875, + "learning_rate": 9.365531300748143e-07, + "loss": 1.56423283, + "memory(GiB)": 117.38, + "step": 64010, + "train_speed(iter/s)": 1.635812 + }, + { + "acc": 0.66863675, + "epoch": 1.6239218670725521, + "grad_norm": 5.78125, + "learning_rate": 9.359421894026394e-07, + "loss": 1.50921106, + "memory(GiB)": 117.38, + "step": 64015, + "train_speed(iter/s)": 1.635825 + }, + { + "acc": 0.62581825, + "epoch": 1.6240487062404871, + "grad_norm": 5.8125, + "learning_rate": 9.353314274858726e-07, + "loss": 1.69167366, + "memory(GiB)": 117.38, + "step": 64020, + "train_speed(iter/s)": 1.635839 + }, + { + "acc": 0.64224763, + "epoch": 1.6241755454084221, + "grad_norm": 6.0, + "learning_rate": 9.347208443513773e-07, + "loss": 1.61048775, + "memory(GiB)": 117.38, + "step": 64025, + "train_speed(iter/s)": 1.635852 + }, + { + "acc": 0.65394907, + "epoch": 1.6243023845763571, + "grad_norm": 5.28125, + "learning_rate": 9.341104400260103e-07, + "loss": 1.63138447, + "memory(GiB)": 117.38, + "step": 64030, + "train_speed(iter/s)": 1.635865 + }, + { + "acc": 0.65322199, + "epoch": 1.624429223744292, + "grad_norm": 5.4375, + "learning_rate": 9.335002145366167e-07, + "loss": 1.64252357, + "memory(GiB)": 117.38, + "step": 64035, + "train_speed(iter/s)": 1.635879 + }, + { + "acc": 0.65276918, + "epoch": 1.6245560629122273, + "grad_norm": 6.21875, + "learning_rate": 9.328901679100405e-07, + "loss": 1.59390326, + "memory(GiB)": 117.38, + "step": 64040, + "train_speed(iter/s)": 1.635894 + }, + { + "acc": 0.63086853, + "epoch": 1.6246829020801623, + "grad_norm": 4.875, + "learning_rate": 9.322803001731128e-07, + "loss": 1.66026917, + "memory(GiB)": 117.38, + "step": 64045, + "train_speed(iter/s)": 1.635907 + }, + { + "acc": 0.64947605, + "epoch": 1.6248097412480975, + "grad_norm": 7.03125, + "learning_rate": 9.316706113526591e-07, + "loss": 1.62306824, + "memory(GiB)": 117.38, + "step": 64050, + "train_speed(iter/s)": 1.635921 + }, + { + "acc": 0.6578341, + "epoch": 1.6249365804160325, + "grad_norm": 5.28125, + "learning_rate": 9.310611014754956e-07, + "loss": 1.64150639, + "memory(GiB)": 117.38, + "step": 64055, + "train_speed(iter/s)": 1.635935 + }, + { + "acc": 0.63564014, + "epoch": 1.6250634195839675, + "grad_norm": 5.8125, + "learning_rate": 9.304517705684308e-07, + "loss": 1.65794983, + "memory(GiB)": 117.38, + "step": 64060, + "train_speed(iter/s)": 1.635948 + }, + { + "acc": 0.65869732, + "epoch": 1.6251902587519025, + "grad_norm": 6.65625, + "learning_rate": 9.298426186582671e-07, + "loss": 1.6023632, + "memory(GiB)": 117.38, + "step": 64065, + "train_speed(iter/s)": 1.635962 + }, + { + "acc": 0.64586973, + "epoch": 1.6253170979198377, + "grad_norm": 5.5625, + "learning_rate": 9.292336457717965e-07, + "loss": 1.63211555, + "memory(GiB)": 117.38, + "step": 64070, + "train_speed(iter/s)": 1.635976 + }, + { + "acc": 0.64693317, + "epoch": 1.6254439370877727, + "grad_norm": 6.34375, + "learning_rate": 9.286248519358049e-07, + "loss": 1.65288639, + "memory(GiB)": 117.38, + "step": 64075, + "train_speed(iter/s)": 1.63599 + }, + { + "acc": 0.6616128, + "epoch": 1.625570776255708, + "grad_norm": 6.0, + "learning_rate": 9.280162371770696e-07, + "loss": 1.50730543, + "memory(GiB)": 117.38, + "step": 64080, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.66329699, + "epoch": 1.6256976154236429, + "grad_norm": 5.03125, + "learning_rate": 9.274078015223603e-07, + "loss": 1.58291759, + "memory(GiB)": 117.38, + "step": 64085, + "train_speed(iter/s)": 1.636016 + }, + { + "acc": 0.65693541, + "epoch": 1.6258244545915779, + "grad_norm": 5.84375, + "learning_rate": 9.26799544998439e-07, + "loss": 1.59871483, + "memory(GiB)": 117.38, + "step": 64090, + "train_speed(iter/s)": 1.63603 + }, + { + "acc": 0.67011118, + "epoch": 1.6259512937595129, + "grad_norm": 6.125, + "learning_rate": 9.261914676320594e-07, + "loss": 1.49998903, + "memory(GiB)": 117.38, + "step": 64095, + "train_speed(iter/s)": 1.636045 + }, + { + "acc": 0.65041304, + "epoch": 1.6260781329274479, + "grad_norm": 6.03125, + "learning_rate": 9.255835694499665e-07, + "loss": 1.5893405, + "memory(GiB)": 117.38, + "step": 64100, + "train_speed(iter/s)": 1.636058 + }, + { + "acc": 0.68258257, + "epoch": 1.626204972095383, + "grad_norm": 5.25, + "learning_rate": 9.249758504788986e-07, + "loss": 1.56823673, + "memory(GiB)": 117.38, + "step": 64105, + "train_speed(iter/s)": 1.636072 + }, + { + "acc": 0.65966063, + "epoch": 1.6263318112633183, + "grad_norm": 5.03125, + "learning_rate": 9.243683107455886e-07, + "loss": 1.56983013, + "memory(GiB)": 117.38, + "step": 64110, + "train_speed(iter/s)": 1.636086 + }, + { + "acc": 0.66664734, + "epoch": 1.6264586504312533, + "grad_norm": 5.71875, + "learning_rate": 9.237609502767558e-07, + "loss": 1.5638628, + "memory(GiB)": 117.38, + "step": 64115, + "train_speed(iter/s)": 1.636098 + }, + { + "acc": 0.64629068, + "epoch": 1.6265854895991883, + "grad_norm": 5.4375, + "learning_rate": 9.231537690991155e-07, + "loss": 1.62178955, + "memory(GiB)": 117.38, + "step": 64120, + "train_speed(iter/s)": 1.636112 + }, + { + "acc": 0.66311121, + "epoch": 1.6267123287671232, + "grad_norm": 6.9375, + "learning_rate": 9.225467672393729e-07, + "loss": 1.58112679, + "memory(GiB)": 117.38, + "step": 64125, + "train_speed(iter/s)": 1.636126 + }, + { + "acc": 0.66931963, + "epoch": 1.6268391679350582, + "grad_norm": 6.5, + "learning_rate": 9.2193994472423e-07, + "loss": 1.58022137, + "memory(GiB)": 117.38, + "step": 64130, + "train_speed(iter/s)": 1.636139 + }, + { + "acc": 0.64963408, + "epoch": 1.6269660071029934, + "grad_norm": 5.4375, + "learning_rate": 9.21333301580375e-07, + "loss": 1.61292439, + "memory(GiB)": 117.38, + "step": 64135, + "train_speed(iter/s)": 1.636153 + }, + { + "acc": 0.64832935, + "epoch": 1.6270928462709284, + "grad_norm": 5.0, + "learning_rate": 9.207268378344896e-07, + "loss": 1.66601448, + "memory(GiB)": 117.38, + "step": 64140, + "train_speed(iter/s)": 1.636166 + }, + { + "acc": 0.66399355, + "epoch": 1.6272196854388636, + "grad_norm": 6.0625, + "learning_rate": 9.201205535132523e-07, + "loss": 1.62433567, + "memory(GiB)": 117.38, + "step": 64145, + "train_speed(iter/s)": 1.636179 + }, + { + "acc": 0.64774213, + "epoch": 1.6273465246067986, + "grad_norm": 5.71875, + "learning_rate": 9.195144486433294e-07, + "loss": 1.63149204, + "memory(GiB)": 117.38, + "step": 64150, + "train_speed(iter/s)": 1.636193 + }, + { + "acc": 0.68049345, + "epoch": 1.6274733637747336, + "grad_norm": 5.25, + "learning_rate": 9.189085232513773e-07, + "loss": 1.52996807, + "memory(GiB)": 117.38, + "step": 64155, + "train_speed(iter/s)": 1.636207 + }, + { + "acc": 0.65328145, + "epoch": 1.6276002029426686, + "grad_norm": 5.53125, + "learning_rate": 9.183027773640485e-07, + "loss": 1.66289978, + "memory(GiB)": 117.38, + "step": 64160, + "train_speed(iter/s)": 1.63622 + }, + { + "acc": 0.65506711, + "epoch": 1.6277270421106036, + "grad_norm": 4.46875, + "learning_rate": 9.176972110079879e-07, + "loss": 1.48536968, + "memory(GiB)": 117.38, + "step": 64165, + "train_speed(iter/s)": 1.636234 + }, + { + "acc": 0.66621208, + "epoch": 1.6278538812785388, + "grad_norm": 6.0, + "learning_rate": 9.17091824209832e-07, + "loss": 1.58151608, + "memory(GiB)": 117.38, + "step": 64170, + "train_speed(iter/s)": 1.636247 + }, + { + "acc": 0.66377249, + "epoch": 1.627980720446474, + "grad_norm": 5.34375, + "learning_rate": 9.164866169962034e-07, + "loss": 1.62506218, + "memory(GiB)": 117.38, + "step": 64175, + "train_speed(iter/s)": 1.636261 + }, + { + "acc": 0.67016363, + "epoch": 1.628107559614409, + "grad_norm": 7.21875, + "learning_rate": 9.158815893937268e-07, + "loss": 1.61708374, + "memory(GiB)": 117.38, + "step": 64180, + "train_speed(iter/s)": 1.636275 + }, + { + "acc": 0.66039228, + "epoch": 1.628234398782344, + "grad_norm": 6.875, + "learning_rate": 9.152767414290115e-07, + "loss": 1.6024477, + "memory(GiB)": 117.38, + "step": 64185, + "train_speed(iter/s)": 1.63629 + }, + { + "acc": 0.66371355, + "epoch": 1.628361237950279, + "grad_norm": 5.875, + "learning_rate": 9.146720731286623e-07, + "loss": 1.56843948, + "memory(GiB)": 117.38, + "step": 64190, + "train_speed(iter/s)": 1.636303 + }, + { + "acc": 0.65737667, + "epoch": 1.628488077118214, + "grad_norm": 6.65625, + "learning_rate": 9.140675845192754e-07, + "loss": 1.64151211, + "memory(GiB)": 117.38, + "step": 64195, + "train_speed(iter/s)": 1.636107 + }, + { + "acc": 0.65039558, + "epoch": 1.6286149162861492, + "grad_norm": 5.5, + "learning_rate": 9.134632756274381e-07, + "loss": 1.55718536, + "memory(GiB)": 117.38, + "step": 64200, + "train_speed(iter/s)": 1.636121 + }, + { + "acc": 0.66105261, + "epoch": 1.6287417554540842, + "grad_norm": 5.96875, + "learning_rate": 9.12859146479731e-07, + "loss": 1.63983612, + "memory(GiB)": 117.38, + "step": 64205, + "train_speed(iter/s)": 1.636134 + }, + { + "acc": 0.66798487, + "epoch": 1.6288685946220194, + "grad_norm": 5.9375, + "learning_rate": 9.122551971027266e-07, + "loss": 1.56808758, + "memory(GiB)": 117.38, + "step": 64210, + "train_speed(iter/s)": 1.636148 + }, + { + "acc": 0.64251666, + "epoch": 1.6289954337899544, + "grad_norm": 5.8125, + "learning_rate": 9.116514275229892e-07, + "loss": 1.65047455, + "memory(GiB)": 117.38, + "step": 64215, + "train_speed(iter/s)": 1.636162 + }, + { + "acc": 0.67602401, + "epoch": 1.6291222729578894, + "grad_norm": 5.25, + "learning_rate": 9.110478377670751e-07, + "loss": 1.54542303, + "memory(GiB)": 117.38, + "step": 64220, + "train_speed(iter/s)": 1.636176 + }, + { + "acc": 0.68549647, + "epoch": 1.6292491121258244, + "grad_norm": 6.21875, + "learning_rate": 9.104444278615327e-07, + "loss": 1.52364578, + "memory(GiB)": 117.38, + "step": 64225, + "train_speed(iter/s)": 1.63619 + }, + { + "acc": 0.66653528, + "epoch": 1.6293759512937596, + "grad_norm": 5.875, + "learning_rate": 9.098411978329031e-07, + "loss": 1.58440819, + "memory(GiB)": 117.38, + "step": 64230, + "train_speed(iter/s)": 1.636205 + }, + { + "acc": 0.68131003, + "epoch": 1.6295027904616946, + "grad_norm": 4.90625, + "learning_rate": 9.092381477077189e-07, + "loss": 1.53496552, + "memory(GiB)": 117.38, + "step": 64235, + "train_speed(iter/s)": 1.636218 + }, + { + "acc": 0.65149441, + "epoch": 1.6296296296296298, + "grad_norm": 5.125, + "learning_rate": 9.086352775125046e-07, + "loss": 1.63113022, + "memory(GiB)": 117.38, + "step": 64240, + "train_speed(iter/s)": 1.636232 + }, + { + "acc": 0.65774584, + "epoch": 1.6297564687975648, + "grad_norm": 5.34375, + "learning_rate": 9.080325872737766e-07, + "loss": 1.63655815, + "memory(GiB)": 117.38, + "step": 64245, + "train_speed(iter/s)": 1.636243 + }, + { + "acc": 0.67013774, + "epoch": 1.6298833079654997, + "grad_norm": 5.875, + "learning_rate": 9.074300770180472e-07, + "loss": 1.59241638, + "memory(GiB)": 117.38, + "step": 64250, + "train_speed(iter/s)": 1.636257 + }, + { + "acc": 0.66312466, + "epoch": 1.6300101471334347, + "grad_norm": 5.65625, + "learning_rate": 9.068277467718134e-07, + "loss": 1.5373848, + "memory(GiB)": 117.38, + "step": 64255, + "train_speed(iter/s)": 1.636271 + }, + { + "acc": 0.66634731, + "epoch": 1.6301369863013697, + "grad_norm": 6.0, + "learning_rate": 9.062255965615701e-07, + "loss": 1.57371206, + "memory(GiB)": 117.38, + "step": 64260, + "train_speed(iter/s)": 1.636284 + }, + { + "acc": 0.65798616, + "epoch": 1.630263825469305, + "grad_norm": 6.46875, + "learning_rate": 9.056236264138013e-07, + "loss": 1.5157733, + "memory(GiB)": 117.38, + "step": 64265, + "train_speed(iter/s)": 1.636298 + }, + { + "acc": 0.67357807, + "epoch": 1.6303906646372401, + "grad_norm": 6.03125, + "learning_rate": 9.050218363549885e-07, + "loss": 1.57546177, + "memory(GiB)": 117.38, + "step": 64270, + "train_speed(iter/s)": 1.636311 + }, + { + "acc": 0.65279284, + "epoch": 1.6305175038051751, + "grad_norm": 5.9375, + "learning_rate": 9.044202264115958e-07, + "loss": 1.58305664, + "memory(GiB)": 117.38, + "step": 64275, + "train_speed(iter/s)": 1.636325 + }, + { + "acc": 0.66002922, + "epoch": 1.6306443429731101, + "grad_norm": 5.71875, + "learning_rate": 9.038187966100864e-07, + "loss": 1.54904785, + "memory(GiB)": 117.38, + "step": 64280, + "train_speed(iter/s)": 1.636338 + }, + { + "acc": 0.65746746, + "epoch": 1.630771182141045, + "grad_norm": 5.8125, + "learning_rate": 9.032175469769155e-07, + "loss": 1.61362495, + "memory(GiB)": 117.38, + "step": 64285, + "train_speed(iter/s)": 1.636351 + }, + { + "acc": 0.65153384, + "epoch": 1.63089802130898, + "grad_norm": 6.375, + "learning_rate": 9.026164775385294e-07, + "loss": 1.53693304, + "memory(GiB)": 117.38, + "step": 64290, + "train_speed(iter/s)": 1.636365 + }, + { + "acc": 0.65331411, + "epoch": 1.6310248604769153, + "grad_norm": 5.625, + "learning_rate": 9.020155883213627e-07, + "loss": 1.55656967, + "memory(GiB)": 117.38, + "step": 64295, + "train_speed(iter/s)": 1.636378 + }, + { + "acc": 0.65635805, + "epoch": 1.6311516996448503, + "grad_norm": 5.5, + "learning_rate": 9.014148793518451e-07, + "loss": 1.55267391, + "memory(GiB)": 117.38, + "step": 64300, + "train_speed(iter/s)": 1.636392 + }, + { + "acc": 0.65309649, + "epoch": 1.6312785388127855, + "grad_norm": 5.40625, + "learning_rate": 9.00814350656401e-07, + "loss": 1.61679459, + "memory(GiB)": 117.38, + "step": 64305, + "train_speed(iter/s)": 1.636405 + }, + { + "acc": 0.63942447, + "epoch": 1.6314053779807205, + "grad_norm": 6.0, + "learning_rate": 9.002140022614452e-07, + "loss": 1.65091991, + "memory(GiB)": 117.38, + "step": 64310, + "train_speed(iter/s)": 1.636418 + }, + { + "acc": 0.6526722, + "epoch": 1.6315322171486555, + "grad_norm": 5.84375, + "learning_rate": 8.996138341933786e-07, + "loss": 1.6415432, + "memory(GiB)": 117.38, + "step": 64315, + "train_speed(iter/s)": 1.636431 + }, + { + "acc": 0.65593309, + "epoch": 1.6316590563165905, + "grad_norm": 5.90625, + "learning_rate": 8.990138464786041e-07, + "loss": 1.65720711, + "memory(GiB)": 117.38, + "step": 64320, + "train_speed(iter/s)": 1.636444 + }, + { + "acc": 0.66309729, + "epoch": 1.6317858954845255, + "grad_norm": 5.0, + "learning_rate": 8.984140391435103e-07, + "loss": 1.5448595, + "memory(GiB)": 117.38, + "step": 64325, + "train_speed(iter/s)": 1.636457 + }, + { + "acc": 0.64228854, + "epoch": 1.6319127346524607, + "grad_norm": 6.0625, + "learning_rate": 8.978144122144805e-07, + "loss": 1.61040192, + "memory(GiB)": 117.38, + "step": 64330, + "train_speed(iter/s)": 1.63647 + }, + { + "acc": 0.64958572, + "epoch": 1.6320395738203959, + "grad_norm": 5.125, + "learning_rate": 8.972149657178852e-07, + "loss": 1.57734375, + "memory(GiB)": 117.38, + "step": 64335, + "train_speed(iter/s)": 1.636483 + }, + { + "acc": 0.63760729, + "epoch": 1.6321664129883309, + "grad_norm": 7.71875, + "learning_rate": 8.966156996800951e-07, + "loss": 1.68630714, + "memory(GiB)": 117.38, + "step": 64340, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.65831456, + "epoch": 1.6322932521562659, + "grad_norm": 6.0, + "learning_rate": 8.960166141274662e-07, + "loss": 1.61654854, + "memory(GiB)": 117.38, + "step": 64345, + "train_speed(iter/s)": 1.636511 + }, + { + "acc": 0.65944462, + "epoch": 1.6324200913242009, + "grad_norm": 6.3125, + "learning_rate": 8.954177090863497e-07, + "loss": 1.5868371, + "memory(GiB)": 117.38, + "step": 64350, + "train_speed(iter/s)": 1.636524 + }, + { + "acc": 0.64768248, + "epoch": 1.6325469304921358, + "grad_norm": 6.96875, + "learning_rate": 8.948189845830879e-07, + "loss": 1.61492577, + "memory(GiB)": 117.38, + "step": 64355, + "train_speed(iter/s)": 1.636537 + }, + { + "acc": 0.65644655, + "epoch": 1.632673769660071, + "grad_norm": 6.125, + "learning_rate": 8.942204406440159e-07, + "loss": 1.58328247, + "memory(GiB)": 117.38, + "step": 64360, + "train_speed(iter/s)": 1.636551 + }, + { + "acc": 0.65405083, + "epoch": 1.632800608828006, + "grad_norm": 6.1875, + "learning_rate": 8.936220772954595e-07, + "loss": 1.65491486, + "memory(GiB)": 117.38, + "step": 64365, + "train_speed(iter/s)": 1.636565 + }, + { + "acc": 0.66227741, + "epoch": 1.6329274479959413, + "grad_norm": 5.0, + "learning_rate": 8.930238945637381e-07, + "loss": 1.63686008, + "memory(GiB)": 117.38, + "step": 64370, + "train_speed(iter/s)": 1.636579 + }, + { + "acc": 0.66622677, + "epoch": 1.6330542871638762, + "grad_norm": 7.65625, + "learning_rate": 8.924258924751617e-07, + "loss": 1.53165569, + "memory(GiB)": 117.38, + "step": 64375, + "train_speed(iter/s)": 1.636593 + }, + { + "acc": 0.66397347, + "epoch": 1.6331811263318112, + "grad_norm": 5.40625, + "learning_rate": 8.918280710560339e-07, + "loss": 1.51180067, + "memory(GiB)": 117.38, + "step": 64380, + "train_speed(iter/s)": 1.636606 + }, + { + "acc": 0.65786042, + "epoch": 1.6333079654997462, + "grad_norm": 5.6875, + "learning_rate": 8.912304303326491e-07, + "loss": 1.56389637, + "memory(GiB)": 117.38, + "step": 64385, + "train_speed(iter/s)": 1.63662 + }, + { + "acc": 0.65657158, + "epoch": 1.6334348046676814, + "grad_norm": 5.46875, + "learning_rate": 8.906329703312943e-07, + "loss": 1.61333218, + "memory(GiB)": 117.38, + "step": 64390, + "train_speed(iter/s)": 1.636634 + }, + { + "acc": 0.65686479, + "epoch": 1.6335616438356164, + "grad_norm": 5.53125, + "learning_rate": 8.900356910782487e-07, + "loss": 1.58123417, + "memory(GiB)": 117.38, + "step": 64395, + "train_speed(iter/s)": 1.636647 + }, + { + "acc": 0.6510046, + "epoch": 1.6336884830035516, + "grad_norm": 6.15625, + "learning_rate": 8.894385925997828e-07, + "loss": 1.63838387, + "memory(GiB)": 117.38, + "step": 64400, + "train_speed(iter/s)": 1.636659 + }, + { + "acc": 0.66055756, + "epoch": 1.6338153221714866, + "grad_norm": 5.0625, + "learning_rate": 8.88841674922159e-07, + "loss": 1.55805283, + "memory(GiB)": 117.38, + "step": 64405, + "train_speed(iter/s)": 1.636673 + }, + { + "acc": 0.65006514, + "epoch": 1.6339421613394216, + "grad_norm": 5.21875, + "learning_rate": 8.882449380716351e-07, + "loss": 1.63462143, + "memory(GiB)": 117.38, + "step": 64410, + "train_speed(iter/s)": 1.636685 + }, + { + "acc": 0.67003059, + "epoch": 1.6340690005073566, + "grad_norm": 5.84375, + "learning_rate": 8.876483820744558e-07, + "loss": 1.53705578, + "memory(GiB)": 117.38, + "step": 64415, + "train_speed(iter/s)": 1.636697 + }, + { + "acc": 0.6776444, + "epoch": 1.6341958396752916, + "grad_norm": 6.53125, + "learning_rate": 8.8705200695686e-07, + "loss": 1.50939484, + "memory(GiB)": 117.38, + "step": 64420, + "train_speed(iter/s)": 1.63671 + }, + { + "acc": 0.65658298, + "epoch": 1.6343226788432268, + "grad_norm": 5.46875, + "learning_rate": 8.864558127450807e-07, + "loss": 1.64592018, + "memory(GiB)": 117.38, + "step": 64425, + "train_speed(iter/s)": 1.636724 + }, + { + "acc": 0.67021289, + "epoch": 1.634449518011162, + "grad_norm": 4.875, + "learning_rate": 8.858597994653417e-07, + "loss": 1.56698513, + "memory(GiB)": 117.38, + "step": 64430, + "train_speed(iter/s)": 1.636737 + }, + { + "acc": 0.6748867, + "epoch": 1.634576357179097, + "grad_norm": 5.90625, + "learning_rate": 8.852639671438562e-07, + "loss": 1.50503607, + "memory(GiB)": 117.38, + "step": 64435, + "train_speed(iter/s)": 1.636749 + }, + { + "acc": 0.65975094, + "epoch": 1.634703196347032, + "grad_norm": 4.6875, + "learning_rate": 8.846683158068309e-07, + "loss": 1.53684912, + "memory(GiB)": 117.38, + "step": 64440, + "train_speed(iter/s)": 1.636762 + }, + { + "acc": 0.65763922, + "epoch": 1.634830035514967, + "grad_norm": 7.65625, + "learning_rate": 8.840728454804676e-07, + "loss": 1.57818031, + "memory(GiB)": 117.38, + "step": 64445, + "train_speed(iter/s)": 1.636774 + }, + { + "acc": 0.66745706, + "epoch": 1.634956874682902, + "grad_norm": 5.625, + "learning_rate": 8.834775561909587e-07, + "loss": 1.56720648, + "memory(GiB)": 117.38, + "step": 64450, + "train_speed(iter/s)": 1.636787 + }, + { + "acc": 0.64632545, + "epoch": 1.6350837138508372, + "grad_norm": 5.1875, + "learning_rate": 8.828824479644827e-07, + "loss": 1.7071949, + "memory(GiB)": 117.38, + "step": 64455, + "train_speed(iter/s)": 1.636799 + }, + { + "acc": 0.64682975, + "epoch": 1.6352105530187722, + "grad_norm": 6.9375, + "learning_rate": 8.822875208272203e-07, + "loss": 1.57792473, + "memory(GiB)": 117.38, + "step": 64460, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.64925599, + "epoch": 1.6353373921867074, + "grad_norm": 7.0625, + "learning_rate": 8.816927748053361e-07, + "loss": 1.63652573, + "memory(GiB)": 117.38, + "step": 64465, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.64419012, + "epoch": 1.6354642313546424, + "grad_norm": 6.03125, + "learning_rate": 8.810982099249926e-07, + "loss": 1.66783848, + "memory(GiB)": 117.38, + "step": 64470, + "train_speed(iter/s)": 1.63684 + }, + { + "acc": 0.66090884, + "epoch": 1.6355910705225774, + "grad_norm": 5.96875, + "learning_rate": 8.805038262123361e-07, + "loss": 1.61365662, + "memory(GiB)": 117.38, + "step": 64475, + "train_speed(iter/s)": 1.636853 + }, + { + "acc": 0.6645216, + "epoch": 1.6357179096905123, + "grad_norm": 5.9375, + "learning_rate": 8.799096236935156e-07, + "loss": 1.60325813, + "memory(GiB)": 117.38, + "step": 64480, + "train_speed(iter/s)": 1.636865 + }, + { + "acc": 0.65705695, + "epoch": 1.6358447488584473, + "grad_norm": 5.625, + "learning_rate": 8.793156023946641e-07, + "loss": 1.60500793, + "memory(GiB)": 117.38, + "step": 64485, + "train_speed(iter/s)": 1.636879 + }, + { + "acc": 0.65052996, + "epoch": 1.6359715880263825, + "grad_norm": 5.9375, + "learning_rate": 8.787217623419104e-07, + "loss": 1.64182587, + "memory(GiB)": 117.38, + "step": 64490, + "train_speed(iter/s)": 1.636892 + }, + { + "acc": 0.66665125, + "epoch": 1.6360984271943178, + "grad_norm": 6.03125, + "learning_rate": 8.781281035613737e-07, + "loss": 1.56315975, + "memory(GiB)": 117.38, + "step": 64495, + "train_speed(iter/s)": 1.636906 + }, + { + "acc": 0.65590496, + "epoch": 1.6362252663622527, + "grad_norm": 5.53125, + "learning_rate": 8.775346260791656e-07, + "loss": 1.6054451, + "memory(GiB)": 117.38, + "step": 64500, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.6460043, + "epoch": 1.6363521055301877, + "grad_norm": 6.59375, + "learning_rate": 8.769413299213903e-07, + "loss": 1.64523582, + "memory(GiB)": 117.38, + "step": 64505, + "train_speed(iter/s)": 1.636931 + }, + { + "acc": 0.65839362, + "epoch": 1.6364789446981227, + "grad_norm": 7.15625, + "learning_rate": 8.763482151141434e-07, + "loss": 1.60169888, + "memory(GiB)": 117.38, + "step": 64510, + "train_speed(iter/s)": 1.636944 + }, + { + "acc": 0.65998778, + "epoch": 1.6366057838660577, + "grad_norm": 5.625, + "learning_rate": 8.75755281683513e-07, + "loss": 1.59584618, + "memory(GiB)": 117.38, + "step": 64515, + "train_speed(iter/s)": 1.636957 + }, + { + "acc": 0.66168942, + "epoch": 1.636732623033993, + "grad_norm": 6.21875, + "learning_rate": 8.751625296555782e-07, + "loss": 1.57391157, + "memory(GiB)": 117.38, + "step": 64520, + "train_speed(iter/s)": 1.63697 + }, + { + "acc": 0.65584211, + "epoch": 1.636859462201928, + "grad_norm": 5.6875, + "learning_rate": 8.745699590564122e-07, + "loss": 1.59686718, + "memory(GiB)": 117.38, + "step": 64525, + "train_speed(iter/s)": 1.636982 + }, + { + "acc": 0.66042361, + "epoch": 1.6369863013698631, + "grad_norm": 7.0, + "learning_rate": 8.739775699120773e-07, + "loss": 1.54451761, + "memory(GiB)": 117.38, + "step": 64530, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.65598269, + "epoch": 1.637113140537798, + "grad_norm": 6.53125, + "learning_rate": 8.733853622486305e-07, + "loss": 1.57635136, + "memory(GiB)": 117.38, + "step": 64535, + "train_speed(iter/s)": 1.637009 + }, + { + "acc": 0.66244283, + "epoch": 1.637239979705733, + "grad_norm": 5.8125, + "learning_rate": 8.727933360921198e-07, + "loss": 1.54611378, + "memory(GiB)": 117.38, + "step": 64540, + "train_speed(iter/s)": 1.637022 + }, + { + "acc": 0.6711031, + "epoch": 1.637366818873668, + "grad_norm": 5.09375, + "learning_rate": 8.722014914685834e-07, + "loss": 1.55091829, + "memory(GiB)": 117.38, + "step": 64545, + "train_speed(iter/s)": 1.637035 + }, + { + "acc": 0.64969845, + "epoch": 1.6374936580416033, + "grad_norm": 8.25, + "learning_rate": 8.71609828404057e-07, + "loss": 1.70320358, + "memory(GiB)": 117.38, + "step": 64550, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.65629444, + "epoch": 1.6376204972095383, + "grad_norm": 5.1875, + "learning_rate": 8.710183469245614e-07, + "loss": 1.61533871, + "memory(GiB)": 117.38, + "step": 64555, + "train_speed(iter/s)": 1.63706 + }, + { + "acc": 0.66461658, + "epoch": 1.6377473363774735, + "grad_norm": 5.28125, + "learning_rate": 8.704270470561132e-07, + "loss": 1.52329006, + "memory(GiB)": 117.38, + "step": 64560, + "train_speed(iter/s)": 1.637073 + }, + { + "acc": 0.65518188, + "epoch": 1.6378741755454085, + "grad_norm": 5.71875, + "learning_rate": 8.698359288247194e-07, + "loss": 1.57330151, + "memory(GiB)": 117.38, + "step": 64565, + "train_speed(iter/s)": 1.637087 + }, + { + "acc": 0.65063562, + "epoch": 1.6380010147133435, + "grad_norm": 7.0625, + "learning_rate": 8.692449922563839e-07, + "loss": 1.63121166, + "memory(GiB)": 117.38, + "step": 64570, + "train_speed(iter/s)": 1.6371 + }, + { + "acc": 0.65189147, + "epoch": 1.6381278538812785, + "grad_norm": 6.5, + "learning_rate": 8.686542373770951e-07, + "loss": 1.63193474, + "memory(GiB)": 117.38, + "step": 64575, + "train_speed(iter/s)": 1.637113 + }, + { + "acc": 0.67223182, + "epoch": 1.6382546930492135, + "grad_norm": 5.125, + "learning_rate": 8.680636642128365e-07, + "loss": 1.58498354, + "memory(GiB)": 117.38, + "step": 64580, + "train_speed(iter/s)": 1.637127 + }, + { + "acc": 0.65705099, + "epoch": 1.6383815322171487, + "grad_norm": 5.21875, + "learning_rate": 8.674732727895874e-07, + "loss": 1.54217386, + "memory(GiB)": 117.38, + "step": 64585, + "train_speed(iter/s)": 1.63714 + }, + { + "acc": 0.64737816, + "epoch": 1.6385083713850839, + "grad_norm": 5.59375, + "learning_rate": 8.668830631333147e-07, + "loss": 1.75328445, + "memory(GiB)": 117.38, + "step": 64590, + "train_speed(iter/s)": 1.637155 + }, + { + "acc": 0.65661049, + "epoch": 1.6386352105530189, + "grad_norm": 5.84375, + "learning_rate": 8.662930352699761e-07, + "loss": 1.60997772, + "memory(GiB)": 117.38, + "step": 64595, + "train_speed(iter/s)": 1.637167 + }, + { + "acc": 0.67823992, + "epoch": 1.6387620497209539, + "grad_norm": 5.90625, + "learning_rate": 8.657031892255263e-07, + "loss": 1.54094582, + "memory(GiB)": 117.38, + "step": 64600, + "train_speed(iter/s)": 1.637181 + }, + { + "acc": 0.67424746, + "epoch": 1.6388888888888888, + "grad_norm": 6.0, + "learning_rate": 8.651135250259091e-07, + "loss": 1.50796165, + "memory(GiB)": 117.38, + "step": 64605, + "train_speed(iter/s)": 1.637194 + }, + { + "acc": 0.64996595, + "epoch": 1.6390157280568238, + "grad_norm": 7.375, + "learning_rate": 8.645240426970608e-07, + "loss": 1.60432434, + "memory(GiB)": 117.38, + "step": 64610, + "train_speed(iter/s)": 1.637208 + }, + { + "acc": 0.64492421, + "epoch": 1.639142567224759, + "grad_norm": 5.59375, + "learning_rate": 8.639347422649058e-07, + "loss": 1.59048653, + "memory(GiB)": 117.38, + "step": 64615, + "train_speed(iter/s)": 1.637221 + }, + { + "acc": 0.66685057, + "epoch": 1.639269406392694, + "grad_norm": 5.40625, + "learning_rate": 8.633456237553689e-07, + "loss": 1.52912235, + "memory(GiB)": 117.38, + "step": 64620, + "train_speed(iter/s)": 1.637235 + }, + { + "acc": 0.65340443, + "epoch": 1.6393962455606292, + "grad_norm": 5.71875, + "learning_rate": 8.627566871943604e-07, + "loss": 1.60479507, + "memory(GiB)": 117.38, + "step": 64625, + "train_speed(iter/s)": 1.637248 + }, + { + "acc": 0.65466242, + "epoch": 1.6395230847285642, + "grad_norm": 5.3125, + "learning_rate": 8.621679326077836e-07, + "loss": 1.56817703, + "memory(GiB)": 117.38, + "step": 64630, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.67594848, + "epoch": 1.6396499238964992, + "grad_norm": 6.5, + "learning_rate": 8.61579360021536e-07, + "loss": 1.51457767, + "memory(GiB)": 117.38, + "step": 64635, + "train_speed(iter/s)": 1.637275 + }, + { + "acc": 0.66544476, + "epoch": 1.6397767630644342, + "grad_norm": 5.71875, + "learning_rate": 8.609909694615043e-07, + "loss": 1.60106907, + "memory(GiB)": 117.38, + "step": 64640, + "train_speed(iter/s)": 1.637288 + }, + { + "acc": 0.65676861, + "epoch": 1.6399036022323692, + "grad_norm": 5.875, + "learning_rate": 8.604027609535693e-07, + "loss": 1.56086674, + "memory(GiB)": 117.38, + "step": 64645, + "train_speed(iter/s)": 1.637301 + }, + { + "acc": 0.6629344, + "epoch": 1.6400304414003044, + "grad_norm": 5.875, + "learning_rate": 8.598147345236029e-07, + "loss": 1.55487366, + "memory(GiB)": 117.38, + "step": 64650, + "train_speed(iter/s)": 1.637315 + }, + { + "acc": 0.64780807, + "epoch": 1.6401572805682396, + "grad_norm": 5.78125, + "learning_rate": 8.592268901974688e-07, + "loss": 1.58347492, + "memory(GiB)": 117.38, + "step": 64655, + "train_speed(iter/s)": 1.637329 + }, + { + "acc": 0.65043926, + "epoch": 1.6402841197361746, + "grad_norm": 7.75, + "learning_rate": 8.586392280010237e-07, + "loss": 1.66943111, + "memory(GiB)": 117.38, + "step": 64660, + "train_speed(iter/s)": 1.637342 + }, + { + "acc": 0.63919687, + "epoch": 1.6404109589041096, + "grad_norm": 6.75, + "learning_rate": 8.580517479601147e-07, + "loss": 1.67320137, + "memory(GiB)": 117.38, + "step": 64665, + "train_speed(iter/s)": 1.637356 + }, + { + "acc": 0.66788187, + "epoch": 1.6405377980720446, + "grad_norm": 7.3125, + "learning_rate": 8.57464450100583e-07, + "loss": 1.5314394, + "memory(GiB)": 117.38, + "step": 64670, + "train_speed(iter/s)": 1.637369 + }, + { + "acc": 0.66296434, + "epoch": 1.6406646372399796, + "grad_norm": 8.5, + "learning_rate": 8.568773344482595e-07, + "loss": 1.58386822, + "memory(GiB)": 117.38, + "step": 64675, + "train_speed(iter/s)": 1.637382 + }, + { + "acc": 0.66180706, + "epoch": 1.6407914764079148, + "grad_norm": 6.0625, + "learning_rate": 8.562904010289685e-07, + "loss": 1.59790993, + "memory(GiB)": 117.38, + "step": 64680, + "train_speed(iter/s)": 1.637395 + }, + { + "acc": 0.65477934, + "epoch": 1.6409183155758498, + "grad_norm": 6.0, + "learning_rate": 8.557036498685245e-07, + "loss": 1.60289841, + "memory(GiB)": 117.38, + "step": 64685, + "train_speed(iter/s)": 1.637409 + }, + { + "acc": 0.65912018, + "epoch": 1.641045154743785, + "grad_norm": 5.625, + "learning_rate": 8.551170809927394e-07, + "loss": 1.62870331, + "memory(GiB)": 117.38, + "step": 64690, + "train_speed(iter/s)": 1.637424 + }, + { + "acc": 0.6732944, + "epoch": 1.64117199391172, + "grad_norm": 6.40625, + "learning_rate": 8.545306944274101e-07, + "loss": 1.53669538, + "memory(GiB)": 117.38, + "step": 64695, + "train_speed(iter/s)": 1.637437 + }, + { + "acc": 0.66373663, + "epoch": 1.641298833079655, + "grad_norm": 5.28125, + "learning_rate": 8.539444901983284e-07, + "loss": 1.55918283, + "memory(GiB)": 117.38, + "step": 64700, + "train_speed(iter/s)": 1.63745 + }, + { + "acc": 0.66447511, + "epoch": 1.64142567224759, + "grad_norm": 6.90625, + "learning_rate": 8.533584683312779e-07, + "loss": 1.60827999, + "memory(GiB)": 117.38, + "step": 64705, + "train_speed(iter/s)": 1.637464 + }, + { + "acc": 0.64863806, + "epoch": 1.6415525114155252, + "grad_norm": 7.03125, + "learning_rate": 8.527726288520377e-07, + "loss": 1.64336967, + "memory(GiB)": 117.38, + "step": 64710, + "train_speed(iter/s)": 1.637479 + }, + { + "acc": 0.65318751, + "epoch": 1.6416793505834602, + "grad_norm": 5.65625, + "learning_rate": 8.521869717863723e-07, + "loss": 1.61412544, + "memory(GiB)": 117.38, + "step": 64715, + "train_speed(iter/s)": 1.637492 + }, + { + "acc": 0.65827713, + "epoch": 1.6418061897513954, + "grad_norm": 5.78125, + "learning_rate": 8.516014971600411e-07, + "loss": 1.61229076, + "memory(GiB)": 117.38, + "step": 64720, + "train_speed(iter/s)": 1.637506 + }, + { + "acc": 0.64206033, + "epoch": 1.6419330289193304, + "grad_norm": 6.8125, + "learning_rate": 8.510162049987986e-07, + "loss": 1.64799881, + "memory(GiB)": 117.38, + "step": 64725, + "train_speed(iter/s)": 1.63752 + }, + { + "acc": 0.6657445, + "epoch": 1.6420598680872653, + "grad_norm": 5.375, + "learning_rate": 8.504310953283884e-07, + "loss": 1.59640121, + "memory(GiB)": 117.38, + "step": 64730, + "train_speed(iter/s)": 1.637534 + }, + { + "acc": 0.65431252, + "epoch": 1.6421867072552003, + "grad_norm": 6.4375, + "learning_rate": 8.49846168174544e-07, + "loss": 1.60152779, + "memory(GiB)": 117.38, + "step": 64735, + "train_speed(iter/s)": 1.637547 + }, + { + "acc": 0.66547089, + "epoch": 1.6423135464231353, + "grad_norm": 5.8125, + "learning_rate": 8.492614235629932e-07, + "loss": 1.61319695, + "memory(GiB)": 117.38, + "step": 64740, + "train_speed(iter/s)": 1.637561 + }, + { + "acc": 0.66679211, + "epoch": 1.6424403855910705, + "grad_norm": 6.40625, + "learning_rate": 8.486768615194579e-07, + "loss": 1.65451031, + "memory(GiB)": 117.38, + "step": 64745, + "train_speed(iter/s)": 1.637574 + }, + { + "acc": 0.67732229, + "epoch": 1.6425672247590057, + "grad_norm": 5.53125, + "learning_rate": 8.480924820696495e-07, + "loss": 1.49379568, + "memory(GiB)": 117.38, + "step": 64750, + "train_speed(iter/s)": 1.637588 + }, + { + "acc": 0.65555902, + "epoch": 1.6426940639269407, + "grad_norm": 5.28125, + "learning_rate": 8.475082852392685e-07, + "loss": 1.63650131, + "memory(GiB)": 117.38, + "step": 64755, + "train_speed(iter/s)": 1.637601 + }, + { + "acc": 0.66189566, + "epoch": 1.6428209030948757, + "grad_norm": 5.21875, + "learning_rate": 8.469242710540138e-07, + "loss": 1.57348881, + "memory(GiB)": 117.38, + "step": 64760, + "train_speed(iter/s)": 1.637615 + }, + { + "acc": 0.64581356, + "epoch": 1.6429477422628107, + "grad_norm": 6.0, + "learning_rate": 8.463404395395713e-07, + "loss": 1.68014965, + "memory(GiB)": 117.38, + "step": 64765, + "train_speed(iter/s)": 1.637629 + }, + { + "acc": 0.65202537, + "epoch": 1.6430745814307457, + "grad_norm": 5.875, + "learning_rate": 8.45756790721623e-07, + "loss": 1.56607771, + "memory(GiB)": 117.38, + "step": 64770, + "train_speed(iter/s)": 1.637642 + }, + { + "acc": 0.64969139, + "epoch": 1.643201420598681, + "grad_norm": 6.3125, + "learning_rate": 8.451733246258354e-07, + "loss": 1.63686638, + "memory(GiB)": 117.38, + "step": 64775, + "train_speed(iter/s)": 1.637656 + }, + { + "acc": 0.6547502, + "epoch": 1.643328259766616, + "grad_norm": 5.84375, + "learning_rate": 8.445900412778768e-07, + "loss": 1.65765114, + "memory(GiB)": 117.38, + "step": 64780, + "train_speed(iter/s)": 1.63767 + }, + { + "acc": 0.6582253, + "epoch": 1.643455098934551, + "grad_norm": 4.75, + "learning_rate": 8.440069407034002e-07, + "loss": 1.54100533, + "memory(GiB)": 117.38, + "step": 64785, + "train_speed(iter/s)": 1.637684 + }, + { + "acc": 0.67139196, + "epoch": 1.643581938102486, + "grad_norm": 6.9375, + "learning_rate": 8.434240229280538e-07, + "loss": 1.55577374, + "memory(GiB)": 117.38, + "step": 64790, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.66159654, + "epoch": 1.643708777270421, + "grad_norm": 5.59375, + "learning_rate": 8.428412879774767e-07, + "loss": 1.65142803, + "memory(GiB)": 117.38, + "step": 64795, + "train_speed(iter/s)": 1.637711 + }, + { + "acc": 0.66612983, + "epoch": 1.643835616438356, + "grad_norm": 5.5, + "learning_rate": 8.422587358772999e-07, + "loss": 1.57532177, + "memory(GiB)": 128.51, + "step": 64800, + "train_speed(iter/s)": 1.637723 + }, + { + "acc": 0.65604439, + "epoch": 1.643962455606291, + "grad_norm": 6.375, + "learning_rate": 8.416763666531468e-07, + "loss": 1.61523132, + "memory(GiB)": 128.51, + "step": 64805, + "train_speed(iter/s)": 1.637736 + }, + { + "acc": 0.65885863, + "epoch": 1.6440892947742263, + "grad_norm": 6.15625, + "learning_rate": 8.410941803306327e-07, + "loss": 1.52201481, + "memory(GiB)": 128.51, + "step": 64810, + "train_speed(iter/s)": 1.63775 + }, + { + "acc": 0.65644484, + "epoch": 1.6442161339421615, + "grad_norm": 6.09375, + "learning_rate": 8.405121769353647e-07, + "loss": 1.64983139, + "memory(GiB)": 128.51, + "step": 64815, + "train_speed(iter/s)": 1.637764 + }, + { + "acc": 0.6522748, + "epoch": 1.6443429731100965, + "grad_norm": 5.375, + "learning_rate": 8.399303564929423e-07, + "loss": 1.57061501, + "memory(GiB)": 128.51, + "step": 64820, + "train_speed(iter/s)": 1.637777 + }, + { + "acc": 0.65681, + "epoch": 1.6444698122780315, + "grad_norm": 6.5, + "learning_rate": 8.393487190289562e-07, + "loss": 1.55727634, + "memory(GiB)": 128.51, + "step": 64825, + "train_speed(iter/s)": 1.637791 + }, + { + "acc": 0.64951587, + "epoch": 1.6445966514459665, + "grad_norm": 5.25, + "learning_rate": 8.387672645689887e-07, + "loss": 1.62064781, + "memory(GiB)": 128.51, + "step": 64830, + "train_speed(iter/s)": 1.637805 + }, + { + "acc": 0.66965523, + "epoch": 1.6447234906139014, + "grad_norm": 6.125, + "learning_rate": 8.381859931386161e-07, + "loss": 1.56142445, + "memory(GiB)": 128.51, + "step": 64835, + "train_speed(iter/s)": 1.637818 + }, + { + "acc": 0.63712821, + "epoch": 1.6448503297818367, + "grad_norm": 5.9375, + "learning_rate": 8.37604904763405e-07, + "loss": 1.5931572, + "memory(GiB)": 128.51, + "step": 64840, + "train_speed(iter/s)": 1.637832 + }, + { + "acc": 0.64802895, + "epoch": 1.6449771689497716, + "grad_norm": 5.65625, + "learning_rate": 8.370239994689123e-07, + "loss": 1.58122149, + "memory(GiB)": 128.51, + "step": 64845, + "train_speed(iter/s)": 1.637846 + }, + { + "acc": 0.6716279, + "epoch": 1.6451040081177069, + "grad_norm": 5.78125, + "learning_rate": 8.364432772806924e-07, + "loss": 1.55457172, + "memory(GiB)": 128.51, + "step": 64850, + "train_speed(iter/s)": 1.637636 + }, + { + "acc": 0.66971855, + "epoch": 1.6452308472856418, + "grad_norm": 5.03125, + "learning_rate": 8.358627382242857e-07, + "loss": 1.5141942, + "memory(GiB)": 128.51, + "step": 64855, + "train_speed(iter/s)": 1.63765 + }, + { + "acc": 0.6456254, + "epoch": 1.6453576864535768, + "grad_norm": 8.25, + "learning_rate": 8.352823823252254e-07, + "loss": 1.65165844, + "memory(GiB)": 128.51, + "step": 64860, + "train_speed(iter/s)": 1.637663 + }, + { + "acc": 0.63329515, + "epoch": 1.6454845256215118, + "grad_norm": 5.34375, + "learning_rate": 8.347022096090418e-07, + "loss": 1.69572487, + "memory(GiB)": 128.51, + "step": 64865, + "train_speed(iter/s)": 1.637676 + }, + { + "acc": 0.66516323, + "epoch": 1.645611364789447, + "grad_norm": 6.96875, + "learning_rate": 8.341222201012527e-07, + "loss": 1.55688972, + "memory(GiB)": 128.51, + "step": 64870, + "train_speed(iter/s)": 1.637689 + }, + { + "acc": 0.65153027, + "epoch": 1.645738203957382, + "grad_norm": 6.21875, + "learning_rate": 8.335424138273668e-07, + "loss": 1.64163017, + "memory(GiB)": 128.51, + "step": 64875, + "train_speed(iter/s)": 1.637702 + }, + { + "acc": 0.65290432, + "epoch": 1.6458650431253172, + "grad_norm": 6.8125, + "learning_rate": 8.329627908128857e-07, + "loss": 1.60109043, + "memory(GiB)": 128.51, + "step": 64880, + "train_speed(iter/s)": 1.637715 + }, + { + "acc": 0.66773653, + "epoch": 1.6459918822932522, + "grad_norm": 6.4375, + "learning_rate": 8.323833510833068e-07, + "loss": 1.55382442, + "memory(GiB)": 128.51, + "step": 64885, + "train_speed(iter/s)": 1.637729 + }, + { + "acc": 0.65851893, + "epoch": 1.6461187214611872, + "grad_norm": 5.5, + "learning_rate": 8.318040946641171e-07, + "loss": 1.64109154, + "memory(GiB)": 128.51, + "step": 64890, + "train_speed(iter/s)": 1.637743 + }, + { + "acc": 0.66323276, + "epoch": 1.6462455606291222, + "grad_norm": 5.90625, + "learning_rate": 8.312250215807898e-07, + "loss": 1.6147213, + "memory(GiB)": 128.51, + "step": 64895, + "train_speed(iter/s)": 1.637756 + }, + { + "acc": 0.66610441, + "epoch": 1.6463723997970572, + "grad_norm": 5.90625, + "learning_rate": 8.306461318587999e-07, + "loss": 1.56968899, + "memory(GiB)": 128.51, + "step": 64900, + "train_speed(iter/s)": 1.63777 + }, + { + "acc": 0.63668189, + "epoch": 1.6464992389649924, + "grad_norm": 5.75, + "learning_rate": 8.300674255236074e-07, + "loss": 1.66660309, + "memory(GiB)": 128.51, + "step": 64905, + "train_speed(iter/s)": 1.637784 + }, + { + "acc": 0.6446743, + "epoch": 1.6466260781329276, + "grad_norm": 5.40625, + "learning_rate": 8.294889026006686e-07, + "loss": 1.64032745, + "memory(GiB)": 128.51, + "step": 64910, + "train_speed(iter/s)": 1.637798 + }, + { + "acc": 0.62644181, + "epoch": 1.6467529173008626, + "grad_norm": 6.1875, + "learning_rate": 8.289105631154254e-07, + "loss": 1.71175785, + "memory(GiB)": 128.51, + "step": 64915, + "train_speed(iter/s)": 1.637812 + }, + { + "acc": 0.65595312, + "epoch": 1.6468797564687976, + "grad_norm": 4.84375, + "learning_rate": 8.283324070933196e-07, + "loss": 1.59741316, + "memory(GiB)": 128.51, + "step": 64920, + "train_speed(iter/s)": 1.637824 + }, + { + "acc": 0.64044676, + "epoch": 1.6470065956367326, + "grad_norm": 5.4375, + "learning_rate": 8.277544345597793e-07, + "loss": 1.68474922, + "memory(GiB)": 128.51, + "step": 64925, + "train_speed(iter/s)": 1.637838 + }, + { + "acc": 0.63849506, + "epoch": 1.6471334348046676, + "grad_norm": 5.71875, + "learning_rate": 8.271766455402269e-07, + "loss": 1.68146992, + "memory(GiB)": 128.51, + "step": 64930, + "train_speed(iter/s)": 1.63785 + }, + { + "acc": 0.65692158, + "epoch": 1.6472602739726028, + "grad_norm": 5.4375, + "learning_rate": 8.26599040060076e-07, + "loss": 1.58570471, + "memory(GiB)": 128.51, + "step": 64935, + "train_speed(iter/s)": 1.637863 + }, + { + "acc": 0.65198889, + "epoch": 1.6473871131405378, + "grad_norm": 6.3125, + "learning_rate": 8.260216181447323e-07, + "loss": 1.57542915, + "memory(GiB)": 128.51, + "step": 64940, + "train_speed(iter/s)": 1.637877 + }, + { + "acc": 0.66240969, + "epoch": 1.647513952308473, + "grad_norm": 6.34375, + "learning_rate": 8.254443798195932e-07, + "loss": 1.52223663, + "memory(GiB)": 128.51, + "step": 64945, + "train_speed(iter/s)": 1.637889 + }, + { + "acc": 0.6646276, + "epoch": 1.647640791476408, + "grad_norm": 6.6875, + "learning_rate": 8.248673251100481e-07, + "loss": 1.61537323, + "memory(GiB)": 128.51, + "step": 64950, + "train_speed(iter/s)": 1.637903 + }, + { + "acc": 0.65987582, + "epoch": 1.647767630644343, + "grad_norm": 6.125, + "learning_rate": 8.242904540414787e-07, + "loss": 1.61644516, + "memory(GiB)": 128.51, + "step": 64955, + "train_speed(iter/s)": 1.637916 + }, + { + "acc": 0.65645065, + "epoch": 1.647894469812278, + "grad_norm": 6.75, + "learning_rate": 8.237137666392586e-07, + "loss": 1.5946147, + "memory(GiB)": 128.51, + "step": 64960, + "train_speed(iter/s)": 1.637931 + }, + { + "acc": 0.64823518, + "epoch": 1.648021308980213, + "grad_norm": 5.3125, + "learning_rate": 8.231372629287526e-07, + "loss": 1.65992336, + "memory(GiB)": 128.51, + "step": 64965, + "train_speed(iter/s)": 1.637944 + }, + { + "acc": 0.6629631, + "epoch": 1.6481481481481481, + "grad_norm": 5.9375, + "learning_rate": 8.225609429353187e-07, + "loss": 1.60226593, + "memory(GiB)": 128.51, + "step": 64970, + "train_speed(iter/s)": 1.637958 + }, + { + "acc": 0.67679634, + "epoch": 1.6482749873160834, + "grad_norm": 6.46875, + "learning_rate": 8.219848066843051e-07, + "loss": 1.52095451, + "memory(GiB)": 128.51, + "step": 64975, + "train_speed(iter/s)": 1.637971 + }, + { + "acc": 0.64920592, + "epoch": 1.6484018264840183, + "grad_norm": 5.40625, + "learning_rate": 8.214088542010529e-07, + "loss": 1.61804352, + "memory(GiB)": 128.51, + "step": 64980, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.64906683, + "epoch": 1.6485286656519533, + "grad_norm": 5.65625, + "learning_rate": 8.208330855108943e-07, + "loss": 1.61961708, + "memory(GiB)": 128.51, + "step": 64985, + "train_speed(iter/s)": 1.637999 + }, + { + "acc": 0.65655632, + "epoch": 1.6486555048198883, + "grad_norm": 6.40625, + "learning_rate": 8.202575006391577e-07, + "loss": 1.60711651, + "memory(GiB)": 128.51, + "step": 64990, + "train_speed(iter/s)": 1.638012 + }, + { + "acc": 0.65767536, + "epoch": 1.6487823439878233, + "grad_norm": 6.1875, + "learning_rate": 8.19682099611156e-07, + "loss": 1.51675682, + "memory(GiB)": 128.51, + "step": 64995, + "train_speed(iter/s)": 1.638026 + }, + { + "acc": 0.64426832, + "epoch": 1.6489091831557585, + "grad_norm": 5.375, + "learning_rate": 8.191068824521998e-07, + "loss": 1.64707069, + "memory(GiB)": 128.51, + "step": 65000, + "train_speed(iter/s)": 1.638039 + }, + { + "epoch": 1.6489091831557585, + "eval_acc": 0.6462789517538639, + "eval_loss": 1.5731784105300903, + "eval_runtime": 58.7741, + "eval_samples_per_second": 108.381, + "eval_steps_per_second": 27.104, + "step": 65000 + }, + { + "acc": 0.64181662, + "epoch": 1.6490360223236935, + "grad_norm": 6.75, + "learning_rate": 8.185318491875876e-07, + "loss": 1.64678955, + "memory(GiB)": 128.51, + "step": 65005, + "train_speed(iter/s)": 1.635451 + }, + { + "acc": 0.67020946, + "epoch": 1.6491628614916287, + "grad_norm": 6.28125, + "learning_rate": 8.179569998426162e-07, + "loss": 1.56151409, + "memory(GiB)": 128.51, + "step": 65010, + "train_speed(iter/s)": 1.635463 + }, + { + "acc": 0.65577993, + "epoch": 1.6492897006595637, + "grad_norm": 6.40625, + "learning_rate": 8.173823344425663e-07, + "loss": 1.56394186, + "memory(GiB)": 128.51, + "step": 65015, + "train_speed(iter/s)": 1.635476 + }, + { + "acc": 0.66167994, + "epoch": 1.6494165398274987, + "grad_norm": 6.28125, + "learning_rate": 8.168078530127138e-07, + "loss": 1.5656702, + "memory(GiB)": 128.51, + "step": 65020, + "train_speed(iter/s)": 1.635488 + }, + { + "acc": 0.6601758, + "epoch": 1.6495433789954337, + "grad_norm": 4.875, + "learning_rate": 8.162335555783301e-07, + "loss": 1.59683399, + "memory(GiB)": 128.51, + "step": 65025, + "train_speed(iter/s)": 1.6355 + }, + { + "acc": 0.64978333, + "epoch": 1.649670218163369, + "grad_norm": 5.34375, + "learning_rate": 8.156594421646752e-07, + "loss": 1.59758263, + "memory(GiB)": 128.51, + "step": 65030, + "train_speed(iter/s)": 1.635513 + }, + { + "acc": 0.66675954, + "epoch": 1.6497970573313039, + "grad_norm": 5.59375, + "learning_rate": 8.15085512796997e-07, + "loss": 1.54028206, + "memory(GiB)": 128.51, + "step": 65035, + "train_speed(iter/s)": 1.635525 + }, + { + "acc": 0.6516674, + "epoch": 1.649923896499239, + "grad_norm": 5.25, + "learning_rate": 8.145117675005431e-07, + "loss": 1.57562218, + "memory(GiB)": 128.51, + "step": 65040, + "train_speed(iter/s)": 1.635539 + }, + { + "acc": 0.64809451, + "epoch": 1.650050735667174, + "grad_norm": 7.71875, + "learning_rate": 8.13938206300549e-07, + "loss": 1.66687222, + "memory(GiB)": 128.51, + "step": 65045, + "train_speed(iter/s)": 1.635552 + }, + { + "acc": 0.64147387, + "epoch": 1.650177574835109, + "grad_norm": 5.53125, + "learning_rate": 8.133648292222435e-07, + "loss": 1.62831974, + "memory(GiB)": 128.51, + "step": 65050, + "train_speed(iter/s)": 1.635564 + }, + { + "acc": 0.65581045, + "epoch": 1.650304414003044, + "grad_norm": 6.0, + "learning_rate": 8.127916362908422e-07, + "loss": 1.62946377, + "memory(GiB)": 128.51, + "step": 65055, + "train_speed(iter/s)": 1.635577 + }, + { + "acc": 0.64711485, + "epoch": 1.650431253170979, + "grad_norm": 5.84375, + "learning_rate": 8.122186275315602e-07, + "loss": 1.65491161, + "memory(GiB)": 128.51, + "step": 65060, + "train_speed(iter/s)": 1.63559 + }, + { + "acc": 0.64933577, + "epoch": 1.6505580923389143, + "grad_norm": 6.0625, + "learning_rate": 8.116458029696e-07, + "loss": 1.65905991, + "memory(GiB)": 128.51, + "step": 65065, + "train_speed(iter/s)": 1.635602 + }, + { + "acc": 0.65868735, + "epoch": 1.6506849315068495, + "grad_norm": 5.90625, + "learning_rate": 8.110731626301577e-07, + "loss": 1.59663143, + "memory(GiB)": 128.51, + "step": 65070, + "train_speed(iter/s)": 1.635615 + }, + { + "acc": 0.65231023, + "epoch": 1.6508117706747845, + "grad_norm": 4.875, + "learning_rate": 8.105007065384191e-07, + "loss": 1.57585659, + "memory(GiB)": 128.51, + "step": 65075, + "train_speed(iter/s)": 1.635628 + }, + { + "acc": 0.66797714, + "epoch": 1.6509386098427195, + "grad_norm": 5.28125, + "learning_rate": 8.099284347195646e-07, + "loss": 1.55437717, + "memory(GiB)": 128.51, + "step": 65080, + "train_speed(iter/s)": 1.635641 + }, + { + "acc": 0.67555838, + "epoch": 1.6510654490106544, + "grad_norm": 8.125, + "learning_rate": 8.093563471987647e-07, + "loss": 1.55635719, + "memory(GiB)": 128.51, + "step": 65085, + "train_speed(iter/s)": 1.635653 + }, + { + "acc": 0.67573853, + "epoch": 1.6511922881785894, + "grad_norm": 5.9375, + "learning_rate": 8.087844440011828e-07, + "loss": 1.50783558, + "memory(GiB)": 128.51, + "step": 65090, + "train_speed(iter/s)": 1.635665 + }, + { + "acc": 0.66688285, + "epoch": 1.6513191273465246, + "grad_norm": 4.875, + "learning_rate": 8.082127251519733e-07, + "loss": 1.57410526, + "memory(GiB)": 128.51, + "step": 65095, + "train_speed(iter/s)": 1.635676 + }, + { + "acc": 0.66919785, + "epoch": 1.6514459665144596, + "grad_norm": 5.59375, + "learning_rate": 8.076411906762832e-07, + "loss": 1.56066799, + "memory(GiB)": 128.51, + "step": 65100, + "train_speed(iter/s)": 1.635689 + }, + { + "acc": 0.66150475, + "epoch": 1.6515728056823948, + "grad_norm": 6.09375, + "learning_rate": 8.070698405992511e-07, + "loss": 1.65026131, + "memory(GiB)": 128.51, + "step": 65105, + "train_speed(iter/s)": 1.635702 + }, + { + "acc": 0.65168858, + "epoch": 1.6516996448503298, + "grad_norm": 6.1875, + "learning_rate": 8.064986749460075e-07, + "loss": 1.6122757, + "memory(GiB)": 128.51, + "step": 65110, + "train_speed(iter/s)": 1.635714 + }, + { + "acc": 0.65405087, + "epoch": 1.6518264840182648, + "grad_norm": 7.34375, + "learning_rate": 8.059276937416744e-07, + "loss": 1.63427658, + "memory(GiB)": 128.51, + "step": 65115, + "train_speed(iter/s)": 1.635727 + }, + { + "acc": 0.65276527, + "epoch": 1.6519533231861998, + "grad_norm": 6.625, + "learning_rate": 8.053568970113667e-07, + "loss": 1.66405487, + "memory(GiB)": 128.51, + "step": 65120, + "train_speed(iter/s)": 1.63574 + }, + { + "acc": 0.63322868, + "epoch": 1.6520801623541348, + "grad_norm": 4.875, + "learning_rate": 8.047862847801896e-07, + "loss": 1.70209656, + "memory(GiB)": 128.51, + "step": 65125, + "train_speed(iter/s)": 1.635749 + }, + { + "acc": 0.66581726, + "epoch": 1.65220700152207, + "grad_norm": 5.71875, + "learning_rate": 8.042158570732444e-07, + "loss": 1.52808619, + "memory(GiB)": 128.51, + "step": 65130, + "train_speed(iter/s)": 1.635761 + }, + { + "acc": 0.64536476, + "epoch": 1.6523338406900052, + "grad_norm": 6.59375, + "learning_rate": 8.036456139156168e-07, + "loss": 1.66580391, + "memory(GiB)": 128.51, + "step": 65135, + "train_speed(iter/s)": 1.635774 + }, + { + "acc": 0.66046152, + "epoch": 1.6524606798579402, + "grad_norm": 5.875, + "learning_rate": 8.03075555332391e-07, + "loss": 1.70430145, + "memory(GiB)": 128.51, + "step": 65140, + "train_speed(iter/s)": 1.635787 + }, + { + "acc": 0.67178168, + "epoch": 1.6525875190258752, + "grad_norm": 9.625, + "learning_rate": 8.025056813486387e-07, + "loss": 1.59192867, + "memory(GiB)": 128.51, + "step": 65145, + "train_speed(iter/s)": 1.635799 + }, + { + "acc": 0.64705992, + "epoch": 1.6527143581938102, + "grad_norm": 6.0, + "learning_rate": 8.019359919894299e-07, + "loss": 1.64672737, + "memory(GiB)": 128.51, + "step": 65150, + "train_speed(iter/s)": 1.635812 + }, + { + "acc": 0.65914907, + "epoch": 1.6528411973617452, + "grad_norm": 7.375, + "learning_rate": 8.013664872798172e-07, + "loss": 1.56427727, + "memory(GiB)": 128.51, + "step": 65155, + "train_speed(iter/s)": 1.635824 + }, + { + "acc": 0.64570169, + "epoch": 1.6529680365296804, + "grad_norm": 6.3125, + "learning_rate": 8.007971672448511e-07, + "loss": 1.61004753, + "memory(GiB)": 128.51, + "step": 65160, + "train_speed(iter/s)": 1.635837 + }, + { + "acc": 0.65401773, + "epoch": 1.6530948756976154, + "grad_norm": 6.34375, + "learning_rate": 8.002280319095751e-07, + "loss": 1.63278847, + "memory(GiB)": 128.51, + "step": 65165, + "train_speed(iter/s)": 1.63585 + }, + { + "acc": 0.65552754, + "epoch": 1.6532217148655506, + "grad_norm": 5.3125, + "learning_rate": 7.996590812990219e-07, + "loss": 1.58666821, + "memory(GiB)": 128.51, + "step": 65170, + "train_speed(iter/s)": 1.635863 + }, + { + "acc": 0.67017293, + "epoch": 1.6533485540334856, + "grad_norm": 4.6875, + "learning_rate": 7.990903154382145e-07, + "loss": 1.52340727, + "memory(GiB)": 128.51, + "step": 65175, + "train_speed(iter/s)": 1.635875 + }, + { + "acc": 0.65655279, + "epoch": 1.6534753932014206, + "grad_norm": 5.90625, + "learning_rate": 7.985217343521695e-07, + "loss": 1.56786976, + "memory(GiB)": 128.51, + "step": 65180, + "train_speed(iter/s)": 1.635888 + }, + { + "acc": 0.65701056, + "epoch": 1.6536022323693556, + "grad_norm": 4.96875, + "learning_rate": 7.979533380658977e-07, + "loss": 1.61558609, + "memory(GiB)": 128.51, + "step": 65185, + "train_speed(iter/s)": 1.635901 + }, + { + "acc": 0.67105217, + "epoch": 1.6537290715372908, + "grad_norm": 7.1875, + "learning_rate": 7.973851266044003e-07, + "loss": 1.5201436, + "memory(GiB)": 128.51, + "step": 65190, + "train_speed(iter/s)": 1.635916 + }, + { + "acc": 0.6506609, + "epoch": 1.6538559107052258, + "grad_norm": 9.75, + "learning_rate": 7.968170999926661e-07, + "loss": 1.69856205, + "memory(GiB)": 128.51, + "step": 65195, + "train_speed(iter/s)": 1.63593 + }, + { + "acc": 0.66018848, + "epoch": 1.653982749873161, + "grad_norm": 6.03125, + "learning_rate": 7.962492582556825e-07, + "loss": 1.58916445, + "memory(GiB)": 128.51, + "step": 65200, + "train_speed(iter/s)": 1.635943 + }, + { + "acc": 0.64859557, + "epoch": 1.654109589041096, + "grad_norm": 5.78125, + "learning_rate": 7.956816014184254e-07, + "loss": 1.62006416, + "memory(GiB)": 128.51, + "step": 65205, + "train_speed(iter/s)": 1.635956 + }, + { + "acc": 0.66832409, + "epoch": 1.654236428209031, + "grad_norm": 6.875, + "learning_rate": 7.951141295058618e-07, + "loss": 1.54471521, + "memory(GiB)": 128.51, + "step": 65210, + "train_speed(iter/s)": 1.63597 + }, + { + "acc": 0.64623723, + "epoch": 1.654363267376966, + "grad_norm": 6.125, + "learning_rate": 7.945468425429525e-07, + "loss": 1.64616013, + "memory(GiB)": 128.51, + "step": 65215, + "train_speed(iter/s)": 1.635983 + }, + { + "acc": 0.63586564, + "epoch": 1.654490106544901, + "grad_norm": 5.78125, + "learning_rate": 7.939797405546496e-07, + "loss": 1.67903404, + "memory(GiB)": 128.51, + "step": 65220, + "train_speed(iter/s)": 1.635997 + }, + { + "acc": 0.66955652, + "epoch": 1.6546169457128361, + "grad_norm": 7.65625, + "learning_rate": 7.934128235658955e-07, + "loss": 1.58389778, + "memory(GiB)": 128.51, + "step": 65225, + "train_speed(iter/s)": 1.63601 + }, + { + "acc": 0.6645081, + "epoch": 1.6547437848807713, + "grad_norm": 4.9375, + "learning_rate": 7.928460916016272e-07, + "loss": 1.56747551, + "memory(GiB)": 128.51, + "step": 65230, + "train_speed(iter/s)": 1.636022 + }, + { + "acc": 0.64479227, + "epoch": 1.6548706240487063, + "grad_norm": 5.59375, + "learning_rate": 7.92279544686771e-07, + "loss": 1.67095509, + "memory(GiB)": 128.51, + "step": 65235, + "train_speed(iter/s)": 1.636035 + }, + { + "acc": 0.64896307, + "epoch": 1.6549974632166413, + "grad_norm": 5.6875, + "learning_rate": 7.917131828462465e-07, + "loss": 1.58562851, + "memory(GiB)": 128.51, + "step": 65240, + "train_speed(iter/s)": 1.636047 + }, + { + "acc": 0.65538549, + "epoch": 1.6551243023845763, + "grad_norm": 5.96875, + "learning_rate": 7.911470061049647e-07, + "loss": 1.59046879, + "memory(GiB)": 128.51, + "step": 65245, + "train_speed(iter/s)": 1.63606 + }, + { + "acc": 0.64248624, + "epoch": 1.6552511415525113, + "grad_norm": 6.25, + "learning_rate": 7.90581014487829e-07, + "loss": 1.62341557, + "memory(GiB)": 128.51, + "step": 65250, + "train_speed(iter/s)": 1.636073 + }, + { + "acc": 0.65540371, + "epoch": 1.6553779807204465, + "grad_norm": 7.09375, + "learning_rate": 7.900152080197337e-07, + "loss": 1.68097935, + "memory(GiB)": 128.51, + "step": 65255, + "train_speed(iter/s)": 1.636086 + }, + { + "acc": 0.6656146, + "epoch": 1.6555048198883815, + "grad_norm": 5.8125, + "learning_rate": 7.894495867255664e-07, + "loss": 1.60883026, + "memory(GiB)": 128.51, + "step": 65260, + "train_speed(iter/s)": 1.6361 + }, + { + "acc": 0.65339518, + "epoch": 1.6556316590563167, + "grad_norm": 6.28125, + "learning_rate": 7.888841506302048e-07, + "loss": 1.6685379, + "memory(GiB)": 128.51, + "step": 65265, + "train_speed(iter/s)": 1.636113 + }, + { + "acc": 0.67134261, + "epoch": 1.6557584982242517, + "grad_norm": 5.3125, + "learning_rate": 7.883188997585189e-07, + "loss": 1.4518425, + "memory(GiB)": 128.51, + "step": 65270, + "train_speed(iter/s)": 1.636126 + }, + { + "acc": 0.64956789, + "epoch": 1.6558853373921867, + "grad_norm": 5.34375, + "learning_rate": 7.877538341353724e-07, + "loss": 1.66238289, + "memory(GiB)": 128.51, + "step": 65275, + "train_speed(iter/s)": 1.63614 + }, + { + "acc": 0.67035651, + "epoch": 1.6560121765601217, + "grad_norm": 5.09375, + "learning_rate": 7.871889537856181e-07, + "loss": 1.58492785, + "memory(GiB)": 128.51, + "step": 65280, + "train_speed(iter/s)": 1.636152 + }, + { + "acc": 0.64411297, + "epoch": 1.6561390157280567, + "grad_norm": 6.875, + "learning_rate": 7.866242587341017e-07, + "loss": 1.61599236, + "memory(GiB)": 128.51, + "step": 65285, + "train_speed(iter/s)": 1.636164 + }, + { + "acc": 0.65285897, + "epoch": 1.6562658548959919, + "grad_norm": 6.40625, + "learning_rate": 7.860597490056638e-07, + "loss": 1.59369917, + "memory(GiB)": 128.51, + "step": 65290, + "train_speed(iter/s)": 1.636178 + }, + { + "acc": 0.66462326, + "epoch": 1.656392694063927, + "grad_norm": 5.15625, + "learning_rate": 7.854954246251306e-07, + "loss": 1.5429512, + "memory(GiB)": 128.51, + "step": 65295, + "train_speed(iter/s)": 1.636191 + }, + { + "acc": 0.66352501, + "epoch": 1.656519533231862, + "grad_norm": 6.0625, + "learning_rate": 7.849312856173242e-07, + "loss": 1.57256432, + "memory(GiB)": 128.51, + "step": 65300, + "train_speed(iter/s)": 1.636205 + }, + { + "acc": 0.65696092, + "epoch": 1.656646372399797, + "grad_norm": 5.21875, + "learning_rate": 7.8436733200706e-07, + "loss": 1.60008869, + "memory(GiB)": 128.51, + "step": 65305, + "train_speed(iter/s)": 1.636218 + }, + { + "acc": 0.64650974, + "epoch": 1.656773211567732, + "grad_norm": 5.90625, + "learning_rate": 7.838035638191432e-07, + "loss": 1.64540615, + "memory(GiB)": 128.51, + "step": 65310, + "train_speed(iter/s)": 1.63623 + }, + { + "acc": 0.65073004, + "epoch": 1.656900050735667, + "grad_norm": 6.25, + "learning_rate": 7.832399810783686e-07, + "loss": 1.62034454, + "memory(GiB)": 128.51, + "step": 65315, + "train_speed(iter/s)": 1.636244 + }, + { + "acc": 0.64747396, + "epoch": 1.6570268899036023, + "grad_norm": 4.90625, + "learning_rate": 7.826765838095246e-07, + "loss": 1.58713512, + "memory(GiB)": 128.51, + "step": 65320, + "train_speed(iter/s)": 1.636257 + }, + { + "acc": 0.67192745, + "epoch": 1.6571537290715372, + "grad_norm": 6.21875, + "learning_rate": 7.821133720373947e-07, + "loss": 1.60663223, + "memory(GiB)": 128.51, + "step": 65325, + "train_speed(iter/s)": 1.636271 + }, + { + "acc": 0.66326227, + "epoch": 1.6572805682394725, + "grad_norm": 5.875, + "learning_rate": 7.815503457867512e-07, + "loss": 1.6469099, + "memory(GiB)": 128.51, + "step": 65330, + "train_speed(iter/s)": 1.636284 + }, + { + "acc": 0.66206222, + "epoch": 1.6574074074074074, + "grad_norm": 6.03125, + "learning_rate": 7.809875050823556e-07, + "loss": 1.54736423, + "memory(GiB)": 128.51, + "step": 65335, + "train_speed(iter/s)": 1.636297 + }, + { + "acc": 0.64818468, + "epoch": 1.6575342465753424, + "grad_norm": 5.375, + "learning_rate": 7.804248499489669e-07, + "loss": 1.65572701, + "memory(GiB)": 128.51, + "step": 65340, + "train_speed(iter/s)": 1.636311 + }, + { + "acc": 0.66980877, + "epoch": 1.6576610857432774, + "grad_norm": 5.375, + "learning_rate": 7.798623804113326e-07, + "loss": 1.59281693, + "memory(GiB)": 128.51, + "step": 65345, + "train_speed(iter/s)": 1.636322 + }, + { + "acc": 0.65084968, + "epoch": 1.6577879249112126, + "grad_norm": 4.6875, + "learning_rate": 7.793000964941932e-07, + "loss": 1.59598341, + "memory(GiB)": 128.51, + "step": 65350, + "train_speed(iter/s)": 1.636335 + }, + { + "acc": 0.65423284, + "epoch": 1.6579147640791476, + "grad_norm": 5.53125, + "learning_rate": 7.787379982222776e-07, + "loss": 1.60357094, + "memory(GiB)": 128.51, + "step": 65355, + "train_speed(iter/s)": 1.636347 + }, + { + "acc": 0.65282068, + "epoch": 1.6580416032470828, + "grad_norm": 5.28125, + "learning_rate": 7.781760856203124e-07, + "loss": 1.56206036, + "memory(GiB)": 128.51, + "step": 65360, + "train_speed(iter/s)": 1.636359 + }, + { + "acc": 0.68612952, + "epoch": 1.6581684424150178, + "grad_norm": 5.5625, + "learning_rate": 7.77614358713012e-07, + "loss": 1.52576618, + "memory(GiB)": 128.51, + "step": 65365, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.66813354, + "epoch": 1.6582952815829528, + "grad_norm": 6.125, + "learning_rate": 7.770528175250835e-07, + "loss": 1.57223053, + "memory(GiB)": 128.51, + "step": 65370, + "train_speed(iter/s)": 1.636385 + }, + { + "acc": 0.64530592, + "epoch": 1.6584221207508878, + "grad_norm": 6.25, + "learning_rate": 7.764914620812269e-07, + "loss": 1.65743713, + "memory(GiB)": 128.51, + "step": 65375, + "train_speed(iter/s)": 1.636397 + }, + { + "acc": 0.64771333, + "epoch": 1.6585489599188228, + "grad_norm": 7.125, + "learning_rate": 7.759302924061318e-07, + "loss": 1.60482826, + "memory(GiB)": 128.51, + "step": 65380, + "train_speed(iter/s)": 1.63641 + }, + { + "acc": 0.64264994, + "epoch": 1.658675799086758, + "grad_norm": 5.34375, + "learning_rate": 7.753693085244818e-07, + "loss": 1.64855995, + "memory(GiB)": 128.51, + "step": 65385, + "train_speed(iter/s)": 1.636423 + }, + { + "acc": 0.65659723, + "epoch": 1.6588026382546932, + "grad_norm": 5.28125, + "learning_rate": 7.748085104609509e-07, + "loss": 1.66198273, + "memory(GiB)": 128.51, + "step": 65390, + "train_speed(iter/s)": 1.636436 + }, + { + "acc": 0.66503048, + "epoch": 1.6589294774226282, + "grad_norm": 7.0625, + "learning_rate": 7.742478982402063e-07, + "loss": 1.55654812, + "memory(GiB)": 128.51, + "step": 65395, + "train_speed(iter/s)": 1.636449 + }, + { + "acc": 0.65070424, + "epoch": 1.6590563165905632, + "grad_norm": 4.78125, + "learning_rate": 7.736874718869053e-07, + "loss": 1.66279659, + "memory(GiB)": 128.51, + "step": 65400, + "train_speed(iter/s)": 1.636462 + }, + { + "acc": 0.65407863, + "epoch": 1.6591831557584982, + "grad_norm": 6.28125, + "learning_rate": 7.731272314256988e-07, + "loss": 1.6149765, + "memory(GiB)": 128.51, + "step": 65405, + "train_speed(iter/s)": 1.636476 + }, + { + "acc": 0.65497122, + "epoch": 1.6593099949264332, + "grad_norm": 8.125, + "learning_rate": 7.725671768812282e-07, + "loss": 1.58684559, + "memory(GiB)": 128.51, + "step": 65410, + "train_speed(iter/s)": 1.636489 + }, + { + "acc": 0.64559765, + "epoch": 1.6594368340943684, + "grad_norm": 6.0625, + "learning_rate": 7.720073082781271e-07, + "loss": 1.61308193, + "memory(GiB)": 128.51, + "step": 65415, + "train_speed(iter/s)": 1.636503 + }, + { + "acc": 0.66281614, + "epoch": 1.6595636732623034, + "grad_norm": 4.6875, + "learning_rate": 7.714476256410214e-07, + "loss": 1.59173279, + "memory(GiB)": 128.51, + "step": 65420, + "train_speed(iter/s)": 1.636516 + }, + { + "acc": 0.6603837, + "epoch": 1.6596905124302386, + "grad_norm": 5.3125, + "learning_rate": 7.708881289945275e-07, + "loss": 1.58160429, + "memory(GiB)": 128.51, + "step": 65425, + "train_speed(iter/s)": 1.636529 + }, + { + "acc": 0.66036453, + "epoch": 1.6598173515981736, + "grad_norm": 5.15625, + "learning_rate": 7.703288183632567e-07, + "loss": 1.58553047, + "memory(GiB)": 128.51, + "step": 65430, + "train_speed(iter/s)": 1.636542 + }, + { + "acc": 0.64074917, + "epoch": 1.6599441907661086, + "grad_norm": 5.84375, + "learning_rate": 7.697696937718079e-07, + "loss": 1.70121021, + "memory(GiB)": 128.51, + "step": 65435, + "train_speed(iter/s)": 1.636554 + }, + { + "acc": 0.65881038, + "epoch": 1.6600710299340435, + "grad_norm": 6.34375, + "learning_rate": 7.692107552447748e-07, + "loss": 1.58281765, + "memory(GiB)": 128.51, + "step": 65440, + "train_speed(iter/s)": 1.636566 + }, + { + "acc": 0.67516508, + "epoch": 1.6601978691019785, + "grad_norm": 6.0, + "learning_rate": 7.686520028067406e-07, + "loss": 1.54906082, + "memory(GiB)": 128.51, + "step": 65445, + "train_speed(iter/s)": 1.63658 + }, + { + "acc": 0.65797296, + "epoch": 1.6603247082699137, + "grad_norm": 7.0, + "learning_rate": 7.680934364822851e-07, + "loss": 1.68625107, + "memory(GiB)": 128.51, + "step": 65450, + "train_speed(iter/s)": 1.636595 + }, + { + "acc": 0.66376686, + "epoch": 1.660451547437849, + "grad_norm": 6.125, + "learning_rate": 7.675350562959733e-07, + "loss": 1.60438538, + "memory(GiB)": 128.51, + "step": 65455, + "train_speed(iter/s)": 1.636609 + }, + { + "acc": 0.66321735, + "epoch": 1.660578386605784, + "grad_norm": 5.5, + "learning_rate": 7.66976862272365e-07, + "loss": 1.55235806, + "memory(GiB)": 128.51, + "step": 65460, + "train_speed(iter/s)": 1.636624 + }, + { + "acc": 0.6486279, + "epoch": 1.660705225773719, + "grad_norm": 5.6875, + "learning_rate": 7.664188544360146e-07, + "loss": 1.61041298, + "memory(GiB)": 128.51, + "step": 65465, + "train_speed(iter/s)": 1.636637 + }, + { + "acc": 0.67162523, + "epoch": 1.660832064941654, + "grad_norm": 7.3125, + "learning_rate": 7.658610328114658e-07, + "loss": 1.52537556, + "memory(GiB)": 128.51, + "step": 65470, + "train_speed(iter/s)": 1.63665 + }, + { + "acc": 0.65377564, + "epoch": 1.660958904109589, + "grad_norm": 6.03125, + "learning_rate": 7.653033974232504e-07, + "loss": 1.65487843, + "memory(GiB)": 128.51, + "step": 65475, + "train_speed(iter/s)": 1.636664 + }, + { + "acc": 0.65147114, + "epoch": 1.6610857432775241, + "grad_norm": 5.28125, + "learning_rate": 7.647459482958991e-07, + "loss": 1.6890976, + "memory(GiB)": 128.51, + "step": 65480, + "train_speed(iter/s)": 1.636676 + }, + { + "acc": 0.65547643, + "epoch": 1.661212582445459, + "grad_norm": 5.03125, + "learning_rate": 7.641886854539304e-07, + "loss": 1.61011543, + "memory(GiB)": 128.51, + "step": 65485, + "train_speed(iter/s)": 1.63669 + }, + { + "acc": 0.6762126, + "epoch": 1.6613394216133943, + "grad_norm": 4.9375, + "learning_rate": 7.63631608921856e-07, + "loss": 1.5608264, + "memory(GiB)": 128.51, + "step": 65490, + "train_speed(iter/s)": 1.636704 + }, + { + "acc": 0.65644288, + "epoch": 1.6614662607813293, + "grad_norm": 5.1875, + "learning_rate": 7.63074718724175e-07, + "loss": 1.57908649, + "memory(GiB)": 128.51, + "step": 65495, + "train_speed(iter/s)": 1.636717 + }, + { + "acc": 0.65521169, + "epoch": 1.6615930999492643, + "grad_norm": 5.125, + "learning_rate": 7.625180148853856e-07, + "loss": 1.5827076, + "memory(GiB)": 128.51, + "step": 65500, + "train_speed(iter/s)": 1.636731 + }, + { + "acc": 0.65709448, + "epoch": 1.6617199391171993, + "grad_norm": 6.25, + "learning_rate": 7.619614974299727e-07, + "loss": 1.59858389, + "memory(GiB)": 128.51, + "step": 65505, + "train_speed(iter/s)": 1.636745 + }, + { + "acc": 0.67462406, + "epoch": 1.6618467782851345, + "grad_norm": 6.25, + "learning_rate": 7.614051663824152e-07, + "loss": 1.50673265, + "memory(GiB)": 128.51, + "step": 65510, + "train_speed(iter/s)": 1.636758 + }, + { + "acc": 0.67008948, + "epoch": 1.6619736174530695, + "grad_norm": 5.75, + "learning_rate": 7.608490217671821e-07, + "loss": 1.52118893, + "memory(GiB)": 128.51, + "step": 65515, + "train_speed(iter/s)": 1.636771 + }, + { + "acc": 0.64825025, + "epoch": 1.6621004566210047, + "grad_norm": 6.03125, + "learning_rate": 7.602930636087352e-07, + "loss": 1.63918686, + "memory(GiB)": 128.51, + "step": 65520, + "train_speed(iter/s)": 1.636785 + }, + { + "acc": 0.66683917, + "epoch": 1.6622272957889397, + "grad_norm": 5.125, + "learning_rate": 7.597372919315288e-07, + "loss": 1.55445423, + "memory(GiB)": 128.51, + "step": 65525, + "train_speed(iter/s)": 1.636797 + }, + { + "acc": 0.65495896, + "epoch": 1.6623541349568747, + "grad_norm": 6.03125, + "learning_rate": 7.591817067600071e-07, + "loss": 1.63891392, + "memory(GiB)": 128.51, + "step": 65530, + "train_speed(iter/s)": 1.636811 + }, + { + "acc": 0.63826151, + "epoch": 1.6624809741248097, + "grad_norm": 6.03125, + "learning_rate": 7.586263081186085e-07, + "loss": 1.69268265, + "memory(GiB)": 128.51, + "step": 65535, + "train_speed(iter/s)": 1.636824 + }, + { + "acc": 0.6428556, + "epoch": 1.6626078132927447, + "grad_norm": 5.40625, + "learning_rate": 7.580710960317605e-07, + "loss": 1.65391922, + "memory(GiB)": 128.51, + "step": 65540, + "train_speed(iter/s)": 1.636838 + }, + { + "acc": 0.65940886, + "epoch": 1.6627346524606799, + "grad_norm": 6.25, + "learning_rate": 7.575160705238854e-07, + "loss": 1.59410992, + "memory(GiB)": 128.51, + "step": 65545, + "train_speed(iter/s)": 1.636852 + }, + { + "acc": 0.65870733, + "epoch": 1.662861491628615, + "grad_norm": 5.0, + "learning_rate": 7.569612316193942e-07, + "loss": 1.60653038, + "memory(GiB)": 128.51, + "step": 65550, + "train_speed(iter/s)": 1.636865 + }, + { + "acc": 0.65938873, + "epoch": 1.66298833079655, + "grad_norm": 5.5625, + "learning_rate": 7.564065793426923e-07, + "loss": 1.58472986, + "memory(GiB)": 128.51, + "step": 65555, + "train_speed(iter/s)": 1.636879 + }, + { + "acc": 0.65987911, + "epoch": 1.663115169964485, + "grad_norm": 5.4375, + "learning_rate": 7.558521137181752e-07, + "loss": 1.61114902, + "memory(GiB)": 128.51, + "step": 65560, + "train_speed(iter/s)": 1.636893 + }, + { + "acc": 0.66059217, + "epoch": 1.66324200913242, + "grad_norm": 7.625, + "learning_rate": 7.552978347702295e-07, + "loss": 1.53199158, + "memory(GiB)": 128.51, + "step": 65565, + "train_speed(iter/s)": 1.636906 + }, + { + "acc": 0.6578804, + "epoch": 1.663368848300355, + "grad_norm": 6.375, + "learning_rate": 7.547437425232384e-07, + "loss": 1.56972036, + "memory(GiB)": 128.51, + "step": 65570, + "train_speed(iter/s)": 1.636919 + }, + { + "acc": 0.65196123, + "epoch": 1.6634956874682902, + "grad_norm": 5.6875, + "learning_rate": 7.541898370015704e-07, + "loss": 1.67188797, + "memory(GiB)": 128.51, + "step": 65575, + "train_speed(iter/s)": 1.636933 + }, + { + "acc": 0.6683013, + "epoch": 1.6636225266362252, + "grad_norm": 5.84375, + "learning_rate": 7.536361182295893e-07, + "loss": 1.5742116, + "memory(GiB)": 128.51, + "step": 65580, + "train_speed(iter/s)": 1.636946 + }, + { + "acc": 0.65599442, + "epoch": 1.6637493658041604, + "grad_norm": 5.03125, + "learning_rate": 7.530825862316493e-07, + "loss": 1.6119175, + "memory(GiB)": 128.51, + "step": 65585, + "train_speed(iter/s)": 1.636959 + }, + { + "acc": 0.66936278, + "epoch": 1.6638762049720954, + "grad_norm": 5.1875, + "learning_rate": 7.525292410321011e-07, + "loss": 1.50604992, + "memory(GiB)": 128.51, + "step": 65590, + "train_speed(iter/s)": 1.636971 + }, + { + "acc": 0.6632875, + "epoch": 1.6640030441400304, + "grad_norm": 5.46875, + "learning_rate": 7.519760826552786e-07, + "loss": 1.60509377, + "memory(GiB)": 128.51, + "step": 65595, + "train_speed(iter/s)": 1.636984 + }, + { + "acc": 0.65209823, + "epoch": 1.6641298833079654, + "grad_norm": 5.375, + "learning_rate": 7.514231111255132e-07, + "loss": 1.57483101, + "memory(GiB)": 128.51, + "step": 65600, + "train_speed(iter/s)": 1.636996 + }, + { + "acc": 0.66512256, + "epoch": 1.6642567224759004, + "grad_norm": 5.15625, + "learning_rate": 7.508703264671291e-07, + "loss": 1.53045959, + "memory(GiB)": 128.51, + "step": 65605, + "train_speed(iter/s)": 1.637008 + }, + { + "acc": 0.64667196, + "epoch": 1.6643835616438356, + "grad_norm": 6.09375, + "learning_rate": 7.5031772870444e-07, + "loss": 1.62335701, + "memory(GiB)": 128.51, + "step": 65610, + "train_speed(iter/s)": 1.637021 + }, + { + "acc": 0.65590401, + "epoch": 1.6645104008117708, + "grad_norm": 5.5, + "learning_rate": 7.497653178617498e-07, + "loss": 1.62596111, + "memory(GiB)": 128.51, + "step": 65615, + "train_speed(iter/s)": 1.637034 + }, + { + "acc": 0.64788008, + "epoch": 1.6646372399797058, + "grad_norm": 5.0625, + "learning_rate": 7.492130939633557e-07, + "loss": 1.6896162, + "memory(GiB)": 128.51, + "step": 65620, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.63460608, + "epoch": 1.6647640791476408, + "grad_norm": 5.3125, + "learning_rate": 7.48661057033549e-07, + "loss": 1.6710701, + "memory(GiB)": 128.51, + "step": 65625, + "train_speed(iter/s)": 1.63706 + }, + { + "acc": 0.65772924, + "epoch": 1.6648909183155758, + "grad_norm": 4.8125, + "learning_rate": 7.481092070966111e-07, + "loss": 1.64268894, + "memory(GiB)": 128.51, + "step": 65630, + "train_speed(iter/s)": 1.637072 + }, + { + "acc": 0.65640893, + "epoch": 1.6650177574835108, + "grad_norm": 6.5625, + "learning_rate": 7.475575441768112e-07, + "loss": 1.60860653, + "memory(GiB)": 128.51, + "step": 65635, + "train_speed(iter/s)": 1.637085 + }, + { + "acc": 0.65400248, + "epoch": 1.665144596651446, + "grad_norm": 5.75, + "learning_rate": 7.470060682984176e-07, + "loss": 1.56483631, + "memory(GiB)": 128.51, + "step": 65640, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.65104542, + "epoch": 1.665271435819381, + "grad_norm": 5.4375, + "learning_rate": 7.464547794856858e-07, + "loss": 1.58781214, + "memory(GiB)": 128.51, + "step": 65645, + "train_speed(iter/s)": 1.637109 + }, + { + "acc": 0.65274248, + "epoch": 1.6653982749873162, + "grad_norm": 6.03125, + "learning_rate": 7.459036777628631e-07, + "loss": 1.55972958, + "memory(GiB)": 128.51, + "step": 65650, + "train_speed(iter/s)": 1.637122 + }, + { + "acc": 0.6407217, + "epoch": 1.6655251141552512, + "grad_norm": 6.375, + "learning_rate": 7.453527631541896e-07, + "loss": 1.61590977, + "memory(GiB)": 128.51, + "step": 65655, + "train_speed(iter/s)": 1.637135 + }, + { + "acc": 0.66780691, + "epoch": 1.6656519533231862, + "grad_norm": 6.28125, + "learning_rate": 7.448020356838975e-07, + "loss": 1.58678398, + "memory(GiB)": 128.51, + "step": 65660, + "train_speed(iter/s)": 1.637148 + }, + { + "acc": 0.65969954, + "epoch": 1.6657787924911212, + "grad_norm": 5.46875, + "learning_rate": 7.442514953762098e-07, + "loss": 1.58708534, + "memory(GiB)": 128.51, + "step": 65665, + "train_speed(iter/s)": 1.63716 + }, + { + "acc": 0.64949865, + "epoch": 1.6659056316590564, + "grad_norm": 5.125, + "learning_rate": 7.43701142255342e-07, + "loss": 1.61301956, + "memory(GiB)": 128.51, + "step": 65670, + "train_speed(iter/s)": 1.637173 + }, + { + "acc": 0.6605875, + "epoch": 1.6660324708269914, + "grad_norm": 5.75, + "learning_rate": 7.431509763455008e-07, + "loss": 1.53994608, + "memory(GiB)": 128.51, + "step": 65675, + "train_speed(iter/s)": 1.637186 + }, + { + "acc": 0.65858688, + "epoch": 1.6661593099949266, + "grad_norm": 6.46875, + "learning_rate": 7.426009976708854e-07, + "loss": 1.56550102, + "memory(GiB)": 128.51, + "step": 65680, + "train_speed(iter/s)": 1.637198 + }, + { + "acc": 0.64349213, + "epoch": 1.6662861491628616, + "grad_norm": 5.09375, + "learning_rate": 7.420512062556856e-07, + "loss": 1.65120335, + "memory(GiB)": 128.51, + "step": 65685, + "train_speed(iter/s)": 1.637211 + }, + { + "acc": 0.66316233, + "epoch": 1.6664129883307965, + "grad_norm": 6.34375, + "learning_rate": 7.415016021240845e-07, + "loss": 1.55864029, + "memory(GiB)": 128.51, + "step": 65690, + "train_speed(iter/s)": 1.637223 + }, + { + "acc": 0.67226272, + "epoch": 1.6665398274987315, + "grad_norm": 6.03125, + "learning_rate": 7.40952185300256e-07, + "loss": 1.56930151, + "memory(GiB)": 128.51, + "step": 65695, + "train_speed(iter/s)": 1.637236 + }, + { + "acc": 0.67184854, + "epoch": 1.6666666666666665, + "grad_norm": 5.0625, + "learning_rate": 7.404029558083653e-07, + "loss": 1.53423843, + "memory(GiB)": 128.51, + "step": 65700, + "train_speed(iter/s)": 1.637249 + }, + { + "acc": 0.6415741, + "epoch": 1.6667935058346017, + "grad_norm": 6.8125, + "learning_rate": 7.398539136725702e-07, + "loss": 1.65635262, + "memory(GiB)": 128.51, + "step": 65705, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.64650521, + "epoch": 1.666920345002537, + "grad_norm": 5.5625, + "learning_rate": 7.393050589170203e-07, + "loss": 1.61602917, + "memory(GiB)": 128.51, + "step": 65710, + "train_speed(iter/s)": 1.637275 + }, + { + "acc": 0.65448236, + "epoch": 1.667047184170472, + "grad_norm": 4.9375, + "learning_rate": 7.387563915658569e-07, + "loss": 1.57407494, + "memory(GiB)": 128.51, + "step": 65715, + "train_speed(iter/s)": 1.637287 + }, + { + "acc": 0.64957504, + "epoch": 1.667174023338407, + "grad_norm": 5.28125, + "learning_rate": 7.382079116432117e-07, + "loss": 1.61673355, + "memory(GiB)": 128.51, + "step": 65720, + "train_speed(iter/s)": 1.637299 + }, + { + "acc": 0.65638967, + "epoch": 1.667300862506342, + "grad_norm": 5.6875, + "learning_rate": 7.376596191732093e-07, + "loss": 1.58749752, + "memory(GiB)": 128.51, + "step": 65725, + "train_speed(iter/s)": 1.637313 + }, + { + "acc": 0.63733754, + "epoch": 1.667427701674277, + "grad_norm": 5.96875, + "learning_rate": 7.371115141799695e-07, + "loss": 1.65910378, + "memory(GiB)": 128.51, + "step": 65730, + "train_speed(iter/s)": 1.637327 + }, + { + "acc": 0.66136904, + "epoch": 1.667554540842212, + "grad_norm": 5.8125, + "learning_rate": 7.365635966875961e-07, + "loss": 1.60378284, + "memory(GiB)": 128.51, + "step": 65735, + "train_speed(iter/s)": 1.63734 + }, + { + "acc": 0.65051584, + "epoch": 1.667681380010147, + "grad_norm": 5.375, + "learning_rate": 7.3601586672019e-07, + "loss": 1.60557194, + "memory(GiB)": 128.51, + "step": 65740, + "train_speed(iter/s)": 1.637354 + }, + { + "acc": 0.66332998, + "epoch": 1.6678082191780823, + "grad_norm": 7.28125, + "learning_rate": 7.35468324301844e-07, + "loss": 1.58791981, + "memory(GiB)": 128.51, + "step": 65745, + "train_speed(iter/s)": 1.637366 + }, + { + "acc": 0.66530094, + "epoch": 1.6679350583460173, + "grad_norm": 7.5625, + "learning_rate": 7.349209694566422e-07, + "loss": 1.56388416, + "memory(GiB)": 128.51, + "step": 65750, + "train_speed(iter/s)": 1.63738 + }, + { + "acc": 0.66008701, + "epoch": 1.6680618975139523, + "grad_norm": 5.71875, + "learning_rate": 7.343738022086572e-07, + "loss": 1.58604755, + "memory(GiB)": 128.51, + "step": 65755, + "train_speed(iter/s)": 1.637393 + }, + { + "acc": 0.65272512, + "epoch": 1.6681887366818873, + "grad_norm": 5.59375, + "learning_rate": 7.338268225819562e-07, + "loss": 1.64771614, + "memory(GiB)": 128.51, + "step": 65760, + "train_speed(iter/s)": 1.637407 + }, + { + "acc": 0.63742986, + "epoch": 1.6683155758498223, + "grad_norm": 5.65625, + "learning_rate": 7.332800306005994e-07, + "loss": 1.6484024, + "memory(GiB)": 128.51, + "step": 65765, + "train_speed(iter/s)": 1.63742 + }, + { + "acc": 0.65031991, + "epoch": 1.6684424150177575, + "grad_norm": 5.375, + "learning_rate": 7.327334262886376e-07, + "loss": 1.68305264, + "memory(GiB)": 128.51, + "step": 65770, + "train_speed(iter/s)": 1.637434 + }, + { + "acc": 0.66134796, + "epoch": 1.6685692541856927, + "grad_norm": 5.59375, + "learning_rate": 7.321870096701095e-07, + "loss": 1.61196709, + "memory(GiB)": 128.51, + "step": 65775, + "train_speed(iter/s)": 1.637447 + }, + { + "acc": 0.66320267, + "epoch": 1.6686960933536277, + "grad_norm": 5.625, + "learning_rate": 7.316407807690523e-07, + "loss": 1.51767149, + "memory(GiB)": 128.51, + "step": 65780, + "train_speed(iter/s)": 1.63746 + }, + { + "acc": 0.6484231, + "epoch": 1.6688229325215627, + "grad_norm": 7.0625, + "learning_rate": 7.310947396094903e-07, + "loss": 1.65481453, + "memory(GiB)": 128.51, + "step": 65785, + "train_speed(iter/s)": 1.637473 + }, + { + "acc": 0.65878487, + "epoch": 1.6689497716894977, + "grad_norm": 5.8125, + "learning_rate": 7.30548886215442e-07, + "loss": 1.62875786, + "memory(GiB)": 128.51, + "step": 65790, + "train_speed(iter/s)": 1.637486 + }, + { + "acc": 0.66949759, + "epoch": 1.6690766108574326, + "grad_norm": 5.53125, + "learning_rate": 7.300032206109131e-07, + "loss": 1.53504972, + "memory(GiB)": 128.51, + "step": 65795, + "train_speed(iter/s)": 1.6375 + }, + { + "acc": 0.66699028, + "epoch": 1.6692034500253679, + "grad_norm": 5.34375, + "learning_rate": 7.294577428199084e-07, + "loss": 1.54227524, + "memory(GiB)": 128.51, + "step": 65800, + "train_speed(iter/s)": 1.637514 + }, + { + "acc": 0.64892025, + "epoch": 1.6693302891933028, + "grad_norm": 5.15625, + "learning_rate": 7.289124528664182e-07, + "loss": 1.70395527, + "memory(GiB)": 128.51, + "step": 65805, + "train_speed(iter/s)": 1.637528 + }, + { + "acc": 0.6476965, + "epoch": 1.669457128361238, + "grad_norm": 5.125, + "learning_rate": 7.283673507744276e-07, + "loss": 1.64203892, + "memory(GiB)": 128.51, + "step": 65810, + "train_speed(iter/s)": 1.637541 + }, + { + "acc": 0.64554396, + "epoch": 1.669583967529173, + "grad_norm": 7.0625, + "learning_rate": 7.278224365679121e-07, + "loss": 1.59112759, + "memory(GiB)": 128.51, + "step": 65815, + "train_speed(iter/s)": 1.637553 + }, + { + "acc": 0.6584939, + "epoch": 1.669710806697108, + "grad_norm": 7.09375, + "learning_rate": 7.2727771027084e-07, + "loss": 1.61457024, + "memory(GiB)": 128.51, + "step": 65820, + "train_speed(iter/s)": 1.637567 + }, + { + "acc": 0.65968752, + "epoch": 1.669837645865043, + "grad_norm": 5.59375, + "learning_rate": 7.267331719071707e-07, + "loss": 1.6631279, + "memory(GiB)": 128.51, + "step": 65825, + "train_speed(iter/s)": 1.637581 + }, + { + "acc": 0.66390309, + "epoch": 1.6699644850329782, + "grad_norm": 5.6875, + "learning_rate": 7.261888215008551e-07, + "loss": 1.56354103, + "memory(GiB)": 128.51, + "step": 65830, + "train_speed(iter/s)": 1.637595 + }, + { + "acc": 0.64683018, + "epoch": 1.6700913242009132, + "grad_norm": 5.25, + "learning_rate": 7.256446590758359e-07, + "loss": 1.67115707, + "memory(GiB)": 128.51, + "step": 65835, + "train_speed(iter/s)": 1.637608 + }, + { + "acc": 0.67179337, + "epoch": 1.6702181633688484, + "grad_norm": 7.625, + "learning_rate": 7.251006846560487e-07, + "loss": 1.51348686, + "memory(GiB)": 128.51, + "step": 65840, + "train_speed(iter/s)": 1.637621 + }, + { + "acc": 0.64702387, + "epoch": 1.6703450025367834, + "grad_norm": 5.875, + "learning_rate": 7.245568982654194e-07, + "loss": 1.64454041, + "memory(GiB)": 128.51, + "step": 65845, + "train_speed(iter/s)": 1.637634 + }, + { + "acc": 0.6678637, + "epoch": 1.6704718417047184, + "grad_norm": 6.15625, + "learning_rate": 7.24013299927866e-07, + "loss": 1.63014526, + "memory(GiB)": 128.51, + "step": 65850, + "train_speed(iter/s)": 1.637647 + }, + { + "acc": 0.6566741, + "epoch": 1.6705986808726534, + "grad_norm": 6.71875, + "learning_rate": 7.234698896672987e-07, + "loss": 1.64499893, + "memory(GiB)": 128.51, + "step": 65855, + "train_speed(iter/s)": 1.63766 + }, + { + "acc": 0.65980058, + "epoch": 1.6707255200405884, + "grad_norm": 5.34375, + "learning_rate": 7.229266675076191e-07, + "loss": 1.59843178, + "memory(GiB)": 128.51, + "step": 65860, + "train_speed(iter/s)": 1.637672 + }, + { + "acc": 0.66773119, + "epoch": 1.6708523592085236, + "grad_norm": 6.03125, + "learning_rate": 7.223836334727191e-07, + "loss": 1.51826258, + "memory(GiB)": 128.51, + "step": 65865, + "train_speed(iter/s)": 1.637685 + }, + { + "acc": 0.65239058, + "epoch": 1.6709791983764588, + "grad_norm": 7.8125, + "learning_rate": 7.218407875864875e-07, + "loss": 1.591014, + "memory(GiB)": 128.51, + "step": 65870, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.65236731, + "epoch": 1.6711060375443938, + "grad_norm": 6.375, + "learning_rate": 7.212981298727972e-07, + "loss": 1.6539753, + "memory(GiB)": 128.51, + "step": 65875, + "train_speed(iter/s)": 1.63771 + }, + { + "acc": 0.64987974, + "epoch": 1.6712328767123288, + "grad_norm": 5.8125, + "learning_rate": 7.207556603555188e-07, + "loss": 1.66034451, + "memory(GiB)": 128.51, + "step": 65880, + "train_speed(iter/s)": 1.637724 + }, + { + "acc": 0.63892794, + "epoch": 1.6713597158802638, + "grad_norm": 5.84375, + "learning_rate": 7.202133790585103e-07, + "loss": 1.62107201, + "memory(GiB)": 128.51, + "step": 65885, + "train_speed(iter/s)": 1.637737 + }, + { + "acc": 0.65837064, + "epoch": 1.6714865550481988, + "grad_norm": 7.09375, + "learning_rate": 7.196712860056277e-07, + "loss": 1.58579884, + "memory(GiB)": 128.51, + "step": 65890, + "train_speed(iter/s)": 1.637749 + }, + { + "acc": 0.65704942, + "epoch": 1.671613394216134, + "grad_norm": 4.53125, + "learning_rate": 7.191293812207111e-07, + "loss": 1.63663578, + "memory(GiB)": 128.51, + "step": 65895, + "train_speed(iter/s)": 1.637761 + }, + { + "acc": 0.6690927, + "epoch": 1.671740233384069, + "grad_norm": 5.46875, + "learning_rate": 7.185876647275958e-07, + "loss": 1.53855295, + "memory(GiB)": 128.51, + "step": 65900, + "train_speed(iter/s)": 1.637773 + }, + { + "acc": 0.65507488, + "epoch": 1.6718670725520042, + "grad_norm": 6.6875, + "learning_rate": 7.180461365501113e-07, + "loss": 1.59777317, + "memory(GiB)": 128.51, + "step": 65905, + "train_speed(iter/s)": 1.637785 + }, + { + "acc": 0.64536839, + "epoch": 1.6719939117199392, + "grad_norm": 5.65625, + "learning_rate": 7.175047967120763e-07, + "loss": 1.5690485, + "memory(GiB)": 128.51, + "step": 65910, + "train_speed(iter/s)": 1.637798 + }, + { + "acc": 0.66415539, + "epoch": 1.6721207508878742, + "grad_norm": 5.875, + "learning_rate": 7.16963645237298e-07, + "loss": 1.56981068, + "memory(GiB)": 128.51, + "step": 65915, + "train_speed(iter/s)": 1.637811 + }, + { + "acc": 0.67045584, + "epoch": 1.6722475900558091, + "grad_norm": 7.0625, + "learning_rate": 7.164226821495824e-07, + "loss": 1.49785137, + "memory(GiB)": 128.51, + "step": 65920, + "train_speed(iter/s)": 1.637824 + }, + { + "acc": 0.65235615, + "epoch": 1.6723744292237441, + "grad_norm": 5.46875, + "learning_rate": 7.158819074727219e-07, + "loss": 1.63071136, + "memory(GiB)": 128.51, + "step": 65925, + "train_speed(iter/s)": 1.637837 + }, + { + "acc": 0.65252352, + "epoch": 1.6725012683916793, + "grad_norm": 9.0625, + "learning_rate": 7.153413212305032e-07, + "loss": 1.59601116, + "memory(GiB)": 128.51, + "step": 65930, + "train_speed(iter/s)": 1.637849 + }, + { + "acc": 0.64923925, + "epoch": 1.6726281075596146, + "grad_norm": 7.5, + "learning_rate": 7.148009234467007e-07, + "loss": 1.63145771, + "memory(GiB)": 128.51, + "step": 65935, + "train_speed(iter/s)": 1.637862 + }, + { + "acc": 0.64023819, + "epoch": 1.6727549467275495, + "grad_norm": 5.6875, + "learning_rate": 7.142607141450869e-07, + "loss": 1.64850807, + "memory(GiB)": 128.51, + "step": 65940, + "train_speed(iter/s)": 1.637875 + }, + { + "acc": 0.6492919, + "epoch": 1.6728817858954845, + "grad_norm": 6.875, + "learning_rate": 7.137206933494211e-07, + "loss": 1.61210518, + "memory(GiB)": 128.51, + "step": 65945, + "train_speed(iter/s)": 1.637888 + }, + { + "acc": 0.6444777, + "epoch": 1.6730086250634195, + "grad_norm": 5.59375, + "learning_rate": 7.131808610834567e-07, + "loss": 1.63769684, + "memory(GiB)": 128.51, + "step": 65950, + "train_speed(iter/s)": 1.637899 + }, + { + "acc": 0.65460792, + "epoch": 1.6731354642313545, + "grad_norm": 5.28125, + "learning_rate": 7.126412173709369e-07, + "loss": 1.57896786, + "memory(GiB)": 128.51, + "step": 65955, + "train_speed(iter/s)": 1.637912 + }, + { + "acc": 0.65432692, + "epoch": 1.6732623033992897, + "grad_norm": 5.59375, + "learning_rate": 7.12101762235598e-07, + "loss": 1.6123373, + "memory(GiB)": 128.51, + "step": 65960, + "train_speed(iter/s)": 1.637924 + }, + { + "acc": 0.65945477, + "epoch": 1.6733891425672247, + "grad_norm": 6.09375, + "learning_rate": 7.115624957011674e-07, + "loss": 1.55556021, + "memory(GiB)": 128.51, + "step": 65965, + "train_speed(iter/s)": 1.637937 + }, + { + "acc": 0.65260043, + "epoch": 1.67351598173516, + "grad_norm": 5.96875, + "learning_rate": 7.11023417791365e-07, + "loss": 1.55330524, + "memory(GiB)": 128.51, + "step": 65970, + "train_speed(iter/s)": 1.637949 + }, + { + "acc": 0.65940013, + "epoch": 1.673642820903095, + "grad_norm": 4.75, + "learning_rate": 7.104845285299017e-07, + "loss": 1.6362566, + "memory(GiB)": 128.51, + "step": 65975, + "train_speed(iter/s)": 1.637961 + }, + { + "acc": 0.65371928, + "epoch": 1.67376966007103, + "grad_norm": 5.625, + "learning_rate": 7.099458279404797e-07, + "loss": 1.6317297, + "memory(GiB)": 128.51, + "step": 65980, + "train_speed(iter/s)": 1.637974 + }, + { + "acc": 0.66153316, + "epoch": 1.6738964992389649, + "grad_norm": 5.375, + "learning_rate": 7.094073160467945e-07, + "loss": 1.57646141, + "memory(GiB)": 128.51, + "step": 65985, + "train_speed(iter/s)": 1.637985 + }, + { + "acc": 0.6506083, + "epoch": 1.6740233384069, + "grad_norm": 5.5, + "learning_rate": 7.088689928725311e-07, + "loss": 1.61677818, + "memory(GiB)": 128.51, + "step": 65990, + "train_speed(iter/s)": 1.637997 + }, + { + "acc": 0.65889263, + "epoch": 1.674150177574835, + "grad_norm": 5.5, + "learning_rate": 7.083308584413684e-07, + "loss": 1.63257027, + "memory(GiB)": 128.51, + "step": 65995, + "train_speed(iter/s)": 1.63801 + }, + { + "acc": 0.65007911, + "epoch": 1.6742770167427703, + "grad_norm": 5.59375, + "learning_rate": 7.077929127769756e-07, + "loss": 1.58507204, + "memory(GiB)": 128.51, + "step": 66000, + "train_speed(iter/s)": 1.638023 + }, + { + "epoch": 1.6742770167427703, + "eval_acc": 0.6463461952106787, + "eval_loss": 1.5732182264328003, + "eval_runtime": 58.7129, + "eval_samples_per_second": 108.494, + "eval_steps_per_second": 27.132, + "step": 66000 + }, + { + "acc": 0.65895338, + "epoch": 1.6744038559107053, + "grad_norm": 6.46875, + "learning_rate": 7.072551559030122e-07, + "loss": 1.56474609, + "memory(GiB)": 128.51, + "step": 66005, + "train_speed(iter/s)": 1.635478 + }, + { + "acc": 0.65693903, + "epoch": 1.6745306950786403, + "grad_norm": 5.65625, + "learning_rate": 7.067175878431353e-07, + "loss": 1.54142103, + "memory(GiB)": 128.51, + "step": 66010, + "train_speed(iter/s)": 1.635487 + }, + { + "acc": 0.65787177, + "epoch": 1.6746575342465753, + "grad_norm": 5.6875, + "learning_rate": 7.061802086209857e-07, + "loss": 1.61344299, + "memory(GiB)": 128.51, + "step": 66015, + "train_speed(iter/s)": 1.6355 + }, + { + "acc": 0.64370451, + "epoch": 1.6747843734145103, + "grad_norm": 4.75, + "learning_rate": 7.056430182602008e-07, + "loss": 1.6362175, + "memory(GiB)": 128.51, + "step": 66020, + "train_speed(iter/s)": 1.635513 + }, + { + "acc": 0.66851707, + "epoch": 1.6749112125824455, + "grad_norm": 5.9375, + "learning_rate": 7.051060167844081e-07, + "loss": 1.57702732, + "memory(GiB)": 128.51, + "step": 66025, + "train_speed(iter/s)": 1.635526 + }, + { + "acc": 0.66495824, + "epoch": 1.6750380517503807, + "grad_norm": 6.90625, + "learning_rate": 7.045692042172309e-07, + "loss": 1.59467278, + "memory(GiB)": 128.51, + "step": 66030, + "train_speed(iter/s)": 1.635541 + }, + { + "acc": 0.6661654, + "epoch": 1.6751648909183157, + "grad_norm": 6.34375, + "learning_rate": 7.040325805822756e-07, + "loss": 1.54114189, + "memory(GiB)": 128.51, + "step": 66035, + "train_speed(iter/s)": 1.635554 + }, + { + "acc": 0.64201546, + "epoch": 1.6752917300862507, + "grad_norm": 4.875, + "learning_rate": 7.03496145903147e-07, + "loss": 1.6417347, + "memory(GiB)": 128.51, + "step": 66040, + "train_speed(iter/s)": 1.635568 + }, + { + "acc": 0.66140008, + "epoch": 1.6754185692541856, + "grad_norm": 5.15625, + "learning_rate": 7.029599002034415e-07, + "loss": 1.5959465, + "memory(GiB)": 128.51, + "step": 66045, + "train_speed(iter/s)": 1.635581 + }, + { + "acc": 0.653299, + "epoch": 1.6755454084221206, + "grad_norm": 5.8125, + "learning_rate": 7.02423843506746e-07, + "loss": 1.59394512, + "memory(GiB)": 128.51, + "step": 66050, + "train_speed(iter/s)": 1.635595 + }, + { + "acc": 0.65036874, + "epoch": 1.6756722475900558, + "grad_norm": 7.25, + "learning_rate": 7.018879758366354e-07, + "loss": 1.59594193, + "memory(GiB)": 128.51, + "step": 66055, + "train_speed(iter/s)": 1.635608 + }, + { + "acc": 0.66116576, + "epoch": 1.6757990867579908, + "grad_norm": 6.03125, + "learning_rate": 7.013522972166803e-07, + "loss": 1.56445589, + "memory(GiB)": 128.51, + "step": 66060, + "train_speed(iter/s)": 1.635621 + }, + { + "acc": 0.65793242, + "epoch": 1.675925925925926, + "grad_norm": 8.125, + "learning_rate": 7.008168076704447e-07, + "loss": 1.57527142, + "memory(GiB)": 128.51, + "step": 66065, + "train_speed(iter/s)": 1.635633 + }, + { + "acc": 0.65877395, + "epoch": 1.676052765093861, + "grad_norm": 6.21875, + "learning_rate": 7.002815072214814e-07, + "loss": 1.59430256, + "memory(GiB)": 128.51, + "step": 66070, + "train_speed(iter/s)": 1.635646 + }, + { + "acc": 0.66328082, + "epoch": 1.676179604261796, + "grad_norm": 4.96875, + "learning_rate": 6.997463958933315e-07, + "loss": 1.56871204, + "memory(GiB)": 128.51, + "step": 66075, + "train_speed(iter/s)": 1.63566 + }, + { + "acc": 0.66457644, + "epoch": 1.676306443429731, + "grad_norm": 4.84375, + "learning_rate": 6.992114737095362e-07, + "loss": 1.58344975, + "memory(GiB)": 128.51, + "step": 66080, + "train_speed(iter/s)": 1.635674 + }, + { + "acc": 0.65482426, + "epoch": 1.676433282597666, + "grad_norm": 5.4375, + "learning_rate": 6.986767406936212e-07, + "loss": 1.5526042, + "memory(GiB)": 128.51, + "step": 66085, + "train_speed(iter/s)": 1.635687 + }, + { + "acc": 0.65471187, + "epoch": 1.6765601217656012, + "grad_norm": 5.65625, + "learning_rate": 6.981421968691077e-07, + "loss": 1.62837772, + "memory(GiB)": 128.51, + "step": 66090, + "train_speed(iter/s)": 1.635702 + }, + { + "acc": 0.64523411, + "epoch": 1.6766869609335364, + "grad_norm": 6.875, + "learning_rate": 6.976078422595067e-07, + "loss": 1.65894051, + "memory(GiB)": 128.51, + "step": 66095, + "train_speed(iter/s)": 1.635715 + }, + { + "acc": 0.66164365, + "epoch": 1.6768138001014714, + "grad_norm": 4.6875, + "learning_rate": 6.970736768883219e-07, + "loss": 1.59820232, + "memory(GiB)": 128.51, + "step": 66100, + "train_speed(iter/s)": 1.635729 + }, + { + "acc": 0.66650925, + "epoch": 1.6769406392694064, + "grad_norm": 5.4375, + "learning_rate": 6.965397007790476e-07, + "loss": 1.59362192, + "memory(GiB)": 128.51, + "step": 66105, + "train_speed(iter/s)": 1.635742 + }, + { + "acc": 0.68512206, + "epoch": 1.6770674784373414, + "grad_norm": 5.375, + "learning_rate": 6.960059139551706e-07, + "loss": 1.49706717, + "memory(GiB)": 128.51, + "step": 66110, + "train_speed(iter/s)": 1.635755 + }, + { + "acc": 0.65982838, + "epoch": 1.6771943176052764, + "grad_norm": 7.21875, + "learning_rate": 6.9547231644017e-07, + "loss": 1.62524433, + "memory(GiB)": 128.51, + "step": 66115, + "train_speed(iter/s)": 1.635769 + }, + { + "acc": 0.64518566, + "epoch": 1.6773211567732116, + "grad_norm": 6.59375, + "learning_rate": 6.949389082575148e-07, + "loss": 1.63537827, + "memory(GiB)": 128.51, + "step": 66120, + "train_speed(iter/s)": 1.635783 + }, + { + "acc": 0.65603218, + "epoch": 1.6774479959411466, + "grad_norm": 6.84375, + "learning_rate": 6.944056894306672e-07, + "loss": 1.57258205, + "memory(GiB)": 128.51, + "step": 66125, + "train_speed(iter/s)": 1.635796 + }, + { + "acc": 0.6519145, + "epoch": 1.6775748351090818, + "grad_norm": 4.78125, + "learning_rate": 6.938726599830808e-07, + "loss": 1.70210648, + "memory(GiB)": 128.51, + "step": 66130, + "train_speed(iter/s)": 1.635808 + }, + { + "acc": 0.64451842, + "epoch": 1.6777016742770168, + "grad_norm": 5.8125, + "learning_rate": 6.933398199382002e-07, + "loss": 1.6237957, + "memory(GiB)": 128.51, + "step": 66135, + "train_speed(iter/s)": 1.635821 + }, + { + "acc": 0.66308241, + "epoch": 1.6778285134449518, + "grad_norm": 8.1875, + "learning_rate": 6.928071693194616e-07, + "loss": 1.62530499, + "memory(GiB)": 128.51, + "step": 66140, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.65866814, + "epoch": 1.6779553526128868, + "grad_norm": 4.96875, + "learning_rate": 6.922747081502945e-07, + "loss": 1.60301971, + "memory(GiB)": 128.51, + "step": 66145, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.67551317, + "epoch": 1.678082191780822, + "grad_norm": 5.8125, + "learning_rate": 6.91742436454118e-07, + "loss": 1.5376049, + "memory(GiB)": 128.51, + "step": 66150, + "train_speed(iter/s)": 1.635859 + }, + { + "acc": 0.66165857, + "epoch": 1.678209030948757, + "grad_norm": 5.5, + "learning_rate": 6.912103542543446e-07, + "loss": 1.6255394, + "memory(GiB)": 128.51, + "step": 66155, + "train_speed(iter/s)": 1.635872 + }, + { + "acc": 0.6597002, + "epoch": 1.6783358701166922, + "grad_norm": 5.71875, + "learning_rate": 6.906784615743772e-07, + "loss": 1.5963603, + "memory(GiB)": 128.51, + "step": 66160, + "train_speed(iter/s)": 1.635885 + }, + { + "acc": 0.66195235, + "epoch": 1.6784627092846272, + "grad_norm": 6.0625, + "learning_rate": 6.901467584376093e-07, + "loss": 1.5107254, + "memory(GiB)": 128.51, + "step": 66165, + "train_speed(iter/s)": 1.635898 + }, + { + "acc": 0.64265833, + "epoch": 1.6785895484525621, + "grad_norm": 5.75, + "learning_rate": 6.896152448674315e-07, + "loss": 1.59157715, + "memory(GiB)": 128.51, + "step": 66170, + "train_speed(iter/s)": 1.635911 + }, + { + "acc": 0.6388649, + "epoch": 1.6787163876204971, + "grad_norm": 5.96875, + "learning_rate": 6.890839208872185e-07, + "loss": 1.64452457, + "memory(GiB)": 128.51, + "step": 66175, + "train_speed(iter/s)": 1.635923 + }, + { + "acc": 0.67246189, + "epoch": 1.6788432267884321, + "grad_norm": 6.75, + "learning_rate": 6.885527865203401e-07, + "loss": 1.55935669, + "memory(GiB)": 128.51, + "step": 66180, + "train_speed(iter/s)": 1.635936 + }, + { + "acc": 0.66789808, + "epoch": 1.6789700659563673, + "grad_norm": 5.34375, + "learning_rate": 6.880218417901608e-07, + "loss": 1.59915428, + "memory(GiB)": 128.51, + "step": 66185, + "train_speed(iter/s)": 1.635949 + }, + { + "acc": 0.6600852, + "epoch": 1.6790969051243025, + "grad_norm": 5.8125, + "learning_rate": 6.874910867200341e-07, + "loss": 1.62683716, + "memory(GiB)": 128.51, + "step": 66190, + "train_speed(iter/s)": 1.635962 + }, + { + "acc": 0.68290262, + "epoch": 1.6792237442922375, + "grad_norm": 5.78125, + "learning_rate": 6.869605213333014e-07, + "loss": 1.51938848, + "memory(GiB)": 128.51, + "step": 66195, + "train_speed(iter/s)": 1.635973 + }, + { + "acc": 0.66223049, + "epoch": 1.6793505834601725, + "grad_norm": 4.71875, + "learning_rate": 6.864301456533007e-07, + "loss": 1.58650331, + "memory(GiB)": 128.51, + "step": 66200, + "train_speed(iter/s)": 1.635986 + }, + { + "acc": 0.64770937, + "epoch": 1.6794774226281075, + "grad_norm": 5.875, + "learning_rate": 6.858999597033617e-07, + "loss": 1.65537148, + "memory(GiB)": 128.51, + "step": 66205, + "train_speed(iter/s)": 1.635999 + }, + { + "acc": 0.64754248, + "epoch": 1.6796042617960425, + "grad_norm": 5.21875, + "learning_rate": 6.85369963506805e-07, + "loss": 1.69379807, + "memory(GiB)": 128.51, + "step": 66210, + "train_speed(iter/s)": 1.636012 + }, + { + "acc": 0.63429809, + "epoch": 1.6797311009639777, + "grad_norm": 5.0625, + "learning_rate": 6.848401570869384e-07, + "loss": 1.69971447, + "memory(GiB)": 128.51, + "step": 66215, + "train_speed(iter/s)": 1.636025 + }, + { + "acc": 0.65975738, + "epoch": 1.6798579401319127, + "grad_norm": 5.875, + "learning_rate": 6.843105404670685e-07, + "loss": 1.54582596, + "memory(GiB)": 128.51, + "step": 66220, + "train_speed(iter/s)": 1.636038 + }, + { + "acc": 0.64358845, + "epoch": 1.679984779299848, + "grad_norm": 6.09375, + "learning_rate": 6.837811136704892e-07, + "loss": 1.63169899, + "memory(GiB)": 128.51, + "step": 66225, + "train_speed(iter/s)": 1.63605 + }, + { + "acc": 0.64815269, + "epoch": 1.680111618467783, + "grad_norm": 4.9375, + "learning_rate": 6.832518767204882e-07, + "loss": 1.58104324, + "memory(GiB)": 128.51, + "step": 66230, + "train_speed(iter/s)": 1.636062 + }, + { + "acc": 0.64621162, + "epoch": 1.6802384576357179, + "grad_norm": 5.3125, + "learning_rate": 6.827228296403405e-07, + "loss": 1.67636414, + "memory(GiB)": 128.51, + "step": 66235, + "train_speed(iter/s)": 1.636076 + }, + { + "acc": 0.65409412, + "epoch": 1.6803652968036529, + "grad_norm": 6.34375, + "learning_rate": 6.821939724533189e-07, + "loss": 1.63334789, + "memory(GiB)": 128.51, + "step": 66240, + "train_speed(iter/s)": 1.636088 + }, + { + "acc": 0.67309103, + "epoch": 1.6804921359715879, + "grad_norm": 6.0, + "learning_rate": 6.816653051826838e-07, + "loss": 1.57870569, + "memory(GiB)": 128.51, + "step": 66245, + "train_speed(iter/s)": 1.636102 + }, + { + "acc": 0.66775198, + "epoch": 1.680618975139523, + "grad_norm": 5.6875, + "learning_rate": 6.811368278516889e-07, + "loss": 1.58345413, + "memory(GiB)": 128.51, + "step": 66250, + "train_speed(iter/s)": 1.636115 + }, + { + "acc": 0.66788878, + "epoch": 1.6807458143074583, + "grad_norm": 8.75, + "learning_rate": 6.806085404835788e-07, + "loss": 1.59038954, + "memory(GiB)": 128.51, + "step": 66255, + "train_speed(iter/s)": 1.63613 + }, + { + "acc": 0.6525846, + "epoch": 1.6808726534753933, + "grad_norm": 8.1875, + "learning_rate": 6.800804431015895e-07, + "loss": 1.6459919, + "memory(GiB)": 128.51, + "step": 66260, + "train_speed(iter/s)": 1.636143 + }, + { + "acc": 0.6666266, + "epoch": 1.6809994926433283, + "grad_norm": 6.4375, + "learning_rate": 6.795525357289496e-07, + "loss": 1.5648243, + "memory(GiB)": 128.51, + "step": 66265, + "train_speed(iter/s)": 1.636156 + }, + { + "acc": 0.64701176, + "epoch": 1.6811263318112633, + "grad_norm": 5.34375, + "learning_rate": 6.790248183888781e-07, + "loss": 1.63595924, + "memory(GiB)": 128.51, + "step": 66270, + "train_speed(iter/s)": 1.636169 + }, + { + "acc": 0.66199794, + "epoch": 1.6812531709791982, + "grad_norm": 5.15625, + "learning_rate": 6.784972911045872e-07, + "loss": 1.61182404, + "memory(GiB)": 128.51, + "step": 66275, + "train_speed(iter/s)": 1.636182 + }, + { + "acc": 0.65428448, + "epoch": 1.6813800101471335, + "grad_norm": 5.90625, + "learning_rate": 6.779699538992796e-07, + "loss": 1.5914463, + "memory(GiB)": 128.51, + "step": 66280, + "train_speed(iter/s)": 1.636195 + }, + { + "acc": 0.65789757, + "epoch": 1.6815068493150684, + "grad_norm": 6.4375, + "learning_rate": 6.774428067961502e-07, + "loss": 1.65470734, + "memory(GiB)": 128.51, + "step": 66285, + "train_speed(iter/s)": 1.636208 + }, + { + "acc": 0.64024372, + "epoch": 1.6816336884830037, + "grad_norm": 6.09375, + "learning_rate": 6.769158498183842e-07, + "loss": 1.68790245, + "memory(GiB)": 128.51, + "step": 66290, + "train_speed(iter/s)": 1.636222 + }, + { + "acc": 0.67159791, + "epoch": 1.6817605276509386, + "grad_norm": 5.21875, + "learning_rate": 6.763890829891611e-07, + "loss": 1.58680639, + "memory(GiB)": 128.51, + "step": 66295, + "train_speed(iter/s)": 1.636234 + }, + { + "acc": 0.64143753, + "epoch": 1.6818873668188736, + "grad_norm": 5.28125, + "learning_rate": 6.758625063316493e-07, + "loss": 1.63990402, + "memory(GiB)": 128.51, + "step": 66300, + "train_speed(iter/s)": 1.636246 + }, + { + "acc": 0.67320423, + "epoch": 1.6820142059868086, + "grad_norm": 6.09375, + "learning_rate": 6.75336119869009e-07, + "loss": 1.5223978, + "memory(GiB)": 128.51, + "step": 66305, + "train_speed(iter/s)": 1.636258 + }, + { + "acc": 0.64519243, + "epoch": 1.6821410451547438, + "grad_norm": 5.75, + "learning_rate": 6.748099236243971e-07, + "loss": 1.6092411, + "memory(GiB)": 128.51, + "step": 66310, + "train_speed(iter/s)": 1.63627 + }, + { + "acc": 0.64731922, + "epoch": 1.6822678843226788, + "grad_norm": 6.0, + "learning_rate": 6.742839176209537e-07, + "loss": 1.64334335, + "memory(GiB)": 128.51, + "step": 66315, + "train_speed(iter/s)": 1.636283 + }, + { + "acc": 0.64213443, + "epoch": 1.682394723490614, + "grad_norm": 5.875, + "learning_rate": 6.737581018818167e-07, + "loss": 1.62842293, + "memory(GiB)": 128.51, + "step": 66320, + "train_speed(iter/s)": 1.636296 + }, + { + "acc": 0.65887785, + "epoch": 1.682521562658549, + "grad_norm": 7.78125, + "learning_rate": 6.732324764301129e-07, + "loss": 1.62973042, + "memory(GiB)": 128.51, + "step": 66325, + "train_speed(iter/s)": 1.636308 + }, + { + "acc": 0.65739026, + "epoch": 1.682648401826484, + "grad_norm": 5.15625, + "learning_rate": 6.727070412889647e-07, + "loss": 1.61480217, + "memory(GiB)": 128.51, + "step": 66330, + "train_speed(iter/s)": 1.636321 + }, + { + "acc": 0.66327405, + "epoch": 1.682775240994419, + "grad_norm": 6.15625, + "learning_rate": 6.721817964814792e-07, + "loss": 1.53326387, + "memory(GiB)": 128.51, + "step": 66335, + "train_speed(iter/s)": 1.636334 + }, + { + "acc": 0.67155638, + "epoch": 1.682902080162354, + "grad_norm": 5.65625, + "learning_rate": 6.716567420307596e-07, + "loss": 1.54714222, + "memory(GiB)": 128.51, + "step": 66340, + "train_speed(iter/s)": 1.636348 + }, + { + "acc": 0.65686874, + "epoch": 1.6830289193302892, + "grad_norm": 5.25, + "learning_rate": 6.711318779599025e-07, + "loss": 1.50884895, + "memory(GiB)": 128.51, + "step": 66345, + "train_speed(iter/s)": 1.63636 + }, + { + "acc": 0.67943659, + "epoch": 1.6831557584982244, + "grad_norm": 7.8125, + "learning_rate": 6.706072042919931e-07, + "loss": 1.48498478, + "memory(GiB)": 128.51, + "step": 66350, + "train_speed(iter/s)": 1.636373 + }, + { + "acc": 0.64314775, + "epoch": 1.6832825976661594, + "grad_norm": 5.9375, + "learning_rate": 6.700827210501065e-07, + "loss": 1.63353691, + "memory(GiB)": 128.51, + "step": 66355, + "train_speed(iter/s)": 1.636387 + }, + { + "acc": 0.66073294, + "epoch": 1.6834094368340944, + "grad_norm": 5.625, + "learning_rate": 6.695584282573142e-07, + "loss": 1.59654312, + "memory(GiB)": 128.51, + "step": 66360, + "train_speed(iter/s)": 1.636399 + }, + { + "acc": 0.65005159, + "epoch": 1.6835362760020294, + "grad_norm": 5.34375, + "learning_rate": 6.690343259366766e-07, + "loss": 1.5954752, + "memory(GiB)": 128.51, + "step": 66365, + "train_speed(iter/s)": 1.636413 + }, + { + "acc": 0.64390888, + "epoch": 1.6836631151699644, + "grad_norm": 4.96875, + "learning_rate": 6.685104141112464e-07, + "loss": 1.68616505, + "memory(GiB)": 128.51, + "step": 66370, + "train_speed(iter/s)": 1.636426 + }, + { + "acc": 0.67171893, + "epoch": 1.6837899543378996, + "grad_norm": 5.75, + "learning_rate": 6.679866928040651e-07, + "loss": 1.53451977, + "memory(GiB)": 128.51, + "step": 66375, + "train_speed(iter/s)": 1.636438 + }, + { + "acc": 0.6437346, + "epoch": 1.6839167935058346, + "grad_norm": 6.5, + "learning_rate": 6.674631620381711e-07, + "loss": 1.68510971, + "memory(GiB)": 128.51, + "step": 66380, + "train_speed(iter/s)": 1.636451 + }, + { + "acc": 0.6613471, + "epoch": 1.6840436326737698, + "grad_norm": 5.5, + "learning_rate": 6.669398218365902e-07, + "loss": 1.52771311, + "memory(GiB)": 128.51, + "step": 66385, + "train_speed(iter/s)": 1.636464 + }, + { + "acc": 0.66040182, + "epoch": 1.6841704718417048, + "grad_norm": 5.71875, + "learning_rate": 6.664166722223426e-07, + "loss": 1.57890501, + "memory(GiB)": 128.51, + "step": 66390, + "train_speed(iter/s)": 1.636477 + }, + { + "acc": 0.65939522, + "epoch": 1.6842973110096398, + "grad_norm": 4.8125, + "learning_rate": 6.658937132184368e-07, + "loss": 1.61428986, + "memory(GiB)": 128.51, + "step": 66395, + "train_speed(iter/s)": 1.63649 + }, + { + "acc": 0.66360259, + "epoch": 1.6844241501775747, + "grad_norm": 6.09375, + "learning_rate": 6.653709448478762e-07, + "loss": 1.56844444, + "memory(GiB)": 128.51, + "step": 66400, + "train_speed(iter/s)": 1.636504 + }, + { + "acc": 0.65322342, + "epoch": 1.6845509893455097, + "grad_norm": 6.46875, + "learning_rate": 6.648483671336548e-07, + "loss": 1.62249107, + "memory(GiB)": 128.51, + "step": 66405, + "train_speed(iter/s)": 1.636519 + }, + { + "acc": 0.63693781, + "epoch": 1.684677828513445, + "grad_norm": 6.4375, + "learning_rate": 6.643259800987567e-07, + "loss": 1.66031399, + "memory(GiB)": 128.51, + "step": 66410, + "train_speed(iter/s)": 1.636531 + }, + { + "acc": 0.65126677, + "epoch": 1.6848046676813802, + "grad_norm": 6.90625, + "learning_rate": 6.638037837661593e-07, + "loss": 1.58176994, + "memory(GiB)": 128.51, + "step": 66415, + "train_speed(iter/s)": 1.636546 + }, + { + "acc": 0.65425711, + "epoch": 1.6849315068493151, + "grad_norm": 4.6875, + "learning_rate": 6.632817781588313e-07, + "loss": 1.68862343, + "memory(GiB)": 128.51, + "step": 66420, + "train_speed(iter/s)": 1.636559 + }, + { + "acc": 0.64426546, + "epoch": 1.6850583460172501, + "grad_norm": 5.90625, + "learning_rate": 6.627599632997328e-07, + "loss": 1.66668797, + "memory(GiB)": 128.51, + "step": 66425, + "train_speed(iter/s)": 1.636572 + }, + { + "acc": 0.67221508, + "epoch": 1.6851851851851851, + "grad_norm": 5.75, + "learning_rate": 6.622383392118153e-07, + "loss": 1.60266933, + "memory(GiB)": 128.51, + "step": 66430, + "train_speed(iter/s)": 1.636584 + }, + { + "acc": 0.6361062, + "epoch": 1.68531202435312, + "grad_norm": 4.96875, + "learning_rate": 6.617169059180229e-07, + "loss": 1.64124107, + "memory(GiB)": 128.51, + "step": 66435, + "train_speed(iter/s)": 1.636598 + }, + { + "acc": 0.65117702, + "epoch": 1.6854388635210553, + "grad_norm": 5.34375, + "learning_rate": 6.611956634412897e-07, + "loss": 1.60670547, + "memory(GiB)": 128.51, + "step": 66440, + "train_speed(iter/s)": 1.636612 + }, + { + "acc": 0.6443337, + "epoch": 1.6855657026889903, + "grad_norm": 5.59375, + "learning_rate": 6.606746118045415e-07, + "loss": 1.5744648, + "memory(GiB)": 128.51, + "step": 66445, + "train_speed(iter/s)": 1.636624 + }, + { + "acc": 0.66201048, + "epoch": 1.6856925418569255, + "grad_norm": 7.34375, + "learning_rate": 6.601537510306999e-07, + "loss": 1.58101501, + "memory(GiB)": 128.51, + "step": 66450, + "train_speed(iter/s)": 1.636638 + }, + { + "acc": 0.63887186, + "epoch": 1.6858193810248605, + "grad_norm": 5.9375, + "learning_rate": 6.59633081142671e-07, + "loss": 1.68242455, + "memory(GiB)": 128.51, + "step": 66455, + "train_speed(iter/s)": 1.636652 + }, + { + "acc": 0.659795, + "epoch": 1.6859462201927955, + "grad_norm": 7.0, + "learning_rate": 6.591126021633575e-07, + "loss": 1.55775766, + "memory(GiB)": 128.51, + "step": 66460, + "train_speed(iter/s)": 1.636666 + }, + { + "acc": 0.65130558, + "epoch": 1.6860730593607305, + "grad_norm": 5.71875, + "learning_rate": 6.585923141156513e-07, + "loss": 1.60600815, + "memory(GiB)": 128.51, + "step": 66465, + "train_speed(iter/s)": 1.63668 + }, + { + "acc": 0.6596117, + "epoch": 1.6861998985286657, + "grad_norm": 5.84375, + "learning_rate": 6.580722170224408e-07, + "loss": 1.55361948, + "memory(GiB)": 128.51, + "step": 66470, + "train_speed(iter/s)": 1.636694 + }, + { + "acc": 0.65276661, + "epoch": 1.6863267376966007, + "grad_norm": 6.28125, + "learning_rate": 6.575523109065979e-07, + "loss": 1.62090073, + "memory(GiB)": 128.51, + "step": 66475, + "train_speed(iter/s)": 1.636707 + }, + { + "acc": 0.65026875, + "epoch": 1.686453576864536, + "grad_norm": 5.0625, + "learning_rate": 6.570325957909912e-07, + "loss": 1.63086548, + "memory(GiB)": 128.51, + "step": 66480, + "train_speed(iter/s)": 1.63672 + }, + { + "acc": 0.66136537, + "epoch": 1.6865804160324709, + "grad_norm": 4.84375, + "learning_rate": 6.565130716984819e-07, + "loss": 1.57797918, + "memory(GiB)": 128.51, + "step": 66485, + "train_speed(iter/s)": 1.636734 + }, + { + "acc": 0.65649347, + "epoch": 1.6867072552004059, + "grad_norm": 4.90625, + "learning_rate": 6.55993738651921e-07, + "loss": 1.58404427, + "memory(GiB)": 128.51, + "step": 66490, + "train_speed(iter/s)": 1.636747 + }, + { + "acc": 0.65040445, + "epoch": 1.6868340943683409, + "grad_norm": 6.09375, + "learning_rate": 6.554745966741488e-07, + "loss": 1.6970562, + "memory(GiB)": 128.51, + "step": 66495, + "train_speed(iter/s)": 1.63676 + }, + { + "acc": 0.64716797, + "epoch": 1.6869609335362759, + "grad_norm": 6.875, + "learning_rate": 6.549556457879996e-07, + "loss": 1.61366329, + "memory(GiB)": 128.51, + "step": 66500, + "train_speed(iter/s)": 1.636774 + }, + { + "acc": 0.65494671, + "epoch": 1.687087772704211, + "grad_norm": 5.53125, + "learning_rate": 6.54436886016302e-07, + "loss": 1.60575371, + "memory(GiB)": 128.51, + "step": 66505, + "train_speed(iter/s)": 1.636787 + }, + { + "acc": 0.64323206, + "epoch": 1.6872146118721463, + "grad_norm": 6.375, + "learning_rate": 6.539183173818725e-07, + "loss": 1.60546474, + "memory(GiB)": 128.51, + "step": 66510, + "train_speed(iter/s)": 1.6368 + }, + { + "acc": 0.65925994, + "epoch": 1.6873414510400813, + "grad_norm": 5.1875, + "learning_rate": 6.53399939907517e-07, + "loss": 1.58527508, + "memory(GiB)": 128.51, + "step": 66515, + "train_speed(iter/s)": 1.636813 + }, + { + "acc": 0.67269168, + "epoch": 1.6874682902080163, + "grad_norm": 6.3125, + "learning_rate": 6.528817536160392e-07, + "loss": 1.5747426, + "memory(GiB)": 128.51, + "step": 66520, + "train_speed(iter/s)": 1.636826 + }, + { + "acc": 0.66061754, + "epoch": 1.6875951293759512, + "grad_norm": 7.03125, + "learning_rate": 6.523637585302311e-07, + "loss": 1.65887413, + "memory(GiB)": 128.51, + "step": 66525, + "train_speed(iter/s)": 1.636841 + }, + { + "acc": 0.65177183, + "epoch": 1.6877219685438862, + "grad_norm": 4.90625, + "learning_rate": 6.51845954672875e-07, + "loss": 1.55776482, + "memory(GiB)": 128.51, + "step": 66530, + "train_speed(iter/s)": 1.636855 + }, + { + "acc": 0.6554522, + "epoch": 1.6878488077118214, + "grad_norm": 8.25, + "learning_rate": 6.513283420667471e-07, + "loss": 1.69159813, + "memory(GiB)": 128.51, + "step": 66535, + "train_speed(iter/s)": 1.636868 + }, + { + "acc": 0.6526967, + "epoch": 1.6879756468797564, + "grad_norm": 6.6875, + "learning_rate": 6.508109207346142e-07, + "loss": 1.63218689, + "memory(GiB)": 128.51, + "step": 66540, + "train_speed(iter/s)": 1.636882 + }, + { + "acc": 0.6490098, + "epoch": 1.6881024860476916, + "grad_norm": 6.0, + "learning_rate": 6.502936906992346e-07, + "loss": 1.61840134, + "memory(GiB)": 128.51, + "step": 66545, + "train_speed(iter/s)": 1.636894 + }, + { + "acc": 0.64785519, + "epoch": 1.6882293252156266, + "grad_norm": 4.90625, + "learning_rate": 6.497766519833587e-07, + "loss": 1.52472229, + "memory(GiB)": 128.51, + "step": 66550, + "train_speed(iter/s)": 1.636908 + }, + { + "acc": 0.66280384, + "epoch": 1.6883561643835616, + "grad_norm": 5.0, + "learning_rate": 6.492598046097282e-07, + "loss": 1.57390785, + "memory(GiB)": 128.51, + "step": 66555, + "train_speed(iter/s)": 1.636921 + }, + { + "acc": 0.65174108, + "epoch": 1.6884830035514966, + "grad_norm": 5.65625, + "learning_rate": 6.487431486010759e-07, + "loss": 1.63463612, + "memory(GiB)": 128.51, + "step": 66560, + "train_speed(iter/s)": 1.636936 + }, + { + "acc": 0.64512362, + "epoch": 1.6886098427194316, + "grad_norm": 6.78125, + "learning_rate": 6.482266839801265e-07, + "loss": 1.62295952, + "memory(GiB)": 128.51, + "step": 66565, + "train_speed(iter/s)": 1.636949 + }, + { + "acc": 0.66724973, + "epoch": 1.6887366818873668, + "grad_norm": 4.5, + "learning_rate": 6.477104107695975e-07, + "loss": 1.58875866, + "memory(GiB)": 128.51, + "step": 66570, + "train_speed(iter/s)": 1.636963 + }, + { + "acc": 0.65869265, + "epoch": 1.688863521055302, + "grad_norm": 6.28125, + "learning_rate": 6.471943289921955e-07, + "loss": 1.5639576, + "memory(GiB)": 128.51, + "step": 66575, + "train_speed(iter/s)": 1.636977 + }, + { + "acc": 0.66198511, + "epoch": 1.688990360223237, + "grad_norm": 5.6875, + "learning_rate": 6.466784386706215e-07, + "loss": 1.52178745, + "memory(GiB)": 128.51, + "step": 66580, + "train_speed(iter/s)": 1.63699 + }, + { + "acc": 0.65528412, + "epoch": 1.689117199391172, + "grad_norm": 4.6875, + "learning_rate": 6.461627398275655e-07, + "loss": 1.59580669, + "memory(GiB)": 128.51, + "step": 66585, + "train_speed(iter/s)": 1.637004 + }, + { + "acc": 0.67058916, + "epoch": 1.689244038559107, + "grad_norm": 5.53125, + "learning_rate": 6.456472324857111e-07, + "loss": 1.66829681, + "memory(GiB)": 128.51, + "step": 66590, + "train_speed(iter/s)": 1.637016 + }, + { + "acc": 0.64319115, + "epoch": 1.689370877727042, + "grad_norm": 8.25, + "learning_rate": 6.451319166677317e-07, + "loss": 1.64337044, + "memory(GiB)": 128.51, + "step": 66595, + "train_speed(iter/s)": 1.637029 + }, + { + "acc": 0.66640234, + "epoch": 1.6894977168949772, + "grad_norm": 6.1875, + "learning_rate": 6.446167923962943e-07, + "loss": 1.53427582, + "memory(GiB)": 128.51, + "step": 66600, + "train_speed(iter/s)": 1.637043 + }, + { + "acc": 0.65677981, + "epoch": 1.6896245560629122, + "grad_norm": 5.8125, + "learning_rate": 6.44101859694054e-07, + "loss": 1.63551998, + "memory(GiB)": 128.51, + "step": 66605, + "train_speed(iter/s)": 1.637056 + }, + { + "acc": 0.65459723, + "epoch": 1.6897513952308474, + "grad_norm": 5.125, + "learning_rate": 6.43587118583664e-07, + "loss": 1.55850086, + "memory(GiB)": 128.51, + "step": 66610, + "train_speed(iter/s)": 1.63707 + }, + { + "acc": 0.6696631, + "epoch": 1.6898782343987824, + "grad_norm": 5.46875, + "learning_rate": 6.430725690877615e-07, + "loss": 1.53732719, + "memory(GiB)": 128.51, + "step": 66615, + "train_speed(iter/s)": 1.637083 + }, + { + "acc": 0.65037088, + "epoch": 1.6900050735667174, + "grad_norm": 5.28125, + "learning_rate": 6.425582112289785e-07, + "loss": 1.59629421, + "memory(GiB)": 128.51, + "step": 66620, + "train_speed(iter/s)": 1.637097 + }, + { + "acc": 0.6654757, + "epoch": 1.6901319127346524, + "grad_norm": 5.09375, + "learning_rate": 6.420440450299414e-07, + "loss": 1.47901516, + "memory(GiB)": 128.51, + "step": 66625, + "train_speed(iter/s)": 1.63711 + }, + { + "acc": 0.67324791, + "epoch": 1.6902587519025876, + "grad_norm": 6.375, + "learning_rate": 6.415300705132648e-07, + "loss": 1.49997272, + "memory(GiB)": 128.51, + "step": 66630, + "train_speed(iter/s)": 1.637123 + }, + { + "acc": 0.65585203, + "epoch": 1.6903855910705226, + "grad_norm": 5.90625, + "learning_rate": 6.410162877015535e-07, + "loss": 1.57841148, + "memory(GiB)": 128.51, + "step": 66635, + "train_speed(iter/s)": 1.637137 + }, + { + "acc": 0.66604357, + "epoch": 1.6905124302384578, + "grad_norm": 5.65625, + "learning_rate": 6.405026966174066e-07, + "loss": 1.58571539, + "memory(GiB)": 128.51, + "step": 66640, + "train_speed(iter/s)": 1.637151 + }, + { + "acc": 0.66064062, + "epoch": 1.6906392694063928, + "grad_norm": 5.46875, + "learning_rate": 6.39989297283416e-07, + "loss": 1.52937279, + "memory(GiB)": 128.51, + "step": 66645, + "train_speed(iter/s)": 1.637164 + }, + { + "acc": 0.64689898, + "epoch": 1.6907661085743277, + "grad_norm": 4.875, + "learning_rate": 6.394760897221636e-07, + "loss": 1.63860416, + "memory(GiB)": 128.51, + "step": 66650, + "train_speed(iter/s)": 1.637177 + }, + { + "acc": 0.62799459, + "epoch": 1.6908929477422627, + "grad_norm": 5.96875, + "learning_rate": 6.389630739562186e-07, + "loss": 1.69764309, + "memory(GiB)": 128.51, + "step": 66655, + "train_speed(iter/s)": 1.63719 + }, + { + "acc": 0.65750885, + "epoch": 1.6910197869101977, + "grad_norm": 5.25, + "learning_rate": 6.38450250008149e-07, + "loss": 1.56338425, + "memory(GiB)": 128.51, + "step": 66660, + "train_speed(iter/s)": 1.637203 + }, + { + "acc": 0.66203971, + "epoch": 1.691146626078133, + "grad_norm": 5.15625, + "learning_rate": 6.379376179005103e-07, + "loss": 1.56963091, + "memory(GiB)": 128.51, + "step": 66665, + "train_speed(iter/s)": 1.637217 + }, + { + "acc": 0.66247883, + "epoch": 1.6912734652460681, + "grad_norm": 6.6875, + "learning_rate": 6.374251776558521e-07, + "loss": 1.57038927, + "memory(GiB)": 128.51, + "step": 66670, + "train_speed(iter/s)": 1.637231 + }, + { + "acc": 0.65231047, + "epoch": 1.6914003044140031, + "grad_norm": 5.0625, + "learning_rate": 6.369129292967097e-07, + "loss": 1.55709057, + "memory(GiB)": 128.51, + "step": 66675, + "train_speed(iter/s)": 1.637243 + }, + { + "acc": 0.65538011, + "epoch": 1.6915271435819381, + "grad_norm": 5.4375, + "learning_rate": 6.364008728456173e-07, + "loss": 1.53696947, + "memory(GiB)": 128.51, + "step": 66680, + "train_speed(iter/s)": 1.637257 + }, + { + "acc": 0.65603461, + "epoch": 1.691653982749873, + "grad_norm": 5.8125, + "learning_rate": 6.358890083250963e-07, + "loss": 1.61610146, + "memory(GiB)": 128.51, + "step": 66685, + "train_speed(iter/s)": 1.63727 + }, + { + "acc": 0.6634779, + "epoch": 1.691780821917808, + "grad_norm": 5.8125, + "learning_rate": 6.353773357576615e-07, + "loss": 1.5710845, + "memory(GiB)": 128.51, + "step": 66690, + "train_speed(iter/s)": 1.637283 + }, + { + "acc": 0.6633091, + "epoch": 1.6919076610857433, + "grad_norm": 5.4375, + "learning_rate": 6.348658551658182e-07, + "loss": 1.53256826, + "memory(GiB)": 128.51, + "step": 66695, + "train_speed(iter/s)": 1.637296 + }, + { + "acc": 0.65155902, + "epoch": 1.6920345002536783, + "grad_norm": 6.9375, + "learning_rate": 6.343545665720636e-07, + "loss": 1.66027565, + "memory(GiB)": 128.51, + "step": 66700, + "train_speed(iter/s)": 1.637308 + }, + { + "acc": 0.64160042, + "epoch": 1.6921613394216135, + "grad_norm": 6.6875, + "learning_rate": 6.33843469998886e-07, + "loss": 1.68734779, + "memory(GiB)": 128.51, + "step": 66705, + "train_speed(iter/s)": 1.637321 + }, + { + "acc": 0.65719976, + "epoch": 1.6922881785895485, + "grad_norm": 5.34375, + "learning_rate": 6.33332565468766e-07, + "loss": 1.57576942, + "memory(GiB)": 128.51, + "step": 66710, + "train_speed(iter/s)": 1.637334 + }, + { + "acc": 0.64508729, + "epoch": 1.6924150177574835, + "grad_norm": 6.1875, + "learning_rate": 6.328218530041757e-07, + "loss": 1.67298508, + "memory(GiB)": 128.51, + "step": 66715, + "train_speed(iter/s)": 1.637347 + }, + { + "acc": 0.64120893, + "epoch": 1.6925418569254185, + "grad_norm": 5.34375, + "learning_rate": 6.323113326275781e-07, + "loss": 1.62973709, + "memory(GiB)": 128.51, + "step": 66720, + "train_speed(iter/s)": 1.63736 + }, + { + "acc": 0.63750439, + "epoch": 1.6926686960933535, + "grad_norm": 5.75, + "learning_rate": 6.318010043614292e-07, + "loss": 1.69637928, + "memory(GiB)": 128.51, + "step": 66725, + "train_speed(iter/s)": 1.637372 + }, + { + "acc": 0.66092091, + "epoch": 1.6927955352612887, + "grad_norm": 5.21875, + "learning_rate": 6.312908682281743e-07, + "loss": 1.547859, + "memory(GiB)": 128.51, + "step": 66730, + "train_speed(iter/s)": 1.637385 + }, + { + "acc": 0.68325677, + "epoch": 1.6929223744292239, + "grad_norm": 5.53125, + "learning_rate": 6.307809242502517e-07, + "loss": 1.51669903, + "memory(GiB)": 128.51, + "step": 66735, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.65210657, + "epoch": 1.6930492135971589, + "grad_norm": 6.03125, + "learning_rate": 6.302711724500909e-07, + "loss": 1.61665249, + "memory(GiB)": 128.51, + "step": 66740, + "train_speed(iter/s)": 1.63741 + }, + { + "acc": 0.66810455, + "epoch": 1.6931760527650939, + "grad_norm": 5.84375, + "learning_rate": 6.297616128501133e-07, + "loss": 1.57994995, + "memory(GiB)": 128.51, + "step": 66745, + "train_speed(iter/s)": 1.637423 + }, + { + "acc": 0.65783567, + "epoch": 1.6933028919330289, + "grad_norm": 10.0, + "learning_rate": 6.292522454727329e-07, + "loss": 1.61713924, + "memory(GiB)": 128.51, + "step": 66750, + "train_speed(iter/s)": 1.637437 + }, + { + "acc": 0.64224997, + "epoch": 1.6934297311009638, + "grad_norm": 7.0625, + "learning_rate": 6.287430703403524e-07, + "loss": 1.60276871, + "memory(GiB)": 128.51, + "step": 66755, + "train_speed(iter/s)": 1.63745 + }, + { + "acc": 0.6657033, + "epoch": 1.693556570268899, + "grad_norm": 6.34375, + "learning_rate": 6.282340874753673e-07, + "loss": 1.55315561, + "memory(GiB)": 128.51, + "step": 66760, + "train_speed(iter/s)": 1.637463 + }, + { + "acc": 0.65935316, + "epoch": 1.693683409436834, + "grad_norm": 5.15625, + "learning_rate": 6.277252969001646e-07, + "loss": 1.577668, + "memory(GiB)": 128.51, + "step": 66765, + "train_speed(iter/s)": 1.637477 + }, + { + "acc": 0.68309288, + "epoch": 1.6938102486047693, + "grad_norm": 6.09375, + "learning_rate": 6.272166986371264e-07, + "loss": 1.56526871, + "memory(GiB)": 128.51, + "step": 66770, + "train_speed(iter/s)": 1.637489 + }, + { + "acc": 0.66563978, + "epoch": 1.6939370877727042, + "grad_norm": 6.25, + "learning_rate": 6.267082927086199e-07, + "loss": 1.56169863, + "memory(GiB)": 128.51, + "step": 66775, + "train_speed(iter/s)": 1.637503 + }, + { + "acc": 0.64557629, + "epoch": 1.6940639269406392, + "grad_norm": 5.03125, + "learning_rate": 6.262000791370071e-07, + "loss": 1.61343136, + "memory(GiB)": 128.51, + "step": 66780, + "train_speed(iter/s)": 1.637515 + }, + { + "acc": 0.65218968, + "epoch": 1.6941907661085742, + "grad_norm": 4.8125, + "learning_rate": 6.256920579446429e-07, + "loss": 1.61522121, + "memory(GiB)": 128.51, + "step": 66785, + "train_speed(iter/s)": 1.637527 + }, + { + "acc": 0.63828354, + "epoch": 1.6943176052765094, + "grad_norm": 5.96875, + "learning_rate": 6.251842291538734e-07, + "loss": 1.69617958, + "memory(GiB)": 128.51, + "step": 66790, + "train_speed(iter/s)": 1.637541 + }, + { + "acc": 0.66302691, + "epoch": 1.6944444444444444, + "grad_norm": 5.90625, + "learning_rate": 6.246765927870313e-07, + "loss": 1.59541874, + "memory(GiB)": 128.51, + "step": 66795, + "train_speed(iter/s)": 1.637554 + }, + { + "acc": 0.65665331, + "epoch": 1.6945712836123796, + "grad_norm": 5.96875, + "learning_rate": 6.241691488664486e-07, + "loss": 1.62578659, + "memory(GiB)": 128.51, + "step": 66800, + "train_speed(iter/s)": 1.637567 + }, + { + "acc": 0.65172, + "epoch": 1.6946981227803146, + "grad_norm": 5.71875, + "learning_rate": 6.236618974144432e-07, + "loss": 1.63236542, + "memory(GiB)": 128.51, + "step": 66805, + "train_speed(iter/s)": 1.637579 + }, + { + "acc": 0.6651423, + "epoch": 1.6948249619482496, + "grad_norm": 6.0625, + "learning_rate": 6.23154838453327e-07, + "loss": 1.53436604, + "memory(GiB)": 128.51, + "step": 66810, + "train_speed(iter/s)": 1.637591 + }, + { + "acc": 0.65798717, + "epoch": 1.6949518011161846, + "grad_norm": 6.0625, + "learning_rate": 6.226479720054007e-07, + "loss": 1.55678043, + "memory(GiB)": 128.51, + "step": 66815, + "train_speed(iter/s)": 1.637605 + }, + { + "acc": 0.6566298, + "epoch": 1.6950786402841196, + "grad_norm": 8.3125, + "learning_rate": 6.221412980929608e-07, + "loss": 1.64250984, + "memory(GiB)": 128.51, + "step": 66820, + "train_speed(iter/s)": 1.637618 + }, + { + "acc": 0.64589119, + "epoch": 1.6952054794520548, + "grad_norm": 8.75, + "learning_rate": 6.21634816738293e-07, + "loss": 1.57805367, + "memory(GiB)": 128.51, + "step": 66825, + "train_speed(iter/s)": 1.637631 + }, + { + "acc": 0.65637679, + "epoch": 1.69533231861999, + "grad_norm": 5.09375, + "learning_rate": 6.211285279636731e-07, + "loss": 1.55608921, + "memory(GiB)": 128.51, + "step": 66830, + "train_speed(iter/s)": 1.637644 + }, + { + "acc": 0.64408159, + "epoch": 1.695459157787925, + "grad_norm": 6.53125, + "learning_rate": 6.206224317913711e-07, + "loss": 1.64528217, + "memory(GiB)": 128.51, + "step": 66835, + "train_speed(iter/s)": 1.637658 + }, + { + "acc": 0.65190306, + "epoch": 1.69558599695586, + "grad_norm": 5.53125, + "learning_rate": 6.201165282436473e-07, + "loss": 1.58617287, + "memory(GiB)": 128.51, + "step": 66840, + "train_speed(iter/s)": 1.637671 + }, + { + "acc": 0.66689792, + "epoch": 1.695712836123795, + "grad_norm": 5.96875, + "learning_rate": 6.196108173427529e-07, + "loss": 1.55203533, + "memory(GiB)": 128.51, + "step": 66845, + "train_speed(iter/s)": 1.637684 + }, + { + "acc": 0.65467238, + "epoch": 1.69583967529173, + "grad_norm": 5.71875, + "learning_rate": 6.19105299110932e-07, + "loss": 1.69402924, + "memory(GiB)": 128.51, + "step": 66850, + "train_speed(iter/s)": 1.637697 + }, + { + "acc": 0.65164242, + "epoch": 1.6959665144596652, + "grad_norm": 4.84375, + "learning_rate": 6.185999735704195e-07, + "loss": 1.56395731, + "memory(GiB)": 128.51, + "step": 66855, + "train_speed(iter/s)": 1.637709 + }, + { + "acc": 0.66126356, + "epoch": 1.6960933536276002, + "grad_norm": 6.4375, + "learning_rate": 6.180948407434417e-07, + "loss": 1.50182371, + "memory(GiB)": 128.51, + "step": 66860, + "train_speed(iter/s)": 1.637722 + }, + { + "acc": 0.66517563, + "epoch": 1.6962201927955354, + "grad_norm": 8.25, + "learning_rate": 6.175899006522162e-07, + "loss": 1.56674652, + "memory(GiB)": 128.51, + "step": 66865, + "train_speed(iter/s)": 1.637736 + }, + { + "acc": 0.64143372, + "epoch": 1.6963470319634704, + "grad_norm": 5.5625, + "learning_rate": 6.170851533189537e-07, + "loss": 1.6080389, + "memory(GiB)": 128.51, + "step": 66870, + "train_speed(iter/s)": 1.637748 + }, + { + "acc": 0.67202463, + "epoch": 1.6964738711314054, + "grad_norm": 5.625, + "learning_rate": 6.165805987658535e-07, + "loss": 1.52700891, + "memory(GiB)": 128.51, + "step": 66875, + "train_speed(iter/s)": 1.637762 + }, + { + "acc": 0.66692219, + "epoch": 1.6966007102993403, + "grad_norm": 4.96875, + "learning_rate": 6.160762370151097e-07, + "loss": 1.60886707, + "memory(GiB)": 128.51, + "step": 66880, + "train_speed(iter/s)": 1.637774 + }, + { + "acc": 0.6600338, + "epoch": 1.6967275494672753, + "grad_norm": 6.5, + "learning_rate": 6.155720680889049e-07, + "loss": 1.6563942, + "memory(GiB)": 128.51, + "step": 66885, + "train_speed(iter/s)": 1.637576 + }, + { + "acc": 0.6666203, + "epoch": 1.6968543886352105, + "grad_norm": 6.03125, + "learning_rate": 6.150680920094171e-07, + "loss": 1.6286375, + "memory(GiB)": 128.51, + "step": 66890, + "train_speed(iter/s)": 1.637589 + }, + { + "acc": 0.64762788, + "epoch": 1.6969812278031458, + "grad_norm": 5.4375, + "learning_rate": 6.145643087988113e-07, + "loss": 1.62744045, + "memory(GiB)": 128.51, + "step": 66895, + "train_speed(iter/s)": 1.637384 + }, + { + "acc": 0.6326457, + "epoch": 1.6971080669710807, + "grad_norm": 7.6875, + "learning_rate": 6.140607184792469e-07, + "loss": 1.67915802, + "memory(GiB)": 128.51, + "step": 66900, + "train_speed(iter/s)": 1.637398 + }, + { + "acc": 0.66274881, + "epoch": 1.6972349061390157, + "grad_norm": 7.28125, + "learning_rate": 6.135573210728724e-07, + "loss": 1.61145687, + "memory(GiB)": 128.51, + "step": 66905, + "train_speed(iter/s)": 1.63741 + }, + { + "acc": 0.66126814, + "epoch": 1.6973617453069507, + "grad_norm": 5.59375, + "learning_rate": 6.130541166018333e-07, + "loss": 1.50673847, + "memory(GiB)": 128.51, + "step": 66910, + "train_speed(iter/s)": 1.637424 + }, + { + "acc": 0.64857712, + "epoch": 1.6974885844748857, + "grad_norm": 4.8125, + "learning_rate": 6.125511050882598e-07, + "loss": 1.62452621, + "memory(GiB)": 128.51, + "step": 66915, + "train_speed(iter/s)": 1.637436 + }, + { + "acc": 0.65366311, + "epoch": 1.697615423642821, + "grad_norm": 6.625, + "learning_rate": 6.12048286554276e-07, + "loss": 1.56389093, + "memory(GiB)": 128.51, + "step": 66920, + "train_speed(iter/s)": 1.63745 + }, + { + "acc": 0.66152525, + "epoch": 1.697742262810756, + "grad_norm": 6.65625, + "learning_rate": 6.11545661022e-07, + "loss": 1.59166803, + "memory(GiB)": 128.51, + "step": 66925, + "train_speed(iter/s)": 1.637462 + }, + { + "acc": 0.64788628, + "epoch": 1.6978691019786911, + "grad_norm": 4.9375, + "learning_rate": 6.110432285135404e-07, + "loss": 1.604673, + "memory(GiB)": 128.51, + "step": 66930, + "train_speed(iter/s)": 1.637475 + }, + { + "acc": 0.66452541, + "epoch": 1.697995941146626, + "grad_norm": 5.09375, + "learning_rate": 6.105409890509933e-07, + "loss": 1.5599659, + "memory(GiB)": 128.51, + "step": 66935, + "train_speed(iter/s)": 1.637489 + }, + { + "acc": 0.64601746, + "epoch": 1.698122780314561, + "grad_norm": 5.15625, + "learning_rate": 6.100389426564501e-07, + "loss": 1.61214828, + "memory(GiB)": 128.51, + "step": 66940, + "train_speed(iter/s)": 1.637502 + }, + { + "acc": 0.65639439, + "epoch": 1.698249619482496, + "grad_norm": 5.21875, + "learning_rate": 6.095370893519953e-07, + "loss": 1.6274086, + "memory(GiB)": 128.51, + "step": 66945, + "train_speed(iter/s)": 1.637514 + }, + { + "acc": 0.66981788, + "epoch": 1.6983764586504313, + "grad_norm": 5.40625, + "learning_rate": 6.090354291597022e-07, + "loss": 1.5581728, + "memory(GiB)": 128.51, + "step": 66950, + "train_speed(iter/s)": 1.637526 + }, + { + "acc": 0.65510864, + "epoch": 1.6985032978183663, + "grad_norm": 6.5625, + "learning_rate": 6.085339621016334e-07, + "loss": 1.58307333, + "memory(GiB)": 128.51, + "step": 66955, + "train_speed(iter/s)": 1.637538 + }, + { + "acc": 0.66451969, + "epoch": 1.6986301369863015, + "grad_norm": 5.625, + "learning_rate": 6.080326881998482e-07, + "loss": 1.49869213, + "memory(GiB)": 128.51, + "step": 66960, + "train_speed(iter/s)": 1.63755 + }, + { + "acc": 0.66328754, + "epoch": 1.6987569761542365, + "grad_norm": 5.96875, + "learning_rate": 6.075316074763943e-07, + "loss": 1.57312756, + "memory(GiB)": 128.51, + "step": 66965, + "train_speed(iter/s)": 1.637562 + }, + { + "acc": 0.65397367, + "epoch": 1.6988838153221715, + "grad_norm": 5.0625, + "learning_rate": 6.070307199533109e-07, + "loss": 1.61045895, + "memory(GiB)": 128.51, + "step": 66970, + "train_speed(iter/s)": 1.637575 + }, + { + "acc": 0.6604332, + "epoch": 1.6990106544901065, + "grad_norm": 5.6875, + "learning_rate": 6.065300256526297e-07, + "loss": 1.59720001, + "memory(GiB)": 128.51, + "step": 66975, + "train_speed(iter/s)": 1.637587 + }, + { + "acc": 0.65703821, + "epoch": 1.6991374936580415, + "grad_norm": 7.125, + "learning_rate": 6.060295245963737e-07, + "loss": 1.61264229, + "memory(GiB)": 128.51, + "step": 66980, + "train_speed(iter/s)": 1.637601 + }, + { + "acc": 0.65314856, + "epoch": 1.6992643328259767, + "grad_norm": 5.625, + "learning_rate": 6.055292168065563e-07, + "loss": 1.53722334, + "memory(GiB)": 128.51, + "step": 66985, + "train_speed(iter/s)": 1.637613 + }, + { + "acc": 0.65193248, + "epoch": 1.6993911719939119, + "grad_norm": 6.53125, + "learning_rate": 6.050291023051846e-07, + "loss": 1.62349396, + "memory(GiB)": 128.51, + "step": 66990, + "train_speed(iter/s)": 1.637626 + }, + { + "acc": 0.6468997, + "epoch": 1.6995180111618469, + "grad_norm": 7.71875, + "learning_rate": 6.045291811142545e-07, + "loss": 1.61090851, + "memory(GiB)": 128.51, + "step": 66995, + "train_speed(iter/s)": 1.637639 + }, + { + "acc": 0.66547103, + "epoch": 1.6996448503297819, + "grad_norm": 6.0, + "learning_rate": 6.040294532557555e-07, + "loss": 1.57564735, + "memory(GiB)": 128.51, + "step": 67000, + "train_speed(iter/s)": 1.637652 + }, + { + "epoch": 1.6996448503297819, + "eval_acc": 0.6463044290884336, + "eval_loss": 1.5730608701705933, + "eval_runtime": 58.275, + "eval_samples_per_second": 109.309, + "eval_steps_per_second": 27.336, + "step": 67000 + }, + { + "acc": 0.67243071, + "epoch": 1.6997716894977168, + "grad_norm": 5.1875, + "learning_rate": 6.03529918751668e-07, + "loss": 1.56185894, + "memory(GiB)": 128.51, + "step": 67005, + "train_speed(iter/s)": 1.63516 + }, + { + "acc": 0.65515356, + "epoch": 1.6998985286656518, + "grad_norm": 5.8125, + "learning_rate": 6.030305776239637e-07, + "loss": 1.61485138, + "memory(GiB)": 128.51, + "step": 67010, + "train_speed(iter/s)": 1.635173 + }, + { + "acc": 0.66119003, + "epoch": 1.700025367833587, + "grad_norm": 5.625, + "learning_rate": 6.02531429894605e-07, + "loss": 1.62632561, + "memory(GiB)": 128.51, + "step": 67015, + "train_speed(iter/s)": 1.635186 + }, + { + "acc": 0.65236435, + "epoch": 1.700152207001522, + "grad_norm": 5.71875, + "learning_rate": 6.020324755855478e-07, + "loss": 1.6575634, + "memory(GiB)": 128.51, + "step": 67020, + "train_speed(iter/s)": 1.635199 + }, + { + "acc": 0.66233978, + "epoch": 1.7002790461694572, + "grad_norm": 7.53125, + "learning_rate": 6.015337147187378e-07, + "loss": 1.5628746, + "memory(GiB)": 128.51, + "step": 67025, + "train_speed(iter/s)": 1.635212 + }, + { + "acc": 0.64645081, + "epoch": 1.7004058853373922, + "grad_norm": 5.625, + "learning_rate": 6.010351473161124e-07, + "loss": 1.62819366, + "memory(GiB)": 128.51, + "step": 67030, + "train_speed(iter/s)": 1.635224 + }, + { + "acc": 0.65663452, + "epoch": 1.7005327245053272, + "grad_norm": 4.78125, + "learning_rate": 6.005367733996014e-07, + "loss": 1.59264259, + "memory(GiB)": 128.51, + "step": 67035, + "train_speed(iter/s)": 1.635237 + }, + { + "acc": 0.67016044, + "epoch": 1.7006595636732622, + "grad_norm": 5.625, + "learning_rate": 6.000385929911251e-07, + "loss": 1.55682907, + "memory(GiB)": 128.51, + "step": 67040, + "train_speed(iter/s)": 1.63525 + }, + { + "acc": 0.67255087, + "epoch": 1.7007864028411972, + "grad_norm": 5.375, + "learning_rate": 5.995406061125952e-07, + "loss": 1.5465332, + "memory(GiB)": 128.51, + "step": 67045, + "train_speed(iter/s)": 1.635262 + }, + { + "acc": 0.64551592, + "epoch": 1.7009132420091324, + "grad_norm": 7.21875, + "learning_rate": 5.990428127859182e-07, + "loss": 1.63492146, + "memory(GiB)": 128.51, + "step": 67050, + "train_speed(iter/s)": 1.635276 + }, + { + "acc": 0.65595264, + "epoch": 1.7010400811770676, + "grad_norm": 5.6875, + "learning_rate": 5.985452130329855e-07, + "loss": 1.62923775, + "memory(GiB)": 128.51, + "step": 67055, + "train_speed(iter/s)": 1.635289 + }, + { + "acc": 0.6516037, + "epoch": 1.7011669203450026, + "grad_norm": 6.5625, + "learning_rate": 5.980478068756851e-07, + "loss": 1.65989189, + "memory(GiB)": 128.51, + "step": 67060, + "train_speed(iter/s)": 1.635301 + }, + { + "acc": 0.64937429, + "epoch": 1.7012937595129376, + "grad_norm": 6.71875, + "learning_rate": 5.975505943358961e-07, + "loss": 1.65548401, + "memory(GiB)": 128.51, + "step": 67065, + "train_speed(iter/s)": 1.635313 + }, + { + "acc": 0.6575778, + "epoch": 1.7014205986808726, + "grad_norm": 7.1875, + "learning_rate": 5.970535754354889e-07, + "loss": 1.57781773, + "memory(GiB)": 128.51, + "step": 67070, + "train_speed(iter/s)": 1.635324 + }, + { + "acc": 0.64836845, + "epoch": 1.7015474378488076, + "grad_norm": 4.875, + "learning_rate": 5.965567501963215e-07, + "loss": 1.57942142, + "memory(GiB)": 128.51, + "step": 67075, + "train_speed(iter/s)": 1.635337 + }, + { + "acc": 0.66571617, + "epoch": 1.7016742770167428, + "grad_norm": 7.40625, + "learning_rate": 5.960601186402476e-07, + "loss": 1.54892979, + "memory(GiB)": 128.51, + "step": 67080, + "train_speed(iter/s)": 1.635349 + }, + { + "acc": 0.65545506, + "epoch": 1.7018011161846778, + "grad_norm": 5.46875, + "learning_rate": 5.95563680789113e-07, + "loss": 1.57882843, + "memory(GiB)": 128.51, + "step": 67085, + "train_speed(iter/s)": 1.635362 + }, + { + "acc": 0.65290594, + "epoch": 1.701927955352613, + "grad_norm": 6.0625, + "learning_rate": 5.95067436664753e-07, + "loss": 1.61973362, + "memory(GiB)": 128.51, + "step": 67090, + "train_speed(iter/s)": 1.635374 + }, + { + "acc": 0.67909002, + "epoch": 1.702054794520548, + "grad_norm": 6.96875, + "learning_rate": 5.945713862889918e-07, + "loss": 1.53332214, + "memory(GiB)": 128.51, + "step": 67095, + "train_speed(iter/s)": 1.635387 + }, + { + "acc": 0.65608969, + "epoch": 1.702181633688483, + "grad_norm": 5.625, + "learning_rate": 5.940755296836504e-07, + "loss": 1.63927383, + "memory(GiB)": 128.51, + "step": 67100, + "train_speed(iter/s)": 1.635401 + }, + { + "acc": 0.6650589, + "epoch": 1.702308472856418, + "grad_norm": 5.4375, + "learning_rate": 5.935798668705389e-07, + "loss": 1.61723461, + "memory(GiB)": 128.51, + "step": 67105, + "train_speed(iter/s)": 1.635414 + }, + { + "acc": 0.66094608, + "epoch": 1.7024353120243532, + "grad_norm": 5.34375, + "learning_rate": 5.930843978714584e-07, + "loss": 1.58662138, + "memory(GiB)": 128.51, + "step": 67110, + "train_speed(iter/s)": 1.635427 + }, + { + "acc": 0.6680903, + "epoch": 1.7025621511922882, + "grad_norm": 5.34375, + "learning_rate": 5.925891227081998e-07, + "loss": 1.5670907, + "memory(GiB)": 128.51, + "step": 67115, + "train_speed(iter/s)": 1.635438 + }, + { + "acc": 0.66787357, + "epoch": 1.7026889903602234, + "grad_norm": 8.0, + "learning_rate": 5.920940414025506e-07, + "loss": 1.56318722, + "memory(GiB)": 128.51, + "step": 67120, + "train_speed(iter/s)": 1.635452 + }, + { + "acc": 0.66724529, + "epoch": 1.7028158295281584, + "grad_norm": 6.375, + "learning_rate": 5.91599153976285e-07, + "loss": 1.51742411, + "memory(GiB)": 128.51, + "step": 67125, + "train_speed(iter/s)": 1.635464 + }, + { + "acc": 0.65388689, + "epoch": 1.7029426686960933, + "grad_norm": 4.9375, + "learning_rate": 5.911044604511712e-07, + "loss": 1.62871475, + "memory(GiB)": 128.51, + "step": 67130, + "train_speed(iter/s)": 1.635478 + }, + { + "acc": 0.65295963, + "epoch": 1.7030695078640283, + "grad_norm": 5.28125, + "learning_rate": 5.906099608489669e-07, + "loss": 1.64161377, + "memory(GiB)": 128.51, + "step": 67135, + "train_speed(iter/s)": 1.635489 + }, + { + "acc": 0.65288839, + "epoch": 1.7031963470319633, + "grad_norm": 4.9375, + "learning_rate": 5.901156551914233e-07, + "loss": 1.59084301, + "memory(GiB)": 128.51, + "step": 67140, + "train_speed(iter/s)": 1.635501 + }, + { + "acc": 0.64179258, + "epoch": 1.7033231861998985, + "grad_norm": 6.0625, + "learning_rate": 5.896215435002816e-07, + "loss": 1.63305626, + "memory(GiB)": 128.51, + "step": 67145, + "train_speed(iter/s)": 1.635509 + }, + { + "acc": 0.65226526, + "epoch": 1.7034500253678337, + "grad_norm": 5.1875, + "learning_rate": 5.891276257972762e-07, + "loss": 1.57875299, + "memory(GiB)": 128.51, + "step": 67150, + "train_speed(iter/s)": 1.635521 + }, + { + "acc": 0.65882301, + "epoch": 1.7035768645357687, + "grad_norm": 5.15625, + "learning_rate": 5.886339021041304e-07, + "loss": 1.57733374, + "memory(GiB)": 128.51, + "step": 67155, + "train_speed(iter/s)": 1.635533 + }, + { + "acc": 0.65831118, + "epoch": 1.7037037037037037, + "grad_norm": 5.46875, + "learning_rate": 5.881403724425605e-07, + "loss": 1.64175663, + "memory(GiB)": 128.51, + "step": 67160, + "train_speed(iter/s)": 1.635547 + }, + { + "acc": 0.65446987, + "epoch": 1.7038305428716387, + "grad_norm": 6.21875, + "learning_rate": 5.876470368342752e-07, + "loss": 1.57208385, + "memory(GiB)": 128.51, + "step": 67165, + "train_speed(iter/s)": 1.635559 + }, + { + "acc": 0.66269779, + "epoch": 1.7039573820395737, + "grad_norm": 4.96875, + "learning_rate": 5.871538953009725e-07, + "loss": 1.51582699, + "memory(GiB)": 128.51, + "step": 67170, + "train_speed(iter/s)": 1.635572 + }, + { + "acc": 0.66614838, + "epoch": 1.704084221207509, + "grad_norm": 5.53125, + "learning_rate": 5.866609478643437e-07, + "loss": 1.53599463, + "memory(GiB)": 128.51, + "step": 67175, + "train_speed(iter/s)": 1.635583 + }, + { + "acc": 0.64267693, + "epoch": 1.704211060375444, + "grad_norm": 5.78125, + "learning_rate": 5.861681945460706e-07, + "loss": 1.64508476, + "memory(GiB)": 128.51, + "step": 67180, + "train_speed(iter/s)": 1.635596 + }, + { + "acc": 0.63538084, + "epoch": 1.704337899543379, + "grad_norm": 6.46875, + "learning_rate": 5.856756353678255e-07, + "loss": 1.65825348, + "memory(GiB)": 128.51, + "step": 67185, + "train_speed(iter/s)": 1.635608 + }, + { + "acc": 0.65387354, + "epoch": 1.704464738711314, + "grad_norm": 5.34375, + "learning_rate": 5.851832703512766e-07, + "loss": 1.57906122, + "memory(GiB)": 128.51, + "step": 67190, + "train_speed(iter/s)": 1.635621 + }, + { + "acc": 0.65083556, + "epoch": 1.704591577879249, + "grad_norm": 5.125, + "learning_rate": 5.846910995180771e-07, + "loss": 1.56133184, + "memory(GiB)": 128.51, + "step": 67195, + "train_speed(iter/s)": 1.635634 + }, + { + "acc": 0.64118786, + "epoch": 1.704718417047184, + "grad_norm": 4.71875, + "learning_rate": 5.841991228898758e-07, + "loss": 1.60160198, + "memory(GiB)": 128.51, + "step": 67200, + "train_speed(iter/s)": 1.635648 + }, + { + "acc": 0.67618542, + "epoch": 1.704845256215119, + "grad_norm": 6.40625, + "learning_rate": 5.837073404883115e-07, + "loss": 1.54821815, + "memory(GiB)": 128.51, + "step": 67205, + "train_speed(iter/s)": 1.635661 + }, + { + "acc": 0.67320957, + "epoch": 1.7049720953830543, + "grad_norm": 5.875, + "learning_rate": 5.832157523350179e-07, + "loss": 1.5800415, + "memory(GiB)": 128.51, + "step": 67210, + "train_speed(iter/s)": 1.635674 + }, + { + "acc": 0.64668779, + "epoch": 1.7050989345509895, + "grad_norm": 5.9375, + "learning_rate": 5.827243584516135e-07, + "loss": 1.61795712, + "memory(GiB)": 128.51, + "step": 67215, + "train_speed(iter/s)": 1.635688 + }, + { + "acc": 0.65144224, + "epoch": 1.7052257737189245, + "grad_norm": 5.40625, + "learning_rate": 5.822331588597129e-07, + "loss": 1.62538376, + "memory(GiB)": 128.51, + "step": 67220, + "train_speed(iter/s)": 1.635701 + }, + { + "acc": 0.66135716, + "epoch": 1.7053526128868595, + "grad_norm": 6.59375, + "learning_rate": 5.817421535809226e-07, + "loss": 1.58652143, + "memory(GiB)": 128.51, + "step": 67225, + "train_speed(iter/s)": 1.635714 + }, + { + "acc": 0.6567132, + "epoch": 1.7054794520547945, + "grad_norm": 5.71875, + "learning_rate": 5.812513426368399e-07, + "loss": 1.59401484, + "memory(GiB)": 128.51, + "step": 67230, + "train_speed(iter/s)": 1.635727 + }, + { + "acc": 0.66141739, + "epoch": 1.7056062912227294, + "grad_norm": 5.3125, + "learning_rate": 5.807607260490489e-07, + "loss": 1.60627651, + "memory(GiB)": 128.51, + "step": 67235, + "train_speed(iter/s)": 1.635739 + }, + { + "acc": 0.66541567, + "epoch": 1.7057331303906647, + "grad_norm": 5.375, + "learning_rate": 5.802703038391333e-07, + "loss": 1.54779749, + "memory(GiB)": 128.51, + "step": 67240, + "train_speed(iter/s)": 1.635753 + }, + { + "acc": 0.64876361, + "epoch": 1.7058599695585996, + "grad_norm": 5.40625, + "learning_rate": 5.797800760286621e-07, + "loss": 1.61090012, + "memory(GiB)": 128.51, + "step": 67245, + "train_speed(iter/s)": 1.635766 + }, + { + "acc": 0.65214796, + "epoch": 1.7059868087265349, + "grad_norm": 6.34375, + "learning_rate": 5.79290042639199e-07, + "loss": 1.57541056, + "memory(GiB)": 128.51, + "step": 67250, + "train_speed(iter/s)": 1.635778 + }, + { + "acc": 0.67495232, + "epoch": 1.7061136478944698, + "grad_norm": 5.46875, + "learning_rate": 5.788002036922946e-07, + "loss": 1.56865396, + "memory(GiB)": 128.51, + "step": 67255, + "train_speed(iter/s)": 1.635792 + }, + { + "acc": 0.65226698, + "epoch": 1.7062404870624048, + "grad_norm": 6.34375, + "learning_rate": 5.783105592094978e-07, + "loss": 1.60335197, + "memory(GiB)": 128.51, + "step": 67260, + "train_speed(iter/s)": 1.635806 + }, + { + "acc": 0.66577034, + "epoch": 1.7063673262303398, + "grad_norm": 5.84375, + "learning_rate": 5.778211092123437e-07, + "loss": 1.6350111, + "memory(GiB)": 128.51, + "step": 67265, + "train_speed(iter/s)": 1.635819 + }, + { + "acc": 0.66672201, + "epoch": 1.706494165398275, + "grad_norm": 6.59375, + "learning_rate": 5.77331853722361e-07, + "loss": 1.61463795, + "memory(GiB)": 128.51, + "step": 67270, + "train_speed(iter/s)": 1.635833 + }, + { + "acc": 0.65614357, + "epoch": 1.70662100456621, + "grad_norm": 5.28125, + "learning_rate": 5.768427927610698e-07, + "loss": 1.68490486, + "memory(GiB)": 128.51, + "step": 67275, + "train_speed(iter/s)": 1.635845 + }, + { + "acc": 0.66344318, + "epoch": 1.7067478437341452, + "grad_norm": 8.8125, + "learning_rate": 5.763539263499796e-07, + "loss": 1.53171568, + "memory(GiB)": 128.51, + "step": 67280, + "train_speed(iter/s)": 1.635858 + }, + { + "acc": 0.66403871, + "epoch": 1.7068746829020802, + "grad_norm": 5.28125, + "learning_rate": 5.758652545105941e-07, + "loss": 1.55713673, + "memory(GiB)": 128.51, + "step": 67285, + "train_speed(iter/s)": 1.63587 + }, + { + "acc": 0.64168429, + "epoch": 1.7070015220700152, + "grad_norm": 6.0, + "learning_rate": 5.753767772644075e-07, + "loss": 1.62596092, + "memory(GiB)": 128.51, + "step": 67290, + "train_speed(iter/s)": 1.635882 + }, + { + "acc": 0.66364594, + "epoch": 1.7071283612379502, + "grad_norm": 6.34375, + "learning_rate": 5.748884946329048e-07, + "loss": 1.54136763, + "memory(GiB)": 128.51, + "step": 67295, + "train_speed(iter/s)": 1.635895 + }, + { + "acc": 0.66366282, + "epoch": 1.7072552004058852, + "grad_norm": 6.4375, + "learning_rate": 5.744004066375625e-07, + "loss": 1.59002447, + "memory(GiB)": 128.51, + "step": 67300, + "train_speed(iter/s)": 1.635907 + }, + { + "acc": 0.68015742, + "epoch": 1.7073820395738204, + "grad_norm": 5.15625, + "learning_rate": 5.739125132998491e-07, + "loss": 1.54480391, + "memory(GiB)": 128.51, + "step": 67305, + "train_speed(iter/s)": 1.63592 + }, + { + "acc": 0.65216713, + "epoch": 1.7075088787417556, + "grad_norm": 5.78125, + "learning_rate": 5.734248146412242e-07, + "loss": 1.61866341, + "memory(GiB)": 128.51, + "step": 67310, + "train_speed(iter/s)": 1.635931 + }, + { + "acc": 0.64957247, + "epoch": 1.7076357179096906, + "grad_norm": 7.0625, + "learning_rate": 5.729373106831399e-07, + "loss": 1.63667259, + "memory(GiB)": 128.51, + "step": 67315, + "train_speed(iter/s)": 1.635943 + }, + { + "acc": 0.6638835, + "epoch": 1.7077625570776256, + "grad_norm": 11.25, + "learning_rate": 5.724500014470374e-07, + "loss": 1.6611311, + "memory(GiB)": 128.51, + "step": 67320, + "train_speed(iter/s)": 1.635955 + }, + { + "acc": 0.64782643, + "epoch": 1.7078893962455606, + "grad_norm": 5.6875, + "learning_rate": 5.719628869543509e-07, + "loss": 1.61336021, + "memory(GiB)": 128.51, + "step": 67325, + "train_speed(iter/s)": 1.635968 + }, + { + "acc": 0.6541482, + "epoch": 1.7080162354134956, + "grad_norm": 6.3125, + "learning_rate": 5.71475967226508e-07, + "loss": 1.5971468, + "memory(GiB)": 128.51, + "step": 67330, + "train_speed(iter/s)": 1.635979 + }, + { + "acc": 0.67253056, + "epoch": 1.7081430745814308, + "grad_norm": 7.0, + "learning_rate": 5.709892422849233e-07, + "loss": 1.54063168, + "memory(GiB)": 128.51, + "step": 67335, + "train_speed(iter/s)": 1.635992 + }, + { + "acc": 0.64050703, + "epoch": 1.7082699137493658, + "grad_norm": 6.96875, + "learning_rate": 5.705027121510059e-07, + "loss": 1.65965195, + "memory(GiB)": 128.51, + "step": 67340, + "train_speed(iter/s)": 1.636003 + }, + { + "acc": 0.65276108, + "epoch": 1.708396752917301, + "grad_norm": 6.25, + "learning_rate": 5.700163768461542e-07, + "loss": 1.57592535, + "memory(GiB)": 128.51, + "step": 67345, + "train_speed(iter/s)": 1.636016 + }, + { + "acc": 0.63521709, + "epoch": 1.708523592085236, + "grad_norm": 5.65625, + "learning_rate": 5.695302363917626e-07, + "loss": 1.6635664, + "memory(GiB)": 128.51, + "step": 67350, + "train_speed(iter/s)": 1.636029 + }, + { + "acc": 0.66569362, + "epoch": 1.708650431253171, + "grad_norm": 7.875, + "learning_rate": 5.690442908092115e-07, + "loss": 1.55869045, + "memory(GiB)": 128.51, + "step": 67355, + "train_speed(iter/s)": 1.636041 + }, + { + "acc": 0.64959393, + "epoch": 1.708777270421106, + "grad_norm": 8.3125, + "learning_rate": 5.685585401198735e-07, + "loss": 1.62649384, + "memory(GiB)": 128.51, + "step": 67360, + "train_speed(iter/s)": 1.636052 + }, + { + "acc": 0.65645819, + "epoch": 1.708904109589041, + "grad_norm": 7.90625, + "learning_rate": 5.680729843451172e-07, + "loss": 1.59811745, + "memory(GiB)": 128.51, + "step": 67365, + "train_speed(iter/s)": 1.636064 + }, + { + "acc": 0.63494329, + "epoch": 1.7090309487569761, + "grad_norm": 4.65625, + "learning_rate": 5.675876235062994e-07, + "loss": 1.69742622, + "memory(GiB)": 128.51, + "step": 67370, + "train_speed(iter/s)": 1.636076 + }, + { + "acc": 0.65067978, + "epoch": 1.7091577879249114, + "grad_norm": 5.84375, + "learning_rate": 5.671024576247657e-07, + "loss": 1.56292629, + "memory(GiB)": 128.51, + "step": 67375, + "train_speed(iter/s)": 1.636088 + }, + { + "acc": 0.66673517, + "epoch": 1.7092846270928463, + "grad_norm": 4.5625, + "learning_rate": 5.666174867218572e-07, + "loss": 1.54882193, + "memory(GiB)": 128.51, + "step": 67380, + "train_speed(iter/s)": 1.636099 + }, + { + "acc": 0.66848035, + "epoch": 1.7094114662607813, + "grad_norm": 6.21875, + "learning_rate": 5.661327108189058e-07, + "loss": 1.52031937, + "memory(GiB)": 128.51, + "step": 67385, + "train_speed(iter/s)": 1.636111 + }, + { + "acc": 0.66235709, + "epoch": 1.7095383054287163, + "grad_norm": 5.65625, + "learning_rate": 5.65648129937234e-07, + "loss": 1.54351664, + "memory(GiB)": 128.51, + "step": 67390, + "train_speed(iter/s)": 1.636124 + }, + { + "acc": 0.65491085, + "epoch": 1.7096651445966513, + "grad_norm": 6.40625, + "learning_rate": 5.651637440981545e-07, + "loss": 1.61414585, + "memory(GiB)": 128.51, + "step": 67395, + "train_speed(iter/s)": 1.636137 + }, + { + "acc": 0.65579281, + "epoch": 1.7097919837645865, + "grad_norm": 6.1875, + "learning_rate": 5.646795533229738e-07, + "loss": 1.65341644, + "memory(GiB)": 128.51, + "step": 67400, + "train_speed(iter/s)": 1.636149 + }, + { + "acc": 0.65887375, + "epoch": 1.7099188229325215, + "grad_norm": 6.625, + "learning_rate": 5.641955576329888e-07, + "loss": 1.597859, + "memory(GiB)": 128.51, + "step": 67405, + "train_speed(iter/s)": 1.636163 + }, + { + "acc": 0.67939582, + "epoch": 1.7100456621004567, + "grad_norm": 5.1875, + "learning_rate": 5.637117570494877e-07, + "loss": 1.52714405, + "memory(GiB)": 128.51, + "step": 67410, + "train_speed(iter/s)": 1.636175 + }, + { + "acc": 0.64111986, + "epoch": 1.7101725012683917, + "grad_norm": 4.96875, + "learning_rate": 5.632281515937504e-07, + "loss": 1.63542213, + "memory(GiB)": 128.51, + "step": 67415, + "train_speed(iter/s)": 1.636188 + }, + { + "acc": 0.65833836, + "epoch": 1.7102993404363267, + "grad_norm": 5.125, + "learning_rate": 5.627447412870474e-07, + "loss": 1.57399855, + "memory(GiB)": 128.51, + "step": 67420, + "train_speed(iter/s)": 1.6362 + }, + { + "acc": 0.65467939, + "epoch": 1.7104261796042617, + "grad_norm": 6.09375, + "learning_rate": 5.62261526150642e-07, + "loss": 1.62727623, + "memory(GiB)": 128.51, + "step": 67425, + "train_speed(iter/s)": 1.636211 + }, + { + "acc": 0.6677947, + "epoch": 1.710553018772197, + "grad_norm": 6.46875, + "learning_rate": 5.617785062057873e-07, + "loss": 1.59308891, + "memory(GiB)": 128.51, + "step": 67430, + "train_speed(iter/s)": 1.636223 + }, + { + "acc": 0.64166551, + "epoch": 1.7106798579401319, + "grad_norm": 5.28125, + "learning_rate": 5.612956814737291e-07, + "loss": 1.59966021, + "memory(GiB)": 128.51, + "step": 67435, + "train_speed(iter/s)": 1.636236 + }, + { + "acc": 0.6683826, + "epoch": 1.710806697108067, + "grad_norm": 6.65625, + "learning_rate": 5.608130519757044e-07, + "loss": 1.50156555, + "memory(GiB)": 128.51, + "step": 67440, + "train_speed(iter/s)": 1.636249 + }, + { + "acc": 0.66201143, + "epoch": 1.710933536276002, + "grad_norm": 5.96875, + "learning_rate": 5.603306177329415e-07, + "loss": 1.56475639, + "memory(GiB)": 128.51, + "step": 67445, + "train_speed(iter/s)": 1.636264 + }, + { + "acc": 0.65424442, + "epoch": 1.711060375443937, + "grad_norm": 5.125, + "learning_rate": 5.598483787666592e-07, + "loss": 1.55552483, + "memory(GiB)": 128.51, + "step": 67450, + "train_speed(iter/s)": 1.636276 + }, + { + "acc": 0.65675545, + "epoch": 1.711187214611872, + "grad_norm": 6.5, + "learning_rate": 5.593663350980693e-07, + "loss": 1.65335522, + "memory(GiB)": 128.51, + "step": 67455, + "train_speed(iter/s)": 1.636289 + }, + { + "acc": 0.67183008, + "epoch": 1.711314053779807, + "grad_norm": 6.28125, + "learning_rate": 5.588844867483739e-07, + "loss": 1.56383057, + "memory(GiB)": 128.51, + "step": 67460, + "train_speed(iter/s)": 1.636301 + }, + { + "acc": 0.66191173, + "epoch": 1.7114408929477423, + "grad_norm": 5.6875, + "learning_rate": 5.58402833738767e-07, + "loss": 1.51929502, + "memory(GiB)": 128.51, + "step": 67465, + "train_speed(iter/s)": 1.636315 + }, + { + "acc": 0.63522797, + "epoch": 1.7115677321156775, + "grad_norm": 4.59375, + "learning_rate": 5.579213760904339e-07, + "loss": 1.6719841, + "memory(GiB)": 128.51, + "step": 67470, + "train_speed(iter/s)": 1.636328 + }, + { + "acc": 0.65678344, + "epoch": 1.7116945712836125, + "grad_norm": 6.125, + "learning_rate": 5.574401138245511e-07, + "loss": 1.57955208, + "memory(GiB)": 128.51, + "step": 67475, + "train_speed(iter/s)": 1.636341 + }, + { + "acc": 0.65569644, + "epoch": 1.7118214104515475, + "grad_norm": 5.46875, + "learning_rate": 5.569590469622865e-07, + "loss": 1.55289574, + "memory(GiB)": 128.51, + "step": 67480, + "train_speed(iter/s)": 1.636354 + }, + { + "acc": 0.68059773, + "epoch": 1.7119482496194824, + "grad_norm": 5.625, + "learning_rate": 5.564781755247989e-07, + "loss": 1.52871294, + "memory(GiB)": 128.51, + "step": 67485, + "train_speed(iter/s)": 1.636365 + }, + { + "acc": 0.65363941, + "epoch": 1.7120750887874174, + "grad_norm": 6.9375, + "learning_rate": 5.559974995332424e-07, + "loss": 1.60149441, + "memory(GiB)": 128.51, + "step": 67490, + "train_speed(iter/s)": 1.636378 + }, + { + "acc": 0.6431273, + "epoch": 1.7122019279553526, + "grad_norm": 5.25, + "learning_rate": 5.555170190087556e-07, + "loss": 1.58752108, + "memory(GiB)": 128.51, + "step": 67495, + "train_speed(iter/s)": 1.636391 + }, + { + "acc": 0.64971752, + "epoch": 1.7123287671232876, + "grad_norm": 6.34375, + "learning_rate": 5.550367339724721e-07, + "loss": 1.62518997, + "memory(GiB)": 128.51, + "step": 67500, + "train_speed(iter/s)": 1.636404 + }, + { + "acc": 0.66027694, + "epoch": 1.7124556062912228, + "grad_norm": 5.875, + "learning_rate": 5.545566444455198e-07, + "loss": 1.58222637, + "memory(GiB)": 128.51, + "step": 67505, + "train_speed(iter/s)": 1.636417 + }, + { + "acc": 0.66492968, + "epoch": 1.7125824454591578, + "grad_norm": 5.96875, + "learning_rate": 5.540767504490147e-07, + "loss": 1.64748383, + "memory(GiB)": 128.51, + "step": 67510, + "train_speed(iter/s)": 1.63643 + }, + { + "acc": 0.66160536, + "epoch": 1.7127092846270928, + "grad_norm": 5.21875, + "learning_rate": 5.535970520040629e-07, + "loss": 1.52424345, + "memory(GiB)": 128.51, + "step": 67515, + "train_speed(iter/s)": 1.636443 + }, + { + "acc": 0.66720095, + "epoch": 1.7128361237950278, + "grad_norm": 4.65625, + "learning_rate": 5.53117549131763e-07, + "loss": 1.54493313, + "memory(GiB)": 128.51, + "step": 67520, + "train_speed(iter/s)": 1.636456 + }, + { + "acc": 0.66251659, + "epoch": 1.7129629629629628, + "grad_norm": 6.1875, + "learning_rate": 5.526382418532089e-07, + "loss": 1.56596413, + "memory(GiB)": 128.51, + "step": 67525, + "train_speed(iter/s)": 1.636469 + }, + { + "acc": 0.66081324, + "epoch": 1.713089802130898, + "grad_norm": 7.59375, + "learning_rate": 5.521591301894813e-07, + "loss": 1.58382988, + "memory(GiB)": 128.51, + "step": 67530, + "train_speed(iter/s)": 1.636482 + }, + { + "acc": 0.66171446, + "epoch": 1.7132166412988332, + "grad_norm": 5.15625, + "learning_rate": 5.516802141616517e-07, + "loss": 1.59539871, + "memory(GiB)": 128.51, + "step": 67535, + "train_speed(iter/s)": 1.636496 + }, + { + "acc": 0.65765657, + "epoch": 1.7133434804667682, + "grad_norm": 4.875, + "learning_rate": 5.512014937907872e-07, + "loss": 1.594205, + "memory(GiB)": 128.51, + "step": 67540, + "train_speed(iter/s)": 1.636508 + }, + { + "acc": 0.65852518, + "epoch": 1.7134703196347032, + "grad_norm": 5.53125, + "learning_rate": 5.507229690979438e-07, + "loss": 1.53196783, + "memory(GiB)": 128.51, + "step": 67545, + "train_speed(iter/s)": 1.63652 + }, + { + "acc": 0.64753685, + "epoch": 1.7135971588026382, + "grad_norm": 6.3125, + "learning_rate": 5.5024464010417e-07, + "loss": 1.63069267, + "memory(GiB)": 128.51, + "step": 67550, + "train_speed(iter/s)": 1.636534 + }, + { + "acc": 0.67058296, + "epoch": 1.7137239979705732, + "grad_norm": 5.9375, + "learning_rate": 5.497665068305014e-07, + "loss": 1.55970478, + "memory(GiB)": 128.51, + "step": 67555, + "train_speed(iter/s)": 1.636547 + }, + { + "acc": 0.64686761, + "epoch": 1.7138508371385084, + "grad_norm": 6.1875, + "learning_rate": 5.492885692979716e-07, + "loss": 1.63346825, + "memory(GiB)": 128.51, + "step": 67560, + "train_speed(iter/s)": 1.636559 + }, + { + "acc": 0.65042729, + "epoch": 1.7139776763064434, + "grad_norm": 6.09375, + "learning_rate": 5.488108275276016e-07, + "loss": 1.55949078, + "memory(GiB)": 128.51, + "step": 67565, + "train_speed(iter/s)": 1.636572 + }, + { + "acc": 0.66530113, + "epoch": 1.7141045154743786, + "grad_norm": 6.125, + "learning_rate": 5.483332815404046e-07, + "loss": 1.57052898, + "memory(GiB)": 128.51, + "step": 67570, + "train_speed(iter/s)": 1.636585 + }, + { + "acc": 0.65633655, + "epoch": 1.7142313546423136, + "grad_norm": 5.5625, + "learning_rate": 5.478559313573855e-07, + "loss": 1.61563721, + "memory(GiB)": 128.51, + "step": 67575, + "train_speed(iter/s)": 1.636597 + }, + { + "acc": 0.66037369, + "epoch": 1.7143581938102486, + "grad_norm": 6.0625, + "learning_rate": 5.473787769995392e-07, + "loss": 1.59547806, + "memory(GiB)": 128.51, + "step": 67580, + "train_speed(iter/s)": 1.63661 + }, + { + "acc": 0.63931317, + "epoch": 1.7144850329781836, + "grad_norm": 5.96875, + "learning_rate": 5.469018184878544e-07, + "loss": 1.63119316, + "memory(GiB)": 128.51, + "step": 67585, + "train_speed(iter/s)": 1.636622 + }, + { + "acc": 0.6508801, + "epoch": 1.7146118721461188, + "grad_norm": 5.1875, + "learning_rate": 5.464250558433088e-07, + "loss": 1.58278198, + "memory(GiB)": 128.51, + "step": 67590, + "train_speed(iter/s)": 1.636635 + }, + { + "acc": 0.64154372, + "epoch": 1.7147387113140538, + "grad_norm": 5.21875, + "learning_rate": 5.459484890868732e-07, + "loss": 1.66356468, + "memory(GiB)": 128.51, + "step": 67595, + "train_speed(iter/s)": 1.636648 + }, + { + "acc": 0.65137215, + "epoch": 1.714865550481989, + "grad_norm": 6.65625, + "learning_rate": 5.454721182395096e-07, + "loss": 1.64805431, + "memory(GiB)": 128.51, + "step": 67600, + "train_speed(iter/s)": 1.63666 + }, + { + "acc": 0.66730328, + "epoch": 1.714992389649924, + "grad_norm": 5.625, + "learning_rate": 5.449959433221697e-07, + "loss": 1.5982955, + "memory(GiB)": 128.51, + "step": 67605, + "train_speed(iter/s)": 1.636672 + }, + { + "acc": 0.65276961, + "epoch": 1.715119228817859, + "grad_norm": 5.28125, + "learning_rate": 5.44519964355798e-07, + "loss": 1.64188347, + "memory(GiB)": 128.51, + "step": 67610, + "train_speed(iter/s)": 1.636684 + }, + { + "acc": 0.64667115, + "epoch": 1.715246067985794, + "grad_norm": 6.3125, + "learning_rate": 5.440441813613312e-07, + "loss": 1.6179575, + "memory(GiB)": 128.51, + "step": 67615, + "train_speed(iter/s)": 1.636698 + }, + { + "acc": 0.65816946, + "epoch": 1.715372907153729, + "grad_norm": 5.4375, + "learning_rate": 5.435685943596953e-07, + "loss": 1.62623596, + "memory(GiB)": 128.51, + "step": 67620, + "train_speed(iter/s)": 1.636709 + }, + { + "acc": 0.66269526, + "epoch": 1.7154997463216641, + "grad_norm": 5.5, + "learning_rate": 5.430932033718083e-07, + "loss": 1.54525394, + "memory(GiB)": 128.51, + "step": 67625, + "train_speed(iter/s)": 1.636721 + }, + { + "acc": 0.66235828, + "epoch": 1.7156265854895993, + "grad_norm": 5.21875, + "learning_rate": 5.426180084185828e-07, + "loss": 1.60276299, + "memory(GiB)": 128.51, + "step": 67630, + "train_speed(iter/s)": 1.636733 + }, + { + "acc": 0.66971216, + "epoch": 1.7157534246575343, + "grad_norm": 7.125, + "learning_rate": 5.421430095209173e-07, + "loss": 1.60329056, + "memory(GiB)": 128.51, + "step": 67635, + "train_speed(iter/s)": 1.636746 + }, + { + "acc": 0.64574385, + "epoch": 1.7158802638254693, + "grad_norm": 5.46875, + "learning_rate": 5.416682066997048e-07, + "loss": 1.62317638, + "memory(GiB)": 128.51, + "step": 67640, + "train_speed(iter/s)": 1.636758 + }, + { + "acc": 0.66928387, + "epoch": 1.7160071029934043, + "grad_norm": 5.375, + "learning_rate": 5.411935999758288e-07, + "loss": 1.60135002, + "memory(GiB)": 128.51, + "step": 67645, + "train_speed(iter/s)": 1.63677 + }, + { + "acc": 0.65422292, + "epoch": 1.7161339421613393, + "grad_norm": 6.9375, + "learning_rate": 5.407191893701674e-07, + "loss": 1.57047758, + "memory(GiB)": 128.51, + "step": 67650, + "train_speed(iter/s)": 1.636783 + }, + { + "acc": 0.65626907, + "epoch": 1.7162607813292745, + "grad_norm": 5.90625, + "learning_rate": 5.402449749035843e-07, + "loss": 1.59983549, + "memory(GiB)": 128.51, + "step": 67655, + "train_speed(iter/s)": 1.636794 + }, + { + "acc": 0.65422726, + "epoch": 1.7163876204972095, + "grad_norm": 6.40625, + "learning_rate": 5.397709565969378e-07, + "loss": 1.56725245, + "memory(GiB)": 128.51, + "step": 67660, + "train_speed(iter/s)": 1.636806 + }, + { + "acc": 0.65677891, + "epoch": 1.7165144596651447, + "grad_norm": 6.8125, + "learning_rate": 5.392971344710785e-07, + "loss": 1.64886169, + "memory(GiB)": 128.51, + "step": 67665, + "train_speed(iter/s)": 1.636818 + }, + { + "acc": 0.66282663, + "epoch": 1.7166412988330797, + "grad_norm": 6.1875, + "learning_rate": 5.388235085468485e-07, + "loss": 1.62454166, + "memory(GiB)": 128.51, + "step": 67670, + "train_speed(iter/s)": 1.63683 + }, + { + "acc": 0.66598711, + "epoch": 1.7167681380010147, + "grad_norm": 5.96875, + "learning_rate": 5.383500788450757e-07, + "loss": 1.5794363, + "memory(GiB)": 128.51, + "step": 67675, + "train_speed(iter/s)": 1.636843 + }, + { + "acc": 0.65941963, + "epoch": 1.7168949771689497, + "grad_norm": 7.0, + "learning_rate": 5.378768453865879e-07, + "loss": 1.64889221, + "memory(GiB)": 128.51, + "step": 67680, + "train_speed(iter/s)": 1.636855 + }, + { + "acc": 0.65301218, + "epoch": 1.7170218163368847, + "grad_norm": 6.8125, + "learning_rate": 5.374038081921978e-07, + "loss": 1.61743774, + "memory(GiB)": 128.51, + "step": 67685, + "train_speed(iter/s)": 1.636866 + }, + { + "acc": 0.65661535, + "epoch": 1.7171486555048199, + "grad_norm": 6.59375, + "learning_rate": 5.369309672827139e-07, + "loss": 1.62625866, + "memory(GiB)": 128.51, + "step": 67690, + "train_speed(iter/s)": 1.636878 + }, + { + "acc": 0.66227503, + "epoch": 1.717275494672755, + "grad_norm": 6.9375, + "learning_rate": 5.364583226789299e-07, + "loss": 1.57511816, + "memory(GiB)": 128.51, + "step": 67695, + "train_speed(iter/s)": 1.636889 + }, + { + "acc": 0.66967993, + "epoch": 1.71740233384069, + "grad_norm": 5.15625, + "learning_rate": 5.359858744016378e-07, + "loss": 1.59977551, + "memory(GiB)": 128.51, + "step": 67700, + "train_speed(iter/s)": 1.636901 + }, + { + "acc": 0.64543953, + "epoch": 1.717529173008625, + "grad_norm": 4.96875, + "learning_rate": 5.355136224716179e-07, + "loss": 1.58033867, + "memory(GiB)": 128.51, + "step": 67705, + "train_speed(iter/s)": 1.636913 + }, + { + "acc": 0.6615799, + "epoch": 1.71765601217656, + "grad_norm": 5.25, + "learning_rate": 5.350415669096409e-07, + "loss": 1.54568233, + "memory(GiB)": 128.51, + "step": 67710, + "train_speed(iter/s)": 1.636925 + }, + { + "acc": 0.66160154, + "epoch": 1.717782851344495, + "grad_norm": 7.1875, + "learning_rate": 5.345697077364708e-07, + "loss": 1.61753101, + "memory(GiB)": 128.51, + "step": 67715, + "train_speed(iter/s)": 1.636938 + }, + { + "acc": 0.63484812, + "epoch": 1.7179096905124303, + "grad_norm": 5.0, + "learning_rate": 5.340980449728617e-07, + "loss": 1.67677536, + "memory(GiB)": 128.51, + "step": 67720, + "train_speed(iter/s)": 1.63695 + }, + { + "acc": 0.66475959, + "epoch": 1.7180365296803652, + "grad_norm": 5.625, + "learning_rate": 5.336265786395589e-07, + "loss": 1.50078735, + "memory(GiB)": 128.51, + "step": 67725, + "train_speed(iter/s)": 1.636962 + }, + { + "acc": 0.66264863, + "epoch": 1.7181633688483005, + "grad_norm": 6.78125, + "learning_rate": 5.331553087573005e-07, + "loss": 1.58849535, + "memory(GiB)": 128.51, + "step": 67730, + "train_speed(iter/s)": 1.636974 + }, + { + "acc": 0.6571393, + "epoch": 1.7182902080162354, + "grad_norm": 5.03125, + "learning_rate": 5.326842353468148e-07, + "loss": 1.64206944, + "memory(GiB)": 128.51, + "step": 67735, + "train_speed(iter/s)": 1.636986 + }, + { + "acc": 0.66942153, + "epoch": 1.7184170471841704, + "grad_norm": 7.3125, + "learning_rate": 5.322133584288214e-07, + "loss": 1.55967379, + "memory(GiB)": 128.51, + "step": 67740, + "train_speed(iter/s)": 1.636999 + }, + { + "acc": 0.65423899, + "epoch": 1.7185438863521054, + "grad_norm": 5.9375, + "learning_rate": 5.317426780240314e-07, + "loss": 1.59145966, + "memory(GiB)": 128.51, + "step": 67745, + "train_speed(iter/s)": 1.637011 + }, + { + "acc": 0.66266003, + "epoch": 1.7186707255200406, + "grad_norm": 5.21875, + "learning_rate": 5.312721941531479e-07, + "loss": 1.57306519, + "memory(GiB)": 128.51, + "step": 67750, + "train_speed(iter/s)": 1.637023 + }, + { + "acc": 0.65121617, + "epoch": 1.7187975646879756, + "grad_norm": 5.96875, + "learning_rate": 5.308019068368647e-07, + "loss": 1.62663727, + "memory(GiB)": 128.51, + "step": 67755, + "train_speed(iter/s)": 1.637035 + }, + { + "acc": 0.64435482, + "epoch": 1.7189244038559108, + "grad_norm": 5.75, + "learning_rate": 5.303318160958675e-07, + "loss": 1.65798759, + "memory(GiB)": 128.51, + "step": 67760, + "train_speed(iter/s)": 1.637047 + }, + { + "acc": 0.66324158, + "epoch": 1.7190512430238458, + "grad_norm": 6.3125, + "learning_rate": 5.298619219508317e-07, + "loss": 1.56744471, + "memory(GiB)": 128.51, + "step": 67765, + "train_speed(iter/s)": 1.637058 + }, + { + "acc": 0.64399962, + "epoch": 1.7191780821917808, + "grad_norm": 5.15625, + "learning_rate": 5.293922244224275e-07, + "loss": 1.64972839, + "memory(GiB)": 128.51, + "step": 67770, + "train_speed(iter/s)": 1.63707 + }, + { + "acc": 0.65744452, + "epoch": 1.7193049213597158, + "grad_norm": 5.8125, + "learning_rate": 5.289227235313133e-07, + "loss": 1.66066113, + "memory(GiB)": 128.51, + "step": 67775, + "train_speed(iter/s)": 1.637082 + }, + { + "acc": 0.65435376, + "epoch": 1.7194317605276508, + "grad_norm": 7.375, + "learning_rate": 5.284534192981389e-07, + "loss": 1.64509163, + "memory(GiB)": 128.51, + "step": 67780, + "train_speed(iter/s)": 1.637094 + }, + { + "acc": 0.65661602, + "epoch": 1.719558599695586, + "grad_norm": 6.125, + "learning_rate": 5.279843117435463e-07, + "loss": 1.5995266, + "memory(GiB)": 128.51, + "step": 67785, + "train_speed(iter/s)": 1.637106 + }, + { + "acc": 0.66487389, + "epoch": 1.7196854388635212, + "grad_norm": 7.375, + "learning_rate": 5.275154008881716e-07, + "loss": 1.56443815, + "memory(GiB)": 128.51, + "step": 67790, + "train_speed(iter/s)": 1.637119 + }, + { + "acc": 0.65768166, + "epoch": 1.7198122780314562, + "grad_norm": 6.25, + "learning_rate": 5.270466867526369e-07, + "loss": 1.59378281, + "memory(GiB)": 128.51, + "step": 67795, + "train_speed(iter/s)": 1.637132 + }, + { + "acc": 0.66073885, + "epoch": 1.7199391171993912, + "grad_norm": 5.71875, + "learning_rate": 5.265781693575578e-07, + "loss": 1.60156002, + "memory(GiB)": 128.51, + "step": 67800, + "train_speed(iter/s)": 1.637143 + }, + { + "acc": 0.64835563, + "epoch": 1.7200659563673262, + "grad_norm": 5.3125, + "learning_rate": 5.261098487235449e-07, + "loss": 1.59911194, + "memory(GiB)": 128.51, + "step": 67805, + "train_speed(iter/s)": 1.637155 + }, + { + "acc": 0.65387764, + "epoch": 1.7201927955352612, + "grad_norm": 5.75, + "learning_rate": 5.256417248711959e-07, + "loss": 1.60365086, + "memory(GiB)": 128.51, + "step": 67810, + "train_speed(iter/s)": 1.637168 + }, + { + "acc": 0.66301355, + "epoch": 1.7203196347031964, + "grad_norm": 5.28125, + "learning_rate": 5.251737978210997e-07, + "loss": 1.57626514, + "memory(GiB)": 128.51, + "step": 67815, + "train_speed(iter/s)": 1.637179 + }, + { + "acc": 0.63855886, + "epoch": 1.7204464738711314, + "grad_norm": 5.0, + "learning_rate": 5.247060675938376e-07, + "loss": 1.66888084, + "memory(GiB)": 128.51, + "step": 67820, + "train_speed(iter/s)": 1.63719 + }, + { + "acc": 0.6596756, + "epoch": 1.7205733130390666, + "grad_norm": 5.84375, + "learning_rate": 5.242385342099842e-07, + "loss": 1.63618908, + "memory(GiB)": 128.51, + "step": 67825, + "train_speed(iter/s)": 1.637203 + }, + { + "acc": 0.6773294, + "epoch": 1.7207001522070016, + "grad_norm": 6.21875, + "learning_rate": 5.237711976901039e-07, + "loss": 1.53798943, + "memory(GiB)": 128.51, + "step": 67830, + "train_speed(iter/s)": 1.637215 + }, + { + "acc": 0.65338316, + "epoch": 1.7208269913749366, + "grad_norm": 5.59375, + "learning_rate": 5.233040580547489e-07, + "loss": 1.60261517, + "memory(GiB)": 128.51, + "step": 67835, + "train_speed(iter/s)": 1.637226 + }, + { + "acc": 0.64087706, + "epoch": 1.7209538305428715, + "grad_norm": 5.46875, + "learning_rate": 5.228371153244699e-07, + "loss": 1.61807747, + "memory(GiB)": 128.51, + "step": 67840, + "train_speed(iter/s)": 1.637238 + }, + { + "acc": 0.64424658, + "epoch": 1.7210806697108065, + "grad_norm": 7.28125, + "learning_rate": 5.223703695198034e-07, + "loss": 1.6849762, + "memory(GiB)": 128.51, + "step": 67845, + "train_speed(iter/s)": 1.63725 + }, + { + "acc": 0.65990434, + "epoch": 1.7212075088787417, + "grad_norm": 6.625, + "learning_rate": 5.219038206612798e-07, + "loss": 1.57745113, + "memory(GiB)": 128.51, + "step": 67850, + "train_speed(iter/s)": 1.637262 + }, + { + "acc": 0.65871782, + "epoch": 1.721334348046677, + "grad_norm": 8.3125, + "learning_rate": 5.214374687694191e-07, + "loss": 1.61559029, + "memory(GiB)": 128.51, + "step": 67855, + "train_speed(iter/s)": 1.637273 + }, + { + "acc": 0.65611963, + "epoch": 1.721461187214612, + "grad_norm": 7.625, + "learning_rate": 5.209713138647332e-07, + "loss": 1.54990177, + "memory(GiB)": 128.51, + "step": 67860, + "train_speed(iter/s)": 1.637285 + }, + { + "acc": 0.64143906, + "epoch": 1.721588026382547, + "grad_norm": 6.15625, + "learning_rate": 5.20505355967727e-07, + "loss": 1.70312443, + "memory(GiB)": 128.51, + "step": 67865, + "train_speed(iter/s)": 1.637297 + }, + { + "acc": 0.68299685, + "epoch": 1.721714865550482, + "grad_norm": 5.96875, + "learning_rate": 5.200395950988945e-07, + "loss": 1.52948208, + "memory(GiB)": 128.51, + "step": 67870, + "train_speed(iter/s)": 1.637309 + }, + { + "acc": 0.65109282, + "epoch": 1.721841704718417, + "grad_norm": 7.375, + "learning_rate": 5.195740312787229e-07, + "loss": 1.68921089, + "memory(GiB)": 128.51, + "step": 67875, + "train_speed(iter/s)": 1.637322 + }, + { + "acc": 0.6647438, + "epoch": 1.7219685438863521, + "grad_norm": 6.8125, + "learning_rate": 5.191086645276883e-07, + "loss": 1.56310205, + "memory(GiB)": 128.51, + "step": 67880, + "train_speed(iter/s)": 1.637334 + }, + { + "acc": 0.6477344, + "epoch": 1.722095383054287, + "grad_norm": 6.625, + "learning_rate": 5.186434948662605e-07, + "loss": 1.58649569, + "memory(GiB)": 128.51, + "step": 67885, + "train_speed(iter/s)": 1.637346 + }, + { + "acc": 0.64870615, + "epoch": 1.7222222222222223, + "grad_norm": 7.15625, + "learning_rate": 5.181785223148999e-07, + "loss": 1.61224403, + "memory(GiB)": 128.51, + "step": 67890, + "train_speed(iter/s)": 1.637358 + }, + { + "acc": 0.65779119, + "epoch": 1.7223490613901573, + "grad_norm": 5.4375, + "learning_rate": 5.177137468940574e-07, + "loss": 1.54895439, + "memory(GiB)": 128.51, + "step": 67895, + "train_speed(iter/s)": 1.63737 + }, + { + "acc": 0.63718061, + "epoch": 1.7224759005580923, + "grad_norm": 5.625, + "learning_rate": 5.172491686241765e-07, + "loss": 1.74033813, + "memory(GiB)": 128.51, + "step": 67900, + "train_speed(iter/s)": 1.637383 + }, + { + "acc": 0.65718231, + "epoch": 1.7226027397260273, + "grad_norm": 5.78125, + "learning_rate": 5.167847875256904e-07, + "loss": 1.60256824, + "memory(GiB)": 128.51, + "step": 67905, + "train_speed(iter/s)": 1.637395 + }, + { + "acc": 0.65998745, + "epoch": 1.7227295788939625, + "grad_norm": 5.5625, + "learning_rate": 5.163206036190272e-07, + "loss": 1.61081467, + "memory(GiB)": 128.51, + "step": 67910, + "train_speed(iter/s)": 1.637407 + }, + { + "acc": 0.67601619, + "epoch": 1.7228564180618975, + "grad_norm": 6.40625, + "learning_rate": 5.15856616924601e-07, + "loss": 1.51871977, + "memory(GiB)": 128.51, + "step": 67915, + "train_speed(iter/s)": 1.637419 + }, + { + "acc": 0.65791874, + "epoch": 1.7229832572298327, + "grad_norm": 5.9375, + "learning_rate": 5.153928274628211e-07, + "loss": 1.61719761, + "memory(GiB)": 128.51, + "step": 67920, + "train_speed(iter/s)": 1.637431 + }, + { + "acc": 0.65185332, + "epoch": 1.7231100963977677, + "grad_norm": 7.03125, + "learning_rate": 5.149292352540857e-07, + "loss": 1.65299034, + "memory(GiB)": 128.51, + "step": 67925, + "train_speed(iter/s)": 1.637443 + }, + { + "acc": 0.66740294, + "epoch": 1.7232369355657027, + "grad_norm": 6.4375, + "learning_rate": 5.144658403187896e-07, + "loss": 1.60723743, + "memory(GiB)": 128.51, + "step": 67930, + "train_speed(iter/s)": 1.637456 + }, + { + "acc": 0.64974055, + "epoch": 1.7233637747336377, + "grad_norm": 5.40625, + "learning_rate": 5.14002642677311e-07, + "loss": 1.59239941, + "memory(GiB)": 128.51, + "step": 67935, + "train_speed(iter/s)": 1.637469 + }, + { + "acc": 0.64810677, + "epoch": 1.7234906139015727, + "grad_norm": 5.53125, + "learning_rate": 5.135396423500233e-07, + "loss": 1.60865936, + "memory(GiB)": 128.51, + "step": 67940, + "train_speed(iter/s)": 1.637481 + }, + { + "acc": 0.63797407, + "epoch": 1.7236174530695079, + "grad_norm": 5.03125, + "learning_rate": 5.130768393572943e-07, + "loss": 1.69640751, + "memory(GiB)": 128.51, + "step": 67945, + "train_speed(iter/s)": 1.637493 + }, + { + "acc": 0.64058113, + "epoch": 1.723744292237443, + "grad_norm": 5.25, + "learning_rate": 5.126142337194795e-07, + "loss": 1.64647846, + "memory(GiB)": 128.51, + "step": 67950, + "train_speed(iter/s)": 1.637505 + }, + { + "acc": 0.67002277, + "epoch": 1.723871131405378, + "grad_norm": 6.625, + "learning_rate": 5.121518254569241e-07, + "loss": 1.50479069, + "memory(GiB)": 128.51, + "step": 67955, + "train_speed(iter/s)": 1.637517 + }, + { + "acc": 0.66783686, + "epoch": 1.723997970573313, + "grad_norm": 5.0625, + "learning_rate": 5.116896145899675e-07, + "loss": 1.60261726, + "memory(GiB)": 128.51, + "step": 67960, + "train_speed(iter/s)": 1.63753 + }, + { + "acc": 0.64873695, + "epoch": 1.724124809741248, + "grad_norm": 5.65625, + "learning_rate": 5.112276011389416e-07, + "loss": 1.66556511, + "memory(GiB)": 128.51, + "step": 67965, + "train_speed(iter/s)": 1.63754 + }, + { + "acc": 0.64816608, + "epoch": 1.724251648909183, + "grad_norm": 5.875, + "learning_rate": 5.107657851241671e-07, + "loss": 1.62191849, + "memory(GiB)": 128.51, + "step": 67970, + "train_speed(iter/s)": 1.637552 + }, + { + "acc": 0.67163992, + "epoch": 1.7243784880771182, + "grad_norm": 6.84375, + "learning_rate": 5.103041665659553e-07, + "loss": 1.53661757, + "memory(GiB)": 128.51, + "step": 67975, + "train_speed(iter/s)": 1.637564 + }, + { + "acc": 0.65030098, + "epoch": 1.7245053272450532, + "grad_norm": 5.0625, + "learning_rate": 5.098427454846116e-07, + "loss": 1.63800507, + "memory(GiB)": 128.51, + "step": 67980, + "train_speed(iter/s)": 1.637576 + }, + { + "acc": 0.67146072, + "epoch": 1.7246321664129884, + "grad_norm": 6.09375, + "learning_rate": 5.093815219004311e-07, + "loss": 1.53376112, + "memory(GiB)": 128.51, + "step": 67985, + "train_speed(iter/s)": 1.637589 + }, + { + "acc": 0.65836477, + "epoch": 1.7247590055809234, + "grad_norm": 5.65625, + "learning_rate": 5.089204958337018e-07, + "loss": 1.5691987, + "memory(GiB)": 128.51, + "step": 67990, + "train_speed(iter/s)": 1.637602 + }, + { + "acc": 0.63850951, + "epoch": 1.7248858447488584, + "grad_norm": 5.5, + "learning_rate": 5.084596673046982e-07, + "loss": 1.68628883, + "memory(GiB)": 128.51, + "step": 67995, + "train_speed(iter/s)": 1.637614 + }, + { + "acc": 0.66782904, + "epoch": 1.7250126839167934, + "grad_norm": 6.25, + "learning_rate": 5.079990363336928e-07, + "loss": 1.56277695, + "memory(GiB)": 128.51, + "step": 68000, + "train_speed(iter/s)": 1.637626 + }, + { + "epoch": 1.7250126839167934, + "eval_acc": 0.646264333611078, + "eval_loss": 1.5730524063110352, + "eval_runtime": 58.6761, + "eval_samples_per_second": 108.562, + "eval_steps_per_second": 27.149, + "step": 68000 + } + ], + "logging_steps": 5, + "max_steps": 78840, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3396563922916475e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}