{ "best_metric": 0.96944082, "best_model_checkpoint": "/home/ubuntu/swift2/output2/qwen2_5-32b/v0-20250210-012932/checkpoint-62000", "epoch": 1.5728056823947236, "eval_steps": 1000, "global_step": 62000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.72258151, "epoch": 2.536783358701167e-05, "grad_norm": 2.78125, "learning_rate": 2.536783358701167e-09, "loss": 1.22116041, "memory(GiB)": 367.04, "step": 1, "train_speed(iter/s)": 0.032799 }, { "acc": 0.69900775, "epoch": 0.00012683916793505834, "grad_norm": 2.765625, "learning_rate": 1.2683916793505834e-08, "loss": 1.26773286, "memory(GiB)": 367.13, "step": 5, "train_speed(iter/s)": 0.10514 }, { "acc": 0.68508959, "epoch": 0.0002536783358701167, "grad_norm": 2.46875, "learning_rate": 2.536783358701167e-08, "loss": 1.35106306, "memory(GiB)": 367.13, "step": 10, "train_speed(iter/s)": 0.147598 }, { "acc": 0.66965218, "epoch": 0.000380517503805175, "grad_norm": 2.75, "learning_rate": 3.80517503805175e-08, "loss": 1.43235245, "memory(GiB)": 368.59, "step": 15, "train_speed(iter/s)": 0.170096 }, { "acc": 0.68901067, "epoch": 0.0005073566717402334, "grad_norm": 2.75, "learning_rate": 5.073566717402334e-08, "loss": 1.31296177, "memory(GiB)": 368.59, "step": 20, "train_speed(iter/s)": 0.180139 }, { "acc": 0.67792463, "epoch": 0.0006341958396752917, "grad_norm": 2.671875, "learning_rate": 6.341958396752917e-08, "loss": 1.37621107, "memory(GiB)": 368.59, "step": 25, "train_speed(iter/s)": 0.191405 }, { "acc": 0.68527193, "epoch": 0.00076103500761035, "grad_norm": 3.546875, "learning_rate": 7.6103500761035e-08, "loss": 1.35504694, "memory(GiB)": 368.59, "step": 30, "train_speed(iter/s)": 0.197893 }, { "acc": 0.68640947, "epoch": 0.0008878741755454084, "grad_norm": 2.65625, "learning_rate": 8.878741755454084e-08, "loss": 1.38612432, "memory(GiB)": 368.59, "step": 35, "train_speed(iter/s)": 0.20333 }, { "acc": 0.69748793, "epoch": 0.0010147133434804667, "grad_norm": 2.890625, "learning_rate": 1.0147133434804667e-07, "loss": 1.32476912, "memory(GiB)": 368.59, "step": 40, "train_speed(iter/s)": 0.205729 }, { "acc": 0.69311705, "epoch": 0.001141552511415525, "grad_norm": 2.84375, "learning_rate": 1.1415525114155251e-07, "loss": 1.34483833, "memory(GiB)": 368.59, "step": 45, "train_speed(iter/s)": 0.209813 }, { "acc": 0.69359598, "epoch": 0.0012683916793505834, "grad_norm": 2.65625, "learning_rate": 1.2683916793505834e-07, "loss": 1.31629105, "memory(GiB)": 368.59, "step": 50, "train_speed(iter/s)": 0.21271 }, { "acc": 0.6894146, "epoch": 0.0013952308472856417, "grad_norm": 2.734375, "learning_rate": 1.3952308472856418e-07, "loss": 1.44201794, "memory(GiB)": 368.59, "step": 55, "train_speed(iter/s)": 0.215115 }, { "acc": 0.66725492, "epoch": 0.0015220700152207, "grad_norm": 2.9375, "learning_rate": 1.5220700152207e-07, "loss": 1.40911312, "memory(GiB)": 368.59, "step": 60, "train_speed(iter/s)": 0.216591 }, { "acc": 0.68062468, "epoch": 0.0016489091831557584, "grad_norm": 2.75, "learning_rate": 1.6489091831557585e-07, "loss": 1.37944727, "memory(GiB)": 368.59, "step": 65, "train_speed(iter/s)": 0.2181 }, { "acc": 0.69591818, "epoch": 0.0017757483510908167, "grad_norm": 2.578125, "learning_rate": 1.7757483510908168e-07, "loss": 1.34101734, "memory(GiB)": 368.59, "step": 70, "train_speed(iter/s)": 0.220332 }, { "acc": 0.67872515, "epoch": 0.001902587519025875, "grad_norm": 2.53125, "learning_rate": 1.9025875190258752e-07, "loss": 1.37323837, "memory(GiB)": 368.59, "step": 75, "train_speed(iter/s)": 0.220572 }, { "acc": 0.69941545, "epoch": 0.0020294266869609334, "grad_norm": 2.671875, "learning_rate": 2.0294266869609335e-07, "loss": 1.27727337, "memory(GiB)": 368.59, "step": 80, "train_speed(iter/s)": 0.221093 }, { "acc": 0.67554841, "epoch": 0.0021562658548959918, "grad_norm": 2.796875, "learning_rate": 2.1562658548959918e-07, "loss": 1.43144722, "memory(GiB)": 368.61, "step": 85, "train_speed(iter/s)": 0.222072 }, { "acc": 0.68067541, "epoch": 0.00228310502283105, "grad_norm": 2.703125, "learning_rate": 2.2831050228310502e-07, "loss": 1.33494492, "memory(GiB)": 368.61, "step": 90, "train_speed(iter/s)": 0.223124 }, { "acc": 0.67702923, "epoch": 0.0024099441907661084, "grad_norm": 2.5625, "learning_rate": 2.409944190766109e-07, "loss": 1.454076, "memory(GiB)": 368.61, "step": 95, "train_speed(iter/s)": 0.223623 }, { "acc": 0.68957691, "epoch": 0.0025367833587011668, "grad_norm": 2.671875, "learning_rate": 2.536783358701167e-07, "loss": 1.33546581, "memory(GiB)": 368.61, "step": 100, "train_speed(iter/s)": 0.224214 }, { "acc": 0.67355747, "epoch": 0.002663622526636225, "grad_norm": 2.5625, "learning_rate": 2.6636225266362255e-07, "loss": 1.37022858, "memory(GiB)": 368.61, "step": 105, "train_speed(iter/s)": 0.225483 }, { "acc": 0.70293808, "epoch": 0.0027904616945712835, "grad_norm": 2.125, "learning_rate": 2.7904616945712836e-07, "loss": 1.29456387, "memory(GiB)": 368.61, "step": 110, "train_speed(iter/s)": 0.225784 }, { "acc": 0.69164877, "epoch": 0.002917300862506342, "grad_norm": 2.625, "learning_rate": 2.917300862506342e-07, "loss": 1.38045902, "memory(GiB)": 368.61, "step": 115, "train_speed(iter/s)": 0.22694 }, { "acc": 0.68978939, "epoch": 0.0030441400304414, "grad_norm": 2.421875, "learning_rate": 3.0441400304414e-07, "loss": 1.34489698, "memory(GiB)": 368.61, "step": 120, "train_speed(iter/s)": 0.226685 }, { "acc": 0.6820365, "epoch": 0.0031709791983764585, "grad_norm": 2.390625, "learning_rate": 3.170979198376459e-07, "loss": 1.28958292, "memory(GiB)": 368.61, "step": 125, "train_speed(iter/s)": 0.226925 }, { "acc": 0.6815877, "epoch": 0.003297818366311517, "grad_norm": 2.78125, "learning_rate": 3.297818366311517e-07, "loss": 1.36315727, "memory(GiB)": 368.61, "step": 130, "train_speed(iter/s)": 0.227095 }, { "acc": 0.69810781, "epoch": 0.003424657534246575, "grad_norm": 2.46875, "learning_rate": 3.4246575342465755e-07, "loss": 1.29944029, "memory(GiB)": 368.61, "step": 135, "train_speed(iter/s)": 0.227555 }, { "acc": 0.68844109, "epoch": 0.0035514967021816335, "grad_norm": 3.171875, "learning_rate": 3.5514967021816336e-07, "loss": 1.32682533, "memory(GiB)": 368.61, "step": 140, "train_speed(iter/s)": 0.22823 }, { "acc": 0.68884706, "epoch": 0.003678335870116692, "grad_norm": 2.828125, "learning_rate": 3.678335870116692e-07, "loss": 1.33707294, "memory(GiB)": 368.61, "step": 145, "train_speed(iter/s)": 0.22829 }, { "acc": 0.69978833, "epoch": 0.00380517503805175, "grad_norm": 3.15625, "learning_rate": 3.8051750380517503e-07, "loss": 1.32232199, "memory(GiB)": 368.61, "step": 150, "train_speed(iter/s)": 0.228245 }, { "acc": 0.68999271, "epoch": 0.0039320142059868085, "grad_norm": 3.09375, "learning_rate": 3.932014205986809e-07, "loss": 1.32313528, "memory(GiB)": 368.61, "step": 155, "train_speed(iter/s)": 0.229192 }, { "acc": 0.6925375, "epoch": 0.004058853373921867, "grad_norm": 2.484375, "learning_rate": 4.058853373921867e-07, "loss": 1.39565353, "memory(GiB)": 368.61, "step": 160, "train_speed(iter/s)": 0.229485 }, { "acc": 0.68201027, "epoch": 0.004185692541856925, "grad_norm": 3.328125, "learning_rate": 4.1856925418569256e-07, "loss": 1.3672802, "memory(GiB)": 368.61, "step": 165, "train_speed(iter/s)": 0.229366 }, { "acc": 0.67476382, "epoch": 0.0043125317097919835, "grad_norm": 2.578125, "learning_rate": 4.3125317097919837e-07, "loss": 1.37982359, "memory(GiB)": 368.61, "step": 170, "train_speed(iter/s)": 0.229186 }, { "acc": 0.67629614, "epoch": 0.004439370877727042, "grad_norm": 3.203125, "learning_rate": 4.4393708777270423e-07, "loss": 1.38686256, "memory(GiB)": 368.61, "step": 175, "train_speed(iter/s)": 0.229658 }, { "acc": 0.67296748, "epoch": 0.0045662100456621, "grad_norm": 2.90625, "learning_rate": 4.5662100456621004e-07, "loss": 1.46069517, "memory(GiB)": 368.61, "step": 180, "train_speed(iter/s)": 0.230072 }, { "acc": 0.69871254, "epoch": 0.0046930492135971585, "grad_norm": 3.1875, "learning_rate": 4.693049213597159e-07, "loss": 1.32986813, "memory(GiB)": 368.61, "step": 185, "train_speed(iter/s)": 0.230554 }, { "acc": 0.68908038, "epoch": 0.004819888381532217, "grad_norm": 2.203125, "learning_rate": 4.819888381532218e-07, "loss": 1.32280064, "memory(GiB)": 368.61, "step": 190, "train_speed(iter/s)": 0.23047 }, { "acc": 0.69476423, "epoch": 0.004946727549467275, "grad_norm": 2.625, "learning_rate": 4.946727549467275e-07, "loss": 1.38044786, "memory(GiB)": 368.61, "step": 195, "train_speed(iter/s)": 0.23064 }, { "acc": 0.68373308, "epoch": 0.0050735667174023336, "grad_norm": 2.671875, "learning_rate": 5.073566717402334e-07, "loss": 1.36221714, "memory(GiB)": 368.61, "step": 200, "train_speed(iter/s)": 0.230236 }, { "acc": 0.68018503, "epoch": 0.005200405885337392, "grad_norm": 3.078125, "learning_rate": 5.200405885337392e-07, "loss": 1.38976746, "memory(GiB)": 368.61, "step": 205, "train_speed(iter/s)": 0.230125 }, { "acc": 0.68763971, "epoch": 0.00532724505327245, "grad_norm": 2.703125, "learning_rate": 5.327245053272451e-07, "loss": 1.37556639, "memory(GiB)": 368.61, "step": 210, "train_speed(iter/s)": 0.230508 }, { "acc": 0.67785921, "epoch": 0.005454084221207509, "grad_norm": 2.78125, "learning_rate": 5.454084221207509e-07, "loss": 1.36991215, "memory(GiB)": 368.61, "step": 215, "train_speed(iter/s)": 0.230498 }, { "acc": 0.69295216, "epoch": 0.005580923389142567, "grad_norm": 2.515625, "learning_rate": 5.580923389142567e-07, "loss": 1.28559837, "memory(GiB)": 368.61, "step": 220, "train_speed(iter/s)": 0.230417 }, { "acc": 0.69019418, "epoch": 0.005707762557077625, "grad_norm": 2.421875, "learning_rate": 5.707762557077626e-07, "loss": 1.29329929, "memory(GiB)": 368.61, "step": 225, "train_speed(iter/s)": 0.230319 }, { "acc": 0.68573942, "epoch": 0.005834601725012684, "grad_norm": 2.5625, "learning_rate": 5.834601725012684e-07, "loss": 1.36514225, "memory(GiB)": 368.61, "step": 230, "train_speed(iter/s)": 0.230371 }, { "acc": 0.69488649, "epoch": 0.005961440892947742, "grad_norm": 2.328125, "learning_rate": 5.961440892947743e-07, "loss": 1.29173689, "memory(GiB)": 368.61, "step": 235, "train_speed(iter/s)": 0.23011 }, { "acc": 0.67788444, "epoch": 0.0060882800608828, "grad_norm": 2.578125, "learning_rate": 6.0882800608828e-07, "loss": 1.40850382, "memory(GiB)": 368.61, "step": 240, "train_speed(iter/s)": 0.230218 }, { "acc": 0.69308128, "epoch": 0.006215119228817859, "grad_norm": 2.78125, "learning_rate": 6.215119228817859e-07, "loss": 1.32065563, "memory(GiB)": 368.61, "step": 245, "train_speed(iter/s)": 0.230205 }, { "acc": 0.6787281, "epoch": 0.006341958396752917, "grad_norm": 2.953125, "learning_rate": 6.341958396752918e-07, "loss": 1.44482059, "memory(GiB)": 368.61, "step": 250, "train_speed(iter/s)": 0.230331 }, { "acc": 0.66357412, "epoch": 0.006468797564687975, "grad_norm": 2.59375, "learning_rate": 6.468797564687976e-07, "loss": 1.41406517, "memory(GiB)": 368.61, "step": 255, "train_speed(iter/s)": 0.230594 }, { "acc": 0.6854682, "epoch": 0.006595636732623034, "grad_norm": 2.828125, "learning_rate": 6.595636732623034e-07, "loss": 1.34347534, "memory(GiB)": 368.61, "step": 260, "train_speed(iter/s)": 0.230864 }, { "acc": 0.69844513, "epoch": 0.006722475900558092, "grad_norm": 2.421875, "learning_rate": 6.722475900558092e-07, "loss": 1.31414871, "memory(GiB)": 368.61, "step": 265, "train_speed(iter/s)": 0.230857 }, { "acc": 0.69269924, "epoch": 0.00684931506849315, "grad_norm": 3.15625, "learning_rate": 6.849315068493151e-07, "loss": 1.31900482, "memory(GiB)": 368.61, "step": 270, "train_speed(iter/s)": 0.230919 }, { "acc": 0.68208141, "epoch": 0.006976154236428209, "grad_norm": 2.75, "learning_rate": 6.97615423642821e-07, "loss": 1.3514082, "memory(GiB)": 368.61, "step": 275, "train_speed(iter/s)": 0.231061 }, { "acc": 0.68196726, "epoch": 0.007102993404363267, "grad_norm": 2.6875, "learning_rate": 7.102993404363267e-07, "loss": 1.38447351, "memory(GiB)": 368.61, "step": 280, "train_speed(iter/s)": 0.231185 }, { "acc": 0.69876342, "epoch": 0.007229832572298325, "grad_norm": 2.78125, "learning_rate": 7.229832572298326e-07, "loss": 1.28989477, "memory(GiB)": 368.61, "step": 285, "train_speed(iter/s)": 0.231051 }, { "acc": 0.70783377, "epoch": 0.007356671740233384, "grad_norm": 2.984375, "learning_rate": 7.356671740233384e-07, "loss": 1.31450615, "memory(GiB)": 368.61, "step": 290, "train_speed(iter/s)": 0.231198 }, { "acc": 0.68970871, "epoch": 0.007483510908168442, "grad_norm": 2.796875, "learning_rate": 7.483510908168443e-07, "loss": 1.34218788, "memory(GiB)": 368.61, "step": 295, "train_speed(iter/s)": 0.231607 }, { "acc": 0.69317012, "epoch": 0.0076103500761035, "grad_norm": 2.4375, "learning_rate": 7.610350076103501e-07, "loss": 1.30271292, "memory(GiB)": 368.61, "step": 300, "train_speed(iter/s)": 0.231149 }, { "acc": 0.69178343, "epoch": 0.007737189244038559, "grad_norm": 3.078125, "learning_rate": 7.737189244038559e-07, "loss": 1.36446228, "memory(GiB)": 368.61, "step": 305, "train_speed(iter/s)": 0.231174 }, { "acc": 0.69743881, "epoch": 0.007864028411973617, "grad_norm": 2.4375, "learning_rate": 7.864028411973618e-07, "loss": 1.26317663, "memory(GiB)": 368.61, "step": 310, "train_speed(iter/s)": 0.231271 }, { "acc": 0.68340206, "epoch": 0.007990867579908675, "grad_norm": 2.703125, "learning_rate": 7.990867579908676e-07, "loss": 1.39926243, "memory(GiB)": 368.61, "step": 315, "train_speed(iter/s)": 0.231268 }, { "acc": 0.69635286, "epoch": 0.008117706747843734, "grad_norm": 2.75, "learning_rate": 8.117706747843734e-07, "loss": 1.31512003, "memory(GiB)": 368.61, "step": 320, "train_speed(iter/s)": 0.231108 }, { "acc": 0.66709213, "epoch": 0.008244545915778792, "grad_norm": 2.65625, "learning_rate": 8.244545915778793e-07, "loss": 1.39215488, "memory(GiB)": 368.61, "step": 325, "train_speed(iter/s)": 0.230821 }, { "acc": 0.67933731, "epoch": 0.00837138508371385, "grad_norm": 2.671875, "learning_rate": 8.371385083713851e-07, "loss": 1.38740253, "memory(GiB)": 368.61, "step": 330, "train_speed(iter/s)": 0.231051 }, { "acc": 0.68233171, "epoch": 0.008498224251648909, "grad_norm": 3.265625, "learning_rate": 8.49822425164891e-07, "loss": 1.36244974, "memory(GiB)": 368.61, "step": 335, "train_speed(iter/s)": 0.231344 }, { "acc": 0.69122267, "epoch": 0.008625063419583967, "grad_norm": 2.4375, "learning_rate": 8.625063419583967e-07, "loss": 1.33053207, "memory(GiB)": 368.61, "step": 340, "train_speed(iter/s)": 0.231408 }, { "acc": 0.68220596, "epoch": 0.008751902587519025, "grad_norm": 2.4375, "learning_rate": 8.751902587519026e-07, "loss": 1.36557808, "memory(GiB)": 368.61, "step": 345, "train_speed(iter/s)": 0.231372 }, { "acc": 0.68177319, "epoch": 0.008878741755454084, "grad_norm": 2.6875, "learning_rate": 8.878741755454085e-07, "loss": 1.3370615, "memory(GiB)": 368.61, "step": 350, "train_speed(iter/s)": 0.231153 }, { "acc": 0.6851028, "epoch": 0.009005580923389142, "grad_norm": 2.890625, "learning_rate": 9.005580923389143e-07, "loss": 1.35413151, "memory(GiB)": 368.61, "step": 355, "train_speed(iter/s)": 0.231207 }, { "acc": 0.69210343, "epoch": 0.0091324200913242, "grad_norm": 2.65625, "learning_rate": 9.132420091324201e-07, "loss": 1.26611891, "memory(GiB)": 368.61, "step": 360, "train_speed(iter/s)": 0.231047 }, { "acc": 0.6942359, "epoch": 0.009259259259259259, "grad_norm": 2.90625, "learning_rate": 9.259259259259259e-07, "loss": 1.33149929, "memory(GiB)": 368.61, "step": 365, "train_speed(iter/s)": 0.23125 }, { "acc": 0.68027267, "epoch": 0.009386098427194317, "grad_norm": 3.015625, "learning_rate": 9.386098427194318e-07, "loss": 1.37637234, "memory(GiB)": 368.61, "step": 370, "train_speed(iter/s)": 0.231551 }, { "acc": 0.68231955, "epoch": 0.009512937595129375, "grad_norm": 2.703125, "learning_rate": 9.512937595129377e-07, "loss": 1.41887856, "memory(GiB)": 368.61, "step": 375, "train_speed(iter/s)": 0.231681 }, { "acc": 0.69369812, "epoch": 0.009639776763064434, "grad_norm": 2.71875, "learning_rate": 9.639776763064435e-07, "loss": 1.31884565, "memory(GiB)": 368.61, "step": 380, "train_speed(iter/s)": 0.231412 }, { "acc": 0.68560605, "epoch": 0.009766615930999492, "grad_norm": 3.375, "learning_rate": 9.766615930999493e-07, "loss": 1.32670078, "memory(GiB)": 368.61, "step": 385, "train_speed(iter/s)": 0.231154 }, { "acc": 0.68793001, "epoch": 0.00989345509893455, "grad_norm": 2.5, "learning_rate": 9.89345509893455e-07, "loss": 1.3339716, "memory(GiB)": 368.61, "step": 390, "train_speed(iter/s)": 0.231351 }, { "acc": 0.69094672, "epoch": 0.010020294266869609, "grad_norm": 2.734375, "learning_rate": 1.002029426686961e-06, "loss": 1.38724575, "memory(GiB)": 368.61, "step": 395, "train_speed(iter/s)": 0.231389 }, { "acc": 0.68088579, "epoch": 0.010147133434804667, "grad_norm": 2.828125, "learning_rate": 1.0147133434804667e-06, "loss": 1.3817338, "memory(GiB)": 368.61, "step": 400, "train_speed(iter/s)": 0.231624 }, { "acc": 0.68227673, "epoch": 0.010273972602739725, "grad_norm": 2.640625, "learning_rate": 1.0273972602739727e-06, "loss": 1.36219254, "memory(GiB)": 368.61, "step": 405, "train_speed(iter/s)": 0.231589 }, { "acc": 0.68181658, "epoch": 0.010400811770674784, "grad_norm": 3.46875, "learning_rate": 1.0400811770674785e-06, "loss": 1.39222164, "memory(GiB)": 368.61, "step": 410, "train_speed(iter/s)": 0.231267 }, { "acc": 0.70052872, "epoch": 0.010527650938609842, "grad_norm": 2.734375, "learning_rate": 1.0527650938609842e-06, "loss": 1.29408321, "memory(GiB)": 368.61, "step": 415, "train_speed(iter/s)": 0.23137 }, { "acc": 0.69214716, "epoch": 0.0106544901065449, "grad_norm": 3.328125, "learning_rate": 1.0654490106544902e-06, "loss": 1.33794737, "memory(GiB)": 368.61, "step": 420, "train_speed(iter/s)": 0.231206 }, { "acc": 0.68946495, "epoch": 0.010781329274479959, "grad_norm": 2.484375, "learning_rate": 1.078132927447996e-06, "loss": 1.30758438, "memory(GiB)": 368.61, "step": 425, "train_speed(iter/s)": 0.231202 }, { "acc": 0.68787594, "epoch": 0.010908168442415017, "grad_norm": 2.71875, "learning_rate": 1.0908168442415017e-06, "loss": 1.34961596, "memory(GiB)": 368.61, "step": 430, "train_speed(iter/s)": 0.231254 }, { "acc": 0.67905159, "epoch": 0.011035007610350075, "grad_norm": 2.828125, "learning_rate": 1.1035007610350077e-06, "loss": 1.36250381, "memory(GiB)": 368.61, "step": 435, "train_speed(iter/s)": 0.231378 }, { "acc": 0.69467573, "epoch": 0.011161846778285134, "grad_norm": 2.734375, "learning_rate": 1.1161846778285134e-06, "loss": 1.29877462, "memory(GiB)": 368.61, "step": 440, "train_speed(iter/s)": 0.231617 }, { "acc": 0.69808369, "epoch": 0.011288685946220192, "grad_norm": 2.375, "learning_rate": 1.1288685946220194e-06, "loss": 1.29690628, "memory(GiB)": 368.61, "step": 445, "train_speed(iter/s)": 0.23173 }, { "acc": 0.68661213, "epoch": 0.01141552511415525, "grad_norm": 2.53125, "learning_rate": 1.1415525114155251e-06, "loss": 1.33353872, "memory(GiB)": 368.61, "step": 450, "train_speed(iter/s)": 0.231726 }, { "acc": 0.68887868, "epoch": 0.011542364282090309, "grad_norm": 2.734375, "learning_rate": 1.154236428209031e-06, "loss": 1.38428135, "memory(GiB)": 368.61, "step": 455, "train_speed(iter/s)": 0.231923 }, { "acc": 0.69335179, "epoch": 0.011669203450025367, "grad_norm": 2.421875, "learning_rate": 1.1669203450025369e-06, "loss": 1.32404289, "memory(GiB)": 368.61, "step": 460, "train_speed(iter/s)": 0.231716 }, { "acc": 0.68602152, "epoch": 0.011796042617960426, "grad_norm": 2.765625, "learning_rate": 1.1796042617960426e-06, "loss": 1.388307, "memory(GiB)": 368.61, "step": 465, "train_speed(iter/s)": 0.231764 }, { "acc": 0.67668114, "epoch": 0.011922881785895484, "grad_norm": 2.734375, "learning_rate": 1.1922881785895486e-06, "loss": 1.45500956, "memory(GiB)": 368.61, "step": 470, "train_speed(iter/s)": 0.231953 }, { "acc": 0.70207701, "epoch": 0.012049720953830542, "grad_norm": 2.140625, "learning_rate": 1.2049720953830543e-06, "loss": 1.28134022, "memory(GiB)": 368.61, "step": 475, "train_speed(iter/s)": 0.231973 }, { "acc": 0.68534999, "epoch": 0.0121765601217656, "grad_norm": 2.828125, "learning_rate": 1.21765601217656e-06, "loss": 1.34807358, "memory(GiB)": 368.61, "step": 480, "train_speed(iter/s)": 0.231935 }, { "acc": 0.68371849, "epoch": 0.012303399289700659, "grad_norm": 2.03125, "learning_rate": 1.230339928970066e-06, "loss": 1.31797543, "memory(GiB)": 368.61, "step": 485, "train_speed(iter/s)": 0.231951 }, { "acc": 0.70335903, "epoch": 0.012430238457635717, "grad_norm": 2.171875, "learning_rate": 1.2430238457635718e-06, "loss": 1.2871645, "memory(GiB)": 368.61, "step": 490, "train_speed(iter/s)": 0.231958 }, { "acc": 0.69549046, "epoch": 0.012557077625570776, "grad_norm": 3.21875, "learning_rate": 1.2557077625570776e-06, "loss": 1.36881657, "memory(GiB)": 368.61, "step": 495, "train_speed(iter/s)": 0.231973 }, { "acc": 0.68523798, "epoch": 0.012683916793505834, "grad_norm": 3.0, "learning_rate": 1.2683916793505835e-06, "loss": 1.32310028, "memory(GiB)": 368.61, "step": 500, "train_speed(iter/s)": 0.232102 }, { "acc": 0.67850146, "epoch": 0.012810755961440892, "grad_norm": 2.90625, "learning_rate": 1.2810755961440893e-06, "loss": 1.35342245, "memory(GiB)": 368.61, "step": 505, "train_speed(iter/s)": 0.232126 }, { "acc": 0.69367218, "epoch": 0.01293759512937595, "grad_norm": 2.71875, "learning_rate": 1.2937595129375953e-06, "loss": 1.33931713, "memory(GiB)": 368.61, "step": 510, "train_speed(iter/s)": 0.231982 }, { "acc": 0.67943907, "epoch": 0.013064434297311009, "grad_norm": 2.703125, "learning_rate": 1.306443429731101e-06, "loss": 1.40809498, "memory(GiB)": 368.61, "step": 515, "train_speed(iter/s)": 0.231912 }, { "acc": 0.70291443, "epoch": 0.013191273465246067, "grad_norm": 2.0625, "learning_rate": 1.3191273465246068e-06, "loss": 1.27885742, "memory(GiB)": 368.61, "step": 520, "train_speed(iter/s)": 0.231861 }, { "acc": 0.68690996, "epoch": 0.013318112633181126, "grad_norm": 2.734375, "learning_rate": 1.3318112633181127e-06, "loss": 1.30699387, "memory(GiB)": 368.61, "step": 525, "train_speed(iter/s)": 0.231828 }, { "acc": 0.71086907, "epoch": 0.013444951801116184, "grad_norm": 2.640625, "learning_rate": 1.3444951801116185e-06, "loss": 1.25877895, "memory(GiB)": 368.61, "step": 530, "train_speed(iter/s)": 0.231654 }, { "acc": 0.69785409, "epoch": 0.013571790969051242, "grad_norm": 2.65625, "learning_rate": 1.3571790969051243e-06, "loss": 1.27086124, "memory(GiB)": 368.61, "step": 535, "train_speed(iter/s)": 0.231796 }, { "acc": 0.69089527, "epoch": 0.0136986301369863, "grad_norm": 2.609375, "learning_rate": 1.3698630136986302e-06, "loss": 1.37519722, "memory(GiB)": 368.61, "step": 540, "train_speed(iter/s)": 0.231875 }, { "acc": 0.68066568, "epoch": 0.013825469304921359, "grad_norm": 2.5625, "learning_rate": 1.382546930492136e-06, "loss": 1.36353474, "memory(GiB)": 368.61, "step": 545, "train_speed(iter/s)": 0.231994 }, { "acc": 0.6873168, "epoch": 0.013952308472856417, "grad_norm": 3.140625, "learning_rate": 1.395230847285642e-06, "loss": 1.42179489, "memory(GiB)": 368.61, "step": 550, "train_speed(iter/s)": 0.231905 }, { "acc": 0.69332685, "epoch": 0.014079147640791476, "grad_norm": 2.5625, "learning_rate": 1.4079147640791477e-06, "loss": 1.31028576, "memory(GiB)": 368.61, "step": 555, "train_speed(iter/s)": 0.231923 }, { "acc": 0.69818935, "epoch": 0.014205986808726534, "grad_norm": 2.703125, "learning_rate": 1.4205986808726534e-06, "loss": 1.26530418, "memory(GiB)": 368.61, "step": 560, "train_speed(iter/s)": 0.232004 }, { "acc": 0.69409885, "epoch": 0.014332825976661592, "grad_norm": 3.125, "learning_rate": 1.4332825976661594e-06, "loss": 1.34725981, "memory(GiB)": 368.61, "step": 565, "train_speed(iter/s)": 0.232155 }, { "acc": 0.69217014, "epoch": 0.01445966514459665, "grad_norm": 3.40625, "learning_rate": 1.4459665144596652e-06, "loss": 1.29356203, "memory(GiB)": 368.61, "step": 570, "train_speed(iter/s)": 0.232093 }, { "acc": 0.70825119, "epoch": 0.014586504312531709, "grad_norm": 2.65625, "learning_rate": 1.458650431253171e-06, "loss": 1.24919415, "memory(GiB)": 368.61, "step": 575, "train_speed(iter/s)": 0.231636 }, { "acc": 0.68474727, "epoch": 0.014713343480466767, "grad_norm": 2.828125, "learning_rate": 1.4713343480466769e-06, "loss": 1.40724335, "memory(GiB)": 368.61, "step": 580, "train_speed(iter/s)": 0.231736 }, { "acc": 0.68723154, "epoch": 0.014840182648401826, "grad_norm": 2.9375, "learning_rate": 1.4840182648401826e-06, "loss": 1.3849514, "memory(GiB)": 368.61, "step": 585, "train_speed(iter/s)": 0.231968 }, { "acc": 0.67858372, "epoch": 0.014967021816336884, "grad_norm": 2.3125, "learning_rate": 1.4967021816336886e-06, "loss": 1.29665174, "memory(GiB)": 368.61, "step": 590, "train_speed(iter/s)": 0.231872 }, { "acc": 0.68471775, "epoch": 0.015093860984271942, "grad_norm": 2.40625, "learning_rate": 1.5093860984271944e-06, "loss": 1.33779335, "memory(GiB)": 368.61, "step": 595, "train_speed(iter/s)": 0.231938 }, { "acc": 0.70354004, "epoch": 0.015220700152207, "grad_norm": 2.265625, "learning_rate": 1.5220700152207001e-06, "loss": 1.25389671, "memory(GiB)": 368.61, "step": 600, "train_speed(iter/s)": 0.231978 }, { "acc": 0.70415344, "epoch": 0.015347539320142059, "grad_norm": 2.546875, "learning_rate": 1.534753932014206e-06, "loss": 1.26755962, "memory(GiB)": 368.61, "step": 605, "train_speed(iter/s)": 0.232036 }, { "acc": 0.69151945, "epoch": 0.015474378488077117, "grad_norm": 2.3125, "learning_rate": 1.5474378488077118e-06, "loss": 1.31266422, "memory(GiB)": 368.61, "step": 610, "train_speed(iter/s)": 0.232114 }, { "acc": 0.69452524, "epoch": 0.015601217656012176, "grad_norm": 2.65625, "learning_rate": 1.5601217656012176e-06, "loss": 1.2707077, "memory(GiB)": 368.61, "step": 615, "train_speed(iter/s)": 0.232159 }, { "acc": 0.69659638, "epoch": 0.015728056823947234, "grad_norm": 2.3125, "learning_rate": 1.5728056823947236e-06, "loss": 1.26892471, "memory(GiB)": 368.61, "step": 620, "train_speed(iter/s)": 0.232159 }, { "acc": 0.69280224, "epoch": 0.015854895991882292, "grad_norm": 2.421875, "learning_rate": 1.5854895991882293e-06, "loss": 1.33262529, "memory(GiB)": 368.61, "step": 625, "train_speed(iter/s)": 0.23227 }, { "acc": 0.68903265, "epoch": 0.01598173515981735, "grad_norm": 2.609375, "learning_rate": 1.5981735159817353e-06, "loss": 1.33474522, "memory(GiB)": 368.61, "step": 630, "train_speed(iter/s)": 0.232387 }, { "acc": 0.6938283, "epoch": 0.01610857432775241, "grad_norm": 2.8125, "learning_rate": 1.610857432775241e-06, "loss": 1.33078556, "memory(GiB)": 368.61, "step": 635, "train_speed(iter/s)": 0.232515 }, { "acc": 0.68049469, "epoch": 0.016235413495687467, "grad_norm": 2.6875, "learning_rate": 1.6235413495687468e-06, "loss": 1.33125362, "memory(GiB)": 368.61, "step": 640, "train_speed(iter/s)": 0.23272 }, { "acc": 0.69723735, "epoch": 0.016362252663622526, "grad_norm": 2.421875, "learning_rate": 1.6362252663622528e-06, "loss": 1.32692118, "memory(GiB)": 368.61, "step": 645, "train_speed(iter/s)": 0.232594 }, { "acc": 0.68464165, "epoch": 0.016489091831557584, "grad_norm": 2.40625, "learning_rate": 1.6489091831557585e-06, "loss": 1.32219048, "memory(GiB)": 368.61, "step": 650, "train_speed(iter/s)": 0.232618 }, { "acc": 0.68868089, "epoch": 0.016615930999492642, "grad_norm": 2.28125, "learning_rate": 1.6615930999492643e-06, "loss": 1.3427824, "memory(GiB)": 368.61, "step": 655, "train_speed(iter/s)": 0.232623 }, { "acc": 0.68767109, "epoch": 0.0167427701674277, "grad_norm": 2.515625, "learning_rate": 1.6742770167427702e-06, "loss": 1.28951149, "memory(GiB)": 368.61, "step": 660, "train_speed(iter/s)": 0.232751 }, { "acc": 0.69273419, "epoch": 0.01686960933536276, "grad_norm": 2.3125, "learning_rate": 1.686960933536276e-06, "loss": 1.28929291, "memory(GiB)": 368.61, "step": 665, "train_speed(iter/s)": 0.232719 }, { "acc": 0.70647039, "epoch": 0.016996448503297817, "grad_norm": 2.453125, "learning_rate": 1.699644850329782e-06, "loss": 1.30026598, "memory(GiB)": 368.61, "step": 670, "train_speed(iter/s)": 0.232577 }, { "acc": 0.70746546, "epoch": 0.017123287671232876, "grad_norm": 2.703125, "learning_rate": 1.7123287671232877e-06, "loss": 1.26077938, "memory(GiB)": 368.61, "step": 675, "train_speed(iter/s)": 0.232433 }, { "acc": 0.68412161, "epoch": 0.017250126839167934, "grad_norm": 2.578125, "learning_rate": 1.7250126839167935e-06, "loss": 1.37765427, "memory(GiB)": 368.61, "step": 680, "train_speed(iter/s)": 0.232423 }, { "acc": 0.69389043, "epoch": 0.017376966007102992, "grad_norm": 2.421875, "learning_rate": 1.7376966007102994e-06, "loss": 1.31172314, "memory(GiB)": 368.61, "step": 685, "train_speed(iter/s)": 0.232337 }, { "acc": 0.69760876, "epoch": 0.01750380517503805, "grad_norm": 2.328125, "learning_rate": 1.7503805175038052e-06, "loss": 1.33980904, "memory(GiB)": 368.61, "step": 690, "train_speed(iter/s)": 0.232398 }, { "acc": 0.6931869, "epoch": 0.01763064434297311, "grad_norm": 2.953125, "learning_rate": 1.7630644342973112e-06, "loss": 1.28292713, "memory(GiB)": 368.61, "step": 695, "train_speed(iter/s)": 0.232483 }, { "acc": 0.6979476, "epoch": 0.017757483510908167, "grad_norm": 2.296875, "learning_rate": 1.775748351090817e-06, "loss": 1.28128901, "memory(GiB)": 368.61, "step": 700, "train_speed(iter/s)": 0.232508 }, { "acc": 0.68921604, "epoch": 0.017884322678843226, "grad_norm": 2.359375, "learning_rate": 1.7884322678843227e-06, "loss": 1.25999851, "memory(GiB)": 368.61, "step": 705, "train_speed(iter/s)": 0.232592 }, { "acc": 0.68730011, "epoch": 0.018011161846778284, "grad_norm": 2.25, "learning_rate": 1.8011161846778286e-06, "loss": 1.31758442, "memory(GiB)": 368.61, "step": 710, "train_speed(iter/s)": 0.232453 }, { "acc": 0.67609444, "epoch": 0.018138001014713342, "grad_norm": 2.171875, "learning_rate": 1.8138001014713344e-06, "loss": 1.36603365, "memory(GiB)": 368.61, "step": 715, "train_speed(iter/s)": 0.232546 }, { "acc": 0.69120331, "epoch": 0.0182648401826484, "grad_norm": 2.53125, "learning_rate": 1.8264840182648401e-06, "loss": 1.29363375, "memory(GiB)": 368.61, "step": 720, "train_speed(iter/s)": 0.232685 }, { "acc": 0.68683414, "epoch": 0.01839167935058346, "grad_norm": 2.28125, "learning_rate": 1.8391679350583461e-06, "loss": 1.33888683, "memory(GiB)": 368.61, "step": 725, "train_speed(iter/s)": 0.232732 }, { "acc": 0.70235224, "epoch": 0.018518518518518517, "grad_norm": 2.4375, "learning_rate": 1.8518518518518519e-06, "loss": 1.22214575, "memory(GiB)": 368.61, "step": 730, "train_speed(iter/s)": 0.23281 }, { "acc": 0.67859249, "epoch": 0.018645357686453576, "grad_norm": 2.515625, "learning_rate": 1.8645357686453578e-06, "loss": 1.3327776, "memory(GiB)": 368.61, "step": 735, "train_speed(iter/s)": 0.232954 }, { "acc": 0.69706764, "epoch": 0.018772196854388634, "grad_norm": 2.6875, "learning_rate": 1.8772196854388636e-06, "loss": 1.29788561, "memory(GiB)": 368.61, "step": 740, "train_speed(iter/s)": 0.232985 }, { "acc": 0.69531698, "epoch": 0.018899036022323693, "grad_norm": 2.234375, "learning_rate": 1.8899036022323693e-06, "loss": 1.25877705, "memory(GiB)": 368.61, "step": 745, "train_speed(iter/s)": 0.23266 }, { "acc": 0.69724526, "epoch": 0.01902587519025875, "grad_norm": 2.453125, "learning_rate": 1.9025875190258753e-06, "loss": 1.25514536, "memory(GiB)": 368.61, "step": 750, "train_speed(iter/s)": 0.232597 }, { "acc": 0.69890642, "epoch": 0.01915271435819381, "grad_norm": 2.171875, "learning_rate": 1.915271435819381e-06, "loss": 1.31525879, "memory(GiB)": 368.61, "step": 755, "train_speed(iter/s)": 0.232595 }, { "acc": 0.68396273, "epoch": 0.019279553526128868, "grad_norm": 2.296875, "learning_rate": 1.927955352612887e-06, "loss": 1.28517437, "memory(GiB)": 368.61, "step": 760, "train_speed(iter/s)": 0.232557 }, { "acc": 0.7021451, "epoch": 0.019406392694063926, "grad_norm": 2.34375, "learning_rate": 1.9406392694063926e-06, "loss": 1.3149889, "memory(GiB)": 368.61, "step": 765, "train_speed(iter/s)": 0.23257 }, { "acc": 0.70087299, "epoch": 0.019533231861998984, "grad_norm": 2.609375, "learning_rate": 1.9533231861998985e-06, "loss": 1.29243927, "memory(GiB)": 368.61, "step": 770, "train_speed(iter/s)": 0.232723 }, { "acc": 0.69362717, "epoch": 0.019660071029934043, "grad_norm": 2.515625, "learning_rate": 1.9660071029934045e-06, "loss": 1.30669241, "memory(GiB)": 368.61, "step": 775, "train_speed(iter/s)": 0.232539 }, { "acc": 0.6712357, "epoch": 0.0197869101978691, "grad_norm": 2.703125, "learning_rate": 1.97869101978691e-06, "loss": 1.37696171, "memory(GiB)": 368.61, "step": 780, "train_speed(iter/s)": 0.232527 }, { "acc": 0.71143351, "epoch": 0.01991374936580416, "grad_norm": 2.4375, "learning_rate": 1.991374936580416e-06, "loss": 1.19541235, "memory(GiB)": 368.61, "step": 785, "train_speed(iter/s)": 0.232493 }, { "acc": 0.69512777, "epoch": 0.020040588533739218, "grad_norm": 2.6875, "learning_rate": 2.004058853373922e-06, "loss": 1.28759794, "memory(GiB)": 368.61, "step": 790, "train_speed(iter/s)": 0.232531 }, { "acc": 0.69569101, "epoch": 0.020167427701674276, "grad_norm": 2.953125, "learning_rate": 2.016742770167428e-06, "loss": 1.24013748, "memory(GiB)": 368.61, "step": 795, "train_speed(iter/s)": 0.232601 }, { "acc": 0.69176626, "epoch": 0.020294266869609334, "grad_norm": 2.640625, "learning_rate": 2.0294266869609335e-06, "loss": 1.21449461, "memory(GiB)": 368.61, "step": 800, "train_speed(iter/s)": 0.232591 }, { "acc": 0.69038477, "epoch": 0.020421106037544393, "grad_norm": 2.6875, "learning_rate": 2.0421106037544395e-06, "loss": 1.32981243, "memory(GiB)": 368.61, "step": 805, "train_speed(iter/s)": 0.232706 }, { "acc": 0.70778461, "epoch": 0.02054794520547945, "grad_norm": 2.53125, "learning_rate": 2.0547945205479454e-06, "loss": 1.2008213, "memory(GiB)": 368.61, "step": 810, "train_speed(iter/s)": 0.232745 }, { "acc": 0.70905633, "epoch": 0.02067478437341451, "grad_norm": 2.859375, "learning_rate": 2.067478437341451e-06, "loss": 1.31931925, "memory(GiB)": 368.61, "step": 815, "train_speed(iter/s)": 0.232863 }, { "acc": 0.70005217, "epoch": 0.020801623541349568, "grad_norm": 2.578125, "learning_rate": 2.080162354134957e-06, "loss": 1.25738316, "memory(GiB)": 368.61, "step": 820, "train_speed(iter/s)": 0.232752 }, { "acc": 0.67996464, "epoch": 0.020928462709284626, "grad_norm": 2.140625, "learning_rate": 2.092846270928463e-06, "loss": 1.30845404, "memory(GiB)": 368.61, "step": 825, "train_speed(iter/s)": 0.232696 }, { "acc": 0.67891979, "epoch": 0.021055301877219684, "grad_norm": 2.484375, "learning_rate": 2.1055301877219685e-06, "loss": 1.33170471, "memory(GiB)": 368.61, "step": 830, "train_speed(iter/s)": 0.232799 }, { "acc": 0.69740653, "epoch": 0.021182141045154743, "grad_norm": 1.9375, "learning_rate": 2.1182141045154744e-06, "loss": 1.25628376, "memory(GiB)": 368.61, "step": 835, "train_speed(iter/s)": 0.232857 }, { "acc": 0.69880581, "epoch": 0.0213089802130898, "grad_norm": 2.875, "learning_rate": 2.1308980213089804e-06, "loss": 1.22247028, "memory(GiB)": 368.61, "step": 840, "train_speed(iter/s)": 0.232723 }, { "acc": 0.69614782, "epoch": 0.02143581938102486, "grad_norm": 2.234375, "learning_rate": 2.143581938102486e-06, "loss": 1.22408028, "memory(GiB)": 368.61, "step": 845, "train_speed(iter/s)": 0.232721 }, { "acc": 0.71626825, "epoch": 0.021562658548959918, "grad_norm": 2.390625, "learning_rate": 2.156265854895992e-06, "loss": 1.25379143, "memory(GiB)": 368.61, "step": 850, "train_speed(iter/s)": 0.232795 }, { "acc": 0.67569246, "epoch": 0.021689497716894976, "grad_norm": 2.671875, "learning_rate": 2.168949771689498e-06, "loss": 1.32362099, "memory(GiB)": 368.61, "step": 855, "train_speed(iter/s)": 0.23281 }, { "acc": 0.70386605, "epoch": 0.021816336884830034, "grad_norm": 2.40625, "learning_rate": 2.1816336884830034e-06, "loss": 1.28031616, "memory(GiB)": 368.61, "step": 860, "train_speed(iter/s)": 0.23283 }, { "acc": 0.68288746, "epoch": 0.021943176052765093, "grad_norm": 3.03125, "learning_rate": 2.1943176052765094e-06, "loss": 1.26909618, "memory(GiB)": 368.61, "step": 865, "train_speed(iter/s)": 0.23293 }, { "acc": 0.68917279, "epoch": 0.02207001522070015, "grad_norm": 2.71875, "learning_rate": 2.2070015220700153e-06, "loss": 1.33650732, "memory(GiB)": 368.61, "step": 870, "train_speed(iter/s)": 0.233008 }, { "acc": 0.68990235, "epoch": 0.02219685438863521, "grad_norm": 2.75, "learning_rate": 2.2196854388635213e-06, "loss": 1.30084038, "memory(GiB)": 368.61, "step": 875, "train_speed(iter/s)": 0.233088 }, { "acc": 0.70526581, "epoch": 0.022323693556570268, "grad_norm": 3.328125, "learning_rate": 2.232369355657027e-06, "loss": 1.22841501, "memory(GiB)": 368.61, "step": 880, "train_speed(iter/s)": 0.23301 }, { "acc": 0.69007673, "epoch": 0.022450532724505326, "grad_norm": 2.65625, "learning_rate": 2.245053272450533e-06, "loss": 1.34288054, "memory(GiB)": 368.61, "step": 885, "train_speed(iter/s)": 0.232971 }, { "acc": 0.6910737, "epoch": 0.022577371892440384, "grad_norm": 2.4375, "learning_rate": 2.2577371892440388e-06, "loss": 1.28788948, "memory(GiB)": 368.61, "step": 890, "train_speed(iter/s)": 0.233054 }, { "acc": 0.71145663, "epoch": 0.022704211060375443, "grad_norm": 1.9375, "learning_rate": 2.2704211060375443e-06, "loss": 1.26794434, "memory(GiB)": 368.61, "step": 895, "train_speed(iter/s)": 0.233048 }, { "acc": 0.70632811, "epoch": 0.0228310502283105, "grad_norm": 2.59375, "learning_rate": 2.2831050228310503e-06, "loss": 1.29238281, "memory(GiB)": 368.61, "step": 900, "train_speed(iter/s)": 0.233087 }, { "acc": 0.69763489, "epoch": 0.02295788939624556, "grad_norm": 2.34375, "learning_rate": 2.2957889396245563e-06, "loss": 1.26473637, "memory(GiB)": 368.61, "step": 905, "train_speed(iter/s)": 0.23265 }, { "acc": 0.69633074, "epoch": 0.023084728564180618, "grad_norm": 2.4375, "learning_rate": 2.308472856418062e-06, "loss": 1.25594387, "memory(GiB)": 368.61, "step": 910, "train_speed(iter/s)": 0.232726 }, { "acc": 0.7010201, "epoch": 0.023211567732115676, "grad_norm": 1.9921875, "learning_rate": 2.3211567732115678e-06, "loss": 1.23549805, "memory(GiB)": 368.61, "step": 915, "train_speed(iter/s)": 0.232655 }, { "acc": 0.72183642, "epoch": 0.023338406900050734, "grad_norm": 2.03125, "learning_rate": 2.3338406900050737e-06, "loss": 1.18305168, "memory(GiB)": 368.61, "step": 920, "train_speed(iter/s)": 0.232729 }, { "acc": 0.694981, "epoch": 0.023465246067985793, "grad_norm": 2.265625, "learning_rate": 2.3465246067985793e-06, "loss": 1.28907042, "memory(GiB)": 368.61, "step": 925, "train_speed(iter/s)": 0.232799 }, { "acc": 0.70338583, "epoch": 0.02359208523592085, "grad_norm": 3.265625, "learning_rate": 2.3592085235920852e-06, "loss": 1.2647377, "memory(GiB)": 368.61, "step": 930, "train_speed(iter/s)": 0.232942 }, { "acc": 0.70444903, "epoch": 0.02371892440385591, "grad_norm": 2.453125, "learning_rate": 2.371892440385591e-06, "loss": 1.22272148, "memory(GiB)": 368.61, "step": 935, "train_speed(iter/s)": 0.232928 }, { "acc": 0.7029171, "epoch": 0.023845763571790968, "grad_norm": 2.4375, "learning_rate": 2.384576357179097e-06, "loss": 1.24798546, "memory(GiB)": 368.61, "step": 940, "train_speed(iter/s)": 0.23304 }, { "acc": 0.70238094, "epoch": 0.023972602739726026, "grad_norm": 2.5, "learning_rate": 2.3972602739726027e-06, "loss": 1.22255116, "memory(GiB)": 368.61, "step": 945, "train_speed(iter/s)": 0.23314 }, { "acc": 0.68831744, "epoch": 0.024099441907661084, "grad_norm": 2.96875, "learning_rate": 2.4099441907661087e-06, "loss": 1.31269703, "memory(GiB)": 368.61, "step": 950, "train_speed(iter/s)": 0.233221 }, { "acc": 0.71749873, "epoch": 0.024226281075596143, "grad_norm": 2.90625, "learning_rate": 2.4226281075596147e-06, "loss": 1.29138193, "memory(GiB)": 368.61, "step": 955, "train_speed(iter/s)": 0.233207 }, { "acc": 0.69821215, "epoch": 0.0243531202435312, "grad_norm": 3.0, "learning_rate": 2.43531202435312e-06, "loss": 1.29011431, "memory(GiB)": 368.61, "step": 960, "train_speed(iter/s)": 0.233239 }, { "acc": 0.69093399, "epoch": 0.02447995941146626, "grad_norm": 2.421875, "learning_rate": 2.447995941146626e-06, "loss": 1.33248634, "memory(GiB)": 368.61, "step": 965, "train_speed(iter/s)": 0.2333 }, { "acc": 0.69656291, "epoch": 0.024606798579401318, "grad_norm": 2.203125, "learning_rate": 2.460679857940132e-06, "loss": 1.29032917, "memory(GiB)": 368.61, "step": 970, "train_speed(iter/s)": 0.233312 }, { "acc": 0.69390492, "epoch": 0.024733637747336376, "grad_norm": 2.328125, "learning_rate": 2.4733637747336377e-06, "loss": 1.29867287, "memory(GiB)": 368.61, "step": 975, "train_speed(iter/s)": 0.233397 }, { "acc": 0.71941633, "epoch": 0.024860476915271434, "grad_norm": 2.359375, "learning_rate": 2.4860476915271436e-06, "loss": 1.20646372, "memory(GiB)": 368.61, "step": 980, "train_speed(iter/s)": 0.23338 }, { "acc": 0.69875121, "epoch": 0.024987316083206493, "grad_norm": 2.390625, "learning_rate": 2.4987316083206496e-06, "loss": 1.28059807, "memory(GiB)": 368.61, "step": 985, "train_speed(iter/s)": 0.23337 }, { "acc": 0.69065843, "epoch": 0.02511415525114155, "grad_norm": 2.453125, "learning_rate": 2.511415525114155e-06, "loss": 1.31402168, "memory(GiB)": 368.61, "step": 990, "train_speed(iter/s)": 0.233422 }, { "acc": 0.69150953, "epoch": 0.02524099441907661, "grad_norm": 2.25, "learning_rate": 2.5240994419076615e-06, "loss": 1.2509819, "memory(GiB)": 368.61, "step": 995, "train_speed(iter/s)": 0.233488 }, { "acc": 0.71255536, "epoch": 0.025367833587011668, "grad_norm": 2.953125, "learning_rate": 2.536783358701167e-06, "loss": 1.19677992, "memory(GiB)": 368.61, "step": 1000, "train_speed(iter/s)": 0.233528 }, { "epoch": 0.025367833587011668, "eval_acc": 0.6928361679541559, "eval_loss": 1.271693229675293, "eval_runtime": 385.4806, "eval_samples_per_second": 16.525, "eval_steps_per_second": 8.262, "step": 1000 }, { "acc": 0.69311595, "epoch": 0.025494672754946726, "grad_norm": 2.71875, "learning_rate": 2.549467275494673e-06, "loss": 1.25625019, "memory(GiB)": 368.61, "step": 1005, "train_speed(iter/s)": 0.201603 }, { "acc": 0.69689379, "epoch": 0.025621511922881784, "grad_norm": 2.640625, "learning_rate": 2.5621511922881786e-06, "loss": 1.33825855, "memory(GiB)": 368.61, "step": 1010, "train_speed(iter/s)": 0.201761 }, { "acc": 0.69145355, "epoch": 0.025748351090816843, "grad_norm": 2.71875, "learning_rate": 2.5748351090816846e-06, "loss": 1.29609146, "memory(GiB)": 368.61, "step": 1015, "train_speed(iter/s)": 0.20185 }, { "acc": 0.70653386, "epoch": 0.0258751902587519, "grad_norm": 2.796875, "learning_rate": 2.5875190258751905e-06, "loss": 1.24921703, "memory(GiB)": 368.61, "step": 1020, "train_speed(iter/s)": 0.202017 }, { "acc": 0.7116869, "epoch": 0.02600202942668696, "grad_norm": 2.234375, "learning_rate": 2.6002029426686965e-06, "loss": 1.26201763, "memory(GiB)": 368.61, "step": 1025, "train_speed(iter/s)": 0.202082 }, { "acc": 0.70310216, "epoch": 0.026128868594622018, "grad_norm": 2.078125, "learning_rate": 2.612886859462202e-06, "loss": 1.23814259, "memory(GiB)": 368.61, "step": 1030, "train_speed(iter/s)": 0.202245 }, { "acc": 0.7082099, "epoch": 0.026255707762557076, "grad_norm": 2.375, "learning_rate": 2.625570776255708e-06, "loss": 1.27855244, "memory(GiB)": 368.61, "step": 1035, "train_speed(iter/s)": 0.202274 }, { "acc": 0.69157925, "epoch": 0.026382546930492135, "grad_norm": 3.171875, "learning_rate": 2.6382546930492135e-06, "loss": 1.26497602, "memory(GiB)": 368.61, "step": 1040, "train_speed(iter/s)": 0.202443 }, { "acc": 0.7000762, "epoch": 0.026509386098427193, "grad_norm": 2.296875, "learning_rate": 2.65093860984272e-06, "loss": 1.30621471, "memory(GiB)": 368.61, "step": 1045, "train_speed(iter/s)": 0.202489 }, { "acc": 0.70767269, "epoch": 0.02663622526636225, "grad_norm": 3.46875, "learning_rate": 2.6636225266362255e-06, "loss": 1.28214035, "memory(GiB)": 368.61, "step": 1050, "train_speed(iter/s)": 0.202642 }, { "acc": 0.70433469, "epoch": 0.02676306443429731, "grad_norm": 2.734375, "learning_rate": 2.6763064434297314e-06, "loss": 1.24030571, "memory(GiB)": 368.61, "step": 1055, "train_speed(iter/s)": 0.202816 }, { "acc": 0.69845381, "epoch": 0.026889903602232368, "grad_norm": 2.09375, "learning_rate": 2.688990360223237e-06, "loss": 1.25559635, "memory(GiB)": 368.61, "step": 1060, "train_speed(iter/s)": 0.202928 }, { "acc": 0.69944725, "epoch": 0.027016742770167426, "grad_norm": 2.15625, "learning_rate": 2.701674277016743e-06, "loss": 1.23517704, "memory(GiB)": 368.61, "step": 1065, "train_speed(iter/s)": 0.203121 }, { "acc": 0.69922285, "epoch": 0.027143581938102485, "grad_norm": 2.578125, "learning_rate": 2.7143581938102485e-06, "loss": 1.29283562, "memory(GiB)": 368.61, "step": 1070, "train_speed(iter/s)": 0.203165 }, { "acc": 0.69631195, "epoch": 0.027270421106037543, "grad_norm": 2.390625, "learning_rate": 2.727042110603755e-06, "loss": 1.26046772, "memory(GiB)": 368.61, "step": 1075, "train_speed(iter/s)": 0.203281 }, { "acc": 0.68495526, "epoch": 0.0273972602739726, "grad_norm": 2.296875, "learning_rate": 2.7397260273972604e-06, "loss": 1.34147863, "memory(GiB)": 368.61, "step": 1080, "train_speed(iter/s)": 0.203405 }, { "acc": 0.71348872, "epoch": 0.02752409944190766, "grad_norm": 2.1875, "learning_rate": 2.7524099441907664e-06, "loss": 1.26164036, "memory(GiB)": 368.61, "step": 1085, "train_speed(iter/s)": 0.203575 }, { "acc": 0.71379051, "epoch": 0.027650938609842718, "grad_norm": 2.03125, "learning_rate": 2.765093860984272e-06, "loss": 1.21263933, "memory(GiB)": 368.61, "step": 1090, "train_speed(iter/s)": 0.203733 }, { "acc": 0.69404078, "epoch": 0.027777777777777776, "grad_norm": 2.453125, "learning_rate": 2.7777777777777783e-06, "loss": 1.25947161, "memory(GiB)": 368.61, "step": 1095, "train_speed(iter/s)": 0.203884 }, { "acc": 0.70873685, "epoch": 0.027904616945712835, "grad_norm": 2.71875, "learning_rate": 2.790461694571284e-06, "loss": 1.18990803, "memory(GiB)": 368.61, "step": 1100, "train_speed(iter/s)": 0.203981 }, { "acc": 0.70823264, "epoch": 0.028031456113647893, "grad_norm": 2.59375, "learning_rate": 2.80314561136479e-06, "loss": 1.22174034, "memory(GiB)": 368.61, "step": 1105, "train_speed(iter/s)": 0.204094 }, { "acc": 0.70553484, "epoch": 0.02815829528158295, "grad_norm": 2.484375, "learning_rate": 2.8158295281582954e-06, "loss": 1.25095654, "memory(GiB)": 368.61, "step": 1110, "train_speed(iter/s)": 0.204231 }, { "acc": 0.69563894, "epoch": 0.02828513444951801, "grad_norm": 2.3125, "learning_rate": 2.8285134449518014e-06, "loss": 1.28689594, "memory(GiB)": 368.61, "step": 1115, "train_speed(iter/s)": 0.204392 }, { "acc": 0.7030479, "epoch": 0.028411973617453068, "grad_norm": 2.3125, "learning_rate": 2.841197361745307e-06, "loss": 1.25615368, "memory(GiB)": 368.61, "step": 1120, "train_speed(iter/s)": 0.204504 }, { "acc": 0.69638753, "epoch": 0.028538812785388126, "grad_norm": 2.171875, "learning_rate": 2.8538812785388133e-06, "loss": 1.21460381, "memory(GiB)": 368.61, "step": 1125, "train_speed(iter/s)": 0.204629 }, { "acc": 0.70991416, "epoch": 0.028665651953323185, "grad_norm": 2.484375, "learning_rate": 2.866565195332319e-06, "loss": 1.21105528, "memory(GiB)": 368.61, "step": 1130, "train_speed(iter/s)": 0.204767 }, { "acc": 0.70021954, "epoch": 0.028792491121258243, "grad_norm": 2.0625, "learning_rate": 2.879249112125825e-06, "loss": 1.24010296, "memory(GiB)": 368.61, "step": 1135, "train_speed(iter/s)": 0.204821 }, { "acc": 0.69841137, "epoch": 0.0289193302891933, "grad_norm": 2.296875, "learning_rate": 2.8919330289193303e-06, "loss": 1.27836781, "memory(GiB)": 368.61, "step": 1140, "train_speed(iter/s)": 0.204924 }, { "acc": 0.69533949, "epoch": 0.02904616945712836, "grad_norm": 2.0625, "learning_rate": 2.9046169457128363e-06, "loss": 1.23384266, "memory(GiB)": 368.61, "step": 1145, "train_speed(iter/s)": 0.205054 }, { "acc": 0.70363207, "epoch": 0.029173008625063418, "grad_norm": 2.5625, "learning_rate": 2.917300862506342e-06, "loss": 1.24817371, "memory(GiB)": 368.61, "step": 1150, "train_speed(iter/s)": 0.205202 }, { "acc": 0.71027336, "epoch": 0.029299847792998476, "grad_norm": 2.421875, "learning_rate": 2.9299847792998482e-06, "loss": 1.26078968, "memory(GiB)": 368.61, "step": 1155, "train_speed(iter/s)": 0.205348 }, { "acc": 0.71297121, "epoch": 0.029426686960933535, "grad_norm": 2.515625, "learning_rate": 2.9426686960933538e-06, "loss": 1.2629673, "memory(GiB)": 368.61, "step": 1160, "train_speed(iter/s)": 0.205521 }, { "acc": 0.69952965, "epoch": 0.029553526128868593, "grad_norm": 2.8125, "learning_rate": 2.9553526128868598e-06, "loss": 1.24001408, "memory(GiB)": 368.61, "step": 1165, "train_speed(iter/s)": 0.205587 }, { "acc": 0.69408112, "epoch": 0.02968036529680365, "grad_norm": 2.296875, "learning_rate": 2.9680365296803653e-06, "loss": 1.2834713, "memory(GiB)": 368.61, "step": 1170, "train_speed(iter/s)": 0.205691 }, { "acc": 0.70758123, "epoch": 0.02980720446473871, "grad_norm": 2.265625, "learning_rate": 2.9807204464738717e-06, "loss": 1.26487188, "memory(GiB)": 368.61, "step": 1175, "train_speed(iter/s)": 0.205781 }, { "acc": 0.71206236, "epoch": 0.029934043632673768, "grad_norm": 2.25, "learning_rate": 2.9934043632673772e-06, "loss": 1.17348967, "memory(GiB)": 368.61, "step": 1180, "train_speed(iter/s)": 0.205869 }, { "acc": 0.69065561, "epoch": 0.030060882800608826, "grad_norm": 2.109375, "learning_rate": 3.006088280060883e-06, "loss": 1.24965935, "memory(GiB)": 368.61, "step": 1185, "train_speed(iter/s)": 0.205953 }, { "acc": 0.69633484, "epoch": 0.030187721968543885, "grad_norm": 2.484375, "learning_rate": 3.0187721968543887e-06, "loss": 1.26966057, "memory(GiB)": 368.61, "step": 1190, "train_speed(iter/s)": 0.206005 }, { "acc": 0.69028339, "epoch": 0.030314561136478943, "grad_norm": 2.59375, "learning_rate": 3.0314561136478947e-06, "loss": 1.30558071, "memory(GiB)": 368.61, "step": 1195, "train_speed(iter/s)": 0.206087 }, { "acc": 0.72871375, "epoch": 0.030441400304414, "grad_norm": 2.5, "learning_rate": 3.0441400304414002e-06, "loss": 1.20455647, "memory(GiB)": 368.61, "step": 1200, "train_speed(iter/s)": 0.206206 }, { "acc": 0.69297495, "epoch": 0.03056823947234906, "grad_norm": 2.71875, "learning_rate": 3.0568239472349066e-06, "loss": 1.31520462, "memory(GiB)": 368.61, "step": 1205, "train_speed(iter/s)": 0.206347 }, { "acc": 0.69144545, "epoch": 0.030695078640284118, "grad_norm": 2.9375, "learning_rate": 3.069507864028412e-06, "loss": 1.22424049, "memory(GiB)": 368.61, "step": 1210, "train_speed(iter/s)": 0.206381 }, { "acc": 0.70138321, "epoch": 0.030821917808219176, "grad_norm": 2.1875, "learning_rate": 3.082191780821918e-06, "loss": 1.26048412, "memory(GiB)": 368.61, "step": 1215, "train_speed(iter/s)": 0.206488 }, { "acc": 0.69968262, "epoch": 0.030948756976154235, "grad_norm": 2.640625, "learning_rate": 3.0948756976154237e-06, "loss": 1.24404526, "memory(GiB)": 368.61, "step": 1220, "train_speed(iter/s)": 0.206546 }, { "acc": 0.71165953, "epoch": 0.031075596144089293, "grad_norm": 2.21875, "learning_rate": 3.1075596144089297e-06, "loss": 1.23251724, "memory(GiB)": 368.61, "step": 1225, "train_speed(iter/s)": 0.206616 }, { "acc": 0.71368427, "epoch": 0.03120243531202435, "grad_norm": 2.40625, "learning_rate": 3.120243531202435e-06, "loss": 1.23651581, "memory(GiB)": 368.61, "step": 1230, "train_speed(iter/s)": 0.206663 }, { "acc": 0.69554605, "epoch": 0.03132927447995941, "grad_norm": 2.578125, "learning_rate": 3.1329274479959416e-06, "loss": 1.28354635, "memory(GiB)": 368.61, "step": 1235, "train_speed(iter/s)": 0.206787 }, { "acc": 0.70657616, "epoch": 0.03145611364789447, "grad_norm": 2.40625, "learning_rate": 3.145611364789447e-06, "loss": 1.25124464, "memory(GiB)": 368.61, "step": 1240, "train_speed(iter/s)": 0.206888 }, { "acc": 0.70396166, "epoch": 0.031582952815829526, "grad_norm": 2.390625, "learning_rate": 3.158295281582953e-06, "loss": 1.30820618, "memory(GiB)": 368.61, "step": 1245, "train_speed(iter/s)": 0.206965 }, { "acc": 0.7054306, "epoch": 0.031709791983764585, "grad_norm": 2.421875, "learning_rate": 3.1709791983764586e-06, "loss": 1.20780325, "memory(GiB)": 368.61, "step": 1250, "train_speed(iter/s)": 0.207075 }, { "acc": 0.70681648, "epoch": 0.03183663115169964, "grad_norm": 2.0625, "learning_rate": 3.183663115169965e-06, "loss": 1.25089149, "memory(GiB)": 368.61, "step": 1255, "train_speed(iter/s)": 0.207178 }, { "acc": 0.71195364, "epoch": 0.0319634703196347, "grad_norm": 1.8984375, "learning_rate": 3.1963470319634706e-06, "loss": 1.24388618, "memory(GiB)": 368.61, "step": 1260, "train_speed(iter/s)": 0.207286 }, { "acc": 0.7052422, "epoch": 0.03209030948756976, "grad_norm": 1.984375, "learning_rate": 3.2090309487569765e-06, "loss": 1.23871946, "memory(GiB)": 368.61, "step": 1265, "train_speed(iter/s)": 0.207387 }, { "acc": 0.70433035, "epoch": 0.03221714865550482, "grad_norm": 2.421875, "learning_rate": 3.221714865550482e-06, "loss": 1.2436595, "memory(GiB)": 368.61, "step": 1270, "train_speed(iter/s)": 0.207461 }, { "acc": 0.69908738, "epoch": 0.032343987823439876, "grad_norm": 2.28125, "learning_rate": 3.234398782343988e-06, "loss": 1.31297207, "memory(GiB)": 368.61, "step": 1275, "train_speed(iter/s)": 0.207491 }, { "acc": 0.7088583, "epoch": 0.032470826991374935, "grad_norm": 2.09375, "learning_rate": 3.2470826991374936e-06, "loss": 1.21550293, "memory(GiB)": 368.61, "step": 1280, "train_speed(iter/s)": 0.207647 }, { "acc": 0.70628901, "epoch": 0.03259766615930999, "grad_norm": 2.21875, "learning_rate": 3.259766615931e-06, "loss": 1.2446826, "memory(GiB)": 368.61, "step": 1285, "train_speed(iter/s)": 0.207746 }, { "acc": 0.71173048, "epoch": 0.03272450532724505, "grad_norm": 2.1875, "learning_rate": 3.2724505327245055e-06, "loss": 1.23548527, "memory(GiB)": 368.61, "step": 1290, "train_speed(iter/s)": 0.207829 }, { "acc": 0.69723587, "epoch": 0.03285134449518011, "grad_norm": 2.265625, "learning_rate": 3.2851344495180115e-06, "loss": 1.28510799, "memory(GiB)": 368.61, "step": 1295, "train_speed(iter/s)": 0.207917 }, { "acc": 0.70709038, "epoch": 0.03297818366311517, "grad_norm": 2.140625, "learning_rate": 3.297818366311517e-06, "loss": 1.28729029, "memory(GiB)": 368.61, "step": 1300, "train_speed(iter/s)": 0.207949 }, { "acc": 0.71231217, "epoch": 0.033105022831050226, "grad_norm": 3.25, "learning_rate": 3.310502283105023e-06, "loss": 1.22225208, "memory(GiB)": 368.61, "step": 1305, "train_speed(iter/s)": 0.208021 }, { "acc": 0.70198984, "epoch": 0.033231861998985285, "grad_norm": 2.15625, "learning_rate": 3.3231861998985286e-06, "loss": 1.23049698, "memory(GiB)": 368.61, "step": 1310, "train_speed(iter/s)": 0.20815 }, { "acc": 0.72374501, "epoch": 0.03335870116692034, "grad_norm": 2.1875, "learning_rate": 3.335870116692035e-06, "loss": 1.14125824, "memory(GiB)": 368.61, "step": 1315, "train_speed(iter/s)": 0.208289 }, { "acc": 0.69872208, "epoch": 0.0334855403348554, "grad_norm": 2.5, "learning_rate": 3.3485540334855405e-06, "loss": 1.25974712, "memory(GiB)": 368.61, "step": 1320, "train_speed(iter/s)": 0.208368 }, { "acc": 0.71242743, "epoch": 0.03361237950279046, "grad_norm": 2.59375, "learning_rate": 3.3612379502790465e-06, "loss": 1.16494961, "memory(GiB)": 368.61, "step": 1325, "train_speed(iter/s)": 0.208445 }, { "acc": 0.71464148, "epoch": 0.03373921867072552, "grad_norm": 2.875, "learning_rate": 3.373921867072552e-06, "loss": 1.23755989, "memory(GiB)": 368.61, "step": 1330, "train_speed(iter/s)": 0.208511 }, { "acc": 0.71028633, "epoch": 0.033866057838660577, "grad_norm": 2.265625, "learning_rate": 3.3866057838660584e-06, "loss": 1.26862888, "memory(GiB)": 368.61, "step": 1335, "train_speed(iter/s)": 0.208561 }, { "acc": 0.71381092, "epoch": 0.033992897006595635, "grad_norm": 2.171875, "learning_rate": 3.399289700659564e-06, "loss": 1.20646534, "memory(GiB)": 368.61, "step": 1340, "train_speed(iter/s)": 0.208642 }, { "acc": 0.72625837, "epoch": 0.03411973617453069, "grad_norm": 2.03125, "learning_rate": 3.41197361745307e-06, "loss": 1.1996666, "memory(GiB)": 368.61, "step": 1345, "train_speed(iter/s)": 0.20872 }, { "acc": 0.70150347, "epoch": 0.03424657534246575, "grad_norm": 1.9921875, "learning_rate": 3.4246575342465754e-06, "loss": 1.21793213, "memory(GiB)": 368.61, "step": 1350, "train_speed(iter/s)": 0.208771 }, { "acc": 0.72662449, "epoch": 0.03437341451040081, "grad_norm": 2.640625, "learning_rate": 3.4373414510400814e-06, "loss": 1.16567612, "memory(GiB)": 368.61, "step": 1355, "train_speed(iter/s)": 0.208891 }, { "acc": 0.7078619, "epoch": 0.03450025367833587, "grad_norm": 2.5625, "learning_rate": 3.450025367833587e-06, "loss": 1.19322653, "memory(GiB)": 368.61, "step": 1360, "train_speed(iter/s)": 0.20895 }, { "acc": 0.69958334, "epoch": 0.03462709284627093, "grad_norm": 2.359375, "learning_rate": 3.4627092846270933e-06, "loss": 1.22846832, "memory(GiB)": 368.61, "step": 1365, "train_speed(iter/s)": 0.209063 }, { "acc": 0.69846411, "epoch": 0.034753932014205985, "grad_norm": 2.296875, "learning_rate": 3.475393201420599e-06, "loss": 1.2452713, "memory(GiB)": 368.61, "step": 1370, "train_speed(iter/s)": 0.209134 }, { "acc": 0.70711012, "epoch": 0.03488077118214104, "grad_norm": 2.3125, "learning_rate": 3.488077118214105e-06, "loss": 1.21954155, "memory(GiB)": 368.61, "step": 1375, "train_speed(iter/s)": 0.209211 }, { "acc": 0.71297307, "epoch": 0.0350076103500761, "grad_norm": 2.171875, "learning_rate": 3.5007610350076104e-06, "loss": 1.21458492, "memory(GiB)": 368.61, "step": 1380, "train_speed(iter/s)": 0.209282 }, { "acc": 0.72078681, "epoch": 0.03513444951801116, "grad_norm": 2.390625, "learning_rate": 3.5134449518011164e-06, "loss": 1.21841898, "memory(GiB)": 368.61, "step": 1385, "train_speed(iter/s)": 0.209379 }, { "acc": 0.70294619, "epoch": 0.03526128868594622, "grad_norm": 2.265625, "learning_rate": 3.5261288685946223e-06, "loss": 1.24314766, "memory(GiB)": 368.61, "step": 1390, "train_speed(iter/s)": 0.209469 }, { "acc": 0.69220495, "epoch": 0.03538812785388128, "grad_norm": 2.21875, "learning_rate": 3.5388127853881283e-06, "loss": 1.30040503, "memory(GiB)": 368.61, "step": 1395, "train_speed(iter/s)": 0.209479 }, { "acc": 0.70446892, "epoch": 0.035514967021816335, "grad_norm": 2.09375, "learning_rate": 3.551496702181634e-06, "loss": 1.18788843, "memory(GiB)": 368.61, "step": 1400, "train_speed(iter/s)": 0.209608 }, { "acc": 0.70480223, "epoch": 0.03564180618975139, "grad_norm": 2.640625, "learning_rate": 3.56418061897514e-06, "loss": 1.28891125, "memory(GiB)": 368.61, "step": 1405, "train_speed(iter/s)": 0.209693 }, { "acc": 0.6972261, "epoch": 0.03576864535768645, "grad_norm": 2.375, "learning_rate": 3.5768645357686453e-06, "loss": 1.32768593, "memory(GiB)": 368.61, "step": 1410, "train_speed(iter/s)": 0.209819 }, { "acc": 0.7117331, "epoch": 0.03589548452562151, "grad_norm": 2.171875, "learning_rate": 3.5895484525621517e-06, "loss": 1.25111427, "memory(GiB)": 368.61, "step": 1415, "train_speed(iter/s)": 0.209861 }, { "acc": 0.72346716, "epoch": 0.03602232369355657, "grad_norm": 2.3125, "learning_rate": 3.6022323693556573e-06, "loss": 1.18725624, "memory(GiB)": 368.61, "step": 1420, "train_speed(iter/s)": 0.209907 }, { "acc": 0.7029603, "epoch": 0.03614916286149163, "grad_norm": 2.46875, "learning_rate": 3.6149162861491632e-06, "loss": 1.24830618, "memory(GiB)": 368.61, "step": 1425, "train_speed(iter/s)": 0.209999 }, { "acc": 0.7220314, "epoch": 0.036276002029426685, "grad_norm": 2.15625, "learning_rate": 3.6276002029426688e-06, "loss": 1.15794888, "memory(GiB)": 368.61, "step": 1430, "train_speed(iter/s)": 0.210028 }, { "acc": 0.69663162, "epoch": 0.03640284119736174, "grad_norm": 2.21875, "learning_rate": 3.6402841197361748e-06, "loss": 1.23983889, "memory(GiB)": 368.61, "step": 1435, "train_speed(iter/s)": 0.210117 }, { "acc": 0.70513582, "epoch": 0.0365296803652968, "grad_norm": 2.78125, "learning_rate": 3.6529680365296803e-06, "loss": 1.25718088, "memory(GiB)": 368.61, "step": 1440, "train_speed(iter/s)": 0.21021 }, { "acc": 0.72540836, "epoch": 0.03665651953323186, "grad_norm": 2.3125, "learning_rate": 3.6656519533231867e-06, "loss": 1.20485907, "memory(GiB)": 368.61, "step": 1445, "train_speed(iter/s)": 0.210234 }, { "acc": 0.69956207, "epoch": 0.03678335870116692, "grad_norm": 2.203125, "learning_rate": 3.6783358701166922e-06, "loss": 1.29173794, "memory(GiB)": 368.61, "step": 1450, "train_speed(iter/s)": 0.210332 }, { "acc": 0.71431313, "epoch": 0.03691019786910198, "grad_norm": 2.1875, "learning_rate": 3.691019786910198e-06, "loss": 1.24841299, "memory(GiB)": 368.61, "step": 1455, "train_speed(iter/s)": 0.210417 }, { "acc": 0.71956697, "epoch": 0.037037037037037035, "grad_norm": 2.265625, "learning_rate": 3.7037037037037037e-06, "loss": 1.13407707, "memory(GiB)": 368.61, "step": 1460, "train_speed(iter/s)": 0.210507 }, { "acc": 0.70406733, "epoch": 0.03716387620497209, "grad_norm": 2.390625, "learning_rate": 3.7163876204972097e-06, "loss": 1.2584362, "memory(GiB)": 368.61, "step": 1465, "train_speed(iter/s)": 0.210566 }, { "acc": 0.70269251, "epoch": 0.03729071537290715, "grad_norm": 1.8984375, "learning_rate": 3.7290715372907157e-06, "loss": 1.23106098, "memory(GiB)": 368.61, "step": 1470, "train_speed(iter/s)": 0.210618 }, { "acc": 0.70128088, "epoch": 0.03741755454084221, "grad_norm": 2.296875, "learning_rate": 3.7417554540842216e-06, "loss": 1.21602917, "memory(GiB)": 368.61, "step": 1475, "train_speed(iter/s)": 0.210716 }, { "acc": 0.72336569, "epoch": 0.03754439370877727, "grad_norm": 2.03125, "learning_rate": 3.754439370877727e-06, "loss": 1.19279442, "memory(GiB)": 368.61, "step": 1480, "train_speed(iter/s)": 0.210803 }, { "acc": 0.70541291, "epoch": 0.03767123287671233, "grad_norm": 2.359375, "learning_rate": 3.767123287671233e-06, "loss": 1.16452332, "memory(GiB)": 368.61, "step": 1485, "train_speed(iter/s)": 0.210815 }, { "acc": 0.69803829, "epoch": 0.037798072044647385, "grad_norm": 2.5, "learning_rate": 3.7798072044647387e-06, "loss": 1.27253609, "memory(GiB)": 368.61, "step": 1490, "train_speed(iter/s)": 0.210871 }, { "acc": 0.68931093, "epoch": 0.03792491121258244, "grad_norm": 2.53125, "learning_rate": 3.792491121258245e-06, "loss": 1.32908115, "memory(GiB)": 368.61, "step": 1495, "train_speed(iter/s)": 0.210931 }, { "acc": 0.69941411, "epoch": 0.0380517503805175, "grad_norm": 2.171875, "learning_rate": 3.8051750380517506e-06, "loss": 1.26151962, "memory(GiB)": 368.61, "step": 1500, "train_speed(iter/s)": 0.211003 }, { "acc": 0.7060987, "epoch": 0.03817858954845256, "grad_norm": 1.984375, "learning_rate": 3.817858954845256e-06, "loss": 1.21855145, "memory(GiB)": 368.61, "step": 1505, "train_speed(iter/s)": 0.211069 }, { "acc": 0.70071278, "epoch": 0.03830542871638762, "grad_norm": 2.65625, "learning_rate": 3.830542871638762e-06, "loss": 1.21921215, "memory(GiB)": 368.61, "step": 1510, "train_speed(iter/s)": 0.211151 }, { "acc": 0.71930833, "epoch": 0.03843226788432268, "grad_norm": 2.375, "learning_rate": 3.843226788432268e-06, "loss": 1.18444099, "memory(GiB)": 368.61, "step": 1515, "train_speed(iter/s)": 0.211129 }, { "acc": 0.71606212, "epoch": 0.038559107052257735, "grad_norm": 2.46875, "learning_rate": 3.855910705225774e-06, "loss": 1.21894054, "memory(GiB)": 368.61, "step": 1520, "train_speed(iter/s)": 0.211196 }, { "acc": 0.70253611, "epoch": 0.03868594622019279, "grad_norm": 2.171875, "learning_rate": 3.86859462201928e-06, "loss": 1.23634224, "memory(GiB)": 368.61, "step": 1525, "train_speed(iter/s)": 0.211217 }, { "acc": 0.69997745, "epoch": 0.03881278538812785, "grad_norm": 2.203125, "learning_rate": 3.881278538812785e-06, "loss": 1.25708618, "memory(GiB)": 368.61, "step": 1530, "train_speed(iter/s)": 0.211262 }, { "acc": 0.71489706, "epoch": 0.03893962455606291, "grad_norm": 2.234375, "learning_rate": 3.893962455606292e-06, "loss": 1.20200424, "memory(GiB)": 368.61, "step": 1535, "train_speed(iter/s)": 0.211306 }, { "acc": 0.71914968, "epoch": 0.03906646372399797, "grad_norm": 2.5625, "learning_rate": 3.906646372399797e-06, "loss": 1.16475506, "memory(GiB)": 368.61, "step": 1540, "train_speed(iter/s)": 0.211352 }, { "acc": 0.69565382, "epoch": 0.03919330289193303, "grad_norm": 2.28125, "learning_rate": 3.919330289193303e-06, "loss": 1.24201202, "memory(GiB)": 368.61, "step": 1545, "train_speed(iter/s)": 0.211386 }, { "acc": 0.69225197, "epoch": 0.039320142059868085, "grad_norm": 2.53125, "learning_rate": 3.932014205986809e-06, "loss": 1.21871204, "memory(GiB)": 368.61, "step": 1550, "train_speed(iter/s)": 0.211516 }, { "acc": 0.72537565, "epoch": 0.03944698122780314, "grad_norm": 2.359375, "learning_rate": 3.944698122780315e-06, "loss": 1.18966408, "memory(GiB)": 368.61, "step": 1555, "train_speed(iter/s)": 0.211598 }, { "acc": 0.70735493, "epoch": 0.0395738203957382, "grad_norm": 3.59375, "learning_rate": 3.95738203957382e-06, "loss": 1.25340185, "memory(GiB)": 368.61, "step": 1560, "train_speed(iter/s)": 0.211658 }, { "acc": 0.71954675, "epoch": 0.03970065956367326, "grad_norm": 2.703125, "learning_rate": 3.970065956367327e-06, "loss": 1.23075323, "memory(GiB)": 368.61, "step": 1565, "train_speed(iter/s)": 0.211712 }, { "acc": 0.70653305, "epoch": 0.03982749873160832, "grad_norm": 2.296875, "learning_rate": 3.982749873160832e-06, "loss": 1.24468765, "memory(GiB)": 368.61, "step": 1570, "train_speed(iter/s)": 0.21182 }, { "acc": 0.6985673, "epoch": 0.03995433789954338, "grad_norm": 3.0625, "learning_rate": 3.995433789954338e-06, "loss": 1.27648888, "memory(GiB)": 368.61, "step": 1575, "train_speed(iter/s)": 0.211934 }, { "acc": 0.71385527, "epoch": 0.040081177067478435, "grad_norm": 1.9296875, "learning_rate": 4.008117706747844e-06, "loss": 1.24669886, "memory(GiB)": 368.61, "step": 1580, "train_speed(iter/s)": 0.211955 }, { "acc": 0.71240096, "epoch": 0.04020801623541349, "grad_norm": 2.75, "learning_rate": 4.02080162354135e-06, "loss": 1.18408031, "memory(GiB)": 368.61, "step": 1585, "train_speed(iter/s)": 0.212076 }, { "acc": 0.71752872, "epoch": 0.04033485540334855, "grad_norm": 2.375, "learning_rate": 4.033485540334856e-06, "loss": 1.19783077, "memory(GiB)": 368.61, "step": 1590, "train_speed(iter/s)": 0.212125 }, { "acc": 0.70621443, "epoch": 0.04046169457128361, "grad_norm": 1.96875, "learning_rate": 4.046169457128362e-06, "loss": 1.22866688, "memory(GiB)": 368.61, "step": 1595, "train_speed(iter/s)": 0.212177 }, { "acc": 0.70002546, "epoch": 0.04058853373921867, "grad_norm": 2.4375, "learning_rate": 4.058853373921867e-06, "loss": 1.28540878, "memory(GiB)": 368.61, "step": 1600, "train_speed(iter/s)": 0.21228 }, { "acc": 0.72262897, "epoch": 0.04071537290715373, "grad_norm": 2.171875, "learning_rate": 4.071537290715373e-06, "loss": 1.12066488, "memory(GiB)": 368.61, "step": 1605, "train_speed(iter/s)": 0.212344 }, { "acc": 0.70683312, "epoch": 0.040842212075088785, "grad_norm": 2.671875, "learning_rate": 4.084221207508879e-06, "loss": 1.22210197, "memory(GiB)": 368.61, "step": 1610, "train_speed(iter/s)": 0.212339 }, { "acc": 0.70306635, "epoch": 0.040969051243023843, "grad_norm": 2.421875, "learning_rate": 4.096905124302385e-06, "loss": 1.24115181, "memory(GiB)": 368.61, "step": 1615, "train_speed(iter/s)": 0.21243 }, { "acc": 0.71820469, "epoch": 0.0410958904109589, "grad_norm": 2.1875, "learning_rate": 4.109589041095891e-06, "loss": 1.15622663, "memory(GiB)": 368.61, "step": 1620, "train_speed(iter/s)": 0.212461 }, { "acc": 0.70159254, "epoch": 0.04122272957889396, "grad_norm": 2.4375, "learning_rate": 4.122272957889397e-06, "loss": 1.30148239, "memory(GiB)": 368.61, "step": 1625, "train_speed(iter/s)": 0.212492 }, { "acc": 0.70703411, "epoch": 0.04134956874682902, "grad_norm": 3.0625, "learning_rate": 4.134956874682902e-06, "loss": 1.20278301, "memory(GiB)": 368.61, "step": 1630, "train_speed(iter/s)": 0.21242 }, { "acc": 0.72401466, "epoch": 0.04147640791476408, "grad_norm": 2.015625, "learning_rate": 4.147640791476408e-06, "loss": 1.16009808, "memory(GiB)": 368.61, "step": 1635, "train_speed(iter/s)": 0.212484 }, { "acc": 0.70614481, "epoch": 0.041603247082699135, "grad_norm": 2.109375, "learning_rate": 4.160324708269914e-06, "loss": 1.27027855, "memory(GiB)": 368.61, "step": 1640, "train_speed(iter/s)": 0.21254 }, { "acc": 0.72243695, "epoch": 0.041730086250634194, "grad_norm": 1.953125, "learning_rate": 4.17300862506342e-06, "loss": 1.1396842, "memory(GiB)": 368.61, "step": 1645, "train_speed(iter/s)": 0.212577 }, { "acc": 0.71268206, "epoch": 0.04185692541856925, "grad_norm": 2.25, "learning_rate": 4.185692541856926e-06, "loss": 1.15543385, "memory(GiB)": 368.61, "step": 1650, "train_speed(iter/s)": 0.212581 }, { "acc": 0.71207747, "epoch": 0.04198376458650431, "grad_norm": 2.046875, "learning_rate": 4.198376458650432e-06, "loss": 1.18781605, "memory(GiB)": 368.61, "step": 1655, "train_speed(iter/s)": 0.212659 }, { "acc": 0.71515827, "epoch": 0.04211060375443937, "grad_norm": 2.640625, "learning_rate": 4.211060375443937e-06, "loss": 1.16350718, "memory(GiB)": 368.61, "step": 1660, "train_speed(iter/s)": 0.212768 }, { "acc": 0.69751368, "epoch": 0.04223744292237443, "grad_norm": 2.234375, "learning_rate": 4.223744292237444e-06, "loss": 1.30449085, "memory(GiB)": 368.61, "step": 1665, "train_speed(iter/s)": 0.212689 }, { "acc": 0.70890684, "epoch": 0.042364282090309485, "grad_norm": 2.546875, "learning_rate": 4.236428209030949e-06, "loss": 1.24865055, "memory(GiB)": 368.61, "step": 1670, "train_speed(iter/s)": 0.212734 }, { "acc": 0.69972019, "epoch": 0.042491121258244544, "grad_norm": 2.40625, "learning_rate": 4.249112125824455e-06, "loss": 1.24370031, "memory(GiB)": 368.61, "step": 1675, "train_speed(iter/s)": 0.212816 }, { "acc": 0.71093278, "epoch": 0.0426179604261796, "grad_norm": 2.609375, "learning_rate": 4.261796042617961e-06, "loss": 1.19533176, "memory(GiB)": 368.61, "step": 1680, "train_speed(iter/s)": 0.212891 }, { "acc": 0.72650585, "epoch": 0.04274479959411466, "grad_norm": 2.40625, "learning_rate": 4.274479959411467e-06, "loss": 1.15750351, "memory(GiB)": 368.61, "step": 1685, "train_speed(iter/s)": 0.212977 }, { "acc": 0.72726192, "epoch": 0.04287163876204972, "grad_norm": 2.21875, "learning_rate": 4.287163876204972e-06, "loss": 1.12178736, "memory(GiB)": 368.61, "step": 1690, "train_speed(iter/s)": 0.213071 }, { "acc": 0.70805802, "epoch": 0.04299847792998478, "grad_norm": 2.15625, "learning_rate": 4.299847792998479e-06, "loss": 1.1490243, "memory(GiB)": 368.61, "step": 1695, "train_speed(iter/s)": 0.213176 }, { "acc": 0.71232944, "epoch": 0.043125317097919835, "grad_norm": 2.625, "learning_rate": 4.312531709791984e-06, "loss": 1.2079505, "memory(GiB)": 368.61, "step": 1700, "train_speed(iter/s)": 0.213203 }, { "acc": 0.70138226, "epoch": 0.043252156265854894, "grad_norm": 2.984375, "learning_rate": 4.32521562658549e-06, "loss": 1.20759716, "memory(GiB)": 368.61, "step": 1705, "train_speed(iter/s)": 0.213264 }, { "acc": 0.71627789, "epoch": 0.04337899543378995, "grad_norm": 2.53125, "learning_rate": 4.337899543378996e-06, "loss": 1.20351553, "memory(GiB)": 368.61, "step": 1710, "train_speed(iter/s)": 0.213348 }, { "acc": 0.69190683, "epoch": 0.04350583460172501, "grad_norm": 2.015625, "learning_rate": 4.350583460172502e-06, "loss": 1.28105478, "memory(GiB)": 368.61, "step": 1715, "train_speed(iter/s)": 0.213417 }, { "acc": 0.72676544, "epoch": 0.04363267376966007, "grad_norm": 2.34375, "learning_rate": 4.363267376966007e-06, "loss": 1.14118938, "memory(GiB)": 368.61, "step": 1720, "train_speed(iter/s)": 0.213481 }, { "acc": 0.70680008, "epoch": 0.04375951293759513, "grad_norm": 2.171875, "learning_rate": 4.375951293759514e-06, "loss": 1.24821901, "memory(GiB)": 368.61, "step": 1725, "train_speed(iter/s)": 0.213545 }, { "acc": 0.716325, "epoch": 0.043886352105530185, "grad_norm": 2.515625, "learning_rate": 4.388635210553019e-06, "loss": 1.1932024, "memory(GiB)": 368.61, "step": 1730, "train_speed(iter/s)": 0.213578 }, { "acc": 0.71823931, "epoch": 0.044013191273465244, "grad_norm": 2.09375, "learning_rate": 4.401319127346525e-06, "loss": 1.18616409, "memory(GiB)": 368.61, "step": 1735, "train_speed(iter/s)": 0.213603 }, { "acc": 0.7121748, "epoch": 0.0441400304414003, "grad_norm": 2.671875, "learning_rate": 4.414003044140031e-06, "loss": 1.17575121, "memory(GiB)": 368.61, "step": 1740, "train_speed(iter/s)": 0.213657 }, { "acc": 0.72417321, "epoch": 0.04426686960933536, "grad_norm": 2.578125, "learning_rate": 4.426686960933537e-06, "loss": 1.19618349, "memory(GiB)": 368.61, "step": 1745, "train_speed(iter/s)": 0.213631 }, { "acc": 0.71272984, "epoch": 0.04439370877727042, "grad_norm": 2.015625, "learning_rate": 4.439370877727043e-06, "loss": 1.24028263, "memory(GiB)": 368.61, "step": 1750, "train_speed(iter/s)": 0.213707 }, { "acc": 0.70653648, "epoch": 0.04452054794520548, "grad_norm": 2.6875, "learning_rate": 4.4520547945205486e-06, "loss": 1.20298996, "memory(GiB)": 368.61, "step": 1755, "train_speed(iter/s)": 0.213725 }, { "acc": 0.73113375, "epoch": 0.044647387113140535, "grad_norm": 2.3125, "learning_rate": 4.464738711314054e-06, "loss": 1.1559763, "memory(GiB)": 368.61, "step": 1760, "train_speed(iter/s)": 0.213759 }, { "acc": 0.7150588, "epoch": 0.044774226281075594, "grad_norm": 2.171875, "learning_rate": 4.47742262810756e-06, "loss": 1.22648363, "memory(GiB)": 368.61, "step": 1765, "train_speed(iter/s)": 0.213815 }, { "acc": 0.71587, "epoch": 0.04490106544901065, "grad_norm": 1.96875, "learning_rate": 4.490106544901066e-06, "loss": 1.1519558, "memory(GiB)": 368.61, "step": 1770, "train_speed(iter/s)": 0.213857 }, { "acc": 0.70585794, "epoch": 0.04502790461694571, "grad_norm": 3.0, "learning_rate": 4.502790461694572e-06, "loss": 1.26226416, "memory(GiB)": 368.61, "step": 1775, "train_speed(iter/s)": 0.213914 }, { "acc": 0.71313887, "epoch": 0.04515474378488077, "grad_norm": 2.421875, "learning_rate": 4.5154743784880776e-06, "loss": 1.25774689, "memory(GiB)": 368.61, "step": 1780, "train_speed(iter/s)": 0.214005 }, { "acc": 0.71457701, "epoch": 0.04528158295281583, "grad_norm": 2.03125, "learning_rate": 4.5281582952815835e-06, "loss": 1.19510107, "memory(GiB)": 368.61, "step": 1785, "train_speed(iter/s)": 0.214028 }, { "acc": 0.71161518, "epoch": 0.045408422120750885, "grad_norm": 3.15625, "learning_rate": 4.540842212075089e-06, "loss": 1.19052258, "memory(GiB)": 368.61, "step": 1790, "train_speed(iter/s)": 0.21406 }, { "acc": 0.71418066, "epoch": 0.045535261288685944, "grad_norm": 2.65625, "learning_rate": 4.553526128868595e-06, "loss": 1.21041765, "memory(GiB)": 368.61, "step": 1795, "train_speed(iter/s)": 0.21414 }, { "acc": 0.70951414, "epoch": 0.045662100456621, "grad_norm": 1.671875, "learning_rate": 4.566210045662101e-06, "loss": 1.20413208, "memory(GiB)": 368.61, "step": 1800, "train_speed(iter/s)": 0.21414 }, { "acc": 0.73836546, "epoch": 0.04578893962455606, "grad_norm": 2.453125, "learning_rate": 4.5788939624556065e-06, "loss": 1.08171883, "memory(GiB)": 368.61, "step": 1805, "train_speed(iter/s)": 0.214203 }, { "acc": 0.71318913, "epoch": 0.04591577879249112, "grad_norm": 2.1875, "learning_rate": 4.5915778792491125e-06, "loss": 1.20642776, "memory(GiB)": 368.61, "step": 1810, "train_speed(iter/s)": 0.21424 }, { "acc": 0.68736038, "epoch": 0.04604261796042618, "grad_norm": 2.859375, "learning_rate": 4.6042617960426185e-06, "loss": 1.31945629, "memory(GiB)": 368.61, "step": 1815, "train_speed(iter/s)": 0.21435 }, { "acc": 0.70922689, "epoch": 0.046169457128361235, "grad_norm": 2.375, "learning_rate": 4.616945712836124e-06, "loss": 1.19198933, "memory(GiB)": 368.61, "step": 1820, "train_speed(iter/s)": 0.214445 }, { "acc": 0.719174, "epoch": 0.046296296296296294, "grad_norm": 2.421875, "learning_rate": 4.62962962962963e-06, "loss": 1.21845455, "memory(GiB)": 368.61, "step": 1825, "train_speed(iter/s)": 0.21445 }, { "acc": 0.71450276, "epoch": 0.04642313546423135, "grad_norm": 2.796875, "learning_rate": 4.6423135464231355e-06, "loss": 1.26803169, "memory(GiB)": 368.61, "step": 1830, "train_speed(iter/s)": 0.214502 }, { "acc": 0.72662721, "epoch": 0.04654997463216641, "grad_norm": 3.046875, "learning_rate": 4.6549974632166415e-06, "loss": 1.16515732, "memory(GiB)": 368.61, "step": 1835, "train_speed(iter/s)": 0.214569 }, { "acc": 0.7353663, "epoch": 0.04667681380010147, "grad_norm": 2.296875, "learning_rate": 4.6676813800101475e-06, "loss": 1.13579741, "memory(GiB)": 368.61, "step": 1840, "train_speed(iter/s)": 0.214615 }, { "acc": 0.70937881, "epoch": 0.04680365296803653, "grad_norm": 2.65625, "learning_rate": 4.6803652968036534e-06, "loss": 1.227248, "memory(GiB)": 368.61, "step": 1845, "train_speed(iter/s)": 0.214696 }, { "acc": 0.7222621, "epoch": 0.046930492135971585, "grad_norm": 2.203125, "learning_rate": 4.6930492135971586e-06, "loss": 1.1796917, "memory(GiB)": 368.61, "step": 1850, "train_speed(iter/s)": 0.21472 }, { "acc": 0.71726837, "epoch": 0.047057331303906644, "grad_norm": 2.203125, "learning_rate": 4.705733130390665e-06, "loss": 1.16343346, "memory(GiB)": 368.61, "step": 1855, "train_speed(iter/s)": 0.214798 }, { "acc": 0.69328208, "epoch": 0.0471841704718417, "grad_norm": 1.7578125, "learning_rate": 4.7184170471841705e-06, "loss": 1.2150425, "memory(GiB)": 368.61, "step": 1860, "train_speed(iter/s)": 0.214825 }, { "acc": 0.71067066, "epoch": 0.04731100963977676, "grad_norm": 2.265625, "learning_rate": 4.7311009639776765e-06, "loss": 1.22681208, "memory(GiB)": 368.61, "step": 1865, "train_speed(iter/s)": 0.214828 }, { "acc": 0.69871387, "epoch": 0.04743784880771182, "grad_norm": 2.484375, "learning_rate": 4.743784880771182e-06, "loss": 1.23960752, "memory(GiB)": 368.61, "step": 1870, "train_speed(iter/s)": 0.214894 }, { "acc": 0.71352377, "epoch": 0.04756468797564688, "grad_norm": 2.3125, "learning_rate": 4.756468797564688e-06, "loss": 1.17368212, "memory(GiB)": 368.61, "step": 1875, "train_speed(iter/s)": 0.214975 }, { "acc": 0.71735239, "epoch": 0.047691527143581935, "grad_norm": 2.390625, "learning_rate": 4.769152714358194e-06, "loss": 1.25607719, "memory(GiB)": 368.61, "step": 1880, "train_speed(iter/s)": 0.214992 }, { "acc": 0.7238553, "epoch": 0.047818366311516994, "grad_norm": 1.875, "learning_rate": 4.7818366311517e-06, "loss": 1.19626551, "memory(GiB)": 368.61, "step": 1885, "train_speed(iter/s)": 0.215023 }, { "acc": 0.7171484, "epoch": 0.04794520547945205, "grad_norm": 2.03125, "learning_rate": 4.7945205479452054e-06, "loss": 1.20751362, "memory(GiB)": 368.61, "step": 1890, "train_speed(iter/s)": 0.215103 }, { "acc": 0.7239717, "epoch": 0.04807204464738711, "grad_norm": 2.515625, "learning_rate": 4.807204464738711e-06, "loss": 1.16265554, "memory(GiB)": 368.61, "step": 1895, "train_speed(iter/s)": 0.21512 }, { "acc": 0.7145359, "epoch": 0.04819888381532217, "grad_norm": 3.625, "learning_rate": 4.819888381532217e-06, "loss": 1.18276482, "memory(GiB)": 368.61, "step": 1900, "train_speed(iter/s)": 0.215154 }, { "acc": 0.71529121, "epoch": 0.04832572298325723, "grad_norm": 2.90625, "learning_rate": 4.832572298325723e-06, "loss": 1.1730238, "memory(GiB)": 368.61, "step": 1905, "train_speed(iter/s)": 0.215146 }, { "acc": 0.71257401, "epoch": 0.048452562151192285, "grad_norm": 2.65625, "learning_rate": 4.845256215119229e-06, "loss": 1.17314777, "memory(GiB)": 368.61, "step": 1910, "train_speed(iter/s)": 0.215138 }, { "acc": 0.71851349, "epoch": 0.048579401319127344, "grad_norm": 2.15625, "learning_rate": 4.857940131912735e-06, "loss": 1.18030472, "memory(GiB)": 368.61, "step": 1915, "train_speed(iter/s)": 0.215158 }, { "acc": 0.72114267, "epoch": 0.0487062404870624, "grad_norm": 2.1875, "learning_rate": 4.87062404870624e-06, "loss": 1.1065196, "memory(GiB)": 368.61, "step": 1920, "train_speed(iter/s)": 0.215162 }, { "acc": 0.70176163, "epoch": 0.04883307965499746, "grad_norm": 2.28125, "learning_rate": 4.883307965499746e-06, "loss": 1.21968546, "memory(GiB)": 368.61, "step": 1925, "train_speed(iter/s)": 0.215217 }, { "acc": 0.70226002, "epoch": 0.04895991882293252, "grad_norm": 2.15625, "learning_rate": 4.895991882293252e-06, "loss": 1.25086126, "memory(GiB)": 368.61, "step": 1930, "train_speed(iter/s)": 0.215264 }, { "acc": 0.71436086, "epoch": 0.04908675799086758, "grad_norm": 2.296875, "learning_rate": 4.908675799086758e-06, "loss": 1.14739237, "memory(GiB)": 368.61, "step": 1935, "train_speed(iter/s)": 0.215305 }, { "acc": 0.71588769, "epoch": 0.049213597158802636, "grad_norm": 2.1875, "learning_rate": 4.921359715880264e-06, "loss": 1.21228409, "memory(GiB)": 368.61, "step": 1940, "train_speed(iter/s)": 0.215357 }, { "acc": 0.71544075, "epoch": 0.049340436326737694, "grad_norm": 2.15625, "learning_rate": 4.93404363267377e-06, "loss": 1.18400126, "memory(GiB)": 368.61, "step": 1945, "train_speed(iter/s)": 0.215378 }, { "acc": 0.73573608, "epoch": 0.04946727549467275, "grad_norm": 1.828125, "learning_rate": 4.946727549467275e-06, "loss": 1.13237867, "memory(GiB)": 368.61, "step": 1950, "train_speed(iter/s)": 0.215444 }, { "acc": 0.72503214, "epoch": 0.04959411466260781, "grad_norm": 2.21875, "learning_rate": 4.959411466260781e-06, "loss": 1.109375, "memory(GiB)": 368.61, "step": 1955, "train_speed(iter/s)": 0.215517 }, { "acc": 0.72331491, "epoch": 0.04972095383054287, "grad_norm": 2.4375, "learning_rate": 4.972095383054287e-06, "loss": 1.21074104, "memory(GiB)": 368.61, "step": 1960, "train_speed(iter/s)": 0.215599 }, { "acc": 0.7225297, "epoch": 0.04984779299847793, "grad_norm": 2.40625, "learning_rate": 4.984779299847793e-06, "loss": 1.12964401, "memory(GiB)": 368.61, "step": 1965, "train_speed(iter/s)": 0.215643 }, { "acc": 0.71861229, "epoch": 0.049974632166412986, "grad_norm": 2.859375, "learning_rate": 4.997463216641299e-06, "loss": 1.21870165, "memory(GiB)": 368.61, "step": 1970, "train_speed(iter/s)": 0.215713 }, { "acc": 0.71985264, "epoch": 0.050101471334348044, "grad_norm": 2.3125, "learning_rate": 5.010147133434805e-06, "loss": 1.18171082, "memory(GiB)": 368.61, "step": 1975, "train_speed(iter/s)": 0.215773 }, { "acc": 0.72492809, "epoch": 0.0502283105022831, "grad_norm": 2.515625, "learning_rate": 5.02283105022831e-06, "loss": 1.13915653, "memory(GiB)": 368.61, "step": 1980, "train_speed(iter/s)": 0.215796 }, { "acc": 0.70064459, "epoch": 0.05035514967021816, "grad_norm": 2.765625, "learning_rate": 5.035514967021817e-06, "loss": 1.27141895, "memory(GiB)": 368.61, "step": 1985, "train_speed(iter/s)": 0.215844 }, { "acc": 0.72150631, "epoch": 0.05048198883815322, "grad_norm": 2.015625, "learning_rate": 5.048198883815323e-06, "loss": 1.16398497, "memory(GiB)": 368.61, "step": 1990, "train_speed(iter/s)": 0.215906 }, { "acc": 0.70610275, "epoch": 0.05060882800608828, "grad_norm": 2.484375, "learning_rate": 5.060882800608828e-06, "loss": 1.2341753, "memory(GiB)": 368.61, "step": 1995, "train_speed(iter/s)": 0.21597 }, { "acc": 0.7120717, "epoch": 0.050735667174023336, "grad_norm": 2.359375, "learning_rate": 5.073566717402334e-06, "loss": 1.19752502, "memory(GiB)": 368.61, "step": 2000, "train_speed(iter/s)": 0.216056 }, { "epoch": 0.050735667174023336, "eval_acc": 0.7043548482105146, "eval_loss": 1.1560105085372925, "eval_runtime": 384.9523, "eval_samples_per_second": 16.548, "eval_steps_per_second": 8.274, "step": 2000 }, { "acc": 0.71890116, "epoch": 0.050862506341958394, "grad_norm": 2.578125, "learning_rate": 5.086250634195841e-06, "loss": 1.1635643, "memory(GiB)": 368.61, "step": 2005, "train_speed(iter/s)": 0.201362 }, { "acc": 0.7069828, "epoch": 0.05098934550989345, "grad_norm": 2.25, "learning_rate": 5.098934550989346e-06, "loss": 1.17529182, "memory(GiB)": 368.61, "step": 2010, "train_speed(iter/s)": 0.201484 }, { "acc": 0.72755213, "epoch": 0.05111618467782851, "grad_norm": 2.765625, "learning_rate": 5.111618467782852e-06, "loss": 1.15269604, "memory(GiB)": 368.61, "step": 2015, "train_speed(iter/s)": 0.201556 }, { "acc": 0.71663647, "epoch": 0.05124302384576357, "grad_norm": 2.15625, "learning_rate": 5.124302384576357e-06, "loss": 1.16693144, "memory(GiB)": 368.61, "step": 2020, "train_speed(iter/s)": 0.201625 }, { "acc": 0.71790447, "epoch": 0.05136986301369863, "grad_norm": 2.234375, "learning_rate": 5.136986301369864e-06, "loss": 1.1630415, "memory(GiB)": 368.61, "step": 2025, "train_speed(iter/s)": 0.201682 }, { "acc": 0.71401701, "epoch": 0.051496702181633686, "grad_norm": 2.515625, "learning_rate": 5.149670218163369e-06, "loss": 1.12907391, "memory(GiB)": 368.61, "step": 2030, "train_speed(iter/s)": 0.20177 }, { "acc": 0.72034936, "epoch": 0.051623541349568744, "grad_norm": 2.765625, "learning_rate": 5.162354134956875e-06, "loss": 1.22061062, "memory(GiB)": 368.61, "step": 2035, "train_speed(iter/s)": 0.201849 }, { "acc": 0.72656493, "epoch": 0.0517503805175038, "grad_norm": 2.578125, "learning_rate": 5.175038051750381e-06, "loss": 1.14752302, "memory(GiB)": 368.61, "step": 2040, "train_speed(iter/s)": 0.201925 }, { "acc": 0.71860065, "epoch": 0.05187721968543886, "grad_norm": 3.25, "learning_rate": 5.187721968543887e-06, "loss": 1.19526348, "memory(GiB)": 368.61, "step": 2045, "train_speed(iter/s)": 0.202024 }, { "acc": 0.71781025, "epoch": 0.05200405885337392, "grad_norm": 2.328125, "learning_rate": 5.200405885337393e-06, "loss": 1.12509279, "memory(GiB)": 368.61, "step": 2050, "train_speed(iter/s)": 0.202069 }, { "acc": 0.7176774, "epoch": 0.05213089802130898, "grad_norm": 2.53125, "learning_rate": 5.213089802130898e-06, "loss": 1.15644932, "memory(GiB)": 368.61, "step": 2055, "train_speed(iter/s)": 0.202141 }, { "acc": 0.72167883, "epoch": 0.052257737189244036, "grad_norm": 2.015625, "learning_rate": 5.225773718924404e-06, "loss": 1.13719196, "memory(GiB)": 368.61, "step": 2060, "train_speed(iter/s)": 0.202176 }, { "acc": 0.69164829, "epoch": 0.052384576357179094, "grad_norm": 2.5, "learning_rate": 5.238457635717911e-06, "loss": 1.30127983, "memory(GiB)": 368.61, "step": 2065, "train_speed(iter/s)": 0.202245 }, { "acc": 0.72059307, "epoch": 0.05251141552511415, "grad_norm": 2.4375, "learning_rate": 5.251141552511416e-06, "loss": 1.19031296, "memory(GiB)": 368.61, "step": 2070, "train_speed(iter/s)": 0.202309 }, { "acc": 0.70207314, "epoch": 0.05263825469304921, "grad_norm": 2.203125, "learning_rate": 5.263825469304922e-06, "loss": 1.23805256, "memory(GiB)": 368.61, "step": 2075, "train_speed(iter/s)": 0.202382 }, { "acc": 0.69533229, "epoch": 0.05276509386098427, "grad_norm": 2.296875, "learning_rate": 5.276509386098427e-06, "loss": 1.24168034, "memory(GiB)": 368.61, "step": 2080, "train_speed(iter/s)": 0.20247 }, { "acc": 0.71109486, "epoch": 0.05289193302891933, "grad_norm": 2.390625, "learning_rate": 5.289193302891934e-06, "loss": 1.15872955, "memory(GiB)": 368.61, "step": 2085, "train_speed(iter/s)": 0.202525 }, { "acc": 0.72278967, "epoch": 0.053018772196854386, "grad_norm": 2.046875, "learning_rate": 5.30187721968544e-06, "loss": 1.14352875, "memory(GiB)": 368.61, "step": 2090, "train_speed(iter/s)": 0.202613 }, { "acc": 0.7074378, "epoch": 0.053145611364789444, "grad_norm": 2.046875, "learning_rate": 5.314561136478945e-06, "loss": 1.21018066, "memory(GiB)": 368.61, "step": 2095, "train_speed(iter/s)": 0.202661 }, { "acc": 0.71318331, "epoch": 0.0532724505327245, "grad_norm": 2.265625, "learning_rate": 5.327245053272451e-06, "loss": 1.1899353, "memory(GiB)": 368.61, "step": 2100, "train_speed(iter/s)": 0.202731 }, { "acc": 0.73359394, "epoch": 0.05339928970065956, "grad_norm": 2.625, "learning_rate": 5.339928970065957e-06, "loss": 1.10934029, "memory(GiB)": 368.61, "step": 2105, "train_speed(iter/s)": 0.202819 }, { "acc": 0.72820601, "epoch": 0.05352612886859462, "grad_norm": 3.078125, "learning_rate": 5.352612886859463e-06, "loss": 1.19632225, "memory(GiB)": 368.61, "step": 2110, "train_speed(iter/s)": 0.202892 }, { "acc": 0.71673074, "epoch": 0.05365296803652968, "grad_norm": 2.3125, "learning_rate": 5.365296803652969e-06, "loss": 1.15741949, "memory(GiB)": 368.61, "step": 2115, "train_speed(iter/s)": 0.202974 }, { "acc": 0.71168146, "epoch": 0.053779807204464736, "grad_norm": 2.328125, "learning_rate": 5.377980720446474e-06, "loss": 1.14071989, "memory(GiB)": 368.61, "step": 2120, "train_speed(iter/s)": 0.203005 }, { "acc": 0.70732088, "epoch": 0.053906646372399794, "grad_norm": 2.625, "learning_rate": 5.390664637239981e-06, "loss": 1.18536816, "memory(GiB)": 368.61, "step": 2125, "train_speed(iter/s)": 0.203077 }, { "acc": 0.71229, "epoch": 0.05403348554033485, "grad_norm": 2.34375, "learning_rate": 5.403348554033486e-06, "loss": 1.17797298, "memory(GiB)": 368.61, "step": 2130, "train_speed(iter/s)": 0.203169 }, { "acc": 0.7279676, "epoch": 0.05416032470826991, "grad_norm": 2.15625, "learning_rate": 5.416032470826992e-06, "loss": 1.11711845, "memory(GiB)": 368.61, "step": 2135, "train_speed(iter/s)": 0.203251 }, { "acc": 0.73158326, "epoch": 0.05428716387620497, "grad_norm": 2.171875, "learning_rate": 5.428716387620497e-06, "loss": 1.14432068, "memory(GiB)": 368.61, "step": 2140, "train_speed(iter/s)": 0.203303 }, { "acc": 0.71476722, "epoch": 0.05441400304414003, "grad_norm": 2.40625, "learning_rate": 5.441400304414004e-06, "loss": 1.18934612, "memory(GiB)": 368.61, "step": 2145, "train_speed(iter/s)": 0.203405 }, { "acc": 0.71937442, "epoch": 0.054540842212075086, "grad_norm": 4.875, "learning_rate": 5.45408422120751e-06, "loss": 1.18501949, "memory(GiB)": 368.61, "step": 2150, "train_speed(iter/s)": 0.203456 }, { "acc": 0.70774174, "epoch": 0.054667681380010144, "grad_norm": 2.09375, "learning_rate": 5.466768138001015e-06, "loss": 1.25616207, "memory(GiB)": 368.61, "step": 2155, "train_speed(iter/s)": 0.203532 }, { "acc": 0.72152176, "epoch": 0.0547945205479452, "grad_norm": 2.046875, "learning_rate": 5.479452054794521e-06, "loss": 1.19778566, "memory(GiB)": 368.61, "step": 2160, "train_speed(iter/s)": 0.203619 }, { "acc": 0.70694833, "epoch": 0.05492135971588026, "grad_norm": 2.90625, "learning_rate": 5.492135971588028e-06, "loss": 1.17392483, "memory(GiB)": 368.61, "step": 2165, "train_speed(iter/s)": 0.203711 }, { "acc": 0.69939289, "epoch": 0.05504819888381532, "grad_norm": 1.921875, "learning_rate": 5.504819888381533e-06, "loss": 1.26126318, "memory(GiB)": 368.61, "step": 2170, "train_speed(iter/s)": 0.203782 }, { "acc": 0.7172267, "epoch": 0.05517503805175038, "grad_norm": 2.1875, "learning_rate": 5.517503805175039e-06, "loss": 1.1652009, "memory(GiB)": 368.61, "step": 2175, "train_speed(iter/s)": 0.20382 }, { "acc": 0.71726351, "epoch": 0.055301877219685436, "grad_norm": 2.4375, "learning_rate": 5.530187721968544e-06, "loss": 1.16243095, "memory(GiB)": 368.61, "step": 2180, "train_speed(iter/s)": 0.203839 }, { "acc": 0.71075759, "epoch": 0.055428716387620494, "grad_norm": 2.5625, "learning_rate": 5.542871638762051e-06, "loss": 1.21801834, "memory(GiB)": 368.61, "step": 2185, "train_speed(iter/s)": 0.203879 }, { "acc": 0.70980539, "epoch": 0.05555555555555555, "grad_norm": 2.25, "learning_rate": 5.555555555555557e-06, "loss": 1.23910122, "memory(GiB)": 368.61, "step": 2190, "train_speed(iter/s)": 0.203924 }, { "acc": 0.72193117, "epoch": 0.05568239472349061, "grad_norm": 1.7890625, "learning_rate": 5.568239472349062e-06, "loss": 1.17492046, "memory(GiB)": 368.61, "step": 2195, "train_speed(iter/s)": 0.204 }, { "acc": 0.71564255, "epoch": 0.05580923389142567, "grad_norm": 3.09375, "learning_rate": 5.580923389142568e-06, "loss": 1.20739002, "memory(GiB)": 368.61, "step": 2200, "train_speed(iter/s)": 0.204075 }, { "acc": 0.71219196, "epoch": 0.05593607305936073, "grad_norm": 2.328125, "learning_rate": 5.593607305936074e-06, "loss": 1.19609261, "memory(GiB)": 368.61, "step": 2205, "train_speed(iter/s)": 0.204087 }, { "acc": 0.71430511, "epoch": 0.056062912227295786, "grad_norm": 2.109375, "learning_rate": 5.60629122272958e-06, "loss": 1.16380005, "memory(GiB)": 368.61, "step": 2210, "train_speed(iter/s)": 0.204153 }, { "acc": 0.71490288, "epoch": 0.056189751395230844, "grad_norm": 2.546875, "learning_rate": 5.618975139523085e-06, "loss": 1.23275061, "memory(GiB)": 368.61, "step": 2215, "train_speed(iter/s)": 0.20421 }, { "acc": 0.72055745, "epoch": 0.0563165905631659, "grad_norm": 2.09375, "learning_rate": 5.631659056316591e-06, "loss": 1.10931759, "memory(GiB)": 368.61, "step": 2220, "train_speed(iter/s)": 0.204278 }, { "acc": 0.71948996, "epoch": 0.05644342973110096, "grad_norm": 2.34375, "learning_rate": 5.644342973110098e-06, "loss": 1.16461544, "memory(GiB)": 368.61, "step": 2225, "train_speed(iter/s)": 0.204354 }, { "acc": 0.72391787, "epoch": 0.05657026889903602, "grad_norm": 2.46875, "learning_rate": 5.657026889903603e-06, "loss": 1.1592205, "memory(GiB)": 368.61, "step": 2230, "train_speed(iter/s)": 0.204381 }, { "acc": 0.71375637, "epoch": 0.05669710806697108, "grad_norm": 1.8828125, "learning_rate": 5.669710806697109e-06, "loss": 1.21173067, "memory(GiB)": 368.61, "step": 2235, "train_speed(iter/s)": 0.204442 }, { "acc": 0.71706305, "epoch": 0.056823947234906136, "grad_norm": 5.84375, "learning_rate": 5.682394723490614e-06, "loss": 1.20089169, "memory(GiB)": 368.61, "step": 2240, "train_speed(iter/s)": 0.204506 }, { "acc": 0.70849895, "epoch": 0.056950786402841194, "grad_norm": 2.0625, "learning_rate": 5.695078640284121e-06, "loss": 1.19618158, "memory(GiB)": 368.61, "step": 2245, "train_speed(iter/s)": 0.204596 }, { "acc": 0.72221699, "epoch": 0.05707762557077625, "grad_norm": 2.328125, "learning_rate": 5.7077625570776266e-06, "loss": 1.13436441, "memory(GiB)": 368.61, "step": 2250, "train_speed(iter/s)": 0.204684 }, { "acc": 0.71266088, "epoch": 0.05720446473871131, "grad_norm": 2.109375, "learning_rate": 5.720446473871132e-06, "loss": 1.21465921, "memory(GiB)": 368.61, "step": 2255, "train_speed(iter/s)": 0.204769 }, { "acc": 0.73061504, "epoch": 0.05733130390664637, "grad_norm": 2.34375, "learning_rate": 5.733130390664638e-06, "loss": 1.09400196, "memory(GiB)": 368.61, "step": 2260, "train_speed(iter/s)": 0.204838 }, { "acc": 0.71387711, "epoch": 0.05745814307458143, "grad_norm": 2.6875, "learning_rate": 5.7458143074581445e-06, "loss": 1.1826313, "memory(GiB)": 368.61, "step": 2265, "train_speed(iter/s)": 0.20491 }, { "acc": 0.71546655, "epoch": 0.057584982242516486, "grad_norm": 2.390625, "learning_rate": 5.75849822425165e-06, "loss": 1.19562092, "memory(GiB)": 368.61, "step": 2270, "train_speed(iter/s)": 0.204997 }, { "acc": 0.70478883, "epoch": 0.057711821410451544, "grad_norm": 2.34375, "learning_rate": 5.7711821410451556e-06, "loss": 1.24934177, "memory(GiB)": 368.61, "step": 2275, "train_speed(iter/s)": 0.205039 }, { "acc": 0.723385, "epoch": 0.0578386605783866, "grad_norm": 2.171875, "learning_rate": 5.783866057838661e-06, "loss": 1.13148174, "memory(GiB)": 368.61, "step": 2280, "train_speed(iter/s)": 0.205102 }, { "acc": 0.7013629, "epoch": 0.05796549974632166, "grad_norm": 2.265625, "learning_rate": 5.7965499746321675e-06, "loss": 1.24249363, "memory(GiB)": 368.61, "step": 2285, "train_speed(iter/s)": 0.205169 }, { "acc": 0.71498299, "epoch": 0.05809233891425672, "grad_norm": 2.515625, "learning_rate": 5.809233891425673e-06, "loss": 1.20102062, "memory(GiB)": 368.61, "step": 2290, "train_speed(iter/s)": 0.205199 }, { "acc": 0.72681522, "epoch": 0.05821917808219178, "grad_norm": 2.234375, "learning_rate": 5.821917808219179e-06, "loss": 1.13309422, "memory(GiB)": 368.61, "step": 2295, "train_speed(iter/s)": 0.205268 }, { "acc": 0.72278872, "epoch": 0.058346017250126836, "grad_norm": 2.234375, "learning_rate": 5.834601725012684e-06, "loss": 1.14178524, "memory(GiB)": 368.61, "step": 2300, "train_speed(iter/s)": 0.20529 }, { "acc": 0.71350398, "epoch": 0.058472856418061894, "grad_norm": 2.5625, "learning_rate": 5.8472856418061905e-06, "loss": 1.18933887, "memory(GiB)": 368.61, "step": 2305, "train_speed(iter/s)": 0.205339 }, { "acc": 0.71697283, "epoch": 0.05859969558599695, "grad_norm": 2.390625, "learning_rate": 5.8599695585996965e-06, "loss": 1.15378876, "memory(GiB)": 368.61, "step": 2310, "train_speed(iter/s)": 0.205399 }, { "acc": 0.72198954, "epoch": 0.05872653475393201, "grad_norm": 1.9921875, "learning_rate": 5.872653475393202e-06, "loss": 1.1266058, "memory(GiB)": 368.61, "step": 2315, "train_speed(iter/s)": 0.205455 }, { "acc": 0.7215312, "epoch": 0.05885337392186707, "grad_norm": 2.34375, "learning_rate": 5.8853373921867076e-06, "loss": 1.23985109, "memory(GiB)": 368.61, "step": 2320, "train_speed(iter/s)": 0.205506 }, { "acc": 0.73134651, "epoch": 0.05898021308980213, "grad_norm": 2.40625, "learning_rate": 5.898021308980214e-06, "loss": 1.1188056, "memory(GiB)": 368.61, "step": 2325, "train_speed(iter/s)": 0.20555 }, { "acc": 0.70272422, "epoch": 0.059107052257737186, "grad_norm": 2.578125, "learning_rate": 5.9107052257737195e-06, "loss": 1.28168879, "memory(GiB)": 368.61, "step": 2330, "train_speed(iter/s)": 0.205573 }, { "acc": 0.71831431, "epoch": 0.059233891425672244, "grad_norm": 2.140625, "learning_rate": 5.9233891425672255e-06, "loss": 1.23153753, "memory(GiB)": 368.61, "step": 2335, "train_speed(iter/s)": 0.205637 }, { "acc": 0.73288779, "epoch": 0.0593607305936073, "grad_norm": 2.34375, "learning_rate": 5.936073059360731e-06, "loss": 1.10426483, "memory(GiB)": 368.61, "step": 2340, "train_speed(iter/s)": 0.205718 }, { "acc": 0.72807922, "epoch": 0.05948756976154236, "grad_norm": 2.234375, "learning_rate": 5.948756976154237e-06, "loss": 1.16011105, "memory(GiB)": 368.61, "step": 2345, "train_speed(iter/s)": 0.205768 }, { "acc": 0.72729702, "epoch": 0.05961440892947742, "grad_norm": 2.234375, "learning_rate": 5.961440892947743e-06, "loss": 1.17491999, "memory(GiB)": 368.61, "step": 2350, "train_speed(iter/s)": 0.205823 }, { "acc": 0.72781162, "epoch": 0.05974124809741248, "grad_norm": 2.34375, "learning_rate": 5.9741248097412485e-06, "loss": 1.11419735, "memory(GiB)": 368.61, "step": 2355, "train_speed(iter/s)": 0.205893 }, { "acc": 0.72597222, "epoch": 0.059868087265347536, "grad_norm": 2.09375, "learning_rate": 5.9868087265347545e-06, "loss": 1.15922832, "memory(GiB)": 368.61, "step": 2360, "train_speed(iter/s)": 0.205963 }, { "acc": 0.72740717, "epoch": 0.059994926433282594, "grad_norm": 2.390625, "learning_rate": 5.99949264332826e-06, "loss": 1.13273296, "memory(GiB)": 368.61, "step": 2365, "train_speed(iter/s)": 0.205936 }, { "acc": 0.71413913, "epoch": 0.06012176560121765, "grad_norm": 2.015625, "learning_rate": 6.012176560121766e-06, "loss": 1.16828451, "memory(GiB)": 368.61, "step": 2370, "train_speed(iter/s)": 0.205976 }, { "acc": 0.72816858, "epoch": 0.06024860476915271, "grad_norm": 2.3125, "learning_rate": 6.0248604769152715e-06, "loss": 1.17847824, "memory(GiB)": 368.61, "step": 2375, "train_speed(iter/s)": 0.206028 }, { "acc": 0.73078432, "epoch": 0.06037544393708777, "grad_norm": 2.765625, "learning_rate": 6.0375443937087775e-06, "loss": 1.09621315, "memory(GiB)": 368.61, "step": 2380, "train_speed(iter/s)": 0.206074 }, { "acc": 0.72156239, "epoch": 0.06050228310502283, "grad_norm": 2.09375, "learning_rate": 6.050228310502284e-06, "loss": 1.24162006, "memory(GiB)": 368.61, "step": 2385, "train_speed(iter/s)": 0.206107 }, { "acc": 0.72205791, "epoch": 0.060629122272957886, "grad_norm": 2.109375, "learning_rate": 6.062912227295789e-06, "loss": 1.15279827, "memory(GiB)": 368.61, "step": 2390, "train_speed(iter/s)": 0.206163 }, { "acc": 0.71550217, "epoch": 0.060755961440892944, "grad_norm": 2.171875, "learning_rate": 6.075596144089295e-06, "loss": 1.12107115, "memory(GiB)": 368.61, "step": 2395, "train_speed(iter/s)": 0.20616 }, { "acc": 0.71892586, "epoch": 0.060882800608828, "grad_norm": 2.40625, "learning_rate": 6.0882800608828005e-06, "loss": 1.21744032, "memory(GiB)": 368.61, "step": 2400, "train_speed(iter/s)": 0.206233 }, { "acc": 0.70745955, "epoch": 0.06100963977676306, "grad_norm": 2.015625, "learning_rate": 6.100963977676307e-06, "loss": 1.21105804, "memory(GiB)": 368.61, "step": 2405, "train_speed(iter/s)": 0.20627 }, { "acc": 0.71549311, "epoch": 0.06113647894469812, "grad_norm": 2.109375, "learning_rate": 6.113647894469813e-06, "loss": 1.16728354, "memory(GiB)": 368.61, "step": 2410, "train_speed(iter/s)": 0.206333 }, { "acc": 0.73254848, "epoch": 0.06126331811263318, "grad_norm": 2.0625, "learning_rate": 6.126331811263318e-06, "loss": 1.11597424, "memory(GiB)": 368.61, "step": 2415, "train_speed(iter/s)": 0.206356 }, { "acc": 0.73595867, "epoch": 0.061390157280568236, "grad_norm": 2.265625, "learning_rate": 6.139015728056824e-06, "loss": 1.12051773, "memory(GiB)": 368.61, "step": 2420, "train_speed(iter/s)": 0.206422 }, { "acc": 0.71729565, "epoch": 0.061516996448503294, "grad_norm": 2.109375, "learning_rate": 6.151699644850331e-06, "loss": 1.17029037, "memory(GiB)": 368.61, "step": 2425, "train_speed(iter/s)": 0.20647 }, { "acc": 0.72923269, "epoch": 0.06164383561643835, "grad_norm": 1.9765625, "learning_rate": 6.164383561643836e-06, "loss": 1.11848583, "memory(GiB)": 368.61, "step": 2430, "train_speed(iter/s)": 0.206519 }, { "acc": 0.7136055, "epoch": 0.06177067478437341, "grad_norm": 2.15625, "learning_rate": 6.177067478437342e-06, "loss": 1.15528316, "memory(GiB)": 368.61, "step": 2435, "train_speed(iter/s)": 0.206534 }, { "acc": 0.70832682, "epoch": 0.06189751395230847, "grad_norm": 2.21875, "learning_rate": 6.189751395230847e-06, "loss": 1.17865028, "memory(GiB)": 368.61, "step": 2440, "train_speed(iter/s)": 0.206575 }, { "acc": 0.71370149, "epoch": 0.06202435312024353, "grad_norm": 3.109375, "learning_rate": 6.202435312024354e-06, "loss": 1.21943893, "memory(GiB)": 368.61, "step": 2445, "train_speed(iter/s)": 0.206645 }, { "acc": 0.74225636, "epoch": 0.062151192288178586, "grad_norm": 2.3125, "learning_rate": 6.215119228817859e-06, "loss": 1.09130135, "memory(GiB)": 368.61, "step": 2450, "train_speed(iter/s)": 0.206709 }, { "acc": 0.70767851, "epoch": 0.062278031456113644, "grad_norm": 2.140625, "learning_rate": 6.227803145611365e-06, "loss": 1.22830849, "memory(GiB)": 368.61, "step": 2455, "train_speed(iter/s)": 0.20677 }, { "acc": 0.70428419, "epoch": 0.0624048706240487, "grad_norm": 2.21875, "learning_rate": 6.24048706240487e-06, "loss": 1.20928841, "memory(GiB)": 368.61, "step": 2460, "train_speed(iter/s)": 0.206852 }, { "acc": 0.72572961, "epoch": 0.06253170979198376, "grad_norm": 2.859375, "learning_rate": 6.253170979198377e-06, "loss": 1.13781376, "memory(GiB)": 368.61, "step": 2465, "train_speed(iter/s)": 0.206885 }, { "acc": 0.7126749, "epoch": 0.06265854895991882, "grad_norm": 2.1875, "learning_rate": 6.265854895991883e-06, "loss": 1.19686584, "memory(GiB)": 368.61, "step": 2470, "train_speed(iter/s)": 0.206911 }, { "acc": 0.72915945, "epoch": 0.06278538812785388, "grad_norm": 2.359375, "learning_rate": 6.278538812785388e-06, "loss": 1.15547056, "memory(GiB)": 368.61, "step": 2475, "train_speed(iter/s)": 0.206974 }, { "acc": 0.72068825, "epoch": 0.06291222729578894, "grad_norm": 2.234375, "learning_rate": 6.291222729578894e-06, "loss": 1.17749367, "memory(GiB)": 368.61, "step": 2480, "train_speed(iter/s)": 0.20699 }, { "acc": 0.71706486, "epoch": 0.063039066463724, "grad_norm": 2.03125, "learning_rate": 6.303906646372401e-06, "loss": 1.20591259, "memory(GiB)": 368.61, "step": 2485, "train_speed(iter/s)": 0.207043 }, { "acc": 0.72788143, "epoch": 0.06316590563165905, "grad_norm": 2.25, "learning_rate": 6.316590563165906e-06, "loss": 1.11569948, "memory(GiB)": 368.61, "step": 2490, "train_speed(iter/s)": 0.207093 }, { "acc": 0.71660585, "epoch": 0.06329274479959411, "grad_norm": 2.453125, "learning_rate": 6.329274479959412e-06, "loss": 1.19179325, "memory(GiB)": 368.61, "step": 2495, "train_speed(iter/s)": 0.207176 }, { "acc": 0.71758194, "epoch": 0.06341958396752917, "grad_norm": 2.078125, "learning_rate": 6.341958396752917e-06, "loss": 1.20597458, "memory(GiB)": 368.61, "step": 2500, "train_speed(iter/s)": 0.207231 }, { "acc": 0.72265959, "epoch": 0.06354642313546423, "grad_norm": 2.59375, "learning_rate": 6.354642313546424e-06, "loss": 1.19365883, "memory(GiB)": 368.61, "step": 2505, "train_speed(iter/s)": 0.207228 }, { "acc": 0.71927528, "epoch": 0.06367326230339929, "grad_norm": 1.8671875, "learning_rate": 6.36732623033993e-06, "loss": 1.21218166, "memory(GiB)": 368.61, "step": 2510, "train_speed(iter/s)": 0.20726 }, { "acc": 0.72662921, "epoch": 0.06380010147133434, "grad_norm": 2.078125, "learning_rate": 6.380010147133435e-06, "loss": 1.09274683, "memory(GiB)": 368.61, "step": 2515, "train_speed(iter/s)": 0.207279 }, { "acc": 0.72653575, "epoch": 0.0639269406392694, "grad_norm": 2.15625, "learning_rate": 6.392694063926941e-06, "loss": 1.14963064, "memory(GiB)": 368.61, "step": 2520, "train_speed(iter/s)": 0.20734 }, { "acc": 0.71106076, "epoch": 0.06405377980720446, "grad_norm": 2.34375, "learning_rate": 6.405377980720447e-06, "loss": 1.1981389, "memory(GiB)": 368.61, "step": 2525, "train_speed(iter/s)": 0.207342 }, { "acc": 0.73632197, "epoch": 0.06418061897513952, "grad_norm": 2.640625, "learning_rate": 6.418061897513953e-06, "loss": 1.07006664, "memory(GiB)": 368.61, "step": 2530, "train_speed(iter/s)": 0.207374 }, { "acc": 0.71981983, "epoch": 0.06430745814307458, "grad_norm": 2.796875, "learning_rate": 6.430745814307458e-06, "loss": 1.13979692, "memory(GiB)": 368.61, "step": 2535, "train_speed(iter/s)": 0.207438 }, { "acc": 0.7165185, "epoch": 0.06443429731100964, "grad_norm": 2.078125, "learning_rate": 6.443429731100964e-06, "loss": 1.17061491, "memory(GiB)": 368.61, "step": 2540, "train_speed(iter/s)": 0.207491 }, { "acc": 0.74141483, "epoch": 0.0645611364789447, "grad_norm": 2.21875, "learning_rate": 6.456113647894471e-06, "loss": 1.12227573, "memory(GiB)": 368.61, "step": 2545, "train_speed(iter/s)": 0.207552 }, { "acc": 0.72301884, "epoch": 0.06468797564687975, "grad_norm": 1.734375, "learning_rate": 6.468797564687976e-06, "loss": 1.09901772, "memory(GiB)": 368.61, "step": 2550, "train_speed(iter/s)": 0.207571 }, { "acc": 0.73839626, "epoch": 0.06481481481481481, "grad_norm": 2.578125, "learning_rate": 6.481481481481482e-06, "loss": 1.05975552, "memory(GiB)": 368.61, "step": 2555, "train_speed(iter/s)": 0.207636 }, { "acc": 0.72273254, "epoch": 0.06494165398274987, "grad_norm": 2.125, "learning_rate": 6.494165398274987e-06, "loss": 1.15259867, "memory(GiB)": 368.61, "step": 2560, "train_speed(iter/s)": 0.207686 }, { "acc": 0.70492592, "epoch": 0.06506849315068493, "grad_norm": 2.21875, "learning_rate": 6.506849315068494e-06, "loss": 1.19082603, "memory(GiB)": 368.61, "step": 2565, "train_speed(iter/s)": 0.207728 }, { "acc": 0.7167285, "epoch": 0.06519533231861999, "grad_norm": 2.3125, "learning_rate": 6.519533231862e-06, "loss": 1.18185778, "memory(GiB)": 368.61, "step": 2570, "train_speed(iter/s)": 0.207791 }, { "acc": 0.72905297, "epoch": 0.06532217148655504, "grad_norm": 2.09375, "learning_rate": 6.532217148655505e-06, "loss": 1.09680376, "memory(GiB)": 368.61, "step": 2575, "train_speed(iter/s)": 0.207831 }, { "acc": 0.71726704, "epoch": 0.0654490106544901, "grad_norm": 2.0625, "learning_rate": 6.544901065449011e-06, "loss": 1.26496067, "memory(GiB)": 368.61, "step": 2580, "train_speed(iter/s)": 0.207885 }, { "acc": 0.71249876, "epoch": 0.06557584982242516, "grad_norm": 3.203125, "learning_rate": 6.557584982242518e-06, "loss": 1.24171057, "memory(GiB)": 368.61, "step": 2585, "train_speed(iter/s)": 0.207952 }, { "acc": 0.71954827, "epoch": 0.06570268899036022, "grad_norm": 2.0, "learning_rate": 6.570268899036023e-06, "loss": 1.16396427, "memory(GiB)": 368.61, "step": 2590, "train_speed(iter/s)": 0.208006 }, { "acc": 0.7224617, "epoch": 0.06582952815829528, "grad_norm": 2.046875, "learning_rate": 6.582952815829529e-06, "loss": 1.18274841, "memory(GiB)": 368.61, "step": 2595, "train_speed(iter/s)": 0.208061 }, { "acc": 0.71182189, "epoch": 0.06595636732623034, "grad_norm": 2.578125, "learning_rate": 6.595636732623034e-06, "loss": 1.23221989, "memory(GiB)": 368.61, "step": 2600, "train_speed(iter/s)": 0.208103 }, { "acc": 0.72690878, "epoch": 0.0660832064941654, "grad_norm": 2.625, "learning_rate": 6.608320649416541e-06, "loss": 1.09808311, "memory(GiB)": 368.61, "step": 2605, "train_speed(iter/s)": 0.208142 }, { "acc": 0.71896267, "epoch": 0.06621004566210045, "grad_norm": 2.6875, "learning_rate": 6.621004566210046e-06, "loss": 1.17133999, "memory(GiB)": 368.61, "step": 2610, "train_speed(iter/s)": 0.208209 }, { "acc": 0.721137, "epoch": 0.06633688483003551, "grad_norm": 2.515625, "learning_rate": 6.633688483003552e-06, "loss": 1.13451347, "memory(GiB)": 368.61, "step": 2615, "train_speed(iter/s)": 0.208207 }, { "acc": 0.71738038, "epoch": 0.06646372399797057, "grad_norm": 2.46875, "learning_rate": 6.646372399797057e-06, "loss": 1.15962582, "memory(GiB)": 368.61, "step": 2620, "train_speed(iter/s)": 0.208214 }, { "acc": 0.71824465, "epoch": 0.06659056316590563, "grad_norm": 2.640625, "learning_rate": 6.659056316590564e-06, "loss": 1.17503862, "memory(GiB)": 368.61, "step": 2625, "train_speed(iter/s)": 0.208275 }, { "acc": 0.71848874, "epoch": 0.06671740233384069, "grad_norm": 2.71875, "learning_rate": 6.67174023338407e-06, "loss": 1.19625893, "memory(GiB)": 368.61, "step": 2630, "train_speed(iter/s)": 0.208346 }, { "acc": 0.7186779, "epoch": 0.06684424150177574, "grad_norm": 2.890625, "learning_rate": 6.684424150177575e-06, "loss": 1.17397633, "memory(GiB)": 368.61, "step": 2635, "train_speed(iter/s)": 0.208382 }, { "acc": 0.71862993, "epoch": 0.0669710806697108, "grad_norm": 2.578125, "learning_rate": 6.697108066971081e-06, "loss": 1.21394825, "memory(GiB)": 368.61, "step": 2640, "train_speed(iter/s)": 0.208433 }, { "acc": 0.72498665, "epoch": 0.06709791983764586, "grad_norm": 2.328125, "learning_rate": 6.709791983764588e-06, "loss": 1.1284709, "memory(GiB)": 368.61, "step": 2645, "train_speed(iter/s)": 0.20848 }, { "acc": 0.71790142, "epoch": 0.06722475900558092, "grad_norm": 2.359375, "learning_rate": 6.722475900558093e-06, "loss": 1.16624546, "memory(GiB)": 368.61, "step": 2650, "train_speed(iter/s)": 0.208533 }, { "acc": 0.73839092, "epoch": 0.06735159817351598, "grad_norm": 2.65625, "learning_rate": 6.735159817351599e-06, "loss": 1.13955889, "memory(GiB)": 368.61, "step": 2655, "train_speed(iter/s)": 0.208556 }, { "acc": 0.72703962, "epoch": 0.06747843734145104, "grad_norm": 2.703125, "learning_rate": 6.747843734145104e-06, "loss": 1.12405281, "memory(GiB)": 368.61, "step": 2660, "train_speed(iter/s)": 0.208604 }, { "acc": 0.71873884, "epoch": 0.0676052765093861, "grad_norm": 2.015625, "learning_rate": 6.760527650938611e-06, "loss": 1.09866543, "memory(GiB)": 368.61, "step": 2665, "train_speed(iter/s)": 0.208588 }, { "acc": 0.71990647, "epoch": 0.06773211567732115, "grad_norm": 2.046875, "learning_rate": 6.773211567732117e-06, "loss": 1.15371265, "memory(GiB)": 368.61, "step": 2670, "train_speed(iter/s)": 0.208655 }, { "acc": 0.72037668, "epoch": 0.06785895484525621, "grad_norm": 2.0625, "learning_rate": 6.785895484525622e-06, "loss": 1.21964455, "memory(GiB)": 368.61, "step": 2675, "train_speed(iter/s)": 0.208707 }, { "acc": 0.72741766, "epoch": 0.06798579401319127, "grad_norm": 2.28125, "learning_rate": 6.798579401319128e-06, "loss": 1.12304125, "memory(GiB)": 368.61, "step": 2680, "train_speed(iter/s)": 0.208763 }, { "acc": 0.72086811, "epoch": 0.06811263318112633, "grad_norm": 2.109375, "learning_rate": 6.811263318112634e-06, "loss": 1.19156189, "memory(GiB)": 368.61, "step": 2685, "train_speed(iter/s)": 0.208825 }, { "acc": 0.70383511, "epoch": 0.06823947234906139, "grad_norm": 2.25, "learning_rate": 6.82394723490614e-06, "loss": 1.16693726, "memory(GiB)": 368.61, "step": 2690, "train_speed(iter/s)": 0.208908 }, { "acc": 0.72662239, "epoch": 0.06836631151699644, "grad_norm": 2.1875, "learning_rate": 6.836631151699645e-06, "loss": 1.18048153, "memory(GiB)": 368.61, "step": 2695, "train_speed(iter/s)": 0.20893 }, { "acc": 0.70188026, "epoch": 0.0684931506849315, "grad_norm": 2.203125, "learning_rate": 6.849315068493151e-06, "loss": 1.22683516, "memory(GiB)": 368.61, "step": 2700, "train_speed(iter/s)": 0.208962 }, { "acc": 0.7142633, "epoch": 0.06861998985286656, "grad_norm": 2.28125, "learning_rate": 6.861998985286658e-06, "loss": 1.199646, "memory(GiB)": 368.61, "step": 2705, "train_speed(iter/s)": 0.209017 }, { "acc": 0.72845631, "epoch": 0.06874682902080162, "grad_norm": 2.609375, "learning_rate": 6.874682902080163e-06, "loss": 1.26053743, "memory(GiB)": 368.61, "step": 2710, "train_speed(iter/s)": 0.209083 }, { "acc": 0.72393169, "epoch": 0.06887366818873668, "grad_norm": 2.5625, "learning_rate": 6.887366818873669e-06, "loss": 1.12433195, "memory(GiB)": 368.61, "step": 2715, "train_speed(iter/s)": 0.209118 }, { "acc": 0.72216048, "epoch": 0.06900050735667174, "grad_norm": 2.296875, "learning_rate": 6.900050735667174e-06, "loss": 1.13927479, "memory(GiB)": 368.61, "step": 2720, "train_speed(iter/s)": 0.209189 }, { "acc": 0.72816529, "epoch": 0.0691273465246068, "grad_norm": 2.03125, "learning_rate": 6.912734652460681e-06, "loss": 1.14771023, "memory(GiB)": 368.61, "step": 2725, "train_speed(iter/s)": 0.209255 }, { "acc": 0.72273922, "epoch": 0.06925418569254185, "grad_norm": 2.828125, "learning_rate": 6.925418569254187e-06, "loss": 1.09932556, "memory(GiB)": 368.61, "step": 2730, "train_speed(iter/s)": 0.209322 }, { "acc": 0.72468796, "epoch": 0.06938102486047691, "grad_norm": 2.109375, "learning_rate": 6.938102486047692e-06, "loss": 1.10793037, "memory(GiB)": 368.61, "step": 2735, "train_speed(iter/s)": 0.209342 }, { "acc": 0.72352772, "epoch": 0.06950786402841197, "grad_norm": 2.078125, "learning_rate": 6.950786402841198e-06, "loss": 1.12655945, "memory(GiB)": 368.61, "step": 2740, "train_speed(iter/s)": 0.209391 }, { "acc": 0.73475609, "epoch": 0.06963470319634703, "grad_norm": 2.4375, "learning_rate": 6.9634703196347046e-06, "loss": 1.10450487, "memory(GiB)": 368.61, "step": 2745, "train_speed(iter/s)": 0.209451 }, { "acc": 0.72485456, "epoch": 0.06976154236428209, "grad_norm": 2.234375, "learning_rate": 6.97615423642821e-06, "loss": 1.18209877, "memory(GiB)": 368.61, "step": 2750, "train_speed(iter/s)": 0.209523 }, { "acc": 0.71865606, "epoch": 0.06988838153221714, "grad_norm": 2.34375, "learning_rate": 6.988838153221716e-06, "loss": 1.17235727, "memory(GiB)": 368.61, "step": 2755, "train_speed(iter/s)": 0.209559 }, { "acc": 0.71556826, "epoch": 0.0700152207001522, "grad_norm": 2.09375, "learning_rate": 7.001522070015221e-06, "loss": 1.16298742, "memory(GiB)": 368.61, "step": 2760, "train_speed(iter/s)": 0.20955 }, { "acc": 0.72116385, "epoch": 0.07014205986808726, "grad_norm": 2.25, "learning_rate": 7.014205986808728e-06, "loss": 1.21962528, "memory(GiB)": 368.61, "step": 2765, "train_speed(iter/s)": 0.209584 }, { "acc": 0.72256756, "epoch": 0.07026889903602232, "grad_norm": 2.09375, "learning_rate": 7.026889903602233e-06, "loss": 1.1417942, "memory(GiB)": 368.61, "step": 2770, "train_speed(iter/s)": 0.209646 }, { "acc": 0.72853389, "epoch": 0.07039573820395738, "grad_norm": 2.578125, "learning_rate": 7.039573820395739e-06, "loss": 1.19196749, "memory(GiB)": 368.61, "step": 2775, "train_speed(iter/s)": 0.209681 }, { "acc": 0.70771332, "epoch": 0.07052257737189244, "grad_norm": 2.28125, "learning_rate": 7.052257737189245e-06, "loss": 1.19528971, "memory(GiB)": 368.61, "step": 2780, "train_speed(iter/s)": 0.209745 }, { "acc": 0.71793146, "epoch": 0.0706494165398275, "grad_norm": 2.59375, "learning_rate": 7.064941653982751e-06, "loss": 1.18634329, "memory(GiB)": 368.61, "step": 2785, "train_speed(iter/s)": 0.209743 }, { "acc": 0.71725545, "epoch": 0.07077625570776255, "grad_norm": 2.203125, "learning_rate": 7.077625570776257e-06, "loss": 1.21552181, "memory(GiB)": 368.61, "step": 2790, "train_speed(iter/s)": 0.209773 }, { "acc": 0.72844801, "epoch": 0.07090309487569761, "grad_norm": 2.578125, "learning_rate": 7.090309487569762e-06, "loss": 1.13328667, "memory(GiB)": 368.61, "step": 2795, "train_speed(iter/s)": 0.209795 }, { "acc": 0.7493434, "epoch": 0.07102993404363267, "grad_norm": 2.421875, "learning_rate": 7.102993404363268e-06, "loss": 1.13221903, "memory(GiB)": 368.61, "step": 2800, "train_speed(iter/s)": 0.209823 }, { "acc": 0.73095722, "epoch": 0.07115677321156773, "grad_norm": 2.375, "learning_rate": 7.1156773211567745e-06, "loss": 1.12052822, "memory(GiB)": 368.61, "step": 2805, "train_speed(iter/s)": 0.209833 }, { "acc": 0.73033047, "epoch": 0.07128361237950279, "grad_norm": 2.390625, "learning_rate": 7.12836123795028e-06, "loss": 1.18001118, "memory(GiB)": 368.61, "step": 2810, "train_speed(iter/s)": 0.209869 }, { "acc": 0.70786481, "epoch": 0.07141045154743784, "grad_norm": 2.078125, "learning_rate": 7.1410451547437856e-06, "loss": 1.17374153, "memory(GiB)": 368.61, "step": 2815, "train_speed(iter/s)": 0.209908 }, { "acc": 0.72788315, "epoch": 0.0715372907153729, "grad_norm": 2.3125, "learning_rate": 7.153729071537291e-06, "loss": 1.09745865, "memory(GiB)": 368.61, "step": 2820, "train_speed(iter/s)": 0.209942 }, { "acc": 0.73677101, "epoch": 0.07166412988330796, "grad_norm": 2.015625, "learning_rate": 7.1664129883307975e-06, "loss": 1.10566645, "memory(GiB)": 368.61, "step": 2825, "train_speed(iter/s)": 0.209989 }, { "acc": 0.72880902, "epoch": 0.07179096905124302, "grad_norm": 2.265625, "learning_rate": 7.1790969051243035e-06, "loss": 1.11580276, "memory(GiB)": 368.61, "step": 2830, "train_speed(iter/s)": 0.210038 }, { "acc": 0.7084878, "epoch": 0.07191780821917808, "grad_norm": 2.84375, "learning_rate": 7.191780821917809e-06, "loss": 1.21638699, "memory(GiB)": 368.61, "step": 2835, "train_speed(iter/s)": 0.210088 }, { "acc": 0.716959, "epoch": 0.07204464738711314, "grad_norm": 2.375, "learning_rate": 7.2044647387113146e-06, "loss": 1.1755044, "memory(GiB)": 368.61, "step": 2840, "train_speed(iter/s)": 0.210137 }, { "acc": 0.7278872, "epoch": 0.0721714865550482, "grad_norm": 2.0625, "learning_rate": 7.2171486555048205e-06, "loss": 1.11242771, "memory(GiB)": 368.61, "step": 2845, "train_speed(iter/s)": 0.210156 }, { "acc": 0.72557354, "epoch": 0.07229832572298325, "grad_norm": 2.109375, "learning_rate": 7.2298325722983265e-06, "loss": 1.12404518, "memory(GiB)": 368.61, "step": 2850, "train_speed(iter/s)": 0.210207 }, { "acc": 0.73053212, "epoch": 0.07242516489091831, "grad_norm": 2.5, "learning_rate": 7.242516489091832e-06, "loss": 1.16707878, "memory(GiB)": 368.61, "step": 2855, "train_speed(iter/s)": 0.210266 }, { "acc": 0.72590437, "epoch": 0.07255200405885337, "grad_norm": 2.5, "learning_rate": 7.2552004058853376e-06, "loss": 1.12596197, "memory(GiB)": 368.61, "step": 2860, "train_speed(iter/s)": 0.210269 }, { "acc": 0.72456059, "epoch": 0.07267884322678843, "grad_norm": 2.265625, "learning_rate": 7.267884322678844e-06, "loss": 1.22058201, "memory(GiB)": 368.61, "step": 2865, "train_speed(iter/s)": 0.210324 }, { "acc": 0.72465506, "epoch": 0.07280568239472349, "grad_norm": 2.5, "learning_rate": 7.2805682394723495e-06, "loss": 1.18537884, "memory(GiB)": 368.61, "step": 2870, "train_speed(iter/s)": 0.210357 }, { "acc": 0.71512213, "epoch": 0.07293252156265854, "grad_norm": 2.203125, "learning_rate": 7.2932521562658555e-06, "loss": 1.15820007, "memory(GiB)": 368.61, "step": 2875, "train_speed(iter/s)": 0.210413 }, { "acc": 0.7225853, "epoch": 0.0730593607305936, "grad_norm": 1.65625, "learning_rate": 7.305936073059361e-06, "loss": 1.15053024, "memory(GiB)": 368.61, "step": 2880, "train_speed(iter/s)": 0.210448 }, { "acc": 0.73679719, "epoch": 0.07318619989852866, "grad_norm": 1.8671875, "learning_rate": 7.318619989852867e-06, "loss": 1.16524258, "memory(GiB)": 368.61, "step": 2885, "train_speed(iter/s)": 0.210479 }, { "acc": 0.71929359, "epoch": 0.07331303906646372, "grad_norm": 2.78125, "learning_rate": 7.331303906646373e-06, "loss": 1.15426807, "memory(GiB)": 368.61, "step": 2890, "train_speed(iter/s)": 0.210527 }, { "acc": 0.7303791, "epoch": 0.07343987823439878, "grad_norm": 2.203125, "learning_rate": 7.3439878234398785e-06, "loss": 1.12018719, "memory(GiB)": 368.61, "step": 2895, "train_speed(iter/s)": 0.210563 }, { "acc": 0.70798769, "epoch": 0.07356671740233384, "grad_norm": 2.625, "learning_rate": 7.3566717402333845e-06, "loss": 1.2441082, "memory(GiB)": 368.61, "step": 2900, "train_speed(iter/s)": 0.210607 }, { "acc": 0.72010593, "epoch": 0.0736935565702689, "grad_norm": 2.46875, "learning_rate": 7.369355657026891e-06, "loss": 1.16225452, "memory(GiB)": 368.61, "step": 2905, "train_speed(iter/s)": 0.210639 }, { "acc": 0.70528612, "epoch": 0.07382039573820395, "grad_norm": 1.9765625, "learning_rate": 7.382039573820396e-06, "loss": 1.23909445, "memory(GiB)": 368.61, "step": 2910, "train_speed(iter/s)": 0.210693 }, { "acc": 0.71976547, "epoch": 0.07394723490613901, "grad_norm": 2.703125, "learning_rate": 7.394723490613902e-06, "loss": 1.15227318, "memory(GiB)": 368.61, "step": 2915, "train_speed(iter/s)": 0.210723 }, { "acc": 0.71680737, "epoch": 0.07407407407407407, "grad_norm": 2.0625, "learning_rate": 7.4074074074074075e-06, "loss": 1.17376108, "memory(GiB)": 368.61, "step": 2920, "train_speed(iter/s)": 0.210747 }, { "acc": 0.70962915, "epoch": 0.07420091324200913, "grad_norm": 2.375, "learning_rate": 7.420091324200914e-06, "loss": 1.18564978, "memory(GiB)": 368.61, "step": 2925, "train_speed(iter/s)": 0.210798 }, { "acc": 0.71204619, "epoch": 0.07432775240994419, "grad_norm": 2.890625, "learning_rate": 7.432775240994419e-06, "loss": 1.2098465, "memory(GiB)": 368.61, "step": 2930, "train_speed(iter/s)": 0.210853 }, { "acc": 0.73058844, "epoch": 0.07445459157787924, "grad_norm": 2.390625, "learning_rate": 7.445459157787925e-06, "loss": 1.11308279, "memory(GiB)": 368.61, "step": 2935, "train_speed(iter/s)": 0.210888 }, { "acc": 0.72384887, "epoch": 0.0745814307458143, "grad_norm": 2.171875, "learning_rate": 7.458143074581431e-06, "loss": 1.12216377, "memory(GiB)": 368.61, "step": 2940, "train_speed(iter/s)": 0.210925 }, { "acc": 0.73835526, "epoch": 0.07470826991374936, "grad_norm": 2.40625, "learning_rate": 7.470826991374937e-06, "loss": 1.0761013, "memory(GiB)": 368.61, "step": 2945, "train_speed(iter/s)": 0.210931 }, { "acc": 0.70512199, "epoch": 0.07483510908168442, "grad_norm": 2.5625, "learning_rate": 7.483510908168443e-06, "loss": 1.22455502, "memory(GiB)": 368.61, "step": 2950, "train_speed(iter/s)": 0.210978 }, { "acc": 0.73466072, "epoch": 0.07496194824961948, "grad_norm": 1.96875, "learning_rate": 7.496194824961948e-06, "loss": 1.0705555, "memory(GiB)": 368.61, "step": 2955, "train_speed(iter/s)": 0.210956 }, { "acc": 0.72890759, "epoch": 0.07508878741755454, "grad_norm": 2.15625, "learning_rate": 7.508878741755454e-06, "loss": 1.1268712, "memory(GiB)": 368.61, "step": 2960, "train_speed(iter/s)": 0.210969 }, { "acc": 0.71797218, "epoch": 0.0752156265854896, "grad_norm": 2.484375, "learning_rate": 7.521562658548961e-06, "loss": 1.12504854, "memory(GiB)": 368.61, "step": 2965, "train_speed(iter/s)": 0.211007 }, { "acc": 0.73015108, "epoch": 0.07534246575342465, "grad_norm": 2.25, "learning_rate": 7.534246575342466e-06, "loss": 1.10806427, "memory(GiB)": 368.61, "step": 2970, "train_speed(iter/s)": 0.211051 }, { "acc": 0.72890215, "epoch": 0.07546930492135971, "grad_norm": 1.9921875, "learning_rate": 7.546930492135972e-06, "loss": 1.10325966, "memory(GiB)": 368.61, "step": 2975, "train_speed(iter/s)": 0.2111 }, { "acc": 0.73269224, "epoch": 0.07559614408929477, "grad_norm": 2.75, "learning_rate": 7.559614408929477e-06, "loss": 1.09259348, "memory(GiB)": 368.61, "step": 2980, "train_speed(iter/s)": 0.211135 }, { "acc": 0.72427187, "epoch": 0.07572298325722983, "grad_norm": 2.265625, "learning_rate": 7.572298325722984e-06, "loss": 1.14609089, "memory(GiB)": 368.61, "step": 2985, "train_speed(iter/s)": 0.211192 }, { "acc": 0.73482995, "epoch": 0.07584982242516489, "grad_norm": 2.21875, "learning_rate": 7.58498224251649e-06, "loss": 1.11421795, "memory(GiB)": 368.61, "step": 2990, "train_speed(iter/s)": 0.211222 }, { "acc": 0.71588039, "epoch": 0.07597666159309995, "grad_norm": 2.25, "learning_rate": 7.597666159309995e-06, "loss": 1.1905098, "memory(GiB)": 368.61, "step": 2995, "train_speed(iter/s)": 0.211266 }, { "acc": 0.72275629, "epoch": 0.076103500761035, "grad_norm": 2.359375, "learning_rate": 7.610350076103501e-06, "loss": 1.17481308, "memory(GiB)": 368.61, "step": 3000, "train_speed(iter/s)": 0.211291 }, { "epoch": 0.076103500761035, "eval_acc": 0.7117639514437412, "eval_loss": 1.1119107007980347, "eval_runtime": 384.327, "eval_samples_per_second": 16.574, "eval_steps_per_second": 8.287, "step": 3000 }, { "acc": 0.74370871, "epoch": 0.07623033992897006, "grad_norm": 1.7265625, "learning_rate": 7.623033992897007e-06, "loss": 1.06896801, "memory(GiB)": 368.61, "step": 3005, "train_speed(iter/s)": 0.201264 }, { "acc": 0.73060851, "epoch": 0.07635717909690512, "grad_norm": 2.328125, "learning_rate": 7.635717909690512e-06, "loss": 1.14235249, "memory(GiB)": 368.61, "step": 3010, "train_speed(iter/s)": 0.201319 }, { "acc": 0.72894039, "epoch": 0.07648401826484018, "grad_norm": 2.328125, "learning_rate": 7.648401826484018e-06, "loss": 1.17112541, "memory(GiB)": 368.61, "step": 3015, "train_speed(iter/s)": 0.201394 }, { "acc": 0.73756609, "epoch": 0.07661085743277524, "grad_norm": 2.328125, "learning_rate": 7.661085743277524e-06, "loss": 1.10149899, "memory(GiB)": 368.61, "step": 3020, "train_speed(iter/s)": 0.201441 }, { "acc": 0.72538385, "epoch": 0.0767376966007103, "grad_norm": 1.8984375, "learning_rate": 7.67376966007103e-06, "loss": 1.13378639, "memory(GiB)": 368.61, "step": 3025, "train_speed(iter/s)": 0.201511 }, { "acc": 0.73482432, "epoch": 0.07686453576864535, "grad_norm": 2.015625, "learning_rate": 7.686453576864536e-06, "loss": 1.06845207, "memory(GiB)": 368.61, "step": 3030, "train_speed(iter/s)": 0.201524 }, { "acc": 0.72148829, "epoch": 0.07699137493658041, "grad_norm": 2.40625, "learning_rate": 7.699137493658042e-06, "loss": 1.15066538, "memory(GiB)": 368.61, "step": 3035, "train_speed(iter/s)": 0.201568 }, { "acc": 0.72669778, "epoch": 0.07711821410451547, "grad_norm": 1.9765625, "learning_rate": 7.711821410451548e-06, "loss": 1.07626238, "memory(GiB)": 368.61, "step": 3040, "train_speed(iter/s)": 0.201606 }, { "acc": 0.72066212, "epoch": 0.07724505327245053, "grad_norm": 2.171875, "learning_rate": 7.724505327245054e-06, "loss": 1.12335644, "memory(GiB)": 368.61, "step": 3045, "train_speed(iter/s)": 0.20167 }, { "acc": 0.72587452, "epoch": 0.07737189244038559, "grad_norm": 2.640625, "learning_rate": 7.73718924403856e-06, "loss": 1.13457298, "memory(GiB)": 368.61, "step": 3050, "train_speed(iter/s)": 0.201721 }, { "acc": 0.72546544, "epoch": 0.07749873160832065, "grad_norm": 1.984375, "learning_rate": 7.749873160832066e-06, "loss": 1.15849991, "memory(GiB)": 368.61, "step": 3055, "train_speed(iter/s)": 0.201769 }, { "acc": 0.73082471, "epoch": 0.0776255707762557, "grad_norm": 1.90625, "learning_rate": 7.76255707762557e-06, "loss": 1.07406712, "memory(GiB)": 368.61, "step": 3060, "train_speed(iter/s)": 0.20181 }, { "acc": 0.73051395, "epoch": 0.07775240994419076, "grad_norm": 2.234375, "learning_rate": 7.775240994419078e-06, "loss": 1.10509605, "memory(GiB)": 368.61, "step": 3065, "train_speed(iter/s)": 0.201855 }, { "acc": 0.72271366, "epoch": 0.07787924911212582, "grad_norm": 2.265625, "learning_rate": 7.787924911212584e-06, "loss": 1.13872204, "memory(GiB)": 368.61, "step": 3070, "train_speed(iter/s)": 0.201886 }, { "acc": 0.7263763, "epoch": 0.07800608828006088, "grad_norm": 2.625, "learning_rate": 7.800608828006088e-06, "loss": 1.15668812, "memory(GiB)": 368.61, "step": 3075, "train_speed(iter/s)": 0.201962 }, { "acc": 0.71835842, "epoch": 0.07813292744799594, "grad_norm": 2.078125, "learning_rate": 7.813292744799594e-06, "loss": 1.14768162, "memory(GiB)": 368.61, "step": 3080, "train_speed(iter/s)": 0.202021 }, { "acc": 0.71796422, "epoch": 0.078259766615931, "grad_norm": 2.296875, "learning_rate": 7.8259766615931e-06, "loss": 1.11255999, "memory(GiB)": 368.61, "step": 3085, "train_speed(iter/s)": 0.202012 }, { "acc": 0.71738834, "epoch": 0.07838660578386605, "grad_norm": 2.421875, "learning_rate": 7.838660578386606e-06, "loss": 1.20496998, "memory(GiB)": 368.61, "step": 3090, "train_speed(iter/s)": 0.202074 }, { "acc": 0.7217926, "epoch": 0.07851344495180111, "grad_norm": 1.921875, "learning_rate": 7.851344495180112e-06, "loss": 1.15487118, "memory(GiB)": 368.61, "step": 3095, "train_speed(iter/s)": 0.202108 }, { "acc": 0.72168045, "epoch": 0.07864028411973617, "grad_norm": 1.984375, "learning_rate": 7.864028411973618e-06, "loss": 1.13894501, "memory(GiB)": 368.61, "step": 3100, "train_speed(iter/s)": 0.202133 }, { "acc": 0.71548233, "epoch": 0.07876712328767123, "grad_norm": 2.21875, "learning_rate": 7.876712328767124e-06, "loss": 1.16783943, "memory(GiB)": 368.61, "step": 3105, "train_speed(iter/s)": 0.202182 }, { "acc": 0.72016077, "epoch": 0.07889396245560629, "grad_norm": 2.265625, "learning_rate": 7.88939624556063e-06, "loss": 1.17770157, "memory(GiB)": 368.61, "step": 3110, "train_speed(iter/s)": 0.202222 }, { "acc": 0.73581414, "epoch": 0.07902080162354135, "grad_norm": 2.28125, "learning_rate": 7.902080162354136e-06, "loss": 1.07776299, "memory(GiB)": 368.61, "step": 3115, "train_speed(iter/s)": 0.202271 }, { "acc": 0.74546461, "epoch": 0.0791476407914764, "grad_norm": 2.359375, "learning_rate": 7.91476407914764e-06, "loss": 1.05505438, "memory(GiB)": 368.61, "step": 3120, "train_speed(iter/s)": 0.202312 }, { "acc": 0.73265333, "epoch": 0.07927447995941146, "grad_norm": 2.25, "learning_rate": 7.927447995941148e-06, "loss": 1.10431433, "memory(GiB)": 368.61, "step": 3125, "train_speed(iter/s)": 0.202289 }, { "acc": 0.71531534, "epoch": 0.07940131912734652, "grad_norm": 2.359375, "learning_rate": 7.940131912734654e-06, "loss": 1.15220165, "memory(GiB)": 368.61, "step": 3130, "train_speed(iter/s)": 0.20236 }, { "acc": 0.74289188, "epoch": 0.07952815829528158, "grad_norm": 2.15625, "learning_rate": 7.952815829528158e-06, "loss": 1.08279991, "memory(GiB)": 368.61, "step": 3135, "train_speed(iter/s)": 0.202404 }, { "acc": 0.72835779, "epoch": 0.07965499746321664, "grad_norm": 2.328125, "learning_rate": 7.965499746321664e-06, "loss": 1.15169945, "memory(GiB)": 368.61, "step": 3140, "train_speed(iter/s)": 0.202423 }, { "acc": 0.72171102, "epoch": 0.0797818366311517, "grad_norm": 2.125, "learning_rate": 7.978183663115172e-06, "loss": 1.22988682, "memory(GiB)": 368.61, "step": 3145, "train_speed(iter/s)": 0.202488 }, { "acc": 0.7214036, "epoch": 0.07990867579908675, "grad_norm": 1.7421875, "learning_rate": 7.990867579908676e-06, "loss": 1.16157627, "memory(GiB)": 368.61, "step": 3150, "train_speed(iter/s)": 0.202527 }, { "acc": 0.72012663, "epoch": 0.08003551496702181, "grad_norm": 1.8984375, "learning_rate": 8.003551496702182e-06, "loss": 1.17374592, "memory(GiB)": 368.61, "step": 3155, "train_speed(iter/s)": 0.202574 }, { "acc": 0.71749797, "epoch": 0.08016235413495687, "grad_norm": 2.109375, "learning_rate": 8.016235413495688e-06, "loss": 1.14840107, "memory(GiB)": 368.61, "step": 3160, "train_speed(iter/s)": 0.202628 }, { "acc": 0.72449684, "epoch": 0.08028919330289193, "grad_norm": 2.796875, "learning_rate": 8.028919330289194e-06, "loss": 1.17737074, "memory(GiB)": 368.61, "step": 3165, "train_speed(iter/s)": 0.20264 }, { "acc": 0.73651223, "epoch": 0.08041603247082699, "grad_norm": 1.9140625, "learning_rate": 8.0416032470827e-06, "loss": 1.10215511, "memory(GiB)": 368.61, "step": 3170, "train_speed(iter/s)": 0.202693 }, { "acc": 0.71754103, "epoch": 0.08054287163876205, "grad_norm": 2.21875, "learning_rate": 8.054287163876206e-06, "loss": 1.15752258, "memory(GiB)": 368.61, "step": 3175, "train_speed(iter/s)": 0.20271 }, { "acc": 0.7291925, "epoch": 0.0806697108066971, "grad_norm": 2.171875, "learning_rate": 8.066971080669712e-06, "loss": 1.09472065, "memory(GiB)": 368.61, "step": 3180, "train_speed(iter/s)": 0.202778 }, { "acc": 0.71313624, "epoch": 0.08079654997463216, "grad_norm": 2.875, "learning_rate": 8.079654997463218e-06, "loss": 1.21061573, "memory(GiB)": 368.61, "step": 3185, "train_speed(iter/s)": 0.202839 }, { "acc": 0.72776546, "epoch": 0.08092338914256722, "grad_norm": 2.375, "learning_rate": 8.092338914256724e-06, "loss": 1.0922184, "memory(GiB)": 368.61, "step": 3190, "train_speed(iter/s)": 0.202881 }, { "acc": 0.71120362, "epoch": 0.08105022831050228, "grad_norm": 2.78125, "learning_rate": 8.105022831050228e-06, "loss": 1.13267384, "memory(GiB)": 368.61, "step": 3195, "train_speed(iter/s)": 0.202922 }, { "acc": 0.72779083, "epoch": 0.08117706747843734, "grad_norm": 2.125, "learning_rate": 8.117706747843734e-06, "loss": 1.12375422, "memory(GiB)": 368.61, "step": 3200, "train_speed(iter/s)": 0.202962 }, { "acc": 0.72090397, "epoch": 0.0813039066463724, "grad_norm": 2.203125, "learning_rate": 8.130390664637242e-06, "loss": 1.13892117, "memory(GiB)": 368.61, "step": 3205, "train_speed(iter/s)": 0.202995 }, { "acc": 0.73687749, "epoch": 0.08143074581430745, "grad_norm": 2.203125, "learning_rate": 8.143074581430746e-06, "loss": 1.07082462, "memory(GiB)": 368.61, "step": 3210, "train_speed(iter/s)": 0.203022 }, { "acc": 0.7337234, "epoch": 0.08155758498224251, "grad_norm": 2.140625, "learning_rate": 8.155758498224252e-06, "loss": 1.14193201, "memory(GiB)": 368.61, "step": 3215, "train_speed(iter/s)": 0.203069 }, { "acc": 0.73363571, "epoch": 0.08168442415017757, "grad_norm": 2.515625, "learning_rate": 8.168442415017758e-06, "loss": 1.13227701, "memory(GiB)": 368.61, "step": 3220, "train_speed(iter/s)": 0.203116 }, { "acc": 0.7130064, "epoch": 0.08181126331811263, "grad_norm": 2.484375, "learning_rate": 8.181126331811264e-06, "loss": 1.1680521, "memory(GiB)": 368.61, "step": 3225, "train_speed(iter/s)": 0.203168 }, { "acc": 0.72522068, "epoch": 0.08193810248604769, "grad_norm": 1.9609375, "learning_rate": 8.19381024860477e-06, "loss": 1.09955149, "memory(GiB)": 368.61, "step": 3230, "train_speed(iter/s)": 0.203233 }, { "acc": 0.72503119, "epoch": 0.08206494165398275, "grad_norm": 2.328125, "learning_rate": 8.206494165398276e-06, "loss": 1.0792511, "memory(GiB)": 368.61, "step": 3235, "train_speed(iter/s)": 0.203294 }, { "acc": 0.72304277, "epoch": 0.0821917808219178, "grad_norm": 2.4375, "learning_rate": 8.219178082191782e-06, "loss": 1.12151031, "memory(GiB)": 368.61, "step": 3240, "train_speed(iter/s)": 0.203335 }, { "acc": 0.72817693, "epoch": 0.08231861998985286, "grad_norm": 2.21875, "learning_rate": 8.231861998985288e-06, "loss": 1.09771929, "memory(GiB)": 368.61, "step": 3245, "train_speed(iter/s)": 0.203386 }, { "acc": 0.74339304, "epoch": 0.08244545915778792, "grad_norm": 2.15625, "learning_rate": 8.244545915778794e-06, "loss": 1.05150642, "memory(GiB)": 368.61, "step": 3250, "train_speed(iter/s)": 0.203432 }, { "acc": 0.72137213, "epoch": 0.08257229832572298, "grad_norm": 1.9140625, "learning_rate": 8.2572298325723e-06, "loss": 1.12713375, "memory(GiB)": 368.61, "step": 3255, "train_speed(iter/s)": 0.203458 }, { "acc": 0.72368641, "epoch": 0.08269913749365804, "grad_norm": 2.140625, "learning_rate": 8.269913749365804e-06, "loss": 1.07657261, "memory(GiB)": 368.61, "step": 3260, "train_speed(iter/s)": 0.203502 }, { "acc": 0.72049179, "epoch": 0.0828259766615931, "grad_norm": 2.5625, "learning_rate": 8.282597666159312e-06, "loss": 1.11671286, "memory(GiB)": 368.61, "step": 3265, "train_speed(iter/s)": 0.203547 }, { "acc": 0.71845226, "epoch": 0.08295281582952815, "grad_norm": 2.296875, "learning_rate": 8.295281582952816e-06, "loss": 1.16160469, "memory(GiB)": 368.61, "step": 3270, "train_speed(iter/s)": 0.203599 }, { "acc": 0.72607188, "epoch": 0.08307965499746321, "grad_norm": 2.34375, "learning_rate": 8.307965499746322e-06, "loss": 1.11756411, "memory(GiB)": 368.61, "step": 3275, "train_speed(iter/s)": 0.203586 }, { "acc": 0.72084446, "epoch": 0.08320649416539827, "grad_norm": 2.015625, "learning_rate": 8.320649416539828e-06, "loss": 1.183533, "memory(GiB)": 368.61, "step": 3280, "train_speed(iter/s)": 0.203653 }, { "acc": 0.73857923, "epoch": 0.08333333333333333, "grad_norm": 1.96875, "learning_rate": 8.333333333333334e-06, "loss": 1.05644875, "memory(GiB)": 368.61, "step": 3285, "train_speed(iter/s)": 0.2037 }, { "acc": 0.73066692, "epoch": 0.08346017250126839, "grad_norm": 2.5, "learning_rate": 8.34601725012684e-06, "loss": 1.06456184, "memory(GiB)": 368.61, "step": 3290, "train_speed(iter/s)": 0.203754 }, { "acc": 0.7303184, "epoch": 0.08358701166920345, "grad_norm": 3.28125, "learning_rate": 8.358701166920346e-06, "loss": 1.15032339, "memory(GiB)": 368.61, "step": 3295, "train_speed(iter/s)": 0.203789 }, { "acc": 0.70762253, "epoch": 0.0837138508371385, "grad_norm": 2.109375, "learning_rate": 8.371385083713852e-06, "loss": 1.2443634, "memory(GiB)": 368.61, "step": 3300, "train_speed(iter/s)": 0.203835 }, { "acc": 0.72022672, "epoch": 0.08384069000507356, "grad_norm": 2.609375, "learning_rate": 8.384069000507358e-06, "loss": 1.16829891, "memory(GiB)": 368.61, "step": 3305, "train_speed(iter/s)": 0.203887 }, { "acc": 0.71406817, "epoch": 0.08396752917300862, "grad_norm": 2.015625, "learning_rate": 8.396752917300864e-06, "loss": 1.14861641, "memory(GiB)": 368.61, "step": 3310, "train_speed(iter/s)": 0.203941 }, { "acc": 0.72284374, "epoch": 0.08409436834094368, "grad_norm": 2.09375, "learning_rate": 8.40943683409437e-06, "loss": 1.13667336, "memory(GiB)": 368.61, "step": 3315, "train_speed(iter/s)": 0.203916 }, { "acc": 0.72349548, "epoch": 0.08422120750887874, "grad_norm": 2.1875, "learning_rate": 8.422120750887874e-06, "loss": 1.15704155, "memory(GiB)": 368.61, "step": 3320, "train_speed(iter/s)": 0.203913 }, { "acc": 0.72273464, "epoch": 0.0843480466768138, "grad_norm": 1.9765625, "learning_rate": 8.434804667681381e-06, "loss": 1.11980171, "memory(GiB)": 368.61, "step": 3325, "train_speed(iter/s)": 0.20396 }, { "acc": 0.72831221, "epoch": 0.08447488584474885, "grad_norm": 2.125, "learning_rate": 8.447488584474887e-06, "loss": 1.13206825, "memory(GiB)": 368.61, "step": 3330, "train_speed(iter/s)": 0.203979 }, { "acc": 0.73017421, "epoch": 0.08460172501268391, "grad_norm": 2.484375, "learning_rate": 8.460172501268392e-06, "loss": 1.14203768, "memory(GiB)": 368.61, "step": 3335, "train_speed(iter/s)": 0.204013 }, { "acc": 0.72546525, "epoch": 0.08472856418061897, "grad_norm": 2.046875, "learning_rate": 8.472856418061898e-06, "loss": 1.11604881, "memory(GiB)": 368.61, "step": 3340, "train_speed(iter/s)": 0.204053 }, { "acc": 0.73602409, "epoch": 0.08485540334855403, "grad_norm": 2.328125, "learning_rate": 8.485540334855404e-06, "loss": 1.12112179, "memory(GiB)": 368.61, "step": 3345, "train_speed(iter/s)": 0.204062 }, { "acc": 0.73787813, "epoch": 0.08498224251648909, "grad_norm": 1.796875, "learning_rate": 8.49822425164891e-06, "loss": 1.0986558, "memory(GiB)": 368.61, "step": 3350, "train_speed(iter/s)": 0.204094 }, { "acc": 0.72018185, "epoch": 0.08510908168442415, "grad_norm": 2.015625, "learning_rate": 8.510908168442416e-06, "loss": 1.12560711, "memory(GiB)": 368.61, "step": 3355, "train_speed(iter/s)": 0.204124 }, { "acc": 0.71957188, "epoch": 0.0852359208523592, "grad_norm": 2.34375, "learning_rate": 8.523592085235922e-06, "loss": 1.13627014, "memory(GiB)": 368.61, "step": 3360, "train_speed(iter/s)": 0.204192 }, { "acc": 0.71451883, "epoch": 0.08536276002029426, "grad_norm": 2.28125, "learning_rate": 8.536276002029428e-06, "loss": 1.12709942, "memory(GiB)": 368.61, "step": 3365, "train_speed(iter/s)": 0.204235 }, { "acc": 0.72148218, "epoch": 0.08548959918822932, "grad_norm": 2.640625, "learning_rate": 8.548959918822933e-06, "loss": 1.16043625, "memory(GiB)": 368.61, "step": 3370, "train_speed(iter/s)": 0.204285 }, { "acc": 0.71781511, "epoch": 0.08561643835616438, "grad_norm": 2.140625, "learning_rate": 8.56164383561644e-06, "loss": 1.15676498, "memory(GiB)": 368.61, "step": 3375, "train_speed(iter/s)": 0.204317 }, { "acc": 0.73780336, "epoch": 0.08574327752409944, "grad_norm": 2.765625, "learning_rate": 8.574327752409944e-06, "loss": 1.09157591, "memory(GiB)": 368.61, "step": 3380, "train_speed(iter/s)": 0.204336 }, { "acc": 0.72095184, "epoch": 0.0858701166920345, "grad_norm": 2.3125, "learning_rate": 8.587011669203451e-06, "loss": 1.12162561, "memory(GiB)": 368.61, "step": 3385, "train_speed(iter/s)": 0.204373 }, { "acc": 0.74219832, "epoch": 0.08599695585996955, "grad_norm": 2.25, "learning_rate": 8.599695585996957e-06, "loss": 1.08954496, "memory(GiB)": 368.61, "step": 3390, "train_speed(iter/s)": 0.204424 }, { "acc": 0.7178834, "epoch": 0.08612379502790461, "grad_norm": 2.8125, "learning_rate": 8.612379502790462e-06, "loss": 1.20100193, "memory(GiB)": 368.61, "step": 3395, "train_speed(iter/s)": 0.204454 }, { "acc": 0.73277626, "epoch": 0.08625063419583967, "grad_norm": 2.3125, "learning_rate": 8.625063419583968e-06, "loss": 1.11559029, "memory(GiB)": 368.61, "step": 3400, "train_speed(iter/s)": 0.204496 }, { "acc": 0.72498059, "epoch": 0.08637747336377473, "grad_norm": 2.171875, "learning_rate": 8.637747336377475e-06, "loss": 1.14360704, "memory(GiB)": 368.61, "step": 3405, "train_speed(iter/s)": 0.204545 }, { "acc": 0.73290224, "epoch": 0.08650431253170979, "grad_norm": 2.578125, "learning_rate": 8.65043125317098e-06, "loss": 1.18445969, "memory(GiB)": 368.61, "step": 3410, "train_speed(iter/s)": 0.204615 }, { "acc": 0.71608944, "epoch": 0.08663115169964485, "grad_norm": 2.390625, "learning_rate": 8.663115169964485e-06, "loss": 1.09736071, "memory(GiB)": 368.61, "step": 3415, "train_speed(iter/s)": 0.204668 }, { "acc": 0.70777283, "epoch": 0.0867579908675799, "grad_norm": 2.65625, "learning_rate": 8.675799086757991e-06, "loss": 1.1968586, "memory(GiB)": 368.61, "step": 3420, "train_speed(iter/s)": 0.204703 }, { "acc": 0.72528772, "epoch": 0.08688483003551496, "grad_norm": 2.34375, "learning_rate": 8.688483003551497e-06, "loss": 1.17035484, "memory(GiB)": 368.61, "step": 3425, "train_speed(iter/s)": 0.204729 }, { "acc": 0.72935967, "epoch": 0.08701166920345002, "grad_norm": 2.125, "learning_rate": 8.701166920345003e-06, "loss": 1.13192196, "memory(GiB)": 368.61, "step": 3430, "train_speed(iter/s)": 0.204749 }, { "acc": 0.73097372, "epoch": 0.08713850837138508, "grad_norm": 2.5, "learning_rate": 8.71385083713851e-06, "loss": 1.15616016, "memory(GiB)": 368.61, "step": 3435, "train_speed(iter/s)": 0.204773 }, { "acc": 0.70717053, "epoch": 0.08726534753932014, "grad_norm": 2.484375, "learning_rate": 8.726534753932014e-06, "loss": 1.25362186, "memory(GiB)": 368.61, "step": 3440, "train_speed(iter/s)": 0.204825 }, { "acc": 0.72490549, "epoch": 0.0873921867072552, "grad_norm": 2.515625, "learning_rate": 8.739218670725521e-06, "loss": 1.11535807, "memory(GiB)": 368.61, "step": 3445, "train_speed(iter/s)": 0.204861 }, { "acc": 0.72495422, "epoch": 0.08751902587519025, "grad_norm": 2.15625, "learning_rate": 8.751902587519027e-06, "loss": 1.09333801, "memory(GiB)": 368.61, "step": 3450, "train_speed(iter/s)": 0.204907 }, { "acc": 0.71785173, "epoch": 0.08764586504312531, "grad_norm": 2.578125, "learning_rate": 8.764586504312532e-06, "loss": 1.19687443, "memory(GiB)": 368.61, "step": 3455, "train_speed(iter/s)": 0.204947 }, { "acc": 0.72998629, "epoch": 0.08777270421106037, "grad_norm": 2.0625, "learning_rate": 8.777270421106037e-06, "loss": 1.17056971, "memory(GiB)": 368.61, "step": 3460, "train_speed(iter/s)": 0.204954 }, { "acc": 0.73495731, "epoch": 0.08789954337899543, "grad_norm": 2.0625, "learning_rate": 8.789954337899545e-06, "loss": 1.10605125, "memory(GiB)": 368.61, "step": 3465, "train_speed(iter/s)": 0.204995 }, { "acc": 0.72184029, "epoch": 0.08802638254693049, "grad_norm": 2.4375, "learning_rate": 8.80263825469305e-06, "loss": 1.14728889, "memory(GiB)": 368.61, "step": 3470, "train_speed(iter/s)": 0.205042 }, { "acc": 0.73719268, "epoch": 0.08815322171486555, "grad_norm": 2.09375, "learning_rate": 8.815322171486555e-06, "loss": 1.06549177, "memory(GiB)": 368.61, "step": 3475, "train_speed(iter/s)": 0.205082 }, { "acc": 0.72771664, "epoch": 0.0882800608828006, "grad_norm": 2.265625, "learning_rate": 8.828006088280061e-06, "loss": 1.13415394, "memory(GiB)": 368.61, "step": 3480, "train_speed(iter/s)": 0.20514 }, { "acc": 0.72635336, "epoch": 0.08840690005073566, "grad_norm": 2.171875, "learning_rate": 8.840690005073567e-06, "loss": 1.09682999, "memory(GiB)": 368.61, "step": 3485, "train_speed(iter/s)": 0.205167 }, { "acc": 0.71841073, "epoch": 0.08853373921867072, "grad_norm": 2.296875, "learning_rate": 8.853373921867073e-06, "loss": 1.12403603, "memory(GiB)": 368.61, "step": 3490, "train_speed(iter/s)": 0.205206 }, { "acc": 0.72518139, "epoch": 0.08866057838660578, "grad_norm": 2.078125, "learning_rate": 8.86605783866058e-06, "loss": 1.14766264, "memory(GiB)": 368.61, "step": 3495, "train_speed(iter/s)": 0.205239 }, { "acc": 0.73673654, "epoch": 0.08878741755454084, "grad_norm": 1.8828125, "learning_rate": 8.878741755454085e-06, "loss": 1.05873604, "memory(GiB)": 368.61, "step": 3500, "train_speed(iter/s)": 0.205275 }, { "acc": 0.73256893, "epoch": 0.0889142567224759, "grad_norm": 1.953125, "learning_rate": 8.891425672247591e-06, "loss": 1.07816353, "memory(GiB)": 368.61, "step": 3505, "train_speed(iter/s)": 0.205316 }, { "acc": 0.72354908, "epoch": 0.08904109589041095, "grad_norm": 1.9765625, "learning_rate": 8.904109589041097e-06, "loss": 1.11936798, "memory(GiB)": 368.61, "step": 3510, "train_speed(iter/s)": 0.205349 }, { "acc": 0.71392279, "epoch": 0.08916793505834601, "grad_norm": 3.0, "learning_rate": 8.916793505834601e-06, "loss": 1.17762947, "memory(GiB)": 368.61, "step": 3515, "train_speed(iter/s)": 0.205402 }, { "acc": 0.73163834, "epoch": 0.08929477422628107, "grad_norm": 2.015625, "learning_rate": 8.929477422628107e-06, "loss": 1.11637249, "memory(GiB)": 368.61, "step": 3520, "train_speed(iter/s)": 0.205453 }, { "acc": 0.72946048, "epoch": 0.08942161339421613, "grad_norm": 2.609375, "learning_rate": 8.942161339421615e-06, "loss": 1.15212479, "memory(GiB)": 368.61, "step": 3525, "train_speed(iter/s)": 0.205484 }, { "acc": 0.73037691, "epoch": 0.08954845256215119, "grad_norm": 2.859375, "learning_rate": 8.95484525621512e-06, "loss": 1.13369331, "memory(GiB)": 368.61, "step": 3530, "train_speed(iter/s)": 0.205508 }, { "acc": 0.72091894, "epoch": 0.08967529173008625, "grad_norm": 2.234375, "learning_rate": 8.967529173008625e-06, "loss": 1.1251646, "memory(GiB)": 368.61, "step": 3535, "train_speed(iter/s)": 0.205533 }, { "acc": 0.73042512, "epoch": 0.0898021308980213, "grad_norm": 1.8515625, "learning_rate": 8.980213089802131e-06, "loss": 1.07540607, "memory(GiB)": 368.61, "step": 3540, "train_speed(iter/s)": 0.20557 }, { "acc": 0.73845892, "epoch": 0.08992897006595636, "grad_norm": 2.21875, "learning_rate": 8.992897006595637e-06, "loss": 1.08342505, "memory(GiB)": 368.61, "step": 3545, "train_speed(iter/s)": 0.205603 }, { "acc": 0.72430182, "epoch": 0.09005580923389142, "grad_norm": 2.09375, "learning_rate": 9.005580923389143e-06, "loss": 1.1130846, "memory(GiB)": 368.61, "step": 3550, "train_speed(iter/s)": 0.205627 }, { "acc": 0.7183393, "epoch": 0.09018264840182648, "grad_norm": 2.0, "learning_rate": 9.01826484018265e-06, "loss": 1.13767443, "memory(GiB)": 368.61, "step": 3555, "train_speed(iter/s)": 0.205677 }, { "acc": 0.71553316, "epoch": 0.09030948756976154, "grad_norm": 2.71875, "learning_rate": 9.030948756976155e-06, "loss": 1.1932703, "memory(GiB)": 368.61, "step": 3560, "train_speed(iter/s)": 0.205732 }, { "acc": 0.72598391, "epoch": 0.0904363267376966, "grad_norm": 2.09375, "learning_rate": 9.043632673769661e-06, "loss": 1.15891075, "memory(GiB)": 368.61, "step": 3565, "train_speed(iter/s)": 0.205774 }, { "acc": 0.72454023, "epoch": 0.09056316590563165, "grad_norm": 2.359375, "learning_rate": 9.056316590563167e-06, "loss": 1.11181355, "memory(GiB)": 368.61, "step": 3570, "train_speed(iter/s)": 0.205829 }, { "acc": 0.72770066, "epoch": 0.09069000507356671, "grad_norm": 1.9140625, "learning_rate": 9.069000507356673e-06, "loss": 1.0908947, "memory(GiB)": 368.61, "step": 3575, "train_speed(iter/s)": 0.205879 }, { "acc": 0.72762852, "epoch": 0.09081684424150177, "grad_norm": 1.9765625, "learning_rate": 9.081684424150177e-06, "loss": 1.13032551, "memory(GiB)": 368.61, "step": 3580, "train_speed(iter/s)": 0.205931 }, { "acc": 0.72599134, "epoch": 0.09094368340943683, "grad_norm": 2.140625, "learning_rate": 9.094368340943685e-06, "loss": 1.09127579, "memory(GiB)": 368.61, "step": 3585, "train_speed(iter/s)": 0.205978 }, { "acc": 0.72994776, "epoch": 0.09107052257737189, "grad_norm": 2.609375, "learning_rate": 9.10705225773719e-06, "loss": 1.03895931, "memory(GiB)": 368.61, "step": 3590, "train_speed(iter/s)": 0.206042 }, { "acc": 0.72962885, "epoch": 0.09119736174530695, "grad_norm": 2.21875, "learning_rate": 9.119736174530695e-06, "loss": 1.17838297, "memory(GiB)": 368.61, "step": 3595, "train_speed(iter/s)": 0.206103 }, { "acc": 0.72467833, "epoch": 0.091324200913242, "grad_norm": 2.09375, "learning_rate": 9.132420091324201e-06, "loss": 1.11369667, "memory(GiB)": 368.61, "step": 3600, "train_speed(iter/s)": 0.2061 }, { "acc": 0.74249291, "epoch": 0.09145104008117706, "grad_norm": 2.1875, "learning_rate": 9.145104008117707e-06, "loss": 1.07630043, "memory(GiB)": 368.61, "step": 3605, "train_speed(iter/s)": 0.206113 }, { "acc": 0.71808777, "epoch": 0.09157787924911212, "grad_norm": 1.859375, "learning_rate": 9.157787924911213e-06, "loss": 1.166119, "memory(GiB)": 368.61, "step": 3610, "train_speed(iter/s)": 0.206166 }, { "acc": 0.72407665, "epoch": 0.09170471841704718, "grad_norm": 2.015625, "learning_rate": 9.170471841704719e-06, "loss": 1.14694386, "memory(GiB)": 368.61, "step": 3615, "train_speed(iter/s)": 0.206198 }, { "acc": 0.72264705, "epoch": 0.09183155758498224, "grad_norm": 2.59375, "learning_rate": 9.183155758498225e-06, "loss": 1.13055668, "memory(GiB)": 368.61, "step": 3620, "train_speed(iter/s)": 0.206223 }, { "acc": 0.71632953, "epoch": 0.0919583967529173, "grad_norm": 2.015625, "learning_rate": 9.195839675291731e-06, "loss": 1.16368055, "memory(GiB)": 368.61, "step": 3625, "train_speed(iter/s)": 0.206269 }, { "acc": 0.72592096, "epoch": 0.09208523592085235, "grad_norm": 2.3125, "learning_rate": 9.208523592085237e-06, "loss": 1.09890575, "memory(GiB)": 368.61, "step": 3630, "train_speed(iter/s)": 0.206301 }, { "acc": 0.73946042, "epoch": 0.09221207508878741, "grad_norm": 2.40625, "learning_rate": 9.221207508878743e-06, "loss": 1.05304136, "memory(GiB)": 368.61, "step": 3635, "train_speed(iter/s)": 0.206341 }, { "acc": 0.72053304, "epoch": 0.09233891425672247, "grad_norm": 2.421875, "learning_rate": 9.233891425672247e-06, "loss": 1.15729351, "memory(GiB)": 368.61, "step": 3640, "train_speed(iter/s)": 0.206376 }, { "acc": 0.72303247, "epoch": 0.09246575342465753, "grad_norm": 2.484375, "learning_rate": 9.246575342465755e-06, "loss": 1.16149979, "memory(GiB)": 368.61, "step": 3645, "train_speed(iter/s)": 0.206407 }, { "acc": 0.72564631, "epoch": 0.09259259259259259, "grad_norm": 2.265625, "learning_rate": 9.25925925925926e-06, "loss": 1.13661861, "memory(GiB)": 368.61, "step": 3650, "train_speed(iter/s)": 0.206417 }, { "acc": 0.7354847, "epoch": 0.09271943176052765, "grad_norm": 2.265625, "learning_rate": 9.271943176052765e-06, "loss": 1.07002287, "memory(GiB)": 368.61, "step": 3655, "train_speed(iter/s)": 0.20645 }, { "acc": 0.74948673, "epoch": 0.0928462709284627, "grad_norm": 2.46875, "learning_rate": 9.284627092846271e-06, "loss": 1.05266171, "memory(GiB)": 368.61, "step": 3660, "train_speed(iter/s)": 0.206504 }, { "acc": 0.74574823, "epoch": 0.09297311009639776, "grad_norm": 2.328125, "learning_rate": 9.297311009639777e-06, "loss": 1.15428219, "memory(GiB)": 368.61, "step": 3665, "train_speed(iter/s)": 0.206549 }, { "acc": 0.72579975, "epoch": 0.09309994926433282, "grad_norm": 2.3125, "learning_rate": 9.309994926433283e-06, "loss": 1.18047762, "memory(GiB)": 368.61, "step": 3670, "train_speed(iter/s)": 0.206584 }, { "acc": 0.71104226, "epoch": 0.09322678843226788, "grad_norm": 2.53125, "learning_rate": 9.322678843226789e-06, "loss": 1.20039768, "memory(GiB)": 368.61, "step": 3675, "train_speed(iter/s)": 0.20664 }, { "acc": 0.72869053, "epoch": 0.09335362760020294, "grad_norm": 3.21875, "learning_rate": 9.335362760020295e-06, "loss": 1.15582047, "memory(GiB)": 368.61, "step": 3680, "train_speed(iter/s)": 0.206677 }, { "acc": 0.7258522, "epoch": 0.093480466768138, "grad_norm": 2.359375, "learning_rate": 9.348046676813801e-06, "loss": 1.11243734, "memory(GiB)": 368.61, "step": 3685, "train_speed(iter/s)": 0.206711 }, { "acc": 0.72078819, "epoch": 0.09360730593607305, "grad_norm": 1.8515625, "learning_rate": 9.360730593607307e-06, "loss": 1.14300003, "memory(GiB)": 368.61, "step": 3690, "train_speed(iter/s)": 0.206753 }, { "acc": 0.74343405, "epoch": 0.09373414510400811, "grad_norm": 2.234375, "learning_rate": 9.373414510400813e-06, "loss": 1.04543581, "memory(GiB)": 368.61, "step": 3695, "train_speed(iter/s)": 0.206779 }, { "acc": 0.72064962, "epoch": 0.09386098427194317, "grad_norm": 2.890625, "learning_rate": 9.386098427194317e-06, "loss": 1.20689917, "memory(GiB)": 368.61, "step": 3700, "train_speed(iter/s)": 0.206825 }, { "acc": 0.71619296, "epoch": 0.09398782343987823, "grad_norm": 2.171875, "learning_rate": 9.398782343987825e-06, "loss": 1.20538292, "memory(GiB)": 368.61, "step": 3705, "train_speed(iter/s)": 0.206857 }, { "acc": 0.73265162, "epoch": 0.09411466260781329, "grad_norm": 2.296875, "learning_rate": 9.41146626078133e-06, "loss": 1.0555419, "memory(GiB)": 368.61, "step": 3710, "train_speed(iter/s)": 0.206887 }, { "acc": 0.73894243, "epoch": 0.09424150177574835, "grad_norm": 2.765625, "learning_rate": 9.424150177574835e-06, "loss": 1.12749157, "memory(GiB)": 368.61, "step": 3715, "train_speed(iter/s)": 0.206921 }, { "acc": 0.72554951, "epoch": 0.0943683409436834, "grad_norm": 2.6875, "learning_rate": 9.436834094368341e-06, "loss": 1.11247559, "memory(GiB)": 368.61, "step": 3720, "train_speed(iter/s)": 0.206957 }, { "acc": 0.72963152, "epoch": 0.09449518011161846, "grad_norm": 2.828125, "learning_rate": 9.449518011161849e-06, "loss": 1.104006, "memory(GiB)": 368.61, "step": 3725, "train_speed(iter/s)": 0.206984 }, { "acc": 0.74242835, "epoch": 0.09462201927955352, "grad_norm": 2.234375, "learning_rate": 9.462201927955353e-06, "loss": 1.06233931, "memory(GiB)": 368.61, "step": 3730, "train_speed(iter/s)": 0.207031 }, { "acc": 0.72046456, "epoch": 0.09474885844748858, "grad_norm": 2.0625, "learning_rate": 9.474885844748859e-06, "loss": 1.13523922, "memory(GiB)": 368.61, "step": 3735, "train_speed(iter/s)": 0.207075 }, { "acc": 0.72745371, "epoch": 0.09487569761542364, "grad_norm": 2.046875, "learning_rate": 9.487569761542365e-06, "loss": 1.09650583, "memory(GiB)": 368.61, "step": 3740, "train_speed(iter/s)": 0.207086 }, { "acc": 0.73304415, "epoch": 0.0950025367833587, "grad_norm": 2.015625, "learning_rate": 9.50025367833587e-06, "loss": 1.10785027, "memory(GiB)": 368.61, "step": 3745, "train_speed(iter/s)": 0.207107 }, { "acc": 0.71610904, "epoch": 0.09512937595129375, "grad_norm": 2.109375, "learning_rate": 9.512937595129377e-06, "loss": 1.15224953, "memory(GiB)": 368.61, "step": 3750, "train_speed(iter/s)": 0.207157 }, { "acc": 0.72756691, "epoch": 0.09525621511922881, "grad_norm": 2.53125, "learning_rate": 9.525621511922883e-06, "loss": 1.18286591, "memory(GiB)": 368.61, "step": 3755, "train_speed(iter/s)": 0.207193 }, { "acc": 0.71843328, "epoch": 0.09538305428716387, "grad_norm": 2.078125, "learning_rate": 9.538305428716389e-06, "loss": 1.11059551, "memory(GiB)": 368.61, "step": 3760, "train_speed(iter/s)": 0.207223 }, { "acc": 0.72565374, "epoch": 0.09550989345509893, "grad_norm": 2.5625, "learning_rate": 9.550989345509895e-06, "loss": 1.17086277, "memory(GiB)": 368.61, "step": 3765, "train_speed(iter/s)": 0.207261 }, { "acc": 0.72139635, "epoch": 0.09563673262303399, "grad_norm": 2.234375, "learning_rate": 9.5636732623034e-06, "loss": 1.14281101, "memory(GiB)": 368.61, "step": 3770, "train_speed(iter/s)": 0.207286 }, { "acc": 0.73156919, "epoch": 0.09576357179096905, "grad_norm": 2.234375, "learning_rate": 9.576357179096905e-06, "loss": 1.09604092, "memory(GiB)": 368.61, "step": 3775, "train_speed(iter/s)": 0.2073 }, { "acc": 0.71833076, "epoch": 0.0958904109589041, "grad_norm": 2.21875, "learning_rate": 9.589041095890411e-06, "loss": 1.15307951, "memory(GiB)": 368.61, "step": 3780, "train_speed(iter/s)": 0.207305 }, { "acc": 0.71240215, "epoch": 0.09601725012683916, "grad_norm": 2.09375, "learning_rate": 9.601725012683919e-06, "loss": 1.16257582, "memory(GiB)": 368.61, "step": 3785, "train_speed(iter/s)": 0.207339 }, { "acc": 0.72154679, "epoch": 0.09614408929477422, "grad_norm": 2.125, "learning_rate": 9.614408929477423e-06, "loss": 1.15451374, "memory(GiB)": 368.61, "step": 3790, "train_speed(iter/s)": 0.207383 }, { "acc": 0.7117815, "epoch": 0.09627092846270928, "grad_norm": 1.7578125, "learning_rate": 9.627092846270929e-06, "loss": 1.17853947, "memory(GiB)": 368.61, "step": 3795, "train_speed(iter/s)": 0.207421 }, { "acc": 0.71945543, "epoch": 0.09639776763064434, "grad_norm": 2.34375, "learning_rate": 9.639776763064435e-06, "loss": 1.13866615, "memory(GiB)": 368.61, "step": 3800, "train_speed(iter/s)": 0.207469 }, { "acc": 0.7228241, "epoch": 0.0965246067985794, "grad_norm": 1.890625, "learning_rate": 9.65246067985794e-06, "loss": 1.14753962, "memory(GiB)": 368.61, "step": 3805, "train_speed(iter/s)": 0.207491 }, { "acc": 0.72271547, "epoch": 0.09665144596651445, "grad_norm": 2.1875, "learning_rate": 9.665144596651447e-06, "loss": 1.12169657, "memory(GiB)": 368.61, "step": 3810, "train_speed(iter/s)": 0.207511 }, { "acc": 0.69849081, "epoch": 0.09677828513444951, "grad_norm": 3.03125, "learning_rate": 9.677828513444953e-06, "loss": 1.22858591, "memory(GiB)": 368.61, "step": 3815, "train_speed(iter/s)": 0.207575 }, { "acc": 0.70891495, "epoch": 0.09690512430238457, "grad_norm": 2.71875, "learning_rate": 9.690512430238459e-06, "loss": 1.22356052, "memory(GiB)": 368.61, "step": 3820, "train_speed(iter/s)": 0.207607 }, { "acc": 0.74672794, "epoch": 0.09703196347031963, "grad_norm": 2.015625, "learning_rate": 9.703196347031965e-06, "loss": 1.02637835, "memory(GiB)": 368.61, "step": 3825, "train_speed(iter/s)": 0.207605 }, { "acc": 0.73932981, "epoch": 0.09715880263825469, "grad_norm": 2.125, "learning_rate": 9.71588026382547e-06, "loss": 1.06295757, "memory(GiB)": 368.61, "step": 3830, "train_speed(iter/s)": 0.207609 }, { "acc": 0.71140919, "epoch": 0.09728564180618975, "grad_norm": 2.1875, "learning_rate": 9.728564180618977e-06, "loss": 1.16966009, "memory(GiB)": 368.61, "step": 3835, "train_speed(iter/s)": 0.207655 }, { "acc": 0.7183979, "epoch": 0.0974124809741248, "grad_norm": 2.140625, "learning_rate": 9.74124809741248e-06, "loss": 1.15571709, "memory(GiB)": 368.61, "step": 3840, "train_speed(iter/s)": 0.207678 }, { "acc": 0.72390752, "epoch": 0.09753932014205986, "grad_norm": 2.140625, "learning_rate": 9.753932014205988e-06, "loss": 1.14200726, "memory(GiB)": 368.61, "step": 3845, "train_speed(iter/s)": 0.207714 }, { "acc": 0.73451781, "epoch": 0.09766615930999492, "grad_norm": 2.0625, "learning_rate": 9.766615930999493e-06, "loss": 1.08548489, "memory(GiB)": 368.61, "step": 3850, "train_speed(iter/s)": 0.207738 }, { "acc": 0.72844839, "epoch": 0.09779299847792998, "grad_norm": 2.125, "learning_rate": 9.779299847792999e-06, "loss": 1.10300579, "memory(GiB)": 368.61, "step": 3855, "train_speed(iter/s)": 0.207738 }, { "acc": 0.73319054, "epoch": 0.09791983764586504, "grad_norm": 2.625, "learning_rate": 9.791983764586505e-06, "loss": 1.07280445, "memory(GiB)": 368.61, "step": 3860, "train_speed(iter/s)": 0.20776 }, { "acc": 0.73449535, "epoch": 0.0980466768138001, "grad_norm": 2.09375, "learning_rate": 9.80466768138001e-06, "loss": 1.13303289, "memory(GiB)": 368.61, "step": 3865, "train_speed(iter/s)": 0.207801 }, { "acc": 0.71911716, "epoch": 0.09817351598173515, "grad_norm": 2.265625, "learning_rate": 9.817351598173517e-06, "loss": 1.15660839, "memory(GiB)": 368.61, "step": 3870, "train_speed(iter/s)": 0.207798 }, { "acc": 0.72427192, "epoch": 0.09830035514967021, "grad_norm": 1.859375, "learning_rate": 9.830035514967023e-06, "loss": 1.17714634, "memory(GiB)": 368.61, "step": 3875, "train_speed(iter/s)": 0.207818 }, { "acc": 0.71657672, "epoch": 0.09842719431760527, "grad_norm": 2.328125, "learning_rate": 9.842719431760529e-06, "loss": 1.20942764, "memory(GiB)": 368.61, "step": 3880, "train_speed(iter/s)": 0.207852 }, { "acc": 0.73161025, "epoch": 0.09855403348554033, "grad_norm": 2.421875, "learning_rate": 9.855403348554034e-06, "loss": 1.09776306, "memory(GiB)": 368.61, "step": 3885, "train_speed(iter/s)": 0.207889 }, { "acc": 0.70724621, "epoch": 0.09868087265347539, "grad_norm": 2.375, "learning_rate": 9.86808726534754e-06, "loss": 1.15629044, "memory(GiB)": 368.61, "step": 3890, "train_speed(iter/s)": 0.207939 }, { "acc": 0.718472, "epoch": 0.09880771182141045, "grad_norm": 2.21875, "learning_rate": 9.880771182141046e-06, "loss": 1.21797066, "memory(GiB)": 368.61, "step": 3895, "train_speed(iter/s)": 0.207982 }, { "acc": 0.73062916, "epoch": 0.0989345509893455, "grad_norm": 2.171875, "learning_rate": 9.89345509893455e-06, "loss": 1.12904835, "memory(GiB)": 368.61, "step": 3900, "train_speed(iter/s)": 0.208016 }, { "acc": 0.73008966, "epoch": 0.09906139015728056, "grad_norm": 2.296875, "learning_rate": 9.906139015728058e-06, "loss": 1.14670868, "memory(GiB)": 368.61, "step": 3905, "train_speed(iter/s)": 0.208048 }, { "acc": 0.71996622, "epoch": 0.09918822932521562, "grad_norm": 2.046875, "learning_rate": 9.918822932521563e-06, "loss": 1.16990833, "memory(GiB)": 368.61, "step": 3910, "train_speed(iter/s)": 0.208049 }, { "acc": 0.7248909, "epoch": 0.09931506849315068, "grad_norm": 2.140625, "learning_rate": 9.931506849315069e-06, "loss": 1.12887154, "memory(GiB)": 368.61, "step": 3915, "train_speed(iter/s)": 0.208094 }, { "acc": 0.72297258, "epoch": 0.09944190766108574, "grad_norm": 2.046875, "learning_rate": 9.944190766108575e-06, "loss": 1.1374321, "memory(GiB)": 368.61, "step": 3920, "train_speed(iter/s)": 0.208133 }, { "acc": 0.73094349, "epoch": 0.0995687468290208, "grad_norm": 2.421875, "learning_rate": 9.95687468290208e-06, "loss": 1.0607029, "memory(GiB)": 368.61, "step": 3925, "train_speed(iter/s)": 0.208165 }, { "acc": 0.72563038, "epoch": 0.09969558599695585, "grad_norm": 2.390625, "learning_rate": 9.969558599695586e-06, "loss": 1.14585686, "memory(GiB)": 368.61, "step": 3930, "train_speed(iter/s)": 0.208176 }, { "acc": 0.71943402, "epoch": 0.09982242516489091, "grad_norm": 2.28125, "learning_rate": 9.982242516489092e-06, "loss": 1.15335836, "memory(GiB)": 368.61, "step": 3935, "train_speed(iter/s)": 0.208207 }, { "acc": 0.73232598, "epoch": 0.09994926433282597, "grad_norm": 2.21875, "learning_rate": 9.994926433282598e-06, "loss": 1.0936141, "memory(GiB)": 368.61, "step": 3940, "train_speed(iter/s)": 0.20823 }, { "acc": 0.74575052, "epoch": 0.10007610350076103, "grad_norm": 2.125, "learning_rate": 9.999999960413982e-06, "loss": 1.06946163, "memory(GiB)": 368.61, "step": 3945, "train_speed(iter/s)": 0.208257 }, { "acc": 0.71907654, "epoch": 0.10020294266869609, "grad_norm": 3.109375, "learning_rate": 9.99999971849943e-06, "loss": 1.14854164, "memory(GiB)": 368.61, "step": 3950, "train_speed(iter/s)": 0.208284 }, { "acc": 0.72422757, "epoch": 0.10032978183663115, "grad_norm": 2.15625, "learning_rate": 9.999999256662563e-06, "loss": 1.14833174, "memory(GiB)": 368.61, "step": 3955, "train_speed(iter/s)": 0.208332 }, { "acc": 0.73814011, "epoch": 0.1004566210045662, "grad_norm": 2.46875, "learning_rate": 9.999998574903408e-06, "loss": 1.07857704, "memory(GiB)": 368.61, "step": 3960, "train_speed(iter/s)": 0.208372 }, { "acc": 0.72978587, "epoch": 0.10058346017250126, "grad_norm": 2.546875, "learning_rate": 9.99999767322199e-06, "loss": 1.13076181, "memory(GiB)": 368.61, "step": 3965, "train_speed(iter/s)": 0.20839 }, { "acc": 0.72294054, "epoch": 0.10071029934043632, "grad_norm": 2.03125, "learning_rate": 9.999996551618353e-06, "loss": 1.09649601, "memory(GiB)": 368.61, "step": 3970, "train_speed(iter/s)": 0.208427 }, { "acc": 0.72119374, "epoch": 0.10083713850837138, "grad_norm": 2.375, "learning_rate": 9.999995210092545e-06, "loss": 1.132864, "memory(GiB)": 368.61, "step": 3975, "train_speed(iter/s)": 0.208469 }, { "acc": 0.72773199, "epoch": 0.10096397767630644, "grad_norm": 2.78125, "learning_rate": 9.999993648644622e-06, "loss": 1.09960518, "memory(GiB)": 368.61, "step": 3980, "train_speed(iter/s)": 0.208511 }, { "acc": 0.72919416, "epoch": 0.1010908168442415, "grad_norm": 2.640625, "learning_rate": 9.999991867274656e-06, "loss": 1.13299809, "memory(GiB)": 368.61, "step": 3985, "train_speed(iter/s)": 0.208537 }, { "acc": 0.73355103, "epoch": 0.10121765601217655, "grad_norm": 2.4375, "learning_rate": 9.999989865982725e-06, "loss": 1.04888554, "memory(GiB)": 368.61, "step": 3990, "train_speed(iter/s)": 0.208574 }, { "acc": 0.71150265, "epoch": 0.10134449518011161, "grad_norm": 2.28125, "learning_rate": 9.999987644768917e-06, "loss": 1.10893517, "memory(GiB)": 368.61, "step": 3995, "train_speed(iter/s)": 0.208596 }, { "acc": 0.72601113, "epoch": 0.10147133434804667, "grad_norm": 2.296875, "learning_rate": 9.999985203633327e-06, "loss": 1.12000761, "memory(GiB)": 368.61, "step": 4000, "train_speed(iter/s)": 0.208632 }, { "epoch": 0.10147133434804667, "eval_acc": 0.7175834500644824, "eval_loss": 1.0784469842910767, "eval_runtime": 384.9738, "eval_samples_per_second": 16.547, "eval_steps_per_second": 8.273, "step": 4000 }, { "acc": 0.72615023, "epoch": 0.10159817351598173, "grad_norm": 2.734375, "learning_rate": 9.999982542576065e-06, "loss": 1.07933903, "memory(GiB)": 368.61, "step": 4005, "train_speed(iter/s)": 0.201201 }, { "acc": 0.73489099, "epoch": 0.10172501268391679, "grad_norm": 2.390625, "learning_rate": 9.999979661597247e-06, "loss": 1.12802525, "memory(GiB)": 368.61, "step": 4010, "train_speed(iter/s)": 0.201226 }, { "acc": 0.73010082, "epoch": 0.10185185185185185, "grad_norm": 2.140625, "learning_rate": 9.999976560697002e-06, "loss": 1.08718491, "memory(GiB)": 368.61, "step": 4015, "train_speed(iter/s)": 0.201259 }, { "acc": 0.73530121, "epoch": 0.1019786910197869, "grad_norm": 2.4375, "learning_rate": 9.999973239875462e-06, "loss": 1.13823023, "memory(GiB)": 368.61, "step": 4020, "train_speed(iter/s)": 0.20129 }, { "acc": 0.73300304, "epoch": 0.10210553018772196, "grad_norm": 2.46875, "learning_rate": 9.999969699132776e-06, "loss": 1.08100109, "memory(GiB)": 368.61, "step": 4025, "train_speed(iter/s)": 0.201335 }, { "acc": 0.71455493, "epoch": 0.10223236935565702, "grad_norm": 1.921875, "learning_rate": 9.999965938469102e-06, "loss": 1.14935837, "memory(GiB)": 368.61, "step": 4030, "train_speed(iter/s)": 0.201379 }, { "acc": 0.72439108, "epoch": 0.10235920852359208, "grad_norm": 2.359375, "learning_rate": 9.9999619578846e-06, "loss": 1.12579231, "memory(GiB)": 368.61, "step": 4035, "train_speed(iter/s)": 0.201421 }, { "acc": 0.73615437, "epoch": 0.10248604769152714, "grad_norm": 2.703125, "learning_rate": 9.999957757379451e-06, "loss": 1.12910175, "memory(GiB)": 368.61, "step": 4040, "train_speed(iter/s)": 0.201445 }, { "acc": 0.74248333, "epoch": 0.1026128868594622, "grad_norm": 2.84375, "learning_rate": 9.999953336953834e-06, "loss": 1.06426411, "memory(GiB)": 368.61, "step": 4045, "train_speed(iter/s)": 0.201454 }, { "acc": 0.72005062, "epoch": 0.10273972602739725, "grad_norm": 2.21875, "learning_rate": 9.999948696607946e-06, "loss": 1.13753414, "memory(GiB)": 368.61, "step": 4050, "train_speed(iter/s)": 0.201507 }, { "acc": 0.7211575, "epoch": 0.10286656519533231, "grad_norm": 2.03125, "learning_rate": 9.999943836341992e-06, "loss": 1.12276173, "memory(GiB)": 368.61, "step": 4055, "train_speed(iter/s)": 0.201548 }, { "acc": 0.75319786, "epoch": 0.10299340436326737, "grad_norm": 2.40625, "learning_rate": 9.999938756156185e-06, "loss": 1.0409399, "memory(GiB)": 368.61, "step": 4060, "train_speed(iter/s)": 0.201565 }, { "acc": 0.73089991, "epoch": 0.10312024353120243, "grad_norm": 2.296875, "learning_rate": 9.999933456050747e-06, "loss": 1.15138798, "memory(GiB)": 368.61, "step": 4065, "train_speed(iter/s)": 0.201589 }, { "acc": 0.71593895, "epoch": 0.10324708269913749, "grad_norm": 2.15625, "learning_rate": 9.999927936025914e-06, "loss": 1.17818222, "memory(GiB)": 368.61, "step": 4070, "train_speed(iter/s)": 0.20162 }, { "acc": 0.72774053, "epoch": 0.10337392186707255, "grad_norm": 2.28125, "learning_rate": 9.999922196081928e-06, "loss": 1.06743088, "memory(GiB)": 368.61, "step": 4075, "train_speed(iter/s)": 0.201604 }, { "acc": 0.73214121, "epoch": 0.1035007610350076, "grad_norm": 2.09375, "learning_rate": 9.99991623621904e-06, "loss": 1.07967348, "memory(GiB)": 368.61, "step": 4080, "train_speed(iter/s)": 0.201623 }, { "acc": 0.72565269, "epoch": 0.10362760020294266, "grad_norm": 2.515625, "learning_rate": 9.999910056437512e-06, "loss": 1.15279636, "memory(GiB)": 368.61, "step": 4085, "train_speed(iter/s)": 0.201681 }, { "acc": 0.73479567, "epoch": 0.10375443937087772, "grad_norm": 1.890625, "learning_rate": 9.999903656737618e-06, "loss": 1.11676807, "memory(GiB)": 368.61, "step": 4090, "train_speed(iter/s)": 0.201722 }, { "acc": 0.73043327, "epoch": 0.10388127853881278, "grad_norm": 2.25, "learning_rate": 9.999897037119637e-06, "loss": 1.06439323, "memory(GiB)": 368.61, "step": 4095, "train_speed(iter/s)": 0.20176 }, { "acc": 0.72920628, "epoch": 0.10400811770674784, "grad_norm": 2.140625, "learning_rate": 9.999890197583862e-06, "loss": 1.16613636, "memory(GiB)": 368.61, "step": 4100, "train_speed(iter/s)": 0.201802 }, { "acc": 0.71723757, "epoch": 0.1041349568746829, "grad_norm": 2.875, "learning_rate": 9.999883138130593e-06, "loss": 1.14026718, "memory(GiB)": 368.61, "step": 4105, "train_speed(iter/s)": 0.201847 }, { "acc": 0.73431988, "epoch": 0.10426179604261795, "grad_norm": 2.28125, "learning_rate": 9.999875858760143e-06, "loss": 1.10393744, "memory(GiB)": 368.61, "step": 4110, "train_speed(iter/s)": 0.201894 }, { "acc": 0.72533541, "epoch": 0.10438863521055301, "grad_norm": 2.328125, "learning_rate": 9.999868359472826e-06, "loss": 1.18306475, "memory(GiB)": 368.61, "step": 4115, "train_speed(iter/s)": 0.201931 }, { "acc": 0.73074913, "epoch": 0.10451547437848807, "grad_norm": 2.0625, "learning_rate": 9.999860640268977e-06, "loss": 1.1386095, "memory(GiB)": 368.61, "step": 4120, "train_speed(iter/s)": 0.201974 }, { "acc": 0.73715639, "epoch": 0.10464231354642313, "grad_norm": 2.171875, "learning_rate": 9.999852701148935e-06, "loss": 1.11335583, "memory(GiB)": 368.61, "step": 4125, "train_speed(iter/s)": 0.202006 }, { "acc": 0.73897505, "epoch": 0.10476915271435819, "grad_norm": 2.53125, "learning_rate": 9.999844542113049e-06, "loss": 1.12533932, "memory(GiB)": 368.61, "step": 4130, "train_speed(iter/s)": 0.202049 }, { "acc": 0.71897283, "epoch": 0.10489599188229325, "grad_norm": 1.9375, "learning_rate": 9.999836163161675e-06, "loss": 1.14538965, "memory(GiB)": 368.61, "step": 4135, "train_speed(iter/s)": 0.202074 }, { "acc": 0.75014458, "epoch": 0.1050228310502283, "grad_norm": 2.71875, "learning_rate": 9.999827564295187e-06, "loss": 1.01045284, "memory(GiB)": 368.61, "step": 4140, "train_speed(iter/s)": 0.202101 }, { "acc": 0.7351728, "epoch": 0.10514967021816336, "grad_norm": 2.15625, "learning_rate": 9.999818745513958e-06, "loss": 1.08135662, "memory(GiB)": 368.61, "step": 4145, "train_speed(iter/s)": 0.202132 }, { "acc": 0.72742052, "epoch": 0.10527650938609842, "grad_norm": 2.265625, "learning_rate": 9.99980970681838e-06, "loss": 1.13863125, "memory(GiB)": 368.61, "step": 4150, "train_speed(iter/s)": 0.202159 }, { "acc": 0.74400134, "epoch": 0.10540334855403348, "grad_norm": 2.421875, "learning_rate": 9.999800448208846e-06, "loss": 1.07212753, "memory(GiB)": 368.61, "step": 4155, "train_speed(iter/s)": 0.202211 }, { "acc": 0.72932186, "epoch": 0.10553018772196854, "grad_norm": 2.015625, "learning_rate": 9.999790969685767e-06, "loss": 1.06633015, "memory(GiB)": 368.61, "step": 4160, "train_speed(iter/s)": 0.202243 }, { "acc": 0.73130674, "epoch": 0.1056570268899036, "grad_norm": 2.828125, "learning_rate": 9.999781271249559e-06, "loss": 1.12501364, "memory(GiB)": 368.61, "step": 4165, "train_speed(iter/s)": 0.202285 }, { "acc": 0.73168449, "epoch": 0.10578386605783865, "grad_norm": 2.140625, "learning_rate": 9.999771352900647e-06, "loss": 1.08586292, "memory(GiB)": 368.61, "step": 4170, "train_speed(iter/s)": 0.202329 }, { "acc": 0.73958664, "epoch": 0.10591070522577371, "grad_norm": 2.109375, "learning_rate": 9.999761214639469e-06, "loss": 1.0075985, "memory(GiB)": 368.61, "step": 4175, "train_speed(iter/s)": 0.202347 }, { "acc": 0.73836336, "epoch": 0.10603754439370877, "grad_norm": 1.9140625, "learning_rate": 9.999750856466472e-06, "loss": 1.05192966, "memory(GiB)": 368.61, "step": 4180, "train_speed(iter/s)": 0.20238 }, { "acc": 0.73840532, "epoch": 0.10616438356164383, "grad_norm": 2.09375, "learning_rate": 9.99974027838211e-06, "loss": 1.03804703, "memory(GiB)": 368.61, "step": 4185, "train_speed(iter/s)": 0.202421 }, { "acc": 0.73273888, "epoch": 0.10629122272957889, "grad_norm": 1.6953125, "learning_rate": 9.999729480386846e-06, "loss": 1.10012093, "memory(GiB)": 368.61, "step": 4190, "train_speed(iter/s)": 0.202458 }, { "acc": 0.73902998, "epoch": 0.10641806189751395, "grad_norm": 2.171875, "learning_rate": 9.999718462481157e-06, "loss": 1.09421997, "memory(GiB)": 368.61, "step": 4195, "train_speed(iter/s)": 0.202493 }, { "acc": 0.74455967, "epoch": 0.106544901065449, "grad_norm": 2.0625, "learning_rate": 9.99970722466553e-06, "loss": 1.06415596, "memory(GiB)": 368.61, "step": 4200, "train_speed(iter/s)": 0.202536 }, { "acc": 0.73479996, "epoch": 0.10667174023338406, "grad_norm": 2.359375, "learning_rate": 9.999695766940458e-06, "loss": 1.06842318, "memory(GiB)": 368.61, "step": 4205, "train_speed(iter/s)": 0.202586 }, { "acc": 0.72158933, "epoch": 0.10679857940131912, "grad_norm": 1.921875, "learning_rate": 9.999684089306442e-06, "loss": 1.10685272, "memory(GiB)": 368.61, "step": 4210, "train_speed(iter/s)": 0.20263 }, { "acc": 0.72949247, "epoch": 0.10692541856925418, "grad_norm": 2.015625, "learning_rate": 9.999672191763999e-06, "loss": 1.11073589, "memory(GiB)": 368.61, "step": 4215, "train_speed(iter/s)": 0.20266 }, { "acc": 0.73196182, "epoch": 0.10705225773718924, "grad_norm": 2.21875, "learning_rate": 9.99966007431365e-06, "loss": 1.03503838, "memory(GiB)": 368.61, "step": 4220, "train_speed(iter/s)": 0.202669 }, { "acc": 0.73152995, "epoch": 0.1071790969051243, "grad_norm": 1.7265625, "learning_rate": 9.99964773695593e-06, "loss": 1.12993774, "memory(GiB)": 368.61, "step": 4225, "train_speed(iter/s)": 0.202693 }, { "acc": 0.74756784, "epoch": 0.10730593607305935, "grad_norm": 2.265625, "learning_rate": 9.999635179691381e-06, "loss": 1.08492727, "memory(GiB)": 368.61, "step": 4230, "train_speed(iter/s)": 0.202732 }, { "acc": 0.72928267, "epoch": 0.10743277524099441, "grad_norm": 2.046875, "learning_rate": 9.999622402520553e-06, "loss": 1.15069923, "memory(GiB)": 368.61, "step": 4235, "train_speed(iter/s)": 0.202776 }, { "acc": 0.72716722, "epoch": 0.10755961440892947, "grad_norm": 2.8125, "learning_rate": 9.999609405444012e-06, "loss": 1.10768471, "memory(GiB)": 368.61, "step": 4240, "train_speed(iter/s)": 0.202813 }, { "acc": 0.73785796, "epoch": 0.10768645357686453, "grad_norm": 2.265625, "learning_rate": 9.999596188462328e-06, "loss": 1.0645071, "memory(GiB)": 368.61, "step": 4245, "train_speed(iter/s)": 0.202848 }, { "acc": 0.73726802, "epoch": 0.10781329274479959, "grad_norm": 1.7578125, "learning_rate": 9.99958275157608e-06, "loss": 1.07647991, "memory(GiB)": 368.61, "step": 4250, "train_speed(iter/s)": 0.202876 }, { "acc": 0.73394074, "epoch": 0.10794013191273465, "grad_norm": 2.28125, "learning_rate": 9.999569094785862e-06, "loss": 1.16249142, "memory(GiB)": 368.61, "step": 4255, "train_speed(iter/s)": 0.20292 }, { "acc": 0.74482145, "epoch": 0.1080669710806697, "grad_norm": 2.046875, "learning_rate": 9.999555218092273e-06, "loss": 1.08304749, "memory(GiB)": 368.61, "step": 4260, "train_speed(iter/s)": 0.202949 }, { "acc": 0.73658199, "epoch": 0.10819381024860476, "grad_norm": 2.1875, "learning_rate": 9.999541121495926e-06, "loss": 1.09394016, "memory(GiB)": 368.61, "step": 4265, "train_speed(iter/s)": 0.202997 }, { "acc": 0.72492924, "epoch": 0.10832064941653982, "grad_norm": 2.546875, "learning_rate": 9.999526804997439e-06, "loss": 1.13653965, "memory(GiB)": 368.61, "step": 4270, "train_speed(iter/s)": 0.203034 }, { "acc": 0.72380414, "epoch": 0.10844748858447488, "grad_norm": 2.078125, "learning_rate": 9.99951226859744e-06, "loss": 1.14571848, "memory(GiB)": 368.61, "step": 4275, "train_speed(iter/s)": 0.203052 }, { "acc": 0.73573523, "epoch": 0.10857432775240994, "grad_norm": 10.1875, "learning_rate": 9.999497512296572e-06, "loss": 1.11834764, "memory(GiB)": 368.61, "step": 4280, "train_speed(iter/s)": 0.203093 }, { "acc": 0.74182081, "epoch": 0.108701166920345, "grad_norm": 3.109375, "learning_rate": 9.999482536095483e-06, "loss": 1.1052084, "memory(GiB)": 368.61, "step": 4285, "train_speed(iter/s)": 0.203135 }, { "acc": 0.73300514, "epoch": 0.10882800608828005, "grad_norm": 1.8671875, "learning_rate": 9.999467339994827e-06, "loss": 1.06736603, "memory(GiB)": 368.61, "step": 4290, "train_speed(iter/s)": 0.203167 }, { "acc": 0.74279714, "epoch": 0.10895484525621511, "grad_norm": 2.265625, "learning_rate": 9.99945192399528e-06, "loss": 1.07806911, "memory(GiB)": 368.61, "step": 4295, "train_speed(iter/s)": 0.203196 }, { "acc": 0.73072348, "epoch": 0.10908168442415017, "grad_norm": 2.21875, "learning_rate": 9.999436288097515e-06, "loss": 1.08448505, "memory(GiB)": 368.61, "step": 4300, "train_speed(iter/s)": 0.203203 }, { "acc": 0.72443891, "epoch": 0.10920852359208523, "grad_norm": 2.515625, "learning_rate": 9.99942043230222e-06, "loss": 1.12276554, "memory(GiB)": 368.61, "step": 4305, "train_speed(iter/s)": 0.203226 }, { "acc": 0.73188295, "epoch": 0.10933536276002029, "grad_norm": 2.234375, "learning_rate": 9.999404356610095e-06, "loss": 1.04763737, "memory(GiB)": 368.61, "step": 4310, "train_speed(iter/s)": 0.203251 }, { "acc": 0.73594446, "epoch": 0.10946220192795535, "grad_norm": 2.625, "learning_rate": 9.999388061021846e-06, "loss": 1.14673843, "memory(GiB)": 368.61, "step": 4315, "train_speed(iter/s)": 0.20328 }, { "acc": 0.72766814, "epoch": 0.1095890410958904, "grad_norm": 2.328125, "learning_rate": 9.99937154553819e-06, "loss": 1.12093315, "memory(GiB)": 368.61, "step": 4320, "train_speed(iter/s)": 0.203332 }, { "acc": 0.71638746, "epoch": 0.10971588026382546, "grad_norm": 2.296875, "learning_rate": 9.999354810159852e-06, "loss": 1.11633253, "memory(GiB)": 368.61, "step": 4325, "train_speed(iter/s)": 0.20336 }, { "acc": 0.72178698, "epoch": 0.10984271943176052, "grad_norm": 2.46875, "learning_rate": 9.999337854887567e-06, "loss": 1.10521441, "memory(GiB)": 368.61, "step": 4330, "train_speed(iter/s)": 0.203392 }, { "acc": 0.73915243, "epoch": 0.10996955859969558, "grad_norm": 2.109375, "learning_rate": 9.999320679722086e-06, "loss": 1.07961521, "memory(GiB)": 368.61, "step": 4335, "train_speed(iter/s)": 0.203439 }, { "acc": 0.73213058, "epoch": 0.11009639776763064, "grad_norm": 2.03125, "learning_rate": 9.999303284664159e-06, "loss": 1.06685486, "memory(GiB)": 368.61, "step": 4340, "train_speed(iter/s)": 0.203478 }, { "acc": 0.74784279, "epoch": 0.1102232369355657, "grad_norm": 1.953125, "learning_rate": 9.999285669714555e-06, "loss": 1.07549686, "memory(GiB)": 368.61, "step": 4345, "train_speed(iter/s)": 0.203512 }, { "acc": 0.74648781, "epoch": 0.11035007610350075, "grad_norm": 2.046875, "learning_rate": 9.999267834874044e-06, "loss": 1.01258955, "memory(GiB)": 368.61, "step": 4350, "train_speed(iter/s)": 0.203554 }, { "acc": 0.730651, "epoch": 0.11047691527143581, "grad_norm": 2.46875, "learning_rate": 9.999249780143416e-06, "loss": 1.06727161, "memory(GiB)": 368.61, "step": 4355, "train_speed(iter/s)": 0.203572 }, { "acc": 0.73295298, "epoch": 0.11060375443937087, "grad_norm": 1.9453125, "learning_rate": 9.999231505523463e-06, "loss": 1.09005566, "memory(GiB)": 368.61, "step": 4360, "train_speed(iter/s)": 0.203599 }, { "acc": 0.71701088, "epoch": 0.11073059360730593, "grad_norm": 2.25, "learning_rate": 9.999213011014987e-06, "loss": 1.13907661, "memory(GiB)": 368.61, "step": 4365, "train_speed(iter/s)": 0.203624 }, { "acc": 0.72829995, "epoch": 0.11085743277524099, "grad_norm": 1.90625, "learning_rate": 9.999194296618805e-06, "loss": 1.10476789, "memory(GiB)": 368.61, "step": 4370, "train_speed(iter/s)": 0.203648 }, { "acc": 0.73527622, "epoch": 0.11098427194317605, "grad_norm": 2.796875, "learning_rate": 9.999175362335735e-06, "loss": 1.08773289, "memory(GiB)": 368.61, "step": 4375, "train_speed(iter/s)": 0.203693 }, { "acc": 0.73727493, "epoch": 0.1111111111111111, "grad_norm": 2.234375, "learning_rate": 9.999156208166614e-06, "loss": 1.09952059, "memory(GiB)": 368.61, "step": 4380, "train_speed(iter/s)": 0.203733 }, { "acc": 0.73393559, "epoch": 0.11123795027904616, "grad_norm": 2.34375, "learning_rate": 9.999136834112284e-06, "loss": 1.0781642, "memory(GiB)": 368.61, "step": 4385, "train_speed(iter/s)": 0.203768 }, { "acc": 0.72576594, "epoch": 0.11136478944698122, "grad_norm": 1.84375, "learning_rate": 9.999117240173597e-06, "loss": 1.13465786, "memory(GiB)": 368.61, "step": 4390, "train_speed(iter/s)": 0.203799 }, { "acc": 0.71982355, "epoch": 0.11149162861491628, "grad_norm": 2.4375, "learning_rate": 9.999097426351412e-06, "loss": 1.2001997, "memory(GiB)": 368.61, "step": 4395, "train_speed(iter/s)": 0.20385 }, { "acc": 0.73362098, "epoch": 0.11161846778285134, "grad_norm": 2.234375, "learning_rate": 9.999077392646606e-06, "loss": 1.09673872, "memory(GiB)": 368.61, "step": 4400, "train_speed(iter/s)": 0.203883 }, { "acc": 0.73949733, "epoch": 0.1117453069507864, "grad_norm": 2.28125, "learning_rate": 9.999057139060055e-06, "loss": 1.09516411, "memory(GiB)": 368.61, "step": 4405, "train_speed(iter/s)": 0.203916 }, { "acc": 0.73522491, "epoch": 0.11187214611872145, "grad_norm": 2.28125, "learning_rate": 9.999036665592653e-06, "loss": 1.08742676, "memory(GiB)": 368.61, "step": 4410, "train_speed(iter/s)": 0.203949 }, { "acc": 0.74120073, "epoch": 0.11199898528665651, "grad_norm": 2.09375, "learning_rate": 9.999015972245298e-06, "loss": 1.07966957, "memory(GiB)": 368.61, "step": 4415, "train_speed(iter/s)": 0.203966 }, { "acc": 0.72910409, "epoch": 0.11212582445459157, "grad_norm": 2.359375, "learning_rate": 9.998995059018901e-06, "loss": 1.14139767, "memory(GiB)": 368.61, "step": 4420, "train_speed(iter/s)": 0.203981 }, { "acc": 0.7221396, "epoch": 0.11225266362252663, "grad_norm": 2.46875, "learning_rate": 9.998973925914384e-06, "loss": 1.09901857, "memory(GiB)": 368.61, "step": 4425, "train_speed(iter/s)": 0.203975 }, { "acc": 0.72419214, "epoch": 0.11237950279046169, "grad_norm": 2.296875, "learning_rate": 9.998952572932675e-06, "loss": 1.10821667, "memory(GiB)": 368.61, "step": 4430, "train_speed(iter/s)": 0.204012 }, { "acc": 0.71324444, "epoch": 0.11250634195839675, "grad_norm": 2.28125, "learning_rate": 9.998931000074712e-06, "loss": 1.19466419, "memory(GiB)": 368.61, "step": 4435, "train_speed(iter/s)": 0.20406 }, { "acc": 0.7276597, "epoch": 0.1126331811263318, "grad_norm": 2.25, "learning_rate": 9.998909207341446e-06, "loss": 1.08697777, "memory(GiB)": 368.61, "step": 4440, "train_speed(iter/s)": 0.204105 }, { "acc": 0.72167816, "epoch": 0.11276002029426686, "grad_norm": 2.203125, "learning_rate": 9.998887194733833e-06, "loss": 1.17359447, "memory(GiB)": 368.61, "step": 4445, "train_speed(iter/s)": 0.204143 }, { "acc": 0.74027472, "epoch": 0.11288685946220192, "grad_norm": 2.140625, "learning_rate": 9.998864962252843e-06, "loss": 1.08167515, "memory(GiB)": 368.61, "step": 4450, "train_speed(iter/s)": 0.204144 }, { "acc": 0.72272854, "epoch": 0.11301369863013698, "grad_norm": 2.140625, "learning_rate": 9.998842509899456e-06, "loss": 1.14766121, "memory(GiB)": 368.61, "step": 4455, "train_speed(iter/s)": 0.204174 }, { "acc": 0.73261261, "epoch": 0.11314053779807204, "grad_norm": 2.703125, "learning_rate": 9.998819837674655e-06, "loss": 1.13349581, "memory(GiB)": 368.61, "step": 4460, "train_speed(iter/s)": 0.204208 }, { "acc": 0.74261642, "epoch": 0.1132673769660071, "grad_norm": 3.0625, "learning_rate": 9.99879694557944e-06, "loss": 1.08936958, "memory(GiB)": 368.61, "step": 4465, "train_speed(iter/s)": 0.204227 }, { "acc": 0.73690381, "epoch": 0.11339421613394216, "grad_norm": 2.171875, "learning_rate": 9.998773833614816e-06, "loss": 1.06881618, "memory(GiB)": 368.61, "step": 4470, "train_speed(iter/s)": 0.204259 }, { "acc": 0.74091244, "epoch": 0.11352105530187721, "grad_norm": 2.078125, "learning_rate": 9.998750501781803e-06, "loss": 1.03233252, "memory(GiB)": 368.61, "step": 4475, "train_speed(iter/s)": 0.204283 }, { "acc": 0.74290533, "epoch": 0.11364789446981227, "grad_norm": 2.328125, "learning_rate": 9.998726950081425e-06, "loss": 1.126684, "memory(GiB)": 368.61, "step": 4480, "train_speed(iter/s)": 0.204313 }, { "acc": 0.73230448, "epoch": 0.11377473363774733, "grad_norm": 2.484375, "learning_rate": 9.998703178514717e-06, "loss": 1.14807644, "memory(GiB)": 368.61, "step": 4485, "train_speed(iter/s)": 0.204349 }, { "acc": 0.72563591, "epoch": 0.11390157280568239, "grad_norm": 2.71875, "learning_rate": 9.998679187082724e-06, "loss": 1.10023937, "memory(GiB)": 368.61, "step": 4490, "train_speed(iter/s)": 0.204374 }, { "acc": 0.72738743, "epoch": 0.11402841197361745, "grad_norm": 1.9453125, "learning_rate": 9.998654975786506e-06, "loss": 1.08317318, "memory(GiB)": 368.61, "step": 4495, "train_speed(iter/s)": 0.204415 }, { "acc": 0.73488178, "epoch": 0.1141552511415525, "grad_norm": 2.359375, "learning_rate": 9.998630544627123e-06, "loss": 1.08726339, "memory(GiB)": 368.61, "step": 4500, "train_speed(iter/s)": 0.204461 }, { "acc": 0.7387538, "epoch": 0.11428209030948756, "grad_norm": 1.9140625, "learning_rate": 9.998605893605653e-06, "loss": 1.05813627, "memory(GiB)": 368.61, "step": 4505, "train_speed(iter/s)": 0.20449 }, { "acc": 0.73413391, "epoch": 0.11440892947742262, "grad_norm": 2.15625, "learning_rate": 9.998581022723178e-06, "loss": 1.11287193, "memory(GiB)": 368.61, "step": 4510, "train_speed(iter/s)": 0.204519 }, { "acc": 0.72237558, "epoch": 0.11453576864535768, "grad_norm": 2.28125, "learning_rate": 9.998555931980792e-06, "loss": 1.15495415, "memory(GiB)": 368.61, "step": 4515, "train_speed(iter/s)": 0.204539 }, { "acc": 0.72623529, "epoch": 0.11466260781329274, "grad_norm": 1.953125, "learning_rate": 9.998530621379599e-06, "loss": 1.09559889, "memory(GiB)": 368.61, "step": 4520, "train_speed(iter/s)": 0.204553 }, { "acc": 0.73735375, "epoch": 0.1147894469812278, "grad_norm": 1.8828125, "learning_rate": 9.998505090920713e-06, "loss": 1.09472332, "memory(GiB)": 368.61, "step": 4525, "train_speed(iter/s)": 0.204583 }, { "acc": 0.7465867, "epoch": 0.11491628614916286, "grad_norm": 2.15625, "learning_rate": 9.998479340605257e-06, "loss": 1.06760836, "memory(GiB)": 368.61, "step": 4530, "train_speed(iter/s)": 0.204621 }, { "acc": 0.73735104, "epoch": 0.11504312531709791, "grad_norm": 1.84375, "learning_rate": 9.99845337043436e-06, "loss": 1.04920616, "memory(GiB)": 368.61, "step": 4535, "train_speed(iter/s)": 0.204664 }, { "acc": 0.72176361, "epoch": 0.11516996448503297, "grad_norm": 2.796875, "learning_rate": 9.998427180409171e-06, "loss": 1.19836979, "memory(GiB)": 368.61, "step": 4540, "train_speed(iter/s)": 0.204661 }, { "acc": 0.72314157, "epoch": 0.11529680365296803, "grad_norm": 2.6875, "learning_rate": 9.998400770530836e-06, "loss": 1.12695131, "memory(GiB)": 368.61, "step": 4545, "train_speed(iter/s)": 0.204693 }, { "acc": 0.72723179, "epoch": 0.11542364282090309, "grad_norm": 2.140625, "learning_rate": 9.99837414080052e-06, "loss": 1.09243298, "memory(GiB)": 368.61, "step": 4550, "train_speed(iter/s)": 0.204734 }, { "acc": 0.71838446, "epoch": 0.11555048198883815, "grad_norm": 2.125, "learning_rate": 9.998347291219393e-06, "loss": 1.11409817, "memory(GiB)": 368.61, "step": 4555, "train_speed(iter/s)": 0.20477 }, { "acc": 0.74161329, "epoch": 0.1156773211567732, "grad_norm": 2.15625, "learning_rate": 9.998320221788635e-06, "loss": 1.09495687, "memory(GiB)": 368.61, "step": 4560, "train_speed(iter/s)": 0.2048 }, { "acc": 0.70339413, "epoch": 0.11580416032470826, "grad_norm": 2.734375, "learning_rate": 9.998292932509438e-06, "loss": 1.24353237, "memory(GiB)": 368.61, "step": 4565, "train_speed(iter/s)": 0.204843 }, { "acc": 0.71885376, "epoch": 0.11593099949264332, "grad_norm": 2.15625, "learning_rate": 9.998265423383003e-06, "loss": 1.16275454, "memory(GiB)": 368.61, "step": 4570, "train_speed(iter/s)": 0.204875 }, { "acc": 0.73622074, "epoch": 0.11605783866057838, "grad_norm": 2.6875, "learning_rate": 9.998237694410537e-06, "loss": 1.10113573, "memory(GiB)": 368.61, "step": 4575, "train_speed(iter/s)": 0.204911 }, { "acc": 0.74263153, "epoch": 0.11618467782851344, "grad_norm": 2.609375, "learning_rate": 9.998209745593264e-06, "loss": 1.04387999, "memory(GiB)": 368.61, "step": 4580, "train_speed(iter/s)": 0.204947 }, { "acc": 0.7303153, "epoch": 0.1163115169964485, "grad_norm": 2.046875, "learning_rate": 9.99818157693241e-06, "loss": 1.11537914, "memory(GiB)": 368.61, "step": 4585, "train_speed(iter/s)": 0.204983 }, { "acc": 0.74241514, "epoch": 0.11643835616438356, "grad_norm": 2.046875, "learning_rate": 9.998153188429216e-06, "loss": 0.99940262, "memory(GiB)": 368.61, "step": 4590, "train_speed(iter/s)": 0.205013 }, { "acc": 0.73466053, "epoch": 0.11656519533231861, "grad_norm": 2.125, "learning_rate": 9.99812458008493e-06, "loss": 1.06780243, "memory(GiB)": 368.61, "step": 4595, "train_speed(iter/s)": 0.205043 }, { "acc": 0.73946342, "epoch": 0.11669203450025367, "grad_norm": 1.78125, "learning_rate": 9.998095751900806e-06, "loss": 1.11986847, "memory(GiB)": 368.61, "step": 4600, "train_speed(iter/s)": 0.205076 }, { "acc": 0.72113538, "epoch": 0.11681887366818873, "grad_norm": 1.78125, "learning_rate": 9.99806670387812e-06, "loss": 1.10547886, "memory(GiB)": 368.61, "step": 4605, "train_speed(iter/s)": 0.205117 }, { "acc": 0.73250046, "epoch": 0.11694571283612379, "grad_norm": 2.453125, "learning_rate": 9.998037436018144e-06, "loss": 1.11229687, "memory(GiB)": 368.61, "step": 4610, "train_speed(iter/s)": 0.205146 }, { "acc": 0.730931, "epoch": 0.11707255200405885, "grad_norm": 2.21875, "learning_rate": 9.998007948322168e-06, "loss": 1.05645771, "memory(GiB)": 368.61, "step": 4615, "train_speed(iter/s)": 0.205159 }, { "acc": 0.73199759, "epoch": 0.1171993911719939, "grad_norm": 2.0625, "learning_rate": 9.997978240791487e-06, "loss": 1.09828739, "memory(GiB)": 368.61, "step": 4620, "train_speed(iter/s)": 0.205204 }, { "acc": 0.72364154, "epoch": 0.11732623033992896, "grad_norm": 2.671875, "learning_rate": 9.99794831342741e-06, "loss": 1.14161224, "memory(GiB)": 368.61, "step": 4625, "train_speed(iter/s)": 0.205236 }, { "acc": 0.73192129, "epoch": 0.11745306950786402, "grad_norm": 2.15625, "learning_rate": 9.99791816623125e-06, "loss": 1.13332157, "memory(GiB)": 368.61, "step": 4630, "train_speed(iter/s)": 0.205229 }, { "acc": 0.7294136, "epoch": 0.11757990867579908, "grad_norm": 1.9609375, "learning_rate": 9.997887799204335e-06, "loss": 1.07731295, "memory(GiB)": 368.61, "step": 4635, "train_speed(iter/s)": 0.205252 }, { "acc": 0.72085962, "epoch": 0.11770674784373414, "grad_norm": 2.03125, "learning_rate": 9.997857212348e-06, "loss": 1.10160913, "memory(GiB)": 368.61, "step": 4640, "train_speed(iter/s)": 0.205279 }, { "acc": 0.73619533, "epoch": 0.1178335870116692, "grad_norm": 1.8515625, "learning_rate": 9.997826405663593e-06, "loss": 1.09799013, "memory(GiB)": 368.61, "step": 4645, "train_speed(iter/s)": 0.205315 }, { "acc": 0.74938459, "epoch": 0.11796042617960426, "grad_norm": 2.203125, "learning_rate": 9.997795379152468e-06, "loss": 0.97700768, "memory(GiB)": 368.61, "step": 4650, "train_speed(iter/s)": 0.205334 }, { "acc": 0.70654278, "epoch": 0.11808726534753931, "grad_norm": 2.234375, "learning_rate": 9.997764132815985e-06, "loss": 1.17970467, "memory(GiB)": 368.61, "step": 4655, "train_speed(iter/s)": 0.20537 }, { "acc": 0.72907562, "epoch": 0.11821410451547437, "grad_norm": 2.375, "learning_rate": 9.997732666655524e-06, "loss": 1.15389357, "memory(GiB)": 368.61, "step": 4660, "train_speed(iter/s)": 0.205411 }, { "acc": 0.72025166, "epoch": 0.11834094368340943, "grad_norm": 2.203125, "learning_rate": 9.997700980672469e-06, "loss": 1.12308693, "memory(GiB)": 368.61, "step": 4665, "train_speed(iter/s)": 0.20546 }, { "acc": 0.739182, "epoch": 0.11846778285134449, "grad_norm": 2.15625, "learning_rate": 9.997669074868208e-06, "loss": 1.05897512, "memory(GiB)": 368.61, "step": 4670, "train_speed(iter/s)": 0.20549 }, { "acc": 0.73353171, "epoch": 0.11859462201927955, "grad_norm": 2.234375, "learning_rate": 9.997636949244151e-06, "loss": 1.11480618, "memory(GiB)": 368.61, "step": 4675, "train_speed(iter/s)": 0.205506 }, { "acc": 0.72743583, "epoch": 0.1187214611872146, "grad_norm": 2.203125, "learning_rate": 9.997604603801707e-06, "loss": 1.1475811, "memory(GiB)": 368.61, "step": 4680, "train_speed(iter/s)": 0.205545 }, { "acc": 0.71727109, "epoch": 0.11884830035514966, "grad_norm": 2.109375, "learning_rate": 9.9975720385423e-06, "loss": 1.18074265, "memory(GiB)": 368.61, "step": 4685, "train_speed(iter/s)": 0.205571 }, { "acc": 0.73539944, "epoch": 0.11897513952308472, "grad_norm": 2.546875, "learning_rate": 9.997539253467361e-06, "loss": 1.12869453, "memory(GiB)": 368.61, "step": 4690, "train_speed(iter/s)": 0.205592 }, { "acc": 0.72821569, "epoch": 0.11910197869101978, "grad_norm": 2.046875, "learning_rate": 9.997506248578334e-06, "loss": 1.10099325, "memory(GiB)": 368.61, "step": 4695, "train_speed(iter/s)": 0.20563 }, { "acc": 0.74204822, "epoch": 0.11922881785895484, "grad_norm": 2.34375, "learning_rate": 9.997473023876671e-06, "loss": 1.1389658, "memory(GiB)": 368.61, "step": 4700, "train_speed(iter/s)": 0.205667 }, { "acc": 0.72636604, "epoch": 0.1193556570268899, "grad_norm": 2.03125, "learning_rate": 9.997439579363831e-06, "loss": 1.16030073, "memory(GiB)": 368.61, "step": 4705, "train_speed(iter/s)": 0.20568 }, { "acc": 0.72652774, "epoch": 0.11948249619482496, "grad_norm": 2.078125, "learning_rate": 9.997405915041288e-06, "loss": 1.10917273, "memory(GiB)": 368.61, "step": 4710, "train_speed(iter/s)": 0.205719 }, { "acc": 0.74062185, "epoch": 0.11960933536276001, "grad_norm": 2.421875, "learning_rate": 9.99737203091052e-06, "loss": 1.05365734, "memory(GiB)": 368.61, "step": 4715, "train_speed(iter/s)": 0.205733 }, { "acc": 0.75091448, "epoch": 0.11973617453069507, "grad_norm": 2.03125, "learning_rate": 9.997337926973018e-06, "loss": 1.03722591, "memory(GiB)": 368.61, "step": 4720, "train_speed(iter/s)": 0.205763 }, { "acc": 0.72495894, "epoch": 0.11986301369863013, "grad_norm": 2.265625, "learning_rate": 9.997303603230282e-06, "loss": 1.10702133, "memory(GiB)": 368.61, "step": 4725, "train_speed(iter/s)": 0.205772 }, { "acc": 0.74773641, "epoch": 0.11998985286656519, "grad_norm": 2.265625, "learning_rate": 9.997269059683822e-06, "loss": 1.04578934, "memory(GiB)": 368.61, "step": 4730, "train_speed(iter/s)": 0.205794 }, { "acc": 0.73316746, "epoch": 0.12011669203450025, "grad_norm": 2.4375, "learning_rate": 9.997234296335159e-06, "loss": 1.13724632, "memory(GiB)": 368.61, "step": 4735, "train_speed(iter/s)": 0.205835 }, { "acc": 0.73592157, "epoch": 0.1202435312024353, "grad_norm": 1.90625, "learning_rate": 9.997199313185821e-06, "loss": 1.09396601, "memory(GiB)": 368.61, "step": 4740, "train_speed(iter/s)": 0.205852 }, { "acc": 0.74874482, "epoch": 0.12037037037037036, "grad_norm": 2.1875, "learning_rate": 9.997164110237345e-06, "loss": 0.9985774, "memory(GiB)": 368.61, "step": 4745, "train_speed(iter/s)": 0.205886 }, { "acc": 0.73845844, "epoch": 0.12049720953830542, "grad_norm": 2.59375, "learning_rate": 9.99712868749128e-06, "loss": 1.07653809, "memory(GiB)": 368.61, "step": 4750, "train_speed(iter/s)": 0.205912 }, { "acc": 0.75049324, "epoch": 0.12062404870624048, "grad_norm": 2.328125, "learning_rate": 9.997093044949186e-06, "loss": 1.08564758, "memory(GiB)": 368.61, "step": 4755, "train_speed(iter/s)": 0.205936 }, { "acc": 0.73999634, "epoch": 0.12075088787417554, "grad_norm": 2.203125, "learning_rate": 9.997057182612631e-06, "loss": 1.05616512, "memory(GiB)": 368.61, "step": 4760, "train_speed(iter/s)": 0.205957 }, { "acc": 0.73449068, "epoch": 0.1208777270421106, "grad_norm": 2.625, "learning_rate": 9.997021100483188e-06, "loss": 1.0858614, "memory(GiB)": 368.61, "step": 4765, "train_speed(iter/s)": 0.20599 }, { "acc": 0.73784256, "epoch": 0.12100456621004566, "grad_norm": 2.5, "learning_rate": 9.996984798562448e-06, "loss": 1.04819126, "memory(GiB)": 368.61, "step": 4770, "train_speed(iter/s)": 0.206013 }, { "acc": 0.72916946, "epoch": 0.12113140537798071, "grad_norm": 2.40625, "learning_rate": 9.996948276852008e-06, "loss": 1.10625143, "memory(GiB)": 368.61, "step": 4775, "train_speed(iter/s)": 0.206049 }, { "acc": 0.7325325, "epoch": 0.12125824454591577, "grad_norm": 2.125, "learning_rate": 9.99691153535347e-06, "loss": 1.14289112, "memory(GiB)": 368.61, "step": 4780, "train_speed(iter/s)": 0.20608 }, { "acc": 0.73488216, "epoch": 0.12138508371385083, "grad_norm": 2.171875, "learning_rate": 9.996874574068457e-06, "loss": 1.10477343, "memory(GiB)": 368.61, "step": 4785, "train_speed(iter/s)": 0.206098 }, { "acc": 0.73328724, "epoch": 0.12151192288178589, "grad_norm": 1.71875, "learning_rate": 9.996837392998586e-06, "loss": 1.09795494, "memory(GiB)": 368.61, "step": 4790, "train_speed(iter/s)": 0.206132 }, { "acc": 0.7379818, "epoch": 0.12163876204972095, "grad_norm": 2.484375, "learning_rate": 9.996799992145501e-06, "loss": 1.09061737, "memory(GiB)": 368.61, "step": 4795, "train_speed(iter/s)": 0.206163 }, { "acc": 0.74224205, "epoch": 0.121765601217656, "grad_norm": 2.171875, "learning_rate": 9.996762371510843e-06, "loss": 1.12616749, "memory(GiB)": 368.61, "step": 4800, "train_speed(iter/s)": 0.206204 }, { "acc": 0.73736839, "epoch": 0.12189244038559106, "grad_norm": 2.0, "learning_rate": 9.996724531096264e-06, "loss": 1.06043673, "memory(GiB)": 368.61, "step": 4805, "train_speed(iter/s)": 0.206237 }, { "acc": 0.73543396, "epoch": 0.12201927955352612, "grad_norm": 2.5625, "learning_rate": 9.996686470903434e-06, "loss": 1.09089069, "memory(GiB)": 368.61, "step": 4810, "train_speed(iter/s)": 0.206271 }, { "acc": 0.73606772, "epoch": 0.12214611872146118, "grad_norm": 2.0625, "learning_rate": 9.996648190934025e-06, "loss": 1.03652515, "memory(GiB)": 368.61, "step": 4815, "train_speed(iter/s)": 0.206311 }, { "acc": 0.73752975, "epoch": 0.12227295788939624, "grad_norm": 2.453125, "learning_rate": 9.996609691189718e-06, "loss": 1.09746094, "memory(GiB)": 368.61, "step": 4820, "train_speed(iter/s)": 0.2063 }, { "acc": 0.74759569, "epoch": 0.1223997970573313, "grad_norm": 2.328125, "learning_rate": 9.996570971672209e-06, "loss": 1.0781312, "memory(GiB)": 368.61, "step": 4825, "train_speed(iter/s)": 0.206332 }, { "acc": 0.71448674, "epoch": 0.12252663622526636, "grad_norm": 2.578125, "learning_rate": 9.996532032383202e-06, "loss": 1.16606874, "memory(GiB)": 368.61, "step": 4830, "train_speed(iter/s)": 0.206358 }, { "acc": 0.73486176, "epoch": 0.12265347539320141, "grad_norm": 2.40625, "learning_rate": 9.996492873324406e-06, "loss": 1.13605614, "memory(GiB)": 368.61, "step": 4835, "train_speed(iter/s)": 0.206382 }, { "acc": 0.73742075, "epoch": 0.12278031456113647, "grad_norm": 2.359375, "learning_rate": 9.996453494497546e-06, "loss": 1.06985912, "memory(GiB)": 368.61, "step": 4840, "train_speed(iter/s)": 0.206395 }, { "acc": 0.73339396, "epoch": 0.12290715372907153, "grad_norm": 2.78125, "learning_rate": 9.996413895904355e-06, "loss": 1.11756716, "memory(GiB)": 368.61, "step": 4845, "train_speed(iter/s)": 0.206426 }, { "acc": 0.72968969, "epoch": 0.12303399289700659, "grad_norm": 2.3125, "learning_rate": 9.996374077546573e-06, "loss": 1.07643394, "memory(GiB)": 368.61, "step": 4850, "train_speed(iter/s)": 0.20645 }, { "acc": 0.74239631, "epoch": 0.12316083206494165, "grad_norm": 2.21875, "learning_rate": 9.996334039425952e-06, "loss": 1.03351526, "memory(GiB)": 368.61, "step": 4855, "train_speed(iter/s)": 0.206479 }, { "acc": 0.74572573, "epoch": 0.1232876712328767, "grad_norm": 2.671875, "learning_rate": 9.996293781544255e-06, "loss": 1.02431793, "memory(GiB)": 368.61, "step": 4860, "train_speed(iter/s)": 0.20651 }, { "acc": 0.73357611, "epoch": 0.12341451040081176, "grad_norm": 2.21875, "learning_rate": 9.996253303903247e-06, "loss": 1.06011791, "memory(GiB)": 368.61, "step": 4865, "train_speed(iter/s)": 0.206549 }, { "acc": 0.73574548, "epoch": 0.12354134956874682, "grad_norm": 1.8359375, "learning_rate": 9.996212606504713e-06, "loss": 1.05312471, "memory(GiB)": 368.61, "step": 4870, "train_speed(iter/s)": 0.20658 }, { "acc": 0.74694891, "epoch": 0.12366818873668188, "grad_norm": 2.140625, "learning_rate": 9.996171689350444e-06, "loss": 1.11474571, "memory(GiB)": 368.61, "step": 4875, "train_speed(iter/s)": 0.206597 }, { "acc": 0.74361162, "epoch": 0.12379502790461694, "grad_norm": 2.0, "learning_rate": 9.996130552442237e-06, "loss": 1.04596367, "memory(GiB)": 368.61, "step": 4880, "train_speed(iter/s)": 0.206629 }, { "acc": 0.73144774, "epoch": 0.123921867072552, "grad_norm": 2.078125, "learning_rate": 9.996089195781902e-06, "loss": 1.02942114, "memory(GiB)": 368.61, "step": 4885, "train_speed(iter/s)": 0.206638 }, { "acc": 0.72643023, "epoch": 0.12404870624048706, "grad_norm": 2.265625, "learning_rate": 9.996047619371256e-06, "loss": 1.09169321, "memory(GiB)": 368.61, "step": 4890, "train_speed(iter/s)": 0.206655 }, { "acc": 0.73255768, "epoch": 0.12417554540842211, "grad_norm": 2.5625, "learning_rate": 9.996005823212132e-06, "loss": 1.06882858, "memory(GiB)": 368.61, "step": 4895, "train_speed(iter/s)": 0.206686 }, { "acc": 0.73362923, "epoch": 0.12430238457635717, "grad_norm": 2.765625, "learning_rate": 9.995963807306368e-06, "loss": 1.07882328, "memory(GiB)": 368.61, "step": 4900, "train_speed(iter/s)": 0.206723 }, { "acc": 0.73367071, "epoch": 0.12442922374429223, "grad_norm": 2.109375, "learning_rate": 9.995921571655808e-06, "loss": 1.07583847, "memory(GiB)": 368.61, "step": 4905, "train_speed(iter/s)": 0.20674 }, { "acc": 0.72105899, "epoch": 0.12455606291222729, "grad_norm": 2.578125, "learning_rate": 9.995879116262312e-06, "loss": 1.12521248, "memory(GiB)": 368.61, "step": 4910, "train_speed(iter/s)": 0.206775 }, { "acc": 0.74954991, "epoch": 0.12468290208016235, "grad_norm": 2.0625, "learning_rate": 9.995836441127749e-06, "loss": 1.01059141, "memory(GiB)": 368.61, "step": 4915, "train_speed(iter/s)": 0.206792 }, { "acc": 0.74116688, "epoch": 0.1248097412480974, "grad_norm": 2.03125, "learning_rate": 9.995793546253993e-06, "loss": 1.01083717, "memory(GiB)": 368.61, "step": 4920, "train_speed(iter/s)": 0.20682 }, { "acc": 0.73786387, "epoch": 0.12493658041603246, "grad_norm": 2.59375, "learning_rate": 9.995750431642933e-06, "loss": 1.08943453, "memory(GiB)": 368.61, "step": 4925, "train_speed(iter/s)": 0.206854 }, { "acc": 0.73638697, "epoch": 0.12506341958396752, "grad_norm": 2.6875, "learning_rate": 9.995707097296465e-06, "loss": 1.09876003, "memory(GiB)": 368.61, "step": 4930, "train_speed(iter/s)": 0.206887 }, { "acc": 0.73142467, "epoch": 0.1251902587519026, "grad_norm": 2.15625, "learning_rate": 9.995663543216493e-06, "loss": 1.09359198, "memory(GiB)": 368.61, "step": 4935, "train_speed(iter/s)": 0.206916 }, { "acc": 0.74579763, "epoch": 0.12531709791983764, "grad_norm": 2.171875, "learning_rate": 9.995619769404936e-06, "loss": 1.07290249, "memory(GiB)": 368.61, "step": 4940, "train_speed(iter/s)": 0.206941 }, { "acc": 0.72314234, "epoch": 0.1254439370877727, "grad_norm": 2.65625, "learning_rate": 9.995575775863717e-06, "loss": 1.13215933, "memory(GiB)": 368.61, "step": 4945, "train_speed(iter/s)": 0.206967 }, { "acc": 0.74907579, "epoch": 0.12557077625570776, "grad_norm": 2.46875, "learning_rate": 9.995531562594773e-06, "loss": 1.08404055, "memory(GiB)": 368.61, "step": 4950, "train_speed(iter/s)": 0.206965 }, { "acc": 0.74484444, "epoch": 0.12569761542364283, "grad_norm": 2.46875, "learning_rate": 9.995487129600046e-06, "loss": 1.05183678, "memory(GiB)": 368.61, "step": 4955, "train_speed(iter/s)": 0.206982 }, { "acc": 0.74490404, "epoch": 0.12582445459157787, "grad_norm": 2.125, "learning_rate": 9.995442476881491e-06, "loss": 1.07052021, "memory(GiB)": 368.61, "step": 4960, "train_speed(iter/s)": 0.207002 }, { "acc": 0.73371267, "epoch": 0.12595129375951294, "grad_norm": 1.9609375, "learning_rate": 9.995397604441076e-06, "loss": 1.05381908, "memory(GiB)": 368.61, "step": 4965, "train_speed(iter/s)": 0.207034 }, { "acc": 0.73564396, "epoch": 0.126078132927448, "grad_norm": 2.34375, "learning_rate": 9.995352512280767e-06, "loss": 1.12134399, "memory(GiB)": 368.61, "step": 4970, "train_speed(iter/s)": 0.207049 }, { "acc": 0.74131756, "epoch": 0.12620497209538306, "grad_norm": 2.828125, "learning_rate": 9.995307200402555e-06, "loss": 1.08391132, "memory(GiB)": 368.61, "step": 4975, "train_speed(iter/s)": 0.207073 }, { "acc": 0.73804545, "epoch": 0.1263318112633181, "grad_norm": 2.3125, "learning_rate": 9.995261668808429e-06, "loss": 1.01543541, "memory(GiB)": 368.61, "step": 4980, "train_speed(iter/s)": 0.207095 }, { "acc": 0.74626675, "epoch": 0.12645865043125318, "grad_norm": 2.234375, "learning_rate": 9.995215917500395e-06, "loss": 1.04763823, "memory(GiB)": 368.61, "step": 4985, "train_speed(iter/s)": 0.207114 }, { "acc": 0.72040882, "epoch": 0.12658548959918822, "grad_norm": 2.046875, "learning_rate": 9.995169946480459e-06, "loss": 1.08493061, "memory(GiB)": 368.61, "step": 4990, "train_speed(iter/s)": 0.207135 }, { "acc": 0.73347292, "epoch": 0.1267123287671233, "grad_norm": 2.46875, "learning_rate": 9.99512375575065e-06, "loss": 1.07559166, "memory(GiB)": 368.61, "step": 4995, "train_speed(iter/s)": 0.207159 }, { "acc": 0.740557, "epoch": 0.12683916793505834, "grad_norm": 2.484375, "learning_rate": 9.995077345312994e-06, "loss": 1.08530922, "memory(GiB)": 368.61, "step": 5000, "train_speed(iter/s)": 0.207186 }, { "epoch": 0.12683916793505834, "eval_acc": 0.7216257665507919, "eval_loss": 1.0562916994094849, "eval_runtime": 384.4596, "eval_samples_per_second": 16.569, "eval_steps_per_second": 8.284, "step": 5000 }, { "acc": 0.72824459, "epoch": 0.1269660071029934, "grad_norm": 2.375, "learning_rate": 9.995030715169535e-06, "loss": 1.12412777, "memory(GiB)": 368.61, "step": 5005, "train_speed(iter/s)": 0.201269 }, { "acc": 0.72367201, "epoch": 0.12709284627092846, "grad_norm": 2.015625, "learning_rate": 9.994983865322327e-06, "loss": 1.11835194, "memory(GiB)": 368.61, "step": 5010, "train_speed(iter/s)": 0.201301 }, { "acc": 0.73322611, "epoch": 0.12721968543886353, "grad_norm": 2.59375, "learning_rate": 9.994936795773424e-06, "loss": 1.08331318, "memory(GiB)": 368.61, "step": 5015, "train_speed(iter/s)": 0.20132 }, { "acc": 0.73807034, "epoch": 0.12734652460679857, "grad_norm": 1.8046875, "learning_rate": 9.994889506524903e-06, "loss": 1.06603317, "memory(GiB)": 368.61, "step": 5020, "train_speed(iter/s)": 0.201337 }, { "acc": 0.74327025, "epoch": 0.12747336377473364, "grad_norm": 2.265625, "learning_rate": 9.994841997578839e-06, "loss": 1.10782146, "memory(GiB)": 368.61, "step": 5025, "train_speed(iter/s)": 0.201363 }, { "acc": 0.73777027, "epoch": 0.1276002029426687, "grad_norm": 1.8046875, "learning_rate": 9.994794268937325e-06, "loss": 1.10203714, "memory(GiB)": 368.61, "step": 5030, "train_speed(iter/s)": 0.201364 }, { "acc": 0.72727509, "epoch": 0.12772704211060376, "grad_norm": 2.796875, "learning_rate": 9.994746320602457e-06, "loss": 1.12249403, "memory(GiB)": 368.61, "step": 5035, "train_speed(iter/s)": 0.201405 }, { "acc": 0.73437042, "epoch": 0.1278538812785388, "grad_norm": 1.953125, "learning_rate": 9.994698152576347e-06, "loss": 1.12131834, "memory(GiB)": 368.61, "step": 5040, "train_speed(iter/s)": 0.201441 }, { "acc": 0.73977976, "epoch": 0.12798072044647388, "grad_norm": 2.46875, "learning_rate": 9.994649764861114e-06, "loss": 1.05356274, "memory(GiB)": 368.61, "step": 5045, "train_speed(iter/s)": 0.201475 }, { "acc": 0.73122902, "epoch": 0.12810755961440892, "grad_norm": 2.234375, "learning_rate": 9.994601157458882e-06, "loss": 1.08431683, "memory(GiB)": 368.61, "step": 5050, "train_speed(iter/s)": 0.20151 }, { "acc": 0.71923437, "epoch": 0.128234398782344, "grad_norm": 2.015625, "learning_rate": 9.994552330371792e-06, "loss": 1.12386713, "memory(GiB)": 368.61, "step": 5055, "train_speed(iter/s)": 0.201545 }, { "acc": 0.72549477, "epoch": 0.12836123795027904, "grad_norm": 1.8046875, "learning_rate": 9.994503283601993e-06, "loss": 1.08051529, "memory(GiB)": 368.61, "step": 5060, "train_speed(iter/s)": 0.201569 }, { "acc": 0.7184411, "epoch": 0.1284880771182141, "grad_norm": 1.8359375, "learning_rate": 9.99445401715164e-06, "loss": 1.13278637, "memory(GiB)": 368.61, "step": 5065, "train_speed(iter/s)": 0.201583 }, { "acc": 0.73607502, "epoch": 0.12861491628614916, "grad_norm": 2.015625, "learning_rate": 9.994404531022901e-06, "loss": 1.06623478, "memory(GiB)": 368.61, "step": 5070, "train_speed(iter/s)": 0.201605 }, { "acc": 0.74576325, "epoch": 0.12874175545408423, "grad_norm": 2.671875, "learning_rate": 9.994354825217954e-06, "loss": 1.04341793, "memory(GiB)": 368.61, "step": 5075, "train_speed(iter/s)": 0.20164 }, { "acc": 0.72704086, "epoch": 0.12886859462201927, "grad_norm": 2.40625, "learning_rate": 9.99430489973898e-06, "loss": 1.10207262, "memory(GiB)": 368.61, "step": 5080, "train_speed(iter/s)": 0.201668 }, { "acc": 0.72334528, "epoch": 0.12899543378995434, "grad_norm": 3.375, "learning_rate": 9.994254754588182e-06, "loss": 1.15221004, "memory(GiB)": 368.61, "step": 5085, "train_speed(iter/s)": 0.201708 }, { "acc": 0.73719816, "epoch": 0.1291222729578894, "grad_norm": 1.9609375, "learning_rate": 9.99420438976776e-06, "loss": 1.05338459, "memory(GiB)": 368.61, "step": 5090, "train_speed(iter/s)": 0.201742 }, { "acc": 0.73267632, "epoch": 0.12924911212582446, "grad_norm": 2.421875, "learning_rate": 9.994153805279932e-06, "loss": 1.08468952, "memory(GiB)": 368.61, "step": 5095, "train_speed(iter/s)": 0.20178 }, { "acc": 0.73816757, "epoch": 0.1293759512937595, "grad_norm": 2.1875, "learning_rate": 9.994103001126923e-06, "loss": 1.01606159, "memory(GiB)": 368.61, "step": 5100, "train_speed(iter/s)": 0.201804 }, { "acc": 0.73447962, "epoch": 0.12950279046169458, "grad_norm": 2.6875, "learning_rate": 9.994051977310966e-06, "loss": 1.06735992, "memory(GiB)": 368.61, "step": 5105, "train_speed(iter/s)": 0.201847 }, { "acc": 0.71175413, "epoch": 0.12962962962962962, "grad_norm": 2.15625, "learning_rate": 9.994000733834307e-06, "loss": 1.17684975, "memory(GiB)": 368.61, "step": 5110, "train_speed(iter/s)": 0.201875 }, { "acc": 0.73795366, "epoch": 0.1297564687975647, "grad_norm": 1.8125, "learning_rate": 9.993949270699197e-06, "loss": 1.070327, "memory(GiB)": 368.61, "step": 5115, "train_speed(iter/s)": 0.201898 }, { "acc": 0.75203419, "epoch": 0.12988330796549974, "grad_norm": 2.125, "learning_rate": 9.993897587907904e-06, "loss": 1.02014542, "memory(GiB)": 368.61, "step": 5120, "train_speed(iter/s)": 0.201936 }, { "acc": 0.7342639, "epoch": 0.1300101471334348, "grad_norm": 2.09375, "learning_rate": 9.993845685462697e-06, "loss": 1.07588177, "memory(GiB)": 368.61, "step": 5125, "train_speed(iter/s)": 0.201958 }, { "acc": 0.71048422, "epoch": 0.13013698630136986, "grad_norm": 2.640625, "learning_rate": 9.993793563365864e-06, "loss": 1.12894049, "memory(GiB)": 368.61, "step": 5130, "train_speed(iter/s)": 0.201987 }, { "acc": 0.73050251, "epoch": 0.13026382546930493, "grad_norm": 2.03125, "learning_rate": 9.993741221619692e-06, "loss": 1.07797136, "memory(GiB)": 368.61, "step": 5135, "train_speed(iter/s)": 0.202018 }, { "acc": 0.72876673, "epoch": 0.13039066463723997, "grad_norm": 2.1875, "learning_rate": 9.993688660226486e-06, "loss": 1.08963242, "memory(GiB)": 368.61, "step": 5140, "train_speed(iter/s)": 0.202036 }, { "acc": 0.75479741, "epoch": 0.13051750380517504, "grad_norm": 2.609375, "learning_rate": 9.993635879188557e-06, "loss": 1.01997032, "memory(GiB)": 368.61, "step": 5145, "train_speed(iter/s)": 0.202034 }, { "acc": 0.73627357, "epoch": 0.1306443429731101, "grad_norm": 2.5, "learning_rate": 9.993582878508229e-06, "loss": 1.05918007, "memory(GiB)": 368.61, "step": 5150, "train_speed(iter/s)": 0.20206 }, { "acc": 0.72078791, "epoch": 0.13077118214104516, "grad_norm": 2.1875, "learning_rate": 9.993529658187829e-06, "loss": 1.10357323, "memory(GiB)": 368.61, "step": 5155, "train_speed(iter/s)": 0.202095 }, { "acc": 0.74228806, "epoch": 0.1308980213089802, "grad_norm": 2.328125, "learning_rate": 9.9934762182297e-06, "loss": 1.09633694, "memory(GiB)": 368.61, "step": 5160, "train_speed(iter/s)": 0.202127 }, { "acc": 0.74245152, "epoch": 0.13102486047691528, "grad_norm": 2.34375, "learning_rate": 9.993422558636194e-06, "loss": 1.07402945, "memory(GiB)": 368.61, "step": 5165, "train_speed(iter/s)": 0.202168 }, { "acc": 0.73509121, "epoch": 0.13115169964485032, "grad_norm": 1.96875, "learning_rate": 9.99336867940967e-06, "loss": 1.06787243, "memory(GiB)": 368.61, "step": 5170, "train_speed(iter/s)": 0.202178 }, { "acc": 0.74187522, "epoch": 0.1312785388127854, "grad_norm": 2.125, "learning_rate": 9.993314580552497e-06, "loss": 1.04444609, "memory(GiB)": 368.61, "step": 5175, "train_speed(iter/s)": 0.202202 }, { "acc": 0.73716736, "epoch": 0.13140537798072044, "grad_norm": 3.09375, "learning_rate": 9.993260262067054e-06, "loss": 1.05475731, "memory(GiB)": 368.61, "step": 5180, "train_speed(iter/s)": 0.202206 }, { "acc": 0.72331524, "epoch": 0.1315322171486555, "grad_norm": 2.59375, "learning_rate": 9.993205723955734e-06, "loss": 1.11614952, "memory(GiB)": 368.61, "step": 5185, "train_speed(iter/s)": 0.202236 }, { "acc": 0.74276218, "epoch": 0.13165905631659056, "grad_norm": 2.84375, "learning_rate": 9.993150966220933e-06, "loss": 1.0203434, "memory(GiB)": 368.61, "step": 5190, "train_speed(iter/s)": 0.20227 }, { "acc": 0.72057447, "epoch": 0.13178589548452563, "grad_norm": 2.265625, "learning_rate": 9.993095988865057e-06, "loss": 1.09866066, "memory(GiB)": 368.61, "step": 5195, "train_speed(iter/s)": 0.2023 }, { "acc": 0.72695518, "epoch": 0.13191273465246067, "grad_norm": 2.078125, "learning_rate": 9.99304079189053e-06, "loss": 1.12650051, "memory(GiB)": 368.61, "step": 5200, "train_speed(iter/s)": 0.202326 }, { "acc": 0.74184036, "epoch": 0.13203957382039574, "grad_norm": 2.9375, "learning_rate": 9.992985375299775e-06, "loss": 1.10648079, "memory(GiB)": 368.61, "step": 5205, "train_speed(iter/s)": 0.202335 }, { "acc": 0.72942271, "epoch": 0.1321664129883308, "grad_norm": 2.640625, "learning_rate": 9.992929739095232e-06, "loss": 1.07439613, "memory(GiB)": 368.61, "step": 5210, "train_speed(iter/s)": 0.202376 }, { "acc": 0.7304738, "epoch": 0.13229325215626586, "grad_norm": 2.421875, "learning_rate": 9.992873883279345e-06, "loss": 1.10954971, "memory(GiB)": 368.61, "step": 5215, "train_speed(iter/s)": 0.2024 }, { "acc": 0.7193428, "epoch": 0.1324200913242009, "grad_norm": 1.8828125, "learning_rate": 9.992817807854575e-06, "loss": 1.09677238, "memory(GiB)": 368.61, "step": 5220, "train_speed(iter/s)": 0.202435 }, { "acc": 0.73426552, "epoch": 0.13254693049213598, "grad_norm": 2.21875, "learning_rate": 9.992761512823386e-06, "loss": 1.1095108, "memory(GiB)": 368.61, "step": 5225, "train_speed(iter/s)": 0.202473 }, { "acc": 0.73511038, "epoch": 0.13267376966007102, "grad_norm": 2.734375, "learning_rate": 9.992704998188255e-06, "loss": 1.06814575, "memory(GiB)": 368.61, "step": 5230, "train_speed(iter/s)": 0.202491 }, { "acc": 0.71491041, "epoch": 0.1328006088280061, "grad_norm": 2.40625, "learning_rate": 9.992648263951668e-06, "loss": 1.13755341, "memory(GiB)": 368.61, "step": 5235, "train_speed(iter/s)": 0.202526 }, { "acc": 0.72461305, "epoch": 0.13292744799594114, "grad_norm": 2.046875, "learning_rate": 9.992591310116118e-06, "loss": 1.14645519, "memory(GiB)": 368.61, "step": 5240, "train_speed(iter/s)": 0.202555 }, { "acc": 0.7297473, "epoch": 0.1330542871638762, "grad_norm": 2.203125, "learning_rate": 9.992534136684112e-06, "loss": 1.05822468, "memory(GiB)": 368.61, "step": 5245, "train_speed(iter/s)": 0.202584 }, { "acc": 0.732828, "epoch": 0.13318112633181126, "grad_norm": 2.171875, "learning_rate": 9.992476743658165e-06, "loss": 1.07250195, "memory(GiB)": 368.61, "step": 5250, "train_speed(iter/s)": 0.202596 }, { "acc": 0.75162206, "epoch": 0.13330796549974633, "grad_norm": 2.03125, "learning_rate": 9.992419131040803e-06, "loss": 1.02664728, "memory(GiB)": 368.61, "step": 5255, "train_speed(iter/s)": 0.202613 }, { "acc": 0.74871926, "epoch": 0.13343480466768137, "grad_norm": 2.234375, "learning_rate": 9.992361298834555e-06, "loss": 1.00895786, "memory(GiB)": 368.61, "step": 5260, "train_speed(iter/s)": 0.202645 }, { "acc": 0.73138847, "epoch": 0.13356164383561644, "grad_norm": 2.390625, "learning_rate": 9.99230324704197e-06, "loss": 1.11082649, "memory(GiB)": 368.61, "step": 5265, "train_speed(iter/s)": 0.202667 }, { "acc": 0.74052291, "epoch": 0.1336884830035515, "grad_norm": 2.46875, "learning_rate": 9.992244975665598e-06, "loss": 1.05461597, "memory(GiB)": 368.61, "step": 5270, "train_speed(iter/s)": 0.202694 }, { "acc": 0.73484125, "epoch": 0.13381532217148656, "grad_norm": 2.046875, "learning_rate": 9.992186484708003e-06, "loss": 1.09189148, "memory(GiB)": 368.61, "step": 5275, "train_speed(iter/s)": 0.202722 }, { "acc": 0.74196315, "epoch": 0.1339421613394216, "grad_norm": 2.5625, "learning_rate": 9.992127774171759e-06, "loss": 1.03337631, "memory(GiB)": 368.61, "step": 5280, "train_speed(iter/s)": 0.20275 }, { "acc": 0.72085552, "epoch": 0.13406900050735668, "grad_norm": 2.328125, "learning_rate": 9.992068844059446e-06, "loss": 1.20064726, "memory(GiB)": 368.61, "step": 5285, "train_speed(iter/s)": 0.202782 }, { "acc": 0.72362814, "epoch": 0.13419583967529172, "grad_norm": 2.859375, "learning_rate": 9.992009694373658e-06, "loss": 1.12500772, "memory(GiB)": 368.61, "step": 5290, "train_speed(iter/s)": 0.202803 }, { "acc": 0.73421135, "epoch": 0.1343226788432268, "grad_norm": 2.9375, "learning_rate": 9.991950325116995e-06, "loss": 1.13016109, "memory(GiB)": 368.61, "step": 5295, "train_speed(iter/s)": 0.202819 }, { "acc": 0.73061638, "epoch": 0.13444951801116184, "grad_norm": 2.21875, "learning_rate": 9.99189073629207e-06, "loss": 1.05364666, "memory(GiB)": 368.61, "step": 5300, "train_speed(iter/s)": 0.202831 }, { "acc": 0.72792649, "epoch": 0.1345763571790969, "grad_norm": 2.625, "learning_rate": 9.991830927901505e-06, "loss": 1.1445714, "memory(GiB)": 368.61, "step": 5305, "train_speed(iter/s)": 0.20285 }, { "acc": 0.73917365, "epoch": 0.13470319634703196, "grad_norm": 2.203125, "learning_rate": 9.991770899947925e-06, "loss": 1.03481369, "memory(GiB)": 368.61, "step": 5310, "train_speed(iter/s)": 0.202874 }, { "acc": 0.73898277, "epoch": 0.13483003551496703, "grad_norm": 2.875, "learning_rate": 9.991710652433977e-06, "loss": 1.08651066, "memory(GiB)": 368.61, "step": 5315, "train_speed(iter/s)": 0.202911 }, { "acc": 0.72752023, "epoch": 0.13495687468290207, "grad_norm": 2.109375, "learning_rate": 9.991650185362308e-06, "loss": 1.09908867, "memory(GiB)": 368.61, "step": 5320, "train_speed(iter/s)": 0.202943 }, { "acc": 0.73789387, "epoch": 0.13508371385083714, "grad_norm": 2.34375, "learning_rate": 9.991589498735577e-06, "loss": 1.0346941, "memory(GiB)": 368.61, "step": 5325, "train_speed(iter/s)": 0.202971 }, { "acc": 0.74281635, "epoch": 0.1352105530187722, "grad_norm": 2.5625, "learning_rate": 9.991528592556454e-06, "loss": 1.06013489, "memory(GiB)": 368.61, "step": 5330, "train_speed(iter/s)": 0.203002 }, { "acc": 0.72102442, "epoch": 0.13533739218670726, "grad_norm": 2.53125, "learning_rate": 9.991467466827618e-06, "loss": 1.15959692, "memory(GiB)": 368.61, "step": 5335, "train_speed(iter/s)": 0.20303 }, { "acc": 0.73357706, "epoch": 0.1354642313546423, "grad_norm": 2.5, "learning_rate": 9.99140612155176e-06, "loss": 1.07843914, "memory(GiB)": 368.61, "step": 5340, "train_speed(iter/s)": 0.203057 }, { "acc": 0.72266226, "epoch": 0.13559107052257738, "grad_norm": 2.484375, "learning_rate": 9.991344556731572e-06, "loss": 1.10120163, "memory(GiB)": 368.61, "step": 5345, "train_speed(iter/s)": 0.203098 }, { "acc": 0.74451418, "epoch": 0.13571790969051242, "grad_norm": 2.0625, "learning_rate": 9.991282772369766e-06, "loss": 1.03823805, "memory(GiB)": 368.61, "step": 5350, "train_speed(iter/s)": 0.20313 }, { "acc": 0.73434906, "epoch": 0.1358447488584475, "grad_norm": 2.46875, "learning_rate": 9.99122076846906e-06, "loss": 1.06131124, "memory(GiB)": 368.61, "step": 5355, "train_speed(iter/s)": 0.203164 }, { "acc": 0.7441915, "epoch": 0.13597158802638254, "grad_norm": 2.1875, "learning_rate": 9.991158545032181e-06, "loss": 1.07264805, "memory(GiB)": 368.61, "step": 5360, "train_speed(iter/s)": 0.203202 }, { "acc": 0.74688339, "epoch": 0.1360984271943176, "grad_norm": 2.1875, "learning_rate": 9.991096102061865e-06, "loss": 1.06796455, "memory(GiB)": 368.61, "step": 5365, "train_speed(iter/s)": 0.20324 }, { "acc": 0.73786564, "epoch": 0.13622526636225266, "grad_norm": 2.765625, "learning_rate": 9.991033439560858e-06, "loss": 1.068363, "memory(GiB)": 368.61, "step": 5370, "train_speed(iter/s)": 0.203278 }, { "acc": 0.72452121, "epoch": 0.13635210553018773, "grad_norm": 2.0625, "learning_rate": 9.990970557531918e-06, "loss": 1.12915716, "memory(GiB)": 368.61, "step": 5375, "train_speed(iter/s)": 0.203306 }, { "acc": 0.74911337, "epoch": 0.13647894469812277, "grad_norm": 2.375, "learning_rate": 9.990907455977809e-06, "loss": 1.05254078, "memory(GiB)": 368.61, "step": 5380, "train_speed(iter/s)": 0.203329 }, { "acc": 0.72420082, "epoch": 0.13660578386605784, "grad_norm": 1.890625, "learning_rate": 9.990844134901308e-06, "loss": 1.11539879, "memory(GiB)": 368.61, "step": 5385, "train_speed(iter/s)": 0.203353 }, { "acc": 0.74961476, "epoch": 0.1367326230339929, "grad_norm": 2.125, "learning_rate": 9.9907805943052e-06, "loss": 1.05759678, "memory(GiB)": 368.61, "step": 5390, "train_speed(iter/s)": 0.20337 }, { "acc": 0.73516817, "epoch": 0.13685946220192796, "grad_norm": 1.9375, "learning_rate": 9.990716834192278e-06, "loss": 1.01977043, "memory(GiB)": 368.61, "step": 5395, "train_speed(iter/s)": 0.203388 }, { "acc": 0.72944136, "epoch": 0.136986301369863, "grad_norm": 2.21875, "learning_rate": 9.990652854565348e-06, "loss": 1.08127432, "memory(GiB)": 368.61, "step": 5400, "train_speed(iter/s)": 0.203421 }, { "acc": 0.72651949, "epoch": 0.13711314053779808, "grad_norm": 2.109375, "learning_rate": 9.990588655427225e-06, "loss": 1.12899361, "memory(GiB)": 368.61, "step": 5405, "train_speed(iter/s)": 0.203446 }, { "acc": 0.74128647, "epoch": 0.13723997970573312, "grad_norm": 2.46875, "learning_rate": 9.99052423678073e-06, "loss": 1.07728167, "memory(GiB)": 368.61, "step": 5410, "train_speed(iter/s)": 0.203481 }, { "acc": 0.74640064, "epoch": 0.1373668188736682, "grad_norm": 2.28125, "learning_rate": 9.990459598628697e-06, "loss": 1.01153097, "memory(GiB)": 368.61, "step": 5415, "train_speed(iter/s)": 0.203523 }, { "acc": 0.73067722, "epoch": 0.13749365804160324, "grad_norm": 1.875, "learning_rate": 9.990394740973972e-06, "loss": 1.00768337, "memory(GiB)": 368.61, "step": 5420, "train_speed(iter/s)": 0.203522 }, { "acc": 0.7339304, "epoch": 0.1376204972095383, "grad_norm": 2.375, "learning_rate": 9.990329663819405e-06, "loss": 1.11423969, "memory(GiB)": 368.61, "step": 5425, "train_speed(iter/s)": 0.203553 }, { "acc": 0.74262538, "epoch": 0.13774733637747336, "grad_norm": 2.25, "learning_rate": 9.99026436716786e-06, "loss": 1.02871933, "memory(GiB)": 368.61, "step": 5430, "train_speed(iter/s)": 0.203573 }, { "acc": 0.73128834, "epoch": 0.13787417554540843, "grad_norm": 1.8046875, "learning_rate": 9.990198851022207e-06, "loss": 1.08732986, "memory(GiB)": 368.61, "step": 5435, "train_speed(iter/s)": 0.203601 }, { "acc": 0.73805676, "epoch": 0.13800101471334347, "grad_norm": 2.15625, "learning_rate": 9.99013311538533e-06, "loss": 1.04493847, "memory(GiB)": 368.61, "step": 5440, "train_speed(iter/s)": 0.203629 }, { "acc": 0.73905258, "epoch": 0.13812785388127855, "grad_norm": 2.40625, "learning_rate": 9.99006716026012e-06, "loss": 1.10283976, "memory(GiB)": 368.61, "step": 5445, "train_speed(iter/s)": 0.203641 }, { "acc": 0.73302913, "epoch": 0.1382546930492136, "grad_norm": 1.90625, "learning_rate": 9.990000985649475e-06, "loss": 1.10595551, "memory(GiB)": 368.61, "step": 5450, "train_speed(iter/s)": 0.203665 }, { "acc": 0.72930002, "epoch": 0.13838153221714866, "grad_norm": 2.09375, "learning_rate": 9.989934591556308e-06, "loss": 1.09892101, "memory(GiB)": 368.61, "step": 5455, "train_speed(iter/s)": 0.203695 }, { "acc": 0.71855536, "epoch": 0.1385083713850837, "grad_norm": 2.59375, "learning_rate": 9.98986797798354e-06, "loss": 1.15458393, "memory(GiB)": 368.61, "step": 5460, "train_speed(iter/s)": 0.203715 }, { "acc": 0.73860407, "epoch": 0.13863521055301878, "grad_norm": 2.484375, "learning_rate": 9.989801144934102e-06, "loss": 1.05976524, "memory(GiB)": 368.61, "step": 5465, "train_speed(iter/s)": 0.203737 }, { "acc": 0.74032497, "epoch": 0.13876204972095382, "grad_norm": 2.40625, "learning_rate": 9.98973409241093e-06, "loss": 1.10245399, "memory(GiB)": 368.61, "step": 5470, "train_speed(iter/s)": 0.203772 }, { "acc": 0.72685146, "epoch": 0.1388888888888889, "grad_norm": 2.171875, "learning_rate": 9.989666820416974e-06, "loss": 1.09536095, "memory(GiB)": 368.61, "step": 5475, "train_speed(iter/s)": 0.203811 }, { "acc": 0.73340201, "epoch": 0.13901572805682394, "grad_norm": 2.875, "learning_rate": 9.989599328955195e-06, "loss": 1.1098671, "memory(GiB)": 368.61, "step": 5480, "train_speed(iter/s)": 0.203829 }, { "acc": 0.73820057, "epoch": 0.139142567224759, "grad_norm": 1.828125, "learning_rate": 9.98953161802856e-06, "loss": 1.05588436, "memory(GiB)": 368.61, "step": 5485, "train_speed(iter/s)": 0.203843 }, { "acc": 0.74829283, "epoch": 0.13926940639269406, "grad_norm": 2.578125, "learning_rate": 9.98946368764005e-06, "loss": 0.99326572, "memory(GiB)": 368.61, "step": 5490, "train_speed(iter/s)": 0.203863 }, { "acc": 0.75197544, "epoch": 0.13939624556062913, "grad_norm": 2.484375, "learning_rate": 9.989395537792647e-06, "loss": 1.05588503, "memory(GiB)": 368.61, "step": 5495, "train_speed(iter/s)": 0.203897 }, { "acc": 0.73112726, "epoch": 0.13952308472856417, "grad_norm": 2.03125, "learning_rate": 9.989327168489356e-06, "loss": 1.08248816, "memory(GiB)": 368.61, "step": 5500, "train_speed(iter/s)": 0.203912 }, { "acc": 0.73163419, "epoch": 0.13964992389649925, "grad_norm": 2.109375, "learning_rate": 9.989258579733179e-06, "loss": 1.11124077, "memory(GiB)": 368.61, "step": 5505, "train_speed(iter/s)": 0.203928 }, { "acc": 0.73279495, "epoch": 0.1397767630644343, "grad_norm": 2.015625, "learning_rate": 9.989189771527133e-06, "loss": 1.03788033, "memory(GiB)": 368.61, "step": 5510, "train_speed(iter/s)": 0.203947 }, { "acc": 0.73604221, "epoch": 0.13990360223236936, "grad_norm": 2.234375, "learning_rate": 9.989120743874248e-06, "loss": 1.07855816, "memory(GiB)": 368.61, "step": 5515, "train_speed(iter/s)": 0.203972 }, { "acc": 0.72785316, "epoch": 0.1400304414003044, "grad_norm": 3.125, "learning_rate": 9.989051496777556e-06, "loss": 1.17630148, "memory(GiB)": 368.61, "step": 5520, "train_speed(iter/s)": 0.204013 }, { "acc": 0.722616, "epoch": 0.14015728056823948, "grad_norm": 2.453125, "learning_rate": 9.988982030240104e-06, "loss": 1.11574764, "memory(GiB)": 368.61, "step": 5525, "train_speed(iter/s)": 0.204024 }, { "acc": 0.72898965, "epoch": 0.14028411973617452, "grad_norm": 2.421875, "learning_rate": 9.988912344264949e-06, "loss": 1.12347507, "memory(GiB)": 368.61, "step": 5530, "train_speed(iter/s)": 0.20402 }, { "acc": 0.7381628, "epoch": 0.1404109589041096, "grad_norm": 1.953125, "learning_rate": 9.988842438855156e-06, "loss": 1.04669895, "memory(GiB)": 368.61, "step": 5535, "train_speed(iter/s)": 0.204042 }, { "acc": 0.72813015, "epoch": 0.14053779807204464, "grad_norm": 2.53125, "learning_rate": 9.988772314013799e-06, "loss": 1.09414101, "memory(GiB)": 368.61, "step": 5540, "train_speed(iter/s)": 0.204065 }, { "acc": 0.73794842, "epoch": 0.1406646372399797, "grad_norm": 2.265625, "learning_rate": 9.988701969743961e-06, "loss": 1.07644701, "memory(GiB)": 368.61, "step": 5545, "train_speed(iter/s)": 0.204105 }, { "acc": 0.73608751, "epoch": 0.14079147640791476, "grad_norm": 1.75, "learning_rate": 9.98863140604874e-06, "loss": 1.06742373, "memory(GiB)": 368.61, "step": 5550, "train_speed(iter/s)": 0.204133 }, { "acc": 0.73570743, "epoch": 0.14091831557584983, "grad_norm": 2.046875, "learning_rate": 9.988560622931233e-06, "loss": 1.06472759, "memory(GiB)": 368.61, "step": 5555, "train_speed(iter/s)": 0.204156 }, { "acc": 0.7465097, "epoch": 0.14104515474378487, "grad_norm": 1.96875, "learning_rate": 9.988489620394562e-06, "loss": 1.06010571, "memory(GiB)": 368.61, "step": 5560, "train_speed(iter/s)": 0.204184 }, { "acc": 0.75340643, "epoch": 0.14117199391171995, "grad_norm": 1.9921875, "learning_rate": 9.988418398441842e-06, "loss": 1.02390423, "memory(GiB)": 368.61, "step": 5565, "train_speed(iter/s)": 0.204223 }, { "acc": 0.71866126, "epoch": 0.141298833079655, "grad_norm": 2.484375, "learning_rate": 9.98834695707621e-06, "loss": 1.18506222, "memory(GiB)": 368.61, "step": 5570, "train_speed(iter/s)": 0.204261 }, { "acc": 0.73444052, "epoch": 0.14142567224759006, "grad_norm": 1.921875, "learning_rate": 9.98827529630081e-06, "loss": 1.08799572, "memory(GiB)": 368.61, "step": 5575, "train_speed(iter/s)": 0.204299 }, { "acc": 0.74279194, "epoch": 0.1415525114155251, "grad_norm": 1.9765625, "learning_rate": 9.988203416118788e-06, "loss": 1.00565786, "memory(GiB)": 368.61, "step": 5580, "train_speed(iter/s)": 0.204316 }, { "acc": 0.73796301, "epoch": 0.14167935058346018, "grad_norm": 2.09375, "learning_rate": 9.98813131653331e-06, "loss": 1.07064629, "memory(GiB)": 368.61, "step": 5585, "train_speed(iter/s)": 0.20435 }, { "acc": 0.7289855, "epoch": 0.14180618975139522, "grad_norm": 2.0, "learning_rate": 9.988058997547548e-06, "loss": 1.07333899, "memory(GiB)": 368.61, "step": 5590, "train_speed(iter/s)": 0.204359 }, { "acc": 0.73357487, "epoch": 0.1419330289193303, "grad_norm": 2.609375, "learning_rate": 9.987986459164678e-06, "loss": 1.14836655, "memory(GiB)": 368.61, "step": 5595, "train_speed(iter/s)": 0.204395 }, { "acc": 0.72988825, "epoch": 0.14205986808726534, "grad_norm": 2.65625, "learning_rate": 9.987913701387897e-06, "loss": 1.1034193, "memory(GiB)": 368.61, "step": 5600, "train_speed(iter/s)": 0.20438 }, { "acc": 0.7280591, "epoch": 0.1421867072552004, "grad_norm": 2.046875, "learning_rate": 9.9878407242204e-06, "loss": 1.10228386, "memory(GiB)": 368.61, "step": 5605, "train_speed(iter/s)": 0.204408 }, { "acc": 0.73018088, "epoch": 0.14231354642313546, "grad_norm": 2.296875, "learning_rate": 9.9877675276654e-06, "loss": 1.05373116, "memory(GiB)": 368.61, "step": 5610, "train_speed(iter/s)": 0.204423 }, { "acc": 0.73741207, "epoch": 0.14244038559107053, "grad_norm": 2.046875, "learning_rate": 9.987694111726114e-06, "loss": 1.07452736, "memory(GiB)": 368.61, "step": 5615, "train_speed(iter/s)": 0.204446 }, { "acc": 0.750211, "epoch": 0.14256722475900557, "grad_norm": 1.96875, "learning_rate": 9.987620476405774e-06, "loss": 1.05202789, "memory(GiB)": 368.61, "step": 5620, "train_speed(iter/s)": 0.204478 }, { "acc": 0.74349618, "epoch": 0.14269406392694065, "grad_norm": 2.09375, "learning_rate": 9.987546621707616e-06, "loss": 1.02799511, "memory(GiB)": 368.61, "step": 5625, "train_speed(iter/s)": 0.204502 }, { "acc": 0.72994328, "epoch": 0.1428209030948757, "grad_norm": 2.328125, "learning_rate": 9.98747254763489e-06, "loss": 1.10899487, "memory(GiB)": 368.61, "step": 5630, "train_speed(iter/s)": 0.204537 }, { "acc": 0.72683721, "epoch": 0.14294774226281076, "grad_norm": 2.46875, "learning_rate": 9.987398254190855e-06, "loss": 1.11955566, "memory(GiB)": 368.61, "step": 5635, "train_speed(iter/s)": 0.204564 }, { "acc": 0.73415709, "epoch": 0.1430745814307458, "grad_norm": 2.25, "learning_rate": 9.987323741378777e-06, "loss": 1.11011372, "memory(GiB)": 368.61, "step": 5640, "train_speed(iter/s)": 0.204593 }, { "acc": 0.74953318, "epoch": 0.14320142059868088, "grad_norm": 2.546875, "learning_rate": 9.987249009201934e-06, "loss": 1.000525, "memory(GiB)": 368.61, "step": 5645, "train_speed(iter/s)": 0.204618 }, { "acc": 0.73807755, "epoch": 0.14332825976661592, "grad_norm": 2.5625, "learning_rate": 9.987174057663613e-06, "loss": 1.05090714, "memory(GiB)": 368.61, "step": 5650, "train_speed(iter/s)": 0.204641 }, { "acc": 0.72988439, "epoch": 0.143455098934551, "grad_norm": 2.421875, "learning_rate": 9.987098886767111e-06, "loss": 1.0758357, "memory(GiB)": 368.61, "step": 5655, "train_speed(iter/s)": 0.20467 }, { "acc": 0.72039204, "epoch": 0.14358193810248604, "grad_norm": 2.296875, "learning_rate": 9.987023496515734e-06, "loss": 1.13254051, "memory(GiB)": 368.61, "step": 5660, "train_speed(iter/s)": 0.204675 }, { "acc": 0.73670263, "epoch": 0.1437087772704211, "grad_norm": 2.828125, "learning_rate": 9.9869478869128e-06, "loss": 1.1095171, "memory(GiB)": 368.61, "step": 5665, "train_speed(iter/s)": 0.204712 }, { "acc": 0.72893519, "epoch": 0.14383561643835616, "grad_norm": 2.171875, "learning_rate": 9.98687205796163e-06, "loss": 1.12208014, "memory(GiB)": 368.61, "step": 5670, "train_speed(iter/s)": 0.204699 }, { "acc": 0.73782334, "epoch": 0.14396245560629123, "grad_norm": 2.296875, "learning_rate": 9.986796009665562e-06, "loss": 1.02920895, "memory(GiB)": 368.61, "step": 5675, "train_speed(iter/s)": 0.204726 }, { "acc": 0.74092083, "epoch": 0.14408929477422627, "grad_norm": 1.8359375, "learning_rate": 9.986719742027944e-06, "loss": 1.00523567, "memory(GiB)": 368.61, "step": 5680, "train_speed(iter/s)": 0.204756 }, { "acc": 0.73869209, "epoch": 0.14421613394216135, "grad_norm": 2.25, "learning_rate": 9.986643255052125e-06, "loss": 1.06701908, "memory(GiB)": 368.61, "step": 5685, "train_speed(iter/s)": 0.204794 }, { "acc": 0.73085222, "epoch": 0.1443429731100964, "grad_norm": 2.0, "learning_rate": 9.986566548741473e-06, "loss": 1.07387352, "memory(GiB)": 368.61, "step": 5690, "train_speed(iter/s)": 0.204829 }, { "acc": 0.74542732, "epoch": 0.14446981227803146, "grad_norm": 2.28125, "learning_rate": 9.98648962309936e-06, "loss": 1.05299339, "memory(GiB)": 368.61, "step": 5695, "train_speed(iter/s)": 0.204832 }, { "acc": 0.74126873, "epoch": 0.1445966514459665, "grad_norm": 2.265625, "learning_rate": 9.986412478129171e-06, "loss": 1.01359282, "memory(GiB)": 368.61, "step": 5700, "train_speed(iter/s)": 0.204863 }, { "acc": 0.74206028, "epoch": 0.14472349061390158, "grad_norm": 2.859375, "learning_rate": 9.9863351138343e-06, "loss": 1.06893349, "memory(GiB)": 368.61, "step": 5705, "train_speed(iter/s)": 0.204888 }, { "acc": 0.74345236, "epoch": 0.14485032978183662, "grad_norm": 2.078125, "learning_rate": 9.986257530218146e-06, "loss": 1.0215085, "memory(GiB)": 368.61, "step": 5710, "train_speed(iter/s)": 0.20492 }, { "acc": 0.72952957, "epoch": 0.1449771689497717, "grad_norm": 2.21875, "learning_rate": 9.986179727284124e-06, "loss": 1.0448595, "memory(GiB)": 368.61, "step": 5715, "train_speed(iter/s)": 0.204947 }, { "acc": 0.72056532, "epoch": 0.14510400811770674, "grad_norm": 2.765625, "learning_rate": 9.986101705035656e-06, "loss": 1.13139839, "memory(GiB)": 368.61, "step": 5720, "train_speed(iter/s)": 0.204958 }, { "acc": 0.7266839, "epoch": 0.1452308472856418, "grad_norm": 2.109375, "learning_rate": 9.986023463476175e-06, "loss": 1.12605095, "memory(GiB)": 368.61, "step": 5725, "train_speed(iter/s)": 0.204966 }, { "acc": 0.73994579, "epoch": 0.14535768645357686, "grad_norm": 2.078125, "learning_rate": 9.985945002609119e-06, "loss": 1.0409626, "memory(GiB)": 368.61, "step": 5730, "train_speed(iter/s)": 0.204995 }, { "acc": 0.73484435, "epoch": 0.14548452562151193, "grad_norm": 3.34375, "learning_rate": 9.985866322437942e-06, "loss": 1.11764536, "memory(GiB)": 368.61, "step": 5735, "train_speed(iter/s)": 0.205022 }, { "acc": 0.73375711, "epoch": 0.14561136478944697, "grad_norm": 2.8125, "learning_rate": 9.985787422966105e-06, "loss": 1.08106155, "memory(GiB)": 368.61, "step": 5740, "train_speed(iter/s)": 0.205041 }, { "acc": 0.74465446, "epoch": 0.14573820395738205, "grad_norm": 2.71875, "learning_rate": 9.985708304197075e-06, "loss": 1.13923311, "memory(GiB)": 368.61, "step": 5745, "train_speed(iter/s)": 0.205067 }, { "acc": 0.73917637, "epoch": 0.1458650431253171, "grad_norm": 2.203125, "learning_rate": 9.985628966134336e-06, "loss": 1.09034958, "memory(GiB)": 368.61, "step": 5750, "train_speed(iter/s)": 0.205104 }, { "acc": 0.73785224, "epoch": 0.14599188229325216, "grad_norm": 2.734375, "learning_rate": 9.985549408781377e-06, "loss": 1.05894566, "memory(GiB)": 368.61, "step": 5755, "train_speed(iter/s)": 0.205105 }, { "acc": 0.72909441, "epoch": 0.1461187214611872, "grad_norm": 1.9765625, "learning_rate": 9.985469632141693e-06, "loss": 1.13491459, "memory(GiB)": 368.61, "step": 5760, "train_speed(iter/s)": 0.205129 }, { "acc": 0.72845688, "epoch": 0.14624556062912228, "grad_norm": 2.125, "learning_rate": 9.985389636218797e-06, "loss": 1.10054426, "memory(GiB)": 368.61, "step": 5765, "train_speed(iter/s)": 0.205147 }, { "acc": 0.73294945, "epoch": 0.14637239979705732, "grad_norm": 2.328125, "learning_rate": 9.985309421016207e-06, "loss": 1.06650181, "memory(GiB)": 368.61, "step": 5770, "train_speed(iter/s)": 0.205176 }, { "acc": 0.73993864, "epoch": 0.1464992389649924, "grad_norm": 2.25, "learning_rate": 9.985228986537451e-06, "loss": 1.08289394, "memory(GiB)": 368.61, "step": 5775, "train_speed(iter/s)": 0.205209 }, { "acc": 0.73399458, "epoch": 0.14662607813292744, "grad_norm": 2.15625, "learning_rate": 9.985148332786068e-06, "loss": 1.05821314, "memory(GiB)": 368.61, "step": 5780, "train_speed(iter/s)": 0.205233 }, { "acc": 0.75296926, "epoch": 0.1467529173008625, "grad_norm": 2.5, "learning_rate": 9.985067459765603e-06, "loss": 1.058708, "memory(GiB)": 368.61, "step": 5785, "train_speed(iter/s)": 0.205263 }, { "acc": 0.73656878, "epoch": 0.14687975646879756, "grad_norm": 2.1875, "learning_rate": 9.984986367479615e-06, "loss": 1.08167648, "memory(GiB)": 368.61, "step": 5790, "train_speed(iter/s)": 0.205283 }, { "acc": 0.7425457, "epoch": 0.14700659563673263, "grad_norm": 2.21875, "learning_rate": 9.984905055931668e-06, "loss": 1.04830685, "memory(GiB)": 368.61, "step": 5795, "train_speed(iter/s)": 0.205316 }, { "acc": 0.71194277, "epoch": 0.14713343480466767, "grad_norm": 2.28125, "learning_rate": 9.984823525125342e-06, "loss": 1.14045944, "memory(GiB)": 368.61, "step": 5800, "train_speed(iter/s)": 0.205354 }, { "acc": 0.72828484, "epoch": 0.14726027397260275, "grad_norm": 2.078125, "learning_rate": 9.984741775064222e-06, "loss": 1.11615248, "memory(GiB)": 368.61, "step": 5805, "train_speed(iter/s)": 0.205382 }, { "acc": 0.71543131, "epoch": 0.1473871131405378, "grad_norm": 2.0, "learning_rate": 9.984659805751904e-06, "loss": 1.12537022, "memory(GiB)": 368.61, "step": 5810, "train_speed(iter/s)": 0.205406 }, { "acc": 0.73992634, "epoch": 0.14751395230847286, "grad_norm": 2.078125, "learning_rate": 9.984577617191993e-06, "loss": 1.093011, "memory(GiB)": 368.61, "step": 5815, "train_speed(iter/s)": 0.205424 }, { "acc": 0.72705803, "epoch": 0.1476407914764079, "grad_norm": 2.546875, "learning_rate": 9.984495209388102e-06, "loss": 1.12771778, "memory(GiB)": 368.61, "step": 5820, "train_speed(iter/s)": 0.205458 }, { "acc": 0.73185544, "epoch": 0.14776763064434298, "grad_norm": 2.796875, "learning_rate": 9.984412582343859e-06, "loss": 1.11665211, "memory(GiB)": 368.61, "step": 5825, "train_speed(iter/s)": 0.205489 }, { "acc": 0.73624039, "epoch": 0.14789446981227802, "grad_norm": 2.46875, "learning_rate": 9.984329736062896e-06, "loss": 1.02688274, "memory(GiB)": 368.61, "step": 5830, "train_speed(iter/s)": 0.205516 }, { "acc": 0.72525539, "epoch": 0.1480213089802131, "grad_norm": 2.1875, "learning_rate": 9.984246670548858e-06, "loss": 1.09424744, "memory(GiB)": 368.61, "step": 5835, "train_speed(iter/s)": 0.205537 }, { "acc": 0.74052401, "epoch": 0.14814814814814814, "grad_norm": 2.1875, "learning_rate": 9.984163385805398e-06, "loss": 1.10474052, "memory(GiB)": 368.61, "step": 5840, "train_speed(iter/s)": 0.20556 }, { "acc": 0.73790846, "epoch": 0.1482749873160832, "grad_norm": 2.90625, "learning_rate": 9.984079881836182e-06, "loss": 1.09409952, "memory(GiB)": 368.61, "step": 5845, "train_speed(iter/s)": 0.205598 }, { "acc": 0.73598933, "epoch": 0.14840182648401826, "grad_norm": 2.46875, "learning_rate": 9.983996158644877e-06, "loss": 1.0527626, "memory(GiB)": 368.61, "step": 5850, "train_speed(iter/s)": 0.205619 }, { "acc": 0.7396822, "epoch": 0.14852866565195333, "grad_norm": 1.8125, "learning_rate": 9.983912216235172e-06, "loss": 0.98610058, "memory(GiB)": 368.61, "step": 5855, "train_speed(iter/s)": 0.205642 }, { "acc": 0.73419123, "epoch": 0.14865550481988837, "grad_norm": 2.734375, "learning_rate": 9.983828054610754e-06, "loss": 1.13580275, "memory(GiB)": 368.61, "step": 5860, "train_speed(iter/s)": 0.205666 }, { "acc": 0.7401701, "epoch": 0.14878234398782345, "grad_norm": 2.421875, "learning_rate": 9.983743673775328e-06, "loss": 1.10592651, "memory(GiB)": 368.61, "step": 5865, "train_speed(iter/s)": 0.205673 }, { "acc": 0.74357824, "epoch": 0.1489091831557585, "grad_norm": 2.625, "learning_rate": 9.983659073732604e-06, "loss": 1.06428127, "memory(GiB)": 368.61, "step": 5870, "train_speed(iter/s)": 0.205713 }, { "acc": 0.73415527, "epoch": 0.14903602232369356, "grad_norm": 2.765625, "learning_rate": 9.983574254486303e-06, "loss": 1.09815693, "memory(GiB)": 368.61, "step": 5875, "train_speed(iter/s)": 0.205742 }, { "acc": 0.73954086, "epoch": 0.1491628614916286, "grad_norm": 1.96875, "learning_rate": 9.983489216040158e-06, "loss": 1.0727067, "memory(GiB)": 368.61, "step": 5880, "train_speed(iter/s)": 0.20577 }, { "acc": 0.74192667, "epoch": 0.14928970065956368, "grad_norm": 1.984375, "learning_rate": 9.983403958397907e-06, "loss": 1.06694603, "memory(GiB)": 368.61, "step": 5885, "train_speed(iter/s)": 0.205808 }, { "acc": 0.72736273, "epoch": 0.14941653982749872, "grad_norm": 2.34375, "learning_rate": 9.9833184815633e-06, "loss": 1.20730762, "memory(GiB)": 368.61, "step": 5890, "train_speed(iter/s)": 0.205831 }, { "acc": 0.73350801, "epoch": 0.1495433789954338, "grad_norm": 2.09375, "learning_rate": 9.983232785540097e-06, "loss": 1.11818027, "memory(GiB)": 368.61, "step": 5895, "train_speed(iter/s)": 0.20585 }, { "acc": 0.7258317, "epoch": 0.14967021816336884, "grad_norm": 2.5625, "learning_rate": 9.983146870332068e-06, "loss": 1.09937382, "memory(GiB)": 368.61, "step": 5900, "train_speed(iter/s)": 0.205877 }, { "acc": 0.7318141, "epoch": 0.1497970573313039, "grad_norm": 3.0625, "learning_rate": 9.98306073594299e-06, "loss": 1.1325655, "memory(GiB)": 368.61, "step": 5905, "train_speed(iter/s)": 0.205909 }, { "acc": 0.74840131, "epoch": 0.14992389649923896, "grad_norm": 2.5625, "learning_rate": 9.982974382376656e-06, "loss": 1.04089775, "memory(GiB)": 368.61, "step": 5910, "train_speed(iter/s)": 0.205929 }, { "acc": 0.7428587, "epoch": 0.15005073566717403, "grad_norm": 2.421875, "learning_rate": 9.98288780963686e-06, "loss": 1.06921835, "memory(GiB)": 368.61, "step": 5915, "train_speed(iter/s)": 0.205945 }, { "acc": 0.73227997, "epoch": 0.15017757483510907, "grad_norm": 1.9609375, "learning_rate": 9.98280101772741e-06, "loss": 1.11730356, "memory(GiB)": 368.61, "step": 5920, "train_speed(iter/s)": 0.20597 }, { "acc": 0.72484326, "epoch": 0.15030441400304415, "grad_norm": 2.046875, "learning_rate": 9.982714006652126e-06, "loss": 1.10188103, "memory(GiB)": 368.61, "step": 5925, "train_speed(iter/s)": 0.205986 }, { "acc": 0.73602977, "epoch": 0.1504312531709792, "grad_norm": 1.9921875, "learning_rate": 9.982626776414834e-06, "loss": 1.07228003, "memory(GiB)": 368.61, "step": 5930, "train_speed(iter/s)": 0.206004 }, { "acc": 0.72441287, "epoch": 0.15055809233891426, "grad_norm": 1.9765625, "learning_rate": 9.98253932701937e-06, "loss": 1.08609047, "memory(GiB)": 368.61, "step": 5935, "train_speed(iter/s)": 0.206021 }, { "acc": 0.73211861, "epoch": 0.1506849315068493, "grad_norm": 2.875, "learning_rate": 9.98245165846958e-06, "loss": 1.15057774, "memory(GiB)": 368.61, "step": 5940, "train_speed(iter/s)": 0.206048 }, { "acc": 0.7456058, "epoch": 0.15081177067478438, "grad_norm": 2.359375, "learning_rate": 9.982363770769323e-06, "loss": 1.08182364, "memory(GiB)": 368.61, "step": 5945, "train_speed(iter/s)": 0.206068 }, { "acc": 0.73035922, "epoch": 0.15093860984271942, "grad_norm": 2.265625, "learning_rate": 9.98227566392246e-06, "loss": 1.10162277, "memory(GiB)": 368.61, "step": 5950, "train_speed(iter/s)": 0.206077 }, { "acc": 0.73627548, "epoch": 0.1510654490106545, "grad_norm": 2.5, "learning_rate": 9.982187337932871e-06, "loss": 1.05095549, "memory(GiB)": 368.61, "step": 5955, "train_speed(iter/s)": 0.206108 }, { "acc": 0.72125406, "epoch": 0.15119228817858954, "grad_norm": 1.96875, "learning_rate": 9.98209879280444e-06, "loss": 1.12195358, "memory(GiB)": 368.61, "step": 5960, "train_speed(iter/s)": 0.20614 }, { "acc": 0.72953339, "epoch": 0.1513191273465246, "grad_norm": 2.671875, "learning_rate": 9.982010028541057e-06, "loss": 1.11204147, "memory(GiB)": 368.61, "step": 5965, "train_speed(iter/s)": 0.206165 }, { "acc": 0.73201346, "epoch": 0.15144596651445966, "grad_norm": 2.484375, "learning_rate": 9.981921045146633e-06, "loss": 1.03141193, "memory(GiB)": 368.61, "step": 5970, "train_speed(iter/s)": 0.206175 }, { "acc": 0.73809071, "epoch": 0.15157280568239473, "grad_norm": 2.171875, "learning_rate": 9.981831842625079e-06, "loss": 1.0463377, "memory(GiB)": 368.61, "step": 5975, "train_speed(iter/s)": 0.206209 }, { "acc": 0.72709665, "epoch": 0.15169964485032977, "grad_norm": 2.109375, "learning_rate": 9.981742420980316e-06, "loss": 1.11958027, "memory(GiB)": 368.61, "step": 5980, "train_speed(iter/s)": 0.206235 }, { "acc": 0.73694701, "epoch": 0.15182648401826485, "grad_norm": 2.125, "learning_rate": 9.981652780216281e-06, "loss": 1.03667603, "memory(GiB)": 368.61, "step": 5985, "train_speed(iter/s)": 0.206273 }, { "acc": 0.74966345, "epoch": 0.1519533231861999, "grad_norm": 2.671875, "learning_rate": 9.981562920336915e-06, "loss": 1.02287102, "memory(GiB)": 368.61, "step": 5990, "train_speed(iter/s)": 0.206299 }, { "acc": 0.71881781, "epoch": 0.15208016235413496, "grad_norm": 2.296875, "learning_rate": 9.98147284134617e-06, "loss": 1.13086834, "memory(GiB)": 368.61, "step": 5995, "train_speed(iter/s)": 0.206312 }, { "acc": 0.72761126, "epoch": 0.15220700152207, "grad_norm": 2.40625, "learning_rate": 9.981382543248011e-06, "loss": 1.13217449, "memory(GiB)": 368.61, "step": 6000, "train_speed(iter/s)": 0.206319 }, { "epoch": 0.15220700152207, "eval_acc": 0.7241244328286565, "eval_loss": 1.0425529479980469, "eval_runtime": 384.1732, "eval_samples_per_second": 16.581, "eval_steps_per_second": 8.291, "step": 6000 }, { "acc": 0.72478704, "epoch": 0.15233384069000508, "grad_norm": 2.515625, "learning_rate": 9.981292026046406e-06, "loss": 1.10239201, "memory(GiB)": 368.61, "step": 6005, "train_speed(iter/s)": 0.201421 }, { "acc": 0.73023119, "epoch": 0.15246067985794012, "grad_norm": 2.359375, "learning_rate": 9.981201289745337e-06, "loss": 1.08803968, "memory(GiB)": 368.61, "step": 6010, "train_speed(iter/s)": 0.201448 }, { "acc": 0.73504906, "epoch": 0.1525875190258752, "grad_norm": 2.203125, "learning_rate": 9.981110334348796e-06, "loss": 1.10517864, "memory(GiB)": 368.61, "step": 6015, "train_speed(iter/s)": 0.201482 }, { "acc": 0.74785981, "epoch": 0.15271435819381024, "grad_norm": 2.40625, "learning_rate": 9.981019159860782e-06, "loss": 1.04900742, "memory(GiB)": 368.61, "step": 6020, "train_speed(iter/s)": 0.201522 }, { "acc": 0.73935008, "epoch": 0.1528411973617453, "grad_norm": 2.09375, "learning_rate": 9.98092776628531e-06, "loss": 1.036376, "memory(GiB)": 368.61, "step": 6025, "train_speed(iter/s)": 0.201547 }, { "acc": 0.74961944, "epoch": 0.15296803652968036, "grad_norm": 2.03125, "learning_rate": 9.980836153626396e-06, "loss": 1.01315165, "memory(GiB)": 368.61, "step": 6030, "train_speed(iter/s)": 0.201574 }, { "acc": 0.74103851, "epoch": 0.15309487569761543, "grad_norm": 2.4375, "learning_rate": 9.980744321888068e-06, "loss": 1.0633358, "memory(GiB)": 368.61, "step": 6035, "train_speed(iter/s)": 0.201591 }, { "acc": 0.73017406, "epoch": 0.15322171486555047, "grad_norm": 2.078125, "learning_rate": 9.98065227107437e-06, "loss": 1.15877037, "memory(GiB)": 368.61, "step": 6040, "train_speed(iter/s)": 0.201615 }, { "acc": 0.73966637, "epoch": 0.15334855403348555, "grad_norm": 1.96875, "learning_rate": 9.980560001189346e-06, "loss": 1.088062, "memory(GiB)": 368.61, "step": 6045, "train_speed(iter/s)": 0.201632 }, { "acc": 0.71956758, "epoch": 0.1534753932014206, "grad_norm": 2.15625, "learning_rate": 9.980467512237058e-06, "loss": 1.13949709, "memory(GiB)": 368.61, "step": 6050, "train_speed(iter/s)": 0.201664 }, { "acc": 0.71833425, "epoch": 0.15360223236935566, "grad_norm": 2.5625, "learning_rate": 9.98037480422157e-06, "loss": 1.06782999, "memory(GiB)": 368.61, "step": 6055, "train_speed(iter/s)": 0.201685 }, { "acc": 0.73539033, "epoch": 0.1537290715372907, "grad_norm": 2.578125, "learning_rate": 9.980281877146964e-06, "loss": 1.10189762, "memory(GiB)": 368.61, "step": 6060, "train_speed(iter/s)": 0.20171 }, { "acc": 0.73447218, "epoch": 0.15385591070522578, "grad_norm": 2.09375, "learning_rate": 9.980188731017327e-06, "loss": 1.11537924, "memory(GiB)": 368.61, "step": 6065, "train_speed(iter/s)": 0.20174 }, { "acc": 0.74046121, "epoch": 0.15398274987316082, "grad_norm": 2.3125, "learning_rate": 9.980095365836753e-06, "loss": 1.02936077, "memory(GiB)": 368.61, "step": 6070, "train_speed(iter/s)": 0.201763 }, { "acc": 0.73510046, "epoch": 0.1541095890410959, "grad_norm": 2.453125, "learning_rate": 9.980001781609353e-06, "loss": 1.09446926, "memory(GiB)": 368.61, "step": 6075, "train_speed(iter/s)": 0.201783 }, { "acc": 0.72929783, "epoch": 0.15423642820903094, "grad_norm": 2.3125, "learning_rate": 9.979907978339236e-06, "loss": 1.08201618, "memory(GiB)": 368.61, "step": 6080, "train_speed(iter/s)": 0.20182 }, { "acc": 0.73835869, "epoch": 0.154363267376966, "grad_norm": 2.359375, "learning_rate": 9.979813956030535e-06, "loss": 1.04879246, "memory(GiB)": 368.61, "step": 6085, "train_speed(iter/s)": 0.201847 }, { "acc": 0.72995033, "epoch": 0.15449010654490106, "grad_norm": 2.25, "learning_rate": 9.979719714687384e-06, "loss": 1.03752718, "memory(GiB)": 368.61, "step": 6090, "train_speed(iter/s)": 0.201871 }, { "acc": 0.72877073, "epoch": 0.15461694571283613, "grad_norm": 2.015625, "learning_rate": 9.979625254313924e-06, "loss": 1.14433756, "memory(GiB)": 368.61, "step": 6095, "train_speed(iter/s)": 0.201892 }, { "acc": 0.72883644, "epoch": 0.15474378488077117, "grad_norm": 2.703125, "learning_rate": 9.979530574914316e-06, "loss": 1.13410149, "memory(GiB)": 368.61, "step": 6100, "train_speed(iter/s)": 0.201921 }, { "acc": 0.73672075, "epoch": 0.15487062404870625, "grad_norm": 2.421875, "learning_rate": 9.97943567649272e-06, "loss": 1.01763325, "memory(GiB)": 368.61, "step": 6105, "train_speed(iter/s)": 0.201932 }, { "acc": 0.73324528, "epoch": 0.1549974632166413, "grad_norm": 2.1875, "learning_rate": 9.979340559053311e-06, "loss": 1.03487167, "memory(GiB)": 368.61, "step": 6110, "train_speed(iter/s)": 0.201941 }, { "acc": 0.72808914, "epoch": 0.15512430238457636, "grad_norm": 1.9140625, "learning_rate": 9.979245222600273e-06, "loss": 1.09221573, "memory(GiB)": 368.61, "step": 6115, "train_speed(iter/s)": 0.20197 }, { "acc": 0.72994604, "epoch": 0.1552511415525114, "grad_norm": 2.09375, "learning_rate": 9.979149667137801e-06, "loss": 1.1121067, "memory(GiB)": 368.61, "step": 6120, "train_speed(iter/s)": 0.201999 }, { "acc": 0.72452941, "epoch": 0.15537798072044648, "grad_norm": 2.359375, "learning_rate": 9.979053892670094e-06, "loss": 1.10398159, "memory(GiB)": 368.61, "step": 6125, "train_speed(iter/s)": 0.202027 }, { "acc": 0.72929454, "epoch": 0.15550481988838152, "grad_norm": 2.0, "learning_rate": 9.978957899201369e-06, "loss": 1.13890743, "memory(GiB)": 368.61, "step": 6130, "train_speed(iter/s)": 0.202056 }, { "acc": 0.73939466, "epoch": 0.1556316590563166, "grad_norm": 1.96875, "learning_rate": 9.978861686735845e-06, "loss": 1.04979153, "memory(GiB)": 368.61, "step": 6135, "train_speed(iter/s)": 0.202081 }, { "acc": 0.73421025, "epoch": 0.15575849822425164, "grad_norm": 2.71875, "learning_rate": 9.978765255277756e-06, "loss": 1.07307167, "memory(GiB)": 368.61, "step": 6140, "train_speed(iter/s)": 0.202097 }, { "acc": 0.74383016, "epoch": 0.1558853373921867, "grad_norm": 2.0625, "learning_rate": 9.97866860483134e-06, "loss": 1.06742573, "memory(GiB)": 368.61, "step": 6145, "train_speed(iter/s)": 0.202128 }, { "acc": 0.74561663, "epoch": 0.15601217656012176, "grad_norm": 2.453125, "learning_rate": 9.978571735400853e-06, "loss": 1.06769123, "memory(GiB)": 368.61, "step": 6150, "train_speed(iter/s)": 0.202148 }, { "acc": 0.7247664, "epoch": 0.15613901572805683, "grad_norm": 2.359375, "learning_rate": 9.978474646990552e-06, "loss": 1.05741186, "memory(GiB)": 368.61, "step": 6155, "train_speed(iter/s)": 0.202166 }, { "acc": 0.7342145, "epoch": 0.15626585489599187, "grad_norm": 2.296875, "learning_rate": 9.97837733960471e-06, "loss": 1.07168579, "memory(GiB)": 368.61, "step": 6160, "train_speed(iter/s)": 0.202158 }, { "acc": 0.75577207, "epoch": 0.15639269406392695, "grad_norm": 2.546875, "learning_rate": 9.978279813247605e-06, "loss": 1.0419755, "memory(GiB)": 368.61, "step": 6165, "train_speed(iter/s)": 0.202188 }, { "acc": 0.75119886, "epoch": 0.156519533231862, "grad_norm": 2.09375, "learning_rate": 9.978182067923528e-06, "loss": 1.03686285, "memory(GiB)": 368.61, "step": 6170, "train_speed(iter/s)": 0.202209 }, { "acc": 0.72218218, "epoch": 0.15664637239979706, "grad_norm": 1.765625, "learning_rate": 9.978084103636778e-06, "loss": 1.16029682, "memory(GiB)": 368.61, "step": 6175, "train_speed(iter/s)": 0.202224 }, { "acc": 0.7411437, "epoch": 0.1567732115677321, "grad_norm": 2.46875, "learning_rate": 9.977985920391661e-06, "loss": 1.04030972, "memory(GiB)": 368.61, "step": 6180, "train_speed(iter/s)": 0.202255 }, { "acc": 0.73137884, "epoch": 0.15690005073566718, "grad_norm": 2.25, "learning_rate": 9.977887518192501e-06, "loss": 1.10289288, "memory(GiB)": 368.61, "step": 6185, "train_speed(iter/s)": 0.202277 }, { "acc": 0.71499681, "epoch": 0.15702688990360222, "grad_norm": 1.84375, "learning_rate": 9.977788897043622e-06, "loss": 1.15405159, "memory(GiB)": 368.61, "step": 6190, "train_speed(iter/s)": 0.202307 }, { "acc": 0.73384895, "epoch": 0.1571537290715373, "grad_norm": 2.296875, "learning_rate": 9.977690056949363e-06, "loss": 1.08978653, "memory(GiB)": 368.61, "step": 6195, "train_speed(iter/s)": 0.202324 }, { "acc": 0.73025045, "epoch": 0.15728056823947234, "grad_norm": 2.703125, "learning_rate": 9.977590997914072e-06, "loss": 1.07197285, "memory(GiB)": 368.61, "step": 6200, "train_speed(iter/s)": 0.202335 }, { "acc": 0.73196063, "epoch": 0.1574074074074074, "grad_norm": 2.203125, "learning_rate": 9.977491719942106e-06, "loss": 1.0692831, "memory(GiB)": 368.61, "step": 6205, "train_speed(iter/s)": 0.202348 }, { "acc": 0.73433332, "epoch": 0.15753424657534246, "grad_norm": 2.296875, "learning_rate": 9.97739222303783e-06, "loss": 1.08904428, "memory(GiB)": 368.61, "step": 6210, "train_speed(iter/s)": 0.202373 }, { "acc": 0.73780327, "epoch": 0.15766108574327753, "grad_norm": 2.15625, "learning_rate": 9.977292507205623e-06, "loss": 1.03608589, "memory(GiB)": 368.61, "step": 6215, "train_speed(iter/s)": 0.202397 }, { "acc": 0.71910839, "epoch": 0.15778792491121257, "grad_norm": 2.453125, "learning_rate": 9.977192572449868e-06, "loss": 1.1476861, "memory(GiB)": 368.61, "step": 6220, "train_speed(iter/s)": 0.202425 }, { "acc": 0.73878436, "epoch": 0.15791476407914765, "grad_norm": 2.515625, "learning_rate": 9.977092418774962e-06, "loss": 1.09867954, "memory(GiB)": 368.61, "step": 6225, "train_speed(iter/s)": 0.20246 }, { "acc": 0.73428521, "epoch": 0.1580416032470827, "grad_norm": 2.53125, "learning_rate": 9.976992046185313e-06, "loss": 1.11018848, "memory(GiB)": 368.61, "step": 6230, "train_speed(iter/s)": 0.202471 }, { "acc": 0.73637023, "epoch": 0.15816844241501776, "grad_norm": 1.9765625, "learning_rate": 9.97689145468533e-06, "loss": 1.06389742, "memory(GiB)": 368.61, "step": 6235, "train_speed(iter/s)": 0.202505 }, { "acc": 0.75117416, "epoch": 0.1582952815829528, "grad_norm": 2.265625, "learning_rate": 9.976790644279442e-06, "loss": 1.02124157, "memory(GiB)": 368.61, "step": 6240, "train_speed(iter/s)": 0.202531 }, { "acc": 0.74290457, "epoch": 0.15842212075088788, "grad_norm": 2.203125, "learning_rate": 9.976689614972082e-06, "loss": 1.02084732, "memory(GiB)": 368.61, "step": 6245, "train_speed(iter/s)": 0.20254 }, { "acc": 0.73146248, "epoch": 0.15854895991882292, "grad_norm": 3.1875, "learning_rate": 9.976588366767693e-06, "loss": 1.12418232, "memory(GiB)": 368.61, "step": 6250, "train_speed(iter/s)": 0.202569 }, { "acc": 0.73670826, "epoch": 0.158675799086758, "grad_norm": 2.46875, "learning_rate": 9.976486899670729e-06, "loss": 1.06113586, "memory(GiB)": 368.61, "step": 6255, "train_speed(iter/s)": 0.202583 }, { "acc": 0.74015255, "epoch": 0.15880263825469304, "grad_norm": 2.171875, "learning_rate": 9.976385213685652e-06, "loss": 1.05721378, "memory(GiB)": 368.61, "step": 6260, "train_speed(iter/s)": 0.20261 }, { "acc": 0.73819327, "epoch": 0.1589294774226281, "grad_norm": 2.3125, "learning_rate": 9.976283308816937e-06, "loss": 1.10813246, "memory(GiB)": 368.61, "step": 6265, "train_speed(iter/s)": 0.202641 }, { "acc": 0.74191008, "epoch": 0.15905631659056316, "grad_norm": 2.546875, "learning_rate": 9.976181185069063e-06, "loss": 1.07443933, "memory(GiB)": 368.61, "step": 6270, "train_speed(iter/s)": 0.202671 }, { "acc": 0.7422514, "epoch": 0.15918315575849823, "grad_norm": 1.765625, "learning_rate": 9.976078842446522e-06, "loss": 1.06363573, "memory(GiB)": 368.61, "step": 6275, "train_speed(iter/s)": 0.202692 }, { "acc": 0.72868223, "epoch": 0.15930999492643327, "grad_norm": 2.03125, "learning_rate": 9.97597628095382e-06, "loss": 1.20037832, "memory(GiB)": 368.61, "step": 6280, "train_speed(iter/s)": 0.20269 }, { "acc": 0.7284965, "epoch": 0.15943683409436835, "grad_norm": 2.265625, "learning_rate": 9.975873500595464e-06, "loss": 1.12769852, "memory(GiB)": 368.61, "step": 6285, "train_speed(iter/s)": 0.202714 }, { "acc": 0.73488173, "epoch": 0.1595636732623034, "grad_norm": 2.515625, "learning_rate": 9.975770501375974e-06, "loss": 1.11406155, "memory(GiB)": 368.61, "step": 6290, "train_speed(iter/s)": 0.202737 }, { "acc": 0.7392067, "epoch": 0.15969051243023846, "grad_norm": 2.25, "learning_rate": 9.975667283299884e-06, "loss": 1.0314476, "memory(GiB)": 368.61, "step": 6295, "train_speed(iter/s)": 0.202761 }, { "acc": 0.73591275, "epoch": 0.1598173515981735, "grad_norm": 2.59375, "learning_rate": 9.975563846371732e-06, "loss": 1.09228477, "memory(GiB)": 368.61, "step": 6300, "train_speed(iter/s)": 0.202784 }, { "acc": 0.73919225, "epoch": 0.15994419076610858, "grad_norm": 2.109375, "learning_rate": 9.975460190596068e-06, "loss": 1.04242992, "memory(GiB)": 368.61, "step": 6305, "train_speed(iter/s)": 0.2028 }, { "acc": 0.73177137, "epoch": 0.16007102993404362, "grad_norm": 3.109375, "learning_rate": 9.975356315977451e-06, "loss": 1.09168091, "memory(GiB)": 368.61, "step": 6310, "train_speed(iter/s)": 0.202826 }, { "acc": 0.73177862, "epoch": 0.1601978691019787, "grad_norm": 2.1875, "learning_rate": 9.975252222520449e-06, "loss": 1.07477827, "memory(GiB)": 368.61, "step": 6315, "train_speed(iter/s)": 0.202831 }, { "acc": 0.73077955, "epoch": 0.16032470826991374, "grad_norm": 1.96875, "learning_rate": 9.97514791022964e-06, "loss": 1.07231693, "memory(GiB)": 368.61, "step": 6320, "train_speed(iter/s)": 0.202856 }, { "acc": 0.74156051, "epoch": 0.1604515474378488, "grad_norm": 2.3125, "learning_rate": 9.975043379109617e-06, "loss": 1.06660671, "memory(GiB)": 368.61, "step": 6325, "train_speed(iter/s)": 0.202847 }, { "acc": 0.74560814, "epoch": 0.16057838660578386, "grad_norm": 2.09375, "learning_rate": 9.974938629164973e-06, "loss": 1.02607117, "memory(GiB)": 368.61, "step": 6330, "train_speed(iter/s)": 0.202875 }, { "acc": 0.74282498, "epoch": 0.16070522577371893, "grad_norm": 1.890625, "learning_rate": 9.974833660400315e-06, "loss": 1.09469223, "memory(GiB)": 368.61, "step": 6335, "train_speed(iter/s)": 0.202905 }, { "acc": 0.7369092, "epoch": 0.16083206494165397, "grad_norm": 2.625, "learning_rate": 9.974728472820264e-06, "loss": 1.02950878, "memory(GiB)": 368.61, "step": 6340, "train_speed(iter/s)": 0.202931 }, { "acc": 0.73263278, "epoch": 0.16095890410958905, "grad_norm": 1.8984375, "learning_rate": 9.97462306642944e-06, "loss": 1.09403915, "memory(GiB)": 368.61, "step": 6345, "train_speed(iter/s)": 0.202942 }, { "acc": 0.74616256, "epoch": 0.1610857432775241, "grad_norm": 2.015625, "learning_rate": 9.974517441232487e-06, "loss": 0.99876719, "memory(GiB)": 368.61, "step": 6350, "train_speed(iter/s)": 0.202972 }, { "acc": 0.73201952, "epoch": 0.16121258244545916, "grad_norm": 2.28125, "learning_rate": 9.974411597234046e-06, "loss": 1.09038467, "memory(GiB)": 368.61, "step": 6355, "train_speed(iter/s)": 0.20299 }, { "acc": 0.72936134, "epoch": 0.1613394216133942, "grad_norm": 2.515625, "learning_rate": 9.974305534438774e-06, "loss": 1.0733943, "memory(GiB)": 368.61, "step": 6360, "train_speed(iter/s)": 0.203006 }, { "acc": 0.74202375, "epoch": 0.16146626078132928, "grad_norm": 2.125, "learning_rate": 9.974199252851338e-06, "loss": 1.06816168, "memory(GiB)": 368.61, "step": 6365, "train_speed(iter/s)": 0.203028 }, { "acc": 0.73887296, "epoch": 0.16159309994926432, "grad_norm": 2.59375, "learning_rate": 9.974092752476408e-06, "loss": 1.04476681, "memory(GiB)": 368.61, "step": 6370, "train_speed(iter/s)": 0.20305 }, { "acc": 0.74495525, "epoch": 0.1617199391171994, "grad_norm": 2.171875, "learning_rate": 9.973986033318673e-06, "loss": 1.06991844, "memory(GiB)": 368.61, "step": 6375, "train_speed(iter/s)": 0.203063 }, { "acc": 0.74796305, "epoch": 0.16184677828513444, "grad_norm": 3.03125, "learning_rate": 9.973879095382824e-06, "loss": 1.02222919, "memory(GiB)": 368.61, "step": 6380, "train_speed(iter/s)": 0.203066 }, { "acc": 0.74011197, "epoch": 0.1619736174530695, "grad_norm": 2.703125, "learning_rate": 9.973771938673564e-06, "loss": 1.07438574, "memory(GiB)": 368.61, "step": 6385, "train_speed(iter/s)": 0.203093 }, { "acc": 0.72389622, "epoch": 0.16210045662100456, "grad_norm": 2.578125, "learning_rate": 9.973664563195609e-06, "loss": 1.13200388, "memory(GiB)": 368.61, "step": 6390, "train_speed(iter/s)": 0.203107 }, { "acc": 0.73270073, "epoch": 0.16222729578893963, "grad_norm": 2.515625, "learning_rate": 9.973556968953682e-06, "loss": 1.11792755, "memory(GiB)": 368.61, "step": 6395, "train_speed(iter/s)": 0.203136 }, { "acc": 0.73339624, "epoch": 0.16235413495687467, "grad_norm": 2.234375, "learning_rate": 9.973449155952512e-06, "loss": 1.07669144, "memory(GiB)": 368.61, "step": 6400, "train_speed(iter/s)": 0.203157 }, { "acc": 0.74656324, "epoch": 0.16248097412480975, "grad_norm": 2.171875, "learning_rate": 9.973341124196847e-06, "loss": 1.04600601, "memory(GiB)": 368.61, "step": 6405, "train_speed(iter/s)": 0.203187 }, { "acc": 0.7475646, "epoch": 0.1626078132927448, "grad_norm": 2.015625, "learning_rate": 9.973232873691431e-06, "loss": 0.99941063, "memory(GiB)": 368.61, "step": 6410, "train_speed(iter/s)": 0.203213 }, { "acc": 0.7334352, "epoch": 0.16273465246067986, "grad_norm": 2.046875, "learning_rate": 9.973124404441031e-06, "loss": 1.09353247, "memory(GiB)": 368.61, "step": 6415, "train_speed(iter/s)": 0.203204 }, { "acc": 0.73318615, "epoch": 0.1628614916286149, "grad_norm": 2.265625, "learning_rate": 9.973015716450416e-06, "loss": 1.14584332, "memory(GiB)": 368.61, "step": 6420, "train_speed(iter/s)": 0.203211 }, { "acc": 0.74204559, "epoch": 0.16298833079654998, "grad_norm": 2.46875, "learning_rate": 9.972906809724367e-06, "loss": 1.04461155, "memory(GiB)": 368.61, "step": 6425, "train_speed(iter/s)": 0.203229 }, { "acc": 0.73892097, "epoch": 0.16311516996448502, "grad_norm": 2.28125, "learning_rate": 9.972797684267674e-06, "loss": 1.05232391, "memory(GiB)": 368.61, "step": 6430, "train_speed(iter/s)": 0.203251 }, { "acc": 0.7436985, "epoch": 0.1632420091324201, "grad_norm": 2.046875, "learning_rate": 9.972688340085137e-06, "loss": 1.02400036, "memory(GiB)": 368.61, "step": 6435, "train_speed(iter/s)": 0.203263 }, { "acc": 0.73981524, "epoch": 0.16336884830035514, "grad_norm": 2.71875, "learning_rate": 9.972578777181565e-06, "loss": 1.08512154, "memory(GiB)": 368.61, "step": 6440, "train_speed(iter/s)": 0.203279 }, { "acc": 0.73808126, "epoch": 0.1634956874682902, "grad_norm": 2.28125, "learning_rate": 9.972468995561778e-06, "loss": 1.03090096, "memory(GiB)": 368.61, "step": 6445, "train_speed(iter/s)": 0.203299 }, { "acc": 0.73674088, "epoch": 0.16362252663622526, "grad_norm": 2.328125, "learning_rate": 9.972358995230604e-06, "loss": 1.02426777, "memory(GiB)": 368.61, "step": 6450, "train_speed(iter/s)": 0.203317 }, { "acc": 0.74590154, "epoch": 0.16374936580416033, "grad_norm": 2.75, "learning_rate": 9.97224877619288e-06, "loss": 1.08760853, "memory(GiB)": 368.61, "step": 6455, "train_speed(iter/s)": 0.203346 }, { "acc": 0.72564397, "epoch": 0.16387620497209537, "grad_norm": 2.3125, "learning_rate": 9.972138338453457e-06, "loss": 1.12686329, "memory(GiB)": 368.61, "step": 6460, "train_speed(iter/s)": 0.203363 }, { "acc": 0.74126348, "epoch": 0.16400304414003045, "grad_norm": 1.984375, "learning_rate": 9.972027682017191e-06, "loss": 1.06954594, "memory(GiB)": 368.61, "step": 6465, "train_speed(iter/s)": 0.203389 }, { "acc": 0.72215815, "epoch": 0.1641298833079655, "grad_norm": 1.8671875, "learning_rate": 9.971916806888948e-06, "loss": 1.16434879, "memory(GiB)": 368.61, "step": 6470, "train_speed(iter/s)": 0.203406 }, { "acc": 0.7304882, "epoch": 0.16425672247590056, "grad_norm": 2.46875, "learning_rate": 9.971805713073606e-06, "loss": 1.09913063, "memory(GiB)": 368.61, "step": 6475, "train_speed(iter/s)": 0.203426 }, { "acc": 0.73857136, "epoch": 0.1643835616438356, "grad_norm": 1.9140625, "learning_rate": 9.971694400576053e-06, "loss": 1.02550488, "memory(GiB)": 368.61, "step": 6480, "train_speed(iter/s)": 0.203428 }, { "acc": 0.73774781, "epoch": 0.16451040081177068, "grad_norm": 1.90625, "learning_rate": 9.971582869401182e-06, "loss": 1.07636623, "memory(GiB)": 368.61, "step": 6485, "train_speed(iter/s)": 0.203439 }, { "acc": 0.74292555, "epoch": 0.16463723997970572, "grad_norm": 1.9375, "learning_rate": 9.9714711195539e-06, "loss": 1.06132107, "memory(GiB)": 368.61, "step": 6490, "train_speed(iter/s)": 0.20347 }, { "acc": 0.73531895, "epoch": 0.1647640791476408, "grad_norm": 2.875, "learning_rate": 9.97135915103912e-06, "loss": 1.04619541, "memory(GiB)": 368.61, "step": 6495, "train_speed(iter/s)": 0.203503 }, { "acc": 0.73774004, "epoch": 0.16489091831557584, "grad_norm": 2.265625, "learning_rate": 9.971246963861772e-06, "loss": 1.04610214, "memory(GiB)": 368.61, "step": 6500, "train_speed(iter/s)": 0.20351 }, { "acc": 0.73871994, "epoch": 0.1650177574835109, "grad_norm": 2.0, "learning_rate": 9.971134558026786e-06, "loss": 1.06102057, "memory(GiB)": 368.61, "step": 6505, "train_speed(iter/s)": 0.20354 }, { "acc": 0.74119706, "epoch": 0.16514459665144596, "grad_norm": 2.125, "learning_rate": 9.971021933539108e-06, "loss": 1.04180365, "memory(GiB)": 368.61, "step": 6510, "train_speed(iter/s)": 0.203562 }, { "acc": 0.74771204, "epoch": 0.16527143581938103, "grad_norm": 2.40625, "learning_rate": 9.97090909040369e-06, "loss": 1.01983566, "memory(GiB)": 368.61, "step": 6515, "train_speed(iter/s)": 0.203592 }, { "acc": 0.72673779, "epoch": 0.16539827498731607, "grad_norm": 2.0625, "learning_rate": 9.970796028625499e-06, "loss": 1.10761547, "memory(GiB)": 368.61, "step": 6520, "train_speed(iter/s)": 0.203611 }, { "acc": 0.74263105, "epoch": 0.16552511415525115, "grad_norm": 2.125, "learning_rate": 9.970682748209505e-06, "loss": 1.06231365, "memory(GiB)": 368.61, "step": 6525, "train_speed(iter/s)": 0.203594 }, { "acc": 0.74316702, "epoch": 0.1656519533231862, "grad_norm": 1.890625, "learning_rate": 9.97056924916069e-06, "loss": 1.03566799, "memory(GiB)": 368.61, "step": 6530, "train_speed(iter/s)": 0.203612 }, { "acc": 0.75118952, "epoch": 0.16577879249112126, "grad_norm": 2.03125, "learning_rate": 9.970455531484049e-06, "loss": 1.01868305, "memory(GiB)": 368.61, "step": 6535, "train_speed(iter/s)": 0.203629 }, { "acc": 0.74098191, "epoch": 0.1659056316590563, "grad_norm": 2.390625, "learning_rate": 9.97034159518458e-06, "loss": 1.0494256, "memory(GiB)": 368.61, "step": 6540, "train_speed(iter/s)": 0.203648 }, { "acc": 0.7379777, "epoch": 0.16603247082699138, "grad_norm": 2.5, "learning_rate": 9.9702274402673e-06, "loss": 1.07974186, "memory(GiB)": 368.61, "step": 6545, "train_speed(iter/s)": 0.203678 }, { "acc": 0.74415774, "epoch": 0.16615930999492642, "grad_norm": 2.046875, "learning_rate": 9.970113066737223e-06, "loss": 1.0808054, "memory(GiB)": 368.61, "step": 6550, "train_speed(iter/s)": 0.203695 }, { "acc": 0.73226328, "epoch": 0.1662861491628615, "grad_norm": 2.296875, "learning_rate": 9.969998474599386e-06, "loss": 1.07872105, "memory(GiB)": 368.61, "step": 6555, "train_speed(iter/s)": 0.203709 }, { "acc": 0.74988208, "epoch": 0.16641298833079654, "grad_norm": 2.59375, "learning_rate": 9.969883663858826e-06, "loss": 1.02502003, "memory(GiB)": 368.61, "step": 6560, "train_speed(iter/s)": 0.203724 }, { "acc": 0.74942503, "epoch": 0.1665398274987316, "grad_norm": 2.46875, "learning_rate": 9.969768634520593e-06, "loss": 1.0311511, "memory(GiB)": 368.61, "step": 6565, "train_speed(iter/s)": 0.203754 }, { "acc": 0.75640612, "epoch": 0.16666666666666666, "grad_norm": 1.875, "learning_rate": 9.969653386589749e-06, "loss": 1.00019779, "memory(GiB)": 368.61, "step": 6570, "train_speed(iter/s)": 0.203777 }, { "acc": 0.74886484, "epoch": 0.16679350583460173, "grad_norm": 2.078125, "learning_rate": 9.96953792007136e-06, "loss": 1.07712955, "memory(GiB)": 368.61, "step": 6575, "train_speed(iter/s)": 0.203801 }, { "acc": 0.72923999, "epoch": 0.16692034500253677, "grad_norm": 2.09375, "learning_rate": 9.969422234970506e-06, "loss": 1.11407194, "memory(GiB)": 368.61, "step": 6580, "train_speed(iter/s)": 0.203815 }, { "acc": 0.75329256, "epoch": 0.16704718417047185, "grad_norm": 1.9296875, "learning_rate": 9.969306331292273e-06, "loss": 0.99737148, "memory(GiB)": 368.61, "step": 6585, "train_speed(iter/s)": 0.203825 }, { "acc": 0.72726746, "epoch": 0.1671740233384069, "grad_norm": 2.546875, "learning_rate": 9.969190209041764e-06, "loss": 1.13362389, "memory(GiB)": 368.61, "step": 6590, "train_speed(iter/s)": 0.203853 }, { "acc": 0.72932549, "epoch": 0.16730086250634196, "grad_norm": 2.359375, "learning_rate": 9.969073868224082e-06, "loss": 1.11179743, "memory(GiB)": 368.61, "step": 6595, "train_speed(iter/s)": 0.203876 }, { "acc": 0.72640629, "epoch": 0.167427701674277, "grad_norm": 1.984375, "learning_rate": 9.968957308844346e-06, "loss": 1.15411053, "memory(GiB)": 368.61, "step": 6600, "train_speed(iter/s)": 0.203904 }, { "acc": 0.72726841, "epoch": 0.16755454084221208, "grad_norm": 2.546875, "learning_rate": 9.968840530907684e-06, "loss": 1.13336258, "memory(GiB)": 368.61, "step": 6605, "train_speed(iter/s)": 0.203932 }, { "acc": 0.72825437, "epoch": 0.16768138001014712, "grad_norm": 2.90625, "learning_rate": 9.96872353441923e-06, "loss": 1.09290504, "memory(GiB)": 368.61, "step": 6610, "train_speed(iter/s)": 0.203949 }, { "acc": 0.73157816, "epoch": 0.1678082191780822, "grad_norm": 2.609375, "learning_rate": 9.968606319384131e-06, "loss": 1.11966095, "memory(GiB)": 368.61, "step": 6615, "train_speed(iter/s)": 0.203961 }, { "acc": 0.72731848, "epoch": 0.16793505834601724, "grad_norm": 2.890625, "learning_rate": 9.968488885807544e-06, "loss": 1.13772621, "memory(GiB)": 368.61, "step": 6620, "train_speed(iter/s)": 0.203976 }, { "acc": 0.7406106, "epoch": 0.1680618975139523, "grad_norm": 1.8984375, "learning_rate": 9.968371233694633e-06, "loss": 1.05731525, "memory(GiB)": 368.61, "step": 6625, "train_speed(iter/s)": 0.204005 }, { "acc": 0.74216242, "epoch": 0.16818873668188736, "grad_norm": 2.234375, "learning_rate": 9.968253363050573e-06, "loss": 1.07268162, "memory(GiB)": 368.61, "step": 6630, "train_speed(iter/s)": 0.204021 }, { "acc": 0.7385654, "epoch": 0.16831557584982243, "grad_norm": 1.7421875, "learning_rate": 9.968135273880547e-06, "loss": 0.99552603, "memory(GiB)": 368.61, "step": 6635, "train_speed(iter/s)": 0.204029 }, { "acc": 0.72423334, "epoch": 0.16844241501775747, "grad_norm": 2.40625, "learning_rate": 9.968016966189753e-06, "loss": 1.05824623, "memory(GiB)": 368.61, "step": 6640, "train_speed(iter/s)": 0.204051 }, { "acc": 0.74605274, "epoch": 0.16856925418569255, "grad_norm": 2.375, "learning_rate": 9.96789843998339e-06, "loss": 1.00549679, "memory(GiB)": 368.61, "step": 6645, "train_speed(iter/s)": 0.204069 }, { "acc": 0.7339819, "epoch": 0.1686960933536276, "grad_norm": 2.453125, "learning_rate": 9.967779695266675e-06, "loss": 1.1000227, "memory(GiB)": 368.61, "step": 6650, "train_speed(iter/s)": 0.204081 }, { "acc": 0.73641768, "epoch": 0.16882293252156266, "grad_norm": 2.21875, "learning_rate": 9.967660732044828e-06, "loss": 1.07653456, "memory(GiB)": 368.61, "step": 6655, "train_speed(iter/s)": 0.204103 }, { "acc": 0.73228598, "epoch": 0.1689497716894977, "grad_norm": 2.109375, "learning_rate": 9.967541550323085e-06, "loss": 1.07373581, "memory(GiB)": 368.61, "step": 6660, "train_speed(iter/s)": 0.204126 }, { "acc": 0.74622869, "epoch": 0.16907661085743278, "grad_norm": 2.296875, "learning_rate": 9.967422150106685e-06, "loss": 1.05335474, "memory(GiB)": 368.61, "step": 6665, "train_speed(iter/s)": 0.204157 }, { "acc": 0.73370919, "epoch": 0.16920345002536782, "grad_norm": 1.9921875, "learning_rate": 9.96730253140088e-06, "loss": 1.0629014, "memory(GiB)": 368.61, "step": 6670, "train_speed(iter/s)": 0.204182 }, { "acc": 0.73752484, "epoch": 0.1693302891933029, "grad_norm": 2.1875, "learning_rate": 9.967182694210933e-06, "loss": 1.07202492, "memory(GiB)": 368.61, "step": 6675, "train_speed(iter/s)": 0.204202 }, { "acc": 0.74540458, "epoch": 0.16945712836123794, "grad_norm": 1.9765625, "learning_rate": 9.967062638542116e-06, "loss": 1.0494668, "memory(GiB)": 368.61, "step": 6680, "train_speed(iter/s)": 0.204217 }, { "acc": 0.7392055, "epoch": 0.169583967529173, "grad_norm": 2.46875, "learning_rate": 9.966942364399706e-06, "loss": 1.06103172, "memory(GiB)": 368.61, "step": 6685, "train_speed(iter/s)": 0.20424 }, { "acc": 0.73315935, "epoch": 0.16971080669710806, "grad_norm": 2.15625, "learning_rate": 9.966821871788995e-06, "loss": 1.07505493, "memory(GiB)": 368.61, "step": 6690, "train_speed(iter/s)": 0.204266 }, { "acc": 0.7433938, "epoch": 0.16983764586504313, "grad_norm": 2.390625, "learning_rate": 9.966701160715283e-06, "loss": 1.07396469, "memory(GiB)": 368.61, "step": 6695, "train_speed(iter/s)": 0.204279 }, { "acc": 0.74779468, "epoch": 0.16996448503297817, "grad_norm": 2.328125, "learning_rate": 9.96658023118388e-06, "loss": 1.06497669, "memory(GiB)": 368.61, "step": 6700, "train_speed(iter/s)": 0.204299 }, { "acc": 0.74119005, "epoch": 0.17009132420091325, "grad_norm": 2.625, "learning_rate": 9.966459083200102e-06, "loss": 1.07681866, "memory(GiB)": 368.61, "step": 6705, "train_speed(iter/s)": 0.204319 }, { "acc": 0.72296224, "epoch": 0.1702181633688483, "grad_norm": 2.109375, "learning_rate": 9.966337716769283e-06, "loss": 1.12063761, "memory(GiB)": 368.61, "step": 6710, "train_speed(iter/s)": 0.204347 }, { "acc": 0.72800188, "epoch": 0.17034500253678336, "grad_norm": 2.4375, "learning_rate": 9.966216131896755e-06, "loss": 1.11765041, "memory(GiB)": 368.61, "step": 6715, "train_speed(iter/s)": 0.20437 }, { "acc": 0.72685442, "epoch": 0.1704718417047184, "grad_norm": 2.375, "learning_rate": 9.966094328587871e-06, "loss": 1.08447342, "memory(GiB)": 368.61, "step": 6720, "train_speed(iter/s)": 0.20439 }, { "acc": 0.72865477, "epoch": 0.17059868087265348, "grad_norm": 2.1875, "learning_rate": 9.965972306847986e-06, "loss": 1.14573488, "memory(GiB)": 368.61, "step": 6725, "train_speed(iter/s)": 0.204403 }, { "acc": 0.75789804, "epoch": 0.17072552004058852, "grad_norm": 2.453125, "learning_rate": 9.965850066682468e-06, "loss": 0.9766325, "memory(GiB)": 368.61, "step": 6730, "train_speed(iter/s)": 0.204433 }, { "acc": 0.72786012, "epoch": 0.1708523592085236, "grad_norm": 2.046875, "learning_rate": 9.965727608096692e-06, "loss": 1.06045799, "memory(GiB)": 368.61, "step": 6735, "train_speed(iter/s)": 0.204465 }, { "acc": 0.74278116, "epoch": 0.17097919837645864, "grad_norm": 1.875, "learning_rate": 9.965604931096045e-06, "loss": 1.11301003, "memory(GiB)": 368.61, "step": 6740, "train_speed(iter/s)": 0.204485 }, { "acc": 0.73249092, "epoch": 0.1711060375443937, "grad_norm": 2.21875, "learning_rate": 9.965482035685925e-06, "loss": 1.0716362, "memory(GiB)": 368.61, "step": 6745, "train_speed(iter/s)": 0.204505 }, { "acc": 0.74601345, "epoch": 0.17123287671232876, "grad_norm": 2.234375, "learning_rate": 9.965358921871735e-06, "loss": 1.08809834, "memory(GiB)": 368.61, "step": 6750, "train_speed(iter/s)": 0.204533 }, { "acc": 0.75424471, "epoch": 0.17135971588026383, "grad_norm": 2.15625, "learning_rate": 9.965235589658891e-06, "loss": 1.04829245, "memory(GiB)": 368.61, "step": 6755, "train_speed(iter/s)": 0.204549 }, { "acc": 0.74677176, "epoch": 0.17148655504819887, "grad_norm": 1.6484375, "learning_rate": 9.965112039052817e-06, "loss": 1.07279892, "memory(GiB)": 368.61, "step": 6760, "train_speed(iter/s)": 0.204566 }, { "acc": 0.74298534, "epoch": 0.17161339421613395, "grad_norm": 2.5, "learning_rate": 9.964988270058948e-06, "loss": 1.10165176, "memory(GiB)": 368.61, "step": 6765, "train_speed(iter/s)": 0.204579 }, { "acc": 0.73362913, "epoch": 0.171740233384069, "grad_norm": 2.03125, "learning_rate": 9.96486428268273e-06, "loss": 1.05016232, "memory(GiB)": 368.61, "step": 6770, "train_speed(iter/s)": 0.204594 }, { "acc": 0.73966055, "epoch": 0.17186707255200406, "grad_norm": 2.328125, "learning_rate": 9.964740076929612e-06, "loss": 1.06027184, "memory(GiB)": 368.61, "step": 6775, "train_speed(iter/s)": 0.204609 }, { "acc": 0.73173466, "epoch": 0.1719939117199391, "grad_norm": 2.4375, "learning_rate": 9.964615652805059e-06, "loss": 1.12025814, "memory(GiB)": 368.61, "step": 6780, "train_speed(iter/s)": 0.204628 }, { "acc": 0.74492035, "epoch": 0.17212075088787418, "grad_norm": 2.46875, "learning_rate": 9.964491010314545e-06, "loss": 1.03887386, "memory(GiB)": 368.61, "step": 6785, "train_speed(iter/s)": 0.204648 }, { "acc": 0.73890815, "epoch": 0.17224759005580922, "grad_norm": 2.390625, "learning_rate": 9.964366149463552e-06, "loss": 1.07993069, "memory(GiB)": 368.61, "step": 6790, "train_speed(iter/s)": 0.204665 }, { "acc": 0.75213084, "epoch": 0.1723744292237443, "grad_norm": 1.9296875, "learning_rate": 9.96424107025757e-06, "loss": 1.01932783, "memory(GiB)": 368.61, "step": 6795, "train_speed(iter/s)": 0.204684 }, { "acc": 0.74347539, "epoch": 0.17250126839167934, "grad_norm": 1.875, "learning_rate": 9.964115772702104e-06, "loss": 1.06548386, "memory(GiB)": 368.61, "step": 6800, "train_speed(iter/s)": 0.204715 }, { "acc": 0.73800621, "epoch": 0.1726281075596144, "grad_norm": 1.9921875, "learning_rate": 9.963990256802662e-06, "loss": 1.08166046, "memory(GiB)": 368.61, "step": 6805, "train_speed(iter/s)": 0.204737 }, { "acc": 0.74639673, "epoch": 0.17275494672754946, "grad_norm": 2.671875, "learning_rate": 9.963864522564765e-06, "loss": 1.03670826, "memory(GiB)": 368.61, "step": 6810, "train_speed(iter/s)": 0.204761 }, { "acc": 0.73965464, "epoch": 0.17288178589548453, "grad_norm": 1.9765625, "learning_rate": 9.963738569993945e-06, "loss": 1.09077625, "memory(GiB)": 368.61, "step": 6815, "train_speed(iter/s)": 0.204781 }, { "acc": 0.74477682, "epoch": 0.17300862506341957, "grad_norm": 2.65625, "learning_rate": 9.963612399095743e-06, "loss": 1.09133606, "memory(GiB)": 368.61, "step": 6820, "train_speed(iter/s)": 0.20479 }, { "acc": 0.71959314, "epoch": 0.17313546423135465, "grad_norm": 2.453125, "learning_rate": 9.963486009875705e-06, "loss": 1.1229147, "memory(GiB)": 368.61, "step": 6825, "train_speed(iter/s)": 0.204822 }, { "acc": 0.74049201, "epoch": 0.1732623033992897, "grad_norm": 2.265625, "learning_rate": 9.963359402339393e-06, "loss": 1.06109848, "memory(GiB)": 368.61, "step": 6830, "train_speed(iter/s)": 0.204844 }, { "acc": 0.73399744, "epoch": 0.17338914256722476, "grad_norm": 2.71875, "learning_rate": 9.963232576492373e-06, "loss": 1.11903801, "memory(GiB)": 368.61, "step": 6835, "train_speed(iter/s)": 0.20487 }, { "acc": 0.72643633, "epoch": 0.1735159817351598, "grad_norm": 2.125, "learning_rate": 9.963105532340226e-06, "loss": 1.13132629, "memory(GiB)": 368.61, "step": 6840, "train_speed(iter/s)": 0.204881 }, { "acc": 0.73092594, "epoch": 0.17364282090309488, "grad_norm": 2.578125, "learning_rate": 9.962978269888538e-06, "loss": 1.08780365, "memory(GiB)": 368.61, "step": 6845, "train_speed(iter/s)": 0.204899 }, { "acc": 0.73397036, "epoch": 0.17376966007102992, "grad_norm": 1.8203125, "learning_rate": 9.96285078914291e-06, "loss": 1.04199162, "memory(GiB)": 368.61, "step": 6850, "train_speed(iter/s)": 0.204919 }, { "acc": 0.73425283, "epoch": 0.173896499238965, "grad_norm": 2.015625, "learning_rate": 9.962723090108944e-06, "loss": 1.03591919, "memory(GiB)": 368.61, "step": 6855, "train_speed(iter/s)": 0.204947 }, { "acc": 0.73546391, "epoch": 0.17402333840690004, "grad_norm": 2.1875, "learning_rate": 9.962595172792261e-06, "loss": 1.07579823, "memory(GiB)": 368.61, "step": 6860, "train_speed(iter/s)": 0.204969 }, { "acc": 0.74454193, "epoch": 0.1741501775748351, "grad_norm": 2.09375, "learning_rate": 9.962467037198487e-06, "loss": 0.96992321, "memory(GiB)": 368.61, "step": 6865, "train_speed(iter/s)": 0.20498 }, { "acc": 0.73687119, "epoch": 0.17427701674277016, "grad_norm": 2.015625, "learning_rate": 9.962338683333254e-06, "loss": 1.05794334, "memory(GiB)": 368.61, "step": 6870, "train_speed(iter/s)": 0.205001 }, { "acc": 0.73339157, "epoch": 0.17440385591070523, "grad_norm": 1.96875, "learning_rate": 9.962210111202212e-06, "loss": 1.1341094, "memory(GiB)": 368.61, "step": 6875, "train_speed(iter/s)": 0.205032 }, { "acc": 0.73348174, "epoch": 0.17453069507864027, "grad_norm": 2.765625, "learning_rate": 9.962081320811015e-06, "loss": 1.08524904, "memory(GiB)": 368.61, "step": 6880, "train_speed(iter/s)": 0.205041 }, { "acc": 0.74637318, "epoch": 0.17465753424657535, "grad_norm": 2.4375, "learning_rate": 9.961952312165327e-06, "loss": 1.03012142, "memory(GiB)": 368.61, "step": 6885, "train_speed(iter/s)": 0.205049 }, { "acc": 0.74292173, "epoch": 0.1747843734145104, "grad_norm": 1.984375, "learning_rate": 9.961823085270823e-06, "loss": 1.0386858, "memory(GiB)": 368.61, "step": 6890, "train_speed(iter/s)": 0.20506 }, { "acc": 0.74401045, "epoch": 0.17491121258244546, "grad_norm": 1.984375, "learning_rate": 9.961693640133187e-06, "loss": 0.99030867, "memory(GiB)": 368.61, "step": 6895, "train_speed(iter/s)": 0.205082 }, { "acc": 0.73634806, "epoch": 0.1750380517503805, "grad_norm": 2.109375, "learning_rate": 9.961563976758112e-06, "loss": 1.09452667, "memory(GiB)": 368.61, "step": 6900, "train_speed(iter/s)": 0.205096 }, { "acc": 0.73812523, "epoch": 0.17516489091831558, "grad_norm": 2.296875, "learning_rate": 9.961434095151301e-06, "loss": 1.08954287, "memory(GiB)": 368.61, "step": 6905, "train_speed(iter/s)": 0.205126 }, { "acc": 0.7396708, "epoch": 0.17529173008625062, "grad_norm": 1.71875, "learning_rate": 9.961303995318467e-06, "loss": 1.02438002, "memory(GiB)": 368.61, "step": 6910, "train_speed(iter/s)": 0.205136 }, { "acc": 0.74333005, "epoch": 0.1754185692541857, "grad_norm": 2.328125, "learning_rate": 9.961173677265334e-06, "loss": 1.04597645, "memory(GiB)": 368.61, "step": 6915, "train_speed(iter/s)": 0.205155 }, { "acc": 0.73269129, "epoch": 0.17554540842212074, "grad_norm": 1.9609375, "learning_rate": 9.961043140997632e-06, "loss": 1.04300671, "memory(GiB)": 368.61, "step": 6920, "train_speed(iter/s)": 0.205174 }, { "acc": 0.72249622, "epoch": 0.1756722475900558, "grad_norm": 2.4375, "learning_rate": 9.960912386521104e-06, "loss": 1.11844273, "memory(GiB)": 368.61, "step": 6925, "train_speed(iter/s)": 0.205173 }, { "acc": 0.73272367, "epoch": 0.17579908675799086, "grad_norm": 2.15625, "learning_rate": 9.9607814138415e-06, "loss": 1.03488197, "memory(GiB)": 368.61, "step": 6930, "train_speed(iter/s)": 0.205194 }, { "acc": 0.73511949, "epoch": 0.17592592592592593, "grad_norm": 2.71875, "learning_rate": 9.96065022296458e-06, "loss": 1.08442936, "memory(GiB)": 368.61, "step": 6935, "train_speed(iter/s)": 0.205217 }, { "acc": 0.74903202, "epoch": 0.17605276509386097, "grad_norm": 2.484375, "learning_rate": 9.960518813896117e-06, "loss": 1.01526737, "memory(GiB)": 368.61, "step": 6940, "train_speed(iter/s)": 0.205224 }, { "acc": 0.73137283, "epoch": 0.17617960426179605, "grad_norm": 2.234375, "learning_rate": 9.960387186641887e-06, "loss": 1.07501926, "memory(GiB)": 368.61, "step": 6945, "train_speed(iter/s)": 0.205246 }, { "acc": 0.73833766, "epoch": 0.1763064434297311, "grad_norm": 2.125, "learning_rate": 9.960255341207686e-06, "loss": 1.02673359, "memory(GiB)": 368.61, "step": 6950, "train_speed(iter/s)": 0.205253 }, { "acc": 0.75767183, "epoch": 0.17643328259766616, "grad_norm": 1.8671875, "learning_rate": 9.960123277599305e-06, "loss": 1.0386797, "memory(GiB)": 368.61, "step": 6955, "train_speed(iter/s)": 0.205277 }, { "acc": 0.72183762, "epoch": 0.1765601217656012, "grad_norm": 2.71875, "learning_rate": 9.959990995822559e-06, "loss": 1.07587128, "memory(GiB)": 368.61, "step": 6960, "train_speed(iter/s)": 0.205293 }, { "acc": 0.74333458, "epoch": 0.17668696093353628, "grad_norm": 1.890625, "learning_rate": 9.959858495883263e-06, "loss": 1.04697371, "memory(GiB)": 368.61, "step": 6965, "train_speed(iter/s)": 0.205314 }, { "acc": 0.74575162, "epoch": 0.17681380010147132, "grad_norm": 1.7734375, "learning_rate": 9.959725777787249e-06, "loss": 1.03231945, "memory(GiB)": 368.61, "step": 6970, "train_speed(iter/s)": 0.205336 }, { "acc": 0.74170094, "epoch": 0.1769406392694064, "grad_norm": 2.515625, "learning_rate": 9.959592841540349e-06, "loss": 1.11927052, "memory(GiB)": 368.61, "step": 6975, "train_speed(iter/s)": 0.20535 }, { "acc": 0.72287054, "epoch": 0.17706747843734144, "grad_norm": 2.140625, "learning_rate": 9.959459687148414e-06, "loss": 1.10802994, "memory(GiB)": 368.61, "step": 6980, "train_speed(iter/s)": 0.205372 }, { "acc": 0.7276618, "epoch": 0.1771943176052765, "grad_norm": 2.328125, "learning_rate": 9.959326314617299e-06, "loss": 1.08866434, "memory(GiB)": 368.61, "step": 6985, "train_speed(iter/s)": 0.205389 }, { "acc": 0.73414078, "epoch": 0.17732115677321156, "grad_norm": 2.59375, "learning_rate": 9.95919272395287e-06, "loss": 1.07738762, "memory(GiB)": 368.61, "step": 6990, "train_speed(iter/s)": 0.205406 }, { "acc": 0.72644258, "epoch": 0.17744799594114663, "grad_norm": 2.21875, "learning_rate": 9.959058915161006e-06, "loss": 1.16719551, "memory(GiB)": 368.61, "step": 6995, "train_speed(iter/s)": 0.20543 }, { "acc": 0.72166891, "epoch": 0.17757483510908167, "grad_norm": 2.34375, "learning_rate": 9.95892488824759e-06, "loss": 1.11475506, "memory(GiB)": 368.61, "step": 7000, "train_speed(iter/s)": 0.205445 }, { "epoch": 0.17757483510908167, "eval_acc": 0.7260390604078997, "eval_loss": 1.032478928565979, "eval_runtime": 384.1569, "eval_samples_per_second": 16.582, "eval_steps_per_second": 8.291, "step": 7000 }, { "acc": 0.74958572, "epoch": 0.17770167427701675, "grad_norm": 2.3125, "learning_rate": 9.958790643218515e-06, "loss": 0.97049932, "memory(GiB)": 368.61, "step": 7005, "train_speed(iter/s)": 0.20125 }, { "acc": 0.73144288, "epoch": 0.1778285134449518, "grad_norm": 1.8203125, "learning_rate": 9.95865618007969e-06, "loss": 1.0345314, "memory(GiB)": 368.61, "step": 7010, "train_speed(iter/s)": 0.201255 }, { "acc": 0.74315891, "epoch": 0.17795535261288686, "grad_norm": 2.046875, "learning_rate": 9.958521498837029e-06, "loss": 1.04580135, "memory(GiB)": 368.61, "step": 7015, "train_speed(iter/s)": 0.201285 }, { "acc": 0.73691454, "epoch": 0.1780821917808219, "grad_norm": 2.28125, "learning_rate": 9.95838659949645e-06, "loss": 1.09393463, "memory(GiB)": 368.61, "step": 7020, "train_speed(iter/s)": 0.201302 }, { "acc": 0.75360498, "epoch": 0.17820903094875698, "grad_norm": 2.1875, "learning_rate": 9.958251482063894e-06, "loss": 1.00593863, "memory(GiB)": 368.61, "step": 7025, "train_speed(iter/s)": 0.201323 }, { "acc": 0.72813444, "epoch": 0.17833587011669202, "grad_norm": 2.140625, "learning_rate": 9.9581161465453e-06, "loss": 1.12013836, "memory(GiB)": 368.61, "step": 7030, "train_speed(iter/s)": 0.201341 }, { "acc": 0.73183231, "epoch": 0.1784627092846271, "grad_norm": 2.265625, "learning_rate": 9.957980592946621e-06, "loss": 1.09410028, "memory(GiB)": 368.61, "step": 7035, "train_speed(iter/s)": 0.201369 }, { "acc": 0.74932461, "epoch": 0.17858954845256214, "grad_norm": 2.1875, "learning_rate": 9.957844821273822e-06, "loss": 0.99877472, "memory(GiB)": 368.61, "step": 7040, "train_speed(iter/s)": 0.201387 }, { "acc": 0.75175896, "epoch": 0.1787163876204972, "grad_norm": 1.984375, "learning_rate": 9.95770883153287e-06, "loss": 0.96522694, "memory(GiB)": 368.61, "step": 7045, "train_speed(iter/s)": 0.201412 }, { "acc": 0.73876219, "epoch": 0.17884322678843226, "grad_norm": 2.34375, "learning_rate": 9.957572623729749e-06, "loss": 1.13957386, "memory(GiB)": 368.61, "step": 7050, "train_speed(iter/s)": 0.201434 }, { "acc": 0.72954512, "epoch": 0.17897006595636733, "grad_norm": 2.515625, "learning_rate": 9.957436197870451e-06, "loss": 1.08292751, "memory(GiB)": 368.61, "step": 7055, "train_speed(iter/s)": 0.20146 }, { "acc": 0.73831549, "epoch": 0.17909690512430237, "grad_norm": 2.515625, "learning_rate": 9.957299553960975e-06, "loss": 1.08457508, "memory(GiB)": 368.61, "step": 7060, "train_speed(iter/s)": 0.201488 }, { "acc": 0.74629498, "epoch": 0.17922374429223745, "grad_norm": 2.5, "learning_rate": 9.957162692007334e-06, "loss": 1.01553812, "memory(GiB)": 368.61, "step": 7065, "train_speed(iter/s)": 0.201504 }, { "acc": 0.74929624, "epoch": 0.1793505834601725, "grad_norm": 2.421875, "learning_rate": 9.957025612015543e-06, "loss": 1.03547649, "memory(GiB)": 368.61, "step": 7070, "train_speed(iter/s)": 0.201519 }, { "acc": 0.74647646, "epoch": 0.17947742262810756, "grad_norm": 1.96875, "learning_rate": 9.956888313991636e-06, "loss": 1.01459694, "memory(GiB)": 368.61, "step": 7075, "train_speed(iter/s)": 0.201536 }, { "acc": 0.73734303, "epoch": 0.1796042617960426, "grad_norm": 2.3125, "learning_rate": 9.956750797941648e-06, "loss": 1.06683092, "memory(GiB)": 368.61, "step": 7080, "train_speed(iter/s)": 0.201563 }, { "acc": 0.73319073, "epoch": 0.17973110096397768, "grad_norm": 1.875, "learning_rate": 9.95661306387163e-06, "loss": 1.11999083, "memory(GiB)": 368.61, "step": 7085, "train_speed(iter/s)": 0.201585 }, { "acc": 0.73619642, "epoch": 0.17985794013191272, "grad_norm": 3.09375, "learning_rate": 9.95647511178764e-06, "loss": 1.01085987, "memory(GiB)": 368.61, "step": 7090, "train_speed(iter/s)": 0.201601 }, { "acc": 0.73877153, "epoch": 0.1799847792998478, "grad_norm": 2.609375, "learning_rate": 9.956336941695747e-06, "loss": 1.06953058, "memory(GiB)": 368.61, "step": 7095, "train_speed(iter/s)": 0.201621 }, { "acc": 0.74253902, "epoch": 0.18011161846778284, "grad_norm": 2.203125, "learning_rate": 9.956198553602026e-06, "loss": 1.00938339, "memory(GiB)": 368.61, "step": 7100, "train_speed(iter/s)": 0.201636 }, { "acc": 0.71464596, "epoch": 0.1802384576357179, "grad_norm": 2.046875, "learning_rate": 9.956059947512563e-06, "loss": 1.13385792, "memory(GiB)": 368.61, "step": 7105, "train_speed(iter/s)": 0.201654 }, { "acc": 0.73473549, "epoch": 0.18036529680365296, "grad_norm": 2.09375, "learning_rate": 9.95592112343346e-06, "loss": 1.04764328, "memory(GiB)": 368.61, "step": 7110, "train_speed(iter/s)": 0.201686 }, { "acc": 0.74022865, "epoch": 0.18049213597158803, "grad_norm": 2.421875, "learning_rate": 9.955782081370818e-06, "loss": 1.0324995, "memory(GiB)": 368.61, "step": 7115, "train_speed(iter/s)": 0.201695 }, { "acc": 0.74505949, "epoch": 0.18061897513952307, "grad_norm": 2.046875, "learning_rate": 9.955642821330752e-06, "loss": 1.0460268, "memory(GiB)": 368.61, "step": 7120, "train_speed(iter/s)": 0.201713 }, { "acc": 0.75348368, "epoch": 0.18074581430745815, "grad_norm": 2.0, "learning_rate": 9.95550334331939e-06, "loss": 1.00037975, "memory(GiB)": 368.61, "step": 7125, "train_speed(iter/s)": 0.201722 }, { "acc": 0.74253559, "epoch": 0.1808726534753932, "grad_norm": 2.203125, "learning_rate": 9.955363647342868e-06, "loss": 1.02346973, "memory(GiB)": 368.61, "step": 7130, "train_speed(iter/s)": 0.201726 }, { "acc": 0.72887702, "epoch": 0.18099949264332826, "grad_norm": 2.71875, "learning_rate": 9.955223733407327e-06, "loss": 1.10731564, "memory(GiB)": 368.61, "step": 7135, "train_speed(iter/s)": 0.201735 }, { "acc": 0.74058323, "epoch": 0.1811263318112633, "grad_norm": 2.078125, "learning_rate": 9.955083601518924e-06, "loss": 1.06391335, "memory(GiB)": 368.61, "step": 7140, "train_speed(iter/s)": 0.20175 }, { "acc": 0.71672888, "epoch": 0.18125317097919838, "grad_norm": 2.3125, "learning_rate": 9.95494325168382e-06, "loss": 1.13591337, "memory(GiB)": 368.61, "step": 7145, "train_speed(iter/s)": 0.201772 }, { "acc": 0.73358526, "epoch": 0.18138001014713342, "grad_norm": 2.015625, "learning_rate": 9.954802683908192e-06, "loss": 1.09932899, "memory(GiB)": 368.61, "step": 7150, "train_speed(iter/s)": 0.201789 }, { "acc": 0.74566813, "epoch": 0.1815068493150685, "grad_norm": 2.03125, "learning_rate": 9.954661898198216e-06, "loss": 1.02300949, "memory(GiB)": 368.61, "step": 7155, "train_speed(iter/s)": 0.20181 }, { "acc": 0.73096805, "epoch": 0.18163368848300354, "grad_norm": 2.34375, "learning_rate": 9.954520894560092e-06, "loss": 1.07978582, "memory(GiB)": 368.61, "step": 7160, "train_speed(iter/s)": 0.201839 }, { "acc": 0.73260021, "epoch": 0.1817605276509386, "grad_norm": 1.96875, "learning_rate": 9.954379673000018e-06, "loss": 1.086059, "memory(GiB)": 368.61, "step": 7165, "train_speed(iter/s)": 0.201854 }, { "acc": 0.72697515, "epoch": 0.18188736681887366, "grad_norm": 2.1875, "learning_rate": 9.954238233524208e-06, "loss": 1.10135517, "memory(GiB)": 368.61, "step": 7170, "train_speed(iter/s)": 0.20187 }, { "acc": 0.73953757, "epoch": 0.18201420598680873, "grad_norm": 2.40625, "learning_rate": 9.954096576138879e-06, "loss": 1.07607994, "memory(GiB)": 368.61, "step": 7175, "train_speed(iter/s)": 0.201881 }, { "acc": 0.73539104, "epoch": 0.18214104515474377, "grad_norm": 2.1875, "learning_rate": 9.953954700850264e-06, "loss": 1.101264, "memory(GiB)": 368.61, "step": 7180, "train_speed(iter/s)": 0.201902 }, { "acc": 0.7341691, "epoch": 0.18226788432267885, "grad_norm": 1.890625, "learning_rate": 9.953812607664607e-06, "loss": 1.09754219, "memory(GiB)": 368.61, "step": 7185, "train_speed(iter/s)": 0.201919 }, { "acc": 0.75175838, "epoch": 0.1823947234906139, "grad_norm": 2.234375, "learning_rate": 9.95367029658815e-06, "loss": 1.08350401, "memory(GiB)": 368.61, "step": 7190, "train_speed(iter/s)": 0.201947 }, { "acc": 0.72302527, "epoch": 0.18252156265854896, "grad_norm": 2.515625, "learning_rate": 9.953527767627159e-06, "loss": 1.06786556, "memory(GiB)": 368.61, "step": 7195, "train_speed(iter/s)": 0.20196 }, { "acc": 0.75441771, "epoch": 0.182648401826484, "grad_norm": 1.953125, "learning_rate": 9.9533850207879e-06, "loss": 0.9808506, "memory(GiB)": 368.61, "step": 7200, "train_speed(iter/s)": 0.201971 }, { "acc": 0.74707508, "epoch": 0.18277524099441908, "grad_norm": 2.296875, "learning_rate": 9.953242056076652e-06, "loss": 1.06108131, "memory(GiB)": 368.61, "step": 7205, "train_speed(iter/s)": 0.201994 }, { "acc": 0.72663884, "epoch": 0.18290208016235412, "grad_norm": 2.46875, "learning_rate": 9.953098873499705e-06, "loss": 1.09596424, "memory(GiB)": 368.61, "step": 7210, "train_speed(iter/s)": 0.202024 }, { "acc": 0.74128089, "epoch": 0.1830289193302892, "grad_norm": 2.015625, "learning_rate": 9.952955473063356e-06, "loss": 1.01560097, "memory(GiB)": 368.61, "step": 7215, "train_speed(iter/s)": 0.20204 }, { "acc": 0.72674541, "epoch": 0.18315575849822424, "grad_norm": 2.234375, "learning_rate": 9.952811854773911e-06, "loss": 1.12993097, "memory(GiB)": 368.61, "step": 7220, "train_speed(iter/s)": 0.202066 }, { "acc": 0.72463131, "epoch": 0.1832825976661593, "grad_norm": 2.640625, "learning_rate": 9.952668018637687e-06, "loss": 1.10342607, "memory(GiB)": 368.61, "step": 7225, "train_speed(iter/s)": 0.202081 }, { "acc": 0.7383956, "epoch": 0.18340943683409436, "grad_norm": 2.421875, "learning_rate": 9.952523964661014e-06, "loss": 1.05513401, "memory(GiB)": 368.61, "step": 7230, "train_speed(iter/s)": 0.202111 }, { "acc": 0.73332548, "epoch": 0.18353627600202943, "grad_norm": 2.203125, "learning_rate": 9.952379692850222e-06, "loss": 1.07109871, "memory(GiB)": 368.61, "step": 7235, "train_speed(iter/s)": 0.202134 }, { "acc": 0.74106259, "epoch": 0.18366311516996447, "grad_norm": 2.796875, "learning_rate": 9.952235203211663e-06, "loss": 1.01990891, "memory(GiB)": 368.61, "step": 7240, "train_speed(iter/s)": 0.202124 }, { "acc": 0.74849162, "epoch": 0.18378995433789955, "grad_norm": 2.171875, "learning_rate": 9.952090495751689e-06, "loss": 1.05412846, "memory(GiB)": 368.61, "step": 7245, "train_speed(iter/s)": 0.202144 }, { "acc": 0.7379293, "epoch": 0.1839167935058346, "grad_norm": 2.28125, "learning_rate": 9.951945570476666e-06, "loss": 1.0684268, "memory(GiB)": 368.61, "step": 7250, "train_speed(iter/s)": 0.202172 }, { "acc": 0.72688546, "epoch": 0.18404363267376966, "grad_norm": 2.390625, "learning_rate": 9.951800427392968e-06, "loss": 1.11978569, "memory(GiB)": 368.61, "step": 7255, "train_speed(iter/s)": 0.20219 }, { "acc": 0.73304024, "epoch": 0.1841704718417047, "grad_norm": 2.375, "learning_rate": 9.951655066506977e-06, "loss": 1.1106513, "memory(GiB)": 368.61, "step": 7260, "train_speed(iter/s)": 0.202216 }, { "acc": 0.72991295, "epoch": 0.18429731100963978, "grad_norm": 2.078125, "learning_rate": 9.951509487825091e-06, "loss": 1.09993572, "memory(GiB)": 368.61, "step": 7265, "train_speed(iter/s)": 0.202234 }, { "acc": 0.73011723, "epoch": 0.18442415017757482, "grad_norm": 2.40625, "learning_rate": 9.95136369135371e-06, "loss": 1.09661884, "memory(GiB)": 368.61, "step": 7270, "train_speed(iter/s)": 0.202246 }, { "acc": 0.74305015, "epoch": 0.1845509893455099, "grad_norm": 2.125, "learning_rate": 9.951217677099248e-06, "loss": 1.08406382, "memory(GiB)": 368.61, "step": 7275, "train_speed(iter/s)": 0.202274 }, { "acc": 0.73562961, "epoch": 0.18467782851344494, "grad_norm": 2.109375, "learning_rate": 9.951071445068125e-06, "loss": 1.04280014, "memory(GiB)": 368.61, "step": 7280, "train_speed(iter/s)": 0.202296 }, { "acc": 0.73486509, "epoch": 0.18480466768138, "grad_norm": 2.21875, "learning_rate": 9.950924995266778e-06, "loss": 1.07232609, "memory(GiB)": 368.61, "step": 7285, "train_speed(iter/s)": 0.202312 }, { "acc": 0.73950415, "epoch": 0.18493150684931506, "grad_norm": 2.15625, "learning_rate": 9.950778327701643e-06, "loss": 1.08280439, "memory(GiB)": 368.61, "step": 7290, "train_speed(iter/s)": 0.202334 }, { "acc": 0.75298247, "epoch": 0.18505834601725013, "grad_norm": 1.90625, "learning_rate": 9.950631442379175e-06, "loss": 1.03587799, "memory(GiB)": 368.61, "step": 7295, "train_speed(iter/s)": 0.202359 }, { "acc": 0.73914847, "epoch": 0.18518518518518517, "grad_norm": 2.59375, "learning_rate": 9.950484339305832e-06, "loss": 1.04084187, "memory(GiB)": 368.61, "step": 7300, "train_speed(iter/s)": 0.202366 }, { "acc": 0.74043007, "epoch": 0.18531202435312025, "grad_norm": 2.3125, "learning_rate": 9.950337018488086e-06, "loss": 1.07902308, "memory(GiB)": 368.61, "step": 7305, "train_speed(iter/s)": 0.202385 }, { "acc": 0.73238115, "epoch": 0.1854388635210553, "grad_norm": 2.015625, "learning_rate": 9.950189479932417e-06, "loss": 1.07820024, "memory(GiB)": 368.61, "step": 7310, "train_speed(iter/s)": 0.202407 }, { "acc": 0.74081268, "epoch": 0.18556570268899036, "grad_norm": 2.0625, "learning_rate": 9.950041723645312e-06, "loss": 1.07857361, "memory(GiB)": 368.61, "step": 7315, "train_speed(iter/s)": 0.202414 }, { "acc": 0.73511872, "epoch": 0.1856925418569254, "grad_norm": 1.9765625, "learning_rate": 9.949893749633273e-06, "loss": 1.0719779, "memory(GiB)": 368.61, "step": 7320, "train_speed(iter/s)": 0.20243 }, { "acc": 0.73779483, "epoch": 0.18581938102486048, "grad_norm": 2.140625, "learning_rate": 9.949745557902806e-06, "loss": 1.12587271, "memory(GiB)": 368.61, "step": 7325, "train_speed(iter/s)": 0.202455 }, { "acc": 0.72937961, "epoch": 0.18594622019279552, "grad_norm": 1.859375, "learning_rate": 9.949597148460433e-06, "loss": 1.10185785, "memory(GiB)": 368.61, "step": 7330, "train_speed(iter/s)": 0.202473 }, { "acc": 0.73504219, "epoch": 0.1860730593607306, "grad_norm": 2.375, "learning_rate": 9.949448521312676e-06, "loss": 1.12442722, "memory(GiB)": 368.61, "step": 7335, "train_speed(iter/s)": 0.202468 }, { "acc": 0.73645687, "epoch": 0.18619989852866564, "grad_norm": 2.75, "learning_rate": 9.949299676466077e-06, "loss": 1.08462029, "memory(GiB)": 368.61, "step": 7340, "train_speed(iter/s)": 0.20249 }, { "acc": 0.74195147, "epoch": 0.1863267376966007, "grad_norm": 2.34375, "learning_rate": 9.94915061392718e-06, "loss": 1.01908388, "memory(GiB)": 368.61, "step": 7345, "train_speed(iter/s)": 0.202505 }, { "acc": 0.74993987, "epoch": 0.18645357686453576, "grad_norm": 2.296875, "learning_rate": 9.949001333702543e-06, "loss": 1.01290884, "memory(GiB)": 368.61, "step": 7350, "train_speed(iter/s)": 0.202529 }, { "acc": 0.72713661, "epoch": 0.18658041603247083, "grad_norm": 2.46875, "learning_rate": 9.948851835798732e-06, "loss": 1.17365913, "memory(GiB)": 368.61, "step": 7355, "train_speed(iter/s)": 0.202556 }, { "acc": 0.72747498, "epoch": 0.18670725520040587, "grad_norm": 2.1875, "learning_rate": 9.948702120222323e-06, "loss": 1.09662018, "memory(GiB)": 368.61, "step": 7360, "train_speed(iter/s)": 0.202577 }, { "acc": 0.75277786, "epoch": 0.18683409436834095, "grad_norm": 2.140625, "learning_rate": 9.9485521869799e-06, "loss": 1.0121151, "memory(GiB)": 368.61, "step": 7365, "train_speed(iter/s)": 0.202584 }, { "acc": 0.75506058, "epoch": 0.186960933536276, "grad_norm": 2.015625, "learning_rate": 9.948402036078057e-06, "loss": 1.00780849, "memory(GiB)": 368.61, "step": 7370, "train_speed(iter/s)": 0.2026 }, { "acc": 0.72812967, "epoch": 0.18708777270421106, "grad_norm": 2.15625, "learning_rate": 9.948251667523401e-06, "loss": 1.11597099, "memory(GiB)": 368.61, "step": 7375, "train_speed(iter/s)": 0.202621 }, { "acc": 0.73096933, "epoch": 0.1872146118721461, "grad_norm": 2.03125, "learning_rate": 9.948101081322544e-06, "loss": 1.04211912, "memory(GiB)": 368.61, "step": 7380, "train_speed(iter/s)": 0.202637 }, { "acc": 0.74027038, "epoch": 0.18734145104008118, "grad_norm": 1.9609375, "learning_rate": 9.947950277482109e-06, "loss": 1.03890142, "memory(GiB)": 368.61, "step": 7385, "train_speed(iter/s)": 0.202664 }, { "acc": 0.73945107, "epoch": 0.18746829020801623, "grad_norm": 2.078125, "learning_rate": 9.94779925600873e-06, "loss": 1.0905653, "memory(GiB)": 368.61, "step": 7390, "train_speed(iter/s)": 0.202688 }, { "acc": 0.74880219, "epoch": 0.1875951293759513, "grad_norm": 2.046875, "learning_rate": 9.947648016909048e-06, "loss": 1.04554758, "memory(GiB)": 368.61, "step": 7395, "train_speed(iter/s)": 0.20272 }, { "acc": 0.74070215, "epoch": 0.18772196854388634, "grad_norm": 2.21875, "learning_rate": 9.947496560189717e-06, "loss": 1.08187122, "memory(GiB)": 368.61, "step": 7400, "train_speed(iter/s)": 0.202738 }, { "acc": 0.73150721, "epoch": 0.18784880771182141, "grad_norm": 2.0, "learning_rate": 9.9473448858574e-06, "loss": 1.08858099, "memory(GiB)": 368.61, "step": 7405, "train_speed(iter/s)": 0.202761 }, { "acc": 0.72204332, "epoch": 0.18797564687975646, "grad_norm": 2.4375, "learning_rate": 9.947192993918765e-06, "loss": 1.12560186, "memory(GiB)": 368.61, "step": 7410, "train_speed(iter/s)": 0.202778 }, { "acc": 0.73017545, "epoch": 0.18810248604769153, "grad_norm": 2.53125, "learning_rate": 9.947040884380496e-06, "loss": 1.08032608, "memory(GiB)": 368.61, "step": 7415, "train_speed(iter/s)": 0.202791 }, { "acc": 0.74636707, "epoch": 0.18822932521562658, "grad_norm": 2.328125, "learning_rate": 9.946888557249281e-06, "loss": 1.03786373, "memory(GiB)": 368.61, "step": 7420, "train_speed(iter/s)": 0.202812 }, { "acc": 0.73984356, "epoch": 0.18835616438356165, "grad_norm": 2.03125, "learning_rate": 9.946736012531821e-06, "loss": 1.03640976, "memory(GiB)": 368.61, "step": 7425, "train_speed(iter/s)": 0.202838 }, { "acc": 0.74628181, "epoch": 0.1884830035514967, "grad_norm": 2.078125, "learning_rate": 9.946583250234826e-06, "loss": 1.05456181, "memory(GiB)": 368.61, "step": 7430, "train_speed(iter/s)": 0.202863 }, { "acc": 0.74217167, "epoch": 0.18860984271943176, "grad_norm": 2.046875, "learning_rate": 9.946430270365015e-06, "loss": 1.01388683, "memory(GiB)": 368.61, "step": 7435, "train_speed(iter/s)": 0.202885 }, { "acc": 0.74590178, "epoch": 0.1887366818873668, "grad_norm": 2.203125, "learning_rate": 9.946277072929115e-06, "loss": 1.05300007, "memory(GiB)": 368.61, "step": 7440, "train_speed(iter/s)": 0.202896 }, { "acc": 0.74052162, "epoch": 0.18886352105530188, "grad_norm": 1.9765625, "learning_rate": 9.946123657933867e-06, "loss": 1.04892368, "memory(GiB)": 368.61, "step": 7445, "train_speed(iter/s)": 0.202896 }, { "acc": 0.73084764, "epoch": 0.18899036022323693, "grad_norm": 2.234375, "learning_rate": 9.945970025386018e-06, "loss": 1.03792362, "memory(GiB)": 368.61, "step": 7450, "train_speed(iter/s)": 0.202914 }, { "acc": 0.7391387, "epoch": 0.189117199391172, "grad_norm": 2.6875, "learning_rate": 9.945816175292326e-06, "loss": 1.03101673, "memory(GiB)": 368.61, "step": 7455, "train_speed(iter/s)": 0.202925 }, { "acc": 0.72829857, "epoch": 0.18924403855910704, "grad_norm": 2.171875, "learning_rate": 9.945662107659554e-06, "loss": 1.11133137, "memory(GiB)": 368.61, "step": 7460, "train_speed(iter/s)": 0.202937 }, { "acc": 0.74490595, "epoch": 0.18937087772704211, "grad_norm": 2.421875, "learning_rate": 9.945507822494485e-06, "loss": 1.05077209, "memory(GiB)": 368.61, "step": 7465, "train_speed(iter/s)": 0.202956 }, { "acc": 0.7406354, "epoch": 0.18949771689497716, "grad_norm": 2.625, "learning_rate": 9.9453533198039e-06, "loss": 1.04026642, "memory(GiB)": 368.61, "step": 7470, "train_speed(iter/s)": 0.202971 }, { "acc": 0.72728672, "epoch": 0.18962455606291223, "grad_norm": 2.15625, "learning_rate": 9.945198599594598e-06, "loss": 1.06321621, "memory(GiB)": 368.61, "step": 7475, "train_speed(iter/s)": 0.202996 }, { "acc": 0.72514048, "epoch": 0.18975139523084728, "grad_norm": 2.125, "learning_rate": 9.945043661873381e-06, "loss": 1.11334782, "memory(GiB)": 368.61, "step": 7480, "train_speed(iter/s)": 0.203011 }, { "acc": 0.74027748, "epoch": 0.18987823439878235, "grad_norm": 1.9453125, "learning_rate": 9.944888506647066e-06, "loss": 1.05033588, "memory(GiB)": 368.61, "step": 7485, "train_speed(iter/s)": 0.203013 }, { "acc": 0.74355307, "epoch": 0.1900050735667174, "grad_norm": 2.34375, "learning_rate": 9.944733133922479e-06, "loss": 1.07819233, "memory(GiB)": 368.61, "step": 7490, "train_speed(iter/s)": 0.203034 }, { "acc": 0.74185591, "epoch": 0.19013191273465246, "grad_norm": 2.34375, "learning_rate": 9.944577543706451e-06, "loss": 1.04711056, "memory(GiB)": 368.61, "step": 7495, "train_speed(iter/s)": 0.20305 }, { "acc": 0.73267813, "epoch": 0.1902587519025875, "grad_norm": 2.3125, "learning_rate": 9.944421736005825e-06, "loss": 1.05258026, "memory(GiB)": 368.61, "step": 7500, "train_speed(iter/s)": 0.203079 }, { "acc": 0.72408943, "epoch": 0.19038559107052258, "grad_norm": 2.09375, "learning_rate": 9.944265710827459e-06, "loss": 1.14096203, "memory(GiB)": 368.61, "step": 7505, "train_speed(iter/s)": 0.203104 }, { "acc": 0.7417985, "epoch": 0.19051243023845763, "grad_norm": 2.0625, "learning_rate": 9.944109468178208e-06, "loss": 1.03365746, "memory(GiB)": 368.61, "step": 7510, "train_speed(iter/s)": 0.203118 }, { "acc": 0.73999653, "epoch": 0.1906392694063927, "grad_norm": 1.8046875, "learning_rate": 9.943953008064953e-06, "loss": 1.05270844, "memory(GiB)": 368.61, "step": 7515, "train_speed(iter/s)": 0.203135 }, { "acc": 0.74177599, "epoch": 0.19076610857432774, "grad_norm": 2.5625, "learning_rate": 9.94379633049457e-06, "loss": 1.05415869, "memory(GiB)": 368.61, "step": 7520, "train_speed(iter/s)": 0.203146 }, { "acc": 0.7461246, "epoch": 0.19089294774226281, "grad_norm": 2.015625, "learning_rate": 9.943639435473952e-06, "loss": 1.04645433, "memory(GiB)": 368.61, "step": 7525, "train_speed(iter/s)": 0.20317 }, { "acc": 0.74612832, "epoch": 0.19101978691019786, "grad_norm": 2.328125, "learning_rate": 9.94348232301e-06, "loss": 1.05182743, "memory(GiB)": 368.61, "step": 7530, "train_speed(iter/s)": 0.203188 }, { "acc": 0.72840366, "epoch": 0.19114662607813293, "grad_norm": 2.421875, "learning_rate": 9.943324993109624e-06, "loss": 1.07835684, "memory(GiB)": 368.61, "step": 7535, "train_speed(iter/s)": 0.203209 }, { "acc": 0.74634266, "epoch": 0.19127346524606798, "grad_norm": 2.28125, "learning_rate": 9.943167445779745e-06, "loss": 1.02826252, "memory(GiB)": 368.61, "step": 7540, "train_speed(iter/s)": 0.203225 }, { "acc": 0.75064068, "epoch": 0.19140030441400305, "grad_norm": 2.296875, "learning_rate": 9.94300968102729e-06, "loss": 1.02140875, "memory(GiB)": 368.61, "step": 7545, "train_speed(iter/s)": 0.203244 }, { "acc": 0.74970427, "epoch": 0.1915271435819381, "grad_norm": 2.0625, "learning_rate": 9.942851698859204e-06, "loss": 1.05646038, "memory(GiB)": 368.61, "step": 7550, "train_speed(iter/s)": 0.203267 }, { "acc": 0.75336809, "epoch": 0.19165398274987316, "grad_norm": 2.25, "learning_rate": 9.94269349928243e-06, "loss": 0.99458694, "memory(GiB)": 368.61, "step": 7555, "train_speed(iter/s)": 0.203279 }, { "acc": 0.75104904, "epoch": 0.1917808219178082, "grad_norm": 2.0625, "learning_rate": 9.942535082303927e-06, "loss": 1.01717939, "memory(GiB)": 368.61, "step": 7560, "train_speed(iter/s)": 0.203284 }, { "acc": 0.731248, "epoch": 0.19190766108574328, "grad_norm": 2.09375, "learning_rate": 9.942376447930666e-06, "loss": 1.05929279, "memory(GiB)": 368.61, "step": 7565, "train_speed(iter/s)": 0.203298 }, { "acc": 0.75526962, "epoch": 0.19203450025367833, "grad_norm": 2.03125, "learning_rate": 9.942217596169623e-06, "loss": 0.96555233, "memory(GiB)": 368.61, "step": 7570, "train_speed(iter/s)": 0.203315 }, { "acc": 0.7234045, "epoch": 0.1921613394216134, "grad_norm": 1.9609375, "learning_rate": 9.942058527027785e-06, "loss": 1.09000149, "memory(GiB)": 368.61, "step": 7575, "train_speed(iter/s)": 0.203329 }, { "acc": 0.72499733, "epoch": 0.19228817858954844, "grad_norm": 2.125, "learning_rate": 9.941899240512147e-06, "loss": 1.0643384, "memory(GiB)": 368.61, "step": 7580, "train_speed(iter/s)": 0.203345 }, { "acc": 0.7272294, "epoch": 0.19241501775748351, "grad_norm": 2.484375, "learning_rate": 9.941739736629716e-06, "loss": 1.08887701, "memory(GiB)": 368.61, "step": 7585, "train_speed(iter/s)": 0.203366 }, { "acc": 0.74458904, "epoch": 0.19254185692541856, "grad_norm": 2.265625, "learning_rate": 9.941580015387509e-06, "loss": 1.01491108, "memory(GiB)": 368.61, "step": 7590, "train_speed(iter/s)": 0.20337 }, { "acc": 0.71552525, "epoch": 0.19266869609335363, "grad_norm": 2.96875, "learning_rate": 9.94142007679255e-06, "loss": 1.14177017, "memory(GiB)": 368.61, "step": 7595, "train_speed(iter/s)": 0.203399 }, { "acc": 0.75215721, "epoch": 0.19279553526128868, "grad_norm": 2.515625, "learning_rate": 9.941259920851874e-06, "loss": 0.99483509, "memory(GiB)": 368.61, "step": 7600, "train_speed(iter/s)": 0.20342 }, { "acc": 0.74100533, "epoch": 0.19292237442922375, "grad_norm": 3.21875, "learning_rate": 9.941099547572527e-06, "loss": 1.07972612, "memory(GiB)": 368.61, "step": 7605, "train_speed(iter/s)": 0.203447 }, { "acc": 0.73801222, "epoch": 0.1930492135971588, "grad_norm": 1.9765625, "learning_rate": 9.94093895696156e-06, "loss": 1.05415497, "memory(GiB)": 368.61, "step": 7610, "train_speed(iter/s)": 0.203455 }, { "acc": 0.732583, "epoch": 0.19317605276509386, "grad_norm": 2.359375, "learning_rate": 9.940778149026038e-06, "loss": 1.07361317, "memory(GiB)": 368.61, "step": 7615, "train_speed(iter/s)": 0.20347 }, { "acc": 0.73484507, "epoch": 0.1933028919330289, "grad_norm": 2.109375, "learning_rate": 9.940617123773036e-06, "loss": 1.06005898, "memory(GiB)": 368.61, "step": 7620, "train_speed(iter/s)": 0.203491 }, { "acc": 0.75035095, "epoch": 0.19342973110096398, "grad_norm": 2.390625, "learning_rate": 9.940455881209632e-06, "loss": 0.97193785, "memory(GiB)": 368.61, "step": 7625, "train_speed(iter/s)": 0.203516 }, { "acc": 0.7423152, "epoch": 0.19355657026889903, "grad_norm": 2.171875, "learning_rate": 9.940294421342922e-06, "loss": 0.99105663, "memory(GiB)": 368.61, "step": 7630, "train_speed(iter/s)": 0.203523 }, { "acc": 0.72413521, "epoch": 0.1936834094368341, "grad_norm": 2.375, "learning_rate": 9.940132744180007e-06, "loss": 1.12153664, "memory(GiB)": 368.61, "step": 7635, "train_speed(iter/s)": 0.203536 }, { "acc": 0.74596453, "epoch": 0.19381024860476914, "grad_norm": 2.109375, "learning_rate": 9.939970849727995e-06, "loss": 0.99884968, "memory(GiB)": 368.61, "step": 7640, "train_speed(iter/s)": 0.203549 }, { "acc": 0.72430234, "epoch": 0.19393708777270421, "grad_norm": 2.3125, "learning_rate": 9.939808737994013e-06, "loss": 1.15649948, "memory(GiB)": 368.61, "step": 7645, "train_speed(iter/s)": 0.203569 }, { "acc": 0.74080925, "epoch": 0.19406392694063926, "grad_norm": 2.0625, "learning_rate": 9.939646408985186e-06, "loss": 1.02432337, "memory(GiB)": 368.61, "step": 7650, "train_speed(iter/s)": 0.203591 }, { "acc": 0.74553499, "epoch": 0.19419076610857433, "grad_norm": 2.421875, "learning_rate": 9.939483862708658e-06, "loss": 1.01980934, "memory(GiB)": 368.61, "step": 7655, "train_speed(iter/s)": 0.203609 }, { "acc": 0.73492432, "epoch": 0.19431760527650938, "grad_norm": 2.234375, "learning_rate": 9.939321099171575e-06, "loss": 1.03268127, "memory(GiB)": 368.61, "step": 7660, "train_speed(iter/s)": 0.203621 }, { "acc": 0.73197517, "epoch": 0.19444444444444445, "grad_norm": 1.984375, "learning_rate": 9.939158118381097e-06, "loss": 1.07259102, "memory(GiB)": 368.61, "step": 7665, "train_speed(iter/s)": 0.203631 }, { "acc": 0.73771963, "epoch": 0.1945712836123795, "grad_norm": 2.3125, "learning_rate": 9.938994920344395e-06, "loss": 1.05658627, "memory(GiB)": 368.61, "step": 7670, "train_speed(iter/s)": 0.203645 }, { "acc": 0.7472672, "epoch": 0.19469812278031456, "grad_norm": 2.046875, "learning_rate": 9.938831505068645e-06, "loss": 1.00339527, "memory(GiB)": 368.61, "step": 7675, "train_speed(iter/s)": 0.203675 }, { "acc": 0.73549643, "epoch": 0.1948249619482496, "grad_norm": 2.25, "learning_rate": 9.938667872561035e-06, "loss": 1.02236977, "memory(GiB)": 368.61, "step": 7680, "train_speed(iter/s)": 0.203678 }, { "acc": 0.7538763, "epoch": 0.19495180111618468, "grad_norm": 2.078125, "learning_rate": 9.938504022828762e-06, "loss": 1.00232639, "memory(GiB)": 368.61, "step": 7685, "train_speed(iter/s)": 0.203692 }, { "acc": 0.73408561, "epoch": 0.19507864028411973, "grad_norm": 1.8125, "learning_rate": 9.938339955879033e-06, "loss": 1.06536369, "memory(GiB)": 368.61, "step": 7690, "train_speed(iter/s)": 0.203697 }, { "acc": 0.7268714, "epoch": 0.1952054794520548, "grad_norm": 2.03125, "learning_rate": 9.938175671719064e-06, "loss": 1.10999508, "memory(GiB)": 368.61, "step": 7695, "train_speed(iter/s)": 0.203714 }, { "acc": 0.72697062, "epoch": 0.19533231861998984, "grad_norm": 2.078125, "learning_rate": 9.938011170356083e-06, "loss": 1.04032974, "memory(GiB)": 368.61, "step": 7700, "train_speed(iter/s)": 0.203734 }, { "acc": 0.73324399, "epoch": 0.19545915778792491, "grad_norm": 3.390625, "learning_rate": 9.937846451797324e-06, "loss": 1.06809826, "memory(GiB)": 368.61, "step": 7705, "train_speed(iter/s)": 0.203743 }, { "acc": 0.74828453, "epoch": 0.19558599695585996, "grad_norm": 2.859375, "learning_rate": 9.93768151605003e-06, "loss": 1.01693249, "memory(GiB)": 368.61, "step": 7710, "train_speed(iter/s)": 0.203754 }, { "acc": 0.74603066, "epoch": 0.19571283612379503, "grad_norm": 2.390625, "learning_rate": 9.93751636312146e-06, "loss": 1.01469383, "memory(GiB)": 368.61, "step": 7715, "train_speed(iter/s)": 0.20377 }, { "acc": 0.72379127, "epoch": 0.19583967529173008, "grad_norm": 2.625, "learning_rate": 9.937350993018875e-06, "loss": 1.11140051, "memory(GiB)": 368.61, "step": 7720, "train_speed(iter/s)": 0.203787 }, { "acc": 0.74114113, "epoch": 0.19596651445966515, "grad_norm": 2.015625, "learning_rate": 9.93718540574955e-06, "loss": 1.04791794, "memory(GiB)": 368.61, "step": 7725, "train_speed(iter/s)": 0.203808 }, { "acc": 0.72533302, "epoch": 0.1960933536276002, "grad_norm": 2.015625, "learning_rate": 9.937019601320768e-06, "loss": 1.11380329, "memory(GiB)": 368.61, "step": 7730, "train_speed(iter/s)": 0.203827 }, { "acc": 0.72360449, "epoch": 0.19622019279553526, "grad_norm": 2.296875, "learning_rate": 9.936853579739823e-06, "loss": 1.06785622, "memory(GiB)": 368.61, "step": 7735, "train_speed(iter/s)": 0.203842 }, { "acc": 0.73539972, "epoch": 0.1963470319634703, "grad_norm": 2.6875, "learning_rate": 9.936687341014015e-06, "loss": 1.0935461, "memory(GiB)": 368.61, "step": 7740, "train_speed(iter/s)": 0.203859 }, { "acc": 0.74174266, "epoch": 0.19647387113140538, "grad_norm": 2.078125, "learning_rate": 9.936520885150655e-06, "loss": 1.01211739, "memory(GiB)": 368.61, "step": 7745, "train_speed(iter/s)": 0.203876 }, { "acc": 0.73229256, "epoch": 0.19660071029934043, "grad_norm": 2.125, "learning_rate": 9.936354212157068e-06, "loss": 1.12805929, "memory(GiB)": 368.61, "step": 7750, "train_speed(iter/s)": 0.203889 }, { "acc": 0.7312758, "epoch": 0.1967275494672755, "grad_norm": 1.8828125, "learning_rate": 9.936187322040584e-06, "loss": 1.09999924, "memory(GiB)": 368.61, "step": 7755, "train_speed(iter/s)": 0.203902 }, { "acc": 0.72799311, "epoch": 0.19685438863521054, "grad_norm": 2.03125, "learning_rate": 9.936020214808544e-06, "loss": 1.09908581, "memory(GiB)": 368.61, "step": 7760, "train_speed(iter/s)": 0.203918 }, { "acc": 0.74430766, "epoch": 0.19698122780314561, "grad_norm": 2.140625, "learning_rate": 9.935852890468297e-06, "loss": 1.05290833, "memory(GiB)": 368.61, "step": 7765, "train_speed(iter/s)": 0.203938 }, { "acc": 0.72967048, "epoch": 0.19710806697108066, "grad_norm": 2.859375, "learning_rate": 9.935685349027201e-06, "loss": 1.05484562, "memory(GiB)": 368.61, "step": 7770, "train_speed(iter/s)": 0.203959 }, { "acc": 0.73263092, "epoch": 0.19723490613901573, "grad_norm": 2.125, "learning_rate": 9.935517590492627e-06, "loss": 1.05446892, "memory(GiB)": 368.61, "step": 7775, "train_speed(iter/s)": 0.203978 }, { "acc": 0.74590778, "epoch": 0.19736174530695078, "grad_norm": 2.15625, "learning_rate": 9.935349614871957e-06, "loss": 1.0232502, "memory(GiB)": 368.61, "step": 7780, "train_speed(iter/s)": 0.203987 }, { "acc": 0.74418645, "epoch": 0.19748858447488585, "grad_norm": 2.234375, "learning_rate": 9.935181422172574e-06, "loss": 1.05656729, "memory(GiB)": 368.61, "step": 7785, "train_speed(iter/s)": 0.20401 }, { "acc": 0.73084288, "epoch": 0.1976154236428209, "grad_norm": 2.546875, "learning_rate": 9.935013012401878e-06, "loss": 1.118993, "memory(GiB)": 368.61, "step": 7790, "train_speed(iter/s)": 0.204019 }, { "acc": 0.74380503, "epoch": 0.19774226281075596, "grad_norm": 2.203125, "learning_rate": 9.934844385567275e-06, "loss": 1.01826639, "memory(GiB)": 368.61, "step": 7795, "train_speed(iter/s)": 0.204042 }, { "acc": 0.7379343, "epoch": 0.197869101978691, "grad_norm": 2.453125, "learning_rate": 9.934675541676186e-06, "loss": 1.02244816, "memory(GiB)": 368.61, "step": 7800, "train_speed(iter/s)": 0.204064 }, { "acc": 0.74984312, "epoch": 0.19799594114662608, "grad_norm": 2.546875, "learning_rate": 9.934506480736034e-06, "loss": 1.00490017, "memory(GiB)": 368.61, "step": 7805, "train_speed(iter/s)": 0.204076 }, { "acc": 0.74664955, "epoch": 0.19812278031456113, "grad_norm": 2.234375, "learning_rate": 9.934337202754257e-06, "loss": 1.07197647, "memory(GiB)": 368.61, "step": 7810, "train_speed(iter/s)": 0.204097 }, { "acc": 0.74057035, "epoch": 0.1982496194824962, "grad_norm": 2.46875, "learning_rate": 9.934167707738298e-06, "loss": 1.07698517, "memory(GiB)": 368.61, "step": 7815, "train_speed(iter/s)": 0.204123 }, { "acc": 0.73835239, "epoch": 0.19837645865043124, "grad_norm": 2.4375, "learning_rate": 9.933997995695615e-06, "loss": 1.05317554, "memory(GiB)": 368.61, "step": 7820, "train_speed(iter/s)": 0.204132 }, { "acc": 0.7345952, "epoch": 0.19850329781836631, "grad_norm": 2.28125, "learning_rate": 9.93382806663367e-06, "loss": 1.12739859, "memory(GiB)": 368.61, "step": 7825, "train_speed(iter/s)": 0.204151 }, { "acc": 0.7402669, "epoch": 0.19863013698630136, "grad_norm": 2.09375, "learning_rate": 9.933657920559939e-06, "loss": 1.03918209, "memory(GiB)": 368.61, "step": 7830, "train_speed(iter/s)": 0.204169 }, { "acc": 0.73546214, "epoch": 0.19875697615423643, "grad_norm": 2.03125, "learning_rate": 9.933487557481905e-06, "loss": 1.0697114, "memory(GiB)": 368.61, "step": 7835, "train_speed(iter/s)": 0.204188 }, { "acc": 0.74237475, "epoch": 0.19888381532217148, "grad_norm": 2.15625, "learning_rate": 9.933316977407063e-06, "loss": 1.13945084, "memory(GiB)": 368.61, "step": 7840, "train_speed(iter/s)": 0.204208 }, { "acc": 0.7589776, "epoch": 0.19901065449010655, "grad_norm": 2.015625, "learning_rate": 9.933146180342914e-06, "loss": 0.96657524, "memory(GiB)": 368.61, "step": 7845, "train_speed(iter/s)": 0.204217 }, { "acc": 0.74442167, "epoch": 0.1991374936580416, "grad_norm": 1.9296875, "learning_rate": 9.932975166296972e-06, "loss": 1.06559849, "memory(GiB)": 368.61, "step": 7850, "train_speed(iter/s)": 0.204232 }, { "acc": 0.73970494, "epoch": 0.19926433282597666, "grad_norm": 2.390625, "learning_rate": 9.932803935276757e-06, "loss": 1.0272295, "memory(GiB)": 368.61, "step": 7855, "train_speed(iter/s)": 0.20424 }, { "acc": 0.73216076, "epoch": 0.1993911719939117, "grad_norm": 2.671875, "learning_rate": 9.932632487289802e-06, "loss": 1.10660849, "memory(GiB)": 368.61, "step": 7860, "train_speed(iter/s)": 0.204257 }, { "acc": 0.73391342, "epoch": 0.19951801116184678, "grad_norm": 2.203125, "learning_rate": 9.932460822343649e-06, "loss": 1.04677219, "memory(GiB)": 368.61, "step": 7865, "train_speed(iter/s)": 0.204263 }, { "acc": 0.72961063, "epoch": 0.19964485032978183, "grad_norm": 2.296875, "learning_rate": 9.932288940445845e-06, "loss": 1.16442251, "memory(GiB)": 368.61, "step": 7870, "train_speed(iter/s)": 0.204287 }, { "acc": 0.73574195, "epoch": 0.1997716894977169, "grad_norm": 2.359375, "learning_rate": 9.932116841603954e-06, "loss": 1.02913666, "memory(GiB)": 368.61, "step": 7875, "train_speed(iter/s)": 0.204308 }, { "acc": 0.74415898, "epoch": 0.19989852866565194, "grad_norm": 3.03125, "learning_rate": 9.931944525825542e-06, "loss": 1.07301855, "memory(GiB)": 368.61, "step": 7880, "train_speed(iter/s)": 0.204324 }, { "acc": 0.73766694, "epoch": 0.20002536783358701, "grad_norm": 2.46875, "learning_rate": 9.931771993118191e-06, "loss": 0.98732738, "memory(GiB)": 368.61, "step": 7885, "train_speed(iter/s)": 0.204324 }, { "acc": 0.73742828, "epoch": 0.20015220700152206, "grad_norm": 1.7265625, "learning_rate": 9.931599243489489e-06, "loss": 1.08116436, "memory(GiB)": 368.61, "step": 7890, "train_speed(iter/s)": 0.204348 }, { "acc": 0.73524456, "epoch": 0.20027904616945713, "grad_norm": 2.5, "learning_rate": 9.931426276947037e-06, "loss": 1.03848324, "memory(GiB)": 368.61, "step": 7895, "train_speed(iter/s)": 0.204371 }, { "acc": 0.74055867, "epoch": 0.20040588533739218, "grad_norm": 2.046875, "learning_rate": 9.931253093498437e-06, "loss": 1.05212421, "memory(GiB)": 368.61, "step": 7900, "train_speed(iter/s)": 0.204387 }, { "acc": 0.73436651, "epoch": 0.20053272450532725, "grad_norm": 1.6328125, "learning_rate": 9.93107969315131e-06, "loss": 1.03284588, "memory(GiB)": 368.61, "step": 7905, "train_speed(iter/s)": 0.204407 }, { "acc": 0.75149336, "epoch": 0.2006595636732623, "grad_norm": 1.875, "learning_rate": 9.930906075913281e-06, "loss": 1.05239792, "memory(GiB)": 368.61, "step": 7910, "train_speed(iter/s)": 0.204424 }, { "acc": 0.75455484, "epoch": 0.20078640284119736, "grad_norm": 1.828125, "learning_rate": 9.93073224179199e-06, "loss": 1.00038147, "memory(GiB)": 368.61, "step": 7915, "train_speed(iter/s)": 0.204433 }, { "acc": 0.73104997, "epoch": 0.2009132420091324, "grad_norm": 2.5, "learning_rate": 9.93055819079508e-06, "loss": 1.11053219, "memory(GiB)": 368.61, "step": 7920, "train_speed(iter/s)": 0.204449 }, { "acc": 0.73997798, "epoch": 0.20104008117706748, "grad_norm": 1.90625, "learning_rate": 9.930383922930207e-06, "loss": 1.12212601, "memory(GiB)": 368.61, "step": 7925, "train_speed(iter/s)": 0.204468 }, { "acc": 0.73011503, "epoch": 0.20116692034500253, "grad_norm": 2.375, "learning_rate": 9.930209438205038e-06, "loss": 1.09065399, "memory(GiB)": 368.61, "step": 7930, "train_speed(iter/s)": 0.204485 }, { "acc": 0.73975239, "epoch": 0.2012937595129376, "grad_norm": 2.015625, "learning_rate": 9.930034736627245e-06, "loss": 1.08008633, "memory(GiB)": 368.61, "step": 7935, "train_speed(iter/s)": 0.20451 }, { "acc": 0.73974905, "epoch": 0.20142059868087264, "grad_norm": 2.140625, "learning_rate": 9.929859818204514e-06, "loss": 1.03885489, "memory(GiB)": 368.61, "step": 7940, "train_speed(iter/s)": 0.204529 }, { "acc": 0.74504786, "epoch": 0.20154743784880771, "grad_norm": 2.03125, "learning_rate": 9.929684682944538e-06, "loss": 0.97312603, "memory(GiB)": 368.61, "step": 7945, "train_speed(iter/s)": 0.204533 }, { "acc": 0.73175311, "epoch": 0.20167427701674276, "grad_norm": 2.171875, "learning_rate": 9.929509330855018e-06, "loss": 1.09335823, "memory(GiB)": 368.61, "step": 7950, "train_speed(iter/s)": 0.204551 }, { "acc": 0.73347111, "epoch": 0.20180111618467783, "grad_norm": 1.9765625, "learning_rate": 9.929333761943672e-06, "loss": 1.09811106, "memory(GiB)": 368.61, "step": 7955, "train_speed(iter/s)": 0.204575 }, { "acc": 0.72816544, "epoch": 0.20192795535261288, "grad_norm": 2.5, "learning_rate": 9.929157976218218e-06, "loss": 1.10462208, "memory(GiB)": 368.61, "step": 7960, "train_speed(iter/s)": 0.204599 }, { "acc": 0.73388767, "epoch": 0.20205479452054795, "grad_norm": 2.453125, "learning_rate": 9.928981973686388e-06, "loss": 1.09889565, "memory(GiB)": 368.61, "step": 7965, "train_speed(iter/s)": 0.204617 }, { "acc": 0.75002298, "epoch": 0.202181633688483, "grad_norm": 1.96875, "learning_rate": 9.928805754355926e-06, "loss": 0.99405422, "memory(GiB)": 368.61, "step": 7970, "train_speed(iter/s)": 0.204632 }, { "acc": 0.73836737, "epoch": 0.20230847285641806, "grad_norm": 2.4375, "learning_rate": 9.92862931823458e-06, "loss": 1.02586575, "memory(GiB)": 368.61, "step": 7975, "train_speed(iter/s)": 0.204654 }, { "acc": 0.74087706, "epoch": 0.2024353120243531, "grad_norm": 2.359375, "learning_rate": 9.928452665330113e-06, "loss": 1.08718567, "memory(GiB)": 368.61, "step": 7980, "train_speed(iter/s)": 0.204638 }, { "acc": 0.72791576, "epoch": 0.20256215119228818, "grad_norm": 2.109375, "learning_rate": 9.928275795650293e-06, "loss": 1.11481876, "memory(GiB)": 368.61, "step": 7985, "train_speed(iter/s)": 0.204646 }, { "acc": 0.75757236, "epoch": 0.20268899036022323, "grad_norm": 2.140625, "learning_rate": 9.928098709202901e-06, "loss": 1.02973213, "memory(GiB)": 368.61, "step": 7990, "train_speed(iter/s)": 0.204659 }, { "acc": 0.73862848, "epoch": 0.2028158295281583, "grad_norm": 2.828125, "learning_rate": 9.927921405995727e-06, "loss": 1.08868418, "memory(GiB)": 368.61, "step": 7995, "train_speed(iter/s)": 0.204676 }, { "acc": 0.74650097, "epoch": 0.20294266869609334, "grad_norm": 2.21875, "learning_rate": 9.927743886036566e-06, "loss": 1.08008976, "memory(GiB)": 368.61, "step": 8000, "train_speed(iter/s)": 0.204683 }, { "epoch": 0.20294266869609334, "eval_acc": 0.7275572268104677, "eval_loss": 1.0250566005706787, "eval_runtime": 384.1639, "eval_samples_per_second": 16.581, "eval_steps_per_second": 8.291, "step": 8000 }, { "acc": 0.75121603, "epoch": 0.20306950786402841, "grad_norm": 2.0625, "learning_rate": 9.927566149333228e-06, "loss": 1.02341576, "memory(GiB)": 368.61, "step": 8005, "train_speed(iter/s)": 0.20103 }, { "acc": 0.73391623, "epoch": 0.20319634703196346, "grad_norm": 2.3125, "learning_rate": 9.92738819589353e-06, "loss": 1.04912872, "memory(GiB)": 368.61, "step": 8010, "train_speed(iter/s)": 0.201046 }, { "acc": 0.73907423, "epoch": 0.20332318619989853, "grad_norm": 2.4375, "learning_rate": 9.927210025725301e-06, "loss": 1.04395809, "memory(GiB)": 368.61, "step": 8015, "train_speed(iter/s)": 0.201062 }, { "acc": 0.73824015, "epoch": 0.20345002536783358, "grad_norm": 2.390625, "learning_rate": 9.927031638836377e-06, "loss": 1.09001026, "memory(GiB)": 368.61, "step": 8020, "train_speed(iter/s)": 0.201079 }, { "acc": 0.73831339, "epoch": 0.20357686453576865, "grad_norm": 2.640625, "learning_rate": 9.926853035234603e-06, "loss": 1.05752716, "memory(GiB)": 368.61, "step": 8025, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74540958, "epoch": 0.2037037037037037, "grad_norm": 2.5625, "learning_rate": 9.926674214927836e-06, "loss": 1.10951424, "memory(GiB)": 368.61, "step": 8030, "train_speed(iter/s)": 0.201109 }, { "acc": 0.73806286, "epoch": 0.20383054287163876, "grad_norm": 1.921875, "learning_rate": 9.926495177923941e-06, "loss": 1.02816515, "memory(GiB)": 368.61, "step": 8035, "train_speed(iter/s)": 0.201132 }, { "acc": 0.73828888, "epoch": 0.2039573820395738, "grad_norm": 2.390625, "learning_rate": 9.926315924230794e-06, "loss": 1.07243938, "memory(GiB)": 368.61, "step": 8040, "train_speed(iter/s)": 0.201149 }, { "acc": 0.73385949, "epoch": 0.20408422120750888, "grad_norm": 2.09375, "learning_rate": 9.926136453856277e-06, "loss": 1.0789196, "memory(GiB)": 368.61, "step": 8045, "train_speed(iter/s)": 0.201164 }, { "acc": 0.7496593, "epoch": 0.20421106037544393, "grad_norm": 2.015625, "learning_rate": 9.925956766808286e-06, "loss": 1.03662777, "memory(GiB)": 368.61, "step": 8050, "train_speed(iter/s)": 0.201186 }, { "acc": 0.72975416, "epoch": 0.204337899543379, "grad_norm": 2.03125, "learning_rate": 9.925776863094723e-06, "loss": 1.04749775, "memory(GiB)": 368.61, "step": 8055, "train_speed(iter/s)": 0.201208 }, { "acc": 0.7323422, "epoch": 0.20446473871131404, "grad_norm": 2.015625, "learning_rate": 9.9255967427235e-06, "loss": 1.08673916, "memory(GiB)": 368.61, "step": 8060, "train_speed(iter/s)": 0.201231 }, { "acc": 0.74931264, "epoch": 0.20459157787924911, "grad_norm": 2.109375, "learning_rate": 9.925416405702544e-06, "loss": 1.02774029, "memory(GiB)": 368.61, "step": 8065, "train_speed(iter/s)": 0.201253 }, { "acc": 0.74066076, "epoch": 0.20471841704718416, "grad_norm": 2.21875, "learning_rate": 9.925235852039783e-06, "loss": 1.06779289, "memory(GiB)": 368.61, "step": 8070, "train_speed(iter/s)": 0.201271 }, { "acc": 0.73530903, "epoch": 0.20484525621511923, "grad_norm": 2.21875, "learning_rate": 9.92505508174316e-06, "loss": 1.11241093, "memory(GiB)": 368.61, "step": 8075, "train_speed(iter/s)": 0.201291 }, { "acc": 0.74471655, "epoch": 0.20497209538305428, "grad_norm": 2.75, "learning_rate": 9.924874094820625e-06, "loss": 1.03051224, "memory(GiB)": 368.61, "step": 8080, "train_speed(iter/s)": 0.2013 }, { "acc": 0.74442396, "epoch": 0.20509893455098935, "grad_norm": 2.640625, "learning_rate": 9.924692891280139e-06, "loss": 1.02789955, "memory(GiB)": 368.61, "step": 8085, "train_speed(iter/s)": 0.201321 }, { "acc": 0.74406681, "epoch": 0.2052257737189244, "grad_norm": 2.078125, "learning_rate": 9.924511471129673e-06, "loss": 0.99239178, "memory(GiB)": 368.61, "step": 8090, "train_speed(iter/s)": 0.201339 }, { "acc": 0.73222857, "epoch": 0.20535261288685946, "grad_norm": 2.34375, "learning_rate": 9.924329834377206e-06, "loss": 1.04082012, "memory(GiB)": 368.61, "step": 8095, "train_speed(iter/s)": 0.201356 }, { "acc": 0.73985624, "epoch": 0.2054794520547945, "grad_norm": 2.34375, "learning_rate": 9.924147981030728e-06, "loss": 1.04897976, "memory(GiB)": 368.61, "step": 8100, "train_speed(iter/s)": 0.201347 }, { "acc": 0.74918242, "epoch": 0.20560629122272958, "grad_norm": 2.09375, "learning_rate": 9.923965911098235e-06, "loss": 1.02993183, "memory(GiB)": 368.61, "step": 8105, "train_speed(iter/s)": 0.201352 }, { "acc": 0.72024183, "epoch": 0.20573313039066463, "grad_norm": 2.90625, "learning_rate": 9.92378362458774e-06, "loss": 1.14436359, "memory(GiB)": 368.61, "step": 8110, "train_speed(iter/s)": 0.201374 }, { "acc": 0.74182281, "epoch": 0.2058599695585997, "grad_norm": 2.09375, "learning_rate": 9.923601121507256e-06, "loss": 1.05683851, "memory(GiB)": 368.61, "step": 8115, "train_speed(iter/s)": 0.20139 }, { "acc": 0.71999874, "epoch": 0.20598680872653474, "grad_norm": 2.109375, "learning_rate": 9.923418401864812e-06, "loss": 1.08299961, "memory(GiB)": 368.61, "step": 8120, "train_speed(iter/s)": 0.201402 }, { "acc": 0.73334994, "epoch": 0.20611364789446981, "grad_norm": 2.28125, "learning_rate": 9.923235465668447e-06, "loss": 1.02415218, "memory(GiB)": 368.61, "step": 8125, "train_speed(iter/s)": 0.201423 }, { "acc": 0.74312682, "epoch": 0.20624048706240486, "grad_norm": 2.390625, "learning_rate": 9.923052312926204e-06, "loss": 1.03524513, "memory(GiB)": 368.61, "step": 8130, "train_speed(iter/s)": 0.201434 }, { "acc": 0.73753223, "epoch": 0.20636732623033993, "grad_norm": 1.90625, "learning_rate": 9.922868943646142e-06, "loss": 1.05146675, "memory(GiB)": 368.61, "step": 8135, "train_speed(iter/s)": 0.20145 }, { "acc": 0.73781233, "epoch": 0.20649416539827498, "grad_norm": 2.3125, "learning_rate": 9.922685357836324e-06, "loss": 1.02964745, "memory(GiB)": 368.61, "step": 8140, "train_speed(iter/s)": 0.201467 }, { "acc": 0.73696222, "epoch": 0.20662100456621005, "grad_norm": 2.375, "learning_rate": 9.922501555504827e-06, "loss": 1.06589088, "memory(GiB)": 368.61, "step": 8145, "train_speed(iter/s)": 0.201488 }, { "acc": 0.73433371, "epoch": 0.2067478437341451, "grad_norm": 1.9609375, "learning_rate": 9.922317536659733e-06, "loss": 1.03891792, "memory(GiB)": 368.61, "step": 8150, "train_speed(iter/s)": 0.201501 }, { "acc": 0.74632082, "epoch": 0.20687468290208016, "grad_norm": 2.1875, "learning_rate": 9.922133301309136e-06, "loss": 1.03249226, "memory(GiB)": 368.61, "step": 8155, "train_speed(iter/s)": 0.201526 }, { "acc": 0.73562489, "epoch": 0.2070015220700152, "grad_norm": 2.90625, "learning_rate": 9.921948849461142e-06, "loss": 1.0912775, "memory(GiB)": 368.61, "step": 8160, "train_speed(iter/s)": 0.201549 }, { "acc": 0.75481691, "epoch": 0.20712836123795028, "grad_norm": 2.328125, "learning_rate": 9.921764181123864e-06, "loss": 0.9989872, "memory(GiB)": 368.61, "step": 8165, "train_speed(iter/s)": 0.201574 }, { "acc": 0.74803529, "epoch": 0.20725520040588533, "grad_norm": 2.71875, "learning_rate": 9.921579296305421e-06, "loss": 1.04732189, "memory(GiB)": 368.61, "step": 8170, "train_speed(iter/s)": 0.201573 }, { "acc": 0.72974381, "epoch": 0.2073820395738204, "grad_norm": 2.390625, "learning_rate": 9.921394195013949e-06, "loss": 1.06762753, "memory(GiB)": 368.61, "step": 8175, "train_speed(iter/s)": 0.201598 }, { "acc": 0.73811345, "epoch": 0.20750887874175544, "grad_norm": 2.0625, "learning_rate": 9.921208877257586e-06, "loss": 1.06379166, "memory(GiB)": 368.61, "step": 8180, "train_speed(iter/s)": 0.201604 }, { "acc": 0.73966422, "epoch": 0.20763571790969051, "grad_norm": 2.625, "learning_rate": 9.921023343044486e-06, "loss": 1.0396306, "memory(GiB)": 368.61, "step": 8185, "train_speed(iter/s)": 0.201607 }, { "acc": 0.75027466, "epoch": 0.20776255707762556, "grad_norm": 3.0625, "learning_rate": 9.92083759238281e-06, "loss": 1.00760307, "memory(GiB)": 368.61, "step": 8190, "train_speed(iter/s)": 0.201624 }, { "acc": 0.74426689, "epoch": 0.20788939624556063, "grad_norm": 2.0625, "learning_rate": 9.920651625280725e-06, "loss": 1.08980198, "memory(GiB)": 368.61, "step": 8195, "train_speed(iter/s)": 0.201646 }, { "acc": 0.73595362, "epoch": 0.20801623541349568, "grad_norm": 2.609375, "learning_rate": 9.920465441746412e-06, "loss": 1.06844597, "memory(GiB)": 368.61, "step": 8200, "train_speed(iter/s)": 0.201675 }, { "acc": 0.73519764, "epoch": 0.20814307458143075, "grad_norm": 1.9453125, "learning_rate": 9.920279041788062e-06, "loss": 1.07318478, "memory(GiB)": 368.61, "step": 8205, "train_speed(iter/s)": 0.201696 }, { "acc": 0.74677496, "epoch": 0.2082699137493658, "grad_norm": 2.25, "learning_rate": 9.920092425413871e-06, "loss": 1.05457802, "memory(GiB)": 368.61, "step": 8210, "train_speed(iter/s)": 0.201717 }, { "acc": 0.74161301, "epoch": 0.20839675291730086, "grad_norm": 2.3125, "learning_rate": 9.919905592632048e-06, "loss": 1.0359581, "memory(GiB)": 368.61, "step": 8215, "train_speed(iter/s)": 0.201724 }, { "acc": 0.73597555, "epoch": 0.2085235920852359, "grad_norm": 2.515625, "learning_rate": 9.919718543450813e-06, "loss": 1.02878056, "memory(GiB)": 368.61, "step": 8220, "train_speed(iter/s)": 0.201742 }, { "acc": 0.72935762, "epoch": 0.20865043125317098, "grad_norm": 2.1875, "learning_rate": 9.919531277878391e-06, "loss": 1.09727631, "memory(GiB)": 368.61, "step": 8225, "train_speed(iter/s)": 0.201759 }, { "acc": 0.74033313, "epoch": 0.20877727042110603, "grad_norm": 3.0625, "learning_rate": 9.91934379592302e-06, "loss": 0.97963858, "memory(GiB)": 368.61, "step": 8230, "train_speed(iter/s)": 0.201775 }, { "acc": 0.7514308, "epoch": 0.2089041095890411, "grad_norm": 1.828125, "learning_rate": 9.919156097592944e-06, "loss": 0.98603315, "memory(GiB)": 368.61, "step": 8235, "train_speed(iter/s)": 0.201795 }, { "acc": 0.74889402, "epoch": 0.20903094875697614, "grad_norm": 2.40625, "learning_rate": 9.91896818289642e-06, "loss": 1.01113205, "memory(GiB)": 368.61, "step": 8240, "train_speed(iter/s)": 0.201806 }, { "acc": 0.74344206, "epoch": 0.20915778792491121, "grad_norm": 2.015625, "learning_rate": 9.918780051841716e-06, "loss": 1.06806469, "memory(GiB)": 368.61, "step": 8245, "train_speed(iter/s)": 0.201825 }, { "acc": 0.73663387, "epoch": 0.20928462709284626, "grad_norm": 2.078125, "learning_rate": 9.918591704437103e-06, "loss": 1.03337803, "memory(GiB)": 368.61, "step": 8250, "train_speed(iter/s)": 0.20183 }, { "acc": 0.73502383, "epoch": 0.20941146626078133, "grad_norm": 2.28125, "learning_rate": 9.918403140690866e-06, "loss": 1.04705391, "memory(GiB)": 368.61, "step": 8255, "train_speed(iter/s)": 0.201846 }, { "acc": 0.7379982, "epoch": 0.20953830542871638, "grad_norm": 1.9921875, "learning_rate": 9.918214360611302e-06, "loss": 1.05102043, "memory(GiB)": 368.61, "step": 8260, "train_speed(iter/s)": 0.201865 }, { "acc": 0.73916512, "epoch": 0.20966514459665145, "grad_norm": 2.609375, "learning_rate": 9.918025364206712e-06, "loss": 1.03941326, "memory(GiB)": 368.61, "step": 8265, "train_speed(iter/s)": 0.201871 }, { "acc": 0.74346256, "epoch": 0.2097919837645865, "grad_norm": 2.0, "learning_rate": 9.917836151485407e-06, "loss": 1.04231224, "memory(GiB)": 368.61, "step": 8270, "train_speed(iter/s)": 0.201893 }, { "acc": 0.74564862, "epoch": 0.20991882293252156, "grad_norm": 1.9453125, "learning_rate": 9.917646722455713e-06, "loss": 1.06478691, "memory(GiB)": 368.61, "step": 8275, "train_speed(iter/s)": 0.201908 }, { "acc": 0.73621016, "epoch": 0.2100456621004566, "grad_norm": 2.3125, "learning_rate": 9.91745707712596e-06, "loss": 1.05525274, "memory(GiB)": 368.61, "step": 8280, "train_speed(iter/s)": 0.201924 }, { "acc": 0.73551998, "epoch": 0.21017250126839168, "grad_norm": 2.03125, "learning_rate": 9.91726721550449e-06, "loss": 1.06792336, "memory(GiB)": 368.61, "step": 8285, "train_speed(iter/s)": 0.201945 }, { "acc": 0.73734093, "epoch": 0.21029934043632673, "grad_norm": 1.7890625, "learning_rate": 9.917077137599653e-06, "loss": 1.01512146, "memory(GiB)": 368.61, "step": 8290, "train_speed(iter/s)": 0.201958 }, { "acc": 0.73411207, "epoch": 0.2104261796042618, "grad_norm": 2.046875, "learning_rate": 9.916886843419811e-06, "loss": 1.05682135, "memory(GiB)": 368.61, "step": 8295, "train_speed(iter/s)": 0.201979 }, { "acc": 0.74867873, "epoch": 0.21055301877219684, "grad_norm": 2.265625, "learning_rate": 9.916696332973334e-06, "loss": 1.07308931, "memory(GiB)": 368.61, "step": 8300, "train_speed(iter/s)": 0.201986 }, { "acc": 0.73583183, "epoch": 0.21067985794013191, "grad_norm": 2.265625, "learning_rate": 9.9165056062686e-06, "loss": 1.01456585, "memory(GiB)": 368.61, "step": 8305, "train_speed(iter/s)": 0.202006 }, { "acc": 0.73354964, "epoch": 0.21080669710806696, "grad_norm": 2.390625, "learning_rate": 9.916314663314e-06, "loss": 1.06153374, "memory(GiB)": 368.61, "step": 8310, "train_speed(iter/s)": 0.202027 }, { "acc": 0.74830856, "epoch": 0.21093353627600203, "grad_norm": 2.046875, "learning_rate": 9.91612350411793e-06, "loss": 1.0886591, "memory(GiB)": 368.61, "step": 8315, "train_speed(iter/s)": 0.202045 }, { "acc": 0.74608212, "epoch": 0.21106037544393708, "grad_norm": 3.046875, "learning_rate": 9.9159321286888e-06, "loss": 1.00647659, "memory(GiB)": 368.61, "step": 8320, "train_speed(iter/s)": 0.202058 }, { "acc": 0.73692284, "epoch": 0.21118721461187215, "grad_norm": 2.03125, "learning_rate": 9.915740537035026e-06, "loss": 1.08366756, "memory(GiB)": 368.61, "step": 8325, "train_speed(iter/s)": 0.202078 }, { "acc": 0.7370153, "epoch": 0.2113140537798072, "grad_norm": 2.421875, "learning_rate": 9.915548729165036e-06, "loss": 1.08827915, "memory(GiB)": 368.61, "step": 8330, "train_speed(iter/s)": 0.202098 }, { "acc": 0.7585906, "epoch": 0.21144089294774226, "grad_norm": 2.28125, "learning_rate": 9.915356705087269e-06, "loss": 0.99928198, "memory(GiB)": 368.61, "step": 8335, "train_speed(iter/s)": 0.202109 }, { "acc": 0.73407145, "epoch": 0.2115677321156773, "grad_norm": 2.40625, "learning_rate": 9.915164464810166e-06, "loss": 1.05487461, "memory(GiB)": 368.61, "step": 8340, "train_speed(iter/s)": 0.202121 }, { "acc": 0.75094194, "epoch": 0.21169457128361238, "grad_norm": 2.453125, "learning_rate": 9.914972008342186e-06, "loss": 1.03992825, "memory(GiB)": 368.61, "step": 8345, "train_speed(iter/s)": 0.202122 }, { "acc": 0.72991619, "epoch": 0.21182141045154743, "grad_norm": 2.3125, "learning_rate": 9.914779335691793e-06, "loss": 1.10572128, "memory(GiB)": 368.61, "step": 8350, "train_speed(iter/s)": 0.202138 }, { "acc": 0.72628155, "epoch": 0.2119482496194825, "grad_norm": 2.09375, "learning_rate": 9.914586446867463e-06, "loss": 1.11610861, "memory(GiB)": 368.61, "step": 8355, "train_speed(iter/s)": 0.202153 }, { "acc": 0.74378567, "epoch": 0.21207508878741754, "grad_norm": 2.171875, "learning_rate": 9.914393341877678e-06, "loss": 1.06996689, "memory(GiB)": 368.61, "step": 8360, "train_speed(iter/s)": 0.202173 }, { "acc": 0.73667231, "epoch": 0.21220192795535261, "grad_norm": 2.3125, "learning_rate": 9.914200020730932e-06, "loss": 1.05779915, "memory(GiB)": 368.61, "step": 8365, "train_speed(iter/s)": 0.202187 }, { "acc": 0.74043026, "epoch": 0.21232876712328766, "grad_norm": 2.1875, "learning_rate": 9.914006483435732e-06, "loss": 1.02092543, "memory(GiB)": 368.61, "step": 8370, "train_speed(iter/s)": 0.202211 }, { "acc": 0.73499789, "epoch": 0.21245560629122273, "grad_norm": 2.078125, "learning_rate": 9.913812730000585e-06, "loss": 1.07168655, "memory(GiB)": 368.61, "step": 8375, "train_speed(iter/s)": 0.202219 }, { "acc": 0.74636889, "epoch": 0.21258244545915778, "grad_norm": 2.703125, "learning_rate": 9.913618760434015e-06, "loss": 1.06523685, "memory(GiB)": 368.61, "step": 8380, "train_speed(iter/s)": 0.202245 }, { "acc": 0.74052153, "epoch": 0.21270928462709285, "grad_norm": 2.328125, "learning_rate": 9.913424574744555e-06, "loss": 0.99539061, "memory(GiB)": 368.61, "step": 8385, "train_speed(iter/s)": 0.202264 }, { "acc": 0.74525437, "epoch": 0.2128361237950279, "grad_norm": 2.359375, "learning_rate": 9.913230172940744e-06, "loss": 1.08226271, "memory(GiB)": 368.61, "step": 8390, "train_speed(iter/s)": 0.202274 }, { "acc": 0.71793556, "epoch": 0.21296296296296297, "grad_norm": 2.21875, "learning_rate": 9.913035555031136e-06, "loss": 1.08902283, "memory(GiB)": 368.61, "step": 8395, "train_speed(iter/s)": 0.202294 }, { "acc": 0.73909459, "epoch": 0.213089802130898, "grad_norm": 2.09375, "learning_rate": 9.912840721024288e-06, "loss": 1.00856133, "memory(GiB)": 368.61, "step": 8400, "train_speed(iter/s)": 0.20231 }, { "acc": 0.75783119, "epoch": 0.21321664129883308, "grad_norm": 2.046875, "learning_rate": 9.91264567092877e-06, "loss": 0.97847481, "memory(GiB)": 368.61, "step": 8405, "train_speed(iter/s)": 0.202331 }, { "acc": 0.72745094, "epoch": 0.21334348046676813, "grad_norm": 2.734375, "learning_rate": 9.912450404753164e-06, "loss": 1.11381416, "memory(GiB)": 368.61, "step": 8410, "train_speed(iter/s)": 0.202357 }, { "acc": 0.74416723, "epoch": 0.2134703196347032, "grad_norm": 2.015625, "learning_rate": 9.912254922506057e-06, "loss": 1.02761488, "memory(GiB)": 368.61, "step": 8415, "train_speed(iter/s)": 0.20238 }, { "acc": 0.73819828, "epoch": 0.21359715880263824, "grad_norm": 2.28125, "learning_rate": 9.912059224196044e-06, "loss": 1.08265457, "memory(GiB)": 368.61, "step": 8420, "train_speed(iter/s)": 0.202401 }, { "acc": 0.73985906, "epoch": 0.21372399797057332, "grad_norm": 2.125, "learning_rate": 9.911863309831738e-06, "loss": 1.03566208, "memory(GiB)": 368.61, "step": 8425, "train_speed(iter/s)": 0.202419 }, { "acc": 0.72742229, "epoch": 0.21385083713850836, "grad_norm": 2.046875, "learning_rate": 9.911667179421753e-06, "loss": 1.11393948, "memory(GiB)": 368.61, "step": 8430, "train_speed(iter/s)": 0.202443 }, { "acc": 0.75128756, "epoch": 0.21397767630644343, "grad_norm": 2.0, "learning_rate": 9.911470832974717e-06, "loss": 0.99111519, "memory(GiB)": 368.61, "step": 8435, "train_speed(iter/s)": 0.202459 }, { "acc": 0.73241048, "epoch": 0.21410451547437848, "grad_norm": 2.140625, "learning_rate": 9.911274270499265e-06, "loss": 1.02316599, "memory(GiB)": 368.61, "step": 8440, "train_speed(iter/s)": 0.202477 }, { "acc": 0.75303879, "epoch": 0.21423135464231355, "grad_norm": 2.3125, "learning_rate": 9.911077492004044e-06, "loss": 1.04246387, "memory(GiB)": 368.61, "step": 8445, "train_speed(iter/s)": 0.202486 }, { "acc": 0.73887348, "epoch": 0.2143581938102486, "grad_norm": 2.234375, "learning_rate": 9.910880497497707e-06, "loss": 1.01790466, "memory(GiB)": 368.61, "step": 8450, "train_speed(iter/s)": 0.202504 }, { "acc": 0.74190102, "epoch": 0.21448503297818367, "grad_norm": 2.140625, "learning_rate": 9.910683286988922e-06, "loss": 1.03071165, "memory(GiB)": 368.61, "step": 8455, "train_speed(iter/s)": 0.202529 }, { "acc": 0.73297462, "epoch": 0.2146118721461187, "grad_norm": 2.21875, "learning_rate": 9.910485860486361e-06, "loss": 1.06550217, "memory(GiB)": 368.61, "step": 8460, "train_speed(iter/s)": 0.202554 }, { "acc": 0.73277321, "epoch": 0.21473871131405378, "grad_norm": 2.296875, "learning_rate": 9.910288217998707e-06, "loss": 1.09637594, "memory(GiB)": 368.61, "step": 8465, "train_speed(iter/s)": 0.202574 }, { "acc": 0.73592591, "epoch": 0.21486555048198883, "grad_norm": 2.21875, "learning_rate": 9.910090359534654e-06, "loss": 1.08347893, "memory(GiB)": 368.61, "step": 8470, "train_speed(iter/s)": 0.202587 }, { "acc": 0.74685917, "epoch": 0.2149923896499239, "grad_norm": 2.421875, "learning_rate": 9.909892285102907e-06, "loss": 1.04987621, "memory(GiB)": 368.61, "step": 8475, "train_speed(iter/s)": 0.202612 }, { "acc": 0.74429674, "epoch": 0.21511922881785894, "grad_norm": 2.40625, "learning_rate": 9.909693994712174e-06, "loss": 1.05190182, "memory(GiB)": 368.61, "step": 8480, "train_speed(iter/s)": 0.202624 }, { "acc": 0.74956746, "epoch": 0.21524606798579402, "grad_norm": 2.21875, "learning_rate": 9.909495488371181e-06, "loss": 0.98744678, "memory(GiB)": 368.61, "step": 8485, "train_speed(iter/s)": 0.202628 }, { "acc": 0.74131775, "epoch": 0.21537290715372906, "grad_norm": 2.046875, "learning_rate": 9.909296766088657e-06, "loss": 1.00711727, "memory(GiB)": 368.61, "step": 8490, "train_speed(iter/s)": 0.202649 }, { "acc": 0.75189943, "epoch": 0.21549974632166413, "grad_norm": 2.671875, "learning_rate": 9.909097827873341e-06, "loss": 1.01069908, "memory(GiB)": 368.61, "step": 8495, "train_speed(iter/s)": 0.202662 }, { "acc": 0.75034933, "epoch": 0.21562658548959918, "grad_norm": 2.40625, "learning_rate": 9.908898673733986e-06, "loss": 1.06024084, "memory(GiB)": 368.61, "step": 8500, "train_speed(iter/s)": 0.202679 }, { "acc": 0.74671564, "epoch": 0.21575342465753425, "grad_norm": 2.21875, "learning_rate": 9.90869930367935e-06, "loss": 1.05303469, "memory(GiB)": 368.61, "step": 8505, "train_speed(iter/s)": 0.202696 }, { "acc": 0.7240551, "epoch": 0.2158802638254693, "grad_norm": 2.625, "learning_rate": 9.908499717718203e-06, "loss": 1.11497498, "memory(GiB)": 368.61, "step": 8510, "train_speed(iter/s)": 0.202718 }, { "acc": 0.74768171, "epoch": 0.21600710299340437, "grad_norm": 2.453125, "learning_rate": 9.908299915859325e-06, "loss": 1.06124992, "memory(GiB)": 368.61, "step": 8515, "train_speed(iter/s)": 0.20273 }, { "acc": 0.73898277, "epoch": 0.2161339421613394, "grad_norm": 2.078125, "learning_rate": 9.908099898111502e-06, "loss": 1.06458931, "memory(GiB)": 368.61, "step": 8520, "train_speed(iter/s)": 0.202745 }, { "acc": 0.73693361, "epoch": 0.21626078132927448, "grad_norm": 2.359375, "learning_rate": 9.907899664483533e-06, "loss": 1.04519148, "memory(GiB)": 368.61, "step": 8525, "train_speed(iter/s)": 0.202759 }, { "acc": 0.72827101, "epoch": 0.21638762049720953, "grad_norm": 1.9296875, "learning_rate": 9.907699214984223e-06, "loss": 1.07815723, "memory(GiB)": 368.61, "step": 8530, "train_speed(iter/s)": 0.202769 }, { "acc": 0.73575373, "epoch": 0.2165144596651446, "grad_norm": 2.296875, "learning_rate": 9.90749854962239e-06, "loss": 1.04765339, "memory(GiB)": 368.61, "step": 8535, "train_speed(iter/s)": 0.20278 }, { "acc": 0.74530268, "epoch": 0.21664129883307964, "grad_norm": 2.5625, "learning_rate": 9.907297668406863e-06, "loss": 1.04047985, "memory(GiB)": 368.61, "step": 8540, "train_speed(iter/s)": 0.202797 }, { "acc": 0.73395991, "epoch": 0.21676813800101472, "grad_norm": 2.09375, "learning_rate": 9.907096571346474e-06, "loss": 1.05633717, "memory(GiB)": 368.61, "step": 8545, "train_speed(iter/s)": 0.202809 }, { "acc": 0.74018335, "epoch": 0.21689497716894976, "grad_norm": 1.8984375, "learning_rate": 9.906895258450067e-06, "loss": 1.00518284, "memory(GiB)": 368.61, "step": 8550, "train_speed(iter/s)": 0.202828 }, { "acc": 0.74854898, "epoch": 0.21702181633688483, "grad_norm": 1.96875, "learning_rate": 9.9066937297265e-06, "loss": 1.00583067, "memory(GiB)": 368.61, "step": 8555, "train_speed(iter/s)": 0.202837 }, { "acc": 0.74458838, "epoch": 0.21714865550481988, "grad_norm": 2.125, "learning_rate": 9.906491985184637e-06, "loss": 1.01839962, "memory(GiB)": 368.61, "step": 8560, "train_speed(iter/s)": 0.202856 }, { "acc": 0.72273178, "epoch": 0.21727549467275495, "grad_norm": 2.578125, "learning_rate": 9.906290024833349e-06, "loss": 1.08166122, "memory(GiB)": 368.61, "step": 8565, "train_speed(iter/s)": 0.202865 }, { "acc": 0.7389185, "epoch": 0.21740233384069, "grad_norm": 2.171875, "learning_rate": 9.906087848681523e-06, "loss": 1.06195812, "memory(GiB)": 368.61, "step": 8570, "train_speed(iter/s)": 0.202889 }, { "acc": 0.73741741, "epoch": 0.21752917300862507, "grad_norm": 2.15625, "learning_rate": 9.905885456738046e-06, "loss": 1.07127056, "memory(GiB)": 368.61, "step": 8575, "train_speed(iter/s)": 0.2029 }, { "acc": 0.74048615, "epoch": 0.2176560121765601, "grad_norm": 2.375, "learning_rate": 9.905682849011826e-06, "loss": 1.05588942, "memory(GiB)": 368.61, "step": 8580, "train_speed(iter/s)": 0.202918 }, { "acc": 0.73618331, "epoch": 0.21778285134449518, "grad_norm": 2.328125, "learning_rate": 9.905480025511772e-06, "loss": 1.07909756, "memory(GiB)": 368.61, "step": 8585, "train_speed(iter/s)": 0.202938 }, { "acc": 0.73091941, "epoch": 0.21790969051243023, "grad_norm": 2.203125, "learning_rate": 9.905276986246804e-06, "loss": 1.0789053, "memory(GiB)": 368.61, "step": 8590, "train_speed(iter/s)": 0.202958 }, { "acc": 0.73872781, "epoch": 0.2180365296803653, "grad_norm": 2.0625, "learning_rate": 9.905073731225854e-06, "loss": 1.00294218, "memory(GiB)": 368.61, "step": 8595, "train_speed(iter/s)": 0.202978 }, { "acc": 0.72451658, "epoch": 0.21816336884830034, "grad_norm": 2.484375, "learning_rate": 9.904870260457861e-06, "loss": 1.06755228, "memory(GiB)": 368.61, "step": 8600, "train_speed(iter/s)": 0.202997 }, { "acc": 0.73286371, "epoch": 0.21829020801623542, "grad_norm": 2.703125, "learning_rate": 9.904666573951777e-06, "loss": 1.05181599, "memory(GiB)": 368.61, "step": 8605, "train_speed(iter/s)": 0.203013 }, { "acc": 0.72618432, "epoch": 0.21841704718417046, "grad_norm": 2.453125, "learning_rate": 9.904462671716559e-06, "loss": 1.10476942, "memory(GiB)": 368.61, "step": 8610, "train_speed(iter/s)": 0.203022 }, { "acc": 0.72940421, "epoch": 0.21854388635210553, "grad_norm": 2.21875, "learning_rate": 9.904258553761175e-06, "loss": 1.13473864, "memory(GiB)": 368.61, "step": 8615, "train_speed(iter/s)": 0.203039 }, { "acc": 0.73809681, "epoch": 0.21867072552004058, "grad_norm": 1.9609375, "learning_rate": 9.904054220094603e-06, "loss": 1.06272058, "memory(GiB)": 368.61, "step": 8620, "train_speed(iter/s)": 0.203057 }, { "acc": 0.73890882, "epoch": 0.21879756468797565, "grad_norm": 2.296875, "learning_rate": 9.903849670725833e-06, "loss": 1.05454617, "memory(GiB)": 368.61, "step": 8625, "train_speed(iter/s)": 0.203074 }, { "acc": 0.73794289, "epoch": 0.2189244038559107, "grad_norm": 2.25, "learning_rate": 9.903644905663861e-06, "loss": 1.02655125, "memory(GiB)": 368.61, "step": 8630, "train_speed(iter/s)": 0.203097 }, { "acc": 0.72909513, "epoch": 0.21905124302384577, "grad_norm": 1.96875, "learning_rate": 9.90343992491769e-06, "loss": 1.03842649, "memory(GiB)": 368.61, "step": 8635, "train_speed(iter/s)": 0.203095 }, { "acc": 0.7354682, "epoch": 0.2191780821917808, "grad_norm": 2.015625, "learning_rate": 9.903234728496341e-06, "loss": 1.05515881, "memory(GiB)": 368.61, "step": 8640, "train_speed(iter/s)": 0.203102 }, { "acc": 0.72598166, "epoch": 0.21930492135971588, "grad_norm": 2.46875, "learning_rate": 9.903029316408838e-06, "loss": 1.12901611, "memory(GiB)": 368.61, "step": 8645, "train_speed(iter/s)": 0.203124 }, { "acc": 0.75196099, "epoch": 0.21943176052765093, "grad_norm": 2.03125, "learning_rate": 9.902823688664214e-06, "loss": 0.99303684, "memory(GiB)": 368.61, "step": 8650, "train_speed(iter/s)": 0.203142 }, { "acc": 0.74546757, "epoch": 0.219558599695586, "grad_norm": 2.09375, "learning_rate": 9.902617845271514e-06, "loss": 0.99964962, "memory(GiB)": 368.61, "step": 8655, "train_speed(iter/s)": 0.203157 }, { "acc": 0.74787683, "epoch": 0.21968543886352104, "grad_norm": 1.9453125, "learning_rate": 9.902411786239794e-06, "loss": 1.008535, "memory(GiB)": 368.61, "step": 8660, "train_speed(iter/s)": 0.203177 }, { "acc": 0.7418911, "epoch": 0.21981227803145612, "grad_norm": 1.9921875, "learning_rate": 9.902205511578114e-06, "loss": 1.03545723, "memory(GiB)": 368.61, "step": 8665, "train_speed(iter/s)": 0.203193 }, { "acc": 0.7303565, "epoch": 0.21993911719939116, "grad_norm": 2.390625, "learning_rate": 9.90199902129555e-06, "loss": 1.08395882, "memory(GiB)": 368.61, "step": 8670, "train_speed(iter/s)": 0.203216 }, { "acc": 0.73078232, "epoch": 0.22006595636732623, "grad_norm": 2.390625, "learning_rate": 9.901792315401184e-06, "loss": 1.06080322, "memory(GiB)": 368.61, "step": 8675, "train_speed(iter/s)": 0.203231 }, { "acc": 0.74617434, "epoch": 0.22019279553526128, "grad_norm": 1.9765625, "learning_rate": 9.901585393904104e-06, "loss": 0.97589741, "memory(GiB)": 368.61, "step": 8680, "train_speed(iter/s)": 0.203238 }, { "acc": 0.73268399, "epoch": 0.22031963470319635, "grad_norm": 2.109375, "learning_rate": 9.901378256813418e-06, "loss": 1.10088043, "memory(GiB)": 368.61, "step": 8685, "train_speed(iter/s)": 0.203257 }, { "acc": 0.75106368, "epoch": 0.2204464738711314, "grad_norm": 2.296875, "learning_rate": 9.901170904138232e-06, "loss": 1.01509743, "memory(GiB)": 368.61, "step": 8690, "train_speed(iter/s)": 0.203268 }, { "acc": 0.72555342, "epoch": 0.22057331303906647, "grad_norm": 2.03125, "learning_rate": 9.900963335887667e-06, "loss": 1.12704945, "memory(GiB)": 368.61, "step": 8695, "train_speed(iter/s)": 0.203286 }, { "acc": 0.74256177, "epoch": 0.2207001522070015, "grad_norm": 2.328125, "learning_rate": 9.900755552070852e-06, "loss": 1.06479864, "memory(GiB)": 368.61, "step": 8700, "train_speed(iter/s)": 0.2033 }, { "acc": 0.75582228, "epoch": 0.22082699137493658, "grad_norm": 2.234375, "learning_rate": 9.900547552696931e-06, "loss": 0.99108963, "memory(GiB)": 368.61, "step": 8705, "train_speed(iter/s)": 0.203312 }, { "acc": 0.74261818, "epoch": 0.22095383054287163, "grad_norm": 2.015625, "learning_rate": 9.900339337775046e-06, "loss": 1.08371229, "memory(GiB)": 368.61, "step": 8710, "train_speed(iter/s)": 0.20333 }, { "acc": 0.74018297, "epoch": 0.2210806697108067, "grad_norm": 2.390625, "learning_rate": 9.90013090731436e-06, "loss": 1.09160004, "memory(GiB)": 368.61, "step": 8715, "train_speed(iter/s)": 0.203346 }, { "acc": 0.72956724, "epoch": 0.22120750887874174, "grad_norm": 2.53125, "learning_rate": 9.89992226132404e-06, "loss": 1.11988049, "memory(GiB)": 368.61, "step": 8720, "train_speed(iter/s)": 0.203362 }, { "acc": 0.74091654, "epoch": 0.22133434804667682, "grad_norm": 1.9296875, "learning_rate": 9.899713399813261e-06, "loss": 0.99686394, "memory(GiB)": 368.61, "step": 8725, "train_speed(iter/s)": 0.203357 }, { "acc": 0.74613295, "epoch": 0.22146118721461186, "grad_norm": 2.375, "learning_rate": 9.899504322791212e-06, "loss": 1.04888744, "memory(GiB)": 368.61, "step": 8730, "train_speed(iter/s)": 0.203376 }, { "acc": 0.73434296, "epoch": 0.22158802638254693, "grad_norm": 1.984375, "learning_rate": 9.899295030267086e-06, "loss": 1.10872612, "memory(GiB)": 368.61, "step": 8735, "train_speed(iter/s)": 0.203387 }, { "acc": 0.73445449, "epoch": 0.22171486555048198, "grad_norm": 2.125, "learning_rate": 9.899085522250094e-06, "loss": 1.04331341, "memory(GiB)": 368.61, "step": 8740, "train_speed(iter/s)": 0.203403 }, { "acc": 0.74951067, "epoch": 0.22184170471841705, "grad_norm": 2.03125, "learning_rate": 9.898875798749446e-06, "loss": 1.03704548, "memory(GiB)": 368.61, "step": 8745, "train_speed(iter/s)": 0.203422 }, { "acc": 0.7338325, "epoch": 0.2219685438863521, "grad_norm": 2.421875, "learning_rate": 9.898665859774367e-06, "loss": 1.08752842, "memory(GiB)": 368.61, "step": 8750, "train_speed(iter/s)": 0.203442 }, { "acc": 0.74232359, "epoch": 0.22209538305428717, "grad_norm": 2.65625, "learning_rate": 9.898455705334095e-06, "loss": 1.08771296, "memory(GiB)": 368.61, "step": 8755, "train_speed(iter/s)": 0.203463 }, { "acc": 0.73680897, "epoch": 0.2222222222222222, "grad_norm": 1.9296875, "learning_rate": 9.89824533543787e-06, "loss": 1.07224598, "memory(GiB)": 368.61, "step": 8760, "train_speed(iter/s)": 0.203472 }, { "acc": 0.72217493, "epoch": 0.22234906139015728, "grad_norm": 2.171875, "learning_rate": 9.898034750094946e-06, "loss": 1.07581139, "memory(GiB)": 368.61, "step": 8765, "train_speed(iter/s)": 0.203488 }, { "acc": 0.73591409, "epoch": 0.22247590055809233, "grad_norm": 2.390625, "learning_rate": 9.897823949314586e-06, "loss": 1.10489292, "memory(GiB)": 368.61, "step": 8770, "train_speed(iter/s)": 0.203509 }, { "acc": 0.73495255, "epoch": 0.2226027397260274, "grad_norm": 2.375, "learning_rate": 9.897612933106061e-06, "loss": 1.11963854, "memory(GiB)": 368.61, "step": 8775, "train_speed(iter/s)": 0.203531 }, { "acc": 0.74199009, "epoch": 0.22272957889396244, "grad_norm": 2.0625, "learning_rate": 9.897401701478654e-06, "loss": 1.03986807, "memory(GiB)": 368.61, "step": 8780, "train_speed(iter/s)": 0.203543 }, { "acc": 0.74095383, "epoch": 0.22285641806189752, "grad_norm": 2.59375, "learning_rate": 9.897190254441653e-06, "loss": 1.06943455, "memory(GiB)": 368.61, "step": 8785, "train_speed(iter/s)": 0.203548 }, { "acc": 0.72721214, "epoch": 0.22298325722983256, "grad_norm": 2.0, "learning_rate": 9.896978592004363e-06, "loss": 1.07385979, "memory(GiB)": 368.61, "step": 8790, "train_speed(iter/s)": 0.203564 }, { "acc": 0.73124347, "epoch": 0.22311009639776763, "grad_norm": 2.390625, "learning_rate": 9.896766714176089e-06, "loss": 1.0744545, "memory(GiB)": 368.61, "step": 8795, "train_speed(iter/s)": 0.203584 }, { "acc": 0.73581743, "epoch": 0.22323693556570268, "grad_norm": 2.5, "learning_rate": 9.896554620966152e-06, "loss": 1.07411442, "memory(GiB)": 368.61, "step": 8800, "train_speed(iter/s)": 0.203604 }, { "acc": 0.74593754, "epoch": 0.22336377473363775, "grad_norm": 2.46875, "learning_rate": 9.896342312383883e-06, "loss": 1.079284, "memory(GiB)": 368.61, "step": 8805, "train_speed(iter/s)": 0.203612 }, { "acc": 0.74785604, "epoch": 0.2234906139015728, "grad_norm": 2.8125, "learning_rate": 9.896129788438617e-06, "loss": 1.05355015, "memory(GiB)": 368.61, "step": 8810, "train_speed(iter/s)": 0.20363 }, { "acc": 0.7471077, "epoch": 0.22361745306950787, "grad_norm": 2.03125, "learning_rate": 9.895917049139704e-06, "loss": 1.04013157, "memory(GiB)": 368.61, "step": 8815, "train_speed(iter/s)": 0.203649 }, { "acc": 0.72719707, "epoch": 0.2237442922374429, "grad_norm": 2.96875, "learning_rate": 9.895704094496502e-06, "loss": 1.0877944, "memory(GiB)": 368.61, "step": 8820, "train_speed(iter/s)": 0.203662 }, { "acc": 0.7509088, "epoch": 0.22387113140537798, "grad_norm": 1.921875, "learning_rate": 9.895490924518372e-06, "loss": 1.00894642, "memory(GiB)": 368.61, "step": 8825, "train_speed(iter/s)": 0.203674 }, { "acc": 0.74731331, "epoch": 0.22399797057331303, "grad_norm": 2.078125, "learning_rate": 9.895277539214698e-06, "loss": 0.96919603, "memory(GiB)": 368.61, "step": 8830, "train_speed(iter/s)": 0.203678 }, { "acc": 0.72975073, "epoch": 0.2241248097412481, "grad_norm": 2.25, "learning_rate": 9.895063938594859e-06, "loss": 1.10814304, "memory(GiB)": 368.61, "step": 8835, "train_speed(iter/s)": 0.203692 }, { "acc": 0.73009682, "epoch": 0.22425164890918314, "grad_norm": 1.7109375, "learning_rate": 9.894850122668256e-06, "loss": 1.07448645, "memory(GiB)": 368.61, "step": 8840, "train_speed(iter/s)": 0.203702 }, { "acc": 0.74364605, "epoch": 0.22437848807711822, "grad_norm": 1.9375, "learning_rate": 9.89463609144429e-06, "loss": 1.02741175, "memory(GiB)": 368.61, "step": 8845, "train_speed(iter/s)": 0.203688 }, { "acc": 0.73886576, "epoch": 0.22450532724505326, "grad_norm": 2.390625, "learning_rate": 9.894421844932375e-06, "loss": 1.05654926, "memory(GiB)": 368.61, "step": 8850, "train_speed(iter/s)": 0.203695 }, { "acc": 0.76396122, "epoch": 0.22463216641298833, "grad_norm": 2.4375, "learning_rate": 9.894207383141937e-06, "loss": 0.97060051, "memory(GiB)": 368.61, "step": 8855, "train_speed(iter/s)": 0.203713 }, { "acc": 0.73527422, "epoch": 0.22475900558092338, "grad_norm": 2.46875, "learning_rate": 9.893992706082405e-06, "loss": 1.04055119, "memory(GiB)": 368.61, "step": 8860, "train_speed(iter/s)": 0.203726 }, { "acc": 0.74310923, "epoch": 0.22488584474885845, "grad_norm": 2.046875, "learning_rate": 9.893777813763223e-06, "loss": 1.07943192, "memory(GiB)": 368.61, "step": 8865, "train_speed(iter/s)": 0.203741 }, { "acc": 0.73656988, "epoch": 0.2250126839167935, "grad_norm": 3.09375, "learning_rate": 9.893562706193847e-06, "loss": 1.09236889, "memory(GiB)": 368.61, "step": 8870, "train_speed(iter/s)": 0.203756 }, { "acc": 0.74780178, "epoch": 0.22513952308472857, "grad_norm": 1.953125, "learning_rate": 9.893347383383732e-06, "loss": 1.02933674, "memory(GiB)": 368.61, "step": 8875, "train_speed(iter/s)": 0.203778 }, { "acc": 0.75668583, "epoch": 0.2252663622526636, "grad_norm": 2.28125, "learning_rate": 9.893131845342352e-06, "loss": 1.03931446, "memory(GiB)": 368.61, "step": 8880, "train_speed(iter/s)": 0.203799 }, { "acc": 0.7451551, "epoch": 0.22539320142059868, "grad_norm": 2.046875, "learning_rate": 9.892916092079188e-06, "loss": 1.06591873, "memory(GiB)": 368.61, "step": 8885, "train_speed(iter/s)": 0.203807 }, { "acc": 0.74615498, "epoch": 0.22552004058853373, "grad_norm": 2.078125, "learning_rate": 9.89270012360373e-06, "loss": 1.0622961, "memory(GiB)": 368.61, "step": 8890, "train_speed(iter/s)": 0.203821 }, { "acc": 0.74700537, "epoch": 0.2256468797564688, "grad_norm": 2.09375, "learning_rate": 9.892483939925476e-06, "loss": 1.0357007, "memory(GiB)": 368.61, "step": 8895, "train_speed(iter/s)": 0.203835 }, { "acc": 0.73438082, "epoch": 0.22577371892440384, "grad_norm": 2.5625, "learning_rate": 9.892267541053933e-06, "loss": 1.04228592, "memory(GiB)": 368.61, "step": 8900, "train_speed(iter/s)": 0.203844 }, { "acc": 0.7456521, "epoch": 0.22590055809233892, "grad_norm": 3.171875, "learning_rate": 9.892050926998624e-06, "loss": 1.02470846, "memory(GiB)": 368.61, "step": 8905, "train_speed(iter/s)": 0.203845 }, { "acc": 0.734093, "epoch": 0.22602739726027396, "grad_norm": 2.078125, "learning_rate": 9.891834097769071e-06, "loss": 1.06820564, "memory(GiB)": 368.61, "step": 8910, "train_speed(iter/s)": 0.203862 }, { "acc": 0.72590952, "epoch": 0.22615423642820903, "grad_norm": 2.03125, "learning_rate": 9.891617053374816e-06, "loss": 1.10627747, "memory(GiB)": 368.61, "step": 8915, "train_speed(iter/s)": 0.20387 }, { "acc": 0.73214426, "epoch": 0.22628107559614408, "grad_norm": 2.375, "learning_rate": 9.891399793825403e-06, "loss": 1.05891991, "memory(GiB)": 368.61, "step": 8920, "train_speed(iter/s)": 0.203889 }, { "acc": 0.73954315, "epoch": 0.22640791476407915, "grad_norm": 2.09375, "learning_rate": 9.891182319130387e-06, "loss": 1.12286205, "memory(GiB)": 368.61, "step": 8925, "train_speed(iter/s)": 0.20391 }, { "acc": 0.74182496, "epoch": 0.2265347539320142, "grad_norm": 2.34375, "learning_rate": 9.890964629299336e-06, "loss": 1.09314337, "memory(GiB)": 368.61, "step": 8930, "train_speed(iter/s)": 0.203921 }, { "acc": 0.7369503, "epoch": 0.22666159309994927, "grad_norm": 2.046875, "learning_rate": 9.890746724341825e-06, "loss": 1.08978672, "memory(GiB)": 368.61, "step": 8935, "train_speed(iter/s)": 0.203933 }, { "acc": 0.73616467, "epoch": 0.2267884322678843, "grad_norm": 2.15625, "learning_rate": 9.890528604267436e-06, "loss": 1.00603514, "memory(GiB)": 368.61, "step": 8940, "train_speed(iter/s)": 0.203947 }, { "acc": 0.74353704, "epoch": 0.22691527143581938, "grad_norm": 2.078125, "learning_rate": 9.890310269085765e-06, "loss": 1.03549652, "memory(GiB)": 368.61, "step": 8945, "train_speed(iter/s)": 0.203966 }, { "acc": 0.71808834, "epoch": 0.22704211060375443, "grad_norm": 2.078125, "learning_rate": 9.890091718806414e-06, "loss": 1.11906109, "memory(GiB)": 368.61, "step": 8950, "train_speed(iter/s)": 0.203978 }, { "acc": 0.74304471, "epoch": 0.2271689497716895, "grad_norm": 2.390625, "learning_rate": 9.889872953438996e-06, "loss": 1.06192875, "memory(GiB)": 368.61, "step": 8955, "train_speed(iter/s)": 0.203993 }, { "acc": 0.7387156, "epoch": 0.22729578893962454, "grad_norm": 2.546875, "learning_rate": 9.889653972993136e-06, "loss": 1.0544548, "memory(GiB)": 368.61, "step": 8960, "train_speed(iter/s)": 0.204015 }, { "acc": 0.74035196, "epoch": 0.22742262810755962, "grad_norm": 2.046875, "learning_rate": 9.889434777478464e-06, "loss": 1.04770966, "memory(GiB)": 368.61, "step": 8965, "train_speed(iter/s)": 0.204035 }, { "acc": 0.73981557, "epoch": 0.22754946727549466, "grad_norm": 2.015625, "learning_rate": 9.88921536690462e-06, "loss": 1.06771765, "memory(GiB)": 368.61, "step": 8970, "train_speed(iter/s)": 0.204044 }, { "acc": 0.72850413, "epoch": 0.22767630644342973, "grad_norm": 2.34375, "learning_rate": 9.888995741281252e-06, "loss": 1.11124916, "memory(GiB)": 368.61, "step": 8975, "train_speed(iter/s)": 0.204057 }, { "acc": 0.74679656, "epoch": 0.22780314561136478, "grad_norm": 2.34375, "learning_rate": 9.888775900618028e-06, "loss": 0.99466972, "memory(GiB)": 368.61, "step": 8980, "train_speed(iter/s)": 0.204079 }, { "acc": 0.73824701, "epoch": 0.22792998477929985, "grad_norm": 2.59375, "learning_rate": 9.88855584492461e-06, "loss": 1.08527451, "memory(GiB)": 368.61, "step": 8985, "train_speed(iter/s)": 0.204096 }, { "acc": 0.72893753, "epoch": 0.2280568239472349, "grad_norm": 2.140625, "learning_rate": 9.888335574210681e-06, "loss": 1.07308016, "memory(GiB)": 368.61, "step": 8990, "train_speed(iter/s)": 0.204104 }, { "acc": 0.7406383, "epoch": 0.22818366311516997, "grad_norm": 2.25, "learning_rate": 9.888115088485931e-06, "loss": 0.98559217, "memory(GiB)": 368.61, "step": 8995, "train_speed(iter/s)": 0.204119 }, { "acc": 0.73608208, "epoch": 0.228310502283105, "grad_norm": 2.234375, "learning_rate": 9.887894387760053e-06, "loss": 1.07405615, "memory(GiB)": 368.61, "step": 9000, "train_speed(iter/s)": 0.204135 }, { "epoch": 0.228310502283105, "eval_acc": 0.7287365839250659, "eval_loss": 1.0184811353683472, "eval_runtime": 385.033, "eval_samples_per_second": 16.544, "eval_steps_per_second": 8.272, "step": 9000 }, { "acc": 0.73696532, "epoch": 0.22843734145104008, "grad_norm": 2.71875, "learning_rate": 9.887673472042757e-06, "loss": 1.0669589, "memory(GiB)": 368.61, "step": 9005, "train_speed(iter/s)": 0.200894 }, { "acc": 0.74301491, "epoch": 0.22856418061897513, "grad_norm": 2.140625, "learning_rate": 9.88745234134376e-06, "loss": 1.07069464, "memory(GiB)": 368.61, "step": 9010, "train_speed(iter/s)": 0.200906 }, { "acc": 0.73164368, "epoch": 0.2286910197869102, "grad_norm": 2.125, "learning_rate": 9.887230995672789e-06, "loss": 1.07874813, "memory(GiB)": 368.61, "step": 9015, "train_speed(iter/s)": 0.200915 }, { "acc": 0.7426569, "epoch": 0.22881785895484524, "grad_norm": 2.203125, "learning_rate": 9.887009435039578e-06, "loss": 0.99642124, "memory(GiB)": 368.61, "step": 9020, "train_speed(iter/s)": 0.200926 }, { "acc": 0.73803854, "epoch": 0.22894469812278032, "grad_norm": 2.359375, "learning_rate": 9.886787659453873e-06, "loss": 1.01123409, "memory(GiB)": 368.61, "step": 9025, "train_speed(iter/s)": 0.200931 }, { "acc": 0.73247681, "epoch": 0.22907153729071536, "grad_norm": 2.09375, "learning_rate": 9.886565668925429e-06, "loss": 1.05096045, "memory(GiB)": 368.61, "step": 9030, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74917068, "epoch": 0.22919837645865043, "grad_norm": 2.5, "learning_rate": 9.88634346346401e-06, "loss": 1.04894428, "memory(GiB)": 368.61, "step": 9035, "train_speed(iter/s)": 0.200961 }, { "acc": 0.74220252, "epoch": 0.22932521562658548, "grad_norm": 2.109375, "learning_rate": 9.88612104307939e-06, "loss": 1.04631338, "memory(GiB)": 368.61, "step": 9040, "train_speed(iter/s)": 0.200977 }, { "acc": 0.74531732, "epoch": 0.22945205479452055, "grad_norm": 2.09375, "learning_rate": 9.885898407781352e-06, "loss": 1.02478571, "memory(GiB)": 368.61, "step": 9045, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74290314, "epoch": 0.2295788939624556, "grad_norm": 2.3125, "learning_rate": 9.885675557579686e-06, "loss": 1.11040325, "memory(GiB)": 368.61, "step": 9050, "train_speed(iter/s)": 0.20101 }, { "acc": 0.74601936, "epoch": 0.22970573313039067, "grad_norm": 2.265625, "learning_rate": 9.885452492484198e-06, "loss": 0.98953152, "memory(GiB)": 368.61, "step": 9055, "train_speed(iter/s)": 0.201035 }, { "acc": 0.73779192, "epoch": 0.2298325722983257, "grad_norm": 2.375, "learning_rate": 9.885229212504697e-06, "loss": 1.04386358, "memory(GiB)": 368.61, "step": 9060, "train_speed(iter/s)": 0.201052 }, { "acc": 0.74109797, "epoch": 0.22995941146626078, "grad_norm": 1.9140625, "learning_rate": 9.885005717651002e-06, "loss": 1.02159977, "memory(GiB)": 368.61, "step": 9065, "train_speed(iter/s)": 0.201061 }, { "acc": 0.73861179, "epoch": 0.23008625063419583, "grad_norm": 2.578125, "learning_rate": 9.88478200793295e-06, "loss": 1.00006695, "memory(GiB)": 368.61, "step": 9070, "train_speed(iter/s)": 0.20106 }, { "acc": 0.74329538, "epoch": 0.2302130898021309, "grad_norm": 2.125, "learning_rate": 9.884558083360372e-06, "loss": 1.01710434, "memory(GiB)": 368.61, "step": 9075, "train_speed(iter/s)": 0.201079 }, { "acc": 0.75780168, "epoch": 0.23033992897006594, "grad_norm": 2.046875, "learning_rate": 9.884333943943123e-06, "loss": 0.97641191, "memory(GiB)": 368.61, "step": 9080, "train_speed(iter/s)": 0.201092 }, { "acc": 0.7397615, "epoch": 0.23046676813800102, "grad_norm": 2.171875, "learning_rate": 9.884109589691062e-06, "loss": 1.05865898, "memory(GiB)": 368.61, "step": 9085, "train_speed(iter/s)": 0.201118 }, { "acc": 0.73952751, "epoch": 0.23059360730593606, "grad_norm": 2.1875, "learning_rate": 9.883885020614052e-06, "loss": 1.10702019, "memory(GiB)": 368.61, "step": 9090, "train_speed(iter/s)": 0.201142 }, { "acc": 0.75309143, "epoch": 0.23072044647387113, "grad_norm": 2.0, "learning_rate": 9.883660236721977e-06, "loss": 0.96470966, "memory(GiB)": 368.61, "step": 9095, "train_speed(iter/s)": 0.201159 }, { "acc": 0.73411264, "epoch": 0.23084728564180618, "grad_norm": 2.140625, "learning_rate": 9.883435238024718e-06, "loss": 1.05959339, "memory(GiB)": 368.61, "step": 9100, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75672441, "epoch": 0.23097412480974125, "grad_norm": 2.078125, "learning_rate": 9.883210024532176e-06, "loss": 1.00438728, "memory(GiB)": 368.61, "step": 9105, "train_speed(iter/s)": 0.201192 }, { "acc": 0.74725046, "epoch": 0.2311009639776763, "grad_norm": 2.296875, "learning_rate": 9.882984596254255e-06, "loss": 1.06241055, "memory(GiB)": 368.61, "step": 9110, "train_speed(iter/s)": 0.201212 }, { "acc": 0.74309015, "epoch": 0.23122780314561137, "grad_norm": 2.140625, "learning_rate": 9.88275895320087e-06, "loss": 1.07438393, "memory(GiB)": 368.61, "step": 9115, "train_speed(iter/s)": 0.201219 }, { "acc": 0.740905, "epoch": 0.2313546423135464, "grad_norm": 2.25, "learning_rate": 9.882533095381947e-06, "loss": 1.0207962, "memory(GiB)": 368.61, "step": 9120, "train_speed(iter/s)": 0.201232 }, { "acc": 0.72589598, "epoch": 0.23148148148148148, "grad_norm": 2.90625, "learning_rate": 9.882307022807419e-06, "loss": 1.02092514, "memory(GiB)": 368.61, "step": 9125, "train_speed(iter/s)": 0.201254 }, { "acc": 0.7354064, "epoch": 0.23160832064941653, "grad_norm": 1.9140625, "learning_rate": 9.88208073548723e-06, "loss": 1.12894211, "memory(GiB)": 368.61, "step": 9130, "train_speed(iter/s)": 0.201273 }, { "acc": 0.74509382, "epoch": 0.2317351598173516, "grad_norm": 1.9921875, "learning_rate": 9.881854233431333e-06, "loss": 1.04537296, "memory(GiB)": 368.61, "step": 9135, "train_speed(iter/s)": 0.201288 }, { "acc": 0.74126053, "epoch": 0.23186199898528664, "grad_norm": 1.9921875, "learning_rate": 9.881627516649692e-06, "loss": 1.02460918, "memory(GiB)": 368.61, "step": 9140, "train_speed(iter/s)": 0.201307 }, { "acc": 0.72577467, "epoch": 0.23198883815322172, "grad_norm": 2.140625, "learning_rate": 9.881400585152278e-06, "loss": 1.13600798, "memory(GiB)": 368.61, "step": 9145, "train_speed(iter/s)": 0.201328 }, { "acc": 0.74328604, "epoch": 0.23211567732115676, "grad_norm": 2.25, "learning_rate": 9.881173438949072e-06, "loss": 1.05114517, "memory(GiB)": 368.61, "step": 9150, "train_speed(iter/s)": 0.201348 }, { "acc": 0.7502552, "epoch": 0.23224251648909183, "grad_norm": 1.8671875, "learning_rate": 9.880946078050064e-06, "loss": 0.98201408, "memory(GiB)": 368.61, "step": 9155, "train_speed(iter/s)": 0.201368 }, { "acc": 0.72345314, "epoch": 0.23236935565702688, "grad_norm": 2.015625, "learning_rate": 9.880718502465258e-06, "loss": 1.12966518, "memory(GiB)": 368.61, "step": 9160, "train_speed(iter/s)": 0.201381 }, { "acc": 0.73115664, "epoch": 0.23249619482496195, "grad_norm": 2.046875, "learning_rate": 9.88049071220466e-06, "loss": 1.06607265, "memory(GiB)": 368.61, "step": 9165, "train_speed(iter/s)": 0.201396 }, { "acc": 0.73495278, "epoch": 0.232623033992897, "grad_norm": 2.0625, "learning_rate": 9.88026270727829e-06, "loss": 1.04712238, "memory(GiB)": 368.61, "step": 9170, "train_speed(iter/s)": 0.201411 }, { "acc": 0.72841539, "epoch": 0.23274987316083207, "grad_norm": 2.078125, "learning_rate": 9.880034487696179e-06, "loss": 1.06150856, "memory(GiB)": 368.61, "step": 9175, "train_speed(iter/s)": 0.20142 }, { "acc": 0.73524899, "epoch": 0.2328767123287671, "grad_norm": 2.84375, "learning_rate": 9.879806053468361e-06, "loss": 1.05141525, "memory(GiB)": 368.61, "step": 9180, "train_speed(iter/s)": 0.201433 }, { "acc": 0.74179869, "epoch": 0.23300355149670218, "grad_norm": 1.9453125, "learning_rate": 9.879577404604889e-06, "loss": 1.0633276, "memory(GiB)": 368.61, "step": 9185, "train_speed(iter/s)": 0.201451 }, { "acc": 0.72927294, "epoch": 0.23313039066463723, "grad_norm": 2.171875, "learning_rate": 9.879348541115816e-06, "loss": 1.06274643, "memory(GiB)": 368.61, "step": 9190, "train_speed(iter/s)": 0.201471 }, { "acc": 0.73282976, "epoch": 0.2332572298325723, "grad_norm": 1.7265625, "learning_rate": 9.879119463011208e-06, "loss": 1.04638214, "memory(GiB)": 368.61, "step": 9195, "train_speed(iter/s)": 0.201492 }, { "acc": 0.75106392, "epoch": 0.23338406900050734, "grad_norm": 2.1875, "learning_rate": 9.878890170301143e-06, "loss": 1.01685848, "memory(GiB)": 368.61, "step": 9200, "train_speed(iter/s)": 0.201506 }, { "acc": 0.75009918, "epoch": 0.23351090816844242, "grad_norm": 1.9375, "learning_rate": 9.878660662995706e-06, "loss": 1.0323637, "memory(GiB)": 368.61, "step": 9205, "train_speed(iter/s)": 0.201517 }, { "acc": 0.74008894, "epoch": 0.23363774733637746, "grad_norm": 2.828125, "learning_rate": 9.878430941104991e-06, "loss": 1.04200306, "memory(GiB)": 368.61, "step": 9210, "train_speed(iter/s)": 0.201539 }, { "acc": 0.73428898, "epoch": 0.23376458650431253, "grad_norm": 2.140625, "learning_rate": 9.878201004639104e-06, "loss": 1.08552256, "memory(GiB)": 368.61, "step": 9215, "train_speed(iter/s)": 0.201561 }, { "acc": 0.75769968, "epoch": 0.23389142567224758, "grad_norm": 2.359375, "learning_rate": 9.877970853608156e-06, "loss": 1.01656818, "memory(GiB)": 368.61, "step": 9220, "train_speed(iter/s)": 0.201577 }, { "acc": 0.73735018, "epoch": 0.23401826484018265, "grad_norm": 2.484375, "learning_rate": 9.87774048802227e-06, "loss": 1.05094633, "memory(GiB)": 368.61, "step": 9225, "train_speed(iter/s)": 0.201576 }, { "acc": 0.73817644, "epoch": 0.2341451040081177, "grad_norm": 2.1875, "learning_rate": 9.877509907891583e-06, "loss": 0.98836079, "memory(GiB)": 368.61, "step": 9230, "train_speed(iter/s)": 0.201598 }, { "acc": 0.75248499, "epoch": 0.23427194317605277, "grad_norm": 2.171875, "learning_rate": 9.877279113226232e-06, "loss": 0.96385384, "memory(GiB)": 368.61, "step": 9235, "train_speed(iter/s)": 0.201608 }, { "acc": 0.74420562, "epoch": 0.2343987823439878, "grad_norm": 2.171875, "learning_rate": 9.87704810403637e-06, "loss": 0.99790401, "memory(GiB)": 368.61, "step": 9240, "train_speed(iter/s)": 0.201618 }, { "acc": 0.7375999, "epoch": 0.23452562151192288, "grad_norm": 2.171875, "learning_rate": 9.876816880332157e-06, "loss": 1.06414051, "memory(GiB)": 368.61, "step": 9245, "train_speed(iter/s)": 0.201622 }, { "acc": 0.73862581, "epoch": 0.23465246067985793, "grad_norm": 1.7265625, "learning_rate": 9.876585442123765e-06, "loss": 1.04752455, "memory(GiB)": 368.61, "step": 9250, "train_speed(iter/s)": 0.201635 }, { "acc": 0.74161987, "epoch": 0.234779299847793, "grad_norm": 2.125, "learning_rate": 9.876353789421373e-06, "loss": 1.08028316, "memory(GiB)": 368.61, "step": 9255, "train_speed(iter/s)": 0.201643 }, { "acc": 0.73050766, "epoch": 0.23490613901572804, "grad_norm": 2.609375, "learning_rate": 9.876121922235171e-06, "loss": 1.11987162, "memory(GiB)": 368.61, "step": 9260, "train_speed(iter/s)": 0.201661 }, { "acc": 0.73952951, "epoch": 0.23503297818366312, "grad_norm": 2.171875, "learning_rate": 9.875889840575356e-06, "loss": 1.0996439, "memory(GiB)": 368.61, "step": 9265, "train_speed(iter/s)": 0.201684 }, { "acc": 0.73511381, "epoch": 0.23515981735159816, "grad_norm": 1.921875, "learning_rate": 9.875657544452135e-06, "loss": 1.05610294, "memory(GiB)": 368.61, "step": 9270, "train_speed(iter/s)": 0.201691 }, { "acc": 0.73084455, "epoch": 0.23528665651953323, "grad_norm": 2.21875, "learning_rate": 9.875425033875728e-06, "loss": 1.03936176, "memory(GiB)": 368.61, "step": 9275, "train_speed(iter/s)": 0.201709 }, { "acc": 0.75215502, "epoch": 0.23541349568746828, "grad_norm": 2.203125, "learning_rate": 9.875192308856363e-06, "loss": 1.03550072, "memory(GiB)": 368.61, "step": 9280, "train_speed(iter/s)": 0.201719 }, { "acc": 0.75238862, "epoch": 0.23554033485540335, "grad_norm": 2.421875, "learning_rate": 9.87495936940427e-06, "loss": 0.99948502, "memory(GiB)": 368.61, "step": 9285, "train_speed(iter/s)": 0.201733 }, { "acc": 0.72860579, "epoch": 0.2356671740233384, "grad_norm": 2.21875, "learning_rate": 9.874726215529702e-06, "loss": 1.07468166, "memory(GiB)": 368.61, "step": 9290, "train_speed(iter/s)": 0.201756 }, { "acc": 0.74595389, "epoch": 0.23579401319127347, "grad_norm": 2.625, "learning_rate": 9.87449284724291e-06, "loss": 0.99457664, "memory(GiB)": 368.61, "step": 9295, "train_speed(iter/s)": 0.201775 }, { "acc": 0.74540682, "epoch": 0.2359208523592085, "grad_norm": 2.0625, "learning_rate": 9.874259264554159e-06, "loss": 1.07426262, "memory(GiB)": 368.61, "step": 9300, "train_speed(iter/s)": 0.201789 }, { "acc": 0.74612188, "epoch": 0.23604769152714358, "grad_norm": 1.8359375, "learning_rate": 9.874025467473722e-06, "loss": 0.98606701, "memory(GiB)": 368.61, "step": 9305, "train_speed(iter/s)": 0.201808 }, { "acc": 0.72713242, "epoch": 0.23617453069507863, "grad_norm": 2.234375, "learning_rate": 9.873791456011887e-06, "loss": 1.06059914, "memory(GiB)": 368.61, "step": 9310, "train_speed(iter/s)": 0.201818 }, { "acc": 0.73870754, "epoch": 0.2363013698630137, "grad_norm": 2.578125, "learning_rate": 9.873557230178942e-06, "loss": 1.03250828, "memory(GiB)": 368.61, "step": 9315, "train_speed(iter/s)": 0.201834 }, { "acc": 0.74122968, "epoch": 0.23642820903094874, "grad_norm": 2.171875, "learning_rate": 9.873322789985191e-06, "loss": 1.0237236, "memory(GiB)": 368.61, "step": 9320, "train_speed(iter/s)": 0.201846 }, { "acc": 0.73718748, "epoch": 0.23655504819888382, "grad_norm": 2.453125, "learning_rate": 9.873088135440949e-06, "loss": 1.06025343, "memory(GiB)": 368.61, "step": 9325, "train_speed(iter/s)": 0.201855 }, { "acc": 0.71952162, "epoch": 0.23668188736681886, "grad_norm": 2.15625, "learning_rate": 9.87285326655653e-06, "loss": 1.11408672, "memory(GiB)": 368.61, "step": 9330, "train_speed(iter/s)": 0.20186 }, { "acc": 0.74154158, "epoch": 0.23680872653475393, "grad_norm": 2.171875, "learning_rate": 9.87261818334227e-06, "loss": 1.03191214, "memory(GiB)": 368.61, "step": 9335, "train_speed(iter/s)": 0.201876 }, { "acc": 0.71535673, "epoch": 0.23693556570268898, "grad_norm": 2.34375, "learning_rate": 9.87238288580851e-06, "loss": 1.11353016, "memory(GiB)": 368.61, "step": 9340, "train_speed(iter/s)": 0.20189 }, { "acc": 0.73455944, "epoch": 0.23706240487062405, "grad_norm": 2.078125, "learning_rate": 9.872147373965594e-06, "loss": 1.08979425, "memory(GiB)": 368.61, "step": 9345, "train_speed(iter/s)": 0.201908 }, { "acc": 0.73503637, "epoch": 0.2371892440385591, "grad_norm": 2.1875, "learning_rate": 9.871911647823884e-06, "loss": 1.09835968, "memory(GiB)": 368.61, "step": 9350, "train_speed(iter/s)": 0.20193 }, { "acc": 0.74491692, "epoch": 0.23731608320649417, "grad_norm": 1.7265625, "learning_rate": 9.871675707393749e-06, "loss": 1.04251308, "memory(GiB)": 368.61, "step": 9355, "train_speed(iter/s)": 0.201945 }, { "acc": 0.74138145, "epoch": 0.2374429223744292, "grad_norm": 2.40625, "learning_rate": 9.871439552685566e-06, "loss": 1.05416517, "memory(GiB)": 368.61, "step": 9360, "train_speed(iter/s)": 0.201961 }, { "acc": 0.73865633, "epoch": 0.23756976154236428, "grad_norm": 1.9375, "learning_rate": 9.871203183709723e-06, "loss": 1.05855598, "memory(GiB)": 368.61, "step": 9365, "train_speed(iter/s)": 0.201977 }, { "acc": 0.75840816, "epoch": 0.23769660071029933, "grad_norm": 2.421875, "learning_rate": 9.870966600476614e-06, "loss": 1.03481674, "memory(GiB)": 368.61, "step": 9370, "train_speed(iter/s)": 0.201999 }, { "acc": 0.75572705, "epoch": 0.2378234398782344, "grad_norm": 2.328125, "learning_rate": 9.870729802996647e-06, "loss": 1.03144512, "memory(GiB)": 368.61, "step": 9375, "train_speed(iter/s)": 0.202013 }, { "acc": 0.73629041, "epoch": 0.23795027904616944, "grad_norm": 2.5, "learning_rate": 9.870492791280239e-06, "loss": 1.01358147, "memory(GiB)": 368.61, "step": 9380, "train_speed(iter/s)": 0.202031 }, { "acc": 0.74271946, "epoch": 0.23807711821410452, "grad_norm": 3.328125, "learning_rate": 9.87025556533781e-06, "loss": 1.00435114, "memory(GiB)": 368.61, "step": 9385, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75820098, "epoch": 0.23820395738203956, "grad_norm": 2.390625, "learning_rate": 9.870018125179799e-06, "loss": 0.99861984, "memory(GiB)": 368.61, "step": 9390, "train_speed(iter/s)": 0.202062 }, { "acc": 0.75410666, "epoch": 0.23833079654997463, "grad_norm": 2.21875, "learning_rate": 9.869780470816647e-06, "loss": 0.97799492, "memory(GiB)": 368.61, "step": 9395, "train_speed(iter/s)": 0.202077 }, { "acc": 0.75190582, "epoch": 0.23845763571790968, "grad_norm": 1.78125, "learning_rate": 9.869542602258809e-06, "loss": 1.00904884, "memory(GiB)": 368.61, "step": 9400, "train_speed(iter/s)": 0.202087 }, { "acc": 0.73703709, "epoch": 0.23858447488584475, "grad_norm": 2.59375, "learning_rate": 9.869304519516745e-06, "loss": 1.11225471, "memory(GiB)": 368.61, "step": 9405, "train_speed(iter/s)": 0.202102 }, { "acc": 0.74145336, "epoch": 0.2387113140537798, "grad_norm": 1.921875, "learning_rate": 9.869066222600928e-06, "loss": 1.08858528, "memory(GiB)": 368.61, "step": 9410, "train_speed(iter/s)": 0.202115 }, { "acc": 0.74155931, "epoch": 0.23883815322171487, "grad_norm": 1.796875, "learning_rate": 9.86882771152184e-06, "loss": 1.03226385, "memory(GiB)": 368.61, "step": 9415, "train_speed(iter/s)": 0.202123 }, { "acc": 0.7244483, "epoch": 0.2389649923896499, "grad_norm": 2.1875, "learning_rate": 9.868588986289973e-06, "loss": 1.14626274, "memory(GiB)": 368.61, "step": 9420, "train_speed(iter/s)": 0.20214 }, { "acc": 0.72730255, "epoch": 0.23909183155758498, "grad_norm": 2.15625, "learning_rate": 9.868350046915825e-06, "loss": 1.13815441, "memory(GiB)": 368.61, "step": 9425, "train_speed(iter/s)": 0.202159 }, { "acc": 0.73922739, "epoch": 0.23921867072552003, "grad_norm": 2.625, "learning_rate": 9.868110893409906e-06, "loss": 1.07255898, "memory(GiB)": 368.61, "step": 9430, "train_speed(iter/s)": 0.202175 }, { "acc": 0.76600657, "epoch": 0.2393455098934551, "grad_norm": 2.5, "learning_rate": 9.867871525782735e-06, "loss": 0.99697971, "memory(GiB)": 368.61, "step": 9435, "train_speed(iter/s)": 0.202194 }, { "acc": 0.74549327, "epoch": 0.23947234906139014, "grad_norm": 2.15625, "learning_rate": 9.86763194404484e-06, "loss": 1.03475008, "memory(GiB)": 368.61, "step": 9440, "train_speed(iter/s)": 0.202215 }, { "acc": 0.73304472, "epoch": 0.23959918822932522, "grad_norm": 2.125, "learning_rate": 9.867392148206762e-06, "loss": 1.05033836, "memory(GiB)": 368.61, "step": 9445, "train_speed(iter/s)": 0.202234 }, { "acc": 0.74135437, "epoch": 0.23972602739726026, "grad_norm": 2.515625, "learning_rate": 9.867152138279043e-06, "loss": 1.05266199, "memory(GiB)": 368.61, "step": 9450, "train_speed(iter/s)": 0.202254 }, { "acc": 0.74516897, "epoch": 0.23985286656519533, "grad_norm": 1.9765625, "learning_rate": 9.866911914272246e-06, "loss": 1.02036467, "memory(GiB)": 368.61, "step": 9455, "train_speed(iter/s)": 0.202267 }, { "acc": 0.74373393, "epoch": 0.23997970573313038, "grad_norm": 2.28125, "learning_rate": 9.866671476196931e-06, "loss": 1.02060795, "memory(GiB)": 368.61, "step": 9460, "train_speed(iter/s)": 0.202283 }, { "acc": 0.75289931, "epoch": 0.24010654490106545, "grad_norm": 2.046875, "learning_rate": 9.866430824063678e-06, "loss": 1.00445385, "memory(GiB)": 368.61, "step": 9465, "train_speed(iter/s)": 0.2023 }, { "acc": 0.75292501, "epoch": 0.2402333840690005, "grad_norm": 2.1875, "learning_rate": 9.86618995788307e-06, "loss": 0.98839521, "memory(GiB)": 368.61, "step": 9470, "train_speed(iter/s)": 0.202307 }, { "acc": 0.72736359, "epoch": 0.24036022323693557, "grad_norm": 4.53125, "learning_rate": 9.865948877665702e-06, "loss": 1.05962563, "memory(GiB)": 368.61, "step": 9475, "train_speed(iter/s)": 0.202326 }, { "acc": 0.73871646, "epoch": 0.2404870624048706, "grad_norm": 2.4375, "learning_rate": 9.865707583422178e-06, "loss": 1.05702467, "memory(GiB)": 368.61, "step": 9480, "train_speed(iter/s)": 0.202347 }, { "acc": 0.74553256, "epoch": 0.24061390157280568, "grad_norm": 2.34375, "learning_rate": 9.865466075163108e-06, "loss": 1.06003008, "memory(GiB)": 368.61, "step": 9485, "train_speed(iter/s)": 0.202356 }, { "acc": 0.7566503, "epoch": 0.24074074074074073, "grad_norm": 2.015625, "learning_rate": 9.86522435289912e-06, "loss": 0.98907986, "memory(GiB)": 368.61, "step": 9490, "train_speed(iter/s)": 0.202373 }, { "acc": 0.73005991, "epoch": 0.2408675799086758, "grad_norm": 2.25, "learning_rate": 9.864982416640843e-06, "loss": 1.10924263, "memory(GiB)": 368.61, "step": 9495, "train_speed(iter/s)": 0.202389 }, { "acc": 0.74287395, "epoch": 0.24099441907661084, "grad_norm": 2.171875, "learning_rate": 9.864740266398918e-06, "loss": 1.0470212, "memory(GiB)": 368.61, "step": 9500, "train_speed(iter/s)": 0.202399 }, { "acc": 0.74197464, "epoch": 0.24112125824454592, "grad_norm": 2.078125, "learning_rate": 9.864497902183996e-06, "loss": 1.0222806, "memory(GiB)": 368.61, "step": 9505, "train_speed(iter/s)": 0.202412 }, { "acc": 0.75131302, "epoch": 0.24124809741248096, "grad_norm": 2.34375, "learning_rate": 9.864255324006738e-06, "loss": 1.01609831, "memory(GiB)": 368.61, "step": 9510, "train_speed(iter/s)": 0.20243 }, { "acc": 0.73558693, "epoch": 0.24137493658041603, "grad_norm": 2.265625, "learning_rate": 9.864012531877814e-06, "loss": 1.08391399, "memory(GiB)": 368.61, "step": 9515, "train_speed(iter/s)": 0.202452 }, { "acc": 0.72947044, "epoch": 0.24150177574835108, "grad_norm": 2.03125, "learning_rate": 9.863769525807903e-06, "loss": 1.0611372, "memory(GiB)": 368.61, "step": 9520, "train_speed(iter/s)": 0.202469 }, { "acc": 0.76227503, "epoch": 0.24162861491628615, "grad_norm": 2.125, "learning_rate": 9.863526305807694e-06, "loss": 0.98603115, "memory(GiB)": 368.61, "step": 9525, "train_speed(iter/s)": 0.202487 }, { "acc": 0.73553104, "epoch": 0.2417554540842212, "grad_norm": 3.109375, "learning_rate": 9.863282871887882e-06, "loss": 1.13135109, "memory(GiB)": 368.61, "step": 9530, "train_speed(iter/s)": 0.202508 }, { "acc": 0.74170971, "epoch": 0.24188229325215627, "grad_norm": 2.09375, "learning_rate": 9.863039224059177e-06, "loss": 1.04981308, "memory(GiB)": 368.61, "step": 9535, "train_speed(iter/s)": 0.202523 }, { "acc": 0.74168749, "epoch": 0.2420091324200913, "grad_norm": 2.171875, "learning_rate": 9.862795362332293e-06, "loss": 1.00522861, "memory(GiB)": 368.61, "step": 9540, "train_speed(iter/s)": 0.202536 }, { "acc": 0.75892696, "epoch": 0.24213597158802638, "grad_norm": 2.0625, "learning_rate": 9.862551286717961e-06, "loss": 0.99774513, "memory(GiB)": 368.61, "step": 9545, "train_speed(iter/s)": 0.202547 }, { "acc": 0.74252944, "epoch": 0.24226281075596143, "grad_norm": 1.9140625, "learning_rate": 9.862306997226914e-06, "loss": 0.99597092, "memory(GiB)": 368.61, "step": 9550, "train_speed(iter/s)": 0.202558 }, { "acc": 0.73198757, "epoch": 0.2423896499238965, "grad_norm": 2.609375, "learning_rate": 9.862062493869895e-06, "loss": 1.08796043, "memory(GiB)": 368.61, "step": 9555, "train_speed(iter/s)": 0.202579 }, { "acc": 0.74612131, "epoch": 0.24251648909183154, "grad_norm": 2.328125, "learning_rate": 9.861817776657661e-06, "loss": 0.99332047, "memory(GiB)": 368.61, "step": 9560, "train_speed(iter/s)": 0.202599 }, { "acc": 0.72837448, "epoch": 0.24264332825976662, "grad_norm": 2.703125, "learning_rate": 9.861572845600973e-06, "loss": 1.06988792, "memory(GiB)": 368.61, "step": 9565, "train_speed(iter/s)": 0.202624 }, { "acc": 0.74227371, "epoch": 0.24277016742770166, "grad_norm": 1.9921875, "learning_rate": 9.861327700710608e-06, "loss": 1.04134836, "memory(GiB)": 368.61, "step": 9570, "train_speed(iter/s)": 0.202636 }, { "acc": 0.73422976, "epoch": 0.24289700659563673, "grad_norm": 2.234375, "learning_rate": 9.861082341997345e-06, "loss": 1.08577261, "memory(GiB)": 368.61, "step": 9575, "train_speed(iter/s)": 0.202656 }, { "acc": 0.74406095, "epoch": 0.24302384576357178, "grad_norm": 1.8046875, "learning_rate": 9.860836769471977e-06, "loss": 1.02871437, "memory(GiB)": 368.61, "step": 9580, "train_speed(iter/s)": 0.202671 }, { "acc": 0.74128885, "epoch": 0.24315068493150685, "grad_norm": 2.109375, "learning_rate": 9.860590983145307e-06, "loss": 1.07427959, "memory(GiB)": 368.61, "step": 9585, "train_speed(iter/s)": 0.202684 }, { "acc": 0.74127636, "epoch": 0.2432775240994419, "grad_norm": 2.3125, "learning_rate": 9.860344983028146e-06, "loss": 1.05031528, "memory(GiB)": 368.61, "step": 9590, "train_speed(iter/s)": 0.202683 }, { "acc": 0.73975239, "epoch": 0.24340436326737697, "grad_norm": 2.09375, "learning_rate": 9.86009876913131e-06, "loss": 1.08819637, "memory(GiB)": 368.61, "step": 9595, "train_speed(iter/s)": 0.202698 }, { "acc": 0.73571992, "epoch": 0.243531202435312, "grad_norm": 1.6796875, "learning_rate": 9.859852341465633e-06, "loss": 1.09783478, "memory(GiB)": 368.61, "step": 9600, "train_speed(iter/s)": 0.202714 }, { "acc": 0.74063482, "epoch": 0.24365804160324708, "grad_norm": 2.25, "learning_rate": 9.859605700041951e-06, "loss": 1.08587332, "memory(GiB)": 368.61, "step": 9605, "train_speed(iter/s)": 0.202722 }, { "acc": 0.76402526, "epoch": 0.24378488077118213, "grad_norm": 2.328125, "learning_rate": 9.859358844871113e-06, "loss": 0.97599564, "memory(GiB)": 368.61, "step": 9610, "train_speed(iter/s)": 0.202741 }, { "acc": 0.7404356, "epoch": 0.2439117199391172, "grad_norm": 2.234375, "learning_rate": 9.859111775963981e-06, "loss": 1.02161865, "memory(GiB)": 368.61, "step": 9615, "train_speed(iter/s)": 0.202759 }, { "acc": 0.74415512, "epoch": 0.24403855910705224, "grad_norm": 2.296875, "learning_rate": 9.858864493331417e-06, "loss": 1.05508633, "memory(GiB)": 368.61, "step": 9620, "train_speed(iter/s)": 0.202778 }, { "acc": 0.7264678, "epoch": 0.24416539827498732, "grad_norm": 2.46875, "learning_rate": 9.858616996984297e-06, "loss": 1.10120392, "memory(GiB)": 368.61, "step": 9625, "train_speed(iter/s)": 0.202801 }, { "acc": 0.73313031, "epoch": 0.24429223744292236, "grad_norm": 2.296875, "learning_rate": 9.858369286933513e-06, "loss": 1.07365541, "memory(GiB)": 368.61, "step": 9630, "train_speed(iter/s)": 0.202821 }, { "acc": 0.74582896, "epoch": 0.24441907661085743, "grad_norm": 2.109375, "learning_rate": 9.858121363189954e-06, "loss": 1.04701061, "memory(GiB)": 368.61, "step": 9635, "train_speed(iter/s)": 0.202834 }, { "acc": 0.73679628, "epoch": 0.24454591577879248, "grad_norm": 2.3125, "learning_rate": 9.85787322576453e-06, "loss": 1.11666317, "memory(GiB)": 368.61, "step": 9640, "train_speed(iter/s)": 0.202834 }, { "acc": 0.73340349, "epoch": 0.24467275494672755, "grad_norm": 1.765625, "learning_rate": 9.85762487466815e-06, "loss": 1.03891954, "memory(GiB)": 368.61, "step": 9645, "train_speed(iter/s)": 0.202844 }, { "acc": 0.7509738, "epoch": 0.2447995941146626, "grad_norm": 2.203125, "learning_rate": 9.857376309911741e-06, "loss": 0.94749241, "memory(GiB)": 368.61, "step": 9650, "train_speed(iter/s)": 0.202854 }, { "acc": 0.74389329, "epoch": 0.24492643328259767, "grad_norm": 2.0625, "learning_rate": 9.857127531506237e-06, "loss": 1.08312712, "memory(GiB)": 368.61, "step": 9655, "train_speed(iter/s)": 0.202867 }, { "acc": 0.73578773, "epoch": 0.2450532724505327, "grad_norm": 2.296875, "learning_rate": 9.856878539462577e-06, "loss": 1.03982449, "memory(GiB)": 368.61, "step": 9660, "train_speed(iter/s)": 0.202876 }, { "acc": 0.75364275, "epoch": 0.24518011161846778, "grad_norm": 2.140625, "learning_rate": 9.856629333791716e-06, "loss": 0.99728117, "memory(GiB)": 368.61, "step": 9665, "train_speed(iter/s)": 0.202883 }, { "acc": 0.74030485, "epoch": 0.24530695078640283, "grad_norm": 1.96875, "learning_rate": 9.856379914504612e-06, "loss": 1.05124903, "memory(GiB)": 368.61, "step": 9670, "train_speed(iter/s)": 0.202897 }, { "acc": 0.73856678, "epoch": 0.2454337899543379, "grad_norm": 1.890625, "learning_rate": 9.856130281612237e-06, "loss": 1.05655088, "memory(GiB)": 368.61, "step": 9675, "train_speed(iter/s)": 0.202915 }, { "acc": 0.73314295, "epoch": 0.24556062912227294, "grad_norm": 2.09375, "learning_rate": 9.855880435125572e-06, "loss": 1.01467695, "memory(GiB)": 368.61, "step": 9680, "train_speed(iter/s)": 0.202938 }, { "acc": 0.73917542, "epoch": 0.24568746829020802, "grad_norm": 2.265625, "learning_rate": 9.855630375055604e-06, "loss": 1.03570461, "memory(GiB)": 368.61, "step": 9685, "train_speed(iter/s)": 0.202954 }, { "acc": 0.73574371, "epoch": 0.24581430745814306, "grad_norm": 2.03125, "learning_rate": 9.855380101413336e-06, "loss": 1.07352219, "memory(GiB)": 368.61, "step": 9690, "train_speed(iter/s)": 0.202968 }, { "acc": 0.74163022, "epoch": 0.24594114662607813, "grad_norm": 2.0625, "learning_rate": 9.855129614209771e-06, "loss": 1.03289642, "memory(GiB)": 368.61, "step": 9695, "train_speed(iter/s)": 0.202985 }, { "acc": 0.74101343, "epoch": 0.24606798579401318, "grad_norm": 2.515625, "learning_rate": 9.85487891345593e-06, "loss": 1.09811964, "memory(GiB)": 368.61, "step": 9700, "train_speed(iter/s)": 0.203007 }, { "acc": 0.72718306, "epoch": 0.24619482496194825, "grad_norm": 2.703125, "learning_rate": 9.85462799916284e-06, "loss": 1.11433315, "memory(GiB)": 368.61, "step": 9705, "train_speed(iter/s)": 0.203027 }, { "acc": 0.73313422, "epoch": 0.2463216641298833, "grad_norm": 2.21875, "learning_rate": 9.854376871341535e-06, "loss": 1.08877506, "memory(GiB)": 368.61, "step": 9710, "train_speed(iter/s)": 0.203046 }, { "acc": 0.73142338, "epoch": 0.24644850329781837, "grad_norm": 2.703125, "learning_rate": 9.854125530003063e-06, "loss": 1.08904419, "memory(GiB)": 368.61, "step": 9715, "train_speed(iter/s)": 0.203066 }, { "acc": 0.74216452, "epoch": 0.2465753424657534, "grad_norm": 2.46875, "learning_rate": 9.853873975158476e-06, "loss": 1.03389378, "memory(GiB)": 368.61, "step": 9720, "train_speed(iter/s)": 0.203082 }, { "acc": 0.73465996, "epoch": 0.24670218163368848, "grad_norm": 2.21875, "learning_rate": 9.853622206818842e-06, "loss": 1.10083065, "memory(GiB)": 368.61, "step": 9725, "train_speed(iter/s)": 0.203095 }, { "acc": 0.73907957, "epoch": 0.24682902080162353, "grad_norm": 1.984375, "learning_rate": 9.853370224995233e-06, "loss": 1.02831898, "memory(GiB)": 368.61, "step": 9730, "train_speed(iter/s)": 0.203107 }, { "acc": 0.75611801, "epoch": 0.2469558599695586, "grad_norm": 2.09375, "learning_rate": 9.853118029698733e-06, "loss": 0.98173084, "memory(GiB)": 368.61, "step": 9735, "train_speed(iter/s)": 0.203103 }, { "acc": 0.74910946, "epoch": 0.24708269913749364, "grad_norm": 2.5, "learning_rate": 9.852865620940436e-06, "loss": 1.0764514, "memory(GiB)": 368.61, "step": 9740, "train_speed(iter/s)": 0.203119 }, { "acc": 0.73476009, "epoch": 0.24720953830542872, "grad_norm": 1.8046875, "learning_rate": 9.85261299873144e-06, "loss": 1.05936623, "memory(GiB)": 368.61, "step": 9745, "train_speed(iter/s)": 0.203135 }, { "acc": 0.75461798, "epoch": 0.24733637747336376, "grad_norm": 2.203125, "learning_rate": 9.85236016308286e-06, "loss": 1.03592834, "memory(GiB)": 368.61, "step": 9750, "train_speed(iter/s)": 0.203153 }, { "acc": 0.73853636, "epoch": 0.24746321664129883, "grad_norm": 2.03125, "learning_rate": 9.852107114005816e-06, "loss": 1.04997587, "memory(GiB)": 368.61, "step": 9755, "train_speed(iter/s)": 0.203167 }, { "acc": 0.75599108, "epoch": 0.24759005580923388, "grad_norm": 2.015625, "learning_rate": 9.851853851511437e-06, "loss": 1.00309086, "memory(GiB)": 368.61, "step": 9760, "train_speed(iter/s)": 0.203182 }, { "acc": 0.74531217, "epoch": 0.24771689497716895, "grad_norm": 2.53125, "learning_rate": 9.851600375610864e-06, "loss": 1.01119003, "memory(GiB)": 368.61, "step": 9765, "train_speed(iter/s)": 0.203204 }, { "acc": 0.74145083, "epoch": 0.247843734145104, "grad_norm": 2.34375, "learning_rate": 9.851346686315246e-06, "loss": 1.04244957, "memory(GiB)": 368.61, "step": 9770, "train_speed(iter/s)": 0.203218 }, { "acc": 0.73580065, "epoch": 0.24797057331303907, "grad_norm": 1.75, "learning_rate": 9.851092783635742e-06, "loss": 1.07453842, "memory(GiB)": 368.61, "step": 9775, "train_speed(iter/s)": 0.203237 }, { "acc": 0.74957061, "epoch": 0.2480974124809741, "grad_norm": 2.4375, "learning_rate": 9.850838667583518e-06, "loss": 1.0475462, "memory(GiB)": 368.61, "step": 9780, "train_speed(iter/s)": 0.203257 }, { "acc": 0.74933996, "epoch": 0.24822425164890918, "grad_norm": 2.3125, "learning_rate": 9.850584338169752e-06, "loss": 1.02206306, "memory(GiB)": 368.61, "step": 9785, "train_speed(iter/s)": 0.203276 }, { "acc": 0.73291616, "epoch": 0.24835109081684423, "grad_norm": 2.09375, "learning_rate": 9.85032979540563e-06, "loss": 1.0562171, "memory(GiB)": 368.61, "step": 9790, "train_speed(iter/s)": 0.203291 }, { "acc": 0.74146686, "epoch": 0.2484779299847793, "grad_norm": 2.015625, "learning_rate": 9.85007503930235e-06, "loss": 1.05388422, "memory(GiB)": 368.61, "step": 9795, "train_speed(iter/s)": 0.203311 }, { "acc": 0.74767437, "epoch": 0.24860476915271434, "grad_norm": 2.140625, "learning_rate": 9.849820069871114e-06, "loss": 1.05779686, "memory(GiB)": 368.61, "step": 9800, "train_speed(iter/s)": 0.203326 }, { "acc": 0.73726649, "epoch": 0.24873160832064942, "grad_norm": 2.8125, "learning_rate": 9.849564887123138e-06, "loss": 1.07530327, "memory(GiB)": 368.61, "step": 9805, "train_speed(iter/s)": 0.20334 }, { "acc": 0.73086615, "epoch": 0.24885844748858446, "grad_norm": 2.0625, "learning_rate": 9.849309491069647e-06, "loss": 1.06164007, "memory(GiB)": 368.61, "step": 9810, "train_speed(iter/s)": 0.203354 }, { "acc": 0.74037523, "epoch": 0.24898528665651953, "grad_norm": 2.703125, "learning_rate": 9.849053881721876e-06, "loss": 1.04182968, "memory(GiB)": 368.61, "step": 9815, "train_speed(iter/s)": 0.203366 }, { "acc": 0.7425487, "epoch": 0.24911212582445458, "grad_norm": 1.953125, "learning_rate": 9.848798059091064e-06, "loss": 1.08162231, "memory(GiB)": 368.61, "step": 9820, "train_speed(iter/s)": 0.203389 }, { "acc": 0.74579201, "epoch": 0.24923896499238965, "grad_norm": 2.203125, "learning_rate": 9.848542023188466e-06, "loss": 1.05753632, "memory(GiB)": 368.61, "step": 9825, "train_speed(iter/s)": 0.203398 }, { "acc": 0.75904903, "epoch": 0.2493658041603247, "grad_norm": 2.140625, "learning_rate": 9.848285774025342e-06, "loss": 0.96569653, "memory(GiB)": 368.61, "step": 9830, "train_speed(iter/s)": 0.203407 }, { "acc": 0.73657341, "epoch": 0.24949264332825977, "grad_norm": 2.390625, "learning_rate": 9.848029311612963e-06, "loss": 1.05797596, "memory(GiB)": 368.61, "step": 9835, "train_speed(iter/s)": 0.203418 }, { "acc": 0.74709406, "epoch": 0.2496194824961948, "grad_norm": 2.53125, "learning_rate": 9.84777263596261e-06, "loss": 1.02490034, "memory(GiB)": 368.61, "step": 9840, "train_speed(iter/s)": 0.20343 }, { "acc": 0.73682308, "epoch": 0.24974632166412988, "grad_norm": 2.046875, "learning_rate": 9.847515747085573e-06, "loss": 1.06823196, "memory(GiB)": 368.61, "step": 9845, "train_speed(iter/s)": 0.203445 }, { "acc": 0.74068351, "epoch": 0.24987316083206493, "grad_norm": 2.109375, "learning_rate": 9.847258644993151e-06, "loss": 1.07476482, "memory(GiB)": 368.61, "step": 9850, "train_speed(iter/s)": 0.203454 }, { "acc": 0.72194796, "epoch": 0.25, "grad_norm": 1.9921875, "learning_rate": 9.847001329696653e-06, "loss": 1.0886879, "memory(GiB)": 368.61, "step": 9855, "train_speed(iter/s)": 0.203469 }, { "acc": 0.74522314, "epoch": 0.25012683916793504, "grad_norm": 2.03125, "learning_rate": 9.846743801207395e-06, "loss": 1.00839872, "memory(GiB)": 368.61, "step": 9860, "train_speed(iter/s)": 0.203485 }, { "acc": 0.73205256, "epoch": 0.25025367833587014, "grad_norm": 2.265625, "learning_rate": 9.846486059536706e-06, "loss": 1.08664799, "memory(GiB)": 368.61, "step": 9865, "train_speed(iter/s)": 0.203488 }, { "acc": 0.73844924, "epoch": 0.2503805175038052, "grad_norm": 1.8828125, "learning_rate": 9.846228104695922e-06, "loss": 1.0310173, "memory(GiB)": 368.61, "step": 9870, "train_speed(iter/s)": 0.203503 }, { "acc": 0.72697868, "epoch": 0.25050735667174023, "grad_norm": 2.296875, "learning_rate": 9.84596993669639e-06, "loss": 1.13920364, "memory(GiB)": 368.61, "step": 9875, "train_speed(iter/s)": 0.203518 }, { "acc": 0.74173675, "epoch": 0.2506341958396753, "grad_norm": 3.40625, "learning_rate": 9.845711555549464e-06, "loss": 1.06615582, "memory(GiB)": 368.61, "step": 9880, "train_speed(iter/s)": 0.203526 }, { "acc": 0.74279304, "epoch": 0.2507610350076104, "grad_norm": 2.046875, "learning_rate": 9.845452961266509e-06, "loss": 1.01195831, "memory(GiB)": 368.61, "step": 9885, "train_speed(iter/s)": 0.203538 }, { "acc": 0.74493723, "epoch": 0.2508878741755454, "grad_norm": 2.15625, "learning_rate": 9.845194153858899e-06, "loss": 1.03076248, "memory(GiB)": 368.61, "step": 9890, "train_speed(iter/s)": 0.203554 }, { "acc": 0.7366292, "epoch": 0.25101471334348047, "grad_norm": 2.140625, "learning_rate": 9.844935133338018e-06, "loss": 1.03896103, "memory(GiB)": 368.61, "step": 9895, "train_speed(iter/s)": 0.203567 }, { "acc": 0.72196484, "epoch": 0.2511415525114155, "grad_norm": 2.046875, "learning_rate": 9.84467589971526e-06, "loss": 1.1377533, "memory(GiB)": 368.61, "step": 9900, "train_speed(iter/s)": 0.203573 }, { "acc": 0.72984171, "epoch": 0.2512683916793506, "grad_norm": 2.328125, "learning_rate": 9.844416453002027e-06, "loss": 1.07620735, "memory(GiB)": 368.61, "step": 9905, "train_speed(iter/s)": 0.203577 }, { "acc": 0.73969374, "epoch": 0.25139523084728566, "grad_norm": 2.125, "learning_rate": 9.844156793209725e-06, "loss": 1.02285957, "memory(GiB)": 368.61, "step": 9910, "train_speed(iter/s)": 0.203595 }, { "acc": 0.74609032, "epoch": 0.2515220700152207, "grad_norm": 2.234375, "learning_rate": 9.843896920349783e-06, "loss": 1.02846031, "memory(GiB)": 368.61, "step": 9915, "train_speed(iter/s)": 0.203606 }, { "acc": 0.74171534, "epoch": 0.25164890918315574, "grad_norm": 1.9296875, "learning_rate": 9.843636834433627e-06, "loss": 1.04563122, "memory(GiB)": 368.61, "step": 9920, "train_speed(iter/s)": 0.203625 }, { "acc": 0.74187913, "epoch": 0.25177574835109084, "grad_norm": 2.203125, "learning_rate": 9.843376535472698e-06, "loss": 1.02129078, "memory(GiB)": 368.61, "step": 9925, "train_speed(iter/s)": 0.20364 }, { "acc": 0.75276461, "epoch": 0.2519025875190259, "grad_norm": 1.7421875, "learning_rate": 9.843116023478445e-06, "loss": 0.97208633, "memory(GiB)": 368.61, "step": 9930, "train_speed(iter/s)": 0.203658 }, { "acc": 0.73748832, "epoch": 0.25202942668696093, "grad_norm": 2.046875, "learning_rate": 9.842855298462327e-06, "loss": 1.04450283, "memory(GiB)": 368.61, "step": 9935, "train_speed(iter/s)": 0.203674 }, { "acc": 0.73464231, "epoch": 0.252156265854896, "grad_norm": 2.5625, "learning_rate": 9.84259436043581e-06, "loss": 1.11152382, "memory(GiB)": 368.61, "step": 9940, "train_speed(iter/s)": 0.203692 }, { "acc": 0.75037055, "epoch": 0.2522831050228311, "grad_norm": 2.390625, "learning_rate": 9.842333209410372e-06, "loss": 1.0109127, "memory(GiB)": 368.61, "step": 9945, "train_speed(iter/s)": 0.203699 }, { "acc": 0.74865408, "epoch": 0.2524099441907661, "grad_norm": 2.0625, "learning_rate": 9.842071845397502e-06, "loss": 0.97456207, "memory(GiB)": 368.61, "step": 9950, "train_speed(iter/s)": 0.203715 }, { "acc": 0.73347034, "epoch": 0.25253678335870117, "grad_norm": 2.109375, "learning_rate": 9.841810268408692e-06, "loss": 1.07873793, "memory(GiB)": 368.61, "step": 9955, "train_speed(iter/s)": 0.203727 }, { "acc": 0.74374228, "epoch": 0.2526636225266362, "grad_norm": 2.0625, "learning_rate": 9.841548478455451e-06, "loss": 1.05704365, "memory(GiB)": 368.61, "step": 9960, "train_speed(iter/s)": 0.203736 }, { "acc": 0.73867922, "epoch": 0.2527904616945713, "grad_norm": 2.203125, "learning_rate": 9.841286475549291e-06, "loss": 1.05822144, "memory(GiB)": 368.61, "step": 9965, "train_speed(iter/s)": 0.203752 }, { "acc": 0.74317656, "epoch": 0.25291730086250636, "grad_norm": 2.265625, "learning_rate": 9.841024259701737e-06, "loss": 1.01518631, "memory(GiB)": 368.61, "step": 9970, "train_speed(iter/s)": 0.203761 }, { "acc": 0.74578748, "epoch": 0.2530441400304414, "grad_norm": 2.09375, "learning_rate": 9.840761830924323e-06, "loss": 1.03486233, "memory(GiB)": 368.61, "step": 9975, "train_speed(iter/s)": 0.203783 }, { "acc": 0.7311235, "epoch": 0.25317097919837644, "grad_norm": 2.8125, "learning_rate": 9.84049918922859e-06, "loss": 1.06526527, "memory(GiB)": 368.61, "step": 9980, "train_speed(iter/s)": 0.203803 }, { "acc": 0.73672681, "epoch": 0.25329781836631154, "grad_norm": 2.203125, "learning_rate": 9.840236334626091e-06, "loss": 1.0311058, "memory(GiB)": 368.61, "step": 9985, "train_speed(iter/s)": 0.203812 }, { "acc": 0.73819642, "epoch": 0.2534246575342466, "grad_norm": 1.7890625, "learning_rate": 9.83997326712839e-06, "loss": 1.02926826, "memory(GiB)": 368.61, "step": 9990, "train_speed(iter/s)": 0.203827 }, { "acc": 0.75835061, "epoch": 0.25355149670218163, "grad_norm": 1.890625, "learning_rate": 9.839709986747054e-06, "loss": 1.01053066, "memory(GiB)": 368.61, "step": 9995, "train_speed(iter/s)": 0.203845 }, { "acc": 0.74989471, "epoch": 0.2536783358701167, "grad_norm": 1.890625, "learning_rate": 9.839446493493667e-06, "loss": 1.00320168, "memory(GiB)": 368.61, "step": 10000, "train_speed(iter/s)": 0.203858 }, { "epoch": 0.2536783358701167, "eval_acc": 0.7296803202771301, "eval_loss": 1.0127180814743042, "eval_runtime": 384.3233, "eval_samples_per_second": 16.575, "eval_steps_per_second": 8.287, "step": 10000 }, { "acc": 0.74177256, "epoch": 0.2538051750380518, "grad_norm": 2.078125, "learning_rate": 9.839182787379815e-06, "loss": 0.99234543, "memory(GiB)": 368.61, "step": 10005, "train_speed(iter/s)": 0.200951 }, { "acc": 0.74423146, "epoch": 0.2539320142059868, "grad_norm": 2.265625, "learning_rate": 9.8389188684171e-06, "loss": 0.98791199, "memory(GiB)": 368.61, "step": 10010, "train_speed(iter/s)": 0.200963 }, { "acc": 0.74693289, "epoch": 0.25405885337392187, "grad_norm": 2.125, "learning_rate": 9.838654736617128e-06, "loss": 0.98997211, "memory(GiB)": 368.61, "step": 10015, "train_speed(iter/s)": 0.20098 }, { "acc": 0.74784489, "epoch": 0.2541856925418569, "grad_norm": 2.234375, "learning_rate": 9.838390391991517e-06, "loss": 1.02433033, "memory(GiB)": 368.61, "step": 10020, "train_speed(iter/s)": 0.201002 }, { "acc": 0.73650904, "epoch": 0.254312531709792, "grad_norm": 2.375, "learning_rate": 9.838125834551895e-06, "loss": 1.08018093, "memory(GiB)": 368.61, "step": 10025, "train_speed(iter/s)": 0.201013 }, { "acc": 0.7445734, "epoch": 0.25443937087772706, "grad_norm": 1.9921875, "learning_rate": 9.837861064309899e-06, "loss": 1.06311626, "memory(GiB)": 368.61, "step": 10030, "train_speed(iter/s)": 0.20103 }, { "acc": 0.74086914, "epoch": 0.2545662100456621, "grad_norm": 2.21875, "learning_rate": 9.837596081277173e-06, "loss": 1.00475197, "memory(GiB)": 368.61, "step": 10035, "train_speed(iter/s)": 0.20103 }, { "acc": 0.7452579, "epoch": 0.25469304921359714, "grad_norm": 2.328125, "learning_rate": 9.837330885465373e-06, "loss": 1.04115314, "memory(GiB)": 368.61, "step": 10040, "train_speed(iter/s)": 0.201048 }, { "acc": 0.73913803, "epoch": 0.25481988838153224, "grad_norm": 2.171875, "learning_rate": 9.837065476886163e-06, "loss": 1.08892517, "memory(GiB)": 368.61, "step": 10045, "train_speed(iter/s)": 0.201062 }, { "acc": 0.73405299, "epoch": 0.2549467275494673, "grad_norm": 2.296875, "learning_rate": 9.83679985555122e-06, "loss": 1.04936714, "memory(GiB)": 368.61, "step": 10050, "train_speed(iter/s)": 0.201077 }, { "acc": 0.73953996, "epoch": 0.25507356671740233, "grad_norm": 2.203125, "learning_rate": 9.836534021472222e-06, "loss": 1.02921219, "memory(GiB)": 368.61, "step": 10055, "train_speed(iter/s)": 0.201097 }, { "acc": 0.74027395, "epoch": 0.2552004058853374, "grad_norm": 2.109375, "learning_rate": 9.836267974660866e-06, "loss": 1.03705006, "memory(GiB)": 368.61, "step": 10060, "train_speed(iter/s)": 0.201111 }, { "acc": 0.73598032, "epoch": 0.2553272450532725, "grad_norm": 2.078125, "learning_rate": 9.836001715128851e-06, "loss": 1.09526052, "memory(GiB)": 368.61, "step": 10065, "train_speed(iter/s)": 0.201122 }, { "acc": 0.7394002, "epoch": 0.2554540842212075, "grad_norm": 2.375, "learning_rate": 9.835735242887889e-06, "loss": 1.01746569, "memory(GiB)": 368.61, "step": 10070, "train_speed(iter/s)": 0.20114 }, { "acc": 0.73827963, "epoch": 0.25558092338914257, "grad_norm": 1.984375, "learning_rate": 9.835468557949701e-06, "loss": 1.03954372, "memory(GiB)": 368.61, "step": 10075, "train_speed(iter/s)": 0.201152 }, { "acc": 0.73729296, "epoch": 0.2557077625570776, "grad_norm": 2.125, "learning_rate": 9.83520166032602e-06, "loss": 1.07665081, "memory(GiB)": 368.61, "step": 10080, "train_speed(iter/s)": 0.201168 }, { "acc": 0.7327435, "epoch": 0.2558346017250127, "grad_norm": 2.40625, "learning_rate": 9.834934550028579e-06, "loss": 1.03667297, "memory(GiB)": 368.61, "step": 10085, "train_speed(iter/s)": 0.201188 }, { "acc": 0.73932452, "epoch": 0.25596144089294776, "grad_norm": 2.25, "learning_rate": 9.83466722706913e-06, "loss": 1.07072144, "memory(GiB)": 368.61, "step": 10090, "train_speed(iter/s)": 0.201204 }, { "acc": 0.74110088, "epoch": 0.2560882800608828, "grad_norm": 2.21875, "learning_rate": 9.834399691459433e-06, "loss": 1.03512573, "memory(GiB)": 368.61, "step": 10095, "train_speed(iter/s)": 0.20122 }, { "acc": 0.74097185, "epoch": 0.25621511922881784, "grad_norm": 2.375, "learning_rate": 9.83413194321125e-06, "loss": 1.02702122, "memory(GiB)": 368.61, "step": 10100, "train_speed(iter/s)": 0.201232 }, { "acc": 0.74468689, "epoch": 0.25634195839675294, "grad_norm": 2.28125, "learning_rate": 9.833863982336365e-06, "loss": 0.9997323, "memory(GiB)": 368.61, "step": 10105, "train_speed(iter/s)": 0.201245 }, { "acc": 0.74865398, "epoch": 0.256468797564688, "grad_norm": 2.0, "learning_rate": 9.83359580884656e-06, "loss": 1.03092022, "memory(GiB)": 368.61, "step": 10110, "train_speed(iter/s)": 0.201267 }, { "acc": 0.74539461, "epoch": 0.25659563673262303, "grad_norm": 2.15625, "learning_rate": 9.83332742275363e-06, "loss": 1.05389347, "memory(GiB)": 368.61, "step": 10115, "train_speed(iter/s)": 0.201272 }, { "acc": 0.72800231, "epoch": 0.2567224759005581, "grad_norm": 2.375, "learning_rate": 9.833058824069382e-06, "loss": 1.07010746, "memory(GiB)": 368.61, "step": 10120, "train_speed(iter/s)": 0.201281 }, { "acc": 0.74328456, "epoch": 0.2568493150684932, "grad_norm": 2.5625, "learning_rate": 9.832790012805626e-06, "loss": 1.03763638, "memory(GiB)": 368.61, "step": 10125, "train_speed(iter/s)": 0.20129 }, { "acc": 0.74804735, "epoch": 0.2569761542364282, "grad_norm": 2.28125, "learning_rate": 9.832520988974191e-06, "loss": 1.00333691, "memory(GiB)": 368.61, "step": 10130, "train_speed(iter/s)": 0.201308 }, { "acc": 0.75622854, "epoch": 0.25710299340436327, "grad_norm": 2.109375, "learning_rate": 9.832251752586907e-06, "loss": 0.97704973, "memory(GiB)": 368.61, "step": 10135, "train_speed(iter/s)": 0.201317 }, { "acc": 0.74454684, "epoch": 0.2572298325722983, "grad_norm": 1.953125, "learning_rate": 9.831982303655617e-06, "loss": 1.00075188, "memory(GiB)": 368.61, "step": 10140, "train_speed(iter/s)": 0.201336 }, { "acc": 0.74328594, "epoch": 0.2573566717402334, "grad_norm": 2.15625, "learning_rate": 9.83171264219217e-06, "loss": 1.00573978, "memory(GiB)": 368.61, "step": 10145, "train_speed(iter/s)": 0.201348 }, { "acc": 0.76793022, "epoch": 0.25748351090816846, "grad_norm": 2.234375, "learning_rate": 9.831442768208429e-06, "loss": 0.92903366, "memory(GiB)": 368.61, "step": 10150, "train_speed(iter/s)": 0.201345 }, { "acc": 0.75711141, "epoch": 0.2576103500761035, "grad_norm": 2.046875, "learning_rate": 9.831172681716265e-06, "loss": 1.02166958, "memory(GiB)": 368.61, "step": 10155, "train_speed(iter/s)": 0.201345 }, { "acc": 0.75653629, "epoch": 0.25773718924403854, "grad_norm": 2.078125, "learning_rate": 9.830902382727556e-06, "loss": 1.00990601, "memory(GiB)": 368.61, "step": 10160, "train_speed(iter/s)": 0.201359 }, { "acc": 0.75159798, "epoch": 0.25786402841197364, "grad_norm": 1.984375, "learning_rate": 9.830631871254193e-06, "loss": 0.99651327, "memory(GiB)": 368.61, "step": 10165, "train_speed(iter/s)": 0.201373 }, { "acc": 0.75020604, "epoch": 0.2579908675799087, "grad_norm": 2.15625, "learning_rate": 9.830361147308074e-06, "loss": 1.01959667, "memory(GiB)": 368.61, "step": 10170, "train_speed(iter/s)": 0.201393 }, { "acc": 0.72871437, "epoch": 0.25811770674784373, "grad_norm": 1.9140625, "learning_rate": 9.830090210901104e-06, "loss": 1.05478535, "memory(GiB)": 368.61, "step": 10175, "train_speed(iter/s)": 0.201411 }, { "acc": 0.74672642, "epoch": 0.2582445459157788, "grad_norm": 2.359375, "learning_rate": 9.829819062045203e-06, "loss": 1.03836994, "memory(GiB)": 368.61, "step": 10180, "train_speed(iter/s)": 0.201421 }, { "acc": 0.75322809, "epoch": 0.2583713850837139, "grad_norm": 2.09375, "learning_rate": 9.829547700752295e-06, "loss": 0.97074604, "memory(GiB)": 368.61, "step": 10185, "train_speed(iter/s)": 0.201435 }, { "acc": 0.72366343, "epoch": 0.2584982242516489, "grad_norm": 2.28125, "learning_rate": 9.829276127034315e-06, "loss": 1.09677305, "memory(GiB)": 368.61, "step": 10190, "train_speed(iter/s)": 0.201444 }, { "acc": 0.73360653, "epoch": 0.25862506341958397, "grad_norm": 2.453125, "learning_rate": 9.829004340903214e-06, "loss": 1.06665468, "memory(GiB)": 368.61, "step": 10195, "train_speed(iter/s)": 0.201463 }, { "acc": 0.73701181, "epoch": 0.258751902587519, "grad_norm": 1.7890625, "learning_rate": 9.82873234237094e-06, "loss": 1.09553699, "memory(GiB)": 368.61, "step": 10200, "train_speed(iter/s)": 0.201482 }, { "acc": 0.72836962, "epoch": 0.2588787417554541, "grad_norm": 2.796875, "learning_rate": 9.828460131449457e-06, "loss": 1.09376945, "memory(GiB)": 368.61, "step": 10205, "train_speed(iter/s)": 0.2015 }, { "acc": 0.72715292, "epoch": 0.25900558092338916, "grad_norm": 2.203125, "learning_rate": 9.828187708150743e-06, "loss": 1.06984587, "memory(GiB)": 368.61, "step": 10210, "train_speed(iter/s)": 0.201514 }, { "acc": 0.75371404, "epoch": 0.2591324200913242, "grad_norm": 2.078125, "learning_rate": 9.827915072486776e-06, "loss": 0.99922752, "memory(GiB)": 368.61, "step": 10215, "train_speed(iter/s)": 0.201526 }, { "acc": 0.74566059, "epoch": 0.25925925925925924, "grad_norm": 2.375, "learning_rate": 9.827642224469547e-06, "loss": 1.02944193, "memory(GiB)": 368.61, "step": 10220, "train_speed(iter/s)": 0.201536 }, { "acc": 0.73569975, "epoch": 0.25938609842719434, "grad_norm": 2.53125, "learning_rate": 9.827369164111062e-06, "loss": 1.09774151, "memory(GiB)": 368.61, "step": 10225, "train_speed(iter/s)": 0.201548 }, { "acc": 0.73059702, "epoch": 0.2595129375951294, "grad_norm": 2.53125, "learning_rate": 9.827095891423328e-06, "loss": 1.10837603, "memory(GiB)": 368.61, "step": 10230, "train_speed(iter/s)": 0.201568 }, { "acc": 0.75517845, "epoch": 0.25963977676306443, "grad_norm": 2.359375, "learning_rate": 9.826822406418366e-06, "loss": 0.96015034, "memory(GiB)": 368.61, "step": 10235, "train_speed(iter/s)": 0.201582 }, { "acc": 0.74848099, "epoch": 0.2597666159309995, "grad_norm": 1.96875, "learning_rate": 9.826548709108202e-06, "loss": 0.96588993, "memory(GiB)": 368.61, "step": 10240, "train_speed(iter/s)": 0.201594 }, { "acc": 0.74201326, "epoch": 0.2598934550989346, "grad_norm": 2.453125, "learning_rate": 9.826274799504878e-06, "loss": 1.00334854, "memory(GiB)": 368.61, "step": 10245, "train_speed(iter/s)": 0.201608 }, { "acc": 0.73829203, "epoch": 0.2600202942668696, "grad_norm": 1.9921875, "learning_rate": 9.82600067762044e-06, "loss": 1.06618729, "memory(GiB)": 368.61, "step": 10250, "train_speed(iter/s)": 0.201625 }, { "acc": 0.74862652, "epoch": 0.26014713343480467, "grad_norm": 2.265625, "learning_rate": 9.825726343466947e-06, "loss": 1.04481087, "memory(GiB)": 368.61, "step": 10255, "train_speed(iter/s)": 0.201637 }, { "acc": 0.74460297, "epoch": 0.2602739726027397, "grad_norm": 2.25, "learning_rate": 9.825451797056462e-06, "loss": 1.02102165, "memory(GiB)": 368.61, "step": 10260, "train_speed(iter/s)": 0.201649 }, { "acc": 0.7367383, "epoch": 0.2604008117706748, "grad_norm": 2.40625, "learning_rate": 9.825177038401064e-06, "loss": 1.03936071, "memory(GiB)": 368.61, "step": 10265, "train_speed(iter/s)": 0.201664 }, { "acc": 0.73235197, "epoch": 0.26052765093860986, "grad_norm": 2.0625, "learning_rate": 9.824902067512838e-06, "loss": 1.08809814, "memory(GiB)": 368.61, "step": 10270, "train_speed(iter/s)": 0.201678 }, { "acc": 0.74805164, "epoch": 0.2606544901065449, "grad_norm": 2.296875, "learning_rate": 9.824626884403877e-06, "loss": 1.05575466, "memory(GiB)": 368.61, "step": 10275, "train_speed(iter/s)": 0.201697 }, { "acc": 0.73723984, "epoch": 0.26078132927447994, "grad_norm": 2.0625, "learning_rate": 9.824351489086283e-06, "loss": 1.02605915, "memory(GiB)": 368.61, "step": 10280, "train_speed(iter/s)": 0.201702 }, { "acc": 0.75076399, "epoch": 0.26090816844241504, "grad_norm": 2.4375, "learning_rate": 9.824075881572176e-06, "loss": 1.01974754, "memory(GiB)": 368.61, "step": 10285, "train_speed(iter/s)": 0.201719 }, { "acc": 0.75412498, "epoch": 0.2610350076103501, "grad_norm": 2.34375, "learning_rate": 9.823800061873669e-06, "loss": 1.0474515, "memory(GiB)": 368.61, "step": 10290, "train_speed(iter/s)": 0.201729 }, { "acc": 0.7481102, "epoch": 0.26116184677828513, "grad_norm": 2.796875, "learning_rate": 9.8235240300029e-06, "loss": 1.0694046, "memory(GiB)": 368.61, "step": 10295, "train_speed(iter/s)": 0.201744 }, { "acc": 0.73944912, "epoch": 0.2612886859462202, "grad_norm": 2.0625, "learning_rate": 9.82324778597201e-06, "loss": 1.02461758, "memory(GiB)": 368.61, "step": 10300, "train_speed(iter/s)": 0.201761 }, { "acc": 0.74145761, "epoch": 0.2614155251141553, "grad_norm": 2.28125, "learning_rate": 9.822971329793147e-06, "loss": 1.04376354, "memory(GiB)": 368.61, "step": 10305, "train_speed(iter/s)": 0.20178 }, { "acc": 0.74994226, "epoch": 0.2615423642820903, "grad_norm": 2.03125, "learning_rate": 9.822694661478471e-06, "loss": 0.9983284, "memory(GiB)": 368.61, "step": 10310, "train_speed(iter/s)": 0.201789 }, { "acc": 0.73987379, "epoch": 0.26166920345002537, "grad_norm": 2.3125, "learning_rate": 9.822417781040154e-06, "loss": 1.05726929, "memory(GiB)": 368.61, "step": 10315, "train_speed(iter/s)": 0.201799 }, { "acc": 0.73475008, "epoch": 0.2617960426179604, "grad_norm": 2.046875, "learning_rate": 9.822140688490372e-06, "loss": 1.07606153, "memory(GiB)": 368.61, "step": 10320, "train_speed(iter/s)": 0.201809 }, { "acc": 0.74287443, "epoch": 0.2619228817858955, "grad_norm": 2.265625, "learning_rate": 9.821863383841312e-06, "loss": 1.03136463, "memory(GiB)": 368.61, "step": 10325, "train_speed(iter/s)": 0.201825 }, { "acc": 0.74979835, "epoch": 0.26204972095383056, "grad_norm": 2.09375, "learning_rate": 9.821585867105173e-06, "loss": 1.07053137, "memory(GiB)": 368.61, "step": 10330, "train_speed(iter/s)": 0.201834 }, { "acc": 0.73335724, "epoch": 0.2621765601217656, "grad_norm": 1.9609375, "learning_rate": 9.821308138294162e-06, "loss": 1.07024593, "memory(GiB)": 368.61, "step": 10335, "train_speed(iter/s)": 0.201849 }, { "acc": 0.7390048, "epoch": 0.26230339928970065, "grad_norm": 2.234375, "learning_rate": 9.821030197420492e-06, "loss": 1.05520325, "memory(GiB)": 368.61, "step": 10340, "train_speed(iter/s)": 0.201863 }, { "acc": 0.74405527, "epoch": 0.26243023845763574, "grad_norm": 2.171875, "learning_rate": 9.820752044496389e-06, "loss": 1.03100433, "memory(GiB)": 368.61, "step": 10345, "train_speed(iter/s)": 0.201874 }, { "acc": 0.73630567, "epoch": 0.2625570776255708, "grad_norm": 2.21875, "learning_rate": 9.82047367953409e-06, "loss": 1.0086009, "memory(GiB)": 368.61, "step": 10350, "train_speed(iter/s)": 0.201873 }, { "acc": 0.75530558, "epoch": 0.26268391679350583, "grad_norm": 2.53125, "learning_rate": 9.820195102545835e-06, "loss": 1.01460762, "memory(GiB)": 368.61, "step": 10355, "train_speed(iter/s)": 0.201885 }, { "acc": 0.73040504, "epoch": 0.2628107559614409, "grad_norm": 1.953125, "learning_rate": 9.81991631354388e-06, "loss": 1.02518806, "memory(GiB)": 368.61, "step": 10360, "train_speed(iter/s)": 0.201903 }, { "acc": 0.73351569, "epoch": 0.262937595129376, "grad_norm": 2.390625, "learning_rate": 9.819637312540485e-06, "loss": 1.08640814, "memory(GiB)": 368.61, "step": 10365, "train_speed(iter/s)": 0.201917 }, { "acc": 0.73713174, "epoch": 0.263064434297311, "grad_norm": 2.0625, "learning_rate": 9.819358099547923e-06, "loss": 0.98187733, "memory(GiB)": 368.61, "step": 10370, "train_speed(iter/s)": 0.20192 }, { "acc": 0.72066684, "epoch": 0.26319127346524607, "grad_norm": 2.25, "learning_rate": 9.819078674578474e-06, "loss": 1.13354168, "memory(GiB)": 368.61, "step": 10375, "train_speed(iter/s)": 0.201939 }, { "acc": 0.72802863, "epoch": 0.2633181126331811, "grad_norm": 2.21875, "learning_rate": 9.818799037644432e-06, "loss": 1.10673218, "memory(GiB)": 368.61, "step": 10380, "train_speed(iter/s)": 0.201955 }, { "acc": 0.74497561, "epoch": 0.2634449518011162, "grad_norm": 1.8046875, "learning_rate": 9.818519188758092e-06, "loss": 1.02567148, "memory(GiB)": 368.61, "step": 10385, "train_speed(iter/s)": 0.201968 }, { "acc": 0.74254732, "epoch": 0.26357179096905126, "grad_norm": 2.171875, "learning_rate": 9.818239127931765e-06, "loss": 1.04659662, "memory(GiB)": 368.61, "step": 10390, "train_speed(iter/s)": 0.201985 }, { "acc": 0.7370657, "epoch": 0.2636986301369863, "grad_norm": 2.078125, "learning_rate": 9.81795885517777e-06, "loss": 1.07875614, "memory(GiB)": 368.61, "step": 10395, "train_speed(iter/s)": 0.202 }, { "acc": 0.7392261, "epoch": 0.26382546930492135, "grad_norm": 1.828125, "learning_rate": 9.817678370508434e-06, "loss": 1.07854576, "memory(GiB)": 368.61, "step": 10400, "train_speed(iter/s)": 0.202013 }, { "acc": 0.73466339, "epoch": 0.26395230847285645, "grad_norm": 2.234375, "learning_rate": 9.817397673936093e-06, "loss": 1.11253262, "memory(GiB)": 368.61, "step": 10405, "train_speed(iter/s)": 0.20203 }, { "acc": 0.72541313, "epoch": 0.2640791476407915, "grad_norm": 2.484375, "learning_rate": 9.817116765473095e-06, "loss": 1.09376411, "memory(GiB)": 368.61, "step": 10410, "train_speed(iter/s)": 0.20204 }, { "acc": 0.74940276, "epoch": 0.26420598680872653, "grad_norm": 2.1875, "learning_rate": 9.816835645131795e-06, "loss": 0.96830339, "memory(GiB)": 368.61, "step": 10415, "train_speed(iter/s)": 0.202061 }, { "acc": 0.74541759, "epoch": 0.2643328259766616, "grad_norm": 2.375, "learning_rate": 9.816554312924555e-06, "loss": 1.02350368, "memory(GiB)": 368.61, "step": 10420, "train_speed(iter/s)": 0.202052 }, { "acc": 0.74561572, "epoch": 0.2644596651445967, "grad_norm": 2.3125, "learning_rate": 9.816272768863756e-06, "loss": 1.03651295, "memory(GiB)": 368.61, "step": 10425, "train_speed(iter/s)": 0.202064 }, { "acc": 0.74868746, "epoch": 0.2645865043125317, "grad_norm": 2.1875, "learning_rate": 9.815991012961773e-06, "loss": 1.03384666, "memory(GiB)": 368.61, "step": 10430, "train_speed(iter/s)": 0.202076 }, { "acc": 0.73939652, "epoch": 0.26471334348046677, "grad_norm": 2.078125, "learning_rate": 9.815709045231008e-06, "loss": 1.11578884, "memory(GiB)": 368.61, "step": 10435, "train_speed(iter/s)": 0.202094 }, { "acc": 0.732061, "epoch": 0.2648401826484018, "grad_norm": 1.9140625, "learning_rate": 9.815426865683858e-06, "loss": 1.09217796, "memory(GiB)": 368.61, "step": 10440, "train_speed(iter/s)": 0.202109 }, { "acc": 0.74909582, "epoch": 0.2649670218163369, "grad_norm": 2.09375, "learning_rate": 9.815144474332732e-06, "loss": 0.98801174, "memory(GiB)": 368.61, "step": 10445, "train_speed(iter/s)": 0.202125 }, { "acc": 0.7459094, "epoch": 0.26509386098427196, "grad_norm": 2.0625, "learning_rate": 9.814861871190056e-06, "loss": 0.9936142, "memory(GiB)": 368.61, "step": 10450, "train_speed(iter/s)": 0.202137 }, { "acc": 0.74480381, "epoch": 0.265220700152207, "grad_norm": 2.34375, "learning_rate": 9.814579056268256e-06, "loss": 0.98455658, "memory(GiB)": 368.61, "step": 10455, "train_speed(iter/s)": 0.202153 }, { "acc": 0.74288092, "epoch": 0.26534753932014205, "grad_norm": 2.21875, "learning_rate": 9.814296029579776e-06, "loss": 1.04196091, "memory(GiB)": 368.61, "step": 10460, "train_speed(iter/s)": 0.202168 }, { "acc": 0.73159971, "epoch": 0.26547437848807715, "grad_norm": 2.09375, "learning_rate": 9.814012791137063e-06, "loss": 1.09068565, "memory(GiB)": 368.61, "step": 10465, "train_speed(iter/s)": 0.202182 }, { "acc": 0.73481302, "epoch": 0.2656012176560122, "grad_norm": 2.421875, "learning_rate": 9.81372934095257e-06, "loss": 1.04517841, "memory(GiB)": 368.61, "step": 10470, "train_speed(iter/s)": 0.202173 }, { "acc": 0.72382488, "epoch": 0.26572805682394723, "grad_norm": 2.171875, "learning_rate": 9.813445679038773e-06, "loss": 1.09735909, "memory(GiB)": 368.61, "step": 10475, "train_speed(iter/s)": 0.202192 }, { "acc": 0.73220301, "epoch": 0.2658548959918823, "grad_norm": 2.6875, "learning_rate": 9.813161805408145e-06, "loss": 1.06310482, "memory(GiB)": 368.61, "step": 10480, "train_speed(iter/s)": 0.202204 }, { "acc": 0.74410439, "epoch": 0.2659817351598174, "grad_norm": 2.28125, "learning_rate": 9.812877720073169e-06, "loss": 1.06366796, "memory(GiB)": 368.61, "step": 10485, "train_speed(iter/s)": 0.202215 }, { "acc": 0.73405437, "epoch": 0.2661085743277524, "grad_norm": 2.765625, "learning_rate": 9.812593423046344e-06, "loss": 1.09524174, "memory(GiB)": 368.61, "step": 10490, "train_speed(iter/s)": 0.202229 }, { "acc": 0.72738914, "epoch": 0.26623541349568747, "grad_norm": 1.9453125, "learning_rate": 9.812308914340174e-06, "loss": 1.10650673, "memory(GiB)": 368.61, "step": 10495, "train_speed(iter/s)": 0.202243 }, { "acc": 0.7375514, "epoch": 0.2663622526636225, "grad_norm": 2.375, "learning_rate": 9.812024193967171e-06, "loss": 1.0912981, "memory(GiB)": 368.61, "step": 10500, "train_speed(iter/s)": 0.202258 }, { "acc": 0.73650064, "epoch": 0.2664890918315576, "grad_norm": 2.328125, "learning_rate": 9.811739261939861e-06, "loss": 1.0465189, "memory(GiB)": 368.61, "step": 10505, "train_speed(iter/s)": 0.202273 }, { "acc": 0.74477658, "epoch": 0.26661593099949266, "grad_norm": 1.875, "learning_rate": 9.811454118270775e-06, "loss": 1.04363365, "memory(GiB)": 368.61, "step": 10510, "train_speed(iter/s)": 0.202283 }, { "acc": 0.73216758, "epoch": 0.2667427701674277, "grad_norm": 2.203125, "learning_rate": 9.811168762972457e-06, "loss": 1.06074572, "memory(GiB)": 368.61, "step": 10515, "train_speed(iter/s)": 0.202293 }, { "acc": 0.73497338, "epoch": 0.26686960933536275, "grad_norm": 2.390625, "learning_rate": 9.810883196057454e-06, "loss": 1.08132915, "memory(GiB)": 368.61, "step": 10520, "train_speed(iter/s)": 0.202301 }, { "acc": 0.74781122, "epoch": 0.26699644850329785, "grad_norm": 2.140625, "learning_rate": 9.81059741753833e-06, "loss": 0.99688749, "memory(GiB)": 368.61, "step": 10525, "train_speed(iter/s)": 0.202311 }, { "acc": 0.73433223, "epoch": 0.2671232876712329, "grad_norm": 2.078125, "learning_rate": 9.810311427427653e-06, "loss": 1.06823883, "memory(GiB)": 368.61, "step": 10530, "train_speed(iter/s)": 0.202325 }, { "acc": 0.75128584, "epoch": 0.26725012683916793, "grad_norm": 2.1875, "learning_rate": 9.810025225738005e-06, "loss": 1.04449635, "memory(GiB)": 368.61, "step": 10535, "train_speed(iter/s)": 0.202341 }, { "acc": 0.71479216, "epoch": 0.267376966007103, "grad_norm": 2.703125, "learning_rate": 9.809738812481971e-06, "loss": 1.14536438, "memory(GiB)": 368.61, "step": 10540, "train_speed(iter/s)": 0.202352 }, { "acc": 0.74070425, "epoch": 0.2675038051750381, "grad_norm": 2.1875, "learning_rate": 9.809452187672149e-06, "loss": 1.0361002, "memory(GiB)": 368.61, "step": 10545, "train_speed(iter/s)": 0.202368 }, { "acc": 0.73555098, "epoch": 0.2676306443429731, "grad_norm": 1.9375, "learning_rate": 9.809165351321149e-06, "loss": 1.06779232, "memory(GiB)": 368.61, "step": 10550, "train_speed(iter/s)": 0.202384 }, { "acc": 0.75737638, "epoch": 0.26775748351090817, "grad_norm": 2.296875, "learning_rate": 9.808878303441585e-06, "loss": 1.02124071, "memory(GiB)": 368.61, "step": 10555, "train_speed(iter/s)": 0.202398 }, { "acc": 0.74197292, "epoch": 0.2678843226788432, "grad_norm": 2.671875, "learning_rate": 9.808591044046083e-06, "loss": 1.05330639, "memory(GiB)": 368.61, "step": 10560, "train_speed(iter/s)": 0.202408 }, { "acc": 0.7382741, "epoch": 0.2680111618467783, "grad_norm": 2.234375, "learning_rate": 9.808303573147277e-06, "loss": 1.06581478, "memory(GiB)": 368.61, "step": 10565, "train_speed(iter/s)": 0.202411 }, { "acc": 0.74330359, "epoch": 0.26813800101471336, "grad_norm": 2.6875, "learning_rate": 9.808015890757812e-06, "loss": 1.0663312, "memory(GiB)": 368.61, "step": 10570, "train_speed(iter/s)": 0.202429 }, { "acc": 0.74429255, "epoch": 0.2682648401826484, "grad_norm": 2.03125, "learning_rate": 9.807727996890343e-06, "loss": 1.01889114, "memory(GiB)": 368.61, "step": 10575, "train_speed(iter/s)": 0.202443 }, { "acc": 0.7395957, "epoch": 0.26839167935058345, "grad_norm": 2.390625, "learning_rate": 9.807439891557533e-06, "loss": 1.05447559, "memory(GiB)": 368.61, "step": 10580, "train_speed(iter/s)": 0.202457 }, { "acc": 0.73306417, "epoch": 0.26851851851851855, "grad_norm": 2.3125, "learning_rate": 9.80715157477205e-06, "loss": 1.08778591, "memory(GiB)": 368.61, "step": 10585, "train_speed(iter/s)": 0.202469 }, { "acc": 0.73702402, "epoch": 0.2686453576864536, "grad_norm": 2.046875, "learning_rate": 9.806863046546581e-06, "loss": 1.10270138, "memory(GiB)": 368.61, "step": 10590, "train_speed(iter/s)": 0.202481 }, { "acc": 0.74261069, "epoch": 0.26877219685438863, "grad_norm": 1.8203125, "learning_rate": 9.806574306893814e-06, "loss": 1.07679539, "memory(GiB)": 368.61, "step": 10595, "train_speed(iter/s)": 0.202492 }, { "acc": 0.74724989, "epoch": 0.2688990360223237, "grad_norm": 2.421875, "learning_rate": 9.806285355826447e-06, "loss": 1.03557186, "memory(GiB)": 368.61, "step": 10600, "train_speed(iter/s)": 0.202505 }, { "acc": 0.7530673, "epoch": 0.2690258751902588, "grad_norm": 2.25, "learning_rate": 9.805996193357194e-06, "loss": 1.00206909, "memory(GiB)": 368.61, "step": 10605, "train_speed(iter/s)": 0.202521 }, { "acc": 0.73038964, "epoch": 0.2691527143581938, "grad_norm": 2.140625, "learning_rate": 9.80570681949877e-06, "loss": 1.05477238, "memory(GiB)": 368.61, "step": 10610, "train_speed(iter/s)": 0.202534 }, { "acc": 0.73319473, "epoch": 0.26927955352612887, "grad_norm": 2.015625, "learning_rate": 9.805417234263905e-06, "loss": 1.06110077, "memory(GiB)": 368.61, "step": 10615, "train_speed(iter/s)": 0.202548 }, { "acc": 0.73446674, "epoch": 0.2694063926940639, "grad_norm": 2.03125, "learning_rate": 9.805127437665333e-06, "loss": 1.08642559, "memory(GiB)": 368.61, "step": 10620, "train_speed(iter/s)": 0.202561 }, { "acc": 0.72833138, "epoch": 0.269533231861999, "grad_norm": 1.921875, "learning_rate": 9.804837429715805e-06, "loss": 1.06623926, "memory(GiB)": 368.61, "step": 10625, "train_speed(iter/s)": 0.20257 }, { "acc": 0.7502955, "epoch": 0.26966007102993406, "grad_norm": 2.0, "learning_rate": 9.804547210428074e-06, "loss": 1.04424229, "memory(GiB)": 368.61, "step": 10630, "train_speed(iter/s)": 0.202578 }, { "acc": 0.74075956, "epoch": 0.2697869101978691, "grad_norm": 2.21875, "learning_rate": 9.804256779814906e-06, "loss": 1.05154839, "memory(GiB)": 368.61, "step": 10635, "train_speed(iter/s)": 0.202587 }, { "acc": 0.73658772, "epoch": 0.26991374936580415, "grad_norm": 2.203125, "learning_rate": 9.803966137889076e-06, "loss": 0.99498997, "memory(GiB)": 368.61, "step": 10640, "train_speed(iter/s)": 0.202593 }, { "acc": 0.72789178, "epoch": 0.27004058853373925, "grad_norm": 2.171875, "learning_rate": 9.803675284663368e-06, "loss": 1.110497, "memory(GiB)": 368.61, "step": 10645, "train_speed(iter/s)": 0.202608 }, { "acc": 0.73871818, "epoch": 0.2701674277016743, "grad_norm": 2.734375, "learning_rate": 9.803384220150571e-06, "loss": 1.08543129, "memory(GiB)": 368.61, "step": 10650, "train_speed(iter/s)": 0.202623 }, { "acc": 0.74027672, "epoch": 0.27029426686960933, "grad_norm": 2.53125, "learning_rate": 9.803092944363493e-06, "loss": 1.0714241, "memory(GiB)": 368.61, "step": 10655, "train_speed(iter/s)": 0.202647 }, { "acc": 0.75487919, "epoch": 0.2704211060375444, "grad_norm": 2.453125, "learning_rate": 9.802801457314943e-06, "loss": 1.02373219, "memory(GiB)": 368.61, "step": 10660, "train_speed(iter/s)": 0.202665 }, { "acc": 0.74355168, "epoch": 0.2705479452054795, "grad_norm": 3.015625, "learning_rate": 9.802509759017741e-06, "loss": 1.01476564, "memory(GiB)": 368.61, "step": 10665, "train_speed(iter/s)": 0.202667 }, { "acc": 0.72916684, "epoch": 0.2706747843734145, "grad_norm": 1.703125, "learning_rate": 9.802217849484719e-06, "loss": 1.10833683, "memory(GiB)": 368.61, "step": 10670, "train_speed(iter/s)": 0.202685 }, { "acc": 0.73637543, "epoch": 0.27080162354134957, "grad_norm": 2.484375, "learning_rate": 9.801925728728715e-06, "loss": 1.03007565, "memory(GiB)": 368.61, "step": 10675, "train_speed(iter/s)": 0.2027 }, { "acc": 0.74089985, "epoch": 0.2709284627092846, "grad_norm": 2.046875, "learning_rate": 9.801633396762577e-06, "loss": 1.06168823, "memory(GiB)": 368.61, "step": 10680, "train_speed(iter/s)": 0.202708 }, { "acc": 0.74809265, "epoch": 0.2710553018772197, "grad_norm": 2.40625, "learning_rate": 9.801340853599167e-06, "loss": 1.06390753, "memory(GiB)": 368.61, "step": 10685, "train_speed(iter/s)": 0.202723 }, { "acc": 0.76122055, "epoch": 0.27118214104515476, "grad_norm": 3.03125, "learning_rate": 9.801048099251348e-06, "loss": 0.99939423, "memory(GiB)": 368.61, "step": 10690, "train_speed(iter/s)": 0.202739 }, { "acc": 0.73028836, "epoch": 0.2713089802130898, "grad_norm": 2.078125, "learning_rate": 9.800755133731999e-06, "loss": 1.04913082, "memory(GiB)": 368.61, "step": 10695, "train_speed(iter/s)": 0.202753 }, { "acc": 0.74182129, "epoch": 0.27143581938102485, "grad_norm": 2.171875, "learning_rate": 9.800461957054006e-06, "loss": 1.05461388, "memory(GiB)": 368.61, "step": 10700, "train_speed(iter/s)": 0.202761 }, { "acc": 0.74689827, "epoch": 0.27156265854895995, "grad_norm": 2.34375, "learning_rate": 9.800168569230261e-06, "loss": 1.02153788, "memory(GiB)": 368.61, "step": 10705, "train_speed(iter/s)": 0.202775 }, { "acc": 0.75071526, "epoch": 0.271689497716895, "grad_norm": 2.078125, "learning_rate": 9.799874970273674e-06, "loss": 1.02346344, "memory(GiB)": 368.61, "step": 10710, "train_speed(iter/s)": 0.202794 }, { "acc": 0.74590726, "epoch": 0.27181633688483003, "grad_norm": 2.015625, "learning_rate": 9.799581160197156e-06, "loss": 1.04090147, "memory(GiB)": 368.61, "step": 10715, "train_speed(iter/s)": 0.202809 }, { "acc": 0.74859114, "epoch": 0.2719431760527651, "grad_norm": 1.796875, "learning_rate": 9.799287139013628e-06, "loss": 1.00433664, "memory(GiB)": 368.61, "step": 10720, "train_speed(iter/s)": 0.202817 }, { "acc": 0.74956942, "epoch": 0.2720700152207002, "grad_norm": 2.296875, "learning_rate": 9.798992906736028e-06, "loss": 0.98336563, "memory(GiB)": 368.61, "step": 10725, "train_speed(iter/s)": 0.202828 }, { "acc": 0.73798332, "epoch": 0.2721968543886352, "grad_norm": 2.171875, "learning_rate": 9.79869846337729e-06, "loss": 1.07160511, "memory(GiB)": 368.61, "step": 10730, "train_speed(iter/s)": 0.202831 }, { "acc": 0.73917093, "epoch": 0.27232369355657027, "grad_norm": 2.59375, "learning_rate": 9.79840380895037e-06, "loss": 1.06155014, "memory(GiB)": 368.61, "step": 10735, "train_speed(iter/s)": 0.202834 }, { "acc": 0.74812794, "epoch": 0.2724505327245053, "grad_norm": 2.078125, "learning_rate": 9.798108943468228e-06, "loss": 0.96765594, "memory(GiB)": 368.61, "step": 10740, "train_speed(iter/s)": 0.202843 }, { "acc": 0.74096117, "epoch": 0.2725773718924404, "grad_norm": 2.015625, "learning_rate": 9.797813866943832e-06, "loss": 0.99119511, "memory(GiB)": 368.61, "step": 10745, "train_speed(iter/s)": 0.202858 }, { "acc": 0.75426474, "epoch": 0.27270421106037546, "grad_norm": 2.1875, "learning_rate": 9.797518579390162e-06, "loss": 0.94296131, "memory(GiB)": 368.61, "step": 10750, "train_speed(iter/s)": 0.202873 }, { "acc": 0.74227171, "epoch": 0.2728310502283105, "grad_norm": 1.71875, "learning_rate": 9.797223080820204e-06, "loss": 1.01737309, "memory(GiB)": 368.61, "step": 10755, "train_speed(iter/s)": 0.202884 }, { "acc": 0.73874741, "epoch": 0.27295788939624555, "grad_norm": 1.9296875, "learning_rate": 9.796927371246958e-06, "loss": 1.03918133, "memory(GiB)": 368.61, "step": 10760, "train_speed(iter/s)": 0.202902 }, { "acc": 0.74467373, "epoch": 0.27308472856418065, "grad_norm": 2.140625, "learning_rate": 9.796631450683431e-06, "loss": 1.01751194, "memory(GiB)": 368.61, "step": 10765, "train_speed(iter/s)": 0.202913 }, { "acc": 0.72464862, "epoch": 0.2732115677321157, "grad_norm": 2.34375, "learning_rate": 9.796335319142637e-06, "loss": 1.02419357, "memory(GiB)": 368.61, "step": 10770, "train_speed(iter/s)": 0.202923 }, { "acc": 0.74303026, "epoch": 0.27333840690005073, "grad_norm": 2.4375, "learning_rate": 9.796038976637599e-06, "loss": 1.03835754, "memory(GiB)": 368.61, "step": 10775, "train_speed(iter/s)": 0.202935 }, { "acc": 0.74250603, "epoch": 0.2734652460679858, "grad_norm": 2.140625, "learning_rate": 9.795742423181355e-06, "loss": 1.04821758, "memory(GiB)": 368.61, "step": 10780, "train_speed(iter/s)": 0.202948 }, { "acc": 0.7209126, "epoch": 0.2735920852359209, "grad_norm": 1.9375, "learning_rate": 9.795445658786948e-06, "loss": 1.13291159, "memory(GiB)": 368.61, "step": 10785, "train_speed(iter/s)": 0.202959 }, { "acc": 0.736904, "epoch": 0.2737189244038559, "grad_norm": 1.9921875, "learning_rate": 9.795148683467431e-06, "loss": 1.06104622, "memory(GiB)": 368.61, "step": 10790, "train_speed(iter/s)": 0.202974 }, { "acc": 0.73351293, "epoch": 0.27384576357179097, "grad_norm": 2.203125, "learning_rate": 9.794851497235866e-06, "loss": 1.03558655, "memory(GiB)": 368.61, "step": 10795, "train_speed(iter/s)": 0.20299 }, { "acc": 0.74156237, "epoch": 0.273972602739726, "grad_norm": 2.28125, "learning_rate": 9.794554100105325e-06, "loss": 1.01357822, "memory(GiB)": 368.61, "step": 10800, "train_speed(iter/s)": 0.203005 }, { "acc": 0.74097443, "epoch": 0.2740994419076611, "grad_norm": 1.90625, "learning_rate": 9.794256492088888e-06, "loss": 1.00958633, "memory(GiB)": 368.61, "step": 10805, "train_speed(iter/s)": 0.20302 }, { "acc": 0.74195948, "epoch": 0.27422628107559616, "grad_norm": 2.328125, "learning_rate": 9.793958673199647e-06, "loss": 1.08746605, "memory(GiB)": 368.61, "step": 10810, "train_speed(iter/s)": 0.203035 }, { "acc": 0.74668026, "epoch": 0.2743531202435312, "grad_norm": 1.875, "learning_rate": 9.793660643450697e-06, "loss": 1.02595978, "memory(GiB)": 368.61, "step": 10815, "train_speed(iter/s)": 0.203044 }, { "acc": 0.73643064, "epoch": 0.27447995941146625, "grad_norm": 2.0625, "learning_rate": 9.793362402855152e-06, "loss": 1.04381313, "memory(GiB)": 368.61, "step": 10820, "train_speed(iter/s)": 0.203056 }, { "acc": 0.73703594, "epoch": 0.27460679857940135, "grad_norm": 2.171875, "learning_rate": 9.79306395142613e-06, "loss": 1.03071785, "memory(GiB)": 368.61, "step": 10825, "train_speed(iter/s)": 0.203069 }, { "acc": 0.74759951, "epoch": 0.2747336377473364, "grad_norm": 1.6953125, "learning_rate": 9.792765289176751e-06, "loss": 0.98941822, "memory(GiB)": 368.61, "step": 10830, "train_speed(iter/s)": 0.203075 }, { "acc": 0.7389544, "epoch": 0.27486047691527143, "grad_norm": 2.046875, "learning_rate": 9.79246641612016e-06, "loss": 1.04939356, "memory(GiB)": 368.61, "step": 10835, "train_speed(iter/s)": 0.203093 }, { "acc": 0.74358044, "epoch": 0.2749873160832065, "grad_norm": 2.03125, "learning_rate": 9.792167332269498e-06, "loss": 1.02518959, "memory(GiB)": 368.61, "step": 10840, "train_speed(iter/s)": 0.203107 }, { "acc": 0.75188494, "epoch": 0.2751141552511416, "grad_norm": 1.9609375, "learning_rate": 9.791868037637922e-06, "loss": 1.02937355, "memory(GiB)": 368.61, "step": 10845, "train_speed(iter/s)": 0.203122 }, { "acc": 0.73779116, "epoch": 0.2752409944190766, "grad_norm": 2.484375, "learning_rate": 9.791568532238594e-06, "loss": 1.08301678, "memory(GiB)": 368.61, "step": 10850, "train_speed(iter/s)": 0.203139 }, { "acc": 0.7433898, "epoch": 0.27536783358701167, "grad_norm": 2.234375, "learning_rate": 9.79126881608469e-06, "loss": 1.03382959, "memory(GiB)": 368.61, "step": 10855, "train_speed(iter/s)": 0.203149 }, { "acc": 0.74170675, "epoch": 0.2754946727549467, "grad_norm": 2.390625, "learning_rate": 9.790968889189392e-06, "loss": 1.03473949, "memory(GiB)": 368.61, "step": 10860, "train_speed(iter/s)": 0.203166 }, { "acc": 0.74993277, "epoch": 0.2756215119228818, "grad_norm": 2.46875, "learning_rate": 9.790668751565893e-06, "loss": 1.04451332, "memory(GiB)": 368.61, "step": 10865, "train_speed(iter/s)": 0.203182 }, { "acc": 0.74067812, "epoch": 0.27574835109081686, "grad_norm": 2.015625, "learning_rate": 9.790368403227391e-06, "loss": 1.10869904, "memory(GiB)": 368.61, "step": 10870, "train_speed(iter/s)": 0.203198 }, { "acc": 0.73339262, "epoch": 0.2758751902587519, "grad_norm": 1.7734375, "learning_rate": 9.7900678441871e-06, "loss": 0.99461784, "memory(GiB)": 368.61, "step": 10875, "train_speed(iter/s)": 0.2032 }, { "acc": 0.74174013, "epoch": 0.27600202942668695, "grad_norm": 2.109375, "learning_rate": 9.78976707445824e-06, "loss": 0.99721012, "memory(GiB)": 368.61, "step": 10880, "train_speed(iter/s)": 0.203204 }, { "acc": 0.74653392, "epoch": 0.27612886859462205, "grad_norm": 2.3125, "learning_rate": 9.78946609405404e-06, "loss": 1.05249739, "memory(GiB)": 368.61, "step": 10885, "train_speed(iter/s)": 0.203211 }, { "acc": 0.75095263, "epoch": 0.2762557077625571, "grad_norm": 2.890625, "learning_rate": 9.789164902987738e-06, "loss": 1.00573502, "memory(GiB)": 368.61, "step": 10890, "train_speed(iter/s)": 0.203226 }, { "acc": 0.7514966, "epoch": 0.27638254693049213, "grad_norm": 2.03125, "learning_rate": 9.78886350127258e-06, "loss": 0.99418602, "memory(GiB)": 368.61, "step": 10895, "train_speed(iter/s)": 0.203235 }, { "acc": 0.73971863, "epoch": 0.2765093860984272, "grad_norm": 2.171875, "learning_rate": 9.788561888921825e-06, "loss": 1.06868954, "memory(GiB)": 368.61, "step": 10900, "train_speed(iter/s)": 0.203249 }, { "acc": 0.74653268, "epoch": 0.2766362252663623, "grad_norm": 2.421875, "learning_rate": 9.788260065948738e-06, "loss": 1.08226624, "memory(GiB)": 368.61, "step": 10905, "train_speed(iter/s)": 0.203261 }, { "acc": 0.73405213, "epoch": 0.2767630644342973, "grad_norm": 2.34375, "learning_rate": 9.787958032366596e-06, "loss": 1.09137745, "memory(GiB)": 368.61, "step": 10910, "train_speed(iter/s)": 0.203271 }, { "acc": 0.74659262, "epoch": 0.27688990360223237, "grad_norm": 2.046875, "learning_rate": 9.787655788188684e-06, "loss": 0.99842758, "memory(GiB)": 368.61, "step": 10915, "train_speed(iter/s)": 0.203285 }, { "acc": 0.75439072, "epoch": 0.2770167427701674, "grad_norm": 2.265625, "learning_rate": 9.787353333428293e-06, "loss": 0.97382946, "memory(GiB)": 368.61, "step": 10920, "train_speed(iter/s)": 0.203301 }, { "acc": 0.73893881, "epoch": 0.2771435819381025, "grad_norm": 2.421875, "learning_rate": 9.78705066809873e-06, "loss": 1.08204117, "memory(GiB)": 368.61, "step": 10925, "train_speed(iter/s)": 0.203312 }, { "acc": 0.72857003, "epoch": 0.27727042110603756, "grad_norm": 2.09375, "learning_rate": 9.786747792213304e-06, "loss": 1.08521566, "memory(GiB)": 368.61, "step": 10930, "train_speed(iter/s)": 0.203323 }, { "acc": 0.75533724, "epoch": 0.2773972602739726, "grad_norm": 2.390625, "learning_rate": 9.78644470578534e-06, "loss": 1.01521883, "memory(GiB)": 368.61, "step": 10935, "train_speed(iter/s)": 0.203338 }, { "acc": 0.72565308, "epoch": 0.27752409944190765, "grad_norm": 2.15625, "learning_rate": 9.78614140882817e-06, "loss": 1.10458221, "memory(GiB)": 368.61, "step": 10940, "train_speed(iter/s)": 0.203348 }, { "acc": 0.7599514, "epoch": 0.27765093860984275, "grad_norm": 2.0, "learning_rate": 9.78583790135513e-06, "loss": 0.98292885, "memory(GiB)": 368.61, "step": 10945, "train_speed(iter/s)": 0.203359 }, { "acc": 0.74264765, "epoch": 0.2777777777777778, "grad_norm": 2.1875, "learning_rate": 9.785534183379571e-06, "loss": 1.0344985, "memory(GiB)": 368.61, "step": 10950, "train_speed(iter/s)": 0.203372 }, { "acc": 0.74249668, "epoch": 0.27790461694571283, "grad_norm": 2.09375, "learning_rate": 9.785230254914855e-06, "loss": 1.04484043, "memory(GiB)": 368.61, "step": 10955, "train_speed(iter/s)": 0.203375 }, { "acc": 0.73793182, "epoch": 0.2780314561136479, "grad_norm": 1.9296875, "learning_rate": 9.784926115974346e-06, "loss": 1.05033407, "memory(GiB)": 368.61, "step": 10960, "train_speed(iter/s)": 0.203391 }, { "acc": 0.75515437, "epoch": 0.278158295281583, "grad_norm": 1.9296875, "learning_rate": 9.784621766571424e-06, "loss": 1.02230682, "memory(GiB)": 368.61, "step": 10965, "train_speed(iter/s)": 0.203401 }, { "acc": 0.73566184, "epoch": 0.278285134449518, "grad_norm": 2.09375, "learning_rate": 9.784317206719475e-06, "loss": 1.09520912, "memory(GiB)": 368.61, "step": 10970, "train_speed(iter/s)": 0.203413 }, { "acc": 0.73460827, "epoch": 0.27841197361745307, "grad_norm": 1.8359375, "learning_rate": 9.784012436431896e-06, "loss": 1.0540966, "memory(GiB)": 368.61, "step": 10975, "train_speed(iter/s)": 0.203418 }, { "acc": 0.74286046, "epoch": 0.2785388127853881, "grad_norm": 1.984375, "learning_rate": 9.78370745572209e-06, "loss": 1.0621067, "memory(GiB)": 368.61, "step": 10980, "train_speed(iter/s)": 0.203431 }, { "acc": 0.73997564, "epoch": 0.2786656519533232, "grad_norm": 2.703125, "learning_rate": 9.783402264603471e-06, "loss": 0.97437286, "memory(GiB)": 368.61, "step": 10985, "train_speed(iter/s)": 0.203438 }, { "acc": 0.73181057, "epoch": 0.27879249112125826, "grad_norm": 1.984375, "learning_rate": 9.783096863089465e-06, "loss": 1.04829254, "memory(GiB)": 368.61, "step": 10990, "train_speed(iter/s)": 0.203451 }, { "acc": 0.7606658, "epoch": 0.2789193302891933, "grad_norm": 2.34375, "learning_rate": 9.782791251193505e-06, "loss": 0.90660381, "memory(GiB)": 368.61, "step": 10995, "train_speed(iter/s)": 0.203465 }, { "acc": 0.74278431, "epoch": 0.27904616945712835, "grad_norm": 1.7734375, "learning_rate": 9.782485428929032e-06, "loss": 1.01054745, "memory(GiB)": 368.61, "step": 11000, "train_speed(iter/s)": 0.203481 }, { "epoch": 0.27904616945712835, "eval_acc": 0.7306253099311227, "eval_loss": 1.0085450410842896, "eval_runtime": 384.9412, "eval_samples_per_second": 16.548, "eval_steps_per_second": 8.274, "step": 11000 }, { "acc": 0.75703979, "epoch": 0.27917300862506345, "grad_norm": 2.140625, "learning_rate": 9.782179396309496e-06, "loss": 1.05946274, "memory(GiB)": 368.61, "step": 11005, "train_speed(iter/s)": 0.200835 }, { "acc": 0.73149514, "epoch": 0.2792998477929985, "grad_norm": 1.984375, "learning_rate": 9.78187315334836e-06, "loss": 1.048454, "memory(GiB)": 368.61, "step": 11010, "train_speed(iter/s)": 0.200851 }, { "acc": 0.74149475, "epoch": 0.27942668696093353, "grad_norm": 1.984375, "learning_rate": 9.781566700059094e-06, "loss": 0.99879999, "memory(GiB)": 368.61, "step": 11015, "train_speed(iter/s)": 0.200866 }, { "acc": 0.73819342, "epoch": 0.2795535261288686, "grad_norm": 2.625, "learning_rate": 9.781260036455176e-06, "loss": 1.07098198, "memory(GiB)": 368.61, "step": 11020, "train_speed(iter/s)": 0.200875 }, { "acc": 0.75138702, "epoch": 0.2796803652968037, "grad_norm": 1.796875, "learning_rate": 9.780953162550093e-06, "loss": 1.01427097, "memory(GiB)": 368.61, "step": 11025, "train_speed(iter/s)": 0.200888 }, { "acc": 0.73522348, "epoch": 0.2798072044647387, "grad_norm": 2.28125, "learning_rate": 9.780646078357346e-06, "loss": 1.05452347, "memory(GiB)": 368.61, "step": 11030, "train_speed(iter/s)": 0.200902 }, { "acc": 0.74384699, "epoch": 0.27993404363267377, "grad_norm": 2.1875, "learning_rate": 9.78033878389044e-06, "loss": 1.06644287, "memory(GiB)": 368.61, "step": 11035, "train_speed(iter/s)": 0.200917 }, { "acc": 0.7355011, "epoch": 0.2800608828006088, "grad_norm": 2.515625, "learning_rate": 9.780031279162892e-06, "loss": 1.07278042, "memory(GiB)": 368.61, "step": 11040, "train_speed(iter/s)": 0.200928 }, { "acc": 0.74954257, "epoch": 0.2801877219685439, "grad_norm": 3.03125, "learning_rate": 9.779723564188228e-06, "loss": 1.08009529, "memory(GiB)": 368.61, "step": 11045, "train_speed(iter/s)": 0.200948 }, { "acc": 0.73513002, "epoch": 0.28031456113647896, "grad_norm": 2.40625, "learning_rate": 9.77941563897998e-06, "loss": 1.06684561, "memory(GiB)": 368.61, "step": 11050, "train_speed(iter/s)": 0.200966 }, { "acc": 0.74524417, "epoch": 0.280441400304414, "grad_norm": 2.21875, "learning_rate": 9.779107503551695e-06, "loss": 1.05109959, "memory(GiB)": 368.61, "step": 11055, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75612164, "epoch": 0.28056823947234905, "grad_norm": 2.25, "learning_rate": 9.778799157916926e-06, "loss": 0.99730244, "memory(GiB)": 368.61, "step": 11060, "train_speed(iter/s)": 0.200989 }, { "acc": 0.73577528, "epoch": 0.28069507864028415, "grad_norm": 1.8984375, "learning_rate": 9.77849060208923e-06, "loss": 1.07477522, "memory(GiB)": 368.61, "step": 11065, "train_speed(iter/s)": 0.200992 }, { "acc": 0.72614136, "epoch": 0.2808219178082192, "grad_norm": 1.9375, "learning_rate": 9.778181836082185e-06, "loss": 1.06237879, "memory(GiB)": 368.61, "step": 11070, "train_speed(iter/s)": 0.201004 }, { "acc": 0.74855008, "epoch": 0.28094875697615423, "grad_norm": 2.578125, "learning_rate": 9.777872859909373e-06, "loss": 1.03873987, "memory(GiB)": 368.61, "step": 11075, "train_speed(iter/s)": 0.201014 }, { "acc": 0.75953245, "epoch": 0.2810755961440893, "grad_norm": 2.140625, "learning_rate": 9.777563673584376e-06, "loss": 0.96269569, "memory(GiB)": 368.61, "step": 11080, "train_speed(iter/s)": 0.201032 }, { "acc": 0.74706359, "epoch": 0.2812024353120244, "grad_norm": 2.390625, "learning_rate": 9.777254277120801e-06, "loss": 1.02793274, "memory(GiB)": 368.61, "step": 11085, "train_speed(iter/s)": 0.201043 }, { "acc": 0.75808287, "epoch": 0.2813292744799594, "grad_norm": 2.046875, "learning_rate": 9.776944670532253e-06, "loss": 1.01401062, "memory(GiB)": 368.61, "step": 11090, "train_speed(iter/s)": 0.201047 }, { "acc": 0.74606504, "epoch": 0.28145611364789447, "grad_norm": 2.40625, "learning_rate": 9.776634853832352e-06, "loss": 1.01243906, "memory(GiB)": 368.61, "step": 11095, "train_speed(iter/s)": 0.201064 }, { "acc": 0.74385753, "epoch": 0.2815829528158295, "grad_norm": 2.046875, "learning_rate": 9.776324827034724e-06, "loss": 1.06149254, "memory(GiB)": 368.61, "step": 11100, "train_speed(iter/s)": 0.201071 }, { "acc": 0.73731165, "epoch": 0.2817097919837646, "grad_norm": 2.0, "learning_rate": 9.776014590153005e-06, "loss": 1.05018187, "memory(GiB)": 368.61, "step": 11105, "train_speed(iter/s)": 0.201084 }, { "acc": 0.74439483, "epoch": 0.28183663115169966, "grad_norm": 2.078125, "learning_rate": 9.77570414320084e-06, "loss": 1.00656948, "memory(GiB)": 368.61, "step": 11110, "train_speed(iter/s)": 0.201095 }, { "acc": 0.74518328, "epoch": 0.2819634703196347, "grad_norm": 2.234375, "learning_rate": 9.775393486191884e-06, "loss": 1.01257591, "memory(GiB)": 368.61, "step": 11115, "train_speed(iter/s)": 0.201109 }, { "acc": 0.73487444, "epoch": 0.28209030948756975, "grad_norm": 2.515625, "learning_rate": 9.775082619139805e-06, "loss": 1.04103374, "memory(GiB)": 368.61, "step": 11120, "train_speed(iter/s)": 0.201126 }, { "acc": 0.75732059, "epoch": 0.28221714865550485, "grad_norm": 2.4375, "learning_rate": 9.77477154205827e-06, "loss": 0.99644575, "memory(GiB)": 368.61, "step": 11125, "train_speed(iter/s)": 0.201142 }, { "acc": 0.7342948, "epoch": 0.2823439878234399, "grad_norm": 2.140625, "learning_rate": 9.774460254960968e-06, "loss": 1.04753504, "memory(GiB)": 368.61, "step": 11130, "train_speed(iter/s)": 0.201153 }, { "acc": 0.74604006, "epoch": 0.28247082699137493, "grad_norm": 2.28125, "learning_rate": 9.774148757861584e-06, "loss": 1.01027775, "memory(GiB)": 368.61, "step": 11135, "train_speed(iter/s)": 0.201159 }, { "acc": 0.73673201, "epoch": 0.28259766615931, "grad_norm": 2.4375, "learning_rate": 9.773837050773824e-06, "loss": 1.09014244, "memory(GiB)": 368.61, "step": 11140, "train_speed(iter/s)": 0.201172 }, { "acc": 0.73269224, "epoch": 0.2827245053272451, "grad_norm": 2.3125, "learning_rate": 9.773525133711399e-06, "loss": 1.10068398, "memory(GiB)": 368.61, "step": 11145, "train_speed(iter/s)": 0.201187 }, { "acc": 0.73559732, "epoch": 0.2828513444951801, "grad_norm": 1.84375, "learning_rate": 9.773213006688024e-06, "loss": 1.06745253, "memory(GiB)": 368.61, "step": 11150, "train_speed(iter/s)": 0.201202 }, { "acc": 0.72882576, "epoch": 0.28297818366311517, "grad_norm": 2.1875, "learning_rate": 9.77290066971743e-06, "loss": 1.10510101, "memory(GiB)": 368.61, "step": 11155, "train_speed(iter/s)": 0.20122 }, { "acc": 0.74298382, "epoch": 0.2831050228310502, "grad_norm": 2.15625, "learning_rate": 9.772588122813358e-06, "loss": 1.02716274, "memory(GiB)": 368.61, "step": 11160, "train_speed(iter/s)": 0.201235 }, { "acc": 0.7525321, "epoch": 0.2832318619989853, "grad_norm": 2.203125, "learning_rate": 9.772275365989548e-06, "loss": 1.05582542, "memory(GiB)": 368.61, "step": 11165, "train_speed(iter/s)": 0.201232 }, { "acc": 0.75251617, "epoch": 0.28335870116692036, "grad_norm": 2.078125, "learning_rate": 9.771962399259764e-06, "loss": 1.02531157, "memory(GiB)": 368.61, "step": 11170, "train_speed(iter/s)": 0.201245 }, { "acc": 0.75330453, "epoch": 0.2834855403348554, "grad_norm": 2.15625, "learning_rate": 9.771649222637767e-06, "loss": 0.95045624, "memory(GiB)": 368.61, "step": 11175, "train_speed(iter/s)": 0.201253 }, { "acc": 0.72427635, "epoch": 0.28361237950279045, "grad_norm": 2.375, "learning_rate": 9.771335836137332e-06, "loss": 1.06611996, "memory(GiB)": 368.61, "step": 11180, "train_speed(iter/s)": 0.201268 }, { "acc": 0.74966726, "epoch": 0.28373921867072555, "grad_norm": 2.1875, "learning_rate": 9.771022239772248e-06, "loss": 0.98977671, "memory(GiB)": 368.61, "step": 11185, "train_speed(iter/s)": 0.201276 }, { "acc": 0.73853025, "epoch": 0.2838660578386606, "grad_norm": 2.09375, "learning_rate": 9.770708433556302e-06, "loss": 1.07847004, "memory(GiB)": 368.61, "step": 11190, "train_speed(iter/s)": 0.201291 }, { "acc": 0.72840629, "epoch": 0.28399289700659563, "grad_norm": 2.234375, "learning_rate": 9.7703944175033e-06, "loss": 1.07345619, "memory(GiB)": 368.61, "step": 11195, "train_speed(iter/s)": 0.201306 }, { "acc": 0.75124903, "epoch": 0.2841197361745307, "grad_norm": 2.125, "learning_rate": 9.770080191627054e-06, "loss": 0.98018627, "memory(GiB)": 368.61, "step": 11200, "train_speed(iter/s)": 0.201322 }, { "acc": 0.74627209, "epoch": 0.2842465753424658, "grad_norm": 1.8203125, "learning_rate": 9.769765755941383e-06, "loss": 1.01201134, "memory(GiB)": 368.61, "step": 11205, "train_speed(iter/s)": 0.201338 }, { "acc": 0.74515867, "epoch": 0.2843734145104008, "grad_norm": 2.25, "learning_rate": 9.76945111046012e-06, "loss": 1.02043581, "memory(GiB)": 368.61, "step": 11210, "train_speed(iter/s)": 0.201356 }, { "acc": 0.73218708, "epoch": 0.28450025367833587, "grad_norm": 2.390625, "learning_rate": 9.769136255197103e-06, "loss": 1.09457207, "memory(GiB)": 368.61, "step": 11215, "train_speed(iter/s)": 0.201371 }, { "acc": 0.73439336, "epoch": 0.2846270928462709, "grad_norm": 2.421875, "learning_rate": 9.768821190166179e-06, "loss": 1.03607521, "memory(GiB)": 368.61, "step": 11220, "train_speed(iter/s)": 0.201391 }, { "acc": 0.74446993, "epoch": 0.284753932014206, "grad_norm": 2.15625, "learning_rate": 9.76850591538121e-06, "loss": 0.99753857, "memory(GiB)": 368.61, "step": 11225, "train_speed(iter/s)": 0.201405 }, { "acc": 0.735537, "epoch": 0.28488077118214106, "grad_norm": 2.265625, "learning_rate": 9.76819043085606e-06, "loss": 1.03155441, "memory(GiB)": 368.61, "step": 11230, "train_speed(iter/s)": 0.201415 }, { "acc": 0.75350094, "epoch": 0.2850076103500761, "grad_norm": 1.8515625, "learning_rate": 9.767874736604605e-06, "loss": 0.99318428, "memory(GiB)": 368.61, "step": 11235, "train_speed(iter/s)": 0.201424 }, { "acc": 0.74832001, "epoch": 0.28513444951801115, "grad_norm": 2.21875, "learning_rate": 9.767558832640734e-06, "loss": 1.00259018, "memory(GiB)": 368.61, "step": 11240, "train_speed(iter/s)": 0.201439 }, { "acc": 0.73759012, "epoch": 0.28526128868594625, "grad_norm": 2.234375, "learning_rate": 9.76724271897834e-06, "loss": 0.99809608, "memory(GiB)": 368.61, "step": 11245, "train_speed(iter/s)": 0.201451 }, { "acc": 0.73372288, "epoch": 0.2853881278538813, "grad_norm": 2.375, "learning_rate": 9.766926395631326e-06, "loss": 1.0931447, "memory(GiB)": 368.61, "step": 11250, "train_speed(iter/s)": 0.201469 }, { "acc": 0.76177025, "epoch": 0.28551496702181633, "grad_norm": 2.5, "learning_rate": 9.766609862613607e-06, "loss": 0.95476437, "memory(GiB)": 368.61, "step": 11255, "train_speed(iter/s)": 0.201479 }, { "acc": 0.74367638, "epoch": 0.2856418061897514, "grad_norm": 2.34375, "learning_rate": 9.766293119939104e-06, "loss": 1.05812521, "memory(GiB)": 368.61, "step": 11260, "train_speed(iter/s)": 0.201492 }, { "acc": 0.74676905, "epoch": 0.2857686453576865, "grad_norm": 2.0625, "learning_rate": 9.76597616762175e-06, "loss": 1.04995842, "memory(GiB)": 368.61, "step": 11265, "train_speed(iter/s)": 0.201504 }, { "acc": 0.73807669, "epoch": 0.2858954845256215, "grad_norm": 1.734375, "learning_rate": 9.765659005675488e-06, "loss": 1.00955982, "memory(GiB)": 368.61, "step": 11270, "train_speed(iter/s)": 0.201515 }, { "acc": 0.72713356, "epoch": 0.28602232369355657, "grad_norm": 2.0, "learning_rate": 9.765341634114263e-06, "loss": 1.07335424, "memory(GiB)": 368.61, "step": 11275, "train_speed(iter/s)": 0.201527 }, { "acc": 0.73723125, "epoch": 0.2861491628614916, "grad_norm": 2.1875, "learning_rate": 9.765024052952037e-06, "loss": 1.05208559, "memory(GiB)": 368.61, "step": 11280, "train_speed(iter/s)": 0.201545 }, { "acc": 0.7379653, "epoch": 0.2862760020294267, "grad_norm": 2.109375, "learning_rate": 9.76470626220278e-06, "loss": 1.05100355, "memory(GiB)": 368.61, "step": 11285, "train_speed(iter/s)": 0.201558 }, { "acc": 0.74259109, "epoch": 0.28640284119736176, "grad_norm": 2.28125, "learning_rate": 9.76438826188047e-06, "loss": 1.02367802, "memory(GiB)": 368.61, "step": 11290, "train_speed(iter/s)": 0.201569 }, { "acc": 0.73262224, "epoch": 0.2865296803652968, "grad_norm": 2.578125, "learning_rate": 9.76407005199909e-06, "loss": 1.05033207, "memory(GiB)": 368.61, "step": 11295, "train_speed(iter/s)": 0.201577 }, { "acc": 0.73545299, "epoch": 0.28665651953323185, "grad_norm": 1.6640625, "learning_rate": 9.76375163257264e-06, "loss": 1.03729954, "memory(GiB)": 368.61, "step": 11300, "train_speed(iter/s)": 0.201589 }, { "acc": 0.74997616, "epoch": 0.28678335870116695, "grad_norm": 2.328125, "learning_rate": 9.763433003615124e-06, "loss": 1.02053337, "memory(GiB)": 368.61, "step": 11305, "train_speed(iter/s)": 0.2016 }, { "acc": 0.73969069, "epoch": 0.286910197869102, "grad_norm": 2.328125, "learning_rate": 9.763114165140559e-06, "loss": 1.10044289, "memory(GiB)": 368.61, "step": 11310, "train_speed(iter/s)": 0.201611 }, { "acc": 0.73471861, "epoch": 0.28703703703703703, "grad_norm": 2.046875, "learning_rate": 9.762795117162967e-06, "loss": 1.06960278, "memory(GiB)": 368.61, "step": 11315, "train_speed(iter/s)": 0.201622 }, { "acc": 0.75227108, "epoch": 0.2871638762049721, "grad_norm": 2.25, "learning_rate": 9.76247585969638e-06, "loss": 1.02155533, "memory(GiB)": 368.61, "step": 11320, "train_speed(iter/s)": 0.201631 }, { "acc": 0.7449254, "epoch": 0.2872907153729072, "grad_norm": 2.03125, "learning_rate": 9.762156392754842e-06, "loss": 0.98798828, "memory(GiB)": 368.61, "step": 11325, "train_speed(iter/s)": 0.201641 }, { "acc": 0.7199131, "epoch": 0.2874175545408422, "grad_norm": 2.234375, "learning_rate": 9.761836716352405e-06, "loss": 1.06514874, "memory(GiB)": 368.61, "step": 11330, "train_speed(iter/s)": 0.201647 }, { "acc": 0.73123631, "epoch": 0.28754439370877727, "grad_norm": 2.125, "learning_rate": 9.761516830503128e-06, "loss": 1.0679987, "memory(GiB)": 368.61, "step": 11335, "train_speed(iter/s)": 0.201668 }, { "acc": 0.74868622, "epoch": 0.2876712328767123, "grad_norm": 2.25, "learning_rate": 9.761196735221083e-06, "loss": 1.04511738, "memory(GiB)": 368.61, "step": 11340, "train_speed(iter/s)": 0.201685 }, { "acc": 0.74486876, "epoch": 0.2877980720446474, "grad_norm": 2.171875, "learning_rate": 9.76087643052035e-06, "loss": 1.04228573, "memory(GiB)": 368.61, "step": 11345, "train_speed(iter/s)": 0.201699 }, { "acc": 0.74404182, "epoch": 0.28792491121258246, "grad_norm": 2.125, "learning_rate": 9.760555916415015e-06, "loss": 1.03346958, "memory(GiB)": 368.61, "step": 11350, "train_speed(iter/s)": 0.201707 }, { "acc": 0.74645767, "epoch": 0.2880517503805175, "grad_norm": 1.984375, "learning_rate": 9.760235192919175e-06, "loss": 1.02979984, "memory(GiB)": 368.61, "step": 11355, "train_speed(iter/s)": 0.201724 }, { "acc": 0.73324766, "epoch": 0.28817858954845255, "grad_norm": 2.328125, "learning_rate": 9.75991426004694e-06, "loss": 1.058288, "memory(GiB)": 368.61, "step": 11360, "train_speed(iter/s)": 0.201738 }, { "acc": 0.7430922, "epoch": 0.28830542871638765, "grad_norm": 2.015625, "learning_rate": 9.759593117812423e-06, "loss": 1.04075966, "memory(GiB)": 368.61, "step": 11365, "train_speed(iter/s)": 0.201748 }, { "acc": 0.74775548, "epoch": 0.2884322678843227, "grad_norm": 2.265625, "learning_rate": 9.75927176622975e-06, "loss": 1.03459263, "memory(GiB)": 368.61, "step": 11370, "train_speed(iter/s)": 0.201759 }, { "acc": 0.7307682, "epoch": 0.28855910705225774, "grad_norm": 2.171875, "learning_rate": 9.758950205313057e-06, "loss": 1.07675743, "memory(GiB)": 368.61, "step": 11375, "train_speed(iter/s)": 0.201766 }, { "acc": 0.75266209, "epoch": 0.2886859462201928, "grad_norm": 1.8671875, "learning_rate": 9.758628435076488e-06, "loss": 0.97514763, "memory(GiB)": 368.61, "step": 11380, "train_speed(iter/s)": 0.201764 }, { "acc": 0.73616581, "epoch": 0.2888127853881279, "grad_norm": 2.234375, "learning_rate": 9.758306455534193e-06, "loss": 1.05899506, "memory(GiB)": 368.61, "step": 11385, "train_speed(iter/s)": 0.201774 }, { "acc": 0.73310947, "epoch": 0.2889396245560629, "grad_norm": 2.234375, "learning_rate": 9.757984266700336e-06, "loss": 1.03921518, "memory(GiB)": 368.61, "step": 11390, "train_speed(iter/s)": 0.20179 }, { "acc": 0.74580784, "epoch": 0.28906646372399797, "grad_norm": 2.375, "learning_rate": 9.75766186858909e-06, "loss": 0.99506273, "memory(GiB)": 368.61, "step": 11395, "train_speed(iter/s)": 0.201805 }, { "acc": 0.73922, "epoch": 0.289193302891933, "grad_norm": 2.25, "learning_rate": 9.757339261214631e-06, "loss": 1.08105087, "memory(GiB)": 368.61, "step": 11400, "train_speed(iter/s)": 0.201826 }, { "acc": 0.73293519, "epoch": 0.2893201420598681, "grad_norm": 1.9375, "learning_rate": 9.757016444591152e-06, "loss": 1.0504426, "memory(GiB)": 368.61, "step": 11405, "train_speed(iter/s)": 0.201835 }, { "acc": 0.73782043, "epoch": 0.28944698122780316, "grad_norm": 1.9140625, "learning_rate": 9.756693418732852e-06, "loss": 1.0526926, "memory(GiB)": 368.61, "step": 11410, "train_speed(iter/s)": 0.201851 }, { "acc": 0.74042492, "epoch": 0.2895738203957382, "grad_norm": 2.1875, "learning_rate": 9.756370183653938e-06, "loss": 1.09948606, "memory(GiB)": 368.61, "step": 11415, "train_speed(iter/s)": 0.201862 }, { "acc": 0.7492043, "epoch": 0.28970065956367325, "grad_norm": 2.5, "learning_rate": 9.756046739368628e-06, "loss": 0.98540554, "memory(GiB)": 368.61, "step": 11420, "train_speed(iter/s)": 0.201874 }, { "acc": 0.73563905, "epoch": 0.28982749873160835, "grad_norm": 2.171875, "learning_rate": 9.755723085891147e-06, "loss": 1.09876862, "memory(GiB)": 368.61, "step": 11425, "train_speed(iter/s)": 0.201887 }, { "acc": 0.7525959, "epoch": 0.2899543378995434, "grad_norm": 2.3125, "learning_rate": 9.755399223235734e-06, "loss": 0.98123837, "memory(GiB)": 368.61, "step": 11430, "train_speed(iter/s)": 0.201897 }, { "acc": 0.74136868, "epoch": 0.29008117706747844, "grad_norm": 2.5625, "learning_rate": 9.75507515141663e-06, "loss": 1.01826534, "memory(GiB)": 368.61, "step": 11435, "train_speed(iter/s)": 0.201913 }, { "acc": 0.74607401, "epoch": 0.2902080162354135, "grad_norm": 2.3125, "learning_rate": 9.75475087044809e-06, "loss": 1.02709713, "memory(GiB)": 368.61, "step": 11440, "train_speed(iter/s)": 0.20192 }, { "acc": 0.73906708, "epoch": 0.2903348554033486, "grad_norm": 2.234375, "learning_rate": 9.754426380344382e-06, "loss": 1.05570183, "memory(GiB)": 368.61, "step": 11445, "train_speed(iter/s)": 0.201932 }, { "acc": 0.73738346, "epoch": 0.2904616945712836, "grad_norm": 2.203125, "learning_rate": 9.754101681119772e-06, "loss": 1.06360283, "memory(GiB)": 368.61, "step": 11450, "train_speed(iter/s)": 0.20195 }, { "acc": 0.72609801, "epoch": 0.29058853373921867, "grad_norm": 2.359375, "learning_rate": 9.753776772788545e-06, "loss": 1.06171923, "memory(GiB)": 368.61, "step": 11455, "train_speed(iter/s)": 0.201965 }, { "acc": 0.74091978, "epoch": 0.2907153729071537, "grad_norm": 2.0, "learning_rate": 9.753451655364992e-06, "loss": 1.02952003, "memory(GiB)": 368.61, "step": 11460, "train_speed(iter/s)": 0.201982 }, { "acc": 0.74209337, "epoch": 0.2908422120750888, "grad_norm": 2.0625, "learning_rate": 9.75312632886341e-06, "loss": 1.07847805, "memory(GiB)": 368.61, "step": 11465, "train_speed(iter/s)": 0.201997 }, { "acc": 0.76478291, "epoch": 0.29096905124302386, "grad_norm": 1.96875, "learning_rate": 9.752800793298113e-06, "loss": 0.97937279, "memory(GiB)": 368.61, "step": 11470, "train_speed(iter/s)": 0.202012 }, { "acc": 0.74322538, "epoch": 0.2910958904109589, "grad_norm": 2.234375, "learning_rate": 9.752475048683419e-06, "loss": 1.01258221, "memory(GiB)": 368.61, "step": 11475, "train_speed(iter/s)": 0.202022 }, { "acc": 0.74444847, "epoch": 0.29122272957889395, "grad_norm": 1.8671875, "learning_rate": 9.752149095033651e-06, "loss": 0.99958305, "memory(GiB)": 368.61, "step": 11480, "train_speed(iter/s)": 0.202036 }, { "acc": 0.75357866, "epoch": 0.29134956874682905, "grad_norm": 2.46875, "learning_rate": 9.75182293236315e-06, "loss": 0.97968712, "memory(GiB)": 368.61, "step": 11485, "train_speed(iter/s)": 0.202049 }, { "acc": 0.74148564, "epoch": 0.2914764079147641, "grad_norm": 2.546875, "learning_rate": 9.751496560686262e-06, "loss": 0.95232821, "memory(GiB)": 368.61, "step": 11490, "train_speed(iter/s)": 0.202064 }, { "acc": 0.73680248, "epoch": 0.29160324708269914, "grad_norm": 2.09375, "learning_rate": 9.751169980017341e-06, "loss": 1.0028059, "memory(GiB)": 368.61, "step": 11495, "train_speed(iter/s)": 0.202078 }, { "acc": 0.74449115, "epoch": 0.2917300862506342, "grad_norm": 1.921875, "learning_rate": 9.750843190370752e-06, "loss": 1.00633583, "memory(GiB)": 368.61, "step": 11500, "train_speed(iter/s)": 0.202095 }, { "acc": 0.74885712, "epoch": 0.2918569254185693, "grad_norm": 1.6328125, "learning_rate": 9.750516191760868e-06, "loss": 1.01937199, "memory(GiB)": 368.61, "step": 11505, "train_speed(iter/s)": 0.202106 }, { "acc": 0.74517107, "epoch": 0.2919837645865043, "grad_norm": 1.671875, "learning_rate": 9.750188984202073e-06, "loss": 0.99736423, "memory(GiB)": 368.61, "step": 11510, "train_speed(iter/s)": 0.202115 }, { "acc": 0.7373961, "epoch": 0.29211060375443937, "grad_norm": 2.375, "learning_rate": 9.749861567708759e-06, "loss": 1.08723059, "memory(GiB)": 368.61, "step": 11515, "train_speed(iter/s)": 0.202129 }, { "acc": 0.72764997, "epoch": 0.2922374429223744, "grad_norm": 2.3125, "learning_rate": 9.749533942295323e-06, "loss": 1.09884834, "memory(GiB)": 368.61, "step": 11520, "train_speed(iter/s)": 0.202145 }, { "acc": 0.74290113, "epoch": 0.2923642820903095, "grad_norm": 2.65625, "learning_rate": 9.749206107976183e-06, "loss": 1.00366039, "memory(GiB)": 368.61, "step": 11525, "train_speed(iter/s)": 0.202165 }, { "acc": 0.72552338, "epoch": 0.29249112125824456, "grad_norm": 2.53125, "learning_rate": 9.748878064765753e-06, "loss": 1.09335003, "memory(GiB)": 368.61, "step": 11530, "train_speed(iter/s)": 0.202178 }, { "acc": 0.73707838, "epoch": 0.2926179604261796, "grad_norm": 2.46875, "learning_rate": 9.748549812678466e-06, "loss": 1.08805523, "memory(GiB)": 368.61, "step": 11535, "train_speed(iter/s)": 0.202194 }, { "acc": 0.74491825, "epoch": 0.29274479959411465, "grad_norm": 2.09375, "learning_rate": 9.748221351728754e-06, "loss": 1.05634584, "memory(GiB)": 368.61, "step": 11540, "train_speed(iter/s)": 0.202211 }, { "acc": 0.73912678, "epoch": 0.29287163876204975, "grad_norm": 2.359375, "learning_rate": 9.747892681931067e-06, "loss": 1.01501312, "memory(GiB)": 368.61, "step": 11545, "train_speed(iter/s)": 0.202225 }, { "acc": 0.74246254, "epoch": 0.2929984779299848, "grad_norm": 2.28125, "learning_rate": 9.747563803299865e-06, "loss": 1.05151024, "memory(GiB)": 368.61, "step": 11550, "train_speed(iter/s)": 0.202242 }, { "acc": 0.74282546, "epoch": 0.29312531709791984, "grad_norm": 2.171875, "learning_rate": 9.74723471584961e-06, "loss": 1.00483932, "memory(GiB)": 368.61, "step": 11555, "train_speed(iter/s)": 0.202251 }, { "acc": 0.7394969, "epoch": 0.2932521562658549, "grad_norm": 2.09375, "learning_rate": 9.746905419594777e-06, "loss": 1.01453781, "memory(GiB)": 368.61, "step": 11560, "train_speed(iter/s)": 0.202266 }, { "acc": 0.73205585, "epoch": 0.29337899543379, "grad_norm": 2.25, "learning_rate": 9.746575914549851e-06, "loss": 1.06521921, "memory(GiB)": 368.61, "step": 11565, "train_speed(iter/s)": 0.202281 }, { "acc": 0.73209696, "epoch": 0.293505834601725, "grad_norm": 2.25, "learning_rate": 9.746246200729323e-06, "loss": 1.09627075, "memory(GiB)": 368.61, "step": 11570, "train_speed(iter/s)": 0.202294 }, { "acc": 0.74932518, "epoch": 0.29363267376966007, "grad_norm": 1.828125, "learning_rate": 9.745916278147696e-06, "loss": 0.98917341, "memory(GiB)": 368.61, "step": 11575, "train_speed(iter/s)": 0.202305 }, { "acc": 0.74393272, "epoch": 0.2937595129375951, "grad_norm": 2.953125, "learning_rate": 9.745586146819484e-06, "loss": 1.04295454, "memory(GiB)": 368.61, "step": 11580, "train_speed(iter/s)": 0.202313 }, { "acc": 0.74392433, "epoch": 0.2938863521055302, "grad_norm": 2.0625, "learning_rate": 9.745255806759205e-06, "loss": 1.11278324, "memory(GiB)": 368.61, "step": 11585, "train_speed(iter/s)": 0.202328 }, { "acc": 0.73842945, "epoch": 0.29401319127346526, "grad_norm": 2.21875, "learning_rate": 9.74492525798139e-06, "loss": 1.0681448, "memory(GiB)": 368.61, "step": 11590, "train_speed(iter/s)": 0.20234 }, { "acc": 0.73248172, "epoch": 0.2941400304414003, "grad_norm": 2.09375, "learning_rate": 9.744594500500578e-06, "loss": 1.04514561, "memory(GiB)": 368.61, "step": 11595, "train_speed(iter/s)": 0.202352 }, { "acc": 0.73737655, "epoch": 0.29426686960933535, "grad_norm": 2.125, "learning_rate": 9.744263534331315e-06, "loss": 1.06463375, "memory(GiB)": 368.61, "step": 11600, "train_speed(iter/s)": 0.202364 }, { "acc": 0.74936399, "epoch": 0.29439370877727045, "grad_norm": 2.28125, "learning_rate": 9.743932359488161e-06, "loss": 1.01069241, "memory(GiB)": 368.61, "step": 11605, "train_speed(iter/s)": 0.202378 }, { "acc": 0.73003612, "epoch": 0.2945205479452055, "grad_norm": 1.9375, "learning_rate": 9.743600975985681e-06, "loss": 1.06587048, "memory(GiB)": 368.61, "step": 11610, "train_speed(iter/s)": 0.202391 }, { "acc": 0.74953022, "epoch": 0.29464738711314054, "grad_norm": 2.125, "learning_rate": 9.743269383838452e-06, "loss": 1.05921307, "memory(GiB)": 368.61, "step": 11615, "train_speed(iter/s)": 0.202407 }, { "acc": 0.74757338, "epoch": 0.2947742262810756, "grad_norm": 2.5, "learning_rate": 9.74293758306106e-06, "loss": 1.0070178, "memory(GiB)": 368.61, "step": 11620, "train_speed(iter/s)": 0.202419 }, { "acc": 0.73061705, "epoch": 0.2949010654490107, "grad_norm": 2.734375, "learning_rate": 9.742605573668096e-06, "loss": 1.02904034, "memory(GiB)": 368.61, "step": 11625, "train_speed(iter/s)": 0.20243 }, { "acc": 0.74721384, "epoch": 0.2950279046169457, "grad_norm": 2.671875, "learning_rate": 9.742273355674164e-06, "loss": 1.0983283, "memory(GiB)": 368.61, "step": 11630, "train_speed(iter/s)": 0.202445 }, { "acc": 0.75154381, "epoch": 0.29515474378488077, "grad_norm": 1.9921875, "learning_rate": 9.741940929093879e-06, "loss": 1.05910826, "memory(GiB)": 368.61, "step": 11635, "train_speed(iter/s)": 0.202456 }, { "acc": 0.75306649, "epoch": 0.2952815829528158, "grad_norm": 1.921875, "learning_rate": 9.741608293941858e-06, "loss": 0.96722565, "memory(GiB)": 368.61, "step": 11640, "train_speed(iter/s)": 0.202461 }, { "acc": 0.74494939, "epoch": 0.2954084221207509, "grad_norm": 2.15625, "learning_rate": 9.741275450232736e-06, "loss": 1.05114784, "memory(GiB)": 368.61, "step": 11645, "train_speed(iter/s)": 0.20247 }, { "acc": 0.75120192, "epoch": 0.29553526128868596, "grad_norm": 2.15625, "learning_rate": 9.740942397981151e-06, "loss": 0.96019449, "memory(GiB)": 368.61, "step": 11650, "train_speed(iter/s)": 0.202479 }, { "acc": 0.74067287, "epoch": 0.295662100456621, "grad_norm": 2.125, "learning_rate": 9.740609137201752e-06, "loss": 1.03900204, "memory(GiB)": 368.61, "step": 11655, "train_speed(iter/s)": 0.20249 }, { "acc": 0.74535441, "epoch": 0.29578893962455605, "grad_norm": 2.3125, "learning_rate": 9.7402756679092e-06, "loss": 1.03117676, "memory(GiB)": 368.61, "step": 11660, "train_speed(iter/s)": 0.2025 }, { "acc": 0.74667139, "epoch": 0.29591577879249115, "grad_norm": 2.328125, "learning_rate": 9.739941990118157e-06, "loss": 0.99032192, "memory(GiB)": 368.61, "step": 11665, "train_speed(iter/s)": 0.202512 }, { "acc": 0.74817729, "epoch": 0.2960426179604262, "grad_norm": 1.8984375, "learning_rate": 9.739608103843306e-06, "loss": 1.01214046, "memory(GiB)": 368.61, "step": 11670, "train_speed(iter/s)": 0.202511 }, { "acc": 0.73924875, "epoch": 0.29616945712836124, "grad_norm": 1.9140625, "learning_rate": 9.739274009099328e-06, "loss": 1.05162811, "memory(GiB)": 368.61, "step": 11675, "train_speed(iter/s)": 0.202523 }, { "acc": 0.7468421, "epoch": 0.2962962962962963, "grad_norm": 2.15625, "learning_rate": 9.738939705900922e-06, "loss": 1.03213167, "memory(GiB)": 368.61, "step": 11680, "train_speed(iter/s)": 0.20254 }, { "acc": 0.75023251, "epoch": 0.2964231354642314, "grad_norm": 1.7421875, "learning_rate": 9.738605194262787e-06, "loss": 1.01048746, "memory(GiB)": 368.61, "step": 11685, "train_speed(iter/s)": 0.202555 }, { "acc": 0.74160066, "epoch": 0.2965499746321664, "grad_norm": 1.890625, "learning_rate": 9.738270474199641e-06, "loss": 1.05949249, "memory(GiB)": 368.61, "step": 11690, "train_speed(iter/s)": 0.202568 }, { "acc": 0.73413515, "epoch": 0.29667681380010147, "grad_norm": 2.625, "learning_rate": 9.737935545726205e-06, "loss": 1.07186832, "memory(GiB)": 368.61, "step": 11695, "train_speed(iter/s)": 0.202581 }, { "acc": 0.74038987, "epoch": 0.2968036529680365, "grad_norm": 2.1875, "learning_rate": 9.737600408857208e-06, "loss": 1.03317413, "memory(GiB)": 368.61, "step": 11700, "train_speed(iter/s)": 0.202596 }, { "acc": 0.73995838, "epoch": 0.2969304921359716, "grad_norm": 2.296875, "learning_rate": 9.737265063607395e-06, "loss": 1.04908657, "memory(GiB)": 368.61, "step": 11705, "train_speed(iter/s)": 0.20261 }, { "acc": 0.74489689, "epoch": 0.29705733130390666, "grad_norm": 2.375, "learning_rate": 9.736929509991515e-06, "loss": 1.03436317, "memory(GiB)": 368.61, "step": 11710, "train_speed(iter/s)": 0.202619 }, { "acc": 0.7508039, "epoch": 0.2971841704718417, "grad_norm": 2.390625, "learning_rate": 9.736593748024325e-06, "loss": 1.04204531, "memory(GiB)": 368.61, "step": 11715, "train_speed(iter/s)": 0.202637 }, { "acc": 0.75307827, "epoch": 0.29731100963977675, "grad_norm": 1.9140625, "learning_rate": 9.736257777720595e-06, "loss": 1.01586151, "memory(GiB)": 368.61, "step": 11720, "train_speed(iter/s)": 0.202653 }, { "acc": 0.75209694, "epoch": 0.29743784880771185, "grad_norm": 2.5625, "learning_rate": 9.735921599095101e-06, "loss": 1.06126261, "memory(GiB)": 368.61, "step": 11725, "train_speed(iter/s)": 0.202668 }, { "acc": 0.74971323, "epoch": 0.2975646879756469, "grad_norm": 2.0625, "learning_rate": 9.735585212162633e-06, "loss": 1.02071991, "memory(GiB)": 368.61, "step": 11730, "train_speed(iter/s)": 0.202685 }, { "acc": 0.7408618, "epoch": 0.29769152714358194, "grad_norm": 2.515625, "learning_rate": 9.735248616937983e-06, "loss": 1.05712986, "memory(GiB)": 368.61, "step": 11735, "train_speed(iter/s)": 0.202699 }, { "acc": 0.74035282, "epoch": 0.297818366311517, "grad_norm": 2.015625, "learning_rate": 9.734911813435957e-06, "loss": 1.04072552, "memory(GiB)": 368.61, "step": 11740, "train_speed(iter/s)": 0.202706 }, { "acc": 0.73707514, "epoch": 0.2979452054794521, "grad_norm": 2.015625, "learning_rate": 9.73457480167137e-06, "loss": 1.13941221, "memory(GiB)": 368.61, "step": 11745, "train_speed(iter/s)": 0.202723 }, { "acc": 0.74752808, "epoch": 0.2980720446473871, "grad_norm": 2.328125, "learning_rate": 9.734237581659045e-06, "loss": 1.04767227, "memory(GiB)": 368.61, "step": 11750, "train_speed(iter/s)": 0.202738 }, { "acc": 0.74648104, "epoch": 0.29819888381532217, "grad_norm": 2.140625, "learning_rate": 9.733900153413813e-06, "loss": 0.97350559, "memory(GiB)": 368.61, "step": 11755, "train_speed(iter/s)": 0.202743 }, { "acc": 0.74012728, "epoch": 0.2983257229832572, "grad_norm": 2.375, "learning_rate": 9.733562516950519e-06, "loss": 1.03666859, "memory(GiB)": 368.61, "step": 11760, "train_speed(iter/s)": 0.202753 }, { "acc": 0.73602018, "epoch": 0.2984525621511923, "grad_norm": 2.421875, "learning_rate": 9.73322467228401e-06, "loss": 1.02354975, "memory(GiB)": 368.61, "step": 11765, "train_speed(iter/s)": 0.202763 }, { "acc": 0.74549727, "epoch": 0.29857940131912736, "grad_norm": 2.78125, "learning_rate": 9.73288661942915e-06, "loss": 0.99959431, "memory(GiB)": 368.61, "step": 11770, "train_speed(iter/s)": 0.202773 }, { "acc": 0.75084133, "epoch": 0.2987062404870624, "grad_norm": 2.390625, "learning_rate": 9.732548358400802e-06, "loss": 0.99353876, "memory(GiB)": 368.61, "step": 11775, "train_speed(iter/s)": 0.202776 }, { "acc": 0.74389143, "epoch": 0.29883307965499745, "grad_norm": 1.859375, "learning_rate": 9.73220988921385e-06, "loss": 0.97706451, "memory(GiB)": 368.61, "step": 11780, "train_speed(iter/s)": 0.202777 }, { "acc": 0.74268432, "epoch": 0.29895991882293255, "grad_norm": 2.09375, "learning_rate": 9.73187121188318e-06, "loss": 1.0041666, "memory(GiB)": 368.61, "step": 11785, "train_speed(iter/s)": 0.202794 }, { "acc": 0.75618181, "epoch": 0.2990867579908676, "grad_norm": 2.609375, "learning_rate": 9.731532326423686e-06, "loss": 1.01919556, "memory(GiB)": 368.61, "step": 11790, "train_speed(iter/s)": 0.20281 }, { "acc": 0.75641499, "epoch": 0.29921359715880264, "grad_norm": 1.9921875, "learning_rate": 9.731193232850277e-06, "loss": 0.99652367, "memory(GiB)": 368.61, "step": 11795, "train_speed(iter/s)": 0.20282 }, { "acc": 0.73858175, "epoch": 0.2993404363267377, "grad_norm": 1.8046875, "learning_rate": 9.730853931177866e-06, "loss": 1.0259819, "memory(GiB)": 368.61, "step": 11800, "train_speed(iter/s)": 0.202833 }, { "acc": 0.74554935, "epoch": 0.2994672754946728, "grad_norm": 2.4375, "learning_rate": 9.730514421421378e-06, "loss": 1.03262711, "memory(GiB)": 368.61, "step": 11805, "train_speed(iter/s)": 0.202845 }, { "acc": 0.74116535, "epoch": 0.2995941146626078, "grad_norm": 1.8828125, "learning_rate": 9.730174703595745e-06, "loss": 1.03538761, "memory(GiB)": 368.61, "step": 11810, "train_speed(iter/s)": 0.202849 }, { "acc": 0.74581308, "epoch": 0.29972095383054287, "grad_norm": 1.9921875, "learning_rate": 9.72983477771591e-06, "loss": 0.98349667, "memory(GiB)": 368.61, "step": 11815, "train_speed(iter/s)": 0.202857 }, { "acc": 0.73513393, "epoch": 0.2998477929984779, "grad_norm": 1.828125, "learning_rate": 9.729494643796823e-06, "loss": 1.10805349, "memory(GiB)": 368.61, "step": 11820, "train_speed(iter/s)": 0.202873 }, { "acc": 0.73883076, "epoch": 0.299974632166413, "grad_norm": 2.15625, "learning_rate": 9.729154301853448e-06, "loss": 1.03671989, "memory(GiB)": 368.61, "step": 11825, "train_speed(iter/s)": 0.202887 }, { "acc": 0.7377317, "epoch": 0.30010147133434806, "grad_norm": 2.953125, "learning_rate": 9.72881375190075e-06, "loss": 1.08760052, "memory(GiB)": 368.61, "step": 11830, "train_speed(iter/s)": 0.202892 }, { "acc": 0.75760913, "epoch": 0.3002283105022831, "grad_norm": 1.8671875, "learning_rate": 9.728472993953712e-06, "loss": 0.9783493, "memory(GiB)": 368.61, "step": 11835, "train_speed(iter/s)": 0.202899 }, { "acc": 0.74249954, "epoch": 0.30035514967021815, "grad_norm": 1.9375, "learning_rate": 9.728132028027323e-06, "loss": 0.95591145, "memory(GiB)": 368.61, "step": 11840, "train_speed(iter/s)": 0.202912 }, { "acc": 0.75377731, "epoch": 0.30048198883815325, "grad_norm": 2.0, "learning_rate": 9.727790854136573e-06, "loss": 1.02691479, "memory(GiB)": 368.61, "step": 11845, "train_speed(iter/s)": 0.202923 }, { "acc": 0.75271926, "epoch": 0.3006088280060883, "grad_norm": 1.8359375, "learning_rate": 9.727449472296476e-06, "loss": 0.98577824, "memory(GiB)": 368.61, "step": 11850, "train_speed(iter/s)": 0.202928 }, { "acc": 0.74081802, "epoch": 0.30073566717402334, "grad_norm": 2.484375, "learning_rate": 9.727107882522045e-06, "loss": 1.08783245, "memory(GiB)": 368.61, "step": 11855, "train_speed(iter/s)": 0.202945 }, { "acc": 0.74238811, "epoch": 0.3008625063419584, "grad_norm": 2.359375, "learning_rate": 9.726766084828303e-06, "loss": 1.00902815, "memory(GiB)": 368.61, "step": 11860, "train_speed(iter/s)": 0.202955 }, { "acc": 0.73810472, "epoch": 0.3009893455098935, "grad_norm": 2.078125, "learning_rate": 9.726424079230286e-06, "loss": 1.01635094, "memory(GiB)": 368.61, "step": 11865, "train_speed(iter/s)": 0.20296 }, { "acc": 0.74224777, "epoch": 0.3011161846778285, "grad_norm": 2.0625, "learning_rate": 9.726081865743036e-06, "loss": 1.02132244, "memory(GiB)": 368.61, "step": 11870, "train_speed(iter/s)": 0.202976 }, { "acc": 0.76244674, "epoch": 0.30124302384576357, "grad_norm": 1.953125, "learning_rate": 9.725739444381603e-06, "loss": 1.00475559, "memory(GiB)": 368.61, "step": 11875, "train_speed(iter/s)": 0.20299 }, { "acc": 0.74091854, "epoch": 0.3013698630136986, "grad_norm": 1.9921875, "learning_rate": 9.725396815161053e-06, "loss": 1.06604633, "memory(GiB)": 368.61, "step": 11880, "train_speed(iter/s)": 0.203001 }, { "acc": 0.73840461, "epoch": 0.3014967021816337, "grad_norm": 2.140625, "learning_rate": 9.725053978096453e-06, "loss": 1.09086094, "memory(GiB)": 368.61, "step": 11885, "train_speed(iter/s)": 0.203012 }, { "acc": 0.73310051, "epoch": 0.30162354134956876, "grad_norm": 2.015625, "learning_rate": 9.724710933202884e-06, "loss": 1.08324184, "memory(GiB)": 368.61, "step": 11890, "train_speed(iter/s)": 0.20302 }, { "acc": 0.74822397, "epoch": 0.3017503805175038, "grad_norm": 2.0625, "learning_rate": 9.724367680495432e-06, "loss": 1.03268566, "memory(GiB)": 368.61, "step": 11895, "train_speed(iter/s)": 0.203034 }, { "acc": 0.75791292, "epoch": 0.30187721968543885, "grad_norm": 2.015625, "learning_rate": 9.724024219989198e-06, "loss": 0.95891943, "memory(GiB)": 368.61, "step": 11900, "train_speed(iter/s)": 0.203041 }, { "acc": 0.73665948, "epoch": 0.30200405885337395, "grad_norm": 2.546875, "learning_rate": 9.723680551699286e-06, "loss": 1.0252224, "memory(GiB)": 368.61, "step": 11905, "train_speed(iter/s)": 0.203044 }, { "acc": 0.73782215, "epoch": 0.302130898021309, "grad_norm": 2.484375, "learning_rate": 9.723336675640815e-06, "loss": 1.06029186, "memory(GiB)": 368.61, "step": 11910, "train_speed(iter/s)": 0.203053 }, { "acc": 0.74287405, "epoch": 0.30225773718924404, "grad_norm": 2.0625, "learning_rate": 9.722992591828908e-06, "loss": 1.03502254, "memory(GiB)": 368.61, "step": 11915, "train_speed(iter/s)": 0.203062 }, { "acc": 0.75261183, "epoch": 0.3023845763571791, "grad_norm": 2.28125, "learning_rate": 9.722648300278701e-06, "loss": 0.94986095, "memory(GiB)": 368.61, "step": 11920, "train_speed(iter/s)": 0.203069 }, { "acc": 0.74882288, "epoch": 0.3025114155251142, "grad_norm": 2.265625, "learning_rate": 9.722303801005338e-06, "loss": 1.04705477, "memory(GiB)": 368.61, "step": 11925, "train_speed(iter/s)": 0.203077 }, { "acc": 0.74111481, "epoch": 0.3026382546930492, "grad_norm": 2.453125, "learning_rate": 9.721959094023968e-06, "loss": 1.07254696, "memory(GiB)": 368.61, "step": 11930, "train_speed(iter/s)": 0.203081 }, { "acc": 0.76543159, "epoch": 0.30276509386098427, "grad_norm": 2.515625, "learning_rate": 9.721614179349754e-06, "loss": 0.96276741, "memory(GiB)": 368.61, "step": 11935, "train_speed(iter/s)": 0.203097 }, { "acc": 0.75506926, "epoch": 0.3028919330289193, "grad_norm": 2.359375, "learning_rate": 9.72126905699787e-06, "loss": 1.05141239, "memory(GiB)": 368.61, "step": 11940, "train_speed(iter/s)": 0.203114 }, { "acc": 0.73638802, "epoch": 0.3030187721968544, "grad_norm": 1.8515625, "learning_rate": 9.720923726983493e-06, "loss": 0.99619942, "memory(GiB)": 368.61, "step": 11945, "train_speed(iter/s)": 0.203123 }, { "acc": 0.72888279, "epoch": 0.30314561136478946, "grad_norm": 1.8203125, "learning_rate": 9.720578189321814e-06, "loss": 1.05271492, "memory(GiB)": 368.61, "step": 11950, "train_speed(iter/s)": 0.203136 }, { "acc": 0.75050902, "epoch": 0.3032724505327245, "grad_norm": 2.5625, "learning_rate": 9.72023244402803e-06, "loss": 1.03441687, "memory(GiB)": 368.61, "step": 11955, "train_speed(iter/s)": 0.203141 }, { "acc": 0.74564571, "epoch": 0.30339928970065955, "grad_norm": 2.15625, "learning_rate": 9.719886491117348e-06, "loss": 1.03334522, "memory(GiB)": 368.61, "step": 11960, "train_speed(iter/s)": 0.203154 }, { "acc": 0.74933734, "epoch": 0.30352612886859465, "grad_norm": 2.359375, "learning_rate": 9.719540330604986e-06, "loss": 0.98771858, "memory(GiB)": 368.61, "step": 11965, "train_speed(iter/s)": 0.203163 }, { "acc": 0.748593, "epoch": 0.3036529680365297, "grad_norm": 2.265625, "learning_rate": 9.71919396250617e-06, "loss": 1.06156502, "memory(GiB)": 368.61, "step": 11970, "train_speed(iter/s)": 0.203178 }, { "acc": 0.72736058, "epoch": 0.30377980720446474, "grad_norm": 2.109375, "learning_rate": 9.718847386836131e-06, "loss": 1.09228649, "memory(GiB)": 368.61, "step": 11975, "train_speed(iter/s)": 0.203187 }, { "acc": 0.7400219, "epoch": 0.3039066463723998, "grad_norm": 2.234375, "learning_rate": 9.718500603610119e-06, "loss": 1.06428928, "memory(GiB)": 368.61, "step": 11980, "train_speed(iter/s)": 0.203202 }, { "acc": 0.72974644, "epoch": 0.3040334855403349, "grad_norm": 2.703125, "learning_rate": 9.718153612843382e-06, "loss": 1.03328142, "memory(GiB)": 368.61, "step": 11985, "train_speed(iter/s)": 0.203218 }, { "acc": 0.72957478, "epoch": 0.3041603247082699, "grad_norm": 1.9296875, "learning_rate": 9.717806414551186e-06, "loss": 1.08258266, "memory(GiB)": 368.61, "step": 11990, "train_speed(iter/s)": 0.203216 }, { "acc": 0.74426641, "epoch": 0.30428716387620497, "grad_norm": 2.953125, "learning_rate": 9.717459008748798e-06, "loss": 1.05001774, "memory(GiB)": 368.61, "step": 11995, "train_speed(iter/s)": 0.203228 }, { "acc": 0.73968673, "epoch": 0.30441400304414, "grad_norm": 2.34375, "learning_rate": 9.717111395451501e-06, "loss": 0.99608803, "memory(GiB)": 368.61, "step": 12000, "train_speed(iter/s)": 0.203245 }, { "epoch": 0.30441400304414, "eval_acc": 0.7311826115219389, "eval_loss": 1.0046793222427368, "eval_runtime": 384.4882, "eval_samples_per_second": 16.567, "eval_steps_per_second": 8.284, "step": 12000 }, { "acc": 0.72915745, "epoch": 0.3045408422120751, "grad_norm": 1.9921875, "learning_rate": 9.716763574674586e-06, "loss": 1.10215607, "memory(GiB)": 368.61, "step": 12005, "train_speed(iter/s)": 0.200839 }, { "acc": 0.7546114, "epoch": 0.30466768138001016, "grad_norm": 2.078125, "learning_rate": 9.71641554643335e-06, "loss": 0.9679184, "memory(GiB)": 368.61, "step": 12010, "train_speed(iter/s)": 0.200848 }, { "acc": 0.74297366, "epoch": 0.3047945205479452, "grad_norm": 2.046875, "learning_rate": 9.7160673107431e-06, "loss": 0.97850895, "memory(GiB)": 368.61, "step": 12015, "train_speed(iter/s)": 0.200861 }, { "acc": 0.7201581, "epoch": 0.30492135971588025, "grad_norm": 2.625, "learning_rate": 9.715718867619155e-06, "loss": 1.10855236, "memory(GiB)": 368.61, "step": 12020, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74254131, "epoch": 0.30504819888381535, "grad_norm": 1.9921875, "learning_rate": 9.715370217076838e-06, "loss": 1.00877228, "memory(GiB)": 368.61, "step": 12025, "train_speed(iter/s)": 0.200876 }, { "acc": 0.73817186, "epoch": 0.3051750380517504, "grad_norm": 2.171875, "learning_rate": 9.715021359131489e-06, "loss": 1.06307755, "memory(GiB)": 368.61, "step": 12030, "train_speed(iter/s)": 0.200889 }, { "acc": 0.73998928, "epoch": 0.30530187721968544, "grad_norm": 2.3125, "learning_rate": 9.714672293798449e-06, "loss": 1.06279964, "memory(GiB)": 368.61, "step": 12035, "train_speed(iter/s)": 0.2009 }, { "acc": 0.73171358, "epoch": 0.3054287163876205, "grad_norm": 2.03125, "learning_rate": 9.71432302109307e-06, "loss": 1.0673439, "memory(GiB)": 368.61, "step": 12040, "train_speed(iter/s)": 0.200912 }, { "acc": 0.73607349, "epoch": 0.3055555555555556, "grad_norm": 2.4375, "learning_rate": 9.713973541030716e-06, "loss": 1.00378036, "memory(GiB)": 368.61, "step": 12045, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74690223, "epoch": 0.3056823947234906, "grad_norm": 2.171875, "learning_rate": 9.713623853626763e-06, "loss": 0.99272118, "memory(GiB)": 368.61, "step": 12050, "train_speed(iter/s)": 0.200924 }, { "acc": 0.7480967, "epoch": 0.30580923389142567, "grad_norm": 2.234375, "learning_rate": 9.713273958896586e-06, "loss": 1.02720356, "memory(GiB)": 368.61, "step": 12055, "train_speed(iter/s)": 0.200936 }, { "acc": 0.75161171, "epoch": 0.3059360730593607, "grad_norm": 1.8828125, "learning_rate": 9.712923856855578e-06, "loss": 0.98730278, "memory(GiB)": 368.61, "step": 12060, "train_speed(iter/s)": 0.20095 }, { "acc": 0.73553991, "epoch": 0.3060629122272958, "grad_norm": 2.46875, "learning_rate": 9.712573547519134e-06, "loss": 1.05032005, "memory(GiB)": 368.61, "step": 12065, "train_speed(iter/s)": 0.200965 }, { "acc": 0.72572441, "epoch": 0.30618975139523086, "grad_norm": 2.640625, "learning_rate": 9.712223030902668e-06, "loss": 1.1106884, "memory(GiB)": 368.61, "step": 12070, "train_speed(iter/s)": 0.200978 }, { "acc": 0.74918928, "epoch": 0.3063165905631659, "grad_norm": 2.40625, "learning_rate": 9.711872307021594e-06, "loss": 1.02449284, "memory(GiB)": 368.61, "step": 12075, "train_speed(iter/s)": 0.200988 }, { "acc": 0.74202042, "epoch": 0.30644342973110095, "grad_norm": 2.59375, "learning_rate": 9.711521375891339e-06, "loss": 1.06104851, "memory(GiB)": 368.61, "step": 12080, "train_speed(iter/s)": 0.200999 }, { "acc": 0.74228649, "epoch": 0.30657026889903605, "grad_norm": 2.40625, "learning_rate": 9.71117023752734e-06, "loss": 1.05541039, "memory(GiB)": 368.61, "step": 12085, "train_speed(iter/s)": 0.201014 }, { "acc": 0.73576641, "epoch": 0.3066971080669711, "grad_norm": 2.203125, "learning_rate": 9.710818891945037e-06, "loss": 1.01382322, "memory(GiB)": 368.61, "step": 12090, "train_speed(iter/s)": 0.200997 }, { "acc": 0.74816484, "epoch": 0.30682394723490614, "grad_norm": 2.171875, "learning_rate": 9.71046733915989e-06, "loss": 1.03115282, "memory(GiB)": 368.61, "step": 12095, "train_speed(iter/s)": 0.201004 }, { "acc": 0.74141774, "epoch": 0.3069507864028412, "grad_norm": 2.171875, "learning_rate": 9.710115579187356e-06, "loss": 1.03782349, "memory(GiB)": 368.61, "step": 12100, "train_speed(iter/s)": 0.201018 }, { "acc": 0.73578844, "epoch": 0.3070776255707763, "grad_norm": 2.5, "learning_rate": 9.709763612042911e-06, "loss": 1.08434811, "memory(GiB)": 368.61, "step": 12105, "train_speed(iter/s)": 0.201032 }, { "acc": 0.74506717, "epoch": 0.3072044647387113, "grad_norm": 2.09375, "learning_rate": 9.709411437742035e-06, "loss": 0.95879192, "memory(GiB)": 368.61, "step": 12110, "train_speed(iter/s)": 0.201034 }, { "acc": 0.75554128, "epoch": 0.30733130390664637, "grad_norm": 2.734375, "learning_rate": 9.709059056300218e-06, "loss": 0.96359682, "memory(GiB)": 368.61, "step": 12115, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7416811, "epoch": 0.3074581430745814, "grad_norm": 2.203125, "learning_rate": 9.708706467732958e-06, "loss": 1.04480381, "memory(GiB)": 368.61, "step": 12120, "train_speed(iter/s)": 0.201028 }, { "acc": 0.73514495, "epoch": 0.3075849822425165, "grad_norm": 2.1875, "learning_rate": 9.708353672055766e-06, "loss": 1.08851547, "memory(GiB)": 368.61, "step": 12125, "train_speed(iter/s)": 0.201041 }, { "acc": 0.74163208, "epoch": 0.30771182141045156, "grad_norm": 2.46875, "learning_rate": 9.708000669284158e-06, "loss": 1.08586283, "memory(GiB)": 368.61, "step": 12130, "train_speed(iter/s)": 0.201057 }, { "acc": 0.74365025, "epoch": 0.3078386605783866, "grad_norm": 2.390625, "learning_rate": 9.707647459433661e-06, "loss": 1.06313791, "memory(GiB)": 368.61, "step": 12135, "train_speed(iter/s)": 0.201076 }, { "acc": 0.74174156, "epoch": 0.30796549974632165, "grad_norm": 2.25, "learning_rate": 9.707294042519808e-06, "loss": 1.01591558, "memory(GiB)": 368.61, "step": 12140, "train_speed(iter/s)": 0.201087 }, { "acc": 0.74654036, "epoch": 0.30809233891425675, "grad_norm": 2.140625, "learning_rate": 9.70694041855815e-06, "loss": 1.09124203, "memory(GiB)": 368.61, "step": 12145, "train_speed(iter/s)": 0.201095 }, { "acc": 0.73298912, "epoch": 0.3082191780821918, "grad_norm": 1.9140625, "learning_rate": 9.706586587564236e-06, "loss": 1.08899841, "memory(GiB)": 368.61, "step": 12150, "train_speed(iter/s)": 0.201107 }, { "acc": 0.73756533, "epoch": 0.30834601725012684, "grad_norm": 2.328125, "learning_rate": 9.70623254955363e-06, "loss": 1.04270592, "memory(GiB)": 368.61, "step": 12155, "train_speed(iter/s)": 0.201116 }, { "acc": 0.76472392, "epoch": 0.3084728564180619, "grad_norm": 3.25, "learning_rate": 9.705878304541905e-06, "loss": 0.98217449, "memory(GiB)": 368.61, "step": 12160, "train_speed(iter/s)": 0.201131 }, { "acc": 0.74390092, "epoch": 0.308599695585997, "grad_norm": 2.046875, "learning_rate": 9.705523852544643e-06, "loss": 1.02480755, "memory(GiB)": 368.61, "step": 12165, "train_speed(iter/s)": 0.201143 }, { "acc": 0.73323212, "epoch": 0.308726534753932, "grad_norm": 2.703125, "learning_rate": 9.705169193577434e-06, "loss": 1.10353336, "memory(GiB)": 368.61, "step": 12170, "train_speed(iter/s)": 0.201148 }, { "acc": 0.75863719, "epoch": 0.30885337392186707, "grad_norm": 2.375, "learning_rate": 9.704814327655874e-06, "loss": 0.98420401, "memory(GiB)": 368.61, "step": 12175, "train_speed(iter/s)": 0.20116 }, { "acc": 0.75186148, "epoch": 0.3089802130898021, "grad_norm": 2.25, "learning_rate": 9.704459254795575e-06, "loss": 0.99418087, "memory(GiB)": 368.61, "step": 12180, "train_speed(iter/s)": 0.201169 }, { "acc": 0.73386335, "epoch": 0.3091070522577372, "grad_norm": 1.90625, "learning_rate": 9.704103975012155e-06, "loss": 1.04547167, "memory(GiB)": 368.61, "step": 12185, "train_speed(iter/s)": 0.201181 }, { "acc": 0.74225883, "epoch": 0.30923389142567226, "grad_norm": 2.1875, "learning_rate": 9.703748488321238e-06, "loss": 0.97502174, "memory(GiB)": 368.61, "step": 12190, "train_speed(iter/s)": 0.201194 }, { "acc": 0.76106396, "epoch": 0.3093607305936073, "grad_norm": 2.109375, "learning_rate": 9.703392794738464e-06, "loss": 0.98013992, "memory(GiB)": 368.61, "step": 12195, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74240489, "epoch": 0.30948756976154235, "grad_norm": 2.296875, "learning_rate": 9.703036894279476e-06, "loss": 1.06364679, "memory(GiB)": 368.61, "step": 12200, "train_speed(iter/s)": 0.201216 }, { "acc": 0.76128168, "epoch": 0.30961440892947745, "grad_norm": 2.484375, "learning_rate": 9.702680786959925e-06, "loss": 0.98354263, "memory(GiB)": 368.61, "step": 12205, "train_speed(iter/s)": 0.201225 }, { "acc": 0.74546385, "epoch": 0.3097412480974125, "grad_norm": 2.046875, "learning_rate": 9.702324472795481e-06, "loss": 1.037115, "memory(GiB)": 368.61, "step": 12210, "train_speed(iter/s)": 0.201234 }, { "acc": 0.73909907, "epoch": 0.30986808726534754, "grad_norm": 2.140625, "learning_rate": 9.70196795180181e-06, "loss": 1.03076134, "memory(GiB)": 368.61, "step": 12215, "train_speed(iter/s)": 0.201247 }, { "acc": 0.75152998, "epoch": 0.3099949264332826, "grad_norm": 2.359375, "learning_rate": 9.701611223994596e-06, "loss": 1.03261127, "memory(GiB)": 368.61, "step": 12220, "train_speed(iter/s)": 0.20126 }, { "acc": 0.7359498, "epoch": 0.3101217656012177, "grad_norm": 2.09375, "learning_rate": 9.701254289389529e-06, "loss": 0.99564171, "memory(GiB)": 368.61, "step": 12225, "train_speed(iter/s)": 0.201273 }, { "acc": 0.75102663, "epoch": 0.3102486047691527, "grad_norm": 2.078125, "learning_rate": 9.700897148002308e-06, "loss": 1.04806557, "memory(GiB)": 368.61, "step": 12230, "train_speed(iter/s)": 0.201282 }, { "acc": 0.74403543, "epoch": 0.31037544393708777, "grad_norm": 2.171875, "learning_rate": 9.700539799848645e-06, "loss": 1.07735691, "memory(GiB)": 368.61, "step": 12235, "train_speed(iter/s)": 0.201296 }, { "acc": 0.74169364, "epoch": 0.3105022831050228, "grad_norm": 2.03125, "learning_rate": 9.700182244944252e-06, "loss": 1.02615299, "memory(GiB)": 368.61, "step": 12240, "train_speed(iter/s)": 0.201309 }, { "acc": 0.73061056, "epoch": 0.3106291222729579, "grad_norm": 2.171875, "learning_rate": 9.69982448330486e-06, "loss": 1.07951202, "memory(GiB)": 368.61, "step": 12245, "train_speed(iter/s)": 0.201325 }, { "acc": 0.73320608, "epoch": 0.31075596144089296, "grad_norm": 2.359375, "learning_rate": 9.699466514946206e-06, "loss": 1.12621326, "memory(GiB)": 368.61, "step": 12250, "train_speed(iter/s)": 0.201334 }, { "acc": 0.75305815, "epoch": 0.310882800608828, "grad_norm": 2.453125, "learning_rate": 9.699108339884032e-06, "loss": 0.9763133, "memory(GiB)": 368.61, "step": 12255, "train_speed(iter/s)": 0.201342 }, { "acc": 0.72815208, "epoch": 0.31100963977676305, "grad_norm": 1.9765625, "learning_rate": 9.698749958134093e-06, "loss": 1.08226929, "memory(GiB)": 368.61, "step": 12260, "train_speed(iter/s)": 0.201356 }, { "acc": 0.73480024, "epoch": 0.31113647894469815, "grad_norm": 2.328125, "learning_rate": 9.698391369712152e-06, "loss": 1.1034605, "memory(GiB)": 368.61, "step": 12265, "train_speed(iter/s)": 0.20137 }, { "acc": 0.74526052, "epoch": 0.3112633181126332, "grad_norm": 2.203125, "learning_rate": 9.698032574633982e-06, "loss": 1.02850809, "memory(GiB)": 368.61, "step": 12270, "train_speed(iter/s)": 0.201385 }, { "acc": 0.74261637, "epoch": 0.31139015728056824, "grad_norm": 1.90625, "learning_rate": 9.697673572915364e-06, "loss": 1.02492428, "memory(GiB)": 368.61, "step": 12275, "train_speed(iter/s)": 0.201403 }, { "acc": 0.73347683, "epoch": 0.3115169964485033, "grad_norm": 2.34375, "learning_rate": 9.697314364572087e-06, "loss": 1.06730642, "memory(GiB)": 368.61, "step": 12280, "train_speed(iter/s)": 0.201416 }, { "acc": 0.74387016, "epoch": 0.3116438356164384, "grad_norm": 2.203125, "learning_rate": 9.696954949619955e-06, "loss": 1.05752277, "memory(GiB)": 368.61, "step": 12285, "train_speed(iter/s)": 0.20143 }, { "acc": 0.74379587, "epoch": 0.3117706747843734, "grad_norm": 1.921875, "learning_rate": 9.696595328074774e-06, "loss": 1.02569551, "memory(GiB)": 368.61, "step": 12290, "train_speed(iter/s)": 0.201441 }, { "acc": 0.74676886, "epoch": 0.31189751395230847, "grad_norm": 2.25, "learning_rate": 9.69623549995236e-06, "loss": 1.05847044, "memory(GiB)": 368.61, "step": 12295, "train_speed(iter/s)": 0.20146 }, { "acc": 0.73567114, "epoch": 0.3120243531202435, "grad_norm": 2.328125, "learning_rate": 9.695875465268543e-06, "loss": 1.0761755, "memory(GiB)": 368.61, "step": 12300, "train_speed(iter/s)": 0.201478 }, { "acc": 0.7526144, "epoch": 0.3121511922881786, "grad_norm": 2.59375, "learning_rate": 9.695515224039156e-06, "loss": 1.03322926, "memory(GiB)": 368.61, "step": 12305, "train_speed(iter/s)": 0.201489 }, { "acc": 0.74471121, "epoch": 0.31227803145611366, "grad_norm": 2.40625, "learning_rate": 9.695154776280047e-06, "loss": 1.02185249, "memory(GiB)": 368.61, "step": 12310, "train_speed(iter/s)": 0.201504 }, { "acc": 0.73514071, "epoch": 0.3124048706240487, "grad_norm": 2.28125, "learning_rate": 9.694794122007067e-06, "loss": 1.06493292, "memory(GiB)": 368.61, "step": 12315, "train_speed(iter/s)": 0.201515 }, { "acc": 0.74278803, "epoch": 0.31253170979198375, "grad_norm": 2.140625, "learning_rate": 9.694433261236083e-06, "loss": 1.0250248, "memory(GiB)": 368.61, "step": 12320, "train_speed(iter/s)": 0.20153 }, { "acc": 0.73962727, "epoch": 0.31265854895991885, "grad_norm": 2.265625, "learning_rate": 9.694072193982962e-06, "loss": 1.07790785, "memory(GiB)": 368.61, "step": 12325, "train_speed(iter/s)": 0.20154 }, { "acc": 0.74732494, "epoch": 0.3127853881278539, "grad_norm": 2.328125, "learning_rate": 9.69371092026359e-06, "loss": 1.05409241, "memory(GiB)": 368.61, "step": 12330, "train_speed(iter/s)": 0.201554 }, { "acc": 0.75859275, "epoch": 0.31291222729578894, "grad_norm": 2.1875, "learning_rate": 9.693349440093855e-06, "loss": 0.99201536, "memory(GiB)": 368.61, "step": 12335, "train_speed(iter/s)": 0.201562 }, { "acc": 0.75276155, "epoch": 0.313039066463724, "grad_norm": 2.0625, "learning_rate": 9.69298775348966e-06, "loss": 0.97867336, "memory(GiB)": 368.61, "step": 12340, "train_speed(iter/s)": 0.20157 }, { "acc": 0.74567647, "epoch": 0.3131659056316591, "grad_norm": 2.34375, "learning_rate": 9.69262586046691e-06, "loss": 1.03896685, "memory(GiB)": 368.61, "step": 12345, "train_speed(iter/s)": 0.201582 }, { "acc": 0.74963703, "epoch": 0.3132927447995941, "grad_norm": 2.28125, "learning_rate": 9.692263761041521e-06, "loss": 1.03209782, "memory(GiB)": 368.61, "step": 12350, "train_speed(iter/s)": 0.201597 }, { "acc": 0.74322581, "epoch": 0.31341958396752917, "grad_norm": 1.765625, "learning_rate": 9.691901455229425e-06, "loss": 1.01594887, "memory(GiB)": 368.61, "step": 12355, "train_speed(iter/s)": 0.201609 }, { "acc": 0.7363328, "epoch": 0.3135464231354642, "grad_norm": 2.03125, "learning_rate": 9.691538943046552e-06, "loss": 1.05196457, "memory(GiB)": 368.61, "step": 12360, "train_speed(iter/s)": 0.201627 }, { "acc": 0.74094958, "epoch": 0.3136732623033993, "grad_norm": 1.9921875, "learning_rate": 9.691176224508853e-06, "loss": 1.06456976, "memory(GiB)": 368.61, "step": 12365, "train_speed(iter/s)": 0.201636 }, { "acc": 0.74187689, "epoch": 0.31380010147133436, "grad_norm": 2.21875, "learning_rate": 9.690813299632278e-06, "loss": 1.01403828, "memory(GiB)": 368.61, "step": 12370, "train_speed(iter/s)": 0.201652 }, { "acc": 0.74393053, "epoch": 0.3139269406392694, "grad_norm": 2.046875, "learning_rate": 9.690450168432793e-06, "loss": 1.0438364, "memory(GiB)": 368.61, "step": 12375, "train_speed(iter/s)": 0.201666 }, { "acc": 0.73368273, "epoch": 0.31405377980720445, "grad_norm": 1.9921875, "learning_rate": 9.690086830926366e-06, "loss": 1.03836784, "memory(GiB)": 368.61, "step": 12380, "train_speed(iter/s)": 0.201682 }, { "acc": 0.72420244, "epoch": 0.31418061897513955, "grad_norm": 2.03125, "learning_rate": 9.689723287128981e-06, "loss": 1.06700621, "memory(GiB)": 368.61, "step": 12385, "train_speed(iter/s)": 0.201692 }, { "acc": 0.74849272, "epoch": 0.3143074581430746, "grad_norm": 2.265625, "learning_rate": 9.689359537056628e-06, "loss": 1.02271767, "memory(GiB)": 368.61, "step": 12390, "train_speed(iter/s)": 0.201707 }, { "acc": 0.74734669, "epoch": 0.31443429731100964, "grad_norm": 2.140625, "learning_rate": 9.688995580725304e-06, "loss": 1.00708523, "memory(GiB)": 368.61, "step": 12395, "train_speed(iter/s)": 0.201723 }, { "acc": 0.75381656, "epoch": 0.3145611364789447, "grad_norm": 3.109375, "learning_rate": 9.688631418151022e-06, "loss": 0.96705914, "memory(GiB)": 368.61, "step": 12400, "train_speed(iter/s)": 0.201736 }, { "acc": 0.74873829, "epoch": 0.3146879756468798, "grad_norm": 2.375, "learning_rate": 9.688267049349796e-06, "loss": 0.99723034, "memory(GiB)": 368.61, "step": 12405, "train_speed(iter/s)": 0.201749 }, { "acc": 0.73830233, "epoch": 0.3148148148148148, "grad_norm": 2.046875, "learning_rate": 9.687902474337654e-06, "loss": 1.09916172, "memory(GiB)": 368.61, "step": 12410, "train_speed(iter/s)": 0.201763 }, { "acc": 0.74115853, "epoch": 0.31494165398274987, "grad_norm": 2.015625, "learning_rate": 9.687537693130631e-06, "loss": 1.00360699, "memory(GiB)": 368.61, "step": 12415, "train_speed(iter/s)": 0.20178 }, { "acc": 0.73380666, "epoch": 0.3150684931506849, "grad_norm": 2.28125, "learning_rate": 9.687172705744773e-06, "loss": 1.05038261, "memory(GiB)": 368.61, "step": 12420, "train_speed(iter/s)": 0.201786 }, { "acc": 0.71675301, "epoch": 0.31519533231862, "grad_norm": 2.15625, "learning_rate": 9.686807512196132e-06, "loss": 1.09422741, "memory(GiB)": 368.61, "step": 12425, "train_speed(iter/s)": 0.201794 }, { "acc": 0.73898726, "epoch": 0.31532217148655506, "grad_norm": 2.390625, "learning_rate": 9.68644211250077e-06, "loss": 1.01985321, "memory(GiB)": 368.61, "step": 12430, "train_speed(iter/s)": 0.201808 }, { "acc": 0.72590685, "epoch": 0.3154490106544901, "grad_norm": 2.171875, "learning_rate": 9.686076506674761e-06, "loss": 1.05372372, "memory(GiB)": 368.61, "step": 12435, "train_speed(iter/s)": 0.201823 }, { "acc": 0.73917618, "epoch": 0.31557584982242515, "grad_norm": 1.984375, "learning_rate": 9.685710694734187e-06, "loss": 1.02920465, "memory(GiB)": 368.61, "step": 12440, "train_speed(iter/s)": 0.201832 }, { "acc": 0.7419282, "epoch": 0.31570268899036025, "grad_norm": 2.296875, "learning_rate": 9.685344676695135e-06, "loss": 1.01048031, "memory(GiB)": 368.61, "step": 12445, "train_speed(iter/s)": 0.201847 }, { "acc": 0.74018297, "epoch": 0.3158295281582953, "grad_norm": 2.34375, "learning_rate": 9.684978452573706e-06, "loss": 1.05546513, "memory(GiB)": 368.61, "step": 12450, "train_speed(iter/s)": 0.201859 }, { "acc": 0.74742866, "epoch": 0.31595636732623034, "grad_norm": 2.3125, "learning_rate": 9.684612022386008e-06, "loss": 0.99383812, "memory(GiB)": 368.61, "step": 12455, "train_speed(iter/s)": 0.201856 }, { "acc": 0.73351178, "epoch": 0.3160832064941654, "grad_norm": 1.96875, "learning_rate": 9.68424538614816e-06, "loss": 1.04361401, "memory(GiB)": 368.61, "step": 12460, "train_speed(iter/s)": 0.201871 }, { "acc": 0.74372139, "epoch": 0.3162100456621005, "grad_norm": 2.28125, "learning_rate": 9.683878543876281e-06, "loss": 1.04302883, "memory(GiB)": 368.61, "step": 12465, "train_speed(iter/s)": 0.201887 }, { "acc": 0.74789505, "epoch": 0.3163368848300355, "grad_norm": 2.359375, "learning_rate": 9.683511495586516e-06, "loss": 1.02000933, "memory(GiB)": 368.61, "step": 12470, "train_speed(iter/s)": 0.201904 }, { "acc": 0.74414291, "epoch": 0.31646372399797057, "grad_norm": 2.21875, "learning_rate": 9.683144241295003e-06, "loss": 0.9895771, "memory(GiB)": 368.61, "step": 12475, "train_speed(iter/s)": 0.201905 }, { "acc": 0.74147973, "epoch": 0.3165905631659056, "grad_norm": 2.59375, "learning_rate": 9.682776781017899e-06, "loss": 1.03061523, "memory(GiB)": 368.61, "step": 12480, "train_speed(iter/s)": 0.201917 }, { "acc": 0.73105221, "epoch": 0.3167174023338407, "grad_norm": 2.3125, "learning_rate": 9.682409114771364e-06, "loss": 1.04265327, "memory(GiB)": 368.61, "step": 12485, "train_speed(iter/s)": 0.201926 }, { "acc": 0.72562838, "epoch": 0.31684424150177576, "grad_norm": 1.8984375, "learning_rate": 9.682041242571571e-06, "loss": 1.02623463, "memory(GiB)": 368.61, "step": 12490, "train_speed(iter/s)": 0.201938 }, { "acc": 0.74190469, "epoch": 0.3169710806697108, "grad_norm": 2.5625, "learning_rate": 9.681673164434701e-06, "loss": 1.04514542, "memory(GiB)": 368.61, "step": 12495, "train_speed(iter/s)": 0.201955 }, { "acc": 0.75462503, "epoch": 0.31709791983764585, "grad_norm": 1.96875, "learning_rate": 9.681304880376942e-06, "loss": 0.98751259, "memory(GiB)": 368.61, "step": 12500, "train_speed(iter/s)": 0.201966 }, { "acc": 0.73534083, "epoch": 0.31722475900558095, "grad_norm": 2.296875, "learning_rate": 9.680936390414495e-06, "loss": 1.11555214, "memory(GiB)": 368.61, "step": 12505, "train_speed(iter/s)": 0.201982 }, { "acc": 0.74444628, "epoch": 0.317351598173516, "grad_norm": 2.484375, "learning_rate": 9.680567694563566e-06, "loss": 1.05412483, "memory(GiB)": 368.61, "step": 12510, "train_speed(iter/s)": 0.201989 }, { "acc": 0.74479613, "epoch": 0.31747843734145104, "grad_norm": 2.34375, "learning_rate": 9.680198792840371e-06, "loss": 1.04851255, "memory(GiB)": 368.61, "step": 12515, "train_speed(iter/s)": 0.201994 }, { "acc": 0.73538628, "epoch": 0.3176052765093861, "grad_norm": 2.265625, "learning_rate": 9.67982968526114e-06, "loss": 1.02518864, "memory(GiB)": 368.61, "step": 12520, "train_speed(iter/s)": 0.202003 }, { "acc": 0.75606747, "epoch": 0.3177321156773212, "grad_norm": 2.125, "learning_rate": 9.679460371842104e-06, "loss": 1.02099028, "memory(GiB)": 368.61, "step": 12525, "train_speed(iter/s)": 0.202009 }, { "acc": 0.73065534, "epoch": 0.3178589548452562, "grad_norm": 2.125, "learning_rate": 9.679090852599508e-06, "loss": 1.06017818, "memory(GiB)": 368.61, "step": 12530, "train_speed(iter/s)": 0.202021 }, { "acc": 0.73153586, "epoch": 0.31798579401319127, "grad_norm": 2.421875, "learning_rate": 9.678721127549608e-06, "loss": 1.05897713, "memory(GiB)": 368.61, "step": 12535, "train_speed(iter/s)": 0.202035 }, { "acc": 0.72919044, "epoch": 0.3181126331811263, "grad_norm": 2.671875, "learning_rate": 9.678351196708662e-06, "loss": 1.10896378, "memory(GiB)": 368.61, "step": 12540, "train_speed(iter/s)": 0.202047 }, { "acc": 0.74481001, "epoch": 0.3182394723490614, "grad_norm": 2.421875, "learning_rate": 9.677981060092943e-06, "loss": 1.07084217, "memory(GiB)": 368.61, "step": 12545, "train_speed(iter/s)": 0.202058 }, { "acc": 0.72348089, "epoch": 0.31836631151699646, "grad_norm": 1.8359375, "learning_rate": 9.677610717718732e-06, "loss": 1.0929163, "memory(GiB)": 368.61, "step": 12550, "train_speed(iter/s)": 0.202064 }, { "acc": 0.7187665, "epoch": 0.3184931506849315, "grad_norm": 2.6875, "learning_rate": 9.677240169602317e-06, "loss": 1.12490482, "memory(GiB)": 368.61, "step": 12555, "train_speed(iter/s)": 0.202078 }, { "acc": 0.74213295, "epoch": 0.31861998985286655, "grad_norm": 2.640625, "learning_rate": 9.676869415759999e-06, "loss": 1.05806446, "memory(GiB)": 368.61, "step": 12560, "train_speed(iter/s)": 0.20208 }, { "acc": 0.73694992, "epoch": 0.31874682902080165, "grad_norm": 2.21875, "learning_rate": 9.67649845620808e-06, "loss": 1.10861931, "memory(GiB)": 368.61, "step": 12565, "train_speed(iter/s)": 0.202095 }, { "acc": 0.74733915, "epoch": 0.3188736681887367, "grad_norm": 1.9765625, "learning_rate": 9.676127290962883e-06, "loss": 1.05710087, "memory(GiB)": 368.61, "step": 12570, "train_speed(iter/s)": 0.202106 }, { "acc": 0.73032713, "epoch": 0.31900050735667174, "grad_norm": 2.359375, "learning_rate": 9.675755920040728e-06, "loss": 1.06624832, "memory(GiB)": 368.61, "step": 12575, "train_speed(iter/s)": 0.202115 }, { "acc": 0.75262928, "epoch": 0.3191273465246068, "grad_norm": 2.140625, "learning_rate": 9.675384343457954e-06, "loss": 1.00965977, "memory(GiB)": 368.61, "step": 12580, "train_speed(iter/s)": 0.202116 }, { "acc": 0.73864107, "epoch": 0.3192541856925419, "grad_norm": 2.1875, "learning_rate": 9.675012561230901e-06, "loss": 1.056073, "memory(GiB)": 368.61, "step": 12585, "train_speed(iter/s)": 0.202131 }, { "acc": 0.74312158, "epoch": 0.3193810248604769, "grad_norm": 2.296875, "learning_rate": 9.674640573375924e-06, "loss": 1.07502823, "memory(GiB)": 368.61, "step": 12590, "train_speed(iter/s)": 0.202145 }, { "acc": 0.74314108, "epoch": 0.31950786402841197, "grad_norm": 2.59375, "learning_rate": 9.674268379909383e-06, "loss": 1.09215679, "memory(GiB)": 368.61, "step": 12595, "train_speed(iter/s)": 0.202153 }, { "acc": 0.74464788, "epoch": 0.319634703196347, "grad_norm": 2.75, "learning_rate": 9.67389598084765e-06, "loss": 1.02694645, "memory(GiB)": 368.61, "step": 12600, "train_speed(iter/s)": 0.202155 }, { "acc": 0.75049076, "epoch": 0.3197615423642821, "grad_norm": 2.625, "learning_rate": 9.673523376207103e-06, "loss": 1.01544018, "memory(GiB)": 368.61, "step": 12605, "train_speed(iter/s)": 0.202168 }, { "acc": 0.7368278, "epoch": 0.31988838153221716, "grad_norm": 2.703125, "learning_rate": 9.673150566004135e-06, "loss": 1.03020287, "memory(GiB)": 368.61, "step": 12610, "train_speed(iter/s)": 0.202182 }, { "acc": 0.7315958, "epoch": 0.3200152207001522, "grad_norm": 2.09375, "learning_rate": 9.672777550255137e-06, "loss": 1.09859638, "memory(GiB)": 368.61, "step": 12615, "train_speed(iter/s)": 0.202193 }, { "acc": 0.75032005, "epoch": 0.32014205986808725, "grad_norm": 2.109375, "learning_rate": 9.672404328976523e-06, "loss": 1.03413372, "memory(GiB)": 368.61, "step": 12620, "train_speed(iter/s)": 0.202202 }, { "acc": 0.75275927, "epoch": 0.32026889903602235, "grad_norm": 2.21875, "learning_rate": 9.672030902184706e-06, "loss": 0.99757843, "memory(GiB)": 368.61, "step": 12625, "train_speed(iter/s)": 0.202211 }, { "acc": 0.73730268, "epoch": 0.3203957382039574, "grad_norm": 2.1875, "learning_rate": 9.671657269896108e-06, "loss": 1.12963257, "memory(GiB)": 368.61, "step": 12630, "train_speed(iter/s)": 0.202218 }, { "acc": 0.75450692, "epoch": 0.32052257737189244, "grad_norm": 2.0, "learning_rate": 9.671283432127169e-06, "loss": 1.03368149, "memory(GiB)": 368.61, "step": 12635, "train_speed(iter/s)": 0.20223 }, { "acc": 0.74564586, "epoch": 0.3206494165398275, "grad_norm": 2.03125, "learning_rate": 9.670909388894328e-06, "loss": 1.02003107, "memory(GiB)": 368.61, "step": 12640, "train_speed(iter/s)": 0.202246 }, { "acc": 0.73762388, "epoch": 0.3207762557077626, "grad_norm": 1.875, "learning_rate": 9.670535140214037e-06, "loss": 1.0650116, "memory(GiB)": 368.61, "step": 12645, "train_speed(iter/s)": 0.202253 }, { "acc": 0.7467783, "epoch": 0.3209030948756976, "grad_norm": 2.296875, "learning_rate": 9.670160686102759e-06, "loss": 1.04099913, "memory(GiB)": 368.61, "step": 12650, "train_speed(iter/s)": 0.202266 }, { "acc": 0.7360857, "epoch": 0.32102993404363267, "grad_norm": 2.125, "learning_rate": 9.669786026576962e-06, "loss": 1.05786991, "memory(GiB)": 368.61, "step": 12655, "train_speed(iter/s)": 0.202269 }, { "acc": 0.74691496, "epoch": 0.3211567732115677, "grad_norm": 2.15625, "learning_rate": 9.669411161653127e-06, "loss": 1.02304039, "memory(GiB)": 368.61, "step": 12660, "train_speed(iter/s)": 0.20228 }, { "acc": 0.73938112, "epoch": 0.3212836123795028, "grad_norm": 2.03125, "learning_rate": 9.669036091347742e-06, "loss": 1.01718121, "memory(GiB)": 368.61, "step": 12665, "train_speed(iter/s)": 0.202295 }, { "acc": 0.74663677, "epoch": 0.32141045154743786, "grad_norm": 2.40625, "learning_rate": 9.668660815677304e-06, "loss": 0.98897038, "memory(GiB)": 368.61, "step": 12670, "train_speed(iter/s)": 0.202305 }, { "acc": 0.75990353, "epoch": 0.3215372907153729, "grad_norm": 2.5625, "learning_rate": 9.668285334658319e-06, "loss": 1.00961018, "memory(GiB)": 368.61, "step": 12675, "train_speed(iter/s)": 0.202311 }, { "acc": 0.73065925, "epoch": 0.32166412988330795, "grad_norm": 2.296875, "learning_rate": 9.667909648307302e-06, "loss": 1.07780209, "memory(GiB)": 368.61, "step": 12680, "train_speed(iter/s)": 0.202322 }, { "acc": 0.76082401, "epoch": 0.32179096905124305, "grad_norm": 2.765625, "learning_rate": 9.66753375664078e-06, "loss": 0.94223003, "memory(GiB)": 368.61, "step": 12685, "train_speed(iter/s)": 0.202332 }, { "acc": 0.73812017, "epoch": 0.3219178082191781, "grad_norm": 2.484375, "learning_rate": 9.667157659675284e-06, "loss": 1.04341278, "memory(GiB)": 368.61, "step": 12690, "train_speed(iter/s)": 0.202344 }, { "acc": 0.74171543, "epoch": 0.32204464738711314, "grad_norm": 2.34375, "learning_rate": 9.666781357427355e-06, "loss": 1.05163193, "memory(GiB)": 368.61, "step": 12695, "train_speed(iter/s)": 0.202355 }, { "acc": 0.7432869, "epoch": 0.3221714865550482, "grad_norm": 2.0625, "learning_rate": 9.666404849913546e-06, "loss": 1.018011, "memory(GiB)": 368.61, "step": 12700, "train_speed(iter/s)": 0.202366 }, { "acc": 0.74732685, "epoch": 0.3222983257229833, "grad_norm": 2.484375, "learning_rate": 9.66602813715042e-06, "loss": 1.00745611, "memory(GiB)": 368.61, "step": 12705, "train_speed(iter/s)": 0.202374 }, { "acc": 0.75170937, "epoch": 0.3224251648909183, "grad_norm": 2.15625, "learning_rate": 9.665651219154543e-06, "loss": 1.07002735, "memory(GiB)": 368.61, "step": 12710, "train_speed(iter/s)": 0.202385 }, { "acc": 0.75345416, "epoch": 0.32255200405885337, "grad_norm": 2.046875, "learning_rate": 9.665274095942495e-06, "loss": 1.08125057, "memory(GiB)": 368.61, "step": 12715, "train_speed(iter/s)": 0.202399 }, { "acc": 0.74425116, "epoch": 0.3226788432267884, "grad_norm": 2.09375, "learning_rate": 9.664896767530862e-06, "loss": 1.06817617, "memory(GiB)": 368.61, "step": 12720, "train_speed(iter/s)": 0.20241 }, { "acc": 0.73739958, "epoch": 0.3228056823947235, "grad_norm": 1.9375, "learning_rate": 9.664519233936242e-06, "loss": 1.06561527, "memory(GiB)": 368.61, "step": 12725, "train_speed(iter/s)": 0.20242 }, { "acc": 0.74345474, "epoch": 0.32293252156265856, "grad_norm": 1.984375, "learning_rate": 9.664141495175242e-06, "loss": 0.98250046, "memory(GiB)": 368.61, "step": 12730, "train_speed(iter/s)": 0.202421 }, { "acc": 0.74882612, "epoch": 0.3230593607305936, "grad_norm": 1.9453125, "learning_rate": 9.663763551264476e-06, "loss": 1.04265709, "memory(GiB)": 368.61, "step": 12735, "train_speed(iter/s)": 0.20243 }, { "acc": 0.74589214, "epoch": 0.32318619989852865, "grad_norm": 2.203125, "learning_rate": 9.663385402220565e-06, "loss": 1.03196535, "memory(GiB)": 368.61, "step": 12740, "train_speed(iter/s)": 0.202439 }, { "acc": 0.74828405, "epoch": 0.32331303906646375, "grad_norm": 2.75, "learning_rate": 9.663007048060144e-06, "loss": 1.1001194, "memory(GiB)": 368.61, "step": 12745, "train_speed(iter/s)": 0.202453 }, { "acc": 0.75262117, "epoch": 0.3234398782343988, "grad_norm": 1.9921875, "learning_rate": 9.662628488799854e-06, "loss": 0.95484333, "memory(GiB)": 368.61, "step": 12750, "train_speed(iter/s)": 0.202465 }, { "acc": 0.73621268, "epoch": 0.32356671740233384, "grad_norm": 2.328125, "learning_rate": 9.662249724456346e-06, "loss": 1.02392321, "memory(GiB)": 368.61, "step": 12755, "train_speed(iter/s)": 0.20248 }, { "acc": 0.74725537, "epoch": 0.3236935565702689, "grad_norm": 2.515625, "learning_rate": 9.661870755046278e-06, "loss": 1.02428694, "memory(GiB)": 368.61, "step": 12760, "train_speed(iter/s)": 0.202493 }, { "acc": 0.7377933, "epoch": 0.323820395738204, "grad_norm": 2.015625, "learning_rate": 9.661491580586322e-06, "loss": 0.99493256, "memory(GiB)": 368.61, "step": 12765, "train_speed(iter/s)": 0.202499 }, { "acc": 0.73497558, "epoch": 0.323947234906139, "grad_norm": 2.484375, "learning_rate": 9.661112201093156e-06, "loss": 1.02276096, "memory(GiB)": 368.61, "step": 12770, "train_speed(iter/s)": 0.202511 }, { "acc": 0.7335598, "epoch": 0.32407407407407407, "grad_norm": 2.390625, "learning_rate": 9.660732616583463e-06, "loss": 1.0770071, "memory(GiB)": 368.61, "step": 12775, "train_speed(iter/s)": 0.202518 }, { "acc": 0.7520844, "epoch": 0.3242009132420091, "grad_norm": 2.265625, "learning_rate": 9.660352827073941e-06, "loss": 1.01068316, "memory(GiB)": 368.61, "step": 12780, "train_speed(iter/s)": 0.202527 }, { "acc": 0.72502241, "epoch": 0.3243277524099442, "grad_norm": 2.265625, "learning_rate": 9.659972832581295e-06, "loss": 1.06932173, "memory(GiB)": 368.61, "step": 12785, "train_speed(iter/s)": 0.202539 }, { "acc": 0.74227328, "epoch": 0.32445459157787926, "grad_norm": 2.015625, "learning_rate": 9.659592633122238e-06, "loss": 1.00463085, "memory(GiB)": 369.4, "step": 12790, "train_speed(iter/s)": 0.202553 }, { "acc": 0.73810406, "epoch": 0.3245814307458143, "grad_norm": 2.578125, "learning_rate": 9.659212228713495e-06, "loss": 1.07858524, "memory(GiB)": 369.4, "step": 12795, "train_speed(iter/s)": 0.202563 }, { "acc": 0.74347649, "epoch": 0.32470826991374935, "grad_norm": 2.25, "learning_rate": 9.658831619371793e-06, "loss": 1.0630394, "memory(GiB)": 369.4, "step": 12800, "train_speed(iter/s)": 0.202572 }, { "acc": 0.73795967, "epoch": 0.32483510908168445, "grad_norm": 2.4375, "learning_rate": 9.658450805113879e-06, "loss": 1.06088619, "memory(GiB)": 369.4, "step": 12805, "train_speed(iter/s)": 0.202588 }, { "acc": 0.7459527, "epoch": 0.3249619482496195, "grad_norm": 1.9609375, "learning_rate": 9.6580697859565e-06, "loss": 1.00645351, "memory(GiB)": 369.4, "step": 12810, "train_speed(iter/s)": 0.202594 }, { "acc": 0.76105371, "epoch": 0.32508878741755454, "grad_norm": 1.9609375, "learning_rate": 9.657688561916414e-06, "loss": 0.93242397, "memory(GiB)": 369.4, "step": 12815, "train_speed(iter/s)": 0.202602 }, { "acc": 0.75091133, "epoch": 0.3252156265854896, "grad_norm": 2.484375, "learning_rate": 9.65730713301039e-06, "loss": 1.00546322, "memory(GiB)": 369.4, "step": 12820, "train_speed(iter/s)": 0.202613 }, { "acc": 0.74219365, "epoch": 0.3253424657534247, "grad_norm": 2.0625, "learning_rate": 9.656925499255206e-06, "loss": 1.03627663, "memory(GiB)": 369.4, "step": 12825, "train_speed(iter/s)": 0.202631 }, { "acc": 0.75110254, "epoch": 0.3254693049213597, "grad_norm": 1.890625, "learning_rate": 9.656543660667646e-06, "loss": 0.99170227, "memory(GiB)": 369.4, "step": 12830, "train_speed(iter/s)": 0.202644 }, { "acc": 0.75130243, "epoch": 0.32559614408929477, "grad_norm": 2.1875, "learning_rate": 9.656161617264507e-06, "loss": 0.98466034, "memory(GiB)": 369.4, "step": 12835, "train_speed(iter/s)": 0.202631 }, { "acc": 0.72985315, "epoch": 0.3257229832572298, "grad_norm": 2.578125, "learning_rate": 9.65577936906259e-06, "loss": 1.02045975, "memory(GiB)": 369.4, "step": 12840, "train_speed(iter/s)": 0.202634 }, { "acc": 0.7281846, "epoch": 0.3258498224251649, "grad_norm": 2.65625, "learning_rate": 9.65539691607871e-06, "loss": 1.05779858, "memory(GiB)": 369.4, "step": 12845, "train_speed(iter/s)": 0.202645 }, { "acc": 0.73493176, "epoch": 0.32597666159309996, "grad_norm": 2.03125, "learning_rate": 9.65501425832969e-06, "loss": 1.08209114, "memory(GiB)": 369.4, "step": 12850, "train_speed(iter/s)": 0.202654 }, { "acc": 0.73928204, "epoch": 0.326103500761035, "grad_norm": 1.9921875, "learning_rate": 9.65463139583236e-06, "loss": 1.03572292, "memory(GiB)": 369.4, "step": 12855, "train_speed(iter/s)": 0.202664 }, { "acc": 0.74014468, "epoch": 0.32623033992897005, "grad_norm": 2.21875, "learning_rate": 9.65424832860356e-06, "loss": 1.03295326, "memory(GiB)": 369.4, "step": 12860, "train_speed(iter/s)": 0.20268 }, { "acc": 0.73994794, "epoch": 0.32635717909690515, "grad_norm": 2.078125, "learning_rate": 9.653865056660136e-06, "loss": 1.0733696, "memory(GiB)": 369.4, "step": 12865, "train_speed(iter/s)": 0.202688 }, { "acc": 0.73973823, "epoch": 0.3264840182648402, "grad_norm": 2.265625, "learning_rate": 9.653481580018951e-06, "loss": 1.05366716, "memory(GiB)": 369.4, "step": 12870, "train_speed(iter/s)": 0.202697 }, { "acc": 0.73778367, "epoch": 0.32661085743277524, "grad_norm": 2.515625, "learning_rate": 9.653097898696869e-06, "loss": 1.08134508, "memory(GiB)": 369.4, "step": 12875, "train_speed(iter/s)": 0.202706 }, { "acc": 0.74132705, "epoch": 0.3267376966007103, "grad_norm": 2.453125, "learning_rate": 9.652714012710766e-06, "loss": 1.07240248, "memory(GiB)": 369.4, "step": 12880, "train_speed(iter/s)": 0.202715 }, { "acc": 0.73632603, "epoch": 0.3268645357686454, "grad_norm": 2.46875, "learning_rate": 9.65232992207753e-06, "loss": 1.08835258, "memory(GiB)": 369.4, "step": 12885, "train_speed(iter/s)": 0.202726 }, { "acc": 0.74165592, "epoch": 0.3269913749365804, "grad_norm": 1.9375, "learning_rate": 9.651945626814052e-06, "loss": 1.06828909, "memory(GiB)": 369.4, "step": 12890, "train_speed(iter/s)": 0.202742 }, { "acc": 0.74806175, "epoch": 0.32711821410451547, "grad_norm": 2.28125, "learning_rate": 9.651561126937236e-06, "loss": 1.00778723, "memory(GiB)": 369.4, "step": 12895, "train_speed(iter/s)": 0.20275 }, { "acc": 0.73483343, "epoch": 0.3272450532724505, "grad_norm": 2.40625, "learning_rate": 9.651176422463994e-06, "loss": 1.09927311, "memory(GiB)": 369.4, "step": 12900, "train_speed(iter/s)": 0.202765 }, { "acc": 0.74979939, "epoch": 0.3273718924403856, "grad_norm": 1.9375, "learning_rate": 9.650791513411246e-06, "loss": 0.97296638, "memory(GiB)": 369.4, "step": 12905, "train_speed(iter/s)": 0.202774 }, { "acc": 0.72967129, "epoch": 0.32749873160832066, "grad_norm": 1.9140625, "learning_rate": 9.650406399795924e-06, "loss": 1.06252632, "memory(GiB)": 369.4, "step": 12910, "train_speed(iter/s)": 0.202781 }, { "acc": 0.73961883, "epoch": 0.3276255707762557, "grad_norm": 2.515625, "learning_rate": 9.650021081634965e-06, "loss": 1.04931984, "memory(GiB)": 369.4, "step": 12915, "train_speed(iter/s)": 0.202794 }, { "acc": 0.7330472, "epoch": 0.32775240994419075, "grad_norm": 2.265625, "learning_rate": 9.649635558945318e-06, "loss": 1.03340454, "memory(GiB)": 369.4, "step": 12920, "train_speed(iter/s)": 0.202808 }, { "acc": 0.73589954, "epoch": 0.32787924911212585, "grad_norm": 1.8984375, "learning_rate": 9.649249831743941e-06, "loss": 1.0489563, "memory(GiB)": 369.4, "step": 12925, "train_speed(iter/s)": 0.202816 }, { "acc": 0.73667593, "epoch": 0.3280060882800609, "grad_norm": 2.015625, "learning_rate": 9.6488639000478e-06, "loss": 1.07184591, "memory(GiB)": 369.4, "step": 12930, "train_speed(iter/s)": 0.202828 }, { "acc": 0.7538723, "epoch": 0.32813292744799594, "grad_norm": 2.1875, "learning_rate": 9.648477763873868e-06, "loss": 1.01965256, "memory(GiB)": 369.4, "step": 12935, "train_speed(iter/s)": 0.202841 }, { "acc": 0.75380015, "epoch": 0.328259766615931, "grad_norm": 2.328125, "learning_rate": 9.648091423239128e-06, "loss": 0.98501549, "memory(GiB)": 369.4, "step": 12940, "train_speed(iter/s)": 0.202853 }, { "acc": 0.74126496, "epoch": 0.3283866057838661, "grad_norm": 2.3125, "learning_rate": 9.647704878160576e-06, "loss": 1.02027664, "memory(GiB)": 369.4, "step": 12945, "train_speed(iter/s)": 0.202863 }, { "acc": 0.72426891, "epoch": 0.3285134449518011, "grad_norm": 2.25, "learning_rate": 9.647318128655213e-06, "loss": 1.05228682, "memory(GiB)": 369.4, "step": 12950, "train_speed(iter/s)": 0.202873 }, { "acc": 0.75349932, "epoch": 0.32864028411973617, "grad_norm": 1.9375, "learning_rate": 9.64693117474005e-06, "loss": 0.97998333, "memory(GiB)": 369.4, "step": 12955, "train_speed(iter/s)": 0.202876 }, { "acc": 0.74775057, "epoch": 0.3287671232876712, "grad_norm": 2.15625, "learning_rate": 9.646544016432109e-06, "loss": 1.0007719, "memory(GiB)": 369.4, "step": 12960, "train_speed(iter/s)": 0.20288 }, { "acc": 0.75373344, "epoch": 0.3288939624556063, "grad_norm": 2.40625, "learning_rate": 9.646156653748415e-06, "loss": 0.99307251, "memory(GiB)": 369.4, "step": 12965, "train_speed(iter/s)": 0.202892 }, { "acc": 0.74880919, "epoch": 0.32902080162354136, "grad_norm": 2.515625, "learning_rate": 9.645769086706008e-06, "loss": 1.05529165, "memory(GiB)": 369.4, "step": 12970, "train_speed(iter/s)": 0.202899 }, { "acc": 0.73773136, "epoch": 0.3291476407914764, "grad_norm": 1.78125, "learning_rate": 9.645381315321934e-06, "loss": 1.05913668, "memory(GiB)": 369.4, "step": 12975, "train_speed(iter/s)": 0.202908 }, { "acc": 0.73935952, "epoch": 0.32927447995941145, "grad_norm": 1.96875, "learning_rate": 9.64499333961325e-06, "loss": 1.07339153, "memory(GiB)": 369.4, "step": 12980, "train_speed(iter/s)": 0.202922 }, { "acc": 0.75220203, "epoch": 0.32940131912734655, "grad_norm": 2.09375, "learning_rate": 9.64460515959702e-06, "loss": 0.95625916, "memory(GiB)": 369.4, "step": 12985, "train_speed(iter/s)": 0.202937 }, { "acc": 0.73565087, "epoch": 0.3295281582952816, "grad_norm": 2.3125, "learning_rate": 9.64421677529032e-06, "loss": 1.04300318, "memory(GiB)": 369.4, "step": 12990, "train_speed(iter/s)": 0.202943 }, { "acc": 0.75625229, "epoch": 0.32965499746321664, "grad_norm": 1.8359375, "learning_rate": 9.64382818671023e-06, "loss": 0.99317646, "memory(GiB)": 369.4, "step": 12995, "train_speed(iter/s)": 0.202954 }, { "acc": 0.7491519, "epoch": 0.3297818366311517, "grad_norm": 2.5, "learning_rate": 9.643439393873844e-06, "loss": 0.99412365, "memory(GiB)": 369.4, "step": 13000, "train_speed(iter/s)": 0.202961 }, { "epoch": 0.3297818366311517, "eval_acc": 0.7318013249072453, "eval_loss": 1.0012459754943848, "eval_runtime": 384.8939, "eval_samples_per_second": 16.55, "eval_steps_per_second": 8.275, "step": 13000 }, { "acc": 0.75399055, "epoch": 0.3299086757990868, "grad_norm": 2.078125, "learning_rate": 9.643050396798262e-06, "loss": 1.02980862, "memory(GiB)": 369.4, "step": 13005, "train_speed(iter/s)": 0.200743 }, { "acc": 0.74870372, "epoch": 0.3300355149670218, "grad_norm": 2.015625, "learning_rate": 9.642661195500593e-06, "loss": 0.96113291, "memory(GiB)": 369.4, "step": 13010, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74085436, "epoch": 0.33016235413495687, "grad_norm": 2.4375, "learning_rate": 9.642271789997956e-06, "loss": 1.01846123, "memory(GiB)": 369.4, "step": 13015, "train_speed(iter/s)": 0.200758 }, { "acc": 0.7528161, "epoch": 0.3302891933028919, "grad_norm": 1.8046875, "learning_rate": 9.64188218030748e-06, "loss": 0.95753841, "memory(GiB)": 369.4, "step": 13020, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74610815, "epoch": 0.330416032470827, "grad_norm": 1.9375, "learning_rate": 9.641492366446301e-06, "loss": 1.00802402, "memory(GiB)": 369.4, "step": 13025, "train_speed(iter/s)": 0.200765 }, { "acc": 0.73725982, "epoch": 0.33054287163876206, "grad_norm": 1.9140625, "learning_rate": 9.641102348431565e-06, "loss": 1.02632532, "memory(GiB)": 369.4, "step": 13030, "train_speed(iter/s)": 0.200776 }, { "acc": 0.74424973, "epoch": 0.3306697108066971, "grad_norm": 2.0, "learning_rate": 9.640712126280429e-06, "loss": 1.01680317, "memory(GiB)": 369.4, "step": 13035, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74751434, "epoch": 0.33079654997463215, "grad_norm": 2.0, "learning_rate": 9.640321700010053e-06, "loss": 0.99278011, "memory(GiB)": 369.4, "step": 13040, "train_speed(iter/s)": 0.200795 }, { "acc": 0.74817557, "epoch": 0.33092338914256725, "grad_norm": 2.1875, "learning_rate": 9.63993106963761e-06, "loss": 1.01123524, "memory(GiB)": 369.4, "step": 13045, "train_speed(iter/s)": 0.20081 }, { "acc": 0.73758883, "epoch": 0.3310502283105023, "grad_norm": 1.8828125, "learning_rate": 9.639540235180283e-06, "loss": 1.04976025, "memory(GiB)": 369.4, "step": 13050, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75305333, "epoch": 0.33117706747843734, "grad_norm": 2.484375, "learning_rate": 9.639149196655263e-06, "loss": 0.98647785, "memory(GiB)": 369.4, "step": 13055, "train_speed(iter/s)": 0.200834 }, { "acc": 0.73670917, "epoch": 0.3313039066463724, "grad_norm": 2.15625, "learning_rate": 9.638757954079749e-06, "loss": 1.01488333, "memory(GiB)": 369.4, "step": 13060, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75023632, "epoch": 0.3314307458143075, "grad_norm": 2.5, "learning_rate": 9.638366507470948e-06, "loss": 0.99917545, "memory(GiB)": 369.4, "step": 13065, "train_speed(iter/s)": 0.200857 }, { "acc": 0.75229168, "epoch": 0.3315575849822425, "grad_norm": 2.3125, "learning_rate": 9.637974856846082e-06, "loss": 1.01511908, "memory(GiB)": 369.4, "step": 13070, "train_speed(iter/s)": 0.200866 }, { "acc": 0.75617828, "epoch": 0.33168442415017757, "grad_norm": 2.296875, "learning_rate": 9.637583002222373e-06, "loss": 0.96552601, "memory(GiB)": 369.4, "step": 13075, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74509144, "epoch": 0.3318112633181126, "grad_norm": 2.03125, "learning_rate": 9.637190943617059e-06, "loss": 1.01759281, "memory(GiB)": 369.4, "step": 13080, "train_speed(iter/s)": 0.200887 }, { "acc": 0.73882322, "epoch": 0.3319381024860477, "grad_norm": 2.171875, "learning_rate": 9.636798681047383e-06, "loss": 1.0136694, "memory(GiB)": 369.4, "step": 13085, "train_speed(iter/s)": 0.200897 }, { "acc": 0.75856085, "epoch": 0.33206494165398276, "grad_norm": 2.359375, "learning_rate": 9.6364062145306e-06, "loss": 1.02948771, "memory(GiB)": 369.4, "step": 13090, "train_speed(iter/s)": 0.200911 }, { "acc": 0.74901972, "epoch": 0.3321917808219178, "grad_norm": 1.8828125, "learning_rate": 9.636013544083971e-06, "loss": 0.99238071, "memory(GiB)": 369.4, "step": 13095, "train_speed(iter/s)": 0.200909 }, { "acc": 0.7473073, "epoch": 0.33231861998985285, "grad_norm": 2.015625, "learning_rate": 9.635620669724768e-06, "loss": 1.01971989, "memory(GiB)": 369.4, "step": 13100, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74558172, "epoch": 0.33244545915778795, "grad_norm": 2.3125, "learning_rate": 9.635227591470272e-06, "loss": 1.07274227, "memory(GiB)": 369.4, "step": 13105, "train_speed(iter/s)": 0.200933 }, { "acc": 0.74525852, "epoch": 0.332572298325723, "grad_norm": 2.09375, "learning_rate": 9.63483430933777e-06, "loss": 1.00645123, "memory(GiB)": 369.4, "step": 13110, "train_speed(iter/s)": 0.200942 }, { "acc": 0.74763484, "epoch": 0.33269913749365804, "grad_norm": 2.078125, "learning_rate": 9.634440823344565e-06, "loss": 1.06925373, "memory(GiB)": 369.4, "step": 13115, "train_speed(iter/s)": 0.200957 }, { "acc": 0.75564618, "epoch": 0.3328259766615931, "grad_norm": 2.140625, "learning_rate": 9.634047133507959e-06, "loss": 0.97530031, "memory(GiB)": 369.4, "step": 13120, "train_speed(iter/s)": 0.200971 }, { "acc": 0.75605145, "epoch": 0.3329528158295282, "grad_norm": 2.03125, "learning_rate": 9.63365323984527e-06, "loss": 0.97917519, "memory(GiB)": 369.4, "step": 13125, "train_speed(iter/s)": 0.200984 }, { "acc": 0.75331917, "epoch": 0.3330796549974632, "grad_norm": 2.0625, "learning_rate": 9.633259142373825e-06, "loss": 0.96274672, "memory(GiB)": 369.4, "step": 13130, "train_speed(iter/s)": 0.200992 }, { "acc": 0.73162451, "epoch": 0.33320649416539827, "grad_norm": 2.03125, "learning_rate": 9.632864841110957e-06, "loss": 1.06403809, "memory(GiB)": 369.4, "step": 13135, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74060307, "epoch": 0.3333333333333333, "grad_norm": 2.84375, "learning_rate": 9.632470336074009e-06, "loss": 1.06755238, "memory(GiB)": 369.4, "step": 13140, "train_speed(iter/s)": 0.201011 }, { "acc": 0.75577106, "epoch": 0.3334601725012684, "grad_norm": 2.34375, "learning_rate": 9.632075627280333e-06, "loss": 0.95737457, "memory(GiB)": 369.4, "step": 13145, "train_speed(iter/s)": 0.201026 }, { "acc": 0.73915658, "epoch": 0.33358701166920346, "grad_norm": 2.09375, "learning_rate": 9.631680714747292e-06, "loss": 1.02166939, "memory(GiB)": 369.4, "step": 13150, "train_speed(iter/s)": 0.201036 }, { "acc": 0.74547224, "epoch": 0.3337138508371385, "grad_norm": 3.125, "learning_rate": 9.63128559849225e-06, "loss": 0.99621668, "memory(GiB)": 369.4, "step": 13155, "train_speed(iter/s)": 0.20105 }, { "acc": 0.74303827, "epoch": 0.33384069000507355, "grad_norm": 2.625, "learning_rate": 9.630890278532594e-06, "loss": 1.00069427, "memory(GiB)": 369.4, "step": 13160, "train_speed(iter/s)": 0.201065 }, { "acc": 0.71811123, "epoch": 0.33396752917300865, "grad_norm": 2.03125, "learning_rate": 9.630494754885706e-06, "loss": 1.1020134, "memory(GiB)": 369.4, "step": 13165, "train_speed(iter/s)": 0.201072 }, { "acc": 0.74162927, "epoch": 0.3340943683409437, "grad_norm": 2.25, "learning_rate": 9.630099027568986e-06, "loss": 1.0134491, "memory(GiB)": 369.4, "step": 13170, "train_speed(iter/s)": 0.201083 }, { "acc": 0.75164928, "epoch": 0.33422120750887874, "grad_norm": 2.21875, "learning_rate": 9.629703096599839e-06, "loss": 1.04275055, "memory(GiB)": 369.4, "step": 13175, "train_speed(iter/s)": 0.201099 }, { "acc": 0.74262152, "epoch": 0.3343480466768138, "grad_norm": 2.125, "learning_rate": 9.629306961995678e-06, "loss": 1.02850456, "memory(GiB)": 369.4, "step": 13180, "train_speed(iter/s)": 0.201105 }, { "acc": 0.72625208, "epoch": 0.3344748858447489, "grad_norm": 2.109375, "learning_rate": 9.62891062377393e-06, "loss": 1.0575078, "memory(GiB)": 369.4, "step": 13185, "train_speed(iter/s)": 0.201117 }, { "acc": 0.74563169, "epoch": 0.3346017250126839, "grad_norm": 2.265625, "learning_rate": 9.628514081952026e-06, "loss": 1.02355547, "memory(GiB)": 369.4, "step": 13190, "train_speed(iter/s)": 0.201119 }, { "acc": 0.72022839, "epoch": 0.33472856418061897, "grad_norm": 2.5625, "learning_rate": 9.628117336547408e-06, "loss": 1.13143549, "memory(GiB)": 369.4, "step": 13195, "train_speed(iter/s)": 0.201133 }, { "acc": 0.73016319, "epoch": 0.334855403348554, "grad_norm": 1.875, "learning_rate": 9.627720387577525e-06, "loss": 1.01030064, "memory(GiB)": 369.4, "step": 13200, "train_speed(iter/s)": 0.201141 }, { "acc": 0.74346428, "epoch": 0.3349822425164891, "grad_norm": 2.34375, "learning_rate": 9.62732323505984e-06, "loss": 1.06707706, "memory(GiB)": 369.4, "step": 13205, "train_speed(iter/s)": 0.201159 }, { "acc": 0.74514112, "epoch": 0.33510908168442416, "grad_norm": 2.078125, "learning_rate": 9.62692587901182e-06, "loss": 0.99921083, "memory(GiB)": 369.4, "step": 13210, "train_speed(iter/s)": 0.201164 }, { "acc": 0.73729773, "epoch": 0.3352359208523592, "grad_norm": 2.09375, "learning_rate": 9.62652831945094e-06, "loss": 1.04766903, "memory(GiB)": 369.4, "step": 13215, "train_speed(iter/s)": 0.201174 }, { "acc": 0.75352707, "epoch": 0.33536276002029425, "grad_norm": 2.296875, "learning_rate": 9.626130556394689e-06, "loss": 1.05202274, "memory(GiB)": 369.4, "step": 13220, "train_speed(iter/s)": 0.201183 }, { "acc": 0.74175711, "epoch": 0.33548959918822935, "grad_norm": 2.25, "learning_rate": 9.625732589860562e-06, "loss": 1.07814846, "memory(GiB)": 369.4, "step": 13225, "train_speed(iter/s)": 0.201197 }, { "acc": 0.73214712, "epoch": 0.3356164383561644, "grad_norm": 2.265625, "learning_rate": 9.625334419866064e-06, "loss": 1.06082821, "memory(GiB)": 369.4, "step": 13230, "train_speed(iter/s)": 0.201212 }, { "acc": 0.74552841, "epoch": 0.33574327752409944, "grad_norm": 2.1875, "learning_rate": 9.624936046428708e-06, "loss": 1.01172047, "memory(GiB)": 369.4, "step": 13235, "train_speed(iter/s)": 0.201219 }, { "acc": 0.75086889, "epoch": 0.3358701166920345, "grad_norm": 1.9921875, "learning_rate": 9.624537469566015e-06, "loss": 1.0114563, "memory(GiB)": 369.4, "step": 13240, "train_speed(iter/s)": 0.20123 }, { "acc": 0.74943533, "epoch": 0.3359969558599696, "grad_norm": 2.421875, "learning_rate": 9.624138689295516e-06, "loss": 0.96037827, "memory(GiB)": 369.4, "step": 13245, "train_speed(iter/s)": 0.201241 }, { "acc": 0.7441679, "epoch": 0.3361237950279046, "grad_norm": 2.25, "learning_rate": 9.623739705634753e-06, "loss": 0.99737034, "memory(GiB)": 369.4, "step": 13250, "train_speed(iter/s)": 0.201258 }, { "acc": 0.74680414, "epoch": 0.33625063419583967, "grad_norm": 2.390625, "learning_rate": 9.623340518601274e-06, "loss": 1.01947975, "memory(GiB)": 369.4, "step": 13255, "train_speed(iter/s)": 0.201266 }, { "acc": 0.73372765, "epoch": 0.3363774733637747, "grad_norm": 3.125, "learning_rate": 9.622941128212639e-06, "loss": 1.12561731, "memory(GiB)": 369.4, "step": 13260, "train_speed(iter/s)": 0.201281 }, { "acc": 0.74606886, "epoch": 0.3365043125317098, "grad_norm": 2.390625, "learning_rate": 9.622541534486411e-06, "loss": 1.06892958, "memory(GiB)": 369.4, "step": 13265, "train_speed(iter/s)": 0.201302 }, { "acc": 0.74129353, "epoch": 0.33663115169964486, "grad_norm": 2.40625, "learning_rate": 9.62214173744017e-06, "loss": 1.04529209, "memory(GiB)": 369.4, "step": 13270, "train_speed(iter/s)": 0.201313 }, { "acc": 0.73455467, "epoch": 0.3367579908675799, "grad_norm": 2.125, "learning_rate": 9.6217417370915e-06, "loss": 1.01043158, "memory(GiB)": 369.4, "step": 13275, "train_speed(iter/s)": 0.201324 }, { "acc": 0.7488534, "epoch": 0.33688483003551495, "grad_norm": 2.1875, "learning_rate": 9.62134153345799e-06, "loss": 1.00336208, "memory(GiB)": 369.4, "step": 13280, "train_speed(iter/s)": 0.201332 }, { "acc": 0.7428967, "epoch": 0.33701166920345005, "grad_norm": 2.171875, "learning_rate": 9.620941126557248e-06, "loss": 0.99522448, "memory(GiB)": 369.4, "step": 13285, "train_speed(iter/s)": 0.201343 }, { "acc": 0.75541005, "epoch": 0.3371385083713851, "grad_norm": 2.046875, "learning_rate": 9.620540516406885e-06, "loss": 0.94538116, "memory(GiB)": 369.4, "step": 13290, "train_speed(iter/s)": 0.201351 }, { "acc": 0.73685474, "epoch": 0.33726534753932014, "grad_norm": 1.9453125, "learning_rate": 9.620139703024522e-06, "loss": 1.03809776, "memory(GiB)": 369.4, "step": 13295, "train_speed(iter/s)": 0.201366 }, { "acc": 0.72562809, "epoch": 0.3373921867072552, "grad_norm": 2.453125, "learning_rate": 9.619738686427785e-06, "loss": 1.11344604, "memory(GiB)": 369.4, "step": 13300, "train_speed(iter/s)": 0.201382 }, { "acc": 0.7418149, "epoch": 0.3375190258751903, "grad_norm": 1.921875, "learning_rate": 9.619337466634317e-06, "loss": 1.0174612, "memory(GiB)": 369.4, "step": 13305, "train_speed(iter/s)": 0.201396 }, { "acc": 0.75071287, "epoch": 0.3376458650431253, "grad_norm": 2.21875, "learning_rate": 9.618936043661762e-06, "loss": 0.98309078, "memory(GiB)": 369.4, "step": 13310, "train_speed(iter/s)": 0.201409 }, { "acc": 0.73189297, "epoch": 0.33777270421106037, "grad_norm": 2.421875, "learning_rate": 9.618534417527779e-06, "loss": 1.04821854, "memory(GiB)": 369.4, "step": 13315, "train_speed(iter/s)": 0.201424 }, { "acc": 0.74682364, "epoch": 0.3378995433789954, "grad_norm": 2.46875, "learning_rate": 9.61813258825003e-06, "loss": 1.00869703, "memory(GiB)": 369.4, "step": 13320, "train_speed(iter/s)": 0.201435 }, { "acc": 0.74229746, "epoch": 0.3380263825469305, "grad_norm": 2.234375, "learning_rate": 9.617730555846191e-06, "loss": 1.01988554, "memory(GiB)": 369.4, "step": 13325, "train_speed(iter/s)": 0.201441 }, { "acc": 0.73701639, "epoch": 0.33815322171486556, "grad_norm": 2.203125, "learning_rate": 9.617328320333947e-06, "loss": 1.01662788, "memory(GiB)": 369.4, "step": 13330, "train_speed(iter/s)": 0.201454 }, { "acc": 0.76093001, "epoch": 0.3382800608828006, "grad_norm": 2.25, "learning_rate": 9.616925881730989e-06, "loss": 0.93839931, "memory(GiB)": 369.4, "step": 13335, "train_speed(iter/s)": 0.201456 }, { "acc": 0.74869442, "epoch": 0.33840690005073565, "grad_norm": 2.25, "learning_rate": 9.616523240055017e-06, "loss": 1.02693501, "memory(GiB)": 369.4, "step": 13340, "train_speed(iter/s)": 0.201461 }, { "acc": 0.72587481, "epoch": 0.33853373921867075, "grad_norm": 2.15625, "learning_rate": 9.616120395323743e-06, "loss": 1.07663174, "memory(GiB)": 369.4, "step": 13345, "train_speed(iter/s)": 0.201469 }, { "acc": 0.74461021, "epoch": 0.3386605783866058, "grad_norm": 2.0625, "learning_rate": 9.615717347554882e-06, "loss": 1.0252018, "memory(GiB)": 369.4, "step": 13350, "train_speed(iter/s)": 0.201482 }, { "acc": 0.73421841, "epoch": 0.33878741755454084, "grad_norm": 1.9765625, "learning_rate": 9.615314096766166e-06, "loss": 1.01228046, "memory(GiB)": 369.4, "step": 13355, "train_speed(iter/s)": 0.201493 }, { "acc": 0.73651133, "epoch": 0.3389142567224759, "grad_norm": 2.578125, "learning_rate": 9.61491064297533e-06, "loss": 1.08423395, "memory(GiB)": 369.4, "step": 13360, "train_speed(iter/s)": 0.201501 }, { "acc": 0.73516197, "epoch": 0.339041095890411, "grad_norm": 2.234375, "learning_rate": 9.614506986200119e-06, "loss": 1.09430733, "memory(GiB)": 369.4, "step": 13365, "train_speed(iter/s)": 0.2015 }, { "acc": 0.73953509, "epoch": 0.339167935058346, "grad_norm": 2.765625, "learning_rate": 9.61410312645829e-06, "loss": 1.05261936, "memory(GiB)": 369.4, "step": 13370, "train_speed(iter/s)": 0.201509 }, { "acc": 0.74173465, "epoch": 0.33929477422628107, "grad_norm": 2.421875, "learning_rate": 9.613699063767603e-06, "loss": 1.05190201, "memory(GiB)": 369.4, "step": 13375, "train_speed(iter/s)": 0.201512 }, { "acc": 0.73974075, "epoch": 0.3394216133942161, "grad_norm": 2.3125, "learning_rate": 9.613294798145833e-06, "loss": 1.05980711, "memory(GiB)": 369.4, "step": 13380, "train_speed(iter/s)": 0.201524 }, { "acc": 0.75189986, "epoch": 0.3395484525621512, "grad_norm": 2.1875, "learning_rate": 9.612890329610762e-06, "loss": 0.99732838, "memory(GiB)": 369.4, "step": 13385, "train_speed(iter/s)": 0.201531 }, { "acc": 0.74606571, "epoch": 0.33967529173008626, "grad_norm": 2.234375, "learning_rate": 9.612485658180178e-06, "loss": 0.99165726, "memory(GiB)": 369.4, "step": 13390, "train_speed(iter/s)": 0.201543 }, { "acc": 0.74585676, "epoch": 0.3398021308980213, "grad_norm": 2.28125, "learning_rate": 9.612080783871882e-06, "loss": 0.98921413, "memory(GiB)": 369.4, "step": 13395, "train_speed(iter/s)": 0.20155 }, { "acc": 0.74194918, "epoch": 0.33992897006595635, "grad_norm": 2.078125, "learning_rate": 9.611675706703682e-06, "loss": 1.00197105, "memory(GiB)": 369.4, "step": 13400, "train_speed(iter/s)": 0.201561 }, { "acc": 0.736064, "epoch": 0.34005580923389145, "grad_norm": 2.859375, "learning_rate": 9.611270426693395e-06, "loss": 1.07388039, "memory(GiB)": 369.4, "step": 13405, "train_speed(iter/s)": 0.201571 }, { "acc": 0.72371941, "epoch": 0.3401826484018265, "grad_norm": 2.65625, "learning_rate": 9.610864943858847e-06, "loss": 1.03722076, "memory(GiB)": 369.4, "step": 13410, "train_speed(iter/s)": 0.201585 }, { "acc": 0.74876375, "epoch": 0.34030948756976154, "grad_norm": 2.4375, "learning_rate": 9.61045925821787e-06, "loss": 0.9928154, "memory(GiB)": 369.4, "step": 13415, "train_speed(iter/s)": 0.201596 }, { "acc": 0.74266815, "epoch": 0.3404363267376966, "grad_norm": 2.046875, "learning_rate": 9.610053369788314e-06, "loss": 1.0299881, "memory(GiB)": 369.4, "step": 13420, "train_speed(iter/s)": 0.201602 }, { "acc": 0.74153366, "epoch": 0.3405631659056317, "grad_norm": 2.234375, "learning_rate": 9.609647278588027e-06, "loss": 1.11857052, "memory(GiB)": 369.4, "step": 13425, "train_speed(iter/s)": 0.201616 }, { "acc": 0.74203053, "epoch": 0.3406900050735667, "grad_norm": 1.71875, "learning_rate": 9.609240984634871e-06, "loss": 0.98995914, "memory(GiB)": 369.4, "step": 13430, "train_speed(iter/s)": 0.201628 }, { "acc": 0.74898286, "epoch": 0.34081684424150177, "grad_norm": 1.96875, "learning_rate": 9.608834487946719e-06, "loss": 1.04399281, "memory(GiB)": 369.4, "step": 13435, "train_speed(iter/s)": 0.201628 }, { "acc": 0.73108006, "epoch": 0.3409436834094368, "grad_norm": 2.203125, "learning_rate": 9.60842778854145e-06, "loss": 1.04200459, "memory(GiB)": 369.4, "step": 13440, "train_speed(iter/s)": 0.20164 }, { "acc": 0.74969511, "epoch": 0.3410705225773719, "grad_norm": 2.0625, "learning_rate": 9.60802088643695e-06, "loss": 1.01076565, "memory(GiB)": 369.4, "step": 13445, "train_speed(iter/s)": 0.20165 }, { "acc": 0.73250856, "epoch": 0.34119736174530696, "grad_norm": 1.921875, "learning_rate": 9.60761378165112e-06, "loss": 1.0081953, "memory(GiB)": 369.4, "step": 13450, "train_speed(iter/s)": 0.20166 }, { "acc": 0.7580092, "epoch": 0.341324200913242, "grad_norm": 2.1875, "learning_rate": 9.607206474201863e-06, "loss": 1.01400995, "memory(GiB)": 369.4, "step": 13455, "train_speed(iter/s)": 0.201668 }, { "acc": 0.74769497, "epoch": 0.34145104008117705, "grad_norm": 2.375, "learning_rate": 9.606798964107096e-06, "loss": 1.09516916, "memory(GiB)": 369.4, "step": 13460, "train_speed(iter/s)": 0.201678 }, { "acc": 0.75269775, "epoch": 0.34157787924911215, "grad_norm": 2.28125, "learning_rate": 9.60639125138474e-06, "loss": 0.95421743, "memory(GiB)": 369.4, "step": 13465, "train_speed(iter/s)": 0.201692 }, { "acc": 0.74930162, "epoch": 0.3417047184170472, "grad_norm": 2.296875, "learning_rate": 9.605983336052735e-06, "loss": 1.03072491, "memory(GiB)": 369.4, "step": 13470, "train_speed(iter/s)": 0.201707 }, { "acc": 0.72789645, "epoch": 0.34183155758498224, "grad_norm": 2.125, "learning_rate": 9.605575218129017e-06, "loss": 1.03375778, "memory(GiB)": 369.4, "step": 13475, "train_speed(iter/s)": 0.201717 }, { "acc": 0.73064251, "epoch": 0.3419583967529173, "grad_norm": 2.21875, "learning_rate": 9.605166897631539e-06, "loss": 1.1146657, "memory(GiB)": 369.4, "step": 13480, "train_speed(iter/s)": 0.201726 }, { "acc": 0.74322782, "epoch": 0.3420852359208524, "grad_norm": 2.25, "learning_rate": 9.604758374578259e-06, "loss": 1.05786448, "memory(GiB)": 369.4, "step": 13485, "train_speed(iter/s)": 0.201733 }, { "acc": 0.73598242, "epoch": 0.3422120750887874, "grad_norm": 2.0625, "learning_rate": 9.604349648987148e-06, "loss": 1.07078657, "memory(GiB)": 369.4, "step": 13490, "train_speed(iter/s)": 0.201741 }, { "acc": 0.74586182, "epoch": 0.34233891425672247, "grad_norm": 2.3125, "learning_rate": 9.603940720876181e-06, "loss": 1.02462883, "memory(GiB)": 369.4, "step": 13495, "train_speed(iter/s)": 0.201749 }, { "acc": 0.74029112, "epoch": 0.3424657534246575, "grad_norm": 2.203125, "learning_rate": 9.603531590263348e-06, "loss": 1.02764626, "memory(GiB)": 369.4, "step": 13500, "train_speed(iter/s)": 0.201762 }, { "acc": 0.74461069, "epoch": 0.3425925925925926, "grad_norm": 1.8671875, "learning_rate": 9.603122257166641e-06, "loss": 1.03286572, "memory(GiB)": 369.4, "step": 13505, "train_speed(iter/s)": 0.201769 }, { "acc": 0.75486178, "epoch": 0.34271943176052766, "grad_norm": 2.03125, "learning_rate": 9.602712721604066e-06, "loss": 0.98458614, "memory(GiB)": 369.4, "step": 13510, "train_speed(iter/s)": 0.201781 }, { "acc": 0.74007201, "epoch": 0.3428462709284627, "grad_norm": 2.53125, "learning_rate": 9.602302983593637e-06, "loss": 1.05159988, "memory(GiB)": 369.4, "step": 13515, "train_speed(iter/s)": 0.201796 }, { "acc": 0.74289522, "epoch": 0.34297311009639775, "grad_norm": 2.484375, "learning_rate": 9.601893043153372e-06, "loss": 1.00602055, "memory(GiB)": 369.4, "step": 13520, "train_speed(iter/s)": 0.2018 }, { "acc": 0.73641171, "epoch": 0.34309994926433285, "grad_norm": 2.171875, "learning_rate": 9.601482900301308e-06, "loss": 1.01880627, "memory(GiB)": 369.4, "step": 13525, "train_speed(iter/s)": 0.201811 }, { "acc": 0.74286747, "epoch": 0.3432267884322679, "grad_norm": 2.125, "learning_rate": 9.60107255505548e-06, "loss": 1.04507942, "memory(GiB)": 369.4, "step": 13530, "train_speed(iter/s)": 0.201824 }, { "acc": 0.7301919, "epoch": 0.34335362760020294, "grad_norm": 2.09375, "learning_rate": 9.60066200743394e-06, "loss": 1.13613243, "memory(GiB)": 369.4, "step": 13535, "train_speed(iter/s)": 0.201841 }, { "acc": 0.74884844, "epoch": 0.343480466768138, "grad_norm": 2.5625, "learning_rate": 9.600251257454744e-06, "loss": 1.01311779, "memory(GiB)": 369.4, "step": 13540, "train_speed(iter/s)": 0.201858 }, { "acc": 0.72944212, "epoch": 0.3436073059360731, "grad_norm": 2.0625, "learning_rate": 9.599840305135959e-06, "loss": 1.0640872, "memory(GiB)": 369.4, "step": 13545, "train_speed(iter/s)": 0.201871 }, { "acc": 0.75065222, "epoch": 0.3437341451040081, "grad_norm": 2.140625, "learning_rate": 9.59942915049566e-06, "loss": 0.97817698, "memory(GiB)": 369.4, "step": 13550, "train_speed(iter/s)": 0.201884 }, { "acc": 0.72598257, "epoch": 0.34386098427194317, "grad_norm": 2.109375, "learning_rate": 9.599017793551933e-06, "loss": 1.052812, "memory(GiB)": 369.4, "step": 13555, "train_speed(iter/s)": 0.201897 }, { "acc": 0.7232501, "epoch": 0.3439878234398782, "grad_norm": 2.21875, "learning_rate": 9.598606234322869e-06, "loss": 1.12432699, "memory(GiB)": 369.4, "step": 13560, "train_speed(iter/s)": 0.201904 }, { "acc": 0.75810356, "epoch": 0.3441146626078133, "grad_norm": 2.125, "learning_rate": 9.598194472826574e-06, "loss": 1.00460052, "memory(GiB)": 369.4, "step": 13565, "train_speed(iter/s)": 0.201916 }, { "acc": 0.74277353, "epoch": 0.34424150177574836, "grad_norm": 2.546875, "learning_rate": 9.597782509081154e-06, "loss": 1.05493755, "memory(GiB)": 369.4, "step": 13570, "train_speed(iter/s)": 0.201926 }, { "acc": 0.74717155, "epoch": 0.3443683409436834, "grad_norm": 2.0, "learning_rate": 9.597370343104733e-06, "loss": 1.00966463, "memory(GiB)": 369.4, "step": 13575, "train_speed(iter/s)": 0.20194 }, { "acc": 0.7556057, "epoch": 0.34449518011161845, "grad_norm": 2.15625, "learning_rate": 9.596957974915438e-06, "loss": 0.98291531, "memory(GiB)": 369.4, "step": 13580, "train_speed(iter/s)": 0.201944 }, { "acc": 0.73722849, "epoch": 0.34462201927955355, "grad_norm": 1.703125, "learning_rate": 9.596545404531408e-06, "loss": 1.06322145, "memory(GiB)": 369.4, "step": 13585, "train_speed(iter/s)": 0.201956 }, { "acc": 0.73649349, "epoch": 0.3447488584474886, "grad_norm": 1.84375, "learning_rate": 9.596132631970788e-06, "loss": 1.00462666, "memory(GiB)": 369.4, "step": 13590, "train_speed(iter/s)": 0.201967 }, { "acc": 0.76466408, "epoch": 0.34487569761542364, "grad_norm": 2.21875, "learning_rate": 9.595719657251735e-06, "loss": 0.98287354, "memory(GiB)": 369.4, "step": 13595, "train_speed(iter/s)": 0.201981 }, { "acc": 0.75316281, "epoch": 0.3450025367833587, "grad_norm": 2.25, "learning_rate": 9.595306480392413e-06, "loss": 1.0201375, "memory(GiB)": 369.4, "step": 13600, "train_speed(iter/s)": 0.201989 }, { "acc": 0.74218459, "epoch": 0.3451293759512938, "grad_norm": 2.5625, "learning_rate": 9.594893101410995e-06, "loss": 1.05068703, "memory(GiB)": 369.4, "step": 13605, "train_speed(iter/s)": 0.201999 }, { "acc": 0.738521, "epoch": 0.3452562151192288, "grad_norm": 2.125, "learning_rate": 9.594479520325665e-06, "loss": 1.04204597, "memory(GiB)": 369.4, "step": 13610, "train_speed(iter/s)": 0.202006 }, { "acc": 0.73309679, "epoch": 0.34538305428716387, "grad_norm": 2.453125, "learning_rate": 9.594065737154611e-06, "loss": 1.02122984, "memory(GiB)": 369.4, "step": 13615, "train_speed(iter/s)": 0.20202 }, { "acc": 0.7492806, "epoch": 0.3455098934550989, "grad_norm": 2.390625, "learning_rate": 9.593651751916037e-06, "loss": 0.98779488, "memory(GiB)": 369.4, "step": 13620, "train_speed(iter/s)": 0.202027 }, { "acc": 0.75891562, "epoch": 0.345636732623034, "grad_norm": 2.09375, "learning_rate": 9.593237564628149e-06, "loss": 0.93048983, "memory(GiB)": 369.4, "step": 13625, "train_speed(iter/s)": 0.202041 }, { "acc": 0.74880619, "epoch": 0.34576357179096906, "grad_norm": 1.7109375, "learning_rate": 9.592823175309164e-06, "loss": 0.99158192, "memory(GiB)": 369.4, "step": 13630, "train_speed(iter/s)": 0.202058 }, { "acc": 0.74707909, "epoch": 0.3458904109589041, "grad_norm": 1.890625, "learning_rate": 9.592408583977311e-06, "loss": 0.9889864, "memory(GiB)": 369.4, "step": 13635, "train_speed(iter/s)": 0.202059 }, { "acc": 0.73396173, "epoch": 0.34601725012683915, "grad_norm": 2.203125, "learning_rate": 9.591993790650826e-06, "loss": 0.99519253, "memory(GiB)": 369.4, "step": 13640, "train_speed(iter/s)": 0.202074 }, { "acc": 0.7520545, "epoch": 0.34614408929477425, "grad_norm": 2.359375, "learning_rate": 9.591578795347952e-06, "loss": 1.04749928, "memory(GiB)": 369.4, "step": 13645, "train_speed(iter/s)": 0.202085 }, { "acc": 0.75280342, "epoch": 0.3462709284627093, "grad_norm": 2.453125, "learning_rate": 9.591163598086943e-06, "loss": 1.03637362, "memory(GiB)": 369.4, "step": 13650, "train_speed(iter/s)": 0.202096 }, { "acc": 0.74366522, "epoch": 0.34639776763064434, "grad_norm": 1.9453125, "learning_rate": 9.59074819888606e-06, "loss": 1.05086899, "memory(GiB)": 369.4, "step": 13655, "train_speed(iter/s)": 0.202109 }, { "acc": 0.74796329, "epoch": 0.3465246067985794, "grad_norm": 2.125, "learning_rate": 9.590332597763575e-06, "loss": 1.00909472, "memory(GiB)": 369.4, "step": 13660, "train_speed(iter/s)": 0.202121 }, { "acc": 0.74918213, "epoch": 0.3466514459665145, "grad_norm": 2.1875, "learning_rate": 9.589916794737768e-06, "loss": 1.03432655, "memory(GiB)": 369.4, "step": 13665, "train_speed(iter/s)": 0.202135 }, { "acc": 0.73743038, "epoch": 0.3467782851344495, "grad_norm": 2.28125, "learning_rate": 9.589500789826927e-06, "loss": 1.10428877, "memory(GiB)": 369.4, "step": 13670, "train_speed(iter/s)": 0.202142 }, { "acc": 0.73123598, "epoch": 0.34690512430238457, "grad_norm": 2.1875, "learning_rate": 9.589084583049353e-06, "loss": 1.11636925, "memory(GiB)": 369.4, "step": 13675, "train_speed(iter/s)": 0.20215 }, { "acc": 0.74724264, "epoch": 0.3470319634703196, "grad_norm": 2.484375, "learning_rate": 9.588668174423348e-06, "loss": 1.03409901, "memory(GiB)": 369.4, "step": 13680, "train_speed(iter/s)": 0.202161 }, { "acc": 0.73350391, "epoch": 0.3471588026382547, "grad_norm": 2.09375, "learning_rate": 9.588251563967232e-06, "loss": 1.06925125, "memory(GiB)": 369.4, "step": 13685, "train_speed(iter/s)": 0.202172 }, { "acc": 0.73482914, "epoch": 0.34728564180618976, "grad_norm": 1.9609375, "learning_rate": 9.587834751699326e-06, "loss": 1.07934704, "memory(GiB)": 369.4, "step": 13690, "train_speed(iter/s)": 0.20218 }, { "acc": 0.73964229, "epoch": 0.3474124809741248, "grad_norm": 2.609375, "learning_rate": 9.587417737637963e-06, "loss": 1.0651722, "memory(GiB)": 369.4, "step": 13695, "train_speed(iter/s)": 0.202193 }, { "acc": 0.73589954, "epoch": 0.34753932014205985, "grad_norm": 2.375, "learning_rate": 9.587000521801488e-06, "loss": 1.01810751, "memory(GiB)": 369.4, "step": 13700, "train_speed(iter/s)": 0.202195 }, { "acc": 0.74906387, "epoch": 0.34766615930999495, "grad_norm": 2.453125, "learning_rate": 9.58658310420825e-06, "loss": 1.04291201, "memory(GiB)": 369.4, "step": 13705, "train_speed(iter/s)": 0.202204 }, { "acc": 0.74551187, "epoch": 0.34779299847793, "grad_norm": 2.171875, "learning_rate": 9.58616548487661e-06, "loss": 1.01267958, "memory(GiB)": 369.4, "step": 13710, "train_speed(iter/s)": 0.202216 }, { "acc": 0.74134054, "epoch": 0.34791983764586504, "grad_norm": 2.40625, "learning_rate": 9.585747663824936e-06, "loss": 1.07676783, "memory(GiB)": 369.4, "step": 13715, "train_speed(iter/s)": 0.202225 }, { "acc": 0.74669881, "epoch": 0.3480466768138001, "grad_norm": 2.28125, "learning_rate": 9.585329641071606e-06, "loss": 1.01701727, "memory(GiB)": 369.4, "step": 13720, "train_speed(iter/s)": 0.202238 }, { "acc": 0.73836412, "epoch": 0.3481735159817352, "grad_norm": 2.46875, "learning_rate": 9.584911416635007e-06, "loss": 1.05576878, "memory(GiB)": 369.4, "step": 13725, "train_speed(iter/s)": 0.20225 }, { "acc": 0.74239182, "epoch": 0.3483003551496702, "grad_norm": 2.265625, "learning_rate": 9.584492990533533e-06, "loss": 1.03475609, "memory(GiB)": 369.4, "step": 13730, "train_speed(iter/s)": 0.202258 }, { "acc": 0.75055218, "epoch": 0.34842719431760527, "grad_norm": 2.28125, "learning_rate": 9.58407436278559e-06, "loss": 0.94383755, "memory(GiB)": 369.4, "step": 13735, "train_speed(iter/s)": 0.202265 }, { "acc": 0.73709555, "epoch": 0.3485540334855403, "grad_norm": 1.96875, "learning_rate": 9.583655533409588e-06, "loss": 1.03973751, "memory(GiB)": 369.4, "step": 13740, "train_speed(iter/s)": 0.202275 }, { "acc": 0.74497962, "epoch": 0.3486808726534754, "grad_norm": 1.8203125, "learning_rate": 9.583236502423952e-06, "loss": 1.04252062, "memory(GiB)": 369.4, "step": 13745, "train_speed(iter/s)": 0.202283 }, { "acc": 0.74925656, "epoch": 0.34880771182141046, "grad_norm": 2.125, "learning_rate": 9.582817269847112e-06, "loss": 1.06646996, "memory(GiB)": 369.4, "step": 13750, "train_speed(iter/s)": 0.202291 }, { "acc": 0.74761925, "epoch": 0.3489345509893455, "grad_norm": 2.015625, "learning_rate": 9.582397835697509e-06, "loss": 0.99236822, "memory(GiB)": 369.4, "step": 13755, "train_speed(iter/s)": 0.202297 }, { "acc": 0.7450295, "epoch": 0.34906139015728055, "grad_norm": 2.0625, "learning_rate": 9.581978199993587e-06, "loss": 1.02228336, "memory(GiB)": 369.4, "step": 13760, "train_speed(iter/s)": 0.202292 }, { "acc": 0.73289623, "epoch": 0.34918822932521565, "grad_norm": 1.8671875, "learning_rate": 9.58155836275381e-06, "loss": 1.02995262, "memory(GiB)": 369.4, "step": 13765, "train_speed(iter/s)": 0.202302 }, { "acc": 0.73907056, "epoch": 0.3493150684931507, "grad_norm": 1.9140625, "learning_rate": 9.581138323996639e-06, "loss": 1.067309, "memory(GiB)": 369.4, "step": 13770, "train_speed(iter/s)": 0.202315 }, { "acc": 0.75204978, "epoch": 0.34944190766108574, "grad_norm": 2.0625, "learning_rate": 9.580718083740553e-06, "loss": 1.0556797, "memory(GiB)": 369.4, "step": 13775, "train_speed(iter/s)": 0.202322 }, { "acc": 0.7634408, "epoch": 0.3495687468290208, "grad_norm": 2.25, "learning_rate": 9.580297642004032e-06, "loss": 0.96024361, "memory(GiB)": 369.4, "step": 13780, "train_speed(iter/s)": 0.202328 }, { "acc": 0.74874001, "epoch": 0.3496955859969559, "grad_norm": 2.03125, "learning_rate": 9.579876998805573e-06, "loss": 1.06257935, "memory(GiB)": 369.4, "step": 13785, "train_speed(iter/s)": 0.20234 }, { "acc": 0.74669008, "epoch": 0.3498224251648909, "grad_norm": 2.3125, "learning_rate": 9.579456154163676e-06, "loss": 1.03203278, "memory(GiB)": 369.4, "step": 13790, "train_speed(iter/s)": 0.202355 }, { "acc": 0.73711495, "epoch": 0.34994926433282597, "grad_norm": 2.171875, "learning_rate": 9.57903510809685e-06, "loss": 1.03965244, "memory(GiB)": 369.4, "step": 13795, "train_speed(iter/s)": 0.202362 }, { "acc": 0.7439106, "epoch": 0.350076103500761, "grad_norm": 2.453125, "learning_rate": 9.578613860623617e-06, "loss": 1.01384983, "memory(GiB)": 369.4, "step": 13800, "train_speed(iter/s)": 0.202371 }, { "acc": 0.75108242, "epoch": 0.3502029426686961, "grad_norm": 2.234375, "learning_rate": 9.578192411762503e-06, "loss": 0.94582806, "memory(GiB)": 369.4, "step": 13805, "train_speed(iter/s)": 0.202377 }, { "acc": 0.72781692, "epoch": 0.35032978183663116, "grad_norm": 1.8515625, "learning_rate": 9.577770761532049e-06, "loss": 1.10371113, "memory(GiB)": 369.4, "step": 13810, "train_speed(iter/s)": 0.202384 }, { "acc": 0.74619551, "epoch": 0.3504566210045662, "grad_norm": 2.203125, "learning_rate": 9.577348909950797e-06, "loss": 1.03243904, "memory(GiB)": 369.4, "step": 13815, "train_speed(iter/s)": 0.202393 }, { "acc": 0.74983864, "epoch": 0.35058346017250125, "grad_norm": 2.140625, "learning_rate": 9.576926857037303e-06, "loss": 0.90642014, "memory(GiB)": 369.4, "step": 13820, "train_speed(iter/s)": 0.202404 }, { "acc": 0.7553535, "epoch": 0.35071029934043635, "grad_norm": 2.21875, "learning_rate": 9.576504602810133e-06, "loss": 1.01168585, "memory(GiB)": 369.4, "step": 13825, "train_speed(iter/s)": 0.202419 }, { "acc": 0.7590518, "epoch": 0.3508371385083714, "grad_norm": 2.015625, "learning_rate": 9.576082147287858e-06, "loss": 0.97913313, "memory(GiB)": 369.4, "step": 13830, "train_speed(iter/s)": 0.202427 }, { "acc": 0.74095955, "epoch": 0.35096397767630644, "grad_norm": 2.296875, "learning_rate": 9.575659490489058e-06, "loss": 0.98642092, "memory(GiB)": 369.4, "step": 13835, "train_speed(iter/s)": 0.202437 }, { "acc": 0.7447268, "epoch": 0.3510908168442415, "grad_norm": 2.0625, "learning_rate": 9.575236632432325e-06, "loss": 1.01597519, "memory(GiB)": 369.4, "step": 13840, "train_speed(iter/s)": 0.20245 }, { "acc": 0.74449458, "epoch": 0.3512176560121766, "grad_norm": 2.390625, "learning_rate": 9.574813573136259e-06, "loss": 1.04744015, "memory(GiB)": 369.4, "step": 13845, "train_speed(iter/s)": 0.20246 }, { "acc": 0.74766526, "epoch": 0.3513444951801116, "grad_norm": 1.8203125, "learning_rate": 9.574390312619466e-06, "loss": 1.03104439, "memory(GiB)": 369.4, "step": 13850, "train_speed(iter/s)": 0.202471 }, { "acc": 0.74337988, "epoch": 0.35147133434804667, "grad_norm": 2.03125, "learning_rate": 9.573966850900565e-06, "loss": 1.0286727, "memory(GiB)": 369.4, "step": 13855, "train_speed(iter/s)": 0.202474 }, { "acc": 0.7414609, "epoch": 0.3515981735159817, "grad_norm": 2.25, "learning_rate": 9.57354318799818e-06, "loss": 1.02350521, "memory(GiB)": 369.4, "step": 13860, "train_speed(iter/s)": 0.202483 }, { "acc": 0.7473793, "epoch": 0.3517250126839168, "grad_norm": 2.109375, "learning_rate": 9.573119323930946e-06, "loss": 1.0094099, "memory(GiB)": 369.4, "step": 13865, "train_speed(iter/s)": 0.202494 }, { "acc": 0.74529924, "epoch": 0.35185185185185186, "grad_norm": 1.9453125, "learning_rate": 9.572695258717507e-06, "loss": 1.00639534, "memory(GiB)": 369.4, "step": 13870, "train_speed(iter/s)": 0.202505 }, { "acc": 0.74200144, "epoch": 0.3519786910197869, "grad_norm": 2.015625, "learning_rate": 9.572270992376513e-06, "loss": 1.02907066, "memory(GiB)": 369.4, "step": 13875, "train_speed(iter/s)": 0.202515 }, { "acc": 0.74360619, "epoch": 0.35210553018772195, "grad_norm": 1.890625, "learning_rate": 9.571846524926629e-06, "loss": 1.0315979, "memory(GiB)": 369.4, "step": 13880, "train_speed(iter/s)": 0.202528 }, { "acc": 0.75074358, "epoch": 0.35223236935565705, "grad_norm": 2.0, "learning_rate": 9.571421856386522e-06, "loss": 1.02857361, "memory(GiB)": 369.4, "step": 13885, "train_speed(iter/s)": 0.202542 }, { "acc": 0.74177809, "epoch": 0.3523592085235921, "grad_norm": 2.125, "learning_rate": 9.570996986774872e-06, "loss": 1.02950888, "memory(GiB)": 369.4, "step": 13890, "train_speed(iter/s)": 0.202548 }, { "acc": 0.74159431, "epoch": 0.35248604769152714, "grad_norm": 2.59375, "learning_rate": 9.570571916110366e-06, "loss": 1.05921125, "memory(GiB)": 369.4, "step": 13895, "train_speed(iter/s)": 0.202548 }, { "acc": 0.74955368, "epoch": 0.3526128868594622, "grad_norm": 2.046875, "learning_rate": 9.570146644411705e-06, "loss": 1.022188, "memory(GiB)": 369.4, "step": 13900, "train_speed(iter/s)": 0.202557 }, { "acc": 0.74570785, "epoch": 0.3527397260273973, "grad_norm": 2.328125, "learning_rate": 9.569721171697587e-06, "loss": 1.03951693, "memory(GiB)": 369.4, "step": 13905, "train_speed(iter/s)": 0.202568 }, { "acc": 0.7388731, "epoch": 0.3528665651953323, "grad_norm": 1.9453125, "learning_rate": 9.569295497986727e-06, "loss": 1.07981892, "memory(GiB)": 369.4, "step": 13910, "train_speed(iter/s)": 0.202582 }, { "acc": 0.76057234, "epoch": 0.35299340436326737, "grad_norm": 2.375, "learning_rate": 9.568869623297855e-06, "loss": 1.00417595, "memory(GiB)": 369.4, "step": 13915, "train_speed(iter/s)": 0.202595 }, { "acc": 0.72809973, "epoch": 0.3531202435312024, "grad_norm": 2.03125, "learning_rate": 9.568443547649697e-06, "loss": 1.1235487, "memory(GiB)": 369.4, "step": 13920, "train_speed(iter/s)": 0.202605 }, { "acc": 0.74137411, "epoch": 0.3532470826991375, "grad_norm": 2.28125, "learning_rate": 9.568017271060994e-06, "loss": 1.05530968, "memory(GiB)": 369.4, "step": 13925, "train_speed(iter/s)": 0.202609 }, { "acc": 0.7449789, "epoch": 0.35337392186707256, "grad_norm": 2.8125, "learning_rate": 9.567590793550498e-06, "loss": 1.0019309, "memory(GiB)": 369.4, "step": 13930, "train_speed(iter/s)": 0.202614 }, { "acc": 0.74072003, "epoch": 0.3535007610350076, "grad_norm": 2.28125, "learning_rate": 9.567164115136965e-06, "loss": 1.06714115, "memory(GiB)": 369.4, "step": 13935, "train_speed(iter/s)": 0.202613 }, { "acc": 0.75090628, "epoch": 0.35362760020294265, "grad_norm": 2.078125, "learning_rate": 9.566737235839166e-06, "loss": 1.04270802, "memory(GiB)": 369.4, "step": 13940, "train_speed(iter/s)": 0.202626 }, { "acc": 0.73601599, "epoch": 0.35375443937087775, "grad_norm": 2.328125, "learning_rate": 9.566310155675871e-06, "loss": 1.05982571, "memory(GiB)": 369.4, "step": 13945, "train_speed(iter/s)": 0.202635 }, { "acc": 0.7323, "epoch": 0.3538812785388128, "grad_norm": 1.6171875, "learning_rate": 9.56588287466587e-06, "loss": 1.00480576, "memory(GiB)": 369.4, "step": 13950, "train_speed(iter/s)": 0.202643 }, { "acc": 0.75257902, "epoch": 0.35400811770674784, "grad_norm": 2.203125, "learning_rate": 9.565455392827954e-06, "loss": 0.99866409, "memory(GiB)": 369.4, "step": 13955, "train_speed(iter/s)": 0.20265 }, { "acc": 0.73584414, "epoch": 0.3541349568746829, "grad_norm": 2.109375, "learning_rate": 9.565027710180927e-06, "loss": 1.04326572, "memory(GiB)": 369.4, "step": 13960, "train_speed(iter/s)": 0.202661 }, { "acc": 0.75814829, "epoch": 0.354261796042618, "grad_norm": 2.234375, "learning_rate": 9.5645998267436e-06, "loss": 0.93887873, "memory(GiB)": 369.4, "step": 13965, "train_speed(iter/s)": 0.202674 }, { "acc": 0.73026676, "epoch": 0.354388635210553, "grad_norm": 1.9140625, "learning_rate": 9.564171742534794e-06, "loss": 1.04192371, "memory(GiB)": 369.4, "step": 13970, "train_speed(iter/s)": 0.202685 }, { "acc": 0.73772216, "epoch": 0.35451547437848807, "grad_norm": 2.15625, "learning_rate": 9.563743457573336e-06, "loss": 1.05043812, "memory(GiB)": 369.4, "step": 13975, "train_speed(iter/s)": 0.20269 }, { "acc": 0.73182058, "epoch": 0.3546423135464231, "grad_norm": 2.015625, "learning_rate": 9.563314971878065e-06, "loss": 1.07031994, "memory(GiB)": 369.4, "step": 13980, "train_speed(iter/s)": 0.202694 }, { "acc": 0.73993721, "epoch": 0.3547691527143582, "grad_norm": 2.140625, "learning_rate": 9.562886285467828e-06, "loss": 1.02791004, "memory(GiB)": 369.4, "step": 13985, "train_speed(iter/s)": 0.202703 }, { "acc": 0.73891277, "epoch": 0.35489599188229326, "grad_norm": 2.359375, "learning_rate": 9.56245739836148e-06, "loss": 1.05129509, "memory(GiB)": 369.4, "step": 13990, "train_speed(iter/s)": 0.202714 }, { "acc": 0.74101319, "epoch": 0.3550228310502283, "grad_norm": 2.0625, "learning_rate": 9.562028310577887e-06, "loss": 1.01268616, "memory(GiB)": 369.4, "step": 13995, "train_speed(iter/s)": 0.20272 }, { "acc": 0.74057713, "epoch": 0.35514967021816335, "grad_norm": 2.234375, "learning_rate": 9.56159902213592e-06, "loss": 1.0660408, "memory(GiB)": 369.4, "step": 14000, "train_speed(iter/s)": 0.202734 }, { "epoch": 0.35514967021816335, "eval_acc": 0.7323933011847463, "eval_loss": 0.9985984563827515, "eval_runtime": 384.5743, "eval_samples_per_second": 16.564, "eval_steps_per_second": 8.282, "step": 14000 }, { "acc": 0.73818455, "epoch": 0.35527650938609845, "grad_norm": 2.3125, "learning_rate": 9.561169533054462e-06, "loss": 1.01044369, "memory(GiB)": 369.4, "step": 14005, "train_speed(iter/s)": 0.200682 }, { "acc": 0.74317031, "epoch": 0.3554033485540335, "grad_norm": 2.75, "learning_rate": 9.560739843352404e-06, "loss": 1.08822803, "memory(GiB)": 369.4, "step": 14010, "train_speed(iter/s)": 0.200692 }, { "acc": 0.73619146, "epoch": 0.35553018772196854, "grad_norm": 2.21875, "learning_rate": 9.560309953048645e-06, "loss": 1.06717033, "memory(GiB)": 369.4, "step": 14015, "train_speed(iter/s)": 0.200702 }, { "acc": 0.73251944, "epoch": 0.3556570268899036, "grad_norm": 1.9765625, "learning_rate": 9.559879862162095e-06, "loss": 1.05906296, "memory(GiB)": 369.4, "step": 14020, "train_speed(iter/s)": 0.200709 }, { "acc": 0.73263211, "epoch": 0.3557838660578387, "grad_norm": 2.25, "learning_rate": 9.55944957071167e-06, "loss": 1.10226927, "memory(GiB)": 369.4, "step": 14025, "train_speed(iter/s)": 0.200721 }, { "acc": 0.73939514, "epoch": 0.3559107052257737, "grad_norm": 2.375, "learning_rate": 9.559019078716295e-06, "loss": 1.04430408, "memory(GiB)": 369.4, "step": 14030, "train_speed(iter/s)": 0.200738 }, { "acc": 0.73703456, "epoch": 0.35603754439370877, "grad_norm": 2.5625, "learning_rate": 9.558588386194907e-06, "loss": 1.10308151, "memory(GiB)": 369.4, "step": 14035, "train_speed(iter/s)": 0.20075 }, { "acc": 0.74117966, "epoch": 0.3561643835616438, "grad_norm": 2.609375, "learning_rate": 9.55815749316645e-06, "loss": 0.9923667, "memory(GiB)": 369.4, "step": 14040, "train_speed(iter/s)": 0.200757 }, { "acc": 0.73477683, "epoch": 0.3562912227295789, "grad_norm": 2.09375, "learning_rate": 9.557726399649875e-06, "loss": 1.04405212, "memory(GiB)": 369.4, "step": 14045, "train_speed(iter/s)": 0.200768 }, { "acc": 0.74282084, "epoch": 0.35641806189751396, "grad_norm": 2.375, "learning_rate": 9.557295105664144e-06, "loss": 1.03417664, "memory(GiB)": 369.4, "step": 14050, "train_speed(iter/s)": 0.20078 }, { "acc": 0.73888311, "epoch": 0.356544901065449, "grad_norm": 2.265625, "learning_rate": 9.556863611228228e-06, "loss": 1.06092033, "memory(GiB)": 369.4, "step": 14055, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74323807, "epoch": 0.35667174023338405, "grad_norm": 2.328125, "learning_rate": 9.556431916361105e-06, "loss": 1.02444839, "memory(GiB)": 369.4, "step": 14060, "train_speed(iter/s)": 0.200791 }, { "acc": 0.73502054, "epoch": 0.35679857940131915, "grad_norm": 2.234375, "learning_rate": 9.556000021081764e-06, "loss": 1.01972399, "memory(GiB)": 369.4, "step": 14065, "train_speed(iter/s)": 0.200801 }, { "acc": 0.75871429, "epoch": 0.3569254185692542, "grad_norm": 2.375, "learning_rate": 9.5555679254092e-06, "loss": 0.96709805, "memory(GiB)": 369.4, "step": 14070, "train_speed(iter/s)": 0.200811 }, { "acc": 0.73584342, "epoch": 0.35705225773718924, "grad_norm": 2.40625, "learning_rate": 9.55513562936242e-06, "loss": 1.09474773, "memory(GiB)": 369.4, "step": 14075, "train_speed(iter/s)": 0.200818 }, { "acc": 0.72646456, "epoch": 0.3571790969051243, "grad_norm": 2.515625, "learning_rate": 9.554703132960437e-06, "loss": 1.12751274, "memory(GiB)": 369.4, "step": 14080, "train_speed(iter/s)": 0.200824 }, { "acc": 0.75922933, "epoch": 0.3573059360730594, "grad_norm": 1.859375, "learning_rate": 9.554270436222277e-06, "loss": 0.94721899, "memory(GiB)": 369.4, "step": 14085, "train_speed(iter/s)": 0.200828 }, { "acc": 0.74219971, "epoch": 0.3574327752409944, "grad_norm": 2.1875, "learning_rate": 9.553837539166969e-06, "loss": 1.02442408, "memory(GiB)": 369.4, "step": 14090, "train_speed(iter/s)": 0.200841 }, { "acc": 0.73343382, "epoch": 0.35755961440892947, "grad_norm": 2.203125, "learning_rate": 9.553404441813554e-06, "loss": 1.06475163, "memory(GiB)": 369.4, "step": 14095, "train_speed(iter/s)": 0.200847 }, { "acc": 0.74411707, "epoch": 0.3576864535768645, "grad_norm": 2.90625, "learning_rate": 9.552971144181083e-06, "loss": 1.01084414, "memory(GiB)": 369.4, "step": 14100, "train_speed(iter/s)": 0.200853 }, { "acc": 0.74530263, "epoch": 0.3578132927447996, "grad_norm": 2.015625, "learning_rate": 9.552537646288612e-06, "loss": 1.03773785, "memory(GiB)": 369.4, "step": 14105, "train_speed(iter/s)": 0.200864 }, { "acc": 0.75726337, "epoch": 0.35794013191273466, "grad_norm": 2.21875, "learning_rate": 9.552103948155211e-06, "loss": 1.00344906, "memory(GiB)": 369.4, "step": 14110, "train_speed(iter/s)": 0.200878 }, { "acc": 0.7582942, "epoch": 0.3580669710806697, "grad_norm": 2.609375, "learning_rate": 9.551670049799954e-06, "loss": 1.01588163, "memory(GiB)": 369.4, "step": 14115, "train_speed(iter/s)": 0.200891 }, { "acc": 0.73274965, "epoch": 0.35819381024860475, "grad_norm": 2.09375, "learning_rate": 9.551235951241927e-06, "loss": 1.0163969, "memory(GiB)": 369.4, "step": 14120, "train_speed(iter/s)": 0.2009 }, { "acc": 0.73030577, "epoch": 0.35832064941653985, "grad_norm": 2.25, "learning_rate": 9.550801652500223e-06, "loss": 1.1110363, "memory(GiB)": 369.4, "step": 14125, "train_speed(iter/s)": 0.200906 }, { "acc": 0.74929762, "epoch": 0.3584474885844749, "grad_norm": 2.3125, "learning_rate": 9.550367153593944e-06, "loss": 1.02229414, "memory(GiB)": 369.4, "step": 14130, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74360495, "epoch": 0.35857432775240994, "grad_norm": 1.9765625, "learning_rate": 9.549932454542202e-06, "loss": 1.02093182, "memory(GiB)": 369.4, "step": 14135, "train_speed(iter/s)": 0.200922 }, { "acc": 0.75655112, "epoch": 0.358701166920345, "grad_norm": 3.265625, "learning_rate": 9.549497555364115e-06, "loss": 0.9632, "memory(GiB)": 369.4, "step": 14140, "train_speed(iter/s)": 0.200938 }, { "acc": 0.7405612, "epoch": 0.3588280060882801, "grad_norm": 2.734375, "learning_rate": 9.549062456078816e-06, "loss": 1.01715889, "memory(GiB)": 369.4, "step": 14145, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74265647, "epoch": 0.3589548452562151, "grad_norm": 2.09375, "learning_rate": 9.54862715670544e-06, "loss": 1.00363026, "memory(GiB)": 369.4, "step": 14150, "train_speed(iter/s)": 0.200956 }, { "acc": 0.74384232, "epoch": 0.35908168442415017, "grad_norm": 1.9453125, "learning_rate": 9.548191657263132e-06, "loss": 1.03920689, "memory(GiB)": 369.4, "step": 14155, "train_speed(iter/s)": 0.200972 }, { "acc": 0.74988689, "epoch": 0.3592085235920852, "grad_norm": 2.09375, "learning_rate": 9.547755957771049e-06, "loss": 1.00422535, "memory(GiB)": 369.4, "step": 14160, "train_speed(iter/s)": 0.20098 }, { "acc": 0.74552178, "epoch": 0.3593353627600203, "grad_norm": 2.0, "learning_rate": 9.547320058248356e-06, "loss": 1.00951786, "memory(GiB)": 369.4, "step": 14165, "train_speed(iter/s)": 0.200992 }, { "acc": 0.7501658, "epoch": 0.35946220192795536, "grad_norm": 2.390625, "learning_rate": 9.546883958714223e-06, "loss": 1.05314827, "memory(GiB)": 369.4, "step": 14170, "train_speed(iter/s)": 0.201007 }, { "acc": 0.71910157, "epoch": 0.3595890410958904, "grad_norm": 2.140625, "learning_rate": 9.546447659187834e-06, "loss": 1.09013319, "memory(GiB)": 369.4, "step": 14175, "train_speed(iter/s)": 0.201017 }, { "acc": 0.75165892, "epoch": 0.35971588026382545, "grad_norm": 1.9921875, "learning_rate": 9.546011159688377e-06, "loss": 0.99663124, "memory(GiB)": 369.4, "step": 14180, "train_speed(iter/s)": 0.201028 }, { "acc": 0.74156194, "epoch": 0.35984271943176055, "grad_norm": 2.265625, "learning_rate": 9.545574460235055e-06, "loss": 1.06377983, "memory(GiB)": 369.4, "step": 14185, "train_speed(iter/s)": 0.201038 }, { "acc": 0.74745378, "epoch": 0.3599695585996956, "grad_norm": 1.96875, "learning_rate": 9.545137560847071e-06, "loss": 0.97367191, "memory(GiB)": 369.4, "step": 14190, "train_speed(iter/s)": 0.201049 }, { "acc": 0.73918657, "epoch": 0.36009639776763064, "grad_norm": 2.28125, "learning_rate": 9.544700461543647e-06, "loss": 1.02213669, "memory(GiB)": 369.4, "step": 14195, "train_speed(iter/s)": 0.201065 }, { "acc": 0.75479889, "epoch": 0.3602232369355657, "grad_norm": 1.9921875, "learning_rate": 9.544263162344005e-06, "loss": 1.00856838, "memory(GiB)": 369.4, "step": 14200, "train_speed(iter/s)": 0.201077 }, { "acc": 0.73618155, "epoch": 0.3603500761035008, "grad_norm": 2.171875, "learning_rate": 9.54382566326738e-06, "loss": 1.07212257, "memory(GiB)": 369.4, "step": 14205, "train_speed(iter/s)": 0.201088 }, { "acc": 0.74173708, "epoch": 0.3604769152714358, "grad_norm": 2.03125, "learning_rate": 9.543387964333018e-06, "loss": 1.02214394, "memory(GiB)": 369.4, "step": 14210, "train_speed(iter/s)": 0.201098 }, { "acc": 0.72487602, "epoch": 0.36060375443937087, "grad_norm": 2.3125, "learning_rate": 9.542950065560165e-06, "loss": 1.04494677, "memory(GiB)": 369.4, "step": 14215, "train_speed(iter/s)": 0.201111 }, { "acc": 0.72783575, "epoch": 0.3607305936073059, "grad_norm": 2.0625, "learning_rate": 9.542511966968087e-06, "loss": 1.10202599, "memory(GiB)": 369.4, "step": 14220, "train_speed(iter/s)": 0.201121 }, { "acc": 0.7541924, "epoch": 0.360857432775241, "grad_norm": 2.234375, "learning_rate": 9.542073668576052e-06, "loss": 0.96531935, "memory(GiB)": 369.4, "step": 14225, "train_speed(iter/s)": 0.201132 }, { "acc": 0.7508503, "epoch": 0.36098427194317606, "grad_norm": 1.8203125, "learning_rate": 9.541635170403338e-06, "loss": 0.98727207, "memory(GiB)": 369.4, "step": 14230, "train_speed(iter/s)": 0.201142 }, { "acc": 0.73835993, "epoch": 0.3611111111111111, "grad_norm": 1.984375, "learning_rate": 9.541196472469234e-06, "loss": 1.00003319, "memory(GiB)": 369.4, "step": 14235, "train_speed(iter/s)": 0.201143 }, { "acc": 0.73580217, "epoch": 0.36123795027904615, "grad_norm": 2.59375, "learning_rate": 9.540757574793032e-06, "loss": 1.05364513, "memory(GiB)": 369.4, "step": 14240, "train_speed(iter/s)": 0.201155 }, { "acc": 0.73376083, "epoch": 0.36136478944698125, "grad_norm": 2.09375, "learning_rate": 9.540318477394039e-06, "loss": 1.04122934, "memory(GiB)": 369.4, "step": 14245, "train_speed(iter/s)": 0.20117 }, { "acc": 0.75930309, "epoch": 0.3614916286149163, "grad_norm": 1.8671875, "learning_rate": 9.539879180291568e-06, "loss": 0.98416224, "memory(GiB)": 369.4, "step": 14250, "train_speed(iter/s)": 0.201179 }, { "acc": 0.74496117, "epoch": 0.36161846778285134, "grad_norm": 1.9921875, "learning_rate": 9.539439683504943e-06, "loss": 1.03717861, "memory(GiB)": 369.4, "step": 14255, "train_speed(iter/s)": 0.201189 }, { "acc": 0.72893724, "epoch": 0.3617453069507864, "grad_norm": 2.21875, "learning_rate": 9.538999987053492e-06, "loss": 1.13467064, "memory(GiB)": 369.4, "step": 14260, "train_speed(iter/s)": 0.201205 }, { "acc": 0.73742981, "epoch": 0.3618721461187215, "grad_norm": 2.0, "learning_rate": 9.538560090956557e-06, "loss": 1.05232353, "memory(GiB)": 369.4, "step": 14265, "train_speed(iter/s)": 0.201214 }, { "acc": 0.74828606, "epoch": 0.3619989852866565, "grad_norm": 2.234375, "learning_rate": 9.538119995233485e-06, "loss": 1.00494871, "memory(GiB)": 369.4, "step": 14270, "train_speed(iter/s)": 0.201224 }, { "acc": 0.750243, "epoch": 0.36212582445459157, "grad_norm": 1.90625, "learning_rate": 9.537679699903637e-06, "loss": 1.02329502, "memory(GiB)": 369.4, "step": 14275, "train_speed(iter/s)": 0.201232 }, { "acc": 0.75626135, "epoch": 0.3622526636225266, "grad_norm": 2.484375, "learning_rate": 9.537239204986375e-06, "loss": 0.94071226, "memory(GiB)": 369.4, "step": 14280, "train_speed(iter/s)": 0.201244 }, { "acc": 0.72977147, "epoch": 0.3623795027904617, "grad_norm": 2.078125, "learning_rate": 9.536798510501075e-06, "loss": 1.02994366, "memory(GiB)": 369.4, "step": 14285, "train_speed(iter/s)": 0.201256 }, { "acc": 0.75197048, "epoch": 0.36250634195839676, "grad_norm": 2.859375, "learning_rate": 9.536357616467123e-06, "loss": 1.01974401, "memory(GiB)": 369.4, "step": 14290, "train_speed(iter/s)": 0.201272 }, { "acc": 0.74944792, "epoch": 0.3626331811263318, "grad_norm": 2.0, "learning_rate": 9.535916522903908e-06, "loss": 0.98737488, "memory(GiB)": 369.4, "step": 14295, "train_speed(iter/s)": 0.201282 }, { "acc": 0.72799129, "epoch": 0.36276002029426685, "grad_norm": 2.640625, "learning_rate": 9.535475229830832e-06, "loss": 1.09364243, "memory(GiB)": 369.4, "step": 14300, "train_speed(iter/s)": 0.20129 }, { "acc": 0.74179373, "epoch": 0.36288685946220195, "grad_norm": 1.9921875, "learning_rate": 9.535033737267308e-06, "loss": 0.98539066, "memory(GiB)": 369.4, "step": 14305, "train_speed(iter/s)": 0.2013 }, { "acc": 0.74609623, "epoch": 0.363013698630137, "grad_norm": 2.234375, "learning_rate": 9.534592045232752e-06, "loss": 0.95813255, "memory(GiB)": 369.4, "step": 14310, "train_speed(iter/s)": 0.201306 }, { "acc": 0.73006315, "epoch": 0.36314053779807204, "grad_norm": 2.375, "learning_rate": 9.534150153746591e-06, "loss": 1.12533321, "memory(GiB)": 369.4, "step": 14315, "train_speed(iter/s)": 0.20131 }, { "acc": 0.73643513, "epoch": 0.3632673769660071, "grad_norm": 2.25, "learning_rate": 9.533708062828264e-06, "loss": 1.03341484, "memory(GiB)": 369.4, "step": 14320, "train_speed(iter/s)": 0.20132 }, { "acc": 0.73333588, "epoch": 0.3633942161339422, "grad_norm": 2.109375, "learning_rate": 9.533265772497216e-06, "loss": 1.0990634, "memory(GiB)": 369.4, "step": 14325, "train_speed(iter/s)": 0.201331 }, { "acc": 0.74965091, "epoch": 0.3635210553018772, "grad_norm": 2.296875, "learning_rate": 9.532823282772899e-06, "loss": 1.0032299, "memory(GiB)": 369.4, "step": 14330, "train_speed(iter/s)": 0.201346 }, { "acc": 0.73922729, "epoch": 0.36364789446981227, "grad_norm": 2.34375, "learning_rate": 9.532380593674775e-06, "loss": 1.0399559, "memory(GiB)": 369.4, "step": 14335, "train_speed(iter/s)": 0.201357 }, { "acc": 0.73947392, "epoch": 0.3637747336377473, "grad_norm": 2.71875, "learning_rate": 9.531937705222319e-06, "loss": 1.05637264, "memory(GiB)": 369.4, "step": 14340, "train_speed(iter/s)": 0.20137 }, { "acc": 0.73396211, "epoch": 0.3639015728056824, "grad_norm": 2.03125, "learning_rate": 9.531494617435006e-06, "loss": 1.02552814, "memory(GiB)": 369.4, "step": 14345, "train_speed(iter/s)": 0.20138 }, { "acc": 0.74489183, "epoch": 0.36402841197361746, "grad_norm": 2.96875, "learning_rate": 9.531051330332331e-06, "loss": 1.00613842, "memory(GiB)": 369.4, "step": 14350, "train_speed(iter/s)": 0.201392 }, { "acc": 0.72988186, "epoch": 0.3641552511415525, "grad_norm": 2.125, "learning_rate": 9.530607843933788e-06, "loss": 1.05310249, "memory(GiB)": 369.4, "step": 14355, "train_speed(iter/s)": 0.201402 }, { "acc": 0.74456186, "epoch": 0.36428209030948755, "grad_norm": 2.09375, "learning_rate": 9.530164158258883e-06, "loss": 0.98942719, "memory(GiB)": 369.4, "step": 14360, "train_speed(iter/s)": 0.201412 }, { "acc": 0.7545557, "epoch": 0.36440892947742265, "grad_norm": 2.265625, "learning_rate": 9.529720273327135e-06, "loss": 1.00433416, "memory(GiB)": 369.4, "step": 14365, "train_speed(iter/s)": 0.201424 }, { "acc": 0.74242706, "epoch": 0.3645357686453577, "grad_norm": 2.15625, "learning_rate": 9.529276189158063e-06, "loss": 1.07028465, "memory(GiB)": 369.4, "step": 14370, "train_speed(iter/s)": 0.201437 }, { "acc": 0.74715567, "epoch": 0.36466260781329274, "grad_norm": 1.96875, "learning_rate": 9.528831905771205e-06, "loss": 1.02060595, "memory(GiB)": 369.4, "step": 14375, "train_speed(iter/s)": 0.20144 }, { "acc": 0.74189148, "epoch": 0.3647894469812278, "grad_norm": 2.3125, "learning_rate": 9.528387423186098e-06, "loss": 1.0425271, "memory(GiB)": 369.4, "step": 14380, "train_speed(iter/s)": 0.201448 }, { "acc": 0.74100676, "epoch": 0.3649162861491629, "grad_norm": 2.03125, "learning_rate": 9.527942741422297e-06, "loss": 1.04477606, "memory(GiB)": 369.4, "step": 14385, "train_speed(iter/s)": 0.201462 }, { "acc": 0.73231592, "epoch": 0.3650431253170979, "grad_norm": 2.296875, "learning_rate": 9.527497860499355e-06, "loss": 1.12373028, "memory(GiB)": 369.4, "step": 14390, "train_speed(iter/s)": 0.201469 }, { "acc": 0.75220747, "epoch": 0.36516996448503297, "grad_norm": 2.34375, "learning_rate": 9.527052780436845e-06, "loss": 0.93231001, "memory(GiB)": 369.4, "step": 14395, "train_speed(iter/s)": 0.201482 }, { "acc": 0.73961253, "epoch": 0.365296803652968, "grad_norm": 2.140625, "learning_rate": 9.52660750125434e-06, "loss": 1.02449236, "memory(GiB)": 369.4, "step": 14400, "train_speed(iter/s)": 0.201488 }, { "acc": 0.7368598, "epoch": 0.3654236428209031, "grad_norm": 1.9296875, "learning_rate": 9.52616202297143e-06, "loss": 1.00447636, "memory(GiB)": 369.4, "step": 14405, "train_speed(iter/s)": 0.2015 }, { "acc": 0.73518996, "epoch": 0.36555048198883816, "grad_norm": 1.875, "learning_rate": 9.525716345607706e-06, "loss": 1.03538599, "memory(GiB)": 369.4, "step": 14410, "train_speed(iter/s)": 0.201509 }, { "acc": 0.72927108, "epoch": 0.3656773211567732, "grad_norm": 2.0625, "learning_rate": 9.52527046918277e-06, "loss": 1.08582859, "memory(GiB)": 369.4, "step": 14415, "train_speed(iter/s)": 0.201521 }, { "acc": 0.73019118, "epoch": 0.36580416032470825, "grad_norm": 1.9375, "learning_rate": 9.524824393716235e-06, "loss": 1.07680292, "memory(GiB)": 369.4, "step": 14420, "train_speed(iter/s)": 0.201528 }, { "acc": 0.74114523, "epoch": 0.36593099949264335, "grad_norm": 2.234375, "learning_rate": 9.524378119227722e-06, "loss": 1.01264343, "memory(GiB)": 369.4, "step": 14425, "train_speed(iter/s)": 0.20154 }, { "acc": 0.74838052, "epoch": 0.3660578386605784, "grad_norm": 2.296875, "learning_rate": 9.523931645736858e-06, "loss": 1.04928226, "memory(GiB)": 369.4, "step": 14430, "train_speed(iter/s)": 0.201553 }, { "acc": 0.74733205, "epoch": 0.36618467782851344, "grad_norm": 2.03125, "learning_rate": 9.523484973263283e-06, "loss": 1.03697338, "memory(GiB)": 369.4, "step": 14435, "train_speed(iter/s)": 0.201563 }, { "acc": 0.7550149, "epoch": 0.3663115169964485, "grad_norm": 2.59375, "learning_rate": 9.523038101826644e-06, "loss": 0.99836464, "memory(GiB)": 369.4, "step": 14440, "train_speed(iter/s)": 0.201577 }, { "acc": 0.72722521, "epoch": 0.3664383561643836, "grad_norm": 2.34375, "learning_rate": 9.522591031446596e-06, "loss": 1.10204372, "memory(GiB)": 369.4, "step": 14445, "train_speed(iter/s)": 0.201588 }, { "acc": 0.74870005, "epoch": 0.3665651953323186, "grad_norm": 2.265625, "learning_rate": 9.522143762142801e-06, "loss": 0.99085732, "memory(GiB)": 369.4, "step": 14450, "train_speed(iter/s)": 0.201596 }, { "acc": 0.76188526, "epoch": 0.36669203450025367, "grad_norm": 2.234375, "learning_rate": 9.521696293934934e-06, "loss": 0.98427639, "memory(GiB)": 369.4, "step": 14455, "train_speed(iter/s)": 0.201606 }, { "acc": 0.74202967, "epoch": 0.3668188736681887, "grad_norm": 1.9921875, "learning_rate": 9.521248626842676e-06, "loss": 1.05902348, "memory(GiB)": 369.4, "step": 14460, "train_speed(iter/s)": 0.201617 }, { "acc": 0.74257498, "epoch": 0.3669457128361238, "grad_norm": 2.03125, "learning_rate": 9.520800760885716e-06, "loss": 1.05372276, "memory(GiB)": 369.4, "step": 14465, "train_speed(iter/s)": 0.201628 }, { "acc": 0.74055128, "epoch": 0.36707255200405886, "grad_norm": 2.15625, "learning_rate": 9.520352696083756e-06, "loss": 0.98856602, "memory(GiB)": 369.4, "step": 14470, "train_speed(iter/s)": 0.201634 }, { "acc": 0.74912634, "epoch": 0.3671993911719939, "grad_norm": 2.25, "learning_rate": 9.519904432456504e-06, "loss": 0.95653648, "memory(GiB)": 369.4, "step": 14475, "train_speed(iter/s)": 0.201648 }, { "acc": 0.75407143, "epoch": 0.36732623033992895, "grad_norm": 2.15625, "learning_rate": 9.519455970023672e-06, "loss": 0.9573801, "memory(GiB)": 369.4, "step": 14480, "train_speed(iter/s)": 0.201661 }, { "acc": 0.74979019, "epoch": 0.36745306950786405, "grad_norm": 2.171875, "learning_rate": 9.519007308804991e-06, "loss": 1.00977631, "memory(GiB)": 369.4, "step": 14485, "train_speed(iter/s)": 0.201669 }, { "acc": 0.73744392, "epoch": 0.3675799086757991, "grad_norm": 1.8359375, "learning_rate": 9.518558448820193e-06, "loss": 0.99327888, "memory(GiB)": 369.4, "step": 14490, "train_speed(iter/s)": 0.201678 }, { "acc": 0.74625988, "epoch": 0.36770674784373414, "grad_norm": 2.453125, "learning_rate": 9.518109390089017e-06, "loss": 1.02008495, "memory(GiB)": 369.4, "step": 14495, "train_speed(iter/s)": 0.201685 }, { "acc": 0.73730211, "epoch": 0.3678335870116692, "grad_norm": 2.90625, "learning_rate": 9.517660132631222e-06, "loss": 1.07416534, "memory(GiB)": 369.4, "step": 14500, "train_speed(iter/s)": 0.201692 }, { "acc": 0.76135211, "epoch": 0.3679604261796043, "grad_norm": 2.046875, "learning_rate": 9.517210676466561e-06, "loss": 0.99493942, "memory(GiB)": 369.4, "step": 14505, "train_speed(iter/s)": 0.201685 }, { "acc": 0.74237452, "epoch": 0.3680872653475393, "grad_norm": 1.7265625, "learning_rate": 9.516761021614809e-06, "loss": 0.99398794, "memory(GiB)": 369.4, "step": 14510, "train_speed(iter/s)": 0.201692 }, { "acc": 0.74953198, "epoch": 0.36821410451547437, "grad_norm": 1.9453125, "learning_rate": 9.51631116809574e-06, "loss": 1.01647348, "memory(GiB)": 369.4, "step": 14515, "train_speed(iter/s)": 0.201701 }, { "acc": 0.73651304, "epoch": 0.3683409436834094, "grad_norm": 2.234375, "learning_rate": 9.515861115929144e-06, "loss": 1.10210743, "memory(GiB)": 369.4, "step": 14520, "train_speed(iter/s)": 0.201713 }, { "acc": 0.73297358, "epoch": 0.3684677828513445, "grad_norm": 3.015625, "learning_rate": 9.515410865134812e-06, "loss": 1.04865475, "memory(GiB)": 369.4, "step": 14525, "train_speed(iter/s)": 0.20172 }, { "acc": 0.75688152, "epoch": 0.36859462201927956, "grad_norm": 2.34375, "learning_rate": 9.514960415732551e-06, "loss": 0.96310749, "memory(GiB)": 369.4, "step": 14530, "train_speed(iter/s)": 0.201732 }, { "acc": 0.74203186, "epoch": 0.3687214611872146, "grad_norm": 2.53125, "learning_rate": 9.514509767742172e-06, "loss": 1.06866608, "memory(GiB)": 369.4, "step": 14535, "train_speed(iter/s)": 0.201746 }, { "acc": 0.74502988, "epoch": 0.36884830035514965, "grad_norm": 2.890625, "learning_rate": 9.5140589211835e-06, "loss": 1.02396965, "memory(GiB)": 369.4, "step": 14540, "train_speed(iter/s)": 0.201759 }, { "acc": 0.74773035, "epoch": 0.36897513952308475, "grad_norm": 2.375, "learning_rate": 9.513607876076363e-06, "loss": 1.08648853, "memory(GiB)": 369.4, "step": 14545, "train_speed(iter/s)": 0.201771 }, { "acc": 0.75011826, "epoch": 0.3691019786910198, "grad_norm": 2.78125, "learning_rate": 9.513156632440598e-06, "loss": 0.99457474, "memory(GiB)": 369.4, "step": 14550, "train_speed(iter/s)": 0.201785 }, { "acc": 0.74232049, "epoch": 0.36922881785895484, "grad_norm": 1.9765625, "learning_rate": 9.512705190296055e-06, "loss": 1.04345036, "memory(GiB)": 369.4, "step": 14555, "train_speed(iter/s)": 0.201792 }, { "acc": 0.74815087, "epoch": 0.3693556570268899, "grad_norm": 2.0625, "learning_rate": 9.512253549662588e-06, "loss": 0.9673214, "memory(GiB)": 369.4, "step": 14560, "train_speed(iter/s)": 0.201799 }, { "acc": 0.7413476, "epoch": 0.369482496194825, "grad_norm": 2.359375, "learning_rate": 9.511801710560066e-06, "loss": 1.1321806, "memory(GiB)": 369.4, "step": 14565, "train_speed(iter/s)": 0.201808 }, { "acc": 0.75019445, "epoch": 0.36960933536276, "grad_norm": 2.09375, "learning_rate": 9.511349673008364e-06, "loss": 0.97380104, "memory(GiB)": 369.4, "step": 14570, "train_speed(iter/s)": 0.201817 }, { "acc": 0.73970156, "epoch": 0.36973617453069507, "grad_norm": 2.046875, "learning_rate": 9.510897437027358e-06, "loss": 1.00947094, "memory(GiB)": 369.4, "step": 14575, "train_speed(iter/s)": 0.201824 }, { "acc": 0.75262675, "epoch": 0.3698630136986301, "grad_norm": 2.3125, "learning_rate": 9.510445002636943e-06, "loss": 1.04188986, "memory(GiB)": 369.4, "step": 14580, "train_speed(iter/s)": 0.201835 }, { "acc": 0.75592442, "epoch": 0.3699898528665652, "grad_norm": 2.375, "learning_rate": 9.50999236985702e-06, "loss": 1.01739712, "memory(GiB)": 369.4, "step": 14585, "train_speed(iter/s)": 0.201847 }, { "acc": 0.74866385, "epoch": 0.37011669203450026, "grad_norm": 2.21875, "learning_rate": 9.509539538707497e-06, "loss": 1.07991447, "memory(GiB)": 369.4, "step": 14590, "train_speed(iter/s)": 0.201855 }, { "acc": 0.74017792, "epoch": 0.3702435312024353, "grad_norm": 2.5625, "learning_rate": 9.50908650920829e-06, "loss": 1.01681004, "memory(GiB)": 369.4, "step": 14595, "train_speed(iter/s)": 0.201865 }, { "acc": 0.73310122, "epoch": 0.37037037037037035, "grad_norm": 1.9375, "learning_rate": 9.50863328137933e-06, "loss": 1.0259119, "memory(GiB)": 369.4, "step": 14600, "train_speed(iter/s)": 0.201872 }, { "acc": 0.74996939, "epoch": 0.37049720953830545, "grad_norm": 1.9609375, "learning_rate": 9.508179855240545e-06, "loss": 1.0165493, "memory(GiB)": 369.4, "step": 14605, "train_speed(iter/s)": 0.201881 }, { "acc": 0.74013166, "epoch": 0.3706240487062405, "grad_norm": 2.1875, "learning_rate": 9.507726230811884e-06, "loss": 1.06903305, "memory(GiB)": 369.4, "step": 14610, "train_speed(iter/s)": 0.201891 }, { "acc": 0.74057522, "epoch": 0.37075088787417554, "grad_norm": 2.359375, "learning_rate": 9.507272408113298e-06, "loss": 1.01388721, "memory(GiB)": 369.4, "step": 14615, "train_speed(iter/s)": 0.201892 }, { "acc": 0.7237237, "epoch": 0.3708777270421106, "grad_norm": 1.953125, "learning_rate": 9.506818387164748e-06, "loss": 1.07750702, "memory(GiB)": 369.4, "step": 14620, "train_speed(iter/s)": 0.201897 }, { "acc": 0.7371767, "epoch": 0.3710045662100457, "grad_norm": 2.09375, "learning_rate": 9.506364167986204e-06, "loss": 1.06960678, "memory(GiB)": 369.4, "step": 14625, "train_speed(iter/s)": 0.2019 }, { "acc": 0.74968214, "epoch": 0.3711314053779807, "grad_norm": 2.515625, "learning_rate": 9.505909750597644e-06, "loss": 1.00305958, "memory(GiB)": 369.4, "step": 14630, "train_speed(iter/s)": 0.201912 }, { "acc": 0.73867893, "epoch": 0.37125824454591577, "grad_norm": 2.140625, "learning_rate": 9.505455135019055e-06, "loss": 0.98572607, "memory(GiB)": 369.4, "step": 14635, "train_speed(iter/s)": 0.201922 }, { "acc": 0.74544744, "epoch": 0.3713850837138508, "grad_norm": 2.421875, "learning_rate": 9.505000321270435e-06, "loss": 0.99452991, "memory(GiB)": 369.4, "step": 14640, "train_speed(iter/s)": 0.201936 }, { "acc": 0.74936337, "epoch": 0.3715119228817859, "grad_norm": 2.484375, "learning_rate": 9.504545309371786e-06, "loss": 1.02852669, "memory(GiB)": 369.4, "step": 14645, "train_speed(iter/s)": 0.201945 }, { "acc": 0.7338912, "epoch": 0.37163876204972096, "grad_norm": 1.921875, "learning_rate": 9.504090099343125e-06, "loss": 1.05368519, "memory(GiB)": 369.4, "step": 14650, "train_speed(iter/s)": 0.201959 }, { "acc": 0.76818428, "epoch": 0.371765601217656, "grad_norm": 2.28125, "learning_rate": 9.50363469120447e-06, "loss": 0.94246006, "memory(GiB)": 369.4, "step": 14655, "train_speed(iter/s)": 0.201959 }, { "acc": 0.72808809, "epoch": 0.37189244038559105, "grad_norm": 2.140625, "learning_rate": 9.503179084975855e-06, "loss": 1.08514442, "memory(GiB)": 369.4, "step": 14660, "train_speed(iter/s)": 0.201972 }, { "acc": 0.73321891, "epoch": 0.37201927955352615, "grad_norm": 2.03125, "learning_rate": 9.502723280677319e-06, "loss": 1.03470592, "memory(GiB)": 369.4, "step": 14665, "train_speed(iter/s)": 0.201968 }, { "acc": 0.74337926, "epoch": 0.3721461187214612, "grad_norm": 2.140625, "learning_rate": 9.50226727832891e-06, "loss": 1.04292908, "memory(GiB)": 369.4, "step": 14670, "train_speed(iter/s)": 0.20198 }, { "acc": 0.73371973, "epoch": 0.37227295788939624, "grad_norm": 2.375, "learning_rate": 9.501811077950685e-06, "loss": 1.05712032, "memory(GiB)": 369.4, "step": 14675, "train_speed(iter/s)": 0.201987 }, { "acc": 0.7491056, "epoch": 0.3723997970573313, "grad_norm": 2.34375, "learning_rate": 9.501354679562708e-06, "loss": 1.00460796, "memory(GiB)": 369.4, "step": 14680, "train_speed(iter/s)": 0.201997 }, { "acc": 0.73451614, "epoch": 0.3725266362252664, "grad_norm": 2.4375, "learning_rate": 9.500898083185058e-06, "loss": 1.07605295, "memory(GiB)": 369.4, "step": 14685, "train_speed(iter/s)": 0.202007 }, { "acc": 0.73600807, "epoch": 0.3726534753932014, "grad_norm": 2.1875, "learning_rate": 9.500441288837812e-06, "loss": 0.99768143, "memory(GiB)": 369.4, "step": 14690, "train_speed(iter/s)": 0.202001 }, { "acc": 0.73696394, "epoch": 0.37278031456113647, "grad_norm": 2.046875, "learning_rate": 9.499984296541066e-06, "loss": 1.09554329, "memory(GiB)": 369.4, "step": 14695, "train_speed(iter/s)": 0.202012 }, { "acc": 0.72743464, "epoch": 0.3729071537290715, "grad_norm": 2.296875, "learning_rate": 9.49952710631492e-06, "loss": 1.11731291, "memory(GiB)": 369.4, "step": 14700, "train_speed(iter/s)": 0.202023 }, { "acc": 0.75080452, "epoch": 0.3730339928970066, "grad_norm": 2.125, "learning_rate": 9.499069718179484e-06, "loss": 0.97217731, "memory(GiB)": 369.4, "step": 14705, "train_speed(iter/s)": 0.202031 }, { "acc": 0.74775257, "epoch": 0.37316083206494166, "grad_norm": 2.21875, "learning_rate": 9.498612132154874e-06, "loss": 0.98042202, "memory(GiB)": 369.4, "step": 14710, "train_speed(iter/s)": 0.202042 }, { "acc": 0.74431114, "epoch": 0.3732876712328767, "grad_norm": 1.6953125, "learning_rate": 9.498154348261217e-06, "loss": 1.02681828, "memory(GiB)": 369.4, "step": 14715, "train_speed(iter/s)": 0.202043 }, { "acc": 0.75452738, "epoch": 0.37341451040081175, "grad_norm": 2.171875, "learning_rate": 9.497696366518649e-06, "loss": 1.00601778, "memory(GiB)": 369.4, "step": 14720, "train_speed(iter/s)": 0.202055 }, { "acc": 0.740165, "epoch": 0.37354134956874685, "grad_norm": 1.8515625, "learning_rate": 9.497238186947315e-06, "loss": 1.08209114, "memory(GiB)": 369.4, "step": 14725, "train_speed(iter/s)": 0.202061 }, { "acc": 0.73377256, "epoch": 0.3736681887366819, "grad_norm": 2.015625, "learning_rate": 9.496779809567367e-06, "loss": 1.08282833, "memory(GiB)": 369.4, "step": 14730, "train_speed(iter/s)": 0.202073 }, { "acc": 0.74249372, "epoch": 0.37379502790461694, "grad_norm": 1.890625, "learning_rate": 9.496321234398967e-06, "loss": 0.99159851, "memory(GiB)": 369.4, "step": 14735, "train_speed(iter/s)": 0.202083 }, { "acc": 0.74305563, "epoch": 0.373921867072552, "grad_norm": 1.78125, "learning_rate": 9.495862461462282e-06, "loss": 1.02378407, "memory(GiB)": 369.4, "step": 14740, "train_speed(iter/s)": 0.202085 }, { "acc": 0.73893185, "epoch": 0.3740487062404871, "grad_norm": 2.015625, "learning_rate": 9.495403490777495e-06, "loss": 1.04910841, "memory(GiB)": 369.4, "step": 14745, "train_speed(iter/s)": 0.202091 }, { "acc": 0.7590889, "epoch": 0.3741755454084221, "grad_norm": 2.1875, "learning_rate": 9.49494432236479e-06, "loss": 0.98336906, "memory(GiB)": 369.4, "step": 14750, "train_speed(iter/s)": 0.202098 }, { "acc": 0.74648509, "epoch": 0.37430238457635717, "grad_norm": 1.984375, "learning_rate": 9.494484956244368e-06, "loss": 1.00328941, "memory(GiB)": 369.4, "step": 14755, "train_speed(iter/s)": 0.202109 }, { "acc": 0.74678202, "epoch": 0.3744292237442922, "grad_norm": 2.59375, "learning_rate": 9.49402539243643e-06, "loss": 1.09164362, "memory(GiB)": 369.4, "step": 14760, "train_speed(iter/s)": 0.202121 }, { "acc": 0.7504427, "epoch": 0.3745560629122273, "grad_norm": 2.078125, "learning_rate": 9.49356563096119e-06, "loss": 1.01573744, "memory(GiB)": 369.4, "step": 14765, "train_speed(iter/s)": 0.202131 }, { "acc": 0.75215354, "epoch": 0.37468290208016236, "grad_norm": 1.8203125, "learning_rate": 9.49310567183887e-06, "loss": 1.00059319, "memory(GiB)": 369.4, "step": 14770, "train_speed(iter/s)": 0.202144 }, { "acc": 0.74639874, "epoch": 0.3748097412480974, "grad_norm": 2.34375, "learning_rate": 9.492645515089706e-06, "loss": 1.02521114, "memory(GiB)": 369.4, "step": 14775, "train_speed(iter/s)": 0.202157 }, { "acc": 0.74175501, "epoch": 0.37493658041603245, "grad_norm": 2.53125, "learning_rate": 9.492185160733934e-06, "loss": 1.00725756, "memory(GiB)": 369.4, "step": 14780, "train_speed(iter/s)": 0.202167 }, { "acc": 0.73667936, "epoch": 0.37506341958396755, "grad_norm": 2.96875, "learning_rate": 9.491724608791798e-06, "loss": 1.04493036, "memory(GiB)": 369.4, "step": 14785, "train_speed(iter/s)": 0.202171 }, { "acc": 0.73242364, "epoch": 0.3751902587519026, "grad_norm": 2.265625, "learning_rate": 9.491263859283563e-06, "loss": 1.05236759, "memory(GiB)": 369.4, "step": 14790, "train_speed(iter/s)": 0.202174 }, { "acc": 0.74892335, "epoch": 0.37531709791983764, "grad_norm": 1.78125, "learning_rate": 9.490802912229491e-06, "loss": 1.029076, "memory(GiB)": 369.4, "step": 14795, "train_speed(iter/s)": 0.202183 }, { "acc": 0.73583975, "epoch": 0.3754439370877727, "grad_norm": 1.8046875, "learning_rate": 9.490341767649858e-06, "loss": 1.09912682, "memory(GiB)": 369.4, "step": 14800, "train_speed(iter/s)": 0.20219 }, { "acc": 0.75684919, "epoch": 0.3755707762557078, "grad_norm": 2.1875, "learning_rate": 9.489880425564944e-06, "loss": 0.98686829, "memory(GiB)": 369.4, "step": 14805, "train_speed(iter/s)": 0.202198 }, { "acc": 0.73992181, "epoch": 0.37569761542364283, "grad_norm": 1.8359375, "learning_rate": 9.489418885995043e-06, "loss": 1.0025548, "memory(GiB)": 369.4, "step": 14810, "train_speed(iter/s)": 0.202208 }, { "acc": 0.73153114, "epoch": 0.3758244545915779, "grad_norm": 2.171875, "learning_rate": 9.488957148960457e-06, "loss": 1.07587137, "memory(GiB)": 369.4, "step": 14815, "train_speed(iter/s)": 0.202216 }, { "acc": 0.73437681, "epoch": 0.3759512937595129, "grad_norm": 2.15625, "learning_rate": 9.488495214481494e-06, "loss": 1.00884247, "memory(GiB)": 369.4, "step": 14820, "train_speed(iter/s)": 0.202225 }, { "acc": 0.75173616, "epoch": 0.376078132927448, "grad_norm": 2.15625, "learning_rate": 9.48803308257847e-06, "loss": 1.02686977, "memory(GiB)": 369.4, "step": 14825, "train_speed(iter/s)": 0.202232 }, { "acc": 0.75227671, "epoch": 0.37620497209538306, "grad_norm": 2.28125, "learning_rate": 9.487570753271716e-06, "loss": 1.02512445, "memory(GiB)": 369.4, "step": 14830, "train_speed(iter/s)": 0.202244 }, { "acc": 0.73586068, "epoch": 0.3763318112633181, "grad_norm": 2.5, "learning_rate": 9.487108226581564e-06, "loss": 1.06805153, "memory(GiB)": 369.4, "step": 14835, "train_speed(iter/s)": 0.202257 }, { "acc": 0.74960427, "epoch": 0.37645865043125315, "grad_norm": 1.8984375, "learning_rate": 9.486645502528355e-06, "loss": 0.98024826, "memory(GiB)": 369.4, "step": 14840, "train_speed(iter/s)": 0.202266 }, { "acc": 0.74236016, "epoch": 0.37658548959918825, "grad_norm": 1.9609375, "learning_rate": 9.486182581132449e-06, "loss": 0.98499498, "memory(GiB)": 369.4, "step": 14845, "train_speed(iter/s)": 0.202276 }, { "acc": 0.7503499, "epoch": 0.3767123287671233, "grad_norm": 2.40625, "learning_rate": 9.485719462414202e-06, "loss": 0.96335802, "memory(GiB)": 369.4, "step": 14850, "train_speed(iter/s)": 0.202287 }, { "acc": 0.75109444, "epoch": 0.37683916793505834, "grad_norm": 2.34375, "learning_rate": 9.485256146393987e-06, "loss": 1.0457056, "memory(GiB)": 369.4, "step": 14855, "train_speed(iter/s)": 0.2023 }, { "acc": 0.74860497, "epoch": 0.3769660071029934, "grad_norm": 2.453125, "learning_rate": 9.484792633092182e-06, "loss": 1.04843454, "memory(GiB)": 369.4, "step": 14860, "train_speed(iter/s)": 0.202308 }, { "acc": 0.75075331, "epoch": 0.3770928462709285, "grad_norm": 2.109375, "learning_rate": 9.484328922529172e-06, "loss": 0.96908112, "memory(GiB)": 369.4, "step": 14865, "train_speed(iter/s)": 0.202322 }, { "acc": 0.73602734, "epoch": 0.37721968543886353, "grad_norm": 1.9296875, "learning_rate": 9.483865014725356e-06, "loss": 1.00567207, "memory(GiB)": 369.4, "step": 14870, "train_speed(iter/s)": 0.202326 }, { "acc": 0.74514332, "epoch": 0.3773465246067986, "grad_norm": 2.5, "learning_rate": 9.483400909701139e-06, "loss": 1.03673649, "memory(GiB)": 369.4, "step": 14875, "train_speed(iter/s)": 0.202334 }, { "acc": 0.73637094, "epoch": 0.3774733637747336, "grad_norm": 1.9375, "learning_rate": 9.482936607476931e-06, "loss": 1.04570789, "memory(GiB)": 369.4, "step": 14880, "train_speed(iter/s)": 0.202344 }, { "acc": 0.73519373, "epoch": 0.3776002029426687, "grad_norm": 2.421875, "learning_rate": 9.482472108073157e-06, "loss": 1.07043037, "memory(GiB)": 369.4, "step": 14885, "train_speed(iter/s)": 0.202356 }, { "acc": 0.74056993, "epoch": 0.37772704211060376, "grad_norm": 2.359375, "learning_rate": 9.482007411510245e-06, "loss": 1.03568592, "memory(GiB)": 369.4, "step": 14890, "train_speed(iter/s)": 0.202363 }, { "acc": 0.75107985, "epoch": 0.3778538812785388, "grad_norm": 2.203125, "learning_rate": 9.48154251780864e-06, "loss": 0.96499977, "memory(GiB)": 369.4, "step": 14895, "train_speed(iter/s)": 0.202369 }, { "acc": 0.74281211, "epoch": 0.37798072044647385, "grad_norm": 2.1875, "learning_rate": 9.481077426988782e-06, "loss": 1.03564396, "memory(GiB)": 369.4, "step": 14900, "train_speed(iter/s)": 0.20238 }, { "acc": 0.73113108, "epoch": 0.37810755961440895, "grad_norm": 2.40625, "learning_rate": 9.480612139071134e-06, "loss": 1.06302147, "memory(GiB)": 369.4, "step": 14905, "train_speed(iter/s)": 0.202388 }, { "acc": 0.73050313, "epoch": 0.378234398782344, "grad_norm": 2.375, "learning_rate": 9.48014665407616e-06, "loss": 1.05575447, "memory(GiB)": 369.4, "step": 14910, "train_speed(iter/s)": 0.202397 }, { "acc": 0.75610614, "epoch": 0.37836123795027904, "grad_norm": 2.125, "learning_rate": 9.479680972024334e-06, "loss": 1.01096449, "memory(GiB)": 369.4, "step": 14915, "train_speed(iter/s)": 0.202409 }, { "acc": 0.7546361, "epoch": 0.3784880771182141, "grad_norm": 2.546875, "learning_rate": 9.47921509293614e-06, "loss": 0.96232758, "memory(GiB)": 369.4, "step": 14920, "train_speed(iter/s)": 0.20242 }, { "acc": 0.74606218, "epoch": 0.3786149162861492, "grad_norm": 2.296875, "learning_rate": 9.478749016832066e-06, "loss": 1.05905685, "memory(GiB)": 369.4, "step": 14925, "train_speed(iter/s)": 0.202427 }, { "acc": 0.74565144, "epoch": 0.37874175545408423, "grad_norm": 1.8359375, "learning_rate": 9.478282743732613e-06, "loss": 1.00304289, "memory(GiB)": 369.4, "step": 14930, "train_speed(iter/s)": 0.202432 }, { "acc": 0.7455874, "epoch": 0.3788685946220193, "grad_norm": 2.25, "learning_rate": 9.477816273658293e-06, "loss": 1.04754219, "memory(GiB)": 369.4, "step": 14935, "train_speed(iter/s)": 0.20244 }, { "acc": 0.73778152, "epoch": 0.3789954337899543, "grad_norm": 2.09375, "learning_rate": 9.47734960662962e-06, "loss": 1.06123257, "memory(GiB)": 369.4, "step": 14940, "train_speed(iter/s)": 0.202442 }, { "acc": 0.74162951, "epoch": 0.3791222729578894, "grad_norm": 1.8359375, "learning_rate": 9.476882742667122e-06, "loss": 1.03058453, "memory(GiB)": 369.4, "step": 14945, "train_speed(iter/s)": 0.202454 }, { "acc": 0.74587512, "epoch": 0.37924911212582446, "grad_norm": 2.078125, "learning_rate": 9.476415681791333e-06, "loss": 0.96984386, "memory(GiB)": 369.4, "step": 14950, "train_speed(iter/s)": 0.202456 }, { "acc": 0.74789391, "epoch": 0.3793759512937595, "grad_norm": 2.15625, "learning_rate": 9.475948424022798e-06, "loss": 1.01898956, "memory(GiB)": 369.4, "step": 14955, "train_speed(iter/s)": 0.202466 }, { "acc": 0.75246286, "epoch": 0.37950279046169455, "grad_norm": 2.0625, "learning_rate": 9.475480969382065e-06, "loss": 0.99643116, "memory(GiB)": 369.4, "step": 14960, "train_speed(iter/s)": 0.202472 }, { "acc": 0.75081234, "epoch": 0.37962962962962965, "grad_norm": 1.9765625, "learning_rate": 9.475013317889699e-06, "loss": 1.00374317, "memory(GiB)": 369.4, "step": 14965, "train_speed(iter/s)": 0.202482 }, { "acc": 0.75180511, "epoch": 0.3797564687975647, "grad_norm": 2.390625, "learning_rate": 9.474545469566267e-06, "loss": 0.98282442, "memory(GiB)": 369.4, "step": 14970, "train_speed(iter/s)": 0.202492 }, { "acc": 0.74312057, "epoch": 0.37988330796549974, "grad_norm": 2.0, "learning_rate": 9.474077424432348e-06, "loss": 0.99079685, "memory(GiB)": 369.4, "step": 14975, "train_speed(iter/s)": 0.2025 }, { "acc": 0.73391724, "epoch": 0.3800101471334348, "grad_norm": 2.5, "learning_rate": 9.47360918250853e-06, "loss": 1.0550807, "memory(GiB)": 369.4, "step": 14980, "train_speed(iter/s)": 0.202507 }, { "acc": 0.73782225, "epoch": 0.3801369863013699, "grad_norm": 2.671875, "learning_rate": 9.473140743815405e-06, "loss": 1.09423752, "memory(GiB)": 369.4, "step": 14985, "train_speed(iter/s)": 0.202522 }, { "acc": 0.75970302, "epoch": 0.38026382546930493, "grad_norm": 1.96875, "learning_rate": 9.47267210837358e-06, "loss": 0.99052343, "memory(GiB)": 369.4, "step": 14990, "train_speed(iter/s)": 0.202533 }, { "acc": 0.74840899, "epoch": 0.38039066463724, "grad_norm": 2.421875, "learning_rate": 9.472203276203667e-06, "loss": 1.00915928, "memory(GiB)": 369.4, "step": 14995, "train_speed(iter/s)": 0.202545 }, { "acc": 0.7469728, "epoch": 0.380517503805175, "grad_norm": 2.203125, "learning_rate": 9.471734247326284e-06, "loss": 1.01270809, "memory(GiB)": 369.4, "step": 15000, "train_speed(iter/s)": 0.202552 }, { "epoch": 0.380517503805175, "eval_acc": 0.7328373878346995, "eval_loss": 0.9958781003952026, "eval_runtime": 384.5548, "eval_samples_per_second": 16.565, "eval_steps_per_second": 8.282, "step": 15000 }, { "acc": 0.7472887, "epoch": 0.3806443429731101, "grad_norm": 2.40625, "learning_rate": 9.471265021762067e-06, "loss": 1.02143326, "memory(GiB)": 369.4, "step": 15005, "train_speed(iter/s)": 0.200632 }, { "acc": 0.76133966, "epoch": 0.38077118214104516, "grad_norm": 2.234375, "learning_rate": 9.47079559953165e-06, "loss": 0.92808762, "memory(GiB)": 369.4, "step": 15010, "train_speed(iter/s)": 0.200645 }, { "acc": 0.73162613, "epoch": 0.3808980213089802, "grad_norm": 2.3125, "learning_rate": 9.470325980655683e-06, "loss": 1.04020061, "memory(GiB)": 369.4, "step": 15015, "train_speed(iter/s)": 0.200659 }, { "acc": 0.74132628, "epoch": 0.38102486047691525, "grad_norm": 3.171875, "learning_rate": 9.46985616515482e-06, "loss": 1.06910419, "memory(GiB)": 369.4, "step": 15020, "train_speed(iter/s)": 0.200671 }, { "acc": 0.74319463, "epoch": 0.38115169964485035, "grad_norm": 2.296875, "learning_rate": 9.469386153049727e-06, "loss": 1.01300716, "memory(GiB)": 369.4, "step": 15025, "train_speed(iter/s)": 0.200676 }, { "acc": 0.74420805, "epoch": 0.3812785388127854, "grad_norm": 2.234375, "learning_rate": 9.468915944361076e-06, "loss": 0.96550846, "memory(GiB)": 369.4, "step": 15030, "train_speed(iter/s)": 0.200687 }, { "acc": 0.74804168, "epoch": 0.38140537798072044, "grad_norm": 1.796875, "learning_rate": 9.468445539109551e-06, "loss": 1.02958364, "memory(GiB)": 369.4, "step": 15035, "train_speed(iter/s)": 0.200694 }, { "acc": 0.7292963, "epoch": 0.3815322171486555, "grad_norm": 2.15625, "learning_rate": 9.46797493731584e-06, "loss": 1.08086319, "memory(GiB)": 369.4, "step": 15040, "train_speed(iter/s)": 0.200704 }, { "acc": 0.74996367, "epoch": 0.3816590563165906, "grad_norm": 1.84375, "learning_rate": 9.467504139000642e-06, "loss": 1.02345629, "memory(GiB)": 369.4, "step": 15045, "train_speed(iter/s)": 0.200713 }, { "acc": 0.7425539, "epoch": 0.38178589548452563, "grad_norm": 2.0, "learning_rate": 9.467033144184667e-06, "loss": 1.06179838, "memory(GiB)": 369.4, "step": 15050, "train_speed(iter/s)": 0.200717 }, { "acc": 0.74600878, "epoch": 0.3819127346524607, "grad_norm": 2.46875, "learning_rate": 9.466561952888632e-06, "loss": 1.01609869, "memory(GiB)": 369.4, "step": 15055, "train_speed(iter/s)": 0.200724 }, { "acc": 0.741605, "epoch": 0.3820395738203957, "grad_norm": 2.28125, "learning_rate": 9.466090565133259e-06, "loss": 1.02100544, "memory(GiB)": 369.4, "step": 15060, "train_speed(iter/s)": 0.200736 }, { "acc": 0.73868322, "epoch": 0.3821664129883308, "grad_norm": 1.9765625, "learning_rate": 9.465618980939284e-06, "loss": 1.04387341, "memory(GiB)": 369.4, "step": 15065, "train_speed(iter/s)": 0.200747 }, { "acc": 0.75380774, "epoch": 0.38229325215626586, "grad_norm": 1.9375, "learning_rate": 9.465147200327446e-06, "loss": 1.02549067, "memory(GiB)": 369.4, "step": 15070, "train_speed(iter/s)": 0.200752 }, { "acc": 0.7543222, "epoch": 0.3824200913242009, "grad_norm": 2.46875, "learning_rate": 9.464675223318503e-06, "loss": 0.97986412, "memory(GiB)": 369.4, "step": 15075, "train_speed(iter/s)": 0.200764 }, { "acc": 0.76262059, "epoch": 0.38254693049213595, "grad_norm": 2.3125, "learning_rate": 9.464203049933207e-06, "loss": 0.98882332, "memory(GiB)": 369.4, "step": 15080, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75760832, "epoch": 0.38267376966007105, "grad_norm": 2.390625, "learning_rate": 9.463730680192332e-06, "loss": 0.98782768, "memory(GiB)": 369.4, "step": 15085, "train_speed(iter/s)": 0.20078 }, { "acc": 0.74411178, "epoch": 0.3828006088280061, "grad_norm": 2.125, "learning_rate": 9.46325811411665e-06, "loss": 1.06684399, "memory(GiB)": 369.4, "step": 15090, "train_speed(iter/s)": 0.200788 }, { "acc": 0.74720511, "epoch": 0.38292744799594114, "grad_norm": 2.375, "learning_rate": 9.462785351726951e-06, "loss": 0.99840097, "memory(GiB)": 369.4, "step": 15095, "train_speed(iter/s)": 0.200798 }, { "acc": 0.74255815, "epoch": 0.3830542871638762, "grad_norm": 2.0625, "learning_rate": 9.462312393044027e-06, "loss": 1.02049198, "memory(GiB)": 369.4, "step": 15100, "train_speed(iter/s)": 0.200802 }, { "acc": 0.76587811, "epoch": 0.3831811263318113, "grad_norm": 2.1875, "learning_rate": 9.46183923808868e-06, "loss": 0.97782145, "memory(GiB)": 369.4, "step": 15105, "train_speed(iter/s)": 0.200808 }, { "acc": 0.74667196, "epoch": 0.38330796549974633, "grad_norm": 2.109375, "learning_rate": 9.461365886881724e-06, "loss": 1.02836857, "memory(GiB)": 369.4, "step": 15110, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74969273, "epoch": 0.3834348046676814, "grad_norm": 2.1875, "learning_rate": 9.460892339443977e-06, "loss": 1.05465469, "memory(GiB)": 369.4, "step": 15115, "train_speed(iter/s)": 0.200821 }, { "acc": 0.74799237, "epoch": 0.3835616438356164, "grad_norm": 2.609375, "learning_rate": 9.460418595796268e-06, "loss": 0.9843277, "memory(GiB)": 369.4, "step": 15120, "train_speed(iter/s)": 0.200832 }, { "acc": 0.74034109, "epoch": 0.3836884830035515, "grad_norm": 1.9453125, "learning_rate": 9.459944655959437e-06, "loss": 1.04092455, "memory(GiB)": 369.4, "step": 15125, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75782299, "epoch": 0.38381532217148656, "grad_norm": 2.203125, "learning_rate": 9.459470519954325e-06, "loss": 0.96923542, "memory(GiB)": 369.4, "step": 15130, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75216875, "epoch": 0.3839421613394216, "grad_norm": 2.53125, "learning_rate": 9.458996187801791e-06, "loss": 0.99021549, "memory(GiB)": 369.4, "step": 15135, "train_speed(iter/s)": 0.200865 }, { "acc": 0.73611922, "epoch": 0.38406900050735665, "grad_norm": 2.359375, "learning_rate": 9.458521659522697e-06, "loss": 1.06460924, "memory(GiB)": 369.4, "step": 15140, "train_speed(iter/s)": 0.20088 }, { "acc": 0.73275557, "epoch": 0.38419583967529175, "grad_norm": 2.140625, "learning_rate": 9.458046935137913e-06, "loss": 1.05387325, "memory(GiB)": 369.4, "step": 15145, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74701252, "epoch": 0.3843226788432268, "grad_norm": 2.140625, "learning_rate": 9.457572014668323e-06, "loss": 1.0425539, "memory(GiB)": 369.4, "step": 15150, "train_speed(iter/s)": 0.200902 }, { "acc": 0.74086323, "epoch": 0.38444951801116184, "grad_norm": 2.0625, "learning_rate": 9.457096898134813e-06, "loss": 1.02122746, "memory(GiB)": 369.4, "step": 15155, "train_speed(iter/s)": 0.200912 }, { "acc": 0.7519577, "epoch": 0.3845763571790969, "grad_norm": 2.765625, "learning_rate": 9.45662158555828e-06, "loss": 1.03715916, "memory(GiB)": 369.4, "step": 15160, "train_speed(iter/s)": 0.200923 }, { "acc": 0.74683299, "epoch": 0.384703196347032, "grad_norm": 3.0625, "learning_rate": 9.456146076959636e-06, "loss": 1.01083937, "memory(GiB)": 369.4, "step": 15165, "train_speed(iter/s)": 0.200935 }, { "acc": 0.74235868, "epoch": 0.38483003551496703, "grad_norm": 2.1875, "learning_rate": 9.455670372359791e-06, "loss": 1.05860558, "memory(GiB)": 369.4, "step": 15170, "train_speed(iter/s)": 0.200944 }, { "acc": 0.74265552, "epoch": 0.3849568746829021, "grad_norm": 2.171875, "learning_rate": 9.45519447177967e-06, "loss": 1.01753159, "memory(GiB)": 369.4, "step": 15175, "train_speed(iter/s)": 0.200953 }, { "acc": 0.74208288, "epoch": 0.3850837138508371, "grad_norm": 2.171875, "learning_rate": 9.454718375240204e-06, "loss": 1.10027189, "memory(GiB)": 369.4, "step": 15180, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75291395, "epoch": 0.3852105530187722, "grad_norm": 2.765625, "learning_rate": 9.454242082762336e-06, "loss": 0.98010826, "memory(GiB)": 369.4, "step": 15185, "train_speed(iter/s)": 0.200978 }, { "acc": 0.7419282, "epoch": 0.38533739218670726, "grad_norm": 3.140625, "learning_rate": 9.453765594367014e-06, "loss": 1.05278606, "memory(GiB)": 369.4, "step": 15190, "train_speed(iter/s)": 0.200989 }, { "acc": 0.7489769, "epoch": 0.3854642313546423, "grad_norm": 2.140625, "learning_rate": 9.453288910075196e-06, "loss": 1.01535301, "memory(GiB)": 369.4, "step": 15195, "train_speed(iter/s)": 0.201005 }, { "acc": 0.74247713, "epoch": 0.38559107052257735, "grad_norm": 2.578125, "learning_rate": 9.452812029907849e-06, "loss": 1.03641777, "memory(GiB)": 369.4, "step": 15200, "train_speed(iter/s)": 0.201015 }, { "acc": 0.74414768, "epoch": 0.38571790969051245, "grad_norm": 1.953125, "learning_rate": 9.452334953885951e-06, "loss": 1.00445957, "memory(GiB)": 369.4, "step": 15205, "train_speed(iter/s)": 0.201025 }, { "acc": 0.74367514, "epoch": 0.3858447488584475, "grad_norm": 2.296875, "learning_rate": 9.451857682030481e-06, "loss": 1.05200138, "memory(GiB)": 369.4, "step": 15210, "train_speed(iter/s)": 0.201034 }, { "acc": 0.75344906, "epoch": 0.38597158802638254, "grad_norm": 2.359375, "learning_rate": 9.451380214362436e-06, "loss": 1.00381012, "memory(GiB)": 369.4, "step": 15215, "train_speed(iter/s)": 0.201047 }, { "acc": 0.74899874, "epoch": 0.3860984271943176, "grad_norm": 2.03125, "learning_rate": 9.450902550902814e-06, "loss": 1.01335001, "memory(GiB)": 369.4, "step": 15220, "train_speed(iter/s)": 0.201054 }, { "acc": 0.73170552, "epoch": 0.3862252663622527, "grad_norm": 2.171875, "learning_rate": 9.450424691672626e-06, "loss": 1.08657417, "memory(GiB)": 369.4, "step": 15225, "train_speed(iter/s)": 0.201066 }, { "acc": 0.7479569, "epoch": 0.38635210553018773, "grad_norm": 2.265625, "learning_rate": 9.449946636692891e-06, "loss": 1.00589695, "memory(GiB)": 369.4, "step": 15230, "train_speed(iter/s)": 0.201071 }, { "acc": 0.75515685, "epoch": 0.3864789446981228, "grad_norm": 2.421875, "learning_rate": 9.449468385984634e-06, "loss": 0.96826115, "memory(GiB)": 369.4, "step": 15235, "train_speed(iter/s)": 0.201084 }, { "acc": 0.75027056, "epoch": 0.3866057838660578, "grad_norm": 2.28125, "learning_rate": 9.448989939568892e-06, "loss": 1.00469437, "memory(GiB)": 369.4, "step": 15240, "train_speed(iter/s)": 0.201094 }, { "acc": 0.72806759, "epoch": 0.3867326230339929, "grad_norm": 1.90625, "learning_rate": 9.448511297466708e-06, "loss": 1.08013191, "memory(GiB)": 369.4, "step": 15245, "train_speed(iter/s)": 0.201101 }, { "acc": 0.73969812, "epoch": 0.38685946220192796, "grad_norm": 2.328125, "learning_rate": 9.448032459699139e-06, "loss": 1.02734909, "memory(GiB)": 369.4, "step": 15250, "train_speed(iter/s)": 0.201109 }, { "acc": 0.7281127, "epoch": 0.386986301369863, "grad_norm": 2.25, "learning_rate": 9.447553426287244e-06, "loss": 1.10953264, "memory(GiB)": 369.4, "step": 15255, "train_speed(iter/s)": 0.201116 }, { "acc": 0.74462247, "epoch": 0.38711314053779805, "grad_norm": 1.953125, "learning_rate": 9.44707419725209e-06, "loss": 1.0209589, "memory(GiB)": 369.4, "step": 15260, "train_speed(iter/s)": 0.201128 }, { "acc": 0.73573484, "epoch": 0.38723997970573315, "grad_norm": 2.328125, "learning_rate": 9.446594772614759e-06, "loss": 1.076478, "memory(GiB)": 369.4, "step": 15265, "train_speed(iter/s)": 0.20114 }, { "acc": 0.74212503, "epoch": 0.3873668188736682, "grad_norm": 2.234375, "learning_rate": 9.446115152396335e-06, "loss": 1.01198626, "memory(GiB)": 369.4, "step": 15270, "train_speed(iter/s)": 0.201146 }, { "acc": 0.76424856, "epoch": 0.38749365804160324, "grad_norm": 2.359375, "learning_rate": 9.445635336617919e-06, "loss": 0.94995136, "memory(GiB)": 369.4, "step": 15275, "train_speed(iter/s)": 0.201154 }, { "acc": 0.74105444, "epoch": 0.3876204972095383, "grad_norm": 1.65625, "learning_rate": 9.445155325300612e-06, "loss": 1.07382174, "memory(GiB)": 369.4, "step": 15280, "train_speed(iter/s)": 0.201163 }, { "acc": 0.74627743, "epoch": 0.3877473363774734, "grad_norm": 2.203125, "learning_rate": 9.444675118465528e-06, "loss": 1.039151, "memory(GiB)": 369.4, "step": 15285, "train_speed(iter/s)": 0.201171 }, { "acc": 0.74472599, "epoch": 0.38787417554540843, "grad_norm": 2.0625, "learning_rate": 9.444194716133785e-06, "loss": 1.02745056, "memory(GiB)": 369.4, "step": 15290, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75098343, "epoch": 0.3880010147133435, "grad_norm": 2.15625, "learning_rate": 9.44371411832652e-06, "loss": 0.95040731, "memory(GiB)": 369.4, "step": 15295, "train_speed(iter/s)": 0.201191 }, { "acc": 0.73265796, "epoch": 0.3881278538812785, "grad_norm": 2.234375, "learning_rate": 9.443233325064867e-06, "loss": 1.07659569, "memory(GiB)": 369.4, "step": 15300, "train_speed(iter/s)": 0.201198 }, { "acc": 0.73174248, "epoch": 0.3882546930492136, "grad_norm": 1.984375, "learning_rate": 9.442752336369976e-06, "loss": 1.0357439, "memory(GiB)": 369.4, "step": 15305, "train_speed(iter/s)": 0.201205 }, { "acc": 0.76202011, "epoch": 0.38838153221714866, "grad_norm": 1.9375, "learning_rate": 9.442271152263e-06, "loss": 0.95416374, "memory(GiB)": 369.4, "step": 15310, "train_speed(iter/s)": 0.201204 }, { "acc": 0.74060564, "epoch": 0.3885083713850837, "grad_norm": 2.03125, "learning_rate": 9.441789772765107e-06, "loss": 1.05062428, "memory(GiB)": 369.4, "step": 15315, "train_speed(iter/s)": 0.201216 }, { "acc": 0.73412466, "epoch": 0.38863521055301875, "grad_norm": 2.078125, "learning_rate": 9.441308197897467e-06, "loss": 1.05806923, "memory(GiB)": 369.4, "step": 15320, "train_speed(iter/s)": 0.201224 }, { "acc": 0.74628029, "epoch": 0.38876204972095385, "grad_norm": 2.4375, "learning_rate": 9.440826427681264e-06, "loss": 1.09089842, "memory(GiB)": 369.4, "step": 15325, "train_speed(iter/s)": 0.201228 }, { "acc": 0.75116158, "epoch": 0.3888888888888889, "grad_norm": 2.40625, "learning_rate": 9.44034446213769e-06, "loss": 1.00683784, "memory(GiB)": 369.4, "step": 15330, "train_speed(iter/s)": 0.201238 }, { "acc": 0.74335995, "epoch": 0.38901572805682394, "grad_norm": 2.078125, "learning_rate": 9.439862301287939e-06, "loss": 1.01208248, "memory(GiB)": 369.4, "step": 15335, "train_speed(iter/s)": 0.201253 }, { "acc": 0.74304366, "epoch": 0.389142567224759, "grad_norm": 2.3125, "learning_rate": 9.439379945153223e-06, "loss": 1.06253719, "memory(GiB)": 369.4, "step": 15340, "train_speed(iter/s)": 0.201267 }, { "acc": 0.7556879, "epoch": 0.3892694063926941, "grad_norm": 2.28125, "learning_rate": 9.438897393754755e-06, "loss": 1.00678291, "memory(GiB)": 369.4, "step": 15345, "train_speed(iter/s)": 0.201279 }, { "acc": 0.74501944, "epoch": 0.38939624556062913, "grad_norm": 2.109375, "learning_rate": 9.438414647113762e-06, "loss": 1.02884092, "memory(GiB)": 369.4, "step": 15350, "train_speed(iter/s)": 0.201286 }, { "acc": 0.74790664, "epoch": 0.3895230847285642, "grad_norm": 2.046875, "learning_rate": 9.437931705251478e-06, "loss": 1.02542057, "memory(GiB)": 369.4, "step": 15355, "train_speed(iter/s)": 0.201296 }, { "acc": 0.74786615, "epoch": 0.3896499238964992, "grad_norm": 1.9609375, "learning_rate": 9.437448568189142e-06, "loss": 0.9731286, "memory(GiB)": 369.4, "step": 15360, "train_speed(iter/s)": 0.201305 }, { "acc": 0.74535155, "epoch": 0.3897767630644343, "grad_norm": 1.9921875, "learning_rate": 9.436965235948008e-06, "loss": 0.99518261, "memory(GiB)": 369.4, "step": 15365, "train_speed(iter/s)": 0.201316 }, { "acc": 0.74460239, "epoch": 0.38990360223236936, "grad_norm": 2.0, "learning_rate": 9.436481708549332e-06, "loss": 1.03132057, "memory(GiB)": 369.4, "step": 15370, "train_speed(iter/s)": 0.201328 }, { "acc": 0.73836889, "epoch": 0.3900304414003044, "grad_norm": 2.125, "learning_rate": 9.435997986014382e-06, "loss": 1.08161612, "memory(GiB)": 369.4, "step": 15375, "train_speed(iter/s)": 0.201341 }, { "acc": 0.73138752, "epoch": 0.39015728056823945, "grad_norm": 2.3125, "learning_rate": 9.435514068364437e-06, "loss": 1.1368578, "memory(GiB)": 369.4, "step": 15380, "train_speed(iter/s)": 0.20135 }, { "acc": 0.74251938, "epoch": 0.39028411973617455, "grad_norm": 2.25, "learning_rate": 9.43502995562078e-06, "loss": 1.02739248, "memory(GiB)": 369.4, "step": 15385, "train_speed(iter/s)": 0.201355 }, { "acc": 0.74086742, "epoch": 0.3904109589041096, "grad_norm": 2.78125, "learning_rate": 9.434545647804703e-06, "loss": 1.06645527, "memory(GiB)": 369.4, "step": 15390, "train_speed(iter/s)": 0.201369 }, { "acc": 0.75703001, "epoch": 0.39053779807204464, "grad_norm": 2.28125, "learning_rate": 9.434061144937512e-06, "loss": 1.00315437, "memory(GiB)": 369.4, "step": 15395, "train_speed(iter/s)": 0.201374 }, { "acc": 0.75326371, "epoch": 0.3906646372399797, "grad_norm": 2.265625, "learning_rate": 9.433576447040513e-06, "loss": 0.97736511, "memory(GiB)": 369.4, "step": 15400, "train_speed(iter/s)": 0.201384 }, { "acc": 0.75080738, "epoch": 0.3907914764079148, "grad_norm": 1.9453125, "learning_rate": 9.433091554135029e-06, "loss": 1.05541859, "memory(GiB)": 369.4, "step": 15405, "train_speed(iter/s)": 0.201397 }, { "acc": 0.74254556, "epoch": 0.39091831557584983, "grad_norm": 2.28125, "learning_rate": 9.432606466242384e-06, "loss": 1.06799335, "memory(GiB)": 369.4, "step": 15410, "train_speed(iter/s)": 0.201407 }, { "acc": 0.73625431, "epoch": 0.3910451547437849, "grad_norm": 2.046875, "learning_rate": 9.43212118338392e-06, "loss": 1.00914688, "memory(GiB)": 369.4, "step": 15415, "train_speed(iter/s)": 0.201421 }, { "acc": 0.74789391, "epoch": 0.3911719939117199, "grad_norm": 2.015625, "learning_rate": 9.431635705580975e-06, "loss": 1.01748028, "memory(GiB)": 369.4, "step": 15420, "train_speed(iter/s)": 0.201433 }, { "acc": 0.74534097, "epoch": 0.391298833079655, "grad_norm": 2.421875, "learning_rate": 9.431150032854907e-06, "loss": 1.02641487, "memory(GiB)": 369.4, "step": 15425, "train_speed(iter/s)": 0.201428 }, { "acc": 0.74602928, "epoch": 0.39142567224759006, "grad_norm": 2.328125, "learning_rate": 9.430664165227077e-06, "loss": 0.98749123, "memory(GiB)": 369.4, "step": 15430, "train_speed(iter/s)": 0.201436 }, { "acc": 0.74519444, "epoch": 0.3915525114155251, "grad_norm": 1.953125, "learning_rate": 9.430178102718857e-06, "loss": 1.0405591, "memory(GiB)": 369.4, "step": 15435, "train_speed(iter/s)": 0.201446 }, { "acc": 0.75719824, "epoch": 0.39167935058346015, "grad_norm": 2.140625, "learning_rate": 9.429691845351623e-06, "loss": 0.96858978, "memory(GiB)": 369.4, "step": 15440, "train_speed(iter/s)": 0.201451 }, { "acc": 0.74926262, "epoch": 0.39180618975139525, "grad_norm": 2.03125, "learning_rate": 9.429205393146763e-06, "loss": 0.96452475, "memory(GiB)": 369.4, "step": 15445, "train_speed(iter/s)": 0.201462 }, { "acc": 0.74200001, "epoch": 0.3919330289193303, "grad_norm": 2.171875, "learning_rate": 9.428718746125678e-06, "loss": 1.06173143, "memory(GiB)": 369.4, "step": 15450, "train_speed(iter/s)": 0.201472 }, { "acc": 0.74722919, "epoch": 0.39205986808726534, "grad_norm": 2.375, "learning_rate": 9.428231904309768e-06, "loss": 1.08439159, "memory(GiB)": 369.4, "step": 15455, "train_speed(iter/s)": 0.201475 }, { "acc": 0.72923536, "epoch": 0.3921867072552004, "grad_norm": 2.28125, "learning_rate": 9.427744867720448e-06, "loss": 1.08433933, "memory(GiB)": 369.4, "step": 15460, "train_speed(iter/s)": 0.201487 }, { "acc": 0.7391644, "epoch": 0.3923135464231355, "grad_norm": 2.34375, "learning_rate": 9.42725763637914e-06, "loss": 1.03217497, "memory(GiB)": 369.4, "step": 15465, "train_speed(iter/s)": 0.201496 }, { "acc": 0.75416746, "epoch": 0.39244038559107053, "grad_norm": 2.21875, "learning_rate": 9.426770210307277e-06, "loss": 0.99033909, "memory(GiB)": 369.4, "step": 15470, "train_speed(iter/s)": 0.201502 }, { "acc": 0.7434062, "epoch": 0.3925672247590056, "grad_norm": 2.125, "learning_rate": 9.426282589526294e-06, "loss": 1.02743187, "memory(GiB)": 369.4, "step": 15475, "train_speed(iter/s)": 0.201512 }, { "acc": 0.75015411, "epoch": 0.3926940639269406, "grad_norm": 1.890625, "learning_rate": 9.425794774057641e-06, "loss": 1.00213928, "memory(GiB)": 369.4, "step": 15480, "train_speed(iter/s)": 0.201523 }, { "acc": 0.74284658, "epoch": 0.3928209030948757, "grad_norm": 2.09375, "learning_rate": 9.425306763922775e-06, "loss": 1.04920578, "memory(GiB)": 369.4, "step": 15485, "train_speed(iter/s)": 0.201536 }, { "acc": 0.72845116, "epoch": 0.39294774226281076, "grad_norm": 2.390625, "learning_rate": 9.42481855914316e-06, "loss": 1.08196697, "memory(GiB)": 369.4, "step": 15490, "train_speed(iter/s)": 0.201547 }, { "acc": 0.73017635, "epoch": 0.3930745814307458, "grad_norm": 2.40625, "learning_rate": 9.424330159740269e-06, "loss": 1.03209457, "memory(GiB)": 369.4, "step": 15495, "train_speed(iter/s)": 0.20156 }, { "acc": 0.75963216, "epoch": 0.39320142059868085, "grad_norm": 1.8203125, "learning_rate": 9.423841565735582e-06, "loss": 0.95576868, "memory(GiB)": 369.4, "step": 15500, "train_speed(iter/s)": 0.201567 }, { "acc": 0.74482813, "epoch": 0.39332825976661595, "grad_norm": 2.203125, "learning_rate": 9.423352777150597e-06, "loss": 1.0596385, "memory(GiB)": 369.4, "step": 15505, "train_speed(iter/s)": 0.201582 }, { "acc": 0.75392232, "epoch": 0.393455098934551, "grad_norm": 2.515625, "learning_rate": 9.422863794006804e-06, "loss": 1.03849335, "memory(GiB)": 369.4, "step": 15510, "train_speed(iter/s)": 0.201596 }, { "acc": 0.74146585, "epoch": 0.39358193810248604, "grad_norm": 1.984375, "learning_rate": 9.422374616325716e-06, "loss": 1.00839758, "memory(GiB)": 369.4, "step": 15515, "train_speed(iter/s)": 0.201609 }, { "acc": 0.74311466, "epoch": 0.3937087772704211, "grad_norm": 2.234375, "learning_rate": 9.421885244128847e-06, "loss": 1.02679749, "memory(GiB)": 369.4, "step": 15520, "train_speed(iter/s)": 0.201617 }, { "acc": 0.74305868, "epoch": 0.3938356164383562, "grad_norm": 2.328125, "learning_rate": 9.421395677437724e-06, "loss": 1.00592527, "memory(GiB)": 369.4, "step": 15525, "train_speed(iter/s)": 0.201627 }, { "acc": 0.75216022, "epoch": 0.39396245560629123, "grad_norm": 2.46875, "learning_rate": 9.42090591627388e-06, "loss": 1.00682335, "memory(GiB)": 369.4, "step": 15530, "train_speed(iter/s)": 0.201636 }, { "acc": 0.73483877, "epoch": 0.3940892947742263, "grad_norm": 2.625, "learning_rate": 9.420415960658853e-06, "loss": 1.00854235, "memory(GiB)": 369.4, "step": 15535, "train_speed(iter/s)": 0.201642 }, { "acc": 0.74995928, "epoch": 0.3942161339421613, "grad_norm": 2.265625, "learning_rate": 9.419925810614196e-06, "loss": 1.03436127, "memory(GiB)": 369.4, "step": 15540, "train_speed(iter/s)": 0.201652 }, { "acc": 0.73709316, "epoch": 0.3943429731100964, "grad_norm": 2.90625, "learning_rate": 9.419435466161471e-06, "loss": 1.05042953, "memory(GiB)": 369.4, "step": 15545, "train_speed(iter/s)": 0.201667 }, { "acc": 0.74314265, "epoch": 0.39446981227803146, "grad_norm": 2.25, "learning_rate": 9.418944927322242e-06, "loss": 1.04896564, "memory(GiB)": 369.4, "step": 15550, "train_speed(iter/s)": 0.201679 }, { "acc": 0.75189223, "epoch": 0.3945966514459665, "grad_norm": 2.21875, "learning_rate": 9.418454194118085e-06, "loss": 0.97154045, "memory(GiB)": 369.4, "step": 15555, "train_speed(iter/s)": 0.201691 }, { "acc": 0.74939771, "epoch": 0.39472349061390155, "grad_norm": 2.25, "learning_rate": 9.417963266570587e-06, "loss": 1.04821701, "memory(GiB)": 369.4, "step": 15560, "train_speed(iter/s)": 0.201701 }, { "acc": 0.74203715, "epoch": 0.39485032978183665, "grad_norm": 1.8828125, "learning_rate": 9.417472144701338e-06, "loss": 1.01389599, "memory(GiB)": 369.4, "step": 15565, "train_speed(iter/s)": 0.201714 }, { "acc": 0.75609379, "epoch": 0.3949771689497717, "grad_norm": 2.015625, "learning_rate": 9.416980828531944e-06, "loss": 0.95613785, "memory(GiB)": 369.4, "step": 15570, "train_speed(iter/s)": 0.201729 }, { "acc": 0.73730078, "epoch": 0.39510400811770674, "grad_norm": 2.21875, "learning_rate": 9.41648931808401e-06, "loss": 1.04541931, "memory(GiB)": 369.4, "step": 15575, "train_speed(iter/s)": 0.201743 }, { "acc": 0.74263391, "epoch": 0.3952308472856418, "grad_norm": 2.28125, "learning_rate": 9.415997613379159e-06, "loss": 1.00427303, "memory(GiB)": 369.4, "step": 15580, "train_speed(iter/s)": 0.201754 }, { "acc": 0.74859209, "epoch": 0.3953576864535769, "grad_norm": 2.265625, "learning_rate": 9.415505714439016e-06, "loss": 0.99069843, "memory(GiB)": 369.4, "step": 15585, "train_speed(iter/s)": 0.201764 }, { "acc": 0.73997316, "epoch": 0.39548452562151193, "grad_norm": 1.90625, "learning_rate": 9.415013621285219e-06, "loss": 1.03484583, "memory(GiB)": 369.4, "step": 15590, "train_speed(iter/s)": 0.201767 }, { "acc": 0.75247755, "epoch": 0.395611364789447, "grad_norm": 2.28125, "learning_rate": 9.41452133393941e-06, "loss": 0.9685895, "memory(GiB)": 369.4, "step": 15595, "train_speed(iter/s)": 0.201778 }, { "acc": 0.75715322, "epoch": 0.395738203957382, "grad_norm": 1.8984375, "learning_rate": 9.414028852423245e-06, "loss": 0.99997368, "memory(GiB)": 369.4, "step": 15600, "train_speed(iter/s)": 0.201784 }, { "acc": 0.75444937, "epoch": 0.3958650431253171, "grad_norm": 2.84375, "learning_rate": 9.413536176758384e-06, "loss": 0.99234257, "memory(GiB)": 369.4, "step": 15605, "train_speed(iter/s)": 0.20178 }, { "acc": 0.73229613, "epoch": 0.39599188229325216, "grad_norm": 2.328125, "learning_rate": 9.413043306966496e-06, "loss": 1.02907314, "memory(GiB)": 369.4, "step": 15610, "train_speed(iter/s)": 0.201792 }, { "acc": 0.75216036, "epoch": 0.3961187214611872, "grad_norm": 2.65625, "learning_rate": 9.41255024306926e-06, "loss": 0.97812366, "memory(GiB)": 369.4, "step": 15615, "train_speed(iter/s)": 0.201799 }, { "acc": 0.76016974, "epoch": 0.39624556062912225, "grad_norm": 2.375, "learning_rate": 9.412056985088364e-06, "loss": 1.02685537, "memory(GiB)": 369.4, "step": 15620, "train_speed(iter/s)": 0.201802 }, { "acc": 0.74724598, "epoch": 0.39637239979705735, "grad_norm": 2.546875, "learning_rate": 9.411563533045505e-06, "loss": 1.03710327, "memory(GiB)": 369.4, "step": 15625, "train_speed(iter/s)": 0.201812 }, { "acc": 0.73282461, "epoch": 0.3964992389649924, "grad_norm": 2.5, "learning_rate": 9.411069886962383e-06, "loss": 1.07110062, "memory(GiB)": 369.4, "step": 15630, "train_speed(iter/s)": 0.20182 }, { "acc": 0.73657589, "epoch": 0.39662607813292744, "grad_norm": 3.0, "learning_rate": 9.410576046860716e-06, "loss": 1.058846, "memory(GiB)": 369.4, "step": 15635, "train_speed(iter/s)": 0.20183 }, { "acc": 0.75751991, "epoch": 0.3967529173008625, "grad_norm": 2.015625, "learning_rate": 9.41008201276222e-06, "loss": 0.98488808, "memory(GiB)": 369.4, "step": 15640, "train_speed(iter/s)": 0.201841 }, { "acc": 0.74820604, "epoch": 0.3968797564687976, "grad_norm": 2.0, "learning_rate": 9.409587784688629e-06, "loss": 1.05534592, "memory(GiB)": 369.4, "step": 15645, "train_speed(iter/s)": 0.201847 }, { "acc": 0.7296752, "epoch": 0.39700659563673263, "grad_norm": 2.140625, "learning_rate": 9.409093362661678e-06, "loss": 1.10109787, "memory(GiB)": 369.4, "step": 15650, "train_speed(iter/s)": 0.201858 }, { "acc": 0.74944973, "epoch": 0.3971334348046677, "grad_norm": 2.421875, "learning_rate": 9.408598746703119e-06, "loss": 0.99318037, "memory(GiB)": 369.4, "step": 15655, "train_speed(iter/s)": 0.201871 }, { "acc": 0.75372763, "epoch": 0.3972602739726027, "grad_norm": 2.34375, "learning_rate": 9.408103936834703e-06, "loss": 1.04416647, "memory(GiB)": 369.4, "step": 15660, "train_speed(iter/s)": 0.201877 }, { "acc": 0.74308305, "epoch": 0.3973871131405378, "grad_norm": 2.25, "learning_rate": 9.407608933078194e-06, "loss": 1.032687, "memory(GiB)": 369.4, "step": 15665, "train_speed(iter/s)": 0.201882 }, { "acc": 0.75407033, "epoch": 0.39751395230847286, "grad_norm": 2.0, "learning_rate": 9.407113735455366e-06, "loss": 1.01721277, "memory(GiB)": 369.4, "step": 15670, "train_speed(iter/s)": 0.201889 }, { "acc": 0.75984459, "epoch": 0.3976407914764079, "grad_norm": 2.3125, "learning_rate": 9.406618343988e-06, "loss": 0.96998997, "memory(GiB)": 369.4, "step": 15675, "train_speed(iter/s)": 0.201902 }, { "acc": 0.74448462, "epoch": 0.39776763064434295, "grad_norm": 2.109375, "learning_rate": 9.406122758697885e-06, "loss": 1.03519888, "memory(GiB)": 369.4, "step": 15680, "train_speed(iter/s)": 0.201913 }, { "acc": 0.74092808, "epoch": 0.39789446981227805, "grad_norm": 1.96875, "learning_rate": 9.405626979606819e-06, "loss": 1.04957485, "memory(GiB)": 369.4, "step": 15685, "train_speed(iter/s)": 0.20192 }, { "acc": 0.74821811, "epoch": 0.3980213089802131, "grad_norm": 1.9453125, "learning_rate": 9.405131006736608e-06, "loss": 1.02705107, "memory(GiB)": 369.4, "step": 15690, "train_speed(iter/s)": 0.201928 }, { "acc": 0.75369034, "epoch": 0.39814814814814814, "grad_norm": 2.375, "learning_rate": 9.404634840109069e-06, "loss": 1.0155241, "memory(GiB)": 369.4, "step": 15695, "train_speed(iter/s)": 0.201931 }, { "acc": 0.7426321, "epoch": 0.3982749873160832, "grad_norm": 2.15625, "learning_rate": 9.404138479746022e-06, "loss": 1.01303463, "memory(GiB)": 369.4, "step": 15700, "train_speed(iter/s)": 0.201938 }, { "acc": 0.74803905, "epoch": 0.3984018264840183, "grad_norm": 2.140625, "learning_rate": 9.403641925669304e-06, "loss": 1.06551399, "memory(GiB)": 369.4, "step": 15705, "train_speed(iter/s)": 0.201944 }, { "acc": 0.73871946, "epoch": 0.39852866565195333, "grad_norm": 1.703125, "learning_rate": 9.403145177900752e-06, "loss": 1.05696421, "memory(GiB)": 369.4, "step": 15710, "train_speed(iter/s)": 0.20195 }, { "acc": 0.74189968, "epoch": 0.3986555048198884, "grad_norm": 2.40625, "learning_rate": 9.402648236462217e-06, "loss": 1.01788731, "memory(GiB)": 369.4, "step": 15715, "train_speed(iter/s)": 0.20196 }, { "acc": 0.74817019, "epoch": 0.3987823439878234, "grad_norm": 1.8203125, "learning_rate": 9.402151101375557e-06, "loss": 1.01790295, "memory(GiB)": 369.4, "step": 15720, "train_speed(iter/s)": 0.201964 }, { "acc": 0.73183918, "epoch": 0.3989091831557585, "grad_norm": 2.09375, "learning_rate": 9.401653772662638e-06, "loss": 1.04835758, "memory(GiB)": 369.4, "step": 15725, "train_speed(iter/s)": 0.201974 }, { "acc": 0.73628802, "epoch": 0.39903602232369356, "grad_norm": 2.03125, "learning_rate": 9.401156250345331e-06, "loss": 1.0441411, "memory(GiB)": 369.4, "step": 15730, "train_speed(iter/s)": 0.201983 }, { "acc": 0.74935446, "epoch": 0.3991628614916286, "grad_norm": 2.171875, "learning_rate": 9.400658534445524e-06, "loss": 0.97735157, "memory(GiB)": 369.4, "step": 15735, "train_speed(iter/s)": 0.20199 }, { "acc": 0.7413693, "epoch": 0.39928970065956365, "grad_norm": 2.328125, "learning_rate": 9.40016062498511e-06, "loss": 1.04539223, "memory(GiB)": 369.4, "step": 15740, "train_speed(iter/s)": 0.201997 }, { "acc": 0.74338007, "epoch": 0.39941653982749875, "grad_norm": 2.296875, "learning_rate": 9.399662521985982e-06, "loss": 1.03502827, "memory(GiB)": 369.4, "step": 15745, "train_speed(iter/s)": 0.202004 }, { "acc": 0.74935665, "epoch": 0.3995433789954338, "grad_norm": 2.359375, "learning_rate": 9.399164225470055e-06, "loss": 1.04552984, "memory(GiB)": 369.4, "step": 15750, "train_speed(iter/s)": 0.202012 }, { "acc": 0.74244003, "epoch": 0.39967021816336884, "grad_norm": 2.21875, "learning_rate": 9.398665735459245e-06, "loss": 1.04905701, "memory(GiB)": 369.4, "step": 15755, "train_speed(iter/s)": 0.202023 }, { "acc": 0.73178401, "epoch": 0.3997970573313039, "grad_norm": 1.6953125, "learning_rate": 9.398167051975475e-06, "loss": 1.0501049, "memory(GiB)": 369.4, "step": 15760, "train_speed(iter/s)": 0.20203 }, { "acc": 0.74052567, "epoch": 0.399923896499239, "grad_norm": 2.453125, "learning_rate": 9.397668175040684e-06, "loss": 1.00528336, "memory(GiB)": 369.4, "step": 15765, "train_speed(iter/s)": 0.202034 }, { "acc": 0.73390207, "epoch": 0.40005073566717403, "grad_norm": 2.140625, "learning_rate": 9.397169104676813e-06, "loss": 1.05242519, "memory(GiB)": 369.4, "step": 15770, "train_speed(iter/s)": 0.202045 }, { "acc": 0.73381538, "epoch": 0.4001775748351091, "grad_norm": 1.8515625, "learning_rate": 9.39666984090581e-06, "loss": 1.06326418, "memory(GiB)": 369.4, "step": 15775, "train_speed(iter/s)": 0.202051 }, { "acc": 0.75061207, "epoch": 0.4003044140030441, "grad_norm": 2.1875, "learning_rate": 9.396170383749642e-06, "loss": 1.01671066, "memory(GiB)": 369.4, "step": 15780, "train_speed(iter/s)": 0.202055 }, { "acc": 0.74583926, "epoch": 0.4004312531709792, "grad_norm": 1.90625, "learning_rate": 9.39567073323027e-06, "loss": 1.00101109, "memory(GiB)": 369.4, "step": 15785, "train_speed(iter/s)": 0.202067 }, { "acc": 0.73230677, "epoch": 0.40055809233891426, "grad_norm": 2.296875, "learning_rate": 9.395170889369674e-06, "loss": 1.07385521, "memory(GiB)": 369.4, "step": 15790, "train_speed(iter/s)": 0.202082 }, { "acc": 0.75273752, "epoch": 0.4006849315068493, "grad_norm": 2.03125, "learning_rate": 9.39467085218984e-06, "loss": 1.01438913, "memory(GiB)": 369.4, "step": 15795, "train_speed(iter/s)": 0.202083 }, { "acc": 0.74248486, "epoch": 0.40081177067478435, "grad_norm": 1.90625, "learning_rate": 9.39417062171276e-06, "loss": 1.02563515, "memory(GiB)": 369.4, "step": 15800, "train_speed(iter/s)": 0.202096 }, { "acc": 0.75330563, "epoch": 0.40093860984271945, "grad_norm": 2.234375, "learning_rate": 9.393670197960439e-06, "loss": 0.99728775, "memory(GiB)": 369.4, "step": 15805, "train_speed(iter/s)": 0.202104 }, { "acc": 0.73914642, "epoch": 0.4010654490106545, "grad_norm": 1.796875, "learning_rate": 9.393169580954884e-06, "loss": 1.01689644, "memory(GiB)": 369.4, "step": 15810, "train_speed(iter/s)": 0.202113 }, { "acc": 0.73506489, "epoch": 0.40119228817858954, "grad_norm": 1.8984375, "learning_rate": 9.392668770718118e-06, "loss": 1.08132553, "memory(GiB)": 369.4, "step": 15815, "train_speed(iter/s)": 0.202119 }, { "acc": 0.74870477, "epoch": 0.4013191273465246, "grad_norm": 2.15625, "learning_rate": 9.392167767272169e-06, "loss": 1.03410072, "memory(GiB)": 369.4, "step": 15820, "train_speed(iter/s)": 0.20213 }, { "acc": 0.7345521, "epoch": 0.4014459665144597, "grad_norm": 1.9609375, "learning_rate": 9.39166657063907e-06, "loss": 1.03682261, "memory(GiB)": 369.4, "step": 15825, "train_speed(iter/s)": 0.202136 }, { "acc": 0.73766837, "epoch": 0.40157280568239473, "grad_norm": 1.953125, "learning_rate": 9.391165180840869e-06, "loss": 1.0003912, "memory(GiB)": 369.4, "step": 15830, "train_speed(iter/s)": 0.202144 }, { "acc": 0.74425831, "epoch": 0.4016996448503298, "grad_norm": 2.421875, "learning_rate": 9.390663597899619e-06, "loss": 1.02737274, "memory(GiB)": 369.4, "step": 15835, "train_speed(iter/s)": 0.202152 }, { "acc": 0.75176249, "epoch": 0.4018264840182648, "grad_norm": 2.3125, "learning_rate": 9.39016182183738e-06, "loss": 0.95312452, "memory(GiB)": 369.4, "step": 15840, "train_speed(iter/s)": 0.202164 }, { "acc": 0.74204154, "epoch": 0.4019533231861999, "grad_norm": 2.25, "learning_rate": 9.389659852676223e-06, "loss": 1.01919317, "memory(GiB)": 369.4, "step": 15845, "train_speed(iter/s)": 0.202172 }, { "acc": 0.75683184, "epoch": 0.40208016235413496, "grad_norm": 2.515625, "learning_rate": 9.389157690438228e-06, "loss": 0.95102234, "memory(GiB)": 369.4, "step": 15850, "train_speed(iter/s)": 0.20218 }, { "acc": 0.74688635, "epoch": 0.40220700152207, "grad_norm": 2.078125, "learning_rate": 9.38865533514548e-06, "loss": 0.99255505, "memory(GiB)": 369.4, "step": 15855, "train_speed(iter/s)": 0.20219 }, { "acc": 0.76120863, "epoch": 0.40233384069000505, "grad_norm": 2.03125, "learning_rate": 9.388152786820078e-06, "loss": 0.93229485, "memory(GiB)": 369.4, "step": 15860, "train_speed(iter/s)": 0.202195 }, { "acc": 0.72955608, "epoch": 0.40246067985794015, "grad_norm": 2.25, "learning_rate": 9.387650045484124e-06, "loss": 1.12393951, "memory(GiB)": 369.4, "step": 15865, "train_speed(iter/s)": 0.202202 }, { "acc": 0.74289608, "epoch": 0.4025875190258752, "grad_norm": 2.140625, "learning_rate": 9.387147111159734e-06, "loss": 1.02175102, "memory(GiB)": 369.4, "step": 15870, "train_speed(iter/s)": 0.202214 }, { "acc": 0.7404213, "epoch": 0.40271435819381024, "grad_norm": 3.03125, "learning_rate": 9.386643983869025e-06, "loss": 1.01651745, "memory(GiB)": 369.4, "step": 15875, "train_speed(iter/s)": 0.202223 }, { "acc": 0.737956, "epoch": 0.4028411973617453, "grad_norm": 2.609375, "learning_rate": 9.386140663634127e-06, "loss": 1.12454681, "memory(GiB)": 369.4, "step": 15880, "train_speed(iter/s)": 0.202234 }, { "acc": 0.73279562, "epoch": 0.4029680365296804, "grad_norm": 2.203125, "learning_rate": 9.385637150477182e-06, "loss": 1.06567745, "memory(GiB)": 369.4, "step": 15885, "train_speed(iter/s)": 0.202243 }, { "acc": 0.73854108, "epoch": 0.40309487569761543, "grad_norm": 2.375, "learning_rate": 9.385133444420333e-06, "loss": 1.05808067, "memory(GiB)": 369.4, "step": 15890, "train_speed(iter/s)": 0.202256 }, { "acc": 0.75747623, "epoch": 0.4032217148655505, "grad_norm": 2.84375, "learning_rate": 9.384629545485738e-06, "loss": 0.96580257, "memory(GiB)": 369.4, "step": 15895, "train_speed(iter/s)": 0.202261 }, { "acc": 0.73564916, "epoch": 0.4033485540334855, "grad_norm": 2.0, "learning_rate": 9.38412545369556e-06, "loss": 1.0404459, "memory(GiB)": 369.4, "step": 15900, "train_speed(iter/s)": 0.202266 }, { "acc": 0.73634777, "epoch": 0.4034753932014206, "grad_norm": 2.75, "learning_rate": 9.383621169071971e-06, "loss": 1.00460529, "memory(GiB)": 369.4, "step": 15905, "train_speed(iter/s)": 0.20228 }, { "acc": 0.75330734, "epoch": 0.40360223236935566, "grad_norm": 1.9453125, "learning_rate": 9.38311669163715e-06, "loss": 1.0268177, "memory(GiB)": 369.4, "step": 15910, "train_speed(iter/s)": 0.202285 }, { "acc": 0.74822388, "epoch": 0.4037290715372907, "grad_norm": 2.78125, "learning_rate": 9.38261202141329e-06, "loss": 0.99948359, "memory(GiB)": 369.4, "step": 15915, "train_speed(iter/s)": 0.202295 }, { "acc": 0.74619956, "epoch": 0.40385591070522575, "grad_norm": 2.65625, "learning_rate": 9.382107158422585e-06, "loss": 1.00845423, "memory(GiB)": 369.4, "step": 15920, "train_speed(iter/s)": 0.202302 }, { "acc": 0.74148831, "epoch": 0.40398274987316085, "grad_norm": 2.546875, "learning_rate": 9.381602102687241e-06, "loss": 0.99661942, "memory(GiB)": 369.4, "step": 15925, "train_speed(iter/s)": 0.202307 }, { "acc": 0.7462851, "epoch": 0.4041095890410959, "grad_norm": 2.21875, "learning_rate": 9.381096854229476e-06, "loss": 1.02246799, "memory(GiB)": 369.4, "step": 15930, "train_speed(iter/s)": 0.202313 }, { "acc": 0.75032272, "epoch": 0.40423642820903094, "grad_norm": 2.046875, "learning_rate": 9.38059141307151e-06, "loss": 1.04194613, "memory(GiB)": 369.4, "step": 15935, "train_speed(iter/s)": 0.20232 }, { "acc": 0.74431572, "epoch": 0.404363267376966, "grad_norm": 1.9140625, "learning_rate": 9.380085779235577e-06, "loss": 1.00722446, "memory(GiB)": 369.4, "step": 15940, "train_speed(iter/s)": 0.202326 }, { "acc": 0.73594666, "epoch": 0.4044901065449011, "grad_norm": 1.90625, "learning_rate": 9.379579952743916e-06, "loss": 1.04409275, "memory(GiB)": 369.4, "step": 15945, "train_speed(iter/s)": 0.202335 }, { "acc": 0.74787326, "epoch": 0.40461694571283613, "grad_norm": 2.421875, "learning_rate": 9.379073933618774e-06, "loss": 1.00968647, "memory(GiB)": 369.4, "step": 15950, "train_speed(iter/s)": 0.202347 }, { "acc": 0.73984671, "epoch": 0.4047437848807712, "grad_norm": 1.9296875, "learning_rate": 9.37856772188241e-06, "loss": 1.03041773, "memory(GiB)": 369.4, "step": 15955, "train_speed(iter/s)": 0.202356 }, { "acc": 0.74537826, "epoch": 0.4048706240487062, "grad_norm": 1.9765625, "learning_rate": 9.378061317557088e-06, "loss": 0.98532085, "memory(GiB)": 369.4, "step": 15960, "train_speed(iter/s)": 0.202366 }, { "acc": 0.75560837, "epoch": 0.4049974632166413, "grad_norm": 2.40625, "learning_rate": 9.377554720665083e-06, "loss": 1.00118065, "memory(GiB)": 369.4, "step": 15965, "train_speed(iter/s)": 0.202378 }, { "acc": 0.74163284, "epoch": 0.40512430238457636, "grad_norm": 2.125, "learning_rate": 9.377047931228677e-06, "loss": 1.05362034, "memory(GiB)": 369.4, "step": 15970, "train_speed(iter/s)": 0.202386 }, { "acc": 0.75423584, "epoch": 0.4052511415525114, "grad_norm": 2.65625, "learning_rate": 9.376540949270161e-06, "loss": 0.98733387, "memory(GiB)": 369.4, "step": 15975, "train_speed(iter/s)": 0.2024 }, { "acc": 0.7434185, "epoch": 0.40537798072044645, "grad_norm": 2.078125, "learning_rate": 9.376033774811833e-06, "loss": 0.9838541, "memory(GiB)": 369.4, "step": 15980, "train_speed(iter/s)": 0.202412 }, { "acc": 0.7578011, "epoch": 0.40550481988838155, "grad_norm": 2.34375, "learning_rate": 9.375526407876003e-06, "loss": 0.99665279, "memory(GiB)": 369.4, "step": 15985, "train_speed(iter/s)": 0.202423 }, { "acc": 0.73311596, "epoch": 0.4056316590563166, "grad_norm": 2.359375, "learning_rate": 9.375018848484987e-06, "loss": 1.05727081, "memory(GiB)": 369.4, "step": 15990, "train_speed(iter/s)": 0.202431 }, { "acc": 0.74847894, "epoch": 0.40575849822425164, "grad_norm": 1.7890625, "learning_rate": 9.374511096661108e-06, "loss": 1.01878681, "memory(GiB)": 369.4, "step": 15995, "train_speed(iter/s)": 0.202435 }, { "acc": 0.74180632, "epoch": 0.4058853373921867, "grad_norm": 1.90625, "learning_rate": 9.374003152426701e-06, "loss": 0.99784174, "memory(GiB)": 369.4, "step": 16000, "train_speed(iter/s)": 0.202446 }, { "epoch": 0.4058853373921867, "eval_acc": 0.7334352128545331, "eval_loss": 0.9935078620910645, "eval_runtime": 385.1411, "eval_samples_per_second": 16.539, "eval_steps_per_second": 8.27, "step": 16000 }, { "acc": 0.73839359, "epoch": 0.4060121765601218, "grad_norm": 2.125, "learning_rate": 9.373495015804106e-06, "loss": 1.01007347, "memory(GiB)": 369.4, "step": 16005, "train_speed(iter/s)": 0.200646 }, { "acc": 0.73675947, "epoch": 0.40613901572805683, "grad_norm": 2.1875, "learning_rate": 9.372986686815674e-06, "loss": 1.07212086, "memory(GiB)": 369.4, "step": 16010, "train_speed(iter/s)": 0.200655 }, { "acc": 0.75057592, "epoch": 0.4062658548959919, "grad_norm": 2.46875, "learning_rate": 9.372478165483763e-06, "loss": 0.9780591, "memory(GiB)": 369.4, "step": 16015, "train_speed(iter/s)": 0.200666 }, { "acc": 0.75031781, "epoch": 0.4063926940639269, "grad_norm": 2.09375, "learning_rate": 9.371969451830743e-06, "loss": 1.03692398, "memory(GiB)": 369.4, "step": 16020, "train_speed(iter/s)": 0.200673 }, { "acc": 0.75359197, "epoch": 0.406519533231862, "grad_norm": 2.03125, "learning_rate": 9.371460545878986e-06, "loss": 0.98143234, "memory(GiB)": 369.4, "step": 16025, "train_speed(iter/s)": 0.200683 }, { "acc": 0.74544063, "epoch": 0.40664637239979706, "grad_norm": 2.359375, "learning_rate": 9.370951447650875e-06, "loss": 1.01993008, "memory(GiB)": 369.4, "step": 16030, "train_speed(iter/s)": 0.200692 }, { "acc": 0.72509193, "epoch": 0.4067732115677321, "grad_norm": 2.171875, "learning_rate": 9.370442157168806e-06, "loss": 1.05367413, "memory(GiB)": 369.4, "step": 16035, "train_speed(iter/s)": 0.200702 }, { "acc": 0.73933487, "epoch": 0.40690005073566715, "grad_norm": 1.890625, "learning_rate": 9.369932674455177e-06, "loss": 1.03527708, "memory(GiB)": 369.4, "step": 16040, "train_speed(iter/s)": 0.200714 }, { "acc": 0.75123243, "epoch": 0.40702688990360225, "grad_norm": 1.9453125, "learning_rate": 9.3694229995324e-06, "loss": 0.96230698, "memory(GiB)": 369.4, "step": 16045, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75583558, "epoch": 0.4071537290715373, "grad_norm": 2.421875, "learning_rate": 9.368913132422891e-06, "loss": 1.02887936, "memory(GiB)": 369.4, "step": 16050, "train_speed(iter/s)": 0.200736 }, { "acc": 0.74400511, "epoch": 0.40728056823947234, "grad_norm": 2.203125, "learning_rate": 9.368403073149079e-06, "loss": 1.01158695, "memory(GiB)": 369.4, "step": 16055, "train_speed(iter/s)": 0.200748 }, { "acc": 0.74179316, "epoch": 0.4074074074074074, "grad_norm": 2.5625, "learning_rate": 9.367892821733393e-06, "loss": 1.00887775, "memory(GiB)": 369.4, "step": 16060, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74489813, "epoch": 0.4075342465753425, "grad_norm": 2.1875, "learning_rate": 9.367382378198282e-06, "loss": 1.04085999, "memory(GiB)": 369.4, "step": 16065, "train_speed(iter/s)": 0.200766 }, { "acc": 0.74681211, "epoch": 0.40766108574327753, "grad_norm": 2.75, "learning_rate": 9.366871742566193e-06, "loss": 0.99840431, "memory(GiB)": 369.4, "step": 16070, "train_speed(iter/s)": 0.200776 }, { "acc": 0.73765593, "epoch": 0.4077879249112126, "grad_norm": 2.265625, "learning_rate": 9.36636091485959e-06, "loss": 1.03528099, "memory(GiB)": 369.4, "step": 16075, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74506598, "epoch": 0.4079147640791476, "grad_norm": 2.265625, "learning_rate": 9.365849895100939e-06, "loss": 0.98499031, "memory(GiB)": 369.4, "step": 16080, "train_speed(iter/s)": 0.200793 }, { "acc": 0.74971819, "epoch": 0.4080416032470827, "grad_norm": 2.703125, "learning_rate": 9.365338683312714e-06, "loss": 0.99557076, "memory(GiB)": 369.4, "step": 16085, "train_speed(iter/s)": 0.200801 }, { "acc": 0.74712114, "epoch": 0.40816844241501776, "grad_norm": 2.015625, "learning_rate": 9.364827279517408e-06, "loss": 1.03934422, "memory(GiB)": 369.4, "step": 16090, "train_speed(iter/s)": 0.200809 }, { "acc": 0.73167534, "epoch": 0.4082952815829528, "grad_norm": 2.0, "learning_rate": 9.36431568373751e-06, "loss": 1.08160114, "memory(GiB)": 369.4, "step": 16095, "train_speed(iter/s)": 0.200819 }, { "acc": 0.74891844, "epoch": 0.40842212075088785, "grad_norm": 2.046875, "learning_rate": 9.363803895995522e-06, "loss": 0.98161755, "memory(GiB)": 369.4, "step": 16100, "train_speed(iter/s)": 0.200828 }, { "acc": 0.74974995, "epoch": 0.40854895991882295, "grad_norm": 2.40625, "learning_rate": 9.363291916313955e-06, "loss": 1.00658083, "memory(GiB)": 369.4, "step": 16105, "train_speed(iter/s)": 0.200839 }, { "acc": 0.74611177, "epoch": 0.408675799086758, "grad_norm": 1.9375, "learning_rate": 9.362779744715332e-06, "loss": 1.02125282, "memory(GiB)": 369.4, "step": 16110, "train_speed(iter/s)": 0.200843 }, { "acc": 0.74879045, "epoch": 0.40880263825469304, "grad_norm": 2.40625, "learning_rate": 9.362267381222174e-06, "loss": 1.03480062, "memory(GiB)": 369.4, "step": 16115, "train_speed(iter/s)": 0.200852 }, { "acc": 0.74820805, "epoch": 0.4089294774226281, "grad_norm": 2.046875, "learning_rate": 9.361754825857022e-06, "loss": 1.00455132, "memory(GiB)": 369.4, "step": 16120, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74013786, "epoch": 0.4090563165905632, "grad_norm": 2.265625, "learning_rate": 9.36124207864242e-06, "loss": 1.03145657, "memory(GiB)": 369.4, "step": 16125, "train_speed(iter/s)": 0.200866 }, { "acc": 0.73587661, "epoch": 0.40918315575849823, "grad_norm": 2.265625, "learning_rate": 9.360729139600917e-06, "loss": 1.02892914, "memory(GiB)": 369.4, "step": 16130, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74987202, "epoch": 0.4093099949264333, "grad_norm": 2.109375, "learning_rate": 9.36021600875508e-06, "loss": 1.03011503, "memory(GiB)": 369.4, "step": 16135, "train_speed(iter/s)": 0.200891 }, { "acc": 0.73526812, "epoch": 0.4094368340943683, "grad_norm": 2.109375, "learning_rate": 9.359702686127474e-06, "loss": 1.0478426, "memory(GiB)": 369.4, "step": 16140, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75080786, "epoch": 0.4095636732623034, "grad_norm": 2.046875, "learning_rate": 9.359189171740679e-06, "loss": 1.04870033, "memory(GiB)": 369.4, "step": 16145, "train_speed(iter/s)": 0.200912 }, { "acc": 0.7391232, "epoch": 0.40969051243023846, "grad_norm": 2.0, "learning_rate": 9.358675465617283e-06, "loss": 1.0303195, "memory(GiB)": 369.4, "step": 16150, "train_speed(iter/s)": 0.200925 }, { "acc": 0.74563122, "epoch": 0.4098173515981735, "grad_norm": 2.3125, "learning_rate": 9.35816156777988e-06, "loss": 1.04413834, "memory(GiB)": 369.4, "step": 16155, "train_speed(iter/s)": 0.200936 }, { "acc": 0.73808956, "epoch": 0.40994419076610855, "grad_norm": 2.640625, "learning_rate": 9.357647478251072e-06, "loss": 1.02075987, "memory(GiB)": 369.4, "step": 16160, "train_speed(iter/s)": 0.200949 }, { "acc": 0.73614521, "epoch": 0.41007102993404365, "grad_norm": 2.421875, "learning_rate": 9.357133197053475e-06, "loss": 1.06327696, "memory(GiB)": 369.4, "step": 16165, "train_speed(iter/s)": 0.200963 }, { "acc": 0.74454718, "epoch": 0.4101978691019787, "grad_norm": 2.65625, "learning_rate": 9.356618724209704e-06, "loss": 1.03618469, "memory(GiB)": 369.4, "step": 16170, "train_speed(iter/s)": 0.200971 }, { "acc": 0.74558129, "epoch": 0.41032470826991374, "grad_norm": 1.9921875, "learning_rate": 9.356104059742392e-06, "loss": 0.96805611, "memory(GiB)": 369.4, "step": 16175, "train_speed(iter/s)": 0.200982 }, { "acc": 0.73925734, "epoch": 0.4104515474378488, "grad_norm": 2.015625, "learning_rate": 9.355589203674175e-06, "loss": 1.03768845, "memory(GiB)": 369.4, "step": 16180, "train_speed(iter/s)": 0.200992 }, { "acc": 0.7586853, "epoch": 0.4105783866057839, "grad_norm": 2.578125, "learning_rate": 9.355074156027699e-06, "loss": 0.97726631, "memory(GiB)": 369.4, "step": 16185, "train_speed(iter/s)": 0.201004 }, { "acc": 0.73938618, "epoch": 0.41070522577371893, "grad_norm": 2.046875, "learning_rate": 9.354558916825616e-06, "loss": 1.0464982, "memory(GiB)": 369.4, "step": 16190, "train_speed(iter/s)": 0.201015 }, { "acc": 0.75063219, "epoch": 0.410832064941654, "grad_norm": 2.0625, "learning_rate": 9.354043486090592e-06, "loss": 0.97359781, "memory(GiB)": 369.4, "step": 16195, "train_speed(iter/s)": 0.201022 }, { "acc": 0.73265777, "epoch": 0.410958904109589, "grad_norm": 2.09375, "learning_rate": 9.353527863845296e-06, "loss": 1.09790096, "memory(GiB)": 369.4, "step": 16200, "train_speed(iter/s)": 0.201035 }, { "acc": 0.75815983, "epoch": 0.4110857432775241, "grad_norm": 1.84375, "learning_rate": 9.353012050112405e-06, "loss": 0.98086424, "memory(GiB)": 369.4, "step": 16205, "train_speed(iter/s)": 0.201042 }, { "acc": 0.74424438, "epoch": 0.41121258244545916, "grad_norm": 2.0625, "learning_rate": 9.352496044914611e-06, "loss": 0.99777393, "memory(GiB)": 369.4, "step": 16210, "train_speed(iter/s)": 0.20105 }, { "acc": 0.75846705, "epoch": 0.4113394216133942, "grad_norm": 2.6875, "learning_rate": 9.351979848274608e-06, "loss": 0.9728199, "memory(GiB)": 369.4, "step": 16215, "train_speed(iter/s)": 0.201059 }, { "acc": 0.76370363, "epoch": 0.41146626078132925, "grad_norm": 2.03125, "learning_rate": 9.351463460215102e-06, "loss": 0.9313118, "memory(GiB)": 369.4, "step": 16220, "train_speed(iter/s)": 0.201069 }, { "acc": 0.75069695, "epoch": 0.41159309994926435, "grad_norm": 2.546875, "learning_rate": 9.350946880758804e-06, "loss": 1.00205402, "memory(GiB)": 369.4, "step": 16225, "train_speed(iter/s)": 0.20108 }, { "acc": 0.74602575, "epoch": 0.4117199391171994, "grad_norm": 2.078125, "learning_rate": 9.350430109928437e-06, "loss": 1.01740446, "memory(GiB)": 369.4, "step": 16230, "train_speed(iter/s)": 0.201089 }, { "acc": 0.74978743, "epoch": 0.41184677828513444, "grad_norm": 2.421875, "learning_rate": 9.349913147746731e-06, "loss": 1.05110607, "memory(GiB)": 369.4, "step": 16235, "train_speed(iter/s)": 0.2011 }, { "acc": 0.75076156, "epoch": 0.4119736174530695, "grad_norm": 1.8203125, "learning_rate": 9.349395994236423e-06, "loss": 0.9742691, "memory(GiB)": 369.4, "step": 16240, "train_speed(iter/s)": 0.201111 }, { "acc": 0.74232764, "epoch": 0.4121004566210046, "grad_norm": 2.171875, "learning_rate": 9.348878649420262e-06, "loss": 1.06046791, "memory(GiB)": 369.4, "step": 16245, "train_speed(iter/s)": 0.201124 }, { "acc": 0.7373682, "epoch": 0.41222729578893963, "grad_norm": 2.390625, "learning_rate": 9.348361113321e-06, "loss": 1.02679729, "memory(GiB)": 369.4, "step": 16250, "train_speed(iter/s)": 0.201128 }, { "acc": 0.74952807, "epoch": 0.4123541349568747, "grad_norm": 2.3125, "learning_rate": 9.347843385961403e-06, "loss": 1.03532829, "memory(GiB)": 369.4, "step": 16255, "train_speed(iter/s)": 0.201136 }, { "acc": 0.73414345, "epoch": 0.4124809741248097, "grad_norm": 2.1875, "learning_rate": 9.347325467364242e-06, "loss": 1.0238699, "memory(GiB)": 369.4, "step": 16260, "train_speed(iter/s)": 0.201149 }, { "acc": 0.75224824, "epoch": 0.4126078132927448, "grad_norm": 2.109375, "learning_rate": 9.346807357552296e-06, "loss": 1.01597252, "memory(GiB)": 369.4, "step": 16265, "train_speed(iter/s)": 0.201158 }, { "acc": 0.73782687, "epoch": 0.41273465246067986, "grad_norm": 2.03125, "learning_rate": 9.346289056548357e-06, "loss": 1.02302685, "memory(GiB)": 369.4, "step": 16270, "train_speed(iter/s)": 0.201169 }, { "acc": 0.75321054, "epoch": 0.4128614916286149, "grad_norm": 1.859375, "learning_rate": 9.345770564375221e-06, "loss": 0.98389854, "memory(GiB)": 369.4, "step": 16275, "train_speed(iter/s)": 0.201175 }, { "acc": 0.73251734, "epoch": 0.41298833079654995, "grad_norm": 2.015625, "learning_rate": 9.345251881055692e-06, "loss": 1.01387615, "memory(GiB)": 369.4, "step": 16280, "train_speed(iter/s)": 0.201183 }, { "acc": 0.7500227, "epoch": 0.41311516996448505, "grad_norm": 2.53125, "learning_rate": 9.344733006612585e-06, "loss": 1.04854088, "memory(GiB)": 369.4, "step": 16285, "train_speed(iter/s)": 0.201192 }, { "acc": 0.7459053, "epoch": 0.4132420091324201, "grad_norm": 2.34375, "learning_rate": 9.344213941068724e-06, "loss": 1.03207016, "memory(GiB)": 369.4, "step": 16290, "train_speed(iter/s)": 0.201199 }, { "acc": 0.73931675, "epoch": 0.41336884830035514, "grad_norm": 2.375, "learning_rate": 9.343694684446937e-06, "loss": 1.03360176, "memory(GiB)": 369.4, "step": 16295, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74867907, "epoch": 0.4134956874682902, "grad_norm": 2.078125, "learning_rate": 9.343175236770065e-06, "loss": 0.99481401, "memory(GiB)": 369.4, "step": 16300, "train_speed(iter/s)": 0.201211 }, { "acc": 0.74790535, "epoch": 0.4136225266362253, "grad_norm": 1.8125, "learning_rate": 9.342655598060955e-06, "loss": 0.99789677, "memory(GiB)": 369.4, "step": 16305, "train_speed(iter/s)": 0.20122 }, { "acc": 0.74652958, "epoch": 0.41374936580416033, "grad_norm": 1.890625, "learning_rate": 9.342135768342464e-06, "loss": 0.98819122, "memory(GiB)": 369.4, "step": 16310, "train_speed(iter/s)": 0.201228 }, { "acc": 0.75198689, "epoch": 0.4138762049720954, "grad_norm": 2.15625, "learning_rate": 9.341615747637454e-06, "loss": 0.9936409, "memory(GiB)": 369.4, "step": 16315, "train_speed(iter/s)": 0.201241 }, { "acc": 0.7364934, "epoch": 0.4140030441400304, "grad_norm": 1.8984375, "learning_rate": 9.3410955359688e-06, "loss": 1.04692574, "memory(GiB)": 369.4, "step": 16320, "train_speed(iter/s)": 0.201252 }, { "acc": 0.74670863, "epoch": 0.4141298833079655, "grad_norm": 2.390625, "learning_rate": 9.340575133359385e-06, "loss": 1.00202923, "memory(GiB)": 369.4, "step": 16325, "train_speed(iter/s)": 0.201255 }, { "acc": 0.7439476, "epoch": 0.41425672247590056, "grad_norm": 2.03125, "learning_rate": 9.340054539832095e-06, "loss": 0.98834229, "memory(GiB)": 369.4, "step": 16330, "train_speed(iter/s)": 0.201261 }, { "acc": 0.7344758, "epoch": 0.4143835616438356, "grad_norm": 2.546875, "learning_rate": 9.339533755409828e-06, "loss": 1.07708721, "memory(GiB)": 369.4, "step": 16335, "train_speed(iter/s)": 0.201273 }, { "acc": 0.73711371, "epoch": 0.41451040081177065, "grad_norm": 2.140625, "learning_rate": 9.339012780115492e-06, "loss": 1.03245687, "memory(GiB)": 369.4, "step": 16340, "train_speed(iter/s)": 0.20128 }, { "acc": 0.74285703, "epoch": 0.41463723997970575, "grad_norm": 1.921875, "learning_rate": 9.338491613972002e-06, "loss": 0.99333477, "memory(GiB)": 369.4, "step": 16345, "train_speed(iter/s)": 0.201287 }, { "acc": 0.76223602, "epoch": 0.4147640791476408, "grad_norm": 2.125, "learning_rate": 9.337970257002282e-06, "loss": 1.02642021, "memory(GiB)": 369.4, "step": 16350, "train_speed(iter/s)": 0.201279 }, { "acc": 0.7432519, "epoch": 0.41489091831557584, "grad_norm": 2.546875, "learning_rate": 9.337448709229261e-06, "loss": 1.09971657, "memory(GiB)": 369.4, "step": 16355, "train_speed(iter/s)": 0.201289 }, { "acc": 0.74820814, "epoch": 0.4150177574835109, "grad_norm": 2.421875, "learning_rate": 9.336926970675883e-06, "loss": 1.03107014, "memory(GiB)": 369.4, "step": 16360, "train_speed(iter/s)": 0.201301 }, { "acc": 0.73713427, "epoch": 0.415144596651446, "grad_norm": 2.21875, "learning_rate": 9.33640504136509e-06, "loss": 1.05091057, "memory(GiB)": 369.4, "step": 16365, "train_speed(iter/s)": 0.201309 }, { "acc": 0.73871489, "epoch": 0.41527143581938103, "grad_norm": 2.25, "learning_rate": 9.335882921319845e-06, "loss": 1.01176777, "memory(GiB)": 369.4, "step": 16370, "train_speed(iter/s)": 0.20132 }, { "acc": 0.75374393, "epoch": 0.4153982749873161, "grad_norm": 1.9453125, "learning_rate": 9.335360610563111e-06, "loss": 1.02276039, "memory(GiB)": 369.4, "step": 16375, "train_speed(iter/s)": 0.201334 }, { "acc": 0.74542251, "epoch": 0.4155251141552511, "grad_norm": 2.171875, "learning_rate": 9.33483810911786e-06, "loss": 1.01217556, "memory(GiB)": 369.4, "step": 16380, "train_speed(iter/s)": 0.201343 }, { "acc": 0.71913481, "epoch": 0.4156519533231862, "grad_norm": 2.25, "learning_rate": 9.334315417007079e-06, "loss": 1.12010918, "memory(GiB)": 369.4, "step": 16385, "train_speed(iter/s)": 0.20135 }, { "acc": 0.73978615, "epoch": 0.41577879249112126, "grad_norm": 1.9453125, "learning_rate": 9.333792534253751e-06, "loss": 1.05163479, "memory(GiB)": 369.4, "step": 16390, "train_speed(iter/s)": 0.201362 }, { "acc": 0.74295602, "epoch": 0.4159056316590563, "grad_norm": 2.0625, "learning_rate": 9.333269460880879e-06, "loss": 1.03162575, "memory(GiB)": 369.4, "step": 16395, "train_speed(iter/s)": 0.201372 }, { "acc": 0.74903078, "epoch": 0.41603247082699135, "grad_norm": 2.421875, "learning_rate": 9.33274619691147e-06, "loss": 0.96998024, "memory(GiB)": 369.4, "step": 16400, "train_speed(iter/s)": 0.201383 }, { "acc": 0.74446115, "epoch": 0.41615930999492645, "grad_norm": 2.28125, "learning_rate": 9.332222742368537e-06, "loss": 1.00353575, "memory(GiB)": 369.4, "step": 16405, "train_speed(iter/s)": 0.201395 }, { "acc": 0.73947659, "epoch": 0.4162861491628615, "grad_norm": 1.9140625, "learning_rate": 9.331699097275108e-06, "loss": 1.02648125, "memory(GiB)": 369.4, "step": 16410, "train_speed(iter/s)": 0.201404 }, { "acc": 0.7472826, "epoch": 0.41641298833079654, "grad_norm": 3.125, "learning_rate": 9.331175261654213e-06, "loss": 1.04733782, "memory(GiB)": 369.4, "step": 16415, "train_speed(iter/s)": 0.201402 }, { "acc": 0.73973064, "epoch": 0.4165398274987316, "grad_norm": 2.171875, "learning_rate": 9.330651235528891e-06, "loss": 1.04494514, "memory(GiB)": 369.4, "step": 16420, "train_speed(iter/s)": 0.201411 }, { "acc": 0.73728132, "epoch": 0.4166666666666667, "grad_norm": 2.125, "learning_rate": 9.330127018922195e-06, "loss": 1.08577003, "memory(GiB)": 369.4, "step": 16425, "train_speed(iter/s)": 0.201421 }, { "acc": 0.75780048, "epoch": 0.41679350583460173, "grad_norm": 1.9765625, "learning_rate": 9.329602611857179e-06, "loss": 0.94894609, "memory(GiB)": 369.4, "step": 16430, "train_speed(iter/s)": 0.201429 }, { "acc": 0.73993864, "epoch": 0.4169203450025368, "grad_norm": 2.171875, "learning_rate": 9.329078014356909e-06, "loss": 0.99331799, "memory(GiB)": 369.4, "step": 16435, "train_speed(iter/s)": 0.201439 }, { "acc": 0.75795274, "epoch": 0.4170471841704718, "grad_norm": 2.1875, "learning_rate": 9.32855322644446e-06, "loss": 0.94207678, "memory(GiB)": 369.4, "step": 16440, "train_speed(iter/s)": 0.201444 }, { "acc": 0.74821715, "epoch": 0.4171740233384069, "grad_norm": 2.109375, "learning_rate": 9.328028248142916e-06, "loss": 0.97856941, "memory(GiB)": 369.4, "step": 16445, "train_speed(iter/s)": 0.201451 }, { "acc": 0.73508768, "epoch": 0.41730086250634196, "grad_norm": 1.9296875, "learning_rate": 9.327503079475365e-06, "loss": 1.05833406, "memory(GiB)": 369.4, "step": 16450, "train_speed(iter/s)": 0.20146 }, { "acc": 0.74723854, "epoch": 0.417427701674277, "grad_norm": 2.28125, "learning_rate": 9.326977720464908e-06, "loss": 0.99243698, "memory(GiB)": 369.4, "step": 16455, "train_speed(iter/s)": 0.201468 }, { "acc": 0.75028353, "epoch": 0.41755454084221205, "grad_norm": 2.65625, "learning_rate": 9.326452171134652e-06, "loss": 1.01441822, "memory(GiB)": 369.4, "step": 16460, "train_speed(iter/s)": 0.201476 }, { "acc": 0.74066067, "epoch": 0.41768138001014715, "grad_norm": 2.140625, "learning_rate": 9.325926431507714e-06, "loss": 1.01667328, "memory(GiB)": 369.4, "step": 16465, "train_speed(iter/s)": 0.201481 }, { "acc": 0.75166368, "epoch": 0.4178082191780822, "grad_norm": 2.171875, "learning_rate": 9.325400501607218e-06, "loss": 0.99153862, "memory(GiB)": 369.4, "step": 16470, "train_speed(iter/s)": 0.201491 }, { "acc": 0.73759303, "epoch": 0.41793505834601724, "grad_norm": 2.484375, "learning_rate": 9.324874381456295e-06, "loss": 1.06939678, "memory(GiB)": 369.4, "step": 16475, "train_speed(iter/s)": 0.201499 }, { "acc": 0.74674959, "epoch": 0.4180618975139523, "grad_norm": 2.40625, "learning_rate": 9.324348071078088e-06, "loss": 0.99686069, "memory(GiB)": 369.4, "step": 16480, "train_speed(iter/s)": 0.20151 }, { "acc": 0.72962084, "epoch": 0.4181887366818874, "grad_norm": 2.140625, "learning_rate": 9.323821570495748e-06, "loss": 1.05545149, "memory(GiB)": 369.4, "step": 16485, "train_speed(iter/s)": 0.201517 }, { "acc": 0.73761334, "epoch": 0.41831557584982243, "grad_norm": 2.234375, "learning_rate": 9.32329487973243e-06, "loss": 1.03663321, "memory(GiB)": 369.4, "step": 16490, "train_speed(iter/s)": 0.201522 }, { "acc": 0.74922361, "epoch": 0.4184424150177575, "grad_norm": 2.125, "learning_rate": 9.3227679988113e-06, "loss": 1.02111454, "memory(GiB)": 369.4, "step": 16495, "train_speed(iter/s)": 0.201524 }, { "acc": 0.74879775, "epoch": 0.4185692541856925, "grad_norm": 2.546875, "learning_rate": 9.322240927755534e-06, "loss": 1.00760441, "memory(GiB)": 369.4, "step": 16500, "train_speed(iter/s)": 0.201528 }, { "acc": 0.74587412, "epoch": 0.4186960933536276, "grad_norm": 2.53125, "learning_rate": 9.321713666588314e-06, "loss": 1.02396975, "memory(GiB)": 369.4, "step": 16505, "train_speed(iter/s)": 0.201538 }, { "acc": 0.74294405, "epoch": 0.41882293252156266, "grad_norm": 2.125, "learning_rate": 9.321186215332833e-06, "loss": 1.0553278, "memory(GiB)": 369.4, "step": 16510, "train_speed(iter/s)": 0.201547 }, { "acc": 0.74991894, "epoch": 0.4189497716894977, "grad_norm": 1.8203125, "learning_rate": 9.320658574012289e-06, "loss": 1.05651798, "memory(GiB)": 369.4, "step": 16515, "train_speed(iter/s)": 0.201558 }, { "acc": 0.74720592, "epoch": 0.41907661085743275, "grad_norm": 2.25, "learning_rate": 9.32013074264989e-06, "loss": 1.00344381, "memory(GiB)": 369.4, "step": 16520, "train_speed(iter/s)": 0.201565 }, { "acc": 0.7488369, "epoch": 0.41920345002536785, "grad_norm": 2.359375, "learning_rate": 9.319602721268853e-06, "loss": 1.00502586, "memory(GiB)": 369.4, "step": 16525, "train_speed(iter/s)": 0.201574 }, { "acc": 0.7605051, "epoch": 0.4193302891933029, "grad_norm": 1.9609375, "learning_rate": 9.319074509892403e-06, "loss": 0.96570787, "memory(GiB)": 369.4, "step": 16530, "train_speed(iter/s)": 0.201581 }, { "acc": 0.74153767, "epoch": 0.41945712836123794, "grad_norm": 1.953125, "learning_rate": 9.318546108543774e-06, "loss": 0.97862339, "memory(GiB)": 369.4, "step": 16535, "train_speed(iter/s)": 0.201592 }, { "acc": 0.72008953, "epoch": 0.419583967529173, "grad_norm": 2.21875, "learning_rate": 9.318017517246205e-06, "loss": 1.0985611, "memory(GiB)": 369.4, "step": 16540, "train_speed(iter/s)": 0.201601 }, { "acc": 0.72978506, "epoch": 0.4197108066971081, "grad_norm": 2.109375, "learning_rate": 9.317488736022948e-06, "loss": 1.0582283, "memory(GiB)": 369.4, "step": 16545, "train_speed(iter/s)": 0.201609 }, { "acc": 0.73697691, "epoch": 0.41983764586504313, "grad_norm": 2.40625, "learning_rate": 9.316959764897259e-06, "loss": 1.03445263, "memory(GiB)": 369.4, "step": 16550, "train_speed(iter/s)": 0.20162 }, { "acc": 0.73872943, "epoch": 0.4199644850329782, "grad_norm": 2.328125, "learning_rate": 9.316430603892406e-06, "loss": 0.98682365, "memory(GiB)": 369.4, "step": 16555, "train_speed(iter/s)": 0.201626 }, { "acc": 0.73484392, "epoch": 0.4200913242009132, "grad_norm": 1.9921875, "learning_rate": 9.315901253031663e-06, "loss": 1.02892637, "memory(GiB)": 369.4, "step": 16560, "train_speed(iter/s)": 0.201633 }, { "acc": 0.73723049, "epoch": 0.4202181633688483, "grad_norm": 2.265625, "learning_rate": 9.315371712338315e-06, "loss": 1.13422146, "memory(GiB)": 369.4, "step": 16565, "train_speed(iter/s)": 0.20164 }, { "acc": 0.74329405, "epoch": 0.42034500253678336, "grad_norm": 2.15625, "learning_rate": 9.314841981835652e-06, "loss": 1.01491623, "memory(GiB)": 369.4, "step": 16570, "train_speed(iter/s)": 0.201642 }, { "acc": 0.73131132, "epoch": 0.4204718417047184, "grad_norm": 2.234375, "learning_rate": 9.314312061546974e-06, "loss": 1.07000084, "memory(GiB)": 369.4, "step": 16575, "train_speed(iter/s)": 0.201652 }, { "acc": 0.7373908, "epoch": 0.42059868087265345, "grad_norm": 2.0, "learning_rate": 9.313781951495588e-06, "loss": 1.05369568, "memory(GiB)": 369.4, "step": 16580, "train_speed(iter/s)": 0.201662 }, { "acc": 0.73979745, "epoch": 0.42072552004058855, "grad_norm": 2.078125, "learning_rate": 9.313251651704816e-06, "loss": 1.1181776, "memory(GiB)": 369.4, "step": 16585, "train_speed(iter/s)": 0.201673 }, { "acc": 0.74952693, "epoch": 0.4208523592085236, "grad_norm": 2.015625, "learning_rate": 9.312721162197975e-06, "loss": 1.01939316, "memory(GiB)": 369.4, "step": 16590, "train_speed(iter/s)": 0.201677 }, { "acc": 0.76380877, "epoch": 0.42097919837645864, "grad_norm": 2.09375, "learning_rate": 9.312190482998405e-06, "loss": 0.98014402, "memory(GiB)": 369.4, "step": 16595, "train_speed(iter/s)": 0.20169 }, { "acc": 0.74945221, "epoch": 0.4211060375443937, "grad_norm": 2.109375, "learning_rate": 9.311659614129443e-06, "loss": 1.03792133, "memory(GiB)": 369.4, "step": 16600, "train_speed(iter/s)": 0.201698 }, { "acc": 0.74052086, "epoch": 0.4212328767123288, "grad_norm": 2.34375, "learning_rate": 9.311128555614443e-06, "loss": 1.06311626, "memory(GiB)": 369.4, "step": 16605, "train_speed(iter/s)": 0.201712 }, { "acc": 0.74743605, "epoch": 0.42135971588026383, "grad_norm": 1.828125, "learning_rate": 9.31059730747676e-06, "loss": 0.96844845, "memory(GiB)": 369.4, "step": 16610, "train_speed(iter/s)": 0.201721 }, { "acc": 0.75785375, "epoch": 0.4214865550481989, "grad_norm": 2.109375, "learning_rate": 9.310065869739763e-06, "loss": 1.00252619, "memory(GiB)": 369.4, "step": 16615, "train_speed(iter/s)": 0.201733 }, { "acc": 0.74205728, "epoch": 0.4216133942161339, "grad_norm": 2.015625, "learning_rate": 9.309534242426826e-06, "loss": 1.00792837, "memory(GiB)": 369.4, "step": 16620, "train_speed(iter/s)": 0.201743 }, { "acc": 0.75004706, "epoch": 0.421740233384069, "grad_norm": 1.8984375, "learning_rate": 9.30900242556133e-06, "loss": 0.98161068, "memory(GiB)": 369.4, "step": 16625, "train_speed(iter/s)": 0.201756 }, { "acc": 0.75188451, "epoch": 0.42186707255200406, "grad_norm": 2.28125, "learning_rate": 9.308470419166672e-06, "loss": 0.9416872, "memory(GiB)": 369.4, "step": 16630, "train_speed(iter/s)": 0.201762 }, { "acc": 0.73863101, "epoch": 0.4219939117199391, "grad_norm": 2.1875, "learning_rate": 9.307938223266247e-06, "loss": 0.99171238, "memory(GiB)": 369.4, "step": 16635, "train_speed(iter/s)": 0.201773 }, { "acc": 0.73441243, "epoch": 0.42212075088787415, "grad_norm": 1.8671875, "learning_rate": 9.307405837883467e-06, "loss": 1.06826344, "memory(GiB)": 369.4, "step": 16640, "train_speed(iter/s)": 0.201779 }, { "acc": 0.73810596, "epoch": 0.42224759005580925, "grad_norm": 1.5078125, "learning_rate": 9.306873263041745e-06, "loss": 1.01536427, "memory(GiB)": 369.4, "step": 16645, "train_speed(iter/s)": 0.20178 }, { "acc": 0.74365706, "epoch": 0.4223744292237443, "grad_norm": 2.46875, "learning_rate": 9.30634049876451e-06, "loss": 1.01900215, "memory(GiB)": 369.4, "step": 16650, "train_speed(iter/s)": 0.20179 }, { "acc": 0.74013925, "epoch": 0.42250126839167934, "grad_norm": 2.421875, "learning_rate": 9.305807545075194e-06, "loss": 1.01223173, "memory(GiB)": 369.4, "step": 16655, "train_speed(iter/s)": 0.2018 }, { "acc": 0.73919802, "epoch": 0.4226281075596144, "grad_norm": 1.8828125, "learning_rate": 9.305274401997237e-06, "loss": 1.03600273, "memory(GiB)": 369.4, "step": 16660, "train_speed(iter/s)": 0.20181 }, { "acc": 0.74529405, "epoch": 0.4227549467275495, "grad_norm": 2.140625, "learning_rate": 9.304741069554088e-06, "loss": 0.99203339, "memory(GiB)": 369.4, "step": 16665, "train_speed(iter/s)": 0.201813 }, { "acc": 0.75911236, "epoch": 0.42288178589548453, "grad_norm": 2.0, "learning_rate": 9.304207547769211e-06, "loss": 0.98116198, "memory(GiB)": 369.4, "step": 16670, "train_speed(iter/s)": 0.20182 }, { "acc": 0.75639553, "epoch": 0.4230086250634196, "grad_norm": 2.515625, "learning_rate": 9.303673836666067e-06, "loss": 0.99025383, "memory(GiB)": 369.4, "step": 16675, "train_speed(iter/s)": 0.20183 }, { "acc": 0.73926086, "epoch": 0.4231354642313546, "grad_norm": 1.953125, "learning_rate": 9.303139936268133e-06, "loss": 1.03222122, "memory(GiB)": 369.4, "step": 16680, "train_speed(iter/s)": 0.201844 }, { "acc": 0.73224716, "epoch": 0.4232623033992897, "grad_norm": 2.125, "learning_rate": 9.302605846598894e-06, "loss": 1.05833235, "memory(GiB)": 369.4, "step": 16685, "train_speed(iter/s)": 0.201854 }, { "acc": 0.74404812, "epoch": 0.42338914256722476, "grad_norm": 2.8125, "learning_rate": 9.30207156768184e-06, "loss": 1.03365269, "memory(GiB)": 369.4, "step": 16690, "train_speed(iter/s)": 0.201863 }, { "acc": 0.74851322, "epoch": 0.4235159817351598, "grad_norm": 2.296875, "learning_rate": 9.30153709954047e-06, "loss": 0.96835289, "memory(GiB)": 369.4, "step": 16695, "train_speed(iter/s)": 0.201868 }, { "acc": 0.75276628, "epoch": 0.42364282090309485, "grad_norm": 1.9765625, "learning_rate": 9.301002442198294e-06, "loss": 1.00009956, "memory(GiB)": 369.4, "step": 16700, "train_speed(iter/s)": 0.201873 }, { "acc": 0.74153037, "epoch": 0.42376966007102995, "grad_norm": 2.203125, "learning_rate": 9.300467595678829e-06, "loss": 1.02775221, "memory(GiB)": 369.4, "step": 16705, "train_speed(iter/s)": 0.20188 }, { "acc": 0.74663916, "epoch": 0.423896499238965, "grad_norm": 2.796875, "learning_rate": 9.299932560005596e-06, "loss": 1.02983818, "memory(GiB)": 369.4, "step": 16710, "train_speed(iter/s)": 0.201892 }, { "acc": 0.74689164, "epoch": 0.42402333840690004, "grad_norm": 2.203125, "learning_rate": 9.299397335202133e-06, "loss": 1.01739101, "memory(GiB)": 369.4, "step": 16715, "train_speed(iter/s)": 0.201899 }, { "acc": 0.75019979, "epoch": 0.4241501775748351, "grad_norm": 2.140625, "learning_rate": 9.29886192129198e-06, "loss": 1.00547104, "memory(GiB)": 369.4, "step": 16720, "train_speed(iter/s)": 0.2019 }, { "acc": 0.73923903, "epoch": 0.4242770167427702, "grad_norm": 2.203125, "learning_rate": 9.298326318298688e-06, "loss": 1.05492363, "memory(GiB)": 369.4, "step": 16725, "train_speed(iter/s)": 0.201903 }, { "acc": 0.74326754, "epoch": 0.42440385591070523, "grad_norm": 2.265625, "learning_rate": 9.29779052624581e-06, "loss": 1.00568047, "memory(GiB)": 369.4, "step": 16730, "train_speed(iter/s)": 0.201912 }, { "acc": 0.74914379, "epoch": 0.4245306950786403, "grad_norm": 2.28125, "learning_rate": 9.29725454515692e-06, "loss": 1.03223705, "memory(GiB)": 369.4, "step": 16735, "train_speed(iter/s)": 0.201923 }, { "acc": 0.74835844, "epoch": 0.4246575342465753, "grad_norm": 2.15625, "learning_rate": 9.296718375055587e-06, "loss": 0.97808914, "memory(GiB)": 369.4, "step": 16740, "train_speed(iter/s)": 0.201933 }, { "acc": 0.75203819, "epoch": 0.4247843734145104, "grad_norm": 2.125, "learning_rate": 9.296182015965399e-06, "loss": 1.02102318, "memory(GiB)": 369.4, "step": 16745, "train_speed(iter/s)": 0.201944 }, { "acc": 0.74790735, "epoch": 0.42491121258244546, "grad_norm": 2.96875, "learning_rate": 9.295645467909942e-06, "loss": 1.02933712, "memory(GiB)": 369.4, "step": 16750, "train_speed(iter/s)": 0.201951 }, { "acc": 0.74282007, "epoch": 0.4250380517503805, "grad_norm": 2.015625, "learning_rate": 9.29510873091282e-06, "loss": 1.00628595, "memory(GiB)": 369.4, "step": 16755, "train_speed(iter/s)": 0.201959 }, { "acc": 0.74401908, "epoch": 0.42516489091831555, "grad_norm": 2.515625, "learning_rate": 9.29457180499764e-06, "loss": 1.05102692, "memory(GiB)": 369.4, "step": 16760, "train_speed(iter/s)": 0.201969 }, { "acc": 0.7387063, "epoch": 0.42529173008625065, "grad_norm": 2.375, "learning_rate": 9.294034690188016e-06, "loss": 1.06343775, "memory(GiB)": 369.4, "step": 16765, "train_speed(iter/s)": 0.201975 }, { "acc": 0.73560314, "epoch": 0.4254185692541857, "grad_norm": 2.0, "learning_rate": 9.293497386507577e-06, "loss": 1.05301495, "memory(GiB)": 369.4, "step": 16770, "train_speed(iter/s)": 0.201979 }, { "acc": 0.75511785, "epoch": 0.42554540842212074, "grad_norm": 2.53125, "learning_rate": 9.292959893979953e-06, "loss": 0.99006739, "memory(GiB)": 369.4, "step": 16775, "train_speed(iter/s)": 0.201992 }, { "acc": 0.74327793, "epoch": 0.4256722475900558, "grad_norm": 2.078125, "learning_rate": 9.292422212628786e-06, "loss": 1.05610199, "memory(GiB)": 369.4, "step": 16780, "train_speed(iter/s)": 0.202001 }, { "acc": 0.74926748, "epoch": 0.4257990867579909, "grad_norm": 2.03125, "learning_rate": 9.291884342477728e-06, "loss": 1.00873814, "memory(GiB)": 369.4, "step": 16785, "train_speed(iter/s)": 0.202008 }, { "acc": 0.73361559, "epoch": 0.42592592592592593, "grad_norm": 2.109375, "learning_rate": 9.291346283550433e-06, "loss": 0.99383507, "memory(GiB)": 369.4, "step": 16790, "train_speed(iter/s)": 0.202019 }, { "acc": 0.74579802, "epoch": 0.426052765093861, "grad_norm": 2.3125, "learning_rate": 9.290808035870569e-06, "loss": 1.04583273, "memory(GiB)": 369.4, "step": 16795, "train_speed(iter/s)": 0.202026 }, { "acc": 0.74567003, "epoch": 0.426179604261796, "grad_norm": 2.234375, "learning_rate": 9.29026959946181e-06, "loss": 1.04648037, "memory(GiB)": 369.4, "step": 16800, "train_speed(iter/s)": 0.202032 }, { "acc": 0.7450098, "epoch": 0.4263064434297311, "grad_norm": 1.8828125, "learning_rate": 9.289730974347841e-06, "loss": 1.01023598, "memory(GiB)": 369.4, "step": 16805, "train_speed(iter/s)": 0.202042 }, { "acc": 0.76484137, "epoch": 0.42643328259766616, "grad_norm": 2.21875, "learning_rate": 9.28919216055235e-06, "loss": 1.01007175, "memory(GiB)": 369.4, "step": 16810, "train_speed(iter/s)": 0.202052 }, { "acc": 0.75827646, "epoch": 0.4265601217656012, "grad_norm": 2.515625, "learning_rate": 9.288653158099038e-06, "loss": 0.98521614, "memory(GiB)": 369.4, "step": 16815, "train_speed(iter/s)": 0.20206 }, { "acc": 0.75099535, "epoch": 0.42668696093353625, "grad_norm": 2.203125, "learning_rate": 9.288113967011612e-06, "loss": 1.02433167, "memory(GiB)": 369.4, "step": 16820, "train_speed(iter/s)": 0.202068 }, { "acc": 0.74895449, "epoch": 0.42681380010147135, "grad_norm": 2.21875, "learning_rate": 9.28757458731379e-06, "loss": 0.96833, "memory(GiB)": 369.4, "step": 16825, "train_speed(iter/s)": 0.202077 }, { "acc": 0.73958063, "epoch": 0.4269406392694064, "grad_norm": 2.203125, "learning_rate": 9.287035019029295e-06, "loss": 1.07043514, "memory(GiB)": 369.4, "step": 16830, "train_speed(iter/s)": 0.202085 }, { "acc": 0.74656997, "epoch": 0.42706747843734144, "grad_norm": 2.234375, "learning_rate": 9.286495262181859e-06, "loss": 0.96979752, "memory(GiB)": 369.4, "step": 16835, "train_speed(iter/s)": 0.202095 }, { "acc": 0.74174709, "epoch": 0.4271943176052765, "grad_norm": 3.203125, "learning_rate": 9.285955316795224e-06, "loss": 1.07030649, "memory(GiB)": 369.4, "step": 16840, "train_speed(iter/s)": 0.202104 }, { "acc": 0.74047294, "epoch": 0.4273211567732116, "grad_norm": 2.75, "learning_rate": 9.285415182893138e-06, "loss": 1.05215988, "memory(GiB)": 369.4, "step": 16845, "train_speed(iter/s)": 0.202112 }, { "acc": 0.73367019, "epoch": 0.42744799594114663, "grad_norm": 2.375, "learning_rate": 9.28487486049936e-06, "loss": 1.04555569, "memory(GiB)": 369.4, "step": 16850, "train_speed(iter/s)": 0.20212 }, { "acc": 0.74701266, "epoch": 0.4275748351090817, "grad_norm": 2.4375, "learning_rate": 9.284334349637655e-06, "loss": 1.06435385, "memory(GiB)": 369.4, "step": 16855, "train_speed(iter/s)": 0.20213 }, { "acc": 0.75827069, "epoch": 0.4277016742770167, "grad_norm": 2.140625, "learning_rate": 9.283793650331798e-06, "loss": 0.96698837, "memory(GiB)": 369.4, "step": 16860, "train_speed(iter/s)": 0.20214 }, { "acc": 0.75214829, "epoch": 0.4278285134449518, "grad_norm": 2.203125, "learning_rate": 9.283252762605568e-06, "loss": 1.00152454, "memory(GiB)": 369.4, "step": 16865, "train_speed(iter/s)": 0.20215 }, { "acc": 0.73948979, "epoch": 0.42795535261288686, "grad_norm": 2.15625, "learning_rate": 9.28271168648276e-06, "loss": 1.06528759, "memory(GiB)": 369.4, "step": 16870, "train_speed(iter/s)": 0.202154 }, { "acc": 0.74484539, "epoch": 0.4280821917808219, "grad_norm": 2.421875, "learning_rate": 9.282170421987171e-06, "loss": 1.0196907, "memory(GiB)": 369.4, "step": 16875, "train_speed(iter/s)": 0.202161 }, { "acc": 0.72984638, "epoch": 0.42820903094875695, "grad_norm": 2.671875, "learning_rate": 9.281628969142609e-06, "loss": 1.13062716, "memory(GiB)": 369.4, "step": 16880, "train_speed(iter/s)": 0.202173 }, { "acc": 0.73002667, "epoch": 0.42833587011669205, "grad_norm": 2.28125, "learning_rate": 9.281087327972886e-06, "loss": 1.09646568, "memory(GiB)": 369.4, "step": 16885, "train_speed(iter/s)": 0.202182 }, { "acc": 0.74586473, "epoch": 0.4284627092846271, "grad_norm": 2.203125, "learning_rate": 9.280545498501832e-06, "loss": 0.9678442, "memory(GiB)": 369.4, "step": 16890, "train_speed(iter/s)": 0.202192 }, { "acc": 0.74182658, "epoch": 0.42858954845256214, "grad_norm": 2.109375, "learning_rate": 9.280003480753274e-06, "loss": 1.07217112, "memory(GiB)": 369.4, "step": 16895, "train_speed(iter/s)": 0.202202 }, { "acc": 0.72634397, "epoch": 0.4287163876204972, "grad_norm": 2.171875, "learning_rate": 9.279461274751054e-06, "loss": 1.07862644, "memory(GiB)": 369.4, "step": 16900, "train_speed(iter/s)": 0.202213 }, { "acc": 0.73984098, "epoch": 0.4288432267884323, "grad_norm": 2.125, "learning_rate": 9.27891888051902e-06, "loss": 1.00906887, "memory(GiB)": 369.4, "step": 16905, "train_speed(iter/s)": 0.202217 }, { "acc": 0.74853983, "epoch": 0.42897006595636733, "grad_norm": 2.328125, "learning_rate": 9.278376298081032e-06, "loss": 1.05889473, "memory(GiB)": 369.4, "step": 16910, "train_speed(iter/s)": 0.20223 }, { "acc": 0.74524899, "epoch": 0.4290969051243024, "grad_norm": 2.265625, "learning_rate": 9.277833527460952e-06, "loss": 1.03242378, "memory(GiB)": 369.4, "step": 16915, "train_speed(iter/s)": 0.202245 }, { "acc": 0.75132771, "epoch": 0.4292237442922374, "grad_norm": 2.21875, "learning_rate": 9.277290568682653e-06, "loss": 1.04625168, "memory(GiB)": 369.4, "step": 16920, "train_speed(iter/s)": 0.202257 }, { "acc": 0.72362165, "epoch": 0.4293505834601725, "grad_norm": 2.21875, "learning_rate": 9.27674742177002e-06, "loss": 1.11019697, "memory(GiB)": 369.4, "step": 16925, "train_speed(iter/s)": 0.202267 }, { "acc": 0.73665934, "epoch": 0.42947742262810756, "grad_norm": 2.09375, "learning_rate": 9.27620408674694e-06, "loss": 1.00239353, "memory(GiB)": 369.4, "step": 16930, "train_speed(iter/s)": 0.202271 }, { "acc": 0.73810406, "epoch": 0.4296042617960426, "grad_norm": 1.8984375, "learning_rate": 9.275660563637313e-06, "loss": 1.03193188, "memory(GiB)": 369.4, "step": 16935, "train_speed(iter/s)": 0.202277 }, { "acc": 0.7484849, "epoch": 0.42973110096397765, "grad_norm": 2.3125, "learning_rate": 9.275116852465043e-06, "loss": 1.02340975, "memory(GiB)": 369.4, "step": 16940, "train_speed(iter/s)": 0.202288 }, { "acc": 0.74286261, "epoch": 0.42985794013191275, "grad_norm": 2.484375, "learning_rate": 9.274572953254048e-06, "loss": 1.00931807, "memory(GiB)": 369.4, "step": 16945, "train_speed(iter/s)": 0.202295 }, { "acc": 0.7530652, "epoch": 0.4299847792998478, "grad_norm": 2.625, "learning_rate": 9.27402886602825e-06, "loss": 0.95272522, "memory(GiB)": 369.4, "step": 16950, "train_speed(iter/s)": 0.202301 }, { "acc": 0.73072395, "epoch": 0.43011161846778284, "grad_norm": 2.328125, "learning_rate": 9.27348459081158e-06, "loss": 1.04488592, "memory(GiB)": 369.4, "step": 16955, "train_speed(iter/s)": 0.202312 }, { "acc": 0.74527454, "epoch": 0.4302384576357179, "grad_norm": 2.0625, "learning_rate": 9.272940127627979e-06, "loss": 0.99428616, "memory(GiB)": 369.4, "step": 16960, "train_speed(iter/s)": 0.202322 }, { "acc": 0.74989738, "epoch": 0.430365296803653, "grad_norm": 2.8125, "learning_rate": 9.272395476501392e-06, "loss": 0.99205627, "memory(GiB)": 369.4, "step": 16965, "train_speed(iter/s)": 0.202335 }, { "acc": 0.72905273, "epoch": 0.43049213597158803, "grad_norm": 1.859375, "learning_rate": 9.27185063745578e-06, "loss": 1.08462105, "memory(GiB)": 369.4, "step": 16970, "train_speed(iter/s)": 0.20234 }, { "acc": 0.75458627, "epoch": 0.4306189751395231, "grad_norm": 2.203125, "learning_rate": 9.271305610515103e-06, "loss": 0.98666534, "memory(GiB)": 369.4, "step": 16975, "train_speed(iter/s)": 0.202344 }, { "acc": 0.73678474, "epoch": 0.4307458143074581, "grad_norm": 1.8984375, "learning_rate": 9.270760395703334e-06, "loss": 1.03960028, "memory(GiB)": 369.4, "step": 16980, "train_speed(iter/s)": 0.202347 }, { "acc": 0.74394264, "epoch": 0.4308726534753932, "grad_norm": 2.015625, "learning_rate": 9.270214993044456e-06, "loss": 1.01649818, "memory(GiB)": 369.4, "step": 16985, "train_speed(iter/s)": 0.202352 }, { "acc": 0.73953118, "epoch": 0.43099949264332826, "grad_norm": 2.15625, "learning_rate": 9.269669402562458e-06, "loss": 1.02825966, "memory(GiB)": 369.4, "step": 16990, "train_speed(iter/s)": 0.202358 }, { "acc": 0.74703779, "epoch": 0.4311263318112633, "grad_norm": 2.171875, "learning_rate": 9.269123624281336e-06, "loss": 1.01023064, "memory(GiB)": 369.4, "step": 16995, "train_speed(iter/s)": 0.202364 }, { "acc": 0.75997362, "epoch": 0.43125317097919835, "grad_norm": 1.8671875, "learning_rate": 9.268577658225097e-06, "loss": 0.91075668, "memory(GiB)": 369.4, "step": 17000, "train_speed(iter/s)": 0.202372 }, { "epoch": 0.43125317097919835, "eval_acc": 0.7337105215114654, "eval_loss": 0.9914264678955078, "eval_runtime": 384.3903, "eval_samples_per_second": 16.572, "eval_steps_per_second": 8.286, "step": 17000 }, { "acc": 0.74407845, "epoch": 0.43138001014713345, "grad_norm": 2.375, "learning_rate": 9.268031504417756e-06, "loss": 1.00239906, "memory(GiB)": 369.4, "step": 17005, "train_speed(iter/s)": 0.200682 }, { "acc": 0.73763914, "epoch": 0.4315068493150685, "grad_norm": 2.3125, "learning_rate": 9.267485162883334e-06, "loss": 1.04625664, "memory(GiB)": 369.4, "step": 17010, "train_speed(iter/s)": 0.200687 }, { "acc": 0.72820587, "epoch": 0.43163368848300354, "grad_norm": 2.453125, "learning_rate": 9.266938633645861e-06, "loss": 1.09326553, "memory(GiB)": 369.4, "step": 17015, "train_speed(iter/s)": 0.2007 }, { "acc": 0.76273584, "epoch": 0.4317605276509386, "grad_norm": 2.5625, "learning_rate": 9.266391916729376e-06, "loss": 0.98227873, "memory(GiB)": 369.4, "step": 17020, "train_speed(iter/s)": 0.200711 }, { "acc": 0.73769217, "epoch": 0.4318873668188737, "grad_norm": 2.125, "learning_rate": 9.265845012157926e-06, "loss": 1.04354353, "memory(GiB)": 369.4, "step": 17025, "train_speed(iter/s)": 0.200722 }, { "acc": 0.73778076, "epoch": 0.43201420598680873, "grad_norm": 2.609375, "learning_rate": 9.265297919955566e-06, "loss": 1.02220058, "memory(GiB)": 369.4, "step": 17030, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74410486, "epoch": 0.4321410451547438, "grad_norm": 2.28125, "learning_rate": 9.264750640146363e-06, "loss": 1.04985733, "memory(GiB)": 369.4, "step": 17035, "train_speed(iter/s)": 0.200744 }, { "acc": 0.74390793, "epoch": 0.4322678843226788, "grad_norm": 1.71875, "learning_rate": 9.264203172754384e-06, "loss": 1.01798639, "memory(GiB)": 369.4, "step": 17040, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74560585, "epoch": 0.4323947234906139, "grad_norm": 2.1875, "learning_rate": 9.263655517803713e-06, "loss": 1.04632044, "memory(GiB)": 369.4, "step": 17045, "train_speed(iter/s)": 0.20076 }, { "acc": 0.7380641, "epoch": 0.43252156265854896, "grad_norm": 1.9609375, "learning_rate": 9.263107675318434e-06, "loss": 1.08395081, "memory(GiB)": 369.4, "step": 17050, "train_speed(iter/s)": 0.200761 }, { "acc": 0.75457191, "epoch": 0.432648401826484, "grad_norm": 2.03125, "learning_rate": 9.262559645322648e-06, "loss": 1.00672016, "memory(GiB)": 369.4, "step": 17055, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74982986, "epoch": 0.43277524099441905, "grad_norm": 2.359375, "learning_rate": 9.262011427840459e-06, "loss": 1.01940784, "memory(GiB)": 369.4, "step": 17060, "train_speed(iter/s)": 0.200774 }, { "acc": 0.7520752, "epoch": 0.43290208016235415, "grad_norm": 2.109375, "learning_rate": 9.261463022895976e-06, "loss": 0.92931595, "memory(GiB)": 369.4, "step": 17065, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74665928, "epoch": 0.4330289193302892, "grad_norm": 2.375, "learning_rate": 9.260914430513325e-06, "loss": 1.00900898, "memory(GiB)": 369.4, "step": 17070, "train_speed(iter/s)": 0.200791 }, { "acc": 0.73172998, "epoch": 0.43315575849822424, "grad_norm": 2.09375, "learning_rate": 9.260365650716632e-06, "loss": 1.08943462, "memory(GiB)": 369.4, "step": 17075, "train_speed(iter/s)": 0.200801 }, { "acc": 0.73983974, "epoch": 0.4332825976661593, "grad_norm": 2.515625, "learning_rate": 9.259816683530038e-06, "loss": 1.03836498, "memory(GiB)": 369.4, "step": 17080, "train_speed(iter/s)": 0.20081 }, { "acc": 0.75760341, "epoch": 0.4334094368340944, "grad_norm": 2.25, "learning_rate": 9.259267528977687e-06, "loss": 0.94070225, "memory(GiB)": 369.4, "step": 17085, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74816313, "epoch": 0.43353627600202943, "grad_norm": 2.109375, "learning_rate": 9.258718187083734e-06, "loss": 1.0124918, "memory(GiB)": 369.4, "step": 17090, "train_speed(iter/s)": 0.200824 }, { "acc": 0.74546623, "epoch": 0.4336631151699645, "grad_norm": 2.421875, "learning_rate": 9.258168657872341e-06, "loss": 1.02792377, "memory(GiB)": 369.4, "step": 17095, "train_speed(iter/s)": 0.200833 }, { "acc": 0.747646, "epoch": 0.4337899543378995, "grad_norm": 2.140625, "learning_rate": 9.25761894136768e-06, "loss": 1.04447308, "memory(GiB)": 369.4, "step": 17100, "train_speed(iter/s)": 0.200845 }, { "acc": 0.73759222, "epoch": 0.4339167935058346, "grad_norm": 1.890625, "learning_rate": 9.25706903759393e-06, "loss": 1.04166327, "memory(GiB)": 369.4, "step": 17105, "train_speed(iter/s)": 0.200856 }, { "acc": 0.74794469, "epoch": 0.43404363267376966, "grad_norm": 2.265625, "learning_rate": 9.256518946575274e-06, "loss": 0.98306904, "memory(GiB)": 369.4, "step": 17110, "train_speed(iter/s)": 0.200861 }, { "acc": 0.74982014, "epoch": 0.4341704718417047, "grad_norm": 2.828125, "learning_rate": 9.255968668335912e-06, "loss": 1.0258606, "memory(GiB)": 369.4, "step": 17115, "train_speed(iter/s)": 0.200872 }, { "acc": 0.75065136, "epoch": 0.43429731100963975, "grad_norm": 2.421875, "learning_rate": 9.255418202900048e-06, "loss": 0.99412689, "memory(GiB)": 369.4, "step": 17120, "train_speed(iter/s)": 0.200877 }, { "acc": 0.7414855, "epoch": 0.43442415017757485, "grad_norm": 2.015625, "learning_rate": 9.254867550291891e-06, "loss": 1.02271061, "memory(GiB)": 369.4, "step": 17125, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74793129, "epoch": 0.4345509893455099, "grad_norm": 2.34375, "learning_rate": 9.254316710535662e-06, "loss": 0.96994038, "memory(GiB)": 369.4, "step": 17130, "train_speed(iter/s)": 0.200882 }, { "acc": 0.76074762, "epoch": 0.43467782851344494, "grad_norm": 2.359375, "learning_rate": 9.253765683655591e-06, "loss": 0.98969736, "memory(GiB)": 369.4, "step": 17135, "train_speed(iter/s)": 0.200892 }, { "acc": 0.76301622, "epoch": 0.43480466768138, "grad_norm": 2.4375, "learning_rate": 9.253214469675913e-06, "loss": 1.00565481, "memory(GiB)": 369.4, "step": 17140, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75213585, "epoch": 0.4349315068493151, "grad_norm": 2.234375, "learning_rate": 9.252663068620874e-06, "loss": 1.0171875, "memory(GiB)": 369.4, "step": 17145, "train_speed(iter/s)": 0.200909 }, { "acc": 0.73343616, "epoch": 0.43505834601725013, "grad_norm": 1.8515625, "learning_rate": 9.252111480514726e-06, "loss": 1.0750288, "memory(GiB)": 369.4, "step": 17150, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74752417, "epoch": 0.4351851851851852, "grad_norm": 2.34375, "learning_rate": 9.251559705381731e-06, "loss": 1.03953772, "memory(GiB)": 369.4, "step": 17155, "train_speed(iter/s)": 0.200911 }, { "acc": 0.73044977, "epoch": 0.4353120243531202, "grad_norm": 2.609375, "learning_rate": 9.251007743246159e-06, "loss": 1.07126694, "memory(GiB)": 369.4, "step": 17160, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75086799, "epoch": 0.4354388635210553, "grad_norm": 2.625, "learning_rate": 9.250455594132286e-06, "loss": 1.0080204, "memory(GiB)": 369.4, "step": 17165, "train_speed(iter/s)": 0.200932 }, { "acc": 0.74889221, "epoch": 0.43556570268899036, "grad_norm": 2.140625, "learning_rate": 9.249903258064399e-06, "loss": 1.06341972, "memory(GiB)": 369.4, "step": 17170, "train_speed(iter/s)": 0.200943 }, { "acc": 0.75851507, "epoch": 0.4356925418569254, "grad_norm": 2.3125, "learning_rate": 9.249350735066792e-06, "loss": 0.95759182, "memory(GiB)": 369.4, "step": 17175, "train_speed(iter/s)": 0.200953 }, { "acc": 0.73523426, "epoch": 0.43581938102486045, "grad_norm": 2.15625, "learning_rate": 9.248798025163768e-06, "loss": 1.00379124, "memory(GiB)": 369.4, "step": 17180, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75078387, "epoch": 0.43594622019279555, "grad_norm": 2.53125, "learning_rate": 9.248245128379638e-06, "loss": 0.97327299, "memory(GiB)": 369.4, "step": 17185, "train_speed(iter/s)": 0.200972 }, { "acc": 0.7435822, "epoch": 0.4360730593607306, "grad_norm": 2.078125, "learning_rate": 9.24769204473872e-06, "loss": 1.01263371, "memory(GiB)": 369.4, "step": 17190, "train_speed(iter/s)": 0.20098 }, { "acc": 0.73975511, "epoch": 0.43619989852866564, "grad_norm": 2.078125, "learning_rate": 9.24713877426534e-06, "loss": 0.99163609, "memory(GiB)": 369.4, "step": 17195, "train_speed(iter/s)": 0.200987 }, { "acc": 0.75189753, "epoch": 0.4363267376966007, "grad_norm": 2.109375, "learning_rate": 9.246585316983837e-06, "loss": 1.0220108, "memory(GiB)": 369.4, "step": 17200, "train_speed(iter/s)": 0.200996 }, { "acc": 0.73060675, "epoch": 0.4364535768645358, "grad_norm": 2.03125, "learning_rate": 9.24603167291855e-06, "loss": 1.08551073, "memory(GiB)": 369.4, "step": 17205, "train_speed(iter/s)": 0.201003 }, { "acc": 0.74387946, "epoch": 0.43658041603247083, "grad_norm": 2.34375, "learning_rate": 9.245477842093832e-06, "loss": 1.00805302, "memory(GiB)": 369.4, "step": 17210, "train_speed(iter/s)": 0.20101 }, { "acc": 0.74943624, "epoch": 0.4367072552004059, "grad_norm": 2.265625, "learning_rate": 9.244923824534046e-06, "loss": 1.02259884, "memory(GiB)": 369.4, "step": 17215, "train_speed(iter/s)": 0.201022 }, { "acc": 0.73664222, "epoch": 0.4368340943683409, "grad_norm": 2.1875, "learning_rate": 9.244369620263558e-06, "loss": 1.03386726, "memory(GiB)": 369.4, "step": 17220, "train_speed(iter/s)": 0.20103 }, { "acc": 0.74556026, "epoch": 0.436960933536276, "grad_norm": 2.28125, "learning_rate": 9.243815229306746e-06, "loss": 0.98703651, "memory(GiB)": 369.4, "step": 17225, "train_speed(iter/s)": 0.201036 }, { "acc": 0.74466028, "epoch": 0.43708777270421106, "grad_norm": 2.46875, "learning_rate": 9.243260651687989e-06, "loss": 1.05282402, "memory(GiB)": 369.4, "step": 17230, "train_speed(iter/s)": 0.201046 }, { "acc": 0.73425684, "epoch": 0.4372146118721461, "grad_norm": 2.15625, "learning_rate": 9.242705887431685e-06, "loss": 1.03556347, "memory(GiB)": 369.4, "step": 17235, "train_speed(iter/s)": 0.201058 }, { "acc": 0.73839149, "epoch": 0.43734145104008115, "grad_norm": 2.5625, "learning_rate": 9.242150936562235e-06, "loss": 1.08887844, "memory(GiB)": 369.4, "step": 17240, "train_speed(iter/s)": 0.201066 }, { "acc": 0.74798536, "epoch": 0.43746829020801625, "grad_norm": 2.046875, "learning_rate": 9.241595799104046e-06, "loss": 0.96236687, "memory(GiB)": 369.4, "step": 17245, "train_speed(iter/s)": 0.201075 }, { "acc": 0.74590549, "epoch": 0.4375951293759513, "grad_norm": 2.046875, "learning_rate": 9.241040475081537e-06, "loss": 1.04670105, "memory(GiB)": 369.4, "step": 17250, "train_speed(iter/s)": 0.201083 }, { "acc": 0.73808503, "epoch": 0.43772196854388634, "grad_norm": 2.09375, "learning_rate": 9.240484964519131e-06, "loss": 1.04476624, "memory(GiB)": 369.4, "step": 17255, "train_speed(iter/s)": 0.20109 }, { "acc": 0.74846897, "epoch": 0.4378488077118214, "grad_norm": 2.59375, "learning_rate": 9.239929267441267e-06, "loss": 1.01479416, "memory(GiB)": 369.4, "step": 17260, "train_speed(iter/s)": 0.201098 }, { "acc": 0.75670567, "epoch": 0.4379756468797565, "grad_norm": 2.4375, "learning_rate": 9.239373383872382e-06, "loss": 1.06835918, "memory(GiB)": 369.4, "step": 17265, "train_speed(iter/s)": 0.201109 }, { "acc": 0.73590832, "epoch": 0.43810248604769153, "grad_norm": 2.046875, "learning_rate": 9.238817313836927e-06, "loss": 1.00608625, "memory(GiB)": 369.4, "step": 17270, "train_speed(iter/s)": 0.201117 }, { "acc": 0.74945278, "epoch": 0.4382293252156266, "grad_norm": 1.9609375, "learning_rate": 9.238261057359365e-06, "loss": 1.00640602, "memory(GiB)": 369.4, "step": 17275, "train_speed(iter/s)": 0.201123 }, { "acc": 0.74495554, "epoch": 0.4383561643835616, "grad_norm": 2.171875, "learning_rate": 9.237704614464157e-06, "loss": 1.03044004, "memory(GiB)": 369.4, "step": 17280, "train_speed(iter/s)": 0.201135 }, { "acc": 0.74033637, "epoch": 0.4384830035514967, "grad_norm": 1.890625, "learning_rate": 9.237147985175781e-06, "loss": 1.04095144, "memory(GiB)": 369.4, "step": 17285, "train_speed(iter/s)": 0.20114 }, { "acc": 0.74577913, "epoch": 0.43860984271943176, "grad_norm": 2.015625, "learning_rate": 9.236591169518717e-06, "loss": 1.02657557, "memory(GiB)": 369.4, "step": 17290, "train_speed(iter/s)": 0.201146 }, { "acc": 0.74400244, "epoch": 0.4387366818873668, "grad_norm": 2.390625, "learning_rate": 9.236034167517461e-06, "loss": 1.07089491, "memory(GiB)": 369.4, "step": 17295, "train_speed(iter/s)": 0.201155 }, { "acc": 0.75072184, "epoch": 0.43886352105530185, "grad_norm": 2.671875, "learning_rate": 9.235476979196507e-06, "loss": 0.99948587, "memory(GiB)": 369.4, "step": 17300, "train_speed(iter/s)": 0.201166 }, { "acc": 0.74643297, "epoch": 0.43899036022323695, "grad_norm": 2.84375, "learning_rate": 9.234919604580368e-06, "loss": 1.04251881, "memory(GiB)": 369.4, "step": 17305, "train_speed(iter/s)": 0.201176 }, { "acc": 0.75686736, "epoch": 0.439117199391172, "grad_norm": 2.265625, "learning_rate": 9.234362043693556e-06, "loss": 0.97385979, "memory(GiB)": 369.4, "step": 17310, "train_speed(iter/s)": 0.201184 }, { "acc": 0.72808433, "epoch": 0.43924403855910704, "grad_norm": 2.15625, "learning_rate": 9.233804296560596e-06, "loss": 1.0298563, "memory(GiB)": 369.4, "step": 17315, "train_speed(iter/s)": 0.201193 }, { "acc": 0.73853102, "epoch": 0.4393708777270421, "grad_norm": 2.125, "learning_rate": 9.233246363206021e-06, "loss": 1.06602135, "memory(GiB)": 369.4, "step": 17320, "train_speed(iter/s)": 0.201203 }, { "acc": 0.74446135, "epoch": 0.4394977168949772, "grad_norm": 2.1875, "learning_rate": 9.232688243654371e-06, "loss": 1.06449995, "memory(GiB)": 369.4, "step": 17325, "train_speed(iter/s)": 0.201211 }, { "acc": 0.73018003, "epoch": 0.43962455606291223, "grad_norm": 2.21875, "learning_rate": 9.232129937930194e-06, "loss": 1.07857666, "memory(GiB)": 369.4, "step": 17330, "train_speed(iter/s)": 0.201219 }, { "acc": 0.74930744, "epoch": 0.4397513952308473, "grad_norm": 1.8359375, "learning_rate": 9.231571446058047e-06, "loss": 1.05865164, "memory(GiB)": 369.4, "step": 17335, "train_speed(iter/s)": 0.201226 }, { "acc": 0.73969541, "epoch": 0.4398782343987823, "grad_norm": 1.9375, "learning_rate": 9.231012768062497e-06, "loss": 1.03974686, "memory(GiB)": 369.4, "step": 17340, "train_speed(iter/s)": 0.201234 }, { "acc": 0.73468113, "epoch": 0.4400050735667174, "grad_norm": 2.015625, "learning_rate": 9.230453903968112e-06, "loss": 1.05348234, "memory(GiB)": 369.4, "step": 17345, "train_speed(iter/s)": 0.20124 }, { "acc": 0.75256329, "epoch": 0.44013191273465246, "grad_norm": 2.296875, "learning_rate": 9.22989485379948e-06, "loss": 0.95489702, "memory(GiB)": 369.4, "step": 17350, "train_speed(iter/s)": 0.201251 }, { "acc": 0.74497252, "epoch": 0.4402587519025875, "grad_norm": 1.9296875, "learning_rate": 9.229335617581187e-06, "loss": 0.96565895, "memory(GiB)": 369.4, "step": 17355, "train_speed(iter/s)": 0.201259 }, { "acc": 0.75990353, "epoch": 0.44038559107052255, "grad_norm": 1.921875, "learning_rate": 9.22877619533783e-06, "loss": 0.94862099, "memory(GiB)": 369.4, "step": 17360, "train_speed(iter/s)": 0.201268 }, { "acc": 0.74972258, "epoch": 0.44051243023845765, "grad_norm": 1.890625, "learning_rate": 9.228216587094014e-06, "loss": 1.00141973, "memory(GiB)": 369.4, "step": 17365, "train_speed(iter/s)": 0.201277 }, { "acc": 0.76207628, "epoch": 0.4406392694063927, "grad_norm": 2.5, "learning_rate": 9.227656792874358e-06, "loss": 0.93520126, "memory(GiB)": 369.4, "step": 17370, "train_speed(iter/s)": 0.201284 }, { "acc": 0.74408054, "epoch": 0.44076610857432774, "grad_norm": 2.4375, "learning_rate": 9.227096812703479e-06, "loss": 1.06480732, "memory(GiB)": 369.4, "step": 17375, "train_speed(iter/s)": 0.20129 }, { "acc": 0.74842815, "epoch": 0.4408929477422628, "grad_norm": 2.390625, "learning_rate": 9.22653664660601e-06, "loss": 1.01262035, "memory(GiB)": 369.4, "step": 17380, "train_speed(iter/s)": 0.201302 }, { "acc": 0.74871693, "epoch": 0.4410197869101979, "grad_norm": 1.8515625, "learning_rate": 9.225976294606589e-06, "loss": 1.04699087, "memory(GiB)": 369.4, "step": 17385, "train_speed(iter/s)": 0.201311 }, { "acc": 0.7421402, "epoch": 0.44114662607813293, "grad_norm": 2.046875, "learning_rate": 9.225415756729863e-06, "loss": 0.99829206, "memory(GiB)": 369.4, "step": 17390, "train_speed(iter/s)": 0.20132 }, { "acc": 0.73628402, "epoch": 0.441273465246068, "grad_norm": 2.65625, "learning_rate": 9.224855033000489e-06, "loss": 1.04391146, "memory(GiB)": 369.4, "step": 17395, "train_speed(iter/s)": 0.201329 }, { "acc": 0.75138493, "epoch": 0.441400304414003, "grad_norm": 2.03125, "learning_rate": 9.224294123443125e-06, "loss": 1.01333847, "memory(GiB)": 369.4, "step": 17400, "train_speed(iter/s)": 0.201325 }, { "acc": 0.75512419, "epoch": 0.4415271435819381, "grad_norm": 1.953125, "learning_rate": 9.223733028082447e-06, "loss": 0.98365192, "memory(GiB)": 369.4, "step": 17405, "train_speed(iter/s)": 0.201333 }, { "acc": 0.74607964, "epoch": 0.44165398274987316, "grad_norm": 2.40625, "learning_rate": 9.223171746943132e-06, "loss": 1.00665846, "memory(GiB)": 369.4, "step": 17410, "train_speed(iter/s)": 0.201345 }, { "acc": 0.74941282, "epoch": 0.4417808219178082, "grad_norm": 2.0625, "learning_rate": 9.222610280049868e-06, "loss": 1.05214424, "memory(GiB)": 369.4, "step": 17415, "train_speed(iter/s)": 0.201352 }, { "acc": 0.73495712, "epoch": 0.44190766108574325, "grad_norm": 2.28125, "learning_rate": 9.222048627427352e-06, "loss": 1.05749741, "memory(GiB)": 369.4, "step": 17420, "train_speed(iter/s)": 0.201363 }, { "acc": 0.74489532, "epoch": 0.44203450025367835, "grad_norm": 2.5, "learning_rate": 9.221486789100288e-06, "loss": 1.04973106, "memory(GiB)": 369.4, "step": 17425, "train_speed(iter/s)": 0.201371 }, { "acc": 0.74373188, "epoch": 0.4421613394216134, "grad_norm": 2.640625, "learning_rate": 9.220924765093386e-06, "loss": 1.01561966, "memory(GiB)": 369.4, "step": 17430, "train_speed(iter/s)": 0.201376 }, { "acc": 0.73928509, "epoch": 0.44228817858954844, "grad_norm": 1.9609375, "learning_rate": 9.220362555431369e-06, "loss": 1.02770519, "memory(GiB)": 369.4, "step": 17435, "train_speed(iter/s)": 0.201387 }, { "acc": 0.73613758, "epoch": 0.4424150177574835, "grad_norm": 1.9609375, "learning_rate": 9.219800160138964e-06, "loss": 0.96730881, "memory(GiB)": 369.4, "step": 17440, "train_speed(iter/s)": 0.201395 }, { "acc": 0.73804078, "epoch": 0.4425418569254186, "grad_norm": 2.359375, "learning_rate": 9.219237579240907e-06, "loss": 1.04557438, "memory(GiB)": 369.4, "step": 17445, "train_speed(iter/s)": 0.2014 }, { "acc": 0.7547967, "epoch": 0.44266869609335363, "grad_norm": 2.15625, "learning_rate": 9.218674812761946e-06, "loss": 1.00423651, "memory(GiB)": 369.4, "step": 17450, "train_speed(iter/s)": 0.201406 }, { "acc": 0.7518693, "epoch": 0.4427955352612887, "grad_norm": 1.8203125, "learning_rate": 9.21811186072683e-06, "loss": 0.98945627, "memory(GiB)": 369.4, "step": 17455, "train_speed(iter/s)": 0.201411 }, { "acc": 0.75180635, "epoch": 0.4429223744292237, "grad_norm": 2.4375, "learning_rate": 9.21754872316032e-06, "loss": 0.9962862, "memory(GiB)": 369.4, "step": 17460, "train_speed(iter/s)": 0.201418 }, { "acc": 0.75894451, "epoch": 0.4430492135971588, "grad_norm": 2.625, "learning_rate": 9.21698540008719e-06, "loss": 0.98012629, "memory(GiB)": 369.4, "step": 17465, "train_speed(iter/s)": 0.201426 }, { "acc": 0.74002752, "epoch": 0.44317605276509386, "grad_norm": 2.21875, "learning_rate": 9.216421891532214e-06, "loss": 0.99437733, "memory(GiB)": 369.4, "step": 17470, "train_speed(iter/s)": 0.201427 }, { "acc": 0.76314516, "epoch": 0.4433028919330289, "grad_norm": 2.234375, "learning_rate": 9.215858197520178e-06, "loss": 0.98497124, "memory(GiB)": 369.4, "step": 17475, "train_speed(iter/s)": 0.201431 }, { "acc": 0.75389328, "epoch": 0.44342973110096395, "grad_norm": 2.21875, "learning_rate": 9.215294318075876e-06, "loss": 1.02279587, "memory(GiB)": 369.4, "step": 17480, "train_speed(iter/s)": 0.201441 }, { "acc": 0.75236015, "epoch": 0.44355657026889905, "grad_norm": 2.203125, "learning_rate": 9.21473025322411e-06, "loss": 1.01407566, "memory(GiB)": 369.4, "step": 17485, "train_speed(iter/s)": 0.201452 }, { "acc": 0.74452085, "epoch": 0.4436834094368341, "grad_norm": 2.0625, "learning_rate": 9.21416600298969e-06, "loss": 1.05197124, "memory(GiB)": 369.4, "step": 17490, "train_speed(iter/s)": 0.201462 }, { "acc": 0.75304756, "epoch": 0.44381024860476914, "grad_norm": 2.125, "learning_rate": 9.213601567397434e-06, "loss": 0.94705849, "memory(GiB)": 369.4, "step": 17495, "train_speed(iter/s)": 0.201462 }, { "acc": 0.73187027, "epoch": 0.4439370877727042, "grad_norm": 2.375, "learning_rate": 9.213036946472169e-06, "loss": 1.07970562, "memory(GiB)": 369.4, "step": 17500, "train_speed(iter/s)": 0.201471 }, { "acc": 0.73354235, "epoch": 0.4440639269406393, "grad_norm": 2.125, "learning_rate": 9.212472140238729e-06, "loss": 1.02469997, "memory(GiB)": 369.4, "step": 17505, "train_speed(iter/s)": 0.201482 }, { "acc": 0.7486043, "epoch": 0.44419076610857433, "grad_norm": 2.359375, "learning_rate": 9.211907148721958e-06, "loss": 1.01872597, "memory(GiB)": 369.4, "step": 17510, "train_speed(iter/s)": 0.201495 }, { "acc": 0.74074974, "epoch": 0.4443176052765094, "grad_norm": 1.8828125, "learning_rate": 9.211341971946705e-06, "loss": 1.02796841, "memory(GiB)": 369.4, "step": 17515, "train_speed(iter/s)": 0.201499 }, { "acc": 0.75427575, "epoch": 0.4444444444444444, "grad_norm": 2.046875, "learning_rate": 9.21077660993783e-06, "loss": 1.00446062, "memory(GiB)": 369.4, "step": 17520, "train_speed(iter/s)": 0.201508 }, { "acc": 0.74387898, "epoch": 0.4445712836123795, "grad_norm": 2.359375, "learning_rate": 9.210211062720198e-06, "loss": 1.05232067, "memory(GiB)": 369.4, "step": 17525, "train_speed(iter/s)": 0.201508 }, { "acc": 0.72803211, "epoch": 0.44469812278031456, "grad_norm": 1.9375, "learning_rate": 9.209645330318689e-06, "loss": 1.07042866, "memory(GiB)": 369.4, "step": 17530, "train_speed(iter/s)": 0.201514 }, { "acc": 0.74251318, "epoch": 0.4448249619482496, "grad_norm": 2.375, "learning_rate": 9.209079412758183e-06, "loss": 1.03818741, "memory(GiB)": 369.4, "step": 17535, "train_speed(iter/s)": 0.20152 }, { "acc": 0.74345551, "epoch": 0.44495180111618465, "grad_norm": 2.21875, "learning_rate": 9.208513310063572e-06, "loss": 1.01294117, "memory(GiB)": 369.4, "step": 17540, "train_speed(iter/s)": 0.201527 }, { "acc": 0.75645642, "epoch": 0.44507864028411975, "grad_norm": 2.65625, "learning_rate": 9.207947022259755e-06, "loss": 1.01468601, "memory(GiB)": 369.4, "step": 17545, "train_speed(iter/s)": 0.201539 }, { "acc": 0.7495223, "epoch": 0.4452054794520548, "grad_norm": 2.578125, "learning_rate": 9.207380549371642e-06, "loss": 0.99451294, "memory(GiB)": 369.4, "step": 17550, "train_speed(iter/s)": 0.201546 }, { "acc": 0.73662448, "epoch": 0.44533231861998984, "grad_norm": 2.40625, "learning_rate": 9.206813891424147e-06, "loss": 1.0573, "memory(GiB)": 369.4, "step": 17555, "train_speed(iter/s)": 0.201548 }, { "acc": 0.74277811, "epoch": 0.4454591577879249, "grad_norm": 2.09375, "learning_rate": 9.206247048442196e-06, "loss": 1.04212799, "memory(GiB)": 369.4, "step": 17560, "train_speed(iter/s)": 0.201555 }, { "acc": 0.75115504, "epoch": 0.44558599695586, "grad_norm": 2.078125, "learning_rate": 9.20568002045072e-06, "loss": 1.0272975, "memory(GiB)": 369.4, "step": 17565, "train_speed(iter/s)": 0.201564 }, { "acc": 0.73808341, "epoch": 0.44571283612379503, "grad_norm": 1.90625, "learning_rate": 9.20511280747466e-06, "loss": 1.01410637, "memory(GiB)": 369.4, "step": 17570, "train_speed(iter/s)": 0.201574 }, { "acc": 0.75342169, "epoch": 0.4458396752917301, "grad_norm": 1.6796875, "learning_rate": 9.204545409538962e-06, "loss": 0.93682728, "memory(GiB)": 369.4, "step": 17575, "train_speed(iter/s)": 0.201585 }, { "acc": 0.7302732, "epoch": 0.4459665144596651, "grad_norm": 1.953125, "learning_rate": 9.203977826668587e-06, "loss": 1.10469418, "memory(GiB)": 369.4, "step": 17580, "train_speed(iter/s)": 0.201594 }, { "acc": 0.76051388, "epoch": 0.4460933536276002, "grad_norm": 2.234375, "learning_rate": 9.203410058888498e-06, "loss": 0.97201471, "memory(GiB)": 369.4, "step": 17585, "train_speed(iter/s)": 0.2016 }, { "acc": 0.75024633, "epoch": 0.44622019279553526, "grad_norm": 2.140625, "learning_rate": 9.202842106223667e-06, "loss": 1.01515083, "memory(GiB)": 369.4, "step": 17590, "train_speed(iter/s)": 0.201607 }, { "acc": 0.73698301, "epoch": 0.4463470319634703, "grad_norm": 2.28125, "learning_rate": 9.202273968699075e-06, "loss": 1.07098389, "memory(GiB)": 369.4, "step": 17595, "train_speed(iter/s)": 0.201618 }, { "acc": 0.75365372, "epoch": 0.44647387113140535, "grad_norm": 2.640625, "learning_rate": 9.201705646339714e-06, "loss": 1.02034321, "memory(GiB)": 369.4, "step": 17600, "train_speed(iter/s)": 0.201628 }, { "acc": 0.74920959, "epoch": 0.44660071029934045, "grad_norm": 1.8828125, "learning_rate": 9.201137139170578e-06, "loss": 0.98317356, "memory(GiB)": 369.4, "step": 17605, "train_speed(iter/s)": 0.201639 }, { "acc": 0.75674171, "epoch": 0.4467275494672755, "grad_norm": 2.0, "learning_rate": 9.200568447216673e-06, "loss": 0.99442539, "memory(GiB)": 369.4, "step": 17610, "train_speed(iter/s)": 0.201648 }, { "acc": 0.74670782, "epoch": 0.44685438863521054, "grad_norm": 1.8984375, "learning_rate": 9.199999570503015e-06, "loss": 0.97269583, "memory(GiB)": 369.4, "step": 17615, "train_speed(iter/s)": 0.201655 }, { "acc": 0.71821995, "epoch": 0.4469812278031456, "grad_norm": 1.7265625, "learning_rate": 9.199430509054625e-06, "loss": 1.1292387, "memory(GiB)": 369.4, "step": 17620, "train_speed(iter/s)": 0.201664 }, { "acc": 0.74316244, "epoch": 0.4471080669710807, "grad_norm": 2.359375, "learning_rate": 9.19886126289653e-06, "loss": 1.02212286, "memory(GiB)": 369.4, "step": 17625, "train_speed(iter/s)": 0.201668 }, { "acc": 0.73983431, "epoch": 0.44723490613901573, "grad_norm": 2.203125, "learning_rate": 9.198291832053771e-06, "loss": 1.04138718, "memory(GiB)": 369.4, "step": 17630, "train_speed(iter/s)": 0.201675 }, { "acc": 0.74982023, "epoch": 0.4473617453069508, "grad_norm": 2.34375, "learning_rate": 9.197722216551395e-06, "loss": 1.03402424, "memory(GiB)": 369.4, "step": 17635, "train_speed(iter/s)": 0.201684 }, { "acc": 0.7580183, "epoch": 0.4474885844748858, "grad_norm": 1.671875, "learning_rate": 9.197152416414452e-06, "loss": 0.9450408, "memory(GiB)": 369.4, "step": 17640, "train_speed(iter/s)": 0.201691 }, { "acc": 0.74344339, "epoch": 0.4476154236428209, "grad_norm": 2.0, "learning_rate": 9.196582431668007e-06, "loss": 1.08838062, "memory(GiB)": 369.4, "step": 17645, "train_speed(iter/s)": 0.201696 }, { "acc": 0.74260559, "epoch": 0.44774226281075596, "grad_norm": 2.546875, "learning_rate": 9.196012262337131e-06, "loss": 0.97923164, "memory(GiB)": 369.4, "step": 17650, "train_speed(iter/s)": 0.201701 }, { "acc": 0.74017396, "epoch": 0.447869101978691, "grad_norm": 2.203125, "learning_rate": 9.195441908446902e-06, "loss": 1.07403374, "memory(GiB)": 369.4, "step": 17655, "train_speed(iter/s)": 0.201712 }, { "acc": 0.74194298, "epoch": 0.44799594114662605, "grad_norm": 2.28125, "learning_rate": 9.194871370022407e-06, "loss": 1.02744083, "memory(GiB)": 369.4, "step": 17660, "train_speed(iter/s)": 0.201724 }, { "acc": 0.75011759, "epoch": 0.44812278031456115, "grad_norm": 2.234375, "learning_rate": 9.194300647088739e-06, "loss": 1.01930599, "memory(GiB)": 369.4, "step": 17665, "train_speed(iter/s)": 0.201732 }, { "acc": 0.73174372, "epoch": 0.4482496194824962, "grad_norm": 2.03125, "learning_rate": 9.193729739671002e-06, "loss": 1.02854519, "memory(GiB)": 369.4, "step": 17670, "train_speed(iter/s)": 0.201741 }, { "acc": 0.74374433, "epoch": 0.44837645865043124, "grad_norm": 1.875, "learning_rate": 9.193158647794308e-06, "loss": 1.01801748, "memory(GiB)": 369.4, "step": 17675, "train_speed(iter/s)": 0.201747 }, { "acc": 0.73532324, "epoch": 0.4485032978183663, "grad_norm": 2.28125, "learning_rate": 9.192587371483777e-06, "loss": 1.06471348, "memory(GiB)": 369.4, "step": 17680, "train_speed(iter/s)": 0.201755 }, { "acc": 0.72834902, "epoch": 0.4486301369863014, "grad_norm": 2.34375, "learning_rate": 9.192015910764535e-06, "loss": 1.10776348, "memory(GiB)": 369.4, "step": 17685, "train_speed(iter/s)": 0.201767 }, { "acc": 0.74633636, "epoch": 0.44875697615423643, "grad_norm": 2.4375, "learning_rate": 9.191444265661715e-06, "loss": 1.04296494, "memory(GiB)": 369.4, "step": 17690, "train_speed(iter/s)": 0.201776 }, { "acc": 0.74536209, "epoch": 0.4488838153221715, "grad_norm": 1.9609375, "learning_rate": 9.190872436200464e-06, "loss": 1.02854033, "memory(GiB)": 369.4, "step": 17695, "train_speed(iter/s)": 0.201783 }, { "acc": 0.73646364, "epoch": 0.4490106544901065, "grad_norm": 2.25, "learning_rate": 9.190300422405933e-06, "loss": 1.02117052, "memory(GiB)": 369.4, "step": 17700, "train_speed(iter/s)": 0.201794 }, { "acc": 0.74075427, "epoch": 0.4491374936580416, "grad_norm": 2.15625, "learning_rate": 9.18972822430328e-06, "loss": 1.01075544, "memory(GiB)": 369.4, "step": 17705, "train_speed(iter/s)": 0.201803 }, { "acc": 0.74553733, "epoch": 0.44926433282597666, "grad_norm": 2.109375, "learning_rate": 9.189155841917675e-06, "loss": 0.96991711, "memory(GiB)": 369.4, "step": 17710, "train_speed(iter/s)": 0.20181 }, { "acc": 0.74645452, "epoch": 0.4493911719939117, "grad_norm": 2.296875, "learning_rate": 9.18858327527429e-06, "loss": 1.02086277, "memory(GiB)": 369.4, "step": 17715, "train_speed(iter/s)": 0.201823 }, { "acc": 0.73718424, "epoch": 0.44951801116184675, "grad_norm": 1.921875, "learning_rate": 9.188010524398314e-06, "loss": 1.04050198, "memory(GiB)": 369.4, "step": 17720, "train_speed(iter/s)": 0.201829 }, { "acc": 0.7673872, "epoch": 0.44964485032978185, "grad_norm": 2.40625, "learning_rate": 9.187437589314939e-06, "loss": 0.9283143, "memory(GiB)": 369.4, "step": 17725, "train_speed(iter/s)": 0.201835 }, { "acc": 0.7363945, "epoch": 0.4497716894977169, "grad_norm": 2.171875, "learning_rate": 9.186864470049358e-06, "loss": 1.01941681, "memory(GiB)": 369.4, "step": 17730, "train_speed(iter/s)": 0.201845 }, { "acc": 0.75690622, "epoch": 0.44989852866565194, "grad_norm": 1.8125, "learning_rate": 9.186291166626789e-06, "loss": 0.9808403, "memory(GiB)": 369.4, "step": 17735, "train_speed(iter/s)": 0.201855 }, { "acc": 0.74556808, "epoch": 0.450025367833587, "grad_norm": 2.21875, "learning_rate": 9.185717679072444e-06, "loss": 1.01032887, "memory(GiB)": 369.4, "step": 17740, "train_speed(iter/s)": 0.201864 }, { "acc": 0.74152021, "epoch": 0.4501522070015221, "grad_norm": 1.9296875, "learning_rate": 9.185144007411547e-06, "loss": 1.01042166, "memory(GiB)": 369.4, "step": 17745, "train_speed(iter/s)": 0.201871 }, { "acc": 0.72997952, "epoch": 0.45027904616945713, "grad_norm": 1.96875, "learning_rate": 9.18457015166933e-06, "loss": 1.0259573, "memory(GiB)": 369.4, "step": 17750, "train_speed(iter/s)": 0.201879 }, { "acc": 0.74847565, "epoch": 0.4504058853373922, "grad_norm": 2.65625, "learning_rate": 9.183996111871034e-06, "loss": 1.02460861, "memory(GiB)": 369.4, "step": 17755, "train_speed(iter/s)": 0.201886 }, { "acc": 0.74466124, "epoch": 0.4505327245053272, "grad_norm": 2.25, "learning_rate": 9.18342188804191e-06, "loss": 1.05888424, "memory(GiB)": 369.4, "step": 17760, "train_speed(iter/s)": 0.201894 }, { "acc": 0.73978944, "epoch": 0.4506595636732623, "grad_norm": 2.171875, "learning_rate": 9.182847480207215e-06, "loss": 1.00123682, "memory(GiB)": 369.4, "step": 17765, "train_speed(iter/s)": 0.201898 }, { "acc": 0.74871674, "epoch": 0.45078640284119736, "grad_norm": 1.9375, "learning_rate": 9.182272888392211e-06, "loss": 0.98247433, "memory(GiB)": 369.4, "step": 17770, "train_speed(iter/s)": 0.201906 }, { "acc": 0.73652177, "epoch": 0.4509132420091324, "grad_norm": 2.3125, "learning_rate": 9.181698112622175e-06, "loss": 1.01969547, "memory(GiB)": 369.4, "step": 17775, "train_speed(iter/s)": 0.201915 }, { "acc": 0.72153745, "epoch": 0.45104008117706745, "grad_norm": 2.03125, "learning_rate": 9.181123152922384e-06, "loss": 1.03437443, "memory(GiB)": 369.4, "step": 17780, "train_speed(iter/s)": 0.201922 }, { "acc": 0.73947682, "epoch": 0.45116692034500255, "grad_norm": 2.0, "learning_rate": 9.18054800931813e-06, "loss": 1.03232765, "memory(GiB)": 369.4, "step": 17785, "train_speed(iter/s)": 0.201931 }, { "acc": 0.74758563, "epoch": 0.4512937595129376, "grad_norm": 2.0625, "learning_rate": 9.17997268183471e-06, "loss": 0.98439407, "memory(GiB)": 369.4, "step": 17790, "train_speed(iter/s)": 0.201939 }, { "acc": 0.73698263, "epoch": 0.45142059868087264, "grad_norm": 2.21875, "learning_rate": 9.17939717049743e-06, "loss": 1.05883799, "memory(GiB)": 369.4, "step": 17795, "train_speed(iter/s)": 0.20195 }, { "acc": 0.73664875, "epoch": 0.4515474378488077, "grad_norm": 2.515625, "learning_rate": 9.1788214753316e-06, "loss": 1.05040169, "memory(GiB)": 369.4, "step": 17800, "train_speed(iter/s)": 0.201956 }, { "acc": 0.75258622, "epoch": 0.4516742770167428, "grad_norm": 2.25, "learning_rate": 9.178245596362546e-06, "loss": 1.01810551, "memory(GiB)": 369.4, "step": 17805, "train_speed(iter/s)": 0.201962 }, { "acc": 0.74351168, "epoch": 0.45180111618467783, "grad_norm": 2.40625, "learning_rate": 9.177669533615599e-06, "loss": 0.98839397, "memory(GiB)": 369.4, "step": 17810, "train_speed(iter/s)": 0.201972 }, { "acc": 0.74734817, "epoch": 0.4519279553526129, "grad_norm": 2.09375, "learning_rate": 9.17709328711609e-06, "loss": 0.97849216, "memory(GiB)": 369.4, "step": 17815, "train_speed(iter/s)": 0.201981 }, { "acc": 0.73763008, "epoch": 0.4520547945205479, "grad_norm": 2.140625, "learning_rate": 9.17651685688937e-06, "loss": 1.02706661, "memory(GiB)": 369.4, "step": 17820, "train_speed(iter/s)": 0.20199 }, { "acc": 0.74965568, "epoch": 0.452181633688483, "grad_norm": 2.203125, "learning_rate": 9.175940242960792e-06, "loss": 0.99834309, "memory(GiB)": 369.4, "step": 17825, "train_speed(iter/s)": 0.202003 }, { "acc": 0.7389596, "epoch": 0.45230847285641806, "grad_norm": 2.015625, "learning_rate": 9.175363445355718e-06, "loss": 1.05864296, "memory(GiB)": 369.4, "step": 17830, "train_speed(iter/s)": 0.202013 }, { "acc": 0.74928932, "epoch": 0.4524353120243531, "grad_norm": 2.171875, "learning_rate": 9.174786464099519e-06, "loss": 1.02561512, "memory(GiB)": 369.4, "step": 17835, "train_speed(iter/s)": 0.202021 }, { "acc": 0.73845687, "epoch": 0.45256215119228815, "grad_norm": 2.296875, "learning_rate": 9.17420929921757e-06, "loss": 1.04434834, "memory(GiB)": 369.4, "step": 17840, "train_speed(iter/s)": 0.202032 }, { "acc": 0.75545769, "epoch": 0.45268899036022325, "grad_norm": 2.171875, "learning_rate": 9.173631950735262e-06, "loss": 0.97618799, "memory(GiB)": 369.4, "step": 17845, "train_speed(iter/s)": 0.202038 }, { "acc": 0.75515079, "epoch": 0.4528158295281583, "grad_norm": 2.21875, "learning_rate": 9.173054418677985e-06, "loss": 1.03874111, "memory(GiB)": 369.4, "step": 17850, "train_speed(iter/s)": 0.202049 }, { "acc": 0.74751377, "epoch": 0.45294266869609334, "grad_norm": 2.46875, "learning_rate": 9.172476703071145e-06, "loss": 1.02088223, "memory(GiB)": 369.4, "step": 17855, "train_speed(iter/s)": 0.202055 }, { "acc": 0.74832358, "epoch": 0.4530695078640284, "grad_norm": 2.125, "learning_rate": 9.171898803940148e-06, "loss": 0.99074135, "memory(GiB)": 369.4, "step": 17860, "train_speed(iter/s)": 0.202063 }, { "acc": 0.75987263, "epoch": 0.4531963470319635, "grad_norm": 2.1875, "learning_rate": 9.17132072131042e-06, "loss": 0.95746012, "memory(GiB)": 369.4, "step": 17865, "train_speed(iter/s)": 0.202071 }, { "acc": 0.75347157, "epoch": 0.45332318619989853, "grad_norm": 2.390625, "learning_rate": 9.170742455207378e-06, "loss": 1.003438, "memory(GiB)": 369.4, "step": 17870, "train_speed(iter/s)": 0.202065 }, { "acc": 0.75301094, "epoch": 0.4534500253678336, "grad_norm": 2.4375, "learning_rate": 9.170164005656465e-06, "loss": 0.9961442, "memory(GiB)": 369.4, "step": 17875, "train_speed(iter/s)": 0.202071 }, { "acc": 0.76303406, "epoch": 0.4535768645357686, "grad_norm": 2.15625, "learning_rate": 9.169585372683118e-06, "loss": 0.93577347, "memory(GiB)": 369.4, "step": 17880, "train_speed(iter/s)": 0.202074 }, { "acc": 0.74699087, "epoch": 0.4537037037037037, "grad_norm": 2.1875, "learning_rate": 9.169006556312794e-06, "loss": 1.0051034, "memory(GiB)": 369.4, "step": 17885, "train_speed(iter/s)": 0.202081 }, { "acc": 0.73671579, "epoch": 0.45383054287163876, "grad_norm": 2.1875, "learning_rate": 9.168427556570946e-06, "loss": 1.05653305, "memory(GiB)": 369.4, "step": 17890, "train_speed(iter/s)": 0.202076 }, { "acc": 0.75194211, "epoch": 0.4539573820395738, "grad_norm": 2.203125, "learning_rate": 9.167848373483044e-06, "loss": 1.00304947, "memory(GiB)": 369.4, "step": 17895, "train_speed(iter/s)": 0.202086 }, { "acc": 0.73845911, "epoch": 0.45408422120750885, "grad_norm": 2.046875, "learning_rate": 9.167269007074561e-06, "loss": 1.03497896, "memory(GiB)": 369.4, "step": 17900, "train_speed(iter/s)": 0.202089 }, { "acc": 0.74795504, "epoch": 0.45421106037544395, "grad_norm": 1.734375, "learning_rate": 9.166689457370983e-06, "loss": 1.01098623, "memory(GiB)": 369.4, "step": 17905, "train_speed(iter/s)": 0.202099 }, { "acc": 0.7300601, "epoch": 0.454337899543379, "grad_norm": 2.1875, "learning_rate": 9.166109724397801e-06, "loss": 1.05195656, "memory(GiB)": 369.4, "step": 17910, "train_speed(iter/s)": 0.202107 }, { "acc": 0.74865026, "epoch": 0.45446473871131404, "grad_norm": 2.4375, "learning_rate": 9.165529808180511e-06, "loss": 1.00273991, "memory(GiB)": 369.4, "step": 17915, "train_speed(iter/s)": 0.202118 }, { "acc": 0.74625731, "epoch": 0.4545915778792491, "grad_norm": 1.9296875, "learning_rate": 9.164949708744622e-06, "loss": 1.02605133, "memory(GiB)": 369.4, "step": 17920, "train_speed(iter/s)": 0.202124 }, { "acc": 0.7459856, "epoch": 0.4547184170471842, "grad_norm": 2.953125, "learning_rate": 9.164369426115652e-06, "loss": 0.97921267, "memory(GiB)": 369.4, "step": 17925, "train_speed(iter/s)": 0.202131 }, { "acc": 0.73749609, "epoch": 0.45484525621511923, "grad_norm": 2.203125, "learning_rate": 9.16378896031912e-06, "loss": 1.0624239, "memory(GiB)": 369.4, "step": 17930, "train_speed(iter/s)": 0.202138 }, { "acc": 0.74533739, "epoch": 0.4549720953830543, "grad_norm": 2.234375, "learning_rate": 9.163208311380561e-06, "loss": 1.00259705, "memory(GiB)": 369.4, "step": 17935, "train_speed(iter/s)": 0.202147 }, { "acc": 0.74668674, "epoch": 0.4550989345509893, "grad_norm": 2.015625, "learning_rate": 9.162627479325511e-06, "loss": 0.99381886, "memory(GiB)": 369.4, "step": 17940, "train_speed(iter/s)": 0.202155 }, { "acc": 0.75724192, "epoch": 0.4552257737189244, "grad_norm": 2.328125, "learning_rate": 9.16204646417952e-06, "loss": 1.00283051, "memory(GiB)": 369.4, "step": 17945, "train_speed(iter/s)": 0.202162 }, { "acc": 0.74857779, "epoch": 0.45535261288685946, "grad_norm": 1.9375, "learning_rate": 9.161465265968145e-06, "loss": 0.96320095, "memory(GiB)": 369.4, "step": 17950, "train_speed(iter/s)": 0.202165 }, { "acc": 0.75910769, "epoch": 0.4554794520547945, "grad_norm": 2.515625, "learning_rate": 9.160883884716948e-06, "loss": 0.96844273, "memory(GiB)": 369.4, "step": 17955, "train_speed(iter/s)": 0.202174 }, { "acc": 0.74531116, "epoch": 0.45560629122272955, "grad_norm": 1.9609375, "learning_rate": 9.1603023204515e-06, "loss": 1.02774754, "memory(GiB)": 369.4, "step": 17960, "train_speed(iter/s)": 0.202184 }, { "acc": 0.72903686, "epoch": 0.45573313039066465, "grad_norm": 1.8203125, "learning_rate": 9.15972057319738e-06, "loss": 1.01113243, "memory(GiB)": 369.4, "step": 17965, "train_speed(iter/s)": 0.202192 }, { "acc": 0.74989791, "epoch": 0.4558599695585997, "grad_norm": 2.375, "learning_rate": 9.159138642980178e-06, "loss": 1.02440567, "memory(GiB)": 369.4, "step": 17970, "train_speed(iter/s)": 0.202201 }, { "acc": 0.74799881, "epoch": 0.45598680872653474, "grad_norm": 2.5625, "learning_rate": 9.15855652982549e-06, "loss": 1.04383144, "memory(GiB)": 369.4, "step": 17975, "train_speed(iter/s)": 0.202209 }, { "acc": 0.75298014, "epoch": 0.4561136478944698, "grad_norm": 2.953125, "learning_rate": 9.15797423375892e-06, "loss": 1.02381001, "memory(GiB)": 369.4, "step": 17980, "train_speed(iter/s)": 0.202215 }, { "acc": 0.74485617, "epoch": 0.4562404870624049, "grad_norm": 2.390625, "learning_rate": 9.157391754806079e-06, "loss": 1.01816597, "memory(GiB)": 369.4, "step": 17985, "train_speed(iter/s)": 0.20222 }, { "acc": 0.74530783, "epoch": 0.45636732623033993, "grad_norm": 2.34375, "learning_rate": 9.156809092992588e-06, "loss": 1.01202679, "memory(GiB)": 369.4, "step": 17990, "train_speed(iter/s)": 0.202228 }, { "acc": 0.74497728, "epoch": 0.456494165398275, "grad_norm": 2.390625, "learning_rate": 9.156226248344072e-06, "loss": 1.03061209, "memory(GiB)": 369.4, "step": 17995, "train_speed(iter/s)": 0.202234 }, { "acc": 0.74958439, "epoch": 0.45662100456621, "grad_norm": 2.34375, "learning_rate": 9.15564322088617e-06, "loss": 1.04801359, "memory(GiB)": 369.4, "step": 18000, "train_speed(iter/s)": 0.202237 }, { "epoch": 0.45662100456621, "eval_acc": 0.7340597749821718, "eval_loss": 0.9894003868103027, "eval_runtime": 384.6537, "eval_samples_per_second": 16.56, "eval_steps_per_second": 8.28, "step": 18000 }, { "acc": 0.74138498, "epoch": 0.4567478437341451, "grad_norm": 2.234375, "learning_rate": 9.155060010644525e-06, "loss": 1.04365826, "memory(GiB)": 369.4, "step": 18005, "train_speed(iter/s)": 0.20064 }, { "acc": 0.75534167, "epoch": 0.45687468290208016, "grad_norm": 2.359375, "learning_rate": 9.154476617644792e-06, "loss": 0.9776968, "memory(GiB)": 369.4, "step": 18010, "train_speed(iter/s)": 0.200648 }, { "acc": 0.74404125, "epoch": 0.4570015220700152, "grad_norm": 2.578125, "learning_rate": 9.153893041912627e-06, "loss": 0.96656513, "memory(GiB)": 369.4, "step": 18015, "train_speed(iter/s)": 0.200659 }, { "acc": 0.75821233, "epoch": 0.45712836123795025, "grad_norm": 2.234375, "learning_rate": 9.1533092834737e-06, "loss": 0.98463163, "memory(GiB)": 369.4, "step": 18020, "train_speed(iter/s)": 0.200669 }, { "acc": 0.7411499, "epoch": 0.45725520040588535, "grad_norm": 2.109375, "learning_rate": 9.152725342353688e-06, "loss": 1.0659585, "memory(GiB)": 369.4, "step": 18025, "train_speed(iter/s)": 0.200677 }, { "acc": 0.73205814, "epoch": 0.4573820395738204, "grad_norm": 2.625, "learning_rate": 9.152141218578276e-06, "loss": 1.05362949, "memory(GiB)": 369.4, "step": 18030, "train_speed(iter/s)": 0.200682 }, { "acc": 0.73219733, "epoch": 0.45750887874175544, "grad_norm": 2.203125, "learning_rate": 9.151556912173154e-06, "loss": 1.09473925, "memory(GiB)": 369.4, "step": 18035, "train_speed(iter/s)": 0.200685 }, { "acc": 0.74182577, "epoch": 0.4576357179096905, "grad_norm": 2.640625, "learning_rate": 9.150972423164024e-06, "loss": 1.04581089, "memory(GiB)": 369.4, "step": 18040, "train_speed(iter/s)": 0.200691 }, { "acc": 0.74273396, "epoch": 0.4577625570776256, "grad_norm": 2.578125, "learning_rate": 9.150387751576594e-06, "loss": 1.0552372, "memory(GiB)": 369.4, "step": 18045, "train_speed(iter/s)": 0.200698 }, { "acc": 0.73415737, "epoch": 0.45788939624556063, "grad_norm": 1.8359375, "learning_rate": 9.14980289743658e-06, "loss": 1.04551039, "memory(GiB)": 369.4, "step": 18050, "train_speed(iter/s)": 0.200702 }, { "acc": 0.73484373, "epoch": 0.4580162354134957, "grad_norm": 2.21875, "learning_rate": 9.149217860769708e-06, "loss": 1.05748177, "memory(GiB)": 369.4, "step": 18055, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75279303, "epoch": 0.4581430745814307, "grad_norm": 2.359375, "learning_rate": 9.14863264160171e-06, "loss": 0.9912426, "memory(GiB)": 369.4, "step": 18060, "train_speed(iter/s)": 0.200719 }, { "acc": 0.73784642, "epoch": 0.4582699137493658, "grad_norm": 2.328125, "learning_rate": 9.148047239958325e-06, "loss": 1.0952507, "memory(GiB)": 369.4, "step": 18065, "train_speed(iter/s)": 0.20073 }, { "acc": 0.73860369, "epoch": 0.45839675291730086, "grad_norm": 2.484375, "learning_rate": 9.147461655865302e-06, "loss": 1.06915188, "memory(GiB)": 369.4, "step": 18070, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75180755, "epoch": 0.4585235920852359, "grad_norm": 2.28125, "learning_rate": 9.1468758893484e-06, "loss": 1.05239239, "memory(GiB)": 369.4, "step": 18075, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75622911, "epoch": 0.45865043125317095, "grad_norm": 1.9609375, "learning_rate": 9.14628994043338e-06, "loss": 1.05468731, "memory(GiB)": 369.4, "step": 18080, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74619093, "epoch": 0.45877727042110605, "grad_norm": 2.0625, "learning_rate": 9.145703809146018e-06, "loss": 1.06425247, "memory(GiB)": 369.4, "step": 18085, "train_speed(iter/s)": 0.20076 }, { "acc": 0.73526816, "epoch": 0.4589041095890411, "grad_norm": 2.125, "learning_rate": 9.145117495512092e-06, "loss": 1.04578533, "memory(GiB)": 369.4, "step": 18090, "train_speed(iter/s)": 0.200773 }, { "acc": 0.74626245, "epoch": 0.45903094875697614, "grad_norm": 2.203125, "learning_rate": 9.144530999557393e-06, "loss": 1.07337265, "memory(GiB)": 369.4, "step": 18095, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75093088, "epoch": 0.4591577879249112, "grad_norm": 2.59375, "learning_rate": 9.143944321307718e-06, "loss": 1.02109776, "memory(GiB)": 369.4, "step": 18100, "train_speed(iter/s)": 0.200793 }, { "acc": 0.73603597, "epoch": 0.4592846270928463, "grad_norm": 1.921875, "learning_rate": 9.143357460788868e-06, "loss": 1.0454483, "memory(GiB)": 369.4, "step": 18105, "train_speed(iter/s)": 0.200797 }, { "acc": 0.72700391, "epoch": 0.45941146626078133, "grad_norm": 2.375, "learning_rate": 9.14277041802666e-06, "loss": 1.12168484, "memory(GiB)": 369.4, "step": 18110, "train_speed(iter/s)": 0.200804 }, { "acc": 0.7416193, "epoch": 0.4595383054287164, "grad_norm": 2.375, "learning_rate": 9.142183193046912e-06, "loss": 1.03198662, "memory(GiB)": 369.4, "step": 18115, "train_speed(iter/s)": 0.200808 }, { "acc": 0.75261698, "epoch": 0.4596651445966514, "grad_norm": 2.09375, "learning_rate": 9.141595785875453e-06, "loss": 0.97236958, "memory(GiB)": 369.4, "step": 18120, "train_speed(iter/s)": 0.200816 }, { "acc": 0.75195522, "epoch": 0.4597919837645865, "grad_norm": 1.8203125, "learning_rate": 9.141008196538122e-06, "loss": 1.02564859, "memory(GiB)": 369.4, "step": 18125, "train_speed(iter/s)": 0.200826 }, { "acc": 0.74609032, "epoch": 0.45991882293252156, "grad_norm": 2.265625, "learning_rate": 9.14042042506076e-06, "loss": 0.98958864, "memory(GiB)": 369.4, "step": 18130, "train_speed(iter/s)": 0.200837 }, { "acc": 0.7449522, "epoch": 0.4600456621004566, "grad_norm": 2.453125, "learning_rate": 9.139832471469224e-06, "loss": 0.99943295, "memory(GiB)": 369.4, "step": 18135, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75066152, "epoch": 0.46017250126839165, "grad_norm": 1.8671875, "learning_rate": 9.13924433578937e-06, "loss": 1.02682552, "memory(GiB)": 369.4, "step": 18140, "train_speed(iter/s)": 0.200848 }, { "acc": 0.74338999, "epoch": 0.46029934043632675, "grad_norm": 1.9140625, "learning_rate": 9.138656018047074e-06, "loss": 1.02882195, "memory(GiB)": 369.4, "step": 18145, "train_speed(iter/s)": 0.200855 }, { "acc": 0.7530694, "epoch": 0.4604261796042618, "grad_norm": 2.015625, "learning_rate": 9.138067518268206e-06, "loss": 1.00559196, "memory(GiB)": 369.4, "step": 18150, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75272436, "epoch": 0.46055301877219684, "grad_norm": 2.03125, "learning_rate": 9.137478836478654e-06, "loss": 0.97718849, "memory(GiB)": 369.4, "step": 18155, "train_speed(iter/s)": 0.200867 }, { "acc": 0.75344868, "epoch": 0.4606798579401319, "grad_norm": 2.171875, "learning_rate": 9.13688997270431e-06, "loss": 1.00888987, "memory(GiB)": 369.4, "step": 18160, "train_speed(iter/s)": 0.200876 }, { "acc": 0.77454405, "epoch": 0.460806697108067, "grad_norm": 2.1875, "learning_rate": 9.136300926971076e-06, "loss": 0.94853344, "memory(GiB)": 369.4, "step": 18165, "train_speed(iter/s)": 0.200883 }, { "acc": 0.73737049, "epoch": 0.46093353627600203, "grad_norm": 1.96875, "learning_rate": 9.135711699304858e-06, "loss": 1.09618778, "memory(GiB)": 369.4, "step": 18170, "train_speed(iter/s)": 0.200893 }, { "acc": 0.74433088, "epoch": 0.4610603754439371, "grad_norm": 2.0625, "learning_rate": 9.135122289731575e-06, "loss": 1.01100979, "memory(GiB)": 369.4, "step": 18175, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74349971, "epoch": 0.4611872146118721, "grad_norm": 2.4375, "learning_rate": 9.134532698277154e-06, "loss": 0.96857214, "memory(GiB)": 369.4, "step": 18180, "train_speed(iter/s)": 0.200913 }, { "acc": 0.74454579, "epoch": 0.4613140537798072, "grad_norm": 2.9375, "learning_rate": 9.133942924967524e-06, "loss": 1.04483891, "memory(GiB)": 369.4, "step": 18185, "train_speed(iter/s)": 0.200923 }, { "acc": 0.75815563, "epoch": 0.46144089294774226, "grad_norm": 2.84375, "learning_rate": 9.133352969828628e-06, "loss": 0.98471613, "memory(GiB)": 369.4, "step": 18190, "train_speed(iter/s)": 0.200932 }, { "acc": 0.74501786, "epoch": 0.4615677321156773, "grad_norm": 1.96875, "learning_rate": 9.132762832886416e-06, "loss": 0.98113403, "memory(GiB)": 369.4, "step": 18195, "train_speed(iter/s)": 0.20094 }, { "acc": 0.74272575, "epoch": 0.46169457128361235, "grad_norm": 2.15625, "learning_rate": 9.13217251416684e-06, "loss": 1.04615307, "memory(GiB)": 369.4, "step": 18200, "train_speed(iter/s)": 0.200948 }, { "acc": 0.75002785, "epoch": 0.46182141045154745, "grad_norm": 2.515625, "learning_rate": 9.131582013695867e-06, "loss": 1.00952892, "memory(GiB)": 369.4, "step": 18205, "train_speed(iter/s)": 0.200955 }, { "acc": 0.75288615, "epoch": 0.4619482496194825, "grad_norm": 2.421875, "learning_rate": 9.130991331499474e-06, "loss": 1.04158192, "memory(GiB)": 369.4, "step": 18210, "train_speed(iter/s)": 0.200963 }, { "acc": 0.76351953, "epoch": 0.46207508878741754, "grad_norm": 2.234375, "learning_rate": 9.130400467603637e-06, "loss": 0.94839516, "memory(GiB)": 369.4, "step": 18215, "train_speed(iter/s)": 0.200973 }, { "acc": 0.73385487, "epoch": 0.4622019279553526, "grad_norm": 2.546875, "learning_rate": 9.129809422034349e-06, "loss": 1.05464878, "memory(GiB)": 369.4, "step": 18220, "train_speed(iter/s)": 0.200984 }, { "acc": 0.75195284, "epoch": 0.4623287671232877, "grad_norm": 2.390625, "learning_rate": 9.129218194817601e-06, "loss": 1.05297489, "memory(GiB)": 369.4, "step": 18225, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74465923, "epoch": 0.46245560629122273, "grad_norm": 2.109375, "learning_rate": 9.128626785979404e-06, "loss": 1.06179609, "memory(GiB)": 369.4, "step": 18230, "train_speed(iter/s)": 0.201001 }, { "acc": 0.74586716, "epoch": 0.4625824454591578, "grad_norm": 2.421875, "learning_rate": 9.128035195545766e-06, "loss": 1.03171902, "memory(GiB)": 369.4, "step": 18235, "train_speed(iter/s)": 0.201008 }, { "acc": 0.72525787, "epoch": 0.4627092846270928, "grad_norm": 2.1875, "learning_rate": 9.12744342354271e-06, "loss": 1.12117481, "memory(GiB)": 369.4, "step": 18240, "train_speed(iter/s)": 0.201014 }, { "acc": 0.74951391, "epoch": 0.4628361237950279, "grad_norm": 2.421875, "learning_rate": 9.126851469996265e-06, "loss": 0.99785452, "memory(GiB)": 369.4, "step": 18245, "train_speed(iter/s)": 0.201023 }, { "acc": 0.74745603, "epoch": 0.46296296296296297, "grad_norm": 2.34375, "learning_rate": 9.126259334932467e-06, "loss": 1.00995846, "memory(GiB)": 369.4, "step": 18250, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7372407, "epoch": 0.463089802130898, "grad_norm": 2.734375, "learning_rate": 9.125667018377362e-06, "loss": 1.0724596, "memory(GiB)": 369.4, "step": 18255, "train_speed(iter/s)": 0.201044 }, { "acc": 0.74463177, "epoch": 0.46321664129883305, "grad_norm": 2.546875, "learning_rate": 9.125074520357002e-06, "loss": 1.01046619, "memory(GiB)": 369.4, "step": 18260, "train_speed(iter/s)": 0.201049 }, { "acc": 0.73934555, "epoch": 0.46334348046676815, "grad_norm": 2.078125, "learning_rate": 9.124481840897446e-06, "loss": 1.0076025, "memory(GiB)": 369.4, "step": 18265, "train_speed(iter/s)": 0.201058 }, { "acc": 0.73974133, "epoch": 0.4634703196347032, "grad_norm": 2.125, "learning_rate": 9.123888980024765e-06, "loss": 1.01058884, "memory(GiB)": 369.4, "step": 18270, "train_speed(iter/s)": 0.201058 }, { "acc": 0.75701022, "epoch": 0.46359715880263824, "grad_norm": 1.9609375, "learning_rate": 9.123295937765034e-06, "loss": 0.98573246, "memory(GiB)": 369.4, "step": 18275, "train_speed(iter/s)": 0.201067 }, { "acc": 0.7622201, "epoch": 0.4637239979705733, "grad_norm": 2.5, "learning_rate": 9.122702714144339e-06, "loss": 0.98266048, "memory(GiB)": 369.4, "step": 18280, "train_speed(iter/s)": 0.201074 }, { "acc": 0.74297676, "epoch": 0.4638508371385084, "grad_norm": 2.078125, "learning_rate": 9.122109309188772e-06, "loss": 1.02835407, "memory(GiB)": 369.4, "step": 18285, "train_speed(iter/s)": 0.201085 }, { "acc": 0.76578646, "epoch": 0.46397767630644343, "grad_norm": 2.796875, "learning_rate": 9.121515722924435e-06, "loss": 0.96913204, "memory(GiB)": 369.4, "step": 18290, "train_speed(iter/s)": 0.201094 }, { "acc": 0.75011253, "epoch": 0.4641045154743785, "grad_norm": 2.265625, "learning_rate": 9.120921955377433e-06, "loss": 0.98360729, "memory(GiB)": 369.4, "step": 18295, "train_speed(iter/s)": 0.201103 }, { "acc": 0.73592224, "epoch": 0.4642313546423135, "grad_norm": 2.0625, "learning_rate": 9.120328006573887e-06, "loss": 1.01829605, "memory(GiB)": 369.4, "step": 18300, "train_speed(iter/s)": 0.201107 }, { "acc": 0.74799905, "epoch": 0.4643581938102486, "grad_norm": 2.125, "learning_rate": 9.119733876539916e-06, "loss": 0.98453617, "memory(GiB)": 369.4, "step": 18305, "train_speed(iter/s)": 0.201114 }, { "acc": 0.74518313, "epoch": 0.46448503297818367, "grad_norm": 2.125, "learning_rate": 9.119139565301658e-06, "loss": 0.98177643, "memory(GiB)": 369.4, "step": 18310, "train_speed(iter/s)": 0.201126 }, { "acc": 0.74695148, "epoch": 0.4646118721461187, "grad_norm": 2.234375, "learning_rate": 9.118545072885253e-06, "loss": 1.0390625, "memory(GiB)": 369.4, "step": 18315, "train_speed(iter/s)": 0.201138 }, { "acc": 0.7556417, "epoch": 0.46473871131405375, "grad_norm": 2.578125, "learning_rate": 9.117950399316845e-06, "loss": 0.97182388, "memory(GiB)": 369.4, "step": 18320, "train_speed(iter/s)": 0.201148 }, { "acc": 0.75818868, "epoch": 0.46486555048198885, "grad_norm": 2.578125, "learning_rate": 9.117355544622595e-06, "loss": 0.99345875, "memory(GiB)": 369.4, "step": 18325, "train_speed(iter/s)": 0.201157 }, { "acc": 0.73358989, "epoch": 0.4649923896499239, "grad_norm": 2.09375, "learning_rate": 9.116760508828664e-06, "loss": 0.97802029, "memory(GiB)": 369.4, "step": 18330, "train_speed(iter/s)": 0.201166 }, { "acc": 0.7570075, "epoch": 0.46511922881785894, "grad_norm": 2.03125, "learning_rate": 9.116165291961225e-06, "loss": 1.01340771, "memory(GiB)": 369.4, "step": 18335, "train_speed(iter/s)": 0.201173 }, { "acc": 0.74844151, "epoch": 0.465246067985794, "grad_norm": 1.9765625, "learning_rate": 9.11556989404646e-06, "loss": 1.00710621, "memory(GiB)": 369.4, "step": 18340, "train_speed(iter/s)": 0.20118 }, { "acc": 0.73576231, "epoch": 0.4653729071537291, "grad_norm": 2.421875, "learning_rate": 9.114974315110558e-06, "loss": 1.09043875, "memory(GiB)": 369.4, "step": 18345, "train_speed(iter/s)": 0.201186 }, { "acc": 0.75288544, "epoch": 0.46549974632166413, "grad_norm": 2.421875, "learning_rate": 9.114378555179712e-06, "loss": 1.03799133, "memory(GiB)": 369.4, "step": 18350, "train_speed(iter/s)": 0.201191 }, { "acc": 0.74916487, "epoch": 0.4656265854895992, "grad_norm": 2.359375, "learning_rate": 9.11378261428013e-06, "loss": 1.01119041, "memory(GiB)": 369.4, "step": 18355, "train_speed(iter/s)": 0.201198 }, { "acc": 0.74752426, "epoch": 0.4657534246575342, "grad_norm": 2.1875, "learning_rate": 9.11318649243802e-06, "loss": 0.99589367, "memory(GiB)": 369.4, "step": 18360, "train_speed(iter/s)": 0.201205 }, { "acc": 0.76135292, "epoch": 0.4658802638254693, "grad_norm": 2.265625, "learning_rate": 9.112590189679604e-06, "loss": 0.97359715, "memory(GiB)": 369.4, "step": 18365, "train_speed(iter/s)": 0.201216 }, { "acc": 0.73806496, "epoch": 0.46600710299340437, "grad_norm": 1.8203125, "learning_rate": 9.111993706031109e-06, "loss": 1.03057432, "memory(GiB)": 369.4, "step": 18370, "train_speed(iter/s)": 0.201222 }, { "acc": 0.75228963, "epoch": 0.4661339421613394, "grad_norm": 2.0625, "learning_rate": 9.111397041518774e-06, "loss": 1.01085815, "memory(GiB)": 369.4, "step": 18375, "train_speed(iter/s)": 0.20123 }, { "acc": 0.74453793, "epoch": 0.46626078132927445, "grad_norm": 2.21875, "learning_rate": 9.11080019616884e-06, "loss": 0.97958469, "memory(GiB)": 369.4, "step": 18380, "train_speed(iter/s)": 0.201241 }, { "acc": 0.75512657, "epoch": 0.46638762049720955, "grad_norm": 2.375, "learning_rate": 9.11020317000756e-06, "loss": 1.02691898, "memory(GiB)": 369.4, "step": 18385, "train_speed(iter/s)": 0.201244 }, { "acc": 0.72034278, "epoch": 0.4665144596651446, "grad_norm": 2.28125, "learning_rate": 9.109605963061194e-06, "loss": 1.09429417, "memory(GiB)": 369.4, "step": 18390, "train_speed(iter/s)": 0.201246 }, { "acc": 0.74435277, "epoch": 0.46664129883307964, "grad_norm": 2.203125, "learning_rate": 9.10900857535601e-06, "loss": 1.02153473, "memory(GiB)": 369.4, "step": 18395, "train_speed(iter/s)": 0.201255 }, { "acc": 0.73054185, "epoch": 0.4667681380010147, "grad_norm": 2.359375, "learning_rate": 9.108411006918283e-06, "loss": 1.05190277, "memory(GiB)": 369.4, "step": 18400, "train_speed(iter/s)": 0.201263 }, { "acc": 0.75215216, "epoch": 0.4668949771689498, "grad_norm": 2.453125, "learning_rate": 9.107813257774298e-06, "loss": 1.0322751, "memory(GiB)": 369.4, "step": 18405, "train_speed(iter/s)": 0.20127 }, { "acc": 0.74067531, "epoch": 0.46702181633688483, "grad_norm": 2.265625, "learning_rate": 9.107215327950345e-06, "loss": 1.03533802, "memory(GiB)": 369.4, "step": 18410, "train_speed(iter/s)": 0.201277 }, { "acc": 0.74793835, "epoch": 0.4671486555048199, "grad_norm": 2.28125, "learning_rate": 9.106617217472724e-06, "loss": 1.00773687, "memory(GiB)": 369.4, "step": 18415, "train_speed(iter/s)": 0.201287 }, { "acc": 0.73359756, "epoch": 0.4672754946727549, "grad_norm": 2.15625, "learning_rate": 9.106018926367744e-06, "loss": 1.05531044, "memory(GiB)": 369.4, "step": 18420, "train_speed(iter/s)": 0.201298 }, { "acc": 0.7421175, "epoch": 0.46740233384069, "grad_norm": 2.21875, "learning_rate": 9.10542045466172e-06, "loss": 1.05715103, "memory(GiB)": 369.4, "step": 18425, "train_speed(iter/s)": 0.201301 }, { "acc": 0.74310932, "epoch": 0.46752917300862507, "grad_norm": 2.203125, "learning_rate": 9.104821802380974e-06, "loss": 1.05090151, "memory(GiB)": 369.4, "step": 18430, "train_speed(iter/s)": 0.201311 }, { "acc": 0.75564327, "epoch": 0.4676560121765601, "grad_norm": 1.9140625, "learning_rate": 9.104222969551838e-06, "loss": 0.97338514, "memory(GiB)": 369.4, "step": 18435, "train_speed(iter/s)": 0.201319 }, { "acc": 0.7460372, "epoch": 0.46778285134449515, "grad_norm": 2.28125, "learning_rate": 9.103623956200654e-06, "loss": 1.03682327, "memory(GiB)": 369.4, "step": 18440, "train_speed(iter/s)": 0.201327 }, { "acc": 0.73067536, "epoch": 0.46790969051243025, "grad_norm": 2.265625, "learning_rate": 9.103024762353766e-06, "loss": 1.08606596, "memory(GiB)": 369.4, "step": 18445, "train_speed(iter/s)": 0.20133 }, { "acc": 0.75299568, "epoch": 0.4680365296803653, "grad_norm": 1.984375, "learning_rate": 9.102425388037527e-06, "loss": 0.96563549, "memory(GiB)": 369.4, "step": 18450, "train_speed(iter/s)": 0.201335 }, { "acc": 0.74419122, "epoch": 0.46816336884830034, "grad_norm": 2.21875, "learning_rate": 9.101825833278308e-06, "loss": 1.02019224, "memory(GiB)": 369.4, "step": 18455, "train_speed(iter/s)": 0.201341 }, { "acc": 0.74968033, "epoch": 0.4682902080162354, "grad_norm": 2.015625, "learning_rate": 9.101226098102473e-06, "loss": 0.96824627, "memory(GiB)": 369.4, "step": 18460, "train_speed(iter/s)": 0.201351 }, { "acc": 0.75378695, "epoch": 0.4684170471841705, "grad_norm": 2.796875, "learning_rate": 9.100626182536405e-06, "loss": 1.03495474, "memory(GiB)": 369.4, "step": 18465, "train_speed(iter/s)": 0.201355 }, { "acc": 0.74845366, "epoch": 0.46854388635210553, "grad_norm": 2.234375, "learning_rate": 9.100026086606488e-06, "loss": 0.9960598, "memory(GiB)": 369.4, "step": 18470, "train_speed(iter/s)": 0.201366 }, { "acc": 0.76465788, "epoch": 0.4686707255200406, "grad_norm": 2.078125, "learning_rate": 9.09942581033912e-06, "loss": 0.96118689, "memory(GiB)": 369.4, "step": 18475, "train_speed(iter/s)": 0.201374 }, { "acc": 0.75912762, "epoch": 0.4687975646879756, "grad_norm": 2.234375, "learning_rate": 9.0988253537607e-06, "loss": 0.94209642, "memory(GiB)": 369.4, "step": 18480, "train_speed(iter/s)": 0.201382 }, { "acc": 0.74613481, "epoch": 0.4689244038559107, "grad_norm": 1.8671875, "learning_rate": 9.098224716897644e-06, "loss": 1.01738291, "memory(GiB)": 369.4, "step": 18485, "train_speed(iter/s)": 0.201386 }, { "acc": 0.74618745, "epoch": 0.46905124302384577, "grad_norm": 2.1875, "learning_rate": 9.097623899776366e-06, "loss": 0.99782791, "memory(GiB)": 369.4, "step": 18490, "train_speed(iter/s)": 0.201395 }, { "acc": 0.73871288, "epoch": 0.4691780821917808, "grad_norm": 2.34375, "learning_rate": 9.097022902423294e-06, "loss": 1.0008482, "memory(GiB)": 369.4, "step": 18495, "train_speed(iter/s)": 0.201406 }, { "acc": 0.74114981, "epoch": 0.46930492135971585, "grad_norm": 2.078125, "learning_rate": 9.096421724864864e-06, "loss": 1.01811295, "memory(GiB)": 369.4, "step": 18500, "train_speed(iter/s)": 0.201414 }, { "acc": 0.74485998, "epoch": 0.46943176052765095, "grad_norm": 2.015625, "learning_rate": 9.095820367127517e-06, "loss": 1.03733749, "memory(GiB)": 369.4, "step": 18505, "train_speed(iter/s)": 0.201418 }, { "acc": 0.74239149, "epoch": 0.469558599695586, "grad_norm": 2.046875, "learning_rate": 9.095218829237703e-06, "loss": 1.05519924, "memory(GiB)": 369.4, "step": 18510, "train_speed(iter/s)": 0.201428 }, { "acc": 0.72919111, "epoch": 0.46968543886352104, "grad_norm": 2.265625, "learning_rate": 9.094617111221881e-06, "loss": 1.08293209, "memory(GiB)": 369.4, "step": 18515, "train_speed(iter/s)": 0.201435 }, { "acc": 0.74336905, "epoch": 0.4698122780314561, "grad_norm": 1.8359375, "learning_rate": 9.09401521310652e-06, "loss": 0.97855988, "memory(GiB)": 369.4, "step": 18520, "train_speed(iter/s)": 0.201445 }, { "acc": 0.75735598, "epoch": 0.4699391171993912, "grad_norm": 2.046875, "learning_rate": 9.093413134918088e-06, "loss": 0.9752039, "memory(GiB)": 369.4, "step": 18525, "train_speed(iter/s)": 0.201457 }, { "acc": 0.75449429, "epoch": 0.47006595636732623, "grad_norm": 2.109375, "learning_rate": 9.09281087668307e-06, "loss": 1.03521996, "memory(GiB)": 369.4, "step": 18530, "train_speed(iter/s)": 0.201461 }, { "acc": 0.75950627, "epoch": 0.4701927955352613, "grad_norm": 2.109375, "learning_rate": 9.09220843842796e-06, "loss": 0.99279594, "memory(GiB)": 369.4, "step": 18535, "train_speed(iter/s)": 0.201469 }, { "acc": 0.74284973, "epoch": 0.4703196347031963, "grad_norm": 1.9921875, "learning_rate": 9.09160582017925e-06, "loss": 0.99825459, "memory(GiB)": 369.4, "step": 18540, "train_speed(iter/s)": 0.201479 }, { "acc": 0.72788281, "epoch": 0.4704464738711314, "grad_norm": 1.9453125, "learning_rate": 9.091003021963449e-06, "loss": 1.10194626, "memory(GiB)": 369.4, "step": 18545, "train_speed(iter/s)": 0.201491 }, { "acc": 0.74397573, "epoch": 0.47057331303906647, "grad_norm": 2.03125, "learning_rate": 9.09040004380707e-06, "loss": 1.03794537, "memory(GiB)": 369.4, "step": 18550, "train_speed(iter/s)": 0.201496 }, { "acc": 0.73747349, "epoch": 0.4707001522070015, "grad_norm": 2.203125, "learning_rate": 9.089796885736637e-06, "loss": 0.96685619, "memory(GiB)": 369.4, "step": 18555, "train_speed(iter/s)": 0.201504 }, { "acc": 0.74972258, "epoch": 0.47082699137493655, "grad_norm": 1.890625, "learning_rate": 9.089193547778674e-06, "loss": 0.99012318, "memory(GiB)": 369.4, "step": 18560, "train_speed(iter/s)": 0.201509 }, { "acc": 0.74817619, "epoch": 0.47095383054287165, "grad_norm": 2.09375, "learning_rate": 9.088590029959724e-06, "loss": 1.00624094, "memory(GiB)": 369.4, "step": 18565, "train_speed(iter/s)": 0.201517 }, { "acc": 0.72077961, "epoch": 0.4710806697108067, "grad_norm": 2.203125, "learning_rate": 9.08798633230633e-06, "loss": 1.11493988, "memory(GiB)": 369.4, "step": 18570, "train_speed(iter/s)": 0.201528 }, { "acc": 0.74492478, "epoch": 0.47120750887874174, "grad_norm": 1.9140625, "learning_rate": 9.087382454845044e-06, "loss": 1.05957279, "memory(GiB)": 369.4, "step": 18575, "train_speed(iter/s)": 0.201537 }, { "acc": 0.73371563, "epoch": 0.4713343480466768, "grad_norm": 1.9296875, "learning_rate": 9.08677839760243e-06, "loss": 1.06236496, "memory(GiB)": 369.4, "step": 18580, "train_speed(iter/s)": 0.201542 }, { "acc": 0.73251843, "epoch": 0.4714611872146119, "grad_norm": 2.140625, "learning_rate": 9.086174160605055e-06, "loss": 1.02490253, "memory(GiB)": 369.4, "step": 18585, "train_speed(iter/s)": 0.201553 }, { "acc": 0.73921342, "epoch": 0.47158802638254693, "grad_norm": 1.9375, "learning_rate": 9.085569743879498e-06, "loss": 1.05070477, "memory(GiB)": 369.4, "step": 18590, "train_speed(iter/s)": 0.20156 }, { "acc": 0.74499311, "epoch": 0.471714865550482, "grad_norm": 2.0625, "learning_rate": 9.084965147452342e-06, "loss": 0.99292154, "memory(GiB)": 369.4, "step": 18595, "train_speed(iter/s)": 0.201568 }, { "acc": 0.7376174, "epoch": 0.471841704718417, "grad_norm": 2.03125, "learning_rate": 9.084360371350182e-06, "loss": 1.03293762, "memory(GiB)": 369.4, "step": 18600, "train_speed(iter/s)": 0.201571 }, { "acc": 0.74104486, "epoch": 0.4719685438863521, "grad_norm": 2.046875, "learning_rate": 9.083755415599617e-06, "loss": 1.01719151, "memory(GiB)": 369.4, "step": 18605, "train_speed(iter/s)": 0.201579 }, { "acc": 0.75069818, "epoch": 0.47209538305428717, "grad_norm": 2.03125, "learning_rate": 9.083150280227255e-06, "loss": 0.99174986, "memory(GiB)": 369.4, "step": 18610, "train_speed(iter/s)": 0.201586 }, { "acc": 0.7459219, "epoch": 0.4722222222222222, "grad_norm": 2.03125, "learning_rate": 9.082544965259716e-06, "loss": 1.00707207, "memory(GiB)": 369.4, "step": 18615, "train_speed(iter/s)": 0.201579 }, { "acc": 0.75396585, "epoch": 0.47234906139015725, "grad_norm": 2.03125, "learning_rate": 9.081939470723619e-06, "loss": 0.9932827, "memory(GiB)": 369.4, "step": 18620, "train_speed(iter/s)": 0.201586 }, { "acc": 0.74039297, "epoch": 0.47247590055809235, "grad_norm": 2.609375, "learning_rate": 9.081333796645603e-06, "loss": 1.0448595, "memory(GiB)": 369.4, "step": 18625, "train_speed(iter/s)": 0.201596 }, { "acc": 0.75138474, "epoch": 0.4726027397260274, "grad_norm": 2.46875, "learning_rate": 9.080727943052304e-06, "loss": 0.96824827, "memory(GiB)": 369.4, "step": 18630, "train_speed(iter/s)": 0.201595 }, { "acc": 0.74020357, "epoch": 0.47272957889396244, "grad_norm": 2.171875, "learning_rate": 9.080121909970369e-06, "loss": 1.0402113, "memory(GiB)": 369.4, "step": 18635, "train_speed(iter/s)": 0.201601 }, { "acc": 0.75094118, "epoch": 0.4728564180618975, "grad_norm": 2.359375, "learning_rate": 9.079515697426457e-06, "loss": 0.9967104, "memory(GiB)": 369.4, "step": 18640, "train_speed(iter/s)": 0.20161 }, { "acc": 0.74127913, "epoch": 0.4729832572298326, "grad_norm": 2.109375, "learning_rate": 9.078909305447231e-06, "loss": 1.04163246, "memory(GiB)": 369.4, "step": 18645, "train_speed(iter/s)": 0.201617 }, { "acc": 0.75407581, "epoch": 0.47311009639776763, "grad_norm": 2.09375, "learning_rate": 9.078302734059363e-06, "loss": 0.93974447, "memory(GiB)": 369.4, "step": 18650, "train_speed(iter/s)": 0.201627 }, { "acc": 0.75285273, "epoch": 0.4732369355657027, "grad_norm": 2.296875, "learning_rate": 9.077695983289531e-06, "loss": 0.96603746, "memory(GiB)": 369.4, "step": 18655, "train_speed(iter/s)": 0.201633 }, { "acc": 0.73960667, "epoch": 0.4733637747336377, "grad_norm": 1.9375, "learning_rate": 9.077089053164426e-06, "loss": 1.02610149, "memory(GiB)": 369.4, "step": 18660, "train_speed(iter/s)": 0.201643 }, { "acc": 0.75593548, "epoch": 0.4734906139015728, "grad_norm": 2.046875, "learning_rate": 9.076481943710742e-06, "loss": 0.97033405, "memory(GiB)": 369.4, "step": 18665, "train_speed(iter/s)": 0.201649 }, { "acc": 0.74374518, "epoch": 0.47361745306950787, "grad_norm": 2.59375, "learning_rate": 9.075874654955178e-06, "loss": 1.01048279, "memory(GiB)": 369.4, "step": 18670, "train_speed(iter/s)": 0.201658 }, { "acc": 0.73603015, "epoch": 0.4737442922374429, "grad_norm": 1.8984375, "learning_rate": 9.075267186924453e-06, "loss": 1.03084412, "memory(GiB)": 369.4, "step": 18675, "train_speed(iter/s)": 0.201666 }, { "acc": 0.7384552, "epoch": 0.47387113140537795, "grad_norm": 2.28125, "learning_rate": 9.074659539645281e-06, "loss": 0.98799477, "memory(GiB)": 369.4, "step": 18680, "train_speed(iter/s)": 0.201673 }, { "acc": 0.73277097, "epoch": 0.47399797057331305, "grad_norm": 2.28125, "learning_rate": 9.074051713144392e-06, "loss": 1.0857439, "memory(GiB)": 369.4, "step": 18685, "train_speed(iter/s)": 0.201684 }, { "acc": 0.75202475, "epoch": 0.4741248097412481, "grad_norm": 2.25, "learning_rate": 9.073443707448517e-06, "loss": 1.02772379, "memory(GiB)": 369.4, "step": 18690, "train_speed(iter/s)": 0.201693 }, { "acc": 0.73196716, "epoch": 0.47425164890918314, "grad_norm": 2.390625, "learning_rate": 9.072835522584402e-06, "loss": 1.03646107, "memory(GiB)": 369.4, "step": 18695, "train_speed(iter/s)": 0.201701 }, { "acc": 0.74896746, "epoch": 0.4743784880771182, "grad_norm": 2.359375, "learning_rate": 9.072227158578798e-06, "loss": 0.98809719, "memory(GiB)": 369.4, "step": 18700, "train_speed(iter/s)": 0.201707 }, { "acc": 0.73584399, "epoch": 0.4745053272450533, "grad_norm": 2.40625, "learning_rate": 9.07161861545846e-06, "loss": 1.01601801, "memory(GiB)": 369.4, "step": 18705, "train_speed(iter/s)": 0.201711 }, { "acc": 0.74074278, "epoch": 0.47463216641298833, "grad_norm": 2.515625, "learning_rate": 9.071009893250158e-06, "loss": 1.05176468, "memory(GiB)": 369.4, "step": 18710, "train_speed(iter/s)": 0.201721 }, { "acc": 0.7397789, "epoch": 0.4747590055809234, "grad_norm": 2.328125, "learning_rate": 9.070400991980666e-06, "loss": 1.04555283, "memory(GiB)": 369.4, "step": 18715, "train_speed(iter/s)": 0.201728 }, { "acc": 0.75876322, "epoch": 0.4748858447488584, "grad_norm": 2.09375, "learning_rate": 9.069791911676765e-06, "loss": 1.01242218, "memory(GiB)": 369.4, "step": 18720, "train_speed(iter/s)": 0.201736 }, { "acc": 0.7591217, "epoch": 0.4750126839167935, "grad_norm": 2.140625, "learning_rate": 9.069182652365245e-06, "loss": 1.01703033, "memory(GiB)": 369.4, "step": 18725, "train_speed(iter/s)": 0.201746 }, { "acc": 0.74016953, "epoch": 0.47513952308472857, "grad_norm": 2.265625, "learning_rate": 9.068573214072904e-06, "loss": 1.12291517, "memory(GiB)": 369.4, "step": 18730, "train_speed(iter/s)": 0.201756 }, { "acc": 0.73800907, "epoch": 0.4752663622526636, "grad_norm": 1.8125, "learning_rate": 9.067963596826547e-06, "loss": 1.04881868, "memory(GiB)": 369.4, "step": 18735, "train_speed(iter/s)": 0.20176 }, { "acc": 0.75040588, "epoch": 0.47539320142059865, "grad_norm": 2.03125, "learning_rate": 9.067353800652991e-06, "loss": 1.01752872, "memory(GiB)": 369.4, "step": 18740, "train_speed(iter/s)": 0.201769 }, { "acc": 0.74495754, "epoch": 0.47552004058853375, "grad_norm": 2.265625, "learning_rate": 9.066743825579056e-06, "loss": 1.02004051, "memory(GiB)": 369.4, "step": 18745, "train_speed(iter/s)": 0.201779 }, { "acc": 0.75610228, "epoch": 0.4756468797564688, "grad_norm": 2.421875, "learning_rate": 9.06613367163157e-06, "loss": 0.96661243, "memory(GiB)": 369.4, "step": 18750, "train_speed(iter/s)": 0.201784 }, { "acc": 0.74345207, "epoch": 0.47577371892440384, "grad_norm": 2.140625, "learning_rate": 9.06552333883737e-06, "loss": 1.01322041, "memory(GiB)": 369.4, "step": 18755, "train_speed(iter/s)": 0.201793 }, { "acc": 0.7457201, "epoch": 0.4759005580923389, "grad_norm": 2.0625, "learning_rate": 9.064912827223303e-06, "loss": 1.03343849, "memory(GiB)": 369.4, "step": 18760, "train_speed(iter/s)": 0.201801 }, { "acc": 0.74563408, "epoch": 0.476027397260274, "grad_norm": 2.109375, "learning_rate": 9.06430213681622e-06, "loss": 0.9748064, "memory(GiB)": 369.4, "step": 18765, "train_speed(iter/s)": 0.201808 }, { "acc": 0.72773294, "epoch": 0.47615423642820903, "grad_norm": 2.25, "learning_rate": 9.063691267642987e-06, "loss": 1.09067163, "memory(GiB)": 369.4, "step": 18770, "train_speed(iter/s)": 0.201815 }, { "acc": 0.74684744, "epoch": 0.4762810755961441, "grad_norm": 2.265625, "learning_rate": 9.063080219730467e-06, "loss": 1.02429886, "memory(GiB)": 369.4, "step": 18775, "train_speed(iter/s)": 0.201824 }, { "acc": 0.74440451, "epoch": 0.4764079147640791, "grad_norm": 2.171875, "learning_rate": 9.062468993105538e-06, "loss": 1.0157052, "memory(GiB)": 369.4, "step": 18780, "train_speed(iter/s)": 0.201832 }, { "acc": 0.77109127, "epoch": 0.4765347539320142, "grad_norm": 2.203125, "learning_rate": 9.061857587795084e-06, "loss": 0.91647348, "memory(GiB)": 369.4, "step": 18785, "train_speed(iter/s)": 0.201843 }, { "acc": 0.74701867, "epoch": 0.47666159309994927, "grad_norm": 2.4375, "learning_rate": 9.061246003826e-06, "loss": 1.08287258, "memory(GiB)": 369.4, "step": 18790, "train_speed(iter/s)": 0.201846 }, { "acc": 0.7398138, "epoch": 0.4767884322678843, "grad_norm": 2.390625, "learning_rate": 9.060634241225184e-06, "loss": 1.04477406, "memory(GiB)": 369.4, "step": 18795, "train_speed(iter/s)": 0.201848 }, { "acc": 0.74300756, "epoch": 0.47691527143581935, "grad_norm": 2.203125, "learning_rate": 9.060022300019546e-06, "loss": 1.00625515, "memory(GiB)": 369.4, "step": 18800, "train_speed(iter/s)": 0.201858 }, { "acc": 0.75290909, "epoch": 0.47704211060375445, "grad_norm": 2.40625, "learning_rate": 9.059410180236e-06, "loss": 1.006213, "memory(GiB)": 369.4, "step": 18805, "train_speed(iter/s)": 0.201861 }, { "acc": 0.75478182, "epoch": 0.4771689497716895, "grad_norm": 1.890625, "learning_rate": 9.058797881901469e-06, "loss": 0.97661285, "memory(GiB)": 369.4, "step": 18810, "train_speed(iter/s)": 0.201865 }, { "acc": 0.76516933, "epoch": 0.47729578893962454, "grad_norm": 2.421875, "learning_rate": 9.058185405042886e-06, "loss": 0.94088974, "memory(GiB)": 369.4, "step": 18815, "train_speed(iter/s)": 0.201873 }, { "acc": 0.74562674, "epoch": 0.4774226281075596, "grad_norm": 2.078125, "learning_rate": 9.05757274968719e-06, "loss": 1.06543522, "memory(GiB)": 369.4, "step": 18820, "train_speed(iter/s)": 0.201876 }, { "acc": 0.74435282, "epoch": 0.4775494672754947, "grad_norm": 2.234375, "learning_rate": 9.056959915861331e-06, "loss": 0.98993549, "memory(GiB)": 369.4, "step": 18825, "train_speed(iter/s)": 0.201885 }, { "acc": 0.74456048, "epoch": 0.47767630644342973, "grad_norm": 2.4375, "learning_rate": 9.056346903592262e-06, "loss": 1.03246593, "memory(GiB)": 369.4, "step": 18830, "train_speed(iter/s)": 0.201892 }, { "acc": 0.73751173, "epoch": 0.4778031456113648, "grad_norm": 1.734375, "learning_rate": 9.055733712906943e-06, "loss": 1.03417063, "memory(GiB)": 369.4, "step": 18835, "train_speed(iter/s)": 0.201895 }, { "acc": 0.72990489, "epoch": 0.4779299847792998, "grad_norm": 2.09375, "learning_rate": 9.05512034383235e-06, "loss": 1.02644062, "memory(GiB)": 369.4, "step": 18840, "train_speed(iter/s)": 0.201902 }, { "acc": 0.74235411, "epoch": 0.4780568239472349, "grad_norm": 2.21875, "learning_rate": 9.054506796395458e-06, "loss": 1.00917511, "memory(GiB)": 369.4, "step": 18845, "train_speed(iter/s)": 0.201911 }, { "acc": 0.73832521, "epoch": 0.47818366311516997, "grad_norm": 2.1875, "learning_rate": 9.053893070623256e-06, "loss": 1.06988811, "memory(GiB)": 369.4, "step": 18850, "train_speed(iter/s)": 0.201918 }, { "acc": 0.75037956, "epoch": 0.478310502283105, "grad_norm": 2.328125, "learning_rate": 9.053279166542738e-06, "loss": 1.03391047, "memory(GiB)": 369.4, "step": 18855, "train_speed(iter/s)": 0.201929 }, { "acc": 0.76027851, "epoch": 0.47843734145104005, "grad_norm": 2.3125, "learning_rate": 9.052665084180906e-06, "loss": 1.00297585, "memory(GiB)": 369.4, "step": 18860, "train_speed(iter/s)": 0.201938 }, { "acc": 0.73866081, "epoch": 0.47856418061897515, "grad_norm": 2.046875, "learning_rate": 9.052050823564767e-06, "loss": 1.02931175, "memory(GiB)": 369.4, "step": 18865, "train_speed(iter/s)": 0.201948 }, { "acc": 0.75394034, "epoch": 0.4786910197869102, "grad_norm": 1.921875, "learning_rate": 9.051436384721344e-06, "loss": 1.02706795, "memory(GiB)": 369.4, "step": 18870, "train_speed(iter/s)": 0.201952 }, { "acc": 0.76800032, "epoch": 0.47881785895484524, "grad_norm": 2.109375, "learning_rate": 9.05082176767766e-06, "loss": 0.94348373, "memory(GiB)": 369.4, "step": 18875, "train_speed(iter/s)": 0.201961 }, { "acc": 0.74575257, "epoch": 0.4789446981227803, "grad_norm": 1.7109375, "learning_rate": 9.050206972460749e-06, "loss": 1.00846081, "memory(GiB)": 369.4, "step": 18880, "train_speed(iter/s)": 0.20197 }, { "acc": 0.74951625, "epoch": 0.4790715372907154, "grad_norm": 2.4375, "learning_rate": 9.049591999097651e-06, "loss": 1.02026796, "memory(GiB)": 369.4, "step": 18885, "train_speed(iter/s)": 0.201976 }, { "acc": 0.73677311, "epoch": 0.47919837645865043, "grad_norm": 2.328125, "learning_rate": 9.048976847615418e-06, "loss": 1.03134871, "memory(GiB)": 369.4, "step": 18890, "train_speed(iter/s)": 0.201984 }, { "acc": 0.74006233, "epoch": 0.4793252156265855, "grad_norm": 2.0625, "learning_rate": 9.048361518041107e-06, "loss": 1.04225655, "memory(GiB)": 369.4, "step": 18895, "train_speed(iter/s)": 0.20199 }, { "acc": 0.75306463, "epoch": 0.4794520547945205, "grad_norm": 1.9375, "learning_rate": 9.04774601040178e-06, "loss": 0.99375, "memory(GiB)": 369.4, "step": 18900, "train_speed(iter/s)": 0.202 }, { "acc": 0.74716654, "epoch": 0.4795788939624556, "grad_norm": 2.609375, "learning_rate": 9.047130324724513e-06, "loss": 1.006217, "memory(GiB)": 369.4, "step": 18905, "train_speed(iter/s)": 0.202004 }, { "acc": 0.74839487, "epoch": 0.47970573313039067, "grad_norm": 3.015625, "learning_rate": 9.046514461036385e-06, "loss": 1.01294498, "memory(GiB)": 369.4, "step": 18910, "train_speed(iter/s)": 0.20201 }, { "acc": 0.73906403, "epoch": 0.4798325722983257, "grad_norm": 1.90625, "learning_rate": 9.045898419364483e-06, "loss": 1.03512249, "memory(GiB)": 369.4, "step": 18915, "train_speed(iter/s)": 0.202019 }, { "acc": 0.75348892, "epoch": 0.47995941146626075, "grad_norm": 1.765625, "learning_rate": 9.045282199735906e-06, "loss": 0.96665945, "memory(GiB)": 369.4, "step": 18920, "train_speed(iter/s)": 0.202029 }, { "acc": 0.73405032, "epoch": 0.48008625063419585, "grad_norm": 2.421875, "learning_rate": 9.044665802177756e-06, "loss": 1.10212555, "memory(GiB)": 369.4, "step": 18925, "train_speed(iter/s)": 0.202038 }, { "acc": 0.74413428, "epoch": 0.4802130898021309, "grad_norm": 3.15625, "learning_rate": 9.044049226717148e-06, "loss": 1.00382652, "memory(GiB)": 369.4, "step": 18930, "train_speed(iter/s)": 0.202045 }, { "acc": 0.75623269, "epoch": 0.48033992897006594, "grad_norm": 1.9296875, "learning_rate": 9.043432473381198e-06, "loss": 1.01225452, "memory(GiB)": 369.4, "step": 18935, "train_speed(iter/s)": 0.202054 }, { "acc": 0.74482837, "epoch": 0.480466768138001, "grad_norm": 2.421875, "learning_rate": 9.042815542197037e-06, "loss": 1.05523081, "memory(GiB)": 369.4, "step": 18940, "train_speed(iter/s)": 0.202059 }, { "acc": 0.74103289, "epoch": 0.4805936073059361, "grad_norm": 2.265625, "learning_rate": 9.042198433191796e-06, "loss": 1.00112953, "memory(GiB)": 369.4, "step": 18945, "train_speed(iter/s)": 0.202063 }, { "acc": 0.74685965, "epoch": 0.48072044647387113, "grad_norm": 2.609375, "learning_rate": 9.041581146392621e-06, "loss": 1.0309721, "memory(GiB)": 369.4, "step": 18950, "train_speed(iter/s)": 0.202066 }, { "acc": 0.74332533, "epoch": 0.4808472856418062, "grad_norm": 2.46875, "learning_rate": 9.040963681826665e-06, "loss": 0.98984699, "memory(GiB)": 369.4, "step": 18955, "train_speed(iter/s)": 0.202078 }, { "acc": 0.74683962, "epoch": 0.4809741248097412, "grad_norm": 2.125, "learning_rate": 9.040346039521085e-06, "loss": 1.01896944, "memory(GiB)": 369.4, "step": 18960, "train_speed(iter/s)": 0.202083 }, { "acc": 0.75092611, "epoch": 0.4811009639776763, "grad_norm": 2.203125, "learning_rate": 9.039728219503044e-06, "loss": 1.01488457, "memory(GiB)": 369.4, "step": 18965, "train_speed(iter/s)": 0.202088 }, { "acc": 0.75593634, "epoch": 0.48122780314561137, "grad_norm": 2.015625, "learning_rate": 9.039110221799721e-06, "loss": 0.96180992, "memory(GiB)": 369.4, "step": 18970, "train_speed(iter/s)": 0.202094 }, { "acc": 0.74050379, "epoch": 0.4813546423135464, "grad_norm": 2.1875, "learning_rate": 9.038492046438298e-06, "loss": 1.03211517, "memory(GiB)": 369.4, "step": 18975, "train_speed(iter/s)": 0.202103 }, { "acc": 0.74836941, "epoch": 0.48148148148148145, "grad_norm": 2.328125, "learning_rate": 9.037873693445965e-06, "loss": 1.02733021, "memory(GiB)": 369.4, "step": 18980, "train_speed(iter/s)": 0.202108 }, { "acc": 0.75943785, "epoch": 0.48160832064941655, "grad_norm": 1.9375, "learning_rate": 9.037255162849918e-06, "loss": 0.98776054, "memory(GiB)": 369.4, "step": 18985, "train_speed(iter/s)": 0.202119 }, { "acc": 0.74520254, "epoch": 0.4817351598173516, "grad_norm": 2.1875, "learning_rate": 9.036636454677363e-06, "loss": 1.02386694, "memory(GiB)": 369.4, "step": 18990, "train_speed(iter/s)": 0.202126 }, { "acc": 0.739149, "epoch": 0.48186199898528664, "grad_norm": 2.0625, "learning_rate": 9.036017568955516e-06, "loss": 1.02820473, "memory(GiB)": 369.4, "step": 18995, "train_speed(iter/s)": 0.202131 }, { "acc": 0.74203005, "epoch": 0.4819888381532217, "grad_norm": 1.828125, "learning_rate": 9.035398505711597e-06, "loss": 1.04708271, "memory(GiB)": 369.4, "step": 19000, "train_speed(iter/s)": 0.202142 }, { "epoch": 0.4819888381532217, "eval_acc": 0.7344061040817119, "eval_loss": 0.9868600368499756, "eval_runtime": 384.5633, "eval_samples_per_second": 16.564, "eval_steps_per_second": 8.282, "step": 19000 }, { "acc": 0.74027781, "epoch": 0.4821156773211568, "grad_norm": 2.4375, "learning_rate": 9.034779264972834e-06, "loss": 1.03981371, "memory(GiB)": 369.4, "step": 19005, "train_speed(iter/s)": 0.200627 }, { "acc": 0.74313817, "epoch": 0.48224251648909183, "grad_norm": 1.8125, "learning_rate": 9.034159846766464e-06, "loss": 1.01767216, "memory(GiB)": 369.4, "step": 19010, "train_speed(iter/s)": 0.200635 }, { "acc": 0.7531487, "epoch": 0.4823693556570269, "grad_norm": 2.1875, "learning_rate": 9.033540251119734e-06, "loss": 0.92900515, "memory(GiB)": 369.4, "step": 19015, "train_speed(iter/s)": 0.200646 }, { "acc": 0.74743433, "epoch": 0.4824961948249619, "grad_norm": 1.9765625, "learning_rate": 9.032920478059897e-06, "loss": 0.99923496, "memory(GiB)": 369.4, "step": 19020, "train_speed(iter/s)": 0.200652 }, { "acc": 0.75054932, "epoch": 0.482623033992897, "grad_norm": 1.9921875, "learning_rate": 9.032300527614209e-06, "loss": 1.05054588, "memory(GiB)": 369.4, "step": 19025, "train_speed(iter/s)": 0.200661 }, { "acc": 0.73960085, "epoch": 0.48274987316083207, "grad_norm": 2.265625, "learning_rate": 9.031680399809941e-06, "loss": 1.04085855, "memory(GiB)": 369.4, "step": 19030, "train_speed(iter/s)": 0.20067 }, { "acc": 0.72885022, "epoch": 0.4828767123287671, "grad_norm": 2.03125, "learning_rate": 9.031060094674371e-06, "loss": 1.10720949, "memory(GiB)": 369.4, "step": 19035, "train_speed(iter/s)": 0.200678 }, { "acc": 0.73991599, "epoch": 0.48300355149670215, "grad_norm": 2.65625, "learning_rate": 9.030439612234778e-06, "loss": 1.00202398, "memory(GiB)": 369.4, "step": 19040, "train_speed(iter/s)": 0.200689 }, { "acc": 0.75121717, "epoch": 0.48313039066463725, "grad_norm": 1.9765625, "learning_rate": 9.029818952518458e-06, "loss": 0.99897146, "memory(GiB)": 369.4, "step": 19045, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76560202, "epoch": 0.4832572298325723, "grad_norm": 2.09375, "learning_rate": 9.029198115552708e-06, "loss": 0.94483242, "memory(GiB)": 369.4, "step": 19050, "train_speed(iter/s)": 0.200705 }, { "acc": 0.74108853, "epoch": 0.48338406900050734, "grad_norm": 2.296875, "learning_rate": 9.028577101364837e-06, "loss": 0.993647, "memory(GiB)": 369.4, "step": 19055, "train_speed(iter/s)": 0.200708 }, { "acc": 0.75079498, "epoch": 0.4835109081684424, "grad_norm": 2.4375, "learning_rate": 9.027955909982157e-06, "loss": 0.99815998, "memory(GiB)": 369.4, "step": 19060, "train_speed(iter/s)": 0.200718 }, { "acc": 0.76567163, "epoch": 0.4836377473363775, "grad_norm": 1.921875, "learning_rate": 9.027334541431993e-06, "loss": 0.92073021, "memory(GiB)": 369.4, "step": 19065, "train_speed(iter/s)": 0.20072 }, { "acc": 0.7474328, "epoch": 0.48376458650431253, "grad_norm": 2.484375, "learning_rate": 9.026712995741676e-06, "loss": 0.96986961, "memory(GiB)": 369.4, "step": 19070, "train_speed(iter/s)": 0.200731 }, { "acc": 0.73041148, "epoch": 0.4838914256722476, "grad_norm": 2.171875, "learning_rate": 9.026091272938543e-06, "loss": 1.06299229, "memory(GiB)": 369.4, "step": 19075, "train_speed(iter/s)": 0.200737 }, { "acc": 0.74875121, "epoch": 0.4840182648401826, "grad_norm": 2.0625, "learning_rate": 9.02546937304994e-06, "loss": 1.01137495, "memory(GiB)": 369.4, "step": 19080, "train_speed(iter/s)": 0.200747 }, { "acc": 0.74774513, "epoch": 0.4841451040081177, "grad_norm": 2.25, "learning_rate": 9.02484729610322e-06, "loss": 1.03610573, "memory(GiB)": 369.4, "step": 19085, "train_speed(iter/s)": 0.200756 }, { "acc": 0.73598137, "epoch": 0.48427194317605277, "grad_norm": 1.96875, "learning_rate": 9.02422504212575e-06, "loss": 1.05454445, "memory(GiB)": 369.4, "step": 19090, "train_speed(iter/s)": 0.200763 }, { "acc": 0.7491312, "epoch": 0.4843987823439878, "grad_norm": 1.8828125, "learning_rate": 9.023602611144893e-06, "loss": 0.99350929, "memory(GiB)": 369.4, "step": 19095, "train_speed(iter/s)": 0.200772 }, { "acc": 0.72979212, "epoch": 0.48452562151192285, "grad_norm": 2.09375, "learning_rate": 9.02298000318803e-06, "loss": 1.00604315, "memory(GiB)": 369.4, "step": 19100, "train_speed(iter/s)": 0.200778 }, { "acc": 0.73503385, "epoch": 0.48465246067985795, "grad_norm": 2.078125, "learning_rate": 9.022357218282546e-06, "loss": 1.01588078, "memory(GiB)": 369.4, "step": 19105, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74103332, "epoch": 0.484779299847793, "grad_norm": 2.296875, "learning_rate": 9.021734256455832e-06, "loss": 1.06712914, "memory(GiB)": 369.4, "step": 19110, "train_speed(iter/s)": 0.200797 }, { "acc": 0.76356678, "epoch": 0.48490613901572804, "grad_norm": 2.03125, "learning_rate": 9.02111111773529e-06, "loss": 0.98059855, "memory(GiB)": 369.4, "step": 19115, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74179659, "epoch": 0.4850329781836631, "grad_norm": 1.953125, "learning_rate": 9.020487802148328e-06, "loss": 0.9636692, "memory(GiB)": 369.4, "step": 19120, "train_speed(iter/s)": 0.200816 }, { "acc": 0.7409822, "epoch": 0.4851598173515982, "grad_norm": 2.1875, "learning_rate": 9.019864309722362e-06, "loss": 1.02288227, "memory(GiB)": 369.4, "step": 19125, "train_speed(iter/s)": 0.200824 }, { "acc": 0.75738201, "epoch": 0.48528665651953323, "grad_norm": 1.7109375, "learning_rate": 9.019240640484816e-06, "loss": 0.96945038, "memory(GiB)": 369.4, "step": 19130, "train_speed(iter/s)": 0.200831 }, { "acc": 0.74603672, "epoch": 0.4854134956874683, "grad_norm": 2.359375, "learning_rate": 9.018616794463124e-06, "loss": 0.98573322, "memory(GiB)": 369.4, "step": 19135, "train_speed(iter/s)": 0.200842 }, { "acc": 0.73989797, "epoch": 0.4855403348554033, "grad_norm": 1.9453125, "learning_rate": 9.017992771684722e-06, "loss": 1.02947388, "memory(GiB)": 369.4, "step": 19140, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74496055, "epoch": 0.4856671740233384, "grad_norm": 2.546875, "learning_rate": 9.017368572177058e-06, "loss": 1.01796494, "memory(GiB)": 369.4, "step": 19145, "train_speed(iter/s)": 0.200859 }, { "acc": 0.74519815, "epoch": 0.48579401319127347, "grad_norm": 2.625, "learning_rate": 9.016744195967588e-06, "loss": 1.00057163, "memory(GiB)": 369.4, "step": 19150, "train_speed(iter/s)": 0.200867 }, { "acc": 0.76026454, "epoch": 0.4859208523592085, "grad_norm": 2.390625, "learning_rate": 9.016119643083777e-06, "loss": 1.02454853, "memory(GiB)": 369.4, "step": 19155, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74653425, "epoch": 0.48604769152714355, "grad_norm": 2.46875, "learning_rate": 9.015494913553091e-06, "loss": 1.02648048, "memory(GiB)": 369.4, "step": 19160, "train_speed(iter/s)": 0.200884 }, { "acc": 0.74573612, "epoch": 0.48617453069507865, "grad_norm": 2.078125, "learning_rate": 9.014870007403012e-06, "loss": 1.00338383, "memory(GiB)": 369.4, "step": 19165, "train_speed(iter/s)": 0.200897 }, { "acc": 0.73848405, "epoch": 0.4863013698630137, "grad_norm": 2.015625, "learning_rate": 9.014244924661026e-06, "loss": 0.99139805, "memory(GiB)": 369.4, "step": 19170, "train_speed(iter/s)": 0.200902 }, { "acc": 0.74019127, "epoch": 0.48642820903094874, "grad_norm": 2.140625, "learning_rate": 9.013619665354626e-06, "loss": 1.0261735, "memory(GiB)": 369.4, "step": 19175, "train_speed(iter/s)": 0.200904 }, { "acc": 0.7318346, "epoch": 0.4865550481988838, "grad_norm": 2.046875, "learning_rate": 9.01299422951131e-06, "loss": 1.07772179, "memory(GiB)": 369.4, "step": 19180, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74358044, "epoch": 0.4866818873668189, "grad_norm": 2.21875, "learning_rate": 9.012368617158593e-06, "loss": 1.03338528, "memory(GiB)": 369.4, "step": 19185, "train_speed(iter/s)": 0.200923 }, { "acc": 0.73255215, "epoch": 0.48680872653475393, "grad_norm": 2.015625, "learning_rate": 9.01174282832399e-06, "loss": 1.05097294, "memory(GiB)": 369.4, "step": 19190, "train_speed(iter/s)": 0.200934 }, { "acc": 0.76048632, "epoch": 0.486935565702689, "grad_norm": 2.390625, "learning_rate": 9.011116863035027e-06, "loss": 0.93219433, "memory(GiB)": 369.4, "step": 19195, "train_speed(iter/s)": 0.200937 }, { "acc": 0.74162016, "epoch": 0.487062404870624, "grad_norm": 2.375, "learning_rate": 9.010490721319237e-06, "loss": 1.02769775, "memory(GiB)": 369.4, "step": 19200, "train_speed(iter/s)": 0.200947 }, { "acc": 0.74998226, "epoch": 0.4871892440385591, "grad_norm": 2.359375, "learning_rate": 9.009864403204157e-06, "loss": 0.98421173, "memory(GiB)": 369.4, "step": 19205, "train_speed(iter/s)": 0.200954 }, { "acc": 0.74242153, "epoch": 0.48731608320649417, "grad_norm": 1.921875, "learning_rate": 9.00923790871734e-06, "loss": 0.97702503, "memory(GiB)": 369.4, "step": 19210, "train_speed(iter/s)": 0.200964 }, { "acc": 0.73822651, "epoch": 0.4874429223744292, "grad_norm": 2.53125, "learning_rate": 9.008611237886339e-06, "loss": 1.07368307, "memory(GiB)": 369.4, "step": 19215, "train_speed(iter/s)": 0.200971 }, { "acc": 0.7428082, "epoch": 0.48756976154236426, "grad_norm": 2.25, "learning_rate": 9.00798439073872e-06, "loss": 1.01727257, "memory(GiB)": 369.4, "step": 19220, "train_speed(iter/s)": 0.200978 }, { "acc": 0.73669071, "epoch": 0.48769660071029935, "grad_norm": 2.984375, "learning_rate": 9.007357367302052e-06, "loss": 1.06455002, "memory(GiB)": 369.4, "step": 19225, "train_speed(iter/s)": 0.200988 }, { "acc": 0.74238868, "epoch": 0.4878234398782344, "grad_norm": 2.09375, "learning_rate": 9.006730167603914e-06, "loss": 1.06767216, "memory(GiB)": 369.4, "step": 19230, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74433646, "epoch": 0.48795027904616944, "grad_norm": 2.265625, "learning_rate": 9.006102791671896e-06, "loss": 1.02974548, "memory(GiB)": 369.4, "step": 19235, "train_speed(iter/s)": 0.201002 }, { "acc": 0.74465804, "epoch": 0.4880771182141045, "grad_norm": 2.390625, "learning_rate": 9.005475239533591e-06, "loss": 1.0362361, "memory(GiB)": 369.4, "step": 19240, "train_speed(iter/s)": 0.201011 }, { "acc": 0.73782921, "epoch": 0.4882039573820396, "grad_norm": 2.34375, "learning_rate": 9.0048475112166e-06, "loss": 1.02297087, "memory(GiB)": 369.4, "step": 19245, "train_speed(iter/s)": 0.201016 }, { "acc": 0.73502212, "epoch": 0.48833079654997463, "grad_norm": 1.9296875, "learning_rate": 9.00421960674854e-06, "loss": 1.02142811, "memory(GiB)": 369.4, "step": 19250, "train_speed(iter/s)": 0.201024 }, { "acc": 0.73705316, "epoch": 0.4884576357179097, "grad_norm": 2.140625, "learning_rate": 9.003591526157021e-06, "loss": 1.03093977, "memory(GiB)": 369.4, "step": 19255, "train_speed(iter/s)": 0.201031 }, { "acc": 0.74142132, "epoch": 0.4885844748858447, "grad_norm": 2.359375, "learning_rate": 9.002963269469672e-06, "loss": 1.02764521, "memory(GiB)": 369.4, "step": 19260, "train_speed(iter/s)": 0.201039 }, { "acc": 0.75847034, "epoch": 0.4887113140537798, "grad_norm": 2.078125, "learning_rate": 9.002334836714126e-06, "loss": 0.96726885, "memory(GiB)": 369.4, "step": 19265, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74485912, "epoch": 0.48883815322171487, "grad_norm": 1.953125, "learning_rate": 9.001706227918023e-06, "loss": 0.99493713, "memory(GiB)": 369.4, "step": 19270, "train_speed(iter/s)": 0.20105 }, { "acc": 0.7398325, "epoch": 0.4889649923896499, "grad_norm": 2.125, "learning_rate": 9.001077443109016e-06, "loss": 1.0345149, "memory(GiB)": 369.4, "step": 19275, "train_speed(iter/s)": 0.201059 }, { "acc": 0.73397274, "epoch": 0.48909183155758496, "grad_norm": 2.015625, "learning_rate": 9.00044848231476e-06, "loss": 1.0448164, "memory(GiB)": 369.4, "step": 19280, "train_speed(iter/s)": 0.201065 }, { "acc": 0.76054401, "epoch": 0.48921867072552006, "grad_norm": 1.8984375, "learning_rate": 8.999819345562919e-06, "loss": 0.97253742, "memory(GiB)": 369.4, "step": 19285, "train_speed(iter/s)": 0.201072 }, { "acc": 0.73609838, "epoch": 0.4893455098934551, "grad_norm": 2.171875, "learning_rate": 8.999190032881165e-06, "loss": 1.05071926, "memory(GiB)": 369.4, "step": 19290, "train_speed(iter/s)": 0.201079 }, { "acc": 0.75894284, "epoch": 0.48947234906139014, "grad_norm": 2.171875, "learning_rate": 8.998560544297176e-06, "loss": 0.96870899, "memory(GiB)": 369.4, "step": 19295, "train_speed(iter/s)": 0.201082 }, { "acc": 0.74831133, "epoch": 0.4895991882293252, "grad_norm": 2.0, "learning_rate": 8.997930879838646e-06, "loss": 0.97211533, "memory(GiB)": 369.4, "step": 19300, "train_speed(iter/s)": 0.201087 }, { "acc": 0.74166765, "epoch": 0.4897260273972603, "grad_norm": 2.046875, "learning_rate": 8.997301039533264e-06, "loss": 1.04326649, "memory(GiB)": 369.4, "step": 19305, "train_speed(iter/s)": 0.201095 }, { "acc": 0.74395719, "epoch": 0.48985286656519533, "grad_norm": 2.25, "learning_rate": 8.996671023408737e-06, "loss": 1.03160677, "memory(GiB)": 369.4, "step": 19310, "train_speed(iter/s)": 0.201101 }, { "acc": 0.74671164, "epoch": 0.4899797057331304, "grad_norm": 2.125, "learning_rate": 8.996040831492772e-06, "loss": 1.02324085, "memory(GiB)": 369.4, "step": 19315, "train_speed(iter/s)": 0.201106 }, { "acc": 0.72518225, "epoch": 0.4901065449010654, "grad_norm": 1.96875, "learning_rate": 8.995410463813093e-06, "loss": 1.00739937, "memory(GiB)": 369.4, "step": 19320, "train_speed(iter/s)": 0.201116 }, { "acc": 0.74383078, "epoch": 0.4902333840690005, "grad_norm": 1.8359375, "learning_rate": 8.99477992039742e-06, "loss": 0.97714272, "memory(GiB)": 369.4, "step": 19325, "train_speed(iter/s)": 0.201125 }, { "acc": 0.74613514, "epoch": 0.49036022323693557, "grad_norm": 2.6875, "learning_rate": 8.994149201273495e-06, "loss": 0.99876671, "memory(GiB)": 369.4, "step": 19330, "train_speed(iter/s)": 0.201137 }, { "acc": 0.74680853, "epoch": 0.4904870624048706, "grad_norm": 2.28125, "learning_rate": 8.993518306469052e-06, "loss": 0.98572044, "memory(GiB)": 369.4, "step": 19335, "train_speed(iter/s)": 0.201143 }, { "acc": 0.71719723, "epoch": 0.49061390157280566, "grad_norm": 2.5, "learning_rate": 8.992887236011847e-06, "loss": 1.11260471, "memory(GiB)": 369.4, "step": 19340, "train_speed(iter/s)": 0.20115 }, { "acc": 0.73788853, "epoch": 0.49074074074074076, "grad_norm": 2.125, "learning_rate": 8.992255989929632e-06, "loss": 1.055618, "memory(GiB)": 369.4, "step": 19345, "train_speed(iter/s)": 0.201161 }, { "acc": 0.75064206, "epoch": 0.4908675799086758, "grad_norm": 1.875, "learning_rate": 8.991624568250175e-06, "loss": 1.00391102, "memory(GiB)": 369.4, "step": 19350, "train_speed(iter/s)": 0.201163 }, { "acc": 0.74235592, "epoch": 0.49099441907661084, "grad_norm": 2.34375, "learning_rate": 8.99099297100125e-06, "loss": 1.07628098, "memory(GiB)": 369.4, "step": 19355, "train_speed(iter/s)": 0.201172 }, { "acc": 0.75926933, "epoch": 0.4911212582445459, "grad_norm": 2.75, "learning_rate": 8.990361198210634e-06, "loss": 0.99377899, "memory(GiB)": 369.4, "step": 19360, "train_speed(iter/s)": 0.201174 }, { "acc": 0.72574196, "epoch": 0.491248097412481, "grad_norm": 1.9140625, "learning_rate": 8.989729249906116e-06, "loss": 1.09609241, "memory(GiB)": 369.4, "step": 19365, "train_speed(iter/s)": 0.201181 }, { "acc": 0.74806576, "epoch": 0.49137493658041603, "grad_norm": 2.203125, "learning_rate": 8.989097126115493e-06, "loss": 1.0136302, "memory(GiB)": 369.4, "step": 19370, "train_speed(iter/s)": 0.201191 }, { "acc": 0.73687029, "epoch": 0.4915017757483511, "grad_norm": 2.046875, "learning_rate": 8.98846482686657e-06, "loss": 1.06619549, "memory(GiB)": 369.4, "step": 19375, "train_speed(iter/s)": 0.201199 }, { "acc": 0.74734707, "epoch": 0.4916286149162861, "grad_norm": 2.390625, "learning_rate": 8.987832352187156e-06, "loss": 1.02954254, "memory(GiB)": 369.4, "step": 19380, "train_speed(iter/s)": 0.201209 }, { "acc": 0.74034972, "epoch": 0.4917554540842212, "grad_norm": 2.21875, "learning_rate": 8.987199702105071e-06, "loss": 1.07212372, "memory(GiB)": 369.4, "step": 19385, "train_speed(iter/s)": 0.201215 }, { "acc": 0.74127712, "epoch": 0.49188229325215627, "grad_norm": 2.234375, "learning_rate": 8.986566876648141e-06, "loss": 1.05798664, "memory(GiB)": 369.4, "step": 19390, "train_speed(iter/s)": 0.201227 }, { "acc": 0.74313259, "epoch": 0.4920091324200913, "grad_norm": 2.015625, "learning_rate": 8.985933875844202e-06, "loss": 1.00562057, "memory(GiB)": 369.4, "step": 19395, "train_speed(iter/s)": 0.201235 }, { "acc": 0.76769438, "epoch": 0.49213597158802636, "grad_norm": 2.078125, "learning_rate": 8.985300699721094e-06, "loss": 0.9652277, "memory(GiB)": 369.4, "step": 19400, "train_speed(iter/s)": 0.201242 }, { "acc": 0.75763097, "epoch": 0.49226281075596146, "grad_norm": 2.171875, "learning_rate": 8.984667348306669e-06, "loss": 1.04257717, "memory(GiB)": 369.4, "step": 19405, "train_speed(iter/s)": 0.201249 }, { "acc": 0.75377717, "epoch": 0.4923896499238965, "grad_norm": 2.296875, "learning_rate": 8.984033821628782e-06, "loss": 0.98021393, "memory(GiB)": 369.4, "step": 19410, "train_speed(iter/s)": 0.201257 }, { "acc": 0.76190252, "epoch": 0.49251648909183154, "grad_norm": 2.234375, "learning_rate": 8.983400119715303e-06, "loss": 0.94738951, "memory(GiB)": 369.4, "step": 19415, "train_speed(iter/s)": 0.201268 }, { "acc": 0.75162334, "epoch": 0.4926433282597666, "grad_norm": 2.234375, "learning_rate": 8.982766242594099e-06, "loss": 0.97878342, "memory(GiB)": 369.4, "step": 19420, "train_speed(iter/s)": 0.201271 }, { "acc": 0.76167278, "epoch": 0.4927701674277017, "grad_norm": 2.515625, "learning_rate": 8.982132190293056e-06, "loss": 1.02472658, "memory(GiB)": 369.4, "step": 19425, "train_speed(iter/s)": 0.201279 }, { "acc": 0.76158447, "epoch": 0.49289700659563673, "grad_norm": 2.265625, "learning_rate": 8.98149796284006e-06, "loss": 0.97273998, "memory(GiB)": 369.4, "step": 19430, "train_speed(iter/s)": 0.201281 }, { "acc": 0.75526981, "epoch": 0.4930238457635718, "grad_norm": 2.4375, "learning_rate": 8.980863560263007e-06, "loss": 0.99548044, "memory(GiB)": 369.4, "step": 19435, "train_speed(iter/s)": 0.201279 }, { "acc": 0.73897963, "epoch": 0.4931506849315068, "grad_norm": 2.109375, "learning_rate": 8.980228982589802e-06, "loss": 1.02683678, "memory(GiB)": 369.4, "step": 19440, "train_speed(iter/s)": 0.201275 }, { "acc": 0.75811558, "epoch": 0.4932775240994419, "grad_norm": 2.015625, "learning_rate": 8.979594229848355e-06, "loss": 1.02442341, "memory(GiB)": 369.4, "step": 19445, "train_speed(iter/s)": 0.201284 }, { "acc": 0.74389305, "epoch": 0.49340436326737697, "grad_norm": 2.0, "learning_rate": 8.978959302066587e-06, "loss": 1.0242734, "memory(GiB)": 369.4, "step": 19450, "train_speed(iter/s)": 0.201296 }, { "acc": 0.74502139, "epoch": 0.493531202435312, "grad_norm": 1.6875, "learning_rate": 8.978324199272423e-06, "loss": 1.00695324, "memory(GiB)": 369.4, "step": 19455, "train_speed(iter/s)": 0.201303 }, { "acc": 0.75794163, "epoch": 0.49365804160324706, "grad_norm": 2.4375, "learning_rate": 8.977688921493799e-06, "loss": 0.96580067, "memory(GiB)": 369.4, "step": 19460, "train_speed(iter/s)": 0.201312 }, { "acc": 0.74226818, "epoch": 0.49378488077118216, "grad_norm": 2.046875, "learning_rate": 8.977053468758659e-06, "loss": 0.99902973, "memory(GiB)": 369.4, "step": 19465, "train_speed(iter/s)": 0.201322 }, { "acc": 0.74179115, "epoch": 0.4939117199391172, "grad_norm": 2.3125, "learning_rate": 8.976417841094949e-06, "loss": 1.03196154, "memory(GiB)": 369.4, "step": 19470, "train_speed(iter/s)": 0.201332 }, { "acc": 0.75131159, "epoch": 0.49403855910705224, "grad_norm": 2.40625, "learning_rate": 8.97578203853063e-06, "loss": 1.02179775, "memory(GiB)": 369.4, "step": 19475, "train_speed(iter/s)": 0.201339 }, { "acc": 0.74638853, "epoch": 0.4941653982749873, "grad_norm": 1.9765625, "learning_rate": 8.975146061093667e-06, "loss": 1.0046278, "memory(GiB)": 369.4, "step": 19480, "train_speed(iter/s)": 0.201348 }, { "acc": 0.73350639, "epoch": 0.4942922374429224, "grad_norm": 2.109375, "learning_rate": 8.97450990881203e-06, "loss": 1.09147377, "memory(GiB)": 369.4, "step": 19485, "train_speed(iter/s)": 0.201357 }, { "acc": 0.75864964, "epoch": 0.49441907661085743, "grad_norm": 2.59375, "learning_rate": 8.973873581713705e-06, "loss": 1.01719007, "memory(GiB)": 369.4, "step": 19490, "train_speed(iter/s)": 0.201366 }, { "acc": 0.75453558, "epoch": 0.4945459157787925, "grad_norm": 2.171875, "learning_rate": 8.973237079826676e-06, "loss": 1.04025421, "memory(GiB)": 369.4, "step": 19495, "train_speed(iter/s)": 0.201371 }, { "acc": 0.74886336, "epoch": 0.4946727549467275, "grad_norm": 2.59375, "learning_rate": 8.972600403178941e-06, "loss": 1.02762623, "memory(GiB)": 369.4, "step": 19500, "train_speed(iter/s)": 0.201376 }, { "acc": 0.7509798, "epoch": 0.4947995941146626, "grad_norm": 2.359375, "learning_rate": 8.971963551798506e-06, "loss": 0.96459093, "memory(GiB)": 369.4, "step": 19505, "train_speed(iter/s)": 0.201384 }, { "acc": 0.74859838, "epoch": 0.49492643328259767, "grad_norm": 2.078125, "learning_rate": 8.971326525713378e-06, "loss": 0.977318, "memory(GiB)": 369.4, "step": 19510, "train_speed(iter/s)": 0.201393 }, { "acc": 0.74521494, "epoch": 0.4950532724505327, "grad_norm": 2.203125, "learning_rate": 8.97068932495158e-06, "loss": 1.00267429, "memory(GiB)": 369.4, "step": 19515, "train_speed(iter/s)": 0.201399 }, { "acc": 0.74673147, "epoch": 0.49518011161846776, "grad_norm": 3.078125, "learning_rate": 8.970051949541137e-06, "loss": 1.00996609, "memory(GiB)": 369.4, "step": 19520, "train_speed(iter/s)": 0.201409 }, { "acc": 0.74319334, "epoch": 0.49530695078640286, "grad_norm": 2.125, "learning_rate": 8.969414399510085e-06, "loss": 1.01801929, "memory(GiB)": 369.4, "step": 19525, "train_speed(iter/s)": 0.201416 }, { "acc": 0.75536804, "epoch": 0.4954337899543379, "grad_norm": 1.9296875, "learning_rate": 8.968776674886466e-06, "loss": 1.03665123, "memory(GiB)": 369.4, "step": 19530, "train_speed(iter/s)": 0.201424 }, { "acc": 0.73556638, "epoch": 0.49556062912227294, "grad_norm": 3.046875, "learning_rate": 8.968138775698328e-06, "loss": 1.06296778, "memory(GiB)": 369.4, "step": 19535, "train_speed(iter/s)": 0.201437 }, { "acc": 0.75357227, "epoch": 0.495687468290208, "grad_norm": 2.171875, "learning_rate": 8.96750070197373e-06, "loss": 1.02485838, "memory(GiB)": 369.4, "step": 19540, "train_speed(iter/s)": 0.201434 }, { "acc": 0.73642845, "epoch": 0.4958143074581431, "grad_norm": 2.3125, "learning_rate": 8.966862453740738e-06, "loss": 1.02408648, "memory(GiB)": 369.4, "step": 19545, "train_speed(iter/s)": 0.201446 }, { "acc": 0.74667654, "epoch": 0.49594114662607813, "grad_norm": 1.8984375, "learning_rate": 8.966224031027426e-06, "loss": 1.04997702, "memory(GiB)": 369.4, "step": 19550, "train_speed(iter/s)": 0.201449 }, { "acc": 0.74777141, "epoch": 0.4960679857940132, "grad_norm": 2.125, "learning_rate": 8.965585433861871e-06, "loss": 1.01107006, "memory(GiB)": 369.4, "step": 19555, "train_speed(iter/s)": 0.20146 }, { "acc": 0.74610853, "epoch": 0.4961948249619482, "grad_norm": 2.34375, "learning_rate": 8.964946662272167e-06, "loss": 0.98384972, "memory(GiB)": 369.4, "step": 19560, "train_speed(iter/s)": 0.201468 }, { "acc": 0.7558732, "epoch": 0.4963216641298833, "grad_norm": 2.171875, "learning_rate": 8.964307716286404e-06, "loss": 0.9754446, "memory(GiB)": 369.4, "step": 19565, "train_speed(iter/s)": 0.201474 }, { "acc": 0.74938269, "epoch": 0.49644850329781837, "grad_norm": 2.328125, "learning_rate": 8.963668595932689e-06, "loss": 1.04472256, "memory(GiB)": 369.4, "step": 19570, "train_speed(iter/s)": 0.201481 }, { "acc": 0.73994455, "epoch": 0.4965753424657534, "grad_norm": 1.8125, "learning_rate": 8.96302930123913e-06, "loss": 1.03445177, "memory(GiB)": 369.4, "step": 19575, "train_speed(iter/s)": 0.201488 }, { "acc": 0.75487685, "epoch": 0.49670218163368846, "grad_norm": 2.0625, "learning_rate": 8.962389832233853e-06, "loss": 1.01619234, "memory(GiB)": 369.4, "step": 19580, "train_speed(iter/s)": 0.201497 }, { "acc": 0.74391208, "epoch": 0.49682902080162356, "grad_norm": 3.03125, "learning_rate": 8.961750188944978e-06, "loss": 1.06330824, "memory(GiB)": 369.4, "step": 19585, "train_speed(iter/s)": 0.201507 }, { "acc": 0.73735094, "epoch": 0.4969558599695586, "grad_norm": 2.28125, "learning_rate": 8.96111037140064e-06, "loss": 1.02536297, "memory(GiB)": 369.4, "step": 19590, "train_speed(iter/s)": 0.201509 }, { "acc": 0.75528383, "epoch": 0.49708269913749364, "grad_norm": 1.796875, "learning_rate": 8.960470379628986e-06, "loss": 0.96631317, "memory(GiB)": 369.4, "step": 19595, "train_speed(iter/s)": 0.201517 }, { "acc": 0.74465261, "epoch": 0.4972095383054287, "grad_norm": 2.359375, "learning_rate": 8.959830213658161e-06, "loss": 1.02311773, "memory(GiB)": 369.4, "step": 19600, "train_speed(iter/s)": 0.201522 }, { "acc": 0.74492569, "epoch": 0.4973363774733638, "grad_norm": 2.25, "learning_rate": 8.959189873516324e-06, "loss": 1.07774296, "memory(GiB)": 369.4, "step": 19605, "train_speed(iter/s)": 0.201527 }, { "acc": 0.73485708, "epoch": 0.49746321664129883, "grad_norm": 2.078125, "learning_rate": 8.95854935923164e-06, "loss": 1.07703876, "memory(GiB)": 369.4, "step": 19610, "train_speed(iter/s)": 0.201536 }, { "acc": 0.75047421, "epoch": 0.4975900558092339, "grad_norm": 2.59375, "learning_rate": 8.95790867083228e-06, "loss": 1.05897026, "memory(GiB)": 369.4, "step": 19615, "train_speed(iter/s)": 0.201543 }, { "acc": 0.73215618, "epoch": 0.4977168949771689, "grad_norm": 2.5, "learning_rate": 8.957267808346428e-06, "loss": 1.03619919, "memory(GiB)": 369.4, "step": 19620, "train_speed(iter/s)": 0.201544 }, { "acc": 0.74349499, "epoch": 0.497843734145104, "grad_norm": 2.0, "learning_rate": 8.95662677180227e-06, "loss": 1.03077679, "memory(GiB)": 369.4, "step": 19625, "train_speed(iter/s)": 0.201553 }, { "acc": 0.75927653, "epoch": 0.49797057331303907, "grad_norm": 2.265625, "learning_rate": 8.955985561228e-06, "loss": 0.98336506, "memory(GiB)": 369.4, "step": 19630, "train_speed(iter/s)": 0.201558 }, { "acc": 0.75317979, "epoch": 0.4980974124809741, "grad_norm": 3.21875, "learning_rate": 8.955344176651824e-06, "loss": 1.02084999, "memory(GiB)": 369.4, "step": 19635, "train_speed(iter/s)": 0.201564 }, { "acc": 0.75746365, "epoch": 0.49822425164890916, "grad_norm": 2.6875, "learning_rate": 8.954702618101952e-06, "loss": 1.02452078, "memory(GiB)": 369.4, "step": 19640, "train_speed(iter/s)": 0.201571 }, { "acc": 0.74021668, "epoch": 0.49835109081684426, "grad_norm": 2.765625, "learning_rate": 8.9540608856066e-06, "loss": 1.0158433, "memory(GiB)": 369.4, "step": 19645, "train_speed(iter/s)": 0.201577 }, { "acc": 0.73654108, "epoch": 0.4984779299847793, "grad_norm": 2.59375, "learning_rate": 8.953418979194e-06, "loss": 1.08960361, "memory(GiB)": 369.4, "step": 19650, "train_speed(iter/s)": 0.201581 }, { "acc": 0.74869099, "epoch": 0.49860476915271434, "grad_norm": 2.140625, "learning_rate": 8.95277689889238e-06, "loss": 0.99741697, "memory(GiB)": 369.4, "step": 19655, "train_speed(iter/s)": 0.20159 }, { "acc": 0.73895192, "epoch": 0.4987316083206494, "grad_norm": 2.578125, "learning_rate": 8.952134644729985e-06, "loss": 1.04501743, "memory(GiB)": 369.4, "step": 19660, "train_speed(iter/s)": 0.201593 }, { "acc": 0.74277153, "epoch": 0.4988584474885845, "grad_norm": 2.296875, "learning_rate": 8.951492216735062e-06, "loss": 1.0360817, "memory(GiB)": 369.4, "step": 19665, "train_speed(iter/s)": 0.201602 }, { "acc": 0.73146334, "epoch": 0.49898528665651953, "grad_norm": 2.03125, "learning_rate": 8.950849614935872e-06, "loss": 1.04930601, "memory(GiB)": 369.4, "step": 19670, "train_speed(iter/s)": 0.20161 }, { "acc": 0.7405354, "epoch": 0.4991121258244546, "grad_norm": 2.296875, "learning_rate": 8.950206839360674e-06, "loss": 1.06886559, "memory(GiB)": 369.4, "step": 19675, "train_speed(iter/s)": 0.201616 }, { "acc": 0.73962975, "epoch": 0.4992389649923896, "grad_norm": 1.8203125, "learning_rate": 8.949563890037745e-06, "loss": 1.04327135, "memory(GiB)": 369.4, "step": 19680, "train_speed(iter/s)": 0.201622 }, { "acc": 0.73406334, "epoch": 0.4993658041603247, "grad_norm": 2.234375, "learning_rate": 8.948920766995362e-06, "loss": 1.05083427, "memory(GiB)": 369.4, "step": 19685, "train_speed(iter/s)": 0.201633 }, { "acc": 0.74505472, "epoch": 0.49949264332825977, "grad_norm": 2.296875, "learning_rate": 8.948277470261812e-06, "loss": 1.03194656, "memory(GiB)": 369.4, "step": 19690, "train_speed(iter/s)": 0.201637 }, { "acc": 0.73025522, "epoch": 0.4996194824961948, "grad_norm": 2.234375, "learning_rate": 8.94763399986539e-06, "loss": 1.06651363, "memory(GiB)": 369.4, "step": 19695, "train_speed(iter/s)": 0.20164 }, { "acc": 0.74685159, "epoch": 0.49974632166412986, "grad_norm": 1.90625, "learning_rate": 8.946990355834401e-06, "loss": 0.98222446, "memory(GiB)": 369.4, "step": 19700, "train_speed(iter/s)": 0.20165 }, { "acc": 0.75385809, "epoch": 0.49987316083206496, "grad_norm": 2.046875, "learning_rate": 8.946346538197156e-06, "loss": 1.00655861, "memory(GiB)": 369.4, "step": 19705, "train_speed(iter/s)": 0.201658 }, { "acc": 0.74073181, "epoch": 0.5, "grad_norm": 2.171875, "learning_rate": 8.94570254698197e-06, "loss": 1.04789524, "memory(GiB)": 369.4, "step": 19710, "train_speed(iter/s)": 0.201664 }, { "acc": 0.7388792, "epoch": 0.5001268391679351, "grad_norm": 2.078125, "learning_rate": 8.945058382217168e-06, "loss": 1.03995686, "memory(GiB)": 369.4, "step": 19715, "train_speed(iter/s)": 0.201673 }, { "acc": 0.73675013, "epoch": 0.5002536783358701, "grad_norm": 2.1875, "learning_rate": 8.944414043931086e-06, "loss": 1.07859268, "memory(GiB)": 369.4, "step": 19720, "train_speed(iter/s)": 0.201682 }, { "acc": 0.74419394, "epoch": 0.5003805175038052, "grad_norm": 1.75, "learning_rate": 8.943769532152065e-06, "loss": 0.9974328, "memory(GiB)": 369.4, "step": 19725, "train_speed(iter/s)": 0.201685 }, { "acc": 0.73941441, "epoch": 0.5005073566717403, "grad_norm": 2.1875, "learning_rate": 8.94312484690845e-06, "loss": 1.04797926, "memory(GiB)": 369.4, "step": 19730, "train_speed(iter/s)": 0.201694 }, { "acc": 0.74120255, "epoch": 0.5006341958396753, "grad_norm": 2.3125, "learning_rate": 8.9424799882286e-06, "loss": 1.04423637, "memory(GiB)": 369.4, "step": 19735, "train_speed(iter/s)": 0.201702 }, { "acc": 0.75461555, "epoch": 0.5007610350076104, "grad_norm": 1.828125, "learning_rate": 8.94183495614088e-06, "loss": 1.00657425, "memory(GiB)": 369.4, "step": 19740, "train_speed(iter/s)": 0.201707 }, { "acc": 0.75170598, "epoch": 0.5008878741755454, "grad_norm": 2.140625, "learning_rate": 8.941189750673658e-06, "loss": 0.98351479, "memory(GiB)": 369.4, "step": 19745, "train_speed(iter/s)": 0.201712 }, { "acc": 0.74599314, "epoch": 0.5010147133434805, "grad_norm": 1.890625, "learning_rate": 8.940544371855315e-06, "loss": 1.01150761, "memory(GiB)": 369.4, "step": 19750, "train_speed(iter/s)": 0.201721 }, { "acc": 0.7510582, "epoch": 0.5011415525114156, "grad_norm": 2.078125, "learning_rate": 8.939898819714237e-06, "loss": 0.94034386, "memory(GiB)": 369.4, "step": 19755, "train_speed(iter/s)": 0.201727 }, { "acc": 0.74024467, "epoch": 0.5012683916793506, "grad_norm": 1.9453125, "learning_rate": 8.93925309427882e-06, "loss": 1.00872889, "memory(GiB)": 369.4, "step": 19760, "train_speed(iter/s)": 0.201737 }, { "acc": 0.73716192, "epoch": 0.5013952308472857, "grad_norm": 2.296875, "learning_rate": 8.938607195577462e-06, "loss": 1.05268688, "memory(GiB)": 369.4, "step": 19765, "train_speed(iter/s)": 0.201742 }, { "acc": 0.74003272, "epoch": 0.5015220700152208, "grad_norm": 2.078125, "learning_rate": 8.937961123638577e-06, "loss": 1.06651964, "memory(GiB)": 369.4, "step": 19770, "train_speed(iter/s)": 0.201752 }, { "acc": 0.74415874, "epoch": 0.5016489091831557, "grad_norm": 2.21875, "learning_rate": 8.93731487849058e-06, "loss": 1.02325087, "memory(GiB)": 369.4, "step": 19775, "train_speed(iter/s)": 0.201759 }, { "acc": 0.74005017, "epoch": 0.5017757483510908, "grad_norm": 2.53125, "learning_rate": 8.936668460161895e-06, "loss": 1.04543629, "memory(GiB)": 369.4, "step": 19780, "train_speed(iter/s)": 0.201769 }, { "acc": 0.74291, "epoch": 0.5019025875190258, "grad_norm": 2.5625, "learning_rate": 8.936021868680956e-06, "loss": 1.08705254, "memory(GiB)": 369.4, "step": 19785, "train_speed(iter/s)": 0.201766 }, { "acc": 0.74081345, "epoch": 0.5020294266869609, "grad_norm": 2.4375, "learning_rate": 8.935375104076201e-06, "loss": 1.07688522, "memory(GiB)": 369.4, "step": 19790, "train_speed(iter/s)": 0.201775 }, { "acc": 0.74730997, "epoch": 0.502156265854896, "grad_norm": 1.9609375, "learning_rate": 8.93472816637608e-06, "loss": 1.00722771, "memory(GiB)": 369.4, "step": 19795, "train_speed(iter/s)": 0.201784 }, { "acc": 0.7501298, "epoch": 0.502283105022831, "grad_norm": 2.203125, "learning_rate": 8.934081055609046e-06, "loss": 1.00284204, "memory(GiB)": 369.4, "step": 19800, "train_speed(iter/s)": 0.201792 }, { "acc": 0.75304451, "epoch": 0.5024099441907661, "grad_norm": 2.03125, "learning_rate": 8.933433771803562e-06, "loss": 0.98773966, "memory(GiB)": 369.4, "step": 19805, "train_speed(iter/s)": 0.201794 }, { "acc": 0.7465991, "epoch": 0.5025367833587012, "grad_norm": 2.078125, "learning_rate": 8.932786314988099e-06, "loss": 1.01587143, "memory(GiB)": 369.4, "step": 19810, "train_speed(iter/s)": 0.201799 }, { "acc": 0.75185361, "epoch": 0.5026636225266362, "grad_norm": 2.609375, "learning_rate": 8.932138685191136e-06, "loss": 1.00986176, "memory(GiB)": 369.4, "step": 19815, "train_speed(iter/s)": 0.201807 }, { "acc": 0.7532546, "epoch": 0.5027904616945713, "grad_norm": 2.390625, "learning_rate": 8.931490882441159e-06, "loss": 1.01746273, "memory(GiB)": 369.4, "step": 19820, "train_speed(iter/s)": 0.201813 }, { "acc": 0.73979955, "epoch": 0.5029173008625063, "grad_norm": 2.328125, "learning_rate": 8.930842906766659e-06, "loss": 1.05455055, "memory(GiB)": 369.4, "step": 19825, "train_speed(iter/s)": 0.201818 }, { "acc": 0.74498701, "epoch": 0.5030441400304414, "grad_norm": 2.21875, "learning_rate": 8.930194758196138e-06, "loss": 1.01821136, "memory(GiB)": 369.4, "step": 19830, "train_speed(iter/s)": 0.201821 }, { "acc": 0.73577952, "epoch": 0.5031709791983765, "grad_norm": 2.078125, "learning_rate": 8.929546436758105e-06, "loss": 1.05499592, "memory(GiB)": 369.4, "step": 19835, "train_speed(iter/s)": 0.201828 }, { "acc": 0.74702034, "epoch": 0.5032978183663115, "grad_norm": 1.9921875, "learning_rate": 8.928897942481075e-06, "loss": 0.99674902, "memory(GiB)": 369.4, "step": 19840, "train_speed(iter/s)": 0.201833 }, { "acc": 0.73993044, "epoch": 0.5034246575342466, "grad_norm": 2.765625, "learning_rate": 8.928249275393572e-06, "loss": 0.99041443, "memory(GiB)": 369.4, "step": 19845, "train_speed(iter/s)": 0.201842 }, { "acc": 0.7471518, "epoch": 0.5035514967021817, "grad_norm": 2.1875, "learning_rate": 8.927600435524129e-06, "loss": 0.98926029, "memory(GiB)": 369.4, "step": 19850, "train_speed(iter/s)": 0.201847 }, { "acc": 0.74674754, "epoch": 0.5036783358701167, "grad_norm": 2.328125, "learning_rate": 8.926951422901282e-06, "loss": 1.01334629, "memory(GiB)": 369.4, "step": 19855, "train_speed(iter/s)": 0.201855 }, { "acc": 0.75102549, "epoch": 0.5038051750380518, "grad_norm": 2.609375, "learning_rate": 8.926302237553578e-06, "loss": 0.98163223, "memory(GiB)": 369.4, "step": 19860, "train_speed(iter/s)": 0.20186 }, { "acc": 0.73895731, "epoch": 0.5039320142059868, "grad_norm": 2.5, "learning_rate": 8.925652879509575e-06, "loss": 1.00466805, "memory(GiB)": 369.4, "step": 19865, "train_speed(iter/s)": 0.201865 }, { "acc": 0.75934067, "epoch": 0.5040588533739219, "grad_norm": 2.328125, "learning_rate": 8.925003348797829e-06, "loss": 1.04353027, "memory(GiB)": 369.4, "step": 19870, "train_speed(iter/s)": 0.201875 }, { "acc": 0.74433866, "epoch": 0.504185692541857, "grad_norm": 2.140625, "learning_rate": 8.924353645446912e-06, "loss": 0.97766075, "memory(GiB)": 369.4, "step": 19875, "train_speed(iter/s)": 0.201882 }, { "acc": 0.74439173, "epoch": 0.504312531709792, "grad_norm": 2.1875, "learning_rate": 8.923703769485403e-06, "loss": 1.01323805, "memory(GiB)": 369.4, "step": 19880, "train_speed(iter/s)": 0.201892 }, { "acc": 0.74609165, "epoch": 0.5044393708777271, "grad_norm": 2.28125, "learning_rate": 8.92305372094188e-06, "loss": 1.03980465, "memory(GiB)": 369.4, "step": 19885, "train_speed(iter/s)": 0.2019 }, { "acc": 0.75372562, "epoch": 0.5045662100456622, "grad_norm": 2.296875, "learning_rate": 8.922403499844943e-06, "loss": 0.96536169, "memory(GiB)": 369.4, "step": 19890, "train_speed(iter/s)": 0.201906 }, { "acc": 0.74071779, "epoch": 0.5046930492135971, "grad_norm": 2.328125, "learning_rate": 8.921753106223186e-06, "loss": 1.00007153, "memory(GiB)": 369.4, "step": 19895, "train_speed(iter/s)": 0.201915 }, { "acc": 0.72256365, "epoch": 0.5048198883815322, "grad_norm": 2.3125, "learning_rate": 8.92110254010522e-06, "loss": 1.14291477, "memory(GiB)": 369.4, "step": 19900, "train_speed(iter/s)": 0.201923 }, { "acc": 0.7329546, "epoch": 0.5049467275494672, "grad_norm": 1.8203125, "learning_rate": 8.920451801519656e-06, "loss": 1.0813447, "memory(GiB)": 369.4, "step": 19905, "train_speed(iter/s)": 0.201931 }, { "acc": 0.74278812, "epoch": 0.5050735667174023, "grad_norm": 2.65625, "learning_rate": 8.919800890495118e-06, "loss": 0.99233551, "memory(GiB)": 369.4, "step": 19910, "train_speed(iter/s)": 0.201931 }, { "acc": 0.73184958, "epoch": 0.5052004058853374, "grad_norm": 2.109375, "learning_rate": 8.919149807060237e-06, "loss": 1.08698235, "memory(GiB)": 369.4, "step": 19915, "train_speed(iter/s)": 0.201936 }, { "acc": 0.7366344, "epoch": 0.5053272450532724, "grad_norm": 2.296875, "learning_rate": 8.918498551243649e-06, "loss": 1.05324211, "memory(GiB)": 369.4, "step": 19920, "train_speed(iter/s)": 0.201943 }, { "acc": 0.74346852, "epoch": 0.5054540842212075, "grad_norm": 1.9453125, "learning_rate": 8.917847123073999e-06, "loss": 1.00833511, "memory(GiB)": 369.4, "step": 19925, "train_speed(iter/s)": 0.201949 }, { "acc": 0.74608994, "epoch": 0.5055809233891426, "grad_norm": 2.65625, "learning_rate": 8.917195522579943e-06, "loss": 0.99347277, "memory(GiB)": 369.4, "step": 19930, "train_speed(iter/s)": 0.201956 }, { "acc": 0.75054145, "epoch": 0.5057077625570776, "grad_norm": 2.140625, "learning_rate": 8.916543749790139e-06, "loss": 1.00400648, "memory(GiB)": 369.4, "step": 19935, "train_speed(iter/s)": 0.201963 }, { "acc": 0.74367394, "epoch": 0.5058346017250127, "grad_norm": 2.34375, "learning_rate": 8.915891804733253e-06, "loss": 1.08093967, "memory(GiB)": 369.4, "step": 19940, "train_speed(iter/s)": 0.201974 }, { "acc": 0.74867315, "epoch": 0.5059614408929477, "grad_norm": 2.109375, "learning_rate": 8.915239687437963e-06, "loss": 0.99957752, "memory(GiB)": 369.4, "step": 19945, "train_speed(iter/s)": 0.201977 }, { "acc": 0.7453413, "epoch": 0.5060882800608828, "grad_norm": 1.9375, "learning_rate": 8.91458739793295e-06, "loss": 0.97956524, "memory(GiB)": 369.4, "step": 19950, "train_speed(iter/s)": 0.201983 }, { "acc": 0.73810568, "epoch": 0.5062151192288179, "grad_norm": 2.203125, "learning_rate": 8.913934936246907e-06, "loss": 1.04811678, "memory(GiB)": 369.4, "step": 19955, "train_speed(iter/s)": 0.201991 }, { "acc": 0.73112965, "epoch": 0.5063419583967529, "grad_norm": 2.234375, "learning_rate": 8.91328230240853e-06, "loss": 1.06053638, "memory(GiB)": 369.4, "step": 19960, "train_speed(iter/s)": 0.201997 }, { "acc": 0.7524065, "epoch": 0.506468797564688, "grad_norm": 2.203125, "learning_rate": 8.912629496446528e-06, "loss": 0.96568146, "memory(GiB)": 369.4, "step": 19965, "train_speed(iter/s)": 0.202006 }, { "acc": 0.74820786, "epoch": 0.5065956367326231, "grad_norm": 2.8125, "learning_rate": 8.911976518389612e-06, "loss": 0.96334639, "memory(GiB)": 369.4, "step": 19970, "train_speed(iter/s)": 0.202008 }, { "acc": 0.75831208, "epoch": 0.5067224759005581, "grad_norm": 2.28125, "learning_rate": 8.9113233682665e-06, "loss": 0.98190613, "memory(GiB)": 369.4, "step": 19975, "train_speed(iter/s)": 0.202017 }, { "acc": 0.74549074, "epoch": 0.5068493150684932, "grad_norm": 1.8359375, "learning_rate": 8.910670046105927e-06, "loss": 0.99796047, "memory(GiB)": 369.4, "step": 19980, "train_speed(iter/s)": 0.202024 }, { "acc": 0.74904509, "epoch": 0.5069761542364282, "grad_norm": 2.203125, "learning_rate": 8.910016551936623e-06, "loss": 1.03473444, "memory(GiB)": 369.4, "step": 19985, "train_speed(iter/s)": 0.202029 }, { "acc": 0.73789711, "epoch": 0.5071029934043633, "grad_norm": 2.125, "learning_rate": 8.909362885787333e-06, "loss": 1.03982964, "memory(GiB)": 369.4, "step": 19990, "train_speed(iter/s)": 0.202035 }, { "acc": 0.73738794, "epoch": 0.5072298325722984, "grad_norm": 2.109375, "learning_rate": 8.908709047686813e-06, "loss": 1.06647549, "memory(GiB)": 369.4, "step": 19995, "train_speed(iter/s)": 0.202041 }, { "acc": 0.75377951, "epoch": 0.5073566717402334, "grad_norm": 2.46875, "learning_rate": 8.908055037663815e-06, "loss": 0.99038372, "memory(GiB)": 369.4, "step": 20000, "train_speed(iter/s)": 0.20205 }, { "epoch": 0.5073566717402334, "eval_acc": 0.7347624595966791, "eval_loss": 0.9854836463928223, "eval_runtime": 384.2314, "eval_samples_per_second": 16.579, "eval_steps_per_second": 8.289, "step": 20000 }, { "acc": 0.7517518, "epoch": 0.5074835109081685, "grad_norm": 2.40625, "learning_rate": 8.907400855747111e-06, "loss": 1.04046707, "memory(GiB)": 369.4, "step": 20005, "train_speed(iter/s)": 0.200615 }, { "acc": 0.71491909, "epoch": 0.5076103500761036, "grad_norm": 1.9453125, "learning_rate": 8.906746501965468e-06, "loss": 1.09174709, "memory(GiB)": 369.4, "step": 20010, "train_speed(iter/s)": 0.200621 }, { "acc": 0.75931325, "epoch": 0.5077371892440385, "grad_norm": 2.5625, "learning_rate": 8.906091976347675e-06, "loss": 0.9637888, "memory(GiB)": 369.4, "step": 20015, "train_speed(iter/s)": 0.200627 }, { "acc": 0.73812237, "epoch": 0.5078640284119736, "grad_norm": 1.90625, "learning_rate": 8.905437278922516e-06, "loss": 1.01379299, "memory(GiB)": 369.4, "step": 20020, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74644241, "epoch": 0.5079908675799086, "grad_norm": 2.390625, "learning_rate": 8.90478240971879e-06, "loss": 0.99342499, "memory(GiB)": 369.4, "step": 20025, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74629779, "epoch": 0.5081177067478437, "grad_norm": 1.96875, "learning_rate": 8.904127368765298e-06, "loss": 1.01469765, "memory(GiB)": 369.4, "step": 20030, "train_speed(iter/s)": 0.200649 }, { "acc": 0.74017253, "epoch": 0.5082445459157788, "grad_norm": 2.65625, "learning_rate": 8.903472156090856e-06, "loss": 1.04000711, "memory(GiB)": 369.4, "step": 20035, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75270395, "epoch": 0.5083713850837138, "grad_norm": 1.96875, "learning_rate": 8.902816771724279e-06, "loss": 0.98628349, "memory(GiB)": 369.4, "step": 20040, "train_speed(iter/s)": 0.200669 }, { "acc": 0.74953566, "epoch": 0.5084982242516489, "grad_norm": 1.9609375, "learning_rate": 8.902161215694396e-06, "loss": 0.98946533, "memory(GiB)": 369.4, "step": 20045, "train_speed(iter/s)": 0.200678 }, { "acc": 0.75168915, "epoch": 0.508625063419584, "grad_norm": 2.125, "learning_rate": 8.901505488030042e-06, "loss": 1.0175518, "memory(GiB)": 369.4, "step": 20050, "train_speed(iter/s)": 0.200686 }, { "acc": 0.72268181, "epoch": 0.508751902587519, "grad_norm": 2.125, "learning_rate": 8.900849588760057e-06, "loss": 1.02019978, "memory(GiB)": 369.4, "step": 20055, "train_speed(iter/s)": 0.200694 }, { "acc": 0.724933, "epoch": 0.5088787417554541, "grad_norm": 2.21875, "learning_rate": 8.90019351791329e-06, "loss": 1.0826931, "memory(GiB)": 369.4, "step": 20060, "train_speed(iter/s)": 0.200702 }, { "acc": 0.7559092, "epoch": 0.5090055809233891, "grad_norm": 2.34375, "learning_rate": 8.8995372755186e-06, "loss": 0.98818226, "memory(GiB)": 369.4, "step": 20065, "train_speed(iter/s)": 0.200709 }, { "acc": 0.73882656, "epoch": 0.5091324200913242, "grad_norm": 2.9375, "learning_rate": 8.89888086160485e-06, "loss": 0.98412371, "memory(GiB)": 369.4, "step": 20070, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74109411, "epoch": 0.5092592592592593, "grad_norm": 2.203125, "learning_rate": 8.898224276200913e-06, "loss": 1.05688343, "memory(GiB)": 369.4, "step": 20075, "train_speed(iter/s)": 0.200726 }, { "acc": 0.74885244, "epoch": 0.5093860984271943, "grad_norm": 1.9375, "learning_rate": 8.897567519335669e-06, "loss": 1.00434856, "memory(GiB)": 369.4, "step": 20080, "train_speed(iter/s)": 0.200732 }, { "acc": 0.74681067, "epoch": 0.5095129375951294, "grad_norm": 2.25, "learning_rate": 8.896910591038002e-06, "loss": 0.97665215, "memory(GiB)": 369.4, "step": 20085, "train_speed(iter/s)": 0.20074 }, { "acc": 0.73120346, "epoch": 0.5096397767630645, "grad_norm": 2.5, "learning_rate": 8.896253491336809e-06, "loss": 1.06610374, "memory(GiB)": 369.4, "step": 20090, "train_speed(iter/s)": 0.200751 }, { "acc": 0.73224616, "epoch": 0.5097666159309995, "grad_norm": 2.28125, "learning_rate": 8.895596220260993e-06, "loss": 1.01474552, "memory(GiB)": 369.4, "step": 20095, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74320736, "epoch": 0.5098934550989346, "grad_norm": 2.59375, "learning_rate": 8.894938777839462e-06, "loss": 1.05019855, "memory(GiB)": 369.4, "step": 20100, "train_speed(iter/s)": 0.20076 }, { "acc": 0.74544353, "epoch": 0.5100202942668696, "grad_norm": 2.25, "learning_rate": 8.894281164101134e-06, "loss": 1.01288395, "memory(GiB)": 369.4, "step": 20105, "train_speed(iter/s)": 0.200766 }, { "acc": 0.7268836, "epoch": 0.5101471334348047, "grad_norm": 1.8671875, "learning_rate": 8.893623379074934e-06, "loss": 1.00440121, "memory(GiB)": 369.4, "step": 20110, "train_speed(iter/s)": 0.20077 }, { "acc": 0.74816518, "epoch": 0.5102739726027398, "grad_norm": 2.109375, "learning_rate": 8.892965422789793e-06, "loss": 1.04959955, "memory(GiB)": 369.4, "step": 20115, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75101328, "epoch": 0.5104008117706748, "grad_norm": 2.421875, "learning_rate": 8.892307295274654e-06, "loss": 1.01991882, "memory(GiB)": 369.4, "step": 20120, "train_speed(iter/s)": 0.200787 }, { "acc": 0.74629478, "epoch": 0.5105276509386099, "grad_norm": 2.453125, "learning_rate": 8.89164899655846e-06, "loss": 1.01910725, "memory(GiB)": 369.4, "step": 20125, "train_speed(iter/s)": 0.200797 }, { "acc": 0.7614666, "epoch": 0.510654490106545, "grad_norm": 2.421875, "learning_rate": 8.89099052667017e-06, "loss": 0.95639496, "memory(GiB)": 369.4, "step": 20130, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74295034, "epoch": 0.51078132927448, "grad_norm": 1.875, "learning_rate": 8.890331885638744e-06, "loss": 1.01218195, "memory(GiB)": 369.4, "step": 20135, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74341474, "epoch": 0.510908168442415, "grad_norm": 1.890625, "learning_rate": 8.889673073493151e-06, "loss": 1.01016455, "memory(GiB)": 369.4, "step": 20140, "train_speed(iter/s)": 0.200819 }, { "acc": 0.744593, "epoch": 0.51103500761035, "grad_norm": 2.28125, "learning_rate": 8.889014090262371e-06, "loss": 1.06743279, "memory(GiB)": 369.4, "step": 20145, "train_speed(iter/s)": 0.200822 }, { "acc": 0.74165916, "epoch": 0.5111618467782851, "grad_norm": 2.140625, "learning_rate": 8.888354935975388e-06, "loss": 0.97419395, "memory(GiB)": 369.4, "step": 20150, "train_speed(iter/s)": 0.200822 }, { "acc": 0.74500132, "epoch": 0.5112886859462202, "grad_norm": 1.75, "learning_rate": 8.887695610661196e-06, "loss": 0.99429531, "memory(GiB)": 369.4, "step": 20155, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75086985, "epoch": 0.5114155251141552, "grad_norm": 2.09375, "learning_rate": 8.887036114348792e-06, "loss": 1.04610252, "memory(GiB)": 369.4, "step": 20160, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75611935, "epoch": 0.5115423642820903, "grad_norm": 2.109375, "learning_rate": 8.886376447067186e-06, "loss": 1.02105055, "memory(GiB)": 369.4, "step": 20165, "train_speed(iter/s)": 0.200843 }, { "acc": 0.74525461, "epoch": 0.5116692034500254, "grad_norm": 2.59375, "learning_rate": 8.885716608845394e-06, "loss": 1.0083045, "memory(GiB)": 369.4, "step": 20170, "train_speed(iter/s)": 0.200851 }, { "acc": 0.75528297, "epoch": 0.5117960426179604, "grad_norm": 1.703125, "learning_rate": 8.885056599712436e-06, "loss": 0.98232651, "memory(GiB)": 369.4, "step": 20175, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74257498, "epoch": 0.5119228817858955, "grad_norm": 2.25, "learning_rate": 8.884396419697343e-06, "loss": 1.03107777, "memory(GiB)": 369.4, "step": 20180, "train_speed(iter/s)": 0.200864 }, { "acc": 0.73842163, "epoch": 0.5120497209538305, "grad_norm": 2.296875, "learning_rate": 8.883736068829151e-06, "loss": 1.00698996, "memory(GiB)": 369.4, "step": 20185, "train_speed(iter/s)": 0.200874 }, { "acc": 0.74599495, "epoch": 0.5121765601217656, "grad_norm": 2.140625, "learning_rate": 8.88307554713691e-06, "loss": 1.02508287, "memory(GiB)": 369.4, "step": 20190, "train_speed(iter/s)": 0.20088 }, { "acc": 0.75583448, "epoch": 0.5123033992897007, "grad_norm": 2.125, "learning_rate": 8.882414854649667e-06, "loss": 0.98862076, "memory(GiB)": 369.4, "step": 20195, "train_speed(iter/s)": 0.200888 }, { "acc": 0.74789581, "epoch": 0.5124302384576357, "grad_norm": 2.078125, "learning_rate": 8.881753991396488e-06, "loss": 1.05301304, "memory(GiB)": 369.4, "step": 20200, "train_speed(iter/s)": 0.200895 }, { "acc": 0.74891529, "epoch": 0.5125570776255708, "grad_norm": 2.15625, "learning_rate": 8.881092957406436e-06, "loss": 1.0035677, "memory(GiB)": 369.4, "step": 20205, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75108795, "epoch": 0.5126839167935059, "grad_norm": 2.359375, "learning_rate": 8.880431752708588e-06, "loss": 0.95845842, "memory(GiB)": 369.4, "step": 20210, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76105156, "epoch": 0.5128107559614409, "grad_norm": 2.34375, "learning_rate": 8.879770377332026e-06, "loss": 0.95038261, "memory(GiB)": 369.4, "step": 20215, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74537134, "epoch": 0.512937595129376, "grad_norm": 2.453125, "learning_rate": 8.879108831305842e-06, "loss": 1.00997829, "memory(GiB)": 369.4, "step": 20220, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74132633, "epoch": 0.513064434297311, "grad_norm": 2.140625, "learning_rate": 8.878447114659131e-06, "loss": 1.02375469, "memory(GiB)": 369.4, "step": 20225, "train_speed(iter/s)": 0.200925 }, { "acc": 0.7598557, "epoch": 0.5131912734652461, "grad_norm": 2.671875, "learning_rate": 8.877785227421003e-06, "loss": 0.94819651, "memory(GiB)": 369.4, "step": 20230, "train_speed(iter/s)": 0.200935 }, { "acc": 0.75659719, "epoch": 0.5133181126331812, "grad_norm": 2.078125, "learning_rate": 8.877123169620565e-06, "loss": 1.00365772, "memory(GiB)": 369.4, "step": 20235, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74451227, "epoch": 0.5134449518011162, "grad_norm": 1.921875, "learning_rate": 8.876460941286941e-06, "loss": 1.03168526, "memory(GiB)": 369.4, "step": 20240, "train_speed(iter/s)": 0.200951 }, { "acc": 0.73913779, "epoch": 0.5135717909690513, "grad_norm": 2.296875, "learning_rate": 8.875798542449256e-06, "loss": 1.06105347, "memory(GiB)": 369.4, "step": 20245, "train_speed(iter/s)": 0.200959 }, { "acc": 0.73698144, "epoch": 0.5136986301369864, "grad_norm": 2.109375, "learning_rate": 8.87513597313665e-06, "loss": 1.04111271, "memory(GiB)": 369.4, "step": 20250, "train_speed(iter/s)": 0.200965 }, { "acc": 0.73933806, "epoch": 0.5138254693049213, "grad_norm": 2.265625, "learning_rate": 8.87447323337826e-06, "loss": 1.04086018, "memory(GiB)": 369.4, "step": 20255, "train_speed(iter/s)": 0.200972 }, { "acc": 0.75930367, "epoch": 0.5139523084728564, "grad_norm": 2.15625, "learning_rate": 8.87381032320324e-06, "loss": 0.98005581, "memory(GiB)": 369.4, "step": 20260, "train_speed(iter/s)": 0.200977 }, { "acc": 0.7455801, "epoch": 0.5140791476407914, "grad_norm": 2.15625, "learning_rate": 8.873147242640746e-06, "loss": 0.98798847, "memory(GiB)": 369.4, "step": 20265, "train_speed(iter/s)": 0.200986 }, { "acc": 0.74164877, "epoch": 0.5142059868087265, "grad_norm": 1.796875, "learning_rate": 8.872483991719944e-06, "loss": 1.05411673, "memory(GiB)": 369.4, "step": 20270, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74020052, "epoch": 0.5143328259766616, "grad_norm": 1.828125, "learning_rate": 8.871820570470009e-06, "loss": 1.0443285, "memory(GiB)": 369.4, "step": 20275, "train_speed(iter/s)": 0.201 }, { "acc": 0.74886851, "epoch": 0.5144596651445966, "grad_norm": 2.8125, "learning_rate": 8.871156978920116e-06, "loss": 1.00476456, "memory(GiB)": 369.4, "step": 20280, "train_speed(iter/s)": 0.20101 }, { "acc": 0.75133171, "epoch": 0.5145865043125317, "grad_norm": 2.078125, "learning_rate": 8.870493217099456e-06, "loss": 0.93247919, "memory(GiB)": 369.4, "step": 20285, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74985733, "epoch": 0.5147133434804668, "grad_norm": 1.9921875, "learning_rate": 8.869829285037224e-06, "loss": 0.95380611, "memory(GiB)": 369.4, "step": 20290, "train_speed(iter/s)": 0.201021 }, { "acc": 0.75123835, "epoch": 0.5148401826484018, "grad_norm": 2.390625, "learning_rate": 8.869165182762623e-06, "loss": 1.01194134, "memory(GiB)": 369.4, "step": 20295, "train_speed(iter/s)": 0.20103 }, { "acc": 0.7435216, "epoch": 0.5149670218163369, "grad_norm": 2.203125, "learning_rate": 8.868500910304863e-06, "loss": 1.04490919, "memory(GiB)": 369.4, "step": 20300, "train_speed(iter/s)": 0.201038 }, { "acc": 0.75020924, "epoch": 0.5150938609842719, "grad_norm": 1.9375, "learning_rate": 8.86783646769316e-06, "loss": 0.97649441, "memory(GiB)": 369.4, "step": 20305, "train_speed(iter/s)": 0.201048 }, { "acc": 0.74501743, "epoch": 0.515220700152207, "grad_norm": 2.140625, "learning_rate": 8.867171854956742e-06, "loss": 1.01853189, "memory(GiB)": 369.4, "step": 20310, "train_speed(iter/s)": 0.201055 }, { "acc": 0.74437981, "epoch": 0.5153475393201421, "grad_norm": 2.578125, "learning_rate": 8.86650707212484e-06, "loss": 0.99648724, "memory(GiB)": 369.4, "step": 20315, "train_speed(iter/s)": 0.201062 }, { "acc": 0.74284086, "epoch": 0.5154743784880771, "grad_norm": 2.09375, "learning_rate": 8.865842119226693e-06, "loss": 1.01321249, "memory(GiB)": 369.4, "step": 20320, "train_speed(iter/s)": 0.201072 }, { "acc": 0.73456917, "epoch": 0.5156012176560122, "grad_norm": 2.453125, "learning_rate": 8.86517699629155e-06, "loss": 1.07117653, "memory(GiB)": 369.4, "step": 20325, "train_speed(iter/s)": 0.201081 }, { "acc": 0.74400148, "epoch": 0.5157280568239473, "grad_norm": 1.875, "learning_rate": 8.864511703348666e-06, "loss": 1.07152939, "memory(GiB)": 369.4, "step": 20330, "train_speed(iter/s)": 0.201086 }, { "acc": 0.73933887, "epoch": 0.5158548959918823, "grad_norm": 2.15625, "learning_rate": 8.863846240427306e-06, "loss": 1.04598579, "memory(GiB)": 369.4, "step": 20335, "train_speed(iter/s)": 0.201094 }, { "acc": 0.74893165, "epoch": 0.5159817351598174, "grad_norm": 2.109375, "learning_rate": 8.863180607556733e-06, "loss": 0.94964304, "memory(GiB)": 369.4, "step": 20340, "train_speed(iter/s)": 0.201088 }, { "acc": 0.75016174, "epoch": 0.5161085743277524, "grad_norm": 2.296875, "learning_rate": 8.862514804766234e-06, "loss": 0.98038464, "memory(GiB)": 369.4, "step": 20345, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74168911, "epoch": 0.5162354134956875, "grad_norm": 2.203125, "learning_rate": 8.861848832085084e-06, "loss": 1.04507446, "memory(GiB)": 369.4, "step": 20350, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74317799, "epoch": 0.5163622526636226, "grad_norm": 2.125, "learning_rate": 8.861182689542585e-06, "loss": 0.99489937, "memory(GiB)": 369.4, "step": 20355, "train_speed(iter/s)": 0.201101 }, { "acc": 0.75168877, "epoch": 0.5164890918315576, "grad_norm": 2.15625, "learning_rate": 8.86051637716803e-06, "loss": 0.95812683, "memory(GiB)": 369.4, "step": 20360, "train_speed(iter/s)": 0.201108 }, { "acc": 0.7411026, "epoch": 0.5166159309994927, "grad_norm": 2.125, "learning_rate": 8.859849894990728e-06, "loss": 1.05642509, "memory(GiB)": 369.4, "step": 20365, "train_speed(iter/s)": 0.201117 }, { "acc": 0.74344835, "epoch": 0.5167427701674278, "grad_norm": 2.234375, "learning_rate": 8.859183243039995e-06, "loss": 1.04597006, "memory(GiB)": 369.4, "step": 20370, "train_speed(iter/s)": 0.201123 }, { "acc": 0.72522278, "epoch": 0.5168696093353627, "grad_norm": 2.328125, "learning_rate": 8.858516421345154e-06, "loss": 1.05184422, "memory(GiB)": 369.4, "step": 20375, "train_speed(iter/s)": 0.201132 }, { "acc": 0.75239639, "epoch": 0.5169964485032978, "grad_norm": 2.1875, "learning_rate": 8.857849429935534e-06, "loss": 1.0243576, "memory(GiB)": 369.4, "step": 20380, "train_speed(iter/s)": 0.201138 }, { "acc": 0.74721909, "epoch": 0.5171232876712328, "grad_norm": 2.453125, "learning_rate": 8.857182268840471e-06, "loss": 1.02453194, "memory(GiB)": 369.4, "step": 20385, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74950294, "epoch": 0.5172501268391679, "grad_norm": 2.015625, "learning_rate": 8.856514938089312e-06, "loss": 0.9886425, "memory(GiB)": 369.4, "step": 20390, "train_speed(iter/s)": 0.20115 }, { "acc": 0.73804569, "epoch": 0.517376966007103, "grad_norm": 2.21875, "learning_rate": 8.855847437711407e-06, "loss": 1.0715024, "memory(GiB)": 369.4, "step": 20395, "train_speed(iter/s)": 0.20116 }, { "acc": 0.74479184, "epoch": 0.517503805175038, "grad_norm": 2.546875, "learning_rate": 8.855179767736117e-06, "loss": 1.01634846, "memory(GiB)": 369.4, "step": 20400, "train_speed(iter/s)": 0.201166 }, { "acc": 0.74737053, "epoch": 0.5176306443429731, "grad_norm": 2.21875, "learning_rate": 8.854511928192808e-06, "loss": 1.00524483, "memory(GiB)": 369.4, "step": 20405, "train_speed(iter/s)": 0.201173 }, { "acc": 0.74439459, "epoch": 0.5177574835109082, "grad_norm": 2.34375, "learning_rate": 8.853843919110856e-06, "loss": 0.96034164, "memory(GiB)": 369.4, "step": 20410, "train_speed(iter/s)": 0.201176 }, { "acc": 0.7399313, "epoch": 0.5178843226788432, "grad_norm": 2.4375, "learning_rate": 8.853175740519642e-06, "loss": 1.07944164, "memory(GiB)": 369.4, "step": 20415, "train_speed(iter/s)": 0.201185 }, { "acc": 0.74695401, "epoch": 0.5180111618467783, "grad_norm": 2.1875, "learning_rate": 8.852507392448555e-06, "loss": 1.01167221, "memory(GiB)": 369.4, "step": 20420, "train_speed(iter/s)": 0.201192 }, { "acc": 0.74163675, "epoch": 0.5181380010147133, "grad_norm": 2.4375, "learning_rate": 8.851838874926994e-06, "loss": 1.02227135, "memory(GiB)": 369.4, "step": 20425, "train_speed(iter/s)": 0.201199 }, { "acc": 0.74397244, "epoch": 0.5182648401826484, "grad_norm": 2.578125, "learning_rate": 8.851170187984362e-06, "loss": 0.97979259, "memory(GiB)": 369.4, "step": 20430, "train_speed(iter/s)": 0.201204 }, { "acc": 0.74418697, "epoch": 0.5183916793505835, "grad_norm": 2.21875, "learning_rate": 8.850501331650069e-06, "loss": 0.99713049, "memory(GiB)": 369.4, "step": 20435, "train_speed(iter/s)": 0.201208 }, { "acc": 0.7450254, "epoch": 0.5185185185185185, "grad_norm": 2.1875, "learning_rate": 8.849832305953536e-06, "loss": 1.05379896, "memory(GiB)": 369.4, "step": 20440, "train_speed(iter/s)": 0.201216 }, { "acc": 0.74158754, "epoch": 0.5186453576864536, "grad_norm": 1.84375, "learning_rate": 8.849163110924193e-06, "loss": 1.02511635, "memory(GiB)": 369.4, "step": 20445, "train_speed(iter/s)": 0.201224 }, { "acc": 0.73419056, "epoch": 0.5187721968543887, "grad_norm": 2.078125, "learning_rate": 8.84849374659147e-06, "loss": 1.03830223, "memory(GiB)": 369.4, "step": 20450, "train_speed(iter/s)": 0.201223 }, { "acc": 0.75604305, "epoch": 0.5188990360223237, "grad_norm": 1.8125, "learning_rate": 8.847824212984807e-06, "loss": 0.93840342, "memory(GiB)": 369.4, "step": 20455, "train_speed(iter/s)": 0.201235 }, { "acc": 0.75313411, "epoch": 0.5190258751902588, "grad_norm": 2.0625, "learning_rate": 8.84715451013366e-06, "loss": 0.92491779, "memory(GiB)": 369.4, "step": 20460, "train_speed(iter/s)": 0.201238 }, { "acc": 0.7537796, "epoch": 0.5191527143581938, "grad_norm": 2.484375, "learning_rate": 8.846484638067478e-06, "loss": 1.01107788, "memory(GiB)": 369.4, "step": 20465, "train_speed(iter/s)": 0.201243 }, { "acc": 0.74318051, "epoch": 0.5192795535261289, "grad_norm": 2.0, "learning_rate": 8.84581459681573e-06, "loss": 1.03205395, "memory(GiB)": 369.4, "step": 20470, "train_speed(iter/s)": 0.201252 }, { "acc": 0.75221, "epoch": 0.519406392694064, "grad_norm": 2.578125, "learning_rate": 8.845144386407884e-06, "loss": 1.03394451, "memory(GiB)": 369.4, "step": 20475, "train_speed(iter/s)": 0.201257 }, { "acc": 0.75545201, "epoch": 0.519533231861999, "grad_norm": 2.015625, "learning_rate": 8.844474006873422e-06, "loss": 1.02902069, "memory(GiB)": 369.4, "step": 20480, "train_speed(iter/s)": 0.201264 }, { "acc": 0.75847149, "epoch": 0.5196600710299341, "grad_norm": 2.515625, "learning_rate": 8.84380345824183e-06, "loss": 0.95807953, "memory(GiB)": 369.4, "step": 20485, "train_speed(iter/s)": 0.201271 }, { "acc": 0.74370337, "epoch": 0.5197869101978692, "grad_norm": 2.0625, "learning_rate": 8.843132740542599e-06, "loss": 1.03920231, "memory(GiB)": 369.4, "step": 20490, "train_speed(iter/s)": 0.20128 }, { "acc": 0.75299473, "epoch": 0.5199137493658041, "grad_norm": 2.390625, "learning_rate": 8.842461853805232e-06, "loss": 0.95777893, "memory(GiB)": 369.4, "step": 20495, "train_speed(iter/s)": 0.201289 }, { "acc": 0.75175385, "epoch": 0.5200405885337392, "grad_norm": 2.09375, "learning_rate": 8.841790798059237e-06, "loss": 0.99995537, "memory(GiB)": 369.4, "step": 20500, "train_speed(iter/s)": 0.201298 }, { "acc": 0.76145287, "epoch": 0.5201674277016742, "grad_norm": 2.59375, "learning_rate": 8.84111957333413e-06, "loss": 0.94745483, "memory(GiB)": 369.4, "step": 20505, "train_speed(iter/s)": 0.201305 }, { "acc": 0.72738872, "epoch": 0.5202942668696093, "grad_norm": 2.265625, "learning_rate": 8.840448179659436e-06, "loss": 1.1256072, "memory(GiB)": 369.4, "step": 20510, "train_speed(iter/s)": 0.201311 }, { "acc": 0.73880978, "epoch": 0.5204211060375444, "grad_norm": 2.0, "learning_rate": 8.839776617064683e-06, "loss": 1.02100029, "memory(GiB)": 369.4, "step": 20515, "train_speed(iter/s)": 0.201316 }, { "acc": 0.73968921, "epoch": 0.5205479452054794, "grad_norm": 1.9609375, "learning_rate": 8.839104885579413e-06, "loss": 1.00496941, "memory(GiB)": 369.4, "step": 20520, "train_speed(iter/s)": 0.201327 }, { "acc": 0.74010472, "epoch": 0.5206747843734145, "grad_norm": 2.5625, "learning_rate": 8.83843298523317e-06, "loss": 1.05104427, "memory(GiB)": 369.4, "step": 20525, "train_speed(iter/s)": 0.201334 }, { "acc": 0.74186268, "epoch": 0.5208016235413496, "grad_norm": 2.34375, "learning_rate": 8.837760916055505e-06, "loss": 1.02570152, "memory(GiB)": 369.4, "step": 20530, "train_speed(iter/s)": 0.201341 }, { "acc": 0.73014412, "epoch": 0.5209284627092846, "grad_norm": 2.125, "learning_rate": 8.837088678075983e-06, "loss": 1.10374069, "memory(GiB)": 369.4, "step": 20535, "train_speed(iter/s)": 0.201348 }, { "acc": 0.74269671, "epoch": 0.5210553018772197, "grad_norm": 1.921875, "learning_rate": 8.836416271324166e-06, "loss": 1.05161896, "memory(GiB)": 369.4, "step": 20540, "train_speed(iter/s)": 0.201355 }, { "acc": 0.75490928, "epoch": 0.5211821410451547, "grad_norm": 1.8359375, "learning_rate": 8.835743695829635e-06, "loss": 0.97318439, "memory(GiB)": 369.4, "step": 20545, "train_speed(iter/s)": 0.201364 }, { "acc": 0.74272861, "epoch": 0.5213089802130898, "grad_norm": 2.546875, "learning_rate": 8.835070951621971e-06, "loss": 1.02659187, "memory(GiB)": 369.4, "step": 20550, "train_speed(iter/s)": 0.201374 }, { "acc": 0.73856521, "epoch": 0.5214358193810249, "grad_norm": 2.0, "learning_rate": 8.834398038730765e-06, "loss": 1.00815582, "memory(GiB)": 369.4, "step": 20555, "train_speed(iter/s)": 0.201381 }, { "acc": 0.77260237, "epoch": 0.5215626585489599, "grad_norm": 2.453125, "learning_rate": 8.833724957185612e-06, "loss": 0.89986229, "memory(GiB)": 369.4, "step": 20560, "train_speed(iter/s)": 0.201388 }, { "acc": 0.74024115, "epoch": 0.521689497716895, "grad_norm": 2.515625, "learning_rate": 8.83305170701612e-06, "loss": 1.07068081, "memory(GiB)": 369.4, "step": 20565, "train_speed(iter/s)": 0.201399 }, { "acc": 0.74947672, "epoch": 0.5218163368848301, "grad_norm": 2.234375, "learning_rate": 8.832378288251902e-06, "loss": 1.03115768, "memory(GiB)": 369.4, "step": 20570, "train_speed(iter/s)": 0.20141 }, { "acc": 0.74071875, "epoch": 0.5219431760527651, "grad_norm": 2.234375, "learning_rate": 8.831704700922574e-06, "loss": 1.02681656, "memory(GiB)": 369.4, "step": 20575, "train_speed(iter/s)": 0.201416 }, { "acc": 0.74849548, "epoch": 0.5220700152207002, "grad_norm": 2.25, "learning_rate": 8.831030945057767e-06, "loss": 1.02324944, "memory(GiB)": 369.4, "step": 20580, "train_speed(iter/s)": 0.201423 }, { "acc": 0.76357751, "epoch": 0.5221968543886352, "grad_norm": 1.9140625, "learning_rate": 8.830357020687115e-06, "loss": 0.93204756, "memory(GiB)": 369.4, "step": 20585, "train_speed(iter/s)": 0.201428 }, { "acc": 0.75702457, "epoch": 0.5223236935565703, "grad_norm": 2.09375, "learning_rate": 8.82968292784026e-06, "loss": 1.00496588, "memory(GiB)": 369.4, "step": 20590, "train_speed(iter/s)": 0.201436 }, { "acc": 0.74478827, "epoch": 0.5224505327245054, "grad_norm": 2.65625, "learning_rate": 8.82900866654685e-06, "loss": 1.03701, "memory(GiB)": 369.4, "step": 20595, "train_speed(iter/s)": 0.201445 }, { "acc": 0.74873142, "epoch": 0.5225773718924404, "grad_norm": 2.015625, "learning_rate": 8.828334236836546e-06, "loss": 1.03319912, "memory(GiB)": 369.4, "step": 20600, "train_speed(iter/s)": 0.201449 }, { "acc": 0.73905087, "epoch": 0.5227042110603755, "grad_norm": 2.15625, "learning_rate": 8.827659638739007e-06, "loss": 1.03802643, "memory(GiB)": 369.4, "step": 20605, "train_speed(iter/s)": 0.201456 }, { "acc": 0.7511539, "epoch": 0.5228310502283106, "grad_norm": 2.078125, "learning_rate": 8.82698487228391e-06, "loss": 0.99306107, "memory(GiB)": 369.4, "step": 20610, "train_speed(iter/s)": 0.201464 }, { "acc": 0.74002004, "epoch": 0.5229578893962455, "grad_norm": 2.03125, "learning_rate": 8.826309937500932e-06, "loss": 1.00814514, "memory(GiB)": 369.4, "step": 20615, "train_speed(iter/s)": 0.20147 }, { "acc": 0.73574934, "epoch": 0.5230847285641806, "grad_norm": 2.109375, "learning_rate": 8.825634834419758e-06, "loss": 1.07460899, "memory(GiB)": 369.4, "step": 20620, "train_speed(iter/s)": 0.201477 }, { "acc": 0.74833817, "epoch": 0.5232115677321156, "grad_norm": 2.71875, "learning_rate": 8.824959563070085e-06, "loss": 1.00682764, "memory(GiB)": 369.4, "step": 20625, "train_speed(iter/s)": 0.201486 }, { "acc": 0.75031519, "epoch": 0.5233384069000507, "grad_norm": 2.0625, "learning_rate": 8.824284123481614e-06, "loss": 0.95642929, "memory(GiB)": 369.4, "step": 20630, "train_speed(iter/s)": 0.201495 }, { "acc": 0.74396701, "epoch": 0.5234652460679858, "grad_norm": 2.234375, "learning_rate": 8.823608515684053e-06, "loss": 1.03864784, "memory(GiB)": 369.4, "step": 20635, "train_speed(iter/s)": 0.201502 }, { "acc": 0.73681135, "epoch": 0.5235920852359208, "grad_norm": 1.7890625, "learning_rate": 8.822932739707118e-06, "loss": 1.06943054, "memory(GiB)": 369.4, "step": 20640, "train_speed(iter/s)": 0.201511 }, { "acc": 0.74341168, "epoch": 0.5237189244038559, "grad_norm": 2.140625, "learning_rate": 8.822256795580532e-06, "loss": 1.04311867, "memory(GiB)": 369.4, "step": 20645, "train_speed(iter/s)": 0.201517 }, { "acc": 0.75483923, "epoch": 0.523845763571791, "grad_norm": 2.328125, "learning_rate": 8.821580683334027e-06, "loss": 1.00683603, "memory(GiB)": 369.4, "step": 20650, "train_speed(iter/s)": 0.201524 }, { "acc": 0.7426939, "epoch": 0.523972602739726, "grad_norm": 2.734375, "learning_rate": 8.820904402997343e-06, "loss": 0.98187199, "memory(GiB)": 369.4, "step": 20655, "train_speed(iter/s)": 0.201532 }, { "acc": 0.74709139, "epoch": 0.5240994419076611, "grad_norm": 2.0, "learning_rate": 8.820227954600222e-06, "loss": 0.98418236, "memory(GiB)": 369.4, "step": 20660, "train_speed(iter/s)": 0.201534 }, { "acc": 0.72886906, "epoch": 0.5242262810755961, "grad_norm": 2.140625, "learning_rate": 8.819551338172421e-06, "loss": 1.09851618, "memory(GiB)": 369.4, "step": 20665, "train_speed(iter/s)": 0.201537 }, { "acc": 0.74697323, "epoch": 0.5243531202435312, "grad_norm": 1.875, "learning_rate": 8.8188745537437e-06, "loss": 1.02470379, "memory(GiB)": 369.4, "step": 20670, "train_speed(iter/s)": 0.201541 }, { "acc": 0.74931669, "epoch": 0.5244799594114663, "grad_norm": 2.046875, "learning_rate": 8.818197601343822e-06, "loss": 0.98038187, "memory(GiB)": 369.4, "step": 20675, "train_speed(iter/s)": 0.201547 }, { "acc": 0.74354916, "epoch": 0.5246067985794013, "grad_norm": 2.734375, "learning_rate": 8.81752048100257e-06, "loss": 0.98766842, "memory(GiB)": 369.4, "step": 20680, "train_speed(iter/s)": 0.201555 }, { "acc": 0.72427316, "epoch": 0.5247336377473364, "grad_norm": 2.296875, "learning_rate": 8.816843192749724e-06, "loss": 1.0840807, "memory(GiB)": 369.4, "step": 20685, "train_speed(iter/s)": 0.201561 }, { "acc": 0.74355774, "epoch": 0.5248604769152715, "grad_norm": 2.34375, "learning_rate": 8.816165736615072e-06, "loss": 1.00473557, "memory(GiB)": 369.4, "step": 20690, "train_speed(iter/s)": 0.201567 }, { "acc": 0.75339723, "epoch": 0.5249873160832065, "grad_norm": 2.640625, "learning_rate": 8.815488112628412e-06, "loss": 1.00363426, "memory(GiB)": 369.4, "step": 20695, "train_speed(iter/s)": 0.201578 }, { "acc": 0.75966148, "epoch": 0.5251141552511416, "grad_norm": 2.296875, "learning_rate": 8.814810320819551e-06, "loss": 0.97035608, "memory(GiB)": 369.4, "step": 20700, "train_speed(iter/s)": 0.201584 }, { "acc": 0.74923935, "epoch": 0.5252409944190766, "grad_norm": 2.203125, "learning_rate": 8.8141323612183e-06, "loss": 1.00768347, "memory(GiB)": 369.4, "step": 20705, "train_speed(iter/s)": 0.201591 }, { "acc": 0.74302979, "epoch": 0.5253678335870117, "grad_norm": 2.578125, "learning_rate": 8.813454233854479e-06, "loss": 1.05633831, "memory(GiB)": 369.4, "step": 20710, "train_speed(iter/s)": 0.201596 }, { "acc": 0.74924555, "epoch": 0.5254946727549468, "grad_norm": 2.015625, "learning_rate": 8.812775938757914e-06, "loss": 0.98341141, "memory(GiB)": 369.4, "step": 20715, "train_speed(iter/s)": 0.201605 }, { "acc": 0.75758686, "epoch": 0.5256215119228818, "grad_norm": 2.40625, "learning_rate": 8.812097475958442e-06, "loss": 0.97803736, "memory(GiB)": 369.4, "step": 20720, "train_speed(iter/s)": 0.201612 }, { "acc": 0.73654575, "epoch": 0.5257483510908169, "grad_norm": 2.296875, "learning_rate": 8.811418845485902e-06, "loss": 1.04011478, "memory(GiB)": 369.4, "step": 20725, "train_speed(iter/s)": 0.201622 }, { "acc": 0.7551383, "epoch": 0.525875190258752, "grad_norm": 2.265625, "learning_rate": 8.810740047370146e-06, "loss": 1.02959747, "memory(GiB)": 369.4, "step": 20730, "train_speed(iter/s)": 0.201624 }, { "acc": 0.7561471, "epoch": 0.526002029426687, "grad_norm": 2.171875, "learning_rate": 8.810061081641026e-06, "loss": 1.0017312, "memory(GiB)": 369.4, "step": 20735, "train_speed(iter/s)": 0.201634 }, { "acc": 0.75058866, "epoch": 0.526128868594622, "grad_norm": 1.765625, "learning_rate": 8.809381948328412e-06, "loss": 1.00737648, "memory(GiB)": 369.4, "step": 20740, "train_speed(iter/s)": 0.201642 }, { "acc": 0.74639778, "epoch": 0.526255707762557, "grad_norm": 2.046875, "learning_rate": 8.80870264746217e-06, "loss": 1.03497295, "memory(GiB)": 369.4, "step": 20745, "train_speed(iter/s)": 0.20165 }, { "acc": 0.74341516, "epoch": 0.5263825469304921, "grad_norm": 1.9140625, "learning_rate": 8.808023179072183e-06, "loss": 1.00651207, "memory(GiB)": 369.4, "step": 20750, "train_speed(iter/s)": 0.201655 }, { "acc": 0.74319153, "epoch": 0.5265093860984272, "grad_norm": 2.078125, "learning_rate": 8.807343543188333e-06, "loss": 1.06218128, "memory(GiB)": 369.4, "step": 20755, "train_speed(iter/s)": 0.201663 }, { "acc": 0.75583696, "epoch": 0.5266362252663622, "grad_norm": 2.4375, "learning_rate": 8.806663739840515e-06, "loss": 0.98813429, "memory(GiB)": 369.4, "step": 20760, "train_speed(iter/s)": 0.201672 }, { "acc": 0.73981485, "epoch": 0.5267630644342973, "grad_norm": 2.3125, "learning_rate": 8.805983769058633e-06, "loss": 1.02614889, "memory(GiB)": 369.4, "step": 20765, "train_speed(iter/s)": 0.201675 }, { "acc": 0.75570803, "epoch": 0.5268899036022324, "grad_norm": 2.3125, "learning_rate": 8.80530363087259e-06, "loss": 0.98353291, "memory(GiB)": 369.4, "step": 20770, "train_speed(iter/s)": 0.20168 }, { "acc": 0.73574381, "epoch": 0.5270167427701674, "grad_norm": 1.9765625, "learning_rate": 8.804623325312305e-06, "loss": 1.07624798, "memory(GiB)": 369.4, "step": 20775, "train_speed(iter/s)": 0.201688 }, { "acc": 0.73864746, "epoch": 0.5271435819381025, "grad_norm": 2.046875, "learning_rate": 8.8039428524077e-06, "loss": 1.02784214, "memory(GiB)": 369.4, "step": 20780, "train_speed(iter/s)": 0.201694 }, { "acc": 0.74757538, "epoch": 0.5272704211060375, "grad_norm": 2.359375, "learning_rate": 8.803262212188703e-06, "loss": 1.09340992, "memory(GiB)": 369.4, "step": 20785, "train_speed(iter/s)": 0.201699 }, { "acc": 0.74639473, "epoch": 0.5273972602739726, "grad_norm": 2.15625, "learning_rate": 8.802581404685255e-06, "loss": 1.01162634, "memory(GiB)": 369.4, "step": 20790, "train_speed(iter/s)": 0.201707 }, { "acc": 0.72599869, "epoch": 0.5275240994419077, "grad_norm": 2.234375, "learning_rate": 8.8019004299273e-06, "loss": 1.05597458, "memory(GiB)": 369.4, "step": 20795, "train_speed(iter/s)": 0.201715 }, { "acc": 0.7407846, "epoch": 0.5276509386098427, "grad_norm": 2.53125, "learning_rate": 8.801219287944788e-06, "loss": 1.02706947, "memory(GiB)": 369.4, "step": 20800, "train_speed(iter/s)": 0.201721 }, { "acc": 0.74152575, "epoch": 0.5277777777777778, "grad_norm": 2.453125, "learning_rate": 8.800537978767683e-06, "loss": 1.00489464, "memory(GiB)": 369.4, "step": 20805, "train_speed(iter/s)": 0.201732 }, { "acc": 0.7566731, "epoch": 0.5279046169457129, "grad_norm": 2.515625, "learning_rate": 8.79985650242595e-06, "loss": 0.99703617, "memory(GiB)": 369.4, "step": 20810, "train_speed(iter/s)": 0.201739 }, { "acc": 0.73392606, "epoch": 0.5280314561136479, "grad_norm": 1.921875, "learning_rate": 8.79917485894956e-06, "loss": 1.04274807, "memory(GiB)": 369.4, "step": 20815, "train_speed(iter/s)": 0.201743 }, { "acc": 0.73962922, "epoch": 0.528158295281583, "grad_norm": 1.9609375, "learning_rate": 8.798493048368498e-06, "loss": 1.02699184, "memory(GiB)": 369.4, "step": 20820, "train_speed(iter/s)": 0.201752 }, { "acc": 0.74649334, "epoch": 0.528285134449518, "grad_norm": 1.8828125, "learning_rate": 8.797811070712754e-06, "loss": 0.99037952, "memory(GiB)": 369.4, "step": 20825, "train_speed(iter/s)": 0.201757 }, { "acc": 0.73986177, "epoch": 0.5284119736174531, "grad_norm": 2.046875, "learning_rate": 8.797128926012323e-06, "loss": 1.07075062, "memory(GiB)": 369.4, "step": 20830, "train_speed(iter/s)": 0.201765 }, { "acc": 0.73788023, "epoch": 0.5285388127853882, "grad_norm": 2.5625, "learning_rate": 8.796446614297208e-06, "loss": 0.9826128, "memory(GiB)": 369.4, "step": 20835, "train_speed(iter/s)": 0.201775 }, { "acc": 0.74199905, "epoch": 0.5286656519533232, "grad_norm": 2.328125, "learning_rate": 8.795764135597421e-06, "loss": 1.03029709, "memory(GiB)": 369.4, "step": 20840, "train_speed(iter/s)": 0.201781 }, { "acc": 0.75189829, "epoch": 0.5287924911212583, "grad_norm": 1.8984375, "learning_rate": 8.79508148994298e-06, "loss": 1.00599489, "memory(GiB)": 369.4, "step": 20845, "train_speed(iter/s)": 0.201788 }, { "acc": 0.74307957, "epoch": 0.5289193302891934, "grad_norm": 2.203125, "learning_rate": 8.794398677363913e-06, "loss": 1.01467323, "memory(GiB)": 369.4, "step": 20850, "train_speed(iter/s)": 0.201798 }, { "acc": 0.75748968, "epoch": 0.5290461694571283, "grad_norm": 1.9765625, "learning_rate": 8.79371569789025e-06, "loss": 0.98391724, "memory(GiB)": 369.4, "step": 20855, "train_speed(iter/s)": 0.201803 }, { "acc": 0.73922167, "epoch": 0.5291730086250634, "grad_norm": 2.5, "learning_rate": 8.793032551552034e-06, "loss": 1.02211628, "memory(GiB)": 369.4, "step": 20860, "train_speed(iter/s)": 0.201806 }, { "acc": 0.73193159, "epoch": 0.5292998477929984, "grad_norm": 2.125, "learning_rate": 8.792349238379311e-06, "loss": 1.06032658, "memory(GiB)": 369.4, "step": 20865, "train_speed(iter/s)": 0.201811 }, { "acc": 0.73293371, "epoch": 0.5294266869609335, "grad_norm": 2.21875, "learning_rate": 8.791665758402137e-06, "loss": 1.09486933, "memory(GiB)": 369.4, "step": 20870, "train_speed(iter/s)": 0.201817 }, { "acc": 0.75614257, "epoch": 0.5295535261288686, "grad_norm": 2.1875, "learning_rate": 8.790982111650574e-06, "loss": 1.01305065, "memory(GiB)": 369.4, "step": 20875, "train_speed(iter/s)": 0.201823 }, { "acc": 0.74007225, "epoch": 0.5296803652968036, "grad_norm": 2.125, "learning_rate": 8.790298298154694e-06, "loss": 1.0024004, "memory(GiB)": 369.4, "step": 20880, "train_speed(iter/s)": 0.201828 }, { "acc": 0.73621187, "epoch": 0.5298072044647387, "grad_norm": 1.9296875, "learning_rate": 8.78961431794457e-06, "loss": 1.05160809, "memory(GiB)": 369.4, "step": 20885, "train_speed(iter/s)": 0.201835 }, { "acc": 0.7468863, "epoch": 0.5299340436326738, "grad_norm": 2.078125, "learning_rate": 8.78893017105029e-06, "loss": 1.03284225, "memory(GiB)": 369.4, "step": 20890, "train_speed(iter/s)": 0.20184 }, { "acc": 0.75133185, "epoch": 0.5300608828006088, "grad_norm": 2.21875, "learning_rate": 8.788245857501944e-06, "loss": 1.00079432, "memory(GiB)": 369.4, "step": 20895, "train_speed(iter/s)": 0.201849 }, { "acc": 0.73288345, "epoch": 0.5301877219685439, "grad_norm": 2.140625, "learning_rate": 8.787561377329633e-06, "loss": 1.07512379, "memory(GiB)": 369.4, "step": 20900, "train_speed(iter/s)": 0.201855 }, { "acc": 0.75042791, "epoch": 0.5303145611364789, "grad_norm": 2.09375, "learning_rate": 8.786876730563462e-06, "loss": 1.01529093, "memory(GiB)": 369.4, "step": 20905, "train_speed(iter/s)": 0.201862 }, { "acc": 0.7445756, "epoch": 0.530441400304414, "grad_norm": 2.59375, "learning_rate": 8.786191917233545e-06, "loss": 0.99455624, "memory(GiB)": 369.4, "step": 20910, "train_speed(iter/s)": 0.201868 }, { "acc": 0.74867477, "epoch": 0.5305682394723491, "grad_norm": 2.28125, "learning_rate": 8.785506937370003e-06, "loss": 0.98369646, "memory(GiB)": 369.4, "step": 20915, "train_speed(iter/s)": 0.201873 }, { "acc": 0.75147424, "epoch": 0.5306950786402841, "grad_norm": 2.140625, "learning_rate": 8.784821791002965e-06, "loss": 1.0056118, "memory(GiB)": 369.4, "step": 20920, "train_speed(iter/s)": 0.201879 }, { "acc": 0.74356976, "epoch": 0.5308219178082192, "grad_norm": 2.171875, "learning_rate": 8.784136478162567e-06, "loss": 1.04782495, "memory(GiB)": 369.4, "step": 20925, "train_speed(iter/s)": 0.20189 }, { "acc": 0.75456047, "epoch": 0.5309487569761543, "grad_norm": 2.25, "learning_rate": 8.783450998878951e-06, "loss": 0.97943554, "memory(GiB)": 369.4, "step": 20930, "train_speed(iter/s)": 0.201897 }, { "acc": 0.74292812, "epoch": 0.5310755961440893, "grad_norm": 2.203125, "learning_rate": 8.78276535318227e-06, "loss": 1.01874886, "memory(GiB)": 369.4, "step": 20935, "train_speed(iter/s)": 0.201902 }, { "acc": 0.72776604, "epoch": 0.5312024353120244, "grad_norm": 2.484375, "learning_rate": 8.782079541102678e-06, "loss": 1.09766197, "memory(GiB)": 369.4, "step": 20940, "train_speed(iter/s)": 0.201908 }, { "acc": 0.75004358, "epoch": 0.5313292744799594, "grad_norm": 1.96875, "learning_rate": 8.781393562670342e-06, "loss": 0.99597092, "memory(GiB)": 369.4, "step": 20945, "train_speed(iter/s)": 0.201915 }, { "acc": 0.73208447, "epoch": 0.5314561136478945, "grad_norm": 2.03125, "learning_rate": 8.780707417915436e-06, "loss": 1.05183125, "memory(GiB)": 369.4, "step": 20950, "train_speed(iter/s)": 0.201923 }, { "acc": 0.75562134, "epoch": 0.5315829528158296, "grad_norm": 1.84375, "learning_rate": 8.780021106868138e-06, "loss": 0.91724377, "memory(GiB)": 369.4, "step": 20955, "train_speed(iter/s)": 0.201928 }, { "acc": 0.73819194, "epoch": 0.5317097919837646, "grad_norm": 2.359375, "learning_rate": 8.779334629558633e-06, "loss": 1.0528183, "memory(GiB)": 369.4, "step": 20960, "train_speed(iter/s)": 0.201933 }, { "acc": 0.74605823, "epoch": 0.5318366311516997, "grad_norm": 2.828125, "learning_rate": 8.77864798601712e-06, "loss": 1.02040348, "memory(GiB)": 369.4, "step": 20965, "train_speed(iter/s)": 0.201938 }, { "acc": 0.74332423, "epoch": 0.5319634703196348, "grad_norm": 2.203125, "learning_rate": 8.777961176273795e-06, "loss": 1.06013613, "memory(GiB)": 369.4, "step": 20970, "train_speed(iter/s)": 0.201944 }, { "acc": 0.74841838, "epoch": 0.5320903094875697, "grad_norm": 3.15625, "learning_rate": 8.777274200358873e-06, "loss": 1.02675667, "memory(GiB)": 369.4, "step": 20975, "train_speed(iter/s)": 0.201954 }, { "acc": 0.7556078, "epoch": 0.5322171486555048, "grad_norm": 2.25, "learning_rate": 8.776587058302566e-06, "loss": 1.00752878, "memory(GiB)": 369.4, "step": 20980, "train_speed(iter/s)": 0.201961 }, { "acc": 0.74665518, "epoch": 0.5323439878234398, "grad_norm": 1.8984375, "learning_rate": 8.7758997501351e-06, "loss": 0.9976078, "memory(GiB)": 369.4, "step": 20985, "train_speed(iter/s)": 0.201965 }, { "acc": 0.76384773, "epoch": 0.5324708269913749, "grad_norm": 2.25, "learning_rate": 8.775212275886705e-06, "loss": 0.97822075, "memory(GiB)": 369.4, "step": 20990, "train_speed(iter/s)": 0.201969 }, { "acc": 0.73055143, "epoch": 0.53259766615931, "grad_norm": 2.5, "learning_rate": 8.774524635587617e-06, "loss": 1.05275307, "memory(GiB)": 369.4, "step": 20995, "train_speed(iter/s)": 0.201975 }, { "acc": 0.74401617, "epoch": 0.532724505327245, "grad_norm": 2.125, "learning_rate": 8.773836829268084e-06, "loss": 0.99862556, "memory(GiB)": 369.4, "step": 21000, "train_speed(iter/s)": 0.201983 }, { "epoch": 0.532724505327245, "eval_acc": 0.735027324070875, "eval_loss": 0.9840785264968872, "eval_runtime": 384.7094, "eval_samples_per_second": 16.558, "eval_steps_per_second": 8.279, "step": 21000 }, { "acc": 0.74317107, "epoch": 0.5328513444951801, "grad_norm": 2.109375, "learning_rate": 8.77314885695836e-06, "loss": 1.00565357, "memory(GiB)": 369.4, "step": 21005, "train_speed(iter/s)": 0.200619 }, { "acc": 0.74845619, "epoch": 0.5329781836631152, "grad_norm": 2.125, "learning_rate": 8.772460718688702e-06, "loss": 0.9864768, "memory(GiB)": 369.4, "step": 21010, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75301309, "epoch": 0.5331050228310502, "grad_norm": 2.4375, "learning_rate": 8.771772414489379e-06, "loss": 0.97919369, "memory(GiB)": 369.4, "step": 21015, "train_speed(iter/s)": 0.200629 }, { "acc": 0.73536692, "epoch": 0.5332318619989853, "grad_norm": 2.34375, "learning_rate": 8.771083944390665e-06, "loss": 1.06671371, "memory(GiB)": 369.4, "step": 21020, "train_speed(iter/s)": 0.200639 }, { "acc": 0.73180494, "epoch": 0.5333587011669203, "grad_norm": 1.9765625, "learning_rate": 8.770395308422842e-06, "loss": 1.05411081, "memory(GiB)": 369.4, "step": 21025, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75502167, "epoch": 0.5334855403348554, "grad_norm": 2.40625, "learning_rate": 8.769706506616201e-06, "loss": 1.02299881, "memory(GiB)": 369.4, "step": 21030, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75489035, "epoch": 0.5336123795027905, "grad_norm": 1.7421875, "learning_rate": 8.769017539001037e-06, "loss": 0.96504402, "memory(GiB)": 369.4, "step": 21035, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75108128, "epoch": 0.5337392186707255, "grad_norm": 2.546875, "learning_rate": 8.768328405607655e-06, "loss": 1.04070435, "memory(GiB)": 369.4, "step": 21040, "train_speed(iter/s)": 0.20067 }, { "acc": 0.74229727, "epoch": 0.5338660578386606, "grad_norm": 2.15625, "learning_rate": 8.767639106466364e-06, "loss": 1.08547268, "memory(GiB)": 369.4, "step": 21045, "train_speed(iter/s)": 0.200678 }, { "acc": 0.74753847, "epoch": 0.5339928970065957, "grad_norm": 1.921875, "learning_rate": 8.766949641607484e-06, "loss": 1.01524277, "memory(GiB)": 369.4, "step": 21050, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75365791, "epoch": 0.5341197361745307, "grad_norm": 2.40625, "learning_rate": 8.76626001106134e-06, "loss": 1.02136259, "memory(GiB)": 369.4, "step": 21055, "train_speed(iter/s)": 0.200689 }, { "acc": 0.75944624, "epoch": 0.5342465753424658, "grad_norm": 2.203125, "learning_rate": 8.765570214858268e-06, "loss": 0.92689533, "memory(GiB)": 369.4, "step": 21060, "train_speed(iter/s)": 0.200696 }, { "acc": 0.73928823, "epoch": 0.5343734145104008, "grad_norm": 2.078125, "learning_rate": 8.764880253028604e-06, "loss": 1.02434731, "memory(GiB)": 369.4, "step": 21065, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75004482, "epoch": 0.5345002536783359, "grad_norm": 1.6875, "learning_rate": 8.764190125602698e-06, "loss": 0.9694767, "memory(GiB)": 369.4, "step": 21070, "train_speed(iter/s)": 0.200707 }, { "acc": 0.7546114, "epoch": 0.534627092846271, "grad_norm": 2.453125, "learning_rate": 8.763499832610904e-06, "loss": 1.01268463, "memory(GiB)": 369.4, "step": 21075, "train_speed(iter/s)": 0.200709 }, { "acc": 0.75430136, "epoch": 0.534753932014206, "grad_norm": 1.9453125, "learning_rate": 8.762809374083585e-06, "loss": 0.98011322, "memory(GiB)": 369.4, "step": 21080, "train_speed(iter/s)": 0.200709 }, { "acc": 0.73797436, "epoch": 0.5348807711821411, "grad_norm": 2.28125, "learning_rate": 8.76211875005111e-06, "loss": 1.05283546, "memory(GiB)": 369.4, "step": 21085, "train_speed(iter/s)": 0.200718 }, { "acc": 0.76914825, "epoch": 0.5350076103500762, "grad_norm": 2.75, "learning_rate": 8.761427960543854e-06, "loss": 0.9787077, "memory(GiB)": 369.4, "step": 21090, "train_speed(iter/s)": 0.200724 }, { "acc": 0.74231873, "epoch": 0.5351344495180111, "grad_norm": 2.0625, "learning_rate": 8.760737005592205e-06, "loss": 1.01590729, "memory(GiB)": 369.4, "step": 21095, "train_speed(iter/s)": 0.20073 }, { "acc": 0.77320385, "epoch": 0.5352612886859462, "grad_norm": 1.9296875, "learning_rate": 8.760045885226551e-06, "loss": 0.98366661, "memory(GiB)": 369.4, "step": 21100, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75575857, "epoch": 0.5353881278538812, "grad_norm": 2.375, "learning_rate": 8.759354599477293e-06, "loss": 1.04389229, "memory(GiB)": 369.4, "step": 21105, "train_speed(iter/s)": 0.200737 }, { "acc": 0.75681167, "epoch": 0.5355149670218163, "grad_norm": 2.015625, "learning_rate": 8.758663148374833e-06, "loss": 0.95308247, "memory(GiB)": 369.4, "step": 21110, "train_speed(iter/s)": 0.200746 }, { "acc": 0.74963102, "epoch": 0.5356418061897514, "grad_norm": 2.203125, "learning_rate": 8.757971531949587e-06, "loss": 1.03142662, "memory(GiB)": 369.4, "step": 21115, "train_speed(iter/s)": 0.200752 }, { "acc": 0.7547801, "epoch": 0.5357686453576864, "grad_norm": 1.8828125, "learning_rate": 8.757279750231977e-06, "loss": 0.96952057, "memory(GiB)": 369.4, "step": 21120, "train_speed(iter/s)": 0.20076 }, { "acc": 0.76084266, "epoch": 0.5358954845256215, "grad_norm": 1.9765625, "learning_rate": 8.756587803252426e-06, "loss": 0.95839329, "memory(GiB)": 369.4, "step": 21125, "train_speed(iter/s)": 0.200769 }, { "acc": 0.74456396, "epoch": 0.5360223236935566, "grad_norm": 2.109375, "learning_rate": 8.755895691041373e-06, "loss": 1.00802088, "memory(GiB)": 369.4, "step": 21130, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75472736, "epoch": 0.5361491628614916, "grad_norm": 2.015625, "learning_rate": 8.755203413629257e-06, "loss": 1.02207165, "memory(GiB)": 369.4, "step": 21135, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75071487, "epoch": 0.5362760020294267, "grad_norm": 2.015625, "learning_rate": 8.75451097104653e-06, "loss": 0.94844933, "memory(GiB)": 369.4, "step": 21140, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74864902, "epoch": 0.5364028411973617, "grad_norm": 2.0625, "learning_rate": 8.75381836332365e-06, "loss": 1.04975224, "memory(GiB)": 369.4, "step": 21145, "train_speed(iter/s)": 0.200789 }, { "acc": 0.73564749, "epoch": 0.5365296803652968, "grad_norm": 2.046875, "learning_rate": 8.753125590491077e-06, "loss": 1.02692022, "memory(GiB)": 369.4, "step": 21150, "train_speed(iter/s)": 0.200795 }, { "acc": 0.74650192, "epoch": 0.5366565195332319, "grad_norm": 1.90625, "learning_rate": 8.752432652579284e-06, "loss": 0.98286142, "memory(GiB)": 369.4, "step": 21155, "train_speed(iter/s)": 0.200802 }, { "acc": 0.73701143, "epoch": 0.5367833587011669, "grad_norm": 2.640625, "learning_rate": 8.751739549618749e-06, "loss": 1.04825783, "memory(GiB)": 369.4, "step": 21160, "train_speed(iter/s)": 0.200809 }, { "acc": 0.7437315, "epoch": 0.536910197869102, "grad_norm": 2.421875, "learning_rate": 8.751046281639958e-06, "loss": 0.98178749, "memory(GiB)": 369.4, "step": 21165, "train_speed(iter/s)": 0.200814 }, { "acc": 0.75551119, "epoch": 0.5370370370370371, "grad_norm": 1.875, "learning_rate": 8.750352848673405e-06, "loss": 1.01221676, "memory(GiB)": 369.4, "step": 21170, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74180832, "epoch": 0.5371638762049721, "grad_norm": 2.34375, "learning_rate": 8.749659250749589e-06, "loss": 1.09036617, "memory(GiB)": 369.4, "step": 21175, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75479097, "epoch": 0.5372907153729072, "grad_norm": 2.984375, "learning_rate": 8.748965487899019e-06, "loss": 1.00780649, "memory(GiB)": 369.4, "step": 21180, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75405798, "epoch": 0.5374175545408422, "grad_norm": 2.453125, "learning_rate": 8.748271560152208e-06, "loss": 0.96617422, "memory(GiB)": 369.4, "step": 21185, "train_speed(iter/s)": 0.200849 }, { "acc": 0.74603219, "epoch": 0.5375443937087773, "grad_norm": 2.109375, "learning_rate": 8.74757746753968e-06, "loss": 1.02846336, "memory(GiB)": 369.4, "step": 21190, "train_speed(iter/s)": 0.200858 }, { "acc": 0.75303974, "epoch": 0.5376712328767124, "grad_norm": 2.390625, "learning_rate": 8.746883210091963e-06, "loss": 0.9735836, "memory(GiB)": 369.4, "step": 21195, "train_speed(iter/s)": 0.200865 }, { "acc": 0.76407218, "epoch": 0.5377980720446474, "grad_norm": 1.9609375, "learning_rate": 8.746188787839593e-06, "loss": 0.94359264, "memory(GiB)": 369.4, "step": 21200, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74591346, "epoch": 0.5379249112125825, "grad_norm": 1.8984375, "learning_rate": 8.745494200813116e-06, "loss": 1.02461605, "memory(GiB)": 369.4, "step": 21205, "train_speed(iter/s)": 0.20088 }, { "acc": 0.74420567, "epoch": 0.5380517503805176, "grad_norm": 2.890625, "learning_rate": 8.74479944904308e-06, "loss": 1.05175076, "memory(GiB)": 369.4, "step": 21210, "train_speed(iter/s)": 0.200885 }, { "acc": 0.73944759, "epoch": 0.5381785895484525, "grad_norm": 2.125, "learning_rate": 8.744104532560047e-06, "loss": 1.02590828, "memory(GiB)": 369.4, "step": 21215, "train_speed(iter/s)": 0.20089 }, { "acc": 0.73623495, "epoch": 0.5383054287163876, "grad_norm": 2.1875, "learning_rate": 8.74340945139458e-06, "loss": 1.06764774, "memory(GiB)": 369.4, "step": 21220, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74025702, "epoch": 0.5384322678843226, "grad_norm": 2.9375, "learning_rate": 8.742714205577251e-06, "loss": 0.98213329, "memory(GiB)": 369.4, "step": 21225, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75638647, "epoch": 0.5385591070522577, "grad_norm": 1.90625, "learning_rate": 8.742018795138642e-06, "loss": 0.9450716, "memory(GiB)": 369.4, "step": 21230, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74290047, "epoch": 0.5386859462201928, "grad_norm": 2.125, "learning_rate": 8.74132322010934e-06, "loss": 1.00822163, "memory(GiB)": 369.4, "step": 21235, "train_speed(iter/s)": 0.200913 }, { "acc": 0.74625263, "epoch": 0.5388127853881278, "grad_norm": 2.046875, "learning_rate": 8.740627480519937e-06, "loss": 0.98948545, "memory(GiB)": 369.4, "step": 21240, "train_speed(iter/s)": 0.200919 }, { "acc": 0.73905115, "epoch": 0.5389396245560629, "grad_norm": 2.390625, "learning_rate": 8.739931576401037e-06, "loss": 1.04097424, "memory(GiB)": 369.4, "step": 21245, "train_speed(iter/s)": 0.200924 }, { "acc": 0.74439449, "epoch": 0.539066463723998, "grad_norm": 2.140625, "learning_rate": 8.73923550778325e-06, "loss": 0.9735199, "memory(GiB)": 369.4, "step": 21250, "train_speed(iter/s)": 0.200932 }, { "acc": 0.7528254, "epoch": 0.539193302891933, "grad_norm": 2.1875, "learning_rate": 8.73853927469719e-06, "loss": 1.04610348, "memory(GiB)": 369.4, "step": 21255, "train_speed(iter/s)": 0.200936 }, { "acc": 0.74610653, "epoch": 0.5393201420598681, "grad_norm": 2.609375, "learning_rate": 8.73784287717348e-06, "loss": 0.99922009, "memory(GiB)": 369.4, "step": 21260, "train_speed(iter/s)": 0.200946 }, { "acc": 0.72190313, "epoch": 0.5394469812278031, "grad_norm": 2.0625, "learning_rate": 8.737146315242755e-06, "loss": 1.1162962, "memory(GiB)": 369.4, "step": 21265, "train_speed(iter/s)": 0.200952 }, { "acc": 0.74142423, "epoch": 0.5395738203957382, "grad_norm": 2.046875, "learning_rate": 8.73644958893565e-06, "loss": 1.01752491, "memory(GiB)": 369.4, "step": 21270, "train_speed(iter/s)": 0.200961 }, { "acc": 0.74863768, "epoch": 0.5397006595636733, "grad_norm": 1.8515625, "learning_rate": 8.735752698282807e-06, "loss": 0.98738346, "memory(GiB)": 369.4, "step": 21275, "train_speed(iter/s)": 0.200968 }, { "acc": 0.76068382, "epoch": 0.5398274987316083, "grad_norm": 1.859375, "learning_rate": 8.735055643314883e-06, "loss": 1.03110561, "memory(GiB)": 369.4, "step": 21280, "train_speed(iter/s)": 0.200977 }, { "acc": 0.74116583, "epoch": 0.5399543378995434, "grad_norm": 2.03125, "learning_rate": 8.734358424062536e-06, "loss": 1.00915642, "memory(GiB)": 369.4, "step": 21285, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76097317, "epoch": 0.5400811770674785, "grad_norm": 1.953125, "learning_rate": 8.733661040556433e-06, "loss": 0.97127361, "memory(GiB)": 369.4, "step": 21290, "train_speed(iter/s)": 0.200988 }, { "acc": 0.73601966, "epoch": 0.5402080162354135, "grad_norm": 2.171875, "learning_rate": 8.732963492827248e-06, "loss": 1.0221981, "memory(GiB)": 369.4, "step": 21295, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74751997, "epoch": 0.5403348554033486, "grad_norm": 2.1875, "learning_rate": 8.732265780905661e-06, "loss": 1.03250999, "memory(GiB)": 369.4, "step": 21300, "train_speed(iter/s)": 0.200999 }, { "acc": 0.73175726, "epoch": 0.5404616945712836, "grad_norm": 2.078125, "learning_rate": 8.731567904822362e-06, "loss": 1.05942211, "memory(GiB)": 369.4, "step": 21305, "train_speed(iter/s)": 0.201004 }, { "acc": 0.75470881, "epoch": 0.5405885337392187, "grad_norm": 1.9921875, "learning_rate": 8.730869864608047e-06, "loss": 0.94871998, "memory(GiB)": 369.4, "step": 21310, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74192934, "epoch": 0.5407153729071538, "grad_norm": 2.34375, "learning_rate": 8.730171660293418e-06, "loss": 1.0544095, "memory(GiB)": 369.4, "step": 21315, "train_speed(iter/s)": 0.20102 }, { "acc": 0.75204954, "epoch": 0.5408422120750888, "grad_norm": 1.78125, "learning_rate": 8.729473291909185e-06, "loss": 0.98609076, "memory(GiB)": 369.4, "step": 21320, "train_speed(iter/s)": 0.201029 }, { "acc": 0.73608074, "epoch": 0.5409690512430239, "grad_norm": 2.296875, "learning_rate": 8.728774759486065e-06, "loss": 1.02041836, "memory(GiB)": 369.4, "step": 21325, "train_speed(iter/s)": 0.201038 }, { "acc": 0.73866301, "epoch": 0.541095890410959, "grad_norm": 2.046875, "learning_rate": 8.728076063054786e-06, "loss": 1.00984268, "memory(GiB)": 369.4, "step": 21330, "train_speed(iter/s)": 0.201045 }, { "acc": 0.72795792, "epoch": 0.541222729578894, "grad_norm": 2.234375, "learning_rate": 8.727377202646074e-06, "loss": 1.07727909, "memory(GiB)": 369.4, "step": 21335, "train_speed(iter/s)": 0.201056 }, { "acc": 0.7532536, "epoch": 0.541349568746829, "grad_norm": 2.75, "learning_rate": 8.726678178290673e-06, "loss": 1.05670891, "memory(GiB)": 369.4, "step": 21340, "train_speed(iter/s)": 0.201062 }, { "acc": 0.74222198, "epoch": 0.541476407914764, "grad_norm": 2.40625, "learning_rate": 8.725978990019326e-06, "loss": 0.99811859, "memory(GiB)": 369.4, "step": 21345, "train_speed(iter/s)": 0.201072 }, { "acc": 0.75046072, "epoch": 0.5416032470826991, "grad_norm": 1.8828125, "learning_rate": 8.72527963786279e-06, "loss": 0.99685526, "memory(GiB)": 369.4, "step": 21350, "train_speed(iter/s)": 0.201079 }, { "acc": 0.74275198, "epoch": 0.5417300862506342, "grad_norm": 1.9921875, "learning_rate": 8.72458012185182e-06, "loss": 1.05331659, "memory(GiB)": 369.4, "step": 21355, "train_speed(iter/s)": 0.201086 }, { "acc": 0.7480165, "epoch": 0.5418569254185692, "grad_norm": 2.015625, "learning_rate": 8.72388044201719e-06, "loss": 1.00320702, "memory(GiB)": 369.4, "step": 21360, "train_speed(iter/s)": 0.201094 }, { "acc": 0.75820756, "epoch": 0.5419837645865043, "grad_norm": 2.234375, "learning_rate": 8.723180598389671e-06, "loss": 0.92671518, "memory(GiB)": 369.4, "step": 21365, "train_speed(iter/s)": 0.201099 }, { "acc": 0.73695049, "epoch": 0.5421106037544394, "grad_norm": 1.6640625, "learning_rate": 8.722480591000046e-06, "loss": 1.06847963, "memory(GiB)": 369.4, "step": 21370, "train_speed(iter/s)": 0.201106 }, { "acc": 0.76028972, "epoch": 0.5422374429223744, "grad_norm": 2.8125, "learning_rate": 8.721780419879106e-06, "loss": 0.93847198, "memory(GiB)": 369.4, "step": 21375, "train_speed(iter/s)": 0.201114 }, { "acc": 0.74619513, "epoch": 0.5423642820903095, "grad_norm": 2.21875, "learning_rate": 8.721080085057646e-06, "loss": 1.05358219, "memory(GiB)": 369.4, "step": 21380, "train_speed(iter/s)": 0.20112 }, { "acc": 0.7438612, "epoch": 0.5424911212582445, "grad_norm": 2.90625, "learning_rate": 8.72037958656647e-06, "loss": 1.07156315, "memory(GiB)": 369.4, "step": 21385, "train_speed(iter/s)": 0.201128 }, { "acc": 0.7369709, "epoch": 0.5426179604261796, "grad_norm": 2.0625, "learning_rate": 8.71967892443639e-06, "loss": 1.05045395, "memory(GiB)": 369.4, "step": 21390, "train_speed(iter/s)": 0.201136 }, { "acc": 0.75143471, "epoch": 0.5427447995941147, "grad_norm": 2.609375, "learning_rate": 8.718978098698226e-06, "loss": 1.00133457, "memory(GiB)": 369.4, "step": 21395, "train_speed(iter/s)": 0.201138 }, { "acc": 0.74217682, "epoch": 0.5428716387620497, "grad_norm": 2.359375, "learning_rate": 8.718277109382799e-06, "loss": 1.04315252, "memory(GiB)": 369.4, "step": 21400, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74537096, "epoch": 0.5429984779299848, "grad_norm": 2.4375, "learning_rate": 8.717575956520942e-06, "loss": 1.07459984, "memory(GiB)": 369.4, "step": 21405, "train_speed(iter/s)": 0.201152 }, { "acc": 0.74863386, "epoch": 0.5431253170979199, "grad_norm": 2.046875, "learning_rate": 8.716874640143498e-06, "loss": 0.99350672, "memory(GiB)": 369.4, "step": 21410, "train_speed(iter/s)": 0.201158 }, { "acc": 0.74734116, "epoch": 0.5432521562658549, "grad_norm": 1.9765625, "learning_rate": 8.716173160281315e-06, "loss": 1.05843077, "memory(GiB)": 369.4, "step": 21415, "train_speed(iter/s)": 0.201162 }, { "acc": 0.7237833, "epoch": 0.54337899543379, "grad_norm": 2.859375, "learning_rate": 8.715471516965242e-06, "loss": 1.09367008, "memory(GiB)": 369.4, "step": 21420, "train_speed(iter/s)": 0.201166 }, { "acc": 0.73934803, "epoch": 0.543505834601725, "grad_norm": 2.21875, "learning_rate": 8.714769710226144e-06, "loss": 1.04898672, "memory(GiB)": 369.4, "step": 21425, "train_speed(iter/s)": 0.201174 }, { "acc": 0.74990525, "epoch": 0.5436326737696601, "grad_norm": 2.296875, "learning_rate": 8.714067740094888e-06, "loss": 1.0200531, "memory(GiB)": 369.4, "step": 21430, "train_speed(iter/s)": 0.201181 }, { "acc": 0.75572176, "epoch": 0.5437595129375952, "grad_norm": 3.015625, "learning_rate": 8.713365606602353e-06, "loss": 1.00682182, "memory(GiB)": 369.4, "step": 21435, "train_speed(iter/s)": 0.201188 }, { "acc": 0.75995045, "epoch": 0.5438863521055302, "grad_norm": 2.46875, "learning_rate": 8.71266330977942e-06, "loss": 0.94666719, "memory(GiB)": 369.4, "step": 21440, "train_speed(iter/s)": 0.201191 }, { "acc": 0.75697031, "epoch": 0.5440131912734653, "grad_norm": 1.8125, "learning_rate": 8.711960849656975e-06, "loss": 0.96650648, "memory(GiB)": 369.4, "step": 21445, "train_speed(iter/s)": 0.201197 }, { "acc": 0.73883023, "epoch": 0.5441400304414004, "grad_norm": 2.671875, "learning_rate": 8.711258226265922e-06, "loss": 1.04405041, "memory(GiB)": 369.4, "step": 21450, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74393578, "epoch": 0.5442668696093353, "grad_norm": 2.21875, "learning_rate": 8.710555439637163e-06, "loss": 1.04460716, "memory(GiB)": 369.4, "step": 21455, "train_speed(iter/s)": 0.201212 }, { "acc": 0.73828764, "epoch": 0.5443937087772704, "grad_norm": 2.578125, "learning_rate": 8.709852489801608e-06, "loss": 0.99142628, "memory(GiB)": 369.4, "step": 21460, "train_speed(iter/s)": 0.201221 }, { "acc": 0.73881202, "epoch": 0.5445205479452054, "grad_norm": 2.484375, "learning_rate": 8.709149376790177e-06, "loss": 1.01654205, "memory(GiB)": 369.4, "step": 21465, "train_speed(iter/s)": 0.20123 }, { "acc": 0.74004297, "epoch": 0.5446473871131405, "grad_norm": 2.40625, "learning_rate": 8.708446100633796e-06, "loss": 1.0079546, "memory(GiB)": 369.4, "step": 21470, "train_speed(iter/s)": 0.201233 }, { "acc": 0.77185373, "epoch": 0.5447742262810756, "grad_norm": 2.609375, "learning_rate": 8.707742661363401e-06, "loss": 0.93829269, "memory(GiB)": 369.4, "step": 21475, "train_speed(iter/s)": 0.201242 }, { "acc": 0.73244481, "epoch": 0.5449010654490106, "grad_norm": 2.5625, "learning_rate": 8.707039059009927e-06, "loss": 1.10391712, "memory(GiB)": 369.4, "step": 21480, "train_speed(iter/s)": 0.201246 }, { "acc": 0.75519047, "epoch": 0.5450279046169457, "grad_norm": 2.515625, "learning_rate": 8.706335293604326e-06, "loss": 1.00647726, "memory(GiB)": 369.4, "step": 21485, "train_speed(iter/s)": 0.201249 }, { "acc": 0.7362802, "epoch": 0.5451547437848808, "grad_norm": 2.03125, "learning_rate": 8.705631365177552e-06, "loss": 1.05045795, "memory(GiB)": 369.4, "step": 21490, "train_speed(iter/s)": 0.201258 }, { "acc": 0.74690862, "epoch": 0.5452815829528158, "grad_norm": 2.375, "learning_rate": 8.704927273760563e-06, "loss": 1.00204468, "memory(GiB)": 369.4, "step": 21495, "train_speed(iter/s)": 0.201263 }, { "acc": 0.74127259, "epoch": 0.5454084221207509, "grad_norm": 2.03125, "learning_rate": 8.704223019384334e-06, "loss": 1.02622128, "memory(GiB)": 369.4, "step": 21500, "train_speed(iter/s)": 0.20127 }, { "acc": 0.7572063, "epoch": 0.5455352612886859, "grad_norm": 2.359375, "learning_rate": 8.703518602079836e-06, "loss": 0.97402496, "memory(GiB)": 369.4, "step": 21505, "train_speed(iter/s)": 0.201278 }, { "acc": 0.75057387, "epoch": 0.545662100456621, "grad_norm": 2.0625, "learning_rate": 8.702814021878057e-06, "loss": 1.00738831, "memory(GiB)": 369.4, "step": 21510, "train_speed(iter/s)": 0.201284 }, { "acc": 0.74216022, "epoch": 0.5457889396245561, "grad_norm": 2.09375, "learning_rate": 8.702109278809985e-06, "loss": 1.05605412, "memory(GiB)": 369.4, "step": 21515, "train_speed(iter/s)": 0.201292 }, { "acc": 0.74327579, "epoch": 0.5459157787924911, "grad_norm": 1.78125, "learning_rate": 8.70140437290662e-06, "loss": 1.0359971, "memory(GiB)": 369.4, "step": 21520, "train_speed(iter/s)": 0.201298 }, { "acc": 0.73860207, "epoch": 0.5460426179604262, "grad_norm": 2.171875, "learning_rate": 8.700699304198963e-06, "loss": 1.09625683, "memory(GiB)": 369.4, "step": 21525, "train_speed(iter/s)": 0.201306 }, { "acc": 0.74121914, "epoch": 0.5461694571283613, "grad_norm": 2.203125, "learning_rate": 8.699994072718026e-06, "loss": 1.04925938, "memory(GiB)": 369.4, "step": 21530, "train_speed(iter/s)": 0.201306 }, { "acc": 0.76019373, "epoch": 0.5462962962962963, "grad_norm": 2.28125, "learning_rate": 8.699288678494833e-06, "loss": 0.96806698, "memory(GiB)": 369.4, "step": 21535, "train_speed(iter/s)": 0.201314 }, { "acc": 0.73733234, "epoch": 0.5464231354642314, "grad_norm": 2.046875, "learning_rate": 8.698583121560407e-06, "loss": 1.0584959, "memory(GiB)": 369.4, "step": 21540, "train_speed(iter/s)": 0.201322 }, { "acc": 0.73782058, "epoch": 0.5465499746321664, "grad_norm": 2.1875, "learning_rate": 8.697877401945784e-06, "loss": 1.02578697, "memory(GiB)": 369.4, "step": 21545, "train_speed(iter/s)": 0.201331 }, { "acc": 0.76561551, "epoch": 0.5466768138001015, "grad_norm": 2.03125, "learning_rate": 8.697171519682002e-06, "loss": 0.91767845, "memory(GiB)": 369.4, "step": 21550, "train_speed(iter/s)": 0.201337 }, { "acc": 0.74521513, "epoch": 0.5468036529680366, "grad_norm": 1.9453125, "learning_rate": 8.696465474800109e-06, "loss": 0.99499359, "memory(GiB)": 369.4, "step": 21555, "train_speed(iter/s)": 0.201344 }, { "acc": 0.74914527, "epoch": 0.5469304921359716, "grad_norm": 2.015625, "learning_rate": 8.695759267331162e-06, "loss": 1.02871571, "memory(GiB)": 369.4, "step": 21560, "train_speed(iter/s)": 0.201352 }, { "acc": 0.74066987, "epoch": 0.5470573313039067, "grad_norm": 1.796875, "learning_rate": 8.69505289730622e-06, "loss": 1.01833134, "memory(GiB)": 369.4, "step": 21565, "train_speed(iter/s)": 0.20136 }, { "acc": 0.74632282, "epoch": 0.5471841704718418, "grad_norm": 2.5, "learning_rate": 8.694346364756356e-06, "loss": 0.98534384, "memory(GiB)": 369.4, "step": 21570, "train_speed(iter/s)": 0.201369 }, { "acc": 0.75511374, "epoch": 0.5473110096397767, "grad_norm": 1.8984375, "learning_rate": 8.693639669712645e-06, "loss": 0.99883394, "memory(GiB)": 369.4, "step": 21575, "train_speed(iter/s)": 0.201374 }, { "acc": 0.75336523, "epoch": 0.5474378488077118, "grad_norm": 2.5625, "learning_rate": 8.692932812206171e-06, "loss": 0.98073883, "memory(GiB)": 369.4, "step": 21580, "train_speed(iter/s)": 0.20138 }, { "acc": 0.73697209, "epoch": 0.5475646879756468, "grad_norm": 2.171875, "learning_rate": 8.692225792268023e-06, "loss": 1.02416782, "memory(GiB)": 369.4, "step": 21585, "train_speed(iter/s)": 0.201386 }, { "acc": 0.74736285, "epoch": 0.5476915271435819, "grad_norm": 2.359375, "learning_rate": 8.691518609929302e-06, "loss": 1.03720837, "memory(GiB)": 369.4, "step": 21590, "train_speed(iter/s)": 0.201393 }, { "acc": 0.74568911, "epoch": 0.547818366311517, "grad_norm": 1.890625, "learning_rate": 8.690811265221108e-06, "loss": 0.99411716, "memory(GiB)": 369.4, "step": 21595, "train_speed(iter/s)": 0.2014 }, { "acc": 0.75048342, "epoch": 0.547945205479452, "grad_norm": 2.140625, "learning_rate": 8.690103758174558e-06, "loss": 1.02418861, "memory(GiB)": 369.4, "step": 21600, "train_speed(iter/s)": 0.201408 }, { "acc": 0.72456579, "epoch": 0.5480720446473871, "grad_norm": 2.4375, "learning_rate": 8.68939608882077e-06, "loss": 1.02348461, "memory(GiB)": 369.4, "step": 21605, "train_speed(iter/s)": 0.201415 }, { "acc": 0.73963141, "epoch": 0.5481988838153222, "grad_norm": 6.96875, "learning_rate": 8.688688257190869e-06, "loss": 1.04252205, "memory(GiB)": 369.4, "step": 21610, "train_speed(iter/s)": 0.201424 }, { "acc": 0.761376, "epoch": 0.5483257229832572, "grad_norm": 2.3125, "learning_rate": 8.68798026331599e-06, "loss": 0.95223446, "memory(GiB)": 369.4, "step": 21615, "train_speed(iter/s)": 0.201432 }, { "acc": 0.74108233, "epoch": 0.5484525621511923, "grad_norm": 1.9296875, "learning_rate": 8.687272107227274e-06, "loss": 1.06652517, "memory(GiB)": 369.4, "step": 21620, "train_speed(iter/s)": 0.201438 }, { "acc": 0.72998047, "epoch": 0.5485794013191273, "grad_norm": 1.7421875, "learning_rate": 8.686563788955867e-06, "loss": 1.03974199, "memory(GiB)": 369.4, "step": 21625, "train_speed(iter/s)": 0.201447 }, { "acc": 0.7493341, "epoch": 0.5487062404870624, "grad_norm": 2.15625, "learning_rate": 8.685855308532926e-06, "loss": 0.96954365, "memory(GiB)": 369.4, "step": 21630, "train_speed(iter/s)": 0.201453 }, { "acc": 0.75370407, "epoch": 0.5488330796549975, "grad_norm": 2.609375, "learning_rate": 8.685146665989613e-06, "loss": 1.01473351, "memory(GiB)": 369.4, "step": 21635, "train_speed(iter/s)": 0.201462 }, { "acc": 0.74619265, "epoch": 0.5489599188229325, "grad_norm": 2.171875, "learning_rate": 8.684437861357095e-06, "loss": 0.99521036, "memory(GiB)": 369.4, "step": 21640, "train_speed(iter/s)": 0.201471 }, { "acc": 0.74642129, "epoch": 0.5490867579908676, "grad_norm": 2.15625, "learning_rate": 8.683728894666551e-06, "loss": 0.95477657, "memory(GiB)": 369.4, "step": 21645, "train_speed(iter/s)": 0.201482 }, { "acc": 0.74617782, "epoch": 0.5492135971588027, "grad_norm": 2.21875, "learning_rate": 8.683019765949163e-06, "loss": 1.02647886, "memory(GiB)": 369.4, "step": 21650, "train_speed(iter/s)": 0.20149 }, { "acc": 0.7496583, "epoch": 0.5493404363267377, "grad_norm": 2.1875, "learning_rate": 8.682310475236123e-06, "loss": 0.99225931, "memory(GiB)": 369.4, "step": 21655, "train_speed(iter/s)": 0.201497 }, { "acc": 0.73744488, "epoch": 0.5494672754946728, "grad_norm": 2.234375, "learning_rate": 8.681601022558628e-06, "loss": 1.04213448, "memory(GiB)": 369.4, "step": 21660, "train_speed(iter/s)": 0.201505 }, { "acc": 0.74703569, "epoch": 0.5495941146626078, "grad_norm": 2.140625, "learning_rate": 8.680891407947882e-06, "loss": 1.00083466, "memory(GiB)": 369.4, "step": 21665, "train_speed(iter/s)": 0.201513 }, { "acc": 0.74578404, "epoch": 0.5497209538305429, "grad_norm": 2.25, "learning_rate": 8.680181631435098e-06, "loss": 1.10002565, "memory(GiB)": 369.4, "step": 21670, "train_speed(iter/s)": 0.20152 }, { "acc": 0.75105152, "epoch": 0.549847792998478, "grad_norm": 2.53125, "learning_rate": 8.679471693051495e-06, "loss": 0.99965286, "memory(GiB)": 369.4, "step": 21675, "train_speed(iter/s)": 0.201528 }, { "acc": 0.75104189, "epoch": 0.549974632166413, "grad_norm": 1.875, "learning_rate": 8.678761592828301e-06, "loss": 1.00859032, "memory(GiB)": 369.4, "step": 21680, "train_speed(iter/s)": 0.201534 }, { "acc": 0.7473217, "epoch": 0.5501014713343481, "grad_norm": 2.09375, "learning_rate": 8.678051330796746e-06, "loss": 0.99495487, "memory(GiB)": 369.4, "step": 21685, "train_speed(iter/s)": 0.201529 }, { "acc": 0.73024988, "epoch": 0.5502283105022832, "grad_norm": 2.09375, "learning_rate": 8.677340906988072e-06, "loss": 1.04295731, "memory(GiB)": 369.4, "step": 21690, "train_speed(iter/s)": 0.201537 }, { "acc": 0.75114079, "epoch": 0.5503551496702181, "grad_norm": 2.1875, "learning_rate": 8.676630321433528e-06, "loss": 0.95972061, "memory(GiB)": 369.4, "step": 21695, "train_speed(iter/s)": 0.201542 }, { "acc": 0.74657097, "epoch": 0.5504819888381532, "grad_norm": 2.078125, "learning_rate": 8.675919574164366e-06, "loss": 1.02746239, "memory(GiB)": 369.4, "step": 21700, "train_speed(iter/s)": 0.201545 }, { "acc": 0.74773312, "epoch": 0.5506088280060882, "grad_norm": 1.8046875, "learning_rate": 8.675208665211851e-06, "loss": 1.0138195, "memory(GiB)": 369.4, "step": 21705, "train_speed(iter/s)": 0.201551 }, { "acc": 0.74589219, "epoch": 0.5507356671740233, "grad_norm": 2.671875, "learning_rate": 8.674497594607249e-06, "loss": 1.03189316, "memory(GiB)": 369.4, "step": 21710, "train_speed(iter/s)": 0.201559 }, { "acc": 0.73792491, "epoch": 0.5508625063419584, "grad_norm": 2.296875, "learning_rate": 8.673786362381837e-06, "loss": 1.0329937, "memory(GiB)": 369.4, "step": 21715, "train_speed(iter/s)": 0.201567 }, { "acc": 0.74242687, "epoch": 0.5509893455098934, "grad_norm": 1.8515625, "learning_rate": 8.673074968566899e-06, "loss": 0.96778469, "memory(GiB)": 369.4, "step": 21720, "train_speed(iter/s)": 0.201573 }, { "acc": 0.73581448, "epoch": 0.5511161846778285, "grad_norm": 2.59375, "learning_rate": 8.672363413193724e-06, "loss": 1.06613102, "memory(GiB)": 369.4, "step": 21725, "train_speed(iter/s)": 0.201576 }, { "acc": 0.74474649, "epoch": 0.5512430238457636, "grad_norm": 1.6875, "learning_rate": 8.671651696293613e-06, "loss": 1.01558628, "memory(GiB)": 369.4, "step": 21730, "train_speed(iter/s)": 0.201582 }, { "acc": 0.73783426, "epoch": 0.5513698630136986, "grad_norm": 2.4375, "learning_rate": 8.670939817897865e-06, "loss": 1.08242588, "memory(GiB)": 369.4, "step": 21735, "train_speed(iter/s)": 0.201592 }, { "acc": 0.73881235, "epoch": 0.5514967021816337, "grad_norm": 1.8046875, "learning_rate": 8.670227778037796e-06, "loss": 1.03198872, "memory(GiB)": 369.4, "step": 21740, "train_speed(iter/s)": 0.201597 }, { "acc": 0.75231128, "epoch": 0.5516235413495687, "grad_norm": 2.21875, "learning_rate": 8.669515576744722e-06, "loss": 0.99001665, "memory(GiB)": 369.4, "step": 21745, "train_speed(iter/s)": 0.201602 }, { "acc": 0.73769035, "epoch": 0.5517503805175038, "grad_norm": 2.390625, "learning_rate": 8.66880321404997e-06, "loss": 1.04043303, "memory(GiB)": 369.4, "step": 21750, "train_speed(iter/s)": 0.201609 }, { "acc": 0.75525417, "epoch": 0.5518772196854389, "grad_norm": 2.890625, "learning_rate": 8.668090689984872e-06, "loss": 1.00437679, "memory(GiB)": 369.4, "step": 21755, "train_speed(iter/s)": 0.201617 }, { "acc": 0.73855734, "epoch": 0.5520040588533739, "grad_norm": 2.265625, "learning_rate": 8.667378004580769e-06, "loss": 0.9628233, "memory(GiB)": 369.4, "step": 21760, "train_speed(iter/s)": 0.201626 }, { "acc": 0.75381107, "epoch": 0.552130898021309, "grad_norm": 2.21875, "learning_rate": 8.666665157869007e-06, "loss": 0.96908607, "memory(GiB)": 369.4, "step": 21765, "train_speed(iter/s)": 0.201635 }, { "acc": 0.74257431, "epoch": 0.5522577371892441, "grad_norm": 2.09375, "learning_rate": 8.665952149880942e-06, "loss": 1.00686398, "memory(GiB)": 369.4, "step": 21770, "train_speed(iter/s)": 0.201642 }, { "acc": 0.75682068, "epoch": 0.5523845763571791, "grad_norm": 2.140625, "learning_rate": 8.665238980647934e-06, "loss": 0.94614334, "memory(GiB)": 369.4, "step": 21775, "train_speed(iter/s)": 0.201651 }, { "acc": 0.7412992, "epoch": 0.5525114155251142, "grad_norm": 2.28125, "learning_rate": 8.66452565020135e-06, "loss": 1.01731663, "memory(GiB)": 369.4, "step": 21780, "train_speed(iter/s)": 0.201655 }, { "acc": 0.7440578, "epoch": 0.5526382546930492, "grad_norm": 2.28125, "learning_rate": 8.663812158572568e-06, "loss": 1.0226038, "memory(GiB)": 369.4, "step": 21785, "train_speed(iter/s)": 0.201659 }, { "acc": 0.75962949, "epoch": 0.5527650938609843, "grad_norm": 2.59375, "learning_rate": 8.663098505792971e-06, "loss": 0.99670801, "memory(GiB)": 369.4, "step": 21790, "train_speed(iter/s)": 0.201668 }, { "acc": 0.73751111, "epoch": 0.5528919330289194, "grad_norm": 1.96875, "learning_rate": 8.662384691893947e-06, "loss": 0.9982132, "memory(GiB)": 369.4, "step": 21795, "train_speed(iter/s)": 0.201675 }, { "acc": 0.74708652, "epoch": 0.5530187721968544, "grad_norm": 2.0, "learning_rate": 8.661670716906889e-06, "loss": 0.9897397, "memory(GiB)": 369.4, "step": 21800, "train_speed(iter/s)": 0.201682 }, { "acc": 0.74231405, "epoch": 0.5531456113647895, "grad_norm": 2.015625, "learning_rate": 8.66095658086321e-06, "loss": 1.08600216, "memory(GiB)": 369.4, "step": 21805, "train_speed(iter/s)": 0.201691 }, { "acc": 0.73881679, "epoch": 0.5532724505327246, "grad_norm": 1.9609375, "learning_rate": 8.660242283794312e-06, "loss": 1.00336914, "memory(GiB)": 369.4, "step": 21810, "train_speed(iter/s)": 0.201696 }, { "acc": 0.73688798, "epoch": 0.5533992897006595, "grad_norm": 2.234375, "learning_rate": 8.659527825731617e-06, "loss": 1.0713954, "memory(GiB)": 369.4, "step": 21815, "train_speed(iter/s)": 0.201704 }, { "acc": 0.73727922, "epoch": 0.5535261288685946, "grad_norm": 2.515625, "learning_rate": 8.65881320670655e-06, "loss": 1.00694571, "memory(GiB)": 369.4, "step": 21820, "train_speed(iter/s)": 0.20171 }, { "acc": 0.74918404, "epoch": 0.5536529680365296, "grad_norm": 2.109375, "learning_rate": 8.658098426750543e-06, "loss": 1.05465565, "memory(GiB)": 369.4, "step": 21825, "train_speed(iter/s)": 0.201718 }, { "acc": 0.73817987, "epoch": 0.5537798072044647, "grad_norm": 2.078125, "learning_rate": 8.657383485895034e-06, "loss": 1.05653896, "memory(GiB)": 369.4, "step": 21830, "train_speed(iter/s)": 0.201725 }, { "acc": 0.7511796, "epoch": 0.5539066463723998, "grad_norm": 2.3125, "learning_rate": 8.656668384171472e-06, "loss": 1.03288355, "memory(GiB)": 369.4, "step": 21835, "train_speed(iter/s)": 0.201731 }, { "acc": 0.75963898, "epoch": 0.5540334855403348, "grad_norm": 2.71875, "learning_rate": 8.655953121611307e-06, "loss": 1.0220727, "memory(GiB)": 369.4, "step": 21840, "train_speed(iter/s)": 0.201738 }, { "acc": 0.73981762, "epoch": 0.5541603247082699, "grad_norm": 2.0, "learning_rate": 8.655237698246002e-06, "loss": 1.03687267, "memory(GiB)": 369.4, "step": 21845, "train_speed(iter/s)": 0.201744 }, { "acc": 0.75204468, "epoch": 0.554287163876205, "grad_norm": 2.109375, "learning_rate": 8.654522114107024e-06, "loss": 0.965798, "memory(GiB)": 369.4, "step": 21850, "train_speed(iter/s)": 0.201751 }, { "acc": 0.75370569, "epoch": 0.55441400304414, "grad_norm": 2.109375, "learning_rate": 8.653806369225846e-06, "loss": 0.98899078, "memory(GiB)": 369.4, "step": 21855, "train_speed(iter/s)": 0.201761 }, { "acc": 0.74163675, "epoch": 0.5545408422120751, "grad_norm": 2.234375, "learning_rate": 8.65309046363395e-06, "loss": 0.98458385, "memory(GiB)": 369.4, "step": 21860, "train_speed(iter/s)": 0.201765 }, { "acc": 0.74083061, "epoch": 0.5546676813800101, "grad_norm": 2.15625, "learning_rate": 8.652374397362828e-06, "loss": 0.98163261, "memory(GiB)": 369.4, "step": 21865, "train_speed(iter/s)": 0.201768 }, { "acc": 0.75314531, "epoch": 0.5547945205479452, "grad_norm": 1.84375, "learning_rate": 8.651658170443972e-06, "loss": 0.94562511, "memory(GiB)": 369.4, "step": 21870, "train_speed(iter/s)": 0.201773 }, { "acc": 0.75520926, "epoch": 0.5549213597158803, "grad_norm": 2.203125, "learning_rate": 8.650941782908886e-06, "loss": 0.96339169, "memory(GiB)": 369.4, "step": 21875, "train_speed(iter/s)": 0.201781 }, { "acc": 0.76781988, "epoch": 0.5550481988838153, "grad_norm": 1.6796875, "learning_rate": 8.65022523478908e-06, "loss": 0.96781864, "memory(GiB)": 369.4, "step": 21880, "train_speed(iter/s)": 0.201788 }, { "acc": 0.7494257, "epoch": 0.5551750380517504, "grad_norm": 2.140625, "learning_rate": 8.649508526116073e-06, "loss": 0.99046421, "memory(GiB)": 369.4, "step": 21885, "train_speed(iter/s)": 0.201793 }, { "acc": 0.73924203, "epoch": 0.5553018772196855, "grad_norm": 2.265625, "learning_rate": 8.648791656921384e-06, "loss": 1.00129604, "memory(GiB)": 369.4, "step": 21890, "train_speed(iter/s)": 0.201801 }, { "acc": 0.75653749, "epoch": 0.5554287163876205, "grad_norm": 1.921875, "learning_rate": 8.648074627236549e-06, "loss": 0.97118387, "memory(GiB)": 369.4, "step": 21895, "train_speed(iter/s)": 0.201805 }, { "acc": 0.75163889, "epoch": 0.5555555555555556, "grad_norm": 2.890625, "learning_rate": 8.647357437093104e-06, "loss": 1.01976976, "memory(GiB)": 369.4, "step": 21900, "train_speed(iter/s)": 0.201812 }, { "acc": 0.75168762, "epoch": 0.5556823947234906, "grad_norm": 2.140625, "learning_rate": 8.646640086522595e-06, "loss": 1.00263271, "memory(GiB)": 369.4, "step": 21905, "train_speed(iter/s)": 0.20182 }, { "acc": 0.74350634, "epoch": 0.5558092338914257, "grad_norm": 2.40625, "learning_rate": 8.645922575556575e-06, "loss": 1.04938993, "memory(GiB)": 369.4, "step": 21910, "train_speed(iter/s)": 0.201828 }, { "acc": 0.74097657, "epoch": 0.5559360730593608, "grad_norm": 2.03125, "learning_rate": 8.645204904226601e-06, "loss": 1.02146921, "memory(GiB)": 369.4, "step": 21915, "train_speed(iter/s)": 0.201832 }, { "acc": 0.75765986, "epoch": 0.5560629122272958, "grad_norm": 2.46875, "learning_rate": 8.64448707256424e-06, "loss": 0.95120335, "memory(GiB)": 369.4, "step": 21920, "train_speed(iter/s)": 0.201841 }, { "acc": 0.75099115, "epoch": 0.5561897513952309, "grad_norm": 1.8671875, "learning_rate": 8.643769080601067e-06, "loss": 0.98047466, "memory(GiB)": 369.4, "step": 21925, "train_speed(iter/s)": 0.20185 }, { "acc": 0.76044269, "epoch": 0.556316590563166, "grad_norm": 2.28125, "learning_rate": 8.643050928368661e-06, "loss": 0.91948214, "memory(GiB)": 369.4, "step": 21930, "train_speed(iter/s)": 0.201859 }, { "acc": 0.75800457, "epoch": 0.556443429731101, "grad_norm": 2.328125, "learning_rate": 8.642332615898611e-06, "loss": 1.00274296, "memory(GiB)": 369.4, "step": 21935, "train_speed(iter/s)": 0.201865 }, { "acc": 0.75225973, "epoch": 0.556570268899036, "grad_norm": 2.328125, "learning_rate": 8.64161414322251e-06, "loss": 0.94964457, "memory(GiB)": 369.4, "step": 21940, "train_speed(iter/s)": 0.201871 }, { "acc": 0.76350536, "epoch": 0.556697108066971, "grad_norm": 2.1875, "learning_rate": 8.64089551037196e-06, "loss": 0.95723553, "memory(GiB)": 369.4, "step": 21945, "train_speed(iter/s)": 0.201875 }, { "acc": 0.74027061, "epoch": 0.5568239472349061, "grad_norm": 1.8515625, "learning_rate": 8.640176717378573e-06, "loss": 1.0274435, "memory(GiB)": 369.4, "step": 21950, "train_speed(iter/s)": 0.20188 }, { "acc": 0.7491374, "epoch": 0.5569507864028412, "grad_norm": 2.359375, "learning_rate": 8.639457764273957e-06, "loss": 1.04944096, "memory(GiB)": 369.4, "step": 21955, "train_speed(iter/s)": 0.201885 }, { "acc": 0.73134899, "epoch": 0.5570776255707762, "grad_norm": 2.09375, "learning_rate": 8.638738651089744e-06, "loss": 1.02858868, "memory(GiB)": 369.4, "step": 21960, "train_speed(iter/s)": 0.201891 }, { "acc": 0.73523345, "epoch": 0.5572044647387113, "grad_norm": 2.109375, "learning_rate": 8.638019377857555e-06, "loss": 1.03705997, "memory(GiB)": 369.4, "step": 21965, "train_speed(iter/s)": 0.201894 }, { "acc": 0.74529867, "epoch": 0.5573313039066464, "grad_norm": 2.640625, "learning_rate": 8.637299944609034e-06, "loss": 1.03233852, "memory(GiB)": 369.4, "step": 21970, "train_speed(iter/s)": 0.201897 }, { "acc": 0.75137091, "epoch": 0.5574581430745814, "grad_norm": 2.609375, "learning_rate": 8.636580351375821e-06, "loss": 1.01790085, "memory(GiB)": 369.4, "step": 21975, "train_speed(iter/s)": 0.201902 }, { "acc": 0.7390255, "epoch": 0.5575849822425165, "grad_norm": 2.578125, "learning_rate": 8.635860598189569e-06, "loss": 1.06453342, "memory(GiB)": 369.4, "step": 21980, "train_speed(iter/s)": 0.201908 }, { "acc": 0.73981047, "epoch": 0.5577118214104515, "grad_norm": 1.8046875, "learning_rate": 8.635140685081936e-06, "loss": 1.04941902, "memory(GiB)": 369.4, "step": 21985, "train_speed(iter/s)": 0.201916 }, { "acc": 0.74507179, "epoch": 0.5578386605783866, "grad_norm": 2.140625, "learning_rate": 8.634420612084583e-06, "loss": 1.02731543, "memory(GiB)": 369.4, "step": 21990, "train_speed(iter/s)": 0.20191 }, { "acc": 0.73569031, "epoch": 0.5579654997463217, "grad_norm": 2.125, "learning_rate": 8.633700379229187e-06, "loss": 1.04925737, "memory(GiB)": 369.4, "step": 21995, "train_speed(iter/s)": 0.201915 }, { "acc": 0.74349251, "epoch": 0.5580923389142567, "grad_norm": 2.4375, "learning_rate": 8.632979986547423e-06, "loss": 1.05227203, "memory(GiB)": 369.4, "step": 22000, "train_speed(iter/s)": 0.201921 }, { "epoch": 0.5580923389142567, "eval_acc": 0.7353360541125641, "eval_loss": 0.9826530814170837, "eval_runtime": 384.6784, "eval_samples_per_second": 16.559, "eval_steps_per_second": 8.28, "step": 22000 }, { "acc": 0.74667706, "epoch": 0.5582191780821918, "grad_norm": 1.984375, "learning_rate": 8.632259434070982e-06, "loss": 1.01661634, "memory(GiB)": 369.4, "step": 22005, "train_speed(iter/s)": 0.200618 }, { "acc": 0.75966396, "epoch": 0.5583460172501269, "grad_norm": 2.140625, "learning_rate": 8.631538721831551e-06, "loss": 1.00490608, "memory(GiB)": 369.4, "step": 22010, "train_speed(iter/s)": 0.200627 }, { "acc": 0.74387603, "epoch": 0.5584728564180619, "grad_norm": 1.9609375, "learning_rate": 8.630817849860835e-06, "loss": 1.0679801, "memory(GiB)": 369.4, "step": 22015, "train_speed(iter/s)": 0.200634 }, { "acc": 0.75685234, "epoch": 0.558599695585997, "grad_norm": 2.390625, "learning_rate": 8.63009681819054e-06, "loss": 0.97745628, "memory(GiB)": 369.4, "step": 22020, "train_speed(iter/s)": 0.200632 }, { "acc": 0.74735355, "epoch": 0.558726534753932, "grad_norm": 3.734375, "learning_rate": 8.629375626852378e-06, "loss": 0.99501858, "memory(GiB)": 369.4, "step": 22025, "train_speed(iter/s)": 0.200639 }, { "acc": 0.73856211, "epoch": 0.5588533739218671, "grad_norm": 1.8984375, "learning_rate": 8.628654275878074e-06, "loss": 1.03129997, "memory(GiB)": 369.4, "step": 22030, "train_speed(iter/s)": 0.200646 }, { "acc": 0.75023203, "epoch": 0.5589802130898022, "grad_norm": 1.984375, "learning_rate": 8.627932765299353e-06, "loss": 0.98436337, "memory(GiB)": 369.4, "step": 22035, "train_speed(iter/s)": 0.200653 }, { "acc": 0.7549201, "epoch": 0.5591070522577372, "grad_norm": 2.046875, "learning_rate": 8.627211095147952e-06, "loss": 0.95212822, "memory(GiB)": 369.4, "step": 22040, "train_speed(iter/s)": 0.200658 }, { "acc": 0.74642353, "epoch": 0.5592338914256723, "grad_norm": 2.53125, "learning_rate": 8.626489265455614e-06, "loss": 1.09521961, "memory(GiB)": 369.4, "step": 22045, "train_speed(iter/s)": 0.200666 }, { "acc": 0.73957391, "epoch": 0.5593607305936074, "grad_norm": 2.046875, "learning_rate": 8.625767276254084e-06, "loss": 1.06354904, "memory(GiB)": 369.4, "step": 22050, "train_speed(iter/s)": 0.200671 }, { "acc": 0.74092846, "epoch": 0.5594875697615423, "grad_norm": 2.171875, "learning_rate": 8.625045127575123e-06, "loss": 1.06827812, "memory(GiB)": 369.4, "step": 22055, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75591574, "epoch": 0.5596144089294774, "grad_norm": 3.15625, "learning_rate": 8.624322819450493e-06, "loss": 1.04819279, "memory(GiB)": 369.4, "step": 22060, "train_speed(iter/s)": 0.200685 }, { "acc": 0.76339655, "epoch": 0.5597412480974124, "grad_norm": 2.71875, "learning_rate": 8.623600351911962e-06, "loss": 0.95013771, "memory(GiB)": 369.4, "step": 22065, "train_speed(iter/s)": 0.200693 }, { "acc": 0.75829792, "epoch": 0.5598680872653475, "grad_norm": 1.96875, "learning_rate": 8.622877724991312e-06, "loss": 1.00951471, "memory(GiB)": 369.4, "step": 22070, "train_speed(iter/s)": 0.200697 }, { "acc": 0.7475903, "epoch": 0.5599949264332826, "grad_norm": 1.921875, "learning_rate": 8.622154938720323e-06, "loss": 0.99467735, "memory(GiB)": 369.4, "step": 22075, "train_speed(iter/s)": 0.200704 }, { "acc": 0.73367405, "epoch": 0.5601217656012176, "grad_norm": 2.0625, "learning_rate": 8.621431993130787e-06, "loss": 1.04150534, "memory(GiB)": 369.4, "step": 22080, "train_speed(iter/s)": 0.200708 }, { "acc": 0.74951868, "epoch": 0.5602486047691527, "grad_norm": 2.109375, "learning_rate": 8.620708888254506e-06, "loss": 1.01854067, "memory(GiB)": 369.4, "step": 22085, "train_speed(iter/s)": 0.200715 }, { "acc": 0.74908648, "epoch": 0.5603754439370878, "grad_norm": 2.0625, "learning_rate": 8.619985624123282e-06, "loss": 0.94285088, "memory(GiB)": 369.4, "step": 22090, "train_speed(iter/s)": 0.200723 }, { "acc": 0.7584691, "epoch": 0.5605022831050228, "grad_norm": 1.8671875, "learning_rate": 8.619262200768928e-06, "loss": 0.95259533, "memory(GiB)": 369.4, "step": 22095, "train_speed(iter/s)": 0.200727 }, { "acc": 0.7533565, "epoch": 0.5606291222729579, "grad_norm": 1.8828125, "learning_rate": 8.618538618223262e-06, "loss": 1.00292244, "memory(GiB)": 369.4, "step": 22100, "train_speed(iter/s)": 0.200732 }, { "acc": 0.75164986, "epoch": 0.5607559614408929, "grad_norm": 2.3125, "learning_rate": 8.617814876518114e-06, "loss": 0.99471169, "memory(GiB)": 369.4, "step": 22105, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74962482, "epoch": 0.560882800608828, "grad_norm": 3.078125, "learning_rate": 8.617090975685314e-06, "loss": 1.02162876, "memory(GiB)": 369.4, "step": 22110, "train_speed(iter/s)": 0.200743 }, { "acc": 0.74109211, "epoch": 0.5610096397767631, "grad_norm": 2.109375, "learning_rate": 8.616366915756704e-06, "loss": 1.01267281, "memory(GiB)": 369.4, "step": 22115, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74901648, "epoch": 0.5611364789446981, "grad_norm": 1.875, "learning_rate": 8.615642696764131e-06, "loss": 1.01220818, "memory(GiB)": 369.4, "step": 22120, "train_speed(iter/s)": 0.200762 }, { "acc": 0.7453064, "epoch": 0.5612633181126332, "grad_norm": 2.09375, "learning_rate": 8.614918318739452e-06, "loss": 1.02990475, "memory(GiB)": 369.4, "step": 22125, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75032005, "epoch": 0.5613901572805683, "grad_norm": 2.25, "learning_rate": 8.614193781714522e-06, "loss": 1.00821257, "memory(GiB)": 369.4, "step": 22130, "train_speed(iter/s)": 0.200776 }, { "acc": 0.74129949, "epoch": 0.5615169964485033, "grad_norm": 2.125, "learning_rate": 8.613469085721215e-06, "loss": 1.01219187, "memory(GiB)": 369.4, "step": 22135, "train_speed(iter/s)": 0.200785 }, { "acc": 0.7482439, "epoch": 0.5616438356164384, "grad_norm": 2.34375, "learning_rate": 8.612744230791406e-06, "loss": 1.0376112, "memory(GiB)": 369.4, "step": 22140, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75402117, "epoch": 0.5617706747843734, "grad_norm": 2.265625, "learning_rate": 8.612019216956975e-06, "loss": 0.97360058, "memory(GiB)": 369.4, "step": 22145, "train_speed(iter/s)": 0.200796 }, { "acc": 0.74184046, "epoch": 0.5618975139523085, "grad_norm": 2.015625, "learning_rate": 8.611294044249811e-06, "loss": 1.00389977, "memory(GiB)": 369.4, "step": 22150, "train_speed(iter/s)": 0.2008 }, { "acc": 0.74438515, "epoch": 0.5620243531202436, "grad_norm": 2.140625, "learning_rate": 8.610568712701814e-06, "loss": 1.00832558, "memory(GiB)": 369.4, "step": 22155, "train_speed(iter/s)": 0.200804 }, { "acc": 0.74742675, "epoch": 0.5621511922881786, "grad_norm": 2.0625, "learning_rate": 8.609843222344883e-06, "loss": 1.00454388, "memory(GiB)": 369.4, "step": 22160, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75421257, "epoch": 0.5622780314561137, "grad_norm": 2.03125, "learning_rate": 8.609117573210931e-06, "loss": 0.90116978, "memory(GiB)": 369.4, "step": 22165, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74654865, "epoch": 0.5624048706240488, "grad_norm": 2.046875, "learning_rate": 8.608391765331875e-06, "loss": 1.00925198, "memory(GiB)": 369.4, "step": 22170, "train_speed(iter/s)": 0.200831 }, { "acc": 0.75650873, "epoch": 0.5625317097919837, "grad_norm": 1.9609375, "learning_rate": 8.607665798739638e-06, "loss": 0.99359827, "memory(GiB)": 369.4, "step": 22175, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75179963, "epoch": 0.5626585489599188, "grad_norm": 2.328125, "learning_rate": 8.606939673466153e-06, "loss": 0.99047012, "memory(GiB)": 369.4, "step": 22180, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75154281, "epoch": 0.5627853881278538, "grad_norm": 2.5, "learning_rate": 8.606213389543356e-06, "loss": 1.00059147, "memory(GiB)": 369.4, "step": 22185, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74908638, "epoch": 0.5629122272957889, "grad_norm": 2.46875, "learning_rate": 8.605486947003194e-06, "loss": 1.05688534, "memory(GiB)": 369.4, "step": 22190, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74023142, "epoch": 0.563039066463724, "grad_norm": 2.015625, "learning_rate": 8.60476034587762e-06, "loss": 1.04641542, "memory(GiB)": 369.4, "step": 22195, "train_speed(iter/s)": 0.200861 }, { "acc": 0.75487041, "epoch": 0.563165905631659, "grad_norm": 2.140625, "learning_rate": 8.604033586198592e-06, "loss": 1.00482845, "memory(GiB)": 369.4, "step": 22200, "train_speed(iter/s)": 0.20087 }, { "acc": 0.74179425, "epoch": 0.5632927447995941, "grad_norm": 2.171875, "learning_rate": 8.603306667998074e-06, "loss": 1.03189812, "memory(GiB)": 369.4, "step": 22205, "train_speed(iter/s)": 0.200875 }, { "acc": 0.74864244, "epoch": 0.5634195839675292, "grad_norm": 2.15625, "learning_rate": 8.602579591308043e-06, "loss": 1.00759239, "memory(GiB)": 369.4, "step": 22210, "train_speed(iter/s)": 0.200881 }, { "acc": 0.72480726, "epoch": 0.5635464231354642, "grad_norm": 2.15625, "learning_rate": 8.601852356160476e-06, "loss": 1.09820881, "memory(GiB)": 369.4, "step": 22215, "train_speed(iter/s)": 0.200887 }, { "acc": 0.7432375, "epoch": 0.5636732623033993, "grad_norm": 2.109375, "learning_rate": 8.60112496258736e-06, "loss": 1.01767807, "memory(GiB)": 369.4, "step": 22220, "train_speed(iter/s)": 0.200896 }, { "acc": 0.73819418, "epoch": 0.5638001014713343, "grad_norm": 2.046875, "learning_rate": 8.600397410620693e-06, "loss": 1.04144363, "memory(GiB)": 369.4, "step": 22225, "train_speed(iter/s)": 0.200904 }, { "acc": 0.74113007, "epoch": 0.5639269406392694, "grad_norm": 2.140625, "learning_rate": 8.599669700292472e-06, "loss": 1.01793251, "memory(GiB)": 369.4, "step": 22230, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75179553, "epoch": 0.5640537798072045, "grad_norm": 2.25, "learning_rate": 8.598941831634707e-06, "loss": 1.03357735, "memory(GiB)": 369.4, "step": 22235, "train_speed(iter/s)": 0.200917 }, { "acc": 0.73746824, "epoch": 0.5641806189751395, "grad_norm": 2.0, "learning_rate": 8.598213804679412e-06, "loss": 1.02078419, "memory(GiB)": 369.4, "step": 22240, "train_speed(iter/s)": 0.200926 }, { "acc": 0.74008465, "epoch": 0.5643074581430746, "grad_norm": 2.09375, "learning_rate": 8.597485619458609e-06, "loss": 1.04842014, "memory(GiB)": 369.4, "step": 22245, "train_speed(iter/s)": 0.200932 }, { "acc": 0.74164472, "epoch": 0.5644342973110097, "grad_norm": 2.140625, "learning_rate": 8.596757276004327e-06, "loss": 1.029212, "memory(GiB)": 369.4, "step": 22250, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75639262, "epoch": 0.5645611364789447, "grad_norm": 2.359375, "learning_rate": 8.5960287743486e-06, "loss": 1.06348629, "memory(GiB)": 369.4, "step": 22255, "train_speed(iter/s)": 0.200945 }, { "acc": 0.755758, "epoch": 0.5646879756468798, "grad_norm": 2.03125, "learning_rate": 8.595300114523473e-06, "loss": 0.97622013, "memory(GiB)": 369.4, "step": 22260, "train_speed(iter/s)": 0.200955 }, { "acc": 0.74649887, "epoch": 0.5648148148148148, "grad_norm": 2.0625, "learning_rate": 8.594571296560997e-06, "loss": 1.06618605, "memory(GiB)": 369.4, "step": 22265, "train_speed(iter/s)": 0.200962 }, { "acc": 0.73679943, "epoch": 0.5649416539827499, "grad_norm": 1.9921875, "learning_rate": 8.593842320493224e-06, "loss": 1.0851819, "memory(GiB)": 369.4, "step": 22270, "train_speed(iter/s)": 0.200969 }, { "acc": 0.75007048, "epoch": 0.565068493150685, "grad_norm": 2.1875, "learning_rate": 8.593113186352222e-06, "loss": 1.05597534, "memory(GiB)": 369.4, "step": 22275, "train_speed(iter/s)": 0.200976 }, { "acc": 0.7297904, "epoch": 0.56519533231862, "grad_norm": 2.328125, "learning_rate": 8.592383894170059e-06, "loss": 1.04515514, "memory(GiB)": 369.4, "step": 22280, "train_speed(iter/s)": 0.200985 }, { "acc": 0.75354395, "epoch": 0.5653221714865551, "grad_norm": 1.921875, "learning_rate": 8.591654443978815e-06, "loss": 0.9952137, "memory(GiB)": 369.4, "step": 22285, "train_speed(iter/s)": 0.200993 }, { "acc": 0.75033197, "epoch": 0.5654490106544902, "grad_norm": 2.296875, "learning_rate": 8.590924835810572e-06, "loss": 0.9954834, "memory(GiB)": 369.4, "step": 22290, "train_speed(iter/s)": 0.201001 }, { "acc": 0.7507174, "epoch": 0.5655758498224251, "grad_norm": 2.328125, "learning_rate": 8.590195069697423e-06, "loss": 0.99402456, "memory(GiB)": 369.4, "step": 22295, "train_speed(iter/s)": 0.201009 }, { "acc": 0.74370966, "epoch": 0.5657026889903602, "grad_norm": 1.8984375, "learning_rate": 8.589465145671465e-06, "loss": 1.00481586, "memory(GiB)": 369.4, "step": 22300, "train_speed(iter/s)": 0.201016 }, { "acc": 0.7381362, "epoch": 0.5658295281582952, "grad_norm": 2.1875, "learning_rate": 8.588735063764803e-06, "loss": 1.03651428, "memory(GiB)": 369.4, "step": 22305, "train_speed(iter/s)": 0.201025 }, { "acc": 0.75156345, "epoch": 0.5659563673262303, "grad_norm": 1.9140625, "learning_rate": 8.588004824009552e-06, "loss": 1.0626461, "memory(GiB)": 369.4, "step": 22310, "train_speed(iter/s)": 0.201033 }, { "acc": 0.74047909, "epoch": 0.5660832064941654, "grad_norm": 2.328125, "learning_rate": 8.58727442643783e-06, "loss": 1.08179235, "memory(GiB)": 369.4, "step": 22315, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7349474, "epoch": 0.5662100456621004, "grad_norm": 2.34375, "learning_rate": 8.586543871081764e-06, "loss": 1.02579231, "memory(GiB)": 369.4, "step": 22320, "train_speed(iter/s)": 0.20104 }, { "acc": 0.75819178, "epoch": 0.5663368848300355, "grad_norm": 2.125, "learning_rate": 8.585813157973482e-06, "loss": 0.9707777, "memory(GiB)": 369.4, "step": 22325, "train_speed(iter/s)": 0.201049 }, { "acc": 0.73352275, "epoch": 0.5664637239979706, "grad_norm": 2.3125, "learning_rate": 8.58508228714513e-06, "loss": 1.01696625, "memory(GiB)": 369.4, "step": 22330, "train_speed(iter/s)": 0.201056 }, { "acc": 0.75367346, "epoch": 0.5665905631659056, "grad_norm": 2.21875, "learning_rate": 8.584351258628852e-06, "loss": 1.0014533, "memory(GiB)": 369.4, "step": 22335, "train_speed(iter/s)": 0.20106 }, { "acc": 0.76553297, "epoch": 0.5667174023338407, "grad_norm": 2.0625, "learning_rate": 8.583620072456803e-06, "loss": 0.90196257, "memory(GiB)": 369.4, "step": 22340, "train_speed(iter/s)": 0.201068 }, { "acc": 0.7580739, "epoch": 0.5668442415017757, "grad_norm": 2.1875, "learning_rate": 8.582888728661142e-06, "loss": 0.9950758, "memory(GiB)": 369.4, "step": 22345, "train_speed(iter/s)": 0.201074 }, { "acc": 0.74363785, "epoch": 0.5669710806697108, "grad_norm": 2.265625, "learning_rate": 8.582157227274042e-06, "loss": 1.01821022, "memory(GiB)": 369.4, "step": 22350, "train_speed(iter/s)": 0.20108 }, { "acc": 0.74474707, "epoch": 0.5670979198376459, "grad_norm": 2.296875, "learning_rate": 8.581425568327671e-06, "loss": 1.01042213, "memory(GiB)": 369.4, "step": 22355, "train_speed(iter/s)": 0.201085 }, { "acc": 0.73765678, "epoch": 0.5672247590055809, "grad_norm": 2.375, "learning_rate": 8.580693751854215e-06, "loss": 1.04819412, "memory(GiB)": 369.4, "step": 22360, "train_speed(iter/s)": 0.201095 }, { "acc": 0.75081472, "epoch": 0.567351598173516, "grad_norm": 2.03125, "learning_rate": 8.57996177788586e-06, "loss": 1.01074753, "memory(GiB)": 369.4, "step": 22365, "train_speed(iter/s)": 0.201103 }, { "acc": 0.74525094, "epoch": 0.5674784373414511, "grad_norm": 2.65625, "learning_rate": 8.579229646454803e-06, "loss": 1.01145868, "memory(GiB)": 369.4, "step": 22370, "train_speed(iter/s)": 0.201111 }, { "acc": 0.76096354, "epoch": 0.5676052765093861, "grad_norm": 3.234375, "learning_rate": 8.578497357593246e-06, "loss": 0.97023993, "memory(GiB)": 369.4, "step": 22375, "train_speed(iter/s)": 0.201121 }, { "acc": 0.75682821, "epoch": 0.5677321156773212, "grad_norm": 1.8125, "learning_rate": 8.5777649113334e-06, "loss": 0.95242682, "memory(GiB)": 369.4, "step": 22380, "train_speed(iter/s)": 0.201124 }, { "acc": 0.74805474, "epoch": 0.5678589548452562, "grad_norm": 2.1875, "learning_rate": 8.577032307707476e-06, "loss": 1.00046368, "memory(GiB)": 369.4, "step": 22385, "train_speed(iter/s)": 0.201132 }, { "acc": 0.72577801, "epoch": 0.5679857940131913, "grad_norm": 2.375, "learning_rate": 8.576299546747704e-06, "loss": 1.08262939, "memory(GiB)": 369.4, "step": 22390, "train_speed(iter/s)": 0.201143 }, { "acc": 0.74838152, "epoch": 0.5681126331811264, "grad_norm": 2.09375, "learning_rate": 8.575566628486309e-06, "loss": 1.02671204, "memory(GiB)": 369.4, "step": 22395, "train_speed(iter/s)": 0.20115 }, { "acc": 0.73009281, "epoch": 0.5682394723490614, "grad_norm": 2.21875, "learning_rate": 8.574833552955532e-06, "loss": 1.04727726, "memory(GiB)": 369.4, "step": 22400, "train_speed(iter/s)": 0.201156 }, { "acc": 0.74192643, "epoch": 0.5683663115169965, "grad_norm": 1.9453125, "learning_rate": 8.574100320187612e-06, "loss": 1.05882082, "memory(GiB)": 369.4, "step": 22405, "train_speed(iter/s)": 0.201161 }, { "acc": 0.74263668, "epoch": 0.5684931506849316, "grad_norm": 2.234375, "learning_rate": 8.573366930214807e-06, "loss": 1.05572987, "memory(GiB)": 369.4, "step": 22410, "train_speed(iter/s)": 0.201167 }, { "acc": 0.73111, "epoch": 0.5686199898528665, "grad_norm": 2.8125, "learning_rate": 8.572633383069366e-06, "loss": 1.06312237, "memory(GiB)": 369.4, "step": 22415, "train_speed(iter/s)": 0.201174 }, { "acc": 0.74129763, "epoch": 0.5687468290208016, "grad_norm": 2.9375, "learning_rate": 8.571899678783561e-06, "loss": 1.05078754, "memory(GiB)": 369.4, "step": 22420, "train_speed(iter/s)": 0.201182 }, { "acc": 0.74646034, "epoch": 0.5688736681887366, "grad_norm": 1.9453125, "learning_rate": 8.57116581738966e-06, "loss": 1.02633705, "memory(GiB)": 369.4, "step": 22425, "train_speed(iter/s)": 0.20119 }, { "acc": 0.74898968, "epoch": 0.5690005073566717, "grad_norm": 1.859375, "learning_rate": 8.570431798919941e-06, "loss": 0.99575195, "memory(GiB)": 369.4, "step": 22430, "train_speed(iter/s)": 0.201198 }, { "acc": 0.75203481, "epoch": 0.5691273465246068, "grad_norm": 2.484375, "learning_rate": 8.569697623406692e-06, "loss": 0.98300428, "memory(GiB)": 369.4, "step": 22435, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74264483, "epoch": 0.5692541856925418, "grad_norm": 2.015625, "learning_rate": 8.568963290882204e-06, "loss": 1.03927574, "memory(GiB)": 369.4, "step": 22440, "train_speed(iter/s)": 0.201207 }, { "acc": 0.75349274, "epoch": 0.5693810248604769, "grad_norm": 2.1875, "learning_rate": 8.568228801378775e-06, "loss": 0.92861223, "memory(GiB)": 369.4, "step": 22445, "train_speed(iter/s)": 0.201213 }, { "acc": 0.75217061, "epoch": 0.569507864028412, "grad_norm": 2.078125, "learning_rate": 8.567494154928713e-06, "loss": 0.99605618, "memory(GiB)": 369.4, "step": 22450, "train_speed(iter/s)": 0.20122 }, { "acc": 0.740872, "epoch": 0.569634703196347, "grad_norm": 1.921875, "learning_rate": 8.566759351564332e-06, "loss": 1.02787457, "memory(GiB)": 369.4, "step": 22455, "train_speed(iter/s)": 0.201227 }, { "acc": 0.73578558, "epoch": 0.5697615423642821, "grad_norm": 2.15625, "learning_rate": 8.566024391317947e-06, "loss": 1.06451197, "memory(GiB)": 369.4, "step": 22460, "train_speed(iter/s)": 0.201233 }, { "acc": 0.75244203, "epoch": 0.5698883815322171, "grad_norm": 2.53125, "learning_rate": 8.565289274221891e-06, "loss": 0.97981224, "memory(GiB)": 369.4, "step": 22465, "train_speed(iter/s)": 0.20124 }, { "acc": 0.75440178, "epoch": 0.5700152207001522, "grad_norm": 1.9921875, "learning_rate": 8.564554000308493e-06, "loss": 1.00179367, "memory(GiB)": 369.4, "step": 22470, "train_speed(iter/s)": 0.201248 }, { "acc": 0.74743395, "epoch": 0.5701420598680873, "grad_norm": 2.265625, "learning_rate": 8.563818569610096e-06, "loss": 0.96430769, "memory(GiB)": 369.4, "step": 22475, "train_speed(iter/s)": 0.201256 }, { "acc": 0.74051023, "epoch": 0.5702688990360223, "grad_norm": 1.96875, "learning_rate": 8.563082982159048e-06, "loss": 1.01390362, "memory(GiB)": 369.4, "step": 22480, "train_speed(iter/s)": 0.201264 }, { "acc": 0.74983091, "epoch": 0.5703957382039574, "grad_norm": 2.234375, "learning_rate": 8.562347237987701e-06, "loss": 1.03478775, "memory(GiB)": 369.4, "step": 22485, "train_speed(iter/s)": 0.201267 }, { "acc": 0.72715702, "epoch": 0.5705225773718925, "grad_norm": 2.46875, "learning_rate": 8.561611337128418e-06, "loss": 1.05993614, "memory(GiB)": 369.4, "step": 22490, "train_speed(iter/s)": 0.201272 }, { "acc": 0.7555191, "epoch": 0.5706494165398275, "grad_norm": 2.25, "learning_rate": 8.560875279613568e-06, "loss": 0.97333603, "memory(GiB)": 369.4, "step": 22495, "train_speed(iter/s)": 0.201278 }, { "acc": 0.73663349, "epoch": 0.5707762557077626, "grad_norm": 2.046875, "learning_rate": 8.560139065475523e-06, "loss": 1.05922489, "memory(GiB)": 369.4, "step": 22500, "train_speed(iter/s)": 0.201283 }, { "acc": 0.76334963, "epoch": 0.5709030948756976, "grad_norm": 2.34375, "learning_rate": 8.559402694746671e-06, "loss": 0.99385357, "memory(GiB)": 369.4, "step": 22505, "train_speed(iter/s)": 0.201291 }, { "acc": 0.75963211, "epoch": 0.5710299340436327, "grad_norm": 1.921875, "learning_rate": 8.558666167459393e-06, "loss": 0.92319508, "memory(GiB)": 369.4, "step": 22510, "train_speed(iter/s)": 0.201297 }, { "acc": 0.74090767, "epoch": 0.5711567732115678, "grad_norm": 2.046875, "learning_rate": 8.55792948364609e-06, "loss": 1.12973423, "memory(GiB)": 369.4, "step": 22515, "train_speed(iter/s)": 0.201304 }, { "acc": 0.75871964, "epoch": 0.5712836123795028, "grad_norm": 2.28125, "learning_rate": 8.557192643339164e-06, "loss": 1.04520187, "memory(GiB)": 369.4, "step": 22520, "train_speed(iter/s)": 0.201307 }, { "acc": 0.75331068, "epoch": 0.5714104515474379, "grad_norm": 1.625, "learning_rate": 8.556455646571022e-06, "loss": 1.00139494, "memory(GiB)": 369.4, "step": 22525, "train_speed(iter/s)": 0.201311 }, { "acc": 0.7571887, "epoch": 0.571537290715373, "grad_norm": 2.203125, "learning_rate": 8.555718493374084e-06, "loss": 1.03027802, "memory(GiB)": 369.4, "step": 22530, "train_speed(iter/s)": 0.201314 }, { "acc": 0.74437914, "epoch": 0.571664129883308, "grad_norm": 2.125, "learning_rate": 8.55498118378077e-06, "loss": 1.03984785, "memory(GiB)": 369.4, "step": 22535, "train_speed(iter/s)": 0.20132 }, { "acc": 0.76544113, "epoch": 0.571790969051243, "grad_norm": 2.921875, "learning_rate": 8.554243717823512e-06, "loss": 0.94664078, "memory(GiB)": 369.4, "step": 22540, "train_speed(iter/s)": 0.201328 }, { "acc": 0.7466465, "epoch": 0.571917808219178, "grad_norm": 2.09375, "learning_rate": 8.553506095534747e-06, "loss": 1.04866676, "memory(GiB)": 369.4, "step": 22545, "train_speed(iter/s)": 0.201335 }, { "acc": 0.75410271, "epoch": 0.5720446473871131, "grad_norm": 2.21875, "learning_rate": 8.55276831694692e-06, "loss": 0.97014465, "memory(GiB)": 369.4, "step": 22550, "train_speed(iter/s)": 0.20134 }, { "acc": 0.74746714, "epoch": 0.5721714865550482, "grad_norm": 2.015625, "learning_rate": 8.552030382092477e-06, "loss": 1.02496433, "memory(GiB)": 369.4, "step": 22555, "train_speed(iter/s)": 0.201345 }, { "acc": 0.75177836, "epoch": 0.5722983257229832, "grad_norm": 2.0625, "learning_rate": 8.551292291003884e-06, "loss": 1.02743664, "memory(GiB)": 369.4, "step": 22560, "train_speed(iter/s)": 0.201351 }, { "acc": 0.75441618, "epoch": 0.5724251648909183, "grad_norm": 2.46875, "learning_rate": 8.550554043713597e-06, "loss": 1.02186995, "memory(GiB)": 369.4, "step": 22565, "train_speed(iter/s)": 0.201358 }, { "acc": 0.75449739, "epoch": 0.5725520040588534, "grad_norm": 2.03125, "learning_rate": 8.549815640254092e-06, "loss": 0.97134304, "memory(GiB)": 369.4, "step": 22570, "train_speed(iter/s)": 0.201365 }, { "acc": 0.75078363, "epoch": 0.5726788432267884, "grad_norm": 2.34375, "learning_rate": 8.549077080657846e-06, "loss": 1.00384836, "memory(GiB)": 369.4, "step": 22575, "train_speed(iter/s)": 0.201374 }, { "acc": 0.73269868, "epoch": 0.5728056823947235, "grad_norm": 2.40625, "learning_rate": 8.548338364957345e-06, "loss": 1.07550459, "memory(GiB)": 369.4, "step": 22580, "train_speed(iter/s)": 0.201377 }, { "acc": 0.7399785, "epoch": 0.5729325215626585, "grad_norm": 2.109375, "learning_rate": 8.54759949318508e-06, "loss": 1.02886343, "memory(GiB)": 369.4, "step": 22585, "train_speed(iter/s)": 0.201383 }, { "acc": 0.74158926, "epoch": 0.5730593607305936, "grad_norm": 1.796875, "learning_rate": 8.546860465373552e-06, "loss": 0.9811698, "memory(GiB)": 369.4, "step": 22590, "train_speed(iter/s)": 0.201388 }, { "acc": 0.75272188, "epoch": 0.5731861998985287, "grad_norm": 2.046875, "learning_rate": 8.546121281555265e-06, "loss": 0.97116728, "memory(GiB)": 369.4, "step": 22595, "train_speed(iter/s)": 0.201394 }, { "acc": 0.75601583, "epoch": 0.5733130390664637, "grad_norm": 2.25, "learning_rate": 8.54538194176273e-06, "loss": 0.9834713, "memory(GiB)": 369.4, "step": 22600, "train_speed(iter/s)": 0.201395 }, { "acc": 0.74257503, "epoch": 0.5734398782343988, "grad_norm": 2.28125, "learning_rate": 8.544642446028469e-06, "loss": 1.00994606, "memory(GiB)": 369.4, "step": 22605, "train_speed(iter/s)": 0.201403 }, { "acc": 0.72771211, "epoch": 0.5735667174023339, "grad_norm": 2.515625, "learning_rate": 8.543902794385008e-06, "loss": 1.10515108, "memory(GiB)": 369.4, "step": 22610, "train_speed(iter/s)": 0.201406 }, { "acc": 0.73522987, "epoch": 0.5736935565702689, "grad_norm": 2.1875, "learning_rate": 8.543162986864879e-06, "loss": 1.06823254, "memory(GiB)": 369.4, "step": 22615, "train_speed(iter/s)": 0.201412 }, { "acc": 0.75547376, "epoch": 0.573820395738204, "grad_norm": 2.203125, "learning_rate": 8.542423023500623e-06, "loss": 0.96773624, "memory(GiB)": 369.4, "step": 22620, "train_speed(iter/s)": 0.201419 }, { "acc": 0.75191269, "epoch": 0.573947234906139, "grad_norm": 2.21875, "learning_rate": 8.541682904324786e-06, "loss": 0.99670219, "memory(GiB)": 369.4, "step": 22625, "train_speed(iter/s)": 0.201424 }, { "acc": 0.74368315, "epoch": 0.5740740740740741, "grad_norm": 2.234375, "learning_rate": 8.540942629369923e-06, "loss": 1.03187065, "memory(GiB)": 369.4, "step": 22630, "train_speed(iter/s)": 0.201431 }, { "acc": 0.72704344, "epoch": 0.5742009132420092, "grad_norm": 2.265625, "learning_rate": 8.540202198668595e-06, "loss": 1.02265167, "memory(GiB)": 369.4, "step": 22635, "train_speed(iter/s)": 0.20144 }, { "acc": 0.74247093, "epoch": 0.5743277524099442, "grad_norm": 2.171875, "learning_rate": 8.539461612253368e-06, "loss": 0.98060341, "memory(GiB)": 369.4, "step": 22640, "train_speed(iter/s)": 0.201443 }, { "acc": 0.74378719, "epoch": 0.5744545915778793, "grad_norm": 2.40625, "learning_rate": 8.538720870156816e-06, "loss": 1.01176424, "memory(GiB)": 369.4, "step": 22645, "train_speed(iter/s)": 0.20145 }, { "acc": 0.74431067, "epoch": 0.5745814307458144, "grad_norm": 2.09375, "learning_rate": 8.53797997241152e-06, "loss": 0.96676769, "memory(GiB)": 369.4, "step": 22650, "train_speed(iter/s)": 0.201457 }, { "acc": 0.75698977, "epoch": 0.5747082699137493, "grad_norm": 2.765625, "learning_rate": 8.537238919050071e-06, "loss": 0.9880024, "memory(GiB)": 369.4, "step": 22655, "train_speed(iter/s)": 0.201467 }, { "acc": 0.75724115, "epoch": 0.5748351090816844, "grad_norm": 2.125, "learning_rate": 8.53649771010506e-06, "loss": 0.95084286, "memory(GiB)": 369.4, "step": 22660, "train_speed(iter/s)": 0.201474 }, { "acc": 0.74179411, "epoch": 0.5749619482496194, "grad_norm": 2.34375, "learning_rate": 8.535756345609092e-06, "loss": 1.00367069, "memory(GiB)": 369.4, "step": 22665, "train_speed(iter/s)": 0.201483 }, { "acc": 0.74804296, "epoch": 0.5750887874175545, "grad_norm": 2.390625, "learning_rate": 8.535014825594772e-06, "loss": 0.98422489, "memory(GiB)": 369.4, "step": 22670, "train_speed(iter/s)": 0.201482 }, { "acc": 0.74814525, "epoch": 0.5752156265854896, "grad_norm": 1.8828125, "learning_rate": 8.534273150094718e-06, "loss": 1.01832657, "memory(GiB)": 369.4, "step": 22675, "train_speed(iter/s)": 0.201483 }, { "acc": 0.73494568, "epoch": 0.5753424657534246, "grad_norm": 2.890625, "learning_rate": 8.533531319141552e-06, "loss": 0.98139, "memory(GiB)": 369.4, "step": 22680, "train_speed(iter/s)": 0.20149 }, { "acc": 0.72794447, "epoch": 0.5754693049213597, "grad_norm": 2.390625, "learning_rate": 8.532789332767902e-06, "loss": 1.07503223, "memory(GiB)": 369.4, "step": 22685, "train_speed(iter/s)": 0.201497 }, { "acc": 0.73732429, "epoch": 0.5755961440892948, "grad_norm": 2.203125, "learning_rate": 8.532047191006405e-06, "loss": 1.00737076, "memory(GiB)": 369.4, "step": 22690, "train_speed(iter/s)": 0.201504 }, { "acc": 0.74117041, "epoch": 0.5757229832572298, "grad_norm": 1.7578125, "learning_rate": 8.531304893889702e-06, "loss": 1.00831833, "memory(GiB)": 369.4, "step": 22695, "train_speed(iter/s)": 0.201512 }, { "acc": 0.76276817, "epoch": 0.5758498224251649, "grad_norm": 1.9921875, "learning_rate": 8.530562441450445e-06, "loss": 1.01218901, "memory(GiB)": 369.4, "step": 22700, "train_speed(iter/s)": 0.201519 }, { "acc": 0.75294266, "epoch": 0.5759766615930999, "grad_norm": 2.15625, "learning_rate": 8.529819833721289e-06, "loss": 0.99287434, "memory(GiB)": 369.4, "step": 22705, "train_speed(iter/s)": 0.201525 }, { "acc": 0.74269772, "epoch": 0.576103500761035, "grad_norm": 2.296875, "learning_rate": 8.529077070734896e-06, "loss": 1.04732265, "memory(GiB)": 369.4, "step": 22710, "train_speed(iter/s)": 0.201534 }, { "acc": 0.74241247, "epoch": 0.5762303399289701, "grad_norm": 2.390625, "learning_rate": 8.528334152523938e-06, "loss": 1.0987278, "memory(GiB)": 369.4, "step": 22715, "train_speed(iter/s)": 0.20154 }, { "acc": 0.75570059, "epoch": 0.5763571790969051, "grad_norm": 2.296875, "learning_rate": 8.52759107912109e-06, "loss": 0.96956844, "memory(GiB)": 369.4, "step": 22720, "train_speed(iter/s)": 0.201547 }, { "acc": 0.75067034, "epoch": 0.5764840182648402, "grad_norm": 2.0625, "learning_rate": 8.526847850559037e-06, "loss": 0.97738361, "memory(GiB)": 369.4, "step": 22725, "train_speed(iter/s)": 0.201554 }, { "acc": 0.73853536, "epoch": 0.5766108574327753, "grad_norm": 1.8046875, "learning_rate": 8.526104466870472e-06, "loss": 1.05799646, "memory(GiB)": 369.4, "step": 22730, "train_speed(iter/s)": 0.201551 }, { "acc": 0.75265517, "epoch": 0.5767376966007103, "grad_norm": 1.9375, "learning_rate": 8.525360928088087e-06, "loss": 0.97723656, "memory(GiB)": 369.4, "step": 22735, "train_speed(iter/s)": 0.201555 }, { "acc": 0.74853435, "epoch": 0.5768645357686454, "grad_norm": 2.046875, "learning_rate": 8.524617234244588e-06, "loss": 1.01336842, "memory(GiB)": 369.4, "step": 22740, "train_speed(iter/s)": 0.20156 }, { "acc": 0.74405832, "epoch": 0.5769913749365804, "grad_norm": 2.234375, "learning_rate": 8.523873385372687e-06, "loss": 0.93044243, "memory(GiB)": 369.4, "step": 22745, "train_speed(iter/s)": 0.201561 }, { "acc": 0.73878717, "epoch": 0.5771182141045155, "grad_norm": 2.203125, "learning_rate": 8.523129381505104e-06, "loss": 1.0715189, "memory(GiB)": 369.4, "step": 22750, "train_speed(iter/s)": 0.201569 }, { "acc": 0.76156826, "epoch": 0.5772450532724506, "grad_norm": 2.171875, "learning_rate": 8.522385222674559e-06, "loss": 0.96379519, "memory(GiB)": 369.4, "step": 22755, "train_speed(iter/s)": 0.201578 }, { "acc": 0.75642729, "epoch": 0.5773718924403856, "grad_norm": 2.109375, "learning_rate": 8.521640908913787e-06, "loss": 0.99734058, "memory(GiB)": 369.4, "step": 22760, "train_speed(iter/s)": 0.201584 }, { "acc": 0.75015888, "epoch": 0.5774987316083207, "grad_norm": 1.984375, "learning_rate": 8.520896440255524e-06, "loss": 1.01610622, "memory(GiB)": 369.4, "step": 22765, "train_speed(iter/s)": 0.201578 }, { "acc": 0.75718431, "epoch": 0.5776255707762558, "grad_norm": 2.515625, "learning_rate": 8.520151816732517e-06, "loss": 1.00084667, "memory(GiB)": 369.4, "step": 22770, "train_speed(iter/s)": 0.201588 }, { "acc": 0.73345242, "epoch": 0.5777524099441907, "grad_norm": 2.4375, "learning_rate": 8.519407038377515e-06, "loss": 1.09035072, "memory(GiB)": 369.4, "step": 22775, "train_speed(iter/s)": 0.201593 }, { "acc": 0.74281044, "epoch": 0.5778792491121258, "grad_norm": 1.9453125, "learning_rate": 8.518662105223279e-06, "loss": 1.04372921, "memory(GiB)": 369.4, "step": 22780, "train_speed(iter/s)": 0.201601 }, { "acc": 0.74353237, "epoch": 0.5780060882800608, "grad_norm": 2.421875, "learning_rate": 8.517917017302574e-06, "loss": 1.04124546, "memory(GiB)": 369.4, "step": 22785, "train_speed(iter/s)": 0.201607 }, { "acc": 0.7411232, "epoch": 0.5781329274479959, "grad_norm": 2.046875, "learning_rate": 8.517171774648172e-06, "loss": 1.07900906, "memory(GiB)": 369.4, "step": 22790, "train_speed(iter/s)": 0.201607 }, { "acc": 0.74347086, "epoch": 0.578259766615931, "grad_norm": 2.0, "learning_rate": 8.516426377292854e-06, "loss": 1.02547798, "memory(GiB)": 369.4, "step": 22795, "train_speed(iter/s)": 0.201614 }, { "acc": 0.74724507, "epoch": 0.578386605783866, "grad_norm": 1.9375, "learning_rate": 8.515680825269404e-06, "loss": 1.00600948, "memory(GiB)": 369.4, "step": 22800, "train_speed(iter/s)": 0.201621 }, { "acc": 0.74303555, "epoch": 0.5785134449518011, "grad_norm": 2.125, "learning_rate": 8.514935118610613e-06, "loss": 1.08561935, "memory(GiB)": 369.4, "step": 22805, "train_speed(iter/s)": 0.201627 }, { "acc": 0.75101395, "epoch": 0.5786402841197362, "grad_norm": 2.03125, "learning_rate": 8.514189257349283e-06, "loss": 0.98817739, "memory(GiB)": 369.4, "step": 22810, "train_speed(iter/s)": 0.20163 }, { "acc": 0.75212908, "epoch": 0.5787671232876712, "grad_norm": 2.0625, "learning_rate": 8.51344324151822e-06, "loss": 0.95551863, "memory(GiB)": 369.4, "step": 22815, "train_speed(iter/s)": 0.201635 }, { "acc": 0.74962006, "epoch": 0.5788939624556063, "grad_norm": 2.234375, "learning_rate": 8.512697071150235e-06, "loss": 0.98266582, "memory(GiB)": 369.4, "step": 22820, "train_speed(iter/s)": 0.201636 }, { "acc": 0.74875722, "epoch": 0.5790208016235413, "grad_norm": 2.21875, "learning_rate": 8.511950746278152e-06, "loss": 0.93796616, "memory(GiB)": 369.4, "step": 22825, "train_speed(iter/s)": 0.201643 }, { "acc": 0.73437414, "epoch": 0.5791476407914764, "grad_norm": 2.34375, "learning_rate": 8.511204266934797e-06, "loss": 1.04684267, "memory(GiB)": 369.4, "step": 22830, "train_speed(iter/s)": 0.201647 }, { "acc": 0.72835732, "epoch": 0.5792744799594115, "grad_norm": 2.0, "learning_rate": 8.510457633152998e-06, "loss": 1.07939854, "memory(GiB)": 369.4, "step": 22835, "train_speed(iter/s)": 0.201653 }, { "acc": 0.73903885, "epoch": 0.5794013191273465, "grad_norm": 2.0, "learning_rate": 8.509710844965602e-06, "loss": 1.02611408, "memory(GiB)": 369.4, "step": 22840, "train_speed(iter/s)": 0.201658 }, { "acc": 0.74031744, "epoch": 0.5795281582952816, "grad_norm": 2.46875, "learning_rate": 8.508963902405451e-06, "loss": 1.05818939, "memory(GiB)": 369.4, "step": 22845, "train_speed(iter/s)": 0.201663 }, { "acc": 0.74725132, "epoch": 0.5796549974632167, "grad_norm": 2.15625, "learning_rate": 8.508216805505403e-06, "loss": 0.97558689, "memory(GiB)": 369.4, "step": 22850, "train_speed(iter/s)": 0.201668 }, { "acc": 0.7461216, "epoch": 0.5797818366311517, "grad_norm": 1.8984375, "learning_rate": 8.507469554298318e-06, "loss": 1.0124321, "memory(GiB)": 369.4, "step": 22855, "train_speed(iter/s)": 0.20167 }, { "acc": 0.75408764, "epoch": 0.5799086757990868, "grad_norm": 2.234375, "learning_rate": 8.506722148817061e-06, "loss": 1.0110733, "memory(GiB)": 369.4, "step": 22860, "train_speed(iter/s)": 0.201678 }, { "acc": 0.75370402, "epoch": 0.5800355149670218, "grad_norm": 2.0, "learning_rate": 8.505974589094505e-06, "loss": 0.96751862, "memory(GiB)": 369.4, "step": 22865, "train_speed(iter/s)": 0.201683 }, { "acc": 0.75348892, "epoch": 0.5801623541349569, "grad_norm": 1.875, "learning_rate": 8.505226875163537e-06, "loss": 1.03395538, "memory(GiB)": 369.4, "step": 22870, "train_speed(iter/s)": 0.201689 }, { "acc": 0.75362778, "epoch": 0.580289193302892, "grad_norm": 2.375, "learning_rate": 8.50447900705704e-06, "loss": 0.98010283, "memory(GiB)": 369.4, "step": 22875, "train_speed(iter/s)": 0.201696 }, { "acc": 0.74887919, "epoch": 0.580416032470827, "grad_norm": 1.640625, "learning_rate": 8.503730984807911e-06, "loss": 0.96311111, "memory(GiB)": 369.4, "step": 22880, "train_speed(iter/s)": 0.201698 }, { "acc": 0.75956178, "epoch": 0.5805428716387621, "grad_norm": 1.9609375, "learning_rate": 8.502982808449049e-06, "loss": 0.96979475, "memory(GiB)": 369.4, "step": 22885, "train_speed(iter/s)": 0.201703 }, { "acc": 0.73647318, "epoch": 0.5806697108066972, "grad_norm": 2.0625, "learning_rate": 8.502234478013363e-06, "loss": 1.09686489, "memory(GiB)": 369.4, "step": 22890, "train_speed(iter/s)": 0.20171 }, { "acc": 0.73931942, "epoch": 0.5807965499746321, "grad_norm": 2.234375, "learning_rate": 8.501485993533769e-06, "loss": 1.04557934, "memory(GiB)": 369.4, "step": 22895, "train_speed(iter/s)": 0.201717 }, { "acc": 0.75094657, "epoch": 0.5809233891425672, "grad_norm": 2.34375, "learning_rate": 8.500737355043188e-06, "loss": 1.01269312, "memory(GiB)": 369.4, "step": 22900, "train_speed(iter/s)": 0.201724 }, { "acc": 0.75487666, "epoch": 0.5810502283105022, "grad_norm": 2.078125, "learning_rate": 8.499988562574549e-06, "loss": 0.99939804, "memory(GiB)": 369.4, "step": 22905, "train_speed(iter/s)": 0.20173 }, { "acc": 0.7573566, "epoch": 0.5811770674784373, "grad_norm": 2.609375, "learning_rate": 8.499239616160787e-06, "loss": 1.04672394, "memory(GiB)": 369.4, "step": 22910, "train_speed(iter/s)": 0.201736 }, { "acc": 0.75317807, "epoch": 0.5813039066463724, "grad_norm": 2.34375, "learning_rate": 8.498490515834841e-06, "loss": 1.00883999, "memory(GiB)": 369.4, "step": 22915, "train_speed(iter/s)": 0.201744 }, { "acc": 0.74067349, "epoch": 0.5814307458143074, "grad_norm": 1.9765625, "learning_rate": 8.497741261629664e-06, "loss": 1.03783741, "memory(GiB)": 369.4, "step": 22920, "train_speed(iter/s)": 0.201747 }, { "acc": 0.74552398, "epoch": 0.5815575849822425, "grad_norm": 2.046875, "learning_rate": 8.496991853578212e-06, "loss": 0.96901283, "memory(GiB)": 369.4, "step": 22925, "train_speed(iter/s)": 0.201756 }, { "acc": 0.74264278, "epoch": 0.5816844241501776, "grad_norm": 2.15625, "learning_rate": 8.496242291713444e-06, "loss": 1.01541595, "memory(GiB)": 369.4, "step": 22930, "train_speed(iter/s)": 0.201761 }, { "acc": 0.74777622, "epoch": 0.5818112633181126, "grad_norm": 2.453125, "learning_rate": 8.495492576068329e-06, "loss": 0.99609108, "memory(GiB)": 369.4, "step": 22935, "train_speed(iter/s)": 0.201766 }, { "acc": 0.74743481, "epoch": 0.5819381024860477, "grad_norm": 2.015625, "learning_rate": 8.494742706675844e-06, "loss": 1.03181229, "memory(GiB)": 369.4, "step": 22940, "train_speed(iter/s)": 0.201771 }, { "acc": 0.74206529, "epoch": 0.5820649416539827, "grad_norm": 2.25, "learning_rate": 8.493992683568975e-06, "loss": 1.06558876, "memory(GiB)": 369.4, "step": 22945, "train_speed(iter/s)": 0.201776 }, { "acc": 0.75361471, "epoch": 0.5821917808219178, "grad_norm": 2.84375, "learning_rate": 8.493242506780705e-06, "loss": 1.01070137, "memory(GiB)": 369.4, "step": 22950, "train_speed(iter/s)": 0.201783 }, { "acc": 0.74281855, "epoch": 0.5823186199898529, "grad_norm": 2.75, "learning_rate": 8.492492176344035e-06, "loss": 1.06944122, "memory(GiB)": 369.4, "step": 22955, "train_speed(iter/s)": 0.201792 }, { "acc": 0.75256166, "epoch": 0.5824454591577879, "grad_norm": 1.8359375, "learning_rate": 8.491741692291967e-06, "loss": 0.9934474, "memory(GiB)": 369.4, "step": 22960, "train_speed(iter/s)": 0.201798 }, { "acc": 0.7279706, "epoch": 0.582572298325723, "grad_norm": 2.53125, "learning_rate": 8.490991054657507e-06, "loss": 1.07604885, "memory(GiB)": 369.4, "step": 22965, "train_speed(iter/s)": 0.201804 }, { "acc": 0.74159794, "epoch": 0.5826991374936581, "grad_norm": 2.25, "learning_rate": 8.490240263473677e-06, "loss": 1.06396904, "memory(GiB)": 369.4, "step": 22970, "train_speed(iter/s)": 0.201806 }, { "acc": 0.74270353, "epoch": 0.5828259766615931, "grad_norm": 2.75, "learning_rate": 8.489489318773496e-06, "loss": 1.04966831, "memory(GiB)": 369.4, "step": 22975, "train_speed(iter/s)": 0.201814 }, { "acc": 0.73716888, "epoch": 0.5829528158295282, "grad_norm": 2.546875, "learning_rate": 8.488738220589996e-06, "loss": 1.08355141, "memory(GiB)": 369.4, "step": 22980, "train_speed(iter/s)": 0.201821 }, { "acc": 0.75542326, "epoch": 0.5830796549974632, "grad_norm": 1.8671875, "learning_rate": 8.487986968956212e-06, "loss": 0.98105869, "memory(GiB)": 369.4, "step": 22985, "train_speed(iter/s)": 0.201827 }, { "acc": 0.75280147, "epoch": 0.5832064941653983, "grad_norm": 1.9453125, "learning_rate": 8.487235563905191e-06, "loss": 0.96603146, "memory(GiB)": 369.4, "step": 22990, "train_speed(iter/s)": 0.201835 }, { "acc": 0.74322052, "epoch": 0.5833333333333334, "grad_norm": 2.359375, "learning_rate": 8.486484005469977e-06, "loss": 0.9780077, "memory(GiB)": 369.4, "step": 22995, "train_speed(iter/s)": 0.201843 }, { "acc": 0.75595007, "epoch": 0.5834601725012684, "grad_norm": 2.046875, "learning_rate": 8.485732293683633e-06, "loss": 0.97658377, "memory(GiB)": 369.4, "step": 23000, "train_speed(iter/s)": 0.201849 }, { "epoch": 0.5834601725012684, "eval_acc": 0.7355491154403874, "eval_loss": 0.9813453555107117, "eval_runtime": 384.5291, "eval_samples_per_second": 16.566, "eval_steps_per_second": 8.283, "step": 23000 }, { "acc": 0.74136386, "epoch": 0.5835870116692035, "grad_norm": 2.5625, "learning_rate": 8.48498042857922e-06, "loss": 1.04021091, "memory(GiB)": 369.4, "step": 23005, "train_speed(iter/s)": 0.200605 }, { "acc": 0.73497419, "epoch": 0.5837138508371386, "grad_norm": 2.03125, "learning_rate": 8.484228410189807e-06, "loss": 1.02053432, "memory(GiB)": 369.4, "step": 23010, "train_speed(iter/s)": 0.200608 }, { "acc": 0.73972306, "epoch": 0.5838406900050735, "grad_norm": 2.171875, "learning_rate": 8.483476238548473e-06, "loss": 1.03810749, "memory(GiB)": 369.4, "step": 23015, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74041147, "epoch": 0.5839675291730086, "grad_norm": 1.96875, "learning_rate": 8.482723913688301e-06, "loss": 1.0249896, "memory(GiB)": 369.4, "step": 23020, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75161524, "epoch": 0.5840943683409436, "grad_norm": 2.171875, "learning_rate": 8.481971435642382e-06, "loss": 0.98914757, "memory(GiB)": 369.4, "step": 23025, "train_speed(iter/s)": 0.200625 }, { "acc": 0.74791999, "epoch": 0.5842212075088787, "grad_norm": 2.03125, "learning_rate": 8.481218804443814e-06, "loss": 0.95551243, "memory(GiB)": 369.4, "step": 23030, "train_speed(iter/s)": 0.200631 }, { "acc": 0.73392563, "epoch": 0.5843480466768138, "grad_norm": 2.125, "learning_rate": 8.480466020125701e-06, "loss": 1.05597582, "memory(GiB)": 369.4, "step": 23035, "train_speed(iter/s)": 0.200636 }, { "acc": 0.73947783, "epoch": 0.5844748858447488, "grad_norm": 1.828125, "learning_rate": 8.479713082721153e-06, "loss": 1.0349987, "memory(GiB)": 369.4, "step": 23040, "train_speed(iter/s)": 0.200642 }, { "acc": 0.76191244, "epoch": 0.5846017250126839, "grad_norm": 2.34375, "learning_rate": 8.478959992263288e-06, "loss": 0.95947132, "memory(GiB)": 369.4, "step": 23045, "train_speed(iter/s)": 0.200646 }, { "acc": 0.74349194, "epoch": 0.584728564180619, "grad_norm": 2.09375, "learning_rate": 8.478206748785229e-06, "loss": 1.05786228, "memory(GiB)": 369.4, "step": 23050, "train_speed(iter/s)": 0.200653 }, { "acc": 0.74732113, "epoch": 0.584855403348554, "grad_norm": 1.8359375, "learning_rate": 8.477453352320108e-06, "loss": 0.98138638, "memory(GiB)": 369.4, "step": 23055, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74996581, "epoch": 0.5849822425164891, "grad_norm": 1.9296875, "learning_rate": 8.476699802901066e-06, "loss": 1.00152283, "memory(GiB)": 369.4, "step": 23060, "train_speed(iter/s)": 0.200668 }, { "acc": 0.74881287, "epoch": 0.5851090816844241, "grad_norm": 2.046875, "learning_rate": 8.47594610056124e-06, "loss": 0.99832096, "memory(GiB)": 369.4, "step": 23065, "train_speed(iter/s)": 0.200675 }, { "acc": 0.7446207, "epoch": 0.5852359208523592, "grad_norm": 2.234375, "learning_rate": 8.475192245333787e-06, "loss": 1.00104589, "memory(GiB)": 369.4, "step": 23070, "train_speed(iter/s)": 0.200681 }, { "acc": 0.75535479, "epoch": 0.5853627600202943, "grad_norm": 2.328125, "learning_rate": 8.474438237251864e-06, "loss": 0.96360683, "memory(GiB)": 369.4, "step": 23075, "train_speed(iter/s)": 0.200687 }, { "acc": 0.74749212, "epoch": 0.5854895991882293, "grad_norm": 1.953125, "learning_rate": 8.473684076348635e-06, "loss": 0.99494276, "memory(GiB)": 369.4, "step": 23080, "train_speed(iter/s)": 0.200692 }, { "acc": 0.74073997, "epoch": 0.5856164383561644, "grad_norm": 2.34375, "learning_rate": 8.472929762657272e-06, "loss": 0.99982853, "memory(GiB)": 369.4, "step": 23085, "train_speed(iter/s)": 0.200699 }, { "acc": 0.75638342, "epoch": 0.5857432775240995, "grad_norm": 2.09375, "learning_rate": 8.472175296210952e-06, "loss": 0.98772955, "memory(GiB)": 369.4, "step": 23090, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74741726, "epoch": 0.5858701166920345, "grad_norm": 2.171875, "learning_rate": 8.471420677042858e-06, "loss": 1.00734386, "memory(GiB)": 369.4, "step": 23095, "train_speed(iter/s)": 0.200713 }, { "acc": 0.7521143, "epoch": 0.5859969558599696, "grad_norm": 2.1875, "learning_rate": 8.470665905186188e-06, "loss": 0.95700264, "memory(GiB)": 369.4, "step": 23100, "train_speed(iter/s)": 0.200719 }, { "acc": 0.74086766, "epoch": 0.5861237950279046, "grad_norm": 1.8515625, "learning_rate": 8.469910980674134e-06, "loss": 1.02784615, "memory(GiB)": 369.4, "step": 23105, "train_speed(iter/s)": 0.200728 }, { "acc": 0.76322908, "epoch": 0.5862506341958397, "grad_norm": 2.171875, "learning_rate": 8.469155903539903e-06, "loss": 0.91041756, "memory(GiB)": 369.4, "step": 23110, "train_speed(iter/s)": 0.200734 }, { "acc": 0.74396639, "epoch": 0.5863774733637748, "grad_norm": 2.5625, "learning_rate": 8.468400673816705e-06, "loss": 1.03989239, "memory(GiB)": 369.4, "step": 23115, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74446607, "epoch": 0.5865043125317098, "grad_norm": 2.5, "learning_rate": 8.467645291537763e-06, "loss": 1.08600702, "memory(GiB)": 369.4, "step": 23120, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74943686, "epoch": 0.5866311516996449, "grad_norm": 2.109375, "learning_rate": 8.466889756736298e-06, "loss": 0.97406235, "memory(GiB)": 369.4, "step": 23125, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74419689, "epoch": 0.58675799086758, "grad_norm": 1.9765625, "learning_rate": 8.466134069445544e-06, "loss": 0.99611683, "memory(GiB)": 369.4, "step": 23130, "train_speed(iter/s)": 0.200758 }, { "acc": 0.74125986, "epoch": 0.586884830035515, "grad_norm": 2.0, "learning_rate": 8.465378229698737e-06, "loss": 1.03315277, "memory(GiB)": 369.4, "step": 23135, "train_speed(iter/s)": 0.200766 }, { "acc": 0.73458757, "epoch": 0.58701166920345, "grad_norm": 2.109375, "learning_rate": 8.464622237529123e-06, "loss": 1.06862259, "memory(GiB)": 369.4, "step": 23140, "train_speed(iter/s)": 0.20077 }, { "acc": 0.75382652, "epoch": 0.587138508371385, "grad_norm": 2.296875, "learning_rate": 8.463866092969958e-06, "loss": 0.96754398, "memory(GiB)": 369.4, "step": 23145, "train_speed(iter/s)": 0.200775 }, { "acc": 0.74776325, "epoch": 0.5872653475393201, "grad_norm": 2.078125, "learning_rate": 8.463109796054495e-06, "loss": 1.03935833, "memory(GiB)": 369.4, "step": 23150, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75018387, "epoch": 0.5873921867072552, "grad_norm": 2.484375, "learning_rate": 8.462353346815999e-06, "loss": 0.99975491, "memory(GiB)": 369.4, "step": 23155, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75127268, "epoch": 0.5875190258751902, "grad_norm": 2.359375, "learning_rate": 8.461596745287747e-06, "loss": 0.99934769, "memory(GiB)": 369.4, "step": 23160, "train_speed(iter/s)": 0.200791 }, { "acc": 0.72511911, "epoch": 0.5876458650431253, "grad_norm": 2.140625, "learning_rate": 8.460839991503016e-06, "loss": 1.07434731, "memory(GiB)": 369.4, "step": 23165, "train_speed(iter/s)": 0.2008 }, { "acc": 0.7661046, "epoch": 0.5877727042110604, "grad_norm": 2.796875, "learning_rate": 8.46008308549509e-06, "loss": 0.9503849, "memory(GiB)": 369.4, "step": 23170, "train_speed(iter/s)": 0.200808 }, { "acc": 0.74849348, "epoch": 0.5878995433789954, "grad_norm": 2.125, "learning_rate": 8.459326027297261e-06, "loss": 0.99531918, "memory(GiB)": 369.4, "step": 23175, "train_speed(iter/s)": 0.200813 }, { "acc": 0.745117, "epoch": 0.5880263825469305, "grad_norm": 2.3125, "learning_rate": 8.45856881694283e-06, "loss": 0.98941917, "memory(GiB)": 369.4, "step": 23180, "train_speed(iter/s)": 0.200821 }, { "acc": 0.74367495, "epoch": 0.5881532217148655, "grad_norm": 2.140625, "learning_rate": 8.4578114544651e-06, "loss": 0.95660334, "memory(GiB)": 369.4, "step": 23185, "train_speed(iter/s)": 0.200825 }, { "acc": 0.75663548, "epoch": 0.5882800608828006, "grad_norm": 2.15625, "learning_rate": 8.457053939897385e-06, "loss": 1.01431656, "memory(GiB)": 369.4, "step": 23190, "train_speed(iter/s)": 0.200832 }, { "acc": 0.76025496, "epoch": 0.5884069000507357, "grad_norm": 1.890625, "learning_rate": 8.456296273273e-06, "loss": 0.97509069, "memory(GiB)": 369.4, "step": 23195, "train_speed(iter/s)": 0.200837 }, { "acc": 0.75572109, "epoch": 0.5885337392186707, "grad_norm": 2.296875, "learning_rate": 8.455538454625276e-06, "loss": 1.00172224, "memory(GiB)": 369.4, "step": 23200, "train_speed(iter/s)": 0.200844 }, { "acc": 0.74111462, "epoch": 0.5886605783866058, "grad_norm": 2.3125, "learning_rate": 8.454780483987544e-06, "loss": 1.01552753, "memory(GiB)": 369.4, "step": 23205, "train_speed(iter/s)": 0.200852 }, { "acc": 0.75521049, "epoch": 0.5887874175545409, "grad_norm": 3.359375, "learning_rate": 8.45402236139314e-06, "loss": 0.97637224, "memory(GiB)": 369.4, "step": 23210, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74858489, "epoch": 0.5889142567224759, "grad_norm": 2.078125, "learning_rate": 8.453264086875411e-06, "loss": 0.99869118, "memory(GiB)": 369.4, "step": 23215, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74718637, "epoch": 0.589041095890411, "grad_norm": 2.4375, "learning_rate": 8.452505660467713e-06, "loss": 0.97620001, "memory(GiB)": 369.4, "step": 23220, "train_speed(iter/s)": 0.200872 }, { "acc": 0.75035768, "epoch": 0.589167935058346, "grad_norm": 2.40625, "learning_rate": 8.451747082203398e-06, "loss": 0.99700451, "memory(GiB)": 369.4, "step": 23225, "train_speed(iter/s)": 0.20088 }, { "acc": 0.75929022, "epoch": 0.5892947742262811, "grad_norm": 2.546875, "learning_rate": 8.450988352115838e-06, "loss": 0.95708694, "memory(GiB)": 369.4, "step": 23230, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74834261, "epoch": 0.5894216133942162, "grad_norm": 1.9921875, "learning_rate": 8.450229470238401e-06, "loss": 1.00642929, "memory(GiB)": 369.4, "step": 23235, "train_speed(iter/s)": 0.200895 }, { "acc": 0.74555492, "epoch": 0.5895484525621512, "grad_norm": 2.46875, "learning_rate": 8.44947043660447e-06, "loss": 1.00169086, "memory(GiB)": 369.4, "step": 23240, "train_speed(iter/s)": 0.2009 }, { "acc": 0.76072769, "epoch": 0.5896752917300863, "grad_norm": 2.453125, "learning_rate": 8.448711251247425e-06, "loss": 0.94392023, "memory(GiB)": 369.4, "step": 23245, "train_speed(iter/s)": 0.200909 }, { "acc": 0.75295606, "epoch": 0.5898021308980214, "grad_norm": 2.375, "learning_rate": 8.447951914200665e-06, "loss": 1.00024481, "memory(GiB)": 369.4, "step": 23250, "train_speed(iter/s)": 0.200916 }, { "acc": 0.75626736, "epoch": 0.5899289700659563, "grad_norm": 2.078125, "learning_rate": 8.447192425497583e-06, "loss": 0.97239981, "memory(GiB)": 369.4, "step": 23255, "train_speed(iter/s)": 0.20092 }, { "acc": 0.76384225, "epoch": 0.5900558092338914, "grad_norm": 2.515625, "learning_rate": 8.44643278517159e-06, "loss": 0.95611057, "memory(GiB)": 369.4, "step": 23260, "train_speed(iter/s)": 0.200925 }, { "acc": 0.73392501, "epoch": 0.5901826484018264, "grad_norm": 2.09375, "learning_rate": 8.445672993256095e-06, "loss": 1.10154161, "memory(GiB)": 369.4, "step": 23265, "train_speed(iter/s)": 0.200933 }, { "acc": 0.74101396, "epoch": 0.5903094875697615, "grad_norm": 2.234375, "learning_rate": 8.444913049784517e-06, "loss": 1.04741898, "memory(GiB)": 369.4, "step": 23270, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75228491, "epoch": 0.5904363267376966, "grad_norm": 2.28125, "learning_rate": 8.444152954790285e-06, "loss": 1.00326004, "memory(GiB)": 369.4, "step": 23275, "train_speed(iter/s)": 0.200944 }, { "acc": 0.7474597, "epoch": 0.5905631659056316, "grad_norm": 2.203125, "learning_rate": 8.443392708306827e-06, "loss": 1.00464573, "memory(GiB)": 369.4, "step": 23280, "train_speed(iter/s)": 0.200948 }, { "acc": 0.75381489, "epoch": 0.5906900050735667, "grad_norm": 2.609375, "learning_rate": 8.442632310367585e-06, "loss": 0.99530907, "memory(GiB)": 369.4, "step": 23285, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75407262, "epoch": 0.5908168442415018, "grad_norm": 2.25, "learning_rate": 8.441871761006001e-06, "loss": 0.99679327, "memory(GiB)": 369.4, "step": 23290, "train_speed(iter/s)": 0.20096 }, { "acc": 0.73767347, "epoch": 0.5909436834094368, "grad_norm": 2.734375, "learning_rate": 8.441111060255533e-06, "loss": 1.04349728, "memory(GiB)": 369.4, "step": 23295, "train_speed(iter/s)": 0.200967 }, { "acc": 0.73508511, "epoch": 0.5910705225773719, "grad_norm": 2.828125, "learning_rate": 8.440350208149637e-06, "loss": 1.09124355, "memory(GiB)": 369.4, "step": 23300, "train_speed(iter/s)": 0.200973 }, { "acc": 0.75405221, "epoch": 0.5911973617453069, "grad_norm": 2.640625, "learning_rate": 8.43958920472178e-06, "loss": 1.01528997, "memory(GiB)": 369.4, "step": 23305, "train_speed(iter/s)": 0.200977 }, { "acc": 0.75356045, "epoch": 0.591324200913242, "grad_norm": 1.875, "learning_rate": 8.43882805000543e-06, "loss": 0.97688932, "memory(GiB)": 369.4, "step": 23310, "train_speed(iter/s)": 0.200982 }, { "acc": 0.74429379, "epoch": 0.5914510400811771, "grad_norm": 2.109375, "learning_rate": 8.43806674403407e-06, "loss": 1.05381823, "memory(GiB)": 369.4, "step": 23315, "train_speed(iter/s)": 0.20099 }, { "acc": 0.75705957, "epoch": 0.5915778792491121, "grad_norm": 2.125, "learning_rate": 8.437305286841187e-06, "loss": 0.97669754, "memory(GiB)": 369.4, "step": 23320, "train_speed(iter/s)": 0.200997 }, { "acc": 0.75281906, "epoch": 0.5917047184170472, "grad_norm": 1.984375, "learning_rate": 8.436543678460269e-06, "loss": 1.00659657, "memory(GiB)": 369.4, "step": 23325, "train_speed(iter/s)": 0.201001 }, { "acc": 0.76086888, "epoch": 0.5918315575849823, "grad_norm": 2.359375, "learning_rate": 8.435781918924817e-06, "loss": 0.98872099, "memory(GiB)": 369.4, "step": 23330, "train_speed(iter/s)": 0.201006 }, { "acc": 0.76037641, "epoch": 0.5919583967529173, "grad_norm": 2.140625, "learning_rate": 8.435020008268335e-06, "loss": 0.96448793, "memory(GiB)": 369.4, "step": 23335, "train_speed(iter/s)": 0.201008 }, { "acc": 0.74298029, "epoch": 0.5920852359208524, "grad_norm": 2.265625, "learning_rate": 8.43425794652434e-06, "loss": 0.96995697, "memory(GiB)": 369.4, "step": 23340, "train_speed(iter/s)": 0.201016 }, { "acc": 0.73823233, "epoch": 0.5922120750887874, "grad_norm": 2.0, "learning_rate": 8.433495733726345e-06, "loss": 1.04432964, "memory(GiB)": 369.4, "step": 23345, "train_speed(iter/s)": 0.201019 }, { "acc": 0.73985291, "epoch": 0.5923389142567225, "grad_norm": 1.8671875, "learning_rate": 8.43273336990788e-06, "loss": 1.05063267, "memory(GiB)": 369.4, "step": 23350, "train_speed(iter/s)": 0.201027 }, { "acc": 0.7517591, "epoch": 0.5924657534246576, "grad_norm": 2.171875, "learning_rate": 8.431970855102475e-06, "loss": 1.03261576, "memory(GiB)": 369.4, "step": 23355, "train_speed(iter/s)": 0.201033 }, { "acc": 0.75044031, "epoch": 0.5925925925925926, "grad_norm": 1.734375, "learning_rate": 8.43120818934367e-06, "loss": 0.98767395, "memory(GiB)": 369.4, "step": 23360, "train_speed(iter/s)": 0.201041 }, { "acc": 0.74376144, "epoch": 0.5927194317605277, "grad_norm": 2.359375, "learning_rate": 8.430445372665008e-06, "loss": 1.04411535, "memory(GiB)": 369.4, "step": 23365, "train_speed(iter/s)": 0.201051 }, { "acc": 0.74985085, "epoch": 0.5928462709284628, "grad_norm": 1.921875, "learning_rate": 8.429682405100042e-06, "loss": 1.02975407, "memory(GiB)": 369.4, "step": 23370, "train_speed(iter/s)": 0.201059 }, { "acc": 0.76562405, "epoch": 0.5929731100963977, "grad_norm": 2.375, "learning_rate": 8.428919286682333e-06, "loss": 0.96424408, "memory(GiB)": 369.4, "step": 23375, "train_speed(iter/s)": 0.201065 }, { "acc": 0.75262823, "epoch": 0.5930999492643328, "grad_norm": 2.046875, "learning_rate": 8.428156017445443e-06, "loss": 0.96882029, "memory(GiB)": 369.4, "step": 23380, "train_speed(iter/s)": 0.201068 }, { "acc": 0.7412518, "epoch": 0.5932267884322678, "grad_norm": 2.390625, "learning_rate": 8.427392597422947e-06, "loss": 1.01811409, "memory(GiB)": 369.4, "step": 23385, "train_speed(iter/s)": 0.201077 }, { "acc": 0.74523621, "epoch": 0.5933536276002029, "grad_norm": 2.0625, "learning_rate": 8.426629026648423e-06, "loss": 1.02505312, "memory(GiB)": 369.4, "step": 23390, "train_speed(iter/s)": 0.201082 }, { "acc": 0.73992434, "epoch": 0.593480466768138, "grad_norm": 2.484375, "learning_rate": 8.425865305155455e-06, "loss": 1.03564606, "memory(GiB)": 369.4, "step": 23395, "train_speed(iter/s)": 0.201089 }, { "acc": 0.75054684, "epoch": 0.593607305936073, "grad_norm": 2.46875, "learning_rate": 8.425101432977636e-06, "loss": 1.00444031, "memory(GiB)": 369.4, "step": 23400, "train_speed(iter/s)": 0.201096 }, { "acc": 0.73272877, "epoch": 0.5937341451040081, "grad_norm": 2.125, "learning_rate": 8.424337410148562e-06, "loss": 1.04236336, "memory(GiB)": 369.4, "step": 23405, "train_speed(iter/s)": 0.201105 }, { "acc": 0.73059883, "epoch": 0.5938609842719432, "grad_norm": 2.3125, "learning_rate": 8.423573236701842e-06, "loss": 1.04791498, "memory(GiB)": 369.4, "step": 23410, "train_speed(iter/s)": 0.201111 }, { "acc": 0.74747157, "epoch": 0.5939878234398782, "grad_norm": 2.140625, "learning_rate": 8.422808912671086e-06, "loss": 1.03122749, "memory(GiB)": 369.4, "step": 23415, "train_speed(iter/s)": 0.201119 }, { "acc": 0.75618191, "epoch": 0.5941146626078133, "grad_norm": 1.7890625, "learning_rate": 8.422044438089911e-06, "loss": 1.01394615, "memory(GiB)": 369.4, "step": 23420, "train_speed(iter/s)": 0.201119 }, { "acc": 0.76222544, "epoch": 0.5942415017757483, "grad_norm": 2.78125, "learning_rate": 8.421279812991944e-06, "loss": 0.98219481, "memory(GiB)": 369.4, "step": 23425, "train_speed(iter/s)": 0.201127 }, { "acc": 0.74933901, "epoch": 0.5943683409436834, "grad_norm": 2.046875, "learning_rate": 8.420515037410817e-06, "loss": 0.99986696, "memory(GiB)": 369.4, "step": 23430, "train_speed(iter/s)": 0.201134 }, { "acc": 0.74053836, "epoch": 0.5944951801116185, "grad_norm": 2.1875, "learning_rate": 8.419750111380166e-06, "loss": 0.99040871, "memory(GiB)": 369.4, "step": 23435, "train_speed(iter/s)": 0.201141 }, { "acc": 0.7500597, "epoch": 0.5946220192795535, "grad_norm": 2.15625, "learning_rate": 8.418985034933637e-06, "loss": 1.01744995, "memory(GiB)": 369.4, "step": 23440, "train_speed(iter/s)": 0.201147 }, { "acc": 0.75135651, "epoch": 0.5947488584474886, "grad_norm": 2.265625, "learning_rate": 8.418219808104882e-06, "loss": 0.95030193, "memory(GiB)": 369.4, "step": 23445, "train_speed(iter/s)": 0.201154 }, { "acc": 0.7566247, "epoch": 0.5948756976154237, "grad_norm": 2.28125, "learning_rate": 8.417454430927559e-06, "loss": 1.02417336, "memory(GiB)": 369.4, "step": 23450, "train_speed(iter/s)": 0.20116 }, { "acc": 0.74091864, "epoch": 0.5950025367833587, "grad_norm": 2.046875, "learning_rate": 8.41668890343533e-06, "loss": 1.0329854, "memory(GiB)": 369.4, "step": 23455, "train_speed(iter/s)": 0.201168 }, { "acc": 0.73021445, "epoch": 0.5951293759512938, "grad_norm": 1.7890625, "learning_rate": 8.41592322566187e-06, "loss": 1.07361631, "memory(GiB)": 369.4, "step": 23460, "train_speed(iter/s)": 0.201164 }, { "acc": 0.74259815, "epoch": 0.5952562151192288, "grad_norm": 2.390625, "learning_rate": 8.415157397640857e-06, "loss": 0.97462559, "memory(GiB)": 369.4, "step": 23465, "train_speed(iter/s)": 0.201168 }, { "acc": 0.74201255, "epoch": 0.5953830542871639, "grad_norm": 2.203125, "learning_rate": 8.414391419405972e-06, "loss": 0.99365387, "memory(GiB)": 369.4, "step": 23470, "train_speed(iter/s)": 0.201173 }, { "acc": 0.74681454, "epoch": 0.595509893455099, "grad_norm": 2.484375, "learning_rate": 8.413625290990909e-06, "loss": 1.01832085, "memory(GiB)": 369.4, "step": 23475, "train_speed(iter/s)": 0.201181 }, { "acc": 0.75952497, "epoch": 0.595636732623034, "grad_norm": 2.109375, "learning_rate": 8.412859012429365e-06, "loss": 0.95914459, "memory(GiB)": 369.4, "step": 23480, "train_speed(iter/s)": 0.201187 }, { "acc": 0.7545413, "epoch": 0.5957635717909691, "grad_norm": 2.140625, "learning_rate": 8.412092583755043e-06, "loss": 0.97583399, "memory(GiB)": 369.4, "step": 23485, "train_speed(iter/s)": 0.201192 }, { "acc": 0.7399662, "epoch": 0.5958904109589042, "grad_norm": 2.140625, "learning_rate": 8.411326005001658e-06, "loss": 1.02911339, "memory(GiB)": 369.4, "step": 23490, "train_speed(iter/s)": 0.201199 }, { "acc": 0.73121505, "epoch": 0.5960172501268391, "grad_norm": 2.3125, "learning_rate": 8.410559276202922e-06, "loss": 1.04875879, "memory(GiB)": 369.4, "step": 23495, "train_speed(iter/s)": 0.201205 }, { "acc": 0.7322175, "epoch": 0.5961440892947742, "grad_norm": 1.875, "learning_rate": 8.409792397392565e-06, "loss": 1.02860279, "memory(GiB)": 369.4, "step": 23500, "train_speed(iter/s)": 0.201209 }, { "acc": 0.74466119, "epoch": 0.5962709284627092, "grad_norm": 2.46875, "learning_rate": 8.40902536860431e-06, "loss": 1.0110693, "memory(GiB)": 369.4, "step": 23505, "train_speed(iter/s)": 0.201215 }, { "acc": 0.75321016, "epoch": 0.5963977676306443, "grad_norm": 2.359375, "learning_rate": 8.408258189871904e-06, "loss": 0.98887281, "memory(GiB)": 369.4, "step": 23510, "train_speed(iter/s)": 0.201208 }, { "acc": 0.75508647, "epoch": 0.5965246067985794, "grad_norm": 2.203125, "learning_rate": 8.407490861229084e-06, "loss": 1.03196335, "memory(GiB)": 369.4, "step": 23515, "train_speed(iter/s)": 0.201213 }, { "acc": 0.73786826, "epoch": 0.5966514459665144, "grad_norm": 2.609375, "learning_rate": 8.406723382709603e-06, "loss": 1.06237144, "memory(GiB)": 369.4, "step": 23520, "train_speed(iter/s)": 0.201218 }, { "acc": 0.75036793, "epoch": 0.5967782851344495, "grad_norm": 2.109375, "learning_rate": 8.405955754347216e-06, "loss": 1.04668379, "memory(GiB)": 369.4, "step": 23525, "train_speed(iter/s)": 0.201222 }, { "acc": 0.74590511, "epoch": 0.5969051243023846, "grad_norm": 2.09375, "learning_rate": 8.40518797617569e-06, "loss": 1.02239513, "memory(GiB)": 369.4, "step": 23530, "train_speed(iter/s)": 0.201226 }, { "acc": 0.76418304, "epoch": 0.5970319634703196, "grad_norm": 2.21875, "learning_rate": 8.404420048228794e-06, "loss": 0.92120838, "memory(GiB)": 369.4, "step": 23535, "train_speed(iter/s)": 0.201228 }, { "acc": 0.7427515, "epoch": 0.5971588026382547, "grad_norm": 1.9296875, "learning_rate": 8.403651970540305e-06, "loss": 1.05560255, "memory(GiB)": 369.4, "step": 23540, "train_speed(iter/s)": 0.201234 }, { "acc": 0.74672146, "epoch": 0.5972856418061897, "grad_norm": 2.25, "learning_rate": 8.402883743144005e-06, "loss": 0.97802782, "memory(GiB)": 369.4, "step": 23545, "train_speed(iter/s)": 0.20124 }, { "acc": 0.74548206, "epoch": 0.5974124809741248, "grad_norm": 2.109375, "learning_rate": 8.402115366073686e-06, "loss": 1.03892555, "memory(GiB)": 369.4, "step": 23550, "train_speed(iter/s)": 0.201247 }, { "acc": 0.752878, "epoch": 0.5975393201420599, "grad_norm": 2.171875, "learning_rate": 8.401346839363143e-06, "loss": 0.98687744, "memory(GiB)": 369.4, "step": 23555, "train_speed(iter/s)": 0.201251 }, { "acc": 0.74638104, "epoch": 0.5976661593099949, "grad_norm": 2.421875, "learning_rate": 8.40057816304618e-06, "loss": 1.01141796, "memory(GiB)": 369.4, "step": 23560, "train_speed(iter/s)": 0.201258 }, { "acc": 0.74301748, "epoch": 0.59779299847793, "grad_norm": 2.578125, "learning_rate": 8.399809337156608e-06, "loss": 1.08420706, "memory(GiB)": 369.4, "step": 23565, "train_speed(iter/s)": 0.201266 }, { "acc": 0.73574352, "epoch": 0.5979198376458651, "grad_norm": 2.0, "learning_rate": 8.39904036172824e-06, "loss": 1.02849684, "memory(GiB)": 369.4, "step": 23570, "train_speed(iter/s)": 0.20127 }, { "acc": 0.75068312, "epoch": 0.5980466768138001, "grad_norm": 2.0625, "learning_rate": 8.398271236794904e-06, "loss": 0.99213324, "memory(GiB)": 369.4, "step": 23575, "train_speed(iter/s)": 0.201275 }, { "acc": 0.73866196, "epoch": 0.5981735159817352, "grad_norm": 2.46875, "learning_rate": 8.397501962390427e-06, "loss": 1.02793331, "memory(GiB)": 369.4, "step": 23580, "train_speed(iter/s)": 0.201283 }, { "acc": 0.7325882, "epoch": 0.5983003551496702, "grad_norm": 2.109375, "learning_rate": 8.396732538548642e-06, "loss": 1.02854042, "memory(GiB)": 369.4, "step": 23585, "train_speed(iter/s)": 0.201286 }, { "acc": 0.75058889, "epoch": 0.5984271943176053, "grad_norm": 2.0625, "learning_rate": 8.395962965303397e-06, "loss": 1.04660244, "memory(GiB)": 369.4, "step": 23590, "train_speed(iter/s)": 0.201288 }, { "acc": 0.76452241, "epoch": 0.5985540334855404, "grad_norm": 1.9453125, "learning_rate": 8.395193242688537e-06, "loss": 0.98003635, "memory(GiB)": 369.4, "step": 23595, "train_speed(iter/s)": 0.201293 }, { "acc": 0.75324793, "epoch": 0.5986808726534754, "grad_norm": 2.09375, "learning_rate": 8.394423370737922e-06, "loss": 0.92304287, "memory(GiB)": 369.4, "step": 23600, "train_speed(iter/s)": 0.201299 }, { "acc": 0.74954352, "epoch": 0.5988077118214105, "grad_norm": 2.09375, "learning_rate": 8.393653349485412e-06, "loss": 0.98103695, "memory(GiB)": 369.4, "step": 23605, "train_speed(iter/s)": 0.201301 }, { "acc": 0.73864627, "epoch": 0.5989345509893456, "grad_norm": 2.734375, "learning_rate": 8.392883178964874e-06, "loss": 1.07145462, "memory(GiB)": 369.4, "step": 23610, "train_speed(iter/s)": 0.201306 }, { "acc": 0.74900603, "epoch": 0.5990613901572805, "grad_norm": 2.28125, "learning_rate": 8.392112859210186e-06, "loss": 0.98530865, "memory(GiB)": 369.4, "step": 23615, "train_speed(iter/s)": 0.201313 }, { "acc": 0.74423389, "epoch": 0.5991882293252156, "grad_norm": 2.15625, "learning_rate": 8.391342390255232e-06, "loss": 1.03141346, "memory(GiB)": 369.4, "step": 23620, "train_speed(iter/s)": 0.20132 }, { "acc": 0.74331474, "epoch": 0.5993150684931506, "grad_norm": 1.96875, "learning_rate": 8.390571772133896e-06, "loss": 1.04511766, "memory(GiB)": 369.4, "step": 23625, "train_speed(iter/s)": 0.201326 }, { "acc": 0.7526814, "epoch": 0.5994419076610857, "grad_norm": 2.1875, "learning_rate": 8.389801004880077e-06, "loss": 0.99067669, "memory(GiB)": 369.4, "step": 23630, "train_speed(iter/s)": 0.201333 }, { "acc": 0.7425478, "epoch": 0.5995687468290208, "grad_norm": 2.15625, "learning_rate": 8.389030088527675e-06, "loss": 1.02570267, "memory(GiB)": 369.4, "step": 23635, "train_speed(iter/s)": 0.201341 }, { "acc": 0.73822384, "epoch": 0.5996955859969558, "grad_norm": 2.171875, "learning_rate": 8.388259023110598e-06, "loss": 1.04471855, "memory(GiB)": 369.4, "step": 23640, "train_speed(iter/s)": 0.201344 }, { "acc": 0.75261097, "epoch": 0.5998224251648909, "grad_norm": 1.984375, "learning_rate": 8.387487808662765e-06, "loss": 1.00322399, "memory(GiB)": 369.4, "step": 23645, "train_speed(iter/s)": 0.201348 }, { "acc": 0.72779851, "epoch": 0.599949264332826, "grad_norm": 2.1875, "learning_rate": 8.38671644521809e-06, "loss": 1.06428757, "memory(GiB)": 369.4, "step": 23650, "train_speed(iter/s)": 0.201354 }, { "acc": 0.74345112, "epoch": 0.600076103500761, "grad_norm": 2.109375, "learning_rate": 8.385944932810508e-06, "loss": 1.01929798, "memory(GiB)": 369.4, "step": 23655, "train_speed(iter/s)": 0.201361 }, { "acc": 0.75872135, "epoch": 0.6002029426686961, "grad_norm": 2.140625, "learning_rate": 8.385173271473948e-06, "loss": 0.99248753, "memory(GiB)": 369.4, "step": 23660, "train_speed(iter/s)": 0.201367 }, { "acc": 0.75901833, "epoch": 0.6003297818366311, "grad_norm": 1.8671875, "learning_rate": 8.384401461242355e-06, "loss": 0.99994688, "memory(GiB)": 369.4, "step": 23665, "train_speed(iter/s)": 0.201374 }, { "acc": 0.74684663, "epoch": 0.6004566210045662, "grad_norm": 2.578125, "learning_rate": 8.383629502149678e-06, "loss": 1.01028633, "memory(GiB)": 369.4, "step": 23670, "train_speed(iter/s)": 0.201381 }, { "acc": 0.74798846, "epoch": 0.6005834601725013, "grad_norm": 2.234375, "learning_rate": 8.382857394229865e-06, "loss": 0.99186325, "memory(GiB)": 369.4, "step": 23675, "train_speed(iter/s)": 0.201389 }, { "acc": 0.74276295, "epoch": 0.6007102993404363, "grad_norm": 2.078125, "learning_rate": 8.382085137516883e-06, "loss": 1.03046131, "memory(GiB)": 369.4, "step": 23680, "train_speed(iter/s)": 0.201393 }, { "acc": 0.75229349, "epoch": 0.6008371385083714, "grad_norm": 2.0, "learning_rate": 8.381312732044696e-06, "loss": 1.02041826, "memory(GiB)": 369.4, "step": 23685, "train_speed(iter/s)": 0.2014 }, { "acc": 0.75623512, "epoch": 0.6009639776763065, "grad_norm": 2.328125, "learning_rate": 8.380540177847278e-06, "loss": 0.98013735, "memory(GiB)": 369.4, "step": 23690, "train_speed(iter/s)": 0.201407 }, { "acc": 0.73653994, "epoch": 0.6010908168442415, "grad_norm": 2.03125, "learning_rate": 8.37976747495861e-06, "loss": 1.06148815, "memory(GiB)": 369.4, "step": 23695, "train_speed(iter/s)": 0.201412 }, { "acc": 0.75140486, "epoch": 0.6012176560121766, "grad_norm": 2.546875, "learning_rate": 8.378994623412679e-06, "loss": 0.96627016, "memory(GiB)": 369.4, "step": 23700, "train_speed(iter/s)": 0.201421 }, { "acc": 0.74226294, "epoch": 0.6013444951801116, "grad_norm": 2.21875, "learning_rate": 8.378221623243478e-06, "loss": 1.01273441, "memory(GiB)": 369.4, "step": 23705, "train_speed(iter/s)": 0.201429 }, { "acc": 0.75046272, "epoch": 0.6014713343480467, "grad_norm": 1.9765625, "learning_rate": 8.377448474485008e-06, "loss": 0.94856358, "memory(GiB)": 369.4, "step": 23710, "train_speed(iter/s)": 0.201436 }, { "acc": 0.75548229, "epoch": 0.6015981735159818, "grad_norm": 1.859375, "learning_rate": 8.376675177171273e-06, "loss": 0.98336048, "memory(GiB)": 369.4, "step": 23715, "train_speed(iter/s)": 0.201445 }, { "acc": 0.75038071, "epoch": 0.6017250126839168, "grad_norm": 2.09375, "learning_rate": 8.375901731336292e-06, "loss": 0.97909174, "memory(GiB)": 369.4, "step": 23720, "train_speed(iter/s)": 0.201453 }, { "acc": 0.75302229, "epoch": 0.6018518518518519, "grad_norm": 2.71875, "learning_rate": 8.375128137014076e-06, "loss": 1.00148315, "memory(GiB)": 369.4, "step": 23725, "train_speed(iter/s)": 0.201455 }, { "acc": 0.74008207, "epoch": 0.601978691019787, "grad_norm": 2.984375, "learning_rate": 8.374354394238658e-06, "loss": 1.06827717, "memory(GiB)": 369.4, "step": 23730, "train_speed(iter/s)": 0.201462 }, { "acc": 0.74921618, "epoch": 0.602105530187722, "grad_norm": 2.1875, "learning_rate": 8.373580503044068e-06, "loss": 0.97966194, "memory(GiB)": 369.4, "step": 23735, "train_speed(iter/s)": 0.201467 }, { "acc": 0.75126019, "epoch": 0.602232369355657, "grad_norm": 2.8125, "learning_rate": 8.372806463464347e-06, "loss": 1.04698429, "memory(GiB)": 369.4, "step": 23740, "train_speed(iter/s)": 0.20147 }, { "acc": 0.7525249, "epoch": 0.602359208523592, "grad_norm": 2.078125, "learning_rate": 8.372032275533538e-06, "loss": 0.92721615, "memory(GiB)": 369.4, "step": 23745, "train_speed(iter/s)": 0.201477 }, { "acc": 0.7511373, "epoch": 0.6024860476915271, "grad_norm": 2.375, "learning_rate": 8.371257939285692e-06, "loss": 1.01795559, "memory(GiB)": 369.4, "step": 23750, "train_speed(iter/s)": 0.20148 }, { "acc": 0.75769768, "epoch": 0.6026128868594622, "grad_norm": 2.171875, "learning_rate": 8.370483454754873e-06, "loss": 0.98942127, "memory(GiB)": 369.4, "step": 23755, "train_speed(iter/s)": 0.201481 }, { "acc": 0.75697398, "epoch": 0.6027397260273972, "grad_norm": 2.03125, "learning_rate": 8.369708821975144e-06, "loss": 1.00225487, "memory(GiB)": 369.4, "step": 23760, "train_speed(iter/s)": 0.201486 }, { "acc": 0.76431532, "epoch": 0.6028665651953323, "grad_norm": 2.3125, "learning_rate": 8.368934040980576e-06, "loss": 0.95951405, "memory(GiB)": 369.4, "step": 23765, "train_speed(iter/s)": 0.20149 }, { "acc": 0.74581642, "epoch": 0.6029934043632674, "grad_norm": 2.515625, "learning_rate": 8.368159111805246e-06, "loss": 1.07790833, "memory(GiB)": 369.4, "step": 23770, "train_speed(iter/s)": 0.201492 }, { "acc": 0.75530262, "epoch": 0.6031202435312024, "grad_norm": 1.9609375, "learning_rate": 8.367384034483242e-06, "loss": 0.99217415, "memory(GiB)": 369.4, "step": 23775, "train_speed(iter/s)": 0.201501 }, { "acc": 0.74285192, "epoch": 0.6032470826991375, "grad_norm": 2.21875, "learning_rate": 8.366608809048653e-06, "loss": 1.0376194, "memory(GiB)": 369.4, "step": 23780, "train_speed(iter/s)": 0.201507 }, { "acc": 0.74467716, "epoch": 0.6033739218670725, "grad_norm": 2.46875, "learning_rate": 8.365833435535579e-06, "loss": 1.02535009, "memory(GiB)": 369.4, "step": 23785, "train_speed(iter/s)": 0.201515 }, { "acc": 0.75122695, "epoch": 0.6035007610350076, "grad_norm": 2.25, "learning_rate": 8.365057913978123e-06, "loss": 1.06829491, "memory(GiB)": 369.4, "step": 23790, "train_speed(iter/s)": 0.20152 }, { "acc": 0.73272457, "epoch": 0.6036276002029427, "grad_norm": 2.4375, "learning_rate": 8.364282244410394e-06, "loss": 1.06736546, "memory(GiB)": 369.4, "step": 23795, "train_speed(iter/s)": 0.201528 }, { "acc": 0.74480929, "epoch": 0.6037544393708777, "grad_norm": 1.9140625, "learning_rate": 8.363506426866513e-06, "loss": 0.99747391, "memory(GiB)": 369.4, "step": 23800, "train_speed(iter/s)": 0.201534 }, { "acc": 0.74669733, "epoch": 0.6038812785388128, "grad_norm": 1.90625, "learning_rate": 8.362730461380602e-06, "loss": 0.9923214, "memory(GiB)": 369.4, "step": 23805, "train_speed(iter/s)": 0.201539 }, { "acc": 0.73976145, "epoch": 0.6040081177067479, "grad_norm": 2.234375, "learning_rate": 8.361954347986793e-06, "loss": 1.01833344, "memory(GiB)": 369.4, "step": 23810, "train_speed(iter/s)": 0.201545 }, { "acc": 0.7623704, "epoch": 0.6041349568746829, "grad_norm": 1.875, "learning_rate": 8.36117808671922e-06, "loss": 0.94098835, "memory(GiB)": 369.4, "step": 23815, "train_speed(iter/s)": 0.201548 }, { "acc": 0.74823036, "epoch": 0.604261796042618, "grad_norm": 2.046875, "learning_rate": 8.36040167761203e-06, "loss": 1.01680927, "memory(GiB)": 369.4, "step": 23820, "train_speed(iter/s)": 0.201555 }, { "acc": 0.75301871, "epoch": 0.604388635210553, "grad_norm": 2.265625, "learning_rate": 8.359625120699368e-06, "loss": 1.03692379, "memory(GiB)": 369.4, "step": 23825, "train_speed(iter/s)": 0.201564 }, { "acc": 0.7532835, "epoch": 0.6045154743784881, "grad_norm": 2.203125, "learning_rate": 8.358848416015397e-06, "loss": 0.97351217, "memory(GiB)": 369.4, "step": 23830, "train_speed(iter/s)": 0.20157 }, { "acc": 0.74058647, "epoch": 0.6046423135464232, "grad_norm": 1.8671875, "learning_rate": 8.358071563594274e-06, "loss": 1.01182432, "memory(GiB)": 369.4, "step": 23835, "train_speed(iter/s)": 0.201577 }, { "acc": 0.74535451, "epoch": 0.6047691527143582, "grad_norm": 2.046875, "learning_rate": 8.357294563470173e-06, "loss": 0.99845343, "memory(GiB)": 369.4, "step": 23840, "train_speed(iter/s)": 0.201586 }, { "acc": 0.75846033, "epoch": 0.6048959918822933, "grad_norm": 2.265625, "learning_rate": 8.356517415677267e-06, "loss": 0.94183235, "memory(GiB)": 369.4, "step": 23845, "train_speed(iter/s)": 0.201588 }, { "acc": 0.72430043, "epoch": 0.6050228310502284, "grad_norm": 2.109375, "learning_rate": 8.355740120249739e-06, "loss": 1.0965929, "memory(GiB)": 369.4, "step": 23850, "train_speed(iter/s)": 0.201592 }, { "acc": 0.73735905, "epoch": 0.6051496702181633, "grad_norm": 2.34375, "learning_rate": 8.354962677221779e-06, "loss": 1.04722767, "memory(GiB)": 369.4, "step": 23855, "train_speed(iter/s)": 0.2016 }, { "acc": 0.74191165, "epoch": 0.6052765093860984, "grad_norm": 2.171875, "learning_rate": 8.35418508662758e-06, "loss": 0.99451447, "memory(GiB)": 369.4, "step": 23860, "train_speed(iter/s)": 0.201608 }, { "acc": 0.74474988, "epoch": 0.6054033485540334, "grad_norm": 2.125, "learning_rate": 8.353407348501346e-06, "loss": 1.00319061, "memory(GiB)": 369.4, "step": 23865, "train_speed(iter/s)": 0.201612 }, { "acc": 0.75294638, "epoch": 0.6055301877219685, "grad_norm": 2.109375, "learning_rate": 8.352629462877286e-06, "loss": 0.99670601, "memory(GiB)": 369.4, "step": 23870, "train_speed(iter/s)": 0.201619 }, { "acc": 0.74013128, "epoch": 0.6056570268899036, "grad_norm": 2.203125, "learning_rate": 8.351851429789613e-06, "loss": 1.02265224, "memory(GiB)": 369.4, "step": 23875, "train_speed(iter/s)": 0.201627 }, { "acc": 0.73207431, "epoch": 0.6057838660578386, "grad_norm": 2.078125, "learning_rate": 8.35107324927255e-06, "loss": 1.01127434, "memory(GiB)": 369.4, "step": 23880, "train_speed(iter/s)": 0.201636 }, { "acc": 0.7423749, "epoch": 0.6059107052257737, "grad_norm": 2.59375, "learning_rate": 8.350294921360323e-06, "loss": 1.06325808, "memory(GiB)": 369.4, "step": 23885, "train_speed(iter/s)": 0.201642 }, { "acc": 0.73591452, "epoch": 0.6060375443937088, "grad_norm": 2.125, "learning_rate": 8.349516446087168e-06, "loss": 1.06878834, "memory(GiB)": 369.4, "step": 23890, "train_speed(iter/s)": 0.201644 }, { "acc": 0.7283886, "epoch": 0.6061643835616438, "grad_norm": 2.234375, "learning_rate": 8.348737823487325e-06, "loss": 1.06622467, "memory(GiB)": 369.4, "step": 23895, "train_speed(iter/s)": 0.20165 }, { "acc": 0.7416028, "epoch": 0.6062912227295789, "grad_norm": 2.1875, "learning_rate": 8.347959053595042e-06, "loss": 1.0493454, "memory(GiB)": 369.4, "step": 23900, "train_speed(iter/s)": 0.201654 }, { "acc": 0.73366404, "epoch": 0.6064180618975139, "grad_norm": 2.40625, "learning_rate": 8.347180136444572e-06, "loss": 1.05351706, "memory(GiB)": 369.4, "step": 23905, "train_speed(iter/s)": 0.201662 }, { "acc": 0.75074415, "epoch": 0.606544901065449, "grad_norm": 2.265625, "learning_rate": 8.346401072070174e-06, "loss": 1.01570282, "memory(GiB)": 369.4, "step": 23910, "train_speed(iter/s)": 0.201667 }, { "acc": 0.73019729, "epoch": 0.6066717402333841, "grad_norm": 2.28125, "learning_rate": 8.345621860506119e-06, "loss": 1.05688343, "memory(GiB)": 369.4, "step": 23915, "train_speed(iter/s)": 0.201673 }, { "acc": 0.75555105, "epoch": 0.6067985794013191, "grad_norm": 2.3125, "learning_rate": 8.344842501786675e-06, "loss": 0.96239243, "memory(GiB)": 369.4, "step": 23920, "train_speed(iter/s)": 0.201679 }, { "acc": 0.74309649, "epoch": 0.6069254185692542, "grad_norm": 2.296875, "learning_rate": 8.344062995946125e-06, "loss": 1.04024105, "memory(GiB)": 369.4, "step": 23925, "train_speed(iter/s)": 0.201686 }, { "acc": 0.74181232, "epoch": 0.6070522577371893, "grad_norm": 2.0625, "learning_rate": 8.343283343018755e-06, "loss": 1.01029444, "memory(GiB)": 369.4, "step": 23930, "train_speed(iter/s)": 0.201687 }, { "acc": 0.74482212, "epoch": 0.6071790969051243, "grad_norm": 2.09375, "learning_rate": 8.342503543038855e-06, "loss": 1.02328148, "memory(GiB)": 369.4, "step": 23935, "train_speed(iter/s)": 0.201689 }, { "acc": 0.73920631, "epoch": 0.6073059360730594, "grad_norm": 2.421875, "learning_rate": 8.341723596040728e-06, "loss": 1.09835634, "memory(GiB)": 369.4, "step": 23940, "train_speed(iter/s)": 0.201696 }, { "acc": 0.728824, "epoch": 0.6074327752409944, "grad_norm": 2.046875, "learning_rate": 8.340943502058675e-06, "loss": 1.06225929, "memory(GiB)": 369.4, "step": 23945, "train_speed(iter/s)": 0.201701 }, { "acc": 0.74484844, "epoch": 0.6075596144089295, "grad_norm": 2.359375, "learning_rate": 8.340163261127014e-06, "loss": 1.03666449, "memory(GiB)": 369.4, "step": 23950, "train_speed(iter/s)": 0.20171 }, { "acc": 0.7364687, "epoch": 0.6076864535768646, "grad_norm": 2.21875, "learning_rate": 8.339382873280058e-06, "loss": 1.05177956, "memory(GiB)": 369.4, "step": 23955, "train_speed(iter/s)": 0.201717 }, { "acc": 0.74503508, "epoch": 0.6078132927447996, "grad_norm": 1.9375, "learning_rate": 8.338602338552136e-06, "loss": 1.03967724, "memory(GiB)": 369.4, "step": 23960, "train_speed(iter/s)": 0.201717 }, { "acc": 0.74638395, "epoch": 0.6079401319127347, "grad_norm": 2.15625, "learning_rate": 8.337821656977574e-06, "loss": 1.01365376, "memory(GiB)": 369.4, "step": 23965, "train_speed(iter/s)": 0.20172 }, { "acc": 0.75599136, "epoch": 0.6080669710806698, "grad_norm": 2.09375, "learning_rate": 8.337040828590715e-06, "loss": 1.01517515, "memory(GiB)": 369.4, "step": 23970, "train_speed(iter/s)": 0.201729 }, { "acc": 0.7364819, "epoch": 0.6081938102486047, "grad_norm": 2.078125, "learning_rate": 8.336259853425901e-06, "loss": 1.00935421, "memory(GiB)": 369.4, "step": 23975, "train_speed(iter/s)": 0.201736 }, { "acc": 0.73659439, "epoch": 0.6083206494165398, "grad_norm": 2.078125, "learning_rate": 8.335478731517484e-06, "loss": 1.01878681, "memory(GiB)": 369.4, "step": 23980, "train_speed(iter/s)": 0.20174 }, { "acc": 0.73429003, "epoch": 0.6084474885844748, "grad_norm": 2.34375, "learning_rate": 8.33469746289982e-06, "loss": 1.03251047, "memory(GiB)": 369.4, "step": 23985, "train_speed(iter/s)": 0.201742 }, { "acc": 0.73865128, "epoch": 0.6085743277524099, "grad_norm": 2.21875, "learning_rate": 8.333916047607274e-06, "loss": 1.02585812, "memory(GiB)": 369.4, "step": 23990, "train_speed(iter/s)": 0.20175 }, { "acc": 0.76029911, "epoch": 0.608701166920345, "grad_norm": 2.59375, "learning_rate": 8.333134485674214e-06, "loss": 1.0000679, "memory(GiB)": 369.4, "step": 23995, "train_speed(iter/s)": 0.201758 }, { "acc": 0.72971067, "epoch": 0.60882800608828, "grad_norm": 2.1875, "learning_rate": 8.33235277713502e-06, "loss": 1.05321093, "memory(GiB)": 369.4, "step": 24000, "train_speed(iter/s)": 0.201763 }, { "epoch": 0.60882800608828, "eval_acc": 0.735890431332214, "eval_loss": 0.9802754521369934, "eval_runtime": 385.1755, "eval_samples_per_second": 16.538, "eval_steps_per_second": 8.269, "step": 24000 }, { "acc": 0.75270243, "epoch": 0.6089548452562151, "grad_norm": 2.828125, "learning_rate": 8.33157092202407e-06, "loss": 1.04412365, "memory(GiB)": 369.4, "step": 24005, "train_speed(iter/s)": 0.200567 }, { "acc": 0.74498258, "epoch": 0.6090816844241502, "grad_norm": 1.546875, "learning_rate": 8.33078892037576e-06, "loss": 1.02107315, "memory(GiB)": 369.4, "step": 24010, "train_speed(iter/s)": 0.20057 }, { "acc": 0.74789276, "epoch": 0.6092085235920852, "grad_norm": 2.4375, "learning_rate": 8.33000677222448e-06, "loss": 1.04497471, "memory(GiB)": 369.4, "step": 24015, "train_speed(iter/s)": 0.200578 }, { "acc": 0.75036268, "epoch": 0.6093353627600203, "grad_norm": 2.0, "learning_rate": 8.329224477604635e-06, "loss": 1.01037235, "memory(GiB)": 369.4, "step": 24020, "train_speed(iter/s)": 0.200587 }, { "acc": 0.74091501, "epoch": 0.6094622019279553, "grad_norm": 2.1875, "learning_rate": 8.328442036550633e-06, "loss": 1.05290527, "memory(GiB)": 369.4, "step": 24025, "train_speed(iter/s)": 0.200595 }, { "acc": 0.74632912, "epoch": 0.6095890410958904, "grad_norm": 2.171875, "learning_rate": 8.327659449096892e-06, "loss": 0.94530878, "memory(GiB)": 369.4, "step": 24030, "train_speed(iter/s)": 0.200604 }, { "acc": 0.76587782, "epoch": 0.6097158802638255, "grad_norm": 2.25, "learning_rate": 8.32687671527783e-06, "loss": 0.95983334, "memory(GiB)": 369.4, "step": 24035, "train_speed(iter/s)": 0.200612 }, { "acc": 0.74922709, "epoch": 0.6098427194317605, "grad_norm": 2.265625, "learning_rate": 8.326093835127878e-06, "loss": 1.04716015, "memory(GiB)": 369.4, "step": 24040, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75199089, "epoch": 0.6099695585996956, "grad_norm": 2.5625, "learning_rate": 8.325310808681466e-06, "loss": 1.00261917, "memory(GiB)": 369.4, "step": 24045, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75612345, "epoch": 0.6100963977676307, "grad_norm": 2.34375, "learning_rate": 8.32452763597304e-06, "loss": 0.94749527, "memory(GiB)": 369.4, "step": 24050, "train_speed(iter/s)": 0.20063 }, { "acc": 0.74366198, "epoch": 0.6102232369355657, "grad_norm": 2.1875, "learning_rate": 8.323744317037048e-06, "loss": 0.96595097, "memory(GiB)": 369.4, "step": 24055, "train_speed(iter/s)": 0.200636 }, { "acc": 0.73650122, "epoch": 0.6103500761035008, "grad_norm": 2.0, "learning_rate": 8.322960851907937e-06, "loss": 1.02085857, "memory(GiB)": 369.4, "step": 24060, "train_speed(iter/s)": 0.20064 }, { "acc": 0.72991467, "epoch": 0.6104769152714358, "grad_norm": 2.5, "learning_rate": 8.322177240620175e-06, "loss": 1.03118, "memory(GiB)": 369.4, "step": 24065, "train_speed(iter/s)": 0.200649 }, { "acc": 0.75601249, "epoch": 0.6106037544393709, "grad_norm": 2.3125, "learning_rate": 8.321393483208224e-06, "loss": 0.99134293, "memory(GiB)": 369.4, "step": 24070, "train_speed(iter/s)": 0.200654 }, { "acc": 0.75259881, "epoch": 0.610730593607306, "grad_norm": 2.078125, "learning_rate": 8.32060957970656e-06, "loss": 1.0043992, "memory(GiB)": 369.4, "step": 24075, "train_speed(iter/s)": 0.200662 }, { "acc": 0.73832769, "epoch": 0.610857432775241, "grad_norm": 1.9453125, "learning_rate": 8.319825530149661e-06, "loss": 1.08822479, "memory(GiB)": 369.4, "step": 24080, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74167566, "epoch": 0.6109842719431761, "grad_norm": 2.421875, "learning_rate": 8.319041334572012e-06, "loss": 1.04055195, "memory(GiB)": 369.4, "step": 24085, "train_speed(iter/s)": 0.200672 }, { "acc": 0.743752, "epoch": 0.6111111111111112, "grad_norm": 2.328125, "learning_rate": 8.318256993008108e-06, "loss": 1.05621891, "memory(GiB)": 369.4, "step": 24090, "train_speed(iter/s)": 0.20068 }, { "acc": 0.73239298, "epoch": 0.6112379502790461, "grad_norm": 2.15625, "learning_rate": 8.317472505492446e-06, "loss": 1.09216385, "memory(GiB)": 369.4, "step": 24095, "train_speed(iter/s)": 0.200688 }, { "acc": 0.76410351, "epoch": 0.6113647894469812, "grad_norm": 2.03125, "learning_rate": 8.31668787205953e-06, "loss": 0.92724514, "memory(GiB)": 369.4, "step": 24100, "train_speed(iter/s)": 0.200693 }, { "acc": 0.75608101, "epoch": 0.6114916286149162, "grad_norm": 1.8515625, "learning_rate": 8.315903092743876e-06, "loss": 1.02020531, "memory(GiB)": 369.4, "step": 24105, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75992107, "epoch": 0.6116184677828513, "grad_norm": 2.109375, "learning_rate": 8.315118167579999e-06, "loss": 0.99990578, "memory(GiB)": 369.4, "step": 24110, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75068388, "epoch": 0.6117453069507864, "grad_norm": 2.328125, "learning_rate": 8.314333096602423e-06, "loss": 0.96947098, "memory(GiB)": 369.4, "step": 24115, "train_speed(iter/s)": 0.200717 }, { "acc": 0.75658317, "epoch": 0.6118721461187214, "grad_norm": 2.09375, "learning_rate": 8.313547879845682e-06, "loss": 1.02306595, "memory(GiB)": 369.4, "step": 24120, "train_speed(iter/s)": 0.200726 }, { "acc": 0.73912764, "epoch": 0.6119989852866565, "grad_norm": 2.765625, "learning_rate": 8.312762517344308e-06, "loss": 1.04338741, "memory(GiB)": 369.4, "step": 24125, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74041042, "epoch": 0.6121258244545916, "grad_norm": 2.546875, "learning_rate": 8.311977009132851e-06, "loss": 1.04597988, "memory(GiB)": 369.4, "step": 24130, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74977627, "epoch": 0.6122526636225266, "grad_norm": 2.09375, "learning_rate": 8.311191355245858e-06, "loss": 1.02064457, "memory(GiB)": 369.4, "step": 24135, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74813643, "epoch": 0.6123795027904617, "grad_norm": 1.7890625, "learning_rate": 8.310405555717884e-06, "loss": 1.03598366, "memory(GiB)": 369.4, "step": 24140, "train_speed(iter/s)": 0.200758 }, { "acc": 0.74082689, "epoch": 0.6125063419583967, "grad_norm": 1.8515625, "learning_rate": 8.309619610583495e-06, "loss": 1.06271, "memory(GiB)": 369.4, "step": 24145, "train_speed(iter/s)": 0.20076 }, { "acc": 0.76420679, "epoch": 0.6126331811263318, "grad_norm": 2.1875, "learning_rate": 8.30883351987726e-06, "loss": 1.00060358, "memory(GiB)": 369.4, "step": 24150, "train_speed(iter/s)": 0.200767 }, { "acc": 0.74046831, "epoch": 0.6127600202942669, "grad_norm": 2.796875, "learning_rate": 8.30804728363375e-06, "loss": 1.02554235, "memory(GiB)": 369.4, "step": 24155, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75547667, "epoch": 0.6128868594622019, "grad_norm": 2.609375, "learning_rate": 8.307260901887556e-06, "loss": 1.00635815, "memory(GiB)": 369.4, "step": 24160, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75148773, "epoch": 0.613013698630137, "grad_norm": 1.9140625, "learning_rate": 8.306474374673259e-06, "loss": 0.9508358, "memory(GiB)": 369.4, "step": 24165, "train_speed(iter/s)": 0.200786 }, { "acc": 0.75714836, "epoch": 0.6131405377980721, "grad_norm": 2.296875, "learning_rate": 8.305687702025457e-06, "loss": 0.98279505, "memory(GiB)": 369.4, "step": 24170, "train_speed(iter/s)": 0.200792 }, { "acc": 0.7519032, "epoch": 0.6132673769660071, "grad_norm": 2.09375, "learning_rate": 8.304900883978753e-06, "loss": 1.0094943, "memory(GiB)": 369.4, "step": 24175, "train_speed(iter/s)": 0.200799 }, { "acc": 0.73939233, "epoch": 0.6133942161339422, "grad_norm": 2.0625, "learning_rate": 8.304113920567751e-06, "loss": 1.03456459, "memory(GiB)": 369.4, "step": 24180, "train_speed(iter/s)": 0.200806 }, { "acc": 0.75245771, "epoch": 0.6135210553018772, "grad_norm": 1.9765625, "learning_rate": 8.303326811827066e-06, "loss": 1.00135994, "memory(GiB)": 369.4, "step": 24185, "train_speed(iter/s)": 0.200814 }, { "acc": 0.75190248, "epoch": 0.6136478944698123, "grad_norm": 1.828125, "learning_rate": 8.302539557791322e-06, "loss": 1.02325802, "memory(GiB)": 369.4, "step": 24190, "train_speed(iter/s)": 0.200822 }, { "acc": 0.73258114, "epoch": 0.6137747336377474, "grad_norm": 2.046875, "learning_rate": 8.301752158495141e-06, "loss": 1.02348328, "memory(GiB)": 369.4, "step": 24195, "train_speed(iter/s)": 0.200825 }, { "acc": 0.7459445, "epoch": 0.6139015728056824, "grad_norm": 2.203125, "learning_rate": 8.300964613973159e-06, "loss": 1.0420826, "memory(GiB)": 369.4, "step": 24200, "train_speed(iter/s)": 0.200833 }, { "acc": 0.73859015, "epoch": 0.6140284119736175, "grad_norm": 2.234375, "learning_rate": 8.300176924260017e-06, "loss": 1.00071373, "memory(GiB)": 369.4, "step": 24205, "train_speed(iter/s)": 0.200836 }, { "acc": 0.74686422, "epoch": 0.6141552511415526, "grad_norm": 2.125, "learning_rate": 8.299389089390359e-06, "loss": 1.03023701, "memory(GiB)": 369.4, "step": 24210, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75149384, "epoch": 0.6142820903094875, "grad_norm": 1.921875, "learning_rate": 8.298601109398838e-06, "loss": 1.00118847, "memory(GiB)": 369.4, "step": 24215, "train_speed(iter/s)": 0.200849 }, { "acc": 0.74343586, "epoch": 0.6144089294774226, "grad_norm": 1.7421875, "learning_rate": 8.297812984320113e-06, "loss": 1.02387419, "memory(GiB)": 369.4, "step": 24220, "train_speed(iter/s)": 0.200854 }, { "acc": 0.74503307, "epoch": 0.6145357686453576, "grad_norm": 2.0625, "learning_rate": 8.297024714188851e-06, "loss": 1.02011604, "memory(GiB)": 369.4, "step": 24225, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74915342, "epoch": 0.6146626078132927, "grad_norm": 2.53125, "learning_rate": 8.296236299039719e-06, "loss": 1.03554573, "memory(GiB)": 369.4, "step": 24230, "train_speed(iter/s)": 0.200864 }, { "acc": 0.74339609, "epoch": 0.6147894469812278, "grad_norm": 2.046875, "learning_rate": 8.295447738907401e-06, "loss": 0.97709866, "memory(GiB)": 369.4, "step": 24235, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74208574, "epoch": 0.6149162861491628, "grad_norm": 2.03125, "learning_rate": 8.294659033826576e-06, "loss": 1.05862923, "memory(GiB)": 369.4, "step": 24240, "train_speed(iter/s)": 0.20088 }, { "acc": 0.73198652, "epoch": 0.6150431253170979, "grad_norm": 2.359375, "learning_rate": 8.293870183831937e-06, "loss": 1.04214401, "memory(GiB)": 369.4, "step": 24245, "train_speed(iter/s)": 0.200885 }, { "acc": 0.75065498, "epoch": 0.615169964485033, "grad_norm": 2.375, "learning_rate": 8.293081188958183e-06, "loss": 0.97079763, "memory(GiB)": 369.4, "step": 24250, "train_speed(iter/s)": 0.20089 }, { "acc": 0.75639482, "epoch": 0.615296803652968, "grad_norm": 2.265625, "learning_rate": 8.292292049240014e-06, "loss": 1.00538473, "memory(GiB)": 369.4, "step": 24255, "train_speed(iter/s)": 0.200894 }, { "acc": 0.73561392, "epoch": 0.6154236428209031, "grad_norm": 2.140625, "learning_rate": 8.291502764712143e-06, "loss": 1.05144711, "memory(GiB)": 369.4, "step": 24260, "train_speed(iter/s)": 0.200901 }, { "acc": 0.75146012, "epoch": 0.6155504819888381, "grad_norm": 2.171875, "learning_rate": 8.290713335409284e-06, "loss": 1.02373714, "memory(GiB)": 369.4, "step": 24265, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74140224, "epoch": 0.6156773211567732, "grad_norm": 1.953125, "learning_rate": 8.28992376136616e-06, "loss": 1.00070524, "memory(GiB)": 369.4, "step": 24270, "train_speed(iter/s)": 0.200911 }, { "acc": 0.73690553, "epoch": 0.6158041603247083, "grad_norm": 2.203125, "learning_rate": 8.289134042617502e-06, "loss": 1.07770042, "memory(GiB)": 369.4, "step": 24275, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75096226, "epoch": 0.6159309994926433, "grad_norm": 2.171875, "learning_rate": 8.288344179198043e-06, "loss": 0.97823639, "memory(GiB)": 369.4, "step": 24280, "train_speed(iter/s)": 0.20092 }, { "acc": 0.74852028, "epoch": 0.6160578386605784, "grad_norm": 2.1875, "learning_rate": 8.287554171142525e-06, "loss": 0.99589529, "memory(GiB)": 369.4, "step": 24285, "train_speed(iter/s)": 0.200925 }, { "acc": 0.73312111, "epoch": 0.6161846778285135, "grad_norm": 2.078125, "learning_rate": 8.2867640184857e-06, "loss": 1.04638376, "memory(GiB)": 369.4, "step": 24290, "train_speed(iter/s)": 0.200933 }, { "acc": 0.72442966, "epoch": 0.6163115169964485, "grad_norm": 2.3125, "learning_rate": 8.285973721262315e-06, "loss": 1.0615469, "memory(GiB)": 369.4, "step": 24295, "train_speed(iter/s)": 0.200938 }, { "acc": 0.74422226, "epoch": 0.6164383561643836, "grad_norm": 2.125, "learning_rate": 8.285183279507135e-06, "loss": 1.02317448, "memory(GiB)": 369.4, "step": 24300, "train_speed(iter/s)": 0.200942 }, { "acc": 0.73931198, "epoch": 0.6165651953323186, "grad_norm": 2.28125, "learning_rate": 8.28439269325493e-06, "loss": 0.97860374, "memory(GiB)": 369.4, "step": 24305, "train_speed(iter/s)": 0.200943 }, { "acc": 0.73783998, "epoch": 0.6166920345002537, "grad_norm": 2.296875, "learning_rate": 8.28360196254047e-06, "loss": 1.05560789, "memory(GiB)": 369.4, "step": 24310, "train_speed(iter/s)": 0.200948 }, { "acc": 0.74325824, "epoch": 0.6168188736681888, "grad_norm": 2.265625, "learning_rate": 8.282811087398535e-06, "loss": 1.00304451, "memory(GiB)": 369.4, "step": 24315, "train_speed(iter/s)": 0.200956 }, { "acc": 0.74194136, "epoch": 0.6169457128361238, "grad_norm": 2.484375, "learning_rate": 8.282020067863911e-06, "loss": 1.04016171, "memory(GiB)": 369.4, "step": 24320, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75712762, "epoch": 0.6170725520040589, "grad_norm": 2.140625, "learning_rate": 8.281228903971391e-06, "loss": 0.97481956, "memory(GiB)": 369.4, "step": 24325, "train_speed(iter/s)": 0.200971 }, { "acc": 0.75140028, "epoch": 0.617199391171994, "grad_norm": 2.640625, "learning_rate": 8.280437595755774e-06, "loss": 0.9539793, "memory(GiB)": 369.4, "step": 24330, "train_speed(iter/s)": 0.200976 }, { "acc": 0.75112333, "epoch": 0.617326230339929, "grad_norm": 2.015625, "learning_rate": 8.279646143251867e-06, "loss": 0.93645678, "memory(GiB)": 369.4, "step": 24335, "train_speed(iter/s)": 0.200983 }, { "acc": 0.75793338, "epoch": 0.617453069507864, "grad_norm": 2.1875, "learning_rate": 8.278854546494479e-06, "loss": 0.98221531, "memory(GiB)": 369.4, "step": 24340, "train_speed(iter/s)": 0.200989 }, { "acc": 0.74595461, "epoch": 0.617579908675799, "grad_norm": 2.171875, "learning_rate": 8.27806280551843e-06, "loss": 0.95771713, "memory(GiB)": 369.4, "step": 24345, "train_speed(iter/s)": 0.200996 }, { "acc": 0.73524675, "epoch": 0.6177067478437341, "grad_norm": 2.359375, "learning_rate": 8.277270920358542e-06, "loss": 1.04198246, "memory(GiB)": 369.4, "step": 24350, "train_speed(iter/s)": 0.201001 }, { "acc": 0.76467776, "epoch": 0.6178335870116692, "grad_norm": 2.078125, "learning_rate": 8.276478891049649e-06, "loss": 1.00762758, "memory(GiB)": 369.4, "step": 24355, "train_speed(iter/s)": 0.200998 }, { "acc": 0.74875622, "epoch": 0.6179604261796042, "grad_norm": 2.328125, "learning_rate": 8.275686717626584e-06, "loss": 0.99424353, "memory(GiB)": 369.4, "step": 24360, "train_speed(iter/s)": 0.201 }, { "acc": 0.75381484, "epoch": 0.6180872653475393, "grad_norm": 1.78125, "learning_rate": 8.274894400124191e-06, "loss": 0.97294416, "memory(GiB)": 369.4, "step": 24365, "train_speed(iter/s)": 0.200999 }, { "acc": 0.74874845, "epoch": 0.6182141045154744, "grad_norm": 2.140625, "learning_rate": 8.274101938577324e-06, "loss": 1.04181614, "memory(GiB)": 369.4, "step": 24370, "train_speed(iter/s)": 0.201004 }, { "acc": 0.75848551, "epoch": 0.6183409436834094, "grad_norm": 2.5625, "learning_rate": 8.273309333020834e-06, "loss": 0.95653267, "memory(GiB)": 369.4, "step": 24375, "train_speed(iter/s)": 0.201009 }, { "acc": 0.7459444, "epoch": 0.6184677828513445, "grad_norm": 1.7890625, "learning_rate": 8.272516583489587e-06, "loss": 1.04002361, "memory(GiB)": 369.4, "step": 24380, "train_speed(iter/s)": 0.201015 }, { "acc": 0.7567276, "epoch": 0.6185946220192795, "grad_norm": 2.3125, "learning_rate": 8.271723690018448e-06, "loss": 0.97615852, "memory(GiB)": 369.4, "step": 24385, "train_speed(iter/s)": 0.201023 }, { "acc": 0.73067651, "epoch": 0.6187214611872146, "grad_norm": 2.25, "learning_rate": 8.270930652642295e-06, "loss": 1.04129343, "memory(GiB)": 369.4, "step": 24390, "train_speed(iter/s)": 0.201031 }, { "acc": 0.74406557, "epoch": 0.6188483003551497, "grad_norm": 2.125, "learning_rate": 8.270137471396007e-06, "loss": 1.02644978, "memory(GiB)": 369.4, "step": 24395, "train_speed(iter/s)": 0.201038 }, { "acc": 0.74885283, "epoch": 0.6189751395230847, "grad_norm": 1.8125, "learning_rate": 8.269344146314475e-06, "loss": 1.01488762, "memory(GiB)": 369.4, "step": 24400, "train_speed(iter/s)": 0.201043 }, { "acc": 0.74918532, "epoch": 0.6191019786910198, "grad_norm": 2.21875, "learning_rate": 8.26855067743259e-06, "loss": 1.05063801, "memory(GiB)": 369.4, "step": 24405, "train_speed(iter/s)": 0.201048 }, { "acc": 0.75488586, "epoch": 0.6192288178589549, "grad_norm": 2.03125, "learning_rate": 8.267757064785254e-06, "loss": 0.99297838, "memory(GiB)": 369.4, "step": 24410, "train_speed(iter/s)": 0.201056 }, { "acc": 0.74761858, "epoch": 0.6193556570268899, "grad_norm": 2.46875, "learning_rate": 8.26696330840737e-06, "loss": 1.0207428, "memory(GiB)": 369.4, "step": 24415, "train_speed(iter/s)": 0.201062 }, { "acc": 0.76044512, "epoch": 0.619482496194825, "grad_norm": 2.234375, "learning_rate": 8.266169408333856e-06, "loss": 0.99342279, "memory(GiB)": 369.4, "step": 24420, "train_speed(iter/s)": 0.201068 }, { "acc": 0.75507274, "epoch": 0.61960933536276, "grad_norm": 2.1875, "learning_rate": 8.265375364599629e-06, "loss": 1.02126303, "memory(GiB)": 369.4, "step": 24425, "train_speed(iter/s)": 0.201076 }, { "acc": 0.76050539, "epoch": 0.6197361745306951, "grad_norm": 2.34375, "learning_rate": 8.264581177239615e-06, "loss": 1.01054878, "memory(GiB)": 369.4, "step": 24430, "train_speed(iter/s)": 0.201081 }, { "acc": 0.75944619, "epoch": 0.6198630136986302, "grad_norm": 2.140625, "learning_rate": 8.263786846288745e-06, "loss": 0.96374264, "memory(GiB)": 369.4, "step": 24435, "train_speed(iter/s)": 0.201089 }, { "acc": 0.75255947, "epoch": 0.6199898528665652, "grad_norm": 2.40625, "learning_rate": 8.262992371781956e-06, "loss": 0.98841343, "memory(GiB)": 369.4, "step": 24440, "train_speed(iter/s)": 0.201094 }, { "acc": 0.73200626, "epoch": 0.6201166920345003, "grad_norm": 2.171875, "learning_rate": 8.262197753754195e-06, "loss": 1.07421532, "memory(GiB)": 369.4, "step": 24445, "train_speed(iter/s)": 0.201103 }, { "acc": 0.74238238, "epoch": 0.6202435312024354, "grad_norm": 2.625, "learning_rate": 8.261402992240414e-06, "loss": 1.03487291, "memory(GiB)": 369.4, "step": 24450, "train_speed(iter/s)": 0.20111 }, { "acc": 0.74514456, "epoch": 0.6203703703703703, "grad_norm": 2.5, "learning_rate": 8.260608087275566e-06, "loss": 1.04124355, "memory(GiB)": 369.4, "step": 24455, "train_speed(iter/s)": 0.201116 }, { "acc": 0.74257874, "epoch": 0.6204972095383054, "grad_norm": 2.03125, "learning_rate": 8.259813038894617e-06, "loss": 1.03968048, "memory(GiB)": 369.4, "step": 24460, "train_speed(iter/s)": 0.201121 }, { "acc": 0.74809952, "epoch": 0.6206240487062404, "grad_norm": 2.0, "learning_rate": 8.259017847132538e-06, "loss": 0.9969861, "memory(GiB)": 369.4, "step": 24465, "train_speed(iter/s)": 0.201123 }, { "acc": 0.73870411, "epoch": 0.6207508878741755, "grad_norm": 2.28125, "learning_rate": 8.258222512024303e-06, "loss": 1.05897369, "memory(GiB)": 369.4, "step": 24470, "train_speed(iter/s)": 0.201132 }, { "acc": 0.75791206, "epoch": 0.6208777270421106, "grad_norm": 1.8828125, "learning_rate": 8.257427033604894e-06, "loss": 0.95063324, "memory(GiB)": 369.4, "step": 24475, "train_speed(iter/s)": 0.20114 }, { "acc": 0.73960905, "epoch": 0.6210045662100456, "grad_norm": 2.09375, "learning_rate": 8.256631411909305e-06, "loss": 1.02361126, "memory(GiB)": 369.4, "step": 24480, "train_speed(iter/s)": 0.201144 }, { "acc": 0.74633455, "epoch": 0.6211314053779807, "grad_norm": 2.28125, "learning_rate": 8.25583564697252e-06, "loss": 0.95015106, "memory(GiB)": 369.4, "step": 24485, "train_speed(iter/s)": 0.201149 }, { "acc": 0.74825306, "epoch": 0.6212582445459158, "grad_norm": 2.171875, "learning_rate": 8.255039738829552e-06, "loss": 1.01030235, "memory(GiB)": 369.4, "step": 24490, "train_speed(iter/s)": 0.201156 }, { "acc": 0.74566145, "epoch": 0.6213850837138508, "grad_norm": 2.3125, "learning_rate": 8.254243687515402e-06, "loss": 0.9883604, "memory(GiB)": 369.4, "step": 24495, "train_speed(iter/s)": 0.20116 }, { "acc": 0.74607506, "epoch": 0.6215119228817859, "grad_norm": 2.09375, "learning_rate": 8.253447493065085e-06, "loss": 1.03372631, "memory(GiB)": 369.4, "step": 24500, "train_speed(iter/s)": 0.201164 }, { "acc": 0.73998208, "epoch": 0.6216387620497209, "grad_norm": 2.21875, "learning_rate": 8.252651155513622e-06, "loss": 1.01619301, "memory(GiB)": 369.4, "step": 24505, "train_speed(iter/s)": 0.201171 }, { "acc": 0.72980566, "epoch": 0.621765601217656, "grad_norm": 2.109375, "learning_rate": 8.251854674896039e-06, "loss": 1.08547535, "memory(GiB)": 369.4, "step": 24510, "train_speed(iter/s)": 0.201176 }, { "acc": 0.73927064, "epoch": 0.6218924403855911, "grad_norm": 2.140625, "learning_rate": 8.251058051247368e-06, "loss": 1.01978807, "memory(GiB)": 369.4, "step": 24515, "train_speed(iter/s)": 0.201182 }, { "acc": 0.75205431, "epoch": 0.6220192795535261, "grad_norm": 2.125, "learning_rate": 8.250261284602651e-06, "loss": 0.99172506, "memory(GiB)": 369.4, "step": 24520, "train_speed(iter/s)": 0.201189 }, { "acc": 0.74527178, "epoch": 0.6221461187214612, "grad_norm": 2.140625, "learning_rate": 8.249464374996932e-06, "loss": 1.03820248, "memory(GiB)": 369.4, "step": 24525, "train_speed(iter/s)": 0.201195 }, { "acc": 0.74331226, "epoch": 0.6222729578893963, "grad_norm": 2.203125, "learning_rate": 8.24866732246526e-06, "loss": 1.04260616, "memory(GiB)": 369.4, "step": 24530, "train_speed(iter/s)": 0.201202 }, { "acc": 0.75027614, "epoch": 0.6223997970573313, "grad_norm": 2.609375, "learning_rate": 8.247870127042695e-06, "loss": 0.99842491, "memory(GiB)": 369.4, "step": 24535, "train_speed(iter/s)": 0.201208 }, { "acc": 0.7423275, "epoch": 0.6225266362252664, "grad_norm": 2.4375, "learning_rate": 8.247072788764302e-06, "loss": 1.00385084, "memory(GiB)": 369.4, "step": 24540, "train_speed(iter/s)": 0.201214 }, { "acc": 0.73271236, "epoch": 0.6226534753932014, "grad_norm": 2.109375, "learning_rate": 8.246275307665147e-06, "loss": 1.05170975, "memory(GiB)": 369.4, "step": 24545, "train_speed(iter/s)": 0.201221 }, { "acc": 0.75231247, "epoch": 0.6227803145611365, "grad_norm": 2.1875, "learning_rate": 8.245477683780316e-06, "loss": 0.93840637, "memory(GiB)": 369.4, "step": 24550, "train_speed(iter/s)": 0.201227 }, { "acc": 0.75493712, "epoch": 0.6229071537290716, "grad_norm": 2.109375, "learning_rate": 8.244679917144883e-06, "loss": 1.00413609, "memory(GiB)": 369.4, "step": 24555, "train_speed(iter/s)": 0.201233 }, { "acc": 0.72872543, "epoch": 0.6230339928970066, "grad_norm": 2.296875, "learning_rate": 8.243882007793941e-06, "loss": 1.08913183, "memory(GiB)": 369.4, "step": 24560, "train_speed(iter/s)": 0.20124 }, { "acc": 0.74825869, "epoch": 0.6231608320649417, "grad_norm": 2.3125, "learning_rate": 8.243083955762588e-06, "loss": 1.00630856, "memory(GiB)": 369.4, "step": 24565, "train_speed(iter/s)": 0.201246 }, { "acc": 0.74674983, "epoch": 0.6232876712328768, "grad_norm": 2.28125, "learning_rate": 8.24228576108592e-06, "loss": 1.02384405, "memory(GiB)": 369.4, "step": 24570, "train_speed(iter/s)": 0.201254 }, { "acc": 0.74328017, "epoch": 0.6234145104008117, "grad_norm": 2.484375, "learning_rate": 8.24148742379905e-06, "loss": 0.98845921, "memory(GiB)": 369.4, "step": 24575, "train_speed(iter/s)": 0.201258 }, { "acc": 0.76183519, "epoch": 0.6235413495687468, "grad_norm": 2.109375, "learning_rate": 8.240688943937092e-06, "loss": 1.00026226, "memory(GiB)": 369.4, "step": 24580, "train_speed(iter/s)": 0.201266 }, { "acc": 0.74896474, "epoch": 0.6236681887366818, "grad_norm": 2.421875, "learning_rate": 8.239890321535163e-06, "loss": 1.01818523, "memory(GiB)": 369.4, "step": 24585, "train_speed(iter/s)": 0.201271 }, { "acc": 0.73902464, "epoch": 0.6237950279046169, "grad_norm": 2.203125, "learning_rate": 8.239091556628395e-06, "loss": 1.04732513, "memory(GiB)": 369.4, "step": 24590, "train_speed(iter/s)": 0.201278 }, { "acc": 0.73950028, "epoch": 0.623921867072552, "grad_norm": 2.171875, "learning_rate": 8.238292649251918e-06, "loss": 1.03579855, "memory(GiB)": 369.4, "step": 24595, "train_speed(iter/s)": 0.201286 }, { "acc": 0.75340357, "epoch": 0.624048706240487, "grad_norm": 2.0, "learning_rate": 8.237493599440871e-06, "loss": 0.99255333, "memory(GiB)": 369.4, "step": 24600, "train_speed(iter/s)": 0.201292 }, { "acc": 0.7249404, "epoch": 0.6241755454084221, "grad_norm": 2.25, "learning_rate": 8.236694407230402e-06, "loss": 1.12898102, "memory(GiB)": 369.4, "step": 24605, "train_speed(iter/s)": 0.201299 }, { "acc": 0.76614571, "epoch": 0.6243023845763572, "grad_norm": 2.109375, "learning_rate": 8.235895072655664e-06, "loss": 0.94894428, "memory(GiB)": 369.4, "step": 24610, "train_speed(iter/s)": 0.201296 }, { "acc": 0.74650869, "epoch": 0.6244292237442922, "grad_norm": 2.328125, "learning_rate": 8.235095595751809e-06, "loss": 1.0242487, "memory(GiB)": 369.4, "step": 24615, "train_speed(iter/s)": 0.2013 }, { "acc": 0.7572001, "epoch": 0.6245560629122273, "grad_norm": 2.265625, "learning_rate": 8.23429597655401e-06, "loss": 1.00425406, "memory(GiB)": 369.4, "step": 24620, "train_speed(iter/s)": 0.201307 }, { "acc": 0.72608452, "epoch": 0.6246829020801623, "grad_norm": 2.453125, "learning_rate": 8.233496215097433e-06, "loss": 1.08003731, "memory(GiB)": 369.4, "step": 24625, "train_speed(iter/s)": 0.201313 }, { "acc": 0.74476333, "epoch": 0.6248097412480974, "grad_norm": 2.171875, "learning_rate": 8.232696311417256e-06, "loss": 0.98133144, "memory(GiB)": 369.4, "step": 24630, "train_speed(iter/s)": 0.201321 }, { "acc": 0.74404526, "epoch": 0.6249365804160325, "grad_norm": 2.40625, "learning_rate": 8.231896265548662e-06, "loss": 1.05146809, "memory(GiB)": 369.4, "step": 24635, "train_speed(iter/s)": 0.201325 }, { "acc": 0.74404745, "epoch": 0.6250634195839675, "grad_norm": 2.0625, "learning_rate": 8.231096077526841e-06, "loss": 1.03180008, "memory(GiB)": 369.4, "step": 24640, "train_speed(iter/s)": 0.201331 }, { "acc": 0.76027193, "epoch": 0.6251902587519026, "grad_norm": 2.171875, "learning_rate": 8.230295747386988e-06, "loss": 0.96722546, "memory(GiB)": 369.4, "step": 24645, "train_speed(iter/s)": 0.20134 }, { "acc": 0.75190353, "epoch": 0.6253170979198377, "grad_norm": 2.125, "learning_rate": 8.229495275164307e-06, "loss": 0.9362977, "memory(GiB)": 369.4, "step": 24650, "train_speed(iter/s)": 0.201347 }, { "acc": 0.74331784, "epoch": 0.6254439370877727, "grad_norm": 2.390625, "learning_rate": 8.228694660894003e-06, "loss": 1.06347694, "memory(GiB)": 369.4, "step": 24655, "train_speed(iter/s)": 0.201354 }, { "acc": 0.7593647, "epoch": 0.6255707762557078, "grad_norm": 2.203125, "learning_rate": 8.227893904611295e-06, "loss": 0.96074505, "memory(GiB)": 369.4, "step": 24660, "train_speed(iter/s)": 0.201361 }, { "acc": 0.75014286, "epoch": 0.6256976154236428, "grad_norm": 1.78125, "learning_rate": 8.2270930063514e-06, "loss": 0.99967556, "memory(GiB)": 369.4, "step": 24665, "train_speed(iter/s)": 0.201368 }, { "acc": 0.74265056, "epoch": 0.6258244545915779, "grad_norm": 2.203125, "learning_rate": 8.226291966149549e-06, "loss": 1.0505784, "memory(GiB)": 369.4, "step": 24670, "train_speed(iter/s)": 0.201374 }, { "acc": 0.75570784, "epoch": 0.625951293759513, "grad_norm": 2.171875, "learning_rate": 8.225490784040971e-06, "loss": 0.94803524, "memory(GiB)": 369.4, "step": 24675, "train_speed(iter/s)": 0.201379 }, { "acc": 0.74133768, "epoch": 0.626078132927448, "grad_norm": 2.640625, "learning_rate": 8.224689460060908e-06, "loss": 1.03585739, "memory(GiB)": 369.4, "step": 24680, "train_speed(iter/s)": 0.201386 }, { "acc": 0.76221094, "epoch": 0.6262049720953831, "grad_norm": 2.09375, "learning_rate": 8.223887994244604e-06, "loss": 0.95367298, "memory(GiB)": 369.4, "step": 24685, "train_speed(iter/s)": 0.201392 }, { "acc": 0.76050797, "epoch": 0.6263318112633182, "grad_norm": 1.984375, "learning_rate": 8.223086386627314e-06, "loss": 0.95685568, "memory(GiB)": 369.4, "step": 24690, "train_speed(iter/s)": 0.2014 }, { "acc": 0.76556196, "epoch": 0.6264586504312532, "grad_norm": 2.09375, "learning_rate": 8.222284637244296e-06, "loss": 0.94345798, "memory(GiB)": 369.4, "step": 24695, "train_speed(iter/s)": 0.201407 }, { "acc": 0.75028982, "epoch": 0.6265854895991883, "grad_norm": 1.859375, "learning_rate": 8.221482746130811e-06, "loss": 0.97842216, "memory(GiB)": 369.4, "step": 24700, "train_speed(iter/s)": 0.201414 }, { "acc": 0.74640975, "epoch": 0.6267123287671232, "grad_norm": 2.140625, "learning_rate": 8.220680713322131e-06, "loss": 1.01588535, "memory(GiB)": 369.4, "step": 24705, "train_speed(iter/s)": 0.201418 }, { "acc": 0.74702616, "epoch": 0.6268391679350583, "grad_norm": 2.046875, "learning_rate": 8.219878538853537e-06, "loss": 0.99839001, "memory(GiB)": 369.4, "step": 24710, "train_speed(iter/s)": 0.20142 }, { "acc": 0.73277092, "epoch": 0.6269660071029934, "grad_norm": 1.9609375, "learning_rate": 8.219076222760307e-06, "loss": 1.04679203, "memory(GiB)": 369.4, "step": 24715, "train_speed(iter/s)": 0.201425 }, { "acc": 0.7300097, "epoch": 0.6270928462709284, "grad_norm": 2.03125, "learning_rate": 8.218273765077734e-06, "loss": 1.01174774, "memory(GiB)": 369.4, "step": 24720, "train_speed(iter/s)": 0.201429 }, { "acc": 0.75440302, "epoch": 0.6272196854388635, "grad_norm": 2.140625, "learning_rate": 8.21747116584111e-06, "loss": 0.9839921, "memory(GiB)": 369.4, "step": 24725, "train_speed(iter/s)": 0.201438 }, { "acc": 0.74269662, "epoch": 0.6273465246067986, "grad_norm": 1.9375, "learning_rate": 8.21666842508574e-06, "loss": 0.97122078, "memory(GiB)": 369.4, "step": 24730, "train_speed(iter/s)": 0.201445 }, { "acc": 0.75139761, "epoch": 0.6274733637747336, "grad_norm": 2.609375, "learning_rate": 8.215865542846932e-06, "loss": 1.09303179, "memory(GiB)": 369.4, "step": 24735, "train_speed(iter/s)": 0.201451 }, { "acc": 0.74000573, "epoch": 0.6276002029426687, "grad_norm": 2.234375, "learning_rate": 8.215062519160002e-06, "loss": 1.04172096, "memory(GiB)": 369.4, "step": 24740, "train_speed(iter/s)": 0.201457 }, { "acc": 0.7503201, "epoch": 0.6277270421106037, "grad_norm": 2.09375, "learning_rate": 8.214259354060263e-06, "loss": 1.02474298, "memory(GiB)": 369.4, "step": 24745, "train_speed(iter/s)": 0.201463 }, { "acc": 0.74456873, "epoch": 0.6278538812785388, "grad_norm": 2.46875, "learning_rate": 8.21345604758305e-06, "loss": 1.00963469, "memory(GiB)": 369.4, "step": 24750, "train_speed(iter/s)": 0.201469 }, { "acc": 0.75185623, "epoch": 0.6279807204464739, "grad_norm": 1.890625, "learning_rate": 8.212652599763693e-06, "loss": 0.98552818, "memory(GiB)": 369.4, "step": 24755, "train_speed(iter/s)": 0.201474 }, { "acc": 0.74589987, "epoch": 0.6281075596144089, "grad_norm": 2.953125, "learning_rate": 8.211849010637532e-06, "loss": 0.99353123, "memory(GiB)": 369.4, "step": 24760, "train_speed(iter/s)": 0.201482 }, { "acc": 0.73846083, "epoch": 0.628234398782344, "grad_norm": 2.046875, "learning_rate": 8.211045280239908e-06, "loss": 0.97884789, "memory(GiB)": 369.4, "step": 24765, "train_speed(iter/s)": 0.201488 }, { "acc": 0.7430438, "epoch": 0.6283612379502791, "grad_norm": 2.140625, "learning_rate": 8.210241408606182e-06, "loss": 0.98496056, "memory(GiB)": 369.4, "step": 24770, "train_speed(iter/s)": 0.201492 }, { "acc": 0.75083179, "epoch": 0.6284880771182141, "grad_norm": 2.140625, "learning_rate": 8.2094373957717e-06, "loss": 1.04032526, "memory(GiB)": 369.4, "step": 24775, "train_speed(iter/s)": 0.201499 }, { "acc": 0.74758215, "epoch": 0.6286149162861492, "grad_norm": 2.203125, "learning_rate": 8.208633241771836e-06, "loss": 0.9890192, "memory(GiB)": 369.4, "step": 24780, "train_speed(iter/s)": 0.201505 }, { "acc": 0.74584999, "epoch": 0.6287417554540842, "grad_norm": 2.1875, "learning_rate": 8.207828946641956e-06, "loss": 1.02772322, "memory(GiB)": 369.4, "step": 24785, "train_speed(iter/s)": 0.201511 }, { "acc": 0.75521164, "epoch": 0.6288685946220193, "grad_norm": 1.7421875, "learning_rate": 8.207024510417436e-06, "loss": 0.99766617, "memory(GiB)": 369.4, "step": 24790, "train_speed(iter/s)": 0.201517 }, { "acc": 0.74673147, "epoch": 0.6289954337899544, "grad_norm": 2.359375, "learning_rate": 8.20621993313366e-06, "loss": 1.02953949, "memory(GiB)": 369.4, "step": 24795, "train_speed(iter/s)": 0.201524 }, { "acc": 0.74859409, "epoch": 0.6291222729578894, "grad_norm": 2.21875, "learning_rate": 8.205415214826018e-06, "loss": 1.1649272, "memory(GiB)": 369.4, "step": 24800, "train_speed(iter/s)": 0.201528 }, { "acc": 0.7415472, "epoch": 0.6292491121258245, "grad_norm": 2.015625, "learning_rate": 8.204610355529901e-06, "loss": 1.02111311, "memory(GiB)": 369.4, "step": 24805, "train_speed(iter/s)": 0.201533 }, { "acc": 0.73837276, "epoch": 0.6293759512937596, "grad_norm": 2.046875, "learning_rate": 8.203805355280715e-06, "loss": 1.0113903, "memory(GiB)": 369.4, "step": 24810, "train_speed(iter/s)": 0.201538 }, { "acc": 0.74716835, "epoch": 0.6295027904616946, "grad_norm": 2.03125, "learning_rate": 8.203000214113865e-06, "loss": 1.03338394, "memory(GiB)": 369.4, "step": 24815, "train_speed(iter/s)": 0.201546 }, { "acc": 0.7490098, "epoch": 0.6296296296296297, "grad_norm": 2.15625, "learning_rate": 8.202194932064767e-06, "loss": 1.01458931, "memory(GiB)": 369.4, "step": 24820, "train_speed(iter/s)": 0.201551 }, { "acc": 0.74594641, "epoch": 0.6297564687975646, "grad_norm": 2.53125, "learning_rate": 8.201389509168836e-06, "loss": 1.03022575, "memory(GiB)": 369.4, "step": 24825, "train_speed(iter/s)": 0.201553 }, { "acc": 0.72532282, "epoch": 0.6298833079654997, "grad_norm": 2.40625, "learning_rate": 8.200583945461502e-06, "loss": 1.0452342, "memory(GiB)": 369.4, "step": 24830, "train_speed(iter/s)": 0.201557 }, { "acc": 0.74071093, "epoch": 0.6300101471334348, "grad_norm": 2.796875, "learning_rate": 8.199778240978197e-06, "loss": 0.98447552, "memory(GiB)": 369.4, "step": 24835, "train_speed(iter/s)": 0.201563 }, { "acc": 0.72836828, "epoch": 0.6301369863013698, "grad_norm": 2.578125, "learning_rate": 8.19897239575436e-06, "loss": 1.05360575, "memory(GiB)": 369.4, "step": 24840, "train_speed(iter/s)": 0.201571 }, { "acc": 0.72923708, "epoch": 0.6302638254693049, "grad_norm": 1.9921875, "learning_rate": 8.198166409825434e-06, "loss": 1.06442451, "memory(GiB)": 369.4, "step": 24845, "train_speed(iter/s)": 0.201577 }, { "acc": 0.74918041, "epoch": 0.63039066463724, "grad_norm": 2.546875, "learning_rate": 8.19736028322687e-06, "loss": 1.01742516, "memory(GiB)": 369.4, "step": 24850, "train_speed(iter/s)": 0.201585 }, { "acc": 0.7451499, "epoch": 0.630517503805175, "grad_norm": 1.8984375, "learning_rate": 8.196554015994126e-06, "loss": 1.01521492, "memory(GiB)": 369.4, "step": 24855, "train_speed(iter/s)": 0.201591 }, { "acc": 0.7508709, "epoch": 0.6306443429731101, "grad_norm": 2.21875, "learning_rate": 8.195747608162665e-06, "loss": 1.01280899, "memory(GiB)": 369.4, "step": 24860, "train_speed(iter/s)": 0.201599 }, { "acc": 0.74791088, "epoch": 0.6307711821410451, "grad_norm": 2.0, "learning_rate": 8.194941059767957e-06, "loss": 1.01124287, "memory(GiB)": 369.4, "step": 24865, "train_speed(iter/s)": 0.201604 }, { "acc": 0.73956404, "epoch": 0.6308980213089802, "grad_norm": 2.234375, "learning_rate": 8.194134370845474e-06, "loss": 1.0165103, "memory(GiB)": 369.4, "step": 24870, "train_speed(iter/s)": 0.201612 }, { "acc": 0.7463448, "epoch": 0.6310248604769153, "grad_norm": 2.5, "learning_rate": 8.193327541430703e-06, "loss": 1.04220943, "memory(GiB)": 369.4, "step": 24875, "train_speed(iter/s)": 0.201618 }, { "acc": 0.75267277, "epoch": 0.6311516996448503, "grad_norm": 2.234375, "learning_rate": 8.192520571559128e-06, "loss": 1.04846706, "memory(GiB)": 369.4, "step": 24880, "train_speed(iter/s)": 0.201625 }, { "acc": 0.75175934, "epoch": 0.6312785388127854, "grad_norm": 2.53125, "learning_rate": 8.191713461266246e-06, "loss": 1.01135235, "memory(GiB)": 369.4, "step": 24885, "train_speed(iter/s)": 0.201632 }, { "acc": 0.74319544, "epoch": 0.6314053779807205, "grad_norm": 2.046875, "learning_rate": 8.190906210587555e-06, "loss": 1.01913433, "memory(GiB)": 369.4, "step": 24890, "train_speed(iter/s)": 0.201637 }, { "acc": 0.75627031, "epoch": 0.6315322171486555, "grad_norm": 1.984375, "learning_rate": 8.190098819558562e-06, "loss": 0.94368362, "memory(GiB)": 369.4, "step": 24895, "train_speed(iter/s)": 0.201644 }, { "acc": 0.74081564, "epoch": 0.6316590563165906, "grad_norm": 1.84375, "learning_rate": 8.189291288214782e-06, "loss": 0.99618645, "memory(GiB)": 369.4, "step": 24900, "train_speed(iter/s)": 0.201652 }, { "acc": 0.73680053, "epoch": 0.6317858954845256, "grad_norm": 1.8046875, "learning_rate": 8.18848361659173e-06, "loss": 1.0365015, "memory(GiB)": 369.4, "step": 24905, "train_speed(iter/s)": 0.201656 }, { "acc": 0.75678272, "epoch": 0.6319127346524607, "grad_norm": 2.140625, "learning_rate": 8.187675804724935e-06, "loss": 1.01222525, "memory(GiB)": 369.4, "step": 24910, "train_speed(iter/s)": 0.201662 }, { "acc": 0.74445705, "epoch": 0.6320395738203958, "grad_norm": 2.03125, "learning_rate": 8.186867852649925e-06, "loss": 0.98885918, "memory(GiB)": 369.4, "step": 24915, "train_speed(iter/s)": 0.201668 }, { "acc": 0.75259223, "epoch": 0.6321664129883308, "grad_norm": 2.203125, "learning_rate": 8.186059760402238e-06, "loss": 1.00695162, "memory(GiB)": 369.4, "step": 24920, "train_speed(iter/s)": 0.201673 }, { "acc": 0.75643163, "epoch": 0.6322932521562659, "grad_norm": 2.09375, "learning_rate": 8.185251528017419e-06, "loss": 1.00791225, "memory(GiB)": 369.4, "step": 24925, "train_speed(iter/s)": 0.201678 }, { "acc": 0.7453722, "epoch": 0.632420091324201, "grad_norm": 2.234375, "learning_rate": 8.184443155531016e-06, "loss": 1.00427122, "memory(GiB)": 369.4, "step": 24930, "train_speed(iter/s)": 0.201685 }, { "acc": 0.75444441, "epoch": 0.632546930492136, "grad_norm": 1.9921875, "learning_rate": 8.183634642978586e-06, "loss": 0.99664688, "memory(GiB)": 369.4, "step": 24935, "train_speed(iter/s)": 0.20169 }, { "acc": 0.74882317, "epoch": 0.632673769660071, "grad_norm": 2.46875, "learning_rate": 8.18282599039569e-06, "loss": 0.98648129, "memory(GiB)": 369.4, "step": 24940, "train_speed(iter/s)": 0.201695 }, { "acc": 0.74301386, "epoch": 0.632800608828006, "grad_norm": 2.53125, "learning_rate": 8.182017197817898e-06, "loss": 1.03991318, "memory(GiB)": 369.4, "step": 24945, "train_speed(iter/s)": 0.201702 }, { "acc": 0.74833326, "epoch": 0.6329274479959411, "grad_norm": 2.0625, "learning_rate": 8.181208265280782e-06, "loss": 1.00993156, "memory(GiB)": 369.4, "step": 24950, "train_speed(iter/s)": 0.201703 }, { "acc": 0.73705521, "epoch": 0.6330542871638762, "grad_norm": 2.28125, "learning_rate": 8.180399192819923e-06, "loss": 1.04347992, "memory(GiB)": 369.4, "step": 24955, "train_speed(iter/s)": 0.20171 }, { "acc": 0.74451675, "epoch": 0.6331811263318112, "grad_norm": 2.609375, "learning_rate": 8.17958998047091e-06, "loss": 1.01787491, "memory(GiB)": 369.4, "step": 24960, "train_speed(iter/s)": 0.201716 }, { "acc": 0.75386286, "epoch": 0.6333079654997463, "grad_norm": 2.421875, "learning_rate": 8.178780628269332e-06, "loss": 0.95343666, "memory(GiB)": 369.4, "step": 24965, "train_speed(iter/s)": 0.201722 }, { "acc": 0.74805484, "epoch": 0.6334348046676814, "grad_norm": 2.125, "learning_rate": 8.177971136250788e-06, "loss": 1.06416903, "memory(GiB)": 369.4, "step": 24970, "train_speed(iter/s)": 0.201722 }, { "acc": 0.74678507, "epoch": 0.6335616438356164, "grad_norm": 2.015625, "learning_rate": 8.177161504450887e-06, "loss": 0.9986167, "memory(GiB)": 369.4, "step": 24975, "train_speed(iter/s)": 0.20173 }, { "acc": 0.74034138, "epoch": 0.6336884830035515, "grad_norm": 2.0, "learning_rate": 8.176351732905239e-06, "loss": 1.05146828, "memory(GiB)": 369.4, "step": 24980, "train_speed(iter/s)": 0.201737 }, { "acc": 0.74950905, "epoch": 0.6338153221714865, "grad_norm": 2.46875, "learning_rate": 8.175541821649459e-06, "loss": 1.04767227, "memory(GiB)": 369.4, "step": 24985, "train_speed(iter/s)": 0.201743 }, { "acc": 0.76734209, "epoch": 0.6339421613394216, "grad_norm": 2.25, "learning_rate": 8.174731770719173e-06, "loss": 0.95047817, "memory(GiB)": 369.4, "step": 24990, "train_speed(iter/s)": 0.201745 }, { "acc": 0.74658403, "epoch": 0.6340690005073567, "grad_norm": 1.96875, "learning_rate": 8.173921580150008e-06, "loss": 1.01711197, "memory(GiB)": 369.4, "step": 24995, "train_speed(iter/s)": 0.201751 }, { "acc": 0.73783355, "epoch": 0.6341958396752917, "grad_norm": 2.34375, "learning_rate": 8.173111249977602e-06, "loss": 1.08297615, "memory(GiB)": 369.4, "step": 25000, "train_speed(iter/s)": 0.201757 }, { "epoch": 0.6341958396752917, "eval_acc": 0.7360028107384581, "eval_loss": 0.979103684425354, "eval_runtime": 384.6458, "eval_samples_per_second": 16.561, "eval_steps_per_second": 8.28, "step": 25000 }, { "acc": 0.75729094, "epoch": 0.6343226788432268, "grad_norm": 2.203125, "learning_rate": 8.172300780237596e-06, "loss": 0.93621464, "memory(GiB)": 369.4, "step": 25005, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74424953, "epoch": 0.6344495180111619, "grad_norm": 2.234375, "learning_rate": 8.171490170965639e-06, "loss": 0.98406258, "memory(GiB)": 369.4, "step": 25010, "train_speed(iter/s)": 0.200615 }, { "acc": 0.76226788, "epoch": 0.6345763571790969, "grad_norm": 2.625, "learning_rate": 8.170679422197385e-06, "loss": 0.95543127, "memory(GiB)": 369.4, "step": 25015, "train_speed(iter/s)": 0.20062 }, { "acc": 0.74663849, "epoch": 0.634703196347032, "grad_norm": 2.328125, "learning_rate": 8.169868533968493e-06, "loss": 0.98571053, "memory(GiB)": 369.4, "step": 25020, "train_speed(iter/s)": 0.200626 }, { "acc": 0.74336824, "epoch": 0.634830035514967, "grad_norm": 1.8828125, "learning_rate": 8.16905750631463e-06, "loss": 1.03214064, "memory(GiB)": 369.4, "step": 25025, "train_speed(iter/s)": 0.200633 }, { "acc": 0.75226946, "epoch": 0.6349568746829021, "grad_norm": 2.90625, "learning_rate": 8.168246339271471e-06, "loss": 0.99833603, "memory(GiB)": 369.4, "step": 25030, "train_speed(iter/s)": 0.200637 }, { "acc": 0.73213792, "epoch": 0.6350837138508372, "grad_norm": 1.9375, "learning_rate": 8.16743503287469e-06, "loss": 1.00828247, "memory(GiB)": 369.4, "step": 25035, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74757004, "epoch": 0.6352105530187722, "grad_norm": 2.359375, "learning_rate": 8.166623587159978e-06, "loss": 0.96358347, "memory(GiB)": 369.4, "step": 25040, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75117016, "epoch": 0.6353373921867073, "grad_norm": 2.234375, "learning_rate": 8.16581200216302e-06, "loss": 0.99594955, "memory(GiB)": 369.4, "step": 25045, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74071808, "epoch": 0.6354642313546424, "grad_norm": 1.8984375, "learning_rate": 8.165000277919517e-06, "loss": 1.02067747, "memory(GiB)": 369.4, "step": 25050, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75011077, "epoch": 0.6355910705225774, "grad_norm": 2.328125, "learning_rate": 8.16418841446517e-06, "loss": 1.00558147, "memory(GiB)": 369.4, "step": 25055, "train_speed(iter/s)": 0.200665 }, { "acc": 0.7453475, "epoch": 0.6357179096905125, "grad_norm": 1.84375, "learning_rate": 8.163376411835691e-06, "loss": 1.01267395, "memory(GiB)": 369.4, "step": 25060, "train_speed(iter/s)": 0.200673 }, { "acc": 0.75674763, "epoch": 0.6358447488584474, "grad_norm": 1.8828125, "learning_rate": 8.162564270066793e-06, "loss": 0.94614162, "memory(GiB)": 369.4, "step": 25065, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75071878, "epoch": 0.6359715880263825, "grad_norm": 2.4375, "learning_rate": 8.1617519891942e-06, "loss": 1.01253872, "memory(GiB)": 369.4, "step": 25070, "train_speed(iter/s)": 0.200679 }, { "acc": 0.74408159, "epoch": 0.6360984271943176, "grad_norm": 1.984375, "learning_rate": 8.160939569253637e-06, "loss": 0.97565546, "memory(GiB)": 369.4, "step": 25075, "train_speed(iter/s)": 0.200685 }, { "acc": 0.7433042, "epoch": 0.6362252663622526, "grad_norm": 2.0625, "learning_rate": 8.160127010280838e-06, "loss": 1.01561375, "memory(GiB)": 369.4, "step": 25080, "train_speed(iter/s)": 0.200691 }, { "acc": 0.73845439, "epoch": 0.6363521055301877, "grad_norm": 2.671875, "learning_rate": 8.159314312311546e-06, "loss": 1.08016167, "memory(GiB)": 369.4, "step": 25085, "train_speed(iter/s)": 0.200696 }, { "acc": 0.74666023, "epoch": 0.6364789446981228, "grad_norm": 2.109375, "learning_rate": 8.158501475381505e-06, "loss": 1.05703583, "memory(GiB)": 369.4, "step": 25090, "train_speed(iter/s)": 0.200699 }, { "acc": 0.75175562, "epoch": 0.6366057838660578, "grad_norm": 1.9140625, "learning_rate": 8.157688499526466e-06, "loss": 0.91630487, "memory(GiB)": 369.4, "step": 25095, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75236096, "epoch": 0.6367326230339929, "grad_norm": 1.9609375, "learning_rate": 8.15687538478219e-06, "loss": 1.02088642, "memory(GiB)": 369.4, "step": 25100, "train_speed(iter/s)": 0.200698 }, { "acc": 0.73485661, "epoch": 0.6368594622019279, "grad_norm": 2.359375, "learning_rate": 8.156062131184439e-06, "loss": 1.04786558, "memory(GiB)": 369.4, "step": 25105, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74586821, "epoch": 0.636986301369863, "grad_norm": 2.40625, "learning_rate": 8.155248738768986e-06, "loss": 1.02865963, "memory(GiB)": 369.4, "step": 25110, "train_speed(iter/s)": 0.200712 }, { "acc": 0.75592318, "epoch": 0.6371131405377981, "grad_norm": 2.015625, "learning_rate": 8.154435207571606e-06, "loss": 0.93476725, "memory(GiB)": 369.4, "step": 25115, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75421081, "epoch": 0.6372399797057331, "grad_norm": 2.78125, "learning_rate": 8.153621537628083e-06, "loss": 0.96934738, "memory(GiB)": 369.4, "step": 25120, "train_speed(iter/s)": 0.200726 }, { "acc": 0.74488955, "epoch": 0.6373668188736682, "grad_norm": 2.25, "learning_rate": 8.152807728974203e-06, "loss": 0.9944849, "memory(GiB)": 369.4, "step": 25125, "train_speed(iter/s)": 0.20073 }, { "acc": 0.76406031, "epoch": 0.6374936580416033, "grad_norm": 2.53125, "learning_rate": 8.151993781645765e-06, "loss": 0.96549683, "memory(GiB)": 369.4, "step": 25130, "train_speed(iter/s)": 0.200735 }, { "acc": 0.72669849, "epoch": 0.6376204972095383, "grad_norm": 3.0625, "learning_rate": 8.151179695678565e-06, "loss": 1.04753742, "memory(GiB)": 369.4, "step": 25135, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75003757, "epoch": 0.6377473363774734, "grad_norm": 2.15625, "learning_rate": 8.150365471108414e-06, "loss": 0.97157249, "memory(GiB)": 369.4, "step": 25140, "train_speed(iter/s)": 0.200748 }, { "acc": 0.73633566, "epoch": 0.6378741755454084, "grad_norm": 2.015625, "learning_rate": 8.149551107971125e-06, "loss": 1.0455267, "memory(GiB)": 369.4, "step": 25145, "train_speed(iter/s)": 0.200755 }, { "acc": 0.72985568, "epoch": 0.6380010147133435, "grad_norm": 2.125, "learning_rate": 8.148736606302517e-06, "loss": 1.05796509, "memory(GiB)": 369.4, "step": 25150, "train_speed(iter/s)": 0.200759 }, { "acc": 0.73625288, "epoch": 0.6381278538812786, "grad_norm": 2.3125, "learning_rate": 8.147921966138412e-06, "loss": 1.04456348, "memory(GiB)": 369.4, "step": 25155, "train_speed(iter/s)": 0.200765 }, { "acc": 0.75609341, "epoch": 0.6382546930492136, "grad_norm": 1.7421875, "learning_rate": 8.147107187514647e-06, "loss": 0.9961174, "memory(GiB)": 369.4, "step": 25160, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75470715, "epoch": 0.6383815322171487, "grad_norm": 1.8125, "learning_rate": 8.146292270467056e-06, "loss": 0.89861784, "memory(GiB)": 369.4, "step": 25165, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75493956, "epoch": 0.6385083713850838, "grad_norm": 2.234375, "learning_rate": 8.145477215031486e-06, "loss": 0.98315125, "memory(GiB)": 369.4, "step": 25170, "train_speed(iter/s)": 0.200773 }, { "acc": 0.74666471, "epoch": 0.6386352105530188, "grad_norm": 2.5, "learning_rate": 8.144662021243782e-06, "loss": 0.96198235, "memory(GiB)": 369.4, "step": 25175, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74676065, "epoch": 0.6387620497209539, "grad_norm": 2.109375, "learning_rate": 8.143846689139805e-06, "loss": 1.07040892, "memory(GiB)": 369.4, "step": 25180, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74098368, "epoch": 0.6388888888888888, "grad_norm": 1.890625, "learning_rate": 8.143031218755411e-06, "loss": 1.03750658, "memory(GiB)": 369.4, "step": 25185, "train_speed(iter/s)": 0.200797 }, { "acc": 0.73168793, "epoch": 0.6390157280568239, "grad_norm": 2.4375, "learning_rate": 8.142215610126474e-06, "loss": 1.06921425, "memory(GiB)": 369.4, "step": 25190, "train_speed(iter/s)": 0.200802 }, { "acc": 0.76263433, "epoch": 0.639142567224759, "grad_norm": 2.1875, "learning_rate": 8.141399863288863e-06, "loss": 0.98110189, "memory(GiB)": 369.4, "step": 25195, "train_speed(iter/s)": 0.200804 }, { "acc": 0.73982029, "epoch": 0.639269406392694, "grad_norm": 3.5, "learning_rate": 8.140583978278463e-06, "loss": 0.98859043, "memory(GiB)": 369.4, "step": 25200, "train_speed(iter/s)": 0.200807 }, { "acc": 0.76273861, "epoch": 0.6393962455606291, "grad_norm": 1.828125, "learning_rate": 8.139767955131157e-06, "loss": 0.89837236, "memory(GiB)": 369.4, "step": 25205, "train_speed(iter/s)": 0.200813 }, { "acc": 0.7583282, "epoch": 0.6395230847285642, "grad_norm": 2.0, "learning_rate": 8.138951793882838e-06, "loss": 0.97107944, "memory(GiB)": 369.4, "step": 25210, "train_speed(iter/s)": 0.20082 }, { "acc": 0.75649748, "epoch": 0.6396499238964992, "grad_norm": 2.046875, "learning_rate": 8.138135494569405e-06, "loss": 0.97707882, "memory(GiB)": 369.4, "step": 25215, "train_speed(iter/s)": 0.200827 }, { "acc": 0.75313606, "epoch": 0.6397767630644343, "grad_norm": 2.28125, "learning_rate": 8.137319057226763e-06, "loss": 0.96301861, "memory(GiB)": 369.4, "step": 25220, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75630989, "epoch": 0.6399036022323693, "grad_norm": 1.9453125, "learning_rate": 8.136502481890821e-06, "loss": 0.99542542, "memory(GiB)": 369.4, "step": 25225, "train_speed(iter/s)": 0.200839 }, { "acc": 0.74964976, "epoch": 0.6400304414003044, "grad_norm": 2.5, "learning_rate": 8.135685768597496e-06, "loss": 0.98569527, "memory(GiB)": 369.4, "step": 25230, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74389248, "epoch": 0.6401572805682395, "grad_norm": 1.8828125, "learning_rate": 8.134868917382713e-06, "loss": 0.99372883, "memory(GiB)": 369.4, "step": 25235, "train_speed(iter/s)": 0.200843 }, { "acc": 0.73934736, "epoch": 0.6402841197361745, "grad_norm": 2.171875, "learning_rate": 8.134051928282396e-06, "loss": 1.03495722, "memory(GiB)": 369.4, "step": 25240, "train_speed(iter/s)": 0.200845 }, { "acc": 0.74615726, "epoch": 0.6404109589041096, "grad_norm": 2.0625, "learning_rate": 8.133234801332484e-06, "loss": 1.03705273, "memory(GiB)": 369.4, "step": 25245, "train_speed(iter/s)": 0.200852 }, { "acc": 0.7506052, "epoch": 0.6405377980720447, "grad_norm": 2.0625, "learning_rate": 8.132417536568918e-06, "loss": 0.98192987, "memory(GiB)": 369.4, "step": 25250, "train_speed(iter/s)": 0.200857 }, { "acc": 0.73155546, "epoch": 0.6406646372399797, "grad_norm": 2.109375, "learning_rate": 8.131600134027641e-06, "loss": 1.05795593, "memory(GiB)": 369.4, "step": 25255, "train_speed(iter/s)": 0.200863 }, { "acc": 0.74489813, "epoch": 0.6407914764079148, "grad_norm": 2.015625, "learning_rate": 8.13078259374461e-06, "loss": 0.99673634, "memory(GiB)": 369.4, "step": 25260, "train_speed(iter/s)": 0.200866 }, { "acc": 0.74354987, "epoch": 0.6409183155758498, "grad_norm": 2.5, "learning_rate": 8.129964915755781e-06, "loss": 0.99870853, "memory(GiB)": 369.4, "step": 25265, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75705423, "epoch": 0.6410451547437849, "grad_norm": 1.796875, "learning_rate": 8.129147100097122e-06, "loss": 0.93815422, "memory(GiB)": 369.4, "step": 25270, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74651713, "epoch": 0.64117199391172, "grad_norm": 1.96875, "learning_rate": 8.128329146804604e-06, "loss": 1.01533194, "memory(GiB)": 369.4, "step": 25275, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74745979, "epoch": 0.641298833079655, "grad_norm": 2.0625, "learning_rate": 8.127511055914201e-06, "loss": 0.99429607, "memory(GiB)": 369.4, "step": 25280, "train_speed(iter/s)": 0.200871 }, { "acc": 0.76788654, "epoch": 0.6414256722475901, "grad_norm": 2.40625, "learning_rate": 8.1266928274619e-06, "loss": 0.96833687, "memory(GiB)": 369.4, "step": 25285, "train_speed(iter/s)": 0.200876 }, { "acc": 0.7468873, "epoch": 0.6415525114155252, "grad_norm": 2.03125, "learning_rate": 8.125874461483687e-06, "loss": 1.00603056, "memory(GiB)": 369.4, "step": 25290, "train_speed(iter/s)": 0.200882 }, { "acc": 0.72998343, "epoch": 0.6416793505834602, "grad_norm": 2.21875, "learning_rate": 8.12505595801556e-06, "loss": 1.05517159, "memory(GiB)": 369.4, "step": 25295, "train_speed(iter/s)": 0.200887 }, { "acc": 0.73658113, "epoch": 0.6418061897513953, "grad_norm": 2.921875, "learning_rate": 8.12423731709352e-06, "loss": 1.0911644, "memory(GiB)": 369.4, "step": 25300, "train_speed(iter/s)": 0.200892 }, { "acc": 0.73996496, "epoch": 0.6419330289193302, "grad_norm": 2.0625, "learning_rate": 8.123418538753573e-06, "loss": 0.99165602, "memory(GiB)": 369.4, "step": 25305, "train_speed(iter/s)": 0.200901 }, { "acc": 0.75453596, "epoch": 0.6420598680872653, "grad_norm": 2.53125, "learning_rate": 8.122599623031735e-06, "loss": 0.97006779, "memory(GiB)": 369.4, "step": 25310, "train_speed(iter/s)": 0.200905 }, { "acc": 0.73877602, "epoch": 0.6421867072552004, "grad_norm": 1.78125, "learning_rate": 8.121780569964024e-06, "loss": 0.98853636, "memory(GiB)": 369.4, "step": 25315, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75294027, "epoch": 0.6423135464231354, "grad_norm": 2.25, "learning_rate": 8.120961379586466e-06, "loss": 1.00415478, "memory(GiB)": 369.4, "step": 25320, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74493942, "epoch": 0.6424403855910705, "grad_norm": 2.1875, "learning_rate": 8.120142051935092e-06, "loss": 1.05740652, "memory(GiB)": 369.4, "step": 25325, "train_speed(iter/s)": 0.200921 }, { "acc": 0.72447538, "epoch": 0.6425672247590056, "grad_norm": 2.53125, "learning_rate": 8.11932258704594e-06, "loss": 1.07692776, "memory(GiB)": 369.4, "step": 25330, "train_speed(iter/s)": 0.200926 }, { "acc": 0.71995678, "epoch": 0.6426940639269406, "grad_norm": 2.71875, "learning_rate": 8.118502984955053e-06, "loss": 1.11883259, "memory(GiB)": 369.4, "step": 25335, "train_speed(iter/s)": 0.200929 }, { "acc": 0.74705057, "epoch": 0.6428209030948757, "grad_norm": 2.15625, "learning_rate": 8.117683245698483e-06, "loss": 0.99765654, "memory(GiB)": 369.4, "step": 25340, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74440827, "epoch": 0.6429477422628107, "grad_norm": 2.234375, "learning_rate": 8.116863369312283e-06, "loss": 1.05735645, "memory(GiB)": 369.4, "step": 25345, "train_speed(iter/s)": 0.200941 }, { "acc": 0.74859405, "epoch": 0.6430745814307458, "grad_norm": 2.265625, "learning_rate": 8.116043355832518e-06, "loss": 1.05636215, "memory(GiB)": 369.4, "step": 25350, "train_speed(iter/s)": 0.200949 }, { "acc": 0.7413209, "epoch": 0.6432014205986809, "grad_norm": 1.8359375, "learning_rate": 8.115223205295253e-06, "loss": 1.0175806, "memory(GiB)": 369.4, "step": 25355, "train_speed(iter/s)": 0.200954 }, { "acc": 0.74845829, "epoch": 0.6433282597666159, "grad_norm": 2.484375, "learning_rate": 8.114402917736563e-06, "loss": 1.01814823, "memory(GiB)": 369.4, "step": 25360, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75141554, "epoch": 0.643455098934551, "grad_norm": 2.34375, "learning_rate": 8.113582493192529e-06, "loss": 0.96117096, "memory(GiB)": 369.4, "step": 25365, "train_speed(iter/s)": 0.200963 }, { "acc": 0.75840664, "epoch": 0.6435819381024861, "grad_norm": 2.296875, "learning_rate": 8.112761931699235e-06, "loss": 1.02792826, "memory(GiB)": 369.4, "step": 25370, "train_speed(iter/s)": 0.200969 }, { "acc": 0.73934269, "epoch": 0.6437087772704211, "grad_norm": 2.125, "learning_rate": 8.111941233292772e-06, "loss": 1.05126839, "memory(GiB)": 369.4, "step": 25375, "train_speed(iter/s)": 0.200976 }, { "acc": 0.75919428, "epoch": 0.6438356164383562, "grad_norm": 2.34375, "learning_rate": 8.111120398009243e-06, "loss": 0.98617001, "memory(GiB)": 369.4, "step": 25380, "train_speed(iter/s)": 0.200981 }, { "acc": 0.74082856, "epoch": 0.6439624556062912, "grad_norm": 1.9609375, "learning_rate": 8.110299425884745e-06, "loss": 1.0150775, "memory(GiB)": 369.4, "step": 25385, "train_speed(iter/s)": 0.200988 }, { "acc": 0.73149796, "epoch": 0.6440892947742263, "grad_norm": 1.96875, "learning_rate": 8.109478316955394e-06, "loss": 1.07188501, "memory(GiB)": 369.4, "step": 25390, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74301796, "epoch": 0.6442161339421614, "grad_norm": 2.078125, "learning_rate": 8.108657071257304e-06, "loss": 1.02836561, "memory(GiB)": 369.4, "step": 25395, "train_speed(iter/s)": 0.200999 }, { "acc": 0.74168653, "epoch": 0.6443429731100964, "grad_norm": 2.265625, "learning_rate": 8.107835688826598e-06, "loss": 1.01830502, "memory(GiB)": 369.4, "step": 25400, "train_speed(iter/s)": 0.201004 }, { "acc": 0.75675263, "epoch": 0.6444698122780315, "grad_norm": 1.8828125, "learning_rate": 8.1070141696994e-06, "loss": 1.00914841, "memory(GiB)": 369.4, "step": 25405, "train_speed(iter/s)": 0.201011 }, { "acc": 0.75294075, "epoch": 0.6445966514459666, "grad_norm": 2.140625, "learning_rate": 8.106192513911849e-06, "loss": 0.99197416, "memory(GiB)": 369.4, "step": 25410, "train_speed(iter/s)": 0.201019 }, { "acc": 0.74153795, "epoch": 0.6447234906139016, "grad_norm": 1.9921875, "learning_rate": 8.105370721500083e-06, "loss": 1.09803886, "memory(GiB)": 369.4, "step": 25415, "train_speed(iter/s)": 0.201021 }, { "acc": 0.74467821, "epoch": 0.6448503297818367, "grad_norm": 2.15625, "learning_rate": 8.104548792500246e-06, "loss": 0.98852587, "memory(GiB)": 369.4, "step": 25420, "train_speed(iter/s)": 0.201023 }, { "acc": 0.74837089, "epoch": 0.6449771689497716, "grad_norm": 2.265625, "learning_rate": 8.103726726948495e-06, "loss": 0.99299707, "memory(GiB)": 369.4, "step": 25425, "train_speed(iter/s)": 0.20103 }, { "acc": 0.74244337, "epoch": 0.6451040081177067, "grad_norm": 1.859375, "learning_rate": 8.102904524880985e-06, "loss": 1.0139677, "memory(GiB)": 369.4, "step": 25430, "train_speed(iter/s)": 0.201037 }, { "acc": 0.7401031, "epoch": 0.6452308472856418, "grad_norm": 1.953125, "learning_rate": 8.10208218633388e-06, "loss": 1.00345955, "memory(GiB)": 369.4, "step": 25435, "train_speed(iter/s)": 0.201044 }, { "acc": 0.74748383, "epoch": 0.6453576864535768, "grad_norm": 2.28125, "learning_rate": 8.10125971134335e-06, "loss": 1.0037776, "memory(GiB)": 369.4, "step": 25440, "train_speed(iter/s)": 0.20105 }, { "acc": 0.75089951, "epoch": 0.6454845256215119, "grad_norm": 2.1875, "learning_rate": 8.100437099945572e-06, "loss": 0.99049797, "memory(GiB)": 369.4, "step": 25445, "train_speed(iter/s)": 0.201059 }, { "acc": 0.75248346, "epoch": 0.645611364789447, "grad_norm": 1.8984375, "learning_rate": 8.099614352176727e-06, "loss": 0.95847893, "memory(GiB)": 369.4, "step": 25450, "train_speed(iter/s)": 0.201066 }, { "acc": 0.74573832, "epoch": 0.645738203957382, "grad_norm": 2.328125, "learning_rate": 8.098791468073007e-06, "loss": 0.99346905, "memory(GiB)": 369.4, "step": 25455, "train_speed(iter/s)": 0.201071 }, { "acc": 0.74913797, "epoch": 0.6458650431253171, "grad_norm": 2.625, "learning_rate": 8.097968447670601e-06, "loss": 0.99985924, "memory(GiB)": 369.4, "step": 25460, "train_speed(iter/s)": 0.201079 }, { "acc": 0.76180182, "epoch": 0.6459918822932521, "grad_norm": 2.40625, "learning_rate": 8.09714529100571e-06, "loss": 0.97547846, "memory(GiB)": 369.4, "step": 25465, "train_speed(iter/s)": 0.201083 }, { "acc": 0.74493618, "epoch": 0.6461187214611872, "grad_norm": 2.25, "learning_rate": 8.096321998114545e-06, "loss": 1.01438065, "memory(GiB)": 369.4, "step": 25470, "train_speed(iter/s)": 0.201088 }, { "acc": 0.75934057, "epoch": 0.6462455606291223, "grad_norm": 2.609375, "learning_rate": 8.09549856903331e-06, "loss": 0.94754505, "memory(GiB)": 369.4, "step": 25475, "train_speed(iter/s)": 0.201094 }, { "acc": 0.73858981, "epoch": 0.6463723997970573, "grad_norm": 2.046875, "learning_rate": 8.094675003798232e-06, "loss": 1.03587933, "memory(GiB)": 369.4, "step": 25480, "train_speed(iter/s)": 0.201101 }, { "acc": 0.76028032, "epoch": 0.6464992389649924, "grad_norm": 2.515625, "learning_rate": 8.093851302445528e-06, "loss": 0.9675024, "memory(GiB)": 369.4, "step": 25485, "train_speed(iter/s)": 0.201104 }, { "acc": 0.74901514, "epoch": 0.6466260781329275, "grad_norm": 2.09375, "learning_rate": 8.093027465011431e-06, "loss": 0.9869709, "memory(GiB)": 369.4, "step": 25490, "train_speed(iter/s)": 0.201111 }, { "acc": 0.7574625, "epoch": 0.6467529173008625, "grad_norm": 2.03125, "learning_rate": 8.092203491532178e-06, "loss": 0.97576199, "memory(GiB)": 369.4, "step": 25495, "train_speed(iter/s)": 0.201116 }, { "acc": 0.74693956, "epoch": 0.6468797564687976, "grad_norm": 2.21875, "learning_rate": 8.091379382044009e-06, "loss": 1.04526987, "memory(GiB)": 369.4, "step": 25500, "train_speed(iter/s)": 0.201121 }, { "acc": 0.74500408, "epoch": 0.6470065956367326, "grad_norm": 2.40625, "learning_rate": 8.090555136583172e-06, "loss": 1.01962166, "memory(GiB)": 369.4, "step": 25505, "train_speed(iter/s)": 0.201126 }, { "acc": 0.74983773, "epoch": 0.6471334348046677, "grad_norm": 2.1875, "learning_rate": 8.089730755185921e-06, "loss": 1.00746212, "memory(GiB)": 369.4, "step": 25510, "train_speed(iter/s)": 0.201132 }, { "acc": 0.74283886, "epoch": 0.6472602739726028, "grad_norm": 2.296875, "learning_rate": 8.088906237888517e-06, "loss": 1.02286215, "memory(GiB)": 369.4, "step": 25515, "train_speed(iter/s)": 0.201133 }, { "acc": 0.74336491, "epoch": 0.6473871131405378, "grad_norm": 1.9140625, "learning_rate": 8.088081584727228e-06, "loss": 1.02201595, "memory(GiB)": 369.4, "step": 25520, "train_speed(iter/s)": 0.201138 }, { "acc": 0.74212494, "epoch": 0.6475139523084729, "grad_norm": 2.609375, "learning_rate": 8.08725679573832e-06, "loss": 0.98077221, "memory(GiB)": 369.4, "step": 25525, "train_speed(iter/s)": 0.201146 }, { "acc": 0.75476866, "epoch": 0.647640791476408, "grad_norm": 2.078125, "learning_rate": 8.086431870958078e-06, "loss": 0.99059258, "memory(GiB)": 369.4, "step": 25530, "train_speed(iter/s)": 0.201148 }, { "acc": 0.74127116, "epoch": 0.647767630644343, "grad_norm": 1.7578125, "learning_rate": 8.085606810422781e-06, "loss": 1.02728786, "memory(GiB)": 369.4, "step": 25535, "train_speed(iter/s)": 0.201155 }, { "acc": 0.73481898, "epoch": 0.647894469812278, "grad_norm": 2.265625, "learning_rate": 8.08478161416872e-06, "loss": 1.06044617, "memory(GiB)": 369.4, "step": 25540, "train_speed(iter/s)": 0.201163 }, { "acc": 0.74414449, "epoch": 0.648021308980213, "grad_norm": 2.03125, "learning_rate": 8.083956282232192e-06, "loss": 1.04142952, "memory(GiB)": 369.4, "step": 25545, "train_speed(iter/s)": 0.201171 }, { "acc": 0.74195738, "epoch": 0.6481481481481481, "grad_norm": 2.21875, "learning_rate": 8.083130814649498e-06, "loss": 0.99614735, "memory(GiB)": 369.4, "step": 25550, "train_speed(iter/s)": 0.201176 }, { "acc": 0.76188955, "epoch": 0.6482749873160832, "grad_norm": 2.25, "learning_rate": 8.082305211456943e-06, "loss": 0.96052294, "memory(GiB)": 369.4, "step": 25555, "train_speed(iter/s)": 0.201182 }, { "acc": 0.73535738, "epoch": 0.6484018264840182, "grad_norm": 1.8984375, "learning_rate": 8.081479472690846e-06, "loss": 1.05721817, "memory(GiB)": 369.4, "step": 25560, "train_speed(iter/s)": 0.201186 }, { "acc": 0.74056754, "epoch": 0.6485286656519533, "grad_norm": 1.71875, "learning_rate": 8.080653598387522e-06, "loss": 0.9808073, "memory(GiB)": 369.4, "step": 25565, "train_speed(iter/s)": 0.201192 }, { "acc": 0.7660995, "epoch": 0.6486555048198884, "grad_norm": 2.3125, "learning_rate": 8.0798275885833e-06, "loss": 0.98052664, "memory(GiB)": 369.4, "step": 25570, "train_speed(iter/s)": 0.201197 }, { "acc": 0.75150671, "epoch": 0.6487823439878234, "grad_norm": 2.203125, "learning_rate": 8.07900144331451e-06, "loss": 0.96880312, "memory(GiB)": 369.4, "step": 25575, "train_speed(iter/s)": 0.201203 }, { "acc": 0.74596448, "epoch": 0.6489091831557585, "grad_norm": 2.578125, "learning_rate": 8.07817516261749e-06, "loss": 1.00768499, "memory(GiB)": 369.4, "step": 25580, "train_speed(iter/s)": 0.201209 }, { "acc": 0.74000769, "epoch": 0.6490360223236935, "grad_norm": 1.8359375, "learning_rate": 8.077348746528583e-06, "loss": 0.98608856, "memory(GiB)": 369.4, "step": 25585, "train_speed(iter/s)": 0.201216 }, { "acc": 0.75335269, "epoch": 0.6491628614916286, "grad_norm": 1.9765625, "learning_rate": 8.076522195084139e-06, "loss": 0.9646492, "memory(GiB)": 369.4, "step": 25590, "train_speed(iter/s)": 0.20122 }, { "acc": 0.75453043, "epoch": 0.6492897006595637, "grad_norm": 1.9140625, "learning_rate": 8.075695508320512e-06, "loss": 1.01013718, "memory(GiB)": 369.4, "step": 25595, "train_speed(iter/s)": 0.201223 }, { "acc": 0.74618192, "epoch": 0.6494165398274987, "grad_norm": 2.5625, "learning_rate": 8.074868686274065e-06, "loss": 1.06768599, "memory(GiB)": 369.4, "step": 25600, "train_speed(iter/s)": 0.201229 }, { "acc": 0.75878625, "epoch": 0.6495433789954338, "grad_norm": 2.3125, "learning_rate": 8.074041728981166e-06, "loss": 0.95686035, "memory(GiB)": 369.4, "step": 25605, "train_speed(iter/s)": 0.201233 }, { "acc": 0.7463666, "epoch": 0.6496702181633689, "grad_norm": 2.03125, "learning_rate": 8.073214636478186e-06, "loss": 1.00092754, "memory(GiB)": 369.4, "step": 25610, "train_speed(iter/s)": 0.201238 }, { "acc": 0.74404602, "epoch": 0.6497970573313039, "grad_norm": 2.9375, "learning_rate": 8.072387408801506e-06, "loss": 1.07703381, "memory(GiB)": 369.4, "step": 25615, "train_speed(iter/s)": 0.201243 }, { "acc": 0.74263172, "epoch": 0.649923896499239, "grad_norm": 1.8203125, "learning_rate": 8.07156004598751e-06, "loss": 0.99676819, "memory(GiB)": 369.4, "step": 25620, "train_speed(iter/s)": 0.201248 }, { "acc": 0.75003014, "epoch": 0.650050735667174, "grad_norm": 2.0, "learning_rate": 8.07073254807259e-06, "loss": 1.01833668, "memory(GiB)": 369.4, "step": 25625, "train_speed(iter/s)": 0.201254 }, { "acc": 0.74470053, "epoch": 0.6501775748351091, "grad_norm": 1.984375, "learning_rate": 8.069904915093144e-06, "loss": 1.02054901, "memory(GiB)": 369.4, "step": 25630, "train_speed(iter/s)": 0.201261 }, { "acc": 0.73972697, "epoch": 0.6503044140030442, "grad_norm": 1.734375, "learning_rate": 8.069077147085571e-06, "loss": 0.9811698, "memory(GiB)": 369.4, "step": 25635, "train_speed(iter/s)": 0.201265 }, { "acc": 0.76143236, "epoch": 0.6504312531709792, "grad_norm": 2.09375, "learning_rate": 8.068249244086283e-06, "loss": 0.97978706, "memory(GiB)": 369.4, "step": 25640, "train_speed(iter/s)": 0.201272 }, { "acc": 0.73448606, "epoch": 0.6505580923389143, "grad_norm": 2.296875, "learning_rate": 8.067421206131696e-06, "loss": 1.06532135, "memory(GiB)": 369.4, "step": 25645, "train_speed(iter/s)": 0.201277 }, { "acc": 0.74652205, "epoch": 0.6506849315068494, "grad_norm": 2.375, "learning_rate": 8.06659303325823e-06, "loss": 0.98827848, "memory(GiB)": 369.4, "step": 25650, "train_speed(iter/s)": 0.201282 }, { "acc": 0.74698315, "epoch": 0.6508117706747844, "grad_norm": 2.1875, "learning_rate": 8.06576472550231e-06, "loss": 1.03368359, "memory(GiB)": 369.4, "step": 25655, "train_speed(iter/s)": 0.201288 }, { "acc": 0.74212523, "epoch": 0.6509386098427195, "grad_norm": 2.28125, "learning_rate": 8.064936282900368e-06, "loss": 1.00601425, "memory(GiB)": 369.4, "step": 25660, "train_speed(iter/s)": 0.201293 }, { "acc": 0.73239746, "epoch": 0.6510654490106544, "grad_norm": 2.359375, "learning_rate": 8.064107705488846e-06, "loss": 1.02146339, "memory(GiB)": 369.4, "step": 25665, "train_speed(iter/s)": 0.201299 }, { "acc": 0.75370302, "epoch": 0.6511922881785895, "grad_norm": 2.3125, "learning_rate": 8.063278993304188e-06, "loss": 0.97682409, "memory(GiB)": 369.4, "step": 25670, "train_speed(iter/s)": 0.201305 }, { "acc": 0.74160213, "epoch": 0.6513191273465246, "grad_norm": 2.40625, "learning_rate": 8.06245014638284e-06, "loss": 1.00169067, "memory(GiB)": 369.4, "step": 25675, "train_speed(iter/s)": 0.201306 }, { "acc": 0.75318823, "epoch": 0.6514459665144596, "grad_norm": 2.5625, "learning_rate": 8.061621164761266e-06, "loss": 0.9478837, "memory(GiB)": 369.4, "step": 25680, "train_speed(iter/s)": 0.20131 }, { "acc": 0.74233398, "epoch": 0.6515728056823947, "grad_norm": 2.109375, "learning_rate": 8.06079204847592e-06, "loss": 0.98795977, "memory(GiB)": 369.4, "step": 25685, "train_speed(iter/s)": 0.20131 }, { "acc": 0.75198946, "epoch": 0.6516996448503298, "grad_norm": 1.90625, "learning_rate": 8.059962797563277e-06, "loss": 0.97646103, "memory(GiB)": 369.4, "step": 25690, "train_speed(iter/s)": 0.201318 }, { "acc": 0.75177455, "epoch": 0.6518264840182648, "grad_norm": 1.8359375, "learning_rate": 8.059133412059808e-06, "loss": 0.97208796, "memory(GiB)": 369.4, "step": 25695, "train_speed(iter/s)": 0.201324 }, { "acc": 0.75066872, "epoch": 0.6519533231861999, "grad_norm": 2.3125, "learning_rate": 8.058303892001993e-06, "loss": 0.9941164, "memory(GiB)": 369.4, "step": 25700, "train_speed(iter/s)": 0.201325 }, { "acc": 0.73765345, "epoch": 0.6520801623541349, "grad_norm": 2.03125, "learning_rate": 8.057474237426318e-06, "loss": 0.98123474, "memory(GiB)": 369.4, "step": 25705, "train_speed(iter/s)": 0.201333 }, { "acc": 0.74252334, "epoch": 0.65220700152207, "grad_norm": 2.28125, "learning_rate": 8.056644448369275e-06, "loss": 1.07736969, "memory(GiB)": 369.4, "step": 25710, "train_speed(iter/s)": 0.201339 }, { "acc": 0.74507499, "epoch": 0.6523338406900051, "grad_norm": 1.9296875, "learning_rate": 8.055814524867364e-06, "loss": 1.0370472, "memory(GiB)": 369.4, "step": 25715, "train_speed(iter/s)": 0.201345 }, { "acc": 0.73675852, "epoch": 0.6524606798579401, "grad_norm": 2.28125, "learning_rate": 8.054984466957085e-06, "loss": 1.08070946, "memory(GiB)": 369.4, "step": 25720, "train_speed(iter/s)": 0.201352 }, { "acc": 0.76549091, "epoch": 0.6525875190258752, "grad_norm": 2.125, "learning_rate": 8.05415427467495e-06, "loss": 0.88928909, "memory(GiB)": 369.4, "step": 25725, "train_speed(iter/s)": 0.201359 }, { "acc": 0.75462322, "epoch": 0.6527143581938103, "grad_norm": 1.984375, "learning_rate": 8.053323948057477e-06, "loss": 1.00843334, "memory(GiB)": 369.4, "step": 25730, "train_speed(iter/s)": 0.201366 }, { "acc": 0.74298496, "epoch": 0.6528411973617453, "grad_norm": 2.21875, "learning_rate": 8.052493487141183e-06, "loss": 0.99805918, "memory(GiB)": 369.4, "step": 25735, "train_speed(iter/s)": 0.20137 }, { "acc": 0.72842879, "epoch": 0.6529680365296804, "grad_norm": 2.015625, "learning_rate": 8.051662891962594e-06, "loss": 1.03302774, "memory(GiB)": 369.4, "step": 25740, "train_speed(iter/s)": 0.201375 }, { "acc": 0.74435587, "epoch": 0.6530948756976154, "grad_norm": 1.8984375, "learning_rate": 8.05083216255825e-06, "loss": 1.03268814, "memory(GiB)": 369.4, "step": 25745, "train_speed(iter/s)": 0.201377 }, { "acc": 0.75300331, "epoch": 0.6532217148655505, "grad_norm": 1.8984375, "learning_rate": 8.050001298964685e-06, "loss": 1.06785622, "memory(GiB)": 369.4, "step": 25750, "train_speed(iter/s)": 0.201379 }, { "acc": 0.75359488, "epoch": 0.6533485540334856, "grad_norm": 2.296875, "learning_rate": 8.049170301218445e-06, "loss": 1.02664776, "memory(GiB)": 369.4, "step": 25755, "train_speed(iter/s)": 0.201385 }, { "acc": 0.75693426, "epoch": 0.6534753932014206, "grad_norm": 2.46875, "learning_rate": 8.048339169356085e-06, "loss": 1.01990728, "memory(GiB)": 369.4, "step": 25760, "train_speed(iter/s)": 0.201392 }, { "acc": 0.75614457, "epoch": 0.6536022323693557, "grad_norm": 1.8984375, "learning_rate": 8.047507903414155e-06, "loss": 0.94472513, "memory(GiB)": 369.4, "step": 25765, "train_speed(iter/s)": 0.201397 }, { "acc": 0.76192303, "epoch": 0.6537290715372908, "grad_norm": 2.0625, "learning_rate": 8.046676503429222e-06, "loss": 0.98495636, "memory(GiB)": 369.4, "step": 25770, "train_speed(iter/s)": 0.201403 }, { "acc": 0.75143905, "epoch": 0.6538559107052258, "grad_norm": 2.265625, "learning_rate": 8.045844969437855e-06, "loss": 0.99822941, "memory(GiB)": 369.4, "step": 25775, "train_speed(iter/s)": 0.201409 }, { "acc": 0.73327837, "epoch": 0.6539827498731609, "grad_norm": 2.171875, "learning_rate": 8.045013301476625e-06, "loss": 1.02659988, "memory(GiB)": 369.4, "step": 25780, "train_speed(iter/s)": 0.201414 }, { "acc": 0.74999905, "epoch": 0.6541095890410958, "grad_norm": 1.8203125, "learning_rate": 8.044181499582117e-06, "loss": 1.01552048, "memory(GiB)": 369.4, "step": 25785, "train_speed(iter/s)": 0.201422 }, { "acc": 0.75360403, "epoch": 0.6542364282090309, "grad_norm": 2.296875, "learning_rate": 8.043349563790917e-06, "loss": 1.02453823, "memory(GiB)": 369.4, "step": 25790, "train_speed(iter/s)": 0.201426 }, { "acc": 0.75155993, "epoch": 0.654363267376966, "grad_norm": 2.21875, "learning_rate": 8.042517494139612e-06, "loss": 1.05082321, "memory(GiB)": 369.4, "step": 25795, "train_speed(iter/s)": 0.201435 }, { "acc": 0.74545937, "epoch": 0.654490106544901, "grad_norm": 1.8515625, "learning_rate": 8.041685290664806e-06, "loss": 0.95632896, "memory(GiB)": 369.4, "step": 25800, "train_speed(iter/s)": 0.201439 }, { "acc": 0.74585705, "epoch": 0.6546169457128361, "grad_norm": 2.109375, "learning_rate": 8.0408529534031e-06, "loss": 0.98624258, "memory(GiB)": 369.4, "step": 25805, "train_speed(iter/s)": 0.201447 }, { "acc": 0.74844894, "epoch": 0.6547437848807712, "grad_norm": 1.7578125, "learning_rate": 8.040020482391105e-06, "loss": 0.99251928, "memory(GiB)": 369.4, "step": 25810, "train_speed(iter/s)": 0.201455 }, { "acc": 0.77407804, "epoch": 0.6548706240487062, "grad_norm": 1.9453125, "learning_rate": 8.039187877665435e-06, "loss": 0.92592993, "memory(GiB)": 369.4, "step": 25815, "train_speed(iter/s)": 0.201462 }, { "acc": 0.75632982, "epoch": 0.6549974632166413, "grad_norm": 2.796875, "learning_rate": 8.038355139262716e-06, "loss": 0.97435017, "memory(GiB)": 369.4, "step": 25820, "train_speed(iter/s)": 0.201466 }, { "acc": 0.76045475, "epoch": 0.6551243023845763, "grad_norm": 2.15625, "learning_rate": 8.037522267219571e-06, "loss": 0.97769346, "memory(GiB)": 369.4, "step": 25825, "train_speed(iter/s)": 0.201469 }, { "acc": 0.73252578, "epoch": 0.6552511415525114, "grad_norm": 2.171875, "learning_rate": 8.036689261572636e-06, "loss": 0.99602203, "memory(GiB)": 369.4, "step": 25830, "train_speed(iter/s)": 0.201476 }, { "acc": 0.74079976, "epoch": 0.6553779807204465, "grad_norm": 2.609375, "learning_rate": 8.035856122358548e-06, "loss": 1.04164419, "memory(GiB)": 369.4, "step": 25835, "train_speed(iter/s)": 0.201484 }, { "acc": 0.74821219, "epoch": 0.6555048198883815, "grad_norm": 1.953125, "learning_rate": 8.035022849613954e-06, "loss": 1.02013607, "memory(GiB)": 369.4, "step": 25840, "train_speed(iter/s)": 0.201489 }, { "acc": 0.74905224, "epoch": 0.6556316590563166, "grad_norm": 2.40625, "learning_rate": 8.034189443375505e-06, "loss": 1.01453323, "memory(GiB)": 369.4, "step": 25845, "train_speed(iter/s)": 0.201494 }, { "acc": 0.74130926, "epoch": 0.6557584982242517, "grad_norm": 2.015625, "learning_rate": 8.033355903679858e-06, "loss": 0.96845951, "memory(GiB)": 369.4, "step": 25850, "train_speed(iter/s)": 0.201503 }, { "acc": 0.75233192, "epoch": 0.6558853373921867, "grad_norm": 2.375, "learning_rate": 8.032522230563676e-06, "loss": 1.03006725, "memory(GiB)": 369.4, "step": 25855, "train_speed(iter/s)": 0.20151 }, { "acc": 0.73105354, "epoch": 0.6560121765601218, "grad_norm": 2.359375, "learning_rate": 8.031688424063625e-06, "loss": 1.00935745, "memory(GiB)": 369.4, "step": 25860, "train_speed(iter/s)": 0.201512 }, { "acc": 0.7506001, "epoch": 0.6561390157280568, "grad_norm": 2.25, "learning_rate": 8.030854484216381e-06, "loss": 1.02319298, "memory(GiB)": 369.4, "step": 25865, "train_speed(iter/s)": 0.201519 }, { "acc": 0.74706445, "epoch": 0.6562658548959919, "grad_norm": 1.8515625, "learning_rate": 8.030020411058627e-06, "loss": 0.99895496, "memory(GiB)": 369.4, "step": 25870, "train_speed(iter/s)": 0.201525 }, { "acc": 0.75051384, "epoch": 0.656392694063927, "grad_norm": 2.09375, "learning_rate": 8.029186204627049e-06, "loss": 0.97161694, "memory(GiB)": 369.4, "step": 25875, "train_speed(iter/s)": 0.201529 }, { "acc": 0.74495106, "epoch": 0.656519533231862, "grad_norm": 2.703125, "learning_rate": 8.028351864958335e-06, "loss": 0.99420662, "memory(GiB)": 369.4, "step": 25880, "train_speed(iter/s)": 0.201533 }, { "acc": 0.76058283, "epoch": 0.6566463723997971, "grad_norm": 2.125, "learning_rate": 8.027517392089185e-06, "loss": 0.9720705, "memory(GiB)": 369.4, "step": 25885, "train_speed(iter/s)": 0.201539 }, { "acc": 0.75552273, "epoch": 0.6567732115677322, "grad_norm": 2.359375, "learning_rate": 8.026682786056304e-06, "loss": 1.04396458, "memory(GiB)": 369.4, "step": 25890, "train_speed(iter/s)": 0.201545 }, { "acc": 0.73438773, "epoch": 0.6569000507356672, "grad_norm": 1.8203125, "learning_rate": 8.025848046896401e-06, "loss": 1.0625989, "memory(GiB)": 369.4, "step": 25895, "train_speed(iter/s)": 0.201549 }, { "acc": 0.74941969, "epoch": 0.6570268899036023, "grad_norm": 2.15625, "learning_rate": 8.02501317464619e-06, "loss": 1.00712147, "memory(GiB)": 369.4, "step": 25900, "train_speed(iter/s)": 0.201557 }, { "acc": 0.75229034, "epoch": 0.6571537290715372, "grad_norm": 2.890625, "learning_rate": 8.024178169342396e-06, "loss": 1.03695221, "memory(GiB)": 369.4, "step": 25905, "train_speed(iter/s)": 0.201563 }, { "acc": 0.75140424, "epoch": 0.6572805682394723, "grad_norm": 2.09375, "learning_rate": 8.023343031021744e-06, "loss": 1.00779781, "memory(GiB)": 369.4, "step": 25910, "train_speed(iter/s)": 0.201565 }, { "acc": 0.73999004, "epoch": 0.6574074074074074, "grad_norm": 1.9921875, "learning_rate": 8.022507759720966e-06, "loss": 1.00888271, "memory(GiB)": 369.4, "step": 25915, "train_speed(iter/s)": 0.201573 }, { "acc": 0.74001646, "epoch": 0.6575342465753424, "grad_norm": 2.15625, "learning_rate": 8.021672355476802e-06, "loss": 1.07879963, "memory(GiB)": 369.4, "step": 25920, "train_speed(iter/s)": 0.201578 }, { "acc": 0.74793692, "epoch": 0.6576610857432775, "grad_norm": 2.078125, "learning_rate": 8.020836818325997e-06, "loss": 0.99961987, "memory(GiB)": 369.4, "step": 25925, "train_speed(iter/s)": 0.201584 }, { "acc": 0.75060067, "epoch": 0.6577879249112126, "grad_norm": 2.640625, "learning_rate": 8.020001148305304e-06, "loss": 0.96782494, "memory(GiB)": 369.4, "step": 25930, "train_speed(iter/s)": 0.20159 }, { "acc": 0.7605978, "epoch": 0.6579147640791476, "grad_norm": 2.25, "learning_rate": 8.019165345451475e-06, "loss": 0.95436115, "memory(GiB)": 369.4, "step": 25935, "train_speed(iter/s)": 0.201596 }, { "acc": 0.75126772, "epoch": 0.6580416032470827, "grad_norm": 2.6875, "learning_rate": 8.018329409801276e-06, "loss": 0.98373337, "memory(GiB)": 369.4, "step": 25940, "train_speed(iter/s)": 0.201602 }, { "acc": 0.75624437, "epoch": 0.6581684424150177, "grad_norm": 1.9453125, "learning_rate": 8.017493341391471e-06, "loss": 0.94460163, "memory(GiB)": 369.4, "step": 25945, "train_speed(iter/s)": 0.201606 }, { "acc": 0.74112139, "epoch": 0.6582952815829528, "grad_norm": 2.109375, "learning_rate": 8.016657140258839e-06, "loss": 0.98373718, "memory(GiB)": 369.4, "step": 25950, "train_speed(iter/s)": 0.201612 }, { "acc": 0.752, "epoch": 0.6584221207508879, "grad_norm": 2.390625, "learning_rate": 8.015820806440157e-06, "loss": 1.01236496, "memory(GiB)": 369.4, "step": 25955, "train_speed(iter/s)": 0.201619 }, { "acc": 0.74017973, "epoch": 0.6585489599188229, "grad_norm": 2.109375, "learning_rate": 8.014984339972211e-06, "loss": 1.00290661, "memory(GiB)": 369.4, "step": 25960, "train_speed(iter/s)": 0.201622 }, { "acc": 0.74596796, "epoch": 0.658675799086758, "grad_norm": 2.09375, "learning_rate": 8.014147740891793e-06, "loss": 1.00811977, "memory(GiB)": 369.4, "step": 25965, "train_speed(iter/s)": 0.201626 }, { "acc": 0.75712252, "epoch": 0.6588026382546931, "grad_norm": 2.46875, "learning_rate": 8.0133110092357e-06, "loss": 0.9700984, "memory(GiB)": 369.4, "step": 25970, "train_speed(iter/s)": 0.201633 }, { "acc": 0.74691296, "epoch": 0.6589294774226281, "grad_norm": 2.328125, "learning_rate": 8.012474145040737e-06, "loss": 1.07076511, "memory(GiB)": 369.4, "step": 25975, "train_speed(iter/s)": 0.201639 }, { "acc": 0.74554396, "epoch": 0.6590563165905632, "grad_norm": 2.09375, "learning_rate": 8.01163714834371e-06, "loss": 1.04043131, "memory(GiB)": 369.4, "step": 25980, "train_speed(iter/s)": 0.201643 }, { "acc": 0.74578114, "epoch": 0.6591831557584982, "grad_norm": 2.765625, "learning_rate": 8.010800019181433e-06, "loss": 1.04702988, "memory(GiB)": 369.4, "step": 25985, "train_speed(iter/s)": 0.20165 }, { "acc": 0.73924942, "epoch": 0.6593099949264333, "grad_norm": 2.109375, "learning_rate": 8.009962757590732e-06, "loss": 1.02036362, "memory(GiB)": 369.4, "step": 25990, "train_speed(iter/s)": 0.201653 }, { "acc": 0.7525116, "epoch": 0.6594368340943684, "grad_norm": 1.96875, "learning_rate": 8.00912536360843e-06, "loss": 0.99195538, "memory(GiB)": 369.4, "step": 25995, "train_speed(iter/s)": 0.201659 }, { "acc": 0.74719372, "epoch": 0.6595636732623034, "grad_norm": 2.40625, "learning_rate": 8.008287837271359e-06, "loss": 1.0003727, "memory(GiB)": 369.4, "step": 26000, "train_speed(iter/s)": 0.201661 }, { "epoch": 0.6595636732623034, "eval_acc": 0.7362246451797799, "eval_loss": 0.9779871106147766, "eval_runtime": 385.0999, "eval_samples_per_second": 16.541, "eval_steps_per_second": 8.271, "step": 26000 }, { "acc": 0.75276546, "epoch": 0.6596905124302385, "grad_norm": 2.3125, "learning_rate": 8.007450178616356e-06, "loss": 0.98504734, "memory(GiB)": 369.4, "step": 26005, "train_speed(iter/s)": 0.200561 }, { "acc": 0.74172645, "epoch": 0.6598173515981736, "grad_norm": 2.265625, "learning_rate": 8.00661238768027e-06, "loss": 0.99339695, "memory(GiB)": 369.4, "step": 26010, "train_speed(iter/s)": 0.200562 }, { "acc": 0.7503828, "epoch": 0.6599441907661086, "grad_norm": 2.4375, "learning_rate": 8.005774464499947e-06, "loss": 0.97472486, "memory(GiB)": 369.4, "step": 26015, "train_speed(iter/s)": 0.200569 }, { "acc": 0.73921394, "epoch": 0.6600710299340437, "grad_norm": 1.9921875, "learning_rate": 8.004936409112243e-06, "loss": 1.011761, "memory(GiB)": 369.4, "step": 26020, "train_speed(iter/s)": 0.200574 }, { "acc": 0.74570484, "epoch": 0.6601978691019786, "grad_norm": 2.109375, "learning_rate": 8.004098221554018e-06, "loss": 1.01027203, "memory(GiB)": 369.4, "step": 26025, "train_speed(iter/s)": 0.200572 }, { "acc": 0.75150661, "epoch": 0.6603247082699137, "grad_norm": 1.890625, "learning_rate": 8.003259901862143e-06, "loss": 0.98757343, "memory(GiB)": 369.4, "step": 26030, "train_speed(iter/s)": 0.200577 }, { "acc": 0.73990078, "epoch": 0.6604515474378488, "grad_norm": 2.1875, "learning_rate": 8.002421450073488e-06, "loss": 1.00117807, "memory(GiB)": 369.4, "step": 26035, "train_speed(iter/s)": 0.200581 }, { "acc": 0.74888144, "epoch": 0.6605783866057838, "grad_norm": 2.5, "learning_rate": 8.001582866224932e-06, "loss": 0.9620575, "memory(GiB)": 369.4, "step": 26040, "train_speed(iter/s)": 0.200588 }, { "acc": 0.7540266, "epoch": 0.6607052257737189, "grad_norm": 2.1875, "learning_rate": 8.000744150353362e-06, "loss": 1.02751846, "memory(GiB)": 369.4, "step": 26045, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74812021, "epoch": 0.660832064941654, "grad_norm": 2.203125, "learning_rate": 7.999905302495667e-06, "loss": 1.00657406, "memory(GiB)": 369.4, "step": 26050, "train_speed(iter/s)": 0.200597 }, { "acc": 0.7495821, "epoch": 0.660958904109589, "grad_norm": 2.296875, "learning_rate": 7.999066322688743e-06, "loss": 0.98510494, "memory(GiB)": 369.4, "step": 26055, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74694457, "epoch": 0.6610857432775241, "grad_norm": 2.8125, "learning_rate": 7.998227210969491e-06, "loss": 1.05575809, "memory(GiB)": 369.4, "step": 26060, "train_speed(iter/s)": 0.200609 }, { "acc": 0.76058168, "epoch": 0.6612125824454591, "grad_norm": 2.125, "learning_rate": 7.997387967374821e-06, "loss": 0.98905611, "memory(GiB)": 369.4, "step": 26065, "train_speed(iter/s)": 0.200615 }, { "acc": 0.74397292, "epoch": 0.6613394216133942, "grad_norm": 2.546875, "learning_rate": 7.996548591941647e-06, "loss": 1.03234625, "memory(GiB)": 369.4, "step": 26070, "train_speed(iter/s)": 0.200621 }, { "acc": 0.75138578, "epoch": 0.6614662607813293, "grad_norm": 2.578125, "learning_rate": 7.995709084706884e-06, "loss": 0.95263119, "memory(GiB)": 369.4, "step": 26075, "train_speed(iter/s)": 0.200628 }, { "acc": 0.7379961, "epoch": 0.6615930999492643, "grad_norm": 2.0625, "learning_rate": 7.994869445707463e-06, "loss": 1.01625481, "memory(GiB)": 369.4, "step": 26080, "train_speed(iter/s)": 0.200632 }, { "acc": 0.74793549, "epoch": 0.6617199391171994, "grad_norm": 2.15625, "learning_rate": 7.994029674980313e-06, "loss": 0.97914009, "memory(GiB)": 369.4, "step": 26085, "train_speed(iter/s)": 0.200635 }, { "acc": 0.7519794, "epoch": 0.6618467782851345, "grad_norm": 1.9453125, "learning_rate": 7.99318977256237e-06, "loss": 0.98136139, "memory(GiB)": 369.4, "step": 26090, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74207191, "epoch": 0.6619736174530695, "grad_norm": 1.8984375, "learning_rate": 7.992349738490576e-06, "loss": 1.02003756, "memory(GiB)": 369.4, "step": 26095, "train_speed(iter/s)": 0.200647 }, { "acc": 0.76016269, "epoch": 0.6621004566210046, "grad_norm": 2.4375, "learning_rate": 7.991509572801883e-06, "loss": 0.97972374, "memory(GiB)": 369.4, "step": 26100, "train_speed(iter/s)": 0.200649 }, { "acc": 0.74279337, "epoch": 0.6622272957889396, "grad_norm": 1.921875, "learning_rate": 7.990669275533241e-06, "loss": 1.00483961, "memory(GiB)": 369.4, "step": 26105, "train_speed(iter/s)": 0.200655 }, { "acc": 0.73742342, "epoch": 0.6623541349568747, "grad_norm": 2.1875, "learning_rate": 7.989828846721613e-06, "loss": 1.02899218, "memory(GiB)": 369.4, "step": 26110, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74483738, "epoch": 0.6624809741248098, "grad_norm": 2.015625, "learning_rate": 7.98898828640396e-06, "loss": 0.98144741, "memory(GiB)": 369.4, "step": 26115, "train_speed(iter/s)": 0.200669 }, { "acc": 0.75844574, "epoch": 0.6626078132927448, "grad_norm": 2.3125, "learning_rate": 7.988147594617262e-06, "loss": 0.98896351, "memory(GiB)": 369.4, "step": 26120, "train_speed(iter/s)": 0.200674 }, { "acc": 0.74969673, "epoch": 0.6627346524606799, "grad_norm": 1.984375, "learning_rate": 7.987306771398489e-06, "loss": 0.97008801, "memory(GiB)": 369.4, "step": 26125, "train_speed(iter/s)": 0.200681 }, { "acc": 0.75562544, "epoch": 0.662861491628615, "grad_norm": 2.21875, "learning_rate": 7.986465816784628e-06, "loss": 1.01661234, "memory(GiB)": 369.4, "step": 26130, "train_speed(iter/s)": 0.200685 }, { "acc": 0.745854, "epoch": 0.66298833079655, "grad_norm": 1.890625, "learning_rate": 7.985624730812667e-06, "loss": 1.00228138, "memory(GiB)": 369.4, "step": 26135, "train_speed(iter/s)": 0.200691 }, { "acc": 0.73725529, "epoch": 0.663115169964485, "grad_norm": 2.40625, "learning_rate": 7.984783513519601e-06, "loss": 1.02612925, "memory(GiB)": 369.4, "step": 26140, "train_speed(iter/s)": 0.200697 }, { "acc": 0.7337976, "epoch": 0.66324200913242, "grad_norm": 2.546875, "learning_rate": 7.98394216494243e-06, "loss": 1.0908536, "memory(GiB)": 369.4, "step": 26145, "train_speed(iter/s)": 0.200702 }, { "acc": 0.74710665, "epoch": 0.6633688483003551, "grad_norm": 2.359375, "learning_rate": 7.983100685118157e-06, "loss": 1.02606773, "memory(GiB)": 369.4, "step": 26150, "train_speed(iter/s)": 0.200704 }, { "acc": 0.75658598, "epoch": 0.6634956874682902, "grad_norm": 1.8359375, "learning_rate": 7.9822590740838e-06, "loss": 0.93890905, "memory(GiB)": 369.4, "step": 26155, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74894142, "epoch": 0.6636225266362252, "grad_norm": 1.9453125, "learning_rate": 7.981417331876373e-06, "loss": 0.95372658, "memory(GiB)": 369.4, "step": 26160, "train_speed(iter/s)": 0.200717 }, { "acc": 0.74715748, "epoch": 0.6637493658041603, "grad_norm": 1.96875, "learning_rate": 7.980575458532901e-06, "loss": 1.04064598, "memory(GiB)": 369.4, "step": 26165, "train_speed(iter/s)": 0.200722 }, { "acc": 0.74840612, "epoch": 0.6638762049720954, "grad_norm": 2.046875, "learning_rate": 7.979733454090415e-06, "loss": 0.99794426, "memory(GiB)": 369.4, "step": 26170, "train_speed(iter/s)": 0.200729 }, { "acc": 0.73811359, "epoch": 0.6640030441400304, "grad_norm": 2.125, "learning_rate": 7.978891318585947e-06, "loss": 0.983529, "memory(GiB)": 369.4, "step": 26175, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74717493, "epoch": 0.6641298833079655, "grad_norm": 2.515625, "learning_rate": 7.978049052056537e-06, "loss": 1.02892218, "memory(GiB)": 369.4, "step": 26180, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75164642, "epoch": 0.6642567224759005, "grad_norm": 2.28125, "learning_rate": 7.977206654539235e-06, "loss": 0.99379854, "memory(GiB)": 369.4, "step": 26185, "train_speed(iter/s)": 0.200747 }, { "acc": 0.73827014, "epoch": 0.6643835616438356, "grad_norm": 2.203125, "learning_rate": 7.976364126071092e-06, "loss": 1.04106159, "memory(GiB)": 369.4, "step": 26190, "train_speed(iter/s)": 0.200756 }, { "acc": 0.7240695, "epoch": 0.6645104008117707, "grad_norm": 2.28125, "learning_rate": 7.975521466689166e-06, "loss": 1.07699223, "memory(GiB)": 369.4, "step": 26195, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74162951, "epoch": 0.6646372399797057, "grad_norm": 2.625, "learning_rate": 7.974678676430523e-06, "loss": 0.99809923, "memory(GiB)": 369.4, "step": 26200, "train_speed(iter/s)": 0.200767 }, { "acc": 0.75479536, "epoch": 0.6647640791476408, "grad_norm": 1.875, "learning_rate": 7.97383575533223e-06, "loss": 0.97624531, "memory(GiB)": 369.4, "step": 26205, "train_speed(iter/s)": 0.200774 }, { "acc": 0.73041506, "epoch": 0.6648909183155759, "grad_norm": 2.09375, "learning_rate": 7.972992703431362e-06, "loss": 1.01645832, "memory(GiB)": 369.4, "step": 26210, "train_speed(iter/s)": 0.200776 }, { "acc": 0.74200726, "epoch": 0.6650177574835109, "grad_norm": 2.359375, "learning_rate": 7.972149520765e-06, "loss": 1.03741741, "memory(GiB)": 369.4, "step": 26215, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74021301, "epoch": 0.665144596651446, "grad_norm": 2.171875, "learning_rate": 7.971306207370236e-06, "loss": 1.00556564, "memory(GiB)": 369.4, "step": 26220, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75051856, "epoch": 0.665271435819381, "grad_norm": 2.125, "learning_rate": 7.970462763284157e-06, "loss": 1.00951214, "memory(GiB)": 369.4, "step": 26225, "train_speed(iter/s)": 0.200796 }, { "acc": 0.74539366, "epoch": 0.6653982749873161, "grad_norm": 2.34375, "learning_rate": 7.969619188543865e-06, "loss": 1.05196037, "memory(GiB)": 369.4, "step": 26230, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74431429, "epoch": 0.6655251141552512, "grad_norm": 2.53125, "learning_rate": 7.968775483186462e-06, "loss": 1.01570129, "memory(GiB)": 369.4, "step": 26235, "train_speed(iter/s)": 0.200809 }, { "acc": 0.76074667, "epoch": 0.6656519533231862, "grad_norm": 2.1875, "learning_rate": 7.967931647249058e-06, "loss": 0.95511684, "memory(GiB)": 369.4, "step": 26240, "train_speed(iter/s)": 0.200814 }, { "acc": 0.73687286, "epoch": 0.6657787924911213, "grad_norm": 2.515625, "learning_rate": 7.967087680768768e-06, "loss": 1.02281551, "memory(GiB)": 369.4, "step": 26245, "train_speed(iter/s)": 0.200818 }, { "acc": 0.75215139, "epoch": 0.6659056316590564, "grad_norm": 2.171875, "learning_rate": 7.966243583782718e-06, "loss": 1.00393448, "memory(GiB)": 369.4, "step": 26250, "train_speed(iter/s)": 0.200824 }, { "acc": 0.73875823, "epoch": 0.6660324708269914, "grad_norm": 1.9921875, "learning_rate": 7.96539935632803e-06, "loss": 1.06496658, "memory(GiB)": 369.4, "step": 26255, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74543662, "epoch": 0.6661593099949265, "grad_norm": 2.25, "learning_rate": 7.964554998441839e-06, "loss": 1.00321293, "memory(GiB)": 369.4, "step": 26260, "train_speed(iter/s)": 0.200836 }, { "acc": 0.76029205, "epoch": 0.6662861491628614, "grad_norm": 2.203125, "learning_rate": 7.963710510161282e-06, "loss": 0.94854126, "memory(GiB)": 369.4, "step": 26265, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75274034, "epoch": 0.6664129883307965, "grad_norm": 2.140625, "learning_rate": 7.962865891523508e-06, "loss": 0.95968504, "memory(GiB)": 369.4, "step": 26270, "train_speed(iter/s)": 0.200842 }, { "acc": 0.73674612, "epoch": 0.6665398274987316, "grad_norm": 2.46875, "learning_rate": 7.96202114256566e-06, "loss": 1.11243134, "memory(GiB)": 369.4, "step": 26275, "train_speed(iter/s)": 0.200848 }, { "acc": 0.75185809, "epoch": 0.6666666666666666, "grad_norm": 2.078125, "learning_rate": 7.961176263324902e-06, "loss": 0.98792696, "memory(GiB)": 369.4, "step": 26280, "train_speed(iter/s)": 0.200852 }, { "acc": 0.74592514, "epoch": 0.6667935058346017, "grad_norm": 2.015625, "learning_rate": 7.960331253838387e-06, "loss": 0.99868536, "memory(GiB)": 369.4, "step": 26285, "train_speed(iter/s)": 0.200853 }, { "acc": 0.74106188, "epoch": 0.6669203450025368, "grad_norm": 1.8515625, "learning_rate": 7.95948611414329e-06, "loss": 0.9868825, "memory(GiB)": 369.4, "step": 26290, "train_speed(iter/s)": 0.200859 }, { "acc": 0.74366169, "epoch": 0.6670471841704718, "grad_norm": 2.21875, "learning_rate": 7.958640844276776e-06, "loss": 1.02549448, "memory(GiB)": 369.4, "step": 26295, "train_speed(iter/s)": 0.200866 }, { "acc": 0.73563819, "epoch": 0.6671740233384069, "grad_norm": 2.03125, "learning_rate": 7.957795444276033e-06, "loss": 1.04045467, "memory(GiB)": 369.4, "step": 26300, "train_speed(iter/s)": 0.200871 }, { "acc": 0.75214949, "epoch": 0.6673008625063419, "grad_norm": 2.34375, "learning_rate": 7.956949914178239e-06, "loss": 0.95707893, "memory(GiB)": 369.4, "step": 26305, "train_speed(iter/s)": 0.200878 }, { "acc": 0.74470434, "epoch": 0.667427701674277, "grad_norm": 2.03125, "learning_rate": 7.956104254020587e-06, "loss": 0.95846205, "memory(GiB)": 369.4, "step": 26310, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75100136, "epoch": 0.6675545408422121, "grad_norm": 2.125, "learning_rate": 7.95525846384027e-06, "loss": 0.99778309, "memory(GiB)": 369.4, "step": 26315, "train_speed(iter/s)": 0.200891 }, { "acc": 0.75570002, "epoch": 0.6676813800101471, "grad_norm": 1.703125, "learning_rate": 7.954412543674493e-06, "loss": 0.96201324, "memory(GiB)": 369.4, "step": 26320, "train_speed(iter/s)": 0.200898 }, { "acc": 0.74494319, "epoch": 0.6678082191780822, "grad_norm": 2.484375, "learning_rate": 7.95356649356046e-06, "loss": 1.05021992, "memory(GiB)": 369.4, "step": 26325, "train_speed(iter/s)": 0.200906 }, { "acc": 0.73532863, "epoch": 0.6679350583460173, "grad_norm": 2.5, "learning_rate": 7.952720313535387e-06, "loss": 1.03623466, "memory(GiB)": 369.4, "step": 26330, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74615479, "epoch": 0.6680618975139523, "grad_norm": 2.1875, "learning_rate": 7.951874003636492e-06, "loss": 1.03083982, "memory(GiB)": 369.4, "step": 26335, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75254636, "epoch": 0.6681887366818874, "grad_norm": 1.859375, "learning_rate": 7.951027563901e-06, "loss": 0.92633476, "memory(GiB)": 369.4, "step": 26340, "train_speed(iter/s)": 0.200925 }, { "acc": 0.73507423, "epoch": 0.6683155758498224, "grad_norm": 2.78125, "learning_rate": 7.950180994366138e-06, "loss": 1.05470982, "memory(GiB)": 369.4, "step": 26345, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75158153, "epoch": 0.6684424150177575, "grad_norm": 2.15625, "learning_rate": 7.949334295069147e-06, "loss": 0.95618219, "memory(GiB)": 369.4, "step": 26350, "train_speed(iter/s)": 0.20094 }, { "acc": 0.73825073, "epoch": 0.6685692541856926, "grad_norm": 2.625, "learning_rate": 7.948487466047263e-06, "loss": 1.04933147, "memory(GiB)": 369.4, "step": 26355, "train_speed(iter/s)": 0.200947 }, { "acc": 0.74196501, "epoch": 0.6686960933536276, "grad_norm": 2.875, "learning_rate": 7.947640507337737e-06, "loss": 1.0211791, "memory(GiB)": 369.4, "step": 26360, "train_speed(iter/s)": 0.200955 }, { "acc": 0.73419323, "epoch": 0.6688229325215627, "grad_norm": 2.359375, "learning_rate": 7.946793418977821e-06, "loss": 1.01518364, "memory(GiB)": 369.4, "step": 26365, "train_speed(iter/s)": 0.200957 }, { "acc": 0.73063745, "epoch": 0.6689497716894978, "grad_norm": 2.203125, "learning_rate": 7.945946201004775e-06, "loss": 1.04382286, "memory(GiB)": 369.4, "step": 26370, "train_speed(iter/s)": 0.200961 }, { "acc": 0.73849511, "epoch": 0.6690766108574328, "grad_norm": 2.21875, "learning_rate": 7.945098853455862e-06, "loss": 0.99827843, "memory(GiB)": 369.4, "step": 26375, "train_speed(iter/s)": 0.200968 }, { "acc": 0.75665278, "epoch": 0.6692034500253679, "grad_norm": 1.8125, "learning_rate": 7.944251376368352e-06, "loss": 0.98123512, "memory(GiB)": 369.4, "step": 26380, "train_speed(iter/s)": 0.200973 }, { "acc": 0.74361391, "epoch": 0.6693302891933028, "grad_norm": 2.140625, "learning_rate": 7.943403769779523e-06, "loss": 1.04813137, "memory(GiB)": 369.4, "step": 26385, "train_speed(iter/s)": 0.200978 }, { "acc": 0.74576678, "epoch": 0.6694571283612379, "grad_norm": 2.609375, "learning_rate": 7.942556033726654e-06, "loss": 1.06128607, "memory(GiB)": 369.4, "step": 26390, "train_speed(iter/s)": 0.200985 }, { "acc": 0.75693445, "epoch": 0.669583967529173, "grad_norm": 2.09375, "learning_rate": 7.941708168247033e-06, "loss": 0.98891611, "memory(GiB)": 369.4, "step": 26395, "train_speed(iter/s)": 0.200988 }, { "acc": 0.75889244, "epoch": 0.669710806697108, "grad_norm": 2.03125, "learning_rate": 7.940860173377952e-06, "loss": 1.01640234, "memory(GiB)": 369.4, "step": 26400, "train_speed(iter/s)": 0.200993 }, { "acc": 0.74336033, "epoch": 0.6698376458650431, "grad_norm": 2.375, "learning_rate": 7.940012049156711e-06, "loss": 1.01858864, "memory(GiB)": 369.4, "step": 26405, "train_speed(iter/s)": 0.201 }, { "acc": 0.75812845, "epoch": 0.6699644850329782, "grad_norm": 1.8515625, "learning_rate": 7.939163795620614e-06, "loss": 0.98407402, "memory(GiB)": 369.4, "step": 26410, "train_speed(iter/s)": 0.201007 }, { "acc": 0.73621235, "epoch": 0.6700913242009132, "grad_norm": 2.328125, "learning_rate": 7.938315412806971e-06, "loss": 1.0584547, "memory(GiB)": 369.4, "step": 26415, "train_speed(iter/s)": 0.201011 }, { "acc": 0.75496087, "epoch": 0.6702181633688483, "grad_norm": 2.140625, "learning_rate": 7.937466900753098e-06, "loss": 0.97526226, "memory(GiB)": 369.4, "step": 26420, "train_speed(iter/s)": 0.201016 }, { "acc": 0.73809757, "epoch": 0.6703450025367833, "grad_norm": 2.65625, "learning_rate": 7.936618259496316e-06, "loss": 1.02667618, "memory(GiB)": 369.4, "step": 26425, "train_speed(iter/s)": 0.201024 }, { "acc": 0.75379477, "epoch": 0.6704718417047184, "grad_norm": 2.265625, "learning_rate": 7.935769489073952e-06, "loss": 0.99000759, "memory(GiB)": 369.4, "step": 26430, "train_speed(iter/s)": 0.201032 }, { "acc": 0.7571147, "epoch": 0.6705986808726535, "grad_norm": 2.21875, "learning_rate": 7.934920589523336e-06, "loss": 0.92625351, "memory(GiB)": 369.4, "step": 26435, "train_speed(iter/s)": 0.201034 }, { "acc": 0.74461904, "epoch": 0.6707255200405885, "grad_norm": 2.109375, "learning_rate": 7.934071560881812e-06, "loss": 1.04946117, "memory(GiB)": 369.4, "step": 26440, "train_speed(iter/s)": 0.201041 }, { "acc": 0.75394831, "epoch": 0.6708523592085236, "grad_norm": 2.15625, "learning_rate": 7.93322240318672e-06, "loss": 0.97061453, "memory(GiB)": 369.4, "step": 26445, "train_speed(iter/s)": 0.201044 }, { "acc": 0.74114628, "epoch": 0.6709791983764587, "grad_norm": 2.046875, "learning_rate": 7.93237311647541e-06, "loss": 1.05063782, "memory(GiB)": 369.4, "step": 26450, "train_speed(iter/s)": 0.201049 }, { "acc": 0.74525261, "epoch": 0.6711060375443937, "grad_norm": 2.296875, "learning_rate": 7.93152370078524e-06, "loss": 0.9543561, "memory(GiB)": 369.4, "step": 26455, "train_speed(iter/s)": 0.201056 }, { "acc": 0.7572453, "epoch": 0.6712328767123288, "grad_norm": 1.953125, "learning_rate": 7.930674156153569e-06, "loss": 0.9756978, "memory(GiB)": 369.4, "step": 26460, "train_speed(iter/s)": 0.20106 }, { "acc": 0.7405118, "epoch": 0.6713597158802638, "grad_norm": 2.078125, "learning_rate": 7.929824482617763e-06, "loss": 1.02689943, "memory(GiB)": 369.4, "step": 26465, "train_speed(iter/s)": 0.20106 }, { "acc": 0.7388968, "epoch": 0.6714865550481989, "grad_norm": 2.640625, "learning_rate": 7.928974680215196e-06, "loss": 1.07207451, "memory(GiB)": 369.4, "step": 26470, "train_speed(iter/s)": 0.201063 }, { "acc": 0.74040184, "epoch": 0.671613394216134, "grad_norm": 2.109375, "learning_rate": 7.928124748983244e-06, "loss": 1.03007717, "memory(GiB)": 369.4, "step": 26475, "train_speed(iter/s)": 0.201068 }, { "acc": 0.74895115, "epoch": 0.671740233384069, "grad_norm": 2.46875, "learning_rate": 7.927274688959294e-06, "loss": 1.04771814, "memory(GiB)": 369.4, "step": 26480, "train_speed(iter/s)": 0.201072 }, { "acc": 0.74173689, "epoch": 0.6718670725520041, "grad_norm": 2.515625, "learning_rate": 7.926424500180734e-06, "loss": 1.03288898, "memory(GiB)": 369.4, "step": 26485, "train_speed(iter/s)": 0.201075 }, { "acc": 0.7630805, "epoch": 0.6719939117199392, "grad_norm": 2.34375, "learning_rate": 7.92557418268496e-06, "loss": 0.97596664, "memory(GiB)": 369.4, "step": 26490, "train_speed(iter/s)": 0.201078 }, { "acc": 0.75342059, "epoch": 0.6721207508878742, "grad_norm": 2.03125, "learning_rate": 7.92472373650937e-06, "loss": 1.00110445, "memory(GiB)": 369.4, "step": 26495, "train_speed(iter/s)": 0.201085 }, { "acc": 0.728233, "epoch": 0.6722475900558093, "grad_norm": 1.8359375, "learning_rate": 7.923873161691373e-06, "loss": 1.09757156, "memory(GiB)": 369.4, "step": 26500, "train_speed(iter/s)": 0.201089 }, { "acc": 0.75885668, "epoch": 0.6723744292237442, "grad_norm": 2.1875, "learning_rate": 7.923022458268379e-06, "loss": 0.97035942, "memory(GiB)": 369.4, "step": 26505, "train_speed(iter/s)": 0.201091 }, { "acc": 0.74843864, "epoch": 0.6725012683916793, "grad_norm": 2.390625, "learning_rate": 7.922171626277809e-06, "loss": 1.03295631, "memory(GiB)": 369.4, "step": 26510, "train_speed(iter/s)": 0.201097 }, { "acc": 0.74625864, "epoch": 0.6726281075596144, "grad_norm": 1.75, "learning_rate": 7.921320665757081e-06, "loss": 1.0141283, "memory(GiB)": 369.4, "step": 26515, "train_speed(iter/s)": 0.201102 }, { "acc": 0.7470809, "epoch": 0.6727549467275494, "grad_norm": 2.59375, "learning_rate": 7.920469576743631e-06, "loss": 0.96391296, "memory(GiB)": 369.4, "step": 26520, "train_speed(iter/s)": 0.201109 }, { "acc": 0.75840368, "epoch": 0.6728817858954845, "grad_norm": 2.375, "learning_rate": 7.919618359274888e-06, "loss": 0.99486179, "memory(GiB)": 369.4, "step": 26525, "train_speed(iter/s)": 0.201117 }, { "acc": 0.75107117, "epoch": 0.6730086250634196, "grad_norm": 2.5625, "learning_rate": 7.918767013388295e-06, "loss": 1.04879932, "memory(GiB)": 369.4, "step": 26530, "train_speed(iter/s)": 0.201123 }, { "acc": 0.7472806, "epoch": 0.6731354642313546, "grad_norm": 2.328125, "learning_rate": 7.917915539121297e-06, "loss": 1.03614006, "memory(GiB)": 369.4, "step": 26535, "train_speed(iter/s)": 0.20113 }, { "acc": 0.72680058, "epoch": 0.6732623033992897, "grad_norm": 2.046875, "learning_rate": 7.917063936511347e-06, "loss": 1.08791981, "memory(GiB)": 369.4, "step": 26540, "train_speed(iter/s)": 0.201131 }, { "acc": 0.74737816, "epoch": 0.6733891425672247, "grad_norm": 2.484375, "learning_rate": 7.9162122055959e-06, "loss": 1.02176361, "memory(GiB)": 369.4, "step": 26545, "train_speed(iter/s)": 0.201138 }, { "acc": 0.74908266, "epoch": 0.6735159817351598, "grad_norm": 2.171875, "learning_rate": 7.91536034641242e-06, "loss": 0.98799477, "memory(GiB)": 369.4, "step": 26550, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74419184, "epoch": 0.6736428209030949, "grad_norm": 3.71875, "learning_rate": 7.914508358998376e-06, "loss": 1.03219452, "memory(GiB)": 369.4, "step": 26555, "train_speed(iter/s)": 0.201153 }, { "acc": 0.76296415, "epoch": 0.6737696600710299, "grad_norm": 2.171875, "learning_rate": 7.913656243391243e-06, "loss": 0.95851059, "memory(GiB)": 369.4, "step": 26560, "train_speed(iter/s)": 0.201161 }, { "acc": 0.74299803, "epoch": 0.673896499238965, "grad_norm": 2.1875, "learning_rate": 7.9128039996285e-06, "loss": 1.02278299, "memory(GiB)": 369.4, "step": 26565, "train_speed(iter/s)": 0.201168 }, { "acc": 0.74209957, "epoch": 0.6740233384069001, "grad_norm": 1.9765625, "learning_rate": 7.911951627747633e-06, "loss": 1.03405809, "memory(GiB)": 369.4, "step": 26570, "train_speed(iter/s)": 0.201173 }, { "acc": 0.73594055, "epoch": 0.6741501775748351, "grad_norm": 2.3125, "learning_rate": 7.91109912778613e-06, "loss": 1.04068508, "memory(GiB)": 369.4, "step": 26575, "train_speed(iter/s)": 0.20118 }, { "acc": 0.74061079, "epoch": 0.6742770167427702, "grad_norm": 1.9765625, "learning_rate": 7.910246499781492e-06, "loss": 1.00249634, "memory(GiB)": 369.4, "step": 26580, "train_speed(iter/s)": 0.201182 }, { "acc": 0.74039421, "epoch": 0.6744038559107052, "grad_norm": 2.03125, "learning_rate": 7.90939374377122e-06, "loss": 1.01561909, "memory(GiB)": 369.4, "step": 26585, "train_speed(iter/s)": 0.201185 }, { "acc": 0.74856167, "epoch": 0.6745306950786403, "grad_norm": 2.015625, "learning_rate": 7.908540859792821e-06, "loss": 0.97084789, "memory(GiB)": 369.4, "step": 26590, "train_speed(iter/s)": 0.201192 }, { "acc": 0.74684734, "epoch": 0.6746575342465754, "grad_norm": 2.078125, "learning_rate": 7.907687847883809e-06, "loss": 0.97503166, "memory(GiB)": 369.4, "step": 26595, "train_speed(iter/s)": 0.201198 }, { "acc": 0.74744101, "epoch": 0.6747843734145104, "grad_norm": 2.40625, "learning_rate": 7.906834708081703e-06, "loss": 1.00318613, "memory(GiB)": 369.4, "step": 26600, "train_speed(iter/s)": 0.201205 }, { "acc": 0.75649285, "epoch": 0.6749112125824455, "grad_norm": 2.21875, "learning_rate": 7.90598144042403e-06, "loss": 0.99075394, "memory(GiB)": 369.4, "step": 26605, "train_speed(iter/s)": 0.201207 }, { "acc": 0.74552717, "epoch": 0.6750380517503806, "grad_norm": 2.296875, "learning_rate": 7.905128044948318e-06, "loss": 1.02530842, "memory(GiB)": 369.4, "step": 26610, "train_speed(iter/s)": 0.201212 }, { "acc": 0.7440321, "epoch": 0.6751648909183156, "grad_norm": 1.8125, "learning_rate": 7.904274521692104e-06, "loss": 1.04437275, "memory(GiB)": 369.4, "step": 26615, "train_speed(iter/s)": 0.201217 }, { "acc": 0.74469919, "epoch": 0.6752917300862507, "grad_norm": 2.015625, "learning_rate": 7.90342087069293e-06, "loss": 1.01770515, "memory(GiB)": 369.4, "step": 26620, "train_speed(iter/s)": 0.201225 }, { "acc": 0.74526949, "epoch": 0.6754185692541856, "grad_norm": 2.21875, "learning_rate": 7.902567091988343e-06, "loss": 0.97619705, "memory(GiB)": 369.4, "step": 26625, "train_speed(iter/s)": 0.201229 }, { "acc": 0.75067673, "epoch": 0.6755454084221207, "grad_norm": 2.359375, "learning_rate": 7.901713185615898e-06, "loss": 0.99806919, "memory(GiB)": 369.4, "step": 26630, "train_speed(iter/s)": 0.201234 }, { "acc": 0.74560924, "epoch": 0.6756722475900558, "grad_norm": 2.359375, "learning_rate": 7.90085915161315e-06, "loss": 1.00589666, "memory(GiB)": 369.4, "step": 26635, "train_speed(iter/s)": 0.201238 }, { "acc": 0.74158192, "epoch": 0.6757990867579908, "grad_norm": 2.015625, "learning_rate": 7.900004990017667e-06, "loss": 1.06330557, "memory(GiB)": 369.4, "step": 26640, "train_speed(iter/s)": 0.201246 }, { "acc": 0.72849574, "epoch": 0.6759259259259259, "grad_norm": 2.375, "learning_rate": 7.899150700867014e-06, "loss": 1.06767521, "memory(GiB)": 369.4, "step": 26645, "train_speed(iter/s)": 0.201254 }, { "acc": 0.74722323, "epoch": 0.676052765093861, "grad_norm": 2.21875, "learning_rate": 7.898296284198772e-06, "loss": 0.97976151, "memory(GiB)": 369.4, "step": 26650, "train_speed(iter/s)": 0.201256 }, { "acc": 0.74464273, "epoch": 0.676179604261796, "grad_norm": 2.46875, "learning_rate": 7.897441740050518e-06, "loss": 1.04963017, "memory(GiB)": 369.4, "step": 26655, "train_speed(iter/s)": 0.201261 }, { "acc": 0.7280889, "epoch": 0.6763064434297311, "grad_norm": 1.953125, "learning_rate": 7.89658706845984e-06, "loss": 1.01649771, "memory(GiB)": 369.4, "step": 26660, "train_speed(iter/s)": 0.201267 }, { "acc": 0.74228525, "epoch": 0.6764332825976661, "grad_norm": 2.0625, "learning_rate": 7.89573226946433e-06, "loss": 1.01143751, "memory(GiB)": 369.4, "step": 26665, "train_speed(iter/s)": 0.201273 }, { "acc": 0.74541383, "epoch": 0.6765601217656012, "grad_norm": 2.765625, "learning_rate": 7.89487734310159e-06, "loss": 1.01876364, "memory(GiB)": 369.4, "step": 26670, "train_speed(iter/s)": 0.201279 }, { "acc": 0.75194845, "epoch": 0.6766869609335363, "grad_norm": 1.9375, "learning_rate": 7.894022289409216e-06, "loss": 1.0070507, "memory(GiB)": 369.4, "step": 26675, "train_speed(iter/s)": 0.201281 }, { "acc": 0.73362312, "epoch": 0.6768138001014713, "grad_norm": 1.78125, "learning_rate": 7.893167108424822e-06, "loss": 1.05401516, "memory(GiB)": 369.4, "step": 26680, "train_speed(iter/s)": 0.201288 }, { "acc": 0.75545001, "epoch": 0.6769406392694064, "grad_norm": 2.109375, "learning_rate": 7.89231180018602e-06, "loss": 0.94595909, "memory(GiB)": 369.4, "step": 26685, "train_speed(iter/s)": 0.201296 }, { "acc": 0.74502497, "epoch": 0.6770674784373415, "grad_norm": 2.203125, "learning_rate": 7.891456364730434e-06, "loss": 1.02951298, "memory(GiB)": 369.4, "step": 26690, "train_speed(iter/s)": 0.201301 }, { "acc": 0.75236344, "epoch": 0.6771943176052765, "grad_norm": 1.96875, "learning_rate": 7.890600802095686e-06, "loss": 1.02780647, "memory(GiB)": 369.4, "step": 26695, "train_speed(iter/s)": 0.201307 }, { "acc": 0.7452065, "epoch": 0.6773211567732116, "grad_norm": 2.28125, "learning_rate": 7.889745112319411e-06, "loss": 1.00747948, "memory(GiB)": 369.4, "step": 26700, "train_speed(iter/s)": 0.201315 }, { "acc": 0.74442925, "epoch": 0.6774479959411466, "grad_norm": 2.078125, "learning_rate": 7.888889295439244e-06, "loss": 1.01564102, "memory(GiB)": 369.4, "step": 26705, "train_speed(iter/s)": 0.20132 }, { "acc": 0.73909764, "epoch": 0.6775748351090817, "grad_norm": 2.28125, "learning_rate": 7.888033351492827e-06, "loss": 1.0353157, "memory(GiB)": 369.4, "step": 26710, "train_speed(iter/s)": 0.201325 }, { "acc": 0.73421459, "epoch": 0.6777016742770168, "grad_norm": 1.953125, "learning_rate": 7.887177280517808e-06, "loss": 1.09441252, "memory(GiB)": 369.4, "step": 26715, "train_speed(iter/s)": 0.201331 }, { "acc": 0.75349731, "epoch": 0.6778285134449518, "grad_norm": 2.25, "learning_rate": 7.886321082551845e-06, "loss": 0.95507851, "memory(GiB)": 369.4, "step": 26720, "train_speed(iter/s)": 0.201337 }, { "acc": 0.75231113, "epoch": 0.6779553526128869, "grad_norm": 2.125, "learning_rate": 7.88546475763259e-06, "loss": 0.96652889, "memory(GiB)": 369.4, "step": 26725, "train_speed(iter/s)": 0.201338 }, { "acc": 0.76103354, "epoch": 0.678082191780822, "grad_norm": 1.7109375, "learning_rate": 7.884608305797716e-06, "loss": 0.95140762, "memory(GiB)": 369.4, "step": 26730, "train_speed(iter/s)": 0.201344 }, { "acc": 0.74020467, "epoch": 0.678209030948757, "grad_norm": 1.9375, "learning_rate": 7.883751727084888e-06, "loss": 1.05388441, "memory(GiB)": 369.4, "step": 26735, "train_speed(iter/s)": 0.201349 }, { "acc": 0.75103188, "epoch": 0.678335870116692, "grad_norm": 2.140625, "learning_rate": 7.882895021531784e-06, "loss": 1.04525166, "memory(GiB)": 369.4, "step": 26740, "train_speed(iter/s)": 0.201356 }, { "acc": 0.75302782, "epoch": 0.678462709284627, "grad_norm": 2.546875, "learning_rate": 7.882038189176085e-06, "loss": 0.99661512, "memory(GiB)": 369.4, "step": 26745, "train_speed(iter/s)": 0.20136 }, { "acc": 0.75216913, "epoch": 0.6785895484525621, "grad_norm": 2.21875, "learning_rate": 7.881181230055481e-06, "loss": 0.99872417, "memory(GiB)": 369.4, "step": 26750, "train_speed(iter/s)": 0.201356 }, { "acc": 0.74544497, "epoch": 0.6787163876204972, "grad_norm": 3.4375, "learning_rate": 7.880324144207663e-06, "loss": 1.0363142, "memory(GiB)": 369.4, "step": 26755, "train_speed(iter/s)": 0.201359 }, { "acc": 0.76156516, "epoch": 0.6788432267884322, "grad_norm": 2.109375, "learning_rate": 7.879466931670328e-06, "loss": 0.99957504, "memory(GiB)": 369.4, "step": 26760, "train_speed(iter/s)": 0.201366 }, { "acc": 0.736269, "epoch": 0.6789700659563673, "grad_norm": 2.640625, "learning_rate": 7.878609592481182e-06, "loss": 1.10180607, "memory(GiB)": 369.4, "step": 26765, "train_speed(iter/s)": 0.201372 }, { "acc": 0.74939775, "epoch": 0.6790969051243024, "grad_norm": 2.1875, "learning_rate": 7.877752126677933e-06, "loss": 1.01994343, "memory(GiB)": 369.4, "step": 26770, "train_speed(iter/s)": 0.201369 }, { "acc": 0.76062598, "epoch": 0.6792237442922374, "grad_norm": 1.953125, "learning_rate": 7.876894534298298e-06, "loss": 0.92738857, "memory(GiB)": 369.4, "step": 26775, "train_speed(iter/s)": 0.201371 }, { "acc": 0.75383205, "epoch": 0.6793505834601725, "grad_norm": 2.296875, "learning_rate": 7.87603681538e-06, "loss": 0.96549368, "memory(GiB)": 369.4, "step": 26780, "train_speed(iter/s)": 0.201376 }, { "acc": 0.73417068, "epoch": 0.6794774226281075, "grad_norm": 2.203125, "learning_rate": 7.875178969960757e-06, "loss": 1.08073845, "memory(GiB)": 369.4, "step": 26785, "train_speed(iter/s)": 0.201381 }, { "acc": 0.74254971, "epoch": 0.6796042617960426, "grad_norm": 1.921875, "learning_rate": 7.87432099807831e-06, "loss": 1.0376442, "memory(GiB)": 369.4, "step": 26790, "train_speed(iter/s)": 0.201386 }, { "acc": 0.75246644, "epoch": 0.6797311009639777, "grad_norm": 2.359375, "learning_rate": 7.87346289977039e-06, "loss": 1.01984034, "memory(GiB)": 369.4, "step": 26795, "train_speed(iter/s)": 0.201388 }, { "acc": 0.75238008, "epoch": 0.6798579401319127, "grad_norm": 2.28125, "learning_rate": 7.872604675074745e-06, "loss": 0.97608376, "memory(GiB)": 369.4, "step": 26800, "train_speed(iter/s)": 0.201393 }, { "acc": 0.74815559, "epoch": 0.6799847792998478, "grad_norm": 2.234375, "learning_rate": 7.871746324029119e-06, "loss": 0.97245045, "memory(GiB)": 369.4, "step": 26805, "train_speed(iter/s)": 0.201399 }, { "acc": 0.75310907, "epoch": 0.6801116184677829, "grad_norm": 2.28125, "learning_rate": 7.87088784667127e-06, "loss": 0.98354111, "memory(GiB)": 369.4, "step": 26810, "train_speed(iter/s)": 0.201405 }, { "acc": 0.73002958, "epoch": 0.6802384576357179, "grad_norm": 2.078125, "learning_rate": 7.870029243038955e-06, "loss": 1.0758152, "memory(GiB)": 369.4, "step": 26815, "train_speed(iter/s)": 0.201411 }, { "acc": 0.73988066, "epoch": 0.680365296803653, "grad_norm": 2.25, "learning_rate": 7.869170513169941e-06, "loss": 1.02458086, "memory(GiB)": 369.4, "step": 26820, "train_speed(iter/s)": 0.201417 }, { "acc": 0.72328448, "epoch": 0.680492135971588, "grad_norm": 2.109375, "learning_rate": 7.868311657101996e-06, "loss": 1.01675587, "memory(GiB)": 369.4, "step": 26825, "train_speed(iter/s)": 0.201423 }, { "acc": 0.73952522, "epoch": 0.6806189751395231, "grad_norm": 2.796875, "learning_rate": 7.8674526748729e-06, "loss": 1.02605295, "memory(GiB)": 369.4, "step": 26830, "train_speed(iter/s)": 0.201431 }, { "acc": 0.77212706, "epoch": 0.6807458143074582, "grad_norm": 2.453125, "learning_rate": 7.866593566520432e-06, "loss": 0.94126663, "memory(GiB)": 369.4, "step": 26835, "train_speed(iter/s)": 0.201437 }, { "acc": 0.73990593, "epoch": 0.6808726534753932, "grad_norm": 2.078125, "learning_rate": 7.865734332082382e-06, "loss": 1.06512508, "memory(GiB)": 369.4, "step": 26840, "train_speed(iter/s)": 0.201442 }, { "acc": 0.75494442, "epoch": 0.6809994926433283, "grad_norm": 2.453125, "learning_rate": 7.86487497159654e-06, "loss": 0.94114094, "memory(GiB)": 369.4, "step": 26845, "train_speed(iter/s)": 0.201446 }, { "acc": 0.74415975, "epoch": 0.6811263318112634, "grad_norm": 2.078125, "learning_rate": 7.864015485100706e-06, "loss": 1.00317183, "memory(GiB)": 369.4, "step": 26850, "train_speed(iter/s)": 0.201451 }, { "acc": 0.75289006, "epoch": 0.6812531709791984, "grad_norm": 2.09375, "learning_rate": 7.863155872632685e-06, "loss": 0.94688864, "memory(GiB)": 369.4, "step": 26855, "train_speed(iter/s)": 0.201456 }, { "acc": 0.74453144, "epoch": 0.6813800101471335, "grad_norm": 2.34375, "learning_rate": 7.862296134230287e-06, "loss": 0.95363216, "memory(GiB)": 369.4, "step": 26860, "train_speed(iter/s)": 0.201462 }, { "acc": 0.7499052, "epoch": 0.6815068493150684, "grad_norm": 2.28125, "learning_rate": 7.861436269931322e-06, "loss": 0.99042854, "memory(GiB)": 369.4, "step": 26865, "train_speed(iter/s)": 0.201468 }, { "acc": 0.74673915, "epoch": 0.6816336884830035, "grad_norm": 2.0625, "learning_rate": 7.860576279773617e-06, "loss": 0.973592, "memory(GiB)": 369.4, "step": 26870, "train_speed(iter/s)": 0.201474 }, { "acc": 0.74502592, "epoch": 0.6817605276509386, "grad_norm": 2.21875, "learning_rate": 7.859716163794995e-06, "loss": 0.98128119, "memory(GiB)": 369.4, "step": 26875, "train_speed(iter/s)": 0.201478 }, { "acc": 0.74961529, "epoch": 0.6818873668188736, "grad_norm": 1.671875, "learning_rate": 7.858855922033289e-06, "loss": 0.99762506, "memory(GiB)": 369.4, "step": 26880, "train_speed(iter/s)": 0.201481 }, { "acc": 0.74255476, "epoch": 0.6820142059868087, "grad_norm": 2.265625, "learning_rate": 7.857995554526334e-06, "loss": 1.05988894, "memory(GiB)": 369.4, "step": 26885, "train_speed(iter/s)": 0.201483 }, { "acc": 0.73628736, "epoch": 0.6821410451547438, "grad_norm": 2.078125, "learning_rate": 7.857135061311977e-06, "loss": 1.04371061, "memory(GiB)": 369.4, "step": 26890, "train_speed(iter/s)": 0.201487 }, { "acc": 0.74623785, "epoch": 0.6822678843226788, "grad_norm": 2.078125, "learning_rate": 7.856274442428062e-06, "loss": 1.05029278, "memory(GiB)": 369.4, "step": 26895, "train_speed(iter/s)": 0.201493 }, { "acc": 0.73701763, "epoch": 0.6823947234906139, "grad_norm": 2.34375, "learning_rate": 7.855413697912446e-06, "loss": 0.99635496, "memory(GiB)": 369.4, "step": 26900, "train_speed(iter/s)": 0.2015 }, { "acc": 0.73929763, "epoch": 0.6825215626585489, "grad_norm": 2.140625, "learning_rate": 7.854552827802987e-06, "loss": 1.03283052, "memory(GiB)": 369.4, "step": 26905, "train_speed(iter/s)": 0.201505 }, { "acc": 0.74917374, "epoch": 0.682648401826484, "grad_norm": 2.90625, "learning_rate": 7.853691832137547e-06, "loss": 1.02353649, "memory(GiB)": 369.4, "step": 26910, "train_speed(iter/s)": 0.20151 }, { "acc": 0.75635257, "epoch": 0.6827752409944191, "grad_norm": 1.96875, "learning_rate": 7.852830710954003e-06, "loss": 0.95115242, "memory(GiB)": 369.4, "step": 26915, "train_speed(iter/s)": 0.201515 }, { "acc": 0.74559603, "epoch": 0.6829020801623541, "grad_norm": 2.3125, "learning_rate": 7.851969464290226e-06, "loss": 1.02136211, "memory(GiB)": 369.4, "step": 26920, "train_speed(iter/s)": 0.20152 }, { "acc": 0.74496489, "epoch": 0.6830289193302892, "grad_norm": 2.5, "learning_rate": 7.851108092184099e-06, "loss": 1.00847549, "memory(GiB)": 369.4, "step": 26925, "train_speed(iter/s)": 0.201523 }, { "acc": 0.74814014, "epoch": 0.6831557584982243, "grad_norm": 2.109375, "learning_rate": 7.850246594673508e-06, "loss": 0.98962708, "memory(GiB)": 369.4, "step": 26930, "train_speed(iter/s)": 0.201531 }, { "acc": 0.7374341, "epoch": 0.6832825976661593, "grad_norm": 2.578125, "learning_rate": 7.849384971796346e-06, "loss": 1.06951113, "memory(GiB)": 369.4, "step": 26935, "train_speed(iter/s)": 0.201535 }, { "acc": 0.75077744, "epoch": 0.6834094368340944, "grad_norm": 2.125, "learning_rate": 7.848523223590514e-06, "loss": 0.99542618, "memory(GiB)": 369.4, "step": 26940, "train_speed(iter/s)": 0.20154 }, { "acc": 0.73563414, "epoch": 0.6835362760020294, "grad_norm": 2.578125, "learning_rate": 7.84766135009391e-06, "loss": 1.05612984, "memory(GiB)": 369.4, "step": 26945, "train_speed(iter/s)": 0.201546 }, { "acc": 0.73904409, "epoch": 0.6836631151699645, "grad_norm": 1.9609375, "learning_rate": 7.846799351344447e-06, "loss": 1.04840527, "memory(GiB)": 369.4, "step": 26950, "train_speed(iter/s)": 0.201547 }, { "acc": 0.75249677, "epoch": 0.6837899543378996, "grad_norm": 2.15625, "learning_rate": 7.845937227380038e-06, "loss": 0.98981009, "memory(GiB)": 369.4, "step": 26955, "train_speed(iter/s)": 0.201553 }, { "acc": 0.73857493, "epoch": 0.6839167935058346, "grad_norm": 2.0625, "learning_rate": 7.845074978238604e-06, "loss": 1.05404758, "memory(GiB)": 369.4, "step": 26960, "train_speed(iter/s)": 0.201559 }, { "acc": 0.74377689, "epoch": 0.6840436326737697, "grad_norm": 2.015625, "learning_rate": 7.84421260395807e-06, "loss": 1.01990643, "memory(GiB)": 369.4, "step": 26965, "train_speed(iter/s)": 0.201565 }, { "acc": 0.74642367, "epoch": 0.6841704718417048, "grad_norm": 2.296875, "learning_rate": 7.84335010457637e-06, "loss": 0.98560524, "memory(GiB)": 369.4, "step": 26970, "train_speed(iter/s)": 0.201571 }, { "acc": 0.76144757, "epoch": 0.6842973110096398, "grad_norm": 2.34375, "learning_rate": 7.842487480131435e-06, "loss": 0.90803928, "memory(GiB)": 369.4, "step": 26975, "train_speed(iter/s)": 0.201578 }, { "acc": 0.73766108, "epoch": 0.6844241501775749, "grad_norm": 2.390625, "learning_rate": 7.84162473066121e-06, "loss": 1.1025857, "memory(GiB)": 369.4, "step": 26980, "train_speed(iter/s)": 0.201583 }, { "acc": 0.7517252, "epoch": 0.6845509893455098, "grad_norm": 2.109375, "learning_rate": 7.840761856203642e-06, "loss": 0.93013077, "memory(GiB)": 369.4, "step": 26985, "train_speed(iter/s)": 0.201588 }, { "acc": 0.73509445, "epoch": 0.6846778285134449, "grad_norm": 2.015625, "learning_rate": 7.839898856796685e-06, "loss": 1.09069271, "memory(GiB)": 369.4, "step": 26990, "train_speed(iter/s)": 0.201595 }, { "acc": 0.73920422, "epoch": 0.68480466768138, "grad_norm": 2.375, "learning_rate": 7.839035732478297e-06, "loss": 1.00694914, "memory(GiB)": 369.4, "step": 26995, "train_speed(iter/s)": 0.201601 }, { "acc": 0.74165745, "epoch": 0.684931506849315, "grad_norm": 2.109375, "learning_rate": 7.838172483286441e-06, "loss": 1.03753204, "memory(GiB)": 369.4, "step": 27000, "train_speed(iter/s)": 0.201608 }, { "epoch": 0.684931506849315, "eval_acc": 0.7363528997437833, "eval_loss": 0.9775540828704834, "eval_runtime": 385.253, "eval_samples_per_second": 16.535, "eval_steps_per_second": 8.267, "step": 27000 }, { "acc": 0.75804791, "epoch": 0.6850583460172501, "grad_norm": 1.96875, "learning_rate": 7.83730910925909e-06, "loss": 0.98812675, "memory(GiB)": 369.4, "step": 27005, "train_speed(iter/s)": 0.200549 }, { "acc": 0.74356441, "epoch": 0.6851851851851852, "grad_norm": 2.03125, "learning_rate": 7.836445610434215e-06, "loss": 0.99951248, "memory(GiB)": 369.4, "step": 27010, "train_speed(iter/s)": 0.200556 }, { "acc": 0.74939861, "epoch": 0.6853120243531202, "grad_norm": 2.296875, "learning_rate": 7.835581986849799e-06, "loss": 1.00957031, "memory(GiB)": 369.4, "step": 27015, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74665813, "epoch": 0.6854388635210553, "grad_norm": 2.046875, "learning_rate": 7.834718238543827e-06, "loss": 1.00034065, "memory(GiB)": 369.4, "step": 27020, "train_speed(iter/s)": 0.200568 }, { "acc": 0.72888746, "epoch": 0.6855657026889903, "grad_norm": 1.90625, "learning_rate": 7.833854365554289e-06, "loss": 1.0257082, "memory(GiB)": 369.4, "step": 27025, "train_speed(iter/s)": 0.200576 }, { "acc": 0.73384657, "epoch": 0.6856925418569254, "grad_norm": 1.859375, "learning_rate": 7.832990367919186e-06, "loss": 1.04022427, "memory(GiB)": 369.4, "step": 27030, "train_speed(iter/s)": 0.200582 }, { "acc": 0.74654894, "epoch": 0.6858193810248605, "grad_norm": 2.625, "learning_rate": 7.832126245676518e-06, "loss": 0.99368496, "memory(GiB)": 369.4, "step": 27035, "train_speed(iter/s)": 0.200582 }, { "acc": 0.75380182, "epoch": 0.6859462201927955, "grad_norm": 2.4375, "learning_rate": 7.831261998864293e-06, "loss": 0.98825588, "memory(GiB)": 369.4, "step": 27040, "train_speed(iter/s)": 0.200589 }, { "acc": 0.74799323, "epoch": 0.6860730593607306, "grad_norm": 2.15625, "learning_rate": 7.830397627520526e-06, "loss": 1.04796963, "memory(GiB)": 369.4, "step": 27045, "train_speed(iter/s)": 0.200595 }, { "acc": 0.7400856, "epoch": 0.6861998985286657, "grad_norm": 2.09375, "learning_rate": 7.82953313168323e-06, "loss": 1.00245848, "memory(GiB)": 369.4, "step": 27050, "train_speed(iter/s)": 0.200601 }, { "acc": 0.74521046, "epoch": 0.6863267376966007, "grad_norm": 2.3125, "learning_rate": 7.828668511390439e-06, "loss": 0.96749001, "memory(GiB)": 369.4, "step": 27055, "train_speed(iter/s)": 0.200603 }, { "acc": 0.73529387, "epoch": 0.6864535768645358, "grad_norm": 2.375, "learning_rate": 7.827803766680176e-06, "loss": 1.03050489, "memory(GiB)": 369.4, "step": 27060, "train_speed(iter/s)": 0.200607 }, { "acc": 0.73818216, "epoch": 0.6865804160324708, "grad_norm": 2.75, "learning_rate": 7.826938897590477e-06, "loss": 1.04173908, "memory(GiB)": 369.4, "step": 27065, "train_speed(iter/s)": 0.200609 }, { "acc": 0.75281587, "epoch": 0.6867072552004059, "grad_norm": 1.9921875, "learning_rate": 7.826073904159384e-06, "loss": 0.96660719, "memory(GiB)": 369.4, "step": 27070, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75047445, "epoch": 0.686834094368341, "grad_norm": 1.96875, "learning_rate": 7.825208786424944e-06, "loss": 1.02159033, "memory(GiB)": 369.4, "step": 27075, "train_speed(iter/s)": 0.200615 }, { "acc": 0.73868694, "epoch": 0.686960933536276, "grad_norm": 2.4375, "learning_rate": 7.824343544425207e-06, "loss": 1.06479855, "memory(GiB)": 369.4, "step": 27080, "train_speed(iter/s)": 0.200623 }, { "acc": 0.73930249, "epoch": 0.6870877727042111, "grad_norm": 1.921875, "learning_rate": 7.823478178198234e-06, "loss": 1.0209197, "memory(GiB)": 369.4, "step": 27085, "train_speed(iter/s)": 0.200628 }, { "acc": 0.7533782, "epoch": 0.6872146118721462, "grad_norm": 1.984375, "learning_rate": 7.822612687782083e-06, "loss": 0.9449708, "memory(GiB)": 369.4, "step": 27090, "train_speed(iter/s)": 0.200636 }, { "acc": 0.75662975, "epoch": 0.6873414510400812, "grad_norm": 2.375, "learning_rate": 7.821747073214823e-06, "loss": 0.93866329, "memory(GiB)": 369.4, "step": 27095, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74875727, "epoch": 0.6874682902080163, "grad_norm": 1.96875, "learning_rate": 7.820881334534529e-06, "loss": 0.97483644, "memory(GiB)": 369.4, "step": 27100, "train_speed(iter/s)": 0.200646 }, { "acc": 0.75808692, "epoch": 0.6875951293759512, "grad_norm": 1.921875, "learning_rate": 7.820015471779278e-06, "loss": 0.92397337, "memory(GiB)": 369.4, "step": 27105, "train_speed(iter/s)": 0.200651 }, { "acc": 0.75721602, "epoch": 0.6877219685438863, "grad_norm": 2.328125, "learning_rate": 7.819149484987159e-06, "loss": 0.97110214, "memory(GiB)": 369.4, "step": 27110, "train_speed(iter/s)": 0.200658 }, { "acc": 0.75631032, "epoch": 0.6878488077118214, "grad_norm": 1.8515625, "learning_rate": 7.818283374196259e-06, "loss": 0.97031736, "memory(GiB)": 369.4, "step": 27115, "train_speed(iter/s)": 0.20066 }, { "acc": 0.74514856, "epoch": 0.6879756468797564, "grad_norm": 2.296875, "learning_rate": 7.817417139444671e-06, "loss": 1.0523243, "memory(GiB)": 369.4, "step": 27120, "train_speed(iter/s)": 0.200667 }, { "acc": 0.73816137, "epoch": 0.6881024860476915, "grad_norm": 2.4375, "learning_rate": 7.8165507807705e-06, "loss": 1.10929165, "memory(GiB)": 369.4, "step": 27125, "train_speed(iter/s)": 0.200675 }, { "acc": 0.73899183, "epoch": 0.6882293252156266, "grad_norm": 2.25, "learning_rate": 7.81568429821185e-06, "loss": 1.05113487, "memory(GiB)": 369.4, "step": 27130, "train_speed(iter/s)": 0.200682 }, { "acc": 0.74639764, "epoch": 0.6883561643835616, "grad_norm": 2.390625, "learning_rate": 7.814817691806834e-06, "loss": 1.01125603, "memory(GiB)": 369.4, "step": 27135, "train_speed(iter/s)": 0.200685 }, { "acc": 0.7514822, "epoch": 0.6884830035514967, "grad_norm": 2.1875, "learning_rate": 7.813950961593569e-06, "loss": 0.99735966, "memory(GiB)": 369.4, "step": 27140, "train_speed(iter/s)": 0.200689 }, { "acc": 0.73952618, "epoch": 0.6886098427194317, "grad_norm": 1.7265625, "learning_rate": 7.813084107610175e-06, "loss": 0.99825687, "memory(GiB)": 369.4, "step": 27145, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75628271, "epoch": 0.6887366818873668, "grad_norm": 1.9765625, "learning_rate": 7.812217129894785e-06, "loss": 0.91234493, "memory(GiB)": 369.4, "step": 27150, "train_speed(iter/s)": 0.200695 }, { "acc": 0.75496931, "epoch": 0.6888635210553019, "grad_norm": 1.96875, "learning_rate": 7.811350028485531e-06, "loss": 0.96483774, "memory(GiB)": 369.4, "step": 27155, "train_speed(iter/s)": 0.200699 }, { "acc": 0.74773626, "epoch": 0.6889903602232369, "grad_norm": 2.28125, "learning_rate": 7.810482803420549e-06, "loss": 0.98722486, "memory(GiB)": 369.4, "step": 27160, "train_speed(iter/s)": 0.200703 }, { "acc": 0.74741654, "epoch": 0.689117199391172, "grad_norm": 2.359375, "learning_rate": 7.809615454737984e-06, "loss": 1.05295124, "memory(GiB)": 369.4, "step": 27165, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74536915, "epoch": 0.6892440385591071, "grad_norm": 1.921875, "learning_rate": 7.808747982475991e-06, "loss": 1.02984257, "memory(GiB)": 369.4, "step": 27170, "train_speed(iter/s)": 0.200714 }, { "acc": 0.74077315, "epoch": 0.6893708777270421, "grad_norm": 2.65625, "learning_rate": 7.807880386672718e-06, "loss": 1.01409063, "memory(GiB)": 369.4, "step": 27175, "train_speed(iter/s)": 0.200722 }, { "acc": 0.72910266, "epoch": 0.6894977168949772, "grad_norm": 2.46875, "learning_rate": 7.807012667366332e-06, "loss": 1.07377586, "memory(GiB)": 369.4, "step": 27180, "train_speed(iter/s)": 0.200726 }, { "acc": 0.74376078, "epoch": 0.6896245560629122, "grad_norm": 2.6875, "learning_rate": 7.806144824594994e-06, "loss": 1.03511086, "memory(GiB)": 369.4, "step": 27185, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74176426, "epoch": 0.6897513952308473, "grad_norm": 2.234375, "learning_rate": 7.805276858396879e-06, "loss": 1.02780628, "memory(GiB)": 369.4, "step": 27190, "train_speed(iter/s)": 0.200737 }, { "acc": 0.75224328, "epoch": 0.6898782343987824, "grad_norm": 2.125, "learning_rate": 7.804408768810164e-06, "loss": 1.00062313, "memory(GiB)": 369.4, "step": 27195, "train_speed(iter/s)": 0.200744 }, { "acc": 0.73659153, "epoch": 0.6900050735667174, "grad_norm": 2.1875, "learning_rate": 7.80354055587303e-06, "loss": 1.06595154, "memory(GiB)": 369.4, "step": 27200, "train_speed(iter/s)": 0.200746 }, { "acc": 0.74272528, "epoch": 0.6901319127346525, "grad_norm": 2.234375, "learning_rate": 7.802672219623665e-06, "loss": 0.99399405, "memory(GiB)": 369.4, "step": 27205, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75787077, "epoch": 0.6902587519025876, "grad_norm": 2.34375, "learning_rate": 7.801803760100264e-06, "loss": 0.99633532, "memory(GiB)": 369.4, "step": 27210, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74678426, "epoch": 0.6903855910705226, "grad_norm": 2.625, "learning_rate": 7.800935177341022e-06, "loss": 0.95861454, "memory(GiB)": 369.4, "step": 27215, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75644131, "epoch": 0.6905124302384577, "grad_norm": 2.0, "learning_rate": 7.800066471384149e-06, "loss": 1.00286903, "memory(GiB)": 369.4, "step": 27220, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75304108, "epoch": 0.6906392694063926, "grad_norm": 2.078125, "learning_rate": 7.799197642267848e-06, "loss": 1.00957794, "memory(GiB)": 369.4, "step": 27225, "train_speed(iter/s)": 0.200773 }, { "acc": 0.73760862, "epoch": 0.6907661085743277, "grad_norm": 2.0, "learning_rate": 7.79832869003034e-06, "loss": 1.02688684, "memory(GiB)": 369.4, "step": 27230, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75053539, "epoch": 0.6908929477422628, "grad_norm": 2.328125, "learning_rate": 7.797459614709842e-06, "loss": 1.0079587, "memory(GiB)": 369.4, "step": 27235, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74583855, "epoch": 0.6910197869101978, "grad_norm": 2.234375, "learning_rate": 7.796590416344578e-06, "loss": 1.00629368, "memory(GiB)": 369.4, "step": 27240, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74363713, "epoch": 0.6911466260781329, "grad_norm": 2.375, "learning_rate": 7.795721094972783e-06, "loss": 1.06473522, "memory(GiB)": 369.4, "step": 27245, "train_speed(iter/s)": 0.200795 }, { "acc": 0.73395596, "epoch": 0.691273465246068, "grad_norm": 2.46875, "learning_rate": 7.794851650632693e-06, "loss": 1.06927586, "memory(GiB)": 369.4, "step": 27250, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75586081, "epoch": 0.691400304414003, "grad_norm": 1.9609375, "learning_rate": 7.793982083362548e-06, "loss": 0.93068619, "memory(GiB)": 369.4, "step": 27255, "train_speed(iter/s)": 0.200809 }, { "acc": 0.75684738, "epoch": 0.6915271435819381, "grad_norm": 2.390625, "learning_rate": 7.7931123932006e-06, "loss": 0.96869612, "memory(GiB)": 369.4, "step": 27260, "train_speed(iter/s)": 0.200814 }, { "acc": 0.7416997, "epoch": 0.6916539827498731, "grad_norm": 2.171875, "learning_rate": 7.792242580185095e-06, "loss": 0.99494619, "memory(GiB)": 369.4, "step": 27265, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75227494, "epoch": 0.6917808219178082, "grad_norm": 2.921875, "learning_rate": 7.791372644354295e-06, "loss": 0.96342869, "memory(GiB)": 369.4, "step": 27270, "train_speed(iter/s)": 0.200828 }, { "acc": 0.74757557, "epoch": 0.6919076610857433, "grad_norm": 2.203125, "learning_rate": 7.790502585746464e-06, "loss": 1.01078501, "memory(GiB)": 369.4, "step": 27275, "train_speed(iter/s)": 0.200835 }, { "acc": 0.75687742, "epoch": 0.6920345002536783, "grad_norm": 2.21875, "learning_rate": 7.789632404399872e-06, "loss": 0.95765476, "memory(GiB)": 369.4, "step": 27280, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74673929, "epoch": 0.6921613394216134, "grad_norm": 2.5625, "learning_rate": 7.788762100352791e-06, "loss": 1.00138626, "memory(GiB)": 369.4, "step": 27285, "train_speed(iter/s)": 0.200847 }, { "acc": 0.75826855, "epoch": 0.6922881785895485, "grad_norm": 2.125, "learning_rate": 7.787891673643501e-06, "loss": 0.98738155, "memory(GiB)": 369.4, "step": 27290, "train_speed(iter/s)": 0.200852 }, { "acc": 0.74657593, "epoch": 0.6924150177574835, "grad_norm": 2.390625, "learning_rate": 7.78702112431029e-06, "loss": 1.02074642, "memory(GiB)": 369.4, "step": 27295, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75150065, "epoch": 0.6925418569254186, "grad_norm": 2.0625, "learning_rate": 7.786150452391446e-06, "loss": 0.97551031, "memory(GiB)": 369.4, "step": 27300, "train_speed(iter/s)": 0.200859 }, { "acc": 0.75822959, "epoch": 0.6926686960933536, "grad_norm": 2.0625, "learning_rate": 7.785279657925265e-06, "loss": 0.95809755, "memory(GiB)": 369.4, "step": 27305, "train_speed(iter/s)": 0.200867 }, { "acc": 0.7449697, "epoch": 0.6927955352612887, "grad_norm": 2.0, "learning_rate": 7.784408740950051e-06, "loss": 1.00181255, "memory(GiB)": 369.4, "step": 27310, "train_speed(iter/s)": 0.200871 }, { "acc": 0.75331841, "epoch": 0.6929223744292238, "grad_norm": 2.15625, "learning_rate": 7.783537701504109e-06, "loss": 1.01129942, "memory(GiB)": 369.4, "step": 27315, "train_speed(iter/s)": 0.200877 }, { "acc": 0.74553413, "epoch": 0.6930492135971588, "grad_norm": 2.140625, "learning_rate": 7.782666539625749e-06, "loss": 1.03821144, "memory(GiB)": 369.4, "step": 27320, "train_speed(iter/s)": 0.200886 }, { "acc": 0.75368471, "epoch": 0.6931760527650939, "grad_norm": 2.40625, "learning_rate": 7.781795255353293e-06, "loss": 0.95220251, "memory(GiB)": 369.4, "step": 27325, "train_speed(iter/s)": 0.200892 }, { "acc": 0.75685601, "epoch": 0.693302891933029, "grad_norm": 2.1875, "learning_rate": 7.780923848725061e-06, "loss": 0.95302572, "memory(GiB)": 369.4, "step": 27330, "train_speed(iter/s)": 0.200899 }, { "acc": 0.73380251, "epoch": 0.693429731100964, "grad_norm": 2.078125, "learning_rate": 7.780052319779382e-06, "loss": 1.0324461, "memory(GiB)": 369.4, "step": 27335, "train_speed(iter/s)": 0.200905 }, { "acc": 0.74016209, "epoch": 0.693556570268899, "grad_norm": 1.90625, "learning_rate": 7.779180668554591e-06, "loss": 1.06599255, "memory(GiB)": 369.4, "step": 27340, "train_speed(iter/s)": 0.20091 }, { "acc": 0.76129971, "epoch": 0.693683409436834, "grad_norm": 1.96875, "learning_rate": 7.778308895089024e-06, "loss": 0.97491159, "memory(GiB)": 369.4, "step": 27345, "train_speed(iter/s)": 0.200915 }, { "acc": 0.76069894, "epoch": 0.6938102486047691, "grad_norm": 2.21875, "learning_rate": 7.77743699942103e-06, "loss": 0.96148767, "memory(GiB)": 369.4, "step": 27350, "train_speed(iter/s)": 0.200922 }, { "acc": 0.74910164, "epoch": 0.6939370877727042, "grad_norm": 2.265625, "learning_rate": 7.776564981588955e-06, "loss": 0.99446087, "memory(GiB)": 369.4, "step": 27355, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74319735, "epoch": 0.6940639269406392, "grad_norm": 2.546875, "learning_rate": 7.775692841631154e-06, "loss": 1.09556351, "memory(GiB)": 369.4, "step": 27360, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74227638, "epoch": 0.6941907661085743, "grad_norm": 1.625, "learning_rate": 7.774820579585993e-06, "loss": 1.04410725, "memory(GiB)": 369.4, "step": 27365, "train_speed(iter/s)": 0.200941 }, { "acc": 0.75396991, "epoch": 0.6943176052765094, "grad_norm": 2.046875, "learning_rate": 7.773948195491831e-06, "loss": 0.9756424, "memory(GiB)": 369.4, "step": 27370, "train_speed(iter/s)": 0.200946 }, { "acc": 0.74620667, "epoch": 0.6944444444444444, "grad_norm": 2.3125, "learning_rate": 7.773075689387044e-06, "loss": 1.04400845, "memory(GiB)": 369.4, "step": 27375, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75591297, "epoch": 0.6945712836123795, "grad_norm": 2.59375, "learning_rate": 7.772203061310008e-06, "loss": 0.98942823, "memory(GiB)": 369.4, "step": 27380, "train_speed(iter/s)": 0.200958 }, { "acc": 0.73946533, "epoch": 0.6946981227803145, "grad_norm": 2.09375, "learning_rate": 7.771330311299104e-06, "loss": 1.10485229, "memory(GiB)": 369.4, "step": 27385, "train_speed(iter/s)": 0.200965 }, { "acc": 0.73856397, "epoch": 0.6948249619482496, "grad_norm": 1.890625, "learning_rate": 7.770457439392719e-06, "loss": 1.04273052, "memory(GiB)": 369.4, "step": 27390, "train_speed(iter/s)": 0.20097 }, { "acc": 0.73171444, "epoch": 0.6949518011161847, "grad_norm": 1.796875, "learning_rate": 7.769584445629247e-06, "loss": 1.07934179, "memory(GiB)": 369.4, "step": 27395, "train_speed(iter/s)": 0.200974 }, { "acc": 0.74174705, "epoch": 0.6950786402841197, "grad_norm": 2.703125, "learning_rate": 7.768711330047087e-06, "loss": 0.98541985, "memory(GiB)": 369.4, "step": 27400, "train_speed(iter/s)": 0.200981 }, { "acc": 0.74558363, "epoch": 0.6952054794520548, "grad_norm": 2.234375, "learning_rate": 7.767838092684638e-06, "loss": 1.06515827, "memory(GiB)": 369.4, "step": 27405, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76117086, "epoch": 0.6953323186199899, "grad_norm": 1.5390625, "learning_rate": 7.766964733580316e-06, "loss": 0.9396842, "memory(GiB)": 369.4, "step": 27410, "train_speed(iter/s)": 0.200984 }, { "acc": 0.74798088, "epoch": 0.6954591577879249, "grad_norm": 2.046875, "learning_rate": 7.76609125277253e-06, "loss": 1.021696, "memory(GiB)": 369.4, "step": 27415, "train_speed(iter/s)": 0.20099 }, { "acc": 0.75052547, "epoch": 0.69558599695586, "grad_norm": 2.90625, "learning_rate": 7.7652176502997e-06, "loss": 0.96987724, "memory(GiB)": 369.4, "step": 27420, "train_speed(iter/s)": 0.200993 }, { "acc": 0.74854345, "epoch": 0.695712836123795, "grad_norm": 2.078125, "learning_rate": 7.764343926200254e-06, "loss": 0.97168617, "memory(GiB)": 369.4, "step": 27425, "train_speed(iter/s)": 0.200999 }, { "acc": 0.74725046, "epoch": 0.6958396752917301, "grad_norm": 2.140625, "learning_rate": 7.763470080512617e-06, "loss": 1.04287777, "memory(GiB)": 369.4, "step": 27430, "train_speed(iter/s)": 0.201007 }, { "acc": 0.74887314, "epoch": 0.6959665144596652, "grad_norm": 2.6875, "learning_rate": 7.762596113275229e-06, "loss": 0.98466225, "memory(GiB)": 369.4, "step": 27435, "train_speed(iter/s)": 0.201013 }, { "acc": 0.7489686, "epoch": 0.6960933536276002, "grad_norm": 2.796875, "learning_rate": 7.761722024526533e-06, "loss": 1.06998625, "memory(GiB)": 369.4, "step": 27440, "train_speed(iter/s)": 0.201022 }, { "acc": 0.74843402, "epoch": 0.6962201927955353, "grad_norm": 2.03125, "learning_rate": 7.760847814304969e-06, "loss": 1.0148941, "memory(GiB)": 369.4, "step": 27445, "train_speed(iter/s)": 0.201027 }, { "acc": 0.73892574, "epoch": 0.6963470319634704, "grad_norm": 2.78125, "learning_rate": 7.759973482648992e-06, "loss": 1.00453377, "memory(GiB)": 369.4, "step": 27450, "train_speed(iter/s)": 0.201031 }, { "acc": 0.74562511, "epoch": 0.6964738711314054, "grad_norm": 1.765625, "learning_rate": 7.75909902959706e-06, "loss": 1.05934572, "memory(GiB)": 369.4, "step": 27455, "train_speed(iter/s)": 0.20104 }, { "acc": 0.7379426, "epoch": 0.6966007102993405, "grad_norm": 2.0625, "learning_rate": 7.758224455187632e-06, "loss": 1.03899174, "memory(GiB)": 369.4, "step": 27460, "train_speed(iter/s)": 0.201044 }, { "acc": 0.74029465, "epoch": 0.6967275494672754, "grad_norm": 2.375, "learning_rate": 7.75734975945918e-06, "loss": 1.07421932, "memory(GiB)": 369.4, "step": 27465, "train_speed(iter/s)": 0.201044 }, { "acc": 0.744771, "epoch": 0.6968543886352105, "grad_norm": 2.46875, "learning_rate": 7.756474942450174e-06, "loss": 1.02055321, "memory(GiB)": 369.4, "step": 27470, "train_speed(iter/s)": 0.20105 }, { "acc": 0.76022282, "epoch": 0.6969812278031456, "grad_norm": 2.5, "learning_rate": 7.755600004199094e-06, "loss": 0.96355438, "memory(GiB)": 369.4, "step": 27475, "train_speed(iter/s)": 0.201053 }, { "acc": 0.75124307, "epoch": 0.6971080669710806, "grad_norm": 2.25, "learning_rate": 7.754724944744423e-06, "loss": 1.00156498, "memory(GiB)": 369.4, "step": 27480, "train_speed(iter/s)": 0.201057 }, { "acc": 0.75576415, "epoch": 0.6972349061390157, "grad_norm": 2.296875, "learning_rate": 7.753849764124648e-06, "loss": 1.04013615, "memory(GiB)": 369.4, "step": 27485, "train_speed(iter/s)": 0.201065 }, { "acc": 0.74493866, "epoch": 0.6973617453069508, "grad_norm": 2.234375, "learning_rate": 7.752974462378268e-06, "loss": 1.02666311, "memory(GiB)": 369.4, "step": 27490, "train_speed(iter/s)": 0.20107 }, { "acc": 0.74508057, "epoch": 0.6974885844748858, "grad_norm": 2.109375, "learning_rate": 7.752099039543778e-06, "loss": 1.03376408, "memory(GiB)": 369.4, "step": 27495, "train_speed(iter/s)": 0.201076 }, { "acc": 0.74655361, "epoch": 0.6976154236428209, "grad_norm": 2.140625, "learning_rate": 7.751223495659685e-06, "loss": 0.9979166, "memory(GiB)": 369.4, "step": 27500, "train_speed(iter/s)": 0.201085 }, { "acc": 0.74749327, "epoch": 0.6977422628107559, "grad_norm": 2.09375, "learning_rate": 7.7503478307645e-06, "loss": 0.99443188, "memory(GiB)": 369.4, "step": 27505, "train_speed(iter/s)": 0.201091 }, { "acc": 0.75203705, "epoch": 0.697869101978691, "grad_norm": 1.9765625, "learning_rate": 7.74947204489674e-06, "loss": 0.97891254, "memory(GiB)": 369.4, "step": 27510, "train_speed(iter/s)": 0.201096 }, { "acc": 0.73759723, "epoch": 0.6979959411466261, "grad_norm": 2.234375, "learning_rate": 7.748596138094922e-06, "loss": 1.09635887, "memory(GiB)": 369.4, "step": 27515, "train_speed(iter/s)": 0.2011 }, { "acc": 0.720367, "epoch": 0.6981227803145611, "grad_norm": 2.3125, "learning_rate": 7.747720110397573e-06, "loss": 1.08329315, "memory(GiB)": 369.4, "step": 27520, "train_speed(iter/s)": 0.201105 }, { "acc": 0.74571633, "epoch": 0.6982496194824962, "grad_norm": 2.09375, "learning_rate": 7.746843961843226e-06, "loss": 1.04217978, "memory(GiB)": 369.4, "step": 27525, "train_speed(iter/s)": 0.20111 }, { "acc": 0.75912776, "epoch": 0.6983764586504313, "grad_norm": 2.1875, "learning_rate": 7.74596769247042e-06, "loss": 0.97877693, "memory(GiB)": 369.4, "step": 27530, "train_speed(iter/s)": 0.201115 }, { "acc": 0.74940004, "epoch": 0.6985032978183663, "grad_norm": 2.328125, "learning_rate": 7.745091302317694e-06, "loss": 1.01425819, "memory(GiB)": 369.4, "step": 27535, "train_speed(iter/s)": 0.201122 }, { "acc": 0.76252804, "epoch": 0.6986301369863014, "grad_norm": 2.171875, "learning_rate": 7.744214791423597e-06, "loss": 1.00445328, "memory(GiB)": 369.4, "step": 27540, "train_speed(iter/s)": 0.20113 }, { "acc": 0.72047319, "epoch": 0.6987569761542364, "grad_norm": 2.203125, "learning_rate": 7.74333815982668e-06, "loss": 1.05354881, "memory(GiB)": 369.4, "step": 27545, "train_speed(iter/s)": 0.201136 }, { "acc": 0.74334445, "epoch": 0.6988838153221715, "grad_norm": 1.9609375, "learning_rate": 7.742461407565504e-06, "loss": 1.00177269, "memory(GiB)": 369.4, "step": 27550, "train_speed(iter/s)": 0.201143 }, { "acc": 0.74994454, "epoch": 0.6990106544901066, "grad_norm": 1.8046875, "learning_rate": 7.741584534678632e-06, "loss": 1.01808395, "memory(GiB)": 369.4, "step": 27555, "train_speed(iter/s)": 0.201149 }, { "acc": 0.75949574, "epoch": 0.6991374936580416, "grad_norm": 1.984375, "learning_rate": 7.74070754120463e-06, "loss": 0.99777288, "memory(GiB)": 369.4, "step": 27560, "train_speed(iter/s)": 0.201154 }, { "acc": 0.74550476, "epoch": 0.6992643328259767, "grad_norm": 2.3125, "learning_rate": 7.739830427182073e-06, "loss": 1.05479183, "memory(GiB)": 369.4, "step": 27565, "train_speed(iter/s)": 0.201162 }, { "acc": 0.74401484, "epoch": 0.6993911719939118, "grad_norm": 3.640625, "learning_rate": 7.738953192649544e-06, "loss": 1.03805904, "memory(GiB)": 369.4, "step": 27570, "train_speed(iter/s)": 0.201167 }, { "acc": 0.73980761, "epoch": 0.6995180111618468, "grad_norm": 1.9921875, "learning_rate": 7.738075837645625e-06, "loss": 1.02484779, "memory(GiB)": 369.4, "step": 27575, "train_speed(iter/s)": 0.201173 }, { "acc": 0.75524187, "epoch": 0.6996448503297819, "grad_norm": 2.296875, "learning_rate": 7.737198362208904e-06, "loss": 0.97145243, "memory(GiB)": 369.4, "step": 27580, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75387449, "epoch": 0.6997716894977168, "grad_norm": 2.34375, "learning_rate": 7.736320766377978e-06, "loss": 1.04197884, "memory(GiB)": 369.4, "step": 27585, "train_speed(iter/s)": 0.201183 }, { "acc": 0.73293505, "epoch": 0.6998985286656519, "grad_norm": 2.1875, "learning_rate": 7.735443050191452e-06, "loss": 0.98641014, "memory(GiB)": 369.4, "step": 27590, "train_speed(iter/s)": 0.201186 }, { "acc": 0.74945698, "epoch": 0.700025367833587, "grad_norm": 2.03125, "learning_rate": 7.734565213687923e-06, "loss": 0.97178993, "memory(GiB)": 369.4, "step": 27595, "train_speed(iter/s)": 0.201192 }, { "acc": 0.74500928, "epoch": 0.700152207001522, "grad_norm": 1.96875, "learning_rate": 7.733687256906009e-06, "loss": 1.00419474, "memory(GiB)": 369.4, "step": 27600, "train_speed(iter/s)": 0.201199 }, { "acc": 0.75145836, "epoch": 0.7002790461694571, "grad_norm": 1.84375, "learning_rate": 7.732809179884324e-06, "loss": 1.00901546, "memory(GiB)": 369.4, "step": 27605, "train_speed(iter/s)": 0.201196 }, { "acc": 0.75493894, "epoch": 0.7004058853373922, "grad_norm": 2.328125, "learning_rate": 7.73193098266149e-06, "loss": 0.97945995, "memory(GiB)": 369.4, "step": 27610, "train_speed(iter/s)": 0.201201 }, { "acc": 0.73979673, "epoch": 0.7005327245053272, "grad_norm": 2.59375, "learning_rate": 7.731052665276135e-06, "loss": 1.04544811, "memory(GiB)": 369.4, "step": 27615, "train_speed(iter/s)": 0.201208 }, { "acc": 0.76407671, "epoch": 0.7006595636732623, "grad_norm": 2.359375, "learning_rate": 7.730174227766892e-06, "loss": 0.97417393, "memory(GiB)": 369.4, "step": 27620, "train_speed(iter/s)": 0.201213 }, { "acc": 0.7563796, "epoch": 0.7007864028411973, "grad_norm": 2.578125, "learning_rate": 7.729295670172394e-06, "loss": 1.00471191, "memory(GiB)": 369.4, "step": 27625, "train_speed(iter/s)": 0.201217 }, { "acc": 0.75110207, "epoch": 0.7009132420091324, "grad_norm": 2.453125, "learning_rate": 7.728416992531287e-06, "loss": 1.01562166, "memory(GiB)": 369.4, "step": 27630, "train_speed(iter/s)": 0.201223 }, { "acc": 0.75274076, "epoch": 0.7010400811770675, "grad_norm": 2.5, "learning_rate": 7.72753819488222e-06, "loss": 1.02672901, "memory(GiB)": 369.4, "step": 27635, "train_speed(iter/s)": 0.20123 }, { "acc": 0.75046325, "epoch": 0.7011669203450025, "grad_norm": 2.03125, "learning_rate": 7.726659277263848e-06, "loss": 0.98598137, "memory(GiB)": 369.4, "step": 27640, "train_speed(iter/s)": 0.201235 }, { "acc": 0.74453254, "epoch": 0.7012937595129376, "grad_norm": 3.421875, "learning_rate": 7.725780239714824e-06, "loss": 1.03251534, "memory(GiB)": 369.4, "step": 27645, "train_speed(iter/s)": 0.201239 }, { "acc": 0.75167999, "epoch": 0.7014205986808727, "grad_norm": 3.46875, "learning_rate": 7.724901082273817e-06, "loss": 0.97376919, "memory(GiB)": 369.4, "step": 27650, "train_speed(iter/s)": 0.201246 }, { "acc": 0.74853458, "epoch": 0.7015474378488077, "grad_norm": 2.453125, "learning_rate": 7.724021804979493e-06, "loss": 0.99715986, "memory(GiB)": 369.4, "step": 27655, "train_speed(iter/s)": 0.20125 }, { "acc": 0.75399323, "epoch": 0.7016742770167428, "grad_norm": 2.015625, "learning_rate": 7.723142407870532e-06, "loss": 1.02482557, "memory(GiB)": 369.4, "step": 27660, "train_speed(iter/s)": 0.201254 }, { "acc": 0.76200047, "epoch": 0.7018011161846778, "grad_norm": 2.15625, "learning_rate": 7.722262890985605e-06, "loss": 1.03522987, "memory(GiB)": 369.4, "step": 27665, "train_speed(iter/s)": 0.201262 }, { "acc": 0.74617758, "epoch": 0.7019279553526129, "grad_norm": 2.296875, "learning_rate": 7.721383254363407e-06, "loss": 1.01843624, "memory(GiB)": 369.4, "step": 27670, "train_speed(iter/s)": 0.201267 }, { "acc": 0.76683788, "epoch": 0.702054794520548, "grad_norm": 2.03125, "learning_rate": 7.72050349804262e-06, "loss": 0.93249664, "memory(GiB)": 369.4, "step": 27675, "train_speed(iter/s)": 0.201271 }, { "acc": 0.75060177, "epoch": 0.702181633688483, "grad_norm": 1.9140625, "learning_rate": 7.719623622061943e-06, "loss": 0.96981993, "memory(GiB)": 369.4, "step": 27680, "train_speed(iter/s)": 0.201275 }, { "acc": 0.73978009, "epoch": 0.7023084728564181, "grad_norm": 2.1875, "learning_rate": 7.718743626460076e-06, "loss": 1.02753696, "memory(GiB)": 369.4, "step": 27685, "train_speed(iter/s)": 0.201281 }, { "acc": 0.76156044, "epoch": 0.7024353120243532, "grad_norm": 2.015625, "learning_rate": 7.71786351127573e-06, "loss": 0.95315018, "memory(GiB)": 369.4, "step": 27690, "train_speed(iter/s)": 0.201285 }, { "acc": 0.75059261, "epoch": 0.7025621511922882, "grad_norm": 2.140625, "learning_rate": 7.71698327654761e-06, "loss": 0.96843891, "memory(GiB)": 369.4, "step": 27695, "train_speed(iter/s)": 0.201282 }, { "acc": 0.73893828, "epoch": 0.7026889903602233, "grad_norm": 2.375, "learning_rate": 7.716102922314435e-06, "loss": 1.0146492, "memory(GiB)": 369.4, "step": 27700, "train_speed(iter/s)": 0.201286 }, { "acc": 0.7346911, "epoch": 0.7028158295281582, "grad_norm": 2.15625, "learning_rate": 7.715222448614926e-06, "loss": 1.06607819, "memory(GiB)": 369.4, "step": 27705, "train_speed(iter/s)": 0.201291 }, { "acc": 0.74826317, "epoch": 0.7029426686960933, "grad_norm": 2.609375, "learning_rate": 7.714341855487812e-06, "loss": 1.02443333, "memory(GiB)": 369.4, "step": 27710, "train_speed(iter/s)": 0.201297 }, { "acc": 0.73715434, "epoch": 0.7030695078640284, "grad_norm": 1.8828125, "learning_rate": 7.713461142971824e-06, "loss": 1.00638828, "memory(GiB)": 369.4, "step": 27715, "train_speed(iter/s)": 0.201304 }, { "acc": 0.75402069, "epoch": 0.7031963470319634, "grad_norm": 2.046875, "learning_rate": 7.712580311105701e-06, "loss": 1.02130604, "memory(GiB)": 369.4, "step": 27720, "train_speed(iter/s)": 0.201307 }, { "acc": 0.74173565, "epoch": 0.7033231861998985, "grad_norm": 2.109375, "learning_rate": 7.711699359928184e-06, "loss": 0.98166513, "memory(GiB)": 369.4, "step": 27725, "train_speed(iter/s)": 0.201312 }, { "acc": 0.76001177, "epoch": 0.7034500253678336, "grad_norm": 2.59375, "learning_rate": 7.710818289478024e-06, "loss": 0.95777874, "memory(GiB)": 369.4, "step": 27730, "train_speed(iter/s)": 0.20132 }, { "acc": 0.75531235, "epoch": 0.7035768645357686, "grad_norm": 2.21875, "learning_rate": 7.709937099793971e-06, "loss": 0.98264008, "memory(GiB)": 369.4, "step": 27735, "train_speed(iter/s)": 0.201326 }, { "acc": 0.73736625, "epoch": 0.7037037037037037, "grad_norm": 2.375, "learning_rate": 7.709055790914787e-06, "loss": 1.03418646, "memory(GiB)": 369.4, "step": 27740, "train_speed(iter/s)": 0.201333 }, { "acc": 0.75279646, "epoch": 0.7038305428716387, "grad_norm": 2.203125, "learning_rate": 7.708174362879234e-06, "loss": 0.98201542, "memory(GiB)": 369.4, "step": 27745, "train_speed(iter/s)": 0.201335 }, { "acc": 0.73606153, "epoch": 0.7039573820395738, "grad_norm": 2.09375, "learning_rate": 7.70729281572608e-06, "loss": 1.06645641, "memory(GiB)": 369.4, "step": 27750, "train_speed(iter/s)": 0.201342 }, { "acc": 0.73301878, "epoch": 0.7040842212075089, "grad_norm": 2.03125, "learning_rate": 7.706411149494102e-06, "loss": 1.07094393, "memory(GiB)": 369.4, "step": 27755, "train_speed(iter/s)": 0.201348 }, { "acc": 0.75422363, "epoch": 0.7042110603754439, "grad_norm": 2.5, "learning_rate": 7.705529364222079e-06, "loss": 1.03911743, "memory(GiB)": 369.4, "step": 27760, "train_speed(iter/s)": 0.201354 }, { "acc": 0.74453664, "epoch": 0.704337899543379, "grad_norm": 2.34375, "learning_rate": 7.704647459948793e-06, "loss": 1.00440092, "memory(GiB)": 369.4, "step": 27765, "train_speed(iter/s)": 0.201363 }, { "acc": 0.74322658, "epoch": 0.7044647387113141, "grad_norm": 2.078125, "learning_rate": 7.703765436713038e-06, "loss": 0.98064194, "memory(GiB)": 369.4, "step": 27770, "train_speed(iter/s)": 0.20137 }, { "acc": 0.76373329, "epoch": 0.7045915778792491, "grad_norm": 2.328125, "learning_rate": 7.702883294553607e-06, "loss": 0.94988441, "memory(GiB)": 369.4, "step": 27775, "train_speed(iter/s)": 0.201374 }, { "acc": 0.75450497, "epoch": 0.7047184170471842, "grad_norm": 2.03125, "learning_rate": 7.702001033509302e-06, "loss": 0.95754528, "memory(GiB)": 369.4, "step": 27780, "train_speed(iter/s)": 0.201379 }, { "acc": 0.73670797, "epoch": 0.7048452562151192, "grad_norm": 1.7578125, "learning_rate": 7.701118653618927e-06, "loss": 1.0305831, "memory(GiB)": 369.4, "step": 27785, "train_speed(iter/s)": 0.201386 }, { "acc": 0.73381691, "epoch": 0.7049720953830543, "grad_norm": 2.296875, "learning_rate": 7.700236154921294e-06, "loss": 1.00705147, "memory(GiB)": 369.4, "step": 27790, "train_speed(iter/s)": 0.201393 }, { "acc": 0.73477874, "epoch": 0.7050989345509894, "grad_norm": 1.875, "learning_rate": 7.699353537455222e-06, "loss": 1.0301302, "memory(GiB)": 369.4, "step": 27795, "train_speed(iter/s)": 0.201398 }, { "acc": 0.74396486, "epoch": 0.7052257737189244, "grad_norm": 2.140625, "learning_rate": 7.698470801259526e-06, "loss": 1.03471699, "memory(GiB)": 369.4, "step": 27800, "train_speed(iter/s)": 0.201403 }, { "acc": 0.74712772, "epoch": 0.7053526128868595, "grad_norm": 2.25, "learning_rate": 7.697587946373037e-06, "loss": 1.03374939, "memory(GiB)": 369.4, "step": 27805, "train_speed(iter/s)": 0.201409 }, { "acc": 0.74697461, "epoch": 0.7054794520547946, "grad_norm": 1.9296875, "learning_rate": 7.696704972834589e-06, "loss": 0.97704372, "memory(GiB)": 369.4, "step": 27810, "train_speed(iter/s)": 0.201414 }, { "acc": 0.74407921, "epoch": 0.7056062912227296, "grad_norm": 2.015625, "learning_rate": 7.695821880683012e-06, "loss": 1.02461262, "memory(GiB)": 369.4, "step": 27815, "train_speed(iter/s)": 0.201421 }, { "acc": 0.74883566, "epoch": 0.7057331303906647, "grad_norm": 2.453125, "learning_rate": 7.694938669957156e-06, "loss": 0.98587093, "memory(GiB)": 369.4, "step": 27820, "train_speed(iter/s)": 0.201426 }, { "acc": 0.75019507, "epoch": 0.7058599695585996, "grad_norm": 2.359375, "learning_rate": 7.694055340695862e-06, "loss": 0.99527721, "memory(GiB)": 369.4, "step": 27825, "train_speed(iter/s)": 0.201429 }, { "acc": 0.74513583, "epoch": 0.7059868087265347, "grad_norm": 2.765625, "learning_rate": 7.693171892937991e-06, "loss": 1.03444786, "memory(GiB)": 369.4, "step": 27830, "train_speed(iter/s)": 0.201435 }, { "acc": 0.7470787, "epoch": 0.7061136478944698, "grad_norm": 2.078125, "learning_rate": 7.692288326722393e-06, "loss": 1.05122585, "memory(GiB)": 369.4, "step": 27835, "train_speed(iter/s)": 0.201438 }, { "acc": 0.74075332, "epoch": 0.7062404870624048, "grad_norm": 2.6875, "learning_rate": 7.691404642087933e-06, "loss": 0.98703823, "memory(GiB)": 369.4, "step": 27840, "train_speed(iter/s)": 0.20144 }, { "acc": 0.75986996, "epoch": 0.7063673262303399, "grad_norm": 2.375, "learning_rate": 7.690520839073484e-06, "loss": 0.98031101, "memory(GiB)": 369.4, "step": 27845, "train_speed(iter/s)": 0.201444 }, { "acc": 0.74916878, "epoch": 0.706494165398275, "grad_norm": 2.25, "learning_rate": 7.689636917717913e-06, "loss": 1.01415157, "memory(GiB)": 369.4, "step": 27850, "train_speed(iter/s)": 0.20145 }, { "acc": 0.75404758, "epoch": 0.70662100456621, "grad_norm": 2.109375, "learning_rate": 7.688752878060103e-06, "loss": 0.98468227, "memory(GiB)": 369.4, "step": 27855, "train_speed(iter/s)": 0.201455 }, { "acc": 0.74250193, "epoch": 0.7067478437341451, "grad_norm": 1.9453125, "learning_rate": 7.687868720138939e-06, "loss": 0.98994427, "memory(GiB)": 369.4, "step": 27860, "train_speed(iter/s)": 0.201463 }, { "acc": 0.73684039, "epoch": 0.7068746829020801, "grad_norm": 2.5625, "learning_rate": 7.686984443993304e-06, "loss": 1.05219517, "memory(GiB)": 369.4, "step": 27865, "train_speed(iter/s)": 0.20147 }, { "acc": 0.73983874, "epoch": 0.7070015220700152, "grad_norm": 2.21875, "learning_rate": 7.6861000496621e-06, "loss": 1.01994257, "memory(GiB)": 369.4, "step": 27870, "train_speed(iter/s)": 0.201475 }, { "acc": 0.74669743, "epoch": 0.7071283612379503, "grad_norm": 2.3125, "learning_rate": 7.685215537184223e-06, "loss": 1.06275406, "memory(GiB)": 369.4, "step": 27875, "train_speed(iter/s)": 0.20148 }, { "acc": 0.7507349, "epoch": 0.7072552004058853, "grad_norm": 2.328125, "learning_rate": 7.684330906598577e-06, "loss": 1.07692795, "memory(GiB)": 369.4, "step": 27880, "train_speed(iter/s)": 0.201486 }, { "acc": 0.75425172, "epoch": 0.7073820395738204, "grad_norm": 2.046875, "learning_rate": 7.683446157944075e-06, "loss": 0.98509254, "memory(GiB)": 369.4, "step": 27885, "train_speed(iter/s)": 0.201487 }, { "acc": 0.73741522, "epoch": 0.7075088787417555, "grad_norm": 2.1875, "learning_rate": 7.682561291259628e-06, "loss": 1.05218687, "memory(GiB)": 369.4, "step": 27890, "train_speed(iter/s)": 0.20149 }, { "acc": 0.74839644, "epoch": 0.7076357179096905, "grad_norm": 1.9609375, "learning_rate": 7.681676306584159e-06, "loss": 0.97888088, "memory(GiB)": 369.4, "step": 27895, "train_speed(iter/s)": 0.201495 }, { "acc": 0.74344397, "epoch": 0.7077625570776256, "grad_norm": 2.171875, "learning_rate": 7.680791203956594e-06, "loss": 0.98183136, "memory(GiB)": 369.4, "step": 27900, "train_speed(iter/s)": 0.201501 }, { "acc": 0.73815317, "epoch": 0.7078893962455606, "grad_norm": 2.140625, "learning_rate": 7.679905983415861e-06, "loss": 1.0326189, "memory(GiB)": 369.4, "step": 27905, "train_speed(iter/s)": 0.201506 }, { "acc": 0.75676947, "epoch": 0.7080162354134957, "grad_norm": 2.015625, "learning_rate": 7.6790206450009e-06, "loss": 0.99171743, "memory(GiB)": 369.4, "step": 27910, "train_speed(iter/s)": 0.201512 }, { "acc": 0.7391499, "epoch": 0.7081430745814308, "grad_norm": 2.15625, "learning_rate": 7.678135188750648e-06, "loss": 1.02836132, "memory(GiB)": 369.4, "step": 27915, "train_speed(iter/s)": 0.201515 }, { "acc": 0.74900208, "epoch": 0.7082699137493658, "grad_norm": 2.0625, "learning_rate": 7.677249614704057e-06, "loss": 0.97846432, "memory(GiB)": 369.4, "step": 27920, "train_speed(iter/s)": 0.20152 }, { "acc": 0.75543671, "epoch": 0.7083967529173009, "grad_norm": 1.9453125, "learning_rate": 7.676363922900073e-06, "loss": 0.97555037, "memory(GiB)": 369.4, "step": 27925, "train_speed(iter/s)": 0.201525 }, { "acc": 0.73823271, "epoch": 0.708523592085236, "grad_norm": 2.34375, "learning_rate": 7.675478113377653e-06, "loss": 1.07480507, "memory(GiB)": 369.4, "step": 27930, "train_speed(iter/s)": 0.201527 }, { "acc": 0.74655929, "epoch": 0.708650431253171, "grad_norm": 2.3125, "learning_rate": 7.674592186175762e-06, "loss": 0.97328987, "memory(GiB)": 369.4, "step": 27935, "train_speed(iter/s)": 0.201532 }, { "acc": 0.74707518, "epoch": 0.708777270421106, "grad_norm": 1.8671875, "learning_rate": 7.673706141333365e-06, "loss": 0.94100399, "memory(GiB)": 369.4, "step": 27940, "train_speed(iter/s)": 0.201535 }, { "acc": 0.74136658, "epoch": 0.708904109589041, "grad_norm": 2.046875, "learning_rate": 7.672819978889435e-06, "loss": 1.0374609, "memory(GiB)": 369.4, "step": 27945, "train_speed(iter/s)": 0.201542 }, { "acc": 0.74320478, "epoch": 0.7090309487569761, "grad_norm": 2.375, "learning_rate": 7.67193369888295e-06, "loss": 1.04357681, "memory(GiB)": 369.4, "step": 27950, "train_speed(iter/s)": 0.201547 }, { "acc": 0.7500206, "epoch": 0.7091577879249112, "grad_norm": 2.328125, "learning_rate": 7.67104730135289e-06, "loss": 0.97274523, "memory(GiB)": 369.4, "step": 27955, "train_speed(iter/s)": 0.20155 }, { "acc": 0.75172663, "epoch": 0.7092846270928462, "grad_norm": 2.15625, "learning_rate": 7.670160786338246e-06, "loss": 1.02916498, "memory(GiB)": 369.4, "step": 27960, "train_speed(iter/s)": 0.201555 }, { "acc": 0.74808517, "epoch": 0.7094114662607813, "grad_norm": 2.234375, "learning_rate": 7.669274153878006e-06, "loss": 0.99890594, "memory(GiB)": 369.4, "step": 27965, "train_speed(iter/s)": 0.201562 }, { "acc": 0.73662338, "epoch": 0.7095383054287164, "grad_norm": 2.140625, "learning_rate": 7.668387404011176e-06, "loss": 1.06491184, "memory(GiB)": 369.4, "step": 27970, "train_speed(iter/s)": 0.201569 }, { "acc": 0.73421268, "epoch": 0.7096651445966514, "grad_norm": 2.28125, "learning_rate": 7.667500536776748e-06, "loss": 1.06007071, "memory(GiB)": 369.4, "step": 27975, "train_speed(iter/s)": 0.201576 }, { "acc": 0.73725319, "epoch": 0.7097919837645865, "grad_norm": 1.9765625, "learning_rate": 7.666613552213742e-06, "loss": 1.02419643, "memory(GiB)": 369.4, "step": 27980, "train_speed(iter/s)": 0.201581 }, { "acc": 0.75884128, "epoch": 0.7099188229325215, "grad_norm": 2.34375, "learning_rate": 7.665726450361165e-06, "loss": 0.98255863, "memory(GiB)": 369.4, "step": 27985, "train_speed(iter/s)": 0.201585 }, { "acc": 0.74854631, "epoch": 0.7100456621004566, "grad_norm": 1.8125, "learning_rate": 7.664839231258036e-06, "loss": 0.99342146, "memory(GiB)": 369.4, "step": 27990, "train_speed(iter/s)": 0.201586 }, { "acc": 0.74807491, "epoch": 0.7101725012683917, "grad_norm": 2.375, "learning_rate": 7.663951894943383e-06, "loss": 1.0200243, "memory(GiB)": 369.4, "step": 27995, "train_speed(iter/s)": 0.201593 }, { "acc": 0.73633709, "epoch": 0.7102993404363267, "grad_norm": 2.375, "learning_rate": 7.66306444145623e-06, "loss": 1.01676636, "memory(GiB)": 369.4, "step": 28000, "train_speed(iter/s)": 0.2016 }, { "epoch": 0.7102993404363267, "eval_acc": 0.7365346285233972, "eval_loss": 0.9763129353523254, "eval_runtime": 385.0514, "eval_samples_per_second": 16.543, "eval_steps_per_second": 8.272, "step": 28000 }, { "acc": 0.73356686, "epoch": 0.7104261796042618, "grad_norm": 2.453125, "learning_rate": 7.662176870835614e-06, "loss": 1.02287617, "memory(GiB)": 369.4, "step": 28005, "train_speed(iter/s)": 0.200577 }, { "acc": 0.76162987, "epoch": 0.7105530187721969, "grad_norm": 2.09375, "learning_rate": 7.661289183120572e-06, "loss": 0.9409914, "memory(GiB)": 369.4, "step": 28010, "train_speed(iter/s)": 0.200579 }, { "acc": 0.74903989, "epoch": 0.7106798579401319, "grad_norm": 2.46875, "learning_rate": 7.66040137835015e-06, "loss": 1.02781734, "memory(GiB)": 369.4, "step": 28015, "train_speed(iter/s)": 0.200582 }, { "acc": 0.72456484, "epoch": 0.710806697108067, "grad_norm": 1.8828125, "learning_rate": 7.659513456563399e-06, "loss": 1.04057522, "memory(GiB)": 369.4, "step": 28020, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75665264, "epoch": 0.710933536276002, "grad_norm": 2.09375, "learning_rate": 7.658625417799372e-06, "loss": 1.00328655, "memory(GiB)": 369.4, "step": 28025, "train_speed(iter/s)": 0.200588 }, { "acc": 0.75030031, "epoch": 0.7110603754439371, "grad_norm": 1.859375, "learning_rate": 7.657737262097128e-06, "loss": 0.98143158, "memory(GiB)": 369.4, "step": 28030, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75220242, "epoch": 0.7111872146118722, "grad_norm": 2.046875, "learning_rate": 7.656848989495733e-06, "loss": 1.01392536, "memory(GiB)": 369.4, "step": 28035, "train_speed(iter/s)": 0.200597 }, { "acc": 0.75481443, "epoch": 0.7113140537798072, "grad_norm": 2.171875, "learning_rate": 7.65596060003426e-06, "loss": 1.03046103, "memory(GiB)": 369.4, "step": 28040, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74375682, "epoch": 0.7114408929477423, "grad_norm": 3.0625, "learning_rate": 7.655072093751779e-06, "loss": 1.06455498, "memory(GiB)": 369.4, "step": 28045, "train_speed(iter/s)": 0.200603 }, { "acc": 0.74534588, "epoch": 0.7115677321156774, "grad_norm": 2.40625, "learning_rate": 7.654183470687375e-06, "loss": 1.01343555, "memory(GiB)": 369.4, "step": 28050, "train_speed(iter/s)": 0.200606 }, { "acc": 0.75634012, "epoch": 0.7116945712836124, "grad_norm": 2.171875, "learning_rate": 7.653294730880131e-06, "loss": 0.98743944, "memory(GiB)": 369.4, "step": 28055, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75832834, "epoch": 0.7118214104515475, "grad_norm": 2.515625, "learning_rate": 7.65240587436914e-06, "loss": 0.91944237, "memory(GiB)": 369.4, "step": 28060, "train_speed(iter/s)": 0.20062 }, { "acc": 0.74433479, "epoch": 0.7119482496194824, "grad_norm": 2.171875, "learning_rate": 7.651516901193494e-06, "loss": 1.04129496, "memory(GiB)": 369.4, "step": 28065, "train_speed(iter/s)": 0.200626 }, { "acc": 0.75189219, "epoch": 0.7120750887874175, "grad_norm": 2.3125, "learning_rate": 7.650627811392298e-06, "loss": 0.98153992, "memory(GiB)": 369.4, "step": 28070, "train_speed(iter/s)": 0.200632 }, { "acc": 0.7319797, "epoch": 0.7122019279553526, "grad_norm": 2.171875, "learning_rate": 7.649738605004658e-06, "loss": 1.03745861, "memory(GiB)": 369.4, "step": 28075, "train_speed(iter/s)": 0.200639 }, { "acc": 0.75161767, "epoch": 0.7123287671232876, "grad_norm": 2.09375, "learning_rate": 7.648849282069682e-06, "loss": 0.93331671, "memory(GiB)": 369.4, "step": 28080, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74888363, "epoch": 0.7124556062912227, "grad_norm": 1.9140625, "learning_rate": 7.647959842626489e-06, "loss": 1.04810276, "memory(GiB)": 369.4, "step": 28085, "train_speed(iter/s)": 0.200647 }, { "acc": 0.73447022, "epoch": 0.7125824454591578, "grad_norm": 2.109375, "learning_rate": 7.6470702867142e-06, "loss": 1.02522545, "memory(GiB)": 369.4, "step": 28090, "train_speed(iter/s)": 0.20065 }, { "acc": 0.7350049, "epoch": 0.7127092846270928, "grad_norm": 2.1875, "learning_rate": 7.646180614371941e-06, "loss": 1.01393061, "memory(GiB)": 369.4, "step": 28095, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74945631, "epoch": 0.7128361237950279, "grad_norm": 2.78125, "learning_rate": 7.645290825638845e-06, "loss": 1.0557106, "memory(GiB)": 369.4, "step": 28100, "train_speed(iter/s)": 0.200663 }, { "acc": 0.74667139, "epoch": 0.7129629629629629, "grad_norm": 2.5625, "learning_rate": 7.644400920554048e-06, "loss": 1.00007858, "memory(GiB)": 369.4, "step": 28105, "train_speed(iter/s)": 0.200668 }, { "acc": 0.7458003, "epoch": 0.713089802130898, "grad_norm": 1.8671875, "learning_rate": 7.64351089915669e-06, "loss": 1.04137506, "memory(GiB)": 369.4, "step": 28110, "train_speed(iter/s)": 0.20067 }, { "acc": 0.76931953, "epoch": 0.7132166412988331, "grad_norm": 2.15625, "learning_rate": 7.642620761485921e-06, "loss": 0.91181316, "memory(GiB)": 369.4, "step": 28115, "train_speed(iter/s)": 0.200676 }, { "acc": 0.75061245, "epoch": 0.7133434804667681, "grad_norm": 2.109375, "learning_rate": 7.641730507580896e-06, "loss": 1.03050957, "memory(GiB)": 369.4, "step": 28120, "train_speed(iter/s)": 0.20068 }, { "acc": 0.73580942, "epoch": 0.7134703196347032, "grad_norm": 1.984375, "learning_rate": 7.640840137480763e-06, "loss": 1.06221838, "memory(GiB)": 369.4, "step": 28125, "train_speed(iter/s)": 0.200684 }, { "acc": 0.74965901, "epoch": 0.7135971588026383, "grad_norm": 2.0, "learning_rate": 7.639949651224697e-06, "loss": 0.96625462, "memory(GiB)": 369.4, "step": 28130, "train_speed(iter/s)": 0.20069 }, { "acc": 0.73812251, "epoch": 0.7137239979705733, "grad_norm": 2.09375, "learning_rate": 7.639059048851853e-06, "loss": 1.05322428, "memory(GiB)": 369.4, "step": 28135, "train_speed(iter/s)": 0.200695 }, { "acc": 0.75706882, "epoch": 0.7138508371385084, "grad_norm": 2.03125, "learning_rate": 7.638168330401412e-06, "loss": 0.93409328, "memory(GiB)": 369.4, "step": 28140, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75336399, "epoch": 0.7139776763064434, "grad_norm": 2.4375, "learning_rate": 7.637277495912548e-06, "loss": 0.968538, "memory(GiB)": 369.4, "step": 28145, "train_speed(iter/s)": 0.200707 }, { "acc": 0.751331, "epoch": 0.7141045154743785, "grad_norm": 1.984375, "learning_rate": 7.636386545424447e-06, "loss": 0.95566254, "memory(GiB)": 369.4, "step": 28150, "train_speed(iter/s)": 0.200712 }, { "acc": 0.73941994, "epoch": 0.7142313546423136, "grad_norm": 2.015625, "learning_rate": 7.635495478976294e-06, "loss": 1.02294998, "memory(GiB)": 369.4, "step": 28155, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74362817, "epoch": 0.7143581938102486, "grad_norm": 2.578125, "learning_rate": 7.634604296607285e-06, "loss": 1.08690434, "memory(GiB)": 369.4, "step": 28160, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75726156, "epoch": 0.7144850329781837, "grad_norm": 2.0, "learning_rate": 7.633712998356612e-06, "loss": 0.93804579, "memory(GiB)": 369.4, "step": 28165, "train_speed(iter/s)": 0.200727 }, { "acc": 0.74032445, "epoch": 0.7146118721461188, "grad_norm": 2.4375, "learning_rate": 7.632821584263486e-06, "loss": 1.01265211, "memory(GiB)": 369.4, "step": 28170, "train_speed(iter/s)": 0.200731 }, { "acc": 0.75874405, "epoch": 0.7147387113140538, "grad_norm": 2.0625, "learning_rate": 7.631930054367112e-06, "loss": 0.9986412, "memory(GiB)": 369.4, "step": 28175, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75160036, "epoch": 0.7148655504819889, "grad_norm": 1.984375, "learning_rate": 7.631038408706703e-06, "loss": 1.01340904, "memory(GiB)": 369.4, "step": 28180, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74280128, "epoch": 0.7149923896499238, "grad_norm": 2.328125, "learning_rate": 7.630146647321476e-06, "loss": 0.97714291, "memory(GiB)": 369.4, "step": 28185, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75013361, "epoch": 0.7151192288178589, "grad_norm": 1.90625, "learning_rate": 7.62925477025066e-06, "loss": 1.04133682, "memory(GiB)": 369.4, "step": 28190, "train_speed(iter/s)": 0.200754 }, { "acc": 0.73441744, "epoch": 0.715246067985794, "grad_norm": 2.078125, "learning_rate": 7.628362777533479e-06, "loss": 1.05883541, "memory(GiB)": 369.4, "step": 28195, "train_speed(iter/s)": 0.20076 }, { "acc": 0.74797487, "epoch": 0.715372907153729, "grad_norm": 2.1875, "learning_rate": 7.627470669209169e-06, "loss": 1.00886631, "memory(GiB)": 369.4, "step": 28200, "train_speed(iter/s)": 0.200767 }, { "acc": 0.75411263, "epoch": 0.7154997463216641, "grad_norm": 2.15625, "learning_rate": 7.626578445316968e-06, "loss": 0.97097111, "memory(GiB)": 369.4, "step": 28205, "train_speed(iter/s)": 0.200773 }, { "acc": 0.74965076, "epoch": 0.7156265854895992, "grad_norm": 2.140625, "learning_rate": 7.62568610589612e-06, "loss": 1.02861099, "memory(GiB)": 369.4, "step": 28210, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74787312, "epoch": 0.7157534246575342, "grad_norm": 2.671875, "learning_rate": 7.624793650985873e-06, "loss": 1.00995121, "memory(GiB)": 369.4, "step": 28215, "train_speed(iter/s)": 0.200787 }, { "acc": 0.73809137, "epoch": 0.7158802638254693, "grad_norm": 2.21875, "learning_rate": 7.6239010806254835e-06, "loss": 1.02337418, "memory(GiB)": 369.4, "step": 28220, "train_speed(iter/s)": 0.200792 }, { "acc": 0.7416729, "epoch": 0.7160071029934043, "grad_norm": 2.234375, "learning_rate": 7.6230083948542084e-06, "loss": 1.01355858, "memory(GiB)": 369.4, "step": 28225, "train_speed(iter/s)": 0.200797 }, { "acc": 0.76105251, "epoch": 0.7161339421613394, "grad_norm": 2.015625, "learning_rate": 7.622115593711314e-06, "loss": 0.93396988, "memory(GiB)": 369.4, "step": 28230, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74771543, "epoch": 0.7162607813292745, "grad_norm": 1.7734375, "learning_rate": 7.62122267723607e-06, "loss": 0.97252541, "memory(GiB)": 369.4, "step": 28235, "train_speed(iter/s)": 0.200806 }, { "acc": 0.7568768, "epoch": 0.7163876204972095, "grad_norm": 2.046875, "learning_rate": 7.620329645467748e-06, "loss": 0.94529781, "memory(GiB)": 369.4, "step": 28240, "train_speed(iter/s)": 0.200808 }, { "acc": 0.72942142, "epoch": 0.7165144596651446, "grad_norm": 2.359375, "learning_rate": 7.61943649844563e-06, "loss": 1.1119545, "memory(GiB)": 369.4, "step": 28245, "train_speed(iter/s)": 0.200816 }, { "acc": 0.73712702, "epoch": 0.7166412988330797, "grad_norm": 2.09375, "learning_rate": 7.618543236209001e-06, "loss": 1.05982227, "memory(GiB)": 369.4, "step": 28250, "train_speed(iter/s)": 0.200821 }, { "acc": 0.74413042, "epoch": 0.7167681380010147, "grad_norm": 2.15625, "learning_rate": 7.617649858797147e-06, "loss": 1.03669691, "memory(GiB)": 369.4, "step": 28255, "train_speed(iter/s)": 0.200828 }, { "acc": 0.7493804, "epoch": 0.7168949771689498, "grad_norm": 2.171875, "learning_rate": 7.616756366249367e-06, "loss": 1.01018486, "memory(GiB)": 369.4, "step": 28260, "train_speed(iter/s)": 0.200831 }, { "acc": 0.73756094, "epoch": 0.7170218163368848, "grad_norm": 2.046875, "learning_rate": 7.6158627586049586e-06, "loss": 1.0540535, "memory(GiB)": 369.4, "step": 28265, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75993204, "epoch": 0.7171486555048199, "grad_norm": 2.015625, "learning_rate": 7.614969035903228e-06, "loss": 0.97281895, "memory(GiB)": 369.4, "step": 28270, "train_speed(iter/s)": 0.20084 }, { "acc": 0.75670075, "epoch": 0.717275494672755, "grad_norm": 2.0, "learning_rate": 7.614075198183482e-06, "loss": 0.9291811, "memory(GiB)": 369.4, "step": 28275, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75052152, "epoch": 0.71740233384069, "grad_norm": 2.90625, "learning_rate": 7.6131812454850406e-06, "loss": 0.98820286, "memory(GiB)": 369.4, "step": 28280, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74285316, "epoch": 0.7175291730086251, "grad_norm": 1.9921875, "learning_rate": 7.612287177847219e-06, "loss": 1.01777134, "memory(GiB)": 369.4, "step": 28285, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75424447, "epoch": 0.7176560121765602, "grad_norm": 1.9765625, "learning_rate": 7.611392995309345e-06, "loss": 0.98719425, "memory(GiB)": 369.4, "step": 28290, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75060968, "epoch": 0.7177828513444952, "grad_norm": 2.734375, "learning_rate": 7.610498697910748e-06, "loss": 0.98564415, "memory(GiB)": 369.4, "step": 28295, "train_speed(iter/s)": 0.200859 }, { "acc": 0.74695649, "epoch": 0.7179096905124303, "grad_norm": 2.0, "learning_rate": 7.609604285690762e-06, "loss": 1.0136816, "memory(GiB)": 369.4, "step": 28300, "train_speed(iter/s)": 0.200865 }, { "acc": 0.75884976, "epoch": 0.7180365296803652, "grad_norm": 2.3125, "learning_rate": 7.608709758688731e-06, "loss": 0.96308994, "memory(GiB)": 369.4, "step": 28305, "train_speed(iter/s)": 0.200869 }, { "acc": 0.74285173, "epoch": 0.7181633688483003, "grad_norm": 2.125, "learning_rate": 7.607815116943995e-06, "loss": 0.98970184, "memory(GiB)": 369.4, "step": 28310, "train_speed(iter/s)": 0.200875 }, { "acc": 0.75361881, "epoch": 0.7182902080162354, "grad_norm": 1.859375, "learning_rate": 7.606920360495908e-06, "loss": 0.96202631, "memory(GiB)": 369.4, "step": 28315, "train_speed(iter/s)": 0.200881 }, { "acc": 0.73835588, "epoch": 0.7184170471841704, "grad_norm": 1.984375, "learning_rate": 7.6060254893838255e-06, "loss": 1.04239979, "memory(GiB)": 369.4, "step": 28320, "train_speed(iter/s)": 0.200885 }, { "acc": 0.73677206, "epoch": 0.7185438863521055, "grad_norm": 2.125, "learning_rate": 7.6051305036471065e-06, "loss": 1.02221479, "memory(GiB)": 369.4, "step": 28325, "train_speed(iter/s)": 0.200892 }, { "acc": 0.76264038, "epoch": 0.7186707255200406, "grad_norm": 2.296875, "learning_rate": 7.604235403325117e-06, "loss": 0.93891134, "memory(GiB)": 369.4, "step": 28330, "train_speed(iter/s)": 0.200894 }, { "acc": 0.73877325, "epoch": 0.7187975646879756, "grad_norm": 2.03125, "learning_rate": 7.603340188457227e-06, "loss": 1.02464037, "memory(GiB)": 369.4, "step": 28335, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74392004, "epoch": 0.7189244038559107, "grad_norm": 2.0625, "learning_rate": 7.602444859082814e-06, "loss": 1.0364996, "memory(GiB)": 369.4, "step": 28340, "train_speed(iter/s)": 0.200906 }, { "acc": 0.74797726, "epoch": 0.7190512430238457, "grad_norm": 2.34375, "learning_rate": 7.601549415241254e-06, "loss": 0.98717318, "memory(GiB)": 369.4, "step": 28345, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74510069, "epoch": 0.7191780821917808, "grad_norm": 2.4375, "learning_rate": 7.6006538569719375e-06, "loss": 0.99040003, "memory(GiB)": 369.4, "step": 28350, "train_speed(iter/s)": 0.200909 }, { "acc": 0.736759, "epoch": 0.7193049213597159, "grad_norm": 2.21875, "learning_rate": 7.599758184314252e-06, "loss": 1.01139164, "memory(GiB)": 369.4, "step": 28355, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74940872, "epoch": 0.7194317605276509, "grad_norm": 2.375, "learning_rate": 7.598862397307596e-06, "loss": 1.01930246, "memory(GiB)": 369.4, "step": 28360, "train_speed(iter/s)": 0.200919 }, { "acc": 0.77476754, "epoch": 0.719558599695586, "grad_norm": 1.9921875, "learning_rate": 7.597966495991368e-06, "loss": 0.91496487, "memory(GiB)": 369.4, "step": 28365, "train_speed(iter/s)": 0.200926 }, { "acc": 0.75593495, "epoch": 0.7196854388635211, "grad_norm": 2.046875, "learning_rate": 7.597070480404974e-06, "loss": 0.91926384, "memory(GiB)": 369.4, "step": 28370, "train_speed(iter/s)": 0.20093 }, { "acc": 0.75068865, "epoch": 0.7198122780314561, "grad_norm": 1.90625, "learning_rate": 7.596174350587826e-06, "loss": 0.96407614, "memory(GiB)": 369.4, "step": 28375, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74166403, "epoch": 0.7199391171993912, "grad_norm": 2.03125, "learning_rate": 7.595278106579339e-06, "loss": 1.00196314, "memory(GiB)": 369.4, "step": 28380, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75675354, "epoch": 0.7200659563673262, "grad_norm": 2.28125, "learning_rate": 7.594381748418933e-06, "loss": 0.97512283, "memory(GiB)": 369.4, "step": 28385, "train_speed(iter/s)": 0.200942 }, { "acc": 0.74941025, "epoch": 0.7201927955352613, "grad_norm": 1.8671875, "learning_rate": 7.593485276146035e-06, "loss": 0.96635313, "memory(GiB)": 369.4, "step": 28390, "train_speed(iter/s)": 0.20095 }, { "acc": 0.76004171, "epoch": 0.7203196347031964, "grad_norm": 3.078125, "learning_rate": 7.592588689800077e-06, "loss": 0.98774776, "memory(GiB)": 369.4, "step": 28395, "train_speed(iter/s)": 0.200956 }, { "acc": 0.74079442, "epoch": 0.7204464738711314, "grad_norm": 2.234375, "learning_rate": 7.591691989420491e-06, "loss": 1.0334938, "memory(GiB)": 369.4, "step": 28400, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75852861, "epoch": 0.7205733130390665, "grad_norm": 2.375, "learning_rate": 7.590795175046721e-06, "loss": 0.96856298, "memory(GiB)": 369.4, "step": 28405, "train_speed(iter/s)": 0.200967 }, { "acc": 0.74742732, "epoch": 0.7207001522070016, "grad_norm": 2.34375, "learning_rate": 7.5898982467182125e-06, "loss": 0.95695438, "memory(GiB)": 369.4, "step": 28410, "train_speed(iter/s)": 0.200971 }, { "acc": 0.73750563, "epoch": 0.7208269913749366, "grad_norm": 2.625, "learning_rate": 7.589001204474416e-06, "loss": 1.01585464, "memory(GiB)": 369.4, "step": 28415, "train_speed(iter/s)": 0.200973 }, { "acc": 0.73099251, "epoch": 0.7209538305428717, "grad_norm": 2.125, "learning_rate": 7.588104048354787e-06, "loss": 1.0160284, "memory(GiB)": 369.4, "step": 28420, "train_speed(iter/s)": 0.20097 }, { "acc": 0.75215645, "epoch": 0.7210806697108066, "grad_norm": 2.15625, "learning_rate": 7.587206778398788e-06, "loss": 0.9838954, "memory(GiB)": 369.4, "step": 28425, "train_speed(iter/s)": 0.200976 }, { "acc": 0.76070685, "epoch": 0.7212075088787417, "grad_norm": 2.71875, "learning_rate": 7.586309394645882e-06, "loss": 0.95251503, "memory(GiB)": 369.4, "step": 28430, "train_speed(iter/s)": 0.200984 }, { "acc": 0.75941734, "epoch": 0.7213343480466768, "grad_norm": 2.140625, "learning_rate": 7.585411897135544e-06, "loss": 0.92191381, "memory(GiB)": 369.4, "step": 28435, "train_speed(iter/s)": 0.200989 }, { "acc": 0.75097108, "epoch": 0.7214611872146118, "grad_norm": 2.515625, "learning_rate": 7.584514285907245e-06, "loss": 0.95306206, "memory(GiB)": 369.4, "step": 28440, "train_speed(iter/s)": 0.200986 }, { "acc": 0.75647721, "epoch": 0.7215880263825469, "grad_norm": 2.234375, "learning_rate": 7.583616561000471e-06, "loss": 0.97318916, "memory(GiB)": 369.4, "step": 28445, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74123669, "epoch": 0.721714865550482, "grad_norm": 1.9296875, "learning_rate": 7.582718722454705e-06, "loss": 1.01374693, "memory(GiB)": 369.4, "step": 28450, "train_speed(iter/s)": 0.200996 }, { "acc": 0.74422832, "epoch": 0.721841704718417, "grad_norm": 2.03125, "learning_rate": 7.581820770309438e-06, "loss": 1.03772163, "memory(GiB)": 369.4, "step": 28455, "train_speed(iter/s)": 0.201001 }, { "acc": 0.7637907, "epoch": 0.7219685438863521, "grad_norm": 2.578125, "learning_rate": 7.580922704604168e-06, "loss": 0.95293236, "memory(GiB)": 369.4, "step": 28460, "train_speed(iter/s)": 0.201002 }, { "acc": 0.73450441, "epoch": 0.7220953830542871, "grad_norm": 2.171875, "learning_rate": 7.5800245253783935e-06, "loss": 1.03954678, "memory(GiB)": 369.4, "step": 28465, "train_speed(iter/s)": 0.201009 }, { "acc": 0.73097901, "epoch": 0.7222222222222222, "grad_norm": 2.265625, "learning_rate": 7.579126232671621e-06, "loss": 1.01825562, "memory(GiB)": 369.4, "step": 28470, "train_speed(iter/s)": 0.201014 }, { "acc": 0.75200386, "epoch": 0.7223490613901573, "grad_norm": 2.5625, "learning_rate": 7.578227826523361e-06, "loss": 1.04920731, "memory(GiB)": 369.4, "step": 28475, "train_speed(iter/s)": 0.20102 }, { "acc": 0.76021695, "epoch": 0.7224759005580923, "grad_norm": 2.046875, "learning_rate": 7.577329306973132e-06, "loss": 0.99438906, "memory(GiB)": 369.4, "step": 28480, "train_speed(iter/s)": 0.201023 }, { "acc": 0.75411139, "epoch": 0.7226027397260274, "grad_norm": 2.21875, "learning_rate": 7.576430674060452e-06, "loss": 0.97431984, "memory(GiB)": 369.4, "step": 28485, "train_speed(iter/s)": 0.201028 }, { "acc": 0.73708735, "epoch": 0.7227295788939625, "grad_norm": 2.078125, "learning_rate": 7.575531927824849e-06, "loss": 1.03609333, "memory(GiB)": 369.4, "step": 28490, "train_speed(iter/s)": 0.201033 }, { "acc": 0.7507153, "epoch": 0.7228564180618975, "grad_norm": 2.390625, "learning_rate": 7.574633068305852e-06, "loss": 1.04662895, "memory(GiB)": 369.4, "step": 28495, "train_speed(iter/s)": 0.201037 }, { "acc": 0.74940333, "epoch": 0.7229832572298326, "grad_norm": 2.734375, "learning_rate": 7.5737340955429995e-06, "loss": 0.99388123, "memory(GiB)": 369.4, "step": 28500, "train_speed(iter/s)": 0.201041 }, { "acc": 0.76095095, "epoch": 0.7231100963977676, "grad_norm": 2.03125, "learning_rate": 7.572835009575828e-06, "loss": 0.92192631, "memory(GiB)": 369.4, "step": 28505, "train_speed(iter/s)": 0.201045 }, { "acc": 0.75090065, "epoch": 0.7232369355657027, "grad_norm": 2.609375, "learning_rate": 7.571935810443886e-06, "loss": 1.01746979, "memory(GiB)": 369.4, "step": 28510, "train_speed(iter/s)": 0.201052 }, { "acc": 0.75236931, "epoch": 0.7233637747336378, "grad_norm": 1.8515625, "learning_rate": 7.571036498186727e-06, "loss": 0.98081465, "memory(GiB)": 369.4, "step": 28515, "train_speed(iter/s)": 0.201058 }, { "acc": 0.74186983, "epoch": 0.7234906139015728, "grad_norm": 2.34375, "learning_rate": 7.570137072843902e-06, "loss": 1.02973804, "memory(GiB)": 369.4, "step": 28520, "train_speed(iter/s)": 0.201064 }, { "acc": 0.73252287, "epoch": 0.7236174530695079, "grad_norm": 2.3125, "learning_rate": 7.569237534454974e-06, "loss": 1.02595158, "memory(GiB)": 369.4, "step": 28525, "train_speed(iter/s)": 0.20107 }, { "acc": 0.73064394, "epoch": 0.723744292237443, "grad_norm": 2.140625, "learning_rate": 7.568337883059509e-06, "loss": 1.06132936, "memory(GiB)": 369.4, "step": 28530, "train_speed(iter/s)": 0.201073 }, { "acc": 0.74651022, "epoch": 0.723871131405378, "grad_norm": 2.328125, "learning_rate": 7.567438118697077e-06, "loss": 1.03018465, "memory(GiB)": 369.4, "step": 28535, "train_speed(iter/s)": 0.201076 }, { "acc": 0.75070863, "epoch": 0.723997970573313, "grad_norm": 2.453125, "learning_rate": 7.566538241407253e-06, "loss": 0.95091782, "memory(GiB)": 369.4, "step": 28540, "train_speed(iter/s)": 0.201081 }, { "acc": 0.73926725, "epoch": 0.724124809741248, "grad_norm": 1.96875, "learning_rate": 7.565638251229617e-06, "loss": 0.99806576, "memory(GiB)": 369.4, "step": 28545, "train_speed(iter/s)": 0.201086 }, { "acc": 0.75515633, "epoch": 0.7242516489091831, "grad_norm": 2.125, "learning_rate": 7.5647381482037585e-06, "loss": 0.94927406, "memory(GiB)": 369.4, "step": 28550, "train_speed(iter/s)": 0.201093 }, { "acc": 0.75150404, "epoch": 0.7243784880771182, "grad_norm": 2.015625, "learning_rate": 7.563837932369264e-06, "loss": 0.99867496, "memory(GiB)": 369.4, "step": 28555, "train_speed(iter/s)": 0.201096 }, { "acc": 0.74914236, "epoch": 0.7245053272450532, "grad_norm": 2.09375, "learning_rate": 7.562937603765732e-06, "loss": 0.96972027, "memory(GiB)": 369.4, "step": 28560, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74745255, "epoch": 0.7246321664129883, "grad_norm": 2.109375, "learning_rate": 7.562037162432761e-06, "loss": 0.9932375, "memory(GiB)": 369.4, "step": 28565, "train_speed(iter/s)": 0.201101 }, { "acc": 0.73842077, "epoch": 0.7247590055809234, "grad_norm": 2.328125, "learning_rate": 7.561136608409956e-06, "loss": 0.97806187, "memory(GiB)": 369.4, "step": 28570, "train_speed(iter/s)": 0.201105 }, { "acc": 0.72503572, "epoch": 0.7248858447488584, "grad_norm": 2.140625, "learning_rate": 7.560235941736929e-06, "loss": 1.11544399, "memory(GiB)": 369.4, "step": 28575, "train_speed(iter/s)": 0.201108 }, { "acc": 0.73836503, "epoch": 0.7250126839167935, "grad_norm": 2.21875, "learning_rate": 7.559335162453294e-06, "loss": 1.04944744, "memory(GiB)": 369.4, "step": 28580, "train_speed(iter/s)": 0.201116 }, { "acc": 0.75540648, "epoch": 0.7251395230847285, "grad_norm": 2.90625, "learning_rate": 7.558434270598672e-06, "loss": 0.97701206, "memory(GiB)": 369.4, "step": 28585, "train_speed(iter/s)": 0.201118 }, { "acc": 0.74930353, "epoch": 0.7252663622526636, "grad_norm": 2.21875, "learning_rate": 7.5575332662126885e-06, "loss": 0.98118019, "memory(GiB)": 369.4, "step": 28590, "train_speed(iter/s)": 0.201124 }, { "acc": 0.74843259, "epoch": 0.7253932014205987, "grad_norm": 2.046875, "learning_rate": 7.556632149334975e-06, "loss": 0.96762781, "memory(GiB)": 369.4, "step": 28595, "train_speed(iter/s)": 0.201129 }, { "acc": 0.75237036, "epoch": 0.7255200405885337, "grad_norm": 1.9375, "learning_rate": 7.555730920005163e-06, "loss": 0.94417267, "memory(GiB)": 369.4, "step": 28600, "train_speed(iter/s)": 0.201133 }, { "acc": 0.72158532, "epoch": 0.7256468797564688, "grad_norm": 2.578125, "learning_rate": 7.554829578262894e-06, "loss": 1.079181, "memory(GiB)": 369.4, "step": 28605, "train_speed(iter/s)": 0.201139 }, { "acc": 0.7467679, "epoch": 0.7257737189244039, "grad_norm": 2.78125, "learning_rate": 7.5539281241478155e-06, "loss": 1.04416332, "memory(GiB)": 369.4, "step": 28610, "train_speed(iter/s)": 0.201145 }, { "acc": 0.75763779, "epoch": 0.7259005580923389, "grad_norm": 1.9453125, "learning_rate": 7.5530265576995756e-06, "loss": 0.94119644, "memory(GiB)": 369.4, "step": 28615, "train_speed(iter/s)": 0.20115 }, { "acc": 0.73387814, "epoch": 0.726027397260274, "grad_norm": 1.984375, "learning_rate": 7.552124878957829e-06, "loss": 1.02869253, "memory(GiB)": 369.4, "step": 28620, "train_speed(iter/s)": 0.201154 }, { "acc": 0.74848018, "epoch": 0.726154236428209, "grad_norm": 2.28125, "learning_rate": 7.551223087962234e-06, "loss": 1.02287588, "memory(GiB)": 369.4, "step": 28625, "train_speed(iter/s)": 0.201157 }, { "acc": 0.76138115, "epoch": 0.7262810755961441, "grad_norm": 2.03125, "learning_rate": 7.55032118475246e-06, "loss": 0.95037432, "memory(GiB)": 369.4, "step": 28630, "train_speed(iter/s)": 0.20116 }, { "acc": 0.74259243, "epoch": 0.7264079147640792, "grad_norm": 1.875, "learning_rate": 7.549419169368171e-06, "loss": 1.03233328, "memory(GiB)": 369.4, "step": 28635, "train_speed(iter/s)": 0.201165 }, { "acc": 0.74123769, "epoch": 0.7265347539320142, "grad_norm": 2.234375, "learning_rate": 7.548517041849048e-06, "loss": 1.02667351, "memory(GiB)": 369.4, "step": 28640, "train_speed(iter/s)": 0.201173 }, { "acc": 0.75140018, "epoch": 0.7266615930999493, "grad_norm": 1.9921875, "learning_rate": 7.547614802234764e-06, "loss": 0.9868845, "memory(GiB)": 369.4, "step": 28645, "train_speed(iter/s)": 0.201179 }, { "acc": 0.76170354, "epoch": 0.7267884322678844, "grad_norm": 2.015625, "learning_rate": 7.546712450565008e-06, "loss": 1.02011452, "memory(GiB)": 369.4, "step": 28650, "train_speed(iter/s)": 0.201183 }, { "acc": 0.75811129, "epoch": 0.7269152714358194, "grad_norm": 1.7421875, "learning_rate": 7.545809986879469e-06, "loss": 0.92061157, "memory(GiB)": 369.4, "step": 28655, "train_speed(iter/s)": 0.201186 }, { "acc": 0.74785404, "epoch": 0.7270421106037545, "grad_norm": 2.609375, "learning_rate": 7.5449074112178385e-06, "loss": 1.00812149, "memory(GiB)": 369.4, "step": 28660, "train_speed(iter/s)": 0.201191 }, { "acc": 0.7491394, "epoch": 0.7271689497716894, "grad_norm": 2.375, "learning_rate": 7.54400472361982e-06, "loss": 1.06311588, "memory(GiB)": 369.4, "step": 28665, "train_speed(iter/s)": 0.201199 }, { "acc": 0.73765917, "epoch": 0.7272957889396245, "grad_norm": 2.09375, "learning_rate": 7.543101924125115e-06, "loss": 1.010499, "memory(GiB)": 369.4, "step": 28670, "train_speed(iter/s)": 0.201203 }, { "acc": 0.7438097, "epoch": 0.7274226281075596, "grad_norm": 2.375, "learning_rate": 7.542199012773432e-06, "loss": 1.04554558, "memory(GiB)": 369.4, "step": 28675, "train_speed(iter/s)": 0.201209 }, { "acc": 0.7451838, "epoch": 0.7275494672754946, "grad_norm": 2.21875, "learning_rate": 7.541295989604488e-06, "loss": 1.04278812, "memory(GiB)": 369.4, "step": 28680, "train_speed(iter/s)": 0.201216 }, { "acc": 0.75466986, "epoch": 0.7276763064434297, "grad_norm": 2.375, "learning_rate": 7.540392854657999e-06, "loss": 0.96357632, "memory(GiB)": 369.4, "step": 28685, "train_speed(iter/s)": 0.201219 }, { "acc": 0.75491037, "epoch": 0.7278031456113648, "grad_norm": 2.65625, "learning_rate": 7.539489607973691e-06, "loss": 1.00107288, "memory(GiB)": 369.4, "step": 28690, "train_speed(iter/s)": 0.201225 }, { "acc": 0.73886247, "epoch": 0.7279299847792998, "grad_norm": 2.59375, "learning_rate": 7.5385862495912905e-06, "loss": 1.07563314, "memory(GiB)": 369.4, "step": 28695, "train_speed(iter/s)": 0.20123 }, { "acc": 0.74580679, "epoch": 0.7280568239472349, "grad_norm": 2.578125, "learning_rate": 7.537682779550537e-06, "loss": 1.02705555, "memory(GiB)": 369.4, "step": 28700, "train_speed(iter/s)": 0.201234 }, { "acc": 0.7334693, "epoch": 0.7281836631151699, "grad_norm": 2.296875, "learning_rate": 7.536779197891159e-06, "loss": 1.03527489, "memory(GiB)": 369.4, "step": 28705, "train_speed(iter/s)": 0.201239 }, { "acc": 0.75682936, "epoch": 0.728310502283105, "grad_norm": 2.015625, "learning_rate": 7.535875504652912e-06, "loss": 0.98776798, "memory(GiB)": 369.4, "step": 28710, "train_speed(iter/s)": 0.201246 }, { "acc": 0.74663668, "epoch": 0.7284373414510401, "grad_norm": 2.109375, "learning_rate": 7.534971699875534e-06, "loss": 1.02946911, "memory(GiB)": 369.4, "step": 28715, "train_speed(iter/s)": 0.20125 }, { "acc": 0.7497468, "epoch": 0.7285641806189751, "grad_norm": 2.125, "learning_rate": 7.534067783598784e-06, "loss": 0.98386555, "memory(GiB)": 369.4, "step": 28720, "train_speed(iter/s)": 0.201255 }, { "acc": 0.73455772, "epoch": 0.7286910197869102, "grad_norm": 2.09375, "learning_rate": 7.533163755862419e-06, "loss": 1.01640339, "memory(GiB)": 369.4, "step": 28725, "train_speed(iter/s)": 0.201258 }, { "acc": 0.73869286, "epoch": 0.7288178589548453, "grad_norm": 2.0625, "learning_rate": 7.5322596167062035e-06, "loss": 1.02789707, "memory(GiB)": 369.4, "step": 28730, "train_speed(iter/s)": 0.201263 }, { "acc": 0.75415392, "epoch": 0.7289446981227803, "grad_norm": 2.015625, "learning_rate": 7.5313553661699035e-06, "loss": 0.9702179, "memory(GiB)": 369.4, "step": 28735, "train_speed(iter/s)": 0.201266 }, { "acc": 0.76689129, "epoch": 0.7290715372907154, "grad_norm": 1.9765625, "learning_rate": 7.530451004293292e-06, "loss": 0.93418293, "memory(GiB)": 369.4, "step": 28740, "train_speed(iter/s)": 0.201271 }, { "acc": 0.74641895, "epoch": 0.7291983764586504, "grad_norm": 2.078125, "learning_rate": 7.5295465311161485e-06, "loss": 0.98483715, "memory(GiB)": 369.4, "step": 28745, "train_speed(iter/s)": 0.201276 }, { "acc": 0.75556574, "epoch": 0.7293252156265855, "grad_norm": 2.109375, "learning_rate": 7.5286419466782546e-06, "loss": 0.96321449, "memory(GiB)": 369.4, "step": 28750, "train_speed(iter/s)": 0.201283 }, { "acc": 0.75129004, "epoch": 0.7294520547945206, "grad_norm": 4.1875, "learning_rate": 7.527737251019399e-06, "loss": 1.02060814, "memory(GiB)": 369.4, "step": 28755, "train_speed(iter/s)": 0.201291 }, { "acc": 0.72903614, "epoch": 0.7295788939624556, "grad_norm": 2.4375, "learning_rate": 7.526832444179373e-06, "loss": 1.06324558, "memory(GiB)": 369.4, "step": 28760, "train_speed(iter/s)": 0.201296 }, { "acc": 0.75993137, "epoch": 0.7297057331303907, "grad_norm": 2.359375, "learning_rate": 7.525927526197974e-06, "loss": 1.02767658, "memory(GiB)": 369.4, "step": 28765, "train_speed(iter/s)": 0.201302 }, { "acc": 0.73790073, "epoch": 0.7298325722983258, "grad_norm": 1.9921875, "learning_rate": 7.5250224971150065e-06, "loss": 1.0476284, "memory(GiB)": 369.4, "step": 28770, "train_speed(iter/s)": 0.201305 }, { "acc": 0.75834208, "epoch": 0.7299594114662608, "grad_norm": 1.921875, "learning_rate": 7.524117356970275e-06, "loss": 0.97412949, "memory(GiB)": 369.4, "step": 28775, "train_speed(iter/s)": 0.20131 }, { "acc": 0.74754744, "epoch": 0.7300862506341959, "grad_norm": 2.359375, "learning_rate": 7.523212105803594e-06, "loss": 1.06938801, "memory(GiB)": 369.4, "step": 28780, "train_speed(iter/s)": 0.201315 }, { "acc": 0.74895582, "epoch": 0.7302130898021308, "grad_norm": 2.5, "learning_rate": 7.522306743654777e-06, "loss": 0.99124718, "memory(GiB)": 369.4, "step": 28785, "train_speed(iter/s)": 0.201322 }, { "acc": 0.75512552, "epoch": 0.7303399289700659, "grad_norm": 2.078125, "learning_rate": 7.521401270563651e-06, "loss": 1.03112793, "memory(GiB)": 369.4, "step": 28790, "train_speed(iter/s)": 0.201327 }, { "acc": 0.73561368, "epoch": 0.730466768138001, "grad_norm": 2.15625, "learning_rate": 7.520495686570037e-06, "loss": 1.00079212, "memory(GiB)": 369.4, "step": 28795, "train_speed(iter/s)": 0.201332 }, { "acc": 0.75667262, "epoch": 0.730593607305936, "grad_norm": 2.234375, "learning_rate": 7.5195899917137716e-06, "loss": 0.96503325, "memory(GiB)": 369.4, "step": 28800, "train_speed(iter/s)": 0.201338 }, { "acc": 0.74183035, "epoch": 0.7307204464738711, "grad_norm": 2.625, "learning_rate": 7.518684186034688e-06, "loss": 0.96855183, "memory(GiB)": 369.4, "step": 28805, "train_speed(iter/s)": 0.201341 }, { "acc": 0.7578598, "epoch": 0.7308472856418062, "grad_norm": 1.7421875, "learning_rate": 7.51777826957263e-06, "loss": 0.98241196, "memory(GiB)": 369.4, "step": 28810, "train_speed(iter/s)": 0.201347 }, { "acc": 0.7473774, "epoch": 0.7309741248097412, "grad_norm": 2.265625, "learning_rate": 7.516872242367441e-06, "loss": 0.94795341, "memory(GiB)": 369.4, "step": 28815, "train_speed(iter/s)": 0.20135 }, { "acc": 0.75086861, "epoch": 0.7311009639776763, "grad_norm": 2.28125, "learning_rate": 7.5159661044589745e-06, "loss": 1.05210114, "memory(GiB)": 369.4, "step": 28820, "train_speed(iter/s)": 0.201352 }, { "acc": 0.75535574, "epoch": 0.7312278031456113, "grad_norm": 2.796875, "learning_rate": 7.515059855887087e-06, "loss": 0.99343929, "memory(GiB)": 369.4, "step": 28825, "train_speed(iter/s)": 0.201355 }, { "acc": 0.74906306, "epoch": 0.7313546423135464, "grad_norm": 2.421875, "learning_rate": 7.514153496691636e-06, "loss": 0.96166668, "memory(GiB)": 369.4, "step": 28830, "train_speed(iter/s)": 0.201358 }, { "acc": 0.75602813, "epoch": 0.7314814814814815, "grad_norm": 2.1875, "learning_rate": 7.513247026912491e-06, "loss": 0.92787418, "memory(GiB)": 369.4, "step": 28835, "train_speed(iter/s)": 0.201362 }, { "acc": 0.75438042, "epoch": 0.7316083206494165, "grad_norm": 2.5, "learning_rate": 7.512340446589521e-06, "loss": 0.98873167, "memory(GiB)": 369.4, "step": 28840, "train_speed(iter/s)": 0.201367 }, { "acc": 0.74770107, "epoch": 0.7317351598173516, "grad_norm": 1.71875, "learning_rate": 7.5114337557625985e-06, "loss": 0.99351368, "memory(GiB)": 369.4, "step": 28845, "train_speed(iter/s)": 0.201373 }, { "acc": 0.77472172, "epoch": 0.7318619989852867, "grad_norm": 2.125, "learning_rate": 7.510526954471611e-06, "loss": 0.899265, "memory(GiB)": 369.4, "step": 28850, "train_speed(iter/s)": 0.201377 }, { "acc": 0.74407582, "epoch": 0.7319888381532217, "grad_norm": 2.375, "learning_rate": 7.509620042756436e-06, "loss": 1.02864084, "memory(GiB)": 369.4, "step": 28855, "train_speed(iter/s)": 0.201378 }, { "acc": 0.73744888, "epoch": 0.7321156773211568, "grad_norm": 2.34375, "learning_rate": 7.508713020656968e-06, "loss": 1.03869572, "memory(GiB)": 369.4, "step": 28860, "train_speed(iter/s)": 0.201381 }, { "acc": 0.74496121, "epoch": 0.7322425164890918, "grad_norm": 2.203125, "learning_rate": 7.5078058882131e-06, "loss": 1.03551559, "memory(GiB)": 369.4, "step": 28865, "train_speed(iter/s)": 0.201384 }, { "acc": 0.7431416, "epoch": 0.7323693556570269, "grad_norm": 2.0625, "learning_rate": 7.506898645464733e-06, "loss": 1.0317255, "memory(GiB)": 369.4, "step": 28870, "train_speed(iter/s)": 0.20139 }, { "acc": 0.756703, "epoch": 0.732496194824962, "grad_norm": 2.203125, "learning_rate": 7.505991292451772e-06, "loss": 0.94000473, "memory(GiB)": 369.4, "step": 28875, "train_speed(iter/s)": 0.201395 }, { "acc": 0.74481421, "epoch": 0.732623033992897, "grad_norm": 1.703125, "learning_rate": 7.505083829214125e-06, "loss": 1.04805899, "memory(GiB)": 369.4, "step": 28880, "train_speed(iter/s)": 0.201402 }, { "acc": 0.73632898, "epoch": 0.7327498731608321, "grad_norm": 1.8125, "learning_rate": 7.5041762557917065e-06, "loss": 1.06884813, "memory(GiB)": 369.4, "step": 28885, "train_speed(iter/s)": 0.201405 }, { "acc": 0.7450057, "epoch": 0.7328767123287672, "grad_norm": 2.0625, "learning_rate": 7.5032685722244355e-06, "loss": 1.05621986, "memory(GiB)": 369.4, "step": 28890, "train_speed(iter/s)": 0.201409 }, { "acc": 0.74097729, "epoch": 0.7330035514967022, "grad_norm": 2.375, "learning_rate": 7.502360778552238e-06, "loss": 0.99876089, "memory(GiB)": 369.4, "step": 28895, "train_speed(iter/s)": 0.201416 }, { "acc": 0.73833113, "epoch": 0.7331303906646373, "grad_norm": 2.15625, "learning_rate": 7.5014528748150405e-06, "loss": 1.01840706, "memory(GiB)": 369.4, "step": 28900, "train_speed(iter/s)": 0.201419 }, { "acc": 0.7439846, "epoch": 0.7332572298325722, "grad_norm": 2.3125, "learning_rate": 7.5005448610527765e-06, "loss": 0.98077469, "memory(GiB)": 369.4, "step": 28905, "train_speed(iter/s)": 0.201425 }, { "acc": 0.7568325, "epoch": 0.7333840690005073, "grad_norm": 1.875, "learning_rate": 7.499636737305386e-06, "loss": 0.96400394, "memory(GiB)": 369.4, "step": 28910, "train_speed(iter/s)": 0.201431 }, { "acc": 0.76379614, "epoch": 0.7335109081684424, "grad_norm": 2.4375, "learning_rate": 7.498728503612811e-06, "loss": 0.96160126, "memory(GiB)": 369.4, "step": 28915, "train_speed(iter/s)": 0.201437 }, { "acc": 0.75080776, "epoch": 0.7336377473363774, "grad_norm": 2.5, "learning_rate": 7.497820160015002e-06, "loss": 1.00911198, "memory(GiB)": 369.4, "step": 28920, "train_speed(iter/s)": 0.201442 }, { "acc": 0.74239349, "epoch": 0.7337645865043125, "grad_norm": 2.0, "learning_rate": 7.496911706551908e-06, "loss": 1.00973873, "memory(GiB)": 369.4, "step": 28925, "train_speed(iter/s)": 0.201447 }, { "acc": 0.76007838, "epoch": 0.7338914256722476, "grad_norm": 2.296875, "learning_rate": 7.496003143263492e-06, "loss": 0.95652657, "memory(GiB)": 369.4, "step": 28930, "train_speed(iter/s)": 0.201454 }, { "acc": 0.7386147, "epoch": 0.7340182648401826, "grad_norm": 2.140625, "learning_rate": 7.495094470189712e-06, "loss": 1.04775534, "memory(GiB)": 369.4, "step": 28935, "train_speed(iter/s)": 0.201457 }, { "acc": 0.72666988, "epoch": 0.7341451040081177, "grad_norm": 2.6875, "learning_rate": 7.4941856873705376e-06, "loss": 1.11164722, "memory(GiB)": 369.4, "step": 28940, "train_speed(iter/s)": 0.201462 }, { "acc": 0.74361939, "epoch": 0.7342719431760527, "grad_norm": 1.96875, "learning_rate": 7.493276794845941e-06, "loss": 1.02880898, "memory(GiB)": 369.4, "step": 28945, "train_speed(iter/s)": 0.201468 }, { "acc": 0.7581131, "epoch": 0.7343987823439878, "grad_norm": 2.1875, "learning_rate": 7.4923677926559005e-06, "loss": 0.95734434, "memory(GiB)": 369.4, "step": 28950, "train_speed(iter/s)": 0.201473 }, { "acc": 0.7503232, "epoch": 0.7345256215119229, "grad_norm": 2.125, "learning_rate": 7.491458680840396e-06, "loss": 1.00798922, "memory(GiB)": 369.4, "step": 28955, "train_speed(iter/s)": 0.20148 }, { "acc": 0.74262943, "epoch": 0.7346524606798579, "grad_norm": 2.140625, "learning_rate": 7.490549459439415e-06, "loss": 1.02422581, "memory(GiB)": 369.4, "step": 28960, "train_speed(iter/s)": 0.201483 }, { "acc": 0.74676304, "epoch": 0.734779299847793, "grad_norm": 1.7109375, "learning_rate": 7.48964012849295e-06, "loss": 0.99599743, "memory(GiB)": 369.4, "step": 28965, "train_speed(iter/s)": 0.20149 }, { "acc": 0.74445972, "epoch": 0.7349061390157281, "grad_norm": 2.1875, "learning_rate": 7.488730688040995e-06, "loss": 0.98214703, "memory(GiB)": 369.4, "step": 28970, "train_speed(iter/s)": 0.201493 }, { "acc": 0.74703522, "epoch": 0.7350329781836631, "grad_norm": 2.453125, "learning_rate": 7.487821138123554e-06, "loss": 1.03697395, "memory(GiB)": 369.4, "step": 28975, "train_speed(iter/s)": 0.201497 }, { "acc": 0.76895351, "epoch": 0.7351598173515982, "grad_norm": 2.15625, "learning_rate": 7.486911478780633e-06, "loss": 0.93770523, "memory(GiB)": 369.4, "step": 28980, "train_speed(iter/s)": 0.2015 }, { "acc": 0.75363598, "epoch": 0.7352866565195332, "grad_norm": 2.09375, "learning_rate": 7.4860017100522395e-06, "loss": 0.97808647, "memory(GiB)": 369.4, "step": 28985, "train_speed(iter/s)": 0.201507 }, { "acc": 0.74319258, "epoch": 0.7354134956874683, "grad_norm": 2.609375, "learning_rate": 7.485091831978394e-06, "loss": 1.00872021, "memory(GiB)": 369.4, "step": 28990, "train_speed(iter/s)": 0.201512 }, { "acc": 0.74502754, "epoch": 0.7355403348554034, "grad_norm": 2.015625, "learning_rate": 7.484181844599113e-06, "loss": 1.04379272, "memory(GiB)": 369.4, "step": 28995, "train_speed(iter/s)": 0.20152 }, { "acc": 0.75643187, "epoch": 0.7356671740233384, "grad_norm": 2.171875, "learning_rate": 7.483271747954425e-06, "loss": 0.94860439, "memory(GiB)": 369.4, "step": 29000, "train_speed(iter/s)": 0.201525 }, { "epoch": 0.7356671740233384, "eval_acc": 0.7367322324607706, "eval_loss": 0.975178599357605, "eval_runtime": 384.4114, "eval_samples_per_second": 16.571, "eval_steps_per_second": 8.285, "step": 29000 }, { "acc": 0.74902906, "epoch": 0.7357940131912735, "grad_norm": 2.5, "learning_rate": 7.482361542084356e-06, "loss": 1.0036582, "memory(GiB)": 369.4, "step": 29005, "train_speed(iter/s)": 0.200542 }, { "acc": 0.75271325, "epoch": 0.7359208523592086, "grad_norm": 1.7265625, "learning_rate": 7.481451227028946e-06, "loss": 0.93208637, "memory(GiB)": 369.4, "step": 29010, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75793304, "epoch": 0.7360476915271436, "grad_norm": 2.375, "learning_rate": 7.4805408028282316e-06, "loss": 0.9577158, "memory(GiB)": 369.4, "step": 29015, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75249119, "epoch": 0.7361745306950787, "grad_norm": 2.0, "learning_rate": 7.479630269522257e-06, "loss": 0.97505035, "memory(GiB)": 369.4, "step": 29020, "train_speed(iter/s)": 0.200558 }, { "acc": 0.7388938, "epoch": 0.7363013698630136, "grad_norm": 2.234375, "learning_rate": 7.478719627151073e-06, "loss": 1.07977085, "memory(GiB)": 369.4, "step": 29025, "train_speed(iter/s)": 0.200563 }, { "acc": 0.73892341, "epoch": 0.7364282090309487, "grad_norm": 2.421875, "learning_rate": 7.4778088757547325e-06, "loss": 1.0197134, "memory(GiB)": 369.4, "step": 29030, "train_speed(iter/s)": 0.200566 }, { "acc": 0.75131168, "epoch": 0.7365550481988838, "grad_norm": 2.359375, "learning_rate": 7.476898015373296e-06, "loss": 1.0453949, "memory(GiB)": 369.4, "step": 29035, "train_speed(iter/s)": 0.20057 }, { "acc": 0.75492525, "epoch": 0.7366818873668188, "grad_norm": 1.9921875, "learning_rate": 7.4759870460468256e-06, "loss": 0.96354694, "memory(GiB)": 369.4, "step": 29040, "train_speed(iter/s)": 0.200577 }, { "acc": 0.73820133, "epoch": 0.7368087265347539, "grad_norm": 2.65625, "learning_rate": 7.475075967815391e-06, "loss": 0.98419704, "memory(GiB)": 369.4, "step": 29045, "train_speed(iter/s)": 0.200582 }, { "acc": 0.73924227, "epoch": 0.736935565702689, "grad_norm": 2.0, "learning_rate": 7.474164780719064e-06, "loss": 0.99951649, "memory(GiB)": 369.4, "step": 29050, "train_speed(iter/s)": 0.200586 }, { "acc": 0.7391118, "epoch": 0.737062404870624, "grad_norm": 2.203125, "learning_rate": 7.473253484797924e-06, "loss": 1.033951, "memory(GiB)": 369.4, "step": 29055, "train_speed(iter/s)": 0.200591 }, { "acc": 0.7575614, "epoch": 0.7371892440385591, "grad_norm": 2.546875, "learning_rate": 7.4723420800920545e-06, "loss": 0.93188095, "memory(GiB)": 369.4, "step": 29060, "train_speed(iter/s)": 0.200599 }, { "acc": 0.75268698, "epoch": 0.7373160832064941, "grad_norm": 2.0625, "learning_rate": 7.47143056664154e-06, "loss": 0.97346621, "memory(GiB)": 369.4, "step": 29065, "train_speed(iter/s)": 0.200605 }, { "acc": 0.7595048, "epoch": 0.7374429223744292, "grad_norm": 2.140625, "learning_rate": 7.470518944486476e-06, "loss": 0.98007936, "memory(GiB)": 369.4, "step": 29070, "train_speed(iter/s)": 0.200611 }, { "acc": 0.74787979, "epoch": 0.7375697615423643, "grad_norm": 2.1875, "learning_rate": 7.469607213666958e-06, "loss": 1.040378, "memory(GiB)": 369.4, "step": 29075, "train_speed(iter/s)": 0.200616 }, { "acc": 0.7468576, "epoch": 0.7376966007102993, "grad_norm": 2.46875, "learning_rate": 7.468695374223092e-06, "loss": 1.00706615, "memory(GiB)": 369.4, "step": 29080, "train_speed(iter/s)": 0.200622 }, { "acc": 0.7509912, "epoch": 0.7378234398782344, "grad_norm": 1.8828125, "learning_rate": 7.4677834261949765e-06, "loss": 1.01379623, "memory(GiB)": 369.4, "step": 29085, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76835833, "epoch": 0.7379502790461695, "grad_norm": 2.125, "learning_rate": 7.466871369622731e-06, "loss": 0.91440754, "memory(GiB)": 369.4, "step": 29090, "train_speed(iter/s)": 0.200634 }, { "acc": 0.73905973, "epoch": 0.7380771182141045, "grad_norm": 2.296875, "learning_rate": 7.465959204546469e-06, "loss": 1.06014633, "memory(GiB)": 369.4, "step": 29095, "train_speed(iter/s)": 0.200636 }, { "acc": 0.74523897, "epoch": 0.7382039573820396, "grad_norm": 2.375, "learning_rate": 7.465046931006311e-06, "loss": 1.02143803, "memory(GiB)": 369.4, "step": 29100, "train_speed(iter/s)": 0.200642 }, { "acc": 0.75338235, "epoch": 0.7383307965499746, "grad_norm": 2.125, "learning_rate": 7.464134549042383e-06, "loss": 0.98160753, "memory(GiB)": 369.4, "step": 29105, "train_speed(iter/s)": 0.200647 }, { "acc": 0.74940414, "epoch": 0.7384576357179097, "grad_norm": 1.921875, "learning_rate": 7.463222058694817e-06, "loss": 1.01202335, "memory(GiB)": 369.4, "step": 29110, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74724975, "epoch": 0.7385844748858448, "grad_norm": 1.9453125, "learning_rate": 7.462309460003747e-06, "loss": 1.02598419, "memory(GiB)": 369.4, "step": 29115, "train_speed(iter/s)": 0.200658 }, { "acc": 0.75252218, "epoch": 0.7387113140537798, "grad_norm": 2.59375, "learning_rate": 7.461396753009314e-06, "loss": 1.03923283, "memory(GiB)": 369.4, "step": 29120, "train_speed(iter/s)": 0.200664 }, { "acc": 0.74923306, "epoch": 0.7388381532217149, "grad_norm": 1.8984375, "learning_rate": 7.460483937751662e-06, "loss": 0.98135424, "memory(GiB)": 369.4, "step": 29125, "train_speed(iter/s)": 0.200669 }, { "acc": 0.73801785, "epoch": 0.73896499238965, "grad_norm": 2.09375, "learning_rate": 7.45957101427094e-06, "loss": 0.96852846, "memory(GiB)": 369.4, "step": 29130, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74188199, "epoch": 0.739091831557585, "grad_norm": 1.984375, "learning_rate": 7.458657982607303e-06, "loss": 1.018859, "memory(GiB)": 369.4, "step": 29135, "train_speed(iter/s)": 0.200681 }, { "acc": 0.74581795, "epoch": 0.73921867072552, "grad_norm": 2.671875, "learning_rate": 7.457744842800913e-06, "loss": 1.00857086, "memory(GiB)": 369.4, "step": 29140, "train_speed(iter/s)": 0.200684 }, { "acc": 0.73350329, "epoch": 0.739345509893455, "grad_norm": 2.25, "learning_rate": 7.45683159489193e-06, "loss": 1.06578026, "memory(GiB)": 369.4, "step": 29145, "train_speed(iter/s)": 0.200688 }, { "acc": 0.74282141, "epoch": 0.7394723490613901, "grad_norm": 2.40625, "learning_rate": 7.455918238920526e-06, "loss": 0.97652969, "memory(GiB)": 369.4, "step": 29150, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74253368, "epoch": 0.7395991882293252, "grad_norm": 2.328125, "learning_rate": 7.455004774926873e-06, "loss": 1.02529345, "memory(GiB)": 369.4, "step": 29155, "train_speed(iter/s)": 0.200696 }, { "acc": 0.73993969, "epoch": 0.7397260273972602, "grad_norm": 2.78125, "learning_rate": 7.454091202951148e-06, "loss": 0.99797516, "memory(GiB)": 369.4, "step": 29160, "train_speed(iter/s)": 0.200698 }, { "acc": 0.74122887, "epoch": 0.7398528665651953, "grad_norm": 2.34375, "learning_rate": 7.453177523033536e-06, "loss": 1.02053461, "memory(GiB)": 369.4, "step": 29165, "train_speed(iter/s)": 0.200703 }, { "acc": 0.73360081, "epoch": 0.7399797057331304, "grad_norm": 2.546875, "learning_rate": 7.452263735214223e-06, "loss": 1.02469921, "memory(GiB)": 369.4, "step": 29170, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75182247, "epoch": 0.7401065449010654, "grad_norm": 1.9453125, "learning_rate": 7.451349839533404e-06, "loss": 1.04584284, "memory(GiB)": 369.4, "step": 29175, "train_speed(iter/s)": 0.200713 }, { "acc": 0.74091611, "epoch": 0.7402333840690005, "grad_norm": 2.28125, "learning_rate": 7.450435836031273e-06, "loss": 0.98607149, "memory(GiB)": 369.4, "step": 29180, "train_speed(iter/s)": 0.200718 }, { "acc": 0.74949789, "epoch": 0.7403602232369355, "grad_norm": 2.21875, "learning_rate": 7.449521724748034e-06, "loss": 0.973876, "memory(GiB)": 369.4, "step": 29185, "train_speed(iter/s)": 0.200721 }, { "acc": 0.75510836, "epoch": 0.7404870624048706, "grad_norm": 2.28125, "learning_rate": 7.4486075057238936e-06, "loss": 0.94637556, "memory(GiB)": 369.4, "step": 29190, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75593386, "epoch": 0.7406139015728057, "grad_norm": 2.375, "learning_rate": 7.447693178999062e-06, "loss": 1.01516151, "memory(GiB)": 369.4, "step": 29195, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74052563, "epoch": 0.7407407407407407, "grad_norm": 2.1875, "learning_rate": 7.446778744613759e-06, "loss": 1.01490574, "memory(GiB)": 369.4, "step": 29200, "train_speed(iter/s)": 0.20072 }, { "acc": 0.75576782, "epoch": 0.7408675799086758, "grad_norm": 2.109375, "learning_rate": 7.445864202608198e-06, "loss": 0.96918125, "memory(GiB)": 369.4, "step": 29205, "train_speed(iter/s)": 0.200727 }, { "acc": 0.73611908, "epoch": 0.7409944190766109, "grad_norm": 2.4375, "learning_rate": 7.444949553022613e-06, "loss": 1.00599003, "memory(GiB)": 369.4, "step": 29210, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74519753, "epoch": 0.7411212582445459, "grad_norm": 2.34375, "learning_rate": 7.444034795897229e-06, "loss": 0.96028061, "memory(GiB)": 369.4, "step": 29215, "train_speed(iter/s)": 0.200734 }, { "acc": 0.75917587, "epoch": 0.741248097412481, "grad_norm": 2.03125, "learning_rate": 7.443119931272285e-06, "loss": 0.98483162, "memory(GiB)": 369.4, "step": 29220, "train_speed(iter/s)": 0.200737 }, { "acc": 0.7562078, "epoch": 0.741374936580416, "grad_norm": 2.28125, "learning_rate": 7.442204959188016e-06, "loss": 1.00131426, "memory(GiB)": 369.4, "step": 29225, "train_speed(iter/s)": 0.200742 }, { "acc": 0.73892183, "epoch": 0.7415017757483511, "grad_norm": 2.140625, "learning_rate": 7.4412898796846724e-06, "loss": 1.02628078, "memory(GiB)": 369.4, "step": 29230, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76112847, "epoch": 0.7416286149162862, "grad_norm": 2.109375, "learning_rate": 7.440374692802497e-06, "loss": 0.95020638, "memory(GiB)": 369.4, "step": 29235, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74783297, "epoch": 0.7417554540842212, "grad_norm": 2.359375, "learning_rate": 7.439459398581747e-06, "loss": 0.95286798, "memory(GiB)": 369.4, "step": 29240, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75014925, "epoch": 0.7418822932521563, "grad_norm": 2.546875, "learning_rate": 7.438543997062684e-06, "loss": 0.93085499, "memory(GiB)": 369.4, "step": 29245, "train_speed(iter/s)": 0.200763 }, { "acc": 0.73151622, "epoch": 0.7420091324200914, "grad_norm": 2.125, "learning_rate": 7.437628488285568e-06, "loss": 0.98359337, "memory(GiB)": 369.4, "step": 29250, "train_speed(iter/s)": 0.20077 }, { "acc": 0.73895783, "epoch": 0.7421359715880264, "grad_norm": 1.90625, "learning_rate": 7.4367128722906665e-06, "loss": 0.96652002, "memory(GiB)": 369.4, "step": 29255, "train_speed(iter/s)": 0.200773 }, { "acc": 0.74769793, "epoch": 0.7422628107559615, "grad_norm": 2.234375, "learning_rate": 7.435797149118255e-06, "loss": 1.01652126, "memory(GiB)": 369.4, "step": 29260, "train_speed(iter/s)": 0.20078 }, { "acc": 0.74727588, "epoch": 0.7423896499238964, "grad_norm": 2.25, "learning_rate": 7.434881318808609e-06, "loss": 1.00663033, "memory(GiB)": 369.4, "step": 29265, "train_speed(iter/s)": 0.200785 }, { "acc": 0.7482955, "epoch": 0.7425164890918315, "grad_norm": 2.171875, "learning_rate": 7.433965381402013e-06, "loss": 0.99671774, "memory(GiB)": 369.4, "step": 29270, "train_speed(iter/s)": 0.200792 }, { "acc": 0.74444928, "epoch": 0.7426433282597666, "grad_norm": 2.375, "learning_rate": 7.4330493369387514e-06, "loss": 0.98226585, "memory(GiB)": 369.4, "step": 29275, "train_speed(iter/s)": 0.200798 }, { "acc": 0.7345952, "epoch": 0.7427701674277016, "grad_norm": 2.171875, "learning_rate": 7.432133185459117e-06, "loss": 0.97674856, "memory(GiB)": 369.4, "step": 29280, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75432072, "epoch": 0.7428970065956367, "grad_norm": 2.203125, "learning_rate": 7.431216927003406e-06, "loss": 0.97962265, "memory(GiB)": 369.4, "step": 29285, "train_speed(iter/s)": 0.200808 }, { "acc": 0.75090284, "epoch": 0.7430238457635718, "grad_norm": 2.25, "learning_rate": 7.430300561611922e-06, "loss": 0.96692104, "memory(GiB)": 369.4, "step": 29290, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75391397, "epoch": 0.7431506849315068, "grad_norm": 1.8671875, "learning_rate": 7.429384089324967e-06, "loss": 0.97769756, "memory(GiB)": 369.4, "step": 29295, "train_speed(iter/s)": 0.200816 }, { "acc": 0.74797096, "epoch": 0.7432775240994419, "grad_norm": 2.21875, "learning_rate": 7.428467510182854e-06, "loss": 0.97699566, "memory(GiB)": 369.4, "step": 29300, "train_speed(iter/s)": 0.200821 }, { "acc": 0.76127739, "epoch": 0.7434043632673769, "grad_norm": 2.203125, "learning_rate": 7.427550824225896e-06, "loss": 0.95620842, "memory(GiB)": 369.4, "step": 29305, "train_speed(iter/s)": 0.200827 }, { "acc": 0.75154428, "epoch": 0.743531202435312, "grad_norm": 2.625, "learning_rate": 7.426634031494417e-06, "loss": 1.02035093, "memory(GiB)": 369.4, "step": 29310, "train_speed(iter/s)": 0.200831 }, { "acc": 0.76007314, "epoch": 0.7436580416032471, "grad_norm": 2.0, "learning_rate": 7.425717132028738e-06, "loss": 0.93925209, "memory(GiB)": 369.4, "step": 29315, "train_speed(iter/s)": 0.200835 }, { "acc": 0.73397818, "epoch": 0.7437848807711821, "grad_norm": 2.234375, "learning_rate": 7.42480012586919e-06, "loss": 1.07530136, "memory(GiB)": 369.4, "step": 29320, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75483789, "epoch": 0.7439117199391172, "grad_norm": 2.125, "learning_rate": 7.423883013056106e-06, "loss": 0.90685787, "memory(GiB)": 369.4, "step": 29325, "train_speed(iter/s)": 0.200844 }, { "acc": 0.7545764, "epoch": 0.7440385591070523, "grad_norm": 2.265625, "learning_rate": 7.422965793629825e-06, "loss": 0.99174156, "memory(GiB)": 369.4, "step": 29330, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75041676, "epoch": 0.7441653982749873, "grad_norm": 2.4375, "learning_rate": 7.422048467630691e-06, "loss": 1.01582012, "memory(GiB)": 369.4, "step": 29335, "train_speed(iter/s)": 0.200853 }, { "acc": 0.75362253, "epoch": 0.7442922374429224, "grad_norm": 2.046875, "learning_rate": 7.421131035099052e-06, "loss": 0.92873983, "memory(GiB)": 369.4, "step": 29340, "train_speed(iter/s)": 0.200857 }, { "acc": 0.75217361, "epoch": 0.7444190766108574, "grad_norm": 2.15625, "learning_rate": 7.42021349607526e-06, "loss": 0.98398542, "memory(GiB)": 369.4, "step": 29345, "train_speed(iter/s)": 0.200863 }, { "acc": 0.76433377, "epoch": 0.7445459157787925, "grad_norm": 1.8203125, "learning_rate": 7.419295850599673e-06, "loss": 0.90423155, "memory(GiB)": 369.4, "step": 29350, "train_speed(iter/s)": 0.200868 }, { "acc": 0.7528904, "epoch": 0.7446727549467276, "grad_norm": 2.359375, "learning_rate": 7.418378098712653e-06, "loss": 0.9941143, "memory(GiB)": 369.4, "step": 29355, "train_speed(iter/s)": 0.200873 }, { "acc": 0.73306875, "epoch": 0.7447995941146626, "grad_norm": 2.078125, "learning_rate": 7.417460240454568e-06, "loss": 1.11438198, "memory(GiB)": 369.4, "step": 29360, "train_speed(iter/s)": 0.200875 }, { "acc": 0.74568901, "epoch": 0.7449264332825977, "grad_norm": 2.21875, "learning_rate": 7.4165422758657865e-06, "loss": 0.99266033, "memory(GiB)": 369.4, "step": 29365, "train_speed(iter/s)": 0.200878 }, { "acc": 0.75568476, "epoch": 0.7450532724505328, "grad_norm": 2.109375, "learning_rate": 7.415624204986689e-06, "loss": 0.96479206, "memory(GiB)": 369.4, "step": 29370, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74332628, "epoch": 0.7451801116184678, "grad_norm": 1.9765625, "learning_rate": 7.4147060278576525e-06, "loss": 1.023843, "memory(GiB)": 369.4, "step": 29375, "train_speed(iter/s)": 0.200886 }, { "acc": 0.75059395, "epoch": 0.7453069507864029, "grad_norm": 2.25, "learning_rate": 7.413787744519064e-06, "loss": 1.01471624, "memory(GiB)": 369.4, "step": 29380, "train_speed(iter/s)": 0.200892 }, { "acc": 0.74433146, "epoch": 0.7454337899543378, "grad_norm": 2.0, "learning_rate": 7.412869355011314e-06, "loss": 0.9645175, "memory(GiB)": 369.4, "step": 29385, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74394979, "epoch": 0.7455606291222729, "grad_norm": 2.203125, "learning_rate": 7.411950859374797e-06, "loss": 1.00281725, "memory(GiB)": 369.4, "step": 29390, "train_speed(iter/s)": 0.2009 }, { "acc": 0.76035657, "epoch": 0.745687468290208, "grad_norm": 2.34375, "learning_rate": 7.411032257649913e-06, "loss": 1.035849, "memory(GiB)": 369.4, "step": 29395, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75101767, "epoch": 0.745814307458143, "grad_norm": 2.25, "learning_rate": 7.410113549877065e-06, "loss": 1.00406647, "memory(GiB)": 369.4, "step": 29400, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74497275, "epoch": 0.7459411466260781, "grad_norm": 1.8359375, "learning_rate": 7.409194736096663e-06, "loss": 1.01172619, "memory(GiB)": 369.4, "step": 29405, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75493731, "epoch": 0.7460679857940132, "grad_norm": 2.078125, "learning_rate": 7.408275816349121e-06, "loss": 0.98217316, "memory(GiB)": 369.4, "step": 29410, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75203962, "epoch": 0.7461948249619482, "grad_norm": 2.140625, "learning_rate": 7.4073567906748555e-06, "loss": 0.98607464, "memory(GiB)": 369.4, "step": 29415, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74617424, "epoch": 0.7463216641298833, "grad_norm": 1.9921875, "learning_rate": 7.406437659114291e-06, "loss": 0.99669075, "memory(GiB)": 369.4, "step": 29420, "train_speed(iter/s)": 0.200929 }, { "acc": 0.73971705, "epoch": 0.7464485032978183, "grad_norm": 2.125, "learning_rate": 7.405518421707854e-06, "loss": 1.04917068, "memory(GiB)": 369.4, "step": 29425, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75590854, "epoch": 0.7465753424657534, "grad_norm": 2.125, "learning_rate": 7.404599078495977e-06, "loss": 0.963867, "memory(GiB)": 369.4, "step": 29430, "train_speed(iter/s)": 0.20094 }, { "acc": 0.74371395, "epoch": 0.7467021816336885, "grad_norm": 1.875, "learning_rate": 7.403679629519096e-06, "loss": 1.0407011, "memory(GiB)": 369.4, "step": 29435, "train_speed(iter/s)": 0.200944 }, { "acc": 0.73706017, "epoch": 0.7468290208016235, "grad_norm": 2.140625, "learning_rate": 7.402760074817654e-06, "loss": 1.07673893, "memory(GiB)": 369.4, "step": 29440, "train_speed(iter/s)": 0.200948 }, { "acc": 0.76457815, "epoch": 0.7469558599695586, "grad_norm": 2.109375, "learning_rate": 7.4018404144320955e-06, "loss": 1.00163574, "memory(GiB)": 369.4, "step": 29445, "train_speed(iter/s)": 0.200953 }, { "acc": 0.75329351, "epoch": 0.7470826991374937, "grad_norm": 2.375, "learning_rate": 7.4009206484028735e-06, "loss": 1.01388454, "memory(GiB)": 369.4, "step": 29450, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75406199, "epoch": 0.7472095383054287, "grad_norm": 2.0625, "learning_rate": 7.400000776770441e-06, "loss": 0.95894356, "memory(GiB)": 369.4, "step": 29455, "train_speed(iter/s)": 0.200965 }, { "acc": 0.7314847, "epoch": 0.7473363774733638, "grad_norm": 3.4375, "learning_rate": 7.39908079957526e-06, "loss": 1.01324749, "memory(GiB)": 369.4, "step": 29460, "train_speed(iter/s)": 0.200968 }, { "acc": 0.74462657, "epoch": 0.7474632166412988, "grad_norm": 2.84375, "learning_rate": 7.398160716857794e-06, "loss": 1.02194901, "memory(GiB)": 369.4, "step": 29465, "train_speed(iter/s)": 0.200971 }, { "acc": 0.75525599, "epoch": 0.7475900558092339, "grad_norm": 2.203125, "learning_rate": 7.397240528658513e-06, "loss": 0.96676168, "memory(GiB)": 369.4, "step": 29470, "train_speed(iter/s)": 0.200975 }, { "acc": 0.74906569, "epoch": 0.747716894977169, "grad_norm": 2.171875, "learning_rate": 7.39632023501789e-06, "loss": 1.00248795, "memory(GiB)": 369.4, "step": 29475, "train_speed(iter/s)": 0.20098 }, { "acc": 0.73959713, "epoch": 0.747843734145104, "grad_norm": 1.9453125, "learning_rate": 7.3953998359764036e-06, "loss": 1.02605534, "memory(GiB)": 369.4, "step": 29480, "train_speed(iter/s)": 0.200983 }, { "acc": 0.74661999, "epoch": 0.7479705733130391, "grad_norm": 2.390625, "learning_rate": 7.394479331574539e-06, "loss": 1.00342274, "memory(GiB)": 369.4, "step": 29485, "train_speed(iter/s)": 0.200986 }, { "acc": 0.73615828, "epoch": 0.7480974124809742, "grad_norm": 2.046875, "learning_rate": 7.393558721852783e-06, "loss": 1.04202595, "memory(GiB)": 369.4, "step": 29490, "train_speed(iter/s)": 0.20099 }, { "acc": 0.73530884, "epoch": 0.7482242516489092, "grad_norm": 2.34375, "learning_rate": 7.392638006851627e-06, "loss": 1.01284752, "memory(GiB)": 369.4, "step": 29495, "train_speed(iter/s)": 0.200996 }, { "acc": 0.76046228, "epoch": 0.7483510908168443, "grad_norm": 2.125, "learning_rate": 7.391717186611569e-06, "loss": 0.9806881, "memory(GiB)": 369.4, "step": 29500, "train_speed(iter/s)": 0.201003 }, { "acc": 0.75335374, "epoch": 0.7484779299847792, "grad_norm": 2.21875, "learning_rate": 7.39079626117311e-06, "loss": 0.98175392, "memory(GiB)": 369.4, "step": 29505, "train_speed(iter/s)": 0.201008 }, { "acc": 0.74264956, "epoch": 0.7486047691527143, "grad_norm": 2.265625, "learning_rate": 7.3898752305767595e-06, "loss": 1.05610542, "memory(GiB)": 369.4, "step": 29510, "train_speed(iter/s)": 0.201012 }, { "acc": 0.74460268, "epoch": 0.7487316083206494, "grad_norm": 2.0625, "learning_rate": 7.3889540948630245e-06, "loss": 1.02414742, "memory(GiB)": 369.4, "step": 29515, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75243883, "epoch": 0.7488584474885844, "grad_norm": 2.546875, "learning_rate": 7.388032854072424e-06, "loss": 0.972017, "memory(GiB)": 369.4, "step": 29520, "train_speed(iter/s)": 0.201022 }, { "acc": 0.73619437, "epoch": 0.7489852866565195, "grad_norm": 2.375, "learning_rate": 7.387111508245476e-06, "loss": 1.07209282, "memory(GiB)": 369.4, "step": 29525, "train_speed(iter/s)": 0.201028 }, { "acc": 0.76543689, "epoch": 0.7491121258244546, "grad_norm": 2.15625, "learning_rate": 7.386190057422706e-06, "loss": 0.93120041, "memory(GiB)": 369.4, "step": 29530, "train_speed(iter/s)": 0.201034 }, { "acc": 0.74393215, "epoch": 0.7492389649923896, "grad_norm": 1.9921875, "learning_rate": 7.385268501644645e-06, "loss": 0.97194166, "memory(GiB)": 369.4, "step": 29535, "train_speed(iter/s)": 0.201036 }, { "acc": 0.75520964, "epoch": 0.7493658041603247, "grad_norm": 2.078125, "learning_rate": 7.384346840951824e-06, "loss": 0.9938509, "memory(GiB)": 369.4, "step": 29540, "train_speed(iter/s)": 0.201039 }, { "acc": 0.74933772, "epoch": 0.7494926433282597, "grad_norm": 2.609375, "learning_rate": 7.383425075384785e-06, "loss": 0.9707716, "memory(GiB)": 369.4, "step": 29545, "train_speed(iter/s)": 0.201043 }, { "acc": 0.76020832, "epoch": 0.7496194824961948, "grad_norm": 2.03125, "learning_rate": 7.382503204984069e-06, "loss": 0.95816021, "memory(GiB)": 369.4, "step": 29550, "train_speed(iter/s)": 0.201047 }, { "acc": 0.74076643, "epoch": 0.7497463216641299, "grad_norm": 2.21875, "learning_rate": 7.381581229790226e-06, "loss": 0.98885136, "memory(GiB)": 369.4, "step": 29555, "train_speed(iter/s)": 0.201054 }, { "acc": 0.74862499, "epoch": 0.7498731608320649, "grad_norm": 2.453125, "learning_rate": 7.380659149843806e-06, "loss": 1.00221996, "memory(GiB)": 369.4, "step": 29560, "train_speed(iter/s)": 0.201057 }, { "acc": 0.74382176, "epoch": 0.75, "grad_norm": 2.03125, "learning_rate": 7.379736965185369e-06, "loss": 0.98886089, "memory(GiB)": 369.4, "step": 29565, "train_speed(iter/s)": 0.201062 }, { "acc": 0.72744427, "epoch": 0.7501268391679351, "grad_norm": 1.6640625, "learning_rate": 7.378814675855475e-06, "loss": 1.05220547, "memory(GiB)": 369.4, "step": 29570, "train_speed(iter/s)": 0.201065 }, { "acc": 0.75447378, "epoch": 0.7502536783358701, "grad_norm": 2.4375, "learning_rate": 7.37789228189469e-06, "loss": 1.00273628, "memory(GiB)": 369.4, "step": 29575, "train_speed(iter/s)": 0.201069 }, { "acc": 0.7498044, "epoch": 0.7503805175038052, "grad_norm": 1.8515625, "learning_rate": 7.376969783343588e-06, "loss": 0.99000931, "memory(GiB)": 369.4, "step": 29580, "train_speed(iter/s)": 0.201073 }, { "acc": 0.75410337, "epoch": 0.7505073566717403, "grad_norm": 2.25, "learning_rate": 7.37604718024274e-06, "loss": 0.96718769, "memory(GiB)": 369.4, "step": 29585, "train_speed(iter/s)": 0.201079 }, { "acc": 0.75597715, "epoch": 0.7506341958396753, "grad_norm": 2.40625, "learning_rate": 7.375124472632732e-06, "loss": 1.01855421, "memory(GiB)": 369.4, "step": 29590, "train_speed(iter/s)": 0.201086 }, { "acc": 0.74451094, "epoch": 0.7507610350076104, "grad_norm": 1.9921875, "learning_rate": 7.374201660554142e-06, "loss": 1.02882729, "memory(GiB)": 369.4, "step": 29595, "train_speed(iter/s)": 0.201092 }, { "acc": 0.74500723, "epoch": 0.7508878741755454, "grad_norm": 2.484375, "learning_rate": 7.373278744047565e-06, "loss": 1.03049278, "memory(GiB)": 369.4, "step": 29600, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74614182, "epoch": 0.7510147133434805, "grad_norm": 2.703125, "learning_rate": 7.372355723153593e-06, "loss": 1.05566931, "memory(GiB)": 369.4, "step": 29605, "train_speed(iter/s)": 0.201102 }, { "acc": 0.74844666, "epoch": 0.7511415525114156, "grad_norm": 2.328125, "learning_rate": 7.371432597912824e-06, "loss": 1.01105318, "memory(GiB)": 369.4, "step": 29610, "train_speed(iter/s)": 0.201106 }, { "acc": 0.75869818, "epoch": 0.7512683916793506, "grad_norm": 2.15625, "learning_rate": 7.3705093683658616e-06, "loss": 1.0010725, "memory(GiB)": 369.4, "step": 29615, "train_speed(iter/s)": 0.201112 }, { "acc": 0.74366617, "epoch": 0.7513952308472857, "grad_norm": 2.484375, "learning_rate": 7.369586034553313e-06, "loss": 1.03924503, "memory(GiB)": 369.4, "step": 29620, "train_speed(iter/s)": 0.201119 }, { "acc": 0.74172087, "epoch": 0.7515220700152208, "grad_norm": 2.34375, "learning_rate": 7.368662596515792e-06, "loss": 1.0742033, "memory(GiB)": 369.4, "step": 29625, "train_speed(iter/s)": 0.201124 }, { "acc": 0.74019537, "epoch": 0.7516489091831557, "grad_norm": 2.515625, "learning_rate": 7.367739054293914e-06, "loss": 1.03071594, "memory(GiB)": 369.4, "step": 29630, "train_speed(iter/s)": 0.201132 }, { "acc": 0.7469471, "epoch": 0.7517757483510908, "grad_norm": 1.78125, "learning_rate": 7.366815407928302e-06, "loss": 0.98526344, "memory(GiB)": 369.4, "step": 29635, "train_speed(iter/s)": 0.201136 }, { "acc": 0.75097113, "epoch": 0.7519025875190258, "grad_norm": 1.9453125, "learning_rate": 7.365891657459582e-06, "loss": 0.96764259, "memory(GiB)": 369.4, "step": 29640, "train_speed(iter/s)": 0.201142 }, { "acc": 0.73764372, "epoch": 0.7520294266869609, "grad_norm": 2.265625, "learning_rate": 7.3649678029283825e-06, "loss": 1.04605694, "memory(GiB)": 369.4, "step": 29645, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74329987, "epoch": 0.752156265854896, "grad_norm": 2.21875, "learning_rate": 7.364043844375342e-06, "loss": 1.03062658, "memory(GiB)": 369.4, "step": 29650, "train_speed(iter/s)": 0.20115 }, { "acc": 0.73921356, "epoch": 0.752283105022831, "grad_norm": 2.03125, "learning_rate": 7.363119781841095e-06, "loss": 1.03159332, "memory(GiB)": 369.4, "step": 29655, "train_speed(iter/s)": 0.201153 }, { "acc": 0.74265242, "epoch": 0.7524099441907661, "grad_norm": 2.015625, "learning_rate": 7.362195615366293e-06, "loss": 1.03669138, "memory(GiB)": 369.4, "step": 29660, "train_speed(iter/s)": 0.201158 }, { "acc": 0.76641021, "epoch": 0.7525367833587012, "grad_norm": 2.578125, "learning_rate": 7.361271344991579e-06, "loss": 0.94338379, "memory(GiB)": 369.4, "step": 29665, "train_speed(iter/s)": 0.201165 }, { "acc": 0.73665514, "epoch": 0.7526636225266362, "grad_norm": 1.9140625, "learning_rate": 7.36034697075761e-06, "loss": 1.03428612, "memory(GiB)": 369.4, "step": 29670, "train_speed(iter/s)": 0.201168 }, { "acc": 0.74653378, "epoch": 0.7527904616945713, "grad_norm": 2.546875, "learning_rate": 7.359422492705043e-06, "loss": 0.98838434, "memory(GiB)": 369.4, "step": 29675, "train_speed(iter/s)": 0.201173 }, { "acc": 0.74948001, "epoch": 0.7529173008625063, "grad_norm": 2.453125, "learning_rate": 7.3584979108745405e-06, "loss": 0.9855011, "memory(GiB)": 369.4, "step": 29680, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75090828, "epoch": 0.7530441400304414, "grad_norm": 2.046875, "learning_rate": 7.357573225306771e-06, "loss": 0.98187275, "memory(GiB)": 369.4, "step": 29685, "train_speed(iter/s)": 0.201183 }, { "acc": 0.74563961, "epoch": 0.7531709791983765, "grad_norm": 2.421875, "learning_rate": 7.356648436042404e-06, "loss": 1.0064086, "memory(GiB)": 369.4, "step": 29690, "train_speed(iter/s)": 0.201188 }, { "acc": 0.73672543, "epoch": 0.7532978183663115, "grad_norm": 2.5625, "learning_rate": 7.355723543122118e-06, "loss": 1.04604225, "memory(GiB)": 369.4, "step": 29695, "train_speed(iter/s)": 0.20119 }, { "acc": 0.77106643, "epoch": 0.7534246575342466, "grad_norm": 2.09375, "learning_rate": 7.354798546586592e-06, "loss": 0.94235458, "memory(GiB)": 369.4, "step": 29700, "train_speed(iter/s)": 0.201196 }, { "acc": 0.75534229, "epoch": 0.7535514967021817, "grad_norm": 2.109375, "learning_rate": 7.353873446476512e-06, "loss": 0.97541962, "memory(GiB)": 369.4, "step": 29705, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74397564, "epoch": 0.7536783358701167, "grad_norm": 2.296875, "learning_rate": 7.3529482428325705e-06, "loss": 1.05639009, "memory(GiB)": 369.4, "step": 29710, "train_speed(iter/s)": 0.201202 }, { "acc": 0.74056835, "epoch": 0.7538051750380518, "grad_norm": 2.359375, "learning_rate": 7.35202293569546e-06, "loss": 1.01643867, "memory(GiB)": 369.4, "step": 29715, "train_speed(iter/s)": 0.201209 }, { "acc": 0.75728889, "epoch": 0.7539320142059868, "grad_norm": 1.796875, "learning_rate": 7.351097525105878e-06, "loss": 0.93320961, "memory(GiB)": 369.4, "step": 29720, "train_speed(iter/s)": 0.201212 }, { "acc": 0.7538209, "epoch": 0.7540588533739219, "grad_norm": 1.9140625, "learning_rate": 7.35017201110453e-06, "loss": 0.94088116, "memory(GiB)": 369.4, "step": 29725, "train_speed(iter/s)": 0.201214 }, { "acc": 0.74349489, "epoch": 0.754185692541857, "grad_norm": 2.078125, "learning_rate": 7.349246393732126e-06, "loss": 1.01669273, "memory(GiB)": 369.4, "step": 29730, "train_speed(iter/s)": 0.20122 }, { "acc": 0.75449519, "epoch": 0.754312531709792, "grad_norm": 2.390625, "learning_rate": 7.3483206730293755e-06, "loss": 0.99437389, "memory(GiB)": 369.4, "step": 29735, "train_speed(iter/s)": 0.20122 }, { "acc": 0.7516407, "epoch": 0.7544393708777271, "grad_norm": 2.109375, "learning_rate": 7.347394849036998e-06, "loss": 1.00396843, "memory(GiB)": 369.4, "step": 29740, "train_speed(iter/s)": 0.201225 }, { "acc": 0.75298176, "epoch": 0.7545662100456622, "grad_norm": 1.984375, "learning_rate": 7.346468921795714e-06, "loss": 0.95011005, "memory(GiB)": 369.4, "step": 29745, "train_speed(iter/s)": 0.201231 }, { "acc": 0.73760195, "epoch": 0.7546930492135971, "grad_norm": 2.09375, "learning_rate": 7.345542891346251e-06, "loss": 1.01649151, "memory(GiB)": 369.4, "step": 29750, "train_speed(iter/s)": 0.201235 }, { "acc": 0.74622974, "epoch": 0.7548198883815322, "grad_norm": 2.265625, "learning_rate": 7.344616757729341e-06, "loss": 1.00166321, "memory(GiB)": 369.4, "step": 29755, "train_speed(iter/s)": 0.20124 }, { "acc": 0.75471163, "epoch": 0.7549467275494672, "grad_norm": 2.125, "learning_rate": 7.343690520985716e-06, "loss": 0.97947102, "memory(GiB)": 369.4, "step": 29760, "train_speed(iter/s)": 0.201246 }, { "acc": 0.75037298, "epoch": 0.7550735667174023, "grad_norm": 1.984375, "learning_rate": 7.342764181156119e-06, "loss": 0.97744122, "memory(GiB)": 369.4, "step": 29765, "train_speed(iter/s)": 0.201252 }, { "acc": 0.75581255, "epoch": 0.7552004058853374, "grad_norm": 2.03125, "learning_rate": 7.341837738281293e-06, "loss": 0.99508104, "memory(GiB)": 369.4, "step": 29770, "train_speed(iter/s)": 0.201255 }, { "acc": 0.7482038, "epoch": 0.7553272450532724, "grad_norm": 3.171875, "learning_rate": 7.3409111924019885e-06, "loss": 1.03428497, "memory(GiB)": 369.4, "step": 29775, "train_speed(iter/s)": 0.201261 }, { "acc": 0.75019741, "epoch": 0.7554540842212075, "grad_norm": 2.5, "learning_rate": 7.3399845435589574e-06, "loss": 1.0029027, "memory(GiB)": 369.4, "step": 29780, "train_speed(iter/s)": 0.201267 }, { "acc": 0.76509218, "epoch": 0.7555809233891426, "grad_norm": 2.21875, "learning_rate": 7.33905779179296e-06, "loss": 0.94975777, "memory(GiB)": 369.4, "step": 29785, "train_speed(iter/s)": 0.201272 }, { "acc": 0.74799504, "epoch": 0.7557077625570776, "grad_norm": 1.984375, "learning_rate": 7.338130937144756e-06, "loss": 0.98670044, "memory(GiB)": 369.4, "step": 29790, "train_speed(iter/s)": 0.201275 }, { "acc": 0.76136637, "epoch": 0.7558346017250127, "grad_norm": 1.9921875, "learning_rate": 7.3372039796551156e-06, "loss": 0.96516962, "memory(GiB)": 369.4, "step": 29795, "train_speed(iter/s)": 0.201281 }, { "acc": 0.74859085, "epoch": 0.7559614408929477, "grad_norm": 2.09375, "learning_rate": 7.33627691936481e-06, "loss": 1.01857548, "memory(GiB)": 369.4, "step": 29800, "train_speed(iter/s)": 0.201287 }, { "acc": 0.76712465, "epoch": 0.7560882800608828, "grad_norm": 1.71875, "learning_rate": 7.335349756314614e-06, "loss": 0.94513016, "memory(GiB)": 369.4, "step": 29805, "train_speed(iter/s)": 0.201289 }, { "acc": 0.74439287, "epoch": 0.7562151192288179, "grad_norm": 1.8203125, "learning_rate": 7.33442249054531e-06, "loss": 1.05614166, "memory(GiB)": 369.4, "step": 29810, "train_speed(iter/s)": 0.201294 }, { "acc": 0.74260874, "epoch": 0.7563419583967529, "grad_norm": 1.84375, "learning_rate": 7.33349512209768e-06, "loss": 1.00908527, "memory(GiB)": 369.4, "step": 29815, "train_speed(iter/s)": 0.201299 }, { "acc": 0.74176216, "epoch": 0.756468797564688, "grad_norm": 2.234375, "learning_rate": 7.332567651012518e-06, "loss": 1.03281116, "memory(GiB)": 369.4, "step": 29820, "train_speed(iter/s)": 0.201305 }, { "acc": 0.74722118, "epoch": 0.7565956367326231, "grad_norm": 2.140625, "learning_rate": 7.331640077330616e-06, "loss": 0.96510296, "memory(GiB)": 369.4, "step": 29825, "train_speed(iter/s)": 0.20131 }, { "acc": 0.75832491, "epoch": 0.7567224759005581, "grad_norm": 2.28125, "learning_rate": 7.330712401092773e-06, "loss": 1.02734661, "memory(GiB)": 369.4, "step": 29830, "train_speed(iter/s)": 0.201316 }, { "acc": 0.74678288, "epoch": 0.7568493150684932, "grad_norm": 1.9921875, "learning_rate": 7.329784622339794e-06, "loss": 0.98346901, "memory(GiB)": 369.4, "step": 29835, "train_speed(iter/s)": 0.201321 }, { "acc": 0.74294009, "epoch": 0.7569761542364282, "grad_norm": 2.484375, "learning_rate": 7.328856741112484e-06, "loss": 1.03018513, "memory(GiB)": 369.4, "step": 29840, "train_speed(iter/s)": 0.201324 }, { "acc": 0.74669819, "epoch": 0.7571029934043633, "grad_norm": 2.34375, "learning_rate": 7.327928757451659e-06, "loss": 0.98499346, "memory(GiB)": 369.4, "step": 29845, "train_speed(iter/s)": 0.201326 }, { "acc": 0.76191168, "epoch": 0.7572298325722984, "grad_norm": 1.8515625, "learning_rate": 7.3270006713981325e-06, "loss": 0.92136707, "memory(GiB)": 369.4, "step": 29850, "train_speed(iter/s)": 0.201331 }, { "acc": 0.74179888, "epoch": 0.7573566717402334, "grad_norm": 2.125, "learning_rate": 7.326072482992728e-06, "loss": 1.0138051, "memory(GiB)": 369.4, "step": 29855, "train_speed(iter/s)": 0.201337 }, { "acc": 0.74190979, "epoch": 0.7574835109081685, "grad_norm": 2.421875, "learning_rate": 7.325144192276269e-06, "loss": 1.01915512, "memory(GiB)": 369.4, "step": 29860, "train_speed(iter/s)": 0.201341 }, { "acc": 0.73766804, "epoch": 0.7576103500761036, "grad_norm": 2.453125, "learning_rate": 7.324215799289588e-06, "loss": 1.0460804, "memory(GiB)": 369.4, "step": 29865, "train_speed(iter/s)": 0.201345 }, { "acc": 0.76012511, "epoch": 0.7577371892440385, "grad_norm": 2.84375, "learning_rate": 7.3232873040735194e-06, "loss": 0.95869799, "memory(GiB)": 369.4, "step": 29870, "train_speed(iter/s)": 0.201349 }, { "acc": 0.74279928, "epoch": 0.7578640284119736, "grad_norm": 2.03125, "learning_rate": 7.322358706668901e-06, "loss": 1.02943611, "memory(GiB)": 369.4, "step": 29875, "train_speed(iter/s)": 0.201353 }, { "acc": 0.75494356, "epoch": 0.7579908675799086, "grad_norm": 1.90625, "learning_rate": 7.321430007116582e-06, "loss": 0.98712358, "memory(GiB)": 369.4, "step": 29880, "train_speed(iter/s)": 0.201356 }, { "acc": 0.73298316, "epoch": 0.7581177067478437, "grad_norm": 2.59375, "learning_rate": 7.320501205457403e-06, "loss": 1.0466383, "memory(GiB)": 369.4, "step": 29885, "train_speed(iter/s)": 0.201359 }, { "acc": 0.73640103, "epoch": 0.7582445459157788, "grad_norm": 2.46875, "learning_rate": 7.319572301732224e-06, "loss": 1.01285954, "memory(GiB)": 369.4, "step": 29890, "train_speed(iter/s)": 0.201362 }, { "acc": 0.74461384, "epoch": 0.7583713850837138, "grad_norm": 1.8828125, "learning_rate": 7.3186432959818956e-06, "loss": 1.00573854, "memory(GiB)": 369.4, "step": 29895, "train_speed(iter/s)": 0.201366 }, { "acc": 0.75434942, "epoch": 0.7584982242516489, "grad_norm": 2.125, "learning_rate": 7.317714188247285e-06, "loss": 0.96778955, "memory(GiB)": 369.4, "step": 29900, "train_speed(iter/s)": 0.201371 }, { "acc": 0.73952599, "epoch": 0.758625063419584, "grad_norm": 2.015625, "learning_rate": 7.316784978569256e-06, "loss": 1.03168383, "memory(GiB)": 369.4, "step": 29905, "train_speed(iter/s)": 0.201377 }, { "acc": 0.7570775, "epoch": 0.758751902587519, "grad_norm": 2.15625, "learning_rate": 7.31585566698868e-06, "loss": 0.91165609, "memory(GiB)": 369.4, "step": 29910, "train_speed(iter/s)": 0.20138 }, { "acc": 0.75391207, "epoch": 0.7588787417554541, "grad_norm": 2.34375, "learning_rate": 7.314926253546433e-06, "loss": 0.9282217, "memory(GiB)": 369.4, "step": 29915, "train_speed(iter/s)": 0.201385 }, { "acc": 0.75415735, "epoch": 0.7590055809233891, "grad_norm": 2.125, "learning_rate": 7.313996738283393e-06, "loss": 0.99133453, "memory(GiB)": 369.4, "step": 29920, "train_speed(iter/s)": 0.201389 }, { "acc": 0.74992933, "epoch": 0.7591324200913242, "grad_norm": 2.609375, "learning_rate": 7.3130671212404455e-06, "loss": 1.03005924, "memory(GiB)": 369.4, "step": 29925, "train_speed(iter/s)": 0.201395 }, { "acc": 0.75846901, "epoch": 0.7592592592592593, "grad_norm": 2.09375, "learning_rate": 7.312137402458479e-06, "loss": 1.00142632, "memory(GiB)": 369.4, "step": 29930, "train_speed(iter/s)": 0.201402 }, { "acc": 0.75100789, "epoch": 0.7593860984271943, "grad_norm": 1.953125, "learning_rate": 7.3112075819783864e-06, "loss": 0.99547215, "memory(GiB)": 369.4, "step": 29935, "train_speed(iter/s)": 0.201394 }, { "acc": 0.73673687, "epoch": 0.7595129375951294, "grad_norm": 2.796875, "learning_rate": 7.310277659841066e-06, "loss": 1.0753231, "memory(GiB)": 369.4, "step": 29940, "train_speed(iter/s)": 0.2014 }, { "acc": 0.73875656, "epoch": 0.7596397767630645, "grad_norm": 1.9609375, "learning_rate": 7.309347636087418e-06, "loss": 1.01355762, "memory(GiB)": 369.4, "step": 29945, "train_speed(iter/s)": 0.201405 }, { "acc": 0.7257009, "epoch": 0.7597666159309995, "grad_norm": 1.828125, "learning_rate": 7.308417510758353e-06, "loss": 1.06811962, "memory(GiB)": 369.4, "step": 29950, "train_speed(iter/s)": 0.20141 }, { "acc": 0.74315467, "epoch": 0.7598934550989346, "grad_norm": 1.96875, "learning_rate": 7.307487283894777e-06, "loss": 1.04506969, "memory(GiB)": 369.4, "step": 29955, "train_speed(iter/s)": 0.201415 }, { "acc": 0.76021533, "epoch": 0.7600202942668696, "grad_norm": 2.046875, "learning_rate": 7.30655695553761e-06, "loss": 0.91348009, "memory(GiB)": 369.4, "step": 29960, "train_speed(iter/s)": 0.201421 }, { "acc": 0.75377502, "epoch": 0.7601471334348047, "grad_norm": 2.1875, "learning_rate": 7.305626525727769e-06, "loss": 0.98142643, "memory(GiB)": 369.4, "step": 29965, "train_speed(iter/s)": 0.201427 }, { "acc": 0.74794245, "epoch": 0.7602739726027398, "grad_norm": 2.015625, "learning_rate": 7.30469599450618e-06, "loss": 1.05458546, "memory(GiB)": 369.4, "step": 29970, "train_speed(iter/s)": 0.201432 }, { "acc": 0.74513731, "epoch": 0.7604008117706748, "grad_norm": 1.78125, "learning_rate": 7.30376536191377e-06, "loss": 1.01370087, "memory(GiB)": 369.4, "step": 29975, "train_speed(iter/s)": 0.201434 }, { "acc": 0.7398118, "epoch": 0.7605276509386099, "grad_norm": 1.8046875, "learning_rate": 7.302834627991477e-06, "loss": 1.03046246, "memory(GiB)": 369.4, "step": 29980, "train_speed(iter/s)": 0.201435 }, { "acc": 0.74747753, "epoch": 0.760654490106545, "grad_norm": 1.8671875, "learning_rate": 7.301903792780233e-06, "loss": 1.01459312, "memory(GiB)": 369.4, "step": 29985, "train_speed(iter/s)": 0.20144 }, { "acc": 0.7626811, "epoch": 0.76078132927448, "grad_norm": 2.0, "learning_rate": 7.300972856320984e-06, "loss": 0.93700619, "memory(GiB)": 369.4, "step": 29990, "train_speed(iter/s)": 0.201446 }, { "acc": 0.74512324, "epoch": 0.760908168442415, "grad_norm": 2.28125, "learning_rate": 7.3000418186546754e-06, "loss": 0.99369526, "memory(GiB)": 369.4, "step": 29995, "train_speed(iter/s)": 0.201454 }, { "acc": 0.7672267, "epoch": 0.76103500761035, "grad_norm": 1.8359375, "learning_rate": 7.299110679822258e-06, "loss": 0.9373579, "memory(GiB)": 369.4, "step": 30000, "train_speed(iter/s)": 0.201457 }, { "epoch": 0.76103500761035, "eval_acc": 0.7369118724038373, "eval_loss": 0.9745659232139587, "eval_runtime": 384.7533, "eval_samples_per_second": 16.556, "eval_steps_per_second": 8.278, "step": 30000 }, { "acc": 0.738381, "epoch": 0.7611618467782851, "grad_norm": 2.171875, "learning_rate": 7.298179439864689e-06, "loss": 1.0592267, "memory(GiB)": 369.4, "step": 30005, "train_speed(iter/s)": 0.200503 }, { "acc": 0.75732431, "epoch": 0.7612886859462202, "grad_norm": 1.8203125, "learning_rate": 7.297248098822926e-06, "loss": 0.94012051, "memory(GiB)": 369.4, "step": 30010, "train_speed(iter/s)": 0.200507 }, { "acc": 0.73925848, "epoch": 0.7614155251141552, "grad_norm": 1.84375, "learning_rate": 7.296316656737936e-06, "loss": 1.01290264, "memory(GiB)": 369.4, "step": 30015, "train_speed(iter/s)": 0.200511 }, { "acc": 0.76054134, "epoch": 0.7615423642820903, "grad_norm": 1.9375, "learning_rate": 7.295385113650689e-06, "loss": 0.98442078, "memory(GiB)": 369.4, "step": 30020, "train_speed(iter/s)": 0.200517 }, { "acc": 0.74812517, "epoch": 0.7616692034500254, "grad_norm": 2.125, "learning_rate": 7.294453469602154e-06, "loss": 0.98525238, "memory(GiB)": 369.4, "step": 30025, "train_speed(iter/s)": 0.20052 }, { "acc": 0.76576385, "epoch": 0.7617960426179604, "grad_norm": 1.9609375, "learning_rate": 7.293521724633313e-06, "loss": 0.98011446, "memory(GiB)": 369.4, "step": 30030, "train_speed(iter/s)": 0.200527 }, { "acc": 0.74365611, "epoch": 0.7619228817858955, "grad_norm": 1.9453125, "learning_rate": 7.2925898787851455e-06, "loss": 0.9997797, "memory(GiB)": 369.4, "step": 30035, "train_speed(iter/s)": 0.200528 }, { "acc": 0.75542889, "epoch": 0.7620497209538305, "grad_norm": 2.109375, "learning_rate": 7.2916579320986415e-06, "loss": 0.98265467, "memory(GiB)": 369.4, "step": 30040, "train_speed(iter/s)": 0.200535 }, { "acc": 0.74176621, "epoch": 0.7621765601217656, "grad_norm": 2.0625, "learning_rate": 7.290725884614787e-06, "loss": 1.04152222, "memory(GiB)": 369.4, "step": 30045, "train_speed(iter/s)": 0.20054 }, { "acc": 0.74105682, "epoch": 0.7623033992897007, "grad_norm": 2.015625, "learning_rate": 7.2897937363745844e-06, "loss": 0.99258518, "memory(GiB)": 369.4, "step": 30050, "train_speed(iter/s)": 0.200541 }, { "acc": 0.74909096, "epoch": 0.7624302384576357, "grad_norm": 2.46875, "learning_rate": 7.2888614874190276e-06, "loss": 0.99440613, "memory(GiB)": 369.4, "step": 30055, "train_speed(iter/s)": 0.200548 }, { "acc": 0.7519794, "epoch": 0.7625570776255708, "grad_norm": 2.4375, "learning_rate": 7.287929137789124e-06, "loss": 1.03050547, "memory(GiB)": 369.4, "step": 30060, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75548191, "epoch": 0.7626839167935059, "grad_norm": 2.53125, "learning_rate": 7.286996687525882e-06, "loss": 1.01973228, "memory(GiB)": 369.4, "step": 30065, "train_speed(iter/s)": 0.200554 }, { "acc": 0.73750224, "epoch": 0.7628107559614409, "grad_norm": 1.9765625, "learning_rate": 7.2860641366703155e-06, "loss": 0.99538155, "memory(GiB)": 369.4, "step": 30070, "train_speed(iter/s)": 0.200557 }, { "acc": 0.7413805, "epoch": 0.762937595129376, "grad_norm": 2.0, "learning_rate": 7.285131485263441e-06, "loss": 1.03146505, "memory(GiB)": 369.4, "step": 30075, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74429369, "epoch": 0.763064434297311, "grad_norm": 2.25, "learning_rate": 7.2841987333462815e-06, "loss": 1.06471653, "memory(GiB)": 369.4, "step": 30080, "train_speed(iter/s)": 0.200567 }, { "acc": 0.74701424, "epoch": 0.7631912734652461, "grad_norm": 2.5, "learning_rate": 7.283265880959863e-06, "loss": 1.02458305, "memory(GiB)": 369.4, "step": 30085, "train_speed(iter/s)": 0.20057 }, { "acc": 0.7393074, "epoch": 0.7633181126331812, "grad_norm": 2.046875, "learning_rate": 7.282332928145219e-06, "loss": 1.00872478, "memory(GiB)": 369.4, "step": 30090, "train_speed(iter/s)": 0.200575 }, { "acc": 0.75010738, "epoch": 0.7634449518011162, "grad_norm": 2.359375, "learning_rate": 7.281399874943381e-06, "loss": 0.9754549, "memory(GiB)": 369.4, "step": 30095, "train_speed(iter/s)": 0.200578 }, { "acc": 0.7433651, "epoch": 0.7635717909690513, "grad_norm": 2.390625, "learning_rate": 7.280466721395393e-06, "loss": 1.0668272, "memory(GiB)": 369.4, "step": 30100, "train_speed(iter/s)": 0.200583 }, { "acc": 0.73652573, "epoch": 0.7636986301369864, "grad_norm": 1.8984375, "learning_rate": 7.279533467542295e-06, "loss": 1.01352968, "memory(GiB)": 369.4, "step": 30105, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74449234, "epoch": 0.7638254693049213, "grad_norm": 2.3125, "learning_rate": 7.2786001134251385e-06, "loss": 1.02307329, "memory(GiB)": 369.4, "step": 30110, "train_speed(iter/s)": 0.200595 }, { "acc": 0.75868979, "epoch": 0.7639523084728564, "grad_norm": 2.3125, "learning_rate": 7.2776666590849744e-06, "loss": 0.97653189, "memory(GiB)": 369.4, "step": 30115, "train_speed(iter/s)": 0.2006 }, { "acc": 0.75843792, "epoch": 0.7640791476407914, "grad_norm": 1.8984375, "learning_rate": 7.276733104562863e-06, "loss": 0.96032219, "memory(GiB)": 369.4, "step": 30120, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74837523, "epoch": 0.7642059868087265, "grad_norm": 2.015625, "learning_rate": 7.275799449899865e-06, "loss": 1.02358131, "memory(GiB)": 369.4, "step": 30125, "train_speed(iter/s)": 0.200602 }, { "acc": 0.75460186, "epoch": 0.7643328259766616, "grad_norm": 2.203125, "learning_rate": 7.274865695137046e-06, "loss": 1.02049894, "memory(GiB)": 369.4, "step": 30130, "train_speed(iter/s)": 0.200606 }, { "acc": 0.7504097, "epoch": 0.7644596651445966, "grad_norm": 2.1875, "learning_rate": 7.273931840315477e-06, "loss": 0.98611584, "memory(GiB)": 369.4, "step": 30135, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75935383, "epoch": 0.7645865043125317, "grad_norm": 1.9609375, "learning_rate": 7.272997885476234e-06, "loss": 0.9822794, "memory(GiB)": 369.4, "step": 30140, "train_speed(iter/s)": 0.20062 }, { "acc": 0.73069539, "epoch": 0.7647133434804668, "grad_norm": 1.828125, "learning_rate": 7.272063830660395e-06, "loss": 1.07209949, "memory(GiB)": 369.4, "step": 30145, "train_speed(iter/s)": 0.200623 }, { "acc": 0.75135183, "epoch": 0.7648401826484018, "grad_norm": 2.046875, "learning_rate": 7.271129675909046e-06, "loss": 0.97043953, "memory(GiB)": 369.4, "step": 30150, "train_speed(iter/s)": 0.20063 }, { "acc": 0.76834345, "epoch": 0.7649670218163369, "grad_norm": 2.203125, "learning_rate": 7.270195421263271e-06, "loss": 0.90848207, "memory(GiB)": 369.4, "step": 30155, "train_speed(iter/s)": 0.200637 }, { "acc": 0.75153499, "epoch": 0.7650938609842719, "grad_norm": 2.09375, "learning_rate": 7.269261066764169e-06, "loss": 0.99725618, "memory(GiB)": 369.4, "step": 30160, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74020429, "epoch": 0.765220700152207, "grad_norm": 2.40625, "learning_rate": 7.268326612452832e-06, "loss": 0.99479809, "memory(GiB)": 369.4, "step": 30165, "train_speed(iter/s)": 0.200649 }, { "acc": 0.75004139, "epoch": 0.7653475393201421, "grad_norm": 2.203125, "learning_rate": 7.267392058370364e-06, "loss": 0.98462734, "memory(GiB)": 369.4, "step": 30170, "train_speed(iter/s)": 0.200656 }, { "acc": 0.75446291, "epoch": 0.7654743784880771, "grad_norm": 2.21875, "learning_rate": 7.2664574045578685e-06, "loss": 0.97212563, "memory(GiB)": 369.4, "step": 30175, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75083113, "epoch": 0.7656012176560122, "grad_norm": 2.015625, "learning_rate": 7.26552265105646e-06, "loss": 1.02399406, "memory(GiB)": 369.4, "step": 30180, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74170933, "epoch": 0.7657280568239473, "grad_norm": 2.1875, "learning_rate": 7.264587797907248e-06, "loss": 1.01945305, "memory(GiB)": 369.4, "step": 30185, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74481363, "epoch": 0.7658548959918823, "grad_norm": 2.21875, "learning_rate": 7.263652845151354e-06, "loss": 0.97682571, "memory(GiB)": 369.4, "step": 30190, "train_speed(iter/s)": 0.200678 }, { "acc": 0.74483061, "epoch": 0.7659817351598174, "grad_norm": 2.328125, "learning_rate": 7.262717792829903e-06, "loss": 1.01431522, "memory(GiB)": 369.4, "step": 30195, "train_speed(iter/s)": 0.200684 }, { "acc": 0.74925098, "epoch": 0.7661085743277524, "grad_norm": 2.296875, "learning_rate": 7.261782640984021e-06, "loss": 0.99102917, "memory(GiB)": 369.4, "step": 30200, "train_speed(iter/s)": 0.200689 }, { "acc": 0.73431587, "epoch": 0.7662354134956875, "grad_norm": 2.015625, "learning_rate": 7.26084738965484e-06, "loss": 1.05527811, "memory(GiB)": 369.4, "step": 30205, "train_speed(iter/s)": 0.20069 }, { "acc": 0.7477499, "epoch": 0.7663622526636226, "grad_norm": 2.25, "learning_rate": 7.2599120388834964e-06, "loss": 1.0076643, "memory(GiB)": 369.4, "step": 30210, "train_speed(iter/s)": 0.200694 }, { "acc": 0.7644268, "epoch": 0.7664890918315576, "grad_norm": 8.4375, "learning_rate": 7.258976588711133e-06, "loss": 0.9815897, "memory(GiB)": 369.4, "step": 30215, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75905776, "epoch": 0.7666159309994927, "grad_norm": 2.296875, "learning_rate": 7.258041039178891e-06, "loss": 0.94742899, "memory(GiB)": 369.4, "step": 30220, "train_speed(iter/s)": 0.200704 }, { "acc": 0.72365689, "epoch": 0.7667427701674278, "grad_norm": 2.453125, "learning_rate": 7.257105390327925e-06, "loss": 1.04279232, "memory(GiB)": 369.4, "step": 30225, "train_speed(iter/s)": 0.200707 }, { "acc": 0.74244385, "epoch": 0.7668696093353627, "grad_norm": 2.0625, "learning_rate": 7.256169642199386e-06, "loss": 0.96687632, "memory(GiB)": 369.4, "step": 30230, "train_speed(iter/s)": 0.200712 }, { "acc": 0.7394289, "epoch": 0.7669964485032978, "grad_norm": 2.21875, "learning_rate": 7.255233794834432e-06, "loss": 1.02067909, "memory(GiB)": 369.4, "step": 30235, "train_speed(iter/s)": 0.200718 }, { "acc": 0.76795855, "epoch": 0.7671232876712328, "grad_norm": 2.375, "learning_rate": 7.254297848274229e-06, "loss": 0.93336439, "memory(GiB)": 369.4, "step": 30240, "train_speed(iter/s)": 0.200723 }, { "acc": 0.74822941, "epoch": 0.7672501268391679, "grad_norm": 2.609375, "learning_rate": 7.25336180255994e-06, "loss": 1.01419935, "memory(GiB)": 369.4, "step": 30245, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75442142, "epoch": 0.767376966007103, "grad_norm": 1.90625, "learning_rate": 7.25242565773274e-06, "loss": 1.04084272, "memory(GiB)": 369.4, "step": 30250, "train_speed(iter/s)": 0.200728 }, { "acc": 0.74422517, "epoch": 0.767503805175038, "grad_norm": 2.125, "learning_rate": 7.251489413833801e-06, "loss": 1.01049223, "memory(GiB)": 369.4, "step": 30255, "train_speed(iter/s)": 0.200732 }, { "acc": 0.74825764, "epoch": 0.7676306443429731, "grad_norm": 2.125, "learning_rate": 7.250553070904307e-06, "loss": 0.98970337, "memory(GiB)": 369.4, "step": 30260, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75014381, "epoch": 0.7677574835109082, "grad_norm": 2.328125, "learning_rate": 7.2496166289854404e-06, "loss": 0.9687809, "memory(GiB)": 369.4, "step": 30265, "train_speed(iter/s)": 0.200744 }, { "acc": 0.75620956, "epoch": 0.7678843226788432, "grad_norm": 2.09375, "learning_rate": 7.24868008811839e-06, "loss": 1.02518387, "memory(GiB)": 369.4, "step": 30270, "train_speed(iter/s)": 0.200751 }, { "acc": 0.7454154, "epoch": 0.7680111618467783, "grad_norm": 2.25, "learning_rate": 7.247743448344351e-06, "loss": 1.00655594, "memory(GiB)": 369.4, "step": 30275, "train_speed(iter/s)": 0.200758 }, { "acc": 0.73694272, "epoch": 0.7681380010147133, "grad_norm": 2.015625, "learning_rate": 7.246806709704519e-06, "loss": 1.05890961, "memory(GiB)": 369.4, "step": 30280, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74391913, "epoch": 0.7682648401826484, "grad_norm": 1.9921875, "learning_rate": 7.245869872240098e-06, "loss": 1.05956297, "memory(GiB)": 369.4, "step": 30285, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74946899, "epoch": 0.7683916793505835, "grad_norm": 2.1875, "learning_rate": 7.244932935992292e-06, "loss": 1.03324862, "memory(GiB)": 369.4, "step": 30290, "train_speed(iter/s)": 0.200766 }, { "acc": 0.74806104, "epoch": 0.7685185185185185, "grad_norm": 2.6875, "learning_rate": 7.243995901002312e-06, "loss": 1.04002666, "memory(GiB)": 369.4, "step": 30295, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74759073, "epoch": 0.7686453576864536, "grad_norm": 2.390625, "learning_rate": 7.243058767311374e-06, "loss": 1.00808935, "memory(GiB)": 369.4, "step": 30300, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75853891, "epoch": 0.7687721968543887, "grad_norm": 2.546875, "learning_rate": 7.2421215349606955e-06, "loss": 0.95883045, "memory(GiB)": 369.4, "step": 30305, "train_speed(iter/s)": 0.200783 }, { "acc": 0.7587606, "epoch": 0.7688990360223237, "grad_norm": 2.375, "learning_rate": 7.241184203991505e-06, "loss": 0.90831909, "memory(GiB)": 369.4, "step": 30310, "train_speed(iter/s)": 0.200788 }, { "acc": 0.74446855, "epoch": 0.7690258751902588, "grad_norm": 2.328125, "learning_rate": 7.240246774445024e-06, "loss": 1.00533123, "memory(GiB)": 369.4, "step": 30315, "train_speed(iter/s)": 0.200795 }, { "acc": 0.74929504, "epoch": 0.7691527143581938, "grad_norm": 2.328125, "learning_rate": 7.23930924636249e-06, "loss": 1.01011095, "memory(GiB)": 369.4, "step": 30320, "train_speed(iter/s)": 0.200799 }, { "acc": 0.73659468, "epoch": 0.7692795535261289, "grad_norm": 2.21875, "learning_rate": 7.238371619785134e-06, "loss": 1.01551819, "memory(GiB)": 369.4, "step": 30325, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74102631, "epoch": 0.769406392694064, "grad_norm": 2.109375, "learning_rate": 7.237433894754205e-06, "loss": 1.0411108, "memory(GiB)": 369.4, "step": 30330, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74777727, "epoch": 0.769533231861999, "grad_norm": 2.390625, "learning_rate": 7.23649607131094e-06, "loss": 1.0285161, "memory(GiB)": 369.4, "step": 30335, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74369287, "epoch": 0.7696600710299341, "grad_norm": 2.015625, "learning_rate": 7.235558149496595e-06, "loss": 0.99460526, "memory(GiB)": 369.4, "step": 30340, "train_speed(iter/s)": 0.20082 }, { "acc": 0.75596995, "epoch": 0.7697869101978692, "grad_norm": 2.125, "learning_rate": 7.23462012935242e-06, "loss": 0.98838425, "memory(GiB)": 369.4, "step": 30345, "train_speed(iter/s)": 0.200824 }, { "acc": 0.73730903, "epoch": 0.7699137493658041, "grad_norm": 2.109375, "learning_rate": 7.233682010919676e-06, "loss": 1.03920021, "memory(GiB)": 369.4, "step": 30350, "train_speed(iter/s)": 0.200831 }, { "acc": 0.75650816, "epoch": 0.7700405885337392, "grad_norm": 1.8046875, "learning_rate": 7.2327437942396236e-06, "loss": 1.00782585, "memory(GiB)": 369.4, "step": 30355, "train_speed(iter/s)": 0.200835 }, { "acc": 0.75872703, "epoch": 0.7701674277016742, "grad_norm": 2.25, "learning_rate": 7.231805479353532e-06, "loss": 0.93729382, "memory(GiB)": 369.4, "step": 30360, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75491104, "epoch": 0.7702942668696093, "grad_norm": 2.015625, "learning_rate": 7.2308670663026705e-06, "loss": 0.99706879, "memory(GiB)": 369.4, "step": 30365, "train_speed(iter/s)": 0.200843 }, { "acc": 0.73645582, "epoch": 0.7704211060375444, "grad_norm": 1.921875, "learning_rate": 7.229928555128315e-06, "loss": 1.03142204, "memory(GiB)": 369.4, "step": 30370, "train_speed(iter/s)": 0.200848 }, { "acc": 0.75029731, "epoch": 0.7705479452054794, "grad_norm": 2.34375, "learning_rate": 7.228989945871745e-06, "loss": 1.01522942, "memory(GiB)": 369.4, "step": 30375, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74914346, "epoch": 0.7706747843734145, "grad_norm": 1.890625, "learning_rate": 7.2280512385742475e-06, "loss": 0.98116016, "memory(GiB)": 369.4, "step": 30380, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75147295, "epoch": 0.7708016235413496, "grad_norm": 2.109375, "learning_rate": 7.227112433277107e-06, "loss": 0.99743023, "memory(GiB)": 369.4, "step": 30385, "train_speed(iter/s)": 0.200859 }, { "acc": 0.74581852, "epoch": 0.7709284627092846, "grad_norm": 2.4375, "learning_rate": 7.2261735300216195e-06, "loss": 1.03212624, "memory(GiB)": 369.4, "step": 30390, "train_speed(iter/s)": 0.200862 }, { "acc": 0.74487081, "epoch": 0.7710553018772197, "grad_norm": 2.234375, "learning_rate": 7.22523452884908e-06, "loss": 1.00010033, "memory(GiB)": 369.4, "step": 30395, "train_speed(iter/s)": 0.200867 }, { "acc": 0.73348727, "epoch": 0.7711821410451547, "grad_norm": 2.15625, "learning_rate": 7.224295429800792e-06, "loss": 1.02453823, "memory(GiB)": 369.4, "step": 30400, "train_speed(iter/s)": 0.200873 }, { "acc": 0.75051918, "epoch": 0.7713089802130898, "grad_norm": 2.0625, "learning_rate": 7.22335623291806e-06, "loss": 1.03183088, "memory(GiB)": 369.4, "step": 30405, "train_speed(iter/s)": 0.200878 }, { "acc": 0.7460412, "epoch": 0.7714358193810249, "grad_norm": 1.9453125, "learning_rate": 7.222416938242194e-06, "loss": 1.03862648, "memory(GiB)": 369.4, "step": 30410, "train_speed(iter/s)": 0.200884 }, { "acc": 0.76409402, "epoch": 0.7715626585489599, "grad_norm": 2.390625, "learning_rate": 7.221477545814509e-06, "loss": 0.90638561, "memory(GiB)": 369.4, "step": 30415, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74910779, "epoch": 0.771689497716895, "grad_norm": 2.03125, "learning_rate": 7.220538055676323e-06, "loss": 1.02667046, "memory(GiB)": 369.4, "step": 30420, "train_speed(iter/s)": 0.200891 }, { "acc": 0.74280729, "epoch": 0.7718163368848301, "grad_norm": 2.640625, "learning_rate": 7.21959846786896e-06, "loss": 0.98072319, "memory(GiB)": 369.4, "step": 30425, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75711355, "epoch": 0.7719431760527651, "grad_norm": 2.265625, "learning_rate": 7.218658782433746e-06, "loss": 0.96322851, "memory(GiB)": 369.4, "step": 30430, "train_speed(iter/s)": 0.2009 }, { "acc": 0.76269083, "epoch": 0.7720700152207002, "grad_norm": 2.234375, "learning_rate": 7.217718999412013e-06, "loss": 0.9108799, "memory(GiB)": 369.4, "step": 30435, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75136633, "epoch": 0.7721968543886352, "grad_norm": 2.359375, "learning_rate": 7.216779118845097e-06, "loss": 1.02308178, "memory(GiB)": 369.4, "step": 30440, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74096737, "epoch": 0.7723236935565703, "grad_norm": 2.359375, "learning_rate": 7.215839140774339e-06, "loss": 0.99181747, "memory(GiB)": 369.4, "step": 30445, "train_speed(iter/s)": 0.200914 }, { "acc": 0.7545722, "epoch": 0.7724505327245054, "grad_norm": 2.09375, "learning_rate": 7.214899065241082e-06, "loss": 1.05215778, "memory(GiB)": 369.4, "step": 30450, "train_speed(iter/s)": 0.20092 }, { "acc": 0.75329223, "epoch": 0.7725773718924404, "grad_norm": 2.609375, "learning_rate": 7.213958892286674e-06, "loss": 0.98652344, "memory(GiB)": 369.4, "step": 30455, "train_speed(iter/s)": 0.200922 }, { "acc": 0.74111967, "epoch": 0.7727042110603755, "grad_norm": 1.9375, "learning_rate": 7.213018621952472e-06, "loss": 0.95832596, "memory(GiB)": 369.4, "step": 30460, "train_speed(iter/s)": 0.200928 }, { "acc": 0.75133257, "epoch": 0.7728310502283106, "grad_norm": 2.171875, "learning_rate": 7.212078254279828e-06, "loss": 0.95346813, "memory(GiB)": 369.4, "step": 30465, "train_speed(iter/s)": 0.200933 }, { "acc": 0.73691273, "epoch": 0.7729578893962455, "grad_norm": 2.125, "learning_rate": 7.211137789310109e-06, "loss": 1.03472099, "memory(GiB)": 369.4, "step": 30470, "train_speed(iter/s)": 0.200938 }, { "acc": 0.7661047, "epoch": 0.7730847285641806, "grad_norm": 2.03125, "learning_rate": 7.2101972270846756e-06, "loss": 0.92486343, "memory(GiB)": 369.4, "step": 30475, "train_speed(iter/s)": 0.200941 }, { "acc": 0.76383076, "epoch": 0.7732115677321156, "grad_norm": 1.9453125, "learning_rate": 7.2092565676449e-06, "loss": 0.9430584, "memory(GiB)": 369.4, "step": 30480, "train_speed(iter/s)": 0.200945 }, { "acc": 0.75982924, "epoch": 0.7733384069000507, "grad_norm": 7.09375, "learning_rate": 7.208315811032158e-06, "loss": 0.93396597, "memory(GiB)": 369.4, "step": 30485, "train_speed(iter/s)": 0.200951 }, { "acc": 0.74629121, "epoch": 0.7734652460679858, "grad_norm": 2.265625, "learning_rate": 7.207374957287828e-06, "loss": 0.99365692, "memory(GiB)": 369.4, "step": 30490, "train_speed(iter/s)": 0.200958 }, { "acc": 0.74642997, "epoch": 0.7735920852359208, "grad_norm": 2.296875, "learning_rate": 7.2064340064532914e-06, "loss": 0.99859676, "memory(GiB)": 369.4, "step": 30495, "train_speed(iter/s)": 0.200961 }, { "acc": 0.72540622, "epoch": 0.7737189244038559, "grad_norm": 2.1875, "learning_rate": 7.205492958569936e-06, "loss": 1.04254074, "memory(GiB)": 369.4, "step": 30500, "train_speed(iter/s)": 0.200969 }, { "acc": 0.74979315, "epoch": 0.773845763571791, "grad_norm": 1.96875, "learning_rate": 7.204551813679154e-06, "loss": 1.07942619, "memory(GiB)": 369.4, "step": 30505, "train_speed(iter/s)": 0.200974 }, { "acc": 0.73855247, "epoch": 0.773972602739726, "grad_norm": 2.28125, "learning_rate": 7.2036105718223405e-06, "loss": 1.0341876, "memory(GiB)": 369.4, "step": 30510, "train_speed(iter/s)": 0.200977 }, { "acc": 0.7571682, "epoch": 0.7740994419076611, "grad_norm": 2.328125, "learning_rate": 7.202669233040896e-06, "loss": 1.0331255, "memory(GiB)": 369.4, "step": 30515, "train_speed(iter/s)": 0.200984 }, { "acc": 0.74478273, "epoch": 0.7742262810755961, "grad_norm": 2.140625, "learning_rate": 7.201727797376223e-06, "loss": 1.03154526, "memory(GiB)": 369.4, "step": 30520, "train_speed(iter/s)": 0.200985 }, { "acc": 0.7526741, "epoch": 0.7743531202435312, "grad_norm": 2.125, "learning_rate": 7.200786264869732e-06, "loss": 1.05138206, "memory(GiB)": 369.4, "step": 30525, "train_speed(iter/s)": 0.20099 }, { "acc": 0.7368494, "epoch": 0.7744799594114663, "grad_norm": 1.8125, "learning_rate": 7.199844635562836e-06, "loss": 0.99297504, "memory(GiB)": 369.4, "step": 30530, "train_speed(iter/s)": 0.200993 }, { "acc": 0.73817544, "epoch": 0.7746067985794013, "grad_norm": 1.703125, "learning_rate": 7.19890290949695e-06, "loss": 1.034128, "memory(GiB)": 369.4, "step": 30535, "train_speed(iter/s)": 0.200998 }, { "acc": 0.73547201, "epoch": 0.7747336377473364, "grad_norm": 2.125, "learning_rate": 7.197961086713498e-06, "loss": 1.02865276, "memory(GiB)": 369.4, "step": 30540, "train_speed(iter/s)": 0.201001 }, { "acc": 0.74523783, "epoch": 0.7748604769152715, "grad_norm": 2.3125, "learning_rate": 7.197019167253904e-06, "loss": 1.01301966, "memory(GiB)": 369.4, "step": 30545, "train_speed(iter/s)": 0.201008 }, { "acc": 0.75222321, "epoch": 0.7749873160832065, "grad_norm": 1.7265625, "learning_rate": 7.196077151159597e-06, "loss": 1.01106386, "memory(GiB)": 369.4, "step": 30550, "train_speed(iter/s)": 0.20101 }, { "acc": 0.75968657, "epoch": 0.7751141552511416, "grad_norm": 2.296875, "learning_rate": 7.195135038472013e-06, "loss": 1.01139278, "memory(GiB)": 369.4, "step": 30555, "train_speed(iter/s)": 0.201014 }, { "acc": 0.76238194, "epoch": 0.7752409944190766, "grad_norm": 2.15625, "learning_rate": 7.194192829232589e-06, "loss": 0.99188175, "memory(GiB)": 369.4, "step": 30560, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75191646, "epoch": 0.7753678335870117, "grad_norm": 2.640625, "learning_rate": 7.1932505234827686e-06, "loss": 0.97486134, "memory(GiB)": 369.4, "step": 30565, "train_speed(iter/s)": 0.201023 }, { "acc": 0.74853506, "epoch": 0.7754946727549468, "grad_norm": 2.203125, "learning_rate": 7.192308121263998e-06, "loss": 1.04170685, "memory(GiB)": 369.4, "step": 30570, "train_speed(iter/s)": 0.201029 }, { "acc": 0.75686655, "epoch": 0.7756215119228818, "grad_norm": 2.140625, "learning_rate": 7.191365622617728e-06, "loss": 0.98713799, "memory(GiB)": 369.4, "step": 30575, "train_speed(iter/s)": 0.201032 }, { "acc": 0.7421114, "epoch": 0.7757483510908169, "grad_norm": 1.8671875, "learning_rate": 7.190423027585414e-06, "loss": 1.04042416, "memory(GiB)": 369.4, "step": 30580, "train_speed(iter/s)": 0.201035 }, { "acc": 0.74553823, "epoch": 0.775875190258752, "grad_norm": 3.03125, "learning_rate": 7.189480336208516e-06, "loss": 1.02324238, "memory(GiB)": 369.4, "step": 30585, "train_speed(iter/s)": 0.201042 }, { "acc": 0.74382114, "epoch": 0.776002029426687, "grad_norm": 2.3125, "learning_rate": 7.188537548528498e-06, "loss": 1.00725727, "memory(GiB)": 369.4, "step": 30590, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74430161, "epoch": 0.776128868594622, "grad_norm": 2.21875, "learning_rate": 7.187594664586826e-06, "loss": 1.01268759, "memory(GiB)": 369.4, "step": 30595, "train_speed(iter/s)": 0.201048 }, { "acc": 0.74662361, "epoch": 0.776255707762557, "grad_norm": 1.90625, "learning_rate": 7.186651684424975e-06, "loss": 1.04428883, "memory(GiB)": 369.4, "step": 30600, "train_speed(iter/s)": 0.201052 }, { "acc": 0.74320431, "epoch": 0.7763825469304921, "grad_norm": 2.796875, "learning_rate": 7.185708608084418e-06, "loss": 1.02123804, "memory(GiB)": 369.4, "step": 30605, "train_speed(iter/s)": 0.201058 }, { "acc": 0.74960246, "epoch": 0.7765093860984272, "grad_norm": 2.046875, "learning_rate": 7.184765435606642e-06, "loss": 1.00526772, "memory(GiB)": 369.4, "step": 30610, "train_speed(iter/s)": 0.201063 }, { "acc": 0.75594611, "epoch": 0.7766362252663622, "grad_norm": 2.25, "learning_rate": 7.183822167033124e-06, "loss": 1.03105745, "memory(GiB)": 369.4, "step": 30615, "train_speed(iter/s)": 0.201066 }, { "acc": 0.74660344, "epoch": 0.7767630644342973, "grad_norm": 2.203125, "learning_rate": 7.18287880240536e-06, "loss": 1.00331392, "memory(GiB)": 369.4, "step": 30620, "train_speed(iter/s)": 0.201072 }, { "acc": 0.76813345, "epoch": 0.7768899036022324, "grad_norm": 2.453125, "learning_rate": 7.1819353417648386e-06, "loss": 0.91939564, "memory(GiB)": 369.4, "step": 30625, "train_speed(iter/s)": 0.201078 }, { "acc": 0.7497508, "epoch": 0.7770167427701674, "grad_norm": 2.015625, "learning_rate": 7.180991785153059e-06, "loss": 0.98994904, "memory(GiB)": 369.4, "step": 30630, "train_speed(iter/s)": 0.201083 }, { "acc": 0.74859819, "epoch": 0.7771435819381025, "grad_norm": 1.9921875, "learning_rate": 7.180048132611524e-06, "loss": 0.98459167, "memory(GiB)": 369.4, "step": 30635, "train_speed(iter/s)": 0.201087 }, { "acc": 0.74266887, "epoch": 0.7772704211060375, "grad_norm": 3.4375, "learning_rate": 7.17910438418174e-06, "loss": 0.96281271, "memory(GiB)": 369.4, "step": 30640, "train_speed(iter/s)": 0.20109 }, { "acc": 0.74635301, "epoch": 0.7773972602739726, "grad_norm": 2.390625, "learning_rate": 7.178160539905214e-06, "loss": 1.01962242, "memory(GiB)": 369.4, "step": 30645, "train_speed(iter/s)": 0.201097 }, { "acc": 0.75773363, "epoch": 0.7775240994419077, "grad_norm": 2.171875, "learning_rate": 7.1772165998234645e-06, "loss": 0.97213154, "memory(GiB)": 369.4, "step": 30650, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74920511, "epoch": 0.7776509386098427, "grad_norm": 1.9140625, "learning_rate": 7.176272563978007e-06, "loss": 1.01301308, "memory(GiB)": 369.4, "step": 30655, "train_speed(iter/s)": 0.201102 }, { "acc": 0.74418335, "epoch": 0.7777777777777778, "grad_norm": 2.265625, "learning_rate": 7.175328432410367e-06, "loss": 1.00630054, "memory(GiB)": 369.4, "step": 30660, "train_speed(iter/s)": 0.201107 }, { "acc": 0.73814163, "epoch": 0.7779046169457129, "grad_norm": 2.3125, "learning_rate": 7.17438420516207e-06, "loss": 1.02509232, "memory(GiB)": 369.4, "step": 30665, "train_speed(iter/s)": 0.201109 }, { "acc": 0.75822115, "epoch": 0.7780314561136479, "grad_norm": 2.1875, "learning_rate": 7.173439882274647e-06, "loss": 0.9723218, "memory(GiB)": 369.4, "step": 30670, "train_speed(iter/s)": 0.201112 }, { "acc": 0.74318447, "epoch": 0.778158295281583, "grad_norm": 2.34375, "learning_rate": 7.172495463789635e-06, "loss": 1.02172194, "memory(GiB)": 369.4, "step": 30675, "train_speed(iter/s)": 0.201118 }, { "acc": 0.74139872, "epoch": 0.778285134449518, "grad_norm": 2.140625, "learning_rate": 7.171550949748574e-06, "loss": 1.02834187, "memory(GiB)": 369.4, "step": 30680, "train_speed(iter/s)": 0.201123 }, { "acc": 0.75202847, "epoch": 0.7784119736174531, "grad_norm": 2.09375, "learning_rate": 7.170606340193003e-06, "loss": 0.9910099, "memory(GiB)": 369.4, "step": 30685, "train_speed(iter/s)": 0.201128 }, { "acc": 0.74358225, "epoch": 0.7785388127853882, "grad_norm": 2.40625, "learning_rate": 7.1696616351644786e-06, "loss": 1.05388966, "memory(GiB)": 369.4, "step": 30690, "train_speed(iter/s)": 0.201125 }, { "acc": 0.75975809, "epoch": 0.7786656519533232, "grad_norm": 2.21875, "learning_rate": 7.168716834704546e-06, "loss": 1.00906963, "memory(GiB)": 369.4, "step": 30695, "train_speed(iter/s)": 0.201132 }, { "acc": 0.75325031, "epoch": 0.7787924911212583, "grad_norm": 2.046875, "learning_rate": 7.167771938854766e-06, "loss": 1.03281393, "memory(GiB)": 369.4, "step": 30700, "train_speed(iter/s)": 0.201138 }, { "acc": 0.74629989, "epoch": 0.7789193302891934, "grad_norm": 2.703125, "learning_rate": 7.166826947656696e-06, "loss": 1.08338919, "memory(GiB)": 369.4, "step": 30705, "train_speed(iter/s)": 0.201144 }, { "acc": 0.74737511, "epoch": 0.7790461694571283, "grad_norm": 1.9453125, "learning_rate": 7.165881861151904e-06, "loss": 1.02714605, "memory(GiB)": 369.4, "step": 30710, "train_speed(iter/s)": 0.201149 }, { "acc": 0.75438356, "epoch": 0.7791730086250634, "grad_norm": 2.234375, "learning_rate": 7.164936679381957e-06, "loss": 0.96855507, "memory(GiB)": 369.4, "step": 30715, "train_speed(iter/s)": 0.201153 }, { "acc": 0.73971086, "epoch": 0.7792998477929984, "grad_norm": 2.390625, "learning_rate": 7.16399140238843e-06, "loss": 1.05766296, "memory(GiB)": 369.4, "step": 30720, "train_speed(iter/s)": 0.201158 }, { "acc": 0.74271259, "epoch": 0.7794266869609335, "grad_norm": 1.84375, "learning_rate": 7.163046030212899e-06, "loss": 1.02408533, "memory(GiB)": 369.4, "step": 30725, "train_speed(iter/s)": 0.201165 }, { "acc": 0.75864286, "epoch": 0.7795535261288686, "grad_norm": 2.015625, "learning_rate": 7.1621005628969475e-06, "loss": 0.9522624, "memory(GiB)": 369.4, "step": 30730, "train_speed(iter/s)": 0.201169 }, { "acc": 0.73200068, "epoch": 0.7796803652968036, "grad_norm": 2.421875, "learning_rate": 7.161155000482159e-06, "loss": 1.08342819, "memory(GiB)": 369.4, "step": 30735, "train_speed(iter/s)": 0.201176 }, { "acc": 0.7265811, "epoch": 0.7798072044647387, "grad_norm": 2.1875, "learning_rate": 7.160209343010125e-06, "loss": 1.05877838, "memory(GiB)": 369.4, "step": 30740, "train_speed(iter/s)": 0.201178 }, { "acc": 0.73628445, "epoch": 0.7799340436326738, "grad_norm": 2.296875, "learning_rate": 7.1592635905224386e-06, "loss": 1.0765027, "memory(GiB)": 369.4, "step": 30745, "train_speed(iter/s)": 0.201184 }, { "acc": 0.73949118, "epoch": 0.7800608828006088, "grad_norm": 2.40625, "learning_rate": 7.1583177430606995e-06, "loss": 1.05792074, "memory(GiB)": 369.4, "step": 30750, "train_speed(iter/s)": 0.201187 }, { "acc": 0.73910189, "epoch": 0.7801877219685439, "grad_norm": 2.390625, "learning_rate": 7.1573718006665095e-06, "loss": 1.06723404, "memory(GiB)": 369.4, "step": 30755, "train_speed(iter/s)": 0.201191 }, { "acc": 0.73384857, "epoch": 0.7803145611364789, "grad_norm": 2.734375, "learning_rate": 7.156425763381477e-06, "loss": 1.07005272, "memory(GiB)": 369.4, "step": 30760, "train_speed(iter/s)": 0.201197 }, { "acc": 0.75486012, "epoch": 0.780441400304414, "grad_norm": 2.328125, "learning_rate": 7.155479631247211e-06, "loss": 0.99191036, "memory(GiB)": 369.4, "step": 30765, "train_speed(iter/s)": 0.201203 }, { "acc": 0.73026624, "epoch": 0.7805682394723491, "grad_norm": 2.03125, "learning_rate": 7.154533404305327e-06, "loss": 1.05783901, "memory(GiB)": 369.4, "step": 30770, "train_speed(iter/s)": 0.201206 }, { "acc": 0.75526524, "epoch": 0.7806950786402841, "grad_norm": 2.171875, "learning_rate": 7.153587082597445e-06, "loss": 0.92061024, "memory(GiB)": 369.4, "step": 30775, "train_speed(iter/s)": 0.20121 }, { "acc": 0.76198587, "epoch": 0.7808219178082192, "grad_norm": 2.140625, "learning_rate": 7.152640666165187e-06, "loss": 0.95263815, "memory(GiB)": 369.4, "step": 30780, "train_speed(iter/s)": 0.201217 }, { "acc": 0.75590553, "epoch": 0.7809487569761543, "grad_norm": 2.015625, "learning_rate": 7.151694155050184e-06, "loss": 0.97421608, "memory(GiB)": 369.4, "step": 30785, "train_speed(iter/s)": 0.20122 }, { "acc": 0.74849205, "epoch": 0.7810755961440893, "grad_norm": 1.9296875, "learning_rate": 7.150747549294064e-06, "loss": 1.02976732, "memory(GiB)": 369.4, "step": 30790, "train_speed(iter/s)": 0.201224 }, { "acc": 0.75241866, "epoch": 0.7812024353120244, "grad_norm": 2.5625, "learning_rate": 7.149800848938464e-06, "loss": 0.9988636, "memory(GiB)": 369.4, "step": 30795, "train_speed(iter/s)": 0.20123 }, { "acc": 0.74762421, "epoch": 0.7813292744799594, "grad_norm": 2.328125, "learning_rate": 7.1488540540250254e-06, "loss": 1.04293861, "memory(GiB)": 369.4, "step": 30800, "train_speed(iter/s)": 0.201235 }, { "acc": 0.74304056, "epoch": 0.7814561136478945, "grad_norm": 2.046875, "learning_rate": 7.14790716459539e-06, "loss": 1.0183897, "memory(GiB)": 369.4, "step": 30805, "train_speed(iter/s)": 0.201237 }, { "acc": 0.75297503, "epoch": 0.7815829528158296, "grad_norm": 1.921875, "learning_rate": 7.146960180691209e-06, "loss": 0.97120781, "memory(GiB)": 369.4, "step": 30810, "train_speed(iter/s)": 0.20124 }, { "acc": 0.73928185, "epoch": 0.7817097919837646, "grad_norm": 2.28125, "learning_rate": 7.146013102354133e-06, "loss": 0.9767395, "memory(GiB)": 369.4, "step": 30815, "train_speed(iter/s)": 0.201247 }, { "acc": 0.75773335, "epoch": 0.7818366311516997, "grad_norm": 2.1875, "learning_rate": 7.145065929625821e-06, "loss": 0.95239372, "memory(GiB)": 369.4, "step": 30820, "train_speed(iter/s)": 0.201252 }, { "acc": 0.75018783, "epoch": 0.7819634703196348, "grad_norm": 2.03125, "learning_rate": 7.1441186625479304e-06, "loss": 1.01895866, "memory(GiB)": 369.4, "step": 30825, "train_speed(iter/s)": 0.201256 }, { "acc": 0.74002829, "epoch": 0.7820903094875697, "grad_norm": 2.3125, "learning_rate": 7.143171301162131e-06, "loss": 1.00812168, "memory(GiB)": 369.4, "step": 30830, "train_speed(iter/s)": 0.201262 }, { "acc": 0.73337913, "epoch": 0.7822171486555048, "grad_norm": 2.421875, "learning_rate": 7.142223845510086e-06, "loss": 1.09993362, "memory(GiB)": 369.4, "step": 30835, "train_speed(iter/s)": 0.201269 }, { "acc": 0.7341897, "epoch": 0.7823439878234398, "grad_norm": 2.46875, "learning_rate": 7.1412762956334746e-06, "loss": 1.04092541, "memory(GiB)": 369.4, "step": 30840, "train_speed(iter/s)": 0.201276 }, { "acc": 0.74523106, "epoch": 0.7824708269913749, "grad_norm": 1.8359375, "learning_rate": 7.140328651573969e-06, "loss": 1.01129665, "memory(GiB)": 369.4, "step": 30845, "train_speed(iter/s)": 0.20128 }, { "acc": 0.74801092, "epoch": 0.78259766615931, "grad_norm": 2.09375, "learning_rate": 7.139380913373255e-06, "loss": 1.01273251, "memory(GiB)": 369.4, "step": 30850, "train_speed(iter/s)": 0.201284 }, { "acc": 0.74278355, "epoch": 0.782724505327245, "grad_norm": 2.0, "learning_rate": 7.138433081073017e-06, "loss": 1.05021572, "memory(GiB)": 369.4, "step": 30855, "train_speed(iter/s)": 0.201289 }, { "acc": 0.75376663, "epoch": 0.7828513444951801, "grad_norm": 1.9453125, "learning_rate": 7.137485154714945e-06, "loss": 0.97334023, "memory(GiB)": 369.4, "step": 30860, "train_speed(iter/s)": 0.201286 }, { "acc": 0.75407043, "epoch": 0.7829781836631152, "grad_norm": 1.953125, "learning_rate": 7.1365371343407304e-06, "loss": 0.98926926, "memory(GiB)": 369.4, "step": 30865, "train_speed(iter/s)": 0.201292 }, { "acc": 0.76000328, "epoch": 0.7831050228310502, "grad_norm": 2.421875, "learning_rate": 7.135589019992076e-06, "loss": 0.99600172, "memory(GiB)": 369.4, "step": 30870, "train_speed(iter/s)": 0.201295 }, { "acc": 0.75352983, "epoch": 0.7832318619989853, "grad_norm": 2.4375, "learning_rate": 7.134640811710681e-06, "loss": 0.97169914, "memory(GiB)": 369.4, "step": 30875, "train_speed(iter/s)": 0.201301 }, { "acc": 0.742838, "epoch": 0.7833587011669203, "grad_norm": 2.40625, "learning_rate": 7.133692509538253e-06, "loss": 0.99756336, "memory(GiB)": 369.4, "step": 30880, "train_speed(iter/s)": 0.201302 }, { "acc": 0.74514799, "epoch": 0.7834855403348554, "grad_norm": 2.28125, "learning_rate": 7.132744113516502e-06, "loss": 1.06816101, "memory(GiB)": 369.4, "step": 30885, "train_speed(iter/s)": 0.201307 }, { "acc": 0.7642951, "epoch": 0.7836123795027905, "grad_norm": 2.15625, "learning_rate": 7.1317956236871436e-06, "loss": 0.94965248, "memory(GiB)": 369.4, "step": 30890, "train_speed(iter/s)": 0.201312 }, { "acc": 0.75511637, "epoch": 0.7837392186707255, "grad_norm": 2.25, "learning_rate": 7.130847040091893e-06, "loss": 0.99140339, "memory(GiB)": 369.4, "step": 30895, "train_speed(iter/s)": 0.201315 }, { "acc": 0.75552936, "epoch": 0.7838660578386606, "grad_norm": 2.140625, "learning_rate": 7.1298983627724795e-06, "loss": 0.9638052, "memory(GiB)": 369.4, "step": 30900, "train_speed(iter/s)": 0.201319 }, { "acc": 0.73102732, "epoch": 0.7839928970065957, "grad_norm": 2.59375, "learning_rate": 7.128949591770624e-06, "loss": 1.05839901, "memory(GiB)": 369.4, "step": 30905, "train_speed(iter/s)": 0.201326 }, { "acc": 0.75290456, "epoch": 0.7841197361745307, "grad_norm": 2.265625, "learning_rate": 7.128000727128063e-06, "loss": 1.04167004, "memory(GiB)": 369.4, "step": 30910, "train_speed(iter/s)": 0.201328 }, { "acc": 0.74194756, "epoch": 0.7842465753424658, "grad_norm": 2.328125, "learning_rate": 7.127051768886527e-06, "loss": 1.0238327, "memory(GiB)": 369.4, "step": 30915, "train_speed(iter/s)": 0.201333 }, { "acc": 0.76801248, "epoch": 0.7843734145104008, "grad_norm": 2.390625, "learning_rate": 7.126102717087758e-06, "loss": 0.95234747, "memory(GiB)": 369.4, "step": 30920, "train_speed(iter/s)": 0.201337 }, { "acc": 0.74522276, "epoch": 0.7845002536783359, "grad_norm": 2.046875, "learning_rate": 7.1251535717735e-06, "loss": 1.02089024, "memory(GiB)": 369.4, "step": 30925, "train_speed(iter/s)": 0.201343 }, { "acc": 0.73660507, "epoch": 0.784627092846271, "grad_norm": 2.6875, "learning_rate": 7.1242043329854995e-06, "loss": 1.05330505, "memory(GiB)": 369.4, "step": 30930, "train_speed(iter/s)": 0.201346 }, { "acc": 0.74631453, "epoch": 0.784753932014206, "grad_norm": 2.125, "learning_rate": 7.123255000765508e-06, "loss": 0.99997177, "memory(GiB)": 369.4, "step": 30935, "train_speed(iter/s)": 0.201352 }, { "acc": 0.75566063, "epoch": 0.7848807711821411, "grad_norm": 2.359375, "learning_rate": 7.122305575155283e-06, "loss": 0.98622065, "memory(GiB)": 369.4, "step": 30940, "train_speed(iter/s)": 0.201357 }, { "acc": 0.74437737, "epoch": 0.7850076103500762, "grad_norm": 2.03125, "learning_rate": 7.121356056196582e-06, "loss": 1.03499346, "memory(GiB)": 369.4, "step": 30945, "train_speed(iter/s)": 0.201361 }, { "acc": 0.76495924, "epoch": 0.7851344495180111, "grad_norm": 1.8125, "learning_rate": 7.1204064439311715e-06, "loss": 0.97569571, "memory(GiB)": 369.4, "step": 30950, "train_speed(iter/s)": 0.201366 }, { "acc": 0.75724945, "epoch": 0.7852612886859462, "grad_norm": 2.40625, "learning_rate": 7.119456738400818e-06, "loss": 0.96264696, "memory(GiB)": 369.4, "step": 30955, "train_speed(iter/s)": 0.201373 }, { "acc": 0.75101805, "epoch": 0.7853881278538812, "grad_norm": 2.03125, "learning_rate": 7.118506939647295e-06, "loss": 0.93196373, "memory(GiB)": 369.4, "step": 30960, "train_speed(iter/s)": 0.201376 }, { "acc": 0.74733849, "epoch": 0.7855149670218163, "grad_norm": 2.375, "learning_rate": 7.1175570477123776e-06, "loss": 1.0073266, "memory(GiB)": 369.4, "step": 30965, "train_speed(iter/s)": 0.201382 }, { "acc": 0.74615221, "epoch": 0.7856418061897514, "grad_norm": 2.140625, "learning_rate": 7.116607062637848e-06, "loss": 0.9966568, "memory(GiB)": 369.4, "step": 30970, "train_speed(iter/s)": 0.201385 }, { "acc": 0.74624906, "epoch": 0.7857686453576864, "grad_norm": 2.40625, "learning_rate": 7.115656984465489e-06, "loss": 0.96414871, "memory(GiB)": 369.4, "step": 30975, "train_speed(iter/s)": 0.201388 }, { "acc": 0.74387307, "epoch": 0.7858954845256215, "grad_norm": 2.25, "learning_rate": 7.114706813237091e-06, "loss": 1.05908165, "memory(GiB)": 369.4, "step": 30980, "train_speed(iter/s)": 0.201392 }, { "acc": 0.75068955, "epoch": 0.7860223236935566, "grad_norm": 2.015625, "learning_rate": 7.1137565489944445e-06, "loss": 0.99675388, "memory(GiB)": 369.4, "step": 30985, "train_speed(iter/s)": 0.201399 }, { "acc": 0.75644264, "epoch": 0.7861491628614916, "grad_norm": 2.125, "learning_rate": 7.112806191779349e-06, "loss": 0.9827301, "memory(GiB)": 369.4, "step": 30990, "train_speed(iter/s)": 0.201402 }, { "acc": 0.73909888, "epoch": 0.7862760020294267, "grad_norm": 1.9609375, "learning_rate": 7.111855741633603e-06, "loss": 1.02902794, "memory(GiB)": 369.4, "step": 30995, "train_speed(iter/s)": 0.201406 }, { "acc": 0.74615588, "epoch": 0.7864028411973617, "grad_norm": 2.828125, "learning_rate": 7.1109051985990145e-06, "loss": 1.01061049, "memory(GiB)": 369.4, "step": 31000, "train_speed(iter/s)": 0.201413 }, { "epoch": 0.7864028411973617, "eval_acc": 0.7370376203639839, "eval_loss": 0.9740020036697388, "eval_runtime": 384.3014, "eval_samples_per_second": 16.576, "eval_steps_per_second": 8.288, "step": 31000 }, { "acc": 0.74516916, "epoch": 0.7865296803652968, "grad_norm": 1.96875, "learning_rate": 7.109954562717389e-06, "loss": 1.03148975, "memory(GiB)": 369.4, "step": 31005, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75511594, "epoch": 0.7866565195332319, "grad_norm": 2.46875, "learning_rate": 7.109003834030543e-06, "loss": 0.99187546, "memory(GiB)": 369.4, "step": 31010, "train_speed(iter/s)": 0.200495 }, { "acc": 0.75298882, "epoch": 0.7867833587011669, "grad_norm": 2.65625, "learning_rate": 7.108053012580291e-06, "loss": 0.97840128, "memory(GiB)": 369.4, "step": 31015, "train_speed(iter/s)": 0.200499 }, { "acc": 0.7505084, "epoch": 0.786910197869102, "grad_norm": 2.015625, "learning_rate": 7.107102098408457e-06, "loss": 0.97086811, "memory(GiB)": 369.4, "step": 31020, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75198145, "epoch": 0.7870370370370371, "grad_norm": 2.046875, "learning_rate": 7.106151091556865e-06, "loss": 1.01540985, "memory(GiB)": 369.4, "step": 31025, "train_speed(iter/s)": 0.200506 }, { "acc": 0.75040274, "epoch": 0.7871638762049721, "grad_norm": 2.046875, "learning_rate": 7.105199992067344e-06, "loss": 1.00786953, "memory(GiB)": 369.4, "step": 31030, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74476361, "epoch": 0.7872907153729072, "grad_norm": 1.9453125, "learning_rate": 7.1042487999817275e-06, "loss": 1.01408768, "memory(GiB)": 369.4, "step": 31035, "train_speed(iter/s)": 0.200516 }, { "acc": 0.75206842, "epoch": 0.7874175545408422, "grad_norm": 2.15625, "learning_rate": 7.103297515341857e-06, "loss": 0.96162148, "memory(GiB)": 369.4, "step": 31040, "train_speed(iter/s)": 0.200522 }, { "acc": 0.74023457, "epoch": 0.7875443937087773, "grad_norm": 2.625, "learning_rate": 7.1023461381895685e-06, "loss": 1.04217415, "memory(GiB)": 369.4, "step": 31045, "train_speed(iter/s)": 0.200526 }, { "acc": 0.75100236, "epoch": 0.7876712328767124, "grad_norm": 2.03125, "learning_rate": 7.1013946685667125e-06, "loss": 1.00795116, "memory(GiB)": 369.4, "step": 31050, "train_speed(iter/s)": 0.200531 }, { "acc": 0.75636988, "epoch": 0.7877980720446474, "grad_norm": 2.25, "learning_rate": 7.100443106515135e-06, "loss": 0.97931013, "memory(GiB)": 369.4, "step": 31055, "train_speed(iter/s)": 0.200535 }, { "acc": 0.7622673, "epoch": 0.7879249112125825, "grad_norm": 2.59375, "learning_rate": 7.099491452076693e-06, "loss": 1.00465069, "memory(GiB)": 369.4, "step": 31060, "train_speed(iter/s)": 0.200542 }, { "acc": 0.74662952, "epoch": 0.7880517503805176, "grad_norm": 3.3125, "learning_rate": 7.098539705293242e-06, "loss": 0.99926968, "memory(GiB)": 369.4, "step": 31065, "train_speed(iter/s)": 0.200548 }, { "acc": 0.73968272, "epoch": 0.7881785895484525, "grad_norm": 2.59375, "learning_rate": 7.097587866206647e-06, "loss": 1.00694065, "memory(GiB)": 369.4, "step": 31070, "train_speed(iter/s)": 0.200555 }, { "acc": 0.75042024, "epoch": 0.7883054287163876, "grad_norm": 2.625, "learning_rate": 7.096635934858772e-06, "loss": 0.98018379, "memory(GiB)": 369.4, "step": 31075, "train_speed(iter/s)": 0.200559 }, { "acc": 0.7492095, "epoch": 0.7884322678843226, "grad_norm": 2.140625, "learning_rate": 7.095683911291488e-06, "loss": 0.99674644, "memory(GiB)": 369.4, "step": 31080, "train_speed(iter/s)": 0.200565 }, { "acc": 0.74663305, "epoch": 0.7885591070522577, "grad_norm": 2.34375, "learning_rate": 7.0947317955466686e-06, "loss": 1.07587109, "memory(GiB)": 369.4, "step": 31085, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74832659, "epoch": 0.7886859462201928, "grad_norm": 2.109375, "learning_rate": 7.093779587666193e-06, "loss": 0.98614693, "memory(GiB)": 369.4, "step": 31090, "train_speed(iter/s)": 0.200578 }, { "acc": 0.75615368, "epoch": 0.7888127853881278, "grad_norm": 1.9921875, "learning_rate": 7.092827287691943e-06, "loss": 0.94311695, "memory(GiB)": 369.4, "step": 31095, "train_speed(iter/s)": 0.200582 }, { "acc": 0.7452857, "epoch": 0.7889396245560629, "grad_norm": 2.15625, "learning_rate": 7.091874895665806e-06, "loss": 1.00214252, "memory(GiB)": 369.4, "step": 31100, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74993401, "epoch": 0.789066463723998, "grad_norm": 2.359375, "learning_rate": 7.09092241162967e-06, "loss": 1.01708965, "memory(GiB)": 369.4, "step": 31105, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75617828, "epoch": 0.789193302891933, "grad_norm": 1.9921875, "learning_rate": 7.089969835625432e-06, "loss": 1.00291538, "memory(GiB)": 369.4, "step": 31110, "train_speed(iter/s)": 0.200598 }, { "acc": 0.73790483, "epoch": 0.7893201420598681, "grad_norm": 2.3125, "learning_rate": 7.089017167694988e-06, "loss": 1.01368923, "memory(GiB)": 369.4, "step": 31115, "train_speed(iter/s)": 0.200604 }, { "acc": 0.74147234, "epoch": 0.7894469812278031, "grad_norm": 2.171875, "learning_rate": 7.088064407880244e-06, "loss": 1.05093737, "memory(GiB)": 369.4, "step": 31120, "train_speed(iter/s)": 0.200609 }, { "acc": 0.752176, "epoch": 0.7895738203957382, "grad_norm": 2.140625, "learning_rate": 7.087111556223103e-06, "loss": 1.00722828, "memory(GiB)": 369.4, "step": 31125, "train_speed(iter/s)": 0.200614 }, { "acc": 0.74536104, "epoch": 0.7897006595636733, "grad_norm": 2.4375, "learning_rate": 7.08615861276548e-06, "loss": 1.03452911, "memory(GiB)": 369.4, "step": 31130, "train_speed(iter/s)": 0.200619 }, { "acc": 0.73677688, "epoch": 0.7898274987316083, "grad_norm": 1.9765625, "learning_rate": 7.085205577549285e-06, "loss": 1.01922874, "memory(GiB)": 369.4, "step": 31135, "train_speed(iter/s)": 0.200623 }, { "acc": 0.75208402, "epoch": 0.7899543378995434, "grad_norm": 1.828125, "learning_rate": 7.08425245061644e-06, "loss": 1.00101442, "memory(GiB)": 369.4, "step": 31140, "train_speed(iter/s)": 0.200625 }, { "acc": 0.74424009, "epoch": 0.7900811770674785, "grad_norm": 2.09375, "learning_rate": 7.083299232008867e-06, "loss": 1.0380703, "memory(GiB)": 369.4, "step": 31145, "train_speed(iter/s)": 0.200629 }, { "acc": 0.74800353, "epoch": 0.7902080162354135, "grad_norm": 2.109375, "learning_rate": 7.082345921768492e-06, "loss": 0.97087803, "memory(GiB)": 369.4, "step": 31150, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74337769, "epoch": 0.7903348554033486, "grad_norm": 2.375, "learning_rate": 7.0813925199372455e-06, "loss": 1.02678614, "memory(GiB)": 369.4, "step": 31155, "train_speed(iter/s)": 0.200637 }, { "acc": 0.73095522, "epoch": 0.7904616945712836, "grad_norm": 1.9921875, "learning_rate": 7.080439026557065e-06, "loss": 1.03914108, "memory(GiB)": 369.4, "step": 31160, "train_speed(iter/s)": 0.200641 }, { "acc": 0.76510272, "epoch": 0.7905885337392187, "grad_norm": 1.84375, "learning_rate": 7.079485441669887e-06, "loss": 0.91753922, "memory(GiB)": 369.4, "step": 31165, "train_speed(iter/s)": 0.200645 }, { "acc": 0.72731133, "epoch": 0.7907153729071538, "grad_norm": 2.15625, "learning_rate": 7.0785317653176534e-06, "loss": 1.02126198, "memory(GiB)": 369.4, "step": 31170, "train_speed(iter/s)": 0.200651 }, { "acc": 0.7344171, "epoch": 0.7908422120750888, "grad_norm": 2.328125, "learning_rate": 7.077577997542316e-06, "loss": 1.02260361, "memory(GiB)": 369.4, "step": 31175, "train_speed(iter/s)": 0.200654 }, { "acc": 0.72815685, "epoch": 0.7909690512430239, "grad_norm": 2.15625, "learning_rate": 7.0766241383858195e-06, "loss": 1.10588436, "memory(GiB)": 369.4, "step": 31180, "train_speed(iter/s)": 0.200659 }, { "acc": 0.75107713, "epoch": 0.791095890410959, "grad_norm": 2.15625, "learning_rate": 7.075670187890123e-06, "loss": 1.00415421, "memory(GiB)": 369.4, "step": 31185, "train_speed(iter/s)": 0.200666 }, { "acc": 0.7429378, "epoch": 0.791222729578894, "grad_norm": 2.078125, "learning_rate": 7.0747161460971845e-06, "loss": 1.02061281, "memory(GiB)": 369.4, "step": 31190, "train_speed(iter/s)": 0.20067 }, { "acc": 0.74109025, "epoch": 0.791349568746829, "grad_norm": 2.484375, "learning_rate": 7.073762013048966e-06, "loss": 1.08877811, "memory(GiB)": 369.4, "step": 31195, "train_speed(iter/s)": 0.200676 }, { "acc": 0.74434872, "epoch": 0.791476407914764, "grad_norm": 2.109375, "learning_rate": 7.072807788787437e-06, "loss": 0.99870329, "memory(GiB)": 369.4, "step": 31200, "train_speed(iter/s)": 0.200682 }, { "acc": 0.75246515, "epoch": 0.7916032470826991, "grad_norm": 2.4375, "learning_rate": 7.071853473354566e-06, "loss": 1.00624485, "memory(GiB)": 369.4, "step": 31205, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75482597, "epoch": 0.7917300862506342, "grad_norm": 1.90625, "learning_rate": 7.070899066792329e-06, "loss": 0.98956776, "memory(GiB)": 369.4, "step": 31210, "train_speed(iter/s)": 0.200688 }, { "acc": 0.7585969, "epoch": 0.7918569254185692, "grad_norm": 2.1875, "learning_rate": 7.069944569142706e-06, "loss": 0.93806124, "memory(GiB)": 369.4, "step": 31215, "train_speed(iter/s)": 0.200693 }, { "acc": 0.73870163, "epoch": 0.7919837645865043, "grad_norm": 2.4375, "learning_rate": 7.068989980447679e-06, "loss": 1.03822727, "memory(GiB)": 369.4, "step": 31220, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75271149, "epoch": 0.7921106037544394, "grad_norm": 2.390625, "learning_rate": 7.068035300749237e-06, "loss": 0.9951643, "memory(GiB)": 369.4, "step": 31225, "train_speed(iter/s)": 0.200705 }, { "acc": 0.73804617, "epoch": 0.7922374429223744, "grad_norm": 1.90625, "learning_rate": 7.067080530089366e-06, "loss": 1.04524174, "memory(GiB)": 369.4, "step": 31230, "train_speed(iter/s)": 0.200707 }, { "acc": 0.73869686, "epoch": 0.7923642820903095, "grad_norm": 2.1875, "learning_rate": 7.066125668510067e-06, "loss": 1.03424988, "memory(GiB)": 369.4, "step": 31235, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74494686, "epoch": 0.7924911212582445, "grad_norm": 2.1875, "learning_rate": 7.065170716053336e-06, "loss": 1.03115845, "memory(GiB)": 369.4, "step": 31240, "train_speed(iter/s)": 0.200718 }, { "acc": 0.73554487, "epoch": 0.7926179604261796, "grad_norm": 2.4375, "learning_rate": 7.064215672761175e-06, "loss": 1.04611111, "memory(GiB)": 369.4, "step": 31245, "train_speed(iter/s)": 0.200721 }, { "acc": 0.75817351, "epoch": 0.7927447995941147, "grad_norm": 2.09375, "learning_rate": 7.063260538675594e-06, "loss": 0.97714119, "memory(GiB)": 369.4, "step": 31250, "train_speed(iter/s)": 0.200725 }, { "acc": 0.75848641, "epoch": 0.7928716387620497, "grad_norm": 2.0625, "learning_rate": 7.062305313838601e-06, "loss": 0.97977905, "memory(GiB)": 369.4, "step": 31255, "train_speed(iter/s)": 0.200729 }, { "acc": 0.74799747, "epoch": 0.7929984779299848, "grad_norm": 2.1875, "learning_rate": 7.061349998292215e-06, "loss": 1.04641628, "memory(GiB)": 369.4, "step": 31260, "train_speed(iter/s)": 0.200735 }, { "acc": 0.74786005, "epoch": 0.7931253170979199, "grad_norm": 2.09375, "learning_rate": 7.060394592078452e-06, "loss": 1.03117304, "memory(GiB)": 369.4, "step": 31265, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75116196, "epoch": 0.7932521562658549, "grad_norm": 2.90625, "learning_rate": 7.0594390952393365e-06, "loss": 1.02721853, "memory(GiB)": 369.4, "step": 31270, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7460824, "epoch": 0.79337899543379, "grad_norm": 2.109375, "learning_rate": 7.058483507816894e-06, "loss": 1.00782394, "memory(GiB)": 369.4, "step": 31275, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75340519, "epoch": 0.793505834601725, "grad_norm": 1.8125, "learning_rate": 7.057527829853157e-06, "loss": 0.97118549, "memory(GiB)": 369.4, "step": 31280, "train_speed(iter/s)": 0.200756 }, { "acc": 0.74254122, "epoch": 0.7936326737696601, "grad_norm": 2.65625, "learning_rate": 7.056572061390159e-06, "loss": 1.03324327, "memory(GiB)": 369.4, "step": 31285, "train_speed(iter/s)": 0.200761 }, { "acc": 0.73389297, "epoch": 0.7937595129375952, "grad_norm": 1.921875, "learning_rate": 7.055616202469939e-06, "loss": 1.04411983, "memory(GiB)": 369.4, "step": 31290, "train_speed(iter/s)": 0.200766 }, { "acc": 0.74569225, "epoch": 0.7938863521055302, "grad_norm": 2.6875, "learning_rate": 7.054660253134543e-06, "loss": 1.03012867, "memory(GiB)": 369.4, "step": 31295, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74609413, "epoch": 0.7940131912734653, "grad_norm": 2.109375, "learning_rate": 7.053704213426015e-06, "loss": 1.06470022, "memory(GiB)": 369.4, "step": 31300, "train_speed(iter/s)": 0.200776 }, { "acc": 0.74119549, "epoch": 0.7941400304414004, "grad_norm": 2.1875, "learning_rate": 7.052748083386406e-06, "loss": 1.03179512, "memory(GiB)": 369.4, "step": 31305, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75400763, "epoch": 0.7942668696093353, "grad_norm": 2.21875, "learning_rate": 7.051791863057772e-06, "loss": 1.00934134, "memory(GiB)": 369.4, "step": 31310, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74417219, "epoch": 0.7943937087772704, "grad_norm": 2.484375, "learning_rate": 7.050835552482171e-06, "loss": 1.00624638, "memory(GiB)": 369.4, "step": 31315, "train_speed(iter/s)": 0.200783 }, { "acc": 0.75075722, "epoch": 0.7945205479452054, "grad_norm": 2.390625, "learning_rate": 7.049879151701666e-06, "loss": 0.99282284, "memory(GiB)": 369.4, "step": 31320, "train_speed(iter/s)": 0.20079 }, { "acc": 0.74732761, "epoch": 0.7946473871131405, "grad_norm": 2.21875, "learning_rate": 7.048922660758324e-06, "loss": 0.96729221, "memory(GiB)": 369.4, "step": 31325, "train_speed(iter/s)": 0.200796 }, { "acc": 0.75112076, "epoch": 0.7947742262810756, "grad_norm": 2.125, "learning_rate": 7.047966079694215e-06, "loss": 0.96384048, "memory(GiB)": 369.4, "step": 31330, "train_speed(iter/s)": 0.200801 }, { "acc": 0.74849148, "epoch": 0.7949010654490106, "grad_norm": 2.3125, "learning_rate": 7.047009408551414e-06, "loss": 0.95457954, "memory(GiB)": 369.4, "step": 31335, "train_speed(iter/s)": 0.200805 }, { "acc": 0.73825169, "epoch": 0.7950279046169457, "grad_norm": 2.125, "learning_rate": 7.046052647372002e-06, "loss": 1.02365551, "memory(GiB)": 369.4, "step": 31340, "train_speed(iter/s)": 0.20081 }, { "acc": 0.74291916, "epoch": 0.7951547437848808, "grad_norm": 2.296875, "learning_rate": 7.045095796198057e-06, "loss": 1.0363287, "memory(GiB)": 369.4, "step": 31345, "train_speed(iter/s)": 0.200812 }, { "acc": 0.73302031, "epoch": 0.7952815829528158, "grad_norm": 2.265625, "learning_rate": 7.044138855071671e-06, "loss": 1.11168976, "memory(GiB)": 369.4, "step": 31350, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74855232, "epoch": 0.7954084221207509, "grad_norm": 2.3125, "learning_rate": 7.043181824034929e-06, "loss": 0.99059124, "memory(GiB)": 369.4, "step": 31355, "train_speed(iter/s)": 0.200822 }, { "acc": 0.75605249, "epoch": 0.7955352612886859, "grad_norm": 2.515625, "learning_rate": 7.042224703129929e-06, "loss": 0.98859711, "memory(GiB)": 369.4, "step": 31360, "train_speed(iter/s)": 0.200829 }, { "acc": 0.74852848, "epoch": 0.795662100456621, "grad_norm": 2.28125, "learning_rate": 7.0412674923987705e-06, "loss": 1.05287075, "memory(GiB)": 369.4, "step": 31365, "train_speed(iter/s)": 0.200835 }, { "acc": 0.76785984, "epoch": 0.7957889396245561, "grad_norm": 2.46875, "learning_rate": 7.040310191883552e-06, "loss": 0.91287384, "memory(GiB)": 369.4, "step": 31370, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74792194, "epoch": 0.7959157787924911, "grad_norm": 2.1875, "learning_rate": 7.039352801626383e-06, "loss": 0.99993649, "memory(GiB)": 369.4, "step": 31375, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76603842, "epoch": 0.7960426179604262, "grad_norm": 2.234375, "learning_rate": 7.0383953216693725e-06, "loss": 0.91294689, "memory(GiB)": 369.4, "step": 31380, "train_speed(iter/s)": 0.200853 }, { "acc": 0.7586648, "epoch": 0.7961694571283613, "grad_norm": 2.328125, "learning_rate": 7.037437752054635e-06, "loss": 1.02194738, "memory(GiB)": 369.4, "step": 31385, "train_speed(iter/s)": 0.200857 }, { "acc": 0.76313152, "epoch": 0.7962962962962963, "grad_norm": 2.25, "learning_rate": 7.036480092824288e-06, "loss": 0.96742334, "memory(GiB)": 369.4, "step": 31390, "train_speed(iter/s)": 0.200863 }, { "acc": 0.7452692, "epoch": 0.7964231354642314, "grad_norm": 2.28125, "learning_rate": 7.035522344020455e-06, "loss": 1.02248192, "memory(GiB)": 369.4, "step": 31395, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74361019, "epoch": 0.7965499746321664, "grad_norm": 2.203125, "learning_rate": 7.034564505685262e-06, "loss": 0.9718111, "memory(GiB)": 369.4, "step": 31400, "train_speed(iter/s)": 0.200872 }, { "acc": 0.73975763, "epoch": 0.7966768138001015, "grad_norm": 2.5, "learning_rate": 7.0336065778608365e-06, "loss": 1.01579981, "memory(GiB)": 369.4, "step": 31405, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75760069, "epoch": 0.7968036529680366, "grad_norm": 1.78125, "learning_rate": 7.032648560589316e-06, "loss": 0.95995522, "memory(GiB)": 369.4, "step": 31410, "train_speed(iter/s)": 0.200883 }, { "acc": 0.7495554, "epoch": 0.7969304921359716, "grad_norm": 2.328125, "learning_rate": 7.031690453912835e-06, "loss": 0.96885996, "memory(GiB)": 369.4, "step": 31415, "train_speed(iter/s)": 0.200888 }, { "acc": 0.75291882, "epoch": 0.7970573313039067, "grad_norm": 2.703125, "learning_rate": 7.030732257873539e-06, "loss": 0.98828239, "memory(GiB)": 369.4, "step": 31420, "train_speed(iter/s)": 0.200892 }, { "acc": 0.74998331, "epoch": 0.7971841704718418, "grad_norm": 1.921875, "learning_rate": 7.02977397251357e-06, "loss": 1.00607643, "memory(GiB)": 369.4, "step": 31425, "train_speed(iter/s)": 0.200898 }, { "acc": 0.75601349, "epoch": 0.7973110096397767, "grad_norm": 1.8203125, "learning_rate": 7.028815597875081e-06, "loss": 0.96559849, "memory(GiB)": 369.4, "step": 31430, "train_speed(iter/s)": 0.200901 }, { "acc": 0.73588133, "epoch": 0.7974378488077118, "grad_norm": 2.0, "learning_rate": 7.027857134000223e-06, "loss": 1.05409374, "memory(GiB)": 369.4, "step": 31435, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75867567, "epoch": 0.7975646879756468, "grad_norm": 2.65625, "learning_rate": 7.026898580931154e-06, "loss": 0.99017067, "memory(GiB)": 369.4, "step": 31440, "train_speed(iter/s)": 0.200913 }, { "acc": 0.76094398, "epoch": 0.7976915271435819, "grad_norm": 2.078125, "learning_rate": 7.025939938710037e-06, "loss": 0.96274366, "memory(GiB)": 369.4, "step": 31445, "train_speed(iter/s)": 0.200918 }, { "acc": 0.72691097, "epoch": 0.797818366311517, "grad_norm": 2.34375, "learning_rate": 7.024981207379036e-06, "loss": 1.05731611, "memory(GiB)": 369.4, "step": 31450, "train_speed(iter/s)": 0.200923 }, { "acc": 0.73457084, "epoch": 0.797945205479452, "grad_norm": 2.328125, "learning_rate": 7.02402238698032e-06, "loss": 1.03522072, "memory(GiB)": 369.4, "step": 31455, "train_speed(iter/s)": 0.200927 }, { "acc": 0.73773565, "epoch": 0.7980720446473871, "grad_norm": 2.5, "learning_rate": 7.023063477556064e-06, "loss": 1.05891876, "memory(GiB)": 369.4, "step": 31460, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75009723, "epoch": 0.7981988838153222, "grad_norm": 2.078125, "learning_rate": 7.0221044791484424e-06, "loss": 0.9753418, "memory(GiB)": 369.4, "step": 31465, "train_speed(iter/s)": 0.200938 }, { "acc": 0.74428787, "epoch": 0.7983257229832572, "grad_norm": 2.15625, "learning_rate": 7.021145391799639e-06, "loss": 0.97281103, "memory(GiB)": 369.4, "step": 31470, "train_speed(iter/s)": 0.200944 }, { "acc": 0.74457574, "epoch": 0.7984525621511923, "grad_norm": 2.0625, "learning_rate": 7.020186215551837e-06, "loss": 1.01143246, "memory(GiB)": 369.4, "step": 31475, "train_speed(iter/s)": 0.200949 }, { "acc": 0.75278358, "epoch": 0.7985794013191273, "grad_norm": 2.296875, "learning_rate": 7.019226950447227e-06, "loss": 1.00702162, "memory(GiB)": 369.4, "step": 31480, "train_speed(iter/s)": 0.200955 }, { "acc": 0.73384376, "epoch": 0.7987062404870624, "grad_norm": 2.421875, "learning_rate": 7.018267596527998e-06, "loss": 1.05060921, "memory(GiB)": 369.4, "step": 31485, "train_speed(iter/s)": 0.200962 }, { "acc": 0.76336751, "epoch": 0.7988330796549975, "grad_norm": 2.078125, "learning_rate": 7.017308153836352e-06, "loss": 0.97106819, "memory(GiB)": 369.4, "step": 31490, "train_speed(iter/s)": 0.200968 }, { "acc": 0.74603233, "epoch": 0.7989599188229325, "grad_norm": 1.9765625, "learning_rate": 7.016348622414484e-06, "loss": 1.02508144, "memory(GiB)": 369.4, "step": 31495, "train_speed(iter/s)": 0.200972 }, { "acc": 0.74690104, "epoch": 0.7990867579908676, "grad_norm": 2.015625, "learning_rate": 7.015389002304604e-06, "loss": 1.02869492, "memory(GiB)": 369.4, "step": 31500, "train_speed(iter/s)": 0.200975 }, { "acc": 0.76170526, "epoch": 0.7992135971588027, "grad_norm": 2.125, "learning_rate": 7.014429293548916e-06, "loss": 0.95813694, "memory(GiB)": 369.4, "step": 31505, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75091162, "epoch": 0.7993404363267377, "grad_norm": 2.3125, "learning_rate": 7.013469496189633e-06, "loss": 1.0234972, "memory(GiB)": 369.4, "step": 31510, "train_speed(iter/s)": 0.200982 }, { "acc": 0.75612836, "epoch": 0.7994672754946728, "grad_norm": 2.421875, "learning_rate": 7.012509610268974e-06, "loss": 0.95634995, "memory(GiB)": 369.4, "step": 31515, "train_speed(iter/s)": 0.200987 }, { "acc": 0.74615431, "epoch": 0.7995941146626078, "grad_norm": 2.140625, "learning_rate": 7.011549635829156e-06, "loss": 1.0141284, "memory(GiB)": 369.4, "step": 31520, "train_speed(iter/s)": 0.200985 }, { "acc": 0.74249296, "epoch": 0.7997209538305429, "grad_norm": 2.265625, "learning_rate": 7.010589572912404e-06, "loss": 0.98688488, "memory(GiB)": 369.4, "step": 31525, "train_speed(iter/s)": 0.20099 }, { "acc": 0.74504204, "epoch": 0.799847792998478, "grad_norm": 2.09375, "learning_rate": 7.009629421560946e-06, "loss": 1.0299921, "memory(GiB)": 369.4, "step": 31530, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74274383, "epoch": 0.799974632166413, "grad_norm": 2.25, "learning_rate": 7.008669181817015e-06, "loss": 1.03974638, "memory(GiB)": 369.4, "step": 31535, "train_speed(iter/s)": 0.200999 }, { "acc": 0.73944721, "epoch": 0.8001014713343481, "grad_norm": 1.984375, "learning_rate": 7.007708853722844e-06, "loss": 0.98969574, "memory(GiB)": 369.4, "step": 31540, "train_speed(iter/s)": 0.201001 }, { "acc": 0.73435917, "epoch": 0.8002283105022832, "grad_norm": 2.1875, "learning_rate": 7.006748437320674e-06, "loss": 1.10946407, "memory(GiB)": 369.4, "step": 31545, "train_speed(iter/s)": 0.201005 }, { "acc": 0.73857007, "epoch": 0.8003551496702181, "grad_norm": 2.015625, "learning_rate": 7.005787932652749e-06, "loss": 1.00664473, "memory(GiB)": 369.4, "step": 31550, "train_speed(iter/s)": 0.201009 }, { "acc": 0.73788233, "epoch": 0.8004819888381532, "grad_norm": 1.9453125, "learning_rate": 7.0048273397613145e-06, "loss": 1.06516666, "memory(GiB)": 369.4, "step": 31555, "train_speed(iter/s)": 0.201012 }, { "acc": 0.74430685, "epoch": 0.8006088280060882, "grad_norm": 2.1875, "learning_rate": 7.003866658688624e-06, "loss": 0.99405842, "memory(GiB)": 369.4, "step": 31560, "train_speed(iter/s)": 0.201014 }, { "acc": 0.75043664, "epoch": 0.8007356671740233, "grad_norm": 2.328125, "learning_rate": 7.0029058894769295e-06, "loss": 1.04876633, "memory(GiB)": 369.4, "step": 31565, "train_speed(iter/s)": 0.20102 }, { "acc": 0.73946571, "epoch": 0.8008625063419584, "grad_norm": 2.171875, "learning_rate": 7.001945032168493e-06, "loss": 1.00820637, "memory(GiB)": 369.4, "step": 31570, "train_speed(iter/s)": 0.201027 }, { "acc": 0.7444252, "epoch": 0.8009893455098934, "grad_norm": 2.140625, "learning_rate": 7.000984086805575e-06, "loss": 1.00514297, "memory(GiB)": 369.4, "step": 31575, "train_speed(iter/s)": 0.201032 }, { "acc": 0.73832293, "epoch": 0.8011161846778285, "grad_norm": 2.25, "learning_rate": 7.000023053430444e-06, "loss": 1.03680363, "memory(GiB)": 369.4, "step": 31580, "train_speed(iter/s)": 0.201034 }, { "acc": 0.7437747, "epoch": 0.8012430238457636, "grad_norm": 2.46875, "learning_rate": 6.999061932085369e-06, "loss": 1.05236454, "memory(GiB)": 369.4, "step": 31585, "train_speed(iter/s)": 0.201035 }, { "acc": 0.74904275, "epoch": 0.8013698630136986, "grad_norm": 2.234375, "learning_rate": 6.9981007228126255e-06, "loss": 0.95623779, "memory(GiB)": 369.4, "step": 31590, "train_speed(iter/s)": 0.201037 }, { "acc": 0.73724546, "epoch": 0.8014967021816337, "grad_norm": 2.03125, "learning_rate": 6.997139425654491e-06, "loss": 1.04344101, "memory(GiB)": 369.4, "step": 31595, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76339583, "epoch": 0.8016235413495687, "grad_norm": 2.40625, "learning_rate": 6.996178040653248e-06, "loss": 0.92569342, "memory(GiB)": 369.4, "step": 31600, "train_speed(iter/s)": 0.201046 }, { "acc": 0.75403891, "epoch": 0.8017503805175038, "grad_norm": 2.375, "learning_rate": 6.995216567851183e-06, "loss": 0.97726212, "memory(GiB)": 369.4, "step": 31605, "train_speed(iter/s)": 0.20104 }, { "acc": 0.73389463, "epoch": 0.8018772196854389, "grad_norm": 2.234375, "learning_rate": 6.994255007290585e-06, "loss": 1.00489235, "memory(GiB)": 369.4, "step": 31610, "train_speed(iter/s)": 0.201045 }, { "acc": 0.75034833, "epoch": 0.8020040588533739, "grad_norm": 1.7734375, "learning_rate": 6.993293359013747e-06, "loss": 0.99043999, "memory(GiB)": 369.4, "step": 31615, "train_speed(iter/s)": 0.201048 }, { "acc": 0.72891016, "epoch": 0.802130898021309, "grad_norm": 2.078125, "learning_rate": 6.992331623062969e-06, "loss": 1.03479595, "memory(GiB)": 369.4, "step": 31620, "train_speed(iter/s)": 0.201052 }, { "acc": 0.74879513, "epoch": 0.8022577371892441, "grad_norm": 2.015625, "learning_rate": 6.9913697994805505e-06, "loss": 1.01587276, "memory(GiB)": 369.4, "step": 31625, "train_speed(iter/s)": 0.201056 }, { "acc": 0.74591856, "epoch": 0.8023845763571791, "grad_norm": 2.078125, "learning_rate": 6.990407888308799e-06, "loss": 0.98418026, "memory(GiB)": 369.4, "step": 31630, "train_speed(iter/s)": 0.20106 }, { "acc": 0.74570098, "epoch": 0.8025114155251142, "grad_norm": 1.8046875, "learning_rate": 6.98944588959002e-06, "loss": 1.00656815, "memory(GiB)": 369.4, "step": 31635, "train_speed(iter/s)": 0.201066 }, { "acc": 0.74760075, "epoch": 0.8026382546930492, "grad_norm": 2.484375, "learning_rate": 6.9884838033665305e-06, "loss": 0.97571125, "memory(GiB)": 369.4, "step": 31640, "train_speed(iter/s)": 0.201068 }, { "acc": 0.73731537, "epoch": 0.8027650938609843, "grad_norm": 1.9921875, "learning_rate": 6.987521629680643e-06, "loss": 1.04650459, "memory(GiB)": 369.4, "step": 31645, "train_speed(iter/s)": 0.201074 }, { "acc": 0.73870001, "epoch": 0.8028919330289194, "grad_norm": 2.171875, "learning_rate": 6.9865593685746815e-06, "loss": 1.0372961, "memory(GiB)": 369.4, "step": 31650, "train_speed(iter/s)": 0.201076 }, { "acc": 0.74477663, "epoch": 0.8030187721968544, "grad_norm": 2.25, "learning_rate": 6.98559702009097e-06, "loss": 1.07833042, "memory(GiB)": 369.4, "step": 31655, "train_speed(iter/s)": 0.201082 }, { "acc": 0.75165792, "epoch": 0.8031456113647895, "grad_norm": 3.0, "learning_rate": 6.984634584271836e-06, "loss": 1.02220945, "memory(GiB)": 369.4, "step": 31660, "train_speed(iter/s)": 0.201088 }, { "acc": 0.7504014, "epoch": 0.8032724505327246, "grad_norm": 2.0625, "learning_rate": 6.983672061159612e-06, "loss": 0.9841301, "memory(GiB)": 369.4, "step": 31665, "train_speed(iter/s)": 0.201093 }, { "acc": 0.75432634, "epoch": 0.8033992897006595, "grad_norm": 2.40625, "learning_rate": 6.982709450796636e-06, "loss": 0.97981739, "memory(GiB)": 369.4, "step": 31670, "train_speed(iter/s)": 0.201098 }, { "acc": 0.7357007, "epoch": 0.8035261288685946, "grad_norm": 2.15625, "learning_rate": 6.981746753225245e-06, "loss": 1.06921291, "memory(GiB)": 369.4, "step": 31675, "train_speed(iter/s)": 0.201103 }, { "acc": 0.75535574, "epoch": 0.8036529680365296, "grad_norm": 2.34375, "learning_rate": 6.980783968487783e-06, "loss": 0.96620798, "memory(GiB)": 369.4, "step": 31680, "train_speed(iter/s)": 0.201107 }, { "acc": 0.72732759, "epoch": 0.8037798072044647, "grad_norm": 2.46875, "learning_rate": 6.9798210966266e-06, "loss": 1.08570213, "memory(GiB)": 369.4, "step": 31685, "train_speed(iter/s)": 0.20111 }, { "acc": 0.74201088, "epoch": 0.8039066463723998, "grad_norm": 1.9453125, "learning_rate": 6.9788581376840455e-06, "loss": 0.97646446, "memory(GiB)": 369.4, "step": 31690, "train_speed(iter/s)": 0.201113 }, { "acc": 0.75141869, "epoch": 0.8040334855403348, "grad_norm": 2.375, "learning_rate": 6.977895091702474e-06, "loss": 1.03240719, "memory(GiB)": 369.4, "step": 31695, "train_speed(iter/s)": 0.20112 }, { "acc": 0.75926681, "epoch": 0.8041603247082699, "grad_norm": 2.28125, "learning_rate": 6.976931958724248e-06, "loss": 0.95756226, "memory(GiB)": 369.4, "step": 31700, "train_speed(iter/s)": 0.201125 }, { "acc": 0.74237475, "epoch": 0.804287163876205, "grad_norm": 2.03125, "learning_rate": 6.975968738791726e-06, "loss": 1.04758492, "memory(GiB)": 369.4, "step": 31705, "train_speed(iter/s)": 0.201132 }, { "acc": 0.73528461, "epoch": 0.80441400304414, "grad_norm": 2.234375, "learning_rate": 6.9750054319472785e-06, "loss": 1.07419462, "memory(GiB)": 369.4, "step": 31710, "train_speed(iter/s)": 0.201137 }, { "acc": 0.75116339, "epoch": 0.8045408422120751, "grad_norm": 2.125, "learning_rate": 6.974042038233272e-06, "loss": 1.00392208, "memory(GiB)": 369.4, "step": 31715, "train_speed(iter/s)": 0.201143 }, { "acc": 0.72528467, "epoch": 0.8046676813800101, "grad_norm": 2.171875, "learning_rate": 6.9730785576920855e-06, "loss": 1.13717928, "memory(GiB)": 369.4, "step": 31720, "train_speed(iter/s)": 0.201148 }, { "acc": 0.75981169, "epoch": 0.8047945205479452, "grad_norm": 2.078125, "learning_rate": 6.972114990366094e-06, "loss": 0.9432209, "memory(GiB)": 369.4, "step": 31725, "train_speed(iter/s)": 0.201152 }, { "acc": 0.73175125, "epoch": 0.8049213597158803, "grad_norm": 2.578125, "learning_rate": 6.97115133629768e-06, "loss": 0.97768078, "memory(GiB)": 369.4, "step": 31730, "train_speed(iter/s)": 0.201158 }, { "acc": 0.75619731, "epoch": 0.8050481988838153, "grad_norm": 2.015625, "learning_rate": 6.970187595529229e-06, "loss": 0.96417236, "memory(GiB)": 369.4, "step": 31735, "train_speed(iter/s)": 0.201162 }, { "acc": 0.74342961, "epoch": 0.8051750380517504, "grad_norm": 2.09375, "learning_rate": 6.969223768103133e-06, "loss": 1.04427042, "memory(GiB)": 369.4, "step": 31740, "train_speed(iter/s)": 0.201167 }, { "acc": 0.74866161, "epoch": 0.8053018772196855, "grad_norm": 2.140625, "learning_rate": 6.968259854061783e-06, "loss": 0.99917402, "memory(GiB)": 369.4, "step": 31745, "train_speed(iter/s)": 0.201171 }, { "acc": 0.74901381, "epoch": 0.8054287163876205, "grad_norm": 1.9453125, "learning_rate": 6.967295853447578e-06, "loss": 0.99698715, "memory(GiB)": 369.4, "step": 31750, "train_speed(iter/s)": 0.201175 }, { "acc": 0.75197849, "epoch": 0.8055555555555556, "grad_norm": 2.375, "learning_rate": 6.966331766302916e-06, "loss": 1.01940994, "memory(GiB)": 369.4, "step": 31755, "train_speed(iter/s)": 0.201182 }, { "acc": 0.74640379, "epoch": 0.8056823947234906, "grad_norm": 2.03125, "learning_rate": 6.965367592670206e-06, "loss": 0.99229488, "memory(GiB)": 369.4, "step": 31760, "train_speed(iter/s)": 0.201187 }, { "acc": 0.74987397, "epoch": 0.8058092338914257, "grad_norm": 2.25, "learning_rate": 6.964403332591854e-06, "loss": 1.06034327, "memory(GiB)": 369.4, "step": 31765, "train_speed(iter/s)": 0.201193 }, { "acc": 0.75232391, "epoch": 0.8059360730593608, "grad_norm": 2.375, "learning_rate": 6.963438986110272e-06, "loss": 0.9748147, "memory(GiB)": 369.4, "step": 31770, "train_speed(iter/s)": 0.201198 }, { "acc": 0.76310205, "epoch": 0.8060629122272958, "grad_norm": 2.546875, "learning_rate": 6.962474553267877e-06, "loss": 0.98200932, "memory(GiB)": 369.4, "step": 31775, "train_speed(iter/s)": 0.201201 }, { "acc": 0.73371735, "epoch": 0.8061897513952309, "grad_norm": 2.1875, "learning_rate": 6.96151003410709e-06, "loss": 1.06584244, "memory(GiB)": 369.4, "step": 31780, "train_speed(iter/s)": 0.201207 }, { "acc": 0.74100637, "epoch": 0.806316590563166, "grad_norm": 2.5625, "learning_rate": 6.960545428670333e-06, "loss": 1.06658649, "memory(GiB)": 369.4, "step": 31785, "train_speed(iter/s)": 0.201212 }, { "acc": 0.7602809, "epoch": 0.806443429731101, "grad_norm": 2.28125, "learning_rate": 6.959580737000038e-06, "loss": 0.98325891, "memory(GiB)": 369.4, "step": 31790, "train_speed(iter/s)": 0.201217 }, { "acc": 0.75450735, "epoch": 0.806570268899036, "grad_norm": 2.03125, "learning_rate": 6.95861595913863e-06, "loss": 0.9737319, "memory(GiB)": 369.4, "step": 31795, "train_speed(iter/s)": 0.201222 }, { "acc": 0.77084985, "epoch": 0.806697108066971, "grad_norm": 1.8203125, "learning_rate": 6.95765109512855e-06, "loss": 0.92097225, "memory(GiB)": 369.4, "step": 31800, "train_speed(iter/s)": 0.201225 }, { "acc": 0.75050344, "epoch": 0.8068239472349061, "grad_norm": 2.34375, "learning_rate": 6.956686145012233e-06, "loss": 0.98262844, "memory(GiB)": 369.4, "step": 31805, "train_speed(iter/s)": 0.201231 }, { "acc": 0.74200449, "epoch": 0.8069507864028412, "grad_norm": 2.03125, "learning_rate": 6.955721108832124e-06, "loss": 0.99012108, "memory(GiB)": 369.4, "step": 31810, "train_speed(iter/s)": 0.201236 }, { "acc": 0.76397462, "epoch": 0.8070776255707762, "grad_norm": 2.203125, "learning_rate": 6.9547559866306695e-06, "loss": 0.94834862, "memory(GiB)": 369.4, "step": 31815, "train_speed(iter/s)": 0.201236 }, { "acc": 0.74874315, "epoch": 0.8072044647387113, "grad_norm": 2.34375, "learning_rate": 6.953790778450318e-06, "loss": 1.03623009, "memory(GiB)": 369.4, "step": 31820, "train_speed(iter/s)": 0.20124 }, { "acc": 0.74485779, "epoch": 0.8073313039066464, "grad_norm": 2.4375, "learning_rate": 6.9528254843335254e-06, "loss": 1.01247625, "memory(GiB)": 369.4, "step": 31825, "train_speed(iter/s)": 0.201243 }, { "acc": 0.75466614, "epoch": 0.8074581430745814, "grad_norm": 2.109375, "learning_rate": 6.95186010432275e-06, "loss": 0.93006496, "memory(GiB)": 369.4, "step": 31830, "train_speed(iter/s)": 0.201248 }, { "acc": 0.74464779, "epoch": 0.8075849822425165, "grad_norm": 2.078125, "learning_rate": 6.950894638460452e-06, "loss": 1.04741955, "memory(GiB)": 369.4, "step": 31835, "train_speed(iter/s)": 0.201253 }, { "acc": 0.74502869, "epoch": 0.8077118214104515, "grad_norm": 2.0, "learning_rate": 6.949929086789098e-06, "loss": 1.01311216, "memory(GiB)": 369.4, "step": 31840, "train_speed(iter/s)": 0.201258 }, { "acc": 0.73980999, "epoch": 0.8078386605783866, "grad_norm": 1.859375, "learning_rate": 6.948963449351156e-06, "loss": 1.03525696, "memory(GiB)": 369.4, "step": 31845, "train_speed(iter/s)": 0.201262 }, { "acc": 0.74304056, "epoch": 0.8079654997463217, "grad_norm": 2.46875, "learning_rate": 6.947997726189102e-06, "loss": 1.04526348, "memory(GiB)": 369.4, "step": 31850, "train_speed(iter/s)": 0.201267 }, { "acc": 0.74141216, "epoch": 0.8080923389142567, "grad_norm": 2.609375, "learning_rate": 6.947031917345409e-06, "loss": 1.03197556, "memory(GiB)": 369.4, "step": 31855, "train_speed(iter/s)": 0.201271 }, { "acc": 0.73808789, "epoch": 0.8082191780821918, "grad_norm": 2.234375, "learning_rate": 6.946066022862561e-06, "loss": 1.03751612, "memory(GiB)": 369.4, "step": 31860, "train_speed(iter/s)": 0.201277 }, { "acc": 0.74018297, "epoch": 0.8083460172501269, "grad_norm": 2.25, "learning_rate": 6.945100042783039e-06, "loss": 1.05262108, "memory(GiB)": 369.4, "step": 31865, "train_speed(iter/s)": 0.201277 }, { "acc": 0.74717865, "epoch": 0.8084728564180619, "grad_norm": 2.296875, "learning_rate": 6.9441339771493345e-06, "loss": 0.97141285, "memory(GiB)": 369.4, "step": 31870, "train_speed(iter/s)": 0.201283 }, { "acc": 0.76333823, "epoch": 0.808599695585997, "grad_norm": 2.09375, "learning_rate": 6.943167826003937e-06, "loss": 0.91472063, "memory(GiB)": 369.4, "step": 31875, "train_speed(iter/s)": 0.201286 }, { "acc": 0.75866022, "epoch": 0.808726534753932, "grad_norm": 1.8828125, "learning_rate": 6.942201589389344e-06, "loss": 0.99871416, "memory(GiB)": 369.4, "step": 31880, "train_speed(iter/s)": 0.201289 }, { "acc": 0.75760927, "epoch": 0.8088533739218671, "grad_norm": 1.8984375, "learning_rate": 6.9412352673480525e-06, "loss": 1.00091267, "memory(GiB)": 369.4, "step": 31885, "train_speed(iter/s)": 0.201291 }, { "acc": 0.73654313, "epoch": 0.8089802130898022, "grad_norm": 1.8515625, "learning_rate": 6.940268859922566e-06, "loss": 1.01722813, "memory(GiB)": 369.4, "step": 31890, "train_speed(iter/s)": 0.201293 }, { "acc": 0.74878764, "epoch": 0.8091070522577372, "grad_norm": 2.25, "learning_rate": 6.939302367155394e-06, "loss": 0.99641304, "memory(GiB)": 369.4, "step": 31895, "train_speed(iter/s)": 0.201297 }, { "acc": 0.73220205, "epoch": 0.8092338914256723, "grad_norm": 2.59375, "learning_rate": 6.9383357890890454e-06, "loss": 1.12502384, "memory(GiB)": 369.4, "step": 31900, "train_speed(iter/s)": 0.201304 }, { "acc": 0.73946691, "epoch": 0.8093607305936074, "grad_norm": 2.34375, "learning_rate": 6.937369125766033e-06, "loss": 1.01422119, "memory(GiB)": 369.4, "step": 31905, "train_speed(iter/s)": 0.201309 }, { "acc": 0.74496813, "epoch": 0.8094875697615423, "grad_norm": 2.28125, "learning_rate": 6.936402377228879e-06, "loss": 1.04474049, "memory(GiB)": 369.4, "step": 31910, "train_speed(iter/s)": 0.201313 }, { "acc": 0.75282454, "epoch": 0.8096144089294774, "grad_norm": 1.8203125, "learning_rate": 6.9354355435201015e-06, "loss": 0.94440441, "memory(GiB)": 369.4, "step": 31915, "train_speed(iter/s)": 0.201318 }, { "acc": 0.74840555, "epoch": 0.8097412480974124, "grad_norm": 2.296875, "learning_rate": 6.934468624682229e-06, "loss": 1.00622816, "memory(GiB)": 369.4, "step": 31920, "train_speed(iter/s)": 0.201319 }, { "acc": 0.75003576, "epoch": 0.8098680872653475, "grad_norm": 2.21875, "learning_rate": 6.933501620757789e-06, "loss": 0.97372208, "memory(GiB)": 369.4, "step": 31925, "train_speed(iter/s)": 0.201323 }, { "acc": 0.75644889, "epoch": 0.8099949264332826, "grad_norm": 2.03125, "learning_rate": 6.932534531789317e-06, "loss": 0.95241261, "memory(GiB)": 369.4, "step": 31930, "train_speed(iter/s)": 0.201323 }, { "acc": 0.74401951, "epoch": 0.8101217656012176, "grad_norm": 2.0, "learning_rate": 6.931567357819344e-06, "loss": 1.00307503, "memory(GiB)": 369.4, "step": 31935, "train_speed(iter/s)": 0.20133 }, { "acc": 0.73686123, "epoch": 0.8102486047691527, "grad_norm": 2.015625, "learning_rate": 6.930600098890419e-06, "loss": 0.99436398, "memory(GiB)": 369.4, "step": 31940, "train_speed(iter/s)": 0.201336 }, { "acc": 0.74271765, "epoch": 0.8103754439370878, "grad_norm": 2.03125, "learning_rate": 6.929632755045079e-06, "loss": 0.99895344, "memory(GiB)": 369.4, "step": 31945, "train_speed(iter/s)": 0.201339 }, { "acc": 0.7544733, "epoch": 0.8105022831050228, "grad_norm": 3.078125, "learning_rate": 6.9286653263258765e-06, "loss": 1.00122223, "memory(GiB)": 369.4, "step": 31950, "train_speed(iter/s)": 0.201344 }, { "acc": 0.73861427, "epoch": 0.8106291222729579, "grad_norm": 2.3125, "learning_rate": 6.927697812775363e-06, "loss": 1.04855595, "memory(GiB)": 369.4, "step": 31955, "train_speed(iter/s)": 0.201349 }, { "acc": 0.73913355, "epoch": 0.8107559614408929, "grad_norm": 2.578125, "learning_rate": 6.926730214436091e-06, "loss": 1.04816589, "memory(GiB)": 369.4, "step": 31960, "train_speed(iter/s)": 0.201354 }, { "acc": 0.75001736, "epoch": 0.810882800608828, "grad_norm": 2.46875, "learning_rate": 6.925762531350624e-06, "loss": 0.99628954, "memory(GiB)": 369.4, "step": 31965, "train_speed(iter/s)": 0.20136 }, { "acc": 0.7304822, "epoch": 0.8110096397767631, "grad_norm": 2.546875, "learning_rate": 6.924794763561522e-06, "loss": 1.04056778, "memory(GiB)": 369.4, "step": 31970, "train_speed(iter/s)": 0.201364 }, { "acc": 0.74295111, "epoch": 0.8111364789446981, "grad_norm": 2.3125, "learning_rate": 6.923826911111353e-06, "loss": 1.02210445, "memory(GiB)": 369.4, "step": 31975, "train_speed(iter/s)": 0.201369 }, { "acc": 0.75266895, "epoch": 0.8112633181126332, "grad_norm": 1.75, "learning_rate": 6.922858974042688e-06, "loss": 0.98143902, "memory(GiB)": 369.4, "step": 31980, "train_speed(iter/s)": 0.201374 }, { "acc": 0.76449862, "epoch": 0.8113901572805683, "grad_norm": 2.421875, "learning_rate": 6.921890952398098e-06, "loss": 0.95954247, "memory(GiB)": 369.4, "step": 31985, "train_speed(iter/s)": 0.201378 }, { "acc": 0.74724865, "epoch": 0.8115169964485033, "grad_norm": 2.3125, "learning_rate": 6.920922846220166e-06, "loss": 1.05017586, "memory(GiB)": 369.4, "step": 31990, "train_speed(iter/s)": 0.201384 }, { "acc": 0.72555065, "epoch": 0.8116438356164384, "grad_norm": 2.296875, "learning_rate": 6.919954655551469e-06, "loss": 1.0184597, "memory(GiB)": 369.4, "step": 31995, "train_speed(iter/s)": 0.201389 }, { "acc": 0.75792956, "epoch": 0.8117706747843734, "grad_norm": 2.109375, "learning_rate": 6.918986380434594e-06, "loss": 0.9684967, "memory(GiB)": 369.4, "step": 32000, "train_speed(iter/s)": 0.20139 }, { "epoch": 0.8117706747843734, "eval_acc": 0.7372051450550764, "eval_loss": 0.9732819199562073, "eval_runtime": 384.8507, "eval_samples_per_second": 16.552, "eval_steps_per_second": 8.276, "step": 32000 }, { "acc": 0.76372604, "epoch": 0.8118975139523085, "grad_norm": 2.59375, "learning_rate": 6.918018020912132e-06, "loss": 1.01710939, "memory(GiB)": 369.4, "step": 32005, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75678673, "epoch": 0.8120243531202436, "grad_norm": 2.1875, "learning_rate": 6.917049577026673e-06, "loss": 0.98080063, "memory(GiB)": 369.4, "step": 32010, "train_speed(iter/s)": 0.200495 }, { "acc": 0.75148573, "epoch": 0.8121511922881786, "grad_norm": 1.8125, "learning_rate": 6.916081048820815e-06, "loss": 0.96673403, "memory(GiB)": 369.4, "step": 32015, "train_speed(iter/s)": 0.200501 }, { "acc": 0.74345102, "epoch": 0.8122780314561137, "grad_norm": 1.96875, "learning_rate": 6.915112436337157e-06, "loss": 1.02327538, "memory(GiB)": 369.4, "step": 32020, "train_speed(iter/s)": 0.200505 }, { "acc": 0.74004364, "epoch": 0.8124048706240488, "grad_norm": 2.0625, "learning_rate": 6.914143739618305e-06, "loss": 1.0422719, "memory(GiB)": 369.4, "step": 32025, "train_speed(iter/s)": 0.200511 }, { "acc": 0.77109551, "epoch": 0.8125317097919837, "grad_norm": 2.296875, "learning_rate": 6.913174958706865e-06, "loss": 0.93338108, "memory(GiB)": 369.4, "step": 32030, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74627404, "epoch": 0.8126585489599188, "grad_norm": 2.015625, "learning_rate": 6.912206093645448e-06, "loss": 1.05004349, "memory(GiB)": 369.4, "step": 32035, "train_speed(iter/s)": 0.20052 }, { "acc": 0.76048646, "epoch": 0.8127853881278538, "grad_norm": 3.21875, "learning_rate": 6.91123714447667e-06, "loss": 1.00752659, "memory(GiB)": 369.4, "step": 32040, "train_speed(iter/s)": 0.200525 }, { "acc": 0.74407902, "epoch": 0.8129122272957889, "grad_norm": 1.921875, "learning_rate": 6.910268111243149e-06, "loss": 0.97103119, "memory(GiB)": 369.4, "step": 32045, "train_speed(iter/s)": 0.200529 }, { "acc": 0.72767763, "epoch": 0.813039066463724, "grad_norm": 2.34375, "learning_rate": 6.909298993987508e-06, "loss": 1.1417943, "memory(GiB)": 369.4, "step": 32050, "train_speed(iter/s)": 0.200532 }, { "acc": 0.74302201, "epoch": 0.813165905631659, "grad_norm": 1.8046875, "learning_rate": 6.908329792752373e-06, "loss": 0.99683552, "memory(GiB)": 369.4, "step": 32055, "train_speed(iter/s)": 0.200537 }, { "acc": 0.75648022, "epoch": 0.8132927447995941, "grad_norm": 2.015625, "learning_rate": 6.907360507580374e-06, "loss": 1.00556555, "memory(GiB)": 369.4, "step": 32060, "train_speed(iter/s)": 0.200541 }, { "acc": 0.73587208, "epoch": 0.8134195839675292, "grad_norm": 2.1875, "learning_rate": 6.9063911385141425e-06, "loss": 1.02858086, "memory(GiB)": 369.4, "step": 32065, "train_speed(iter/s)": 0.200545 }, { "acc": 0.74353681, "epoch": 0.8135464231354642, "grad_norm": 2.0625, "learning_rate": 6.9054216855963194e-06, "loss": 1.01489029, "memory(GiB)": 369.4, "step": 32070, "train_speed(iter/s)": 0.200551 }, { "acc": 0.73681793, "epoch": 0.8136732623033993, "grad_norm": 2.375, "learning_rate": 6.904452148869541e-06, "loss": 1.02442131, "memory(GiB)": 369.4, "step": 32075, "train_speed(iter/s)": 0.200553 }, { "acc": 0.7499228, "epoch": 0.8138001014713343, "grad_norm": 1.8515625, "learning_rate": 6.903482528376457e-06, "loss": 0.95888472, "memory(GiB)": 369.4, "step": 32080, "train_speed(iter/s)": 0.200558 }, { "acc": 0.73771048, "epoch": 0.8139269406392694, "grad_norm": 2.3125, "learning_rate": 6.902512824159711e-06, "loss": 1.07677011, "memory(GiB)": 369.4, "step": 32085, "train_speed(iter/s)": 0.200565 }, { "acc": 0.74905291, "epoch": 0.8140537798072045, "grad_norm": 2.21875, "learning_rate": 6.901543036261957e-06, "loss": 1.00678511, "memory(GiB)": 369.4, "step": 32090, "train_speed(iter/s)": 0.200571 }, { "acc": 0.73932867, "epoch": 0.8141806189751395, "grad_norm": 2.984375, "learning_rate": 6.900573164725852e-06, "loss": 1.0989809, "memory(GiB)": 369.4, "step": 32095, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75429239, "epoch": 0.8143074581430746, "grad_norm": 2.265625, "learning_rate": 6.899603209594052e-06, "loss": 0.98897419, "memory(GiB)": 369.4, "step": 32100, "train_speed(iter/s)": 0.200578 }, { "acc": 0.74836493, "epoch": 0.8144342973110097, "grad_norm": 1.59375, "learning_rate": 6.898633170909224e-06, "loss": 1.01664648, "memory(GiB)": 369.4, "step": 32105, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74147501, "epoch": 0.8145611364789447, "grad_norm": 2.40625, "learning_rate": 6.897663048714031e-06, "loss": 1.03342781, "memory(GiB)": 369.4, "step": 32110, "train_speed(iter/s)": 0.200583 }, { "acc": 0.74674482, "epoch": 0.8146879756468798, "grad_norm": 2.0625, "learning_rate": 6.896692843051145e-06, "loss": 1.03112469, "memory(GiB)": 369.4, "step": 32115, "train_speed(iter/s)": 0.200588 }, { "acc": 0.7470191, "epoch": 0.8148148148148148, "grad_norm": 2.3125, "learning_rate": 6.895722553963239e-06, "loss": 1.0064024, "memory(GiB)": 369.4, "step": 32120, "train_speed(iter/s)": 0.200594 }, { "acc": 0.74187851, "epoch": 0.8149416539827499, "grad_norm": 2.296875, "learning_rate": 6.8947521814929915e-06, "loss": 1.03594456, "memory(GiB)": 369.4, "step": 32125, "train_speed(iter/s)": 0.2006 }, { "acc": 0.72779756, "epoch": 0.815068493150685, "grad_norm": 2.8125, "learning_rate": 6.8937817256830834e-06, "loss": 1.12764359, "memory(GiB)": 369.4, "step": 32130, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74559731, "epoch": 0.81519533231862, "grad_norm": 2.515625, "learning_rate": 6.892811186576199e-06, "loss": 1.0253952, "memory(GiB)": 369.4, "step": 32135, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75048704, "epoch": 0.8153221714865551, "grad_norm": 2.203125, "learning_rate": 6.8918405642150295e-06, "loss": 1.02952681, "memory(GiB)": 369.4, "step": 32140, "train_speed(iter/s)": 0.200615 }, { "acc": 0.74910188, "epoch": 0.8154490106544902, "grad_norm": 1.921875, "learning_rate": 6.890869858642264e-06, "loss": 1.00125313, "memory(GiB)": 369.4, "step": 32145, "train_speed(iter/s)": 0.200619 }, { "acc": 0.74065447, "epoch": 0.8155758498224251, "grad_norm": 2.125, "learning_rate": 6.889899069900603e-06, "loss": 1.02547016, "memory(GiB)": 369.4, "step": 32150, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74363308, "epoch": 0.8157026889903602, "grad_norm": 2.078125, "learning_rate": 6.888928198032741e-06, "loss": 1.00560837, "memory(GiB)": 369.4, "step": 32155, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75614882, "epoch": 0.8158295281582952, "grad_norm": 2.703125, "learning_rate": 6.887957243081384e-06, "loss": 0.98334236, "memory(GiB)": 369.4, "step": 32160, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74775591, "epoch": 0.8159563673262303, "grad_norm": 2.15625, "learning_rate": 6.886986205089237e-06, "loss": 1.01427822, "memory(GiB)": 369.4, "step": 32165, "train_speed(iter/s)": 0.200638 }, { "acc": 0.7603919, "epoch": 0.8160832064941654, "grad_norm": 2.171875, "learning_rate": 6.886015084099011e-06, "loss": 1.00432577, "memory(GiB)": 369.42, "step": 32170, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74391193, "epoch": 0.8162100456621004, "grad_norm": 2.125, "learning_rate": 6.885043880153424e-06, "loss": 0.99788332, "memory(GiB)": 369.42, "step": 32175, "train_speed(iter/s)": 0.200645 }, { "acc": 0.75917711, "epoch": 0.8163368848300355, "grad_norm": 2.40625, "learning_rate": 6.88407259329519e-06, "loss": 0.94866323, "memory(GiB)": 369.42, "step": 32180, "train_speed(iter/s)": 0.200651 }, { "acc": 0.75307813, "epoch": 0.8164637239979706, "grad_norm": 2.171875, "learning_rate": 6.883101223567031e-06, "loss": 1.00174675, "memory(GiB)": 369.42, "step": 32185, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74721355, "epoch": 0.8165905631659056, "grad_norm": 1.921875, "learning_rate": 6.882129771011674e-06, "loss": 0.97648621, "memory(GiB)": 369.42, "step": 32190, "train_speed(iter/s)": 0.200658 }, { "acc": 0.75512848, "epoch": 0.8167174023338407, "grad_norm": 2.140625, "learning_rate": 6.881158235671845e-06, "loss": 0.91741199, "memory(GiB)": 369.42, "step": 32195, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75475435, "epoch": 0.8168442415017757, "grad_norm": 2.0, "learning_rate": 6.8801866175902785e-06, "loss": 0.97278948, "memory(GiB)": 369.42, "step": 32200, "train_speed(iter/s)": 0.200666 }, { "acc": 0.74300032, "epoch": 0.8169710806697108, "grad_norm": 2.15625, "learning_rate": 6.87921491680971e-06, "loss": 1.06768379, "memory(GiB)": 369.42, "step": 32205, "train_speed(iter/s)": 0.200668 }, { "acc": 0.74747305, "epoch": 0.8170979198376459, "grad_norm": 2.25, "learning_rate": 6.878243133372882e-06, "loss": 0.98697758, "memory(GiB)": 369.42, "step": 32210, "train_speed(iter/s)": 0.200672 }, { "acc": 0.75449839, "epoch": 0.8172247590055809, "grad_norm": 2.28125, "learning_rate": 6.877271267322532e-06, "loss": 0.9879715, "memory(GiB)": 369.42, "step": 32215, "train_speed(iter/s)": 0.200678 }, { "acc": 0.73281832, "epoch": 0.817351598173516, "grad_norm": 1.84375, "learning_rate": 6.876299318701412e-06, "loss": 1.06049919, "memory(GiB)": 369.42, "step": 32220, "train_speed(iter/s)": 0.20068 }, { "acc": 0.74077592, "epoch": 0.8174784373414511, "grad_norm": 2.1875, "learning_rate": 6.875327287552269e-06, "loss": 1.01098232, "memory(GiB)": 369.42, "step": 32225, "train_speed(iter/s)": 0.200686 }, { "acc": 0.74111929, "epoch": 0.8176052765093861, "grad_norm": 2.25, "learning_rate": 6.8743551739178615e-06, "loss": 1.02771521, "memory(GiB)": 369.42, "step": 32230, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74644728, "epoch": 0.8177321156773212, "grad_norm": 2.203125, "learning_rate": 6.8733829778409425e-06, "loss": 1.04986324, "memory(GiB)": 369.42, "step": 32235, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76179891, "epoch": 0.8178589548452562, "grad_norm": 2.203125, "learning_rate": 6.872410699364278e-06, "loss": 0.88218918, "memory(GiB)": 369.42, "step": 32240, "train_speed(iter/s)": 0.200702 }, { "acc": 0.74550571, "epoch": 0.8179857940131913, "grad_norm": 2.171875, "learning_rate": 6.8714383385306305e-06, "loss": 1.04623852, "memory(GiB)": 369.42, "step": 32245, "train_speed(iter/s)": 0.200706 }, { "acc": 0.75224514, "epoch": 0.8181126331811264, "grad_norm": 1.75, "learning_rate": 6.870465895382769e-06, "loss": 0.97293205, "memory(GiB)": 369.42, "step": 32250, "train_speed(iter/s)": 0.200711 }, { "acc": 0.76090016, "epoch": 0.8182394723490614, "grad_norm": 1.9453125, "learning_rate": 6.869493369963468e-06, "loss": 0.97980089, "memory(GiB)": 369.42, "step": 32255, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75279613, "epoch": 0.8183663115169965, "grad_norm": 2.515625, "learning_rate": 6.8685207623155e-06, "loss": 1.0145443, "memory(GiB)": 369.42, "step": 32260, "train_speed(iter/s)": 0.200721 }, { "acc": 0.73599892, "epoch": 0.8184931506849316, "grad_norm": 2.140625, "learning_rate": 6.867548072481649e-06, "loss": 1.05687447, "memory(GiB)": 369.42, "step": 32265, "train_speed(iter/s)": 0.200727 }, { "acc": 0.73076382, "epoch": 0.8186199898528665, "grad_norm": 2.046875, "learning_rate": 6.866575300504695e-06, "loss": 1.00764484, "memory(GiB)": 369.42, "step": 32270, "train_speed(iter/s)": 0.200732 }, { "acc": 0.75814996, "epoch": 0.8187468290208016, "grad_norm": 2.609375, "learning_rate": 6.865602446427424e-06, "loss": 0.98949738, "memory(GiB)": 369.42, "step": 32275, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74431295, "epoch": 0.8188736681887366, "grad_norm": 2.640625, "learning_rate": 6.864629510292629e-06, "loss": 1.00238419, "memory(GiB)": 369.42, "step": 32280, "train_speed(iter/s)": 0.20074 }, { "acc": 0.75938263, "epoch": 0.8190005073566717, "grad_norm": 1.65625, "learning_rate": 6.863656492143103e-06, "loss": 0.9522644, "memory(GiB)": 369.42, "step": 32285, "train_speed(iter/s)": 0.20074 }, { "acc": 0.73159657, "epoch": 0.8191273465246068, "grad_norm": 2.578125, "learning_rate": 6.862683392021644e-06, "loss": 1.01675625, "memory(GiB)": 369.42, "step": 32290, "train_speed(iter/s)": 0.200743 }, { "acc": 0.76840229, "epoch": 0.8192541856925418, "grad_norm": 2.265625, "learning_rate": 6.861710209971052e-06, "loss": 0.90686493, "memory(GiB)": 369.42, "step": 32295, "train_speed(iter/s)": 0.200746 }, { "acc": 0.74488993, "epoch": 0.8193810248604769, "grad_norm": 1.9453125, "learning_rate": 6.860736946034136e-06, "loss": 1.03237095, "memory(GiB)": 369.42, "step": 32300, "train_speed(iter/s)": 0.200749 }, { "acc": 0.73501616, "epoch": 0.819507864028412, "grad_norm": 2.1875, "learning_rate": 6.859763600253698e-06, "loss": 1.08153534, "memory(GiB)": 369.42, "step": 32305, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74403758, "epoch": 0.819634703196347, "grad_norm": 1.9375, "learning_rate": 6.858790172672556e-06, "loss": 1.02018604, "memory(GiB)": 369.42, "step": 32310, "train_speed(iter/s)": 0.200755 }, { "acc": 0.75418606, "epoch": 0.8197615423642821, "grad_norm": 2.234375, "learning_rate": 6.857816663333523e-06, "loss": 1.00521145, "memory(GiB)": 369.42, "step": 32315, "train_speed(iter/s)": 0.200762 }, { "acc": 0.76792297, "epoch": 0.8198883815322171, "grad_norm": 2.609375, "learning_rate": 6.856843072279418e-06, "loss": 0.91875439, "memory(GiB)": 369.42, "step": 32320, "train_speed(iter/s)": 0.200765 }, { "acc": 0.74541473, "epoch": 0.8200152207001522, "grad_norm": 2.0, "learning_rate": 6.855869399553065e-06, "loss": 1.05869064, "memory(GiB)": 369.42, "step": 32325, "train_speed(iter/s)": 0.200768 }, { "acc": 0.74074101, "epoch": 0.8201420598680873, "grad_norm": 2.71875, "learning_rate": 6.85489564519729e-06, "loss": 1.00845175, "memory(GiB)": 369.42, "step": 32330, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75939722, "epoch": 0.8202688990360223, "grad_norm": 2.25, "learning_rate": 6.853921809254922e-06, "loss": 0.97233677, "memory(GiB)": 369.42, "step": 32335, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74052792, "epoch": 0.8203957382039574, "grad_norm": 2.34375, "learning_rate": 6.852947891768796e-06, "loss": 0.98026562, "memory(GiB)": 369.42, "step": 32340, "train_speed(iter/s)": 0.200782 }, { "acc": 0.76708946, "epoch": 0.8205225773718925, "grad_norm": 2.328125, "learning_rate": 6.851973892781749e-06, "loss": 0.93165579, "memory(GiB)": 369.42, "step": 32345, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74250278, "epoch": 0.8206494165398275, "grad_norm": 2.125, "learning_rate": 6.850999812336623e-06, "loss": 0.99202175, "memory(GiB)": 369.42, "step": 32350, "train_speed(iter/s)": 0.200784 }, { "acc": 0.74223948, "epoch": 0.8207762557077626, "grad_norm": 2.375, "learning_rate": 6.850025650476259e-06, "loss": 0.99667196, "memory(GiB)": 369.42, "step": 32355, "train_speed(iter/s)": 0.200778 }, { "acc": 0.73612003, "epoch": 0.8209030948756976, "grad_norm": 1.9453125, "learning_rate": 6.849051407243509e-06, "loss": 1.0197607, "memory(GiB)": 369.42, "step": 32360, "train_speed(iter/s)": 0.200781 }, { "acc": 0.73510809, "epoch": 0.8210299340436327, "grad_norm": 2.296875, "learning_rate": 6.8480770826812205e-06, "loss": 1.03634348, "memory(GiB)": 369.42, "step": 32365, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74830947, "epoch": 0.8211567732115678, "grad_norm": 2.328125, "learning_rate": 6.847102676832253e-06, "loss": 1.04516582, "memory(GiB)": 369.42, "step": 32370, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74892621, "epoch": 0.8212836123795028, "grad_norm": 2.421875, "learning_rate": 6.8461281897394615e-06, "loss": 1.02389755, "memory(GiB)": 369.42, "step": 32375, "train_speed(iter/s)": 0.200795 }, { "acc": 0.73757963, "epoch": 0.8214104515474379, "grad_norm": 2.328125, "learning_rate": 6.845153621445711e-06, "loss": 1.0478982, "memory(GiB)": 369.42, "step": 32380, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74146695, "epoch": 0.821537290715373, "grad_norm": 2.0625, "learning_rate": 6.844178971993866e-06, "loss": 1.07023783, "memory(GiB)": 369.42, "step": 32385, "train_speed(iter/s)": 0.200806 }, { "acc": 0.75098586, "epoch": 0.821664129883308, "grad_norm": 2.328125, "learning_rate": 6.843204241426797e-06, "loss": 1.02575951, "memory(GiB)": 369.42, "step": 32390, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75722327, "epoch": 0.821790969051243, "grad_norm": 1.796875, "learning_rate": 6.842229429787375e-06, "loss": 0.99817162, "memory(GiB)": 369.42, "step": 32395, "train_speed(iter/s)": 0.200816 }, { "acc": 0.74639473, "epoch": 0.821917808219178, "grad_norm": 2.15625, "learning_rate": 6.841254537118477e-06, "loss": 0.95461655, "memory(GiB)": 369.42, "step": 32400, "train_speed(iter/s)": 0.20082 }, { "acc": 0.7341629, "epoch": 0.8220446473871131, "grad_norm": 2.25, "learning_rate": 6.840279563462985e-06, "loss": 1.08950491, "memory(GiB)": 369.42, "step": 32405, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75168648, "epoch": 0.8221714865550482, "grad_norm": 2.203125, "learning_rate": 6.839304508863781e-06, "loss": 1.00730429, "memory(GiB)": 369.42, "step": 32410, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74803066, "epoch": 0.8222983257229832, "grad_norm": 1.921875, "learning_rate": 6.838329373363753e-06, "loss": 1.01555452, "memory(GiB)": 369.42, "step": 32415, "train_speed(iter/s)": 0.200833 }, { "acc": 0.75369625, "epoch": 0.8224251648909183, "grad_norm": 1.8046875, "learning_rate": 6.8373541570057924e-06, "loss": 1.00554676, "memory(GiB)": 369.42, "step": 32420, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75158901, "epoch": 0.8225520040588534, "grad_norm": 2.359375, "learning_rate": 6.836378859832791e-06, "loss": 1.03356819, "memory(GiB)": 369.42, "step": 32425, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75184455, "epoch": 0.8226788432267884, "grad_norm": 1.8828125, "learning_rate": 6.83540348188765e-06, "loss": 0.97016373, "memory(GiB)": 369.42, "step": 32430, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75560508, "epoch": 0.8228056823947235, "grad_norm": 2.6875, "learning_rate": 6.834428023213268e-06, "loss": 0.96513872, "memory(GiB)": 369.42, "step": 32435, "train_speed(iter/s)": 0.200852 }, { "acc": 0.74472394, "epoch": 0.8229325215626585, "grad_norm": 1.8828125, "learning_rate": 6.833452483852554e-06, "loss": 0.99012308, "memory(GiB)": 369.42, "step": 32440, "train_speed(iter/s)": 0.200857 }, { "acc": 0.75427451, "epoch": 0.8230593607305936, "grad_norm": 2.515625, "learning_rate": 6.832476863848411e-06, "loss": 0.98254938, "memory(GiB)": 369.42, "step": 32445, "train_speed(iter/s)": 0.200862 }, { "acc": 0.75080976, "epoch": 0.8231861998985287, "grad_norm": 2.3125, "learning_rate": 6.831501163243756e-06, "loss": 1.03700905, "memory(GiB)": 369.42, "step": 32450, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74686699, "epoch": 0.8233130390664637, "grad_norm": 2.90625, "learning_rate": 6.830525382081501e-06, "loss": 1.04262562, "memory(GiB)": 369.42, "step": 32455, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74893146, "epoch": 0.8234398782343988, "grad_norm": 1.796875, "learning_rate": 6.829549520404568e-06, "loss": 0.99408646, "memory(GiB)": 369.42, "step": 32460, "train_speed(iter/s)": 0.200879 }, { "acc": 0.75177975, "epoch": 0.8235667174023339, "grad_norm": 2.125, "learning_rate": 6.828573578255879e-06, "loss": 0.95875072, "memory(GiB)": 369.42, "step": 32465, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74024358, "epoch": 0.8236935565702689, "grad_norm": 2.109375, "learning_rate": 6.82759755567836e-06, "loss": 1.021422, "memory(GiB)": 369.42, "step": 32470, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74668026, "epoch": 0.823820395738204, "grad_norm": 2.1875, "learning_rate": 6.826621452714941e-06, "loss": 1.04091072, "memory(GiB)": 369.42, "step": 32475, "train_speed(iter/s)": 0.200888 }, { "acc": 0.74249525, "epoch": 0.823947234906139, "grad_norm": 2.0, "learning_rate": 6.825645269408556e-06, "loss": 1.01104317, "memory(GiB)": 369.42, "step": 32480, "train_speed(iter/s)": 0.200893 }, { "acc": 0.75209589, "epoch": 0.8240740740740741, "grad_norm": 2.25, "learning_rate": 6.82466900580214e-06, "loss": 0.97187023, "memory(GiB)": 369.42, "step": 32485, "train_speed(iter/s)": 0.200898 }, { "acc": 0.73269849, "epoch": 0.8242009132420092, "grad_norm": 2.15625, "learning_rate": 6.823692661938634e-06, "loss": 1.05793238, "memory(GiB)": 369.42, "step": 32490, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74821157, "epoch": 0.8243277524099442, "grad_norm": 2.296875, "learning_rate": 6.822716237860984e-06, "loss": 1.02145948, "memory(GiB)": 369.42, "step": 32495, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75437794, "epoch": 0.8244545915778793, "grad_norm": 1.7890625, "learning_rate": 6.821739733612135e-06, "loss": 0.9687027, "memory(GiB)": 369.42, "step": 32500, "train_speed(iter/s)": 0.200911 }, { "acc": 0.74778652, "epoch": 0.8245814307458144, "grad_norm": 2.453125, "learning_rate": 6.820763149235039e-06, "loss": 1.03601627, "memory(GiB)": 369.42, "step": 32505, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74925575, "epoch": 0.8247082699137493, "grad_norm": 2.0, "learning_rate": 6.819786484772652e-06, "loss": 0.98342142, "memory(GiB)": 369.42, "step": 32510, "train_speed(iter/s)": 0.200916 }, { "acc": 0.72557745, "epoch": 0.8248351090816844, "grad_norm": 2.265625, "learning_rate": 6.8188097402679275e-06, "loss": 1.09267187, "memory(GiB)": 369.42, "step": 32515, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75822554, "epoch": 0.8249619482496194, "grad_norm": 1.875, "learning_rate": 6.817832915763833e-06, "loss": 0.92524929, "memory(GiB)": 369.42, "step": 32520, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74144812, "epoch": 0.8250887874175545, "grad_norm": 1.90625, "learning_rate": 6.81685601130333e-06, "loss": 0.98517914, "memory(GiB)": 369.42, "step": 32525, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75918198, "epoch": 0.8252156265854896, "grad_norm": 2.828125, "learning_rate": 6.8158790269293885e-06, "loss": 0.96350651, "memory(GiB)": 369.42, "step": 32530, "train_speed(iter/s)": 0.20093 }, { "acc": 0.75682859, "epoch": 0.8253424657534246, "grad_norm": 2.53125, "learning_rate": 6.8149019626849785e-06, "loss": 0.9242836, "memory(GiB)": 369.42, "step": 32535, "train_speed(iter/s)": 0.200934 }, { "acc": 0.73870969, "epoch": 0.8254693049213597, "grad_norm": 2.078125, "learning_rate": 6.813924818613079e-06, "loss": 1.03561497, "memory(GiB)": 369.42, "step": 32540, "train_speed(iter/s)": 0.200937 }, { "acc": 0.7601716, "epoch": 0.8255961440892948, "grad_norm": 2.328125, "learning_rate": 6.812947594756667e-06, "loss": 1.03648186, "memory(GiB)": 369.42, "step": 32545, "train_speed(iter/s)": 0.200941 }, { "acc": 0.76047907, "epoch": 0.8257229832572298, "grad_norm": 2.078125, "learning_rate": 6.811970291158725e-06, "loss": 0.99547977, "memory(GiB)": 369.42, "step": 32550, "train_speed(iter/s)": 0.200945 }, { "acc": 0.75389671, "epoch": 0.8258498224251649, "grad_norm": 2.0625, "learning_rate": 6.810992907862239e-06, "loss": 0.99476566, "memory(GiB)": 369.42, "step": 32555, "train_speed(iter/s)": 0.200949 }, { "acc": 0.74516244, "epoch": 0.8259766615930999, "grad_norm": 2.4375, "learning_rate": 6.810015444910202e-06, "loss": 1.00167265, "memory(GiB)": 369.42, "step": 32560, "train_speed(iter/s)": 0.200954 }, { "acc": 0.74906492, "epoch": 0.826103500761035, "grad_norm": 2.625, "learning_rate": 6.809037902345603e-06, "loss": 1.04154053, "memory(GiB)": 369.42, "step": 32565, "train_speed(iter/s)": 0.20096 }, { "acc": 0.739604, "epoch": 0.8262303399289701, "grad_norm": 2.4375, "learning_rate": 6.808060280211439e-06, "loss": 1.00075302, "memory(GiB)": 369.42, "step": 32570, "train_speed(iter/s)": 0.200963 }, { "acc": 0.75570831, "epoch": 0.8263571790969051, "grad_norm": 2.265625, "learning_rate": 6.807082578550713e-06, "loss": 1.03229675, "memory(GiB)": 369.42, "step": 32575, "train_speed(iter/s)": 0.200968 }, { "acc": 0.74681177, "epoch": 0.8264840182648402, "grad_norm": 2.125, "learning_rate": 6.806104797406428e-06, "loss": 0.9841177, "memory(GiB)": 369.42, "step": 32580, "train_speed(iter/s)": 0.200972 }, { "acc": 0.73950248, "epoch": 0.8266108574327753, "grad_norm": 2.125, "learning_rate": 6.805126936821588e-06, "loss": 1.00301208, "memory(GiB)": 369.42, "step": 32585, "train_speed(iter/s)": 0.200975 }, { "acc": 0.75089092, "epoch": 0.8267376966007103, "grad_norm": 2.171875, "learning_rate": 6.804148996839208e-06, "loss": 1.01720934, "memory(GiB)": 369.42, "step": 32590, "train_speed(iter/s)": 0.20098 }, { "acc": 0.74492416, "epoch": 0.8268645357686454, "grad_norm": 2.109375, "learning_rate": 6.803170977502298e-06, "loss": 1.02890148, "memory(GiB)": 369.42, "step": 32595, "train_speed(iter/s)": 0.200983 }, { "acc": 0.75770845, "epoch": 0.8269913749365804, "grad_norm": 3.234375, "learning_rate": 6.802192878853879e-06, "loss": 0.99616718, "memory(GiB)": 369.42, "step": 32600, "train_speed(iter/s)": 0.200986 }, { "acc": 0.74776487, "epoch": 0.8271182141045155, "grad_norm": 2.265625, "learning_rate": 6.801214700936972e-06, "loss": 0.96628294, "memory(GiB)": 369.42, "step": 32605, "train_speed(iter/s)": 0.20099 }, { "acc": 0.75292997, "epoch": 0.8272450532724506, "grad_norm": 2.203125, "learning_rate": 6.8002364437946e-06, "loss": 0.99048109, "memory(GiB)": 369.42, "step": 32610, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74142962, "epoch": 0.8273718924403856, "grad_norm": 2.421875, "learning_rate": 6.799258107469792e-06, "loss": 1.03380203, "memory(GiB)": 369.42, "step": 32615, "train_speed(iter/s)": 0.200997 }, { "acc": 0.74820099, "epoch": 0.8274987316083207, "grad_norm": 1.7421875, "learning_rate": 6.798279692005578e-06, "loss": 0.96502447, "memory(GiB)": 369.42, "step": 32620, "train_speed(iter/s)": 0.200997 }, { "acc": 0.75671811, "epoch": 0.8276255707762558, "grad_norm": 2.546875, "learning_rate": 6.7973011974449965e-06, "loss": 0.95818481, "memory(GiB)": 369.42, "step": 32625, "train_speed(iter/s)": 0.201 }, { "acc": 0.75463524, "epoch": 0.8277524099441907, "grad_norm": 2.5625, "learning_rate": 6.796322623831082e-06, "loss": 0.97083073, "memory(GiB)": 369.42, "step": 32630, "train_speed(iter/s)": 0.201007 }, { "acc": 0.74278607, "epoch": 0.8278792491121258, "grad_norm": 2.15625, "learning_rate": 6.795343971206879e-06, "loss": 0.99851189, "memory(GiB)": 369.42, "step": 32635, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74868059, "epoch": 0.8280060882800608, "grad_norm": 2.953125, "learning_rate": 6.794365239615433e-06, "loss": 1.00650568, "memory(GiB)": 369.42, "step": 32640, "train_speed(iter/s)": 0.201017 }, { "acc": 0.75388937, "epoch": 0.8281329274479959, "grad_norm": 1.8828125, "learning_rate": 6.793386429099792e-06, "loss": 1.02585278, "memory(GiB)": 369.42, "step": 32645, "train_speed(iter/s)": 0.201019 }, { "acc": 0.75454531, "epoch": 0.828259766615931, "grad_norm": 1.8515625, "learning_rate": 6.79240753970301e-06, "loss": 0.98314342, "memory(GiB)": 369.42, "step": 32650, "train_speed(iter/s)": 0.201023 }, { "acc": 0.73753633, "epoch": 0.828386605783866, "grad_norm": 1.9921875, "learning_rate": 6.791428571468139e-06, "loss": 1.06516266, "memory(GiB)": 369.42, "step": 32655, "train_speed(iter/s)": 0.201026 }, { "acc": 0.7428822, "epoch": 0.8285134449518011, "grad_norm": 2.734375, "learning_rate": 6.7904495244382454e-06, "loss": 1.04614162, "memory(GiB)": 369.42, "step": 32660, "train_speed(iter/s)": 0.201031 }, { "acc": 0.73693924, "epoch": 0.8286402841197362, "grad_norm": 1.90625, "learning_rate": 6.789470398656385e-06, "loss": 1.07207918, "memory(GiB)": 369.42, "step": 32665, "train_speed(iter/s)": 0.201036 }, { "acc": 0.75500183, "epoch": 0.8287671232876712, "grad_norm": 2.21875, "learning_rate": 6.788491194165629e-06, "loss": 0.98727427, "memory(GiB)": 369.42, "step": 32670, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76906834, "epoch": 0.8288939624556063, "grad_norm": 1.984375, "learning_rate": 6.787511911009044e-06, "loss": 0.92573185, "memory(GiB)": 369.42, "step": 32675, "train_speed(iter/s)": 0.201047 }, { "acc": 0.7407979, "epoch": 0.8290208016235413, "grad_norm": 2.09375, "learning_rate": 6.786532549229704e-06, "loss": 1.06758308, "memory(GiB)": 369.42, "step": 32680, "train_speed(iter/s)": 0.201048 }, { "acc": 0.75563011, "epoch": 0.8291476407914764, "grad_norm": 2.078125, "learning_rate": 6.785553108870686e-06, "loss": 0.98218708, "memory(GiB)": 369.42, "step": 32685, "train_speed(iter/s)": 0.201051 }, { "acc": 0.73964081, "epoch": 0.8292744799594115, "grad_norm": 2.09375, "learning_rate": 6.784573589975072e-06, "loss": 0.99925919, "memory(GiB)": 369.42, "step": 32690, "train_speed(iter/s)": 0.201058 }, { "acc": 0.73465576, "epoch": 0.8294013191273465, "grad_norm": 2.625, "learning_rate": 6.783593992585943e-06, "loss": 1.03480883, "memory(GiB)": 369.42, "step": 32695, "train_speed(iter/s)": 0.201063 }, { "acc": 0.74423146, "epoch": 0.8295281582952816, "grad_norm": 2.203125, "learning_rate": 6.7826143167463876e-06, "loss": 1.02147989, "memory(GiB)": 369.42, "step": 32700, "train_speed(iter/s)": 0.201068 }, { "acc": 0.75334072, "epoch": 0.8296549974632167, "grad_norm": 1.9453125, "learning_rate": 6.781634562499495e-06, "loss": 1.00749168, "memory(GiB)": 369.42, "step": 32705, "train_speed(iter/s)": 0.20107 }, { "acc": 0.74721985, "epoch": 0.8297818366311517, "grad_norm": 2.359375, "learning_rate": 6.780654729888361e-06, "loss": 1.0246727, "memory(GiB)": 369.42, "step": 32710, "train_speed(iter/s)": 0.201074 }, { "acc": 0.75174041, "epoch": 0.8299086757990868, "grad_norm": 2.734375, "learning_rate": 6.779674818956081e-06, "loss": 0.99973202, "memory(GiB)": 369.42, "step": 32715, "train_speed(iter/s)": 0.201079 }, { "acc": 0.7405654, "epoch": 0.8300355149670218, "grad_norm": 2.3125, "learning_rate": 6.778694829745756e-06, "loss": 1.03570518, "memory(GiB)": 369.42, "step": 32720, "train_speed(iter/s)": 0.201084 }, { "acc": 0.73593502, "epoch": 0.8301623541349569, "grad_norm": 2.28125, "learning_rate": 6.777714762300492e-06, "loss": 1.01131401, "memory(GiB)": 369.42, "step": 32725, "train_speed(iter/s)": 0.20109 }, { "acc": 0.76133265, "epoch": 0.830289193302892, "grad_norm": 2.0625, "learning_rate": 6.776734616663397e-06, "loss": 0.96399193, "memory(GiB)": 369.42, "step": 32730, "train_speed(iter/s)": 0.201094 }, { "acc": 0.76312523, "epoch": 0.830416032470827, "grad_norm": 1.9609375, "learning_rate": 6.77575439287758e-06, "loss": 0.92760944, "memory(GiB)": 369.42, "step": 32735, "train_speed(iter/s)": 0.201097 }, { "acc": 0.74941454, "epoch": 0.8305428716387621, "grad_norm": 2.28125, "learning_rate": 6.774774090986157e-06, "loss": 0.97333355, "memory(GiB)": 369.42, "step": 32740, "train_speed(iter/s)": 0.201101 }, { "acc": 0.75775995, "epoch": 0.8306697108066972, "grad_norm": 2.125, "learning_rate": 6.773793711032244e-06, "loss": 1.01630173, "memory(GiB)": 369.42, "step": 32745, "train_speed(iter/s)": 0.201106 }, { "acc": 0.74016314, "epoch": 0.8307965499746321, "grad_norm": 2.359375, "learning_rate": 6.772813253058965e-06, "loss": 1.03248882, "memory(GiB)": 369.42, "step": 32750, "train_speed(iter/s)": 0.201111 }, { "acc": 0.75626678, "epoch": 0.8309233891425672, "grad_norm": 2.40625, "learning_rate": 6.771832717109444e-06, "loss": 1.01501446, "memory(GiB)": 369.42, "step": 32755, "train_speed(iter/s)": 0.201115 }, { "acc": 0.7443553, "epoch": 0.8310502283105022, "grad_norm": 1.953125, "learning_rate": 6.77085210322681e-06, "loss": 1.02230129, "memory(GiB)": 369.42, "step": 32760, "train_speed(iter/s)": 0.20112 }, { "acc": 0.73390551, "epoch": 0.8311770674784373, "grad_norm": 1.96875, "learning_rate": 6.769871411454195e-06, "loss": 0.99288368, "memory(GiB)": 369.42, "step": 32765, "train_speed(iter/s)": 0.201122 }, { "acc": 0.72879972, "epoch": 0.8313039066463724, "grad_norm": 2.15625, "learning_rate": 6.768890641834732e-06, "loss": 1.04064207, "memory(GiB)": 369.42, "step": 32770, "train_speed(iter/s)": 0.201127 }, { "acc": 0.74460888, "epoch": 0.8314307458143074, "grad_norm": 1.890625, "learning_rate": 6.767909794411562e-06, "loss": 0.99995718, "memory(GiB)": 369.42, "step": 32775, "train_speed(iter/s)": 0.20113 }, { "acc": 0.74910293, "epoch": 0.8315575849822425, "grad_norm": 2.984375, "learning_rate": 6.7669288692278256e-06, "loss": 1.01463156, "memory(GiB)": 369.42, "step": 32780, "train_speed(iter/s)": 0.201133 }, { "acc": 0.7424696, "epoch": 0.8316844241501776, "grad_norm": 2.09375, "learning_rate": 6.76594786632667e-06, "loss": 0.95054359, "memory(GiB)": 369.42, "step": 32785, "train_speed(iter/s)": 0.201134 }, { "acc": 0.74618053, "epoch": 0.8318112633181126, "grad_norm": 2.296875, "learning_rate": 6.764966785751242e-06, "loss": 1.02548885, "memory(GiB)": 369.42, "step": 32790, "train_speed(iter/s)": 0.201139 }, { "acc": 0.74809437, "epoch": 0.8319381024860477, "grad_norm": 2.046875, "learning_rate": 6.763985627544693e-06, "loss": 0.99379606, "memory(GiB)": 369.42, "step": 32795, "train_speed(iter/s)": 0.201143 }, { "acc": 0.73977237, "epoch": 0.8320649416539827, "grad_norm": 2.15625, "learning_rate": 6.763004391750183e-06, "loss": 0.99856606, "memory(GiB)": 369.42, "step": 32800, "train_speed(iter/s)": 0.201149 }, { "acc": 0.74119463, "epoch": 0.8321917808219178, "grad_norm": 2.59375, "learning_rate": 6.762023078410867e-06, "loss": 1.0429285, "memory(GiB)": 369.42, "step": 32805, "train_speed(iter/s)": 0.201154 }, { "acc": 0.74796262, "epoch": 0.8323186199898529, "grad_norm": 1.8984375, "learning_rate": 6.7610416875699095e-06, "loss": 0.97533035, "memory(GiB)": 369.42, "step": 32810, "train_speed(iter/s)": 0.201159 }, { "acc": 0.75297985, "epoch": 0.8324454591577879, "grad_norm": 1.9453125, "learning_rate": 6.760060219270476e-06, "loss": 1.0311657, "memory(GiB)": 369.42, "step": 32815, "train_speed(iter/s)": 0.201161 }, { "acc": 0.75308361, "epoch": 0.832572298325723, "grad_norm": 2.296875, "learning_rate": 6.759078673555736e-06, "loss": 1.03205261, "memory(GiB)": 369.42, "step": 32820, "train_speed(iter/s)": 0.201165 }, { "acc": 0.74301863, "epoch": 0.8326991374936581, "grad_norm": 2.296875, "learning_rate": 6.758097050468862e-06, "loss": 1.03282423, "memory(GiB)": 369.42, "step": 32825, "train_speed(iter/s)": 0.201171 }, { "acc": 0.75639229, "epoch": 0.8328259766615931, "grad_norm": 2.28125, "learning_rate": 6.757115350053032e-06, "loss": 1.03567505, "memory(GiB)": 369.42, "step": 32830, "train_speed(iter/s)": 0.201177 }, { "acc": 0.74491529, "epoch": 0.8329528158295282, "grad_norm": 1.9609375, "learning_rate": 6.756133572351422e-06, "loss": 1.0326025, "memory(GiB)": 369.42, "step": 32835, "train_speed(iter/s)": 0.201181 }, { "acc": 0.76083722, "epoch": 0.8330796549974632, "grad_norm": 2.296875, "learning_rate": 6.755151717407218e-06, "loss": 0.92270088, "memory(GiB)": 369.42, "step": 32840, "train_speed(iter/s)": 0.201187 }, { "acc": 0.74537344, "epoch": 0.8332064941653983, "grad_norm": 1.78125, "learning_rate": 6.754169785263605e-06, "loss": 1.00752058, "memory(GiB)": 369.42, "step": 32845, "train_speed(iter/s)": 0.201189 }, { "acc": 0.7533917, "epoch": 0.8333333333333334, "grad_norm": 2.390625, "learning_rate": 6.753187775963773e-06, "loss": 0.9926403, "memory(GiB)": 369.42, "step": 32850, "train_speed(iter/s)": 0.201193 }, { "acc": 0.76232305, "epoch": 0.8334601725012684, "grad_norm": 2.46875, "learning_rate": 6.752205689550915e-06, "loss": 0.93336687, "memory(GiB)": 369.42, "step": 32855, "train_speed(iter/s)": 0.201198 }, { "acc": 0.74691248, "epoch": 0.8335870116692035, "grad_norm": 2.0625, "learning_rate": 6.751223526068228e-06, "loss": 0.9894228, "memory(GiB)": 369.42, "step": 32860, "train_speed(iter/s)": 0.201203 }, { "acc": 0.73940182, "epoch": 0.8337138508371386, "grad_norm": 2.015625, "learning_rate": 6.75024128555891e-06, "loss": 1.04580441, "memory(GiB)": 369.42, "step": 32865, "train_speed(iter/s)": 0.201209 }, { "acc": 0.74605875, "epoch": 0.8338406900050735, "grad_norm": 1.859375, "learning_rate": 6.7492589680661695e-06, "loss": 1.00049191, "memory(GiB)": 369.42, "step": 32870, "train_speed(iter/s)": 0.201216 }, { "acc": 0.74164453, "epoch": 0.8339675291730086, "grad_norm": 2.15625, "learning_rate": 6.748276573633207e-06, "loss": 1.03182316, "memory(GiB)": 369.42, "step": 32875, "train_speed(iter/s)": 0.201218 }, { "acc": 0.75062923, "epoch": 0.8340943683409436, "grad_norm": 2.078125, "learning_rate": 6.747294102303237e-06, "loss": 0.99145298, "memory(GiB)": 369.42, "step": 32880, "train_speed(iter/s)": 0.201222 }, { "acc": 0.75048118, "epoch": 0.8342212075088787, "grad_norm": 2.140625, "learning_rate": 6.746311554119469e-06, "loss": 0.97368145, "memory(GiB)": 369.42, "step": 32885, "train_speed(iter/s)": 0.201227 }, { "acc": 0.76331897, "epoch": 0.8343480466768138, "grad_norm": 2.375, "learning_rate": 6.745328929125125e-06, "loss": 0.96204205, "memory(GiB)": 369.42, "step": 32890, "train_speed(iter/s)": 0.201232 }, { "acc": 0.74397001, "epoch": 0.8344748858447488, "grad_norm": 2.21875, "learning_rate": 6.7443462273634195e-06, "loss": 0.96765633, "memory(GiB)": 369.42, "step": 32895, "train_speed(iter/s)": 0.201237 }, { "acc": 0.74915705, "epoch": 0.8346017250126839, "grad_norm": 2.421875, "learning_rate": 6.74336344887758e-06, "loss": 0.98200188, "memory(GiB)": 369.42, "step": 32900, "train_speed(iter/s)": 0.201244 }, { "acc": 0.75147896, "epoch": 0.834728564180619, "grad_norm": 2.25, "learning_rate": 6.742380593710834e-06, "loss": 1.01089544, "memory(GiB)": 369.42, "step": 32905, "train_speed(iter/s)": 0.201246 }, { "acc": 0.7350853, "epoch": 0.834855403348554, "grad_norm": 2.375, "learning_rate": 6.7413976619064085e-06, "loss": 1.07212877, "memory(GiB)": 369.42, "step": 32910, "train_speed(iter/s)": 0.201251 }, { "acc": 0.75520658, "epoch": 0.8349822425164891, "grad_norm": 2.234375, "learning_rate": 6.74041465350754e-06, "loss": 0.93177891, "memory(GiB)": 369.42, "step": 32915, "train_speed(iter/s)": 0.201249 }, { "acc": 0.76518917, "epoch": 0.8351090816844241, "grad_norm": 1.9453125, "learning_rate": 6.739431568557464e-06, "loss": 0.90533457, "memory(GiB)": 369.42, "step": 32920, "train_speed(iter/s)": 0.201254 }, { "acc": 0.73296313, "epoch": 0.8352359208523592, "grad_norm": 1.9921875, "learning_rate": 6.738448407099423e-06, "loss": 1.07307339, "memory(GiB)": 369.42, "step": 32925, "train_speed(iter/s)": 0.201261 }, { "acc": 0.74402542, "epoch": 0.8353627600202943, "grad_norm": 2.265625, "learning_rate": 6.737465169176658e-06, "loss": 0.98893681, "memory(GiB)": 369.42, "step": 32930, "train_speed(iter/s)": 0.201265 }, { "acc": 0.74293509, "epoch": 0.8354895991882293, "grad_norm": 2.1875, "learning_rate": 6.736481854832418e-06, "loss": 1.02880135, "memory(GiB)": 369.42, "step": 32935, "train_speed(iter/s)": 0.201271 }, { "acc": 0.7433054, "epoch": 0.8356164383561644, "grad_norm": 2.453125, "learning_rate": 6.735498464109953e-06, "loss": 1.01739864, "memory(GiB)": 369.42, "step": 32940, "train_speed(iter/s)": 0.201275 }, { "acc": 0.75352912, "epoch": 0.8357432775240995, "grad_norm": 2.625, "learning_rate": 6.734514997052517e-06, "loss": 0.97045403, "memory(GiB)": 369.42, "step": 32945, "train_speed(iter/s)": 0.201279 }, { "acc": 0.75329332, "epoch": 0.8358701166920345, "grad_norm": 2.484375, "learning_rate": 6.733531453703368e-06, "loss": 0.99480839, "memory(GiB)": 369.42, "step": 32950, "train_speed(iter/s)": 0.201281 }, { "acc": 0.74627004, "epoch": 0.8359969558599696, "grad_norm": 2.265625, "learning_rate": 6.732547834105765e-06, "loss": 0.94583683, "memory(GiB)": 369.42, "step": 32955, "train_speed(iter/s)": 0.201287 }, { "acc": 0.75630474, "epoch": 0.8361237950279046, "grad_norm": 2.3125, "learning_rate": 6.731564138302975e-06, "loss": 0.99159279, "memory(GiB)": 369.42, "step": 32960, "train_speed(iter/s)": 0.201293 }, { "acc": 0.75419812, "epoch": 0.8362506341958397, "grad_norm": 2.0, "learning_rate": 6.730580366338261e-06, "loss": 0.99091301, "memory(GiB)": 369.42, "step": 32965, "train_speed(iter/s)": 0.201299 }, { "acc": 0.76083965, "epoch": 0.8363774733637748, "grad_norm": 2.03125, "learning_rate": 6.729596518254897e-06, "loss": 0.95236473, "memory(GiB)": 369.42, "step": 32970, "train_speed(iter/s)": 0.2013 }, { "acc": 0.74888978, "epoch": 0.8365043125317098, "grad_norm": 2.703125, "learning_rate": 6.728612594096155e-06, "loss": 0.97836246, "memory(GiB)": 369.42, "step": 32975, "train_speed(iter/s)": 0.201306 }, { "acc": 0.74276371, "epoch": 0.8366311516996449, "grad_norm": 1.84375, "learning_rate": 6.727628593905315e-06, "loss": 1.0187273, "memory(GiB)": 369.42, "step": 32980, "train_speed(iter/s)": 0.201311 }, { "acc": 0.75740213, "epoch": 0.83675799086758, "grad_norm": 2.21875, "learning_rate": 6.726644517725655e-06, "loss": 1.00818491, "memory(GiB)": 369.42, "step": 32985, "train_speed(iter/s)": 0.201316 }, { "acc": 0.72510986, "epoch": 0.836884830035515, "grad_norm": 2.375, "learning_rate": 6.725660365600462e-06, "loss": 1.06634302, "memory(GiB)": 369.42, "step": 32990, "train_speed(iter/s)": 0.201319 }, { "acc": 0.72706351, "epoch": 0.83701166920345, "grad_norm": 2.53125, "learning_rate": 6.724676137573021e-06, "loss": 1.04357815, "memory(GiB)": 369.42, "step": 32995, "train_speed(iter/s)": 0.201323 }, { "acc": 0.73853221, "epoch": 0.837138508371385, "grad_norm": 2.828125, "learning_rate": 6.723691833686622e-06, "loss": 0.99242458, "memory(GiB)": 369.42, "step": 33000, "train_speed(iter/s)": 0.201329 }, { "epoch": 0.837138508371385, "eval_acc": 0.7372669746168761, "eval_loss": 0.9728506207466125, "eval_runtime": 384.8169, "eval_samples_per_second": 16.553, "eval_steps_per_second": 8.277, "step": 33000 }, { "acc": 0.74481974, "epoch": 0.8372653475393201, "grad_norm": 2.765625, "learning_rate": 6.722707453984561e-06, "loss": 1.02793055, "memory(GiB)": 369.42, "step": 33005, "train_speed(iter/s)": 0.200461 }, { "acc": 0.75960946, "epoch": 0.8373921867072552, "grad_norm": 2.21875, "learning_rate": 6.721722998510135e-06, "loss": 1.03173513, "memory(GiB)": 369.42, "step": 33010, "train_speed(iter/s)": 0.200464 }, { "acc": 0.75205584, "epoch": 0.8375190258751902, "grad_norm": 2.0, "learning_rate": 6.720738467306644e-06, "loss": 0.99150429, "memory(GiB)": 369.42, "step": 33015, "train_speed(iter/s)": 0.200466 }, { "acc": 0.75265799, "epoch": 0.8376458650431253, "grad_norm": 2.03125, "learning_rate": 6.719753860417394e-06, "loss": 1.01561203, "memory(GiB)": 369.42, "step": 33020, "train_speed(iter/s)": 0.20047 }, { "acc": 0.73172073, "epoch": 0.8377727042110604, "grad_norm": 2.5, "learning_rate": 6.718769177885689e-06, "loss": 1.07606268, "memory(GiB)": 369.42, "step": 33025, "train_speed(iter/s)": 0.200475 }, { "acc": 0.76649742, "epoch": 0.8378995433789954, "grad_norm": 1.7578125, "learning_rate": 6.717784419754845e-06, "loss": 0.92295933, "memory(GiB)": 369.42, "step": 33030, "train_speed(iter/s)": 0.200481 }, { "acc": 0.75512996, "epoch": 0.8380263825469305, "grad_norm": 2.28125, "learning_rate": 6.71679958606817e-06, "loss": 0.963447, "memory(GiB)": 369.42, "step": 33035, "train_speed(iter/s)": 0.200487 }, { "acc": 0.73094406, "epoch": 0.8381532217148655, "grad_norm": 2.3125, "learning_rate": 6.715814676868985e-06, "loss": 1.05871277, "memory(GiB)": 369.42, "step": 33040, "train_speed(iter/s)": 0.200491 }, { "acc": 0.73834066, "epoch": 0.8382800608828006, "grad_norm": 1.9140625, "learning_rate": 6.714829692200611e-06, "loss": 1.03781223, "memory(GiB)": 369.42, "step": 33045, "train_speed(iter/s)": 0.200498 }, { "acc": 0.74494624, "epoch": 0.8384069000507357, "grad_norm": 2.78125, "learning_rate": 6.71384463210637e-06, "loss": 1.02159443, "memory(GiB)": 369.42, "step": 33050, "train_speed(iter/s)": 0.200501 }, { "acc": 0.7504137, "epoch": 0.8385337392186707, "grad_norm": 2.078125, "learning_rate": 6.7128594966295904e-06, "loss": 0.97877398, "memory(GiB)": 369.42, "step": 33055, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75633621, "epoch": 0.8386605783866058, "grad_norm": 2.3125, "learning_rate": 6.711874285813602e-06, "loss": 0.96682663, "memory(GiB)": 369.42, "step": 33060, "train_speed(iter/s)": 0.200512 }, { "acc": 0.75264068, "epoch": 0.8387874175545409, "grad_norm": 2.03125, "learning_rate": 6.710888999701741e-06, "loss": 1.00234013, "memory(GiB)": 369.42, "step": 33065, "train_speed(iter/s)": 0.200517 }, { "acc": 0.75395699, "epoch": 0.8389142567224759, "grad_norm": 2.078125, "learning_rate": 6.7099036383373425e-06, "loss": 0.97271938, "memory(GiB)": 369.42, "step": 33070, "train_speed(iter/s)": 0.200518 }, { "acc": 0.74978323, "epoch": 0.839041095890411, "grad_norm": 2.171875, "learning_rate": 6.708918201763748e-06, "loss": 0.98492165, "memory(GiB)": 369.42, "step": 33075, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75158253, "epoch": 0.839167935058346, "grad_norm": 2.125, "learning_rate": 6.707932690024302e-06, "loss": 1.0117506, "memory(GiB)": 369.42, "step": 33080, "train_speed(iter/s)": 0.200525 }, { "acc": 0.74247928, "epoch": 0.8392947742262811, "grad_norm": 2.25, "learning_rate": 6.706947103162348e-06, "loss": 1.00563097, "memory(GiB)": 369.42, "step": 33085, "train_speed(iter/s)": 0.200527 }, { "acc": 0.754741, "epoch": 0.8394216133942162, "grad_norm": 2.359375, "learning_rate": 6.7059614412212425e-06, "loss": 1.01970806, "memory(GiB)": 369.42, "step": 33090, "train_speed(iter/s)": 0.200534 }, { "acc": 0.7469842, "epoch": 0.8395484525621512, "grad_norm": 2.078125, "learning_rate": 6.704975704244334e-06, "loss": 1.00322571, "memory(GiB)": 369.42, "step": 33095, "train_speed(iter/s)": 0.200537 }, { "acc": 0.74004545, "epoch": 0.8396752917300863, "grad_norm": 2.28125, "learning_rate": 6.703989892274985e-06, "loss": 1.06521349, "memory(GiB)": 369.42, "step": 33100, "train_speed(iter/s)": 0.200541 }, { "acc": 0.75890546, "epoch": 0.8398021308980214, "grad_norm": 2.546875, "learning_rate": 6.703004005356549e-06, "loss": 0.94608936, "memory(GiB)": 369.42, "step": 33105, "train_speed(iter/s)": 0.20054 }, { "acc": 0.73490195, "epoch": 0.8399289700659563, "grad_norm": 2.53125, "learning_rate": 6.7020180435323965e-06, "loss": 1.04362068, "memory(GiB)": 369.42, "step": 33110, "train_speed(iter/s)": 0.200546 }, { "acc": 0.73688865, "epoch": 0.8400558092338914, "grad_norm": 2.296875, "learning_rate": 6.701032006845889e-06, "loss": 1.01623993, "memory(GiB)": 369.42, "step": 33115, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75450954, "epoch": 0.8401826484018264, "grad_norm": 2.421875, "learning_rate": 6.700045895340401e-06, "loss": 0.97298164, "memory(GiB)": 369.42, "step": 33120, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75350866, "epoch": 0.8403094875697615, "grad_norm": 3.265625, "learning_rate": 6.699059709059304e-06, "loss": 1.01098633, "memory(GiB)": 369.42, "step": 33125, "train_speed(iter/s)": 0.200551 }, { "acc": 0.76001368, "epoch": 0.8404363267376966, "grad_norm": 2.109375, "learning_rate": 6.698073448045975e-06, "loss": 1.00475798, "memory(GiB)": 369.42, "step": 33130, "train_speed(iter/s)": 0.200555 }, { "acc": 0.75953808, "epoch": 0.8405631659056316, "grad_norm": 2.125, "learning_rate": 6.697087112343795e-06, "loss": 0.94535389, "memory(GiB)": 369.42, "step": 33135, "train_speed(iter/s)": 0.20056 }, { "acc": 0.73619347, "epoch": 0.8406900050735667, "grad_norm": 2.328125, "learning_rate": 6.696100701996146e-06, "loss": 1.05204611, "memory(GiB)": 369.42, "step": 33140, "train_speed(iter/s)": 0.200564 }, { "acc": 0.74813671, "epoch": 0.8408168442415018, "grad_norm": 2.484375, "learning_rate": 6.6951142170464164e-06, "loss": 0.9855154, "memory(GiB)": 369.42, "step": 33145, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74479551, "epoch": 0.8409436834094368, "grad_norm": 2.3125, "learning_rate": 6.694127657537995e-06, "loss": 1.02646084, "memory(GiB)": 369.42, "step": 33150, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74102483, "epoch": 0.8410705225773719, "grad_norm": 1.9765625, "learning_rate": 6.693141023514276e-06, "loss": 0.99887161, "memory(GiB)": 369.42, "step": 33155, "train_speed(iter/s)": 0.200581 }, { "acc": 0.74271111, "epoch": 0.8411973617453069, "grad_norm": 2.15625, "learning_rate": 6.6921543150186555e-06, "loss": 1.01279583, "memory(GiB)": 369.42, "step": 33160, "train_speed(iter/s)": 0.200584 }, { "acc": 0.76160522, "epoch": 0.841324200913242, "grad_norm": 1.984375, "learning_rate": 6.691167532094531e-06, "loss": 0.9888504, "memory(GiB)": 369.42, "step": 33165, "train_speed(iter/s)": 0.200586 }, { "acc": 0.75158901, "epoch": 0.8414510400811771, "grad_norm": 2.84375, "learning_rate": 6.690180674785311e-06, "loss": 1.07615337, "memory(GiB)": 369.42, "step": 33170, "train_speed(iter/s)": 0.200592 }, { "acc": 0.73384438, "epoch": 0.8415778792491121, "grad_norm": 2.21875, "learning_rate": 6.689193743134397e-06, "loss": 1.00798473, "memory(GiB)": 369.42, "step": 33175, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74566302, "epoch": 0.8417047184170472, "grad_norm": 2.34375, "learning_rate": 6.688206737185201e-06, "loss": 0.98823032, "memory(GiB)": 369.42, "step": 33180, "train_speed(iter/s)": 0.200604 }, { "acc": 0.73227272, "epoch": 0.8418315575849823, "grad_norm": 2.328125, "learning_rate": 6.687219656981135e-06, "loss": 1.00213022, "memory(GiB)": 369.42, "step": 33185, "train_speed(iter/s)": 0.200607 }, { "acc": 0.75681744, "epoch": 0.8419583967529173, "grad_norm": 2.1875, "learning_rate": 6.686232502565616e-06, "loss": 1.00590925, "memory(GiB)": 369.42, "step": 33190, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75045614, "epoch": 0.8420852359208524, "grad_norm": 2.1875, "learning_rate": 6.685245273982063e-06, "loss": 1.01190376, "memory(GiB)": 369.42, "step": 33195, "train_speed(iter/s)": 0.200619 }, { "acc": 0.73109903, "epoch": 0.8422120750887874, "grad_norm": 2.1875, "learning_rate": 6.684257971273899e-06, "loss": 1.09502945, "memory(GiB)": 369.42, "step": 33200, "train_speed(iter/s)": 0.200622 }, { "acc": 0.75751467, "epoch": 0.8423389142567225, "grad_norm": 2.015625, "learning_rate": 6.68327059448455e-06, "loss": 0.9494175, "memory(GiB)": 369.42, "step": 33205, "train_speed(iter/s)": 0.200626 }, { "acc": 0.75452027, "epoch": 0.8424657534246576, "grad_norm": 2.46875, "learning_rate": 6.682283143657444e-06, "loss": 0.97288141, "memory(GiB)": 369.42, "step": 33210, "train_speed(iter/s)": 0.200631 }, { "acc": 0.7489677, "epoch": 0.8425925925925926, "grad_norm": 3.015625, "learning_rate": 6.681295618836015e-06, "loss": 1.03858948, "memory(GiB)": 369.42, "step": 33215, "train_speed(iter/s)": 0.200637 }, { "acc": 0.73591671, "epoch": 0.8427194317605277, "grad_norm": 2.0, "learning_rate": 6.680308020063699e-06, "loss": 0.99229546, "memory(GiB)": 369.42, "step": 33220, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74544172, "epoch": 0.8428462709284628, "grad_norm": 2.1875, "learning_rate": 6.679320347383933e-06, "loss": 1.02483559, "memory(GiB)": 369.42, "step": 33225, "train_speed(iter/s)": 0.200645 }, { "acc": 0.753725, "epoch": 0.8429731100963977, "grad_norm": 2.859375, "learning_rate": 6.678332600840161e-06, "loss": 1.01148434, "memory(GiB)": 369.42, "step": 33230, "train_speed(iter/s)": 0.200651 }, { "acc": 0.76914892, "epoch": 0.8430999492643328, "grad_norm": 2.109375, "learning_rate": 6.677344780475827e-06, "loss": 0.96503696, "memory(GiB)": 369.42, "step": 33235, "train_speed(iter/s)": 0.200658 }, { "acc": 0.7665122, "epoch": 0.8432267884322678, "grad_norm": 1.984375, "learning_rate": 6.676356886334383e-06, "loss": 0.970681, "memory(GiB)": 369.42, "step": 33240, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74279079, "epoch": 0.8433536276002029, "grad_norm": 2.296875, "learning_rate": 6.675368918459276e-06, "loss": 1.04878922, "memory(GiB)": 369.42, "step": 33245, "train_speed(iter/s)": 0.200667 }, { "acc": 0.76737366, "epoch": 0.843480466768138, "grad_norm": 2.03125, "learning_rate": 6.674380876893967e-06, "loss": 0.92161074, "memory(GiB)": 369.42, "step": 33250, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75403585, "epoch": 0.843607305936073, "grad_norm": 2.671875, "learning_rate": 6.673392761681908e-06, "loss": 0.97307682, "memory(GiB)": 369.42, "step": 33255, "train_speed(iter/s)": 0.200675 }, { "acc": 0.7579134, "epoch": 0.8437341451040081, "grad_norm": 1.96875, "learning_rate": 6.672404572866566e-06, "loss": 0.95824108, "memory(GiB)": 369.42, "step": 33260, "train_speed(iter/s)": 0.200679 }, { "acc": 0.75196328, "epoch": 0.8438609842719432, "grad_norm": 1.8828125, "learning_rate": 6.671416310491406e-06, "loss": 0.9480195, "memory(GiB)": 369.42, "step": 33265, "train_speed(iter/s)": 0.20068 }, { "acc": 0.73567228, "epoch": 0.8439878234398782, "grad_norm": 2.125, "learning_rate": 6.670427974599891e-06, "loss": 1.06021633, "memory(GiB)": 369.42, "step": 33270, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75037689, "epoch": 0.8441146626078133, "grad_norm": 2.640625, "learning_rate": 6.669439565235498e-06, "loss": 1.0370182, "memory(GiB)": 369.42, "step": 33275, "train_speed(iter/s)": 0.20069 }, { "acc": 0.74888391, "epoch": 0.8442415017757483, "grad_norm": 2.265625, "learning_rate": 6.668451082441698e-06, "loss": 1.01758785, "memory(GiB)": 369.42, "step": 33280, "train_speed(iter/s)": 0.200695 }, { "acc": 0.73303566, "epoch": 0.8443683409436834, "grad_norm": 2.03125, "learning_rate": 6.667462526261972e-06, "loss": 1.00013151, "memory(GiB)": 369.42, "step": 33285, "train_speed(iter/s)": 0.200699 }, { "acc": 0.75674729, "epoch": 0.8444951801116185, "grad_norm": 1.828125, "learning_rate": 6.666473896739798e-06, "loss": 0.96070967, "memory(GiB)": 369.42, "step": 33290, "train_speed(iter/s)": 0.200703 }, { "acc": 0.74999614, "epoch": 0.8446220192795535, "grad_norm": 2.203125, "learning_rate": 6.665485193918663e-06, "loss": 1.00469999, "memory(GiB)": 369.42, "step": 33295, "train_speed(iter/s)": 0.200709 }, { "acc": 0.7362823, "epoch": 0.8447488584474886, "grad_norm": 2.359375, "learning_rate": 6.664496417842053e-06, "loss": 1.04251852, "memory(GiB)": 369.42, "step": 33300, "train_speed(iter/s)": 0.200714 }, { "acc": 0.75310059, "epoch": 0.8448756976154237, "grad_norm": 1.84375, "learning_rate": 6.6635075685534566e-06, "loss": 0.9774128, "memory(GiB)": 369.42, "step": 33305, "train_speed(iter/s)": 0.200719 }, { "acc": 0.7462008, "epoch": 0.8450025367833587, "grad_norm": 2.59375, "learning_rate": 6.662518646096374e-06, "loss": 1.0202179, "memory(GiB)": 369.42, "step": 33310, "train_speed(iter/s)": 0.200724 }, { "acc": 0.74512177, "epoch": 0.8451293759512938, "grad_norm": 1.8984375, "learning_rate": 6.661529650514296e-06, "loss": 0.98692856, "memory(GiB)": 369.42, "step": 33315, "train_speed(iter/s)": 0.200729 }, { "acc": 0.74015594, "epoch": 0.8452562151192288, "grad_norm": 2.046875, "learning_rate": 6.6605405818507274e-06, "loss": 1.03052025, "memory(GiB)": 369.42, "step": 33320, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74507875, "epoch": 0.8453830542871639, "grad_norm": 2.140625, "learning_rate": 6.659551440149169e-06, "loss": 0.996562, "memory(GiB)": 369.42, "step": 33325, "train_speed(iter/s)": 0.200736 }, { "acc": 0.76132021, "epoch": 0.845509893455099, "grad_norm": 2.28125, "learning_rate": 6.65856222545313e-06, "loss": 0.94452763, "memory(GiB)": 369.42, "step": 33330, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75207663, "epoch": 0.845636732623034, "grad_norm": 1.9453125, "learning_rate": 6.657572937806118e-06, "loss": 1.00603113, "memory(GiB)": 369.42, "step": 33335, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74865694, "epoch": 0.8457635717909691, "grad_norm": 1.921875, "learning_rate": 6.656583577251649e-06, "loss": 0.93371401, "memory(GiB)": 369.42, "step": 33340, "train_speed(iter/s)": 0.200745 }, { "acc": 0.72747517, "epoch": 0.8458904109589042, "grad_norm": 2.015625, "learning_rate": 6.655594143833237e-06, "loss": 1.05430002, "memory(GiB)": 369.42, "step": 33345, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74983716, "epoch": 0.8460172501268391, "grad_norm": 2.25, "learning_rate": 6.654604637594404e-06, "loss": 0.9805234, "memory(GiB)": 369.42, "step": 33350, "train_speed(iter/s)": 0.200755 }, { "acc": 0.73313112, "epoch": 0.8461440892947742, "grad_norm": 2.21875, "learning_rate": 6.653615058578672e-06, "loss": 1.06714401, "memory(GiB)": 369.42, "step": 33355, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75964212, "epoch": 0.8462709284627092, "grad_norm": 2.09375, "learning_rate": 6.652625406829566e-06, "loss": 0.97618351, "memory(GiB)": 369.42, "step": 33360, "train_speed(iter/s)": 0.200765 }, { "acc": 0.74611168, "epoch": 0.8463977676306443, "grad_norm": 2.421875, "learning_rate": 6.651635682390616e-06, "loss": 1.0092289, "memory(GiB)": 369.42, "step": 33365, "train_speed(iter/s)": 0.20077 }, { "acc": 0.75183582, "epoch": 0.8465246067985794, "grad_norm": 2.203125, "learning_rate": 6.650645885305356e-06, "loss": 0.9870101, "memory(GiB)": 369.42, "step": 33370, "train_speed(iter/s)": 0.200774 }, { "acc": 0.74289932, "epoch": 0.8466514459665144, "grad_norm": 2.671875, "learning_rate": 6.649656015617319e-06, "loss": 1.06347065, "memory(GiB)": 369.42, "step": 33375, "train_speed(iter/s)": 0.200779 }, { "acc": 0.73570719, "epoch": 0.8467782851344495, "grad_norm": 1.890625, "learning_rate": 6.648666073370046e-06, "loss": 0.98990049, "memory(GiB)": 369.42, "step": 33380, "train_speed(iter/s)": 0.200781 }, { "acc": 0.75011802, "epoch": 0.8469051243023846, "grad_norm": 1.9765625, "learning_rate": 6.647676058607076e-06, "loss": 1.02739067, "memory(GiB)": 369.42, "step": 33385, "train_speed(iter/s)": 0.200787 }, { "acc": 0.73959551, "epoch": 0.8470319634703196, "grad_norm": 2.4375, "learning_rate": 6.64668597137196e-06, "loss": 1.03786259, "memory(GiB)": 369.42, "step": 33390, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75622034, "epoch": 0.8471588026382547, "grad_norm": 2.578125, "learning_rate": 6.645695811708241e-06, "loss": 0.99894152, "memory(GiB)": 369.42, "step": 33395, "train_speed(iter/s)": 0.200797 }, { "acc": 0.7501688, "epoch": 0.8472856418061897, "grad_norm": 2.046875, "learning_rate": 6.644705579659474e-06, "loss": 0.99186678, "memory(GiB)": 369.42, "step": 33400, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74254699, "epoch": 0.8474124809741248, "grad_norm": 2.390625, "learning_rate": 6.643715275269212e-06, "loss": 1.00553055, "memory(GiB)": 369.42, "step": 33405, "train_speed(iter/s)": 0.200806 }, { "acc": 0.75262585, "epoch": 0.8475393201420599, "grad_norm": 2.375, "learning_rate": 6.642724898581013e-06, "loss": 1.01707172, "memory(GiB)": 369.42, "step": 33410, "train_speed(iter/s)": 0.200811 }, { "acc": 0.76226168, "epoch": 0.8476661593099949, "grad_norm": 2.078125, "learning_rate": 6.6417344496384394e-06, "loss": 0.99304457, "memory(GiB)": 369.42, "step": 33415, "train_speed(iter/s)": 0.200814 }, { "acc": 0.72714419, "epoch": 0.84779299847793, "grad_norm": 1.6640625, "learning_rate": 6.640743928485054e-06, "loss": 1.03135147, "memory(GiB)": 369.42, "step": 33420, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75507731, "epoch": 0.8479198376458651, "grad_norm": 2.921875, "learning_rate": 6.639753335164426e-06, "loss": 1.02866821, "memory(GiB)": 369.42, "step": 33425, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74665203, "epoch": 0.8480466768138001, "grad_norm": 2.140625, "learning_rate": 6.638762669720126e-06, "loss": 1.01234121, "memory(GiB)": 369.42, "step": 33430, "train_speed(iter/s)": 0.200826 }, { "acc": 0.76467981, "epoch": 0.8481735159817352, "grad_norm": 2.359375, "learning_rate": 6.637771932195726e-06, "loss": 0.95708389, "memory(GiB)": 369.42, "step": 33435, "train_speed(iter/s)": 0.200829 }, { "acc": 0.75924397, "epoch": 0.8483003551496702, "grad_norm": 2.125, "learning_rate": 6.636781122634804e-06, "loss": 0.9811388, "memory(GiB)": 369.42, "step": 33440, "train_speed(iter/s)": 0.200833 }, { "acc": 0.76866097, "epoch": 0.8484271943176053, "grad_norm": 1.9140625, "learning_rate": 6.635790241080941e-06, "loss": 0.8791316, "memory(GiB)": 369.42, "step": 33445, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75120769, "epoch": 0.8485540334855404, "grad_norm": 2.203125, "learning_rate": 6.634799287577721e-06, "loss": 0.9821559, "memory(GiB)": 369.42, "step": 33450, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75607896, "epoch": 0.8486808726534754, "grad_norm": 2.296875, "learning_rate": 6.6338082621687286e-06, "loss": 0.91708717, "memory(GiB)": 369.42, "step": 33455, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74021301, "epoch": 0.8488077118214105, "grad_norm": 2.96875, "learning_rate": 6.6328171648975545e-06, "loss": 1.06787949, "memory(GiB)": 369.42, "step": 33460, "train_speed(iter/s)": 0.200846 }, { "acc": 0.76086626, "epoch": 0.8489345509893456, "grad_norm": 2.34375, "learning_rate": 6.63182599580779e-06, "loss": 0.96266804, "memory(GiB)": 369.42, "step": 33465, "train_speed(iter/s)": 0.200851 }, { "acc": 0.75021315, "epoch": 0.8490613901572805, "grad_norm": 2.03125, "learning_rate": 6.630834754943036e-06, "loss": 0.99590969, "memory(GiB)": 369.42, "step": 33470, "train_speed(iter/s)": 0.200856 }, { "acc": 0.75379004, "epoch": 0.8491882293252156, "grad_norm": 2.578125, "learning_rate": 6.629843442346886e-06, "loss": 1.01266575, "memory(GiB)": 369.42, "step": 33475, "train_speed(iter/s)": 0.200862 }, { "acc": 0.74957361, "epoch": 0.8493150684931506, "grad_norm": 2.40625, "learning_rate": 6.628852058062944e-06, "loss": 1.03010006, "memory(GiB)": 369.42, "step": 33480, "train_speed(iter/s)": 0.200866 }, { "acc": 0.72420826, "epoch": 0.8494419076610857, "grad_norm": 1.953125, "learning_rate": 6.627860602134818e-06, "loss": 1.12974205, "memory(GiB)": 369.42, "step": 33485, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74329543, "epoch": 0.8495687468290208, "grad_norm": 1.921875, "learning_rate": 6.626869074606113e-06, "loss": 1.03544893, "memory(GiB)": 369.42, "step": 33490, "train_speed(iter/s)": 0.200878 }, { "acc": 0.73427782, "epoch": 0.8496955859969558, "grad_norm": 2.109375, "learning_rate": 6.625877475520445e-06, "loss": 1.0813015, "memory(GiB)": 369.42, "step": 33495, "train_speed(iter/s)": 0.200883 }, { "acc": 0.75978003, "epoch": 0.8498224251648909, "grad_norm": 2.15625, "learning_rate": 6.624885804921425e-06, "loss": 0.92576313, "memory(GiB)": 369.42, "step": 33500, "train_speed(iter/s)": 0.200889 }, { "acc": 0.75390964, "epoch": 0.849949264332826, "grad_norm": 2.515625, "learning_rate": 6.623894062852673e-06, "loss": 0.94484882, "memory(GiB)": 369.42, "step": 33505, "train_speed(iter/s)": 0.200892 }, { "acc": 0.77151499, "epoch": 0.850076103500761, "grad_norm": 1.8359375, "learning_rate": 6.62290224935781e-06, "loss": 0.93425627, "memory(GiB)": 369.42, "step": 33510, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75509405, "epoch": 0.8502029426686961, "grad_norm": 1.8125, "learning_rate": 6.621910364480461e-06, "loss": 0.99782429, "memory(GiB)": 369.42, "step": 33515, "train_speed(iter/s)": 0.2009 }, { "acc": 0.74648504, "epoch": 0.8503297818366311, "grad_norm": 2.28125, "learning_rate": 6.620918408264252e-06, "loss": 1.05278168, "memory(GiB)": 369.42, "step": 33520, "train_speed(iter/s)": 0.200905 }, { "acc": 0.73718367, "epoch": 0.8504566210045662, "grad_norm": 1.9921875, "learning_rate": 6.6199263807528136e-06, "loss": 1.06286602, "memory(GiB)": 369.42, "step": 33525, "train_speed(iter/s)": 0.200908 }, { "acc": 0.75326452, "epoch": 0.8505834601725013, "grad_norm": 2.234375, "learning_rate": 6.618934281989783e-06, "loss": 1.00673885, "memory(GiB)": 369.42, "step": 33530, "train_speed(iter/s)": 0.200913 }, { "acc": 0.7492609, "epoch": 0.8507102993404363, "grad_norm": 2.078125, "learning_rate": 6.6179421120187915e-06, "loss": 0.9485281, "memory(GiB)": 369.42, "step": 33535, "train_speed(iter/s)": 0.200918 }, { "acc": 0.76308146, "epoch": 0.8508371385083714, "grad_norm": 2.234375, "learning_rate": 6.616949870883486e-06, "loss": 0.93947716, "memory(GiB)": 369.42, "step": 33540, "train_speed(iter/s)": 0.200919 }, { "acc": 0.74598293, "epoch": 0.8509639776763065, "grad_norm": 2.453125, "learning_rate": 6.615957558627503e-06, "loss": 1.03239202, "memory(GiB)": 369.42, "step": 33545, "train_speed(iter/s)": 0.200923 }, { "acc": 0.75069118, "epoch": 0.8510908168442415, "grad_norm": 2.21875, "learning_rate": 6.6149651752944945e-06, "loss": 0.9961688, "memory(GiB)": 369.42, "step": 33550, "train_speed(iter/s)": 0.200928 }, { "acc": 0.7464922, "epoch": 0.8512176560121766, "grad_norm": 2.359375, "learning_rate": 6.613972720928105e-06, "loss": 1.00499039, "memory(GiB)": 369.42, "step": 33555, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74999371, "epoch": 0.8513444951801116, "grad_norm": 2.609375, "learning_rate": 6.61298019557199e-06, "loss": 0.94321108, "memory(GiB)": 369.42, "step": 33560, "train_speed(iter/s)": 0.20094 }, { "acc": 0.754037, "epoch": 0.8514713343480467, "grad_norm": 1.96875, "learning_rate": 6.6119875992698045e-06, "loss": 0.95592785, "memory(GiB)": 369.42, "step": 33565, "train_speed(iter/s)": 0.200944 }, { "acc": 0.7596241, "epoch": 0.8515981735159818, "grad_norm": 2.21875, "learning_rate": 6.610994932065207e-06, "loss": 0.92767506, "memory(GiB)": 369.42, "step": 33570, "train_speed(iter/s)": 0.200949 }, { "acc": 0.74081459, "epoch": 0.8517250126839168, "grad_norm": 2.171875, "learning_rate": 6.610002194001861e-06, "loss": 1.02091618, "memory(GiB)": 369.42, "step": 33575, "train_speed(iter/s)": 0.200953 }, { "acc": 0.75645475, "epoch": 0.8518518518518519, "grad_norm": 2.28125, "learning_rate": 6.609009385123429e-06, "loss": 0.94907093, "memory(GiB)": 369.42, "step": 33580, "train_speed(iter/s)": 0.200957 }, { "acc": 0.75143089, "epoch": 0.851978691019787, "grad_norm": 2.171875, "learning_rate": 6.608016505473582e-06, "loss": 0.99495697, "memory(GiB)": 369.42, "step": 33585, "train_speed(iter/s)": 0.20096 }, { "acc": 0.74327459, "epoch": 0.852105530187722, "grad_norm": 2.390625, "learning_rate": 6.60702355509599e-06, "loss": 1.02375526, "memory(GiB)": 369.42, "step": 33590, "train_speed(iter/s)": 0.200962 }, { "acc": 0.7529357, "epoch": 0.852232369355657, "grad_norm": 1.984375, "learning_rate": 6.606030534034326e-06, "loss": 1.00506382, "memory(GiB)": 369.42, "step": 33595, "train_speed(iter/s)": 0.200967 }, { "acc": 0.74042602, "epoch": 0.852359208523592, "grad_norm": 2.03125, "learning_rate": 6.6050374423322685e-06, "loss": 1.06895704, "memory(GiB)": 369.42, "step": 33600, "train_speed(iter/s)": 0.20097 }, { "acc": 0.729602, "epoch": 0.8524860476915271, "grad_norm": 2.546875, "learning_rate": 6.604044280033498e-06, "loss": 1.04786453, "memory(GiB)": 369.42, "step": 33605, "train_speed(iter/s)": 0.200976 }, { "acc": 0.73253198, "epoch": 0.8526128868594622, "grad_norm": 2.6875, "learning_rate": 6.6030510471817e-06, "loss": 1.06126595, "memory(GiB)": 369.42, "step": 33610, "train_speed(iter/s)": 0.200981 }, { "acc": 0.73572121, "epoch": 0.8527397260273972, "grad_norm": 2.015625, "learning_rate": 6.602057743820558e-06, "loss": 1.05530033, "memory(GiB)": 369.42, "step": 33615, "train_speed(iter/s)": 0.200987 }, { "acc": 0.76010084, "epoch": 0.8528665651953323, "grad_norm": 2.90625, "learning_rate": 6.601064369993766e-06, "loss": 0.95405598, "memory(GiB)": 369.42, "step": 33620, "train_speed(iter/s)": 0.200991 }, { "acc": 0.74112267, "epoch": 0.8529934043632674, "grad_norm": 2.046875, "learning_rate": 6.600070925745012e-06, "loss": 1.02335081, "memory(GiB)": 369.42, "step": 33625, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74662371, "epoch": 0.8531202435312024, "grad_norm": 2.34375, "learning_rate": 6.599077411117998e-06, "loss": 1.02738457, "memory(GiB)": 369.42, "step": 33630, "train_speed(iter/s)": 0.201 }, { "acc": 0.76081858, "epoch": 0.8532470826991375, "grad_norm": 1.9921875, "learning_rate": 6.598083826156418e-06, "loss": 0.9536541, "memory(GiB)": 369.42, "step": 33635, "train_speed(iter/s)": 0.201005 }, { "acc": 0.75513101, "epoch": 0.8533739218670725, "grad_norm": 2.109375, "learning_rate": 6.597090170903977e-06, "loss": 0.99550428, "memory(GiB)": 369.42, "step": 33640, "train_speed(iter/s)": 0.201011 }, { "acc": 0.74359407, "epoch": 0.8535007610350076, "grad_norm": 2.546875, "learning_rate": 6.596096445404381e-06, "loss": 0.97012634, "memory(GiB)": 369.42, "step": 33645, "train_speed(iter/s)": 0.201014 }, { "acc": 0.7459065, "epoch": 0.8536276002029427, "grad_norm": 1.96875, "learning_rate": 6.595102649701336e-06, "loss": 0.99232874, "memory(GiB)": 369.42, "step": 33650, "train_speed(iter/s)": 0.201019 }, { "acc": 0.75130501, "epoch": 0.8537544393708777, "grad_norm": 2.1875, "learning_rate": 6.5941087838385545e-06, "loss": 0.99299793, "memory(GiB)": 369.42, "step": 33655, "train_speed(iter/s)": 0.201026 }, { "acc": 0.75639296, "epoch": 0.8538812785388128, "grad_norm": 2.15625, "learning_rate": 6.593114847859752e-06, "loss": 0.98370743, "memory(GiB)": 369.42, "step": 33660, "train_speed(iter/s)": 0.201028 }, { "acc": 0.74817896, "epoch": 0.8540081177067479, "grad_norm": 2.796875, "learning_rate": 6.592120841808646e-06, "loss": 1.01819038, "memory(GiB)": 369.42, "step": 33665, "train_speed(iter/s)": 0.201032 }, { "acc": 0.75196848, "epoch": 0.8541349568746829, "grad_norm": 1.9296875, "learning_rate": 6.5911267657289564e-06, "loss": 1.03054695, "memory(GiB)": 369.42, "step": 33670, "train_speed(iter/s)": 0.201036 }, { "acc": 0.74715996, "epoch": 0.854261796042618, "grad_norm": 1.84375, "learning_rate": 6.590132619664408e-06, "loss": 0.99793472, "memory(GiB)": 369.42, "step": 33675, "train_speed(iter/s)": 0.20104 }, { "acc": 0.76099877, "epoch": 0.854388635210553, "grad_norm": 2.109375, "learning_rate": 6.589138403658728e-06, "loss": 0.96908665, "memory(GiB)": 369.42, "step": 33680, "train_speed(iter/s)": 0.201043 }, { "acc": 0.73838787, "epoch": 0.8545154743784881, "grad_norm": 2.25, "learning_rate": 6.588144117755645e-06, "loss": 1.04691315, "memory(GiB)": 369.42, "step": 33685, "train_speed(iter/s)": 0.201048 }, { "acc": 0.75591879, "epoch": 0.8546423135464232, "grad_norm": 2.25, "learning_rate": 6.5871497619988945e-06, "loss": 0.99202919, "memory(GiB)": 369.42, "step": 33690, "train_speed(iter/s)": 0.201053 }, { "acc": 0.74047537, "epoch": 0.8547691527143582, "grad_norm": 2.0625, "learning_rate": 6.586155336432211e-06, "loss": 1.03458977, "memory(GiB)": 369.42, "step": 33695, "train_speed(iter/s)": 0.201059 }, { "acc": 0.75642691, "epoch": 0.8548959918822933, "grad_norm": 1.8828125, "learning_rate": 6.585160841099333e-06, "loss": 0.9254323, "memory(GiB)": 369.42, "step": 33700, "train_speed(iter/s)": 0.201063 }, { "acc": 0.75795779, "epoch": 0.8550228310502284, "grad_norm": 2.15625, "learning_rate": 6.584166276044005e-06, "loss": 1.01045504, "memory(GiB)": 369.42, "step": 33705, "train_speed(iter/s)": 0.201066 }, { "acc": 0.7428134, "epoch": 0.8551496702181633, "grad_norm": 1.7734375, "learning_rate": 6.583171641309971e-06, "loss": 1.03590832, "memory(GiB)": 369.42, "step": 33710, "train_speed(iter/s)": 0.20107 }, { "acc": 0.74175982, "epoch": 0.8552765093860984, "grad_norm": 2.515625, "learning_rate": 6.58217693694098e-06, "loss": 1.01065235, "memory(GiB)": 369.42, "step": 33715, "train_speed(iter/s)": 0.201073 }, { "acc": 0.7438652, "epoch": 0.8554033485540334, "grad_norm": 2.390625, "learning_rate": 6.581182162980784e-06, "loss": 0.98619785, "memory(GiB)": 369.42, "step": 33720, "train_speed(iter/s)": 0.201076 }, { "acc": 0.73918877, "epoch": 0.8555301877219685, "grad_norm": 2.015625, "learning_rate": 6.580187319473137e-06, "loss": 1.06191368, "memory(GiB)": 369.42, "step": 33725, "train_speed(iter/s)": 0.201082 }, { "acc": 0.73756084, "epoch": 0.8556570268899036, "grad_norm": 2.046875, "learning_rate": 6.579192406461796e-06, "loss": 1.0268486, "memory(GiB)": 369.42, "step": 33730, "train_speed(iter/s)": 0.201082 }, { "acc": 0.74537063, "epoch": 0.8557838660578386, "grad_norm": 2.453125, "learning_rate": 6.5781974239905225e-06, "loss": 1.06131296, "memory(GiB)": 369.42, "step": 33735, "train_speed(iter/s)": 0.201088 }, { "acc": 0.73499918, "epoch": 0.8559107052257737, "grad_norm": 2.25, "learning_rate": 6.57720237210308e-06, "loss": 0.99982681, "memory(GiB)": 369.42, "step": 33740, "train_speed(iter/s)": 0.201093 }, { "acc": 0.74621334, "epoch": 0.8560375443937088, "grad_norm": 2.390625, "learning_rate": 6.576207250843235e-06, "loss": 0.96980724, "memory(GiB)": 369.42, "step": 33745, "train_speed(iter/s)": 0.201098 }, { "acc": 0.76030135, "epoch": 0.8561643835616438, "grad_norm": 1.953125, "learning_rate": 6.575212060254759e-06, "loss": 0.98750153, "memory(GiB)": 369.42, "step": 33750, "train_speed(iter/s)": 0.201102 }, { "acc": 0.74915037, "epoch": 0.8562912227295789, "grad_norm": 2.203125, "learning_rate": 6.574216800381424e-06, "loss": 1.01841354, "memory(GiB)": 369.42, "step": 33755, "train_speed(iter/s)": 0.201106 }, { "acc": 0.75367832, "epoch": 0.8564180618975139, "grad_norm": 2.5, "learning_rate": 6.573221471267005e-06, "loss": 0.9536458, "memory(GiB)": 369.42, "step": 33760, "train_speed(iter/s)": 0.201109 }, { "acc": 0.75155926, "epoch": 0.856544901065449, "grad_norm": 1.9296875, "learning_rate": 6.572226072955281e-06, "loss": 0.99056273, "memory(GiB)": 369.42, "step": 33765, "train_speed(iter/s)": 0.201113 }, { "acc": 0.76094913, "epoch": 0.8566717402333841, "grad_norm": 2.4375, "learning_rate": 6.571230605490036e-06, "loss": 0.94465275, "memory(GiB)": 369.42, "step": 33770, "train_speed(iter/s)": 0.201117 }, { "acc": 0.75757113, "epoch": 0.8567985794013191, "grad_norm": 2.21875, "learning_rate": 6.570235068915053e-06, "loss": 0.9709795, "memory(GiB)": 369.42, "step": 33775, "train_speed(iter/s)": 0.20112 }, { "acc": 0.75586805, "epoch": 0.8569254185692542, "grad_norm": 2.109375, "learning_rate": 6.569239463274122e-06, "loss": 1.01070919, "memory(GiB)": 369.42, "step": 33780, "train_speed(iter/s)": 0.201124 }, { "acc": 0.73489847, "epoch": 0.8570522577371893, "grad_norm": 1.921875, "learning_rate": 6.568243788611033e-06, "loss": 1.00930462, "memory(GiB)": 369.42, "step": 33785, "train_speed(iter/s)": 0.201128 }, { "acc": 0.7531743, "epoch": 0.8571790969051243, "grad_norm": 1.90625, "learning_rate": 6.56724804496958e-06, "loss": 0.96127796, "memory(GiB)": 369.42, "step": 33790, "train_speed(iter/s)": 0.201132 }, { "acc": 0.74594841, "epoch": 0.8573059360730594, "grad_norm": 1.921875, "learning_rate": 6.566252232393561e-06, "loss": 1.01063137, "memory(GiB)": 369.42, "step": 33795, "train_speed(iter/s)": 0.201138 }, { "acc": 0.7516387, "epoch": 0.8574327752409944, "grad_norm": 1.9375, "learning_rate": 6.565256350926777e-06, "loss": 0.97311325, "memory(GiB)": 369.42, "step": 33800, "train_speed(iter/s)": 0.20114 }, { "acc": 0.75974779, "epoch": 0.8575596144089295, "grad_norm": 1.9296875, "learning_rate": 6.5642604006130286e-06, "loss": 0.94685669, "memory(GiB)": 369.42, "step": 33805, "train_speed(iter/s)": 0.201146 }, { "acc": 0.73946228, "epoch": 0.8576864535768646, "grad_norm": 2.015625, "learning_rate": 6.563264381496124e-06, "loss": 1.08411522, "memory(GiB)": 369.42, "step": 33810, "train_speed(iter/s)": 0.201149 }, { "acc": 0.74045382, "epoch": 0.8578132927447996, "grad_norm": 3.1875, "learning_rate": 6.562268293619872e-06, "loss": 1.0799468, "memory(GiB)": 369.42, "step": 33815, "train_speed(iter/s)": 0.201155 }, { "acc": 0.75508404, "epoch": 0.8579401319127347, "grad_norm": 2.296875, "learning_rate": 6.561272137028089e-06, "loss": 0.99426823, "memory(GiB)": 369.42, "step": 33820, "train_speed(iter/s)": 0.20116 }, { "acc": 0.74035478, "epoch": 0.8580669710806698, "grad_norm": 2.234375, "learning_rate": 6.560275911764582e-06, "loss": 0.98257351, "memory(GiB)": 369.42, "step": 33825, "train_speed(iter/s)": 0.201162 }, { "acc": 0.75176859, "epoch": 0.8581938102486047, "grad_norm": 2.28125, "learning_rate": 6.5592796178731776e-06, "loss": 1.00272293, "memory(GiB)": 369.42, "step": 33830, "train_speed(iter/s)": 0.201168 }, { "acc": 0.72890134, "epoch": 0.8583206494165398, "grad_norm": 2.15625, "learning_rate": 6.5582832553976924e-06, "loss": 1.00211105, "memory(GiB)": 369.42, "step": 33835, "train_speed(iter/s)": 0.201173 }, { "acc": 0.74938879, "epoch": 0.8584474885844748, "grad_norm": 1.84375, "learning_rate": 6.557286824381955e-06, "loss": 1.00177755, "memory(GiB)": 369.42, "step": 33840, "train_speed(iter/s)": 0.201176 }, { "acc": 0.7498939, "epoch": 0.8585743277524099, "grad_norm": 2.53125, "learning_rate": 6.556290324869786e-06, "loss": 1.0251152, "memory(GiB)": 369.42, "step": 33845, "train_speed(iter/s)": 0.201182 }, { "acc": 0.76219864, "epoch": 0.858701166920345, "grad_norm": 2.0625, "learning_rate": 6.555293756905024e-06, "loss": 0.97237864, "memory(GiB)": 369.42, "step": 33850, "train_speed(iter/s)": 0.201184 }, { "acc": 0.75850763, "epoch": 0.85882800608828, "grad_norm": 2.0, "learning_rate": 6.554297120531497e-06, "loss": 0.92653484, "memory(GiB)": 369.42, "step": 33855, "train_speed(iter/s)": 0.20119 }, { "acc": 0.74121847, "epoch": 0.8589548452562151, "grad_norm": 1.8359375, "learning_rate": 6.553300415793042e-06, "loss": 1.01541824, "memory(GiB)": 369.42, "step": 33860, "train_speed(iter/s)": 0.201195 }, { "acc": 0.74150295, "epoch": 0.8590816844241502, "grad_norm": 1.921875, "learning_rate": 6.552303642733502e-06, "loss": 1.03005362, "memory(GiB)": 369.42, "step": 33865, "train_speed(iter/s)": 0.201193 }, { "acc": 0.75389013, "epoch": 0.8592085235920852, "grad_norm": 2.03125, "learning_rate": 6.551306801396715e-06, "loss": 1.01673203, "memory(GiB)": 369.42, "step": 33870, "train_speed(iter/s)": 0.201197 }, { "acc": 0.74680977, "epoch": 0.8593353627600203, "grad_norm": 2.234375, "learning_rate": 6.550309891826531e-06, "loss": 0.95704269, "memory(GiB)": 369.42, "step": 33875, "train_speed(iter/s)": 0.201201 }, { "acc": 0.75711975, "epoch": 0.8594622019279553, "grad_norm": 2.25, "learning_rate": 6.5493129140667955e-06, "loss": 0.9546772, "memory(GiB)": 369.42, "step": 33880, "train_speed(iter/s)": 0.201206 }, { "acc": 0.73858485, "epoch": 0.8595890410958904, "grad_norm": 2.609375, "learning_rate": 6.54831586816136e-06, "loss": 1.04063272, "memory(GiB)": 369.42, "step": 33885, "train_speed(iter/s)": 0.201213 }, { "acc": 0.74975872, "epoch": 0.8597158802638255, "grad_norm": 2.0625, "learning_rate": 6.54731875415408e-06, "loss": 1.00791483, "memory(GiB)": 369.42, "step": 33890, "train_speed(iter/s)": 0.201217 }, { "acc": 0.75599928, "epoch": 0.8598427194317605, "grad_norm": 1.8046875, "learning_rate": 6.546321572088814e-06, "loss": 0.98470459, "memory(GiB)": 369.42, "step": 33895, "train_speed(iter/s)": 0.201222 }, { "acc": 0.74832292, "epoch": 0.8599695585996956, "grad_norm": 2.015625, "learning_rate": 6.545324322009421e-06, "loss": 1.05283451, "memory(GiB)": 369.42, "step": 33900, "train_speed(iter/s)": 0.201228 }, { "acc": 0.76028252, "epoch": 0.8600963977676307, "grad_norm": 2.359375, "learning_rate": 6.544327003959765e-06, "loss": 0.95712862, "memory(GiB)": 369.42, "step": 33905, "train_speed(iter/s)": 0.201234 }, { "acc": 0.74934464, "epoch": 0.8602232369355657, "grad_norm": 2.234375, "learning_rate": 6.543329617983713e-06, "loss": 1.02299061, "memory(GiB)": 369.42, "step": 33910, "train_speed(iter/s)": 0.20124 }, { "acc": 0.75971141, "epoch": 0.8603500761035008, "grad_norm": 2.28125, "learning_rate": 6.5423321641251316e-06, "loss": 1.04095097, "memory(GiB)": 369.42, "step": 33915, "train_speed(iter/s)": 0.201245 }, { "acc": 0.73557196, "epoch": 0.8604769152714358, "grad_norm": 2.375, "learning_rate": 6.541334642427898e-06, "loss": 1.05436401, "memory(GiB)": 369.42, "step": 33920, "train_speed(iter/s)": 0.201252 }, { "acc": 0.7486187, "epoch": 0.8606037544393709, "grad_norm": 1.8671875, "learning_rate": 6.540337052935884e-06, "loss": 0.96055813, "memory(GiB)": 369.42, "step": 33925, "train_speed(iter/s)": 0.201257 }, { "acc": 0.74067874, "epoch": 0.860730593607306, "grad_norm": 2.234375, "learning_rate": 6.53933939569297e-06, "loss": 1.01788998, "memory(GiB)": 369.42, "step": 33930, "train_speed(iter/s)": 0.201262 }, { "acc": 0.74210849, "epoch": 0.860857432775241, "grad_norm": 2.1875, "learning_rate": 6.538341670743037e-06, "loss": 1.02656908, "memory(GiB)": 369.42, "step": 33935, "train_speed(iter/s)": 0.201265 }, { "acc": 0.75719256, "epoch": 0.8609842719431761, "grad_norm": 2.21875, "learning_rate": 6.537343878129969e-06, "loss": 0.99360809, "memory(GiB)": 369.42, "step": 33940, "train_speed(iter/s)": 0.201269 }, { "acc": 0.74230604, "epoch": 0.8611111111111112, "grad_norm": 2.703125, "learning_rate": 6.5363460178976524e-06, "loss": 1.01493101, "memory(GiB)": 369.42, "step": 33945, "train_speed(iter/s)": 0.201271 }, { "acc": 0.73475108, "epoch": 0.8612379502790461, "grad_norm": 2.0625, "learning_rate": 6.53534809008998e-06, "loss": 1.06126347, "memory(GiB)": 369.42, "step": 33950, "train_speed(iter/s)": 0.201277 }, { "acc": 0.73892851, "epoch": 0.8613647894469812, "grad_norm": 1.9375, "learning_rate": 6.534350094750843e-06, "loss": 1.01836262, "memory(GiB)": 369.42, "step": 33955, "train_speed(iter/s)": 0.20128 }, { "acc": 0.73944626, "epoch": 0.8614916286149162, "grad_norm": 2.234375, "learning_rate": 6.5333520319241385e-06, "loss": 1.02613459, "memory(GiB)": 369.42, "step": 33960, "train_speed(iter/s)": 0.201284 }, { "acc": 0.74113417, "epoch": 0.8616184677828513, "grad_norm": 2.53125, "learning_rate": 6.532353901653765e-06, "loss": 1.06047697, "memory(GiB)": 369.42, "step": 33965, "train_speed(iter/s)": 0.201289 }, { "acc": 0.74166431, "epoch": 0.8617453069507864, "grad_norm": 2.0, "learning_rate": 6.531355703983627e-06, "loss": 1.0272747, "memory(GiB)": 369.42, "step": 33970, "train_speed(iter/s)": 0.201295 }, { "acc": 0.75361214, "epoch": 0.8618721461187214, "grad_norm": 2.1875, "learning_rate": 6.530357438957626e-06, "loss": 1.00155735, "memory(GiB)": 369.42, "step": 33975, "train_speed(iter/s)": 0.201299 }, { "acc": 0.75408049, "epoch": 0.8619989852866565, "grad_norm": 2.46875, "learning_rate": 6.529359106619675e-06, "loss": 0.9942872, "memory(GiB)": 369.42, "step": 33980, "train_speed(iter/s)": 0.201305 }, { "acc": 0.74299517, "epoch": 0.8621258244545916, "grad_norm": 1.8359375, "learning_rate": 6.528360707013681e-06, "loss": 0.98794575, "memory(GiB)": 369.42, "step": 33985, "train_speed(iter/s)": 0.201306 }, { "acc": 0.75974226, "epoch": 0.8622526636225266, "grad_norm": 2.609375, "learning_rate": 6.52736224018356e-06, "loss": 1.02003508, "memory(GiB)": 369.42, "step": 33990, "train_speed(iter/s)": 0.201312 }, { "acc": 0.75326796, "epoch": 0.8623795027904617, "grad_norm": 2.0, "learning_rate": 6.526363706173227e-06, "loss": 1.03140945, "memory(GiB)": 369.42, "step": 33995, "train_speed(iter/s)": 0.201316 }, { "acc": 0.76750512, "epoch": 0.8625063419583967, "grad_norm": 2.109375, "learning_rate": 6.525365105026605e-06, "loss": 0.91743603, "memory(GiB)": 369.42, "step": 34000, "train_speed(iter/s)": 0.201321 }, { "epoch": 0.8625063419583967, "eval_acc": 0.7373041559074177, "eval_loss": 0.9723386168479919, "eval_runtime": 385.2246, "eval_samples_per_second": 16.536, "eval_steps_per_second": 8.268, "step": 34000 }, { "acc": 0.76018543, "epoch": 0.8626331811263318, "grad_norm": 2.09375, "learning_rate": 6.524366436787615e-06, "loss": 0.93795147, "memory(GiB)": 369.42, "step": 34005, "train_speed(iter/s)": 0.200478 }, { "acc": 0.7461381, "epoch": 0.8627600202942669, "grad_norm": 2.078125, "learning_rate": 6.523367701500183e-06, "loss": 1.02012291, "memory(GiB)": 369.42, "step": 34010, "train_speed(iter/s)": 0.200474 }, { "acc": 0.74260626, "epoch": 0.8628868594622019, "grad_norm": 1.984375, "learning_rate": 6.5223688992082375e-06, "loss": 1.01615086, "memory(GiB)": 369.42, "step": 34015, "train_speed(iter/s)": 0.200478 }, { "acc": 0.74116259, "epoch": 0.863013698630137, "grad_norm": 2.46875, "learning_rate": 6.521370029955713e-06, "loss": 1.05455065, "memory(GiB)": 369.42, "step": 34020, "train_speed(iter/s)": 0.200482 }, { "acc": 0.73547411, "epoch": 0.8631405377980721, "grad_norm": 1.9765625, "learning_rate": 6.520371093786541e-06, "loss": 1.05830479, "memory(GiB)": 369.42, "step": 34025, "train_speed(iter/s)": 0.200486 }, { "acc": 0.76395454, "epoch": 0.8632673769660071, "grad_norm": 2.453125, "learning_rate": 6.51937209074466e-06, "loss": 0.95259819, "memory(GiB)": 369.42, "step": 34030, "train_speed(iter/s)": 0.20049 }, { "acc": 0.74876652, "epoch": 0.8633942161339422, "grad_norm": 2.140625, "learning_rate": 6.51837302087401e-06, "loss": 0.98345165, "memory(GiB)": 369.42, "step": 34035, "train_speed(iter/s)": 0.200495 }, { "acc": 0.74705529, "epoch": 0.8635210553018772, "grad_norm": 2.828125, "learning_rate": 6.517373884218539e-06, "loss": 1.02793217, "memory(GiB)": 369.42, "step": 34040, "train_speed(iter/s)": 0.200498 }, { "acc": 0.73702126, "epoch": 0.8636478944698123, "grad_norm": 2.46875, "learning_rate": 6.5163746808221865e-06, "loss": 1.03444452, "memory(GiB)": 369.42, "step": 34045, "train_speed(iter/s)": 0.200504 }, { "acc": 0.73621025, "epoch": 0.8637747336377474, "grad_norm": 1.84375, "learning_rate": 6.515375410728907e-06, "loss": 1.00933294, "memory(GiB)": 369.42, "step": 34050, "train_speed(iter/s)": 0.200509 }, { "acc": 0.73944912, "epoch": 0.8639015728056824, "grad_norm": 1.71875, "learning_rate": 6.51437607398265e-06, "loss": 1.01451359, "memory(GiB)": 369.42, "step": 34055, "train_speed(iter/s)": 0.200512 }, { "acc": 0.75699863, "epoch": 0.8640284119736175, "grad_norm": 2.5625, "learning_rate": 6.513376670627374e-06, "loss": 0.98741188, "memory(GiB)": 369.42, "step": 34060, "train_speed(iter/s)": 0.200515 }, { "acc": 0.74515438, "epoch": 0.8641552511415526, "grad_norm": 2.90625, "learning_rate": 6.512377200707033e-06, "loss": 1.02532339, "memory(GiB)": 369.42, "step": 34065, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75346828, "epoch": 0.8642820903094875, "grad_norm": 2.40625, "learning_rate": 6.511377664265591e-06, "loss": 1.03129416, "memory(GiB)": 369.42, "step": 34070, "train_speed(iter/s)": 0.200524 }, { "acc": 0.75511909, "epoch": 0.8644089294774226, "grad_norm": 2.078125, "learning_rate": 6.510378061347013e-06, "loss": 1.02665586, "memory(GiB)": 369.42, "step": 34075, "train_speed(iter/s)": 0.200527 }, { "acc": 0.74515796, "epoch": 0.8645357686453576, "grad_norm": 2.578125, "learning_rate": 6.509378391995264e-06, "loss": 1.02400246, "memory(GiB)": 369.42, "step": 34080, "train_speed(iter/s)": 0.200532 }, { "acc": 0.74716468, "epoch": 0.8646626078132927, "grad_norm": 1.890625, "learning_rate": 6.508378656254314e-06, "loss": 1.05877523, "memory(GiB)": 369.42, "step": 34085, "train_speed(iter/s)": 0.200535 }, { "acc": 0.76293545, "epoch": 0.8647894469812278, "grad_norm": 2.015625, "learning_rate": 6.507378854168136e-06, "loss": 0.95483189, "memory(GiB)": 369.42, "step": 34090, "train_speed(iter/s)": 0.200538 }, { "acc": 0.73949337, "epoch": 0.8649162861491628, "grad_norm": 2.125, "learning_rate": 6.506378985780707e-06, "loss": 0.98811016, "memory(GiB)": 369.42, "step": 34095, "train_speed(iter/s)": 0.200541 }, { "acc": 0.75222731, "epoch": 0.8650431253170979, "grad_norm": 2.328125, "learning_rate": 6.505379051136004e-06, "loss": 1.00371246, "memory(GiB)": 369.42, "step": 34100, "train_speed(iter/s)": 0.200547 }, { "acc": 0.75550723, "epoch": 0.865169964485033, "grad_norm": 2.0625, "learning_rate": 6.504379050278009e-06, "loss": 0.97419624, "memory(GiB)": 369.42, "step": 34105, "train_speed(iter/s)": 0.200551 }, { "acc": 0.74168911, "epoch": 0.865296803652968, "grad_norm": 2.25, "learning_rate": 6.503378983250707e-06, "loss": 1.07001629, "memory(GiB)": 369.42, "step": 34110, "train_speed(iter/s)": 0.200555 }, { "acc": 0.75454726, "epoch": 0.8654236428209031, "grad_norm": 2.6875, "learning_rate": 6.5023788500980855e-06, "loss": 1.01593018, "memory(GiB)": 369.42, "step": 34115, "train_speed(iter/s)": 0.200558 }, { "acc": 0.74865227, "epoch": 0.8655504819888381, "grad_norm": 2.0, "learning_rate": 6.501378650864135e-06, "loss": 0.97737236, "memory(GiB)": 369.42, "step": 34120, "train_speed(iter/s)": 0.200564 }, { "acc": 0.74244204, "epoch": 0.8656773211567732, "grad_norm": 2.171875, "learning_rate": 6.500378385592847e-06, "loss": 1.0168973, "memory(GiB)": 369.42, "step": 34125, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75309777, "epoch": 0.8658041603247083, "grad_norm": 2.796875, "learning_rate": 6.49937805432822e-06, "loss": 0.99774542, "memory(GiB)": 369.42, "step": 34130, "train_speed(iter/s)": 0.200573 }, { "acc": 0.75112848, "epoch": 0.8659309994926433, "grad_norm": 2.265625, "learning_rate": 6.498377657114251e-06, "loss": 0.97940273, "memory(GiB)": 369.42, "step": 34135, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76366048, "epoch": 0.8660578386605784, "grad_norm": 2.28125, "learning_rate": 6.497377193994944e-06, "loss": 0.97399101, "memory(GiB)": 369.42, "step": 34140, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75194883, "epoch": 0.8661846778285135, "grad_norm": 2.0625, "learning_rate": 6.496376665014301e-06, "loss": 0.99548712, "memory(GiB)": 369.42, "step": 34145, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74328489, "epoch": 0.8663115169964485, "grad_norm": 1.8203125, "learning_rate": 6.4953760702163325e-06, "loss": 1.03427353, "memory(GiB)": 369.42, "step": 34150, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75214024, "epoch": 0.8664383561643836, "grad_norm": 1.890625, "learning_rate": 6.494375409645049e-06, "loss": 0.99729671, "memory(GiB)": 369.42, "step": 34155, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74755545, "epoch": 0.8665651953323186, "grad_norm": 2.09375, "learning_rate": 6.493374683344462e-06, "loss": 0.97526646, "memory(GiB)": 369.42, "step": 34160, "train_speed(iter/s)": 0.200596 }, { "acc": 0.75448599, "epoch": 0.8666920345002537, "grad_norm": 2.453125, "learning_rate": 6.492373891358589e-06, "loss": 0.95675783, "memory(GiB)": 369.42, "step": 34165, "train_speed(iter/s)": 0.200602 }, { "acc": 0.74486551, "epoch": 0.8668188736681888, "grad_norm": 2.703125, "learning_rate": 6.4913730337314495e-06, "loss": 1.01651917, "memory(GiB)": 369.42, "step": 34170, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74435654, "epoch": 0.8669457128361238, "grad_norm": 1.765625, "learning_rate": 6.490372110507066e-06, "loss": 1.02181759, "memory(GiB)": 369.42, "step": 34175, "train_speed(iter/s)": 0.200612 }, { "acc": 0.75891705, "epoch": 0.8670725520040589, "grad_norm": 1.953125, "learning_rate": 6.489371121729462e-06, "loss": 0.96483717, "memory(GiB)": 369.42, "step": 34180, "train_speed(iter/s)": 0.200616 }, { "acc": 0.74835968, "epoch": 0.867199391171994, "grad_norm": 2.25, "learning_rate": 6.4883700674426666e-06, "loss": 0.94327726, "memory(GiB)": 369.42, "step": 34185, "train_speed(iter/s)": 0.20062 }, { "acc": 0.74998722, "epoch": 0.867326230339929, "grad_norm": 2.125, "learning_rate": 6.4873689476907105e-06, "loss": 0.99500446, "memory(GiB)": 369.42, "step": 34190, "train_speed(iter/s)": 0.200625 }, { "acc": 0.7477078, "epoch": 0.867453069507864, "grad_norm": 2.421875, "learning_rate": 6.486367762517628e-06, "loss": 1.02454348, "memory(GiB)": 369.42, "step": 34195, "train_speed(iter/s)": 0.200628 }, { "acc": 0.74662328, "epoch": 0.867579908675799, "grad_norm": 2.09375, "learning_rate": 6.4853665119674556e-06, "loss": 1.04325771, "memory(GiB)": 369.42, "step": 34200, "train_speed(iter/s)": 0.200631 }, { "acc": 0.74122753, "epoch": 0.8677067478437341, "grad_norm": 1.9921875, "learning_rate": 6.484365196084231e-06, "loss": 0.96881275, "memory(GiB)": 369.42, "step": 34205, "train_speed(iter/s)": 0.200634 }, { "acc": 0.753339, "epoch": 0.8678335870116692, "grad_norm": 1.8125, "learning_rate": 6.4833638149119985e-06, "loss": 0.97367325, "memory(GiB)": 369.42, "step": 34210, "train_speed(iter/s)": 0.200637 }, { "acc": 0.74976826, "epoch": 0.8679604261796042, "grad_norm": 2.21875, "learning_rate": 6.4823623684948034e-06, "loss": 1.04058475, "memory(GiB)": 369.42, "step": 34215, "train_speed(iter/s)": 0.200639 }, { "acc": 0.75937228, "epoch": 0.8680872653475393, "grad_norm": 2.21875, "learning_rate": 6.4813608568766924e-06, "loss": 1.01208773, "memory(GiB)": 369.42, "step": 34220, "train_speed(iter/s)": 0.200644 }, { "acc": 0.73974938, "epoch": 0.8682141045154744, "grad_norm": 2.1875, "learning_rate": 6.480359280101717e-06, "loss": 1.02020483, "memory(GiB)": 369.42, "step": 34225, "train_speed(iter/s)": 0.20065 }, { "acc": 0.75378599, "epoch": 0.8683409436834094, "grad_norm": 2.09375, "learning_rate": 6.479357638213931e-06, "loss": 0.96319981, "memory(GiB)": 369.42, "step": 34230, "train_speed(iter/s)": 0.200652 }, { "acc": 0.75337744, "epoch": 0.8684677828513445, "grad_norm": 2.265625, "learning_rate": 6.478355931257392e-06, "loss": 0.97989502, "memory(GiB)": 369.42, "step": 34235, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75914898, "epoch": 0.8685946220192795, "grad_norm": 2.03125, "learning_rate": 6.477354159276158e-06, "loss": 0.99020462, "memory(GiB)": 369.42, "step": 34240, "train_speed(iter/s)": 0.200663 }, { "acc": 0.73864479, "epoch": 0.8687214611872146, "grad_norm": 2.078125, "learning_rate": 6.476352322314292e-06, "loss": 0.99137554, "memory(GiB)": 369.42, "step": 34245, "train_speed(iter/s)": 0.200667 }, { "acc": 0.7498642, "epoch": 0.8688483003551497, "grad_norm": 2.34375, "learning_rate": 6.47535042041586e-06, "loss": 1.03834591, "memory(GiB)": 369.42, "step": 34250, "train_speed(iter/s)": 0.200673 }, { "acc": 0.72215185, "epoch": 0.8689751395230847, "grad_norm": 1.984375, "learning_rate": 6.474348453624929e-06, "loss": 1.11338558, "memory(GiB)": 369.42, "step": 34255, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75273328, "epoch": 0.8691019786910198, "grad_norm": 2.328125, "learning_rate": 6.473346421985571e-06, "loss": 0.99112644, "memory(GiB)": 369.42, "step": 34260, "train_speed(iter/s)": 0.20068 }, { "acc": 0.74414797, "epoch": 0.8692288178589549, "grad_norm": 2.265625, "learning_rate": 6.472344325541859e-06, "loss": 1.04156666, "memory(GiB)": 369.42, "step": 34265, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75074863, "epoch": 0.8693556570268899, "grad_norm": 2.125, "learning_rate": 6.4713421643378715e-06, "loss": 1.05879059, "memory(GiB)": 369.42, "step": 34270, "train_speed(iter/s)": 0.200691 }, { "acc": 0.73841524, "epoch": 0.869482496194825, "grad_norm": 2.109375, "learning_rate": 6.470339938417685e-06, "loss": 0.97211723, "memory(GiB)": 369.42, "step": 34275, "train_speed(iter/s)": 0.200697 }, { "acc": 0.76259842, "epoch": 0.86960933536276, "grad_norm": 2.1875, "learning_rate": 6.469337647825384e-06, "loss": 0.97667017, "memory(GiB)": 369.42, "step": 34280, "train_speed(iter/s)": 0.200703 }, { "acc": 0.74766741, "epoch": 0.8697361745306951, "grad_norm": 2.796875, "learning_rate": 6.468335292605053e-06, "loss": 0.99937458, "memory(GiB)": 369.42, "step": 34285, "train_speed(iter/s)": 0.200708 }, { "acc": 0.75157461, "epoch": 0.8698630136986302, "grad_norm": 2.265625, "learning_rate": 6.467332872800779e-06, "loss": 0.95336876, "memory(GiB)": 369.42, "step": 34290, "train_speed(iter/s)": 0.200713 }, { "acc": 0.75244327, "epoch": 0.8699898528665652, "grad_norm": 2.203125, "learning_rate": 6.466330388456655e-06, "loss": 1.01203041, "memory(GiB)": 369.42, "step": 34295, "train_speed(iter/s)": 0.200717 }, { "acc": 0.74401646, "epoch": 0.8701166920345003, "grad_norm": 2.5625, "learning_rate": 6.465327839616774e-06, "loss": 1.03212538, "memory(GiB)": 369.42, "step": 34300, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75018873, "epoch": 0.8702435312024354, "grad_norm": 2.171875, "learning_rate": 6.464325226325232e-06, "loss": 0.96640167, "memory(GiB)": 369.42, "step": 34305, "train_speed(iter/s)": 0.200726 }, { "acc": 0.73958683, "epoch": 0.8703703703703703, "grad_norm": 2.40625, "learning_rate": 6.46332254862613e-06, "loss": 1.0697608, "memory(GiB)": 369.42, "step": 34310, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74802084, "epoch": 0.8704972095383054, "grad_norm": 1.8671875, "learning_rate": 6.462319806563568e-06, "loss": 0.99317856, "memory(GiB)": 369.42, "step": 34315, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74602137, "epoch": 0.8706240487062404, "grad_norm": 2.296875, "learning_rate": 6.461317000181653e-06, "loss": 0.97575073, "memory(GiB)": 369.42, "step": 34320, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74775858, "epoch": 0.8707508878741755, "grad_norm": 2.484375, "learning_rate": 6.460314129524491e-06, "loss": 1.01404696, "memory(GiB)": 369.42, "step": 34325, "train_speed(iter/s)": 0.200743 }, { "acc": 0.74492183, "epoch": 0.8708777270421106, "grad_norm": 1.96875, "learning_rate": 6.4593111946361945e-06, "loss": 1.01845531, "memory(GiB)": 369.42, "step": 34330, "train_speed(iter/s)": 0.200748 }, { "acc": 0.73917904, "epoch": 0.8710045662100456, "grad_norm": 2.03125, "learning_rate": 6.458308195560874e-06, "loss": 1.0004735, "memory(GiB)": 369.42, "step": 34335, "train_speed(iter/s)": 0.200752 }, { "acc": 0.74558048, "epoch": 0.8711314053779807, "grad_norm": 2.59375, "learning_rate": 6.4573051323426515e-06, "loss": 1.08773413, "memory(GiB)": 369.42, "step": 34340, "train_speed(iter/s)": 0.200758 }, { "acc": 0.7437973, "epoch": 0.8712582445459158, "grad_norm": 2.484375, "learning_rate": 6.456302005025641e-06, "loss": 1.02891045, "memory(GiB)": 369.42, "step": 34345, "train_speed(iter/s)": 0.200762 }, { "acc": 0.73112259, "epoch": 0.8713850837138508, "grad_norm": 2.5, "learning_rate": 6.4552988136539675e-06, "loss": 1.04236202, "memory(GiB)": 369.42, "step": 34350, "train_speed(iter/s)": 0.200765 }, { "acc": 0.75762658, "epoch": 0.8715119228817859, "grad_norm": 2.40625, "learning_rate": 6.454295558271752e-06, "loss": 0.97901287, "memory(GiB)": 369.42, "step": 34355, "train_speed(iter/s)": 0.200768 }, { "acc": 0.74810791, "epoch": 0.8716387620497209, "grad_norm": 1.9765625, "learning_rate": 6.4532922389231275e-06, "loss": 0.94547997, "memory(GiB)": 369.42, "step": 34360, "train_speed(iter/s)": 0.200773 }, { "acc": 0.7485291, "epoch": 0.871765601217656, "grad_norm": 1.9921875, "learning_rate": 6.452288855652222e-06, "loss": 0.97877254, "memory(GiB)": 369.42, "step": 34365, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75435581, "epoch": 0.8718924403855911, "grad_norm": 2.046875, "learning_rate": 6.451285408503167e-06, "loss": 0.97095089, "memory(GiB)": 369.42, "step": 34370, "train_speed(iter/s)": 0.200779 }, { "acc": 0.73953629, "epoch": 0.8720192795535261, "grad_norm": 1.96875, "learning_rate": 6.450281897520102e-06, "loss": 1.01927395, "memory(GiB)": 369.42, "step": 34375, "train_speed(iter/s)": 0.200783 }, { "acc": 0.75920448, "epoch": 0.8721461187214612, "grad_norm": 1.9140625, "learning_rate": 6.449278322747164e-06, "loss": 0.97773094, "memory(GiB)": 369.42, "step": 34380, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75640602, "epoch": 0.8722729578893963, "grad_norm": 2.203125, "learning_rate": 6.448274684228494e-06, "loss": 0.98079777, "memory(GiB)": 369.42, "step": 34385, "train_speed(iter/s)": 0.200791 }, { "acc": 0.76622772, "epoch": 0.8723997970573313, "grad_norm": 2.328125, "learning_rate": 6.447270982008237e-06, "loss": 0.93119259, "memory(GiB)": 369.42, "step": 34390, "train_speed(iter/s)": 0.200795 }, { "acc": 0.73744869, "epoch": 0.8725266362252664, "grad_norm": 2.3125, "learning_rate": 6.446267216130541e-06, "loss": 1.04652538, "memory(GiB)": 369.42, "step": 34395, "train_speed(iter/s)": 0.2008 }, { "acc": 0.75127497, "epoch": 0.8726534753932014, "grad_norm": 1.9765625, "learning_rate": 6.4452633866395555e-06, "loss": 0.99851379, "memory(GiB)": 369.42, "step": 34400, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75100203, "epoch": 0.8727803145611365, "grad_norm": 1.875, "learning_rate": 6.444259493579433e-06, "loss": 0.98984871, "memory(GiB)": 369.42, "step": 34405, "train_speed(iter/s)": 0.200809 }, { "acc": 0.74654942, "epoch": 0.8729071537290716, "grad_norm": 2.296875, "learning_rate": 6.443255536994331e-06, "loss": 0.98487682, "memory(GiB)": 369.42, "step": 34410, "train_speed(iter/s)": 0.200815 }, { "acc": 0.75248899, "epoch": 0.8730339928970066, "grad_norm": 2.171875, "learning_rate": 6.442251516928406e-06, "loss": 0.98885288, "memory(GiB)": 369.42, "step": 34415, "train_speed(iter/s)": 0.20082 }, { "acc": 0.75746088, "epoch": 0.8731608320649417, "grad_norm": 1.84375, "learning_rate": 6.441247433425821e-06, "loss": 0.943787, "memory(GiB)": 369.42, "step": 34420, "train_speed(iter/s)": 0.200824 }, { "acc": 0.74325323, "epoch": 0.8732876712328768, "grad_norm": 2.21875, "learning_rate": 6.4402432865307384e-06, "loss": 1.0370924, "memory(GiB)": 369.42, "step": 34425, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74487114, "epoch": 0.8734145104008117, "grad_norm": 2.125, "learning_rate": 6.439239076287327e-06, "loss": 1.03510838, "memory(GiB)": 369.42, "step": 34430, "train_speed(iter/s)": 0.200835 }, { "acc": 0.74837093, "epoch": 0.8735413495687468, "grad_norm": 2.125, "learning_rate": 6.438234802739753e-06, "loss": 1.01091356, "memory(GiB)": 369.42, "step": 34435, "train_speed(iter/s)": 0.20084 }, { "acc": 0.75277882, "epoch": 0.8736681887366818, "grad_norm": 2.578125, "learning_rate": 6.4372304659321935e-06, "loss": 1.00017891, "memory(GiB)": 369.42, "step": 34440, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75055256, "epoch": 0.8737950279046169, "grad_norm": 2.015625, "learning_rate": 6.43622606590882e-06, "loss": 1.00299482, "memory(GiB)": 369.42, "step": 34445, "train_speed(iter/s)": 0.200846 }, { "acc": 0.74144926, "epoch": 0.873921867072552, "grad_norm": 2.03125, "learning_rate": 6.4352216027138125e-06, "loss": 0.97022209, "memory(GiB)": 369.42, "step": 34450, "train_speed(iter/s)": 0.200847 }, { "acc": 0.74160919, "epoch": 0.874048706240487, "grad_norm": 2.46875, "learning_rate": 6.434217076391351e-06, "loss": 1.01434698, "memory(GiB)": 369.42, "step": 34455, "train_speed(iter/s)": 0.200853 }, { "acc": 0.74981298, "epoch": 0.8741755454084221, "grad_norm": 2.1875, "learning_rate": 6.433212486985618e-06, "loss": 0.98671436, "memory(GiB)": 369.42, "step": 34460, "train_speed(iter/s)": 0.200859 }, { "acc": 0.74763041, "epoch": 0.8743023845763572, "grad_norm": 1.875, "learning_rate": 6.432207834540802e-06, "loss": 1.01668053, "memory(GiB)": 369.42, "step": 34465, "train_speed(iter/s)": 0.200861 }, { "acc": 0.7584166, "epoch": 0.8744292237442922, "grad_norm": 1.8515625, "learning_rate": 6.431203119101093e-06, "loss": 0.97536144, "memory(GiB)": 369.42, "step": 34470, "train_speed(iter/s)": 0.200863 }, { "acc": 0.74006681, "epoch": 0.8745560629122273, "grad_norm": 2.296875, "learning_rate": 6.430198340710677e-06, "loss": 1.00242357, "memory(GiB)": 369.42, "step": 34475, "train_speed(iter/s)": 0.200869 }, { "acc": 0.75800042, "epoch": 0.8746829020801623, "grad_norm": 2.203125, "learning_rate": 6.4291934994137566e-06, "loss": 0.98191872, "memory(GiB)": 369.42, "step": 34480, "train_speed(iter/s)": 0.200873 }, { "acc": 0.75218554, "epoch": 0.8748097412480974, "grad_norm": 2.578125, "learning_rate": 6.428188595254521e-06, "loss": 0.99680805, "memory(GiB)": 369.42, "step": 34485, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75505629, "epoch": 0.8749365804160325, "grad_norm": 2.859375, "learning_rate": 6.427183628277178e-06, "loss": 1.0131958, "memory(GiB)": 369.42, "step": 34490, "train_speed(iter/s)": 0.200881 }, { "acc": 0.74270115, "epoch": 0.8750634195839675, "grad_norm": 1.8984375, "learning_rate": 6.426178598525925e-06, "loss": 1.02799568, "memory(GiB)": 369.42, "step": 34495, "train_speed(iter/s)": 0.200885 }, { "acc": 0.74760909, "epoch": 0.8751902587519026, "grad_norm": 1.8515625, "learning_rate": 6.4251735060449725e-06, "loss": 1.06088381, "memory(GiB)": 369.42, "step": 34500, "train_speed(iter/s)": 0.20089 }, { "acc": 0.7350503, "epoch": 0.8753170979198377, "grad_norm": 2.140625, "learning_rate": 6.424168350878524e-06, "loss": 1.02020626, "memory(GiB)": 369.42, "step": 34505, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75019221, "epoch": 0.8754439370877727, "grad_norm": 2.8125, "learning_rate": 6.423163133070792e-06, "loss": 0.96670361, "memory(GiB)": 369.42, "step": 34510, "train_speed(iter/s)": 0.200898 }, { "acc": 0.75422401, "epoch": 0.8755707762557078, "grad_norm": 2.421875, "learning_rate": 6.422157852665993e-06, "loss": 1.06561985, "memory(GiB)": 369.42, "step": 34515, "train_speed(iter/s)": 0.200903 }, { "acc": 0.75720358, "epoch": 0.8756976154236428, "grad_norm": 2.03125, "learning_rate": 6.421152509708342e-06, "loss": 0.94625177, "memory(GiB)": 369.42, "step": 34520, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75192451, "epoch": 0.8758244545915779, "grad_norm": 2.578125, "learning_rate": 6.4201471042420595e-06, "loss": 0.97064486, "memory(GiB)": 369.42, "step": 34525, "train_speed(iter/s)": 0.200913 }, { "acc": 0.75156174, "epoch": 0.875951293759513, "grad_norm": 1.9921875, "learning_rate": 6.419141636311366e-06, "loss": 0.97897625, "memory(GiB)": 369.42, "step": 34530, "train_speed(iter/s)": 0.200917 }, { "acc": 0.74912901, "epoch": 0.876078132927448, "grad_norm": 2.234375, "learning_rate": 6.4181361059604875e-06, "loss": 1.01363506, "memory(GiB)": 369.42, "step": 34535, "train_speed(iter/s)": 0.200921 }, { "acc": 0.73937402, "epoch": 0.8762049720953831, "grad_norm": 2.0625, "learning_rate": 6.4171305132336515e-06, "loss": 1.00902462, "memory(GiB)": 369.42, "step": 34540, "train_speed(iter/s)": 0.200926 }, { "acc": 0.73869095, "epoch": 0.8763318112633182, "grad_norm": 1.9140625, "learning_rate": 6.416124858175088e-06, "loss": 1.00086489, "memory(GiB)": 369.42, "step": 34545, "train_speed(iter/s)": 0.200931 }, { "acc": 0.74639812, "epoch": 0.8764586504312532, "grad_norm": 2.3125, "learning_rate": 6.415119140829031e-06, "loss": 1.10950909, "memory(GiB)": 369.42, "step": 34550, "train_speed(iter/s)": 0.200937 }, { "acc": 0.74403529, "epoch": 0.8765854895991883, "grad_norm": 2.078125, "learning_rate": 6.414113361239715e-06, "loss": 0.98821011, "memory(GiB)": 369.42, "step": 34555, "train_speed(iter/s)": 0.200942 }, { "acc": 0.75567293, "epoch": 0.8767123287671232, "grad_norm": 2.046875, "learning_rate": 6.4131075194513825e-06, "loss": 0.9208559, "memory(GiB)": 369.42, "step": 34560, "train_speed(iter/s)": 0.200946 }, { "acc": 0.76396027, "epoch": 0.8768391679350583, "grad_norm": 1.875, "learning_rate": 6.41210161550827e-06, "loss": 0.87192249, "memory(GiB)": 369.42, "step": 34565, "train_speed(iter/s)": 0.20095 }, { "acc": 0.74255342, "epoch": 0.8769660071029934, "grad_norm": 1.9609375, "learning_rate": 6.411095649454626e-06, "loss": 1.01077271, "memory(GiB)": 369.42, "step": 34570, "train_speed(iter/s)": 0.200955 }, { "acc": 0.7450542, "epoch": 0.8770928462709284, "grad_norm": 2.28125, "learning_rate": 6.410089621334693e-06, "loss": 1.02941608, "memory(GiB)": 369.42, "step": 34575, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75258942, "epoch": 0.8772196854388635, "grad_norm": 2.0, "learning_rate": 6.4090835311927236e-06, "loss": 0.92519732, "memory(GiB)": 369.42, "step": 34580, "train_speed(iter/s)": 0.200962 }, { "acc": 0.76373625, "epoch": 0.8773465246067986, "grad_norm": 2.0625, "learning_rate": 6.40807737907297e-06, "loss": 0.9951087, "memory(GiB)": 369.42, "step": 34585, "train_speed(iter/s)": 0.200966 }, { "acc": 0.74252267, "epoch": 0.8774733637747336, "grad_norm": 2.78125, "learning_rate": 6.407071165019686e-06, "loss": 1.02172318, "memory(GiB)": 369.42, "step": 34590, "train_speed(iter/s)": 0.200968 }, { "acc": 0.76061192, "epoch": 0.8776002029426687, "grad_norm": 1.9140625, "learning_rate": 6.40606488907713e-06, "loss": 0.96804695, "memory(GiB)": 369.42, "step": 34595, "train_speed(iter/s)": 0.200973 }, { "acc": 0.74177413, "epoch": 0.8777270421106037, "grad_norm": 1.96875, "learning_rate": 6.4050585512895624e-06, "loss": 1.02098446, "memory(GiB)": 369.42, "step": 34600, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75816431, "epoch": 0.8778538812785388, "grad_norm": 1.96875, "learning_rate": 6.4040521517012475e-06, "loss": 0.93980904, "memory(GiB)": 369.42, "step": 34605, "train_speed(iter/s)": 0.200982 }, { "acc": 0.73644409, "epoch": 0.8779807204464739, "grad_norm": 2.46875, "learning_rate": 6.40304569035645e-06, "loss": 1.03174114, "memory(GiB)": 369.42, "step": 34610, "train_speed(iter/s)": 0.200985 }, { "acc": 0.74571457, "epoch": 0.8781075596144089, "grad_norm": 2.1875, "learning_rate": 6.402039167299439e-06, "loss": 1.00871468, "memory(GiB)": 369.42, "step": 34615, "train_speed(iter/s)": 0.20099 }, { "acc": 0.73808088, "epoch": 0.878234398782344, "grad_norm": 2.265625, "learning_rate": 6.401032582574485e-06, "loss": 1.04188366, "memory(GiB)": 369.42, "step": 34620, "train_speed(iter/s)": 0.200993 }, { "acc": 0.74852524, "epoch": 0.8783612379502791, "grad_norm": 2.546875, "learning_rate": 6.400025936225862e-06, "loss": 1.0200304, "memory(GiB)": 369.42, "step": 34625, "train_speed(iter/s)": 0.200996 }, { "acc": 0.73541665, "epoch": 0.8784880771182141, "grad_norm": 2.015625, "learning_rate": 6.399019228297851e-06, "loss": 1.04155655, "memory(GiB)": 369.42, "step": 34630, "train_speed(iter/s)": 0.201001 }, { "acc": 0.7562428, "epoch": 0.8786149162861492, "grad_norm": 2.703125, "learning_rate": 6.398012458834724e-06, "loss": 0.96626816, "memory(GiB)": 369.42, "step": 34635, "train_speed(iter/s)": 0.201005 }, { "acc": 0.74411063, "epoch": 0.8787417554540842, "grad_norm": 2.015625, "learning_rate": 6.397005627880771e-06, "loss": 0.9851594, "memory(GiB)": 369.42, "step": 34640, "train_speed(iter/s)": 0.201008 }, { "acc": 0.74496269, "epoch": 0.8788685946220193, "grad_norm": 2.390625, "learning_rate": 6.395998735480271e-06, "loss": 1.03544159, "memory(GiB)": 369.42, "step": 34645, "train_speed(iter/s)": 0.201014 }, { "acc": 0.74104433, "epoch": 0.8789954337899544, "grad_norm": 1.9921875, "learning_rate": 6.394991781677516e-06, "loss": 0.96621666, "memory(GiB)": 369.42, "step": 34650, "train_speed(iter/s)": 0.201016 }, { "acc": 0.73653069, "epoch": 0.8791222729578894, "grad_norm": 2.21875, "learning_rate": 6.393984766516792e-06, "loss": 1.0536026, "memory(GiB)": 369.42, "step": 34655, "train_speed(iter/s)": 0.201019 }, { "acc": 0.73069329, "epoch": 0.8792491121258245, "grad_norm": 2.171875, "learning_rate": 6.392977690042395e-06, "loss": 1.13548632, "memory(GiB)": 369.42, "step": 34660, "train_speed(iter/s)": 0.201024 }, { "acc": 0.75940456, "epoch": 0.8793759512937596, "grad_norm": 2.578125, "learning_rate": 6.3919705522986205e-06, "loss": 0.96497879, "memory(GiB)": 369.42, "step": 34665, "train_speed(iter/s)": 0.201028 }, { "acc": 0.76459341, "epoch": 0.8795027904616946, "grad_norm": 2.1875, "learning_rate": 6.390963353329767e-06, "loss": 0.95263395, "memory(GiB)": 369.42, "step": 34670, "train_speed(iter/s)": 0.201032 }, { "acc": 0.73128085, "epoch": 0.8796296296296297, "grad_norm": 1.96875, "learning_rate": 6.389956093180134e-06, "loss": 1.05122766, "memory(GiB)": 369.42, "step": 34675, "train_speed(iter/s)": 0.201036 }, { "acc": 0.76427979, "epoch": 0.8797564687975646, "grad_norm": 1.9765625, "learning_rate": 6.388948771894025e-06, "loss": 0.96959419, "memory(GiB)": 369.42, "step": 34680, "train_speed(iter/s)": 0.201042 }, { "acc": 0.7572504, "epoch": 0.8798833079654997, "grad_norm": 1.90625, "learning_rate": 6.38794138951575e-06, "loss": 0.98283291, "memory(GiB)": 369.42, "step": 34685, "train_speed(iter/s)": 0.201046 }, { "acc": 0.75742526, "epoch": 0.8800101471334348, "grad_norm": 2.046875, "learning_rate": 6.386933946089615e-06, "loss": 0.99855747, "memory(GiB)": 369.42, "step": 34690, "train_speed(iter/s)": 0.201048 }, { "acc": 0.75433407, "epoch": 0.8801369863013698, "grad_norm": 2.34375, "learning_rate": 6.385926441659933e-06, "loss": 0.9916481, "memory(GiB)": 369.42, "step": 34695, "train_speed(iter/s)": 0.201054 }, { "acc": 0.74091892, "epoch": 0.8802638254693049, "grad_norm": 2.4375, "learning_rate": 6.38491887627102e-06, "loss": 1.03371563, "memory(GiB)": 369.42, "step": 34700, "train_speed(iter/s)": 0.201055 }, { "acc": 0.7430131, "epoch": 0.88039066463724, "grad_norm": 11.5, "learning_rate": 6.383911249967188e-06, "loss": 1.0112421, "memory(GiB)": 369.42, "step": 34705, "train_speed(iter/s)": 0.201057 }, { "acc": 0.74484653, "epoch": 0.880517503805175, "grad_norm": 2.421875, "learning_rate": 6.382903562792764e-06, "loss": 1.02220287, "memory(GiB)": 369.42, "step": 34710, "train_speed(iter/s)": 0.201061 }, { "acc": 0.73643227, "epoch": 0.8806443429731101, "grad_norm": 2.46875, "learning_rate": 6.381895814792065e-06, "loss": 1.08688545, "memory(GiB)": 369.42, "step": 34715, "train_speed(iter/s)": 0.201064 }, { "acc": 0.74902034, "epoch": 0.8807711821410451, "grad_norm": 1.984375, "learning_rate": 6.38088800600942e-06, "loss": 0.98781118, "memory(GiB)": 369.42, "step": 34720, "train_speed(iter/s)": 0.201069 }, { "acc": 0.75161433, "epoch": 0.8808980213089802, "grad_norm": 2.109375, "learning_rate": 6.3798801364891535e-06, "loss": 1.02923441, "memory(GiB)": 369.42, "step": 34725, "train_speed(iter/s)": 0.201074 }, { "acc": 0.74632225, "epoch": 0.8810248604769153, "grad_norm": 2.296875, "learning_rate": 6.378872206275599e-06, "loss": 0.98948126, "memory(GiB)": 369.42, "step": 34730, "train_speed(iter/s)": 0.20108 }, { "acc": 0.74774771, "epoch": 0.8811516996448503, "grad_norm": 2.203125, "learning_rate": 6.377864215413088e-06, "loss": 0.96261864, "memory(GiB)": 369.42, "step": 34735, "train_speed(iter/s)": 0.201082 }, { "acc": 0.75481453, "epoch": 0.8812785388127854, "grad_norm": 2.15625, "learning_rate": 6.376856163945957e-06, "loss": 1.02059641, "memory(GiB)": 369.42, "step": 34740, "train_speed(iter/s)": 0.201086 }, { "acc": 0.74875956, "epoch": 0.8814053779807205, "grad_norm": 2.5625, "learning_rate": 6.375848051918546e-06, "loss": 0.99990711, "memory(GiB)": 369.42, "step": 34745, "train_speed(iter/s)": 0.201088 }, { "acc": 0.75166302, "epoch": 0.8815322171486555, "grad_norm": 2.09375, "learning_rate": 6.374839879375194e-06, "loss": 0.95885315, "memory(GiB)": 369.42, "step": 34750, "train_speed(iter/s)": 0.20109 }, { "acc": 0.7381566, "epoch": 0.8816590563165906, "grad_norm": 1.9453125, "learning_rate": 6.373831646360245e-06, "loss": 0.98070059, "memory(GiB)": 369.42, "step": 34755, "train_speed(iter/s)": 0.201089 }, { "acc": 0.74535275, "epoch": 0.8817858954845256, "grad_norm": 2.140625, "learning_rate": 6.372823352918048e-06, "loss": 1.02951746, "memory(GiB)": 369.42, "step": 34760, "train_speed(iter/s)": 0.201094 }, { "acc": 0.74495406, "epoch": 0.8819127346524607, "grad_norm": 2.0625, "learning_rate": 6.371814999092951e-06, "loss": 0.98230686, "memory(GiB)": 369.42, "step": 34765, "train_speed(iter/s)": 0.201098 }, { "acc": 0.74455748, "epoch": 0.8820395738203958, "grad_norm": 2.0625, "learning_rate": 6.370806584929305e-06, "loss": 1.05117064, "memory(GiB)": 369.42, "step": 34770, "train_speed(iter/s)": 0.201103 }, { "acc": 0.74526949, "epoch": 0.8821664129883308, "grad_norm": 1.7734375, "learning_rate": 6.369798110471463e-06, "loss": 1.01532593, "memory(GiB)": 369.42, "step": 34775, "train_speed(iter/s)": 0.201108 }, { "acc": 0.75731163, "epoch": 0.8822932521562659, "grad_norm": 2.25, "learning_rate": 6.368789575763787e-06, "loss": 0.95703869, "memory(GiB)": 369.42, "step": 34780, "train_speed(iter/s)": 0.201112 }, { "acc": 0.74429669, "epoch": 0.882420091324201, "grad_norm": 2.484375, "learning_rate": 6.367780980850633e-06, "loss": 0.97715158, "memory(GiB)": 369.42, "step": 34785, "train_speed(iter/s)": 0.201115 }, { "acc": 0.74784179, "epoch": 0.882546930492136, "grad_norm": 2.421875, "learning_rate": 6.366772325776367e-06, "loss": 1.01607304, "memory(GiB)": 369.42, "step": 34790, "train_speed(iter/s)": 0.201112 }, { "acc": 0.75228953, "epoch": 0.882673769660071, "grad_norm": 2.234375, "learning_rate": 6.365763610585349e-06, "loss": 0.96278534, "memory(GiB)": 369.42, "step": 34795, "train_speed(iter/s)": 0.201116 }, { "acc": 0.75789566, "epoch": 0.882800608828006, "grad_norm": 2.390625, "learning_rate": 6.3647548353219515e-06, "loss": 0.99355297, "memory(GiB)": 369.42, "step": 34800, "train_speed(iter/s)": 0.201118 }, { "acc": 0.75909042, "epoch": 0.8829274479959411, "grad_norm": 2.46875, "learning_rate": 6.363746000030543e-06, "loss": 0.99956474, "memory(GiB)": 369.42, "step": 34805, "train_speed(iter/s)": 0.20112 }, { "acc": 0.74887185, "epoch": 0.8830542871638762, "grad_norm": 2.453125, "learning_rate": 6.362737104755497e-06, "loss": 1.01477795, "memory(GiB)": 369.42, "step": 34810, "train_speed(iter/s)": 0.201126 }, { "acc": 0.75306659, "epoch": 0.8831811263318112, "grad_norm": 2.140625, "learning_rate": 6.361728149541188e-06, "loss": 1.00297318, "memory(GiB)": 369.42, "step": 34815, "train_speed(iter/s)": 0.201129 }, { "acc": 0.75414476, "epoch": 0.8833079654997463, "grad_norm": 2.046875, "learning_rate": 6.360719134431995e-06, "loss": 1.02438774, "memory(GiB)": 369.42, "step": 34820, "train_speed(iter/s)": 0.201133 }, { "acc": 0.75518298, "epoch": 0.8834348046676814, "grad_norm": 2.78125, "learning_rate": 6.359710059472299e-06, "loss": 1.00235777, "memory(GiB)": 369.42, "step": 34825, "train_speed(iter/s)": 0.201136 }, { "acc": 0.75468006, "epoch": 0.8835616438356164, "grad_norm": 2.25, "learning_rate": 6.358700924706486e-06, "loss": 1.00709286, "memory(GiB)": 369.42, "step": 34830, "train_speed(iter/s)": 0.201139 }, { "acc": 0.74930792, "epoch": 0.8836884830035515, "grad_norm": 2.265625, "learning_rate": 6.357691730178939e-06, "loss": 1.0136754, "memory(GiB)": 369.42, "step": 34835, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74256048, "epoch": 0.8838153221714865, "grad_norm": 2.21875, "learning_rate": 6.356682475934048e-06, "loss": 1.05730743, "memory(GiB)": 369.42, "step": 34840, "train_speed(iter/s)": 0.20115 }, { "acc": 0.74327011, "epoch": 0.8839421613394216, "grad_norm": 2.46875, "learning_rate": 6.3556731620162036e-06, "loss": 1.00118866, "memory(GiB)": 369.42, "step": 34845, "train_speed(iter/s)": 0.201154 }, { "acc": 0.75571194, "epoch": 0.8840690005073567, "grad_norm": 2.765625, "learning_rate": 6.354663788469803e-06, "loss": 0.98098345, "memory(GiB)": 369.42, "step": 34850, "train_speed(iter/s)": 0.201158 }, { "acc": 0.74240322, "epoch": 0.8841958396752917, "grad_norm": 2.453125, "learning_rate": 6.353654355339238e-06, "loss": 1.07836552, "memory(GiB)": 369.42, "step": 34855, "train_speed(iter/s)": 0.201163 }, { "acc": 0.75466952, "epoch": 0.8843226788432268, "grad_norm": 2.296875, "learning_rate": 6.352644862668914e-06, "loss": 0.99379406, "memory(GiB)": 369.42, "step": 34860, "train_speed(iter/s)": 0.201167 }, { "acc": 0.76123781, "epoch": 0.8844495180111619, "grad_norm": 2.015625, "learning_rate": 6.351635310503228e-06, "loss": 0.96342869, "memory(GiB)": 369.42, "step": 34865, "train_speed(iter/s)": 0.201172 }, { "acc": 0.74964342, "epoch": 0.8845763571790969, "grad_norm": 2.65625, "learning_rate": 6.3506256988865865e-06, "loss": 0.98470917, "memory(GiB)": 369.42, "step": 34870, "train_speed(iter/s)": 0.201178 }, { "acc": 0.76725159, "epoch": 0.884703196347032, "grad_norm": 1.9921875, "learning_rate": 6.349616027863397e-06, "loss": 0.88553791, "memory(GiB)": 369.42, "step": 34875, "train_speed(iter/s)": 0.20118 }, { "acc": 0.74498634, "epoch": 0.884830035514967, "grad_norm": 2.0, "learning_rate": 6.34860629747807e-06, "loss": 1.00753517, "memory(GiB)": 369.42, "step": 34880, "train_speed(iter/s)": 0.201184 }, { "acc": 0.73725481, "epoch": 0.8849568746829021, "grad_norm": 2.953125, "learning_rate": 6.347596507775016e-06, "loss": 1.03338356, "memory(GiB)": 369.42, "step": 34885, "train_speed(iter/s)": 0.201189 }, { "acc": 0.7647727, "epoch": 0.8850837138508372, "grad_norm": 1.9921875, "learning_rate": 6.3465866587986505e-06, "loss": 0.92617741, "memory(GiB)": 369.42, "step": 34890, "train_speed(iter/s)": 0.201192 }, { "acc": 0.7601202, "epoch": 0.8852105530187722, "grad_norm": 1.984375, "learning_rate": 6.345576750593392e-06, "loss": 0.9571578, "memory(GiB)": 369.42, "step": 34895, "train_speed(iter/s)": 0.201194 }, { "acc": 0.73349476, "epoch": 0.8853373921867073, "grad_norm": 2.5625, "learning_rate": 6.34456678320366e-06, "loss": 1.08346472, "memory(GiB)": 369.42, "step": 34900, "train_speed(iter/s)": 0.201199 }, { "acc": 0.73817244, "epoch": 0.8854642313546424, "grad_norm": 1.96875, "learning_rate": 6.343556756673879e-06, "loss": 1.05983791, "memory(GiB)": 369.42, "step": 34905, "train_speed(iter/s)": 0.201201 }, { "acc": 0.74697657, "epoch": 0.8855910705225774, "grad_norm": 1.875, "learning_rate": 6.3425466710484726e-06, "loss": 1.01046448, "memory(GiB)": 369.42, "step": 34910, "train_speed(iter/s)": 0.201207 }, { "acc": 0.74279118, "epoch": 0.8857179096905125, "grad_norm": 2.328125, "learning_rate": 6.3415365263718686e-06, "loss": 1.04852104, "memory(GiB)": 369.42, "step": 34915, "train_speed(iter/s)": 0.201208 }, { "acc": 0.74155979, "epoch": 0.8858447488584474, "grad_norm": 2.015625, "learning_rate": 6.340526322688501e-06, "loss": 0.97472372, "memory(GiB)": 369.42, "step": 34920, "train_speed(iter/s)": 0.201212 }, { "acc": 0.75028658, "epoch": 0.8859715880263825, "grad_norm": 2.359375, "learning_rate": 6.339516060042798e-06, "loss": 1.02810135, "memory(GiB)": 369.42, "step": 34925, "train_speed(iter/s)": 0.201216 }, { "acc": 0.73921757, "epoch": 0.8860984271943176, "grad_norm": 2.34375, "learning_rate": 6.3385057384792e-06, "loss": 1.0136548, "memory(GiB)": 369.42, "step": 34930, "train_speed(iter/s)": 0.201217 }, { "acc": 0.73428597, "epoch": 0.8862252663622526, "grad_norm": 2.765625, "learning_rate": 6.337495358042143e-06, "loss": 1.03978844, "memory(GiB)": 369.42, "step": 34935, "train_speed(iter/s)": 0.20122 }, { "acc": 0.74872622, "epoch": 0.8863521055301877, "grad_norm": 2.0625, "learning_rate": 6.336484918776069e-06, "loss": 1.02158899, "memory(GiB)": 369.42, "step": 34940, "train_speed(iter/s)": 0.201223 }, { "acc": 0.75525656, "epoch": 0.8864789446981228, "grad_norm": 1.96875, "learning_rate": 6.335474420725421e-06, "loss": 1.00180855, "memory(GiB)": 369.42, "step": 34945, "train_speed(iter/s)": 0.201229 }, { "acc": 0.73675976, "epoch": 0.8866057838660578, "grad_norm": 2.234375, "learning_rate": 6.334463863934646e-06, "loss": 1.03216286, "memory(GiB)": 369.42, "step": 34950, "train_speed(iter/s)": 0.201233 }, { "acc": 0.74741001, "epoch": 0.8867326230339929, "grad_norm": 1.9375, "learning_rate": 6.333453248448192e-06, "loss": 0.96551189, "memory(GiB)": 369.42, "step": 34955, "train_speed(iter/s)": 0.201236 }, { "acc": 0.73505144, "epoch": 0.8868594622019279, "grad_norm": 2.078125, "learning_rate": 6.33244257431051e-06, "loss": 1.10357056, "memory(GiB)": 369.42, "step": 34960, "train_speed(iter/s)": 0.201243 }, { "acc": 0.73849192, "epoch": 0.886986301369863, "grad_norm": 1.96875, "learning_rate": 6.331431841566056e-06, "loss": 1.01760902, "memory(GiB)": 369.42, "step": 34965, "train_speed(iter/s)": 0.201247 }, { "acc": 0.75035667, "epoch": 0.8871131405377981, "grad_norm": 2.453125, "learning_rate": 6.330421050259283e-06, "loss": 0.99672642, "memory(GiB)": 369.42, "step": 34970, "train_speed(iter/s)": 0.201253 }, { "acc": 0.75505953, "epoch": 0.8872399797057331, "grad_norm": 2.578125, "learning_rate": 6.329410200434655e-06, "loss": 0.97494068, "memory(GiB)": 369.42, "step": 34975, "train_speed(iter/s)": 0.201258 }, { "acc": 0.74773979, "epoch": 0.8873668188736682, "grad_norm": 2.140625, "learning_rate": 6.328399292136629e-06, "loss": 1.02467976, "memory(GiB)": 369.42, "step": 34980, "train_speed(iter/s)": 0.201263 }, { "acc": 0.74095893, "epoch": 0.8874936580416033, "grad_norm": 3.015625, "learning_rate": 6.327388325409672e-06, "loss": 1.02702007, "memory(GiB)": 369.42, "step": 34985, "train_speed(iter/s)": 0.201267 }, { "acc": 0.73275814, "epoch": 0.8876204972095383, "grad_norm": 1.9453125, "learning_rate": 6.326377300298251e-06, "loss": 1.03552322, "memory(GiB)": 369.42, "step": 34990, "train_speed(iter/s)": 0.201271 }, { "acc": 0.75517125, "epoch": 0.8877473363774734, "grad_norm": 2.28125, "learning_rate": 6.325366216846832e-06, "loss": 0.98296261, "memory(GiB)": 369.42, "step": 34995, "train_speed(iter/s)": 0.201276 }, { "acc": 0.75435629, "epoch": 0.8878741755454084, "grad_norm": 2.625, "learning_rate": 6.324355075099893e-06, "loss": 1.05127258, "memory(GiB)": 369.42, "step": 35000, "train_speed(iter/s)": 0.201279 }, { "epoch": 0.8878741755454084, "eval_acc": 0.7373442615691257, "eval_loss": 0.9718271493911743, "eval_runtime": 384.8806, "eval_samples_per_second": 16.551, "eval_steps_per_second": 8.275, "step": 35000 }, { "acc": 0.74613066, "epoch": 0.8880010147133435, "grad_norm": 1.96875, "learning_rate": 6.3233438751019016e-06, "loss": 1.02121878, "memory(GiB)": 369.42, "step": 35005, "train_speed(iter/s)": 0.200461 }, { "acc": 0.75489292, "epoch": 0.8881278538812786, "grad_norm": 2.40625, "learning_rate": 6.322332616897341e-06, "loss": 0.95922737, "memory(GiB)": 369.42, "step": 35010, "train_speed(iter/s)": 0.200466 }, { "acc": 0.75163908, "epoch": 0.8882546930492136, "grad_norm": 2.328125, "learning_rate": 6.321321300530685e-06, "loss": 0.9412466, "memory(GiB)": 369.42, "step": 35015, "train_speed(iter/s)": 0.200471 }, { "acc": 0.76699648, "epoch": 0.8883815322171487, "grad_norm": 2.453125, "learning_rate": 6.320309926046421e-06, "loss": 0.95883102, "memory(GiB)": 369.42, "step": 35020, "train_speed(iter/s)": 0.200473 }, { "acc": 0.73128529, "epoch": 0.8885083713850838, "grad_norm": 2.171875, "learning_rate": 6.319298493489032e-06, "loss": 1.07038975, "memory(GiB)": 369.42, "step": 35025, "train_speed(iter/s)": 0.20048 }, { "acc": 0.74402118, "epoch": 0.8886352105530188, "grad_norm": 2.390625, "learning_rate": 6.318287002903004e-06, "loss": 1.0075963, "memory(GiB)": 369.42, "step": 35030, "train_speed(iter/s)": 0.200483 }, { "acc": 0.73123493, "epoch": 0.8887620497209539, "grad_norm": 2.421875, "learning_rate": 6.317275454332829e-06, "loss": 1.01824274, "memory(GiB)": 369.42, "step": 35035, "train_speed(iter/s)": 0.200489 }, { "acc": 0.74644775, "epoch": 0.8888888888888888, "grad_norm": 2.46875, "learning_rate": 6.3162638478229965e-06, "loss": 1.03601065, "memory(GiB)": 369.42, "step": 35040, "train_speed(iter/s)": 0.200495 }, { "acc": 0.74905672, "epoch": 0.8890157280568239, "grad_norm": 2.21875, "learning_rate": 6.315252183418005e-06, "loss": 0.99055729, "memory(GiB)": 369.42, "step": 35045, "train_speed(iter/s)": 0.200499 }, { "acc": 0.7426744, "epoch": 0.889142567224759, "grad_norm": 1.9375, "learning_rate": 6.31424046116235e-06, "loss": 1.10174065, "memory(GiB)": 369.42, "step": 35050, "train_speed(iter/s)": 0.200504 }, { "acc": 0.74050531, "epoch": 0.889269406392694, "grad_norm": 2.125, "learning_rate": 6.313228681100532e-06, "loss": 1.01281319, "memory(GiB)": 369.42, "step": 35055, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75125794, "epoch": 0.8893962455606291, "grad_norm": 2.109375, "learning_rate": 6.312216843277052e-06, "loss": 0.98678045, "memory(GiB)": 369.42, "step": 35060, "train_speed(iter/s)": 0.200514 }, { "acc": 0.7180584, "epoch": 0.8895230847285642, "grad_norm": 2.140625, "learning_rate": 6.3112049477364165e-06, "loss": 1.12358284, "memory(GiB)": 369.42, "step": 35065, "train_speed(iter/s)": 0.200518 }, { "acc": 0.74459772, "epoch": 0.8896499238964992, "grad_norm": 2.515625, "learning_rate": 6.310192994523137e-06, "loss": 1.0554224, "memory(GiB)": 369.42, "step": 35070, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74536753, "epoch": 0.8897767630644343, "grad_norm": 2.09375, "learning_rate": 6.309180983681716e-06, "loss": 1.0206954, "memory(GiB)": 369.42, "step": 35075, "train_speed(iter/s)": 0.200526 }, { "acc": 0.74002905, "epoch": 0.8899036022323693, "grad_norm": 2.125, "learning_rate": 6.308168915256671e-06, "loss": 1.05146675, "memory(GiB)": 369.42, "step": 35080, "train_speed(iter/s)": 0.200531 }, { "acc": 0.73920259, "epoch": 0.8900304414003044, "grad_norm": 2.28125, "learning_rate": 6.307156789292518e-06, "loss": 1.07091789, "memory(GiB)": 369.42, "step": 35085, "train_speed(iter/s)": 0.200536 }, { "acc": 0.7537694, "epoch": 0.8901572805682395, "grad_norm": 2.28125, "learning_rate": 6.306144605833773e-06, "loss": 1.00788832, "memory(GiB)": 369.42, "step": 35090, "train_speed(iter/s)": 0.20054 }, { "acc": 0.7329793, "epoch": 0.8902841197361745, "grad_norm": 2.265625, "learning_rate": 6.305132364924955e-06, "loss": 1.0024622, "memory(GiB)": 369.42, "step": 35095, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74541702, "epoch": 0.8904109589041096, "grad_norm": 2.5, "learning_rate": 6.3041200666105905e-06, "loss": 0.9895916, "memory(GiB)": 369.42, "step": 35100, "train_speed(iter/s)": 0.200548 }, { "acc": 0.74499621, "epoch": 0.8905377980720447, "grad_norm": 1.8046875, "learning_rate": 6.303107710935202e-06, "loss": 1.01141415, "memory(GiB)": 369.42, "step": 35105, "train_speed(iter/s)": 0.200553 }, { "acc": 0.73216267, "epoch": 0.8906646372399797, "grad_norm": 2.0, "learning_rate": 6.302095297943319e-06, "loss": 1.00655985, "memory(GiB)": 369.42, "step": 35110, "train_speed(iter/s)": 0.200556 }, { "acc": 0.7444953, "epoch": 0.8907914764079148, "grad_norm": 1.96875, "learning_rate": 6.301082827679472e-06, "loss": 1.01216602, "memory(GiB)": 369.42, "step": 35115, "train_speed(iter/s)": 0.20056 }, { "acc": 0.75714722, "epoch": 0.8909183155758498, "grad_norm": 2.265625, "learning_rate": 6.300070300188192e-06, "loss": 0.96257095, "memory(GiB)": 369.42, "step": 35120, "train_speed(iter/s)": 0.200565 }, { "acc": 0.75893917, "epoch": 0.8910451547437849, "grad_norm": 2.875, "learning_rate": 6.2990577155140164e-06, "loss": 1.0061121, "memory(GiB)": 369.42, "step": 35125, "train_speed(iter/s)": 0.200571 }, { "acc": 0.75208712, "epoch": 0.89117199391172, "grad_norm": 2.140625, "learning_rate": 6.298045073701483e-06, "loss": 0.98638859, "memory(GiB)": 369.42, "step": 35130, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74967232, "epoch": 0.891298833079655, "grad_norm": 2.0, "learning_rate": 6.29703237479513e-06, "loss": 0.97810202, "memory(GiB)": 369.42, "step": 35135, "train_speed(iter/s)": 0.200577 }, { "acc": 0.74636645, "epoch": 0.8914256722475901, "grad_norm": 2.0625, "learning_rate": 6.296019618839505e-06, "loss": 1.04095631, "memory(GiB)": 369.42, "step": 35140, "train_speed(iter/s)": 0.200581 }, { "acc": 0.75949717, "epoch": 0.8915525114155252, "grad_norm": 2.03125, "learning_rate": 6.295006805879149e-06, "loss": 0.98319073, "memory(GiB)": 369.42, "step": 35145, "train_speed(iter/s)": 0.200586 }, { "acc": 0.74382915, "epoch": 0.8916793505834602, "grad_norm": 2.328125, "learning_rate": 6.293993935958613e-06, "loss": 0.95821323, "memory(GiB)": 369.42, "step": 35150, "train_speed(iter/s)": 0.200592 }, { "acc": 0.7539938, "epoch": 0.8918061897513953, "grad_norm": 1.890625, "learning_rate": 6.292981009122445e-06, "loss": 0.99343328, "memory(GiB)": 369.42, "step": 35155, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74242535, "epoch": 0.8919330289193302, "grad_norm": 2.546875, "learning_rate": 6.291968025415202e-06, "loss": 1.00087929, "memory(GiB)": 369.42, "step": 35160, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74844866, "epoch": 0.8920598680872653, "grad_norm": 2.125, "learning_rate": 6.290954984881434e-06, "loss": 0.98848152, "memory(GiB)": 369.42, "step": 35165, "train_speed(iter/s)": 0.200606 }, { "acc": 0.74951744, "epoch": 0.8921867072552004, "grad_norm": 2.4375, "learning_rate": 6.289941887565703e-06, "loss": 1.02924452, "memory(GiB)": 369.42, "step": 35170, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75743036, "epoch": 0.8923135464231354, "grad_norm": 2.234375, "learning_rate": 6.288928733512569e-06, "loss": 0.98823137, "memory(GiB)": 369.42, "step": 35175, "train_speed(iter/s)": 0.200615 }, { "acc": 0.76611342, "epoch": 0.8924403855910705, "grad_norm": 2.234375, "learning_rate": 6.287915522766596e-06, "loss": 0.92409859, "memory(GiB)": 369.42, "step": 35180, "train_speed(iter/s)": 0.20062 }, { "acc": 0.74876275, "epoch": 0.8925672247590056, "grad_norm": 2.28125, "learning_rate": 6.2869022553723465e-06, "loss": 0.96630268, "memory(GiB)": 369.42, "step": 35185, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75567303, "epoch": 0.8926940639269406, "grad_norm": 1.9609375, "learning_rate": 6.285888931374391e-06, "loss": 0.97952223, "memory(GiB)": 369.42, "step": 35190, "train_speed(iter/s)": 0.200631 }, { "acc": 0.74712305, "epoch": 0.8928209030948757, "grad_norm": 2.171875, "learning_rate": 6.284875550817299e-06, "loss": 0.96004162, "memory(GiB)": 369.42, "step": 35195, "train_speed(iter/s)": 0.200636 }, { "acc": 0.74429321, "epoch": 0.8929477422628107, "grad_norm": 2.140625, "learning_rate": 6.2838621137456425e-06, "loss": 1.00339108, "memory(GiB)": 369.42, "step": 35200, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74696531, "epoch": 0.8930745814307458, "grad_norm": 1.71875, "learning_rate": 6.282848620203999e-06, "loss": 0.98292961, "memory(GiB)": 369.42, "step": 35205, "train_speed(iter/s)": 0.200646 }, { "acc": 0.74993029, "epoch": 0.8932014205986809, "grad_norm": 2.046875, "learning_rate": 6.2818350702369466e-06, "loss": 0.98663349, "memory(GiB)": 369.42, "step": 35210, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74714565, "epoch": 0.8933282597666159, "grad_norm": 2.59375, "learning_rate": 6.280821463889063e-06, "loss": 1.01859779, "memory(GiB)": 369.42, "step": 35215, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75202112, "epoch": 0.893455098934551, "grad_norm": 2.09375, "learning_rate": 6.279807801204936e-06, "loss": 0.99328671, "memory(GiB)": 369.42, "step": 35220, "train_speed(iter/s)": 0.200661 }, { "acc": 0.76164541, "epoch": 0.8935819381024861, "grad_norm": 2.015625, "learning_rate": 6.278794082229145e-06, "loss": 0.98150311, "memory(GiB)": 369.42, "step": 35225, "train_speed(iter/s)": 0.200664 }, { "acc": 0.75211396, "epoch": 0.8937087772704211, "grad_norm": 2.359375, "learning_rate": 6.2777803070062825e-06, "loss": 1.026618, "memory(GiB)": 369.42, "step": 35230, "train_speed(iter/s)": 0.200667 }, { "acc": 0.75269709, "epoch": 0.8938356164383562, "grad_norm": 2.015625, "learning_rate": 6.276766475580935e-06, "loss": 0.9563633, "memory(GiB)": 369.42, "step": 35235, "train_speed(iter/s)": 0.200671 }, { "acc": 0.73447704, "epoch": 0.8939624556062912, "grad_norm": 2.34375, "learning_rate": 6.2757525879977e-06, "loss": 1.07543631, "memory(GiB)": 369.42, "step": 35240, "train_speed(iter/s)": 0.200675 }, { "acc": 0.7486989, "epoch": 0.8940892947742263, "grad_norm": 2.34375, "learning_rate": 6.27473864430117e-06, "loss": 1.04554768, "memory(GiB)": 369.42, "step": 35245, "train_speed(iter/s)": 0.200679 }, { "acc": 0.73654633, "epoch": 0.8942161339421614, "grad_norm": 2.25, "learning_rate": 6.273724644535942e-06, "loss": 1.02965879, "memory(GiB)": 369.42, "step": 35250, "train_speed(iter/s)": 0.200682 }, { "acc": 0.76127672, "epoch": 0.8943429731100964, "grad_norm": 1.984375, "learning_rate": 6.272710588746619e-06, "loss": 0.96030197, "memory(GiB)": 369.42, "step": 35255, "train_speed(iter/s)": 0.200683 }, { "acc": 0.74282913, "epoch": 0.8944698122780315, "grad_norm": 2.09375, "learning_rate": 6.271696476977801e-06, "loss": 1.0569931, "memory(GiB)": 369.42, "step": 35260, "train_speed(iter/s)": 0.200687 }, { "acc": 0.75740366, "epoch": 0.8945966514459666, "grad_norm": 2.078125, "learning_rate": 6.270682309274094e-06, "loss": 0.96279011, "memory(GiB)": 369.42, "step": 35265, "train_speed(iter/s)": 0.200689 }, { "acc": 0.74979663, "epoch": 0.8947234906139016, "grad_norm": 2.125, "learning_rate": 6.269668085680106e-06, "loss": 0.99933987, "memory(GiB)": 369.42, "step": 35270, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74337626, "epoch": 0.8948503297818367, "grad_norm": 2.6875, "learning_rate": 6.268653806240448e-06, "loss": 1.04610739, "memory(GiB)": 369.42, "step": 35275, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75605512, "epoch": 0.8949771689497716, "grad_norm": 2.03125, "learning_rate": 6.26763947099973e-06, "loss": 0.98671074, "memory(GiB)": 369.42, "step": 35280, "train_speed(iter/s)": 0.200702 }, { "acc": 0.74304152, "epoch": 0.8951040081177067, "grad_norm": 2.234375, "learning_rate": 6.266625080002569e-06, "loss": 1.07209435, "memory(GiB)": 369.42, "step": 35285, "train_speed(iter/s)": 0.200707 }, { "acc": 0.73363614, "epoch": 0.8952308472856418, "grad_norm": 2.09375, "learning_rate": 6.265610633293582e-06, "loss": 1.03417091, "memory(GiB)": 369.42, "step": 35290, "train_speed(iter/s)": 0.200711 }, { "acc": 0.74143925, "epoch": 0.8953576864535768, "grad_norm": 2.46875, "learning_rate": 6.264596130917389e-06, "loss": 0.99853544, "memory(GiB)": 369.42, "step": 35295, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75888867, "epoch": 0.8954845256215119, "grad_norm": 2.390625, "learning_rate": 6.2635815729186124e-06, "loss": 0.92712536, "memory(GiB)": 369.42, "step": 35300, "train_speed(iter/s)": 0.200719 }, { "acc": 0.73718548, "epoch": 0.895611364789447, "grad_norm": 2.5625, "learning_rate": 6.2625669593418744e-06, "loss": 1.03477097, "memory(GiB)": 369.42, "step": 35305, "train_speed(iter/s)": 0.200723 }, { "acc": 0.74591627, "epoch": 0.895738203957382, "grad_norm": 2.578125, "learning_rate": 6.261552290231807e-06, "loss": 0.97674437, "memory(GiB)": 369.42, "step": 35310, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75917263, "epoch": 0.8958650431253171, "grad_norm": 1.953125, "learning_rate": 6.260537565633037e-06, "loss": 0.93538666, "memory(GiB)": 369.42, "step": 35315, "train_speed(iter/s)": 0.20073 }, { "acc": 0.75236993, "epoch": 0.8959918822932521, "grad_norm": 2.046875, "learning_rate": 6.259522785590197e-06, "loss": 0.99312706, "memory(GiB)": 369.42, "step": 35320, "train_speed(iter/s)": 0.200734 }, { "acc": 0.7374671, "epoch": 0.8961187214611872, "grad_norm": 2.5625, "learning_rate": 6.2585079501479205e-06, "loss": 1.03334341, "memory(GiB)": 369.42, "step": 35325, "train_speed(iter/s)": 0.20074 }, { "acc": 0.7486722, "epoch": 0.8962455606291223, "grad_norm": 2.25, "learning_rate": 6.257493059350848e-06, "loss": 0.96240644, "memory(GiB)": 369.42, "step": 35330, "train_speed(iter/s)": 0.200746 }, { "acc": 0.750208, "epoch": 0.8963723997970573, "grad_norm": 1.8046875, "learning_rate": 6.256478113243613e-06, "loss": 1.02016249, "memory(GiB)": 369.42, "step": 35335, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74853773, "epoch": 0.8964992389649924, "grad_norm": 2.046875, "learning_rate": 6.255463111870864e-06, "loss": 0.98116999, "memory(GiB)": 369.42, "step": 35340, "train_speed(iter/s)": 0.200755 }, { "acc": 0.75154886, "epoch": 0.8966260781329275, "grad_norm": 2.109375, "learning_rate": 6.25444805527724e-06, "loss": 1.01462498, "memory(GiB)": 369.42, "step": 35345, "train_speed(iter/s)": 0.200759 }, { "acc": 0.73992701, "epoch": 0.8967529173008625, "grad_norm": 2.484375, "learning_rate": 6.253432943507391e-06, "loss": 1.06607838, "memory(GiB)": 369.42, "step": 35350, "train_speed(iter/s)": 0.200765 }, { "acc": 0.74950037, "epoch": 0.8968797564687976, "grad_norm": 2.171875, "learning_rate": 6.252417776605964e-06, "loss": 0.99287567, "memory(GiB)": 369.42, "step": 35355, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75152602, "epoch": 0.8970065956367326, "grad_norm": 2.453125, "learning_rate": 6.251402554617613e-06, "loss": 0.97129383, "memory(GiB)": 369.42, "step": 35360, "train_speed(iter/s)": 0.200772 }, { "acc": 0.73290997, "epoch": 0.8971334348046677, "grad_norm": 2.9375, "learning_rate": 6.2503872775869886e-06, "loss": 1.05639763, "memory(GiB)": 369.42, "step": 35365, "train_speed(iter/s)": 0.200776 }, { "acc": 0.76004267, "epoch": 0.8972602739726028, "grad_norm": 2.1875, "learning_rate": 6.249371945558751e-06, "loss": 0.98392029, "memory(GiB)": 369.42, "step": 35370, "train_speed(iter/s)": 0.20078 }, { "acc": 0.75657821, "epoch": 0.8973871131405378, "grad_norm": 2.25, "learning_rate": 6.248356558577555e-06, "loss": 0.96302128, "memory(GiB)": 369.42, "step": 35375, "train_speed(iter/s)": 0.200784 }, { "acc": 0.7464715, "epoch": 0.8975139523084729, "grad_norm": 2.53125, "learning_rate": 6.247341116688067e-06, "loss": 1.04771481, "memory(GiB)": 369.42, "step": 35380, "train_speed(iter/s)": 0.20079 }, { "acc": 0.75696239, "epoch": 0.897640791476408, "grad_norm": 2.078125, "learning_rate": 6.246325619934945e-06, "loss": 0.97604847, "memory(GiB)": 369.42, "step": 35385, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74920206, "epoch": 0.897767630644343, "grad_norm": 2.578125, "learning_rate": 6.245310068362859e-06, "loss": 0.96291084, "memory(GiB)": 369.42, "step": 35390, "train_speed(iter/s)": 0.200799 }, { "acc": 0.75414023, "epoch": 0.897894469812278, "grad_norm": 2.171875, "learning_rate": 6.244294462016476e-06, "loss": 1.02488689, "memory(GiB)": 369.42, "step": 35395, "train_speed(iter/s)": 0.200804 }, { "acc": 0.76276197, "epoch": 0.898021308980213, "grad_norm": 2.375, "learning_rate": 6.243278800940468e-06, "loss": 0.94974079, "memory(GiB)": 369.42, "step": 35400, "train_speed(iter/s)": 0.200809 }, { "acc": 0.75291367, "epoch": 0.8981481481481481, "grad_norm": 2.359375, "learning_rate": 6.242263085179506e-06, "loss": 0.99449215, "memory(GiB)": 369.42, "step": 35405, "train_speed(iter/s)": 0.200814 }, { "acc": 0.74359641, "epoch": 0.8982749873160832, "grad_norm": 1.875, "learning_rate": 6.241247314778269e-06, "loss": 1.01548023, "memory(GiB)": 369.42, "step": 35410, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74048128, "epoch": 0.8984018264840182, "grad_norm": 1.96875, "learning_rate": 6.240231489781432e-06, "loss": 1.06315069, "memory(GiB)": 369.42, "step": 35415, "train_speed(iter/s)": 0.200825 }, { "acc": 0.74159021, "epoch": 0.8985286656519533, "grad_norm": 2.328125, "learning_rate": 6.239215610233678e-06, "loss": 1.01344643, "memory(GiB)": 369.42, "step": 35420, "train_speed(iter/s)": 0.200831 }, { "acc": 0.75918665, "epoch": 0.8986555048198884, "grad_norm": 2.46875, "learning_rate": 6.238199676179688e-06, "loss": 0.96333342, "memory(GiB)": 369.42, "step": 35425, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75155907, "epoch": 0.8987823439878234, "grad_norm": 2.296875, "learning_rate": 6.2371836876641475e-06, "loss": 0.96354427, "memory(GiB)": 369.42, "step": 35430, "train_speed(iter/s)": 0.200841 }, { "acc": 0.7440773, "epoch": 0.8989091831557585, "grad_norm": 2.25, "learning_rate": 6.236167644731745e-06, "loss": 1.02124081, "memory(GiB)": 369.42, "step": 35435, "train_speed(iter/s)": 0.200843 }, { "acc": 0.74358654, "epoch": 0.8990360223236935, "grad_norm": 2.21875, "learning_rate": 6.235151547427172e-06, "loss": 1.00324402, "memory(GiB)": 369.42, "step": 35440, "train_speed(iter/s)": 0.200848 }, { "acc": 0.75012259, "epoch": 0.8991628614916286, "grad_norm": 2.109375, "learning_rate": 6.2341353957951165e-06, "loss": 1.00318336, "memory(GiB)": 369.42, "step": 35445, "train_speed(iter/s)": 0.200851 }, { "acc": 0.72210426, "epoch": 0.8992897006595637, "grad_norm": 2.546875, "learning_rate": 6.233119189880279e-06, "loss": 1.04035301, "memory(GiB)": 369.42, "step": 35450, "train_speed(iter/s)": 0.200856 }, { "acc": 0.73547554, "epoch": 0.8994165398274987, "grad_norm": 2.15625, "learning_rate": 6.232102929727353e-06, "loss": 1.02821274, "memory(GiB)": 369.42, "step": 35455, "train_speed(iter/s)": 0.200862 }, { "acc": 0.7387888, "epoch": 0.8995433789954338, "grad_norm": 1.921875, "learning_rate": 6.231086615381039e-06, "loss": 1.05731544, "memory(GiB)": 369.42, "step": 35460, "train_speed(iter/s)": 0.200865 }, { "acc": 0.75989332, "epoch": 0.8996702181633689, "grad_norm": 2.515625, "learning_rate": 6.2300702468860385e-06, "loss": 1.01718979, "memory(GiB)": 369.42, "step": 35465, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75173969, "epoch": 0.8997970573313039, "grad_norm": 2.265625, "learning_rate": 6.229053824287058e-06, "loss": 0.95340433, "memory(GiB)": 369.42, "step": 35470, "train_speed(iter/s)": 0.200875 }, { "acc": 0.73375168, "epoch": 0.899923896499239, "grad_norm": 2.5625, "learning_rate": 6.228037347628803e-06, "loss": 1.03073368, "memory(GiB)": 369.42, "step": 35475, "train_speed(iter/s)": 0.200879 }, { "acc": 0.75695734, "epoch": 0.900050735667174, "grad_norm": 1.984375, "learning_rate": 6.227020816955982e-06, "loss": 0.97424412, "memory(GiB)": 369.42, "step": 35480, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74968204, "epoch": 0.9001775748351091, "grad_norm": 2.03125, "learning_rate": 6.226004232313308e-06, "loss": 1.00677605, "memory(GiB)": 369.42, "step": 35485, "train_speed(iter/s)": 0.200887 }, { "acc": 0.74677935, "epoch": 0.9003044140030442, "grad_norm": 1.984375, "learning_rate": 6.224987593745493e-06, "loss": 1.00261717, "memory(GiB)": 369.42, "step": 35490, "train_speed(iter/s)": 0.200891 }, { "acc": 0.7440155, "epoch": 0.9004312531709792, "grad_norm": 2.46875, "learning_rate": 6.223970901297255e-06, "loss": 0.99013824, "memory(GiB)": 369.42, "step": 35495, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75223532, "epoch": 0.9005580923389143, "grad_norm": 2.46875, "learning_rate": 6.222954155013312e-06, "loss": 1.00599918, "memory(GiB)": 369.42, "step": 35500, "train_speed(iter/s)": 0.2009 }, { "acc": 0.75874524, "epoch": 0.9006849315068494, "grad_norm": 2.15625, "learning_rate": 6.221937354938386e-06, "loss": 0.97395077, "memory(GiB)": 369.42, "step": 35505, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75019417, "epoch": 0.9008117706747844, "grad_norm": 2.109375, "learning_rate": 6.2209205011171995e-06, "loss": 1.00932083, "memory(GiB)": 369.42, "step": 35510, "train_speed(iter/s)": 0.200908 }, { "acc": 0.73753152, "epoch": 0.9009386098427195, "grad_norm": 2.234375, "learning_rate": 6.219903593594476e-06, "loss": 1.0106226, "memory(GiB)": 369.42, "step": 35515, "train_speed(iter/s)": 0.200914 }, { "acc": 0.76232719, "epoch": 0.9010654490106544, "grad_norm": 2.421875, "learning_rate": 6.218886632414949e-06, "loss": 0.94424686, "memory(GiB)": 369.42, "step": 35520, "train_speed(iter/s)": 0.200918 }, { "acc": 0.74368935, "epoch": 0.9011922881785895, "grad_norm": 2.109375, "learning_rate": 6.217869617623343e-06, "loss": 1.03404903, "memory(GiB)": 369.42, "step": 35525, "train_speed(iter/s)": 0.200921 }, { "acc": 0.76037278, "epoch": 0.9013191273465246, "grad_norm": 2.125, "learning_rate": 6.216852549264396e-06, "loss": 0.97968826, "memory(GiB)": 369.42, "step": 35530, "train_speed(iter/s)": 0.200927 }, { "acc": 0.75585241, "epoch": 0.9014459665144596, "grad_norm": 2.140625, "learning_rate": 6.215835427382842e-06, "loss": 0.97840185, "memory(GiB)": 369.42, "step": 35535, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74757185, "epoch": 0.9015728056823947, "grad_norm": 1.96875, "learning_rate": 6.214818252023415e-06, "loss": 1.01554394, "memory(GiB)": 369.42, "step": 35540, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75653763, "epoch": 0.9016996448503298, "grad_norm": 1.9296875, "learning_rate": 6.2138010232308585e-06, "loss": 0.95194244, "memory(GiB)": 369.42, "step": 35545, "train_speed(iter/s)": 0.200938 }, { "acc": 0.75780678, "epoch": 0.9018264840182648, "grad_norm": 2.078125, "learning_rate": 6.212783741049915e-06, "loss": 0.98805752, "memory(GiB)": 369.42, "step": 35550, "train_speed(iter/s)": 0.200941 }, { "acc": 0.75013566, "epoch": 0.9019533231861999, "grad_norm": 1.671875, "learning_rate": 6.211766405525326e-06, "loss": 0.95587292, "memory(GiB)": 369.42, "step": 35555, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74500151, "epoch": 0.9020801623541349, "grad_norm": 2.34375, "learning_rate": 6.210749016701842e-06, "loss": 1.03236189, "memory(GiB)": 369.42, "step": 35560, "train_speed(iter/s)": 0.20095 }, { "acc": 0.73634911, "epoch": 0.90220700152207, "grad_norm": 2.53125, "learning_rate": 6.2097315746242095e-06, "loss": 1.02661343, "memory(GiB)": 369.42, "step": 35565, "train_speed(iter/s)": 0.200953 }, { "acc": 0.72801962, "epoch": 0.9023338406900051, "grad_norm": 2.0625, "learning_rate": 6.208714079337181e-06, "loss": 1.06368427, "memory(GiB)": 369.42, "step": 35570, "train_speed(iter/s)": 0.200955 }, { "acc": 0.74344554, "epoch": 0.9024606798579401, "grad_norm": 1.953125, "learning_rate": 6.207696530885511e-06, "loss": 1.01683617, "memory(GiB)": 369.42, "step": 35575, "train_speed(iter/s)": 0.20096 }, { "acc": 0.74518719, "epoch": 0.9025875190258752, "grad_norm": 2.28125, "learning_rate": 6.2066789293139565e-06, "loss": 1.02127457, "memory(GiB)": 369.42, "step": 35580, "train_speed(iter/s)": 0.200962 }, { "acc": 0.72742453, "epoch": 0.9027143581938103, "grad_norm": 2.28125, "learning_rate": 6.2056612746672736e-06, "loss": 1.07243252, "memory(GiB)": 369.42, "step": 35585, "train_speed(iter/s)": 0.200965 }, { "acc": 0.75251141, "epoch": 0.9028411973617453, "grad_norm": 2.03125, "learning_rate": 6.204643566990227e-06, "loss": 1.02055616, "memory(GiB)": 369.42, "step": 35590, "train_speed(iter/s)": 0.200971 }, { "acc": 0.74583726, "epoch": 0.9029680365296804, "grad_norm": 2.171875, "learning_rate": 6.2036258063275764e-06, "loss": 0.97483358, "memory(GiB)": 369.42, "step": 35595, "train_speed(iter/s)": 0.200975 }, { "acc": 0.74600997, "epoch": 0.9030948756976154, "grad_norm": 2.625, "learning_rate": 6.20260799272409e-06, "loss": 0.98388662, "memory(GiB)": 369.42, "step": 35600, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75525923, "epoch": 0.9032217148655505, "grad_norm": 1.9453125, "learning_rate": 6.201590126224534e-06, "loss": 1.03906784, "memory(GiB)": 369.42, "step": 35605, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75539198, "epoch": 0.9033485540334856, "grad_norm": 1.90625, "learning_rate": 6.20057220687368e-06, "loss": 0.96340828, "memory(GiB)": 369.42, "step": 35610, "train_speed(iter/s)": 0.200977 }, { "acc": 0.74119534, "epoch": 0.9034753932014206, "grad_norm": 1.890625, "learning_rate": 6.199554234716301e-06, "loss": 0.99964848, "memory(GiB)": 369.42, "step": 35615, "train_speed(iter/s)": 0.200981 }, { "acc": 0.73955035, "epoch": 0.9036022323693557, "grad_norm": 2.328125, "learning_rate": 6.19853620979717e-06, "loss": 0.97248096, "memory(GiB)": 369.42, "step": 35620, "train_speed(iter/s)": 0.200986 }, { "acc": 0.75213122, "epoch": 0.9037290715372908, "grad_norm": 2.28125, "learning_rate": 6.1975181321610655e-06, "loss": 0.99681005, "memory(GiB)": 369.42, "step": 35625, "train_speed(iter/s)": 0.200992 }, { "acc": 0.73658962, "epoch": 0.9038559107052258, "grad_norm": 2.34375, "learning_rate": 6.1965000018527676e-06, "loss": 1.05325089, "memory(GiB)": 369.42, "step": 35630, "train_speed(iter/s)": 0.200995 }, { "acc": 0.744666, "epoch": 0.9039827498731609, "grad_norm": 2.1875, "learning_rate": 6.195481818917057e-06, "loss": 1.01024437, "memory(GiB)": 369.42, "step": 35635, "train_speed(iter/s)": 0.201 }, { "acc": 0.74570518, "epoch": 0.9041095890410958, "grad_norm": 2.1875, "learning_rate": 6.194463583398719e-06, "loss": 1.02342072, "memory(GiB)": 369.42, "step": 35640, "train_speed(iter/s)": 0.201005 }, { "acc": 0.74450836, "epoch": 0.9042364282090309, "grad_norm": 2.125, "learning_rate": 6.193445295342538e-06, "loss": 0.96478329, "memory(GiB)": 369.42, "step": 35645, "train_speed(iter/s)": 0.20101 }, { "acc": 0.73192077, "epoch": 0.904363267376966, "grad_norm": 1.875, "learning_rate": 6.192426954793308e-06, "loss": 1.02930946, "memory(GiB)": 369.42, "step": 35650, "train_speed(iter/s)": 0.201015 }, { "acc": 0.74939036, "epoch": 0.904490106544901, "grad_norm": 2.078125, "learning_rate": 6.1914085617958135e-06, "loss": 0.99963512, "memory(GiB)": 369.42, "step": 35655, "train_speed(iter/s)": 0.20102 }, { "acc": 0.75257726, "epoch": 0.9046169457128361, "grad_norm": 2.171875, "learning_rate": 6.190390116394853e-06, "loss": 0.99255486, "memory(GiB)": 369.42, "step": 35660, "train_speed(iter/s)": 0.201014 }, { "acc": 0.73613729, "epoch": 0.9047437848807712, "grad_norm": 2.40625, "learning_rate": 6.189371618635219e-06, "loss": 1.01956978, "memory(GiB)": 369.42, "step": 35665, "train_speed(iter/s)": 0.201021 }, { "acc": 0.74009619, "epoch": 0.9048706240487062, "grad_norm": 2.015625, "learning_rate": 6.188353068561714e-06, "loss": 0.97410412, "memory(GiB)": 369.42, "step": 35670, "train_speed(iter/s)": 0.201024 }, { "acc": 0.75719843, "epoch": 0.9049974632166413, "grad_norm": 2.265625, "learning_rate": 6.187334466219133e-06, "loss": 0.95625315, "memory(GiB)": 369.42, "step": 35675, "train_speed(iter/s)": 0.20103 }, { "acc": 0.75348115, "epoch": 0.9051243023845763, "grad_norm": 2.25, "learning_rate": 6.18631581165228e-06, "loss": 0.95180798, "memory(GiB)": 369.42, "step": 35680, "train_speed(iter/s)": 0.201035 }, { "acc": 0.73903971, "epoch": 0.9052511415525114, "grad_norm": 2.34375, "learning_rate": 6.185297104905963e-06, "loss": 1.01587124, "memory(GiB)": 369.42, "step": 35685, "train_speed(iter/s)": 0.201039 }, { "acc": 0.73750577, "epoch": 0.9053779807204465, "grad_norm": 2.234375, "learning_rate": 6.184278346024988e-06, "loss": 1.04245033, "memory(GiB)": 369.42, "step": 35690, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74863162, "epoch": 0.9055048198883815, "grad_norm": 2.0625, "learning_rate": 6.183259535054163e-06, "loss": 0.97567139, "memory(GiB)": 369.42, "step": 35695, "train_speed(iter/s)": 0.201049 }, { "acc": 0.73727741, "epoch": 0.9056316590563166, "grad_norm": 1.9609375, "learning_rate": 6.1822406720383e-06, "loss": 1.01729984, "memory(GiB)": 369.42, "step": 35700, "train_speed(iter/s)": 0.201054 }, { "acc": 0.74771576, "epoch": 0.9057584982242517, "grad_norm": 1.9765625, "learning_rate": 6.181221757022215e-06, "loss": 1.02584209, "memory(GiB)": 369.42, "step": 35705, "train_speed(iter/s)": 0.201057 }, { "acc": 0.74325895, "epoch": 0.9058853373921867, "grad_norm": 2.359375, "learning_rate": 6.180202790050724e-06, "loss": 1.00756226, "memory(GiB)": 369.42, "step": 35710, "train_speed(iter/s)": 0.20106 }, { "acc": 0.75027866, "epoch": 0.9060121765601218, "grad_norm": 2.140625, "learning_rate": 6.179183771168643e-06, "loss": 1.01426811, "memory(GiB)": 369.42, "step": 35715, "train_speed(iter/s)": 0.201063 }, { "acc": 0.7414155, "epoch": 0.9061390157280568, "grad_norm": 2.21875, "learning_rate": 6.1781647004207965e-06, "loss": 1.02792902, "memory(GiB)": 369.42, "step": 35720, "train_speed(iter/s)": 0.201065 }, { "acc": 0.73727093, "epoch": 0.9062658548959919, "grad_norm": 2.328125, "learning_rate": 6.177145577852005e-06, "loss": 1.04489775, "memory(GiB)": 369.42, "step": 35725, "train_speed(iter/s)": 0.201067 }, { "acc": 0.73593674, "epoch": 0.906392694063927, "grad_norm": 2.21875, "learning_rate": 6.176126403507097e-06, "loss": 1.03549709, "memory(GiB)": 369.42, "step": 35730, "train_speed(iter/s)": 0.201072 }, { "acc": 0.7513196, "epoch": 0.906519533231862, "grad_norm": 2.09375, "learning_rate": 6.175107177430897e-06, "loss": 0.98420858, "memory(GiB)": 369.42, "step": 35735, "train_speed(iter/s)": 0.201078 }, { "acc": 0.74667401, "epoch": 0.9066463723997971, "grad_norm": 2.109375, "learning_rate": 6.17408789966824e-06, "loss": 0.97658253, "memory(GiB)": 369.42, "step": 35740, "train_speed(iter/s)": 0.201082 }, { "acc": 0.75366058, "epoch": 0.9067732115677322, "grad_norm": 1.9765625, "learning_rate": 6.173068570263951e-06, "loss": 0.9744442, "memory(GiB)": 369.42, "step": 35745, "train_speed(iter/s)": 0.201085 }, { "acc": 0.74477448, "epoch": 0.9069000507356672, "grad_norm": 2.234375, "learning_rate": 6.172049189262872e-06, "loss": 1.0089241, "memory(GiB)": 369.42, "step": 35750, "train_speed(iter/s)": 0.20109 }, { "acc": 0.74853005, "epoch": 0.9070268899036023, "grad_norm": 2.0, "learning_rate": 6.1710297567098354e-06, "loss": 1.03022842, "memory(GiB)": 369.42, "step": 35755, "train_speed(iter/s)": 0.201094 }, { "acc": 0.74833345, "epoch": 0.9071537290715372, "grad_norm": 2.53125, "learning_rate": 6.170010272649682e-06, "loss": 1.02150955, "memory(GiB)": 369.42, "step": 35760, "train_speed(iter/s)": 0.201097 }, { "acc": 0.73523283, "epoch": 0.9072805682394723, "grad_norm": 2.40625, "learning_rate": 6.168990737127254e-06, "loss": 1.02877388, "memory(GiB)": 369.42, "step": 35765, "train_speed(iter/s)": 0.201101 }, { "acc": 0.74375544, "epoch": 0.9074074074074074, "grad_norm": 2.28125, "learning_rate": 6.167971150187394e-06, "loss": 0.99110174, "memory(GiB)": 369.42, "step": 35770, "train_speed(iter/s)": 0.201104 }, { "acc": 0.73459015, "epoch": 0.9075342465753424, "grad_norm": 1.9296875, "learning_rate": 6.166951511874948e-06, "loss": 1.06666107, "memory(GiB)": 369.42, "step": 35775, "train_speed(iter/s)": 0.201109 }, { "acc": 0.74519291, "epoch": 0.9076610857432775, "grad_norm": 2.625, "learning_rate": 6.165931822234764e-06, "loss": 0.98364506, "memory(GiB)": 369.42, "step": 35780, "train_speed(iter/s)": 0.201114 }, { "acc": 0.74624529, "epoch": 0.9077879249112126, "grad_norm": 2.765625, "learning_rate": 6.164912081311694e-06, "loss": 1.06670895, "memory(GiB)": 369.42, "step": 35785, "train_speed(iter/s)": 0.201119 }, { "acc": 0.75108399, "epoch": 0.9079147640791476, "grad_norm": 2.171875, "learning_rate": 6.163892289150588e-06, "loss": 0.93702564, "memory(GiB)": 369.42, "step": 35790, "train_speed(iter/s)": 0.201124 }, { "acc": 0.75950041, "epoch": 0.9080416032470827, "grad_norm": 2.453125, "learning_rate": 6.162872445796303e-06, "loss": 0.99026089, "memory(GiB)": 369.42, "step": 35795, "train_speed(iter/s)": 0.201129 }, { "acc": 0.73852415, "epoch": 0.9081684424150177, "grad_norm": 1.9296875, "learning_rate": 6.161852551293697e-06, "loss": 1.03814907, "memory(GiB)": 369.42, "step": 35800, "train_speed(iter/s)": 0.201133 }, { "acc": 0.74283371, "epoch": 0.9082952815829528, "grad_norm": 2.640625, "learning_rate": 6.160832605687628e-06, "loss": 1.04042711, "memory(GiB)": 369.42, "step": 35805, "train_speed(iter/s)": 0.201137 }, { "acc": 0.74868059, "epoch": 0.9084221207508879, "grad_norm": 2.25, "learning_rate": 6.159812609022961e-06, "loss": 1.0090229, "memory(GiB)": 369.42, "step": 35810, "train_speed(iter/s)": 0.201142 }, { "acc": 0.74006448, "epoch": 0.9085489599188229, "grad_norm": 1.8125, "learning_rate": 6.158792561344553e-06, "loss": 1.04006948, "memory(GiB)": 369.42, "step": 35815, "train_speed(iter/s)": 0.201145 }, { "acc": 0.74260454, "epoch": 0.908675799086758, "grad_norm": 2.40625, "learning_rate": 6.157772462697277e-06, "loss": 1.02530785, "memory(GiB)": 369.42, "step": 35820, "train_speed(iter/s)": 0.201148 }, { "acc": 0.75032163, "epoch": 0.9088026382546931, "grad_norm": 2.09375, "learning_rate": 6.156752313125998e-06, "loss": 1.01850548, "memory(GiB)": 369.42, "step": 35825, "train_speed(iter/s)": 0.201154 }, { "acc": 0.75267105, "epoch": 0.9089294774226281, "grad_norm": 2.5625, "learning_rate": 6.155732112675587e-06, "loss": 1.02819672, "memory(GiB)": 369.42, "step": 35830, "train_speed(iter/s)": 0.201158 }, { "acc": 0.76697383, "epoch": 0.9090563165905632, "grad_norm": 2.21875, "learning_rate": 6.154711861390919e-06, "loss": 0.91718559, "memory(GiB)": 369.42, "step": 35835, "train_speed(iter/s)": 0.201161 }, { "acc": 0.74790134, "epoch": 0.9091831557584982, "grad_norm": 1.953125, "learning_rate": 6.153691559316868e-06, "loss": 1.01970673, "memory(GiB)": 369.42, "step": 35840, "train_speed(iter/s)": 0.201165 }, { "acc": 0.73363724, "epoch": 0.9093099949264333, "grad_norm": 2.25, "learning_rate": 6.152671206498311e-06, "loss": 1.08495369, "memory(GiB)": 369.42, "step": 35845, "train_speed(iter/s)": 0.20117 }, { "acc": 0.75835361, "epoch": 0.9094368340943684, "grad_norm": 2.21875, "learning_rate": 6.151650802980128e-06, "loss": 0.95053711, "memory(GiB)": 369.42, "step": 35850, "train_speed(iter/s)": 0.201173 }, { "acc": 0.740341, "epoch": 0.9095636732623034, "grad_norm": 2.3125, "learning_rate": 6.150630348807201e-06, "loss": 1.08384113, "memory(GiB)": 369.42, "step": 35855, "train_speed(iter/s)": 0.201177 }, { "acc": 0.73188848, "epoch": 0.9096905124302385, "grad_norm": 2.75, "learning_rate": 6.149609844024413e-06, "loss": 1.0059166, "memory(GiB)": 369.42, "step": 35860, "train_speed(iter/s)": 0.201183 }, { "acc": 0.74710579, "epoch": 0.9098173515981736, "grad_norm": 2.015625, "learning_rate": 6.148589288676652e-06, "loss": 1.08330641, "memory(GiB)": 369.42, "step": 35865, "train_speed(iter/s)": 0.201189 }, { "acc": 0.75524726, "epoch": 0.9099441907661086, "grad_norm": 2.53125, "learning_rate": 6.147568682808808e-06, "loss": 0.98324242, "memory(GiB)": 369.42, "step": 35870, "train_speed(iter/s)": 0.201195 }, { "acc": 0.75225306, "epoch": 0.9100710299340437, "grad_norm": 2.34375, "learning_rate": 6.146548026465766e-06, "loss": 1.04543476, "memory(GiB)": 369.42, "step": 35875, "train_speed(iter/s)": 0.201199 }, { "acc": 0.75399504, "epoch": 0.9101978691019786, "grad_norm": 2.265625, "learning_rate": 6.145527319692427e-06, "loss": 0.94385071, "memory(GiB)": 369.42, "step": 35880, "train_speed(iter/s)": 0.201203 }, { "acc": 0.74291854, "epoch": 0.9103247082699137, "grad_norm": 2.140625, "learning_rate": 6.144506562533678e-06, "loss": 0.97069702, "memory(GiB)": 369.42, "step": 35885, "train_speed(iter/s)": 0.201209 }, { "acc": 0.75502043, "epoch": 0.9104515474378488, "grad_norm": 2.25, "learning_rate": 6.143485755034425e-06, "loss": 0.97616558, "memory(GiB)": 369.42, "step": 35890, "train_speed(iter/s)": 0.201213 }, { "acc": 0.7428452, "epoch": 0.9105783866057838, "grad_norm": 2.546875, "learning_rate": 6.14246489723956e-06, "loss": 0.99810371, "memory(GiB)": 369.42, "step": 35895, "train_speed(iter/s)": 0.201217 }, { "acc": 0.74670296, "epoch": 0.9107052257737189, "grad_norm": 2.609375, "learning_rate": 6.141443989193988e-06, "loss": 1.00854988, "memory(GiB)": 369.42, "step": 35900, "train_speed(iter/s)": 0.201223 }, { "acc": 0.73926125, "epoch": 0.910832064941654, "grad_norm": 2.90625, "learning_rate": 6.140423030942615e-06, "loss": 1.09345608, "memory(GiB)": 369.42, "step": 35905, "train_speed(iter/s)": 0.201226 }, { "acc": 0.75342398, "epoch": 0.910958904109589, "grad_norm": 1.9765625, "learning_rate": 6.139402022530344e-06, "loss": 0.99206734, "memory(GiB)": 369.42, "step": 35910, "train_speed(iter/s)": 0.201231 }, { "acc": 0.73800173, "epoch": 0.9110857432775241, "grad_norm": 2.234375, "learning_rate": 6.138380964002087e-06, "loss": 1.01606293, "memory(GiB)": 369.42, "step": 35915, "train_speed(iter/s)": 0.201236 }, { "acc": 0.76088376, "epoch": 0.9112125824454591, "grad_norm": 2.5625, "learning_rate": 6.13735985540275e-06, "loss": 0.95225563, "memory(GiB)": 369.42, "step": 35920, "train_speed(iter/s)": 0.20124 }, { "acc": 0.74670172, "epoch": 0.9113394216133942, "grad_norm": 2.359375, "learning_rate": 6.13633869677725e-06, "loss": 0.96914158, "memory(GiB)": 369.42, "step": 35925, "train_speed(iter/s)": 0.201241 }, { "acc": 0.74276714, "epoch": 0.9114662607813293, "grad_norm": 1.9296875, "learning_rate": 6.1353174881705e-06, "loss": 1.00719671, "memory(GiB)": 369.42, "step": 35930, "train_speed(iter/s)": 0.201243 }, { "acc": 0.73291903, "epoch": 0.9115930999492643, "grad_norm": 2.125, "learning_rate": 6.134296229627419e-06, "loss": 1.05866947, "memory(GiB)": 369.42, "step": 35935, "train_speed(iter/s)": 0.201246 }, { "acc": 0.75052061, "epoch": 0.9117199391171994, "grad_norm": 2.109375, "learning_rate": 6.1332749211929255e-06, "loss": 1.02316723, "memory(GiB)": 369.42, "step": 35940, "train_speed(iter/s)": 0.20125 }, { "acc": 0.76793013, "epoch": 0.9118467782851345, "grad_norm": 2.265625, "learning_rate": 6.132253562911941e-06, "loss": 0.94332561, "memory(GiB)": 369.42, "step": 35945, "train_speed(iter/s)": 0.201255 }, { "acc": 0.75753336, "epoch": 0.9119736174530695, "grad_norm": 2.203125, "learning_rate": 6.1312321548293895e-06, "loss": 0.98031921, "memory(GiB)": 369.42, "step": 35950, "train_speed(iter/s)": 0.201261 }, { "acc": 0.74189577, "epoch": 0.9121004566210046, "grad_norm": 2.109375, "learning_rate": 6.130210696990197e-06, "loss": 1.09110031, "memory(GiB)": 369.42, "step": 35955, "train_speed(iter/s)": 0.201263 }, { "acc": 0.74602327, "epoch": 0.9122272957889396, "grad_norm": 2.53125, "learning_rate": 6.129189189439293e-06, "loss": 1.05897427, "memory(GiB)": 369.42, "step": 35960, "train_speed(iter/s)": 0.201267 }, { "acc": 0.74144812, "epoch": 0.9123541349568747, "grad_norm": 2.46875, "learning_rate": 6.128167632221605e-06, "loss": 1.04916344, "memory(GiB)": 369.42, "step": 35965, "train_speed(iter/s)": 0.201272 }, { "acc": 0.75043716, "epoch": 0.9124809741248098, "grad_norm": 2.078125, "learning_rate": 6.127146025382069e-06, "loss": 0.95858688, "memory(GiB)": 369.42, "step": 35970, "train_speed(iter/s)": 0.201274 }, { "acc": 0.73988848, "epoch": 0.9126078132927448, "grad_norm": 2.375, "learning_rate": 6.126124368965619e-06, "loss": 1.07963028, "memory(GiB)": 369.42, "step": 35975, "train_speed(iter/s)": 0.20128 }, { "acc": 0.72567997, "epoch": 0.9127346524606799, "grad_norm": 2.296875, "learning_rate": 6.125102663017191e-06, "loss": 1.13071117, "memory(GiB)": 369.42, "step": 35980, "train_speed(iter/s)": 0.201284 }, { "acc": 0.75330296, "epoch": 0.912861491628615, "grad_norm": 2.078125, "learning_rate": 6.124080907581724e-06, "loss": 0.99356842, "memory(GiB)": 369.42, "step": 35985, "train_speed(iter/s)": 0.201287 }, { "acc": 0.75294223, "epoch": 0.91298833079655, "grad_norm": 2.03125, "learning_rate": 6.1230591027041605e-06, "loss": 0.99017429, "memory(GiB)": 369.42, "step": 35990, "train_speed(iter/s)": 0.201288 }, { "acc": 0.75784912, "epoch": 0.913115169964485, "grad_norm": 1.96875, "learning_rate": 6.1220372484294444e-06, "loss": 0.94009151, "memory(GiB)": 369.42, "step": 35995, "train_speed(iter/s)": 0.201293 }, { "acc": 0.74208746, "epoch": 0.91324200913242, "grad_norm": 2.28125, "learning_rate": 6.12101534480252e-06, "loss": 0.99688911, "memory(GiB)": 369.42, "step": 36000, "train_speed(iter/s)": 0.201297 }, { "epoch": 0.91324200913242, "eval_acc": 0.7374779471081521, "eval_loss": 0.9712907671928406, "eval_runtime": 384.907, "eval_samples_per_second": 16.549, "eval_steps_per_second": 8.275, "step": 36000 }, { "acc": 0.74623337, "epoch": 0.9133688483003551, "grad_norm": 2.359375, "learning_rate": 6.119993391868335e-06, "loss": 0.97935734, "memory(GiB)": 369.42, "step": 36005, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75936918, "epoch": 0.9134956874682902, "grad_norm": 2.09375, "learning_rate": 6.118971389671842e-06, "loss": 0.98962669, "memory(GiB)": 369.42, "step": 36010, "train_speed(iter/s)": 0.200507 }, { "acc": 0.74065123, "epoch": 0.9136225266362252, "grad_norm": 1.90625, "learning_rate": 6.117949338257989e-06, "loss": 1.02356558, "memory(GiB)": 369.42, "step": 36015, "train_speed(iter/s)": 0.20051 }, { "acc": 0.75472775, "epoch": 0.9137493658041603, "grad_norm": 2.28125, "learning_rate": 6.116927237671735e-06, "loss": 0.97744808, "memory(GiB)": 369.42, "step": 36020, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74671478, "epoch": 0.9138762049720954, "grad_norm": 2.515625, "learning_rate": 6.115905087958032e-06, "loss": 0.97316265, "memory(GiB)": 369.42, "step": 36025, "train_speed(iter/s)": 0.200519 }, { "acc": 0.74370193, "epoch": 0.9140030441400304, "grad_norm": 2.453125, "learning_rate": 6.114882889161844e-06, "loss": 1.04425087, "memory(GiB)": 369.42, "step": 36030, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75825987, "epoch": 0.9141298833079655, "grad_norm": 2.1875, "learning_rate": 6.113860641328127e-06, "loss": 1.00989866, "memory(GiB)": 369.42, "step": 36035, "train_speed(iter/s)": 0.200528 }, { "acc": 0.74280534, "epoch": 0.9142567224759005, "grad_norm": 2.171875, "learning_rate": 6.112838344501846e-06, "loss": 0.97100201, "memory(GiB)": 369.42, "step": 36040, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74820294, "epoch": 0.9143835616438356, "grad_norm": 2.34375, "learning_rate": 6.111815998727966e-06, "loss": 1.03741989, "memory(GiB)": 369.42, "step": 36045, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75224619, "epoch": 0.9145104008117707, "grad_norm": 2.09375, "learning_rate": 6.110793604051455e-06, "loss": 0.97260761, "memory(GiB)": 369.42, "step": 36050, "train_speed(iter/s)": 0.200541 }, { "acc": 0.74618778, "epoch": 0.9146372399797057, "grad_norm": 2.5625, "learning_rate": 6.109771160517283e-06, "loss": 1.02036514, "memory(GiB)": 369.42, "step": 36055, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74736643, "epoch": 0.9147640791476408, "grad_norm": 2.53125, "learning_rate": 6.108748668170419e-06, "loss": 1.01303825, "memory(GiB)": 369.42, "step": 36060, "train_speed(iter/s)": 0.200547 }, { "acc": 0.75784512, "epoch": 0.9148909183155759, "grad_norm": 1.890625, "learning_rate": 6.1077261270558385e-06, "loss": 0.99531212, "memory(GiB)": 369.42, "step": 36065, "train_speed(iter/s)": 0.200549 }, { "acc": 0.74224815, "epoch": 0.9150177574835109, "grad_norm": 1.796875, "learning_rate": 6.106703537218518e-06, "loss": 1.02883244, "memory(GiB)": 369.42, "step": 36070, "train_speed(iter/s)": 0.200551 }, { "acc": 0.74572954, "epoch": 0.915144596651446, "grad_norm": 1.9609375, "learning_rate": 6.105680898703434e-06, "loss": 1.03288479, "memory(GiB)": 369.42, "step": 36075, "train_speed(iter/s)": 0.200555 }, { "acc": 0.74102545, "epoch": 0.915271435819381, "grad_norm": 1.859375, "learning_rate": 6.104658211555568e-06, "loss": 1.04144917, "memory(GiB)": 369.42, "step": 36080, "train_speed(iter/s)": 0.200557 }, { "acc": 0.74671264, "epoch": 0.9153982749873161, "grad_norm": 2.515625, "learning_rate": 6.103635475819902e-06, "loss": 1.03024244, "memory(GiB)": 369.42, "step": 36085, "train_speed(iter/s)": 0.200561 }, { "acc": 0.75136538, "epoch": 0.9155251141552512, "grad_norm": 1.9921875, "learning_rate": 6.102612691541422e-06, "loss": 1.01090393, "memory(GiB)": 369.42, "step": 36090, "train_speed(iter/s)": 0.200566 }, { "acc": 0.73097973, "epoch": 0.9156519533231862, "grad_norm": 2.15625, "learning_rate": 6.10158985876511e-06, "loss": 1.01113319, "memory(GiB)": 369.42, "step": 36095, "train_speed(iter/s)": 0.200572 }, { "acc": 0.7499248, "epoch": 0.9157787924911213, "grad_norm": 2.21875, "learning_rate": 6.10056697753596e-06, "loss": 0.96933451, "memory(GiB)": 369.42, "step": 36100, "train_speed(iter/s)": 0.200576 }, { "acc": 0.73917131, "epoch": 0.9159056316590564, "grad_norm": 1.921875, "learning_rate": 6.0995440478989595e-06, "loss": 0.98877811, "memory(GiB)": 369.42, "step": 36105, "train_speed(iter/s)": 0.200579 }, { "acc": 0.77884817, "epoch": 0.9160324708269914, "grad_norm": 2.765625, "learning_rate": 6.098521069899104e-06, "loss": 0.84633369, "memory(GiB)": 369.42, "step": 36110, "train_speed(iter/s)": 0.200581 }, { "acc": 0.74275966, "epoch": 0.9161593099949265, "grad_norm": 2.703125, "learning_rate": 6.097498043581385e-06, "loss": 1.07175932, "memory(GiB)": 369.42, "step": 36115, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75756922, "epoch": 0.9162861491628614, "grad_norm": 2.609375, "learning_rate": 6.096474968990804e-06, "loss": 0.97994633, "memory(GiB)": 369.42, "step": 36120, "train_speed(iter/s)": 0.200587 }, { "acc": 0.7463089, "epoch": 0.9164129883307965, "grad_norm": 1.8828125, "learning_rate": 6.095451846172358e-06, "loss": 1.02043724, "memory(GiB)": 369.42, "step": 36125, "train_speed(iter/s)": 0.200592 }, { "acc": 0.74009123, "epoch": 0.9165398274987316, "grad_norm": 2.5, "learning_rate": 6.094428675171049e-06, "loss": 1.025457, "memory(GiB)": 369.42, "step": 36130, "train_speed(iter/s)": 0.200596 }, { "acc": 0.75147653, "epoch": 0.9166666666666666, "grad_norm": 2.46875, "learning_rate": 6.09340545603188e-06, "loss": 1.01730156, "memory(GiB)": 369.42, "step": 36135, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75967464, "epoch": 0.9167935058346017, "grad_norm": 2.09375, "learning_rate": 6.092382188799858e-06, "loss": 0.99925022, "memory(GiB)": 369.42, "step": 36140, "train_speed(iter/s)": 0.200606 }, { "acc": 0.7529294, "epoch": 0.9169203450025368, "grad_norm": 2.109375, "learning_rate": 6.09135887351999e-06, "loss": 0.99021206, "memory(GiB)": 369.42, "step": 36145, "train_speed(iter/s)": 0.200611 }, { "acc": 0.74900351, "epoch": 0.9170471841704718, "grad_norm": 2.953125, "learning_rate": 6.090335510237286e-06, "loss": 0.98866329, "memory(GiB)": 369.42, "step": 36150, "train_speed(iter/s)": 0.200616 }, { "acc": 0.74533482, "epoch": 0.9171740233384069, "grad_norm": 2.046875, "learning_rate": 6.089312098996758e-06, "loss": 0.95936279, "memory(GiB)": 369.42, "step": 36155, "train_speed(iter/s)": 0.200618 }, { "acc": 0.7537993, "epoch": 0.9173008625063419, "grad_norm": 2.34375, "learning_rate": 6.088288639843422e-06, "loss": 0.98832712, "memory(GiB)": 369.42, "step": 36160, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75788088, "epoch": 0.917427701674277, "grad_norm": 2.359375, "learning_rate": 6.08726513282229e-06, "loss": 0.98228073, "memory(GiB)": 369.42, "step": 36165, "train_speed(iter/s)": 0.200626 }, { "acc": 0.74928346, "epoch": 0.9175545408422121, "grad_norm": 2.0625, "learning_rate": 6.0862415779783855e-06, "loss": 0.99353666, "memory(GiB)": 369.42, "step": 36170, "train_speed(iter/s)": 0.200629 }, { "acc": 0.73797121, "epoch": 0.9176813800101471, "grad_norm": 2.453125, "learning_rate": 6.085217975356726e-06, "loss": 1.04557123, "memory(GiB)": 369.42, "step": 36175, "train_speed(iter/s)": 0.200633 }, { "acc": 0.73646336, "epoch": 0.9178082191780822, "grad_norm": 1.953125, "learning_rate": 6.084194325002335e-06, "loss": 1.03323956, "memory(GiB)": 369.42, "step": 36180, "train_speed(iter/s)": 0.200635 }, { "acc": 0.75313158, "epoch": 0.9179350583460173, "grad_norm": 2.15625, "learning_rate": 6.083170626960237e-06, "loss": 0.99668636, "memory(GiB)": 369.42, "step": 36185, "train_speed(iter/s)": 0.200638 }, { "acc": 0.75352859, "epoch": 0.9180618975139523, "grad_norm": 1.8125, "learning_rate": 6.082146881275458e-06, "loss": 0.981213, "memory(GiB)": 369.42, "step": 36190, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74952564, "epoch": 0.9181887366818874, "grad_norm": 2.28125, "learning_rate": 6.081123087993028e-06, "loss": 0.99844494, "memory(GiB)": 369.42, "step": 36195, "train_speed(iter/s)": 0.200646 }, { "acc": 0.73273783, "epoch": 0.9183155758498224, "grad_norm": 2.65625, "learning_rate": 6.0800992471579775e-06, "loss": 1.05190182, "memory(GiB)": 369.42, "step": 36200, "train_speed(iter/s)": 0.200652 }, { "acc": 0.75638189, "epoch": 0.9184424150177575, "grad_norm": 2.015625, "learning_rate": 6.079075358815341e-06, "loss": 0.97152042, "memory(GiB)": 369.42, "step": 36205, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74317913, "epoch": 0.9185692541856926, "grad_norm": 2.171875, "learning_rate": 6.078051423010152e-06, "loss": 1.01550465, "memory(GiB)": 369.42, "step": 36210, "train_speed(iter/s)": 0.20066 }, { "acc": 0.74815845, "epoch": 0.9186960933536276, "grad_norm": 2.234375, "learning_rate": 6.077027439787448e-06, "loss": 1.04541159, "memory(GiB)": 369.42, "step": 36215, "train_speed(iter/s)": 0.200663 }, { "acc": 0.74246683, "epoch": 0.9188229325215627, "grad_norm": 2.03125, "learning_rate": 6.076003409192268e-06, "loss": 1.04252014, "memory(GiB)": 369.42, "step": 36220, "train_speed(iter/s)": 0.200666 }, { "acc": 0.74961367, "epoch": 0.9189497716894978, "grad_norm": 2.265625, "learning_rate": 6.074979331269656e-06, "loss": 1.00882721, "memory(GiB)": 369.42, "step": 36225, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74376345, "epoch": 0.9190766108574328, "grad_norm": 1.9140625, "learning_rate": 6.0739552060646525e-06, "loss": 1.0309597, "memory(GiB)": 369.42, "step": 36230, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75715446, "epoch": 0.9192034500253679, "grad_norm": 2.421875, "learning_rate": 6.0729310336223025e-06, "loss": 0.96752195, "memory(GiB)": 369.42, "step": 36235, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75516844, "epoch": 0.9193302891933028, "grad_norm": 1.8515625, "learning_rate": 6.071906813987658e-06, "loss": 0.96822929, "memory(GiB)": 369.42, "step": 36240, "train_speed(iter/s)": 0.200686 }, { "acc": 0.7523982, "epoch": 0.9194571283612379, "grad_norm": 2.515625, "learning_rate": 6.070882547205764e-06, "loss": 1.02632761, "memory(GiB)": 369.42, "step": 36245, "train_speed(iter/s)": 0.200688 }, { "acc": 0.75672669, "epoch": 0.919583967529173, "grad_norm": 2.09375, "learning_rate": 6.069858233321677e-06, "loss": 0.95373211, "memory(GiB)": 369.42, "step": 36250, "train_speed(iter/s)": 0.200688 }, { "acc": 0.75080805, "epoch": 0.919710806697108, "grad_norm": 2.1875, "learning_rate": 6.068833872380445e-06, "loss": 0.97942801, "memory(GiB)": 369.42, "step": 36255, "train_speed(iter/s)": 0.200692 }, { "acc": 0.73624973, "epoch": 0.9198376458650431, "grad_norm": 2.453125, "learning_rate": 6.067809464427129e-06, "loss": 1.05959206, "memory(GiB)": 369.42, "step": 36260, "train_speed(iter/s)": 0.200697 }, { "acc": 0.75408306, "epoch": 0.9199644850329782, "grad_norm": 2.234375, "learning_rate": 6.066785009506786e-06, "loss": 0.98353329, "memory(GiB)": 369.42, "step": 36265, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75398426, "epoch": 0.9200913242009132, "grad_norm": 2.3125, "learning_rate": 6.065760507664474e-06, "loss": 0.98503656, "memory(GiB)": 369.42, "step": 36270, "train_speed(iter/s)": 0.200705 }, { "acc": 0.749195, "epoch": 0.9202181633688483, "grad_norm": 2.78125, "learning_rate": 6.064735958945258e-06, "loss": 1.01627922, "memory(GiB)": 369.42, "step": 36275, "train_speed(iter/s)": 0.200709 }, { "acc": 0.75874357, "epoch": 0.9203450025367833, "grad_norm": 2.53125, "learning_rate": 6.0637113633942006e-06, "loss": 0.97440681, "memory(GiB)": 369.42, "step": 36280, "train_speed(iter/s)": 0.200712 }, { "acc": 0.72871027, "epoch": 0.9204718417047184, "grad_norm": 2.03125, "learning_rate": 6.0626867210563675e-06, "loss": 1.09589691, "memory(GiB)": 369.42, "step": 36285, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75473142, "epoch": 0.9205986808726535, "grad_norm": 1.9453125, "learning_rate": 6.061662031976828e-06, "loss": 0.9904089, "memory(GiB)": 369.42, "step": 36290, "train_speed(iter/s)": 0.200721 }, { "acc": 0.74829454, "epoch": 0.9207255200405885, "grad_norm": 1.9453125, "learning_rate": 6.0606372962006534e-06, "loss": 0.98745518, "memory(GiB)": 369.42, "step": 36295, "train_speed(iter/s)": 0.200727 }, { "acc": 0.73367114, "epoch": 0.9208523592085236, "grad_norm": 2.078125, "learning_rate": 6.0596125137729145e-06, "loss": 1.03408489, "memory(GiB)": 369.42, "step": 36300, "train_speed(iter/s)": 0.200729 }, { "acc": 0.74813523, "epoch": 0.9209791983764587, "grad_norm": 2.21875, "learning_rate": 6.058587684738685e-06, "loss": 1.03484306, "memory(GiB)": 369.42, "step": 36305, "train_speed(iter/s)": 0.200734 }, { "acc": 0.74823914, "epoch": 0.9211060375443937, "grad_norm": 2.578125, "learning_rate": 6.057562809143045e-06, "loss": 0.97337036, "memory(GiB)": 369.42, "step": 36310, "train_speed(iter/s)": 0.200737 }, { "acc": 0.74490614, "epoch": 0.9212328767123288, "grad_norm": 1.984375, "learning_rate": 6.056537887031069e-06, "loss": 1.01107941, "memory(GiB)": 369.42, "step": 36315, "train_speed(iter/s)": 0.200741 }, { "acc": 0.75256262, "epoch": 0.9213597158802638, "grad_norm": 2.078125, "learning_rate": 6.055512918447841e-06, "loss": 1.01148949, "memory(GiB)": 369.42, "step": 36320, "train_speed(iter/s)": 0.200747 }, { "acc": 0.74042416, "epoch": 0.9214865550481989, "grad_norm": 1.9765625, "learning_rate": 6.054487903438442e-06, "loss": 0.99166574, "memory(GiB)": 369.42, "step": 36325, "train_speed(iter/s)": 0.200751 }, { "acc": 0.75183916, "epoch": 0.921613394216134, "grad_norm": 2.40625, "learning_rate": 6.0534628420479576e-06, "loss": 0.99804106, "memory(GiB)": 369.42, "step": 36330, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74371109, "epoch": 0.921740233384069, "grad_norm": 2.28125, "learning_rate": 6.0524377343214724e-06, "loss": 1.07969475, "memory(GiB)": 369.42, "step": 36335, "train_speed(iter/s)": 0.200758 }, { "acc": 0.73696856, "epoch": 0.9218670725520041, "grad_norm": 2.28125, "learning_rate": 6.051412580304079e-06, "loss": 1.0390049, "memory(GiB)": 369.42, "step": 36340, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74164324, "epoch": 0.9219939117199392, "grad_norm": 1.8359375, "learning_rate": 6.050387380040864e-06, "loss": 1.02633038, "memory(GiB)": 369.42, "step": 36345, "train_speed(iter/s)": 0.200766 }, { "acc": 0.73790836, "epoch": 0.9221207508878742, "grad_norm": 2.078125, "learning_rate": 6.049362133576924e-06, "loss": 1.02238064, "memory(GiB)": 369.42, "step": 36350, "train_speed(iter/s)": 0.200769 }, { "acc": 0.76925993, "epoch": 0.9222475900558093, "grad_norm": 2.328125, "learning_rate": 6.048336840957351e-06, "loss": 0.9643631, "memory(GiB)": 369.42, "step": 36355, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75484328, "epoch": 0.9223744292237442, "grad_norm": 2.0625, "learning_rate": 6.047311502227245e-06, "loss": 1.0178196, "memory(GiB)": 369.42, "step": 36360, "train_speed(iter/s)": 0.200778 }, { "acc": 0.73957748, "epoch": 0.9225012683916793, "grad_norm": 2.21875, "learning_rate": 6.046286117431703e-06, "loss": 1.02655487, "memory(GiB)": 369.42, "step": 36365, "train_speed(iter/s)": 0.20078 }, { "acc": 0.7456934, "epoch": 0.9226281075596144, "grad_norm": 1.7578125, "learning_rate": 6.0452606866158246e-06, "loss": 1.04772997, "memory(GiB)": 369.42, "step": 36370, "train_speed(iter/s)": 0.200783 }, { "acc": 0.75381641, "epoch": 0.9227549467275494, "grad_norm": 2.359375, "learning_rate": 6.044235209824716e-06, "loss": 0.95797062, "memory(GiB)": 369.42, "step": 36375, "train_speed(iter/s)": 0.200788 }, { "acc": 0.73746238, "epoch": 0.9228817858954845, "grad_norm": 2.21875, "learning_rate": 6.04320968710348e-06, "loss": 1.07682934, "memory(GiB)": 369.42, "step": 36380, "train_speed(iter/s)": 0.200791 }, { "acc": 0.73452129, "epoch": 0.9230086250634196, "grad_norm": 2.390625, "learning_rate": 6.042184118497223e-06, "loss": 1.04703922, "memory(GiB)": 369.42, "step": 36385, "train_speed(iter/s)": 0.200796 }, { "acc": 0.75281415, "epoch": 0.9231354642313546, "grad_norm": 2.359375, "learning_rate": 6.0411585040510576e-06, "loss": 0.99169035, "memory(GiB)": 369.42, "step": 36390, "train_speed(iter/s)": 0.2008 }, { "acc": 0.7499403, "epoch": 0.9232623033992897, "grad_norm": 2.03125, "learning_rate": 6.040132843810091e-06, "loss": 0.99736853, "memory(GiB)": 369.42, "step": 36395, "train_speed(iter/s)": 0.200806 }, { "acc": 0.76945314, "epoch": 0.9233891425672247, "grad_norm": 2.40625, "learning_rate": 6.03910713781944e-06, "loss": 0.92595387, "memory(GiB)": 369.42, "step": 36400, "train_speed(iter/s)": 0.200808 }, { "acc": 0.73587699, "epoch": 0.9235159817351598, "grad_norm": 2.390625, "learning_rate": 6.038081386124216e-06, "loss": 1.03063984, "memory(GiB)": 369.42, "step": 36405, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75884323, "epoch": 0.9236428209030949, "grad_norm": 2.15625, "learning_rate": 6.037055588769539e-06, "loss": 0.94596443, "memory(GiB)": 369.42, "step": 36410, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75320172, "epoch": 0.9237696600710299, "grad_norm": 1.921875, "learning_rate": 6.036029745800527e-06, "loss": 0.95515385, "memory(GiB)": 369.42, "step": 36415, "train_speed(iter/s)": 0.200822 }, { "acc": 0.74645772, "epoch": 0.923896499238965, "grad_norm": 2.015625, "learning_rate": 6.0350038572623e-06, "loss": 1.0539072, "memory(GiB)": 369.42, "step": 36420, "train_speed(iter/s)": 0.200826 }, { "acc": 0.73838043, "epoch": 0.9240233384069001, "grad_norm": 2.09375, "learning_rate": 6.033977923199984e-06, "loss": 1.01228781, "memory(GiB)": 369.42, "step": 36425, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75706182, "epoch": 0.9241501775748351, "grad_norm": 2.453125, "learning_rate": 6.032951943658702e-06, "loss": 0.96051311, "memory(GiB)": 369.42, "step": 36430, "train_speed(iter/s)": 0.200828 }, { "acc": 0.74232168, "epoch": 0.9242770167427702, "grad_norm": 2.09375, "learning_rate": 6.031925918683582e-06, "loss": 0.99794502, "memory(GiB)": 369.42, "step": 36435, "train_speed(iter/s)": 0.200831 }, { "acc": 0.74445977, "epoch": 0.9244038559107052, "grad_norm": 2.453125, "learning_rate": 6.030899848319754e-06, "loss": 1.06632833, "memory(GiB)": 369.42, "step": 36440, "train_speed(iter/s)": 0.200832 }, { "acc": 0.73954391, "epoch": 0.9245306950786403, "grad_norm": 2.109375, "learning_rate": 6.029873732612346e-06, "loss": 1.02789268, "memory(GiB)": 369.42, "step": 36445, "train_speed(iter/s)": 0.200836 }, { "acc": 0.7422605, "epoch": 0.9246575342465754, "grad_norm": 2.515625, "learning_rate": 6.028847571606493e-06, "loss": 1.04904232, "memory(GiB)": 369.42, "step": 36450, "train_speed(iter/s)": 0.20084 }, { "acc": 0.73402653, "epoch": 0.9247843734145104, "grad_norm": 2.03125, "learning_rate": 6.0278213653473305e-06, "loss": 1.01476974, "memory(GiB)": 369.42, "step": 36455, "train_speed(iter/s)": 0.200842 }, { "acc": 0.74483385, "epoch": 0.9249112125824455, "grad_norm": 1.9765625, "learning_rate": 6.026795113879998e-06, "loss": 1.01573143, "memory(GiB)": 369.42, "step": 36460, "train_speed(iter/s)": 0.200847 }, { "acc": 0.7464829, "epoch": 0.9250380517503806, "grad_norm": 1.921875, "learning_rate": 6.025768817249629e-06, "loss": 1.03364983, "memory(GiB)": 369.42, "step": 36465, "train_speed(iter/s)": 0.200853 }, { "acc": 0.7289371, "epoch": 0.9251648909183156, "grad_norm": 2.203125, "learning_rate": 6.024742475501369e-06, "loss": 1.05280313, "memory(GiB)": 369.42, "step": 36470, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74810266, "epoch": 0.9252917300862507, "grad_norm": 2.171875, "learning_rate": 6.023716088680359e-06, "loss": 1.02505245, "memory(GiB)": 369.42, "step": 36475, "train_speed(iter/s)": 0.200861 }, { "acc": 0.74720039, "epoch": 0.9254185692541856, "grad_norm": 2.109375, "learning_rate": 6.022689656831746e-06, "loss": 0.93562498, "memory(GiB)": 369.42, "step": 36480, "train_speed(iter/s)": 0.200865 }, { "acc": 0.74765358, "epoch": 0.9255454084221207, "grad_norm": 2.84375, "learning_rate": 6.021663180000675e-06, "loss": 0.96617641, "memory(GiB)": 369.42, "step": 36485, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74643879, "epoch": 0.9256722475900558, "grad_norm": 2.265625, "learning_rate": 6.020636658232297e-06, "loss": 0.9960701, "memory(GiB)": 369.42, "step": 36490, "train_speed(iter/s)": 0.200875 }, { "acc": 0.74542665, "epoch": 0.9257990867579908, "grad_norm": 2.296875, "learning_rate": 6.019610091571762e-06, "loss": 1.01565199, "memory(GiB)": 369.42, "step": 36495, "train_speed(iter/s)": 0.20088 }, { "acc": 0.74319882, "epoch": 0.9259259259259259, "grad_norm": 2.6875, "learning_rate": 6.018583480064222e-06, "loss": 1.03371353, "memory(GiB)": 369.42, "step": 36500, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75948973, "epoch": 0.926052765093861, "grad_norm": 2.046875, "learning_rate": 6.017556823754833e-06, "loss": 0.90842857, "memory(GiB)": 369.42, "step": 36505, "train_speed(iter/s)": 0.200888 }, { "acc": 0.75827336, "epoch": 0.926179604261796, "grad_norm": 2.984375, "learning_rate": 6.016530122688753e-06, "loss": 0.95983582, "memory(GiB)": 369.42, "step": 36510, "train_speed(iter/s)": 0.200891 }, { "acc": 0.75746245, "epoch": 0.9263064434297311, "grad_norm": 2.328125, "learning_rate": 6.015503376911138e-06, "loss": 1.01519108, "memory(GiB)": 369.42, "step": 36515, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75964079, "epoch": 0.9264332825976661, "grad_norm": 1.96875, "learning_rate": 6.0144765864671515e-06, "loss": 0.98966999, "memory(GiB)": 369.42, "step": 36520, "train_speed(iter/s)": 0.2009 }, { "acc": 0.71886892, "epoch": 0.9265601217656012, "grad_norm": 2.140625, "learning_rate": 6.013449751401954e-06, "loss": 1.07583447, "memory(GiB)": 369.42, "step": 36525, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75575094, "epoch": 0.9266869609335363, "grad_norm": 2.4375, "learning_rate": 6.012422871760715e-06, "loss": 0.97232704, "memory(GiB)": 369.42, "step": 36530, "train_speed(iter/s)": 0.200907 }, { "acc": 0.74080348, "epoch": 0.9268138001014713, "grad_norm": 2.171875, "learning_rate": 6.011395947588594e-06, "loss": 1.03758163, "memory(GiB)": 369.42, "step": 36535, "train_speed(iter/s)": 0.200911 }, { "acc": 0.73733606, "epoch": 0.9269406392694064, "grad_norm": 2.03125, "learning_rate": 6.010368978930767e-06, "loss": 1.03328085, "memory(GiB)": 369.42, "step": 36540, "train_speed(iter/s)": 0.200915 }, { "acc": 0.74677067, "epoch": 0.9270674784373415, "grad_norm": 2.203125, "learning_rate": 6.0093419658323995e-06, "loss": 1.06463823, "memory(GiB)": 369.42, "step": 36545, "train_speed(iter/s)": 0.20092 }, { "acc": 0.7397995, "epoch": 0.9271943176052765, "grad_norm": 2.265625, "learning_rate": 6.0083149083386675e-06, "loss": 1.08758316, "memory(GiB)": 369.42, "step": 36550, "train_speed(iter/s)": 0.200925 }, { "acc": 0.7288887, "epoch": 0.9273211567732116, "grad_norm": 2.234375, "learning_rate": 6.007287806494742e-06, "loss": 1.03845215, "memory(GiB)": 369.42, "step": 36555, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75688915, "epoch": 0.9274479959411466, "grad_norm": 2.125, "learning_rate": 6.006260660345802e-06, "loss": 1.01745796, "memory(GiB)": 369.42, "step": 36560, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75191488, "epoch": 0.9275748351090817, "grad_norm": 1.984375, "learning_rate": 6.005233469937027e-06, "loss": 1.02014122, "memory(GiB)": 369.42, "step": 36565, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75871902, "epoch": 0.9277016742770168, "grad_norm": 2.5, "learning_rate": 6.004206235313594e-06, "loss": 0.95264168, "memory(GiB)": 369.42, "step": 36570, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75036821, "epoch": 0.9278285134449518, "grad_norm": 2.015625, "learning_rate": 6.003178956520688e-06, "loss": 0.99047642, "memory(GiB)": 369.42, "step": 36575, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74427185, "epoch": 0.9279553526128869, "grad_norm": 1.8203125, "learning_rate": 6.002151633603493e-06, "loss": 1.03635101, "memory(GiB)": 369.42, "step": 36580, "train_speed(iter/s)": 0.200948 }, { "acc": 0.74963026, "epoch": 0.928082191780822, "grad_norm": 1.9453125, "learning_rate": 6.0011242666071945e-06, "loss": 0.98785114, "memory(GiB)": 369.42, "step": 36585, "train_speed(iter/s)": 0.200946 }, { "acc": 0.73851786, "epoch": 0.928209030948757, "grad_norm": 2.015625, "learning_rate": 6.000096855576982e-06, "loss": 1.00640259, "memory(GiB)": 369.42, "step": 36590, "train_speed(iter/s)": 0.200949 }, { "acc": 0.74767485, "epoch": 0.928335870116692, "grad_norm": 2.125, "learning_rate": 5.999069400558044e-06, "loss": 1.02888184, "memory(GiB)": 369.42, "step": 36595, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75128798, "epoch": 0.928462709284627, "grad_norm": 2.265625, "learning_rate": 5.998041901595573e-06, "loss": 0.97337494, "memory(GiB)": 369.42, "step": 36600, "train_speed(iter/s)": 0.200954 }, { "acc": 0.72980833, "epoch": 0.9285895484525621, "grad_norm": 2.265625, "learning_rate": 5.997014358734763e-06, "loss": 1.0145359, "memory(GiB)": 369.42, "step": 36605, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75133801, "epoch": 0.9287163876204972, "grad_norm": 2.03125, "learning_rate": 5.995986772020811e-06, "loss": 0.97093916, "memory(GiB)": 369.42, "step": 36610, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75227294, "epoch": 0.9288432267884322, "grad_norm": 1.9609375, "learning_rate": 5.994959141498913e-06, "loss": 0.98852253, "memory(GiB)": 369.42, "step": 36615, "train_speed(iter/s)": 0.200964 }, { "acc": 0.74884291, "epoch": 0.9289700659563673, "grad_norm": 1.8359375, "learning_rate": 5.993931467214272e-06, "loss": 1.00101576, "memory(GiB)": 369.42, "step": 36620, "train_speed(iter/s)": 0.200968 }, { "acc": 0.74712362, "epoch": 0.9290969051243024, "grad_norm": 2.0625, "learning_rate": 5.992903749212084e-06, "loss": 1.0136446, "memory(GiB)": 369.42, "step": 36625, "train_speed(iter/s)": 0.200972 }, { "acc": 0.74312363, "epoch": 0.9292237442922374, "grad_norm": 2.34375, "learning_rate": 5.991875987537559e-06, "loss": 1.01839943, "memory(GiB)": 369.42, "step": 36630, "train_speed(iter/s)": 0.200975 }, { "acc": 0.75818434, "epoch": 0.9293505834601725, "grad_norm": 2.109375, "learning_rate": 5.990848182235898e-06, "loss": 0.98427238, "memory(GiB)": 369.42, "step": 36635, "train_speed(iter/s)": 0.200981 }, { "acc": 0.73923321, "epoch": 0.9294774226281075, "grad_norm": 2.796875, "learning_rate": 5.98982033335231e-06, "loss": 1.06112099, "memory(GiB)": 369.42, "step": 36640, "train_speed(iter/s)": 0.200983 }, { "acc": 0.75613942, "epoch": 0.9296042617960426, "grad_norm": 2.09375, "learning_rate": 5.988792440932006e-06, "loss": 0.90842972, "memory(GiB)": 369.42, "step": 36645, "train_speed(iter/s)": 0.200986 }, { "acc": 0.75985203, "epoch": 0.9297311009639777, "grad_norm": 2.0, "learning_rate": 5.987764505020195e-06, "loss": 0.93062744, "memory(GiB)": 369.42, "step": 36650, "train_speed(iter/s)": 0.20099 }, { "acc": 0.73078051, "epoch": 0.9298579401319127, "grad_norm": 2.296875, "learning_rate": 5.986736525662091e-06, "loss": 1.03019295, "memory(GiB)": 369.42, "step": 36655, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74106493, "epoch": 0.9299847792998478, "grad_norm": 1.890625, "learning_rate": 5.985708502902909e-06, "loss": 1.00144987, "memory(GiB)": 369.42, "step": 36660, "train_speed(iter/s)": 0.200996 }, { "acc": 0.75054493, "epoch": 0.9301116184677829, "grad_norm": 2.203125, "learning_rate": 5.984680436787867e-06, "loss": 0.98758955, "memory(GiB)": 369.42, "step": 36665, "train_speed(iter/s)": 0.201 }, { "acc": 0.74103203, "epoch": 0.9302384576357179, "grad_norm": 2.140625, "learning_rate": 5.983652327362182e-06, "loss": 1.0292326, "memory(GiB)": 369.42, "step": 36670, "train_speed(iter/s)": 0.201005 }, { "acc": 0.74930382, "epoch": 0.930365296803653, "grad_norm": 1.984375, "learning_rate": 5.982624174671077e-06, "loss": 1.03539124, "memory(GiB)": 369.42, "step": 36675, "train_speed(iter/s)": 0.201007 }, { "acc": 0.75656104, "epoch": 0.930492135971588, "grad_norm": 2.265625, "learning_rate": 5.981595978759773e-06, "loss": 1.02672768, "memory(GiB)": 369.42, "step": 36680, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74559431, "epoch": 0.9306189751395231, "grad_norm": 1.8515625, "learning_rate": 5.980567739673495e-06, "loss": 1.04534779, "memory(GiB)": 369.42, "step": 36685, "train_speed(iter/s)": 0.201018 }, { "acc": 0.75889683, "epoch": 0.9307458143074582, "grad_norm": 2.140625, "learning_rate": 5.979539457457472e-06, "loss": 0.99516735, "memory(GiB)": 369.42, "step": 36690, "train_speed(iter/s)": 0.201024 }, { "acc": 0.74910059, "epoch": 0.9308726534753932, "grad_norm": 2.40625, "learning_rate": 5.978511132156928e-06, "loss": 1.00670242, "memory(GiB)": 369.42, "step": 36695, "train_speed(iter/s)": 0.201029 }, { "acc": 0.74501233, "epoch": 0.9309994926433283, "grad_norm": 2.5625, "learning_rate": 5.9774827638170965e-06, "loss": 1.03214207, "memory(GiB)": 369.42, "step": 36700, "train_speed(iter/s)": 0.201034 }, { "acc": 0.75741243, "epoch": 0.9311263318112634, "grad_norm": 2.015625, "learning_rate": 5.9764543524832085e-06, "loss": 0.94683914, "memory(GiB)": 369.42, "step": 36705, "train_speed(iter/s)": 0.201037 }, { "acc": 0.75850916, "epoch": 0.9312531709791984, "grad_norm": 2.40625, "learning_rate": 5.975425898200499e-06, "loss": 0.98596153, "memory(GiB)": 369.42, "step": 36710, "train_speed(iter/s)": 0.201042 }, { "acc": 0.73936763, "epoch": 0.9313800101471335, "grad_norm": 2.40625, "learning_rate": 5.974397401014202e-06, "loss": 1.05132484, "memory(GiB)": 369.42, "step": 36715, "train_speed(iter/s)": 0.201046 }, { "acc": 0.74064612, "epoch": 0.9315068493150684, "grad_norm": 1.96875, "learning_rate": 5.973368860969559e-06, "loss": 1.0600605, "memory(GiB)": 369.42, "step": 36720, "train_speed(iter/s)": 0.201049 }, { "acc": 0.74639521, "epoch": 0.9316336884830035, "grad_norm": 2.5625, "learning_rate": 5.972340278111808e-06, "loss": 1.04596977, "memory(GiB)": 369.42, "step": 36725, "train_speed(iter/s)": 0.201051 }, { "acc": 0.75218749, "epoch": 0.9317605276509386, "grad_norm": 1.875, "learning_rate": 5.9713116524861895e-06, "loss": 0.92440357, "memory(GiB)": 369.42, "step": 36730, "train_speed(iter/s)": 0.201052 }, { "acc": 0.74907055, "epoch": 0.9318873668188736, "grad_norm": 2.359375, "learning_rate": 5.970282984137947e-06, "loss": 1.02015095, "memory(GiB)": 369.42, "step": 36735, "train_speed(iter/s)": 0.201051 }, { "acc": 0.75871859, "epoch": 0.9320142059868087, "grad_norm": 1.8359375, "learning_rate": 5.969254273112328e-06, "loss": 0.97813511, "memory(GiB)": 369.42, "step": 36740, "train_speed(iter/s)": 0.201056 }, { "acc": 0.75216346, "epoch": 0.9321410451547438, "grad_norm": 2.34375, "learning_rate": 5.968225519454577e-06, "loss": 1.02589254, "memory(GiB)": 369.42, "step": 36745, "train_speed(iter/s)": 0.201056 }, { "acc": 0.75500669, "epoch": 0.9322678843226788, "grad_norm": 2.1875, "learning_rate": 5.967196723209947e-06, "loss": 0.98240004, "memory(GiB)": 369.42, "step": 36750, "train_speed(iter/s)": 0.201059 }, { "acc": 0.73727541, "epoch": 0.9323947234906139, "grad_norm": 2.65625, "learning_rate": 5.966167884423686e-06, "loss": 1.07449284, "memory(GiB)": 369.42, "step": 36755, "train_speed(iter/s)": 0.201064 }, { "acc": 0.7693717, "epoch": 0.9325215626585489, "grad_norm": 2.140625, "learning_rate": 5.965139003141048e-06, "loss": 0.91734715, "memory(GiB)": 369.42, "step": 36760, "train_speed(iter/s)": 0.201068 }, { "acc": 0.7448926, "epoch": 0.932648401826484, "grad_norm": 2.109375, "learning_rate": 5.964110079407287e-06, "loss": 0.99721146, "memory(GiB)": 369.42, "step": 36765, "train_speed(iter/s)": 0.201074 }, { "acc": 0.74008608, "epoch": 0.9327752409944191, "grad_norm": 2.0625, "learning_rate": 5.9630811132676625e-06, "loss": 1.05438423, "memory(GiB)": 369.42, "step": 36770, "train_speed(iter/s)": 0.201077 }, { "acc": 0.75030613, "epoch": 0.9329020801623541, "grad_norm": 2.125, "learning_rate": 5.962052104767427e-06, "loss": 1.04277163, "memory(GiB)": 369.42, "step": 36775, "train_speed(iter/s)": 0.20108 }, { "acc": 0.7486022, "epoch": 0.9330289193302892, "grad_norm": 2.28125, "learning_rate": 5.961023053951848e-06, "loss": 1.01673155, "memory(GiB)": 369.42, "step": 36780, "train_speed(iter/s)": 0.201082 }, { "acc": 0.74948325, "epoch": 0.9331557584982243, "grad_norm": 2.484375, "learning_rate": 5.9599939608661825e-06, "loss": 1.05870533, "memory(GiB)": 369.42, "step": 36785, "train_speed(iter/s)": 0.201085 }, { "acc": 0.73941851, "epoch": 0.9332825976661593, "grad_norm": 2.0, "learning_rate": 5.9589648255556975e-06, "loss": 1.03100891, "memory(GiB)": 369.42, "step": 36790, "train_speed(iter/s)": 0.201087 }, { "acc": 0.74844871, "epoch": 0.9334094368340944, "grad_norm": 1.96875, "learning_rate": 5.957935648065658e-06, "loss": 0.99354038, "memory(GiB)": 369.42, "step": 36795, "train_speed(iter/s)": 0.201091 }, { "acc": 0.74551592, "epoch": 0.9335362760020294, "grad_norm": 2.78125, "learning_rate": 5.956906428441331e-06, "loss": 1.01270094, "memory(GiB)": 369.42, "step": 36800, "train_speed(iter/s)": 0.201096 }, { "acc": 0.75298295, "epoch": 0.9336631151699645, "grad_norm": 1.84375, "learning_rate": 5.955877166727988e-06, "loss": 0.98041935, "memory(GiB)": 369.42, "step": 36805, "train_speed(iter/s)": 0.201101 }, { "acc": 0.74598341, "epoch": 0.9337899543378996, "grad_norm": 2.484375, "learning_rate": 5.954847862970898e-06, "loss": 1.00962696, "memory(GiB)": 369.42, "step": 36810, "train_speed(iter/s)": 0.201105 }, { "acc": 0.73484292, "epoch": 0.9339167935058346, "grad_norm": 1.96875, "learning_rate": 5.953818517215338e-06, "loss": 1.06799898, "memory(GiB)": 369.42, "step": 36815, "train_speed(iter/s)": 0.201108 }, { "acc": 0.75556989, "epoch": 0.9340436326737697, "grad_norm": 2.0, "learning_rate": 5.95278912950658e-06, "loss": 1.01111317, "memory(GiB)": 369.42, "step": 36820, "train_speed(iter/s)": 0.201113 }, { "acc": 0.76344481, "epoch": 0.9341704718417048, "grad_norm": 1.984375, "learning_rate": 5.9517596998899e-06, "loss": 0.93435583, "memory(GiB)": 369.42, "step": 36825, "train_speed(iter/s)": 0.201118 }, { "acc": 0.75963106, "epoch": 0.9342973110096398, "grad_norm": 2.265625, "learning_rate": 5.9507302284105836e-06, "loss": 0.99561491, "memory(GiB)": 369.42, "step": 36830, "train_speed(iter/s)": 0.201119 }, { "acc": 0.75741887, "epoch": 0.9344241501775749, "grad_norm": 1.984375, "learning_rate": 5.949700715113904e-06, "loss": 1.02969246, "memory(GiB)": 369.42, "step": 36835, "train_speed(iter/s)": 0.201124 }, { "acc": 0.75226684, "epoch": 0.9345509893455098, "grad_norm": 2.484375, "learning_rate": 5.9486711600451484e-06, "loss": 0.96760635, "memory(GiB)": 369.42, "step": 36840, "train_speed(iter/s)": 0.201129 }, { "acc": 0.74994087, "epoch": 0.9346778285134449, "grad_norm": 2.03125, "learning_rate": 5.9476415632495974e-06, "loss": 1.04316273, "memory(GiB)": 369.42, "step": 36845, "train_speed(iter/s)": 0.201133 }, { "acc": 0.73908644, "epoch": 0.93480466768138, "grad_norm": 2.046875, "learning_rate": 5.946611924772542e-06, "loss": 0.98841496, "memory(GiB)": 369.42, "step": 36850, "train_speed(iter/s)": 0.201137 }, { "acc": 0.75614243, "epoch": 0.934931506849315, "grad_norm": 1.9375, "learning_rate": 5.945582244659267e-06, "loss": 0.98938742, "memory(GiB)": 369.42, "step": 36855, "train_speed(iter/s)": 0.201142 }, { "acc": 0.75081115, "epoch": 0.9350583460172501, "grad_norm": 2.21875, "learning_rate": 5.944552522955063e-06, "loss": 1.03150759, "memory(GiB)": 369.42, "step": 36860, "train_speed(iter/s)": 0.201146 }, { "acc": 0.76120534, "epoch": 0.9351851851851852, "grad_norm": 1.6328125, "learning_rate": 5.943522759705221e-06, "loss": 0.98082581, "memory(GiB)": 369.42, "step": 36865, "train_speed(iter/s)": 0.201146 }, { "acc": 0.73623514, "epoch": 0.9353120243531202, "grad_norm": 1.859375, "learning_rate": 5.942492954955037e-06, "loss": 1.072507, "memory(GiB)": 369.42, "step": 36870, "train_speed(iter/s)": 0.201151 }, { "acc": 0.73467722, "epoch": 0.9354388635210553, "grad_norm": 2.125, "learning_rate": 5.941463108749804e-06, "loss": 1.04599571, "memory(GiB)": 369.42, "step": 36875, "train_speed(iter/s)": 0.201155 }, { "acc": 0.7472496, "epoch": 0.9355657026889903, "grad_norm": 2.625, "learning_rate": 5.940433221134821e-06, "loss": 1.01268559, "memory(GiB)": 369.42, "step": 36880, "train_speed(iter/s)": 0.201159 }, { "acc": 0.74866037, "epoch": 0.9356925418569254, "grad_norm": 2.390625, "learning_rate": 5.9394032921553856e-06, "loss": 1.01809692, "memory(GiB)": 369.42, "step": 36885, "train_speed(iter/s)": 0.201161 }, { "acc": 0.7552896, "epoch": 0.9358193810248605, "grad_norm": 2.5, "learning_rate": 5.9383733218568e-06, "loss": 1.03498278, "memory(GiB)": 369.42, "step": 36890, "train_speed(iter/s)": 0.201165 }, { "acc": 0.73866315, "epoch": 0.9359462201927955, "grad_norm": 2.015625, "learning_rate": 5.937343310284365e-06, "loss": 1.02042637, "memory(GiB)": 369.42, "step": 36895, "train_speed(iter/s)": 0.20117 }, { "acc": 0.74222202, "epoch": 0.9360730593607306, "grad_norm": 1.8828125, "learning_rate": 5.936313257483387e-06, "loss": 1.02784977, "memory(GiB)": 369.42, "step": 36900, "train_speed(iter/s)": 0.201176 }, { "acc": 0.74347982, "epoch": 0.9361998985286657, "grad_norm": 1.890625, "learning_rate": 5.935283163499171e-06, "loss": 1.0455554, "memory(GiB)": 369.42, "step": 36905, "train_speed(iter/s)": 0.201179 }, { "acc": 0.7403614, "epoch": 0.9363267376966007, "grad_norm": 2.0, "learning_rate": 5.9342530283770274e-06, "loss": 1.03119144, "memory(GiB)": 369.42, "step": 36910, "train_speed(iter/s)": 0.20118 }, { "acc": 0.75873861, "epoch": 0.9364535768645358, "grad_norm": 1.7421875, "learning_rate": 5.9332228521622615e-06, "loss": 0.93155251, "memory(GiB)": 369.42, "step": 36915, "train_speed(iter/s)": 0.201183 }, { "acc": 0.75847659, "epoch": 0.9365804160324708, "grad_norm": 2.140625, "learning_rate": 5.93219263490019e-06, "loss": 0.99583035, "memory(GiB)": 369.42, "step": 36920, "train_speed(iter/s)": 0.201185 }, { "acc": 0.74889765, "epoch": 0.9367072552004059, "grad_norm": 2.296875, "learning_rate": 5.931162376636123e-06, "loss": 1.07330141, "memory(GiB)": 369.42, "step": 36925, "train_speed(iter/s)": 0.201187 }, { "acc": 0.74363794, "epoch": 0.936834094368341, "grad_norm": 2.421875, "learning_rate": 5.93013207741538e-06, "loss": 0.96563835, "memory(GiB)": 369.42, "step": 36930, "train_speed(iter/s)": 0.201191 }, { "acc": 0.76383944, "epoch": 0.936960933536276, "grad_norm": 2.0, "learning_rate": 5.929101737283274e-06, "loss": 0.94422016, "memory(GiB)": 369.42, "step": 36935, "train_speed(iter/s)": 0.201196 }, { "acc": 0.74487734, "epoch": 0.9370877727042111, "grad_norm": 2.21875, "learning_rate": 5.928071356285126e-06, "loss": 1.00589104, "memory(GiB)": 369.42, "step": 36940, "train_speed(iter/s)": 0.201199 }, { "acc": 0.76201172, "epoch": 0.9372146118721462, "grad_norm": 2.109375, "learning_rate": 5.927040934466255e-06, "loss": 0.96591921, "memory(GiB)": 369.42, "step": 36945, "train_speed(iter/s)": 0.201203 }, { "acc": 0.76059799, "epoch": 0.9373414510400812, "grad_norm": 1.8984375, "learning_rate": 5.926010471871986e-06, "loss": 0.94772186, "memory(GiB)": 369.42, "step": 36950, "train_speed(iter/s)": 0.201208 }, { "acc": 0.75050964, "epoch": 0.9374682902080163, "grad_norm": 1.78125, "learning_rate": 5.924979968547642e-06, "loss": 0.95791092, "memory(GiB)": 369.42, "step": 36955, "train_speed(iter/s)": 0.201212 }, { "acc": 0.72740569, "epoch": 0.9375951293759512, "grad_norm": 2.5625, "learning_rate": 5.9239494245385485e-06, "loss": 1.10173283, "memory(GiB)": 369.42, "step": 36960, "train_speed(iter/s)": 0.201216 }, { "acc": 0.75091238, "epoch": 0.9377219685438863, "grad_norm": 2.296875, "learning_rate": 5.9229188398900325e-06, "loss": 1.00412025, "memory(GiB)": 369.42, "step": 36965, "train_speed(iter/s)": 0.20122 }, { "acc": 0.73376694, "epoch": 0.9378488077118214, "grad_norm": 2.0, "learning_rate": 5.921888214647429e-06, "loss": 1.04405031, "memory(GiB)": 369.42, "step": 36970, "train_speed(iter/s)": 0.201225 }, { "acc": 0.74623127, "epoch": 0.9379756468797564, "grad_norm": 2.15625, "learning_rate": 5.920857548856064e-06, "loss": 0.97724152, "memory(GiB)": 369.42, "step": 36975, "train_speed(iter/s)": 0.201229 }, { "acc": 0.74775009, "epoch": 0.9381024860476915, "grad_norm": 2.3125, "learning_rate": 5.919826842561274e-06, "loss": 0.96952267, "memory(GiB)": 369.42, "step": 36980, "train_speed(iter/s)": 0.201234 }, { "acc": 0.7449326, "epoch": 0.9382293252156266, "grad_norm": 2.15625, "learning_rate": 5.91879609580839e-06, "loss": 1.05479946, "memory(GiB)": 369.42, "step": 36985, "train_speed(iter/s)": 0.201239 }, { "acc": 0.73835154, "epoch": 0.9383561643835616, "grad_norm": 2.390625, "learning_rate": 5.917765308642754e-06, "loss": 1.06917524, "memory(GiB)": 369.42, "step": 36990, "train_speed(iter/s)": 0.201243 }, { "acc": 0.74691105, "epoch": 0.9384830035514967, "grad_norm": 2.15625, "learning_rate": 5.9167344811097014e-06, "loss": 1.04268284, "memory(GiB)": 369.42, "step": 36995, "train_speed(iter/s)": 0.201249 }, { "acc": 0.75620785, "epoch": 0.9386098427194317, "grad_norm": 2.234375, "learning_rate": 5.9157036132545735e-06, "loss": 0.96092873, "memory(GiB)": 369.42, "step": 37000, "train_speed(iter/s)": 0.201253 }, { "epoch": 0.9386098427194317, "eval_acc": 0.7375840600047542, "eval_loss": 0.9711504578590393, "eval_runtime": 385.1392, "eval_samples_per_second": 16.539, "eval_steps_per_second": 8.27, "step": 37000 }, { "acc": 0.75989265, "epoch": 0.9387366818873668, "grad_norm": 2.6875, "learning_rate": 5.914672705122713e-06, "loss": 0.99831219, "memory(GiB)": 369.42, "step": 37005, "train_speed(iter/s)": 0.200479 }, { "acc": 0.75041866, "epoch": 0.9388635210553019, "grad_norm": 2.421875, "learning_rate": 5.9136417567594615e-06, "loss": 0.98178749, "memory(GiB)": 369.42, "step": 37010, "train_speed(iter/s)": 0.200485 }, { "acc": 0.74841614, "epoch": 0.9389903602232369, "grad_norm": 2.3125, "learning_rate": 5.9126107682101675e-06, "loss": 1.03775043, "memory(GiB)": 369.42, "step": 37015, "train_speed(iter/s)": 0.200489 }, { "acc": 0.7297842, "epoch": 0.939117199391172, "grad_norm": 1.6875, "learning_rate": 5.911579739520178e-06, "loss": 1.03870583, "memory(GiB)": 369.42, "step": 37020, "train_speed(iter/s)": 0.200491 }, { "acc": 0.74391651, "epoch": 0.9392440385591071, "grad_norm": 2.3125, "learning_rate": 5.91054867073484e-06, "loss": 1.02883005, "memory(GiB)": 369.42, "step": 37025, "train_speed(iter/s)": 0.200494 }, { "acc": 0.73677979, "epoch": 0.9393708777270421, "grad_norm": 2.328125, "learning_rate": 5.909517561899508e-06, "loss": 1.02303696, "memory(GiB)": 369.42, "step": 37030, "train_speed(iter/s)": 0.200499 }, { "acc": 0.72396746, "epoch": 0.9394977168949772, "grad_norm": 2.28125, "learning_rate": 5.908486413059532e-06, "loss": 1.0341629, "memory(GiB)": 369.42, "step": 37035, "train_speed(iter/s)": 0.200505 }, { "acc": 0.74344707, "epoch": 0.9396245560629122, "grad_norm": 1.7421875, "learning_rate": 5.907455224260268e-06, "loss": 1.00233498, "memory(GiB)": 369.42, "step": 37040, "train_speed(iter/s)": 0.200508 }, { "acc": 0.74937105, "epoch": 0.9397513952308473, "grad_norm": 2.484375, "learning_rate": 5.9064239955470704e-06, "loss": 0.99145937, "memory(GiB)": 369.42, "step": 37045, "train_speed(iter/s)": 0.200514 }, { "acc": 0.72710381, "epoch": 0.9398782343987824, "grad_norm": 2.328125, "learning_rate": 5.9053927269653e-06, "loss": 1.09581165, "memory(GiB)": 369.42, "step": 37050, "train_speed(iter/s)": 0.200518 }, { "acc": 0.76117234, "epoch": 0.9400050735667174, "grad_norm": 2.0625, "learning_rate": 5.904361418560314e-06, "loss": 0.97605095, "memory(GiB)": 369.42, "step": 37055, "train_speed(iter/s)": 0.200523 }, { "acc": 0.75307865, "epoch": 0.9401319127346525, "grad_norm": 2.375, "learning_rate": 5.903330070377477e-06, "loss": 1.03247995, "memory(GiB)": 369.42, "step": 37060, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75436182, "epoch": 0.9402587519025876, "grad_norm": 2.53125, "learning_rate": 5.902298682462147e-06, "loss": 0.94234734, "memory(GiB)": 369.42, "step": 37065, "train_speed(iter/s)": 0.20053 }, { "acc": 0.75307403, "epoch": 0.9403855910705226, "grad_norm": 2.453125, "learning_rate": 5.901267254859695e-06, "loss": 1.011199, "memory(GiB)": 369.42, "step": 37070, "train_speed(iter/s)": 0.200535 }, { "acc": 0.73954177, "epoch": 0.9405124302384577, "grad_norm": 1.8046875, "learning_rate": 5.900235787615485e-06, "loss": 1.07918062, "memory(GiB)": 369.42, "step": 37075, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75106688, "epoch": 0.9406392694063926, "grad_norm": 2.375, "learning_rate": 5.8992042807748866e-06, "loss": 0.98946104, "memory(GiB)": 369.42, "step": 37080, "train_speed(iter/s)": 0.200543 }, { "acc": 0.74502439, "epoch": 0.9407661085743277, "grad_norm": 2.328125, "learning_rate": 5.898172734383267e-06, "loss": 1.02592344, "memory(GiB)": 369.42, "step": 37085, "train_speed(iter/s)": 0.200547 }, { "acc": 0.73382754, "epoch": 0.9408929477422628, "grad_norm": 2.390625, "learning_rate": 5.897141148486003e-06, "loss": 1.04643803, "memory(GiB)": 369.42, "step": 37090, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75271034, "epoch": 0.9410197869101978, "grad_norm": 2.296875, "learning_rate": 5.8961095231284645e-06, "loss": 0.98050785, "memory(GiB)": 369.42, "step": 37095, "train_speed(iter/s)": 0.200554 }, { "acc": 0.74604015, "epoch": 0.9411466260781329, "grad_norm": 1.9921875, "learning_rate": 5.895077858356029e-06, "loss": 1.00798779, "memory(GiB)": 369.42, "step": 37100, "train_speed(iter/s)": 0.20056 }, { "acc": 0.75107822, "epoch": 0.941273465246068, "grad_norm": 2.1875, "learning_rate": 5.8940461542140725e-06, "loss": 0.96893368, "memory(GiB)": 369.42, "step": 37105, "train_speed(iter/s)": 0.200565 }, { "acc": 0.76036868, "epoch": 0.941400304414003, "grad_norm": 2.4375, "learning_rate": 5.893014410747975e-06, "loss": 0.95684509, "memory(GiB)": 369.42, "step": 37110, "train_speed(iter/s)": 0.20057 }, { "acc": 0.75449433, "epoch": 0.9415271435819381, "grad_norm": 2.0625, "learning_rate": 5.891982628003114e-06, "loss": 1.02586327, "memory(GiB)": 369.42, "step": 37115, "train_speed(iter/s)": 0.200575 }, { "acc": 0.74088182, "epoch": 0.9416539827498731, "grad_norm": 2.59375, "learning_rate": 5.890950806024879e-06, "loss": 1.01271935, "memory(GiB)": 369.42, "step": 37120, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74695807, "epoch": 0.9417808219178082, "grad_norm": 2.28125, "learning_rate": 5.889918944858647e-06, "loss": 1.03322773, "memory(GiB)": 369.42, "step": 37125, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75875845, "epoch": 0.9419076610857433, "grad_norm": 2.359375, "learning_rate": 5.888887044549808e-06, "loss": 0.99519005, "memory(GiB)": 369.42, "step": 37130, "train_speed(iter/s)": 0.200587 }, { "acc": 0.73820934, "epoch": 0.9420345002536783, "grad_norm": 2.171875, "learning_rate": 5.887855105143746e-06, "loss": 1.01570377, "memory(GiB)": 369.42, "step": 37135, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75129695, "epoch": 0.9421613394216134, "grad_norm": 1.8515625, "learning_rate": 5.886823126685855e-06, "loss": 0.96275864, "memory(GiB)": 369.42, "step": 37140, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74007225, "epoch": 0.9422881785895485, "grad_norm": 2.421875, "learning_rate": 5.8857911092215214e-06, "loss": 1.0635047, "memory(GiB)": 369.42, "step": 37145, "train_speed(iter/s)": 0.200603 }, { "acc": 0.74548903, "epoch": 0.9424150177574835, "grad_norm": 2.234375, "learning_rate": 5.884759052796142e-06, "loss": 1.05292797, "memory(GiB)": 369.42, "step": 37150, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74989681, "epoch": 0.9425418569254186, "grad_norm": 2.171875, "learning_rate": 5.883726957455108e-06, "loss": 1.00421705, "memory(GiB)": 369.42, "step": 37155, "train_speed(iter/s)": 0.200609 }, { "acc": 0.74817648, "epoch": 0.9426686960933536, "grad_norm": 1.8984375, "learning_rate": 5.8826948232438176e-06, "loss": 1.01277618, "memory(GiB)": 369.42, "step": 37160, "train_speed(iter/s)": 0.200614 }, { "acc": 0.75117464, "epoch": 0.9427955352612887, "grad_norm": 1.9296875, "learning_rate": 5.881662650207667e-06, "loss": 0.99170132, "memory(GiB)": 369.42, "step": 37165, "train_speed(iter/s)": 0.200618 }, { "acc": 0.76313162, "epoch": 0.9429223744292238, "grad_norm": 2.40625, "learning_rate": 5.880630438392057e-06, "loss": 0.97533703, "memory(GiB)": 369.42, "step": 37170, "train_speed(iter/s)": 0.200624 }, { "acc": 0.7517911, "epoch": 0.9430492135971588, "grad_norm": 1.953125, "learning_rate": 5.879598187842389e-06, "loss": 0.97556982, "memory(GiB)": 369.42, "step": 37175, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75153565, "epoch": 0.9431760527650939, "grad_norm": 2.4375, "learning_rate": 5.878565898604066e-06, "loss": 1.02678328, "memory(GiB)": 369.42, "step": 37180, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75556622, "epoch": 0.943302891933029, "grad_norm": 1.859375, "learning_rate": 5.87753357072249e-06, "loss": 0.93017969, "memory(GiB)": 369.42, "step": 37185, "train_speed(iter/s)": 0.200632 }, { "acc": 0.75974112, "epoch": 0.943429731100964, "grad_norm": 2.328125, "learning_rate": 5.876501204243072e-06, "loss": 0.98734465, "memory(GiB)": 369.42, "step": 37190, "train_speed(iter/s)": 0.200638 }, { "acc": 0.76509638, "epoch": 0.943556570268899, "grad_norm": 2.578125, "learning_rate": 5.875468799211217e-06, "loss": 0.96085157, "memory(GiB)": 369.42, "step": 37195, "train_speed(iter/s)": 0.200641 }, { "acc": 0.74560986, "epoch": 0.943683409436834, "grad_norm": 1.8671875, "learning_rate": 5.874436355672337e-06, "loss": 0.96316805, "memory(GiB)": 369.42, "step": 37200, "train_speed(iter/s)": 0.200637 }, { "acc": 0.74267483, "epoch": 0.9438102486047691, "grad_norm": 2.109375, "learning_rate": 5.873403873671839e-06, "loss": 0.98804798, "memory(GiB)": 369.42, "step": 37205, "train_speed(iter/s)": 0.200641 }, { "acc": 0.74371347, "epoch": 0.9439370877727042, "grad_norm": 2.3125, "learning_rate": 5.872371353255142e-06, "loss": 1.03363419, "memory(GiB)": 369.42, "step": 37210, "train_speed(iter/s)": 0.200645 }, { "acc": 0.75986538, "epoch": 0.9440639269406392, "grad_norm": 2.390625, "learning_rate": 5.871338794467656e-06, "loss": 1.00370779, "memory(GiB)": 369.42, "step": 37215, "train_speed(iter/s)": 0.200648 }, { "acc": 0.74118319, "epoch": 0.9441907661085743, "grad_norm": 2.015625, "learning_rate": 5.8703061973548e-06, "loss": 0.97130051, "memory(GiB)": 369.42, "step": 37220, "train_speed(iter/s)": 0.200651 }, { "acc": 0.74960537, "epoch": 0.9443176052765094, "grad_norm": 1.625, "learning_rate": 5.869273561961992e-06, "loss": 0.98285456, "memory(GiB)": 369.42, "step": 37225, "train_speed(iter/s)": 0.200655 }, { "acc": 0.75240374, "epoch": 0.9444444444444444, "grad_norm": 2.09375, "learning_rate": 5.8682408883346535e-06, "loss": 1.01178818, "memory(GiB)": 369.42, "step": 37230, "train_speed(iter/s)": 0.200658 }, { "acc": 0.75071778, "epoch": 0.9445712836123795, "grad_norm": 2.109375, "learning_rate": 5.867208176518202e-06, "loss": 0.98992004, "memory(GiB)": 369.42, "step": 37235, "train_speed(iter/s)": 0.200663 }, { "acc": 0.75315418, "epoch": 0.9446981227803145, "grad_norm": 3.09375, "learning_rate": 5.866175426558064e-06, "loss": 1.00002871, "memory(GiB)": 369.42, "step": 37240, "train_speed(iter/s)": 0.200665 }, { "acc": 0.75080757, "epoch": 0.9448249619482496, "grad_norm": 2.15625, "learning_rate": 5.865142638499664e-06, "loss": 1.08185034, "memory(GiB)": 369.42, "step": 37245, "train_speed(iter/s)": 0.20067 }, { "acc": 0.73172741, "epoch": 0.9449518011161847, "grad_norm": 2.140625, "learning_rate": 5.864109812388426e-06, "loss": 1.07369089, "memory(GiB)": 369.42, "step": 37250, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74760876, "epoch": 0.9450786402841197, "grad_norm": 1.96875, "learning_rate": 5.863076948269782e-06, "loss": 0.98385897, "memory(GiB)": 369.42, "step": 37255, "train_speed(iter/s)": 0.200679 }, { "acc": 0.74965076, "epoch": 0.9452054794520548, "grad_norm": 1.875, "learning_rate": 5.862044046189162e-06, "loss": 1.0152216, "memory(GiB)": 369.42, "step": 37260, "train_speed(iter/s)": 0.200683 }, { "acc": 0.7583951, "epoch": 0.9453323186199899, "grad_norm": 2.1875, "learning_rate": 5.8610111061919924e-06, "loss": 0.98998489, "memory(GiB)": 369.42, "step": 37265, "train_speed(iter/s)": 0.200687 }, { "acc": 0.74533844, "epoch": 0.9454591577879249, "grad_norm": 2.4375, "learning_rate": 5.859978128323713e-06, "loss": 1.02572899, "memory(GiB)": 369.42, "step": 37270, "train_speed(iter/s)": 0.200692 }, { "acc": 0.74952765, "epoch": 0.94558599695586, "grad_norm": 2.171875, "learning_rate": 5.858945112629755e-06, "loss": 0.98366337, "memory(GiB)": 369.42, "step": 37275, "train_speed(iter/s)": 0.200694 }, { "acc": 0.7585319, "epoch": 0.945712836123795, "grad_norm": 2.03125, "learning_rate": 5.857912059155557e-06, "loss": 0.98737392, "memory(GiB)": 369.42, "step": 37280, "train_speed(iter/s)": 0.200695 }, { "acc": 0.75795279, "epoch": 0.9458396752917301, "grad_norm": 1.8828125, "learning_rate": 5.856878967946555e-06, "loss": 0.96594124, "memory(GiB)": 369.42, "step": 37285, "train_speed(iter/s)": 0.2007 }, { "acc": 0.74261241, "epoch": 0.9459665144596652, "grad_norm": 2.375, "learning_rate": 5.855845839048191e-06, "loss": 1.01696653, "memory(GiB)": 369.42, "step": 37290, "train_speed(iter/s)": 0.200704 }, { "acc": 0.74895749, "epoch": 0.9460933536276002, "grad_norm": 1.9921875, "learning_rate": 5.854812672505906e-06, "loss": 1.03925228, "memory(GiB)": 369.42, "step": 37295, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75864916, "epoch": 0.9462201927955353, "grad_norm": 2.390625, "learning_rate": 5.853779468365144e-06, "loss": 0.95378885, "memory(GiB)": 369.42, "step": 37300, "train_speed(iter/s)": 0.200713 }, { "acc": 0.74315701, "epoch": 0.9463470319634704, "grad_norm": 2.0625, "learning_rate": 5.852746226671348e-06, "loss": 1.05740147, "memory(GiB)": 369.42, "step": 37305, "train_speed(iter/s)": 0.200718 }, { "acc": 0.74047365, "epoch": 0.9464738711314054, "grad_norm": 1.9609375, "learning_rate": 5.851712947469966e-06, "loss": 0.99792709, "memory(GiB)": 369.42, "step": 37310, "train_speed(iter/s)": 0.200722 }, { "acc": 0.74009771, "epoch": 0.9466007102993405, "grad_norm": 2.1875, "learning_rate": 5.850679630806446e-06, "loss": 1.0518631, "memory(GiB)": 369.42, "step": 37315, "train_speed(iter/s)": 0.200724 }, { "acc": 0.74936333, "epoch": 0.9467275494672754, "grad_norm": 2.109375, "learning_rate": 5.849646276726237e-06, "loss": 1.02211094, "memory(GiB)": 369.42, "step": 37320, "train_speed(iter/s)": 0.200727 }, { "acc": 0.74748859, "epoch": 0.9468543886352105, "grad_norm": 2.625, "learning_rate": 5.848612885274792e-06, "loss": 1.01691351, "memory(GiB)": 369.42, "step": 37325, "train_speed(iter/s)": 0.200728 }, { "acc": 0.74028273, "epoch": 0.9469812278031456, "grad_norm": 2.125, "learning_rate": 5.847579456497564e-06, "loss": 1.0611474, "memory(GiB)": 369.42, "step": 37330, "train_speed(iter/s)": 0.200726 }, { "acc": 0.74766464, "epoch": 0.9471080669710806, "grad_norm": 2.25, "learning_rate": 5.8465459904400065e-06, "loss": 0.96299591, "memory(GiB)": 369.42, "step": 37335, "train_speed(iter/s)": 0.200729 }, { "acc": 0.75165625, "epoch": 0.9472349061390157, "grad_norm": 2.140625, "learning_rate": 5.845512487147579e-06, "loss": 1.00441418, "memory(GiB)": 369.42, "step": 37340, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74916763, "epoch": 0.9473617453069508, "grad_norm": 2.359375, "learning_rate": 5.844478946665733e-06, "loss": 1.00801048, "memory(GiB)": 369.42, "step": 37345, "train_speed(iter/s)": 0.200736 }, { "acc": 0.75630889, "epoch": 0.9474885844748858, "grad_norm": 1.90625, "learning_rate": 5.843445369039937e-06, "loss": 1.03553352, "memory(GiB)": 369.42, "step": 37350, "train_speed(iter/s)": 0.200739 }, { "acc": 0.7678112, "epoch": 0.9476154236428209, "grad_norm": 2.4375, "learning_rate": 5.842411754315645e-06, "loss": 0.92507858, "memory(GiB)": 369.42, "step": 37355, "train_speed(iter/s)": 0.200743 }, { "acc": 0.74721451, "epoch": 0.9477422628107559, "grad_norm": 2.046875, "learning_rate": 5.841378102538324e-06, "loss": 1.05622616, "memory(GiB)": 369.42, "step": 37360, "train_speed(iter/s)": 0.200746 }, { "acc": 0.75108814, "epoch": 0.947869101978691, "grad_norm": 2.53125, "learning_rate": 5.840344413753438e-06, "loss": 0.97501163, "memory(GiB)": 369.42, "step": 37365, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74525156, "epoch": 0.9479959411466261, "grad_norm": 1.8671875, "learning_rate": 5.8393106880064535e-06, "loss": 0.97895584, "memory(GiB)": 369.42, "step": 37370, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74872351, "epoch": 0.9481227803145611, "grad_norm": 2.125, "learning_rate": 5.838276925342836e-06, "loss": 0.97649651, "memory(GiB)": 369.42, "step": 37375, "train_speed(iter/s)": 0.200759 }, { "acc": 0.74702039, "epoch": 0.9482496194824962, "grad_norm": 2.171875, "learning_rate": 5.837243125808058e-06, "loss": 0.98384209, "memory(GiB)": 369.42, "step": 37380, "train_speed(iter/s)": 0.200762 }, { "acc": 0.73726058, "epoch": 0.9483764586504313, "grad_norm": 2.53125, "learning_rate": 5.8362092894475886e-06, "loss": 1.05861244, "memory(GiB)": 369.42, "step": 37385, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75003929, "epoch": 0.9485032978183663, "grad_norm": 1.953125, "learning_rate": 5.835175416306901e-06, "loss": 0.99760647, "memory(GiB)": 369.42, "step": 37390, "train_speed(iter/s)": 0.200772 }, { "acc": 0.75527787, "epoch": 0.9486301369863014, "grad_norm": 2.421875, "learning_rate": 5.83414150643147e-06, "loss": 0.95485706, "memory(GiB)": 369.42, "step": 37395, "train_speed(iter/s)": 0.200776 }, { "acc": 0.73653283, "epoch": 0.9487569761542364, "grad_norm": 2.125, "learning_rate": 5.833107559866772e-06, "loss": 0.91635332, "memory(GiB)": 369.42, "step": 37400, "train_speed(iter/s)": 0.200778 }, { "acc": 0.74992547, "epoch": 0.9488838153221715, "grad_norm": 2.765625, "learning_rate": 5.832073576658282e-06, "loss": 1.0145525, "memory(GiB)": 369.42, "step": 37405, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74606552, "epoch": 0.9490106544901066, "grad_norm": 2.375, "learning_rate": 5.831039556851485e-06, "loss": 0.98546181, "memory(GiB)": 369.42, "step": 37410, "train_speed(iter/s)": 0.200787 }, { "acc": 0.74073219, "epoch": 0.9491374936580416, "grad_norm": 1.796875, "learning_rate": 5.8300055004918535e-06, "loss": 1.00182552, "memory(GiB)": 369.42, "step": 37415, "train_speed(iter/s)": 0.200789 }, { "acc": 0.73904681, "epoch": 0.9492643328259767, "grad_norm": 2.1875, "learning_rate": 5.828971407624877e-06, "loss": 1.05409145, "memory(GiB)": 369.42, "step": 37420, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74800091, "epoch": 0.9493911719939118, "grad_norm": 2.03125, "learning_rate": 5.827937278296037e-06, "loss": 0.98246498, "memory(GiB)": 369.42, "step": 37425, "train_speed(iter/s)": 0.200794 }, { "acc": 0.76150351, "epoch": 0.9495180111618468, "grad_norm": 1.796875, "learning_rate": 5.826903112550819e-06, "loss": 0.96935072, "memory(GiB)": 369.42, "step": 37430, "train_speed(iter/s)": 0.200798 }, { "acc": 0.75046825, "epoch": 0.9496448503297819, "grad_norm": 2.28125, "learning_rate": 5.825868910434708e-06, "loss": 0.97235031, "memory(GiB)": 369.42, "step": 37435, "train_speed(iter/s)": 0.200801 }, { "acc": 0.76020021, "epoch": 0.9497716894977168, "grad_norm": 1.9140625, "learning_rate": 5.824834671993197e-06, "loss": 0.9725235, "memory(GiB)": 369.42, "step": 37440, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74081435, "epoch": 0.9498985286656519, "grad_norm": 2.375, "learning_rate": 5.823800397271774e-06, "loss": 1.02292805, "memory(GiB)": 369.42, "step": 37445, "train_speed(iter/s)": 0.200809 }, { "acc": 0.74530125, "epoch": 0.950025367833587, "grad_norm": 2.46875, "learning_rate": 5.822766086315932e-06, "loss": 1.02817307, "memory(GiB)": 369.42, "step": 37450, "train_speed(iter/s)": 0.200813 }, { "acc": 0.74809675, "epoch": 0.950152207001522, "grad_norm": 2.171875, "learning_rate": 5.821731739171164e-06, "loss": 1.02063274, "memory(GiB)": 369.42, "step": 37455, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75010266, "epoch": 0.9502790461694571, "grad_norm": 2.21875, "learning_rate": 5.820697355882965e-06, "loss": 1.04657841, "memory(GiB)": 369.42, "step": 37460, "train_speed(iter/s)": 0.20082 }, { "acc": 0.7482914, "epoch": 0.9504058853373922, "grad_norm": 2.515625, "learning_rate": 5.819662936496833e-06, "loss": 1.02164679, "memory(GiB)": 369.42, "step": 37465, "train_speed(iter/s)": 0.200822 }, { "acc": 0.74085641, "epoch": 0.9505327245053272, "grad_norm": 1.859375, "learning_rate": 5.818628481058265e-06, "loss": 0.96304111, "memory(GiB)": 369.42, "step": 37470, "train_speed(iter/s)": 0.200826 }, { "acc": 0.74010363, "epoch": 0.9506595636732623, "grad_norm": 1.984375, "learning_rate": 5.81759398961276e-06, "loss": 1.02875404, "memory(GiB)": 369.42, "step": 37475, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74664631, "epoch": 0.9507864028411973, "grad_norm": 1.984375, "learning_rate": 5.816559462205824e-06, "loss": 0.99451008, "memory(GiB)": 369.42, "step": 37480, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75652604, "epoch": 0.9509132420091324, "grad_norm": 2.296875, "learning_rate": 5.815524898882954e-06, "loss": 0.95654392, "memory(GiB)": 369.42, "step": 37485, "train_speed(iter/s)": 0.200839 }, { "acc": 0.75804715, "epoch": 0.9510400811770675, "grad_norm": 2.25, "learning_rate": 5.8144902996896615e-06, "loss": 0.99264517, "memory(GiB)": 369.42, "step": 37490, "train_speed(iter/s)": 0.200841 }, { "acc": 0.7360775, "epoch": 0.9511669203450025, "grad_norm": 1.9453125, "learning_rate": 5.813455664671446e-06, "loss": 1.06390266, "memory(GiB)": 369.42, "step": 37495, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75780163, "epoch": 0.9512937595129376, "grad_norm": 2.28125, "learning_rate": 5.812420993873819e-06, "loss": 0.93877831, "memory(GiB)": 369.42, "step": 37500, "train_speed(iter/s)": 0.20085 }, { "acc": 0.75262299, "epoch": 0.9514205986808727, "grad_norm": 2.328125, "learning_rate": 5.81138628734229e-06, "loss": 0.99223557, "memory(GiB)": 369.42, "step": 37505, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74836493, "epoch": 0.9515474378488077, "grad_norm": 2.375, "learning_rate": 5.81035154512237e-06, "loss": 0.96311989, "memory(GiB)": 369.42, "step": 37510, "train_speed(iter/s)": 0.200859 }, { "acc": 0.7412046, "epoch": 0.9516742770167428, "grad_norm": 2.21875, "learning_rate": 5.809316767259571e-06, "loss": 1.02052536, "memory(GiB)": 369.42, "step": 37515, "train_speed(iter/s)": 0.200864 }, { "acc": 0.73826866, "epoch": 0.9518011161846778, "grad_norm": 2.125, "learning_rate": 5.808281953799408e-06, "loss": 1.0205924, "memory(GiB)": 369.42, "step": 37520, "train_speed(iter/s)": 0.200868 }, { "acc": 0.73059373, "epoch": 0.9519279553526129, "grad_norm": 2.421875, "learning_rate": 5.807247104787395e-06, "loss": 1.02008123, "memory(GiB)": 369.42, "step": 37525, "train_speed(iter/s)": 0.200874 }, { "acc": 0.7593976, "epoch": 0.952054794520548, "grad_norm": 2.046875, "learning_rate": 5.806212220269049e-06, "loss": 0.98971748, "memory(GiB)": 369.42, "step": 37530, "train_speed(iter/s)": 0.200878 }, { "acc": 0.73627024, "epoch": 0.952181633688483, "grad_norm": 2.203125, "learning_rate": 5.805177300289891e-06, "loss": 1.04207935, "memory(GiB)": 369.42, "step": 37535, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74672251, "epoch": 0.9523084728564181, "grad_norm": 2.15625, "learning_rate": 5.804142344895441e-06, "loss": 1.01753788, "memory(GiB)": 369.42, "step": 37540, "train_speed(iter/s)": 0.200883 }, { "acc": 0.75004663, "epoch": 0.9524353120243532, "grad_norm": 2.265625, "learning_rate": 5.803107354131221e-06, "loss": 0.97397003, "memory(GiB)": 369.42, "step": 37545, "train_speed(iter/s)": 0.200886 }, { "acc": 0.74777541, "epoch": 0.9525621511922882, "grad_norm": 2.09375, "learning_rate": 5.802072328042753e-06, "loss": 1.0169837, "memory(GiB)": 369.42, "step": 37550, "train_speed(iter/s)": 0.200891 }, { "acc": 0.73991308, "epoch": 0.9526889903602233, "grad_norm": 2.0, "learning_rate": 5.8010372666755625e-06, "loss": 1.04882011, "memory(GiB)": 369.42, "step": 37555, "train_speed(iter/s)": 0.200893 }, { "acc": 0.74789352, "epoch": 0.9528158295281582, "grad_norm": 2.375, "learning_rate": 5.800002170075179e-06, "loss": 1.02101469, "memory(GiB)": 369.42, "step": 37560, "train_speed(iter/s)": 0.200895 }, { "acc": 0.76364002, "epoch": 0.9529426686960933, "grad_norm": 2.109375, "learning_rate": 5.798967038287125e-06, "loss": 0.97219067, "memory(GiB)": 369.42, "step": 37565, "train_speed(iter/s)": 0.2009 }, { "acc": 0.75054398, "epoch": 0.9530695078640284, "grad_norm": 2.53125, "learning_rate": 5.797931871356936e-06, "loss": 0.99162111, "memory(GiB)": 369.42, "step": 37570, "train_speed(iter/s)": 0.200905 }, { "acc": 0.74105334, "epoch": 0.9531963470319634, "grad_norm": 2.15625, "learning_rate": 5.796896669330139e-06, "loss": 1.01780033, "memory(GiB)": 369.42, "step": 37575, "train_speed(iter/s)": 0.20091 }, { "acc": 0.73682237, "epoch": 0.9533231861998985, "grad_norm": 2.15625, "learning_rate": 5.79586143225227e-06, "loss": 1.05570593, "memory(GiB)": 369.42, "step": 37580, "train_speed(iter/s)": 0.200913 }, { "acc": 0.74120002, "epoch": 0.9534500253678336, "grad_norm": 2.515625, "learning_rate": 5.79482616016886e-06, "loss": 1.0496788, "memory(GiB)": 369.42, "step": 37585, "train_speed(iter/s)": 0.200918 }, { "acc": 0.74625273, "epoch": 0.9535768645357686, "grad_norm": 2.640625, "learning_rate": 5.793790853125449e-06, "loss": 1.01585121, "memory(GiB)": 369.42, "step": 37590, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74422798, "epoch": 0.9537037037037037, "grad_norm": 2.234375, "learning_rate": 5.792755511167572e-06, "loss": 1.01752453, "memory(GiB)": 369.42, "step": 37595, "train_speed(iter/s)": 0.200926 }, { "acc": 0.75243549, "epoch": 0.9538305428716387, "grad_norm": 2.234375, "learning_rate": 5.7917201343407685e-06, "loss": 0.99740992, "memory(GiB)": 369.42, "step": 37600, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74515605, "epoch": 0.9539573820395738, "grad_norm": 2.5, "learning_rate": 5.790684722690577e-06, "loss": 1.06909924, "memory(GiB)": 369.42, "step": 37605, "train_speed(iter/s)": 0.200932 }, { "acc": 0.74348679, "epoch": 0.9540842212075089, "grad_norm": 2.53125, "learning_rate": 5.789649276262542e-06, "loss": 1.05126915, "memory(GiB)": 369.42, "step": 37610, "train_speed(iter/s)": 0.200934 }, { "acc": 0.7503902, "epoch": 0.9542110603754439, "grad_norm": 2.484375, "learning_rate": 5.788613795102207e-06, "loss": 0.9927207, "memory(GiB)": 369.42, "step": 37615, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75149765, "epoch": 0.954337899543379, "grad_norm": 1.9453125, "learning_rate": 5.787578279255116e-06, "loss": 0.95085917, "memory(GiB)": 369.42, "step": 37620, "train_speed(iter/s)": 0.200942 }, { "acc": 0.76474876, "epoch": 0.9544647387113141, "grad_norm": 2.046875, "learning_rate": 5.786542728766815e-06, "loss": 0.97078981, "memory(GiB)": 369.42, "step": 37625, "train_speed(iter/s)": 0.200946 }, { "acc": 0.7449666, "epoch": 0.9545915778792491, "grad_norm": 3.203125, "learning_rate": 5.785507143682856e-06, "loss": 0.99095011, "memory(GiB)": 369.42, "step": 37630, "train_speed(iter/s)": 0.200949 }, { "acc": 0.73603239, "epoch": 0.9547184170471842, "grad_norm": 2.15625, "learning_rate": 5.784471524048782e-06, "loss": 1.00457048, "memory(GiB)": 369.42, "step": 37635, "train_speed(iter/s)": 0.200954 }, { "acc": 0.74497142, "epoch": 0.9548452562151192, "grad_norm": 2.53125, "learning_rate": 5.783435869910151e-06, "loss": 1.03713932, "memory(GiB)": 369.42, "step": 37640, "train_speed(iter/s)": 0.200957 }, { "acc": 0.74797745, "epoch": 0.9549720953830543, "grad_norm": 1.828125, "learning_rate": 5.782400181312511e-06, "loss": 1.01946144, "memory(GiB)": 369.42, "step": 37645, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75495491, "epoch": 0.9550989345509894, "grad_norm": 2.078125, "learning_rate": 5.781364458301419e-06, "loss": 0.97917738, "memory(GiB)": 369.42, "step": 37650, "train_speed(iter/s)": 0.200961 }, { "acc": 0.75009356, "epoch": 0.9552257737189244, "grad_norm": 1.8828125, "learning_rate": 5.780328700922427e-06, "loss": 1.02013874, "memory(GiB)": 369.42, "step": 37655, "train_speed(iter/s)": 0.200964 }, { "acc": 0.76880188, "epoch": 0.9553526128868595, "grad_norm": 2.09375, "learning_rate": 5.779292909221097e-06, "loss": 0.99469824, "memory(GiB)": 369.42, "step": 37660, "train_speed(iter/s)": 0.200969 }, { "acc": 0.75196247, "epoch": 0.9554794520547946, "grad_norm": 2.0625, "learning_rate": 5.778257083242986e-06, "loss": 1.00549984, "memory(GiB)": 369.42, "step": 37665, "train_speed(iter/s)": 0.200975 }, { "acc": 0.74115372, "epoch": 0.9556062912227296, "grad_norm": 1.828125, "learning_rate": 5.777221223033653e-06, "loss": 1.00207081, "memory(GiB)": 369.42, "step": 37670, "train_speed(iter/s)": 0.200977 }, { "acc": 0.74535656, "epoch": 0.9557331303906647, "grad_norm": 1.8046875, "learning_rate": 5.77618532863866e-06, "loss": 1.00709333, "memory(GiB)": 369.42, "step": 37675, "train_speed(iter/s)": 0.200979 }, { "acc": 0.76230278, "epoch": 0.9558599695585996, "grad_norm": 2.59375, "learning_rate": 5.775149400103572e-06, "loss": 0.91514397, "memory(GiB)": 369.42, "step": 37680, "train_speed(iter/s)": 0.200981 }, { "acc": 0.74397783, "epoch": 0.9559868087265347, "grad_norm": 2.3125, "learning_rate": 5.774113437473953e-06, "loss": 1.00998917, "memory(GiB)": 369.42, "step": 37685, "train_speed(iter/s)": 0.200985 }, { "acc": 0.74506779, "epoch": 0.9561136478944698, "grad_norm": 2.109375, "learning_rate": 5.7730774407953675e-06, "loss": 0.97689476, "memory(GiB)": 369.42, "step": 37690, "train_speed(iter/s)": 0.200987 }, { "acc": 0.76127834, "epoch": 0.9562404870624048, "grad_norm": 2.421875, "learning_rate": 5.772041410113384e-06, "loss": 0.94342785, "memory(GiB)": 369.42, "step": 37695, "train_speed(iter/s)": 0.200991 }, { "acc": 0.74546051, "epoch": 0.9563673262303399, "grad_norm": 2.21875, "learning_rate": 5.771005345473575e-06, "loss": 0.99740982, "memory(GiB)": 369.42, "step": 37700, "train_speed(iter/s)": 0.200997 }, { "acc": 0.75190811, "epoch": 0.956494165398275, "grad_norm": 2.03125, "learning_rate": 5.769969246921505e-06, "loss": 0.97562656, "memory(GiB)": 369.42, "step": 37705, "train_speed(iter/s)": 0.201001 }, { "acc": 0.75081382, "epoch": 0.95662100456621, "grad_norm": 2.078125, "learning_rate": 5.768933114502753e-06, "loss": 1.00157166, "memory(GiB)": 369.42, "step": 37710, "train_speed(iter/s)": 0.201005 }, { "acc": 0.7598443, "epoch": 0.9567478437341451, "grad_norm": 2.515625, "learning_rate": 5.7678969482628875e-06, "loss": 0.96644192, "memory(GiB)": 369.42, "step": 37715, "train_speed(iter/s)": 0.20101 }, { "acc": 0.75198021, "epoch": 0.9568746829020801, "grad_norm": 2.59375, "learning_rate": 5.766860748247488e-06, "loss": 0.95135212, "memory(GiB)": 369.42, "step": 37720, "train_speed(iter/s)": 0.201013 }, { "acc": 0.73290768, "epoch": 0.9570015220700152, "grad_norm": 1.9921875, "learning_rate": 5.765824514502126e-06, "loss": 1.02780304, "memory(GiB)": 369.42, "step": 37725, "train_speed(iter/s)": 0.201018 }, { "acc": 0.76659136, "epoch": 0.9571283612379503, "grad_norm": 3.3125, "learning_rate": 5.7647882470723846e-06, "loss": 0.97965746, "memory(GiB)": 369.42, "step": 37730, "train_speed(iter/s)": 0.201021 }, { "acc": 0.76611824, "epoch": 0.9572552004058853, "grad_norm": 2.734375, "learning_rate": 5.763751946003842e-06, "loss": 0.93519812, "memory(GiB)": 369.42, "step": 37735, "train_speed(iter/s)": 0.201025 }, { "acc": 0.74495602, "epoch": 0.9573820395738204, "grad_norm": 2.03125, "learning_rate": 5.7627156113420775e-06, "loss": 0.97542572, "memory(GiB)": 369.42, "step": 37740, "train_speed(iter/s)": 0.201027 }, { "acc": 0.74834709, "epoch": 0.9575088787417555, "grad_norm": 2.171875, "learning_rate": 5.761679243132677e-06, "loss": 0.98064308, "memory(GiB)": 369.42, "step": 37745, "train_speed(iter/s)": 0.20103 }, { "acc": 0.73476553, "epoch": 0.9576357179096905, "grad_norm": 2.15625, "learning_rate": 5.760642841421222e-06, "loss": 1.03485107, "memory(GiB)": 369.42, "step": 37750, "train_speed(iter/s)": 0.201032 }, { "acc": 0.75866966, "epoch": 0.9577625570776256, "grad_norm": 2.390625, "learning_rate": 5.759606406253299e-06, "loss": 0.95055742, "memory(GiB)": 369.42, "step": 37755, "train_speed(iter/s)": 0.201034 }, { "acc": 0.75235929, "epoch": 0.9578893962455606, "grad_norm": 1.9453125, "learning_rate": 5.758569937674494e-06, "loss": 0.93686371, "memory(GiB)": 369.42, "step": 37760, "train_speed(iter/s)": 0.201036 }, { "acc": 0.75720363, "epoch": 0.9580162354134957, "grad_norm": 2.25, "learning_rate": 5.7575334357303954e-06, "loss": 0.96311855, "memory(GiB)": 369.42, "step": 37765, "train_speed(iter/s)": 0.20104 }, { "acc": 0.75155463, "epoch": 0.9581430745814308, "grad_norm": 2.03125, "learning_rate": 5.756496900466596e-06, "loss": 0.97882357, "memory(GiB)": 369.42, "step": 37770, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74727306, "epoch": 0.9582699137493658, "grad_norm": 2.03125, "learning_rate": 5.755460331928684e-06, "loss": 0.99112091, "memory(GiB)": 369.42, "step": 37775, "train_speed(iter/s)": 0.201049 }, { "acc": 0.74171863, "epoch": 0.9583967529173009, "grad_norm": 2.234375, "learning_rate": 5.754423730162257e-06, "loss": 1.0565506, "memory(GiB)": 369.42, "step": 37780, "train_speed(iter/s)": 0.201051 }, { "acc": 0.75500374, "epoch": 0.958523592085236, "grad_norm": 1.8515625, "learning_rate": 5.753387095212901e-06, "loss": 0.99538116, "memory(GiB)": 369.42, "step": 37785, "train_speed(iter/s)": 0.201056 }, { "acc": 0.75342255, "epoch": 0.958650431253171, "grad_norm": 2.53125, "learning_rate": 5.752350427126221e-06, "loss": 1.02737532, "memory(GiB)": 369.42, "step": 37790, "train_speed(iter/s)": 0.201061 }, { "acc": 0.74977221, "epoch": 0.958777270421106, "grad_norm": 1.8984375, "learning_rate": 5.751313725947808e-06, "loss": 0.95403042, "memory(GiB)": 369.42, "step": 37795, "train_speed(iter/s)": 0.201064 }, { "acc": 0.75785851, "epoch": 0.958904109589041, "grad_norm": 2.3125, "learning_rate": 5.7502769917232635e-06, "loss": 0.96746511, "memory(GiB)": 369.42, "step": 37800, "train_speed(iter/s)": 0.201069 }, { "acc": 0.73868361, "epoch": 0.9590309487569761, "grad_norm": 2.0625, "learning_rate": 5.7492402244981885e-06, "loss": 1.02881031, "memory(GiB)": 369.42, "step": 37805, "train_speed(iter/s)": 0.201073 }, { "acc": 0.75265822, "epoch": 0.9591577879249112, "grad_norm": 2.40625, "learning_rate": 5.748203424318182e-06, "loss": 1.00324974, "memory(GiB)": 369.42, "step": 37810, "train_speed(iter/s)": 0.201078 }, { "acc": 0.76855059, "epoch": 0.9592846270928462, "grad_norm": 2.234375, "learning_rate": 5.747166591228849e-06, "loss": 0.92601986, "memory(GiB)": 369.42, "step": 37815, "train_speed(iter/s)": 0.201081 }, { "acc": 0.74563189, "epoch": 0.9594114662607813, "grad_norm": 2.203125, "learning_rate": 5.746129725275793e-06, "loss": 1.01294088, "memory(GiB)": 369.42, "step": 37820, "train_speed(iter/s)": 0.201086 }, { "acc": 0.74406319, "epoch": 0.9595383054287164, "grad_norm": 2.28125, "learning_rate": 5.74509282650462e-06, "loss": 1.03943768, "memory(GiB)": 369.42, "step": 37825, "train_speed(iter/s)": 0.201091 }, { "acc": 0.73687143, "epoch": 0.9596651445966514, "grad_norm": 2.0625, "learning_rate": 5.744055894960938e-06, "loss": 1.02004404, "memory(GiB)": 369.42, "step": 37830, "train_speed(iter/s)": 0.201095 }, { "acc": 0.73764343, "epoch": 0.9597919837645865, "grad_norm": 2.34375, "learning_rate": 5.743018930690357e-06, "loss": 1.08570499, "memory(GiB)": 369.42, "step": 37835, "train_speed(iter/s)": 0.2011 }, { "acc": 0.74669991, "epoch": 0.9599188229325215, "grad_norm": 2.21875, "learning_rate": 5.7419819337384855e-06, "loss": 1.01933174, "memory(GiB)": 369.42, "step": 37840, "train_speed(iter/s)": 0.201106 }, { "acc": 0.74596596, "epoch": 0.9600456621004566, "grad_norm": 2.40625, "learning_rate": 5.740944904150934e-06, "loss": 0.96598759, "memory(GiB)": 369.42, "step": 37845, "train_speed(iter/s)": 0.201111 }, { "acc": 0.74780588, "epoch": 0.9601725012683917, "grad_norm": 2.234375, "learning_rate": 5.739907841973321e-06, "loss": 0.97832928, "memory(GiB)": 369.42, "step": 37850, "train_speed(iter/s)": 0.201112 }, { "acc": 0.75079794, "epoch": 0.9602993404363267, "grad_norm": 2.40625, "learning_rate": 5.738870747251255e-06, "loss": 1.01357813, "memory(GiB)": 369.42, "step": 37855, "train_speed(iter/s)": 0.201117 }, { "acc": 0.74582129, "epoch": 0.9604261796042618, "grad_norm": 2.25, "learning_rate": 5.737833620030357e-06, "loss": 1.01786127, "memory(GiB)": 369.42, "step": 37860, "train_speed(iter/s)": 0.201122 }, { "acc": 0.77003565, "epoch": 0.9605530187721969, "grad_norm": 1.8828125, "learning_rate": 5.7367964603562385e-06, "loss": 0.92215748, "memory(GiB)": 369.42, "step": 37865, "train_speed(iter/s)": 0.201126 }, { "acc": 0.76031299, "epoch": 0.9606798579401319, "grad_norm": 1.984375, "learning_rate": 5.7357592682745245e-06, "loss": 1.0075983, "memory(GiB)": 369.42, "step": 37870, "train_speed(iter/s)": 0.20113 }, { "acc": 0.75934148, "epoch": 0.960806697108067, "grad_norm": 2.328125, "learning_rate": 5.734722043830833e-06, "loss": 0.93741035, "memory(GiB)": 369.42, "step": 37875, "train_speed(iter/s)": 0.201132 }, { "acc": 0.74970369, "epoch": 0.960933536276002, "grad_norm": 2.34375, "learning_rate": 5.7336847870707855e-06, "loss": 1.00529366, "memory(GiB)": 369.42, "step": 37880, "train_speed(iter/s)": 0.201137 }, { "acc": 0.73950324, "epoch": 0.9610603754439371, "grad_norm": 2.03125, "learning_rate": 5.732647498040006e-06, "loss": 1.03060169, "memory(GiB)": 369.42, "step": 37885, "train_speed(iter/s)": 0.201141 }, { "acc": 0.74418235, "epoch": 0.9611872146118722, "grad_norm": 2.359375, "learning_rate": 5.731610176784118e-06, "loss": 0.97859097, "memory(GiB)": 369.42, "step": 37890, "train_speed(iter/s)": 0.201144 }, { "acc": 0.74793568, "epoch": 0.9613140537798072, "grad_norm": 2.0, "learning_rate": 5.730572823348748e-06, "loss": 0.94158106, "memory(GiB)": 369.42, "step": 37895, "train_speed(iter/s)": 0.201148 }, { "acc": 0.74341202, "epoch": 0.9614408929477423, "grad_norm": 2.328125, "learning_rate": 5.729535437779523e-06, "loss": 1.01735306, "memory(GiB)": 369.42, "step": 37900, "train_speed(iter/s)": 0.20115 }, { "acc": 0.72994566, "epoch": 0.9615677321156774, "grad_norm": 2.046875, "learning_rate": 5.728498020122073e-06, "loss": 1.03571558, "memory(GiB)": 369.42, "step": 37905, "train_speed(iter/s)": 0.201155 }, { "acc": 0.75303478, "epoch": 0.9616945712836124, "grad_norm": 2.28125, "learning_rate": 5.727460570422028e-06, "loss": 0.96269455, "memory(GiB)": 369.42, "step": 37910, "train_speed(iter/s)": 0.201155 }, { "acc": 0.75813551, "epoch": 0.9618214104515475, "grad_norm": 2.3125, "learning_rate": 5.726423088725017e-06, "loss": 0.9922389, "memory(GiB)": 369.42, "step": 37915, "train_speed(iter/s)": 0.201158 }, { "acc": 0.73851318, "epoch": 0.9619482496194824, "grad_norm": 1.7734375, "learning_rate": 5.725385575076677e-06, "loss": 0.98031406, "memory(GiB)": 369.42, "step": 37920, "train_speed(iter/s)": 0.201163 }, { "acc": 0.73849363, "epoch": 0.9620750887874175, "grad_norm": 2.28125, "learning_rate": 5.7243480295226405e-06, "loss": 1.00893345, "memory(GiB)": 369.42, "step": 37925, "train_speed(iter/s)": 0.201168 }, { "acc": 0.75562835, "epoch": 0.9622019279553526, "grad_norm": 2.203125, "learning_rate": 5.723310452108545e-06, "loss": 0.97385702, "memory(GiB)": 369.42, "step": 37930, "train_speed(iter/s)": 0.201174 }, { "acc": 0.73381267, "epoch": 0.9623287671232876, "grad_norm": 2.015625, "learning_rate": 5.722272842880023e-06, "loss": 1.05490189, "memory(GiB)": 369.42, "step": 37935, "train_speed(iter/s)": 0.201178 }, { "acc": 0.75238638, "epoch": 0.9624556062912227, "grad_norm": 1.9921875, "learning_rate": 5.7212352018827215e-06, "loss": 0.99400339, "memory(GiB)": 369.42, "step": 37940, "train_speed(iter/s)": 0.20118 }, { "acc": 0.74894309, "epoch": 0.9625824454591578, "grad_norm": 2.09375, "learning_rate": 5.720197529162272e-06, "loss": 0.98914232, "memory(GiB)": 369.42, "step": 37945, "train_speed(iter/s)": 0.201181 }, { "acc": 0.75805216, "epoch": 0.9627092846270928, "grad_norm": 1.7578125, "learning_rate": 5.719159824764321e-06, "loss": 0.91134548, "memory(GiB)": 369.42, "step": 37950, "train_speed(iter/s)": 0.201183 }, { "acc": 0.74651041, "epoch": 0.9628361237950279, "grad_norm": 2.015625, "learning_rate": 5.71812208873451e-06, "loss": 1.03161716, "memory(GiB)": 369.42, "step": 37955, "train_speed(iter/s)": 0.201187 }, { "acc": 0.74907875, "epoch": 0.9629629629629629, "grad_norm": 2.109375, "learning_rate": 5.717084321118482e-06, "loss": 0.98232288, "memory(GiB)": 369.42, "step": 37960, "train_speed(iter/s)": 0.201191 }, { "acc": 0.75022774, "epoch": 0.963089802130898, "grad_norm": 2.328125, "learning_rate": 5.716046521961887e-06, "loss": 1.03195744, "memory(GiB)": 369.42, "step": 37965, "train_speed(iter/s)": 0.201195 }, { "acc": 0.75814819, "epoch": 0.9632166412988331, "grad_norm": 2.53125, "learning_rate": 5.715008691310366e-06, "loss": 1.00485258, "memory(GiB)": 369.42, "step": 37970, "train_speed(iter/s)": 0.2012 }, { "acc": 0.73911505, "epoch": 0.9633434804667681, "grad_norm": 2.5, "learning_rate": 5.713970829209573e-06, "loss": 1.05820818, "memory(GiB)": 369.42, "step": 37975, "train_speed(iter/s)": 0.201204 }, { "acc": 0.73926544, "epoch": 0.9634703196347032, "grad_norm": 2.046875, "learning_rate": 5.712932935705153e-06, "loss": 1.07114105, "memory(GiB)": 369.42, "step": 37980, "train_speed(iter/s)": 0.201206 }, { "acc": 0.73238387, "epoch": 0.9635971588026383, "grad_norm": 2.25, "learning_rate": 5.711895010842762e-06, "loss": 1.08379078, "memory(GiB)": 369.42, "step": 37985, "train_speed(iter/s)": 0.201211 }, { "acc": 0.74882703, "epoch": 0.9637239979705733, "grad_norm": 2.046875, "learning_rate": 5.710857054668048e-06, "loss": 1.0607378, "memory(GiB)": 369.42, "step": 37990, "train_speed(iter/s)": 0.201215 }, { "acc": 0.75825396, "epoch": 0.9638508371385084, "grad_norm": 2.21875, "learning_rate": 5.7098190672266675e-06, "loss": 1.00637112, "memory(GiB)": 369.42, "step": 37995, "train_speed(iter/s)": 0.201217 }, { "acc": 0.75063515, "epoch": 0.9639776763064434, "grad_norm": 2.171875, "learning_rate": 5.708781048564276e-06, "loss": 0.9678957, "memory(GiB)": 369.42, "step": 38000, "train_speed(iter/s)": 0.201222 }, { "epoch": 0.9639776763064434, "eval_acc": 0.7376229123645337, "eval_loss": 0.970623254776001, "eval_runtime": 384.2898, "eval_samples_per_second": 16.576, "eval_steps_per_second": 8.288, "step": 38000 }, { "acc": 0.75650373, "epoch": 0.9641045154743785, "grad_norm": 2.9375, "learning_rate": 5.707742998726527e-06, "loss": 0.98460588, "memory(GiB)": 369.42, "step": 38005, "train_speed(iter/s)": 0.200471 }, { "acc": 0.74700947, "epoch": 0.9642313546423136, "grad_norm": 2.484375, "learning_rate": 5.706704917759085e-06, "loss": 0.95660019, "memory(GiB)": 369.42, "step": 38010, "train_speed(iter/s)": 0.200476 }, { "acc": 0.76958914, "epoch": 0.9643581938102486, "grad_norm": 2.015625, "learning_rate": 5.705666805707603e-06, "loss": 0.93567886, "memory(GiB)": 369.42, "step": 38015, "train_speed(iter/s)": 0.20048 }, { "acc": 0.74054751, "epoch": 0.9644850329781837, "grad_norm": 1.9140625, "learning_rate": 5.704628662617744e-06, "loss": 0.98843784, "memory(GiB)": 369.42, "step": 38020, "train_speed(iter/s)": 0.200486 }, { "acc": 0.76126018, "epoch": 0.9646118721461188, "grad_norm": 1.9765625, "learning_rate": 5.703590488535171e-06, "loss": 0.97851143, "memory(GiB)": 369.42, "step": 38025, "train_speed(iter/s)": 0.20049 }, { "acc": 0.7510828, "epoch": 0.9647387113140538, "grad_norm": 2.375, "learning_rate": 5.702552283505548e-06, "loss": 0.9805069, "memory(GiB)": 369.42, "step": 38030, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75812559, "epoch": 0.9648655504819889, "grad_norm": 1.8984375, "learning_rate": 5.7015140475745376e-06, "loss": 0.96242638, "memory(GiB)": 369.42, "step": 38035, "train_speed(iter/s)": 0.200498 }, { "acc": 0.75346208, "epoch": 0.9649923896499238, "grad_norm": 2.265625, "learning_rate": 5.700475780787809e-06, "loss": 0.97683401, "memory(GiB)": 369.42, "step": 38040, "train_speed(iter/s)": 0.200504 }, { "acc": 0.74680948, "epoch": 0.9651192288178589, "grad_norm": 2.03125, "learning_rate": 5.699437483191027e-06, "loss": 0.95629711, "memory(GiB)": 369.42, "step": 38045, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75849032, "epoch": 0.965246067985794, "grad_norm": 2.0625, "learning_rate": 5.6983991548298615e-06, "loss": 0.98000975, "memory(GiB)": 369.42, "step": 38050, "train_speed(iter/s)": 0.200513 }, { "acc": 0.75794535, "epoch": 0.965372907153729, "grad_norm": 2.6875, "learning_rate": 5.697360795749983e-06, "loss": 0.98844128, "memory(GiB)": 369.42, "step": 38055, "train_speed(iter/s)": 0.200517 }, { "acc": 0.74388418, "epoch": 0.9654997463216641, "grad_norm": 2.140625, "learning_rate": 5.696322405997064e-06, "loss": 1.02575321, "memory(GiB)": 369.42, "step": 38060, "train_speed(iter/s)": 0.200522 }, { "acc": 0.74726272, "epoch": 0.9656265854895992, "grad_norm": 2.546875, "learning_rate": 5.695283985616775e-06, "loss": 1.01818113, "memory(GiB)": 369.42, "step": 38065, "train_speed(iter/s)": 0.200524 }, { "acc": 0.74053907, "epoch": 0.9657534246575342, "grad_norm": 1.890625, "learning_rate": 5.694245534654795e-06, "loss": 1.03284159, "memory(GiB)": 369.42, "step": 38070, "train_speed(iter/s)": 0.200529 }, { "acc": 0.75824299, "epoch": 0.9658802638254693, "grad_norm": 2.53125, "learning_rate": 5.693207053156794e-06, "loss": 0.8927866, "memory(GiB)": 369.42, "step": 38075, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75096765, "epoch": 0.9660071029934043, "grad_norm": 1.8046875, "learning_rate": 5.692168541168455e-06, "loss": 0.97783623, "memory(GiB)": 369.42, "step": 38080, "train_speed(iter/s)": 0.200537 }, { "acc": 0.76013727, "epoch": 0.9661339421613394, "grad_norm": 1.890625, "learning_rate": 5.691129998735449e-06, "loss": 0.93751736, "memory(GiB)": 369.42, "step": 38085, "train_speed(iter/s)": 0.200542 }, { "acc": 0.74544401, "epoch": 0.9662607813292745, "grad_norm": 2.203125, "learning_rate": 5.690091425903464e-06, "loss": 1.05340395, "memory(GiB)": 369.42, "step": 38090, "train_speed(iter/s)": 0.200542 }, { "acc": 0.74684167, "epoch": 0.9663876204972095, "grad_norm": 2.34375, "learning_rate": 5.689052822718175e-06, "loss": 1.00176468, "memory(GiB)": 369.42, "step": 38095, "train_speed(iter/s)": 0.200547 }, { "acc": 0.75947676, "epoch": 0.9665144596651446, "grad_norm": 2.453125, "learning_rate": 5.688014189225266e-06, "loss": 0.95185318, "memory(GiB)": 369.42, "step": 38100, "train_speed(iter/s)": 0.200552 }, { "acc": 0.74741421, "epoch": 0.9666412988330797, "grad_norm": 2.46875, "learning_rate": 5.686975525470423e-06, "loss": 1.047929, "memory(GiB)": 369.42, "step": 38105, "train_speed(iter/s)": 0.200555 }, { "acc": 0.75075722, "epoch": 0.9667681380010147, "grad_norm": 2.3125, "learning_rate": 5.685936831499328e-06, "loss": 1.00513716, "memory(GiB)": 369.42, "step": 38110, "train_speed(iter/s)": 0.200556 }, { "acc": 0.75108185, "epoch": 0.9668949771689498, "grad_norm": 2.265625, "learning_rate": 5.684898107357669e-06, "loss": 1.00119972, "memory(GiB)": 369.42, "step": 38115, "train_speed(iter/s)": 0.200557 }, { "acc": 0.74202957, "epoch": 0.9670218163368848, "grad_norm": 2.234375, "learning_rate": 5.683859353091133e-06, "loss": 1.07405357, "memory(GiB)": 369.42, "step": 38120, "train_speed(iter/s)": 0.200563 }, { "acc": 0.73820362, "epoch": 0.9671486555048199, "grad_norm": 1.953125, "learning_rate": 5.6828205687454094e-06, "loss": 1.04730816, "memory(GiB)": 369.42, "step": 38125, "train_speed(iter/s)": 0.200566 }, { "acc": 0.74112172, "epoch": 0.967275494672755, "grad_norm": 2.015625, "learning_rate": 5.68178175436619e-06, "loss": 1.04806795, "memory(GiB)": 369.42, "step": 38130, "train_speed(iter/s)": 0.20057 }, { "acc": 0.7478641, "epoch": 0.96740233384069, "grad_norm": 2.25, "learning_rate": 5.680742909999163e-06, "loss": 1.03967113, "memory(GiB)": 369.42, "step": 38135, "train_speed(iter/s)": 0.200574 }, { "acc": 0.73028822, "epoch": 0.9675291730086251, "grad_norm": 2.6875, "learning_rate": 5.679704035690026e-06, "loss": 1.06888628, "memory(GiB)": 369.42, "step": 38140, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75361052, "epoch": 0.9676560121765602, "grad_norm": 1.8203125, "learning_rate": 5.6786651314844675e-06, "loss": 0.94646702, "memory(GiB)": 369.42, "step": 38145, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74734221, "epoch": 0.9677828513444952, "grad_norm": 2.234375, "learning_rate": 5.67762619742819e-06, "loss": 0.9980629, "memory(GiB)": 369.42, "step": 38150, "train_speed(iter/s)": 0.200585 }, { "acc": 0.74138737, "epoch": 0.9679096905124303, "grad_norm": 2.25, "learning_rate": 5.676587233566885e-06, "loss": 1.01781178, "memory(GiB)": 369.42, "step": 38155, "train_speed(iter/s)": 0.20059 }, { "acc": 0.74576445, "epoch": 0.9680365296803652, "grad_norm": 1.9453125, "learning_rate": 5.675548239946254e-06, "loss": 0.98633022, "memory(GiB)": 369.42, "step": 38160, "train_speed(iter/s)": 0.200596 }, { "acc": 0.72968683, "epoch": 0.9681633688483003, "grad_norm": 2.453125, "learning_rate": 5.674509216611993e-06, "loss": 1.07414427, "memory(GiB)": 369.42, "step": 38165, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74693213, "epoch": 0.9682902080162354, "grad_norm": 2.375, "learning_rate": 5.673470163609806e-06, "loss": 0.98154354, "memory(GiB)": 369.42, "step": 38170, "train_speed(iter/s)": 0.200603 }, { "acc": 0.76136074, "epoch": 0.9684170471841704, "grad_norm": 1.84375, "learning_rate": 5.672431080985395e-06, "loss": 0.93257446, "memory(GiB)": 369.42, "step": 38175, "train_speed(iter/s)": 0.200605 }, { "acc": 0.75066128, "epoch": 0.9685438863521055, "grad_norm": 2.34375, "learning_rate": 5.671391968784464e-06, "loss": 0.98988028, "memory(GiB)": 369.42, "step": 38180, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75361609, "epoch": 0.9686707255200406, "grad_norm": 2.578125, "learning_rate": 5.670352827052715e-06, "loss": 0.99915104, "memory(GiB)": 369.42, "step": 38185, "train_speed(iter/s)": 0.200616 }, { "acc": 0.76020336, "epoch": 0.9687975646879756, "grad_norm": 2.125, "learning_rate": 5.6693136558358565e-06, "loss": 0.98309813, "memory(GiB)": 369.42, "step": 38190, "train_speed(iter/s)": 0.200621 }, { "acc": 0.73921442, "epoch": 0.9689244038559107, "grad_norm": 2.171875, "learning_rate": 5.668274455179595e-06, "loss": 1.00900574, "memory(GiB)": 369.42, "step": 38195, "train_speed(iter/s)": 0.200623 }, { "acc": 0.73179431, "epoch": 0.9690512430238457, "grad_norm": 2.34375, "learning_rate": 5.667235225129639e-06, "loss": 1.03902569, "memory(GiB)": 369.42, "step": 38200, "train_speed(iter/s)": 0.200625 }, { "acc": 0.74553194, "epoch": 0.9691780821917808, "grad_norm": 2.796875, "learning_rate": 5.6661959657317e-06, "loss": 1.00288219, "memory(GiB)": 369.42, "step": 38205, "train_speed(iter/s)": 0.200622 }, { "acc": 0.76342359, "epoch": 0.9693049213597159, "grad_norm": 2.828125, "learning_rate": 5.665156677031487e-06, "loss": 0.97397556, "memory(GiB)": 369.42, "step": 38210, "train_speed(iter/s)": 0.200627 }, { "acc": 0.7481307, "epoch": 0.9694317605276509, "grad_norm": 2.46875, "learning_rate": 5.664117359074712e-06, "loss": 1.05245304, "memory(GiB)": 369.42, "step": 38215, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74503484, "epoch": 0.969558599695586, "grad_norm": 2.421875, "learning_rate": 5.6630780119070935e-06, "loss": 1.04936047, "memory(GiB)": 369.42, "step": 38220, "train_speed(iter/s)": 0.200636 }, { "acc": 0.74278116, "epoch": 0.9696854388635211, "grad_norm": 2.296875, "learning_rate": 5.6620386355743415e-06, "loss": 1.03766623, "memory(GiB)": 369.42, "step": 38225, "train_speed(iter/s)": 0.200639 }, { "acc": 0.75015116, "epoch": 0.9698122780314561, "grad_norm": 2.234375, "learning_rate": 5.660999230122177e-06, "loss": 0.94213514, "memory(GiB)": 369.42, "step": 38230, "train_speed(iter/s)": 0.200643 }, { "acc": 0.74318376, "epoch": 0.9699391171993912, "grad_norm": 2.328125, "learning_rate": 5.659959795596313e-06, "loss": 0.99879913, "memory(GiB)": 369.42, "step": 38235, "train_speed(iter/s)": 0.200648 }, { "acc": 0.73971004, "epoch": 0.9700659563673262, "grad_norm": 2.34375, "learning_rate": 5.65892033204247e-06, "loss": 1.02685547, "memory(GiB)": 369.42, "step": 38240, "train_speed(iter/s)": 0.200652 }, { "acc": 0.76567922, "epoch": 0.9701927955352613, "grad_norm": 1.953125, "learning_rate": 5.657880839506371e-06, "loss": 0.92239094, "memory(GiB)": 369.42, "step": 38245, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75176516, "epoch": 0.9703196347031964, "grad_norm": 2.90625, "learning_rate": 5.656841318033735e-06, "loss": 0.99367809, "memory(GiB)": 369.42, "step": 38250, "train_speed(iter/s)": 0.200659 }, { "acc": 0.74443111, "epoch": 0.9704464738711314, "grad_norm": 2.296875, "learning_rate": 5.6558017676702846e-06, "loss": 0.97658863, "memory(GiB)": 369.42, "step": 38255, "train_speed(iter/s)": 0.200662 }, { "acc": 0.74810877, "epoch": 0.9705733130390665, "grad_norm": 2.28125, "learning_rate": 5.654762188461744e-06, "loss": 1.02377062, "memory(GiB)": 369.42, "step": 38260, "train_speed(iter/s)": 0.200666 }, { "acc": 0.73718023, "epoch": 0.9707001522070016, "grad_norm": 2.046875, "learning_rate": 5.653722580453841e-06, "loss": 1.02497482, "memory(GiB)": 369.42, "step": 38265, "train_speed(iter/s)": 0.200672 }, { "acc": 0.73425913, "epoch": 0.9708269913749366, "grad_norm": 2.1875, "learning_rate": 5.652682943692299e-06, "loss": 1.06118431, "memory(GiB)": 369.42, "step": 38270, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75527992, "epoch": 0.9709538305428717, "grad_norm": 3.0, "learning_rate": 5.651643278222847e-06, "loss": 1.01695137, "memory(GiB)": 369.42, "step": 38275, "train_speed(iter/s)": 0.20068 }, { "acc": 0.7629323, "epoch": 0.9710806697108066, "grad_norm": 2.203125, "learning_rate": 5.6506035840912145e-06, "loss": 0.95578575, "memory(GiB)": 369.42, "step": 38280, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75685291, "epoch": 0.9712075088787417, "grad_norm": 2.09375, "learning_rate": 5.649563861343131e-06, "loss": 1.01041431, "memory(GiB)": 369.42, "step": 38285, "train_speed(iter/s)": 0.200688 }, { "acc": 0.73584952, "epoch": 0.9713343480466768, "grad_norm": 2.34375, "learning_rate": 5.648524110024331e-06, "loss": 1.01521091, "memory(GiB)": 369.42, "step": 38290, "train_speed(iter/s)": 0.200692 }, { "acc": 0.75725527, "epoch": 0.9714611872146118, "grad_norm": 2.1875, "learning_rate": 5.647484330180542e-06, "loss": 0.97134609, "memory(GiB)": 369.42, "step": 38295, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74985261, "epoch": 0.9715880263825469, "grad_norm": 2.5, "learning_rate": 5.646444521857504e-06, "loss": 1.01189156, "memory(GiB)": 369.42, "step": 38300, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76643634, "epoch": 0.971714865550482, "grad_norm": 2.0625, "learning_rate": 5.645404685100948e-06, "loss": 0.89971752, "memory(GiB)": 369.42, "step": 38305, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75402274, "epoch": 0.971841704718417, "grad_norm": 2.140625, "learning_rate": 5.644364819956613e-06, "loss": 1.02029495, "memory(GiB)": 369.42, "step": 38310, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74860282, "epoch": 0.9719685438863521, "grad_norm": 2.34375, "learning_rate": 5.643324926470236e-06, "loss": 1.00112286, "memory(GiB)": 369.42, "step": 38315, "train_speed(iter/s)": 0.20071 }, { "acc": 0.7574163, "epoch": 0.9720953830542871, "grad_norm": 2.53125, "learning_rate": 5.642285004687557e-06, "loss": 0.98583908, "memory(GiB)": 369.42, "step": 38320, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75776868, "epoch": 0.9722222222222222, "grad_norm": 2.296875, "learning_rate": 5.6412450546543165e-06, "loss": 0.99800186, "memory(GiB)": 369.42, "step": 38325, "train_speed(iter/s)": 0.200718 }, { "acc": 0.73624797, "epoch": 0.9723490613901573, "grad_norm": 2.25, "learning_rate": 5.640205076416254e-06, "loss": 1.09945641, "memory(GiB)": 369.42, "step": 38330, "train_speed(iter/s)": 0.200723 }, { "acc": 0.73707972, "epoch": 0.9724759005580923, "grad_norm": 1.9296875, "learning_rate": 5.639165070019116e-06, "loss": 1.03390312, "memory(GiB)": 369.42, "step": 38335, "train_speed(iter/s)": 0.200727 }, { "acc": 0.74823227, "epoch": 0.9726027397260274, "grad_norm": 2.46875, "learning_rate": 5.638125035508642e-06, "loss": 1.01456442, "memory(GiB)": 369.42, "step": 38340, "train_speed(iter/s)": 0.200732 }, { "acc": 0.75562611, "epoch": 0.9727295788939625, "grad_norm": 1.546875, "learning_rate": 5.6370849729305825e-06, "loss": 0.99786091, "memory(GiB)": 369.42, "step": 38345, "train_speed(iter/s)": 0.200734 }, { "acc": 0.73507795, "epoch": 0.9728564180618975, "grad_norm": 2.234375, "learning_rate": 5.63604488233068e-06, "loss": 1.06424904, "memory(GiB)": 369.42, "step": 38350, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74737568, "epoch": 0.9729832572298326, "grad_norm": 2.25, "learning_rate": 5.635004763754683e-06, "loss": 0.94932709, "memory(GiB)": 369.42, "step": 38355, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74085464, "epoch": 0.9731100963977676, "grad_norm": 2.53125, "learning_rate": 5.633964617248345e-06, "loss": 1.03021202, "memory(GiB)": 369.42, "step": 38360, "train_speed(iter/s)": 0.200745 }, { "acc": 0.75315895, "epoch": 0.9732369355657027, "grad_norm": 1.9453125, "learning_rate": 5.6329244428574085e-06, "loss": 0.9563942, "memory(GiB)": 369.42, "step": 38365, "train_speed(iter/s)": 0.200748 }, { "acc": 0.74688807, "epoch": 0.9733637747336378, "grad_norm": 2.15625, "learning_rate": 5.631884240627632e-06, "loss": 0.99419594, "memory(GiB)": 369.42, "step": 38370, "train_speed(iter/s)": 0.200753 }, { "acc": 0.74968529, "epoch": 0.9734906139015728, "grad_norm": 1.890625, "learning_rate": 5.6308440106047634e-06, "loss": 1.00500679, "memory(GiB)": 369.42, "step": 38375, "train_speed(iter/s)": 0.200757 }, { "acc": 0.74989667, "epoch": 0.9736174530695079, "grad_norm": 2.28125, "learning_rate": 5.62980375283456e-06, "loss": 0.96901722, "memory(GiB)": 369.42, "step": 38380, "train_speed(iter/s)": 0.200759 }, { "acc": 0.7457778, "epoch": 0.973744292237443, "grad_norm": 2.234375, "learning_rate": 5.628763467362775e-06, "loss": 0.99042463, "memory(GiB)": 369.42, "step": 38385, "train_speed(iter/s)": 0.20076 }, { "acc": 0.7396997, "epoch": 0.973871131405378, "grad_norm": 2.171875, "learning_rate": 5.627723154235165e-06, "loss": 1.00682831, "memory(GiB)": 369.42, "step": 38390, "train_speed(iter/s)": 0.200764 }, { "acc": 0.74838858, "epoch": 0.973997970573313, "grad_norm": 2.015625, "learning_rate": 5.62668281349749e-06, "loss": 1.04241238, "memory(GiB)": 369.42, "step": 38395, "train_speed(iter/s)": 0.200768 }, { "acc": 0.73945036, "epoch": 0.974124809741248, "grad_norm": 2.296875, "learning_rate": 5.625642445195505e-06, "loss": 1.0862524, "memory(GiB)": 369.42, "step": 38400, "train_speed(iter/s)": 0.200773 }, { "acc": 0.7508482, "epoch": 0.9742516489091831, "grad_norm": 2.515625, "learning_rate": 5.6246020493749735e-06, "loss": 1.01535511, "memory(GiB)": 369.42, "step": 38405, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74931021, "epoch": 0.9743784880771182, "grad_norm": 2.25, "learning_rate": 5.623561626081654e-06, "loss": 1.00845871, "memory(GiB)": 369.42, "step": 38410, "train_speed(iter/s)": 0.20078 }, { "acc": 0.74675941, "epoch": 0.9745053272450532, "grad_norm": 2.1875, "learning_rate": 5.622521175361311e-06, "loss": 0.98147402, "memory(GiB)": 369.42, "step": 38415, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75123048, "epoch": 0.9746321664129883, "grad_norm": 2.1875, "learning_rate": 5.621480697259707e-06, "loss": 1.0412571, "memory(GiB)": 369.42, "step": 38420, "train_speed(iter/s)": 0.200787 }, { "acc": 0.74196134, "epoch": 0.9747590055809234, "grad_norm": 2.09375, "learning_rate": 5.620440191822607e-06, "loss": 0.99443779, "memory(GiB)": 369.42, "step": 38425, "train_speed(iter/s)": 0.200791 }, { "acc": 0.73029213, "epoch": 0.9748858447488584, "grad_norm": 1.9453125, "learning_rate": 5.619399659095778e-06, "loss": 1.03399239, "memory(GiB)": 369.42, "step": 38430, "train_speed(iter/s)": 0.200795 }, { "acc": 0.74747047, "epoch": 0.9750126839167935, "grad_norm": 2.5625, "learning_rate": 5.618359099124985e-06, "loss": 1.01033821, "memory(GiB)": 369.42, "step": 38435, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75212078, "epoch": 0.9751395230847285, "grad_norm": 1.9296875, "learning_rate": 5.617318511956001e-06, "loss": 1.0254652, "memory(GiB)": 369.42, "step": 38440, "train_speed(iter/s)": 0.200806 }, { "acc": 0.74094825, "epoch": 0.9752663622526636, "grad_norm": 2.515625, "learning_rate": 5.61627789763459e-06, "loss": 1.02066612, "memory(GiB)": 369.42, "step": 38445, "train_speed(iter/s)": 0.200808 }, { "acc": 0.75499501, "epoch": 0.9753932014205987, "grad_norm": 2.109375, "learning_rate": 5.6152372562065275e-06, "loss": 0.97863169, "memory(GiB)": 369.42, "step": 38450, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74041309, "epoch": 0.9755200405885337, "grad_norm": 1.9296875, "learning_rate": 5.614196587717581e-06, "loss": 1.00373983, "memory(GiB)": 369.42, "step": 38455, "train_speed(iter/s)": 0.200816 }, { "acc": 0.7651, "epoch": 0.9756468797564688, "grad_norm": 1.7578125, "learning_rate": 5.613155892213529e-06, "loss": 0.97028923, "memory(GiB)": 369.42, "step": 38460, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75914698, "epoch": 0.9757737189244039, "grad_norm": 2.09375, "learning_rate": 5.612115169740142e-06, "loss": 0.97599926, "memory(GiB)": 369.42, "step": 38465, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75871739, "epoch": 0.9759005580923389, "grad_norm": 2.25, "learning_rate": 5.611074420343197e-06, "loss": 0.98644962, "memory(GiB)": 369.42, "step": 38470, "train_speed(iter/s)": 0.200827 }, { "acc": 0.75685778, "epoch": 0.976027397260274, "grad_norm": 2.078125, "learning_rate": 5.610033644068471e-06, "loss": 1.00693483, "memory(GiB)": 369.42, "step": 38475, "train_speed(iter/s)": 0.200833 }, { "acc": 0.75995979, "epoch": 0.976154236428209, "grad_norm": 1.6953125, "learning_rate": 5.608992840961742e-06, "loss": 0.95092449, "memory(GiB)": 369.42, "step": 38480, "train_speed(iter/s)": 0.200836 }, { "acc": 0.74805565, "epoch": 0.9762810755961441, "grad_norm": 2.234375, "learning_rate": 5.6079520110687876e-06, "loss": 0.97759991, "memory(GiB)": 369.42, "step": 38485, "train_speed(iter/s)": 0.200841 }, { "acc": 0.7571312, "epoch": 0.9764079147640792, "grad_norm": 2.09375, "learning_rate": 5.606911154435392e-06, "loss": 0.98214951, "memory(GiB)": 369.42, "step": 38490, "train_speed(iter/s)": 0.200846 }, { "acc": 0.74421015, "epoch": 0.9765347539320142, "grad_norm": 2.625, "learning_rate": 5.605870271107332e-06, "loss": 0.99946232, "memory(GiB)": 369.42, "step": 38495, "train_speed(iter/s)": 0.200851 }, { "acc": 0.74634371, "epoch": 0.9766615930999493, "grad_norm": 2.296875, "learning_rate": 5.6048293611303925e-06, "loss": 1.02002201, "memory(GiB)": 369.42, "step": 38500, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75841084, "epoch": 0.9767884322678844, "grad_norm": 1.796875, "learning_rate": 5.603788424550357e-06, "loss": 0.99913263, "memory(GiB)": 369.42, "step": 38505, "train_speed(iter/s)": 0.200858 }, { "acc": 0.74886098, "epoch": 0.9769152714358194, "grad_norm": 1.7890625, "learning_rate": 5.602747461413014e-06, "loss": 1.00128231, "memory(GiB)": 369.42, "step": 38510, "train_speed(iter/s)": 0.200859 }, { "acc": 0.76277823, "epoch": 0.9770421106037545, "grad_norm": 2.59375, "learning_rate": 5.6017064717641435e-06, "loss": 0.90642185, "memory(GiB)": 369.42, "step": 38515, "train_speed(iter/s)": 0.200862 }, { "acc": 0.74727554, "epoch": 0.9771689497716894, "grad_norm": 2.046875, "learning_rate": 5.600665455649538e-06, "loss": 1.01530724, "memory(GiB)": 369.42, "step": 38520, "train_speed(iter/s)": 0.200866 }, { "acc": 0.75234947, "epoch": 0.9772957889396245, "grad_norm": 2.21875, "learning_rate": 5.599624413114981e-06, "loss": 0.977703, "memory(GiB)": 369.42, "step": 38525, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74652281, "epoch": 0.9774226281075596, "grad_norm": 2.078125, "learning_rate": 5.5985833442062676e-06, "loss": 1.04158955, "memory(GiB)": 369.42, "step": 38530, "train_speed(iter/s)": 0.200876 }, { "acc": 0.75616803, "epoch": 0.9775494672754946, "grad_norm": 2.078125, "learning_rate": 5.597542248969185e-06, "loss": 0.93774738, "memory(GiB)": 369.42, "step": 38535, "train_speed(iter/s)": 0.20088 }, { "acc": 0.73726778, "epoch": 0.9776763064434297, "grad_norm": 2.109375, "learning_rate": 5.596501127449527e-06, "loss": 1.03152714, "memory(GiB)": 369.42, "step": 38540, "train_speed(iter/s)": 0.200882 }, { "acc": 0.75107045, "epoch": 0.9778031456113648, "grad_norm": 2.125, "learning_rate": 5.595459979693086e-06, "loss": 0.98044138, "memory(GiB)": 369.42, "step": 38545, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75404863, "epoch": 0.9779299847792998, "grad_norm": 2.203125, "learning_rate": 5.594418805745657e-06, "loss": 0.991078, "memory(GiB)": 369.42, "step": 38550, "train_speed(iter/s)": 0.200891 }, { "acc": 0.74238324, "epoch": 0.9780568239472349, "grad_norm": 2.28125, "learning_rate": 5.593377605653035e-06, "loss": 0.97731581, "memory(GiB)": 369.42, "step": 38555, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75316062, "epoch": 0.9781836631151699, "grad_norm": 2.15625, "learning_rate": 5.592336379461018e-06, "loss": 0.98252316, "memory(GiB)": 369.42, "step": 38560, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74725409, "epoch": 0.978310502283105, "grad_norm": 2.140625, "learning_rate": 5.5912951272154004e-06, "loss": 0.97248535, "memory(GiB)": 369.42, "step": 38565, "train_speed(iter/s)": 0.200906 }, { "acc": 0.7448792, "epoch": 0.9784373414510401, "grad_norm": 1.984375, "learning_rate": 5.590253848961984e-06, "loss": 1.05503187, "memory(GiB)": 369.42, "step": 38570, "train_speed(iter/s)": 0.20091 }, { "acc": 0.74100037, "epoch": 0.9785641806189751, "grad_norm": 1.9765625, "learning_rate": 5.589212544746566e-06, "loss": 1.01369781, "memory(GiB)": 369.42, "step": 38575, "train_speed(iter/s)": 0.200913 }, { "acc": 0.75492544, "epoch": 0.9786910197869102, "grad_norm": 2.515625, "learning_rate": 5.588171214614953e-06, "loss": 1.02863674, "memory(GiB)": 369.42, "step": 38580, "train_speed(iter/s)": 0.200916 }, { "acc": 0.75129633, "epoch": 0.9788178589548453, "grad_norm": 1.875, "learning_rate": 5.587129858612941e-06, "loss": 0.97662134, "memory(GiB)": 369.42, "step": 38585, "train_speed(iter/s)": 0.200919 }, { "acc": 0.74518366, "epoch": 0.9789446981227803, "grad_norm": 1.8125, "learning_rate": 5.586088476786339e-06, "loss": 1.00608444, "memory(GiB)": 369.42, "step": 38590, "train_speed(iter/s)": 0.200924 }, { "acc": 0.73890333, "epoch": 0.9790715372907154, "grad_norm": 2.109375, "learning_rate": 5.585047069180947e-06, "loss": 1.05526199, "memory(GiB)": 369.42, "step": 38595, "train_speed(iter/s)": 0.200929 }, { "acc": 0.74790258, "epoch": 0.9791983764586504, "grad_norm": 2.390625, "learning_rate": 5.5840056358425755e-06, "loss": 1.01262903, "memory(GiB)": 369.42, "step": 38600, "train_speed(iter/s)": 0.200932 }, { "acc": 0.73339648, "epoch": 0.9793252156265855, "grad_norm": 2.390625, "learning_rate": 5.582964176817025e-06, "loss": 1.06861439, "memory(GiB)": 369.42, "step": 38605, "train_speed(iter/s)": 0.200935 }, { "acc": 0.75763083, "epoch": 0.9794520547945206, "grad_norm": 2.5, "learning_rate": 5.58192269215011e-06, "loss": 0.99959717, "memory(GiB)": 369.42, "step": 38610, "train_speed(iter/s)": 0.200938 }, { "acc": 0.76249714, "epoch": 0.9795788939624556, "grad_norm": 2.109375, "learning_rate": 5.580881181887636e-06, "loss": 0.99270124, "memory(GiB)": 369.42, "step": 38615, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74755569, "epoch": 0.9797057331303907, "grad_norm": 1.7734375, "learning_rate": 5.579839646075414e-06, "loss": 0.99507532, "memory(GiB)": 369.42, "step": 38620, "train_speed(iter/s)": 0.200946 }, { "acc": 0.74142742, "epoch": 0.9798325722983258, "grad_norm": 1.65625, "learning_rate": 5.578798084759257e-06, "loss": 1.02414713, "memory(GiB)": 369.42, "step": 38625, "train_speed(iter/s)": 0.200949 }, { "acc": 0.75680399, "epoch": 0.9799594114662608, "grad_norm": 2.359375, "learning_rate": 5.577756497984975e-06, "loss": 0.95290689, "memory(GiB)": 369.42, "step": 38630, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75092535, "epoch": 0.9800862506341959, "grad_norm": 1.9765625, "learning_rate": 5.576714885798382e-06, "loss": 0.99623575, "memory(GiB)": 369.42, "step": 38635, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75850382, "epoch": 0.9802130898021308, "grad_norm": 2.1875, "learning_rate": 5.575673248245295e-06, "loss": 0.95866985, "memory(GiB)": 369.42, "step": 38640, "train_speed(iter/s)": 0.200958 }, { "acc": 0.73477902, "epoch": 0.9803399289700659, "grad_norm": 3.046875, "learning_rate": 5.574631585371527e-06, "loss": 1.0739852, "memory(GiB)": 369.42, "step": 38645, "train_speed(iter/s)": 0.200961 }, { "acc": 0.7425962, "epoch": 0.980466768138001, "grad_norm": 2.328125, "learning_rate": 5.573589897222897e-06, "loss": 0.96061058, "memory(GiB)": 369.42, "step": 38650, "train_speed(iter/s)": 0.200965 }, { "acc": 0.75297055, "epoch": 0.980593607305936, "grad_norm": 1.9140625, "learning_rate": 5.572548183845222e-06, "loss": 0.95449886, "memory(GiB)": 369.42, "step": 38655, "train_speed(iter/s)": 0.200969 }, { "acc": 0.73743949, "epoch": 0.9807204464738711, "grad_norm": 2.046875, "learning_rate": 5.571506445284322e-06, "loss": 1.01818581, "memory(GiB)": 369.42, "step": 38660, "train_speed(iter/s)": 0.200973 }, { "acc": 0.75727959, "epoch": 0.9808472856418062, "grad_norm": 1.6953125, "learning_rate": 5.570464681586017e-06, "loss": 0.99386654, "memory(GiB)": 369.42, "step": 38665, "train_speed(iter/s)": 0.200974 }, { "acc": 0.74039946, "epoch": 0.9809741248097412, "grad_norm": 2.21875, "learning_rate": 5.569422892796129e-06, "loss": 1.01613178, "memory(GiB)": 369.42, "step": 38670, "train_speed(iter/s)": 0.200976 }, { "acc": 0.73624849, "epoch": 0.9811009639776763, "grad_norm": 2.15625, "learning_rate": 5.568381078960479e-06, "loss": 1.09630508, "memory(GiB)": 369.42, "step": 38675, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75762005, "epoch": 0.9812278031456113, "grad_norm": 2.640625, "learning_rate": 5.567339240124892e-06, "loss": 0.9550251, "memory(GiB)": 369.42, "step": 38680, "train_speed(iter/s)": 0.200986 }, { "acc": 0.74986291, "epoch": 0.9813546423135464, "grad_norm": 1.9921875, "learning_rate": 5.5662973763351915e-06, "loss": 1.01009779, "memory(GiB)": 369.42, "step": 38685, "train_speed(iter/s)": 0.200989 }, { "acc": 0.74309425, "epoch": 0.9814814814814815, "grad_norm": 2.53125, "learning_rate": 5.565255487637204e-06, "loss": 1.01353836, "memory(GiB)": 369.42, "step": 38690, "train_speed(iter/s)": 0.200994 }, { "acc": 0.75038018, "epoch": 0.9816083206494165, "grad_norm": 2.4375, "learning_rate": 5.564213574076757e-06, "loss": 1.01199131, "memory(GiB)": 369.42, "step": 38695, "train_speed(iter/s)": 0.200998 }, { "acc": 0.73952141, "epoch": 0.9817351598173516, "grad_norm": 2.4375, "learning_rate": 5.563171635699678e-06, "loss": 1.03684196, "memory(GiB)": 369.42, "step": 38700, "train_speed(iter/s)": 0.201003 }, { "acc": 0.76143579, "epoch": 0.9818619989852867, "grad_norm": 2.1875, "learning_rate": 5.562129672551796e-06, "loss": 0.93766651, "memory(GiB)": 369.42, "step": 38705, "train_speed(iter/s)": 0.201007 }, { "acc": 0.72445631, "epoch": 0.9819888381532217, "grad_norm": 2.078125, "learning_rate": 5.561087684678941e-06, "loss": 1.04594412, "memory(GiB)": 369.42, "step": 38710, "train_speed(iter/s)": 0.20101 }, { "acc": 0.7335639, "epoch": 0.9821156773211568, "grad_norm": 2.046875, "learning_rate": 5.560045672126945e-06, "loss": 1.01353111, "memory(GiB)": 369.42, "step": 38715, "train_speed(iter/s)": 0.201015 }, { "acc": 0.74473143, "epoch": 0.9822425164890918, "grad_norm": 2.265625, "learning_rate": 5.55900363494164e-06, "loss": 0.98444462, "memory(GiB)": 369.42, "step": 38720, "train_speed(iter/s)": 0.201018 }, { "acc": 0.75111523, "epoch": 0.9823693556570269, "grad_norm": 2.125, "learning_rate": 5.557961573168857e-06, "loss": 1.01356182, "memory(GiB)": 369.42, "step": 38725, "train_speed(iter/s)": 0.20102 }, { "acc": 0.74784279, "epoch": 0.982496194824962, "grad_norm": 2.546875, "learning_rate": 5.5569194868544376e-06, "loss": 1.02174873, "memory(GiB)": 369.42, "step": 38730, "train_speed(iter/s)": 0.201024 }, { "acc": 0.7432951, "epoch": 0.982623033992897, "grad_norm": 2.28125, "learning_rate": 5.555877376044209e-06, "loss": 0.97208462, "memory(GiB)": 369.42, "step": 38735, "train_speed(iter/s)": 0.201027 }, { "acc": 0.76779442, "epoch": 0.9827498731608321, "grad_norm": 2.25, "learning_rate": 5.554835240784013e-06, "loss": 0.98016071, "memory(GiB)": 369.42, "step": 38740, "train_speed(iter/s)": 0.201032 }, { "acc": 0.74465432, "epoch": 0.9828767123287672, "grad_norm": 2.28125, "learning_rate": 5.553793081119685e-06, "loss": 1.05902157, "memory(GiB)": 369.42, "step": 38745, "train_speed(iter/s)": 0.201034 }, { "acc": 0.74503913, "epoch": 0.9830035514967022, "grad_norm": 1.9296875, "learning_rate": 5.552750897097065e-06, "loss": 0.99803047, "memory(GiB)": 369.42, "step": 38750, "train_speed(iter/s)": 0.201037 }, { "acc": 0.7340055, "epoch": 0.9831303906646373, "grad_norm": 2.578125, "learning_rate": 5.551708688761993e-06, "loss": 1.02249527, "memory(GiB)": 369.42, "step": 38755, "train_speed(iter/s)": 0.201042 }, { "acc": 0.74943476, "epoch": 0.9832572298325722, "grad_norm": 2.125, "learning_rate": 5.550666456160311e-06, "loss": 0.99545631, "memory(GiB)": 369.42, "step": 38760, "train_speed(iter/s)": 0.201043 }, { "acc": 0.75020404, "epoch": 0.9833840690005073, "grad_norm": 1.8828125, "learning_rate": 5.549624199337857e-06, "loss": 1.02096558, "memory(GiB)": 369.42, "step": 38765, "train_speed(iter/s)": 0.201047 }, { "acc": 0.74565854, "epoch": 0.9835109081684424, "grad_norm": 2.53125, "learning_rate": 5.548581918340479e-06, "loss": 1.01916904, "memory(GiB)": 369.42, "step": 38770, "train_speed(iter/s)": 0.201053 }, { "acc": 0.75427532, "epoch": 0.9836377473363774, "grad_norm": 2.40625, "learning_rate": 5.547539613214019e-06, "loss": 0.95892477, "memory(GiB)": 369.42, "step": 38775, "train_speed(iter/s)": 0.201056 }, { "acc": 0.73645668, "epoch": 0.9837645865043125, "grad_norm": 2.671875, "learning_rate": 5.546497284004321e-06, "loss": 1.01548233, "memory(GiB)": 369.42, "step": 38780, "train_speed(iter/s)": 0.201062 }, { "acc": 0.74804111, "epoch": 0.9838914256722476, "grad_norm": 2.40625, "learning_rate": 5.545454930757233e-06, "loss": 1.01361408, "memory(GiB)": 369.42, "step": 38785, "train_speed(iter/s)": 0.201067 }, { "acc": 0.76061182, "epoch": 0.9840182648401826, "grad_norm": 2.015625, "learning_rate": 5.544412553518602e-06, "loss": 0.92867804, "memory(GiB)": 369.42, "step": 38790, "train_speed(iter/s)": 0.201071 }, { "acc": 0.75141888, "epoch": 0.9841451040081177, "grad_norm": 1.8984375, "learning_rate": 5.543370152334275e-06, "loss": 0.99526463, "memory(GiB)": 369.42, "step": 38795, "train_speed(iter/s)": 0.201075 }, { "acc": 0.75315232, "epoch": 0.9842719431760527, "grad_norm": 1.8671875, "learning_rate": 5.542327727250105e-06, "loss": 0.96868811, "memory(GiB)": 369.42, "step": 38800, "train_speed(iter/s)": 0.201077 }, { "acc": 0.74006681, "epoch": 0.9843987823439878, "grad_norm": 2.25, "learning_rate": 5.5412852783119385e-06, "loss": 1.02083473, "memory(GiB)": 369.42, "step": 38805, "train_speed(iter/s)": 0.20108 }, { "acc": 0.74672494, "epoch": 0.9845256215119229, "grad_norm": 1.7734375, "learning_rate": 5.54024280556563e-06, "loss": 0.99775124, "memory(GiB)": 369.42, "step": 38810, "train_speed(iter/s)": 0.201085 }, { "acc": 0.74787807, "epoch": 0.9846524606798579, "grad_norm": 2.671875, "learning_rate": 5.53920030905703e-06, "loss": 1.04006548, "memory(GiB)": 369.42, "step": 38815, "train_speed(iter/s)": 0.201089 }, { "acc": 0.742383, "epoch": 0.984779299847793, "grad_norm": 1.9140625, "learning_rate": 5.538157788831993e-06, "loss": 0.98170261, "memory(GiB)": 369.42, "step": 38820, "train_speed(iter/s)": 0.201093 }, { "acc": 0.74809284, "epoch": 0.9849061390157281, "grad_norm": 2.65625, "learning_rate": 5.537115244936374e-06, "loss": 1.00416775, "memory(GiB)": 369.42, "step": 38825, "train_speed(iter/s)": 0.201098 }, { "acc": 0.73483577, "epoch": 0.9850329781836631, "grad_norm": 2.640625, "learning_rate": 5.536072677416029e-06, "loss": 1.05887814, "memory(GiB)": 369.42, "step": 38830, "train_speed(iter/s)": 0.201102 }, { "acc": 0.75715609, "epoch": 0.9851598173515982, "grad_norm": 2.140625, "learning_rate": 5.535030086316814e-06, "loss": 0.95778437, "memory(GiB)": 369.42, "step": 38835, "train_speed(iter/s)": 0.201105 }, { "acc": 0.75477657, "epoch": 0.9852866565195332, "grad_norm": 1.7890625, "learning_rate": 5.533987471684586e-06, "loss": 0.95164595, "memory(GiB)": 369.42, "step": 38840, "train_speed(iter/s)": 0.201106 }, { "acc": 0.74289665, "epoch": 0.9854134956874683, "grad_norm": 1.984375, "learning_rate": 5.532944833565207e-06, "loss": 0.98666286, "memory(GiB)": 369.42, "step": 38845, "train_speed(iter/s)": 0.201109 }, { "acc": 0.74501057, "epoch": 0.9855403348554034, "grad_norm": 2.0625, "learning_rate": 5.531902172004533e-06, "loss": 0.99624462, "memory(GiB)": 369.42, "step": 38850, "train_speed(iter/s)": 0.201112 }, { "acc": 0.74618235, "epoch": 0.9856671740233384, "grad_norm": 2.21875, "learning_rate": 5.530859487048427e-06, "loss": 1.03144169, "memory(GiB)": 369.42, "step": 38855, "train_speed(iter/s)": 0.201113 }, { "acc": 0.72825127, "epoch": 0.9857940131912735, "grad_norm": 1.921875, "learning_rate": 5.529816778742752e-06, "loss": 1.09780827, "memory(GiB)": 369.42, "step": 38860, "train_speed(iter/s)": 0.201115 }, { "acc": 0.76808333, "epoch": 0.9859208523592086, "grad_norm": 2.296875, "learning_rate": 5.528774047133369e-06, "loss": 0.93193312, "memory(GiB)": 369.42, "step": 38865, "train_speed(iter/s)": 0.201119 }, { "acc": 0.75239391, "epoch": 0.9860476915271436, "grad_norm": 1.8515625, "learning_rate": 5.527731292266142e-06, "loss": 0.97101955, "memory(GiB)": 369.42, "step": 38870, "train_speed(iter/s)": 0.201122 }, { "acc": 0.75062675, "epoch": 0.9861745306950787, "grad_norm": 1.984375, "learning_rate": 5.5266885141869355e-06, "loss": 1.03000021, "memory(GiB)": 369.42, "step": 38875, "train_speed(iter/s)": 0.201126 }, { "acc": 0.75424976, "epoch": 0.9863013698630136, "grad_norm": 2.484375, "learning_rate": 5.5256457129416185e-06, "loss": 1.02506866, "memory(GiB)": 369.42, "step": 38880, "train_speed(iter/s)": 0.201131 }, { "acc": 0.76611452, "epoch": 0.9864282090309487, "grad_norm": 2.0, "learning_rate": 5.524602888576055e-06, "loss": 0.98191757, "memory(GiB)": 369.42, "step": 38885, "train_speed(iter/s)": 0.201136 }, { "acc": 0.72847614, "epoch": 0.9865550481988838, "grad_norm": 2.125, "learning_rate": 5.523560041136116e-06, "loss": 1.04573956, "memory(GiB)": 369.42, "step": 38890, "train_speed(iter/s)": 0.201139 }, { "acc": 0.7218009, "epoch": 0.9866818873668188, "grad_norm": 2.234375, "learning_rate": 5.522517170667667e-06, "loss": 1.08220348, "memory(GiB)": 369.42, "step": 38895, "train_speed(iter/s)": 0.201144 }, { "acc": 0.75818086, "epoch": 0.9868087265347539, "grad_norm": 2.34375, "learning_rate": 5.5214742772165806e-06, "loss": 0.96560087, "memory(GiB)": 369.42, "step": 38900, "train_speed(iter/s)": 0.201148 }, { "acc": 0.73706055, "epoch": 0.986935565702689, "grad_norm": 2.46875, "learning_rate": 5.520431360828728e-06, "loss": 1.05755444, "memory(GiB)": 369.42, "step": 38905, "train_speed(iter/s)": 0.201153 }, { "acc": 0.73685775, "epoch": 0.987062404870624, "grad_norm": 2.515625, "learning_rate": 5.51938842154998e-06, "loss": 1.05240126, "memory(GiB)": 369.42, "step": 38910, "train_speed(iter/s)": 0.201156 }, { "acc": 0.75045261, "epoch": 0.9871892440385591, "grad_norm": 2.515625, "learning_rate": 5.51834545942621e-06, "loss": 0.95775318, "memory(GiB)": 369.42, "step": 38915, "train_speed(iter/s)": 0.201159 }, { "acc": 0.73838081, "epoch": 0.9873160832064941, "grad_norm": 2.1875, "learning_rate": 5.5173024745032925e-06, "loss": 0.9994236, "memory(GiB)": 369.42, "step": 38920, "train_speed(iter/s)": 0.201163 }, { "acc": 0.74198337, "epoch": 0.9874429223744292, "grad_norm": 2.25, "learning_rate": 5.516259466827103e-06, "loss": 1.05457554, "memory(GiB)": 369.42, "step": 38925, "train_speed(iter/s)": 0.201167 }, { "acc": 0.75365267, "epoch": 0.9875697615423643, "grad_norm": 2.078125, "learning_rate": 5.515216436443517e-06, "loss": 0.99522467, "memory(GiB)": 369.42, "step": 38930, "train_speed(iter/s)": 0.201171 }, { "acc": 0.75211582, "epoch": 0.9876966007102993, "grad_norm": 2.6875, "learning_rate": 5.514173383398412e-06, "loss": 1.04096737, "memory(GiB)": 369.42, "step": 38935, "train_speed(iter/s)": 0.201175 }, { "acc": 0.74049006, "epoch": 0.9878234398782344, "grad_norm": 2.046875, "learning_rate": 5.513130307737666e-06, "loss": 0.99538059, "memory(GiB)": 369.42, "step": 38940, "train_speed(iter/s)": 0.201178 }, { "acc": 0.73500586, "epoch": 0.9879502790461695, "grad_norm": 1.9921875, "learning_rate": 5.512087209507157e-06, "loss": 1.07305775, "memory(GiB)": 369.42, "step": 38945, "train_speed(iter/s)": 0.201181 }, { "acc": 0.74907455, "epoch": 0.9880771182141045, "grad_norm": 2.046875, "learning_rate": 5.5110440887527684e-06, "loss": 0.94934559, "memory(GiB)": 369.42, "step": 38950, "train_speed(iter/s)": 0.201184 }, { "acc": 0.75652742, "epoch": 0.9882039573820396, "grad_norm": 1.828125, "learning_rate": 5.510000945520377e-06, "loss": 0.97115574, "memory(GiB)": 369.42, "step": 38955, "train_speed(iter/s)": 0.201179 }, { "acc": 0.75039206, "epoch": 0.9883307965499746, "grad_norm": 1.828125, "learning_rate": 5.508957779855869e-06, "loss": 0.9992897, "memory(GiB)": 369.42, "step": 38960, "train_speed(iter/s)": 0.201184 }, { "acc": 0.75032501, "epoch": 0.9884576357179097, "grad_norm": 2.34375, "learning_rate": 5.507914591805124e-06, "loss": 1.02235184, "memory(GiB)": 369.42, "step": 38965, "train_speed(iter/s)": 0.201187 }, { "acc": 0.75226836, "epoch": 0.9885844748858448, "grad_norm": 2.046875, "learning_rate": 5.506871381414027e-06, "loss": 0.96288366, "memory(GiB)": 369.42, "step": 38970, "train_speed(iter/s)": 0.20119 }, { "acc": 0.74592218, "epoch": 0.9887113140537798, "grad_norm": 2.59375, "learning_rate": 5.505828148728465e-06, "loss": 1.00179501, "memory(GiB)": 369.42, "step": 38975, "train_speed(iter/s)": 0.201194 }, { "acc": 0.75796232, "epoch": 0.9888381532217149, "grad_norm": 2.171875, "learning_rate": 5.5047848937943225e-06, "loss": 0.97631721, "memory(GiB)": 369.42, "step": 38980, "train_speed(iter/s)": 0.201199 }, { "acc": 0.75006747, "epoch": 0.98896499238965, "grad_norm": 2.265625, "learning_rate": 5.503741616657486e-06, "loss": 0.98384209, "memory(GiB)": 369.42, "step": 38985, "train_speed(iter/s)": 0.2012 }, { "acc": 0.75065708, "epoch": 0.989091831557585, "grad_norm": 2.53125, "learning_rate": 5.502698317363846e-06, "loss": 1.02178364, "memory(GiB)": 369.42, "step": 38990, "train_speed(iter/s)": 0.201204 }, { "acc": 0.75077424, "epoch": 0.98921867072552, "grad_norm": 2.34375, "learning_rate": 5.501654995959288e-06, "loss": 1.00287504, "memory(GiB)": 369.42, "step": 38995, "train_speed(iter/s)": 0.201206 }, { "acc": 0.73881469, "epoch": 0.989345509893455, "grad_norm": 2.140625, "learning_rate": 5.5006116524897034e-06, "loss": 1.01494341, "memory(GiB)": 369.42, "step": 39000, "train_speed(iter/s)": 0.201211 }, { "epoch": 0.989345509893455, "eval_acc": 0.7377469892554426, "eval_loss": 0.9705737233161926, "eval_runtime": 384.7713, "eval_samples_per_second": 16.555, "eval_steps_per_second": 8.278, "step": 39000 }, { "acc": 0.74906135, "epoch": 0.9894723490613901, "grad_norm": 2.5, "learning_rate": 5.499568287000984e-06, "loss": 1.04088898, "memory(GiB)": 369.42, "step": 39005, "train_speed(iter/s)": 0.200477 }, { "acc": 0.7523735, "epoch": 0.9895991882293252, "grad_norm": 2.25, "learning_rate": 5.49852489953902e-06, "loss": 1.00436707, "memory(GiB)": 369.42, "step": 39010, "train_speed(iter/s)": 0.200482 }, { "acc": 0.75278544, "epoch": 0.9897260273972602, "grad_norm": 2.28125, "learning_rate": 5.497481490149705e-06, "loss": 0.97195396, "memory(GiB)": 369.42, "step": 39015, "train_speed(iter/s)": 0.200486 }, { "acc": 0.75026503, "epoch": 0.9898528665651953, "grad_norm": 2.078125, "learning_rate": 5.496438058878936e-06, "loss": 1.0040247, "memory(GiB)": 369.42, "step": 39020, "train_speed(iter/s)": 0.20049 }, { "acc": 0.73954377, "epoch": 0.9899797057331304, "grad_norm": 2.140625, "learning_rate": 5.4953946057726005e-06, "loss": 1.06840839, "memory(GiB)": 369.42, "step": 39025, "train_speed(iter/s)": 0.200492 }, { "acc": 0.75232501, "epoch": 0.9901065449010654, "grad_norm": 1.9921875, "learning_rate": 5.494351130876602e-06, "loss": 0.98969717, "memory(GiB)": 369.42, "step": 39030, "train_speed(iter/s)": 0.200497 }, { "acc": 0.75709672, "epoch": 0.9902333840690005, "grad_norm": 1.96875, "learning_rate": 5.493307634236831e-06, "loss": 0.96947689, "memory(GiB)": 369.42, "step": 39035, "train_speed(iter/s)": 0.2005 }, { "acc": 0.74526777, "epoch": 0.9903602232369355, "grad_norm": 2.1875, "learning_rate": 5.492264115899189e-06, "loss": 0.98556461, "memory(GiB)": 369.42, "step": 39040, "train_speed(iter/s)": 0.200502 }, { "acc": 0.7431674, "epoch": 0.9904870624048706, "grad_norm": 2.9375, "learning_rate": 5.491220575909573e-06, "loss": 1.02652111, "memory(GiB)": 369.42, "step": 39045, "train_speed(iter/s)": 0.200508 }, { "acc": 0.74683657, "epoch": 0.9906139015728057, "grad_norm": 1.890625, "learning_rate": 5.4901770143138835e-06, "loss": 1.04721165, "memory(GiB)": 369.42, "step": 39050, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74814892, "epoch": 0.9907407407407407, "grad_norm": 1.703125, "learning_rate": 5.48913343115802e-06, "loss": 0.9833437, "memory(GiB)": 369.42, "step": 39055, "train_speed(iter/s)": 0.200515 }, { "acc": 0.74433045, "epoch": 0.9908675799086758, "grad_norm": 1.84375, "learning_rate": 5.488089826487884e-06, "loss": 0.96447086, "memory(GiB)": 369.42, "step": 39060, "train_speed(iter/s)": 0.200518 }, { "acc": 0.75859594, "epoch": 0.9909944190766109, "grad_norm": 1.828125, "learning_rate": 5.48704620034938e-06, "loss": 0.9719347, "memory(GiB)": 369.42, "step": 39065, "train_speed(iter/s)": 0.200523 }, { "acc": 0.75305018, "epoch": 0.9911212582445459, "grad_norm": 2.234375, "learning_rate": 5.486002552788408e-06, "loss": 0.98763294, "memory(GiB)": 369.42, "step": 39070, "train_speed(iter/s)": 0.200526 }, { "acc": 0.73978448, "epoch": 0.991248097412481, "grad_norm": 2.390625, "learning_rate": 5.4849588838508734e-06, "loss": 0.98522148, "memory(GiB)": 369.42, "step": 39075, "train_speed(iter/s)": 0.200527 }, { "acc": 0.7584177, "epoch": 0.991374936580416, "grad_norm": 2.421875, "learning_rate": 5.483915193582684e-06, "loss": 1.0106102, "memory(GiB)": 369.42, "step": 39080, "train_speed(iter/s)": 0.200527 }, { "acc": 0.74023371, "epoch": 0.9915017757483511, "grad_norm": 2.0625, "learning_rate": 5.482871482029742e-06, "loss": 0.96238918, "memory(GiB)": 369.42, "step": 39085, "train_speed(iter/s)": 0.20053 }, { "acc": 0.74909258, "epoch": 0.9916286149162862, "grad_norm": 1.8828125, "learning_rate": 5.4818277492379565e-06, "loss": 1.03487644, "memory(GiB)": 369.42, "step": 39090, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74789, "epoch": 0.9917554540842212, "grad_norm": 2.15625, "learning_rate": 5.480783995253236e-06, "loss": 1.01760807, "memory(GiB)": 369.42, "step": 39095, "train_speed(iter/s)": 0.200539 }, { "acc": 0.75473213, "epoch": 0.9918822932521563, "grad_norm": 1.9921875, "learning_rate": 5.47974022012149e-06, "loss": 0.98479795, "memory(GiB)": 369.42, "step": 39100, "train_speed(iter/s)": 0.200541 }, { "acc": 0.74984713, "epoch": 0.9920091324200914, "grad_norm": 2.484375, "learning_rate": 5.478696423888624e-06, "loss": 0.96173706, "memory(GiB)": 369.42, "step": 39105, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74765825, "epoch": 0.9921359715880264, "grad_norm": 2.15625, "learning_rate": 5.477652606600555e-06, "loss": 0.97656717, "memory(GiB)": 369.42, "step": 39110, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75732398, "epoch": 0.9922628107559615, "grad_norm": 2.28125, "learning_rate": 5.47660876830319e-06, "loss": 0.97552233, "memory(GiB)": 369.42, "step": 39115, "train_speed(iter/s)": 0.200553 }, { "acc": 0.75827675, "epoch": 0.9923896499238964, "grad_norm": 2.1875, "learning_rate": 5.475564909042444e-06, "loss": 0.93470936, "memory(GiB)": 369.42, "step": 39120, "train_speed(iter/s)": 0.200558 }, { "acc": 0.74532752, "epoch": 0.9925164890918315, "grad_norm": 2.0625, "learning_rate": 5.4745210288642306e-06, "loss": 0.9942914, "memory(GiB)": 369.42, "step": 39125, "train_speed(iter/s)": 0.200562 }, { "acc": 0.73801956, "epoch": 0.9926433282597666, "grad_norm": 1.890625, "learning_rate": 5.473477127814464e-06, "loss": 0.9975997, "memory(GiB)": 369.42, "step": 39130, "train_speed(iter/s)": 0.200566 }, { "acc": 0.73159785, "epoch": 0.9927701674277016, "grad_norm": 2.3125, "learning_rate": 5.472433205939058e-06, "loss": 1.06781235, "memory(GiB)": 369.42, "step": 39135, "train_speed(iter/s)": 0.20057 }, { "acc": 0.76071692, "epoch": 0.9928970065956367, "grad_norm": 2.5, "learning_rate": 5.471389263283932e-06, "loss": 0.94275103, "memory(GiB)": 369.42, "step": 39140, "train_speed(iter/s)": 0.200574 }, { "acc": 0.76187382, "epoch": 0.9930238457635718, "grad_norm": 2.59375, "learning_rate": 5.4703452998950005e-06, "loss": 0.96174831, "memory(GiB)": 369.42, "step": 39145, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74699621, "epoch": 0.9931506849315068, "grad_norm": 2.40625, "learning_rate": 5.469301315818183e-06, "loss": 1.04860268, "memory(GiB)": 369.42, "step": 39150, "train_speed(iter/s)": 0.20058 }, { "acc": 0.75097914, "epoch": 0.9932775240994419, "grad_norm": 1.9765625, "learning_rate": 5.468257311099399e-06, "loss": 0.91709557, "memory(GiB)": 369.42, "step": 39155, "train_speed(iter/s)": 0.200583 }, { "acc": 0.76603918, "epoch": 0.9934043632673769, "grad_norm": 2.125, "learning_rate": 5.467213285784567e-06, "loss": 0.93833103, "memory(GiB)": 369.42, "step": 39160, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74698138, "epoch": 0.993531202435312, "grad_norm": 2.0625, "learning_rate": 5.466169239919608e-06, "loss": 1.02316914, "memory(GiB)": 369.42, "step": 39165, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74458399, "epoch": 0.9936580416032471, "grad_norm": 2.421875, "learning_rate": 5.465125173550446e-06, "loss": 1.03745441, "memory(GiB)": 369.42, "step": 39170, "train_speed(iter/s)": 0.200596 }, { "acc": 0.75070343, "epoch": 0.9937848807711821, "grad_norm": 2.3125, "learning_rate": 5.464081086723001e-06, "loss": 1.01576214, "memory(GiB)": 369.42, "step": 39175, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74142952, "epoch": 0.9939117199391172, "grad_norm": 2.65625, "learning_rate": 5.4630369794832006e-06, "loss": 1.02795811, "memory(GiB)": 369.42, "step": 39180, "train_speed(iter/s)": 0.200602 }, { "acc": 0.75338497, "epoch": 0.9940385591070523, "grad_norm": 2.21875, "learning_rate": 5.461992851876963e-06, "loss": 1.03523788, "memory(GiB)": 369.42, "step": 39185, "train_speed(iter/s)": 0.200603 }, { "acc": 0.74690094, "epoch": 0.9941653982749873, "grad_norm": 2.703125, "learning_rate": 5.460948703950218e-06, "loss": 1.07016258, "memory(GiB)": 369.42, "step": 39190, "train_speed(iter/s)": 0.200604 }, { "acc": 0.73430247, "epoch": 0.9942922374429224, "grad_norm": 2.53125, "learning_rate": 5.459904535748892e-06, "loss": 1.08947983, "memory(GiB)": 369.42, "step": 39195, "train_speed(iter/s)": 0.200609 }, { "acc": 0.76741118, "epoch": 0.9944190766108574, "grad_norm": 2.140625, "learning_rate": 5.458860347318912e-06, "loss": 0.9240859, "memory(GiB)": 369.42, "step": 39200, "train_speed(iter/s)": 0.200612 }, { "acc": 0.75090017, "epoch": 0.9945459157787925, "grad_norm": 2.390625, "learning_rate": 5.457816138706203e-06, "loss": 1.04730892, "memory(GiB)": 369.42, "step": 39205, "train_speed(iter/s)": 0.200617 }, { "acc": 0.75945578, "epoch": 0.9946727549467276, "grad_norm": 2.890625, "learning_rate": 5.456771909956697e-06, "loss": 0.9861619, "memory(GiB)": 369.42, "step": 39210, "train_speed(iter/s)": 0.200621 }, { "acc": 0.7544323, "epoch": 0.9947995941146626, "grad_norm": 2.421875, "learning_rate": 5.455727661116324e-06, "loss": 0.99644566, "memory(GiB)": 369.42, "step": 39215, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75019503, "epoch": 0.9949264332825977, "grad_norm": 3.140625, "learning_rate": 5.454683392231014e-06, "loss": 1.01935158, "memory(GiB)": 369.42, "step": 39220, "train_speed(iter/s)": 0.200628 }, { "acc": 0.75236654, "epoch": 0.9950532724505328, "grad_norm": 1.9765625, "learning_rate": 5.453639103346697e-06, "loss": 0.98746948, "memory(GiB)": 369.42, "step": 39225, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74513569, "epoch": 0.9951801116184678, "grad_norm": 2.546875, "learning_rate": 5.452594794509307e-06, "loss": 0.98716154, "memory(GiB)": 369.42, "step": 39230, "train_speed(iter/s)": 0.200636 }, { "acc": 0.75333433, "epoch": 0.9953069507864029, "grad_norm": 2.171875, "learning_rate": 5.4515504657647765e-06, "loss": 1.03785954, "memory(GiB)": 369.42, "step": 39235, "train_speed(iter/s)": 0.200639 }, { "acc": 0.74988956, "epoch": 0.9954337899543378, "grad_norm": 2.078125, "learning_rate": 5.450506117159044e-06, "loss": 1.06598492, "memory(GiB)": 369.42, "step": 39240, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74988518, "epoch": 0.9955606291222729, "grad_norm": 2.8125, "learning_rate": 5.449461748738037e-06, "loss": 1.03245697, "memory(GiB)": 369.42, "step": 39245, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74886246, "epoch": 0.995687468290208, "grad_norm": 2.828125, "learning_rate": 5.448417360547699e-06, "loss": 1.00162983, "memory(GiB)": 369.42, "step": 39250, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75650959, "epoch": 0.995814307458143, "grad_norm": 2.015625, "learning_rate": 5.44737295263396e-06, "loss": 0.94966297, "memory(GiB)": 369.42, "step": 39255, "train_speed(iter/s)": 0.20065 }, { "acc": 0.73961353, "epoch": 0.9959411466260781, "grad_norm": 2.21875, "learning_rate": 5.446328525042764e-06, "loss": 1.05939064, "memory(GiB)": 369.42, "step": 39260, "train_speed(iter/s)": 0.200653 }, { "acc": 0.7394177, "epoch": 0.9960679857940132, "grad_norm": 2.390625, "learning_rate": 5.4452840778200456e-06, "loss": 1.03777065, "memory(GiB)": 369.42, "step": 39265, "train_speed(iter/s)": 0.200659 }, { "acc": 0.74093771, "epoch": 0.9961948249619482, "grad_norm": 2.5, "learning_rate": 5.444239611011746e-06, "loss": 1.03041344, "memory(GiB)": 369.42, "step": 39270, "train_speed(iter/s)": 0.200662 }, { "acc": 0.7464407, "epoch": 0.9963216641298833, "grad_norm": 2.0, "learning_rate": 5.443195124663804e-06, "loss": 1.01782789, "memory(GiB)": 369.42, "step": 39275, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74673257, "epoch": 0.9964485032978183, "grad_norm": 2.375, "learning_rate": 5.442150618822162e-06, "loss": 1.04695625, "memory(GiB)": 369.42, "step": 39280, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75278091, "epoch": 0.9965753424657534, "grad_norm": 2.125, "learning_rate": 5.441106093532762e-06, "loss": 1.05065317, "memory(GiB)": 369.42, "step": 39285, "train_speed(iter/s)": 0.200671 }, { "acc": 0.7572813, "epoch": 0.9967021816336885, "grad_norm": 1.796875, "learning_rate": 5.440061548841546e-06, "loss": 0.97254858, "memory(GiB)": 369.42, "step": 39290, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74875002, "epoch": 0.9968290208016235, "grad_norm": 2.4375, "learning_rate": 5.43901698479446e-06, "loss": 1.01106243, "memory(GiB)": 369.42, "step": 39295, "train_speed(iter/s)": 0.20068 }, { "acc": 0.74107103, "epoch": 0.9969558599695586, "grad_norm": 2.1875, "learning_rate": 5.4379724014374455e-06, "loss": 1.01137285, "memory(GiB)": 369.42, "step": 39300, "train_speed(iter/s)": 0.200685 }, { "acc": 0.73633933, "epoch": 0.9970826991374937, "grad_norm": 1.796875, "learning_rate": 5.436927798816448e-06, "loss": 1.02487926, "memory(GiB)": 369.42, "step": 39305, "train_speed(iter/s)": 0.20069 }, { "acc": 0.7691678, "epoch": 0.9972095383054287, "grad_norm": 2.5625, "learning_rate": 5.4358831769774174e-06, "loss": 0.94171, "memory(GiB)": 369.42, "step": 39310, "train_speed(iter/s)": 0.200694 }, { "acc": 0.74815578, "epoch": 0.9973363774733638, "grad_norm": 2.046875, "learning_rate": 5.434838535966298e-06, "loss": 1.03772268, "memory(GiB)": 369.42, "step": 39315, "train_speed(iter/s)": 0.200697 }, { "acc": 0.74434605, "epoch": 0.9974632166412988, "grad_norm": 2.40625, "learning_rate": 5.43379387582904e-06, "loss": 1.00261917, "memory(GiB)": 369.42, "step": 39320, "train_speed(iter/s)": 0.200703 }, { "acc": 0.75465555, "epoch": 0.9975900558092339, "grad_norm": 2.015625, "learning_rate": 5.432749196611587e-06, "loss": 0.97060318, "memory(GiB)": 369.42, "step": 39325, "train_speed(iter/s)": 0.200707 }, { "acc": 0.74851508, "epoch": 0.997716894977169, "grad_norm": 2.140625, "learning_rate": 5.431704498359896e-06, "loss": 0.9790966, "memory(GiB)": 369.42, "step": 39330, "train_speed(iter/s)": 0.200711 }, { "acc": 0.75160036, "epoch": 0.997843734145104, "grad_norm": 2.515625, "learning_rate": 5.43065978111991e-06, "loss": 0.99769573, "memory(GiB)": 369.42, "step": 39335, "train_speed(iter/s)": 0.200713 }, { "acc": 0.7478065, "epoch": 0.9979705733130391, "grad_norm": 2.390625, "learning_rate": 5.429615044937586e-06, "loss": 0.96711903, "memory(GiB)": 369.42, "step": 39340, "train_speed(iter/s)": 0.200718 }, { "acc": 0.73989286, "epoch": 0.9980974124809742, "grad_norm": 2.15625, "learning_rate": 5.4285702898588754e-06, "loss": 1.04156485, "memory(GiB)": 369.42, "step": 39345, "train_speed(iter/s)": 0.20072 }, { "acc": 0.75442553, "epoch": 0.9982242516489092, "grad_norm": 2.0625, "learning_rate": 5.427525515929729e-06, "loss": 0.96349087, "memory(GiB)": 369.42, "step": 39350, "train_speed(iter/s)": 0.200725 }, { "acc": 0.76715555, "epoch": 0.9983510908168443, "grad_norm": 2.515625, "learning_rate": 5.426480723196102e-06, "loss": 0.94301968, "memory(GiB)": 369.42, "step": 39355, "train_speed(iter/s)": 0.20073 }, { "acc": 0.75286541, "epoch": 0.9984779299847792, "grad_norm": 2.140625, "learning_rate": 5.425435911703948e-06, "loss": 0.98206177, "memory(GiB)": 369.42, "step": 39360, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74418402, "epoch": 0.9986047691527143, "grad_norm": 2.3125, "learning_rate": 5.424391081499223e-06, "loss": 0.99484253, "memory(GiB)": 369.42, "step": 39365, "train_speed(iter/s)": 0.200737 }, { "acc": 0.75274582, "epoch": 0.9987316083206494, "grad_norm": 1.953125, "learning_rate": 5.423346232627884e-06, "loss": 0.98313055, "memory(GiB)": 369.42, "step": 39370, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74162145, "epoch": 0.9988584474885844, "grad_norm": 2.5625, "learning_rate": 5.422301365135887e-06, "loss": 1.03286991, "memory(GiB)": 369.42, "step": 39375, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74060249, "epoch": 0.9989852866565195, "grad_norm": 2.453125, "learning_rate": 5.421256479069191e-06, "loss": 1.03016853, "memory(GiB)": 369.42, "step": 39380, "train_speed(iter/s)": 0.200746 }, { "acc": 0.7587265, "epoch": 0.9991121258244546, "grad_norm": 2.109375, "learning_rate": 5.420211574473754e-06, "loss": 0.95001698, "memory(GiB)": 369.42, "step": 39385, "train_speed(iter/s)": 0.200752 }, { "acc": 0.74868965, "epoch": 0.9992389649923896, "grad_norm": 2.09375, "learning_rate": 5.419166651395536e-06, "loss": 1.0042944, "memory(GiB)": 369.42, "step": 39390, "train_speed(iter/s)": 0.200755 }, { "acc": 0.75938873, "epoch": 0.9993658041603247, "grad_norm": 2.125, "learning_rate": 5.418121709880497e-06, "loss": 0.93497515, "memory(GiB)": 369.42, "step": 39395, "train_speed(iter/s)": 0.200759 }, { "acc": 0.74913921, "epoch": 0.9994926433282597, "grad_norm": 2.53125, "learning_rate": 5.4170767499746e-06, "loss": 1.00431004, "memory(GiB)": 369.42, "step": 39400, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74578724, "epoch": 0.9996194824961948, "grad_norm": 2.4375, "learning_rate": 5.416031771723803e-06, "loss": 1.02310791, "memory(GiB)": 369.42, "step": 39405, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75924602, "epoch": 0.9997463216641299, "grad_norm": 2.65625, "learning_rate": 5.414986775174073e-06, "loss": 0.96423988, "memory(GiB)": 369.42, "step": 39410, "train_speed(iter/s)": 0.200771 }, { "acc": 0.76264644, "epoch": 0.9998731608320649, "grad_norm": 1.71875, "learning_rate": 5.41394176037137e-06, "loss": 0.96116571, "memory(GiB)": 369.42, "step": 39415, "train_speed(iter/s)": 0.200775 }, { "acc": 0.73521447, "epoch": 1.0, "grad_norm": 2.203125, "learning_rate": 5.412896727361663e-06, "loss": 1.07088375, "memory(GiB)": 369.42, "step": 39420, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75494847, "epoch": 1.000126839167935, "grad_norm": 2.40625, "learning_rate": 5.411851676190912e-06, "loss": 0.99681892, "memory(GiB)": 369.42, "step": 39425, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74448709, "epoch": 1.0002536783358702, "grad_norm": 2.078125, "learning_rate": 5.4108066069050864e-06, "loss": 1.00164013, "memory(GiB)": 369.42, "step": 39430, "train_speed(iter/s)": 0.200784 }, { "acc": 0.73621683, "epoch": 1.0003805175038052, "grad_norm": 2.109375, "learning_rate": 5.409761519550153e-06, "loss": 1.03248138, "memory(GiB)": 369.42, "step": 39435, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75825682, "epoch": 1.0005073566717402, "grad_norm": 2.28125, "learning_rate": 5.408716414172077e-06, "loss": 0.96016483, "memory(GiB)": 369.42, "step": 39440, "train_speed(iter/s)": 0.20079 }, { "acc": 0.75849748, "epoch": 1.0006341958396754, "grad_norm": 2.65625, "learning_rate": 5.407671290816829e-06, "loss": 0.9612999, "memory(GiB)": 369.42, "step": 39445, "train_speed(iter/s)": 0.200794 }, { "acc": 0.73329048, "epoch": 1.0007610350076104, "grad_norm": 2.328125, "learning_rate": 5.406626149530378e-06, "loss": 1.05807543, "memory(GiB)": 369.42, "step": 39450, "train_speed(iter/s)": 0.200796 }, { "acc": 0.7581347, "epoch": 1.0008878741755454, "grad_norm": 1.828125, "learning_rate": 5.405580990358692e-06, "loss": 0.94280729, "memory(GiB)": 369.42, "step": 39455, "train_speed(iter/s)": 0.200799 }, { "acc": 0.75225453, "epoch": 1.0010147133434804, "grad_norm": 2.265625, "learning_rate": 5.404535813347746e-06, "loss": 0.99452209, "memory(GiB)": 369.42, "step": 39460, "train_speed(iter/s)": 0.200804 }, { "acc": 0.75236816, "epoch": 1.0011415525114156, "grad_norm": 2.4375, "learning_rate": 5.403490618543505e-06, "loss": 1.01410389, "memory(GiB)": 369.42, "step": 39465, "train_speed(iter/s)": 0.200808 }, { "acc": 0.76253872, "epoch": 1.0012683916793506, "grad_norm": 2.078125, "learning_rate": 5.40244540599195e-06, "loss": 0.96119881, "memory(GiB)": 369.42, "step": 39470, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75000887, "epoch": 1.0013952308472855, "grad_norm": 1.6796875, "learning_rate": 5.401400175739045e-06, "loss": 1.02664204, "memory(GiB)": 369.42, "step": 39475, "train_speed(iter/s)": 0.200815 }, { "acc": 0.74694905, "epoch": 1.0015220700152208, "grad_norm": 2.296875, "learning_rate": 5.400354927830769e-06, "loss": 0.99880924, "memory(GiB)": 369.42, "step": 39480, "train_speed(iter/s)": 0.20082 }, { "acc": 0.75508747, "epoch": 1.0016489091831557, "grad_norm": 2.25, "learning_rate": 5.399309662313097e-06, "loss": 0.97261953, "memory(GiB)": 369.42, "step": 39485, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75323005, "epoch": 1.0017757483510907, "grad_norm": 2.234375, "learning_rate": 5.3982643792320024e-06, "loss": 0.97123718, "memory(GiB)": 369.42, "step": 39490, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75010552, "epoch": 1.001902587519026, "grad_norm": 1.984375, "learning_rate": 5.397219078633462e-06, "loss": 0.97393894, "memory(GiB)": 369.42, "step": 39495, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75213537, "epoch": 1.002029426686961, "grad_norm": 2.53125, "learning_rate": 5.3961737605634546e-06, "loss": 1.01139107, "memory(GiB)": 369.42, "step": 39500, "train_speed(iter/s)": 0.200834 }, { "acc": 0.7564074, "epoch": 1.002156265854896, "grad_norm": 2.171875, "learning_rate": 5.395128425067954e-06, "loss": 0.97902451, "memory(GiB)": 369.42, "step": 39505, "train_speed(iter/s)": 0.200836 }, { "acc": 0.76851864, "epoch": 1.0022831050228311, "grad_norm": 2.03125, "learning_rate": 5.394083072192944e-06, "loss": 0.92802124, "memory(GiB)": 369.42, "step": 39510, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75609894, "epoch": 1.0024099441907661, "grad_norm": 2.515625, "learning_rate": 5.393037701984399e-06, "loss": 0.93015079, "memory(GiB)": 369.42, "step": 39515, "train_speed(iter/s)": 0.200846 }, { "acc": 0.74729772, "epoch": 1.0025367833587011, "grad_norm": 1.984375, "learning_rate": 5.391992314488303e-06, "loss": 0.97322063, "memory(GiB)": 369.42, "step": 39520, "train_speed(iter/s)": 0.200849 }, { "acc": 0.74423285, "epoch": 1.0026636225266363, "grad_norm": 1.828125, "learning_rate": 5.3909469097506314e-06, "loss": 0.9785305, "memory(GiB)": 369.42, "step": 39525, "train_speed(iter/s)": 0.200853 }, { "acc": 0.74063578, "epoch": 1.0027904616945713, "grad_norm": 2.375, "learning_rate": 5.389901487817373e-06, "loss": 1.02098303, "memory(GiB)": 369.42, "step": 39530, "train_speed(iter/s)": 0.200856 }, { "acc": 0.76952, "epoch": 1.0029173008625063, "grad_norm": 2.25, "learning_rate": 5.388856048734505e-06, "loss": 0.96087074, "memory(GiB)": 369.42, "step": 39535, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74296241, "epoch": 1.0030441400304415, "grad_norm": 2.078125, "learning_rate": 5.3878105925480115e-06, "loss": 1.05249825, "memory(GiB)": 369.42, "step": 39540, "train_speed(iter/s)": 0.200865 }, { "acc": 0.74984837, "epoch": 1.0031709791983765, "grad_norm": 2.0, "learning_rate": 5.3867651193038765e-06, "loss": 0.99047661, "memory(GiB)": 369.42, "step": 39545, "train_speed(iter/s)": 0.200868 }, { "acc": 0.7544724, "epoch": 1.0032978183663115, "grad_norm": 2.09375, "learning_rate": 5.385719629048086e-06, "loss": 0.98210373, "memory(GiB)": 369.42, "step": 39550, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74084659, "epoch": 1.0034246575342465, "grad_norm": 1.8359375, "learning_rate": 5.384674121826622e-06, "loss": 1.02549229, "memory(GiB)": 369.42, "step": 39555, "train_speed(iter/s)": 0.200876 }, { "acc": 0.75151577, "epoch": 1.0035514967021817, "grad_norm": 2.484375, "learning_rate": 5.383628597685474e-06, "loss": 0.99181023, "memory(GiB)": 369.42, "step": 39560, "train_speed(iter/s)": 0.20088 }, { "acc": 0.77321672, "epoch": 1.0036783358701167, "grad_norm": 2.046875, "learning_rate": 5.382583056670627e-06, "loss": 0.91619873, "memory(GiB)": 369.42, "step": 39565, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75724573, "epoch": 1.0038051750380517, "grad_norm": 2.375, "learning_rate": 5.38153749882807e-06, "loss": 1.00746775, "memory(GiB)": 369.42, "step": 39570, "train_speed(iter/s)": 0.200889 }, { "acc": 0.75133886, "epoch": 1.0039320142059869, "grad_norm": 2.40625, "learning_rate": 5.38049192420379e-06, "loss": 0.98683586, "memory(GiB)": 369.42, "step": 39575, "train_speed(iter/s)": 0.200895 }, { "acc": 0.7526866, "epoch": 1.0040588533739219, "grad_norm": 1.78125, "learning_rate": 5.3794463328437766e-06, "loss": 0.97631016, "memory(GiB)": 369.42, "step": 39580, "train_speed(iter/s)": 0.200899 }, { "acc": 0.76247978, "epoch": 1.0041856925418569, "grad_norm": 2.375, "learning_rate": 5.3784007247940185e-06, "loss": 0.95133858, "memory(GiB)": 369.42, "step": 39585, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75795593, "epoch": 1.004312531709792, "grad_norm": 2.703125, "learning_rate": 5.377355100100508e-06, "loss": 0.90659466, "memory(GiB)": 369.42, "step": 39590, "train_speed(iter/s)": 0.200907 }, { "acc": 0.76468267, "epoch": 1.004439370877727, "grad_norm": 2.40625, "learning_rate": 5.376309458809235e-06, "loss": 0.92878647, "memory(GiB)": 369.42, "step": 39595, "train_speed(iter/s)": 0.20091 }, { "acc": 0.73708501, "epoch": 1.004566210045662, "grad_norm": 2.265625, "learning_rate": 5.375263800966192e-06, "loss": 1.0024704, "memory(GiB)": 369.42, "step": 39600, "train_speed(iter/s)": 0.200913 }, { "acc": 0.74943686, "epoch": 1.0046930492135973, "grad_norm": 2.734375, "learning_rate": 5.374218126617371e-06, "loss": 0.96427422, "memory(GiB)": 369.42, "step": 39605, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74860573, "epoch": 1.0048198883815322, "grad_norm": 1.9921875, "learning_rate": 5.373172435808768e-06, "loss": 1.02633781, "memory(GiB)": 369.42, "step": 39610, "train_speed(iter/s)": 0.20092 }, { "acc": 0.75201273, "epoch": 1.0049467275494672, "grad_norm": 2.328125, "learning_rate": 5.372126728586372e-06, "loss": 1.01271486, "memory(GiB)": 369.42, "step": 39615, "train_speed(iter/s)": 0.200924 }, { "acc": 0.74933939, "epoch": 1.0050735667174022, "grad_norm": 2.171875, "learning_rate": 5.371081004996184e-06, "loss": 0.97749043, "memory(GiB)": 369.42, "step": 39620, "train_speed(iter/s)": 0.200927 }, { "acc": 0.73713422, "epoch": 1.0052004058853374, "grad_norm": 1.9296875, "learning_rate": 5.370035265084195e-06, "loss": 0.98697548, "memory(GiB)": 369.42, "step": 39625, "train_speed(iter/s)": 0.200931 }, { "acc": 0.75513196, "epoch": 1.0053272450532724, "grad_norm": 2.46875, "learning_rate": 5.3689895088964025e-06, "loss": 0.99248543, "memory(GiB)": 369.42, "step": 39630, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74685187, "epoch": 1.0054540842212074, "grad_norm": 2.265625, "learning_rate": 5.367943736478806e-06, "loss": 1.00941315, "memory(GiB)": 369.42, "step": 39635, "train_speed(iter/s)": 0.200938 }, { "acc": 0.74059668, "epoch": 1.0055809233891426, "grad_norm": 2.328125, "learning_rate": 5.3668979478774e-06, "loss": 1.05962048, "memory(GiB)": 369.42, "step": 39640, "train_speed(iter/s)": 0.200944 }, { "acc": 0.76201124, "epoch": 1.0057077625570776, "grad_norm": 2.203125, "learning_rate": 5.3658521431381836e-06, "loss": 0.96223354, "memory(GiB)": 369.42, "step": 39645, "train_speed(iter/s)": 0.200947 }, { "acc": 0.75487366, "epoch": 1.0058346017250126, "grad_norm": 2.0, "learning_rate": 5.364806322307158e-06, "loss": 0.96928539, "memory(GiB)": 369.42, "step": 39650, "train_speed(iter/s)": 0.200952 }, { "acc": 0.74635973, "epoch": 1.0059614408929478, "grad_norm": 2.28125, "learning_rate": 5.363760485430321e-06, "loss": 1.0034565, "memory(GiB)": 369.42, "step": 39655, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75854578, "epoch": 1.0060882800608828, "grad_norm": 2.21875, "learning_rate": 5.3627146325536725e-06, "loss": 1.01576834, "memory(GiB)": 369.42, "step": 39660, "train_speed(iter/s)": 0.200959 }, { "acc": 0.77541938, "epoch": 1.0062151192288178, "grad_norm": 2.28125, "learning_rate": 5.361668763723216e-06, "loss": 0.88898258, "memory(GiB)": 369.42, "step": 39665, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75516777, "epoch": 1.006341958396753, "grad_norm": 2.0625, "learning_rate": 5.360622878984954e-06, "loss": 0.97017746, "memory(GiB)": 369.42, "step": 39670, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75221395, "epoch": 1.006468797564688, "grad_norm": 1.9609375, "learning_rate": 5.359576978384885e-06, "loss": 0.97536888, "memory(GiB)": 369.42, "step": 39675, "train_speed(iter/s)": 0.200963 }, { "acc": 0.75194774, "epoch": 1.006595636732623, "grad_norm": 2.140625, "learning_rate": 5.358531061969018e-06, "loss": 0.98309612, "memory(GiB)": 369.42, "step": 39680, "train_speed(iter/s)": 0.200967 }, { "acc": 0.74431286, "epoch": 1.0067224759005582, "grad_norm": 2.1875, "learning_rate": 5.357485129783351e-06, "loss": 1.06875343, "memory(GiB)": 369.42, "step": 39685, "train_speed(iter/s)": 0.20097 }, { "acc": 0.74726906, "epoch": 1.0068493150684932, "grad_norm": 2.40625, "learning_rate": 5.356439181873895e-06, "loss": 1.00556316, "memory(GiB)": 369.42, "step": 39690, "train_speed(iter/s)": 0.200975 }, { "acc": 0.76114559, "epoch": 1.0069761542364282, "grad_norm": 2.34375, "learning_rate": 5.35539321828665e-06, "loss": 0.92683783, "memory(GiB)": 369.42, "step": 39695, "train_speed(iter/s)": 0.200979 }, { "acc": 0.74647474, "epoch": 1.0071029934043634, "grad_norm": 2.015625, "learning_rate": 5.354347239067625e-06, "loss": 1.01889229, "memory(GiB)": 369.42, "step": 39700, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75813313, "epoch": 1.0072298325722984, "grad_norm": 2.21875, "learning_rate": 5.3533012442628275e-06, "loss": 0.93721237, "memory(GiB)": 369.42, "step": 39705, "train_speed(iter/s)": 0.200985 }, { "acc": 0.76545639, "epoch": 1.0073566717402334, "grad_norm": 2.515625, "learning_rate": 5.3522552339182635e-06, "loss": 0.9083271, "memory(GiB)": 369.42, "step": 39710, "train_speed(iter/s)": 0.200986 }, { "acc": 0.75520058, "epoch": 1.0074835109081683, "grad_norm": 2.265625, "learning_rate": 5.351209208079941e-06, "loss": 1.02207375, "memory(GiB)": 369.42, "step": 39715, "train_speed(iter/s)": 0.200991 }, { "acc": 0.75274229, "epoch": 1.0076103500761036, "grad_norm": 2.34375, "learning_rate": 5.35016316679387e-06, "loss": 0.9879178, "memory(GiB)": 369.42, "step": 39720, "train_speed(iter/s)": 0.200996 }, { "acc": 0.75714178, "epoch": 1.0077371892440385, "grad_norm": 2.421875, "learning_rate": 5.349117110106059e-06, "loss": 0.90967293, "memory(GiB)": 369.42, "step": 39725, "train_speed(iter/s)": 0.201001 }, { "acc": 0.749228, "epoch": 1.0078640284119735, "grad_norm": 2.0, "learning_rate": 5.34807103806252e-06, "loss": 0.94461565, "memory(GiB)": 369.42, "step": 39730, "train_speed(iter/s)": 0.201004 }, { "acc": 0.74212451, "epoch": 1.0079908675799087, "grad_norm": 2.15625, "learning_rate": 5.347024950709262e-06, "loss": 1.00686493, "memory(GiB)": 369.42, "step": 39735, "train_speed(iter/s)": 0.201006 }, { "acc": 0.73935509, "epoch": 1.0081177067478437, "grad_norm": 2.09375, "learning_rate": 5.345978848092297e-06, "loss": 1.00975189, "memory(GiB)": 369.42, "step": 39740, "train_speed(iter/s)": 0.201011 }, { "acc": 0.76104383, "epoch": 1.0082445459157787, "grad_norm": 2.6875, "learning_rate": 5.344932730257637e-06, "loss": 0.96210213, "memory(GiB)": 369.42, "step": 39745, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75153503, "epoch": 1.008371385083714, "grad_norm": 2.546875, "learning_rate": 5.343886597251298e-06, "loss": 1.01940632, "memory(GiB)": 369.42, "step": 39750, "train_speed(iter/s)": 0.201018 }, { "acc": 0.75329103, "epoch": 1.008498224251649, "grad_norm": 2.15625, "learning_rate": 5.342840449119287e-06, "loss": 0.98150997, "memory(GiB)": 369.42, "step": 39755, "train_speed(iter/s)": 0.201022 }, { "acc": 0.75071697, "epoch": 1.008625063419584, "grad_norm": 2.453125, "learning_rate": 5.341794285907627e-06, "loss": 0.99350243, "memory(GiB)": 369.42, "step": 39760, "train_speed(iter/s)": 0.201026 }, { "acc": 0.74681635, "epoch": 1.0087519025875191, "grad_norm": 2.03125, "learning_rate": 5.340748107662324e-06, "loss": 1.03584747, "memory(GiB)": 369.42, "step": 39765, "train_speed(iter/s)": 0.201031 }, { "acc": 0.7463644, "epoch": 1.0088787417554541, "grad_norm": 1.984375, "learning_rate": 5.339701914429402e-06, "loss": 1.02140541, "memory(GiB)": 369.42, "step": 39770, "train_speed(iter/s)": 0.201034 }, { "acc": 0.74535518, "epoch": 1.009005580923389, "grad_norm": 2.3125, "learning_rate": 5.338655706254871e-06, "loss": 1.0687542, "memory(GiB)": 369.42, "step": 39775, "train_speed(iter/s)": 0.201033 }, { "acc": 0.74172554, "epoch": 1.009132420091324, "grad_norm": 1.6640625, "learning_rate": 5.33760948318475e-06, "loss": 1.00535269, "memory(GiB)": 369.42, "step": 39780, "train_speed(iter/s)": 0.201037 }, { "acc": 0.75126791, "epoch": 1.0092592592592593, "grad_norm": 2.34375, "learning_rate": 5.336563245265056e-06, "loss": 1.01676397, "memory(GiB)": 369.42, "step": 39785, "train_speed(iter/s)": 0.20104 }, { "acc": 0.74306593, "epoch": 1.0093860984271943, "grad_norm": 2.390625, "learning_rate": 5.3355169925418095e-06, "loss": 1.03968019, "memory(GiB)": 369.42, "step": 39790, "train_speed(iter/s)": 0.201043 }, { "acc": 0.74881773, "epoch": 1.0095129375951293, "grad_norm": 1.7734375, "learning_rate": 5.334470725061027e-06, "loss": 1.0033165, "memory(GiB)": 369.42, "step": 39795, "train_speed(iter/s)": 0.201047 }, { "acc": 0.75451412, "epoch": 1.0096397767630645, "grad_norm": 2.140625, "learning_rate": 5.333424442868729e-06, "loss": 1.00664158, "memory(GiB)": 369.42, "step": 39800, "train_speed(iter/s)": 0.201051 }, { "acc": 0.74394569, "epoch": 1.0097666159309995, "grad_norm": 2.40625, "learning_rate": 5.3323781460109345e-06, "loss": 1.03250875, "memory(GiB)": 369.42, "step": 39805, "train_speed(iter/s)": 0.201054 }, { "acc": 0.75403838, "epoch": 1.0098934550989345, "grad_norm": 2.546875, "learning_rate": 5.3313318345336665e-06, "loss": 0.97920952, "memory(GiB)": 369.42, "step": 39810, "train_speed(iter/s)": 0.201057 }, { "acc": 0.76735249, "epoch": 1.0100202942668697, "grad_norm": 2.3125, "learning_rate": 5.330285508482944e-06, "loss": 0.9918746, "memory(GiB)": 369.42, "step": 39815, "train_speed(iter/s)": 0.201062 }, { "acc": 0.75629425, "epoch": 1.0101471334348047, "grad_norm": 2.9375, "learning_rate": 5.3292391679047905e-06, "loss": 0.98838987, "memory(GiB)": 369.42, "step": 39820, "train_speed(iter/s)": 0.201064 }, { "acc": 0.74289522, "epoch": 1.0102739726027397, "grad_norm": 1.875, "learning_rate": 5.328192812845228e-06, "loss": 1.02976265, "memory(GiB)": 369.42, "step": 39825, "train_speed(iter/s)": 0.201068 }, { "acc": 0.7598835, "epoch": 1.0104008117706749, "grad_norm": 2.09375, "learning_rate": 5.3271464433502805e-06, "loss": 0.93900795, "memory(GiB)": 369.42, "step": 39830, "train_speed(iter/s)": 0.201071 }, { "acc": 0.75039792, "epoch": 1.0105276509386099, "grad_norm": 2.140625, "learning_rate": 5.3261000594659715e-06, "loss": 1.03581104, "memory(GiB)": 369.42, "step": 39835, "train_speed(iter/s)": 0.201075 }, { "acc": 0.73561993, "epoch": 1.0106544901065448, "grad_norm": 2.25, "learning_rate": 5.3250536612383275e-06, "loss": 1.05576668, "memory(GiB)": 369.42, "step": 39840, "train_speed(iter/s)": 0.201075 }, { "acc": 0.75954275, "epoch": 1.01078132927448, "grad_norm": 1.8515625, "learning_rate": 5.32400724871337e-06, "loss": 0.92309456, "memory(GiB)": 369.42, "step": 39845, "train_speed(iter/s)": 0.201077 }, { "acc": 0.75819602, "epoch": 1.010908168442415, "grad_norm": 2.171875, "learning_rate": 5.322960821937129e-06, "loss": 0.96596947, "memory(GiB)": 369.42, "step": 39850, "train_speed(iter/s)": 0.201082 }, { "acc": 0.7342031, "epoch": 1.01103500761035, "grad_norm": 2.375, "learning_rate": 5.321914380955628e-06, "loss": 1.02663383, "memory(GiB)": 369.42, "step": 39855, "train_speed(iter/s)": 0.201085 }, { "acc": 0.75496531, "epoch": 1.0111618467782852, "grad_norm": 2.0625, "learning_rate": 5.320867925814896e-06, "loss": 0.97910194, "memory(GiB)": 369.42, "step": 39860, "train_speed(iter/s)": 0.201088 }, { "acc": 0.76106224, "epoch": 1.0112886859462202, "grad_norm": 2.15625, "learning_rate": 5.31982145656096e-06, "loss": 0.98915987, "memory(GiB)": 369.42, "step": 39865, "train_speed(iter/s)": 0.201094 }, { "acc": 0.75041046, "epoch": 1.0114155251141552, "grad_norm": 1.8359375, "learning_rate": 5.318774973239849e-06, "loss": 0.93298006, "memory(GiB)": 369.42, "step": 39870, "train_speed(iter/s)": 0.201096 }, { "acc": 0.75189967, "epoch": 1.0115423642820902, "grad_norm": 1.8984375, "learning_rate": 5.31772847589759e-06, "loss": 0.99047146, "memory(GiB)": 369.42, "step": 39875, "train_speed(iter/s)": 0.201101 }, { "acc": 0.7577302, "epoch": 1.0116692034500254, "grad_norm": 2.4375, "learning_rate": 5.316681964580215e-06, "loss": 0.95711298, "memory(GiB)": 369.42, "step": 39880, "train_speed(iter/s)": 0.201106 }, { "acc": 0.75264997, "epoch": 1.0117960426179604, "grad_norm": 2.75, "learning_rate": 5.315635439333753e-06, "loss": 0.98705873, "memory(GiB)": 369.42, "step": 39885, "train_speed(iter/s)": 0.201106 }, { "acc": 0.74756708, "epoch": 1.0119228817858954, "grad_norm": 2.125, "learning_rate": 5.314588900204235e-06, "loss": 0.99214382, "memory(GiB)": 369.42, "step": 39890, "train_speed(iter/s)": 0.201111 }, { "acc": 0.74334536, "epoch": 1.0120497209538306, "grad_norm": 2.453125, "learning_rate": 5.313542347237692e-06, "loss": 1.03835602, "memory(GiB)": 369.42, "step": 39895, "train_speed(iter/s)": 0.201116 }, { "acc": 0.73788347, "epoch": 1.0121765601217656, "grad_norm": 2.5, "learning_rate": 5.312495780480159e-06, "loss": 1.02488346, "memory(GiB)": 369.42, "step": 39900, "train_speed(iter/s)": 0.20112 }, { "acc": 0.76077547, "epoch": 1.0123033992897006, "grad_norm": 1.9609375, "learning_rate": 5.311449199977664e-06, "loss": 0.99039679, "memory(GiB)": 369.42, "step": 39905, "train_speed(iter/s)": 0.201124 }, { "acc": 0.76120033, "epoch": 1.0124302384576358, "grad_norm": 2.34375, "learning_rate": 5.310402605776245e-06, "loss": 0.95283108, "memory(GiB)": 369.42, "step": 39910, "train_speed(iter/s)": 0.201128 }, { "acc": 0.74886637, "epoch": 1.0125570776255708, "grad_norm": 1.96875, "learning_rate": 5.309355997921931e-06, "loss": 0.9838666, "memory(GiB)": 369.42, "step": 39915, "train_speed(iter/s)": 0.201131 }, { "acc": 0.76557727, "epoch": 1.0126839167935058, "grad_norm": 2.046875, "learning_rate": 5.308309376460761e-06, "loss": 0.99549561, "memory(GiB)": 369.42, "step": 39920, "train_speed(iter/s)": 0.201134 }, { "acc": 0.76075974, "epoch": 1.012810755961441, "grad_norm": 1.8984375, "learning_rate": 5.307262741438767e-06, "loss": 0.94132881, "memory(GiB)": 369.42, "step": 39925, "train_speed(iter/s)": 0.201138 }, { "acc": 0.76021261, "epoch": 1.012937595129376, "grad_norm": 2.125, "learning_rate": 5.3062160929019855e-06, "loss": 0.96204605, "memory(GiB)": 369.42, "step": 39930, "train_speed(iter/s)": 0.201144 }, { "acc": 0.76257029, "epoch": 1.013064434297311, "grad_norm": 2.03125, "learning_rate": 5.305169430896454e-06, "loss": 0.93156052, "memory(GiB)": 369.42, "step": 39935, "train_speed(iter/s)": 0.201148 }, { "acc": 0.75875731, "epoch": 1.013191273465246, "grad_norm": 1.8203125, "learning_rate": 5.304122755468209e-06, "loss": 0.90983658, "memory(GiB)": 369.42, "step": 39940, "train_speed(iter/s)": 0.201151 }, { "acc": 0.75277891, "epoch": 1.0133181126331812, "grad_norm": 2.359375, "learning_rate": 5.303076066663286e-06, "loss": 0.92173347, "memory(GiB)": 369.42, "step": 39945, "train_speed(iter/s)": 0.201154 }, { "acc": 0.74436464, "epoch": 1.0134449518011162, "grad_norm": 2.171875, "learning_rate": 5.302029364527726e-06, "loss": 0.98701801, "memory(GiB)": 369.42, "step": 39950, "train_speed(iter/s)": 0.201156 }, { "acc": 0.74370074, "epoch": 1.0135717909690511, "grad_norm": 2.625, "learning_rate": 5.3009826491075645e-06, "loss": 1.05429144, "memory(GiB)": 369.42, "step": 39955, "train_speed(iter/s)": 0.20116 }, { "acc": 0.75300426, "epoch": 1.0136986301369864, "grad_norm": 2.328125, "learning_rate": 5.299935920448843e-06, "loss": 0.95740814, "memory(GiB)": 369.42, "step": 39960, "train_speed(iter/s)": 0.201163 }, { "acc": 0.75605364, "epoch": 1.0138254693049213, "grad_norm": 2.015625, "learning_rate": 5.298889178597599e-06, "loss": 0.96264133, "memory(GiB)": 369.42, "step": 39965, "train_speed(iter/s)": 0.201167 }, { "acc": 0.7434103, "epoch": 1.0139523084728563, "grad_norm": 2.578125, "learning_rate": 5.297842423599877e-06, "loss": 0.98402863, "memory(GiB)": 369.42, "step": 39970, "train_speed(iter/s)": 0.201171 }, { "acc": 0.76144772, "epoch": 1.0140791476407915, "grad_norm": 2.28125, "learning_rate": 5.296795655501714e-06, "loss": 1.0053133, "memory(GiB)": 369.42, "step": 39975, "train_speed(iter/s)": 0.201176 }, { "acc": 0.75232878, "epoch": 1.0142059868087265, "grad_norm": 2.25, "learning_rate": 5.295748874349155e-06, "loss": 1.03090668, "memory(GiB)": 369.42, "step": 39980, "train_speed(iter/s)": 0.201179 }, { "acc": 0.74804583, "epoch": 1.0143328259766615, "grad_norm": 2.0625, "learning_rate": 5.294702080188236e-06, "loss": 0.9501339, "memory(GiB)": 369.42, "step": 39985, "train_speed(iter/s)": 0.20118 }, { "acc": 0.75002642, "epoch": 1.0144596651445967, "grad_norm": 2.1875, "learning_rate": 5.293655273065008e-06, "loss": 0.96966515, "memory(GiB)": 369.42, "step": 39990, "train_speed(iter/s)": 0.201184 }, { "acc": 0.76137538, "epoch": 1.0145865043125317, "grad_norm": 1.9765625, "learning_rate": 5.2926084530255076e-06, "loss": 0.95002193, "memory(GiB)": 369.42, "step": 39995, "train_speed(iter/s)": 0.201188 }, { "acc": 0.75106907, "epoch": 1.0147133434804667, "grad_norm": 2.171875, "learning_rate": 5.291561620115781e-06, "loss": 0.97068415, "memory(GiB)": 369.42, "step": 40000, "train_speed(iter/s)": 0.201191 }, { "epoch": 1.0147133434804667, "eval_acc": 0.7377210876822562, "eval_loss": 0.9705473780632019, "eval_runtime": 385.4953, "eval_samples_per_second": 16.524, "eval_steps_per_second": 8.262, "step": 40000 }, { "acc": 0.74539175, "epoch": 1.014840182648402, "grad_norm": 2.3125, "learning_rate": 5.290514774381874e-06, "loss": 1.04351482, "memory(GiB)": 369.42, "step": 40005, "train_speed(iter/s)": 0.200471 }, { "acc": 0.75486193, "epoch": 1.014967021816337, "grad_norm": 2.234375, "learning_rate": 5.289467915869829e-06, "loss": 0.98337746, "memory(GiB)": 369.42, "step": 40010, "train_speed(iter/s)": 0.200475 }, { "acc": 0.75751505, "epoch": 1.015093860984272, "grad_norm": 2.359375, "learning_rate": 5.288421044625694e-06, "loss": 0.98726215, "memory(GiB)": 369.42, "step": 40015, "train_speed(iter/s)": 0.200477 }, { "acc": 0.74939337, "epoch": 1.0152207001522071, "grad_norm": 2.15625, "learning_rate": 5.287374160695513e-06, "loss": 0.9990757, "memory(GiB)": 369.42, "step": 40020, "train_speed(iter/s)": 0.200482 }, { "acc": 0.74987135, "epoch": 1.015347539320142, "grad_norm": 1.984375, "learning_rate": 5.286327264125332e-06, "loss": 1.0487133, "memory(GiB)": 369.42, "step": 40025, "train_speed(iter/s)": 0.200487 }, { "acc": 0.74662733, "epoch": 1.015474378488077, "grad_norm": 2.34375, "learning_rate": 5.285280354961202e-06, "loss": 1.0529541, "memory(GiB)": 369.42, "step": 40030, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75651512, "epoch": 1.015601217656012, "grad_norm": 2.21875, "learning_rate": 5.284233433249167e-06, "loss": 0.92253599, "memory(GiB)": 369.42, "step": 40035, "train_speed(iter/s)": 0.200495 }, { "acc": 0.7476326, "epoch": 1.0157280568239473, "grad_norm": 2.640625, "learning_rate": 5.283186499035276e-06, "loss": 0.94651022, "memory(GiB)": 369.42, "step": 40040, "train_speed(iter/s)": 0.200499 }, { "acc": 0.7513587, "epoch": 1.0158548959918823, "grad_norm": 2.203125, "learning_rate": 5.2821395523655795e-06, "loss": 0.94912682, "memory(GiB)": 369.42, "step": 40045, "train_speed(iter/s)": 0.200501 }, { "acc": 0.7637991, "epoch": 1.0159817351598173, "grad_norm": 2.3125, "learning_rate": 5.281092593286127e-06, "loss": 0.93436241, "memory(GiB)": 369.42, "step": 40050, "train_speed(iter/s)": 0.200501 }, { "acc": 0.7531106, "epoch": 1.0161085743277525, "grad_norm": 2.046875, "learning_rate": 5.280045621842964e-06, "loss": 0.97561836, "memory(GiB)": 369.42, "step": 40055, "train_speed(iter/s)": 0.200507 }, { "acc": 0.73737535, "epoch": 1.0162354134956875, "grad_norm": 1.875, "learning_rate": 5.278998638082148e-06, "loss": 0.9997016, "memory(GiB)": 369.42, "step": 40060, "train_speed(iter/s)": 0.200509 }, { "acc": 0.74917812, "epoch": 1.0163622526636225, "grad_norm": 2.015625, "learning_rate": 5.277951642049722e-06, "loss": 0.99846773, "memory(GiB)": 369.42, "step": 40065, "train_speed(iter/s)": 0.20051 }, { "acc": 0.75223846, "epoch": 1.0164890918315577, "grad_norm": 1.921875, "learning_rate": 5.276904633791745e-06, "loss": 0.95761395, "memory(GiB)": 369.42, "step": 40070, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75210409, "epoch": 1.0166159309994927, "grad_norm": 2.28125, "learning_rate": 5.275857613354265e-06, "loss": 0.96193123, "memory(GiB)": 369.42, "step": 40075, "train_speed(iter/s)": 0.200518 }, { "acc": 0.76234188, "epoch": 1.0167427701674276, "grad_norm": 1.8828125, "learning_rate": 5.274810580783335e-06, "loss": 0.96318684, "memory(GiB)": 369.42, "step": 40080, "train_speed(iter/s)": 0.200522 }, { "acc": 0.76547432, "epoch": 1.0168696093353629, "grad_norm": 2.125, "learning_rate": 5.2737635361250094e-06, "loss": 0.96016178, "memory(GiB)": 369.42, "step": 40085, "train_speed(iter/s)": 0.200526 }, { "acc": 0.74822259, "epoch": 1.0169964485032978, "grad_norm": 2.359375, "learning_rate": 5.2727164794253415e-06, "loss": 0.97352991, "memory(GiB)": 369.42, "step": 40090, "train_speed(iter/s)": 0.20053 }, { "acc": 0.74282804, "epoch": 1.0171232876712328, "grad_norm": 2.296875, "learning_rate": 5.271669410730384e-06, "loss": 0.99573689, "memory(GiB)": 369.42, "step": 40095, "train_speed(iter/s)": 0.200535 }, { "acc": 0.75135412, "epoch": 1.0172501268391678, "grad_norm": 2.15625, "learning_rate": 5.270622330086194e-06, "loss": 1.01522694, "memory(GiB)": 369.42, "step": 40100, "train_speed(iter/s)": 0.20054 }, { "acc": 0.74460535, "epoch": 1.017376966007103, "grad_norm": 1.9453125, "learning_rate": 5.269575237538827e-06, "loss": 0.96282644, "memory(GiB)": 369.42, "step": 40105, "train_speed(iter/s)": 0.200542 }, { "acc": 0.73920264, "epoch": 1.017503805175038, "grad_norm": 2.203125, "learning_rate": 5.268528133134335e-06, "loss": 1.00100765, "memory(GiB)": 369.42, "step": 40110, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75375986, "epoch": 1.017630644342973, "grad_norm": 2.15625, "learning_rate": 5.267481016918776e-06, "loss": 0.95225906, "memory(GiB)": 369.42, "step": 40115, "train_speed(iter/s)": 0.200548 }, { "acc": 0.75395417, "epoch": 1.0177574835109082, "grad_norm": 2.21875, "learning_rate": 5.266433888938212e-06, "loss": 0.96301117, "memory(GiB)": 369.42, "step": 40120, "train_speed(iter/s)": 0.200552 }, { "acc": 0.76211996, "epoch": 1.0178843226788432, "grad_norm": 2.046875, "learning_rate": 5.265386749238691e-06, "loss": 0.94663668, "memory(GiB)": 369.42, "step": 40125, "train_speed(iter/s)": 0.200556 }, { "acc": 0.74715176, "epoch": 1.0180111618467782, "grad_norm": 2.171875, "learning_rate": 5.26433959786628e-06, "loss": 0.96206818, "memory(GiB)": 369.42, "step": 40130, "train_speed(iter/s)": 0.200556 }, { "acc": 0.7479836, "epoch": 1.0181380010147134, "grad_norm": 2.515625, "learning_rate": 5.263292434867031e-06, "loss": 0.97687397, "memory(GiB)": 369.42, "step": 40135, "train_speed(iter/s)": 0.200559 }, { "acc": 0.7608644, "epoch": 1.0182648401826484, "grad_norm": 1.7890625, "learning_rate": 5.262245260287006e-06, "loss": 0.93304729, "memory(GiB)": 369.42, "step": 40140, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74158587, "epoch": 1.0183916793505834, "grad_norm": 2.875, "learning_rate": 5.261198074172262e-06, "loss": 1.02256374, "memory(GiB)": 369.42, "step": 40145, "train_speed(iter/s)": 0.200567 }, { "acc": 0.75795956, "epoch": 1.0185185185185186, "grad_norm": 2.234375, "learning_rate": 5.260150876568862e-06, "loss": 0.98723984, "memory(GiB)": 369.42, "step": 40150, "train_speed(iter/s)": 0.200572 }, { "acc": 0.75011468, "epoch": 1.0186453576864536, "grad_norm": 1.9140625, "learning_rate": 5.259103667522866e-06, "loss": 1.00245647, "memory(GiB)": 369.42, "step": 40155, "train_speed(iter/s)": 0.200576 }, { "acc": 0.7520052, "epoch": 1.0187721968543886, "grad_norm": 1.9453125, "learning_rate": 5.258056447080333e-06, "loss": 1.01918468, "memory(GiB)": 369.42, "step": 40160, "train_speed(iter/s)": 0.200579 }, { "acc": 0.74845028, "epoch": 1.0188990360223238, "grad_norm": 1.953125, "learning_rate": 5.257009215287325e-06, "loss": 1.0215107, "memory(GiB)": 369.42, "step": 40165, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75351639, "epoch": 1.0190258751902588, "grad_norm": 2.09375, "learning_rate": 5.255961972189905e-06, "loss": 0.9961277, "memory(GiB)": 369.42, "step": 40170, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74099998, "epoch": 1.0191527143581938, "grad_norm": 2.265625, "learning_rate": 5.254914717834133e-06, "loss": 1.00437689, "memory(GiB)": 369.42, "step": 40175, "train_speed(iter/s)": 0.200591 }, { "acc": 0.75017014, "epoch": 1.019279553526129, "grad_norm": 2.0, "learning_rate": 5.253867452266075e-06, "loss": 0.95822887, "memory(GiB)": 369.42, "step": 40180, "train_speed(iter/s)": 0.200595 }, { "acc": 0.76960034, "epoch": 1.019406392694064, "grad_norm": 2.53125, "learning_rate": 5.252820175531792e-06, "loss": 0.98029375, "memory(GiB)": 369.42, "step": 40185, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74972486, "epoch": 1.019533231861999, "grad_norm": 2.796875, "learning_rate": 5.25177288767735e-06, "loss": 1.0028512, "memory(GiB)": 369.42, "step": 40190, "train_speed(iter/s)": 0.200604 }, { "acc": 0.75933237, "epoch": 1.019660071029934, "grad_norm": 2.109375, "learning_rate": 5.250725588748811e-06, "loss": 0.94964828, "memory(GiB)": 369.42, "step": 40195, "train_speed(iter/s)": 0.200609 }, { "acc": 0.74012709, "epoch": 1.0197869101978692, "grad_norm": 2.09375, "learning_rate": 5.249678278792243e-06, "loss": 1.07032633, "memory(GiB)": 369.42, "step": 40200, "train_speed(iter/s)": 0.200611 }, { "acc": 0.76831131, "epoch": 1.0199137493658041, "grad_norm": 2.15625, "learning_rate": 5.248630957853708e-06, "loss": 0.93627377, "memory(GiB)": 369.42, "step": 40205, "train_speed(iter/s)": 0.200615 }, { "acc": 0.7537786, "epoch": 1.0200405885337391, "grad_norm": 2.046875, "learning_rate": 5.247583625979276e-06, "loss": 0.94224567, "memory(GiB)": 369.42, "step": 40210, "train_speed(iter/s)": 0.200617 }, { "acc": 0.73825655, "epoch": 1.0201674277016743, "grad_norm": 2.515625, "learning_rate": 5.246536283215007e-06, "loss": 1.03235111, "memory(GiB)": 369.42, "step": 40215, "train_speed(iter/s)": 0.200621 }, { "acc": 0.75336103, "epoch": 1.0202942668696093, "grad_norm": 2.125, "learning_rate": 5.245488929606974e-06, "loss": 0.9808075, "memory(GiB)": 369.42, "step": 40220, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75646334, "epoch": 1.0204211060375443, "grad_norm": 1.671875, "learning_rate": 5.244441565201241e-06, "loss": 0.98862667, "memory(GiB)": 369.42, "step": 40225, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75101743, "epoch": 1.0205479452054795, "grad_norm": 2.28125, "learning_rate": 5.243394190043877e-06, "loss": 0.95197916, "memory(GiB)": 369.42, "step": 40230, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75756035, "epoch": 1.0206747843734145, "grad_norm": 2.25, "learning_rate": 5.242346804180949e-06, "loss": 0.97151136, "memory(GiB)": 369.42, "step": 40235, "train_speed(iter/s)": 0.200636 }, { "acc": 0.7539938, "epoch": 1.0208016235413495, "grad_norm": 2.0625, "learning_rate": 5.241299407658528e-06, "loss": 0.98247766, "memory(GiB)": 369.42, "step": 40240, "train_speed(iter/s)": 0.20064 }, { "acc": 0.76157932, "epoch": 1.0209284627092847, "grad_norm": 1.984375, "learning_rate": 5.240252000522681e-06, "loss": 0.94627514, "memory(GiB)": 369.42, "step": 40245, "train_speed(iter/s)": 0.20064 }, { "acc": 0.7448844, "epoch": 1.0210553018772197, "grad_norm": 2.515625, "learning_rate": 5.239204582819479e-06, "loss": 1.01879044, "memory(GiB)": 369.42, "step": 40250, "train_speed(iter/s)": 0.200645 }, { "acc": 0.75007792, "epoch": 1.0211821410451547, "grad_norm": 2.46875, "learning_rate": 5.238157154594989e-06, "loss": 0.97506504, "memory(GiB)": 369.42, "step": 40255, "train_speed(iter/s)": 0.200649 }, { "acc": 0.76032438, "epoch": 1.0213089802130897, "grad_norm": 2.0625, "learning_rate": 5.237109715895287e-06, "loss": 0.91175327, "memory(GiB)": 369.42, "step": 40260, "train_speed(iter/s)": 0.200653 }, { "acc": 0.76369743, "epoch": 1.021435819381025, "grad_norm": 2.109375, "learning_rate": 5.2360622667664385e-06, "loss": 0.95752249, "memory(GiB)": 369.42, "step": 40265, "train_speed(iter/s)": 0.200658 }, { "acc": 0.76841197, "epoch": 1.02156265854896, "grad_norm": 2.328125, "learning_rate": 5.235014807254521e-06, "loss": 0.93105106, "memory(GiB)": 369.42, "step": 40270, "train_speed(iter/s)": 0.20066 }, { "acc": 0.7496376, "epoch": 1.0216894977168949, "grad_norm": 2.4375, "learning_rate": 5.233967337405599e-06, "loss": 0.9913805, "memory(GiB)": 369.42, "step": 40275, "train_speed(iter/s)": 0.200665 }, { "acc": 0.75852571, "epoch": 1.02181633688483, "grad_norm": 1.8671875, "learning_rate": 5.232919857265752e-06, "loss": 0.95737782, "memory(GiB)": 369.42, "step": 40280, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74352646, "epoch": 1.021943176052765, "grad_norm": 2.046875, "learning_rate": 5.231872366881048e-06, "loss": 0.97222347, "memory(GiB)": 369.42, "step": 40285, "train_speed(iter/s)": 0.200672 }, { "acc": 0.75083766, "epoch": 1.0220700152207, "grad_norm": 1.875, "learning_rate": 5.230824866297563e-06, "loss": 0.98030605, "memory(GiB)": 369.42, "step": 40290, "train_speed(iter/s)": 0.200673 }, { "acc": 0.75709534, "epoch": 1.0221968543886353, "grad_norm": 2.203125, "learning_rate": 5.229777355561368e-06, "loss": 0.97802277, "memory(GiB)": 369.42, "step": 40295, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75870466, "epoch": 1.0223236935565703, "grad_norm": 2.59375, "learning_rate": 5.2287298347185415e-06, "loss": 0.96344185, "memory(GiB)": 369.42, "step": 40300, "train_speed(iter/s)": 0.200681 }, { "acc": 0.73947983, "epoch": 1.0224505327245053, "grad_norm": 2.234375, "learning_rate": 5.227682303815155e-06, "loss": 0.99399662, "memory(GiB)": 369.42, "step": 40305, "train_speed(iter/s)": 0.200681 }, { "acc": 0.74795771, "epoch": 1.0225773718924405, "grad_norm": 1.9453125, "learning_rate": 5.226634762897284e-06, "loss": 0.99694958, "memory(GiB)": 369.42, "step": 40310, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75325999, "epoch": 1.0227042110603755, "grad_norm": 1.96875, "learning_rate": 5.225587212011004e-06, "loss": 0.9306591, "memory(GiB)": 369.42, "step": 40315, "train_speed(iter/s)": 0.200689 }, { "acc": 0.74998446, "epoch": 1.0228310502283104, "grad_norm": 2.0625, "learning_rate": 5.224539651202391e-06, "loss": 0.95492134, "memory(GiB)": 369.42, "step": 40320, "train_speed(iter/s)": 0.200693 }, { "acc": 0.75413775, "epoch": 1.0229578893962457, "grad_norm": 1.9921875, "learning_rate": 5.223492080517523e-06, "loss": 0.99346848, "memory(GiB)": 369.42, "step": 40325, "train_speed(iter/s)": 0.200694 }, { "acc": 0.74568958, "epoch": 1.0230847285641806, "grad_norm": 2.5625, "learning_rate": 5.2224445000024744e-06, "loss": 1.01728058, "memory(GiB)": 369.42, "step": 40330, "train_speed(iter/s)": 0.200697 }, { "acc": 0.74265099, "epoch": 1.0232115677321156, "grad_norm": 2.25, "learning_rate": 5.221396909703322e-06, "loss": 0.96931324, "memory(GiB)": 369.42, "step": 40335, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75932426, "epoch": 1.0233384069000508, "grad_norm": 2.015625, "learning_rate": 5.220349309666148e-06, "loss": 0.91373291, "memory(GiB)": 369.42, "step": 40340, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74684658, "epoch": 1.0234652460679858, "grad_norm": 2.609375, "learning_rate": 5.2193016999370265e-06, "loss": 1.00240879, "memory(GiB)": 369.42, "step": 40345, "train_speed(iter/s)": 0.20071 }, { "acc": 0.74714489, "epoch": 1.0235920852359208, "grad_norm": 2.09375, "learning_rate": 5.218254080562038e-06, "loss": 0.95851307, "memory(GiB)": 369.42, "step": 40350, "train_speed(iter/s)": 0.200716 }, { "acc": 0.76801572, "epoch": 1.0237189244038558, "grad_norm": 2.515625, "learning_rate": 5.2172064515872585e-06, "loss": 0.94886999, "memory(GiB)": 369.42, "step": 40355, "train_speed(iter/s)": 0.200721 }, { "acc": 0.74694109, "epoch": 1.023845763571791, "grad_norm": 2.578125, "learning_rate": 5.21615881305877e-06, "loss": 0.99633141, "memory(GiB)": 369.42, "step": 40360, "train_speed(iter/s)": 0.200725 }, { "acc": 0.74274955, "epoch": 1.023972602739726, "grad_norm": 2.03125, "learning_rate": 5.215111165022653e-06, "loss": 0.9605485, "memory(GiB)": 369.42, "step": 40365, "train_speed(iter/s)": 0.200727 }, { "acc": 0.76290922, "epoch": 1.024099441907661, "grad_norm": 1.9765625, "learning_rate": 5.2140635075249856e-06, "loss": 0.90740938, "memory(GiB)": 369.42, "step": 40370, "train_speed(iter/s)": 0.200729 }, { "acc": 0.76055326, "epoch": 1.0242262810755962, "grad_norm": 2.203125, "learning_rate": 5.213015840611851e-06, "loss": 0.96253109, "memory(GiB)": 369.42, "step": 40375, "train_speed(iter/s)": 0.200733 }, { "acc": 0.7442327, "epoch": 1.0243531202435312, "grad_norm": 2.515625, "learning_rate": 5.211968164329328e-06, "loss": 1.01371412, "memory(GiB)": 369.42, "step": 40380, "train_speed(iter/s)": 0.200736 }, { "acc": 0.7540678, "epoch": 1.0244799594114662, "grad_norm": 2.140625, "learning_rate": 5.210920478723497e-06, "loss": 0.99934368, "memory(GiB)": 369.42, "step": 40385, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74198627, "epoch": 1.0246067985794014, "grad_norm": 2.375, "learning_rate": 5.209872783840443e-06, "loss": 1.05740738, "memory(GiB)": 369.42, "step": 40390, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74930716, "epoch": 1.0247336377473364, "grad_norm": 1.9296875, "learning_rate": 5.208825079726248e-06, "loss": 0.99701376, "memory(GiB)": 369.42, "step": 40395, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7550724, "epoch": 1.0248604769152714, "grad_norm": 2.0625, "learning_rate": 5.207777366426992e-06, "loss": 0.98587017, "memory(GiB)": 369.42, "step": 40400, "train_speed(iter/s)": 0.200747 }, { "acc": 0.74730687, "epoch": 1.0249873160832066, "grad_norm": 2.296875, "learning_rate": 5.206729643988759e-06, "loss": 0.98529949, "memory(GiB)": 369.42, "step": 40405, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75882378, "epoch": 1.0251141552511416, "grad_norm": 2.21875, "learning_rate": 5.205681912457635e-06, "loss": 0.95595064, "memory(GiB)": 369.42, "step": 40410, "train_speed(iter/s)": 0.200754 }, { "acc": 0.7418644, "epoch": 1.0252409944190766, "grad_norm": 2.828125, "learning_rate": 5.204634171879701e-06, "loss": 1.07528543, "memory(GiB)": 369.42, "step": 40415, "train_speed(iter/s)": 0.200757 }, { "acc": 0.75867858, "epoch": 1.0253678335870116, "grad_norm": 1.7578125, "learning_rate": 5.2035864223010445e-06, "loss": 0.90105486, "memory(GiB)": 369.42, "step": 40420, "train_speed(iter/s)": 0.200761 }, { "acc": 0.74153385, "epoch": 1.0254946727549468, "grad_norm": 2.421875, "learning_rate": 5.202538663767746e-06, "loss": 1.02211781, "memory(GiB)": 369.42, "step": 40425, "train_speed(iter/s)": 0.200764 }, { "acc": 0.74531636, "epoch": 1.0256215119228818, "grad_norm": 2.28125, "learning_rate": 5.201490896325895e-06, "loss": 1.01046963, "memory(GiB)": 369.42, "step": 40430, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75056577, "epoch": 1.0257483510908167, "grad_norm": 2.046875, "learning_rate": 5.200443120021572e-06, "loss": 0.99021187, "memory(GiB)": 369.42, "step": 40435, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75936432, "epoch": 1.025875190258752, "grad_norm": 2.15625, "learning_rate": 5.199395334900868e-06, "loss": 0.96186476, "memory(GiB)": 369.42, "step": 40440, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75026078, "epoch": 1.026002029426687, "grad_norm": 2.515625, "learning_rate": 5.198347541009866e-06, "loss": 0.97329311, "memory(GiB)": 369.42, "step": 40445, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74647236, "epoch": 1.026128868594622, "grad_norm": 2.3125, "learning_rate": 5.197299738394654e-06, "loss": 1.02344046, "memory(GiB)": 369.42, "step": 40450, "train_speed(iter/s)": 0.200784 }, { "acc": 0.75144606, "epoch": 1.0262557077625571, "grad_norm": 1.953125, "learning_rate": 5.196251927101318e-06, "loss": 0.97472382, "memory(GiB)": 369.42, "step": 40455, "train_speed(iter/s)": 0.200788 }, { "acc": 0.76052356, "epoch": 1.0263825469304921, "grad_norm": 2.171875, "learning_rate": 5.195204107175946e-06, "loss": 0.88892756, "memory(GiB)": 369.42, "step": 40460, "train_speed(iter/s)": 0.200792 }, { "acc": 0.74903245, "epoch": 1.0265093860984271, "grad_norm": 2.421875, "learning_rate": 5.194156278664627e-06, "loss": 1.02542019, "memory(GiB)": 369.42, "step": 40465, "train_speed(iter/s)": 0.200795 }, { "acc": 0.75455933, "epoch": 1.0266362252663623, "grad_norm": 1.9140625, "learning_rate": 5.1931084416134466e-06, "loss": 0.99637375, "memory(GiB)": 369.42, "step": 40470, "train_speed(iter/s)": 0.200799 }, { "acc": 0.75087452, "epoch": 1.0267630644342973, "grad_norm": 1.7890625, "learning_rate": 5.192060596068496e-06, "loss": 1.00308266, "memory(GiB)": 369.42, "step": 40475, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75508165, "epoch": 1.0268899036022323, "grad_norm": 2.109375, "learning_rate": 5.191012742075863e-06, "loss": 0.973036, "memory(GiB)": 369.42, "step": 40480, "train_speed(iter/s)": 0.200806 }, { "acc": 0.755896, "epoch": 1.0270167427701675, "grad_norm": 3.21875, "learning_rate": 5.189964879681635e-06, "loss": 1.03686094, "memory(GiB)": 369.42, "step": 40485, "train_speed(iter/s)": 0.200806 }, { "acc": 0.75798235, "epoch": 1.0271435819381025, "grad_norm": 2.109375, "learning_rate": 5.188917008931905e-06, "loss": 0.97192163, "memory(GiB)": 369.42, "step": 40490, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75300255, "epoch": 1.0272704211060375, "grad_norm": 2.75, "learning_rate": 5.18786912987276e-06, "loss": 0.98537579, "memory(GiB)": 369.42, "step": 40495, "train_speed(iter/s)": 0.200816 }, { "acc": 0.75832238, "epoch": 1.0273972602739727, "grad_norm": 2.0625, "learning_rate": 5.186821242550294e-06, "loss": 0.94212217, "memory(GiB)": 369.42, "step": 40500, "train_speed(iter/s)": 0.200819 }, { "acc": 0.74268541, "epoch": 1.0275240994419077, "grad_norm": 2.171875, "learning_rate": 5.185773347010594e-06, "loss": 0.98626575, "memory(GiB)": 369.42, "step": 40505, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75922451, "epoch": 1.0276509386098427, "grad_norm": 2.109375, "learning_rate": 5.184725443299753e-06, "loss": 0.98040657, "memory(GiB)": 369.42, "step": 40510, "train_speed(iter/s)": 0.200829 }, { "acc": 0.76789141, "epoch": 1.0277777777777777, "grad_norm": 2.25, "learning_rate": 5.183677531463863e-06, "loss": 0.95584106, "memory(GiB)": 369.42, "step": 40515, "train_speed(iter/s)": 0.200833 }, { "acc": 0.74930515, "epoch": 1.027904616945713, "grad_norm": 2.015625, "learning_rate": 5.182629611549015e-06, "loss": 1.01537952, "memory(GiB)": 369.42, "step": 40520, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75765901, "epoch": 1.0280314561136479, "grad_norm": 2.21875, "learning_rate": 5.181581683601301e-06, "loss": 0.92936897, "memory(GiB)": 369.42, "step": 40525, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75146232, "epoch": 1.0281582952815829, "grad_norm": 2.734375, "learning_rate": 5.1805337476668135e-06, "loss": 1.0923069, "memory(GiB)": 369.42, "step": 40530, "train_speed(iter/s)": 0.200837 }, { "acc": 0.7536581, "epoch": 1.028285134449518, "grad_norm": 1.765625, "learning_rate": 5.179485803791646e-06, "loss": 1.00349102, "memory(GiB)": 369.42, "step": 40535, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75055828, "epoch": 1.028411973617453, "grad_norm": 2.25, "learning_rate": 5.178437852021892e-06, "loss": 1.02682734, "memory(GiB)": 369.42, "step": 40540, "train_speed(iter/s)": 0.200843 }, { "acc": 0.76012797, "epoch": 1.028538812785388, "grad_norm": 2.5, "learning_rate": 5.177389892403645e-06, "loss": 0.95895023, "memory(GiB)": 369.42, "step": 40545, "train_speed(iter/s)": 0.200847 }, { "acc": 0.75187483, "epoch": 1.0286656519533233, "grad_norm": 2.296875, "learning_rate": 5.176341924982997e-06, "loss": 0.95624723, "memory(GiB)": 369.42, "step": 40550, "train_speed(iter/s)": 0.20085 }, { "acc": 0.76175699, "epoch": 1.0287924911212583, "grad_norm": 2.484375, "learning_rate": 5.1752939498060435e-06, "loss": 0.95034122, "memory(GiB)": 369.42, "step": 40555, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74573493, "epoch": 1.0289193302891932, "grad_norm": 2.140625, "learning_rate": 5.174245966918883e-06, "loss": 1.0597374, "memory(GiB)": 369.42, "step": 40560, "train_speed(iter/s)": 0.200859 }, { "acc": 0.73628483, "epoch": 1.0290461694571285, "grad_norm": 2.140625, "learning_rate": 5.173197976367603e-06, "loss": 0.97189884, "memory(GiB)": 369.42, "step": 40565, "train_speed(iter/s)": 0.200864 }, { "acc": 0.7528986, "epoch": 1.0291730086250634, "grad_norm": 2.046875, "learning_rate": 5.1721499781983055e-06, "loss": 0.94286299, "memory(GiB)": 369.42, "step": 40570, "train_speed(iter/s)": 0.200868 }, { "acc": 0.73653803, "epoch": 1.0292998477929984, "grad_norm": 1.90625, "learning_rate": 5.171101972457081e-06, "loss": 0.99318771, "memory(GiB)": 369.42, "step": 40575, "train_speed(iter/s)": 0.200873 }, { "acc": 0.75207357, "epoch": 1.0294266869609334, "grad_norm": 2.046875, "learning_rate": 5.170053959190029e-06, "loss": 0.99000902, "memory(GiB)": 369.42, "step": 40580, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75173273, "epoch": 1.0295535261288686, "grad_norm": 2.3125, "learning_rate": 5.169005938443245e-06, "loss": 1.01728191, "memory(GiB)": 369.42, "step": 40585, "train_speed(iter/s)": 0.200874 }, { "acc": 0.74811106, "epoch": 1.0296803652968036, "grad_norm": 2.03125, "learning_rate": 5.1679579102628245e-06, "loss": 0.99555321, "memory(GiB)": 369.42, "step": 40590, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74452763, "epoch": 1.0298072044647386, "grad_norm": 2.15625, "learning_rate": 5.166909874694866e-06, "loss": 0.98435421, "memory(GiB)": 369.42, "step": 40595, "train_speed(iter/s)": 0.200883 }, { "acc": 0.7425292, "epoch": 1.0299340436326738, "grad_norm": 2.625, "learning_rate": 5.165861831785465e-06, "loss": 1.09131689, "memory(GiB)": 369.42, "step": 40600, "train_speed(iter/s)": 0.200887 }, { "acc": 0.74717226, "epoch": 1.0300608828006088, "grad_norm": 2.109375, "learning_rate": 5.164813781580721e-06, "loss": 0.98001766, "memory(GiB)": 369.42, "step": 40605, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74973106, "epoch": 1.0301877219685438, "grad_norm": 2.046875, "learning_rate": 5.16376572412673e-06, "loss": 0.97230721, "memory(GiB)": 369.42, "step": 40610, "train_speed(iter/s)": 0.200893 }, { "acc": 0.7616004, "epoch": 1.030314561136479, "grad_norm": 2.109375, "learning_rate": 5.162717659469593e-06, "loss": 0.99644413, "memory(GiB)": 369.42, "step": 40615, "train_speed(iter/s)": 0.200895 }, { "acc": 0.76282024, "epoch": 1.030441400304414, "grad_norm": 1.9609375, "learning_rate": 5.161669587655406e-06, "loss": 0.92077971, "memory(GiB)": 369.42, "step": 40620, "train_speed(iter/s)": 0.200897 }, { "acc": 0.75919056, "epoch": 1.030568239472349, "grad_norm": 2.578125, "learning_rate": 5.160621508730267e-06, "loss": 0.96784897, "memory(GiB)": 369.42, "step": 40625, "train_speed(iter/s)": 0.2009 }, { "acc": 0.74359779, "epoch": 1.0306950786402842, "grad_norm": 2.125, "learning_rate": 5.15957342274028e-06, "loss": 1.01612778, "memory(GiB)": 369.42, "step": 40630, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75157385, "epoch": 1.0308219178082192, "grad_norm": 2.84375, "learning_rate": 5.158525329731539e-06, "loss": 0.98940601, "memory(GiB)": 369.42, "step": 40635, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74394331, "epoch": 1.0309487569761542, "grad_norm": 2.5, "learning_rate": 5.157477229750149e-06, "loss": 1.0603734, "memory(GiB)": 369.42, "step": 40640, "train_speed(iter/s)": 0.200912 }, { "acc": 0.7548604, "epoch": 1.0310755961440894, "grad_norm": 2.109375, "learning_rate": 5.156429122842204e-06, "loss": 1.00378056, "memory(GiB)": 369.42, "step": 40645, "train_speed(iter/s)": 0.200915 }, { "acc": 0.7554184, "epoch": 1.0312024353120244, "grad_norm": 2.53125, "learning_rate": 5.15538100905381e-06, "loss": 0.96902704, "memory(GiB)": 369.42, "step": 40650, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75477748, "epoch": 1.0313292744799594, "grad_norm": 1.7578125, "learning_rate": 5.154332888431064e-06, "loss": 0.97117043, "memory(GiB)": 369.42, "step": 40655, "train_speed(iter/s)": 0.200918 }, { "acc": 0.74746008, "epoch": 1.0314561136478946, "grad_norm": 2.375, "learning_rate": 5.15328476102007e-06, "loss": 1.0077034, "memory(GiB)": 369.42, "step": 40660, "train_speed(iter/s)": 0.200922 }, { "acc": 0.74908247, "epoch": 1.0315829528158296, "grad_norm": 1.8828125, "learning_rate": 5.1522366268669264e-06, "loss": 0.97849941, "memory(GiB)": 369.42, "step": 40665, "train_speed(iter/s)": 0.200926 }, { "acc": 0.74993491, "epoch": 1.0317097919837646, "grad_norm": 2.140625, "learning_rate": 5.1511884860177376e-06, "loss": 0.97572155, "memory(GiB)": 369.42, "step": 40670, "train_speed(iter/s)": 0.200929 }, { "acc": 0.74996939, "epoch": 1.0318366311516995, "grad_norm": 2.234375, "learning_rate": 5.150140338518603e-06, "loss": 1.00337477, "memory(GiB)": 369.42, "step": 40675, "train_speed(iter/s)": 0.200932 }, { "acc": 0.76376905, "epoch": 1.0319634703196348, "grad_norm": 1.890625, "learning_rate": 5.149092184415627e-06, "loss": 0.92716846, "memory(GiB)": 369.42, "step": 40680, "train_speed(iter/s)": 0.200935 }, { "acc": 0.77308331, "epoch": 1.0320903094875697, "grad_norm": 2.078125, "learning_rate": 5.148044023754911e-06, "loss": 0.97320166, "memory(GiB)": 369.42, "step": 40685, "train_speed(iter/s)": 0.200939 }, { "acc": 0.73519835, "epoch": 1.0322171486555047, "grad_norm": 1.9375, "learning_rate": 5.146995856582557e-06, "loss": 1.04333267, "memory(GiB)": 369.42, "step": 40690, "train_speed(iter/s)": 0.200942 }, { "acc": 0.75094175, "epoch": 1.03234398782344, "grad_norm": 2.078125, "learning_rate": 5.14594768294467e-06, "loss": 1.02889585, "memory(GiB)": 369.42, "step": 40695, "train_speed(iter/s)": 0.200947 }, { "acc": 0.74556351, "epoch": 1.032470826991375, "grad_norm": 2.265625, "learning_rate": 5.1448995028873515e-06, "loss": 1.01999702, "memory(GiB)": 369.42, "step": 40700, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75461149, "epoch": 1.03259766615931, "grad_norm": 2.046875, "learning_rate": 5.143851316456706e-06, "loss": 0.93547459, "memory(GiB)": 369.42, "step": 40705, "train_speed(iter/s)": 0.200949 }, { "acc": 0.75225654, "epoch": 1.0327245053272451, "grad_norm": 2.578125, "learning_rate": 5.142803123698838e-06, "loss": 1.0162529, "memory(GiB)": 369.42, "step": 40710, "train_speed(iter/s)": 0.200953 }, { "acc": 0.76399035, "epoch": 1.0328513444951801, "grad_norm": 2.109375, "learning_rate": 5.14175492465985e-06, "loss": 0.94530487, "memory(GiB)": 369.42, "step": 40715, "train_speed(iter/s)": 0.200957 }, { "acc": 0.75021811, "epoch": 1.0329781836631151, "grad_norm": 2.34375, "learning_rate": 5.14070671938585e-06, "loss": 0.99801903, "memory(GiB)": 369.42, "step": 40720, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75594511, "epoch": 1.0331050228310503, "grad_norm": 1.984375, "learning_rate": 5.139658507922937e-06, "loss": 0.97303963, "memory(GiB)": 369.42, "step": 40725, "train_speed(iter/s)": 0.200962 }, { "acc": 0.74495659, "epoch": 1.0332318619989853, "grad_norm": 2.546875, "learning_rate": 5.138610290317221e-06, "loss": 1.01196404, "memory(GiB)": 369.42, "step": 40730, "train_speed(iter/s)": 0.200966 }, { "acc": 0.75040007, "epoch": 1.0333587011669203, "grad_norm": 2.09375, "learning_rate": 5.137562066614805e-06, "loss": 1.00721893, "memory(GiB)": 369.42, "step": 40735, "train_speed(iter/s)": 0.20097 }, { "acc": 0.76290021, "epoch": 1.0334855403348553, "grad_norm": 2.71875, "learning_rate": 5.136513836861795e-06, "loss": 0.91869268, "memory(GiB)": 369.42, "step": 40740, "train_speed(iter/s)": 0.200974 }, { "acc": 0.75636258, "epoch": 1.0336123795027905, "grad_norm": 1.96875, "learning_rate": 5.135465601104298e-06, "loss": 1.03095093, "memory(GiB)": 369.42, "step": 40745, "train_speed(iter/s)": 0.200978 }, { "acc": 0.767588, "epoch": 1.0337392186707255, "grad_norm": 2.046875, "learning_rate": 5.134417359388418e-06, "loss": 0.95613918, "memory(GiB)": 369.42, "step": 40750, "train_speed(iter/s)": 0.200983 }, { "acc": 0.73950987, "epoch": 1.0338660578386605, "grad_norm": 2.21875, "learning_rate": 5.133369111760264e-06, "loss": 1.02279568, "memory(GiB)": 369.42, "step": 40755, "train_speed(iter/s)": 0.200985 }, { "acc": 0.75770044, "epoch": 1.0339928970065957, "grad_norm": 2.296875, "learning_rate": 5.132320858265939e-06, "loss": 0.96756363, "memory(GiB)": 369.42, "step": 40760, "train_speed(iter/s)": 0.20099 }, { "acc": 0.75463495, "epoch": 1.0341197361745307, "grad_norm": 2.625, "learning_rate": 5.131272598951554e-06, "loss": 0.98590946, "memory(GiB)": 369.42, "step": 40765, "train_speed(iter/s)": 0.200992 }, { "acc": 0.75722847, "epoch": 1.0342465753424657, "grad_norm": 1.765625, "learning_rate": 5.130224333863212e-06, "loss": 0.92029858, "memory(GiB)": 369.42, "step": 40770, "train_speed(iter/s)": 0.200994 }, { "acc": 0.75027056, "epoch": 1.0343734145104009, "grad_norm": 2.296875, "learning_rate": 5.129176063047022e-06, "loss": 0.99078407, "memory(GiB)": 369.42, "step": 40775, "train_speed(iter/s)": 0.200999 }, { "acc": 0.73789063, "epoch": 1.0345002536783359, "grad_norm": 2.3125, "learning_rate": 5.128127786549094e-06, "loss": 1.04666424, "memory(GiB)": 369.42, "step": 40780, "train_speed(iter/s)": 0.201002 }, { "acc": 0.74959307, "epoch": 1.0346270928462709, "grad_norm": 2.234375, "learning_rate": 5.127079504415532e-06, "loss": 0.99566936, "memory(GiB)": 369.42, "step": 40785, "train_speed(iter/s)": 0.201006 }, { "acc": 0.76409874, "epoch": 1.034753932014206, "grad_norm": 2.359375, "learning_rate": 5.126031216692449e-06, "loss": 0.91232929, "memory(GiB)": 369.42, "step": 40790, "train_speed(iter/s)": 0.201009 }, { "acc": 0.75109243, "epoch": 1.034880771182141, "grad_norm": 1.84375, "learning_rate": 5.124982923425947e-06, "loss": 0.98603125, "memory(GiB)": 369.42, "step": 40795, "train_speed(iter/s)": 0.201012 }, { "acc": 0.74692912, "epoch": 1.035007610350076, "grad_norm": 1.921875, "learning_rate": 5.123934624662139e-06, "loss": 0.97600632, "memory(GiB)": 369.42, "step": 40800, "train_speed(iter/s)": 0.201016 }, { "acc": 0.74236288, "epoch": 1.0351344495180113, "grad_norm": 2.546875, "learning_rate": 5.1228863204471335e-06, "loss": 1.0040555, "memory(GiB)": 369.42, "step": 40805, "train_speed(iter/s)": 0.20102 }, { "acc": 0.74599347, "epoch": 1.0352612886859462, "grad_norm": 1.734375, "learning_rate": 5.121838010827039e-06, "loss": 0.96292725, "memory(GiB)": 369.42, "step": 40810, "train_speed(iter/s)": 0.201023 }, { "acc": 0.74331608, "epoch": 1.0353881278538812, "grad_norm": 2.203125, "learning_rate": 5.120789695847965e-06, "loss": 0.99661083, "memory(GiB)": 369.42, "step": 40815, "train_speed(iter/s)": 0.201027 }, { "acc": 0.75151477, "epoch": 1.0355149670218164, "grad_norm": 2.328125, "learning_rate": 5.119741375556021e-06, "loss": 0.99274683, "memory(GiB)": 369.42, "step": 40820, "train_speed(iter/s)": 0.20103 }, { "acc": 0.7700779, "epoch": 1.0356418061897514, "grad_norm": 1.9765625, "learning_rate": 5.118693049997316e-06, "loss": 0.92784996, "memory(GiB)": 369.42, "step": 40825, "train_speed(iter/s)": 0.201034 }, { "acc": 0.74391775, "epoch": 1.0357686453576864, "grad_norm": 2.328125, "learning_rate": 5.117644719217961e-06, "loss": 1.02322941, "memory(GiB)": 369.42, "step": 40830, "train_speed(iter/s)": 0.201039 }, { "acc": 0.74505167, "epoch": 1.0358954845256214, "grad_norm": 2.296875, "learning_rate": 5.116596383264066e-06, "loss": 0.99927931, "memory(GiB)": 369.42, "step": 40835, "train_speed(iter/s)": 0.201042 }, { "acc": 0.73669848, "epoch": 1.0360223236935566, "grad_norm": 3.28125, "learning_rate": 5.115548042181742e-06, "loss": 1.05328732, "memory(GiB)": 369.42, "step": 40840, "train_speed(iter/s)": 0.201043 }, { "acc": 0.76207347, "epoch": 1.0361491628614916, "grad_norm": 1.7734375, "learning_rate": 5.114499696017098e-06, "loss": 0.88896179, "memory(GiB)": 369.42, "step": 40845, "train_speed(iter/s)": 0.201044 }, { "acc": 0.74290953, "epoch": 1.0362760020294266, "grad_norm": 1.9921875, "learning_rate": 5.1134513448162475e-06, "loss": 1.04708357, "memory(GiB)": 369.42, "step": 40850, "train_speed(iter/s)": 0.201047 }, { "acc": 0.73857865, "epoch": 1.0364028411973618, "grad_norm": 2.0625, "learning_rate": 5.112402988625299e-06, "loss": 1.01582069, "memory(GiB)": 369.42, "step": 40855, "train_speed(iter/s)": 0.201051 }, { "acc": 0.75730152, "epoch": 1.0365296803652968, "grad_norm": 2.359375, "learning_rate": 5.111354627490367e-06, "loss": 0.96137753, "memory(GiB)": 369.42, "step": 40860, "train_speed(iter/s)": 0.201055 }, { "acc": 0.74366474, "epoch": 1.0366565195332318, "grad_norm": 2.171875, "learning_rate": 5.110306261457559e-06, "loss": 1.03045998, "memory(GiB)": 369.42, "step": 40865, "train_speed(iter/s)": 0.201059 }, { "acc": 0.7420496, "epoch": 1.036783358701167, "grad_norm": 1.8984375, "learning_rate": 5.109257890572991e-06, "loss": 1.07432766, "memory(GiB)": 369.42, "step": 40870, "train_speed(iter/s)": 0.201062 }, { "acc": 0.75506849, "epoch": 1.036910197869102, "grad_norm": 2.578125, "learning_rate": 5.108209514882772e-06, "loss": 0.98982573, "memory(GiB)": 369.42, "step": 40875, "train_speed(iter/s)": 0.201064 }, { "acc": 0.74496298, "epoch": 1.037037037037037, "grad_norm": 2.53125, "learning_rate": 5.107161134433017e-06, "loss": 0.99297543, "memory(GiB)": 369.42, "step": 40880, "train_speed(iter/s)": 0.201066 }, { "acc": 0.75431194, "epoch": 1.0371638762049722, "grad_norm": 2.328125, "learning_rate": 5.106112749269835e-06, "loss": 0.944907, "memory(GiB)": 369.42, "step": 40885, "train_speed(iter/s)": 0.201066 }, { "acc": 0.74960003, "epoch": 1.0372907153729072, "grad_norm": 2.3125, "learning_rate": 5.105064359439341e-06, "loss": 1.0253334, "memory(GiB)": 369.42, "step": 40890, "train_speed(iter/s)": 0.201071 }, { "acc": 0.75924621, "epoch": 1.0374175545408422, "grad_norm": 2.1875, "learning_rate": 5.1040159649876485e-06, "loss": 0.93187828, "memory(GiB)": 369.42, "step": 40895, "train_speed(iter/s)": 0.201073 }, { "acc": 0.74361858, "epoch": 1.0375443937087772, "grad_norm": 2.40625, "learning_rate": 5.102967565960868e-06, "loss": 1.02398348, "memory(GiB)": 369.42, "step": 40900, "train_speed(iter/s)": 0.201076 }, { "acc": 0.75033312, "epoch": 1.0376712328767124, "grad_norm": 2.109375, "learning_rate": 5.101919162405116e-06, "loss": 1.01109295, "memory(GiB)": 369.42, "step": 40905, "train_speed(iter/s)": 0.201077 }, { "acc": 0.75541897, "epoch": 1.0377980720446474, "grad_norm": 2.3125, "learning_rate": 5.100870754366503e-06, "loss": 0.97858801, "memory(GiB)": 369.42, "step": 40910, "train_speed(iter/s)": 0.20108 }, { "acc": 0.7682045, "epoch": 1.0379249112125823, "grad_norm": 2.296875, "learning_rate": 5.099822341891144e-06, "loss": 0.8978466, "memory(GiB)": 369.42, "step": 40915, "train_speed(iter/s)": 0.201085 }, { "acc": 0.74716334, "epoch": 1.0380517503805176, "grad_norm": 2.34375, "learning_rate": 5.098773925025152e-06, "loss": 1.02751942, "memory(GiB)": 369.42, "step": 40920, "train_speed(iter/s)": 0.20109 }, { "acc": 0.7403739, "epoch": 1.0381785895484525, "grad_norm": 2.140625, "learning_rate": 5.097725503814643e-06, "loss": 1.02879391, "memory(GiB)": 369.42, "step": 40925, "train_speed(iter/s)": 0.201093 }, { "acc": 0.75345802, "epoch": 1.0383054287163875, "grad_norm": 1.96875, "learning_rate": 5.09667707830573e-06, "loss": 0.92485161, "memory(GiB)": 369.42, "step": 40930, "train_speed(iter/s)": 0.201097 }, { "acc": 0.76822357, "epoch": 1.0384322678843227, "grad_norm": 2.296875, "learning_rate": 5.095628648544526e-06, "loss": 0.89204845, "memory(GiB)": 369.42, "step": 40935, "train_speed(iter/s)": 0.201101 }, { "acc": 0.75794201, "epoch": 1.0385591070522577, "grad_norm": 2.0625, "learning_rate": 5.0945802145771495e-06, "loss": 0.91687965, "memory(GiB)": 369.42, "step": 40940, "train_speed(iter/s)": 0.201106 }, { "acc": 0.75602188, "epoch": 1.0386859462201927, "grad_norm": 2.0625, "learning_rate": 5.093531776449711e-06, "loss": 0.94239092, "memory(GiB)": 369.42, "step": 40945, "train_speed(iter/s)": 0.201108 }, { "acc": 0.75537548, "epoch": 1.038812785388128, "grad_norm": 2.296875, "learning_rate": 5.092483334208327e-06, "loss": 1.00657177, "memory(GiB)": 369.42, "step": 40950, "train_speed(iter/s)": 0.201112 }, { "acc": 0.74813519, "epoch": 1.038939624556063, "grad_norm": 2.140625, "learning_rate": 5.091434887899114e-06, "loss": 1.02179489, "memory(GiB)": 369.42, "step": 40955, "train_speed(iter/s)": 0.201115 }, { "acc": 0.75859146, "epoch": 1.039066463723998, "grad_norm": 2.515625, "learning_rate": 5.0903864375681866e-06, "loss": 0.98048706, "memory(GiB)": 369.42, "step": 40960, "train_speed(iter/s)": 0.201118 }, { "acc": 0.75627766, "epoch": 1.0391933028919331, "grad_norm": 2.046875, "learning_rate": 5.0893379832616594e-06, "loss": 0.98139772, "memory(GiB)": 369.42, "step": 40965, "train_speed(iter/s)": 0.201119 }, { "acc": 0.75633831, "epoch": 1.0393201420598681, "grad_norm": 2.578125, "learning_rate": 5.08828952502565e-06, "loss": 0.96915627, "memory(GiB)": 369.42, "step": 40970, "train_speed(iter/s)": 0.201123 }, { "acc": 0.7617784, "epoch": 1.039446981227803, "grad_norm": 1.875, "learning_rate": 5.087241062906272e-06, "loss": 0.96432076, "memory(GiB)": 369.42, "step": 40975, "train_speed(iter/s)": 0.201128 }, { "acc": 0.74619598, "epoch": 1.0395738203957383, "grad_norm": 1.921875, "learning_rate": 5.086192596949643e-06, "loss": 0.98721123, "memory(GiB)": 369.42, "step": 40980, "train_speed(iter/s)": 0.201131 }, { "acc": 0.75411949, "epoch": 1.0397006595636733, "grad_norm": 2.203125, "learning_rate": 5.085144127201879e-06, "loss": 0.95898142, "memory(GiB)": 369.42, "step": 40985, "train_speed(iter/s)": 0.201136 }, { "acc": 0.75802503, "epoch": 1.0398274987316083, "grad_norm": 2.390625, "learning_rate": 5.084095653709096e-06, "loss": 0.92686958, "memory(GiB)": 369.42, "step": 40990, "train_speed(iter/s)": 0.20114 }, { "acc": 0.7633317, "epoch": 1.0399543378995433, "grad_norm": 2.484375, "learning_rate": 5.0830471765174096e-06, "loss": 1.00450649, "memory(GiB)": 369.42, "step": 40995, "train_speed(iter/s)": 0.201142 }, { "acc": 0.74758692, "epoch": 1.0400811770674785, "grad_norm": 2.078125, "learning_rate": 5.0819986956729395e-06, "loss": 1.02043133, "memory(GiB)": 369.42, "step": 41000, "train_speed(iter/s)": 0.201146 }, { "epoch": 1.0400811770674785, "eval_acc": 0.7377607755766546, "eval_loss": 0.9703150987625122, "eval_runtime": 384.8974, "eval_samples_per_second": 16.55, "eval_steps_per_second": 8.275, "step": 41000 }, { "acc": 0.75626631, "epoch": 1.0402080162354135, "grad_norm": 2.453125, "learning_rate": 5.080950211221799e-06, "loss": 0.96321402, "memory(GiB)": 369.42, "step": 41005, "train_speed(iter/s)": 0.20045 }, { "acc": 0.74357071, "epoch": 1.0403348554033485, "grad_norm": 2.359375, "learning_rate": 5.079901723210109e-06, "loss": 0.99705429, "memory(GiB)": 369.42, "step": 41010, "train_speed(iter/s)": 0.200455 }, { "acc": 0.74954891, "epoch": 1.0404616945712837, "grad_norm": 2.265625, "learning_rate": 5.078853231683981e-06, "loss": 0.99574356, "memory(GiB)": 369.42, "step": 41015, "train_speed(iter/s)": 0.20046 }, { "acc": 0.74840937, "epoch": 1.0405885337392187, "grad_norm": 2.25, "learning_rate": 5.077804736689539e-06, "loss": 1.01256657, "memory(GiB)": 369.42, "step": 41020, "train_speed(iter/s)": 0.200465 }, { "acc": 0.75544157, "epoch": 1.0407153729071537, "grad_norm": 2.375, "learning_rate": 5.0767562382728955e-06, "loss": 0.96831264, "memory(GiB)": 369.42, "step": 41025, "train_speed(iter/s)": 0.200469 }, { "acc": 0.74772944, "epoch": 1.0408422120750889, "grad_norm": 2.46875, "learning_rate": 5.075707736480171e-06, "loss": 1.07293577, "memory(GiB)": 369.42, "step": 41030, "train_speed(iter/s)": 0.200473 }, { "acc": 0.74887762, "epoch": 1.0409690512430239, "grad_norm": 2.265625, "learning_rate": 5.074659231357482e-06, "loss": 1.0421134, "memory(GiB)": 369.42, "step": 41035, "train_speed(iter/s)": 0.200476 }, { "acc": 0.7566534, "epoch": 1.0410958904109588, "grad_norm": 1.8359375, "learning_rate": 5.073610722950947e-06, "loss": 0.98914957, "memory(GiB)": 369.42, "step": 41040, "train_speed(iter/s)": 0.200479 }, { "acc": 0.7518117, "epoch": 1.041222729578894, "grad_norm": 2.546875, "learning_rate": 5.072562211306683e-06, "loss": 0.96273251, "memory(GiB)": 369.42, "step": 41045, "train_speed(iter/s)": 0.200482 }, { "acc": 0.74038744, "epoch": 1.041349568746829, "grad_norm": 2.421875, "learning_rate": 5.071513696470809e-06, "loss": 1.03179502, "memory(GiB)": 369.42, "step": 41050, "train_speed(iter/s)": 0.200485 }, { "acc": 0.75678511, "epoch": 1.041476407914764, "grad_norm": 2.734375, "learning_rate": 5.070465178489443e-06, "loss": 1.00921001, "memory(GiB)": 369.42, "step": 41055, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75362096, "epoch": 1.041603247082699, "grad_norm": 1.984375, "learning_rate": 5.069416657408704e-06, "loss": 0.97216606, "memory(GiB)": 369.42, "step": 41060, "train_speed(iter/s)": 0.200494 }, { "acc": 0.7539505, "epoch": 1.0417300862506342, "grad_norm": 1.6796875, "learning_rate": 5.0683681332747105e-06, "loss": 1.01213579, "memory(GiB)": 369.42, "step": 41065, "train_speed(iter/s)": 0.200496 }, { "acc": 0.74912777, "epoch": 1.0418569254185692, "grad_norm": 2.0, "learning_rate": 5.067319606133583e-06, "loss": 1.02135696, "memory(GiB)": 369.42, "step": 41070, "train_speed(iter/s)": 0.200499 }, { "acc": 0.76333141, "epoch": 1.0419837645865042, "grad_norm": 2.515625, "learning_rate": 5.066271076031436e-06, "loss": 0.95955467, "memory(GiB)": 369.42, "step": 41075, "train_speed(iter/s)": 0.200503 }, { "acc": 0.75082736, "epoch": 1.0421106037544394, "grad_norm": 2.046875, "learning_rate": 5.065222543014394e-06, "loss": 0.9963644, "memory(GiB)": 369.42, "step": 41080, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75029306, "epoch": 1.0422374429223744, "grad_norm": 2.625, "learning_rate": 5.06417400712857e-06, "loss": 1.00824203, "memory(GiB)": 369.42, "step": 41085, "train_speed(iter/s)": 0.20051 }, { "acc": 0.75665016, "epoch": 1.0423642820903094, "grad_norm": 2.234375, "learning_rate": 5.0631254684200906e-06, "loss": 0.92600918, "memory(GiB)": 369.42, "step": 41090, "train_speed(iter/s)": 0.200515 }, { "acc": 0.76273623, "epoch": 1.0424911212582446, "grad_norm": 2.03125, "learning_rate": 5.062076926935068e-06, "loss": 0.95414, "memory(GiB)": 369.42, "step": 41095, "train_speed(iter/s)": 0.20052 }, { "acc": 0.75834446, "epoch": 1.0426179604261796, "grad_norm": 2.171875, "learning_rate": 5.061028382719626e-06, "loss": 0.92339983, "memory(GiB)": 369.42, "step": 41100, "train_speed(iter/s)": 0.200524 }, { "acc": 0.75126715, "epoch": 1.0427447995941146, "grad_norm": 2.15625, "learning_rate": 5.0599798358198835e-06, "loss": 0.94957409, "memory(GiB)": 369.42, "step": 41105, "train_speed(iter/s)": 0.200529 }, { "acc": 0.75613585, "epoch": 1.0428716387620498, "grad_norm": 2.15625, "learning_rate": 5.0589312862819605e-06, "loss": 0.98620014, "memory(GiB)": 369.42, "step": 41110, "train_speed(iter/s)": 0.200532 }, { "acc": 0.75324225, "epoch": 1.0429984779299848, "grad_norm": 2.078125, "learning_rate": 5.057882734151977e-06, "loss": 0.94505615, "memory(GiB)": 369.42, "step": 41115, "train_speed(iter/s)": 0.200536 }, { "acc": 0.75352936, "epoch": 1.0431253170979198, "grad_norm": 2.109375, "learning_rate": 5.05683417947605e-06, "loss": 1.0110652, "memory(GiB)": 369.42, "step": 41120, "train_speed(iter/s)": 0.20054 }, { "acc": 0.74976029, "epoch": 1.043252156265855, "grad_norm": 2.3125, "learning_rate": 5.055785622300303e-06, "loss": 0.98836079, "memory(GiB)": 369.42, "step": 41125, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74333029, "epoch": 1.04337899543379, "grad_norm": 1.890625, "learning_rate": 5.054737062670857e-06, "loss": 1.01949234, "memory(GiB)": 369.42, "step": 41130, "train_speed(iter/s)": 0.200547 }, { "acc": 0.74519982, "epoch": 1.043505834601725, "grad_norm": 2.140625, "learning_rate": 5.053688500633828e-06, "loss": 0.97739811, "memory(GiB)": 369.42, "step": 41135, "train_speed(iter/s)": 0.200551 }, { "acc": 0.75437145, "epoch": 1.0436326737696602, "grad_norm": 2.328125, "learning_rate": 5.052639936235341e-06, "loss": 0.97873278, "memory(GiB)": 369.42, "step": 41140, "train_speed(iter/s)": 0.200554 }, { "acc": 0.75214424, "epoch": 1.0437595129375952, "grad_norm": 2.78125, "learning_rate": 5.051591369521513e-06, "loss": 0.98005524, "memory(GiB)": 369.42, "step": 41145, "train_speed(iter/s)": 0.200559 }, { "acc": 0.74408822, "epoch": 1.0438863521055302, "grad_norm": 2.359375, "learning_rate": 5.050542800538469e-06, "loss": 1.00460949, "memory(GiB)": 369.42, "step": 41150, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74649806, "epoch": 1.0440131912734651, "grad_norm": 2.125, "learning_rate": 5.049494229332324e-06, "loss": 1.05410652, "memory(GiB)": 369.42, "step": 41155, "train_speed(iter/s)": 0.200566 }, { "acc": 0.75972385, "epoch": 1.0441400304414004, "grad_norm": 1.9921875, "learning_rate": 5.048445655949204e-06, "loss": 0.98254147, "memory(GiB)": 369.42, "step": 41160, "train_speed(iter/s)": 0.200568 }, { "acc": 0.73442855, "epoch": 1.0442668696093353, "grad_norm": 2.03125, "learning_rate": 5.047397080435225e-06, "loss": 1.09091463, "memory(GiB)": 369.42, "step": 41165, "train_speed(iter/s)": 0.200572 }, { "acc": 0.75671482, "epoch": 1.0443937087772703, "grad_norm": 2.21875, "learning_rate": 5.046348502836512e-06, "loss": 1.01590271, "memory(GiB)": 369.42, "step": 41170, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74043384, "epoch": 1.0445205479452055, "grad_norm": 1.953125, "learning_rate": 5.045299923199186e-06, "loss": 1.00530119, "memory(GiB)": 369.42, "step": 41175, "train_speed(iter/s)": 0.200579 }, { "acc": 0.75376339, "epoch": 1.0446473871131405, "grad_norm": 1.7265625, "learning_rate": 5.044251341569366e-06, "loss": 0.93934708, "memory(GiB)": 369.42, "step": 41180, "train_speed(iter/s)": 0.200581 }, { "acc": 0.73908653, "epoch": 1.0447742262810755, "grad_norm": 2.6875, "learning_rate": 5.043202757993175e-06, "loss": 1.05693417, "memory(GiB)": 369.42, "step": 41185, "train_speed(iter/s)": 0.200586 }, { "acc": 0.74074688, "epoch": 1.0449010654490107, "grad_norm": 2.78125, "learning_rate": 5.042154172516734e-06, "loss": 1.02874985, "memory(GiB)": 369.42, "step": 41190, "train_speed(iter/s)": 0.200591 }, { "acc": 0.74810572, "epoch": 1.0450279046169457, "grad_norm": 2.171875, "learning_rate": 5.041105585186164e-06, "loss": 0.98693752, "memory(GiB)": 369.42, "step": 41195, "train_speed(iter/s)": 0.200595 }, { "acc": 0.76359186, "epoch": 1.0451547437848807, "grad_norm": 1.90625, "learning_rate": 5.040056996047587e-06, "loss": 0.99824772, "memory(GiB)": 369.42, "step": 41200, "train_speed(iter/s)": 0.200598 }, { "acc": 0.75368595, "epoch": 1.045281582952816, "grad_norm": 2.03125, "learning_rate": 5.039008405147125e-06, "loss": 0.94426975, "memory(GiB)": 369.42, "step": 41205, "train_speed(iter/s)": 0.200603 }, { "acc": 0.74719715, "epoch": 1.045408422120751, "grad_norm": 2.4375, "learning_rate": 5.0379598125308984e-06, "loss": 1.00636101, "memory(GiB)": 369.42, "step": 41210, "train_speed(iter/s)": 0.200607 }, { "acc": 0.7375371, "epoch": 1.045535261288686, "grad_norm": 2.296875, "learning_rate": 5.036911218245029e-06, "loss": 1.05735455, "memory(GiB)": 369.42, "step": 41215, "train_speed(iter/s)": 0.20061 }, { "acc": 0.74786263, "epoch": 1.045662100456621, "grad_norm": 1.90625, "learning_rate": 5.035862622335641e-06, "loss": 0.99791994, "memory(GiB)": 369.42, "step": 41220, "train_speed(iter/s)": 0.20061 }, { "acc": 0.76718245, "epoch": 1.045788939624556, "grad_norm": 2.203125, "learning_rate": 5.034814024848853e-06, "loss": 0.94083309, "memory(GiB)": 369.42, "step": 41225, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75662756, "epoch": 1.045915778792491, "grad_norm": 1.9921875, "learning_rate": 5.033765425830791e-06, "loss": 0.96590452, "memory(GiB)": 369.42, "step": 41230, "train_speed(iter/s)": 0.200617 }, { "acc": 0.75869207, "epoch": 1.046042617960426, "grad_norm": 2.65625, "learning_rate": 5.032716825327573e-06, "loss": 0.96369972, "memory(GiB)": 369.42, "step": 41235, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75097284, "epoch": 1.0461694571283613, "grad_norm": 2.328125, "learning_rate": 5.031668223385323e-06, "loss": 0.96520538, "memory(GiB)": 369.42, "step": 41240, "train_speed(iter/s)": 0.200623 }, { "acc": 0.74248514, "epoch": 1.0462962962962963, "grad_norm": 2.4375, "learning_rate": 5.030619620050163e-06, "loss": 1.02029123, "memory(GiB)": 369.42, "step": 41245, "train_speed(iter/s)": 0.200627 }, { "acc": 0.77169199, "epoch": 1.0464231354642313, "grad_norm": 2.328125, "learning_rate": 5.029571015368217e-06, "loss": 0.88735085, "memory(GiB)": 369.42, "step": 41250, "train_speed(iter/s)": 0.20063 }, { "acc": 0.74210143, "epoch": 1.0465499746321665, "grad_norm": 2.109375, "learning_rate": 5.028522409385605e-06, "loss": 1.00074387, "memory(GiB)": 369.42, "step": 41255, "train_speed(iter/s)": 0.200634 }, { "acc": 0.7548933, "epoch": 1.0466768138001015, "grad_norm": 1.8515625, "learning_rate": 5.0274738021484495e-06, "loss": 0.94744091, "memory(GiB)": 369.42, "step": 41260, "train_speed(iter/s)": 0.200637 }, { "acc": 0.73927913, "epoch": 1.0468036529680365, "grad_norm": 2.265625, "learning_rate": 5.026425193702874e-06, "loss": 1.00451984, "memory(GiB)": 369.42, "step": 41265, "train_speed(iter/s)": 0.200641 }, { "acc": 0.76849642, "epoch": 1.0469304921359717, "grad_norm": 2.078125, "learning_rate": 5.025376584095001e-06, "loss": 0.89771662, "memory(GiB)": 369.42, "step": 41270, "train_speed(iter/s)": 0.200644 }, { "acc": 0.76251812, "epoch": 1.0470573313039067, "grad_norm": 2.46875, "learning_rate": 5.024327973370951e-06, "loss": 0.95675869, "memory(GiB)": 369.42, "step": 41275, "train_speed(iter/s)": 0.200648 }, { "acc": 0.73861217, "epoch": 1.0471841704718416, "grad_norm": 2.40625, "learning_rate": 5.02327936157685e-06, "loss": 1.0134305, "memory(GiB)": 369.42, "step": 41280, "train_speed(iter/s)": 0.200651 }, { "acc": 0.75154934, "epoch": 1.0473110096397769, "grad_norm": 2.109375, "learning_rate": 5.022230748758816e-06, "loss": 0.96860809, "memory(GiB)": 369.42, "step": 41285, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75729885, "epoch": 1.0474378488077118, "grad_norm": 2.453125, "learning_rate": 5.021182134962978e-06, "loss": 1.02038717, "memory(GiB)": 369.42, "step": 41290, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75772228, "epoch": 1.0475646879756468, "grad_norm": 1.8203125, "learning_rate": 5.020133520235453e-06, "loss": 0.99568787, "memory(GiB)": 369.42, "step": 41295, "train_speed(iter/s)": 0.200654 }, { "acc": 0.73855162, "epoch": 1.047691527143582, "grad_norm": 2.109375, "learning_rate": 5.019084904622367e-06, "loss": 1.03011684, "memory(GiB)": 369.42, "step": 41300, "train_speed(iter/s)": 0.200658 }, { "acc": 0.7426466, "epoch": 1.047818366311517, "grad_norm": 1.875, "learning_rate": 5.01803628816984e-06, "loss": 0.97996731, "memory(GiB)": 369.42, "step": 41305, "train_speed(iter/s)": 0.20066 }, { "acc": 0.74708233, "epoch": 1.047945205479452, "grad_norm": 2.4375, "learning_rate": 5.016987670923998e-06, "loss": 0.98940735, "memory(GiB)": 369.42, "step": 41310, "train_speed(iter/s)": 0.200664 }, { "acc": 0.76560507, "epoch": 1.048072044647387, "grad_norm": 2.203125, "learning_rate": 5.0159390529309615e-06, "loss": 0.90228128, "memory(GiB)": 369.42, "step": 41315, "train_speed(iter/s)": 0.200667 }, { "acc": 0.75770416, "epoch": 1.0481988838153222, "grad_norm": 2.3125, "learning_rate": 5.014890434236854e-06, "loss": 0.96420898, "memory(GiB)": 369.42, "step": 41320, "train_speed(iter/s)": 0.200665 }, { "acc": 0.73550367, "epoch": 1.0483257229832572, "grad_norm": 2.40625, "learning_rate": 5.0138418148878e-06, "loss": 1.02743416, "memory(GiB)": 369.42, "step": 41325, "train_speed(iter/s)": 0.200669 }, { "acc": 0.74927588, "epoch": 1.0484525621511922, "grad_norm": 2.484375, "learning_rate": 5.01279319492992e-06, "loss": 0.98389606, "memory(GiB)": 369.42, "step": 41330, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74068489, "epoch": 1.0485794013191274, "grad_norm": 2.109375, "learning_rate": 5.01174457440934e-06, "loss": 0.98992977, "memory(GiB)": 369.42, "step": 41335, "train_speed(iter/s)": 0.200676 }, { "acc": 0.76966944, "epoch": 1.0487062404870624, "grad_norm": 2.09375, "learning_rate": 5.010695953372179e-06, "loss": 0.95949764, "memory(GiB)": 369.42, "step": 41340, "train_speed(iter/s)": 0.200679 }, { "acc": 0.75458126, "epoch": 1.0488330796549974, "grad_norm": 2.140625, "learning_rate": 5.009647331864563e-06, "loss": 0.9706358, "memory(GiB)": 369.42, "step": 41345, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75174232, "epoch": 1.0489599188229326, "grad_norm": 2.15625, "learning_rate": 5.008598709932615e-06, "loss": 1.00283852, "memory(GiB)": 369.42, "step": 41350, "train_speed(iter/s)": 0.200687 }, { "acc": 0.74398375, "epoch": 1.0490867579908676, "grad_norm": 2.109375, "learning_rate": 5.007550087622456e-06, "loss": 1.03306694, "memory(GiB)": 369.42, "step": 41355, "train_speed(iter/s)": 0.200689 }, { "acc": 0.76169577, "epoch": 1.0492135971588026, "grad_norm": 2.3125, "learning_rate": 5.0065014649802124e-06, "loss": 0.95588837, "memory(GiB)": 369.42, "step": 41360, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74360404, "epoch": 1.0493404363267378, "grad_norm": 1.9765625, "learning_rate": 5.005452842052003e-06, "loss": 0.99822617, "memory(GiB)": 369.42, "step": 41365, "train_speed(iter/s)": 0.200696 }, { "acc": 0.74664373, "epoch": 1.0494672754946728, "grad_norm": 2.15625, "learning_rate": 5.004404218883955e-06, "loss": 1.02119837, "memory(GiB)": 369.42, "step": 41370, "train_speed(iter/s)": 0.200697 }, { "acc": 0.75959797, "epoch": 1.0495941146626078, "grad_norm": 2.53125, "learning_rate": 5.0033555955221875e-06, "loss": 0.96833906, "memory(GiB)": 369.42, "step": 41375, "train_speed(iter/s)": 0.200703 }, { "acc": 0.75886793, "epoch": 1.0497209538305428, "grad_norm": 2.015625, "learning_rate": 5.002306972012829e-06, "loss": 0.94708424, "memory(GiB)": 369.42, "step": 41380, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74631948, "epoch": 1.049847792998478, "grad_norm": 1.875, "learning_rate": 5.001258348401998e-06, "loss": 0.97293892, "memory(GiB)": 369.42, "step": 41385, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75063791, "epoch": 1.049974632166413, "grad_norm": 2.078125, "learning_rate": 5.000209724735819e-06, "loss": 1.02118053, "memory(GiB)": 369.42, "step": 41390, "train_speed(iter/s)": 0.200713 }, { "acc": 0.74757786, "epoch": 1.050101471334348, "grad_norm": 2.359375, "learning_rate": 4.999161101060416e-06, "loss": 1.07066584, "memory(GiB)": 369.42, "step": 41395, "train_speed(iter/s)": 0.200718 }, { "acc": 0.73944821, "epoch": 1.0502283105022832, "grad_norm": 2.328125, "learning_rate": 4.99811247742191e-06, "loss": 1.03297567, "memory(GiB)": 369.42, "step": 41400, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75215836, "epoch": 1.0503551496702181, "grad_norm": 2.046875, "learning_rate": 4.9970638538664275e-06, "loss": 1.00650434, "memory(GiB)": 369.42, "step": 41405, "train_speed(iter/s)": 0.200724 }, { "acc": 0.75469079, "epoch": 1.0504819888381531, "grad_norm": 1.8125, "learning_rate": 4.996015230440091e-06, "loss": 0.97713757, "memory(GiB)": 369.42, "step": 41410, "train_speed(iter/s)": 0.200728 }, { "acc": 0.76943579, "epoch": 1.0506088280060883, "grad_norm": 2.3125, "learning_rate": 4.99496660718902e-06, "loss": 0.91161556, "memory(GiB)": 369.42, "step": 41415, "train_speed(iter/s)": 0.200732 }, { "acc": 0.74693022, "epoch": 1.0507356671740233, "grad_norm": 1.9609375, "learning_rate": 4.99391798415934e-06, "loss": 0.98923626, "memory(GiB)": 369.42, "step": 41420, "train_speed(iter/s)": 0.200735 }, { "acc": 0.751159, "epoch": 1.0508625063419583, "grad_norm": 2.1875, "learning_rate": 4.992869361397175e-06, "loss": 0.99092598, "memory(GiB)": 369.42, "step": 41425, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75395494, "epoch": 1.0509893455098935, "grad_norm": 1.8828125, "learning_rate": 4.991820738948649e-06, "loss": 0.94682503, "memory(GiB)": 369.42, "step": 41430, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74602652, "epoch": 1.0511161846778285, "grad_norm": 2.453125, "learning_rate": 4.9907721168598805e-06, "loss": 0.99532804, "memory(GiB)": 369.42, "step": 41435, "train_speed(iter/s)": 0.200742 }, { "acc": 0.76392813, "epoch": 1.0512430238457635, "grad_norm": 1.96875, "learning_rate": 4.989723495176997e-06, "loss": 0.93286047, "memory(GiB)": 369.42, "step": 41440, "train_speed(iter/s)": 0.200746 }, { "acc": 0.7696826, "epoch": 1.0513698630136987, "grad_norm": 1.9765625, "learning_rate": 4.988674873946118e-06, "loss": 0.90685997, "memory(GiB)": 369.42, "step": 41445, "train_speed(iter/s)": 0.200745 }, { "acc": 0.77268615, "epoch": 1.0514967021816337, "grad_norm": 2.515625, "learning_rate": 4.987626253213373e-06, "loss": 0.94080257, "memory(GiB)": 369.42, "step": 41450, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76157112, "epoch": 1.0516235413495687, "grad_norm": 2.34375, "learning_rate": 4.986577633024877e-06, "loss": 0.97724075, "memory(GiB)": 369.42, "step": 41455, "train_speed(iter/s)": 0.200751 }, { "acc": 0.75041943, "epoch": 1.051750380517504, "grad_norm": 2.484375, "learning_rate": 4.985529013426758e-06, "loss": 0.94955463, "memory(GiB)": 369.42, "step": 41460, "train_speed(iter/s)": 0.200756 }, { "acc": 0.75713081, "epoch": 1.051877219685439, "grad_norm": 2.171875, "learning_rate": 4.984480394465136e-06, "loss": 0.92933388, "memory(GiB)": 369.42, "step": 41465, "train_speed(iter/s)": 0.200757 }, { "acc": 0.74517555, "epoch": 1.052004058853374, "grad_norm": 1.7890625, "learning_rate": 4.9834317761861385e-06, "loss": 0.99445829, "memory(GiB)": 369.42, "step": 41470, "train_speed(iter/s)": 0.200761 }, { "acc": 0.74332113, "epoch": 1.0521308980213089, "grad_norm": 2.265625, "learning_rate": 4.982383158635884e-06, "loss": 1.01654015, "memory(GiB)": 369.42, "step": 41475, "train_speed(iter/s)": 0.200766 }, { "acc": 0.74300823, "epoch": 1.052257737189244, "grad_norm": 2.234375, "learning_rate": 4.981334541860496e-06, "loss": 1.00391655, "memory(GiB)": 369.42, "step": 41480, "train_speed(iter/s)": 0.200769 }, { "acc": 0.7429183, "epoch": 1.052384576357179, "grad_norm": 2.1875, "learning_rate": 4.980285925906098e-06, "loss": 0.973248, "memory(GiB)": 369.42, "step": 41485, "train_speed(iter/s)": 0.200772 }, { "acc": 0.75805364, "epoch": 1.052511415525114, "grad_norm": 2.046875, "learning_rate": 4.9792373108188155e-06, "loss": 0.98384857, "memory(GiB)": 369.42, "step": 41490, "train_speed(iter/s)": 0.200774 }, { "acc": 0.74796615, "epoch": 1.0526382546930493, "grad_norm": 2.609375, "learning_rate": 4.978188696644767e-06, "loss": 0.94989243, "memory(GiB)": 369.42, "step": 41495, "train_speed(iter/s)": 0.200778 }, { "acc": 0.76420832, "epoch": 1.0527650938609843, "grad_norm": 1.9140625, "learning_rate": 4.977140083430075e-06, "loss": 0.9593132, "memory(GiB)": 369.42, "step": 41500, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75878153, "epoch": 1.0528919330289193, "grad_norm": 1.8046875, "learning_rate": 4.976091471220867e-06, "loss": 0.9652832, "memory(GiB)": 369.42, "step": 41505, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74473577, "epoch": 1.0530187721968545, "grad_norm": 1.9453125, "learning_rate": 4.975042860063263e-06, "loss": 1.01234808, "memory(GiB)": 369.42, "step": 41510, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75486474, "epoch": 1.0531456113647895, "grad_norm": 2.40625, "learning_rate": 4.973994250003384e-06, "loss": 0.99554415, "memory(GiB)": 369.42, "step": 41515, "train_speed(iter/s)": 0.200789 }, { "acc": 0.73478823, "epoch": 1.0532724505327244, "grad_norm": 2.15625, "learning_rate": 4.972945641087355e-06, "loss": 1.04024944, "memory(GiB)": 369.42, "step": 41520, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75562563, "epoch": 1.0533992897006597, "grad_norm": 2.0, "learning_rate": 4.9718970333612955e-06, "loss": 0.9729311, "memory(GiB)": 369.42, "step": 41525, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74395208, "epoch": 1.0535261288685946, "grad_norm": 2.359375, "learning_rate": 4.970848426871333e-06, "loss": 1.00569916, "memory(GiB)": 369.42, "step": 41530, "train_speed(iter/s)": 0.200799 }, { "acc": 0.74462652, "epoch": 1.0536529680365296, "grad_norm": 2.171875, "learning_rate": 4.9697998216635854e-06, "loss": 0.99724331, "memory(GiB)": 369.42, "step": 41535, "train_speed(iter/s)": 0.200801 }, { "acc": 0.75901113, "epoch": 1.0537798072044646, "grad_norm": 2.046875, "learning_rate": 4.9687512177841765e-06, "loss": 0.9416647, "memory(GiB)": 369.42, "step": 41540, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74369578, "epoch": 1.0539066463723998, "grad_norm": 2.140625, "learning_rate": 4.967702615279227e-06, "loss": 1.02014217, "memory(GiB)": 369.42, "step": 41545, "train_speed(iter/s)": 0.200808 }, { "acc": 0.73352895, "epoch": 1.0540334855403348, "grad_norm": 2.1875, "learning_rate": 4.966654014194863e-06, "loss": 1.04991283, "memory(GiB)": 369.42, "step": 41550, "train_speed(iter/s)": 0.200812 }, { "acc": 0.76581416, "epoch": 1.0541603247082698, "grad_norm": 1.84375, "learning_rate": 4.965605414577204e-06, "loss": 0.91253605, "memory(GiB)": 369.42, "step": 41555, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76334677, "epoch": 1.054287163876205, "grad_norm": 2.265625, "learning_rate": 4.964556816472371e-06, "loss": 0.96425934, "memory(GiB)": 369.42, "step": 41560, "train_speed(iter/s)": 0.200818 }, { "acc": 0.75311599, "epoch": 1.05441400304414, "grad_norm": 1.859375, "learning_rate": 4.9635082199264874e-06, "loss": 0.9904356, "memory(GiB)": 369.42, "step": 41565, "train_speed(iter/s)": 0.200818 }, { "acc": 0.75709362, "epoch": 1.054540842212075, "grad_norm": 2.1875, "learning_rate": 4.962459624985677e-06, "loss": 0.96451883, "memory(GiB)": 369.42, "step": 41570, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75042877, "epoch": 1.0546676813800102, "grad_norm": 2.109375, "learning_rate": 4.961411031696059e-06, "loss": 0.9860899, "memory(GiB)": 369.42, "step": 41575, "train_speed(iter/s)": 0.200824 }, { "acc": 0.7568244, "epoch": 1.0547945205479452, "grad_norm": 2.28125, "learning_rate": 4.960362440103756e-06, "loss": 0.9896286, "memory(GiB)": 369.42, "step": 41580, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75806069, "epoch": 1.0549213597158802, "grad_norm": 2.75, "learning_rate": 4.95931385025489e-06, "loss": 0.95769234, "memory(GiB)": 369.42, "step": 41585, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75602789, "epoch": 1.0550481988838154, "grad_norm": 1.9296875, "learning_rate": 4.958265262195584e-06, "loss": 0.98229074, "memory(GiB)": 369.42, "step": 41590, "train_speed(iter/s)": 0.200837 }, { "acc": 0.7623086, "epoch": 1.0551750380517504, "grad_norm": 2.265625, "learning_rate": 4.957216675971955e-06, "loss": 0.96859617, "memory(GiB)": 369.42, "step": 41595, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74586892, "epoch": 1.0553018772196854, "grad_norm": 2.140625, "learning_rate": 4.9561680916301295e-06, "loss": 1.03835278, "memory(GiB)": 369.42, "step": 41600, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75545721, "epoch": 1.0554287163876206, "grad_norm": 2.21875, "learning_rate": 4.955119509216226e-06, "loss": 0.98295984, "memory(GiB)": 369.42, "step": 41605, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75129137, "epoch": 1.0555555555555556, "grad_norm": 1.90625, "learning_rate": 4.9540709287763685e-06, "loss": 1.00823994, "memory(GiB)": 369.42, "step": 41610, "train_speed(iter/s)": 0.20085 }, { "acc": 0.7499855, "epoch": 1.0556823947234906, "grad_norm": 2.734375, "learning_rate": 4.953022350356676e-06, "loss": 1.00955715, "memory(GiB)": 369.42, "step": 41615, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75166512, "epoch": 1.0558092338914258, "grad_norm": 2.53125, "learning_rate": 4.951973774003269e-06, "loss": 1.00936432, "memory(GiB)": 369.42, "step": 41620, "train_speed(iter/s)": 0.200858 }, { "acc": 0.74783654, "epoch": 1.0559360730593608, "grad_norm": 2.1875, "learning_rate": 4.950925199762271e-06, "loss": 1.02566967, "memory(GiB)": 369.42, "step": 41625, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75401754, "epoch": 1.0560629122272958, "grad_norm": 2.375, "learning_rate": 4.949876627679803e-06, "loss": 1.01415129, "memory(GiB)": 369.42, "step": 41630, "train_speed(iter/s)": 0.200863 }, { "acc": 0.76254711, "epoch": 1.0561897513952307, "grad_norm": 2.390625, "learning_rate": 4.948828057801983e-06, "loss": 0.99227381, "memory(GiB)": 369.42, "step": 41635, "train_speed(iter/s)": 0.200869 }, { "acc": 0.74209523, "epoch": 1.056316590563166, "grad_norm": 2.1875, "learning_rate": 4.947779490174933e-06, "loss": 1.00456924, "memory(GiB)": 369.42, "step": 41640, "train_speed(iter/s)": 0.200872 }, { "acc": 0.75884047, "epoch": 1.056443429731101, "grad_norm": 2.390625, "learning_rate": 4.946730924844775e-06, "loss": 0.97724152, "memory(GiB)": 369.42, "step": 41645, "train_speed(iter/s)": 0.200875 }, { "acc": 0.7522007, "epoch": 1.056570268899036, "grad_norm": 2.125, "learning_rate": 4.945682361857631e-06, "loss": 0.98424606, "memory(GiB)": 369.42, "step": 41650, "train_speed(iter/s)": 0.200879 }, { "acc": 0.75642304, "epoch": 1.0566971080669711, "grad_norm": 2.25, "learning_rate": 4.944633801259615e-06, "loss": 0.98725748, "memory(GiB)": 369.42, "step": 41655, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74815412, "epoch": 1.0568239472349061, "grad_norm": 2.53125, "learning_rate": 4.943585243096854e-06, "loss": 1.00900288, "memory(GiB)": 369.42, "step": 41660, "train_speed(iter/s)": 0.200884 }, { "acc": 0.76295767, "epoch": 1.0569507864028411, "grad_norm": 1.8984375, "learning_rate": 4.942536687415465e-06, "loss": 0.93356018, "memory(GiB)": 369.42, "step": 41665, "train_speed(iter/s)": 0.200887 }, { "acc": 0.73534708, "epoch": 1.0570776255707763, "grad_norm": 2.421875, "learning_rate": 4.941488134261571e-06, "loss": 1.06421976, "memory(GiB)": 369.42, "step": 41670, "train_speed(iter/s)": 0.200891 }, { "acc": 0.7597887, "epoch": 1.0572044647387113, "grad_norm": 2.25, "learning_rate": 4.940439583681288e-06, "loss": 0.90567989, "memory(GiB)": 369.42, "step": 41675, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75187998, "epoch": 1.0573313039066463, "grad_norm": 2.125, "learning_rate": 4.939391035720739e-06, "loss": 0.98260632, "memory(GiB)": 369.42, "step": 41680, "train_speed(iter/s)": 0.200899 }, { "acc": 0.76408844, "epoch": 1.0574581430745815, "grad_norm": 1.9453125, "learning_rate": 4.938342490426041e-06, "loss": 0.91433544, "memory(GiB)": 369.42, "step": 41685, "train_speed(iter/s)": 0.200903 }, { "acc": 0.75619483, "epoch": 1.0575849822425165, "grad_norm": 2.28125, "learning_rate": 4.937293947843318e-06, "loss": 0.98516645, "memory(GiB)": 369.42, "step": 41690, "train_speed(iter/s)": 0.200908 }, { "acc": 0.7529273, "epoch": 1.0577118214104515, "grad_norm": 1.8515625, "learning_rate": 4.936245408018687e-06, "loss": 0.94565659, "memory(GiB)": 369.42, "step": 41695, "train_speed(iter/s)": 0.200912 }, { "acc": 0.75331879, "epoch": 1.0578386605783865, "grad_norm": 2.25, "learning_rate": 4.935196870998265e-06, "loss": 0.99331226, "memory(GiB)": 369.42, "step": 41700, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74625916, "epoch": 1.0579654997463217, "grad_norm": 2.71875, "learning_rate": 4.934148336828176e-06, "loss": 1.02456017, "memory(GiB)": 369.42, "step": 41705, "train_speed(iter/s)": 0.200917 }, { "acc": 0.73828726, "epoch": 1.0580923389142567, "grad_norm": 2.578125, "learning_rate": 4.933099805554538e-06, "loss": 1.06305227, "memory(GiB)": 369.42, "step": 41710, "train_speed(iter/s)": 0.20092 }, { "acc": 0.73687534, "epoch": 1.0582191780821917, "grad_norm": 2.15625, "learning_rate": 4.932051277223468e-06, "loss": 1.04667635, "memory(GiB)": 369.42, "step": 41715, "train_speed(iter/s)": 0.200924 }, { "acc": 0.75335183, "epoch": 1.058346017250127, "grad_norm": 2.3125, "learning_rate": 4.931002751881086e-06, "loss": 0.99925137, "memory(GiB)": 369.42, "step": 41720, "train_speed(iter/s)": 0.200926 }, { "acc": 0.75509853, "epoch": 1.0584728564180619, "grad_norm": 2.515625, "learning_rate": 4.929954229573512e-06, "loss": 0.95872183, "memory(GiB)": 369.42, "step": 41725, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74714913, "epoch": 1.0585996955859969, "grad_norm": 2.03125, "learning_rate": 4.9289057103468635e-06, "loss": 0.94584923, "memory(GiB)": 369.42, "step": 41730, "train_speed(iter/s)": 0.200933 }, { "acc": 0.753052, "epoch": 1.058726534753932, "grad_norm": 1.953125, "learning_rate": 4.927857194247258e-06, "loss": 0.99810352, "memory(GiB)": 369.42, "step": 41735, "train_speed(iter/s)": 0.200936 }, { "acc": 0.73729534, "epoch": 1.058853373921867, "grad_norm": 2.03125, "learning_rate": 4.926808681320816e-06, "loss": 1.01757126, "memory(GiB)": 369.42, "step": 41740, "train_speed(iter/s)": 0.200941 }, { "acc": 0.74222746, "epoch": 1.058980213089802, "grad_norm": 2.703125, "learning_rate": 4.925760171613654e-06, "loss": 1.01509209, "memory(GiB)": 369.42, "step": 41745, "train_speed(iter/s)": 0.200945 }, { "acc": 0.75207386, "epoch": 1.0591070522577373, "grad_norm": 2.0, "learning_rate": 4.9247116651718925e-06, "loss": 0.98268261, "memory(GiB)": 369.42, "step": 41750, "train_speed(iter/s)": 0.200948 }, { "acc": 0.7503047, "epoch": 1.0592338914256723, "grad_norm": 2.53125, "learning_rate": 4.9236631620416486e-06, "loss": 1.06233959, "memory(GiB)": 369.42, "step": 41755, "train_speed(iter/s)": 0.200952 }, { "acc": 0.75560341, "epoch": 1.0593607305936072, "grad_norm": 2.0, "learning_rate": 4.922614662269038e-06, "loss": 1.00476875, "memory(GiB)": 369.42, "step": 41760, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75613022, "epoch": 1.0594875697615425, "grad_norm": 1.9609375, "learning_rate": 4.9215661659001805e-06, "loss": 0.96233845, "memory(GiB)": 369.42, "step": 41765, "train_speed(iter/s)": 0.200956 }, { "acc": 0.7679131, "epoch": 1.0596144089294774, "grad_norm": 2.390625, "learning_rate": 4.920517672981195e-06, "loss": 0.93549957, "memory(GiB)": 369.42, "step": 41770, "train_speed(iter/s)": 0.200961 }, { "acc": 0.7507956, "epoch": 1.0597412480974124, "grad_norm": 2.125, "learning_rate": 4.919469183558195e-06, "loss": 0.98597488, "memory(GiB)": 369.42, "step": 41775, "train_speed(iter/s)": 0.200964 }, { "acc": 0.75967245, "epoch": 1.0598680872653476, "grad_norm": 1.9609375, "learning_rate": 4.9184206976773e-06, "loss": 0.95143871, "memory(GiB)": 369.42, "step": 41780, "train_speed(iter/s)": 0.200966 }, { "acc": 0.74015503, "epoch": 1.0599949264332826, "grad_norm": 2.375, "learning_rate": 4.917372215384627e-06, "loss": 1.06201057, "memory(GiB)": 369.42, "step": 41785, "train_speed(iter/s)": 0.200969 }, { "acc": 0.75403171, "epoch": 1.0601217656012176, "grad_norm": 2.46875, "learning_rate": 4.916323736726295e-06, "loss": 0.95918732, "memory(GiB)": 369.42, "step": 41790, "train_speed(iter/s)": 0.200972 }, { "acc": 0.74315658, "epoch": 1.0602486047691526, "grad_norm": 2.046875, "learning_rate": 4.9152752617484156e-06, "loss": 0.99208984, "memory(GiB)": 369.42, "step": 41795, "train_speed(iter/s)": 0.200975 }, { "acc": 0.75219717, "epoch": 1.0603754439370878, "grad_norm": 2.125, "learning_rate": 4.91422679049711e-06, "loss": 0.99368248, "memory(GiB)": 369.42, "step": 41800, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75455847, "epoch": 1.0605022831050228, "grad_norm": 2.171875, "learning_rate": 4.913178323018493e-06, "loss": 0.95898294, "memory(GiB)": 369.42, "step": 41805, "train_speed(iter/s)": 0.200982 }, { "acc": 0.76138086, "epoch": 1.0606291222729578, "grad_norm": 2.59375, "learning_rate": 4.912129859358682e-06, "loss": 0.97283211, "memory(GiB)": 369.42, "step": 41810, "train_speed(iter/s)": 0.200986 }, { "acc": 0.76400414, "epoch": 1.060755961440893, "grad_norm": 2.25, "learning_rate": 4.9110813995637905e-06, "loss": 0.92617397, "memory(GiB)": 369.42, "step": 41815, "train_speed(iter/s)": 0.200988 }, { "acc": 0.75856266, "epoch": 1.060882800608828, "grad_norm": 2.0, "learning_rate": 4.910032943679936e-06, "loss": 0.9531477, "memory(GiB)": 369.42, "step": 41820, "train_speed(iter/s)": 0.200992 }, { "acc": 0.75606956, "epoch": 1.061009639776763, "grad_norm": 2.1875, "learning_rate": 4.908984491753234e-06, "loss": 0.96290722, "memory(GiB)": 369.42, "step": 41825, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74736967, "epoch": 1.0611364789446982, "grad_norm": 2.34375, "learning_rate": 4.907936043829802e-06, "loss": 0.99387512, "memory(GiB)": 369.42, "step": 41830, "train_speed(iter/s)": 0.200999 }, { "acc": 0.74957533, "epoch": 1.0612633181126332, "grad_norm": 2.046875, "learning_rate": 4.906887599955754e-06, "loss": 1.00259609, "memory(GiB)": 369.42, "step": 41835, "train_speed(iter/s)": 0.201002 }, { "acc": 0.76023622, "epoch": 1.0613901572805682, "grad_norm": 2.234375, "learning_rate": 4.905839160177203e-06, "loss": 0.9532835, "memory(GiB)": 369.42, "step": 41840, "train_speed(iter/s)": 0.201005 }, { "acc": 0.74743509, "epoch": 1.0615169964485034, "grad_norm": 2.3125, "learning_rate": 4.904790724540267e-06, "loss": 0.964328, "memory(GiB)": 369.42, "step": 41845, "train_speed(iter/s)": 0.201009 }, { "acc": 0.75639019, "epoch": 1.0616438356164384, "grad_norm": 2.25, "learning_rate": 4.903742293091061e-06, "loss": 0.97119961, "memory(GiB)": 369.42, "step": 41850, "train_speed(iter/s)": 0.201011 }, { "acc": 0.74548187, "epoch": 1.0617706747843734, "grad_norm": 2.0625, "learning_rate": 4.902693865875698e-06, "loss": 1.0309103, "memory(GiB)": 369.42, "step": 41855, "train_speed(iter/s)": 0.201011 }, { "acc": 0.75009899, "epoch": 1.0618975139523084, "grad_norm": 2.5625, "learning_rate": 4.901645442940293e-06, "loss": 0.97646027, "memory(GiB)": 369.42, "step": 41860, "train_speed(iter/s)": 0.201014 }, { "acc": 0.7581161, "epoch": 1.0620243531202436, "grad_norm": 1.96875, "learning_rate": 4.900597024330961e-06, "loss": 0.94744492, "memory(GiB)": 369.42, "step": 41865, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75100737, "epoch": 1.0621511922881786, "grad_norm": 2.375, "learning_rate": 4.899548610093816e-06, "loss": 1.01589375, "memory(GiB)": 369.42, "step": 41870, "train_speed(iter/s)": 0.201019 }, { "acc": 0.75506811, "epoch": 1.0622780314561135, "grad_norm": 2.828125, "learning_rate": 4.89850020027497e-06, "loss": 0.98097515, "memory(GiB)": 369.42, "step": 41875, "train_speed(iter/s)": 0.201023 }, { "acc": 0.75262136, "epoch": 1.0624048706240488, "grad_norm": 2.1875, "learning_rate": 4.89745179492054e-06, "loss": 0.97737951, "memory(GiB)": 369.42, "step": 41880, "train_speed(iter/s)": 0.201023 }, { "acc": 0.75475245, "epoch": 1.0625317097919837, "grad_norm": 2.1875, "learning_rate": 4.896403394076636e-06, "loss": 0.99815407, "memory(GiB)": 369.42, "step": 41885, "train_speed(iter/s)": 0.201027 }, { "acc": 0.74246864, "epoch": 1.0626585489599187, "grad_norm": 2.671875, "learning_rate": 4.895354997789377e-06, "loss": 1.02226582, "memory(GiB)": 369.42, "step": 41890, "train_speed(iter/s)": 0.201032 }, { "acc": 0.75570316, "epoch": 1.062785388127854, "grad_norm": 2.03125, "learning_rate": 4.894306606104869e-06, "loss": 0.95620728, "memory(GiB)": 369.42, "step": 41895, "train_speed(iter/s)": 0.201034 }, { "acc": 0.75890288, "epoch": 1.062912227295789, "grad_norm": 1.859375, "learning_rate": 4.893258219069229e-06, "loss": 0.95780811, "memory(GiB)": 369.42, "step": 41900, "train_speed(iter/s)": 0.201037 }, { "acc": 0.76158218, "epoch": 1.063039066463724, "grad_norm": 2.15625, "learning_rate": 4.892209836728569e-06, "loss": 0.95911427, "memory(GiB)": 369.42, "step": 41905, "train_speed(iter/s)": 0.201038 }, { "acc": 0.75302801, "epoch": 1.0631659056316591, "grad_norm": 1.8828125, "learning_rate": 4.891161459129003e-06, "loss": 1.02138386, "memory(GiB)": 369.42, "step": 41910, "train_speed(iter/s)": 0.201042 }, { "acc": 0.76602273, "epoch": 1.0632927447995941, "grad_norm": 2.265625, "learning_rate": 4.890113086316641e-06, "loss": 0.98463097, "memory(GiB)": 369.42, "step": 41915, "train_speed(iter/s)": 0.201044 }, { "acc": 0.75726185, "epoch": 1.0634195839675291, "grad_norm": 1.9296875, "learning_rate": 4.889064718337595e-06, "loss": 0.94740305, "memory(GiB)": 369.42, "step": 41920, "train_speed(iter/s)": 0.201048 }, { "acc": 0.76252356, "epoch": 1.0635464231354643, "grad_norm": 1.7890625, "learning_rate": 4.888016355237979e-06, "loss": 0.95368137, "memory(GiB)": 369.42, "step": 41925, "train_speed(iter/s)": 0.201051 }, { "acc": 0.75038733, "epoch": 1.0636732623033993, "grad_norm": 3.71875, "learning_rate": 4.886967997063905e-06, "loss": 0.95705805, "memory(GiB)": 369.42, "step": 41930, "train_speed(iter/s)": 0.201054 }, { "acc": 0.7635128, "epoch": 1.0638001014713343, "grad_norm": 2.390625, "learning_rate": 4.885919643861482e-06, "loss": 0.91984787, "memory(GiB)": 369.42, "step": 41935, "train_speed(iter/s)": 0.201059 }, { "acc": 0.76140294, "epoch": 1.0639269406392695, "grad_norm": 2.578125, "learning_rate": 4.884871295676821e-06, "loss": 0.91317539, "memory(GiB)": 369.42, "step": 41940, "train_speed(iter/s)": 0.201059 }, { "acc": 0.75762157, "epoch": 1.0640537798072045, "grad_norm": 2.15625, "learning_rate": 4.883822952556036e-06, "loss": 0.97197399, "memory(GiB)": 369.42, "step": 41945, "train_speed(iter/s)": 0.201064 }, { "acc": 0.7573987, "epoch": 1.0641806189751395, "grad_norm": 2.359375, "learning_rate": 4.882774614545237e-06, "loss": 0.97833939, "memory(GiB)": 369.42, "step": 41950, "train_speed(iter/s)": 0.201068 }, { "acc": 0.74624281, "epoch": 1.0643074581430745, "grad_norm": 2.359375, "learning_rate": 4.881726281690531e-06, "loss": 0.9963294, "memory(GiB)": 369.42, "step": 41955, "train_speed(iter/s)": 0.20107 }, { "acc": 0.7335743, "epoch": 1.0644342973110097, "grad_norm": 2.5625, "learning_rate": 4.8806779540380335e-06, "loss": 1.03102875, "memory(GiB)": 369.42, "step": 41960, "train_speed(iter/s)": 0.201073 }, { "acc": 0.76297593, "epoch": 1.0645611364789447, "grad_norm": 1.828125, "learning_rate": 4.879629631633851e-06, "loss": 0.96189976, "memory(GiB)": 369.42, "step": 41965, "train_speed(iter/s)": 0.201077 }, { "acc": 0.7448184, "epoch": 1.0646879756468797, "grad_norm": 2.484375, "learning_rate": 4.8785813145240965e-06, "loss": 0.99502869, "memory(GiB)": 369.42, "step": 41970, "train_speed(iter/s)": 0.20108 }, { "acc": 0.74870005, "epoch": 1.0648148148148149, "grad_norm": 2.140625, "learning_rate": 4.877533002754877e-06, "loss": 0.98309326, "memory(GiB)": 369.42, "step": 41975, "train_speed(iter/s)": 0.201084 }, { "acc": 0.73843689, "epoch": 1.0649416539827499, "grad_norm": 1.875, "learning_rate": 4.8764846963723025e-06, "loss": 1.02832966, "memory(GiB)": 369.42, "step": 41980, "train_speed(iter/s)": 0.201089 }, { "acc": 0.74705958, "epoch": 1.0650684931506849, "grad_norm": 2.5, "learning_rate": 4.875436395422481e-06, "loss": 0.99414234, "memory(GiB)": 369.42, "step": 41985, "train_speed(iter/s)": 0.201092 }, { "acc": 0.75564003, "epoch": 1.06519533231862, "grad_norm": 2.0, "learning_rate": 4.874388099951527e-06, "loss": 0.96862888, "memory(GiB)": 369.42, "step": 41990, "train_speed(iter/s)": 0.201097 }, { "acc": 0.77117767, "epoch": 1.065322171486555, "grad_norm": 1.8984375, "learning_rate": 4.873339810005543e-06, "loss": 0.93914948, "memory(GiB)": 369.42, "step": 41995, "train_speed(iter/s)": 0.201099 }, { "acc": 0.73494644, "epoch": 1.06544901065449, "grad_norm": 2.203125, "learning_rate": 4.872291525630638e-06, "loss": 1.02633476, "memory(GiB)": 369.42, "step": 42000, "train_speed(iter/s)": 0.201102 }, { "epoch": 1.06544901065449, "eval_acc": 0.7377937791941018, "eval_loss": 0.9703523516654968, "eval_runtime": 385.2215, "eval_samples_per_second": 16.536, "eval_steps_per_second": 8.268, "step": 42000 }, { "acc": 0.75595922, "epoch": 1.0655758498224253, "grad_norm": 2.328125, "learning_rate": 4.871243246872923e-06, "loss": 0.96498098, "memory(GiB)": 369.42, "step": 42005, "train_speed(iter/s)": 0.200422 }, { "acc": 0.75335112, "epoch": 1.0657026889903602, "grad_norm": 2.15625, "learning_rate": 4.870194973778506e-06, "loss": 1.0021781, "memory(GiB)": 369.42, "step": 42010, "train_speed(iter/s)": 0.200424 }, { "acc": 0.74911098, "epoch": 1.0658295281582952, "grad_norm": 1.96875, "learning_rate": 4.869146706393493e-06, "loss": 1.00974522, "memory(GiB)": 369.42, "step": 42015, "train_speed(iter/s)": 0.200428 }, { "acc": 0.76168728, "epoch": 1.0659563673262302, "grad_norm": 2.40625, "learning_rate": 4.868098444763991e-06, "loss": 0.93676491, "memory(GiB)": 369.42, "step": 42020, "train_speed(iter/s)": 0.200432 }, { "acc": 0.74793196, "epoch": 1.0660832064941654, "grad_norm": 2.734375, "learning_rate": 4.86705018893611e-06, "loss": 1.01043053, "memory(GiB)": 369.42, "step": 42025, "train_speed(iter/s)": 0.200437 }, { "acc": 0.76041651, "epoch": 1.0662100456621004, "grad_norm": 1.875, "learning_rate": 4.866001938955955e-06, "loss": 0.9709507, "memory(GiB)": 369.42, "step": 42030, "train_speed(iter/s)": 0.200439 }, { "acc": 0.74695907, "epoch": 1.0663368848300354, "grad_norm": 1.9453125, "learning_rate": 4.864953694869632e-06, "loss": 0.98657074, "memory(GiB)": 369.42, "step": 42035, "train_speed(iter/s)": 0.200443 }, { "acc": 0.76489296, "epoch": 1.0664637239979706, "grad_norm": 3.078125, "learning_rate": 4.863905456723249e-06, "loss": 0.97003965, "memory(GiB)": 369.42, "step": 42040, "train_speed(iter/s)": 0.200447 }, { "acc": 0.73735914, "epoch": 1.0665905631659056, "grad_norm": 2.03125, "learning_rate": 4.8628572245629105e-06, "loss": 1.05848808, "memory(GiB)": 369.42, "step": 42045, "train_speed(iter/s)": 0.200451 }, { "acc": 0.75096159, "epoch": 1.0667174023338406, "grad_norm": 2.28125, "learning_rate": 4.861808998434726e-06, "loss": 0.96476879, "memory(GiB)": 369.42, "step": 42050, "train_speed(iter/s)": 0.200454 }, { "acc": 0.75992527, "epoch": 1.0668442415017758, "grad_norm": 2.453125, "learning_rate": 4.860760778384797e-06, "loss": 0.95783386, "memory(GiB)": 369.42, "step": 42055, "train_speed(iter/s)": 0.200454 }, { "acc": 0.75013876, "epoch": 1.0669710806697108, "grad_norm": 1.8984375, "learning_rate": 4.85971256445923e-06, "loss": 0.95933437, "memory(GiB)": 369.42, "step": 42060, "train_speed(iter/s)": 0.200456 }, { "acc": 0.74705901, "epoch": 1.0670979198376458, "grad_norm": 2.453125, "learning_rate": 4.858664356704131e-06, "loss": 1.01452389, "memory(GiB)": 369.42, "step": 42065, "train_speed(iter/s)": 0.20046 }, { "acc": 0.77663231, "epoch": 1.067224759005581, "grad_norm": 2.5, "learning_rate": 4.857616155165606e-06, "loss": 0.8881567, "memory(GiB)": 369.42, "step": 42070, "train_speed(iter/s)": 0.200465 }, { "acc": 0.73089499, "epoch": 1.067351598173516, "grad_norm": 2.09375, "learning_rate": 4.856567959889758e-06, "loss": 1.10184078, "memory(GiB)": 369.42, "step": 42075, "train_speed(iter/s)": 0.200467 }, { "acc": 0.76538253, "epoch": 1.067478437341451, "grad_norm": 2.046875, "learning_rate": 4.855519770922691e-06, "loss": 0.98907795, "memory(GiB)": 369.42, "step": 42080, "train_speed(iter/s)": 0.200471 }, { "acc": 0.7518877, "epoch": 1.0676052765093862, "grad_norm": 2.515625, "learning_rate": 4.8544715883105084e-06, "loss": 0.98134289, "memory(GiB)": 369.42, "step": 42085, "train_speed(iter/s)": 0.200473 }, { "acc": 0.75611601, "epoch": 1.0677321156773212, "grad_norm": 2.203125, "learning_rate": 4.853423412099318e-06, "loss": 1.00223179, "memory(GiB)": 369.42, "step": 42090, "train_speed(iter/s)": 0.200476 }, { "acc": 0.74894156, "epoch": 1.0678589548452562, "grad_norm": 1.9453125, "learning_rate": 4.852375242335217e-06, "loss": 0.98934593, "memory(GiB)": 369.42, "step": 42095, "train_speed(iter/s)": 0.200481 }, { "acc": 0.75072889, "epoch": 1.0679857940131914, "grad_norm": 2.171875, "learning_rate": 4.851327079064314e-06, "loss": 0.99094791, "memory(GiB)": 369.42, "step": 42100, "train_speed(iter/s)": 0.200485 }, { "acc": 0.74434471, "epoch": 1.0681126331811264, "grad_norm": 2.34375, "learning_rate": 4.850278922332708e-06, "loss": 1.02721004, "memory(GiB)": 369.42, "step": 42105, "train_speed(iter/s)": 0.200488 }, { "acc": 0.75809317, "epoch": 1.0682394723490614, "grad_norm": 2.421875, "learning_rate": 4.849230772186508e-06, "loss": 0.97007542, "memory(GiB)": 369.42, "step": 42110, "train_speed(iter/s)": 0.20049 }, { "acc": 0.75701056, "epoch": 1.0683663115169963, "grad_norm": 1.9375, "learning_rate": 4.848182628671806e-06, "loss": 0.95546055, "memory(GiB)": 369.42, "step": 42115, "train_speed(iter/s)": 0.200493 }, { "acc": 0.76214285, "epoch": 1.0684931506849316, "grad_norm": 2.0, "learning_rate": 4.847134491834713e-06, "loss": 0.9338707, "memory(GiB)": 369.42, "step": 42120, "train_speed(iter/s)": 0.200497 }, { "acc": 0.75003347, "epoch": 1.0686199898528665, "grad_norm": 1.984375, "learning_rate": 4.846086361721326e-06, "loss": 0.96174164, "memory(GiB)": 369.42, "step": 42125, "train_speed(iter/s)": 0.200502 }, { "acc": 0.74964943, "epoch": 1.0687468290208015, "grad_norm": 2.25, "learning_rate": 4.84503823837775e-06, "loss": 0.96860981, "memory(GiB)": 369.42, "step": 42130, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75428519, "epoch": 1.0688736681887367, "grad_norm": 1.828125, "learning_rate": 4.843990121850083e-06, "loss": 0.95923529, "memory(GiB)": 369.42, "step": 42135, "train_speed(iter/s)": 0.200511 }, { "acc": 0.73992338, "epoch": 1.0690005073566717, "grad_norm": 2.265625, "learning_rate": 4.842942012184426e-06, "loss": 0.9956213, "memory(GiB)": 369.42, "step": 42140, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74226961, "epoch": 1.0691273465246067, "grad_norm": 1.8515625, "learning_rate": 4.841893909426881e-06, "loss": 1.00609455, "memory(GiB)": 369.42, "step": 42145, "train_speed(iter/s)": 0.200517 }, { "acc": 0.75083284, "epoch": 1.069254185692542, "grad_norm": 2.703125, "learning_rate": 4.84084581362355e-06, "loss": 1.03161554, "memory(GiB)": 369.42, "step": 42150, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75522795, "epoch": 1.069381024860477, "grad_norm": 2.109375, "learning_rate": 4.839797724820529e-06, "loss": 0.98507977, "memory(GiB)": 369.42, "step": 42155, "train_speed(iter/s)": 0.200524 }, { "acc": 0.74582567, "epoch": 1.069507864028412, "grad_norm": 2.609375, "learning_rate": 4.838749643063918e-06, "loss": 1.01294022, "memory(GiB)": 369.42, "step": 42160, "train_speed(iter/s)": 0.200528 }, { "acc": 0.74522924, "epoch": 1.0696347031963471, "grad_norm": 2.0625, "learning_rate": 4.837701568399819e-06, "loss": 1.02231617, "memory(GiB)": 369.42, "step": 42165, "train_speed(iter/s)": 0.200532 }, { "acc": 0.74272213, "epoch": 1.0697615423642821, "grad_norm": 1.75, "learning_rate": 4.836653500874331e-06, "loss": 0.98807831, "memory(GiB)": 369.42, "step": 42170, "train_speed(iter/s)": 0.200537 }, { "acc": 0.75192242, "epoch": 1.069888381532217, "grad_norm": 2.171875, "learning_rate": 4.835605440533549e-06, "loss": 1.02027369, "memory(GiB)": 369.42, "step": 42175, "train_speed(iter/s)": 0.200541 }, { "acc": 0.77159281, "epoch": 1.070015220700152, "grad_norm": 2.09375, "learning_rate": 4.834557387423575e-06, "loss": 0.9046258, "memory(GiB)": 369.42, "step": 42180, "train_speed(iter/s)": 0.200544 }, { "acc": 0.75119014, "epoch": 1.0701420598680873, "grad_norm": 1.8359375, "learning_rate": 4.833509341590503e-06, "loss": 0.96842957, "memory(GiB)": 369.42, "step": 42185, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75732117, "epoch": 1.0702688990360223, "grad_norm": 2.234375, "learning_rate": 4.8324613030804374e-06, "loss": 0.9219986, "memory(GiB)": 369.42, "step": 42190, "train_speed(iter/s)": 0.200545 }, { "acc": 0.73920441, "epoch": 1.0703957382039573, "grad_norm": 2.375, "learning_rate": 4.83141327193947e-06, "loss": 1.02562618, "memory(GiB)": 369.42, "step": 42195, "train_speed(iter/s)": 0.200549 }, { "acc": 0.76227083, "epoch": 1.0705225773718925, "grad_norm": 2.296875, "learning_rate": 4.8303652482137e-06, "loss": 0.95164413, "memory(GiB)": 369.42, "step": 42200, "train_speed(iter/s)": 0.200553 }, { "acc": 0.73881121, "epoch": 1.0706494165398275, "grad_norm": 2.0625, "learning_rate": 4.829317231949222e-06, "loss": 0.99046974, "memory(GiB)": 369.42, "step": 42205, "train_speed(iter/s)": 0.200556 }, { "acc": 0.76191072, "epoch": 1.0707762557077625, "grad_norm": 2.171875, "learning_rate": 4.828269223192137e-06, "loss": 0.97959538, "memory(GiB)": 369.42, "step": 42210, "train_speed(iter/s)": 0.200559 }, { "acc": 0.75107517, "epoch": 1.0709030948756977, "grad_norm": 2.34375, "learning_rate": 4.827221221988537e-06, "loss": 0.9855731, "memory(GiB)": 369.42, "step": 42215, "train_speed(iter/s)": 0.200563 }, { "acc": 0.75464582, "epoch": 1.0710299340436327, "grad_norm": 1.984375, "learning_rate": 4.826173228384518e-06, "loss": 0.95755758, "memory(GiB)": 369.42, "step": 42220, "train_speed(iter/s)": 0.200567 }, { "acc": 0.74855204, "epoch": 1.0711567732115677, "grad_norm": 1.9375, "learning_rate": 4.8251252424261775e-06, "loss": 1.0227541, "memory(GiB)": 369.42, "step": 42225, "train_speed(iter/s)": 0.20057 }, { "acc": 0.76252365, "epoch": 1.0712836123795029, "grad_norm": 1.953125, "learning_rate": 4.8240772641596105e-06, "loss": 0.9052742, "memory(GiB)": 369.42, "step": 42230, "train_speed(iter/s)": 0.200572 }, { "acc": 0.75755105, "epoch": 1.0714104515474379, "grad_norm": 2.078125, "learning_rate": 4.82302929363091e-06, "loss": 0.98151464, "memory(GiB)": 369.42, "step": 42235, "train_speed(iter/s)": 0.200577 }, { "acc": 0.74564719, "epoch": 1.0715372907153728, "grad_norm": 2.03125, "learning_rate": 4.8219813308861705e-06, "loss": 1.01156635, "memory(GiB)": 369.42, "step": 42240, "train_speed(iter/s)": 0.200581 }, { "acc": 0.76474333, "epoch": 1.071664129883308, "grad_norm": 2.109375, "learning_rate": 4.820933375971487e-06, "loss": 0.90969944, "memory(GiB)": 369.42, "step": 42245, "train_speed(iter/s)": 0.200584 }, { "acc": 0.75892706, "epoch": 1.071790969051243, "grad_norm": 2.25, "learning_rate": 4.819885428932955e-06, "loss": 0.99217987, "memory(GiB)": 369.42, "step": 42250, "train_speed(iter/s)": 0.200589 }, { "acc": 0.76401701, "epoch": 1.071917808219178, "grad_norm": 2.1875, "learning_rate": 4.818837489816664e-06, "loss": 0.96069241, "memory(GiB)": 369.42, "step": 42255, "train_speed(iter/s)": 0.200594 }, { "acc": 0.7480577, "epoch": 1.0720446473871132, "grad_norm": 1.84375, "learning_rate": 4.81778955866871e-06, "loss": 1.06736622, "memory(GiB)": 369.42, "step": 42260, "train_speed(iter/s)": 0.200598 }, { "acc": 0.76487055, "epoch": 1.0721714865550482, "grad_norm": 2.046875, "learning_rate": 4.816741635535183e-06, "loss": 0.96016464, "memory(GiB)": 369.42, "step": 42265, "train_speed(iter/s)": 0.200602 }, { "acc": 0.74140515, "epoch": 1.0722983257229832, "grad_norm": 2.203125, "learning_rate": 4.81569372046218e-06, "loss": 1.050107, "memory(GiB)": 369.42, "step": 42270, "train_speed(iter/s)": 0.200606 }, { "acc": 0.74157085, "epoch": 1.0724251648909182, "grad_norm": 2.578125, "learning_rate": 4.814645813495788e-06, "loss": 1.04223747, "memory(GiB)": 369.42, "step": 42275, "train_speed(iter/s)": 0.200609 }, { "acc": 0.7476491, "epoch": 1.0725520040588534, "grad_norm": 1.765625, "learning_rate": 4.8135979146821e-06, "loss": 0.99763603, "memory(GiB)": 369.42, "step": 42280, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74821453, "epoch": 1.0726788432267884, "grad_norm": 2.40625, "learning_rate": 4.81255002406721e-06, "loss": 0.95194902, "memory(GiB)": 369.42, "step": 42285, "train_speed(iter/s)": 0.200618 }, { "acc": 0.76766543, "epoch": 1.0728056823947234, "grad_norm": 1.890625, "learning_rate": 4.811502141697206e-06, "loss": 0.92241745, "memory(GiB)": 369.42, "step": 42290, "train_speed(iter/s)": 0.200621 }, { "acc": 0.75035009, "epoch": 1.0729325215626586, "grad_norm": 1.7578125, "learning_rate": 4.81045426761818e-06, "loss": 0.98219986, "memory(GiB)": 369.42, "step": 42295, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75420246, "epoch": 1.0730593607305936, "grad_norm": 1.8359375, "learning_rate": 4.80940640187622e-06, "loss": 1.01435432, "memory(GiB)": 369.42, "step": 42300, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75931396, "epoch": 1.0731861998985286, "grad_norm": 2.1875, "learning_rate": 4.808358544517418e-06, "loss": 0.959095, "memory(GiB)": 369.42, "step": 42305, "train_speed(iter/s)": 0.200631 }, { "acc": 0.76803737, "epoch": 1.0733130390664638, "grad_norm": 2.328125, "learning_rate": 4.807310695587865e-06, "loss": 0.92314758, "memory(GiB)": 369.42, "step": 42310, "train_speed(iter/s)": 0.200635 }, { "acc": 0.72837791, "epoch": 1.0734398782343988, "grad_norm": 2.125, "learning_rate": 4.8062628551336445e-06, "loss": 1.06628914, "memory(GiB)": 369.42, "step": 42315, "train_speed(iter/s)": 0.200638 }, { "acc": 0.74757385, "epoch": 1.0735667174023338, "grad_norm": 2.1875, "learning_rate": 4.80521502320085e-06, "loss": 1.00522051, "memory(GiB)": 369.42, "step": 42320, "train_speed(iter/s)": 0.200642 }, { "acc": 0.74064908, "epoch": 1.073693556570269, "grad_norm": 2.375, "learning_rate": 4.804167199835567e-06, "loss": 0.97686386, "memory(GiB)": 369.42, "step": 42325, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75324202, "epoch": 1.073820395738204, "grad_norm": 2.109375, "learning_rate": 4.8031193850838894e-06, "loss": 0.93835583, "memory(GiB)": 369.42, "step": 42330, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75151095, "epoch": 1.073947234906139, "grad_norm": 2.078125, "learning_rate": 4.802071578991896e-06, "loss": 1.00545311, "memory(GiB)": 369.42, "step": 42335, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74892387, "epoch": 1.074074074074074, "grad_norm": 1.9140625, "learning_rate": 4.801023781605679e-06, "loss": 0.99518871, "memory(GiB)": 369.42, "step": 42340, "train_speed(iter/s)": 0.200655 }, { "acc": 0.74656157, "epoch": 1.0742009132420092, "grad_norm": 1.84375, "learning_rate": 4.799975992971325e-06, "loss": 1.00706787, "memory(GiB)": 369.42, "step": 42345, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75702386, "epoch": 1.0743277524099442, "grad_norm": 2.078125, "learning_rate": 4.798928213134921e-06, "loss": 0.97700558, "memory(GiB)": 369.42, "step": 42350, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75515189, "epoch": 1.0744545915778791, "grad_norm": 2.296875, "learning_rate": 4.797880442142551e-06, "loss": 1.01988068, "memory(GiB)": 369.42, "step": 42355, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75275593, "epoch": 1.0745814307458144, "grad_norm": 2.28125, "learning_rate": 4.7968326800403e-06, "loss": 0.94386997, "memory(GiB)": 369.42, "step": 42360, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74426122, "epoch": 1.0747082699137493, "grad_norm": 2.515625, "learning_rate": 4.795784926874255e-06, "loss": 0.98858862, "memory(GiB)": 369.42, "step": 42365, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75923862, "epoch": 1.0748351090816843, "grad_norm": 2.328125, "learning_rate": 4.794737182690503e-06, "loss": 0.98065901, "memory(GiB)": 369.42, "step": 42370, "train_speed(iter/s)": 0.200675 }, { "acc": 0.73931322, "epoch": 1.0749619482496195, "grad_norm": 1.9375, "learning_rate": 4.793689447535126e-06, "loss": 1.00691528, "memory(GiB)": 369.42, "step": 42375, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76557274, "epoch": 1.0750887874175545, "grad_norm": 2.328125, "learning_rate": 4.792641721454206e-06, "loss": 0.9756196, "memory(GiB)": 369.42, "step": 42380, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75583744, "epoch": 1.0752156265854895, "grad_norm": 2.09375, "learning_rate": 4.79159400449383e-06, "loss": 0.96788597, "memory(GiB)": 369.42, "step": 42385, "train_speed(iter/s)": 0.200687 }, { "acc": 0.75948629, "epoch": 1.0753424657534247, "grad_norm": 1.8828125, "learning_rate": 4.7905462967000816e-06, "loss": 0.95724506, "memory(GiB)": 369.42, "step": 42390, "train_speed(iter/s)": 0.20069 }, { "acc": 0.74114561, "epoch": 1.0754693049213597, "grad_norm": 2.40625, "learning_rate": 4.789498598119039e-06, "loss": 1.04677849, "memory(GiB)": 369.42, "step": 42395, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75292015, "epoch": 1.0755961440892947, "grad_norm": 1.9765625, "learning_rate": 4.78845090879679e-06, "loss": 0.99422207, "memory(GiB)": 369.42, "step": 42400, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75807228, "epoch": 1.07572298325723, "grad_norm": 2.46875, "learning_rate": 4.787403228779413e-06, "loss": 0.97093744, "memory(GiB)": 369.42, "step": 42405, "train_speed(iter/s)": 0.200703 }, { "acc": 0.74750495, "epoch": 1.075849822425165, "grad_norm": 2.84375, "learning_rate": 4.786355558112994e-06, "loss": 1.02053928, "memory(GiB)": 369.42, "step": 42410, "train_speed(iter/s)": 0.200706 }, { "acc": 0.76895933, "epoch": 1.0759766615931, "grad_norm": 1.9140625, "learning_rate": 4.78530789684361e-06, "loss": 0.94018288, "memory(GiB)": 369.42, "step": 42415, "train_speed(iter/s)": 0.200711 }, { "acc": 0.74771333, "epoch": 1.0761035007610351, "grad_norm": 1.6640625, "learning_rate": 4.784260245017343e-06, "loss": 1.00844498, "memory(GiB)": 369.42, "step": 42420, "train_speed(iter/s)": 0.200715 }, { "acc": 0.74615612, "epoch": 1.07623033992897, "grad_norm": 2.21875, "learning_rate": 4.7832126026802725e-06, "loss": 1.00437431, "memory(GiB)": 369.42, "step": 42425, "train_speed(iter/s)": 0.200718 }, { "acc": 0.75387187, "epoch": 1.076357179096905, "grad_norm": 2.078125, "learning_rate": 4.782164969878482e-06, "loss": 0.9147007, "memory(GiB)": 369.42, "step": 42430, "train_speed(iter/s)": 0.200722 }, { "acc": 0.73702173, "epoch": 1.07648401826484, "grad_norm": 2.03125, "learning_rate": 4.781117346658047e-06, "loss": 1.01973057, "memory(GiB)": 369.42, "step": 42435, "train_speed(iter/s)": 0.200725 }, { "acc": 0.74345341, "epoch": 1.0766108574327753, "grad_norm": 2.046875, "learning_rate": 4.780069733065048e-06, "loss": 0.97969723, "memory(GiB)": 369.42, "step": 42440, "train_speed(iter/s)": 0.200728 }, { "acc": 0.7614625, "epoch": 1.0767376966007103, "grad_norm": 3.015625, "learning_rate": 4.779022129145566e-06, "loss": 0.94583578, "memory(GiB)": 369.42, "step": 42445, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74655161, "epoch": 1.0768645357686453, "grad_norm": 2.03125, "learning_rate": 4.777974534945677e-06, "loss": 1.03302498, "memory(GiB)": 369.42, "step": 42450, "train_speed(iter/s)": 0.200735 }, { "acc": 0.74492226, "epoch": 1.0769913749365805, "grad_norm": 1.78125, "learning_rate": 4.776926950511457e-06, "loss": 1.05734444, "memory(GiB)": 369.42, "step": 42455, "train_speed(iter/s)": 0.200739 }, { "acc": 0.73333969, "epoch": 1.0771182141045155, "grad_norm": 1.859375, "learning_rate": 4.775879375888986e-06, "loss": 1.03645, "memory(GiB)": 369.42, "step": 42460, "train_speed(iter/s)": 0.200742 }, { "acc": 0.7556613, "epoch": 1.0772450532724505, "grad_norm": 2.234375, "learning_rate": 4.774831811124343e-06, "loss": 0.97739229, "memory(GiB)": 369.42, "step": 42465, "train_speed(iter/s)": 0.200744 }, { "acc": 0.75378089, "epoch": 1.0773718924403857, "grad_norm": 2.078125, "learning_rate": 4.773784256263601e-06, "loss": 0.96788368, "memory(GiB)": 369.42, "step": 42470, "train_speed(iter/s)": 0.200748 }, { "acc": 0.74277306, "epoch": 1.0774987316083207, "grad_norm": 2.21875, "learning_rate": 4.7727367113528374e-06, "loss": 1.01953697, "memory(GiB)": 369.42, "step": 42475, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74951067, "epoch": 1.0776255707762556, "grad_norm": 1.953125, "learning_rate": 4.771689176438128e-06, "loss": 1.0259304, "memory(GiB)": 369.42, "step": 42480, "train_speed(iter/s)": 0.200751 }, { "acc": 0.76010027, "epoch": 1.0777524099441909, "grad_norm": 2.546875, "learning_rate": 4.770641651565546e-06, "loss": 0.92664089, "memory(GiB)": 369.42, "step": 42485, "train_speed(iter/s)": 0.200754 }, { "acc": 0.76059294, "epoch": 1.0778792491121258, "grad_norm": 2.15625, "learning_rate": 4.769594136781172e-06, "loss": 0.9717926, "memory(GiB)": 369.42, "step": 42490, "train_speed(iter/s)": 0.200758 }, { "acc": 0.75877929, "epoch": 1.0780060882800608, "grad_norm": 2.5, "learning_rate": 4.768546632131074e-06, "loss": 0.97981272, "memory(GiB)": 369.42, "step": 42495, "train_speed(iter/s)": 0.200763 }, { "acc": 0.7487504, "epoch": 1.0781329274479958, "grad_norm": 2.8125, "learning_rate": 4.767499137661328e-06, "loss": 1.01595402, "memory(GiB)": 369.42, "step": 42500, "train_speed(iter/s)": 0.200764 }, { "acc": 0.75725069, "epoch": 1.078259766615931, "grad_norm": 1.7734375, "learning_rate": 4.76645165341801e-06, "loss": 0.8970809, "memory(GiB)": 369.42, "step": 42505, "train_speed(iter/s)": 0.200767 }, { "acc": 0.74095974, "epoch": 1.078386605783866, "grad_norm": 2.125, "learning_rate": 4.76540417944719e-06, "loss": 0.98520012, "memory(GiB)": 369.42, "step": 42510, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75072913, "epoch": 1.078513444951801, "grad_norm": 1.8125, "learning_rate": 4.764356715794942e-06, "loss": 0.99002972, "memory(GiB)": 369.42, "step": 42515, "train_speed(iter/s)": 0.200771 }, { "acc": 0.7599268, "epoch": 1.0786402841197362, "grad_norm": 2.8125, "learning_rate": 4.763309262507336e-06, "loss": 0.91030474, "memory(GiB)": 369.42, "step": 42520, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75090303, "epoch": 1.0787671232876712, "grad_norm": 2.265625, "learning_rate": 4.762261819630447e-06, "loss": 1.00413942, "memory(GiB)": 369.42, "step": 42525, "train_speed(iter/s)": 0.200779 }, { "acc": 0.76504354, "epoch": 1.0788939624556062, "grad_norm": 2.1875, "learning_rate": 4.761214387210345e-06, "loss": 0.9243206, "memory(GiB)": 369.42, "step": 42530, "train_speed(iter/s)": 0.200782 }, { "acc": 0.76835642, "epoch": 1.0790208016235414, "grad_norm": 2.03125, "learning_rate": 4.760166965293099e-06, "loss": 0.97492752, "memory(GiB)": 369.42, "step": 42535, "train_speed(iter/s)": 0.200786 }, { "acc": 0.76157255, "epoch": 1.0791476407914764, "grad_norm": 2.15625, "learning_rate": 4.759119553924781e-06, "loss": 0.9323719, "memory(GiB)": 369.42, "step": 42540, "train_speed(iter/s)": 0.200789 }, { "acc": 0.74725075, "epoch": 1.0792744799594114, "grad_norm": 2.09375, "learning_rate": 4.758072153151461e-06, "loss": 0.98268795, "memory(GiB)": 369.42, "step": 42545, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74889002, "epoch": 1.0794013191273466, "grad_norm": 2.078125, "learning_rate": 4.757024763019209e-06, "loss": 0.94621706, "memory(GiB)": 369.42, "step": 42550, "train_speed(iter/s)": 0.200794 }, { "acc": 0.75753498, "epoch": 1.0795281582952816, "grad_norm": 1.875, "learning_rate": 4.755977383574091e-06, "loss": 0.96668444, "memory(GiB)": 369.42, "step": 42555, "train_speed(iter/s)": 0.200798 }, { "acc": 0.7558105, "epoch": 1.0796549974632166, "grad_norm": 2.203125, "learning_rate": 4.754930014862177e-06, "loss": 0.99245949, "memory(GiB)": 369.42, "step": 42560, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75171881, "epoch": 1.0797818366311518, "grad_norm": 2.0625, "learning_rate": 4.753882656929535e-06, "loss": 1.0386282, "memory(GiB)": 369.42, "step": 42565, "train_speed(iter/s)": 0.200807 }, { "acc": 0.75217857, "epoch": 1.0799086757990868, "grad_norm": 2.0, "learning_rate": 4.752835309822234e-06, "loss": 0.9780159, "memory(GiB)": 369.42, "step": 42570, "train_speed(iter/s)": 0.200811 }, { "acc": 0.76022511, "epoch": 1.0800355149670218, "grad_norm": 2.203125, "learning_rate": 4.7517879735863385e-06, "loss": 0.9571619, "memory(GiB)": 369.42, "step": 42575, "train_speed(iter/s)": 0.200813 }, { "acc": 0.74116445, "epoch": 1.080162354134957, "grad_norm": 1.828125, "learning_rate": 4.750740648267916e-06, "loss": 1.00016584, "memory(GiB)": 369.42, "step": 42580, "train_speed(iter/s)": 0.200817 }, { "acc": 0.744028, "epoch": 1.080289193302892, "grad_norm": 1.765625, "learning_rate": 4.749693333913033e-06, "loss": 0.982654, "memory(GiB)": 369.42, "step": 42585, "train_speed(iter/s)": 0.200822 }, { "acc": 0.7504734, "epoch": 1.080416032470827, "grad_norm": 2.203125, "learning_rate": 4.748646030567755e-06, "loss": 0.97871342, "memory(GiB)": 369.42, "step": 42590, "train_speed(iter/s)": 0.200824 }, { "acc": 0.75200877, "epoch": 1.080542871638762, "grad_norm": 2.3125, "learning_rate": 4.747598738278147e-06, "loss": 1.00353756, "memory(GiB)": 369.42, "step": 42595, "train_speed(iter/s)": 0.200829 }, { "acc": 0.75238199, "epoch": 1.0806697108066972, "grad_norm": 1.7578125, "learning_rate": 4.746551457090272e-06, "loss": 1.00004559, "memory(GiB)": 369.42, "step": 42600, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75140958, "epoch": 1.0807965499746321, "grad_norm": 2.140625, "learning_rate": 4.745504187050197e-06, "loss": 1.02400379, "memory(GiB)": 369.42, "step": 42605, "train_speed(iter/s)": 0.200834 }, { "acc": 0.74561186, "epoch": 1.0809233891425671, "grad_norm": 1.9140625, "learning_rate": 4.744456928203985e-06, "loss": 0.98375559, "memory(GiB)": 369.42, "step": 42610, "train_speed(iter/s)": 0.200835 }, { "acc": 0.74451971, "epoch": 1.0810502283105023, "grad_norm": 2.984375, "learning_rate": 4.743409680597695e-06, "loss": 1.0058115, "memory(GiB)": 369.42, "step": 42615, "train_speed(iter/s)": 0.200838 }, { "acc": 0.74128304, "epoch": 1.0811770674784373, "grad_norm": 2.296875, "learning_rate": 4.742362444277394e-06, "loss": 1.03237152, "memory(GiB)": 369.42, "step": 42620, "train_speed(iter/s)": 0.200843 }, { "acc": 0.73977757, "epoch": 1.0813039066463723, "grad_norm": 2.109375, "learning_rate": 4.741315219289142e-06, "loss": 1.02579432, "memory(GiB)": 369.42, "step": 42625, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75987062, "epoch": 1.0814307458143075, "grad_norm": 2.125, "learning_rate": 4.740268005679005e-06, "loss": 0.95211964, "memory(GiB)": 369.42, "step": 42630, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75569048, "epoch": 1.0815575849822425, "grad_norm": 2.71875, "learning_rate": 4.739220803493039e-06, "loss": 0.9473115, "memory(GiB)": 369.42, "step": 42635, "train_speed(iter/s)": 0.200849 }, { "acc": 0.76212597, "epoch": 1.0816844241501775, "grad_norm": 2.703125, "learning_rate": 4.738173612777306e-06, "loss": 1.0029232, "memory(GiB)": 369.42, "step": 42640, "train_speed(iter/s)": 0.200853 }, { "acc": 0.73553972, "epoch": 1.0818112633181127, "grad_norm": 2.09375, "learning_rate": 4.737126433577866e-06, "loss": 1.08368177, "memory(GiB)": 369.42, "step": 42645, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74395161, "epoch": 1.0819381024860477, "grad_norm": 2.21875, "learning_rate": 4.736079265940781e-06, "loss": 1.00634184, "memory(GiB)": 369.42, "step": 42650, "train_speed(iter/s)": 0.200859 }, { "acc": 0.73744993, "epoch": 1.0820649416539827, "grad_norm": 2.46875, "learning_rate": 4.735032109912107e-06, "loss": 1.0065237, "memory(GiB)": 369.42, "step": 42655, "train_speed(iter/s)": 0.200865 }, { "acc": 0.73877926, "epoch": 1.0821917808219177, "grad_norm": 2.109375, "learning_rate": 4.733984965537903e-06, "loss": 0.97598228, "memory(GiB)": 369.42, "step": 42660, "train_speed(iter/s)": 0.200866 }, { "acc": 0.76601906, "epoch": 1.082318619989853, "grad_norm": 2.109375, "learning_rate": 4.732937832864229e-06, "loss": 0.93211765, "memory(GiB)": 369.42, "step": 42665, "train_speed(iter/s)": 0.20087 }, { "acc": 0.73678875, "epoch": 1.082445459157788, "grad_norm": 2.25, "learning_rate": 4.731890711937141e-06, "loss": 1.03576031, "memory(GiB)": 369.42, "step": 42670, "train_speed(iter/s)": 0.200873 }, { "acc": 0.76075535, "epoch": 1.0825722983257229, "grad_norm": 2.390625, "learning_rate": 4.730843602802696e-06, "loss": 0.95786343, "memory(GiB)": 369.42, "step": 42675, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75654678, "epoch": 1.082699137493658, "grad_norm": 2.515625, "learning_rate": 4.729796505506951e-06, "loss": 0.99051867, "memory(GiB)": 369.42, "step": 42680, "train_speed(iter/s)": 0.200881 }, { "acc": 0.74018059, "epoch": 1.082825976661593, "grad_norm": 1.9453125, "learning_rate": 4.728749420095964e-06, "loss": 1.02495422, "memory(GiB)": 369.42, "step": 42685, "train_speed(iter/s)": 0.200883 }, { "acc": 0.75291262, "epoch": 1.082952815829528, "grad_norm": 2.140625, "learning_rate": 4.727702346615788e-06, "loss": 1.01617451, "memory(GiB)": 369.42, "step": 42690, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75206957, "epoch": 1.0830796549974633, "grad_norm": 1.8046875, "learning_rate": 4.726655285112477e-06, "loss": 0.95440607, "memory(GiB)": 369.42, "step": 42695, "train_speed(iter/s)": 0.200891 }, { "acc": 0.74928403, "epoch": 1.0832064941653983, "grad_norm": 2.328125, "learning_rate": 4.725608235632088e-06, "loss": 0.97984972, "memory(GiB)": 369.42, "step": 42700, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75764899, "epoch": 1.0833333333333333, "grad_norm": 2.34375, "learning_rate": 4.724561198220672e-06, "loss": 0.98383188, "memory(GiB)": 369.42, "step": 42705, "train_speed(iter/s)": 0.200895 }, { "acc": 0.74463978, "epoch": 1.0834601725012685, "grad_norm": 2.03125, "learning_rate": 4.723514172924287e-06, "loss": 1.01863651, "memory(GiB)": 369.42, "step": 42710, "train_speed(iter/s)": 0.200898 }, { "acc": 0.74748878, "epoch": 1.0835870116692035, "grad_norm": 1.859375, "learning_rate": 4.7224671597889825e-06, "loss": 0.97974033, "memory(GiB)": 369.42, "step": 42715, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74347038, "epoch": 1.0837138508371384, "grad_norm": 2.515625, "learning_rate": 4.72142015886081e-06, "loss": 1.02349968, "memory(GiB)": 369.42, "step": 42720, "train_speed(iter/s)": 0.200904 }, { "acc": 0.76254711, "epoch": 1.0838406900050737, "grad_norm": 2.1875, "learning_rate": 4.720373170185823e-06, "loss": 0.97242508, "memory(GiB)": 369.42, "step": 42725, "train_speed(iter/s)": 0.200906 }, { "acc": 0.75537577, "epoch": 1.0839675291730086, "grad_norm": 2.3125, "learning_rate": 4.719326193810075e-06, "loss": 1.01418972, "memory(GiB)": 369.42, "step": 42730, "train_speed(iter/s)": 0.20091 }, { "acc": 0.73986702, "epoch": 1.0840943683409436, "grad_norm": 2.59375, "learning_rate": 4.718279229779612e-06, "loss": 1.04047642, "memory(GiB)": 369.42, "step": 42735, "train_speed(iter/s)": 0.200915 }, { "acc": 0.74562855, "epoch": 1.0842212075088788, "grad_norm": 2.0625, "learning_rate": 4.717232278140485e-06, "loss": 1.06689377, "memory(GiB)": 369.42, "step": 42740, "train_speed(iter/s)": 0.200918 }, { "acc": 0.76419668, "epoch": 1.0843480466768138, "grad_norm": 1.84375, "learning_rate": 4.716185338938746e-06, "loss": 0.93654394, "memory(GiB)": 369.42, "step": 42745, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75166931, "epoch": 1.0844748858447488, "grad_norm": 2.421875, "learning_rate": 4.7151384122204445e-06, "loss": 0.93927498, "memory(GiB)": 369.42, "step": 42750, "train_speed(iter/s)": 0.200925 }, { "acc": 0.74770603, "epoch": 1.0846017250126838, "grad_norm": 2.265625, "learning_rate": 4.7140914980316254e-06, "loss": 0.95453892, "memory(GiB)": 369.42, "step": 42755, "train_speed(iter/s)": 0.200927 }, { "acc": 0.7517745, "epoch": 1.084728564180619, "grad_norm": 2.4375, "learning_rate": 4.713044596418339e-06, "loss": 0.98054476, "memory(GiB)": 369.42, "step": 42760, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75420237, "epoch": 1.084855403348554, "grad_norm": 2.125, "learning_rate": 4.711997707426632e-06, "loss": 0.98804493, "memory(GiB)": 369.42, "step": 42765, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74264603, "epoch": 1.084982242516489, "grad_norm": 2.234375, "learning_rate": 4.710950831102555e-06, "loss": 1.01599503, "memory(GiB)": 369.42, "step": 42770, "train_speed(iter/s)": 0.200937 }, { "acc": 0.74921503, "epoch": 1.0851090816844242, "grad_norm": 2.484375, "learning_rate": 4.709903967492147e-06, "loss": 0.96600342, "memory(GiB)": 369.42, "step": 42775, "train_speed(iter/s)": 0.200941 }, { "acc": 0.73973703, "epoch": 1.0852359208523592, "grad_norm": 2.1875, "learning_rate": 4.7088571166414595e-06, "loss": 0.98485832, "memory(GiB)": 369.42, "step": 42780, "train_speed(iter/s)": 0.200946 }, { "acc": 0.75044594, "epoch": 1.0853627600202942, "grad_norm": 1.8515625, "learning_rate": 4.707810278596534e-06, "loss": 1.00144367, "memory(GiB)": 369.42, "step": 42785, "train_speed(iter/s)": 0.200949 }, { "acc": 0.76352515, "epoch": 1.0854895991882294, "grad_norm": 2.6875, "learning_rate": 4.7067634534034205e-06, "loss": 0.96838074, "memory(GiB)": 369.42, "step": 42790, "train_speed(iter/s)": 0.200954 }, { "acc": 0.76327353, "epoch": 1.0856164383561644, "grad_norm": 2.328125, "learning_rate": 4.705716641108157e-06, "loss": 0.95393906, "memory(GiB)": 369.42, "step": 42795, "train_speed(iter/s)": 0.200959 }, { "acc": 0.75122166, "epoch": 1.0857432775240994, "grad_norm": 2.40625, "learning_rate": 4.7046698417567894e-06, "loss": 1.0185256, "memory(GiB)": 369.42, "step": 42800, "train_speed(iter/s)": 0.200962 }, { "acc": 0.75407457, "epoch": 1.0858701166920346, "grad_norm": 2.421875, "learning_rate": 4.7036230553953616e-06, "loss": 1.00208549, "memory(GiB)": 369.42, "step": 42805, "train_speed(iter/s)": 0.200964 }, { "acc": 0.7630177, "epoch": 1.0859969558599696, "grad_norm": 2.453125, "learning_rate": 4.702576282069916e-06, "loss": 0.91544361, "memory(GiB)": 369.42, "step": 42810, "train_speed(iter/s)": 0.200964 }, { "acc": 0.74373221, "epoch": 1.0861237950279046, "grad_norm": 2.296875, "learning_rate": 4.701529521826492e-06, "loss": 1.04303684, "memory(GiB)": 369.42, "step": 42815, "train_speed(iter/s)": 0.200968 }, { "acc": 0.76843729, "epoch": 1.0862506341958396, "grad_norm": 2.203125, "learning_rate": 4.700482774711131e-06, "loss": 0.9927412, "memory(GiB)": 369.42, "step": 42820, "train_speed(iter/s)": 0.200972 }, { "acc": 0.76553831, "epoch": 1.0863774733637748, "grad_norm": 2.203125, "learning_rate": 4.699436040769877e-06, "loss": 0.93695202, "memory(GiB)": 369.42, "step": 42825, "train_speed(iter/s)": 0.200976 }, { "acc": 0.76015654, "epoch": 1.0865043125317098, "grad_norm": 2.1875, "learning_rate": 4.698389320048768e-06, "loss": 0.96889992, "memory(GiB)": 369.42, "step": 42830, "train_speed(iter/s)": 0.200979 }, { "acc": 0.74676571, "epoch": 1.0866311516996447, "grad_norm": 2.21875, "learning_rate": 4.697342612593841e-06, "loss": 0.98747234, "memory(GiB)": 369.42, "step": 42835, "train_speed(iter/s)": 0.200982 }, { "acc": 0.75412931, "epoch": 1.08675799086758, "grad_norm": 2.0, "learning_rate": 4.696295918451139e-06, "loss": 0.95947495, "memory(GiB)": 369.42, "step": 42840, "train_speed(iter/s)": 0.200985 }, { "acc": 0.76125503, "epoch": 1.086884830035515, "grad_norm": 2.0, "learning_rate": 4.695249237666697e-06, "loss": 0.95619869, "memory(GiB)": 369.42, "step": 42845, "train_speed(iter/s)": 0.200988 }, { "acc": 0.77010555, "epoch": 1.08701166920345, "grad_norm": 2.1875, "learning_rate": 4.694202570286556e-06, "loss": 0.91477776, "memory(GiB)": 369.42, "step": 42850, "train_speed(iter/s)": 0.200988 }, { "acc": 0.75659428, "epoch": 1.0871385083713851, "grad_norm": 1.9765625, "learning_rate": 4.693155916356751e-06, "loss": 0.99879436, "memory(GiB)": 369.42, "step": 42855, "train_speed(iter/s)": 0.200991 }, { "acc": 0.73519249, "epoch": 1.0872653475393201, "grad_norm": 2.328125, "learning_rate": 4.692109275923318e-06, "loss": 1.06037064, "memory(GiB)": 369.42, "step": 42860, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74943271, "epoch": 1.0873921867072551, "grad_norm": 2.34375, "learning_rate": 4.6910626490322925e-06, "loss": 1.0194622, "memory(GiB)": 369.42, "step": 42865, "train_speed(iter/s)": 0.200999 }, { "acc": 0.76057882, "epoch": 1.0875190258751903, "grad_norm": 2.59375, "learning_rate": 4.690016035729714e-06, "loss": 0.95604668, "memory(GiB)": 369.42, "step": 42870, "train_speed(iter/s)": 0.201003 }, { "acc": 0.75064363, "epoch": 1.0876458650431253, "grad_norm": 2.40625, "learning_rate": 4.688969436061612e-06, "loss": 0.97237082, "memory(GiB)": 369.42, "step": 42875, "train_speed(iter/s)": 0.201006 }, { "acc": 0.75082254, "epoch": 1.0877727042110603, "grad_norm": 2.40625, "learning_rate": 4.687922850074022e-06, "loss": 0.99735889, "memory(GiB)": 369.42, "step": 42880, "train_speed(iter/s)": 0.20101 }, { "acc": 0.75413275, "epoch": 1.0878995433789955, "grad_norm": 2.09375, "learning_rate": 4.686876277812981e-06, "loss": 1.03119678, "memory(GiB)": 369.42, "step": 42885, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74576216, "epoch": 1.0880263825469305, "grad_norm": 2.25, "learning_rate": 4.685829719324519e-06, "loss": 1.04409075, "memory(GiB)": 369.42, "step": 42890, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74582939, "epoch": 1.0881532217148655, "grad_norm": 2.25, "learning_rate": 4.6847831746546664e-06, "loss": 1.04320526, "memory(GiB)": 369.42, "step": 42895, "train_speed(iter/s)": 0.201018 }, { "acc": 0.75852361, "epoch": 1.0882800608828007, "grad_norm": 2.8125, "learning_rate": 4.683736643849459e-06, "loss": 0.96838036, "memory(GiB)": 369.42, "step": 42900, "train_speed(iter/s)": 0.201022 }, { "acc": 0.75378289, "epoch": 1.0884069000507357, "grad_norm": 2.234375, "learning_rate": 4.6826901269549255e-06, "loss": 0.95048218, "memory(GiB)": 369.42, "step": 42905, "train_speed(iter/s)": 0.201023 }, { "acc": 0.7465991, "epoch": 1.0885337392186707, "grad_norm": 2.453125, "learning_rate": 4.681643624017097e-06, "loss": 1.00407887, "memory(GiB)": 369.42, "step": 42910, "train_speed(iter/s)": 0.201028 }, { "acc": 0.7688652, "epoch": 1.0886605783866057, "grad_norm": 2.390625, "learning_rate": 4.680597135082002e-06, "loss": 0.90566664, "memory(GiB)": 369.42, "step": 42915, "train_speed(iter/s)": 0.201031 }, { "acc": 0.77578821, "epoch": 1.088787417554541, "grad_norm": 2.359375, "learning_rate": 4.679550660195673e-06, "loss": 0.9478178, "memory(GiB)": 369.42, "step": 42920, "train_speed(iter/s)": 0.201035 }, { "acc": 0.75431633, "epoch": 1.0889142567224759, "grad_norm": 2.515625, "learning_rate": 4.6785041994041345e-06, "loss": 1.00711575, "memory(GiB)": 369.42, "step": 42925, "train_speed(iter/s)": 0.201037 }, { "acc": 0.74139876, "epoch": 1.0890410958904109, "grad_norm": 2.203125, "learning_rate": 4.6774577527534195e-06, "loss": 0.99013462, "memory(GiB)": 369.42, "step": 42930, "train_speed(iter/s)": 0.201042 }, { "acc": 0.75512676, "epoch": 1.089167935058346, "grad_norm": 2.0625, "learning_rate": 4.676411320289551e-06, "loss": 1.03368645, "memory(GiB)": 369.42, "step": 42935, "train_speed(iter/s)": 0.201046 }, { "acc": 0.75444183, "epoch": 1.089294774226281, "grad_norm": 2.390625, "learning_rate": 4.675364902058556e-06, "loss": 1.0181077, "memory(GiB)": 369.42, "step": 42940, "train_speed(iter/s)": 0.20105 }, { "acc": 0.76429138, "epoch": 1.089421613394216, "grad_norm": 2.375, "learning_rate": 4.674318498106464e-06, "loss": 1.00361061, "memory(GiB)": 369.42, "step": 42945, "train_speed(iter/s)": 0.201054 }, { "acc": 0.75835662, "epoch": 1.0895484525621513, "grad_norm": 2.640625, "learning_rate": 4.6732721084792985e-06, "loss": 0.99576244, "memory(GiB)": 369.42, "step": 42950, "train_speed(iter/s)": 0.201058 }, { "acc": 0.74104986, "epoch": 1.0896752917300863, "grad_norm": 2.15625, "learning_rate": 4.672225733223084e-06, "loss": 1.0150877, "memory(GiB)": 369.42, "step": 42955, "train_speed(iter/s)": 0.201062 }, { "acc": 0.76171503, "epoch": 1.0898021308980212, "grad_norm": 2.453125, "learning_rate": 4.671179372383844e-06, "loss": 0.94865799, "memory(GiB)": 369.42, "step": 42960, "train_speed(iter/s)": 0.201065 }, { "acc": 0.75681553, "epoch": 1.0899289700659565, "grad_norm": 2.0625, "learning_rate": 4.670133026007604e-06, "loss": 0.98683891, "memory(GiB)": 369.42, "step": 42965, "train_speed(iter/s)": 0.201069 }, { "acc": 0.74763794, "epoch": 1.0900558092338914, "grad_norm": 2.453125, "learning_rate": 4.669086694140388e-06, "loss": 1.02504349, "memory(GiB)": 369.42, "step": 42970, "train_speed(iter/s)": 0.201067 }, { "acc": 0.75118551, "epoch": 1.0901826484018264, "grad_norm": 1.8984375, "learning_rate": 4.668040376828214e-06, "loss": 0.9684267, "memory(GiB)": 369.42, "step": 42975, "train_speed(iter/s)": 0.20107 }, { "acc": 0.76822567, "epoch": 1.0903094875697614, "grad_norm": 2.171875, "learning_rate": 4.666994074117108e-06, "loss": 0.98094978, "memory(GiB)": 369.42, "step": 42980, "train_speed(iter/s)": 0.201072 }, { "acc": 0.74834609, "epoch": 1.0904363267376966, "grad_norm": 2.203125, "learning_rate": 4.665947786053088e-06, "loss": 1.03431129, "memory(GiB)": 369.42, "step": 42985, "train_speed(iter/s)": 0.201075 }, { "acc": 0.73797135, "epoch": 1.0905631659056316, "grad_norm": 2.203125, "learning_rate": 4.664901512682179e-06, "loss": 1.02024984, "memory(GiB)": 369.42, "step": 42990, "train_speed(iter/s)": 0.201079 }, { "acc": 0.75385284, "epoch": 1.0906900050735666, "grad_norm": 4.6875, "learning_rate": 4.663855254050394e-06, "loss": 0.97548656, "memory(GiB)": 369.42, "step": 42995, "train_speed(iter/s)": 0.20108 }, { "acc": 0.73757591, "epoch": 1.0908168442415018, "grad_norm": 2.328125, "learning_rate": 4.662809010203757e-06, "loss": 1.00683384, "memory(GiB)": 369.42, "step": 43000, "train_speed(iter/s)": 0.201084 }, { "epoch": 1.0908168442415018, "eval_acc": 0.7377390516765628, "eval_loss": 0.9702948927879333, "eval_runtime": 384.6155, "eval_samples_per_second": 16.562, "eval_steps_per_second": 8.281, "step": 43000 }, { "acc": 0.74887161, "epoch": 1.0909436834094368, "grad_norm": 2.109375, "learning_rate": 4.661762781188284e-06, "loss": 1.01392527, "memory(GiB)": 369.42, "step": 43005, "train_speed(iter/s)": 0.200416 }, { "acc": 0.75993915, "epoch": 1.0910705225773718, "grad_norm": 1.8984375, "learning_rate": 4.660716567049997e-06, "loss": 0.94952374, "memory(GiB)": 369.42, "step": 43010, "train_speed(iter/s)": 0.20042 }, { "acc": 0.74987431, "epoch": 1.091197361745307, "grad_norm": 2.140625, "learning_rate": 4.659670367834908e-06, "loss": 0.98040676, "memory(GiB)": 369.42, "step": 43015, "train_speed(iter/s)": 0.200423 }, { "acc": 0.75501366, "epoch": 1.091324200913242, "grad_norm": 2.609375, "learning_rate": 4.658624183589035e-06, "loss": 0.99844589, "memory(GiB)": 369.42, "step": 43020, "train_speed(iter/s)": 0.200428 }, { "acc": 0.76287947, "epoch": 1.091451040081177, "grad_norm": 1.875, "learning_rate": 4.657578014358395e-06, "loss": 0.9760603, "memory(GiB)": 369.42, "step": 43025, "train_speed(iter/s)": 0.200431 }, { "acc": 0.7401577, "epoch": 1.0915778792491122, "grad_norm": 1.984375, "learning_rate": 4.656531860189005e-06, "loss": 0.98366604, "memory(GiB)": 369.42, "step": 43030, "train_speed(iter/s)": 0.200434 }, { "acc": 0.75251923, "epoch": 1.0917047184170472, "grad_norm": 2.03125, "learning_rate": 4.655485721126875e-06, "loss": 0.97327652, "memory(GiB)": 369.42, "step": 43035, "train_speed(iter/s)": 0.200438 }, { "acc": 0.74526901, "epoch": 1.0918315575849822, "grad_norm": 2.546875, "learning_rate": 4.6544395972180214e-06, "loss": 0.99016342, "memory(GiB)": 369.42, "step": 43040, "train_speed(iter/s)": 0.20044 }, { "acc": 0.73756914, "epoch": 1.0919583967529174, "grad_norm": 2.359375, "learning_rate": 4.653393488508457e-06, "loss": 1.02543869, "memory(GiB)": 369.42, "step": 43045, "train_speed(iter/s)": 0.200444 }, { "acc": 0.73980837, "epoch": 1.0920852359208524, "grad_norm": 2.375, "learning_rate": 4.652347395044197e-06, "loss": 0.98825188, "memory(GiB)": 369.42, "step": 43050, "train_speed(iter/s)": 0.200448 }, { "acc": 0.75126514, "epoch": 1.0922120750887874, "grad_norm": 1.859375, "learning_rate": 4.651301316871247e-06, "loss": 1.00863724, "memory(GiB)": 369.42, "step": 43055, "train_speed(iter/s)": 0.20045 }, { "acc": 0.75126114, "epoch": 1.0923389142567226, "grad_norm": 2.171875, "learning_rate": 4.6502552540356235e-06, "loss": 0.99794979, "memory(GiB)": 369.42, "step": 43060, "train_speed(iter/s)": 0.200455 }, { "acc": 0.75369487, "epoch": 1.0924657534246576, "grad_norm": 2.15625, "learning_rate": 4.649209206583335e-06, "loss": 1.02590637, "memory(GiB)": 369.42, "step": 43065, "train_speed(iter/s)": 0.20046 }, { "acc": 0.77294173, "epoch": 1.0925925925925926, "grad_norm": 1.9609375, "learning_rate": 4.648163174560393e-06, "loss": 0.96791401, "memory(GiB)": 369.42, "step": 43070, "train_speed(iter/s)": 0.200462 }, { "acc": 0.75584326, "epoch": 1.0927194317605275, "grad_norm": 2.09375, "learning_rate": 4.647117158012804e-06, "loss": 0.99274588, "memory(GiB)": 369.42, "step": 43075, "train_speed(iter/s)": 0.200466 }, { "acc": 0.74996328, "epoch": 1.0928462709284628, "grad_norm": 2.109375, "learning_rate": 4.646071156986579e-06, "loss": 0.98407259, "memory(GiB)": 369.42, "step": 43080, "train_speed(iter/s)": 0.20047 }, { "acc": 0.74662719, "epoch": 1.0929731100963977, "grad_norm": 2.359375, "learning_rate": 4.645025171527723e-06, "loss": 1.00277681, "memory(GiB)": 369.42, "step": 43085, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74289198, "epoch": 1.0930999492643327, "grad_norm": 2.046875, "learning_rate": 4.643979201682247e-06, "loss": 1.01386375, "memory(GiB)": 369.42, "step": 43090, "train_speed(iter/s)": 0.200477 }, { "acc": 0.7486753, "epoch": 1.093226788432268, "grad_norm": 2.234375, "learning_rate": 4.642933247496155e-06, "loss": 1.00140133, "memory(GiB)": 369.42, "step": 43095, "train_speed(iter/s)": 0.200481 }, { "acc": 0.74164019, "epoch": 1.093353627600203, "grad_norm": 2.484375, "learning_rate": 4.641887309015451e-06, "loss": 1.02196903, "memory(GiB)": 369.42, "step": 43100, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75317621, "epoch": 1.093480466768138, "grad_norm": 1.8046875, "learning_rate": 4.640841386286143e-06, "loss": 0.96088581, "memory(GiB)": 369.42, "step": 43105, "train_speed(iter/s)": 0.200486 }, { "acc": 0.74355164, "epoch": 1.0936073059360731, "grad_norm": 2.21875, "learning_rate": 4.639795479354236e-06, "loss": 1.05159674, "memory(GiB)": 369.42, "step": 43110, "train_speed(iter/s)": 0.200489 }, { "acc": 0.75678568, "epoch": 1.0937341451040081, "grad_norm": 3.1875, "learning_rate": 4.6387495882657295e-06, "loss": 0.99024582, "memory(GiB)": 369.42, "step": 43115, "train_speed(iter/s)": 0.200486 }, { "acc": 0.7467967, "epoch": 1.0938609842719431, "grad_norm": 2.15625, "learning_rate": 4.63770371306663e-06, "loss": 1.00936537, "memory(GiB)": 369.42, "step": 43120, "train_speed(iter/s)": 0.200489 }, { "acc": 0.75250053, "epoch": 1.0939878234398783, "grad_norm": 2.375, "learning_rate": 4.636657853802939e-06, "loss": 0.93497572, "memory(GiB)": 369.42, "step": 43125, "train_speed(iter/s)": 0.200491 }, { "acc": 0.76043501, "epoch": 1.0941146626078133, "grad_norm": 2.3125, "learning_rate": 4.635612010520659e-06, "loss": 1.04659672, "memory(GiB)": 369.42, "step": 43130, "train_speed(iter/s)": 0.200496 }, { "acc": 0.75105257, "epoch": 1.0942415017757483, "grad_norm": 2.5, "learning_rate": 4.6345661832657866e-06, "loss": 0.99773502, "memory(GiB)": 369.42, "step": 43135, "train_speed(iter/s)": 0.2005 }, { "acc": 0.75625544, "epoch": 1.0943683409436833, "grad_norm": 2.046875, "learning_rate": 4.633520372084327e-06, "loss": 0.94788895, "memory(GiB)": 369.42, "step": 43140, "train_speed(iter/s)": 0.200502 }, { "acc": 0.75035014, "epoch": 1.0944951801116185, "grad_norm": 2.078125, "learning_rate": 4.632474577022276e-06, "loss": 1.04273396, "memory(GiB)": 369.42, "step": 43145, "train_speed(iter/s)": 0.200506 }, { "acc": 0.75857506, "epoch": 1.0946220192795535, "grad_norm": 2.171875, "learning_rate": 4.631428798125637e-06, "loss": 0.96126347, "memory(GiB)": 369.42, "step": 43150, "train_speed(iter/s)": 0.200508 }, { "acc": 0.75151534, "epoch": 1.0947488584474885, "grad_norm": 1.828125, "learning_rate": 4.630383035440403e-06, "loss": 0.97453976, "memory(GiB)": 369.42, "step": 43155, "train_speed(iter/s)": 0.200511 }, { "acc": 0.76614733, "epoch": 1.0948756976154237, "grad_norm": 1.9921875, "learning_rate": 4.6293372890125724e-06, "loss": 0.97563076, "memory(GiB)": 369.42, "step": 43160, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75695028, "epoch": 1.0950025367833587, "grad_norm": 1.84375, "learning_rate": 4.628291558888144e-06, "loss": 1.00087032, "memory(GiB)": 369.42, "step": 43165, "train_speed(iter/s)": 0.200516 }, { "acc": 0.75732718, "epoch": 1.0951293759512937, "grad_norm": 2.171875, "learning_rate": 4.627245845113113e-06, "loss": 0.96805038, "memory(GiB)": 369.42, "step": 43170, "train_speed(iter/s)": 0.200521 }, { "acc": 0.74872961, "epoch": 1.0952562151192289, "grad_norm": 2.359375, "learning_rate": 4.626200147733474e-06, "loss": 0.98103514, "memory(GiB)": 369.42, "step": 43175, "train_speed(iter/s)": 0.200524 }, { "acc": 0.7476809, "epoch": 1.0953830542871639, "grad_norm": 2.140625, "learning_rate": 4.62515446679522e-06, "loss": 1.05144024, "memory(GiB)": 369.42, "step": 43180, "train_speed(iter/s)": 0.200525 }, { "acc": 0.76809969, "epoch": 1.0955098934550989, "grad_norm": 2.015625, "learning_rate": 4.624108802344347e-06, "loss": 0.90919933, "memory(GiB)": 369.42, "step": 43185, "train_speed(iter/s)": 0.20053 }, { "acc": 0.75442715, "epoch": 1.095636732623034, "grad_norm": 2.578125, "learning_rate": 4.623063154426848e-06, "loss": 0.98422947, "memory(GiB)": 369.42, "step": 43190, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74818125, "epoch": 1.095763571790969, "grad_norm": 2.125, "learning_rate": 4.622017523088712e-06, "loss": 0.97464714, "memory(GiB)": 369.42, "step": 43195, "train_speed(iter/s)": 0.200537 }, { "acc": 0.74919481, "epoch": 1.095890410958904, "grad_norm": 2.34375, "learning_rate": 4.620971908375934e-06, "loss": 0.99574547, "memory(GiB)": 369.42, "step": 43200, "train_speed(iter/s)": 0.20054 }, { "acc": 0.76451564, "epoch": 1.0960172501268393, "grad_norm": 2.625, "learning_rate": 4.619926310334503e-06, "loss": 0.94317608, "memory(GiB)": 369.42, "step": 43205, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74418378, "epoch": 1.0961440892947742, "grad_norm": 2.046875, "learning_rate": 4.618880729010413e-06, "loss": 1.01733189, "memory(GiB)": 369.42, "step": 43210, "train_speed(iter/s)": 0.20055 }, { "acc": 0.7571888, "epoch": 1.0962709284627092, "grad_norm": 2.078125, "learning_rate": 4.617835164449647e-06, "loss": 0.95378971, "memory(GiB)": 369.42, "step": 43215, "train_speed(iter/s)": 0.200553 }, { "acc": 0.75195827, "epoch": 1.0963977676306444, "grad_norm": 2.515625, "learning_rate": 4.616789616698197e-06, "loss": 0.99051819, "memory(GiB)": 369.42, "step": 43220, "train_speed(iter/s)": 0.200557 }, { "acc": 0.73845916, "epoch": 1.0965246067985794, "grad_norm": 2.5, "learning_rate": 4.61574408580205e-06, "loss": 1.00154343, "memory(GiB)": 369.42, "step": 43225, "train_speed(iter/s)": 0.200561 }, { "acc": 0.76419744, "epoch": 1.0966514459665144, "grad_norm": 2.34375, "learning_rate": 4.614698571807196e-06, "loss": 0.93944931, "memory(GiB)": 369.42, "step": 43230, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74699507, "epoch": 1.0967782851344494, "grad_norm": 2.28125, "learning_rate": 4.6136530747596185e-06, "loss": 1.00694141, "memory(GiB)": 369.42, "step": 43235, "train_speed(iter/s)": 0.200567 }, { "acc": 0.75040255, "epoch": 1.0969051243023846, "grad_norm": 2.09375, "learning_rate": 4.612607594705301e-06, "loss": 1.0231514, "memory(GiB)": 369.42, "step": 43240, "train_speed(iter/s)": 0.200572 }, { "acc": 0.76683764, "epoch": 1.0970319634703196, "grad_norm": 2.53125, "learning_rate": 4.611562131690234e-06, "loss": 0.96018295, "memory(GiB)": 369.42, "step": 43245, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74861755, "epoch": 1.0971588026382546, "grad_norm": 2.078125, "learning_rate": 4.610516685760399e-06, "loss": 0.98001041, "memory(GiB)": 369.42, "step": 43250, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76106133, "epoch": 1.0972856418061898, "grad_norm": 2.03125, "learning_rate": 4.6094712569617775e-06, "loss": 0.94601421, "memory(GiB)": 369.42, "step": 43255, "train_speed(iter/s)": 0.200581 }, { "acc": 0.74991989, "epoch": 1.0974124809741248, "grad_norm": 2.140625, "learning_rate": 4.608425845340353e-06, "loss": 0.99480209, "memory(GiB)": 369.42, "step": 43260, "train_speed(iter/s)": 0.200586 }, { "acc": 0.73192892, "epoch": 1.0975393201420598, "grad_norm": 2.296875, "learning_rate": 4.607380450942109e-06, "loss": 1.04139872, "memory(GiB)": 369.42, "step": 43265, "train_speed(iter/s)": 0.20059 }, { "acc": 0.74977612, "epoch": 1.097666159309995, "grad_norm": 2.25, "learning_rate": 4.606335073813028e-06, "loss": 1.01636124, "memory(GiB)": 369.42, "step": 43270, "train_speed(iter/s)": 0.200595 }, { "acc": 0.74033861, "epoch": 1.09779299847793, "grad_norm": 1.9921875, "learning_rate": 4.605289713999085e-06, "loss": 1.0239706, "memory(GiB)": 369.42, "step": 43275, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74891305, "epoch": 1.097919837645865, "grad_norm": 1.7265625, "learning_rate": 4.604244371546263e-06, "loss": 0.97725344, "memory(GiB)": 369.42, "step": 43280, "train_speed(iter/s)": 0.200602 }, { "acc": 0.7372858, "epoch": 1.0980466768138002, "grad_norm": 1.9453125, "learning_rate": 4.603199046500539e-06, "loss": 1.05126286, "memory(GiB)": 369.42, "step": 43285, "train_speed(iter/s)": 0.200605 }, { "acc": 0.75333004, "epoch": 1.0981735159817352, "grad_norm": 2.25, "learning_rate": 4.602153738907896e-06, "loss": 0.97266598, "memory(GiB)": 369.42, "step": 43290, "train_speed(iter/s)": 0.200608 }, { "acc": 0.75179672, "epoch": 1.0983003551496702, "grad_norm": 2.25, "learning_rate": 4.601108448814306e-06, "loss": 0.97973328, "memory(GiB)": 369.42, "step": 43295, "train_speed(iter/s)": 0.200612 }, { "acc": 0.7473649, "epoch": 1.0984271943176052, "grad_norm": 1.8515625, "learning_rate": 4.600063176265749e-06, "loss": 0.92627029, "memory(GiB)": 369.42, "step": 43300, "train_speed(iter/s)": 0.200616 }, { "acc": 0.756252, "epoch": 1.0985540334855404, "grad_norm": 2.1875, "learning_rate": 4.599017921308196e-06, "loss": 0.99561062, "memory(GiB)": 369.42, "step": 43305, "train_speed(iter/s)": 0.200618 }, { "acc": 0.75191894, "epoch": 1.0986808726534754, "grad_norm": 2.3125, "learning_rate": 4.5979726839876285e-06, "loss": 0.96867256, "memory(GiB)": 369.42, "step": 43310, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74827704, "epoch": 1.0988077118214103, "grad_norm": 1.984375, "learning_rate": 4.596927464350015e-06, "loss": 1.01139374, "memory(GiB)": 369.42, "step": 43315, "train_speed(iter/s)": 0.200625 }, { "acc": 0.74810081, "epoch": 1.0989345509893456, "grad_norm": 2.015625, "learning_rate": 4.595882262441331e-06, "loss": 0.96086359, "memory(GiB)": 369.42, "step": 43320, "train_speed(iter/s)": 0.200628 }, { "acc": 0.75440354, "epoch": 1.0990613901572805, "grad_norm": 1.8046875, "learning_rate": 4.5948370783075505e-06, "loss": 0.97948475, "memory(GiB)": 369.42, "step": 43325, "train_speed(iter/s)": 0.200629 }, { "acc": 0.75108376, "epoch": 1.0991882293252155, "grad_norm": 2.28125, "learning_rate": 4.5937919119946445e-06, "loss": 0.99564028, "memory(GiB)": 369.42, "step": 43330, "train_speed(iter/s)": 0.200634 }, { "acc": 0.75864902, "epoch": 1.0993150684931507, "grad_norm": 2.453125, "learning_rate": 4.592746763548582e-06, "loss": 0.94183407, "memory(GiB)": 369.42, "step": 43335, "train_speed(iter/s)": 0.200636 }, { "acc": 0.7471036, "epoch": 1.0994419076610857, "grad_norm": 2.078125, "learning_rate": 4.591701633015336e-06, "loss": 0.99304905, "memory(GiB)": 369.42, "step": 43340, "train_speed(iter/s)": 0.200638 }, { "acc": 0.75512524, "epoch": 1.0995687468290207, "grad_norm": 2.0625, "learning_rate": 4.590656520440876e-06, "loss": 1.01285419, "memory(GiB)": 369.42, "step": 43345, "train_speed(iter/s)": 0.200643 }, { "acc": 0.76887646, "epoch": 1.099695585996956, "grad_norm": 2.234375, "learning_rate": 4.58961142587117e-06, "loss": 0.92157736, "memory(GiB)": 369.42, "step": 43350, "train_speed(iter/s)": 0.200645 }, { "acc": 0.74358492, "epoch": 1.099822425164891, "grad_norm": 2.125, "learning_rate": 4.588566349352185e-06, "loss": 0.94809666, "memory(GiB)": 369.42, "step": 43355, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75254297, "epoch": 1.099949264332826, "grad_norm": 2.28125, "learning_rate": 4.5875212909298885e-06, "loss": 0.96233273, "memory(GiB)": 369.42, "step": 43360, "train_speed(iter/s)": 0.200652 }, { "acc": 0.75855417, "epoch": 1.1000761035007611, "grad_norm": 2.046875, "learning_rate": 4.586476250650246e-06, "loss": 0.94499741, "memory(GiB)": 369.42, "step": 43365, "train_speed(iter/s)": 0.200655 }, { "acc": 0.74841385, "epoch": 1.1002029426686961, "grad_norm": 1.921875, "learning_rate": 4.585431228559228e-06, "loss": 0.96125031, "memory(GiB)": 369.42, "step": 43370, "train_speed(iter/s)": 0.20066 }, { "acc": 0.74922276, "epoch": 1.100329781836631, "grad_norm": 2.0, "learning_rate": 4.584386224702792e-06, "loss": 0.96394291, "memory(GiB)": 369.42, "step": 43375, "train_speed(iter/s)": 0.200664 }, { "acc": 0.74946051, "epoch": 1.1004566210045663, "grad_norm": 2.5625, "learning_rate": 4.583341239126906e-06, "loss": 0.99577961, "memory(GiB)": 369.42, "step": 43380, "train_speed(iter/s)": 0.200669 }, { "acc": 0.74849381, "epoch": 1.1005834601725013, "grad_norm": 2.0625, "learning_rate": 4.582296271877534e-06, "loss": 1.05900316, "memory(GiB)": 369.42, "step": 43385, "train_speed(iter/s)": 0.200673 }, { "acc": 0.74741459, "epoch": 1.1007102993404363, "grad_norm": 2.3125, "learning_rate": 4.581251323000636e-06, "loss": 0.99805412, "memory(GiB)": 369.42, "step": 43390, "train_speed(iter/s)": 0.200676 }, { "acc": 0.73181891, "epoch": 1.1008371385083713, "grad_norm": 1.9609375, "learning_rate": 4.580206392542175e-06, "loss": 1.0059391, "memory(GiB)": 369.42, "step": 43395, "train_speed(iter/s)": 0.200678 }, { "acc": 0.75001431, "epoch": 1.1009639776763065, "grad_norm": 2.140625, "learning_rate": 4.579161480548109e-06, "loss": 0.9914938, "memory(GiB)": 369.42, "step": 43400, "train_speed(iter/s)": 0.200682 }, { "acc": 0.74869461, "epoch": 1.1010908168442415, "grad_norm": 1.828125, "learning_rate": 4.578116587064402e-06, "loss": 1.00031033, "memory(GiB)": 369.42, "step": 43405, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75674014, "epoch": 1.1012176560121765, "grad_norm": 3.25, "learning_rate": 4.577071712137012e-06, "loss": 0.967939, "memory(GiB)": 369.42, "step": 43410, "train_speed(iter/s)": 0.200688 }, { "acc": 0.75976763, "epoch": 1.1013444951801117, "grad_norm": 2.0, "learning_rate": 4.576026855811893e-06, "loss": 0.9958189, "memory(GiB)": 369.42, "step": 43415, "train_speed(iter/s)": 0.200692 }, { "acc": 0.76201229, "epoch": 1.1014713343480467, "grad_norm": 3.328125, "learning_rate": 4.5749820181350095e-06, "loss": 0.96210766, "memory(GiB)": 369.42, "step": 43420, "train_speed(iter/s)": 0.200696 }, { "acc": 0.74302449, "epoch": 1.1015981735159817, "grad_norm": 2.015625, "learning_rate": 4.57393719915231e-06, "loss": 0.96830101, "memory(GiB)": 369.42, "step": 43425, "train_speed(iter/s)": 0.2007 }, { "acc": 0.74995122, "epoch": 1.1017250126839169, "grad_norm": 2.234375, "learning_rate": 4.5728923989097604e-06, "loss": 0.9889473, "memory(GiB)": 369.42, "step": 43430, "train_speed(iter/s)": 0.200701 }, { "acc": 0.76156335, "epoch": 1.1018518518518519, "grad_norm": 2.09375, "learning_rate": 4.571847617453306e-06, "loss": 0.92360172, "memory(GiB)": 369.42, "step": 43435, "train_speed(iter/s)": 0.200704 }, { "acc": 0.74867058, "epoch": 1.1019786910197868, "grad_norm": 2.125, "learning_rate": 4.570802854828906e-06, "loss": 1.01038284, "memory(GiB)": 369.42, "step": 43440, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75833616, "epoch": 1.102105530187722, "grad_norm": 1.8828125, "learning_rate": 4.569758111082512e-06, "loss": 0.93810425, "memory(GiB)": 369.42, "step": 43445, "train_speed(iter/s)": 0.20071 }, { "acc": 0.76751137, "epoch": 1.102232369355657, "grad_norm": 2.75, "learning_rate": 4.568713386260078e-06, "loss": 0.91821804, "memory(GiB)": 369.42, "step": 43450, "train_speed(iter/s)": 0.200714 }, { "acc": 0.7742487, "epoch": 1.102359208523592, "grad_norm": 2.390625, "learning_rate": 4.567668680407555e-06, "loss": 0.96382732, "memory(GiB)": 369.42, "step": 43455, "train_speed(iter/s)": 0.200719 }, { "acc": 0.74421797, "epoch": 1.102486047691527, "grad_norm": 2.28125, "learning_rate": 4.566623993570893e-06, "loss": 1.00864506, "memory(GiB)": 369.42, "step": 43460, "train_speed(iter/s)": 0.200723 }, { "acc": 0.72911057, "epoch": 1.1026128868594622, "grad_norm": 2.109375, "learning_rate": 4.565579325796043e-06, "loss": 1.09157276, "memory(GiB)": 369.42, "step": 43465, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75403042, "epoch": 1.1027397260273972, "grad_norm": 2.0625, "learning_rate": 4.564534677128954e-06, "loss": 0.9972702, "memory(GiB)": 369.42, "step": 43470, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74941978, "epoch": 1.1028665651953322, "grad_norm": 1.96875, "learning_rate": 4.563490047615574e-06, "loss": 0.98814106, "memory(GiB)": 369.42, "step": 43475, "train_speed(iter/s)": 0.200734 }, { "acc": 0.75634789, "epoch": 1.1029934043632674, "grad_norm": 2.171875, "learning_rate": 4.56244543730185e-06, "loss": 0.95726652, "memory(GiB)": 369.42, "step": 43480, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75786476, "epoch": 1.1031202435312024, "grad_norm": 2.0625, "learning_rate": 4.561400846233729e-06, "loss": 0.95575142, "memory(GiB)": 369.42, "step": 43485, "train_speed(iter/s)": 0.20074 }, { "acc": 0.75371342, "epoch": 1.1032470826991374, "grad_norm": 1.875, "learning_rate": 4.56035627445716e-06, "loss": 0.9456975, "memory(GiB)": 369.42, "step": 43490, "train_speed(iter/s)": 0.200743 }, { "acc": 0.73166723, "epoch": 1.1033739218670726, "grad_norm": 2.59375, "learning_rate": 4.55931172201808e-06, "loss": 1.08362665, "memory(GiB)": 369.42, "step": 43495, "train_speed(iter/s)": 0.200747 }, { "acc": 0.74634466, "epoch": 1.1035007610350076, "grad_norm": 2.25, "learning_rate": 4.558267188962441e-06, "loss": 0.97568722, "memory(GiB)": 369.42, "step": 43500, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75018802, "epoch": 1.1036276002029426, "grad_norm": 1.84375, "learning_rate": 4.557222675336182e-06, "loss": 1.00842638, "memory(GiB)": 369.42, "step": 43505, "train_speed(iter/s)": 0.200753 }, { "acc": 0.74784164, "epoch": 1.1037544393708778, "grad_norm": 2.25, "learning_rate": 4.556178181185249e-06, "loss": 0.98300896, "memory(GiB)": 369.42, "step": 43510, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74656544, "epoch": 1.1038812785388128, "grad_norm": 2.15625, "learning_rate": 4.555133706555579e-06, "loss": 1.04281521, "memory(GiB)": 369.42, "step": 43515, "train_speed(iter/s)": 0.200758 }, { "acc": 0.74346838, "epoch": 1.1040081177067478, "grad_norm": 2.28125, "learning_rate": 4.554089251493115e-06, "loss": 1.02843971, "memory(GiB)": 369.42, "step": 43520, "train_speed(iter/s)": 0.200762 }, { "acc": 0.7577158, "epoch": 1.104134956874683, "grad_norm": 2.46875, "learning_rate": 4.553044816043796e-06, "loss": 0.93956032, "memory(GiB)": 369.42, "step": 43525, "train_speed(iter/s)": 0.200764 }, { "acc": 0.75140467, "epoch": 1.104261796042618, "grad_norm": 2.390625, "learning_rate": 4.552000400253563e-06, "loss": 0.9892231, "memory(GiB)": 369.42, "step": 43530, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75643358, "epoch": 1.104388635210553, "grad_norm": 2.3125, "learning_rate": 4.550956004168352e-06, "loss": 0.97183704, "memory(GiB)": 369.42, "step": 43535, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74868417, "epoch": 1.1045154743784882, "grad_norm": 1.828125, "learning_rate": 4.5499116278341e-06, "loss": 1.00954676, "memory(GiB)": 369.42, "step": 43540, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75475225, "epoch": 1.1046423135464232, "grad_norm": 2.203125, "learning_rate": 4.548867271296745e-06, "loss": 1.00154877, "memory(GiB)": 369.42, "step": 43545, "train_speed(iter/s)": 0.200776 }, { "acc": 0.76914697, "epoch": 1.1047691527143582, "grad_norm": 2.1875, "learning_rate": 4.547822934602222e-06, "loss": 0.92299938, "memory(GiB)": 369.42, "step": 43550, "train_speed(iter/s)": 0.20078 }, { "acc": 0.76191735, "epoch": 1.1048959918822931, "grad_norm": 2.28125, "learning_rate": 4.5467786177964635e-06, "loss": 0.97482214, "memory(GiB)": 369.42, "step": 43555, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74472623, "epoch": 1.1050228310502284, "grad_norm": 2.34375, "learning_rate": 4.545734320925406e-06, "loss": 1.0335434, "memory(GiB)": 369.42, "step": 43560, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75810637, "epoch": 1.1051496702181633, "grad_norm": 2.453125, "learning_rate": 4.544690044034981e-06, "loss": 0.98522415, "memory(GiB)": 369.42, "step": 43565, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75464945, "epoch": 1.1052765093860983, "grad_norm": 2.03125, "learning_rate": 4.543645787171122e-06, "loss": 0.98231792, "memory(GiB)": 369.42, "step": 43570, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74125562, "epoch": 1.1054033485540335, "grad_norm": 2.125, "learning_rate": 4.5426015503797565e-06, "loss": 1.02704353, "memory(GiB)": 369.42, "step": 43575, "train_speed(iter/s)": 0.200797 }, { "acc": 0.75082855, "epoch": 1.1055301877219685, "grad_norm": 2.4375, "learning_rate": 4.5415573337068185e-06, "loss": 1.01252937, "memory(GiB)": 369.42, "step": 43580, "train_speed(iter/s)": 0.2008 }, { "acc": 0.74979315, "epoch": 1.1056570268899035, "grad_norm": 2.40625, "learning_rate": 4.540513137198233e-06, "loss": 1.02284603, "memory(GiB)": 369.42, "step": 43585, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75798035, "epoch": 1.1057838660578387, "grad_norm": 2.109375, "learning_rate": 4.539468960899936e-06, "loss": 0.93392239, "memory(GiB)": 369.42, "step": 43590, "train_speed(iter/s)": 0.200807 }, { "acc": 0.73900766, "epoch": 1.1059107052257737, "grad_norm": 2.59375, "learning_rate": 4.538424804857847e-06, "loss": 1.03861389, "memory(GiB)": 369.42, "step": 43595, "train_speed(iter/s)": 0.200807 }, { "acc": 0.74877634, "epoch": 1.1060375443937087, "grad_norm": 1.8984375, "learning_rate": 4.537380669117896e-06, "loss": 1.00357962, "memory(GiB)": 369.42, "step": 43600, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74760141, "epoch": 1.106164383561644, "grad_norm": 2.625, "learning_rate": 4.536336553726008e-06, "loss": 0.984268, "memory(GiB)": 369.42, "step": 43605, "train_speed(iter/s)": 0.200814 }, { "acc": 0.76056652, "epoch": 1.106291222729579, "grad_norm": 1.921875, "learning_rate": 4.535292458728112e-06, "loss": 0.96356783, "memory(GiB)": 369.42, "step": 43610, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74137268, "epoch": 1.106418061897514, "grad_norm": 2.4375, "learning_rate": 4.534248384170126e-06, "loss": 1.01896534, "memory(GiB)": 369.42, "step": 43615, "train_speed(iter/s)": 0.200822 }, { "acc": 0.75209203, "epoch": 1.106544901065449, "grad_norm": 2.25, "learning_rate": 4.533204330097974e-06, "loss": 0.98457165, "memory(GiB)": 369.42, "step": 43620, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75623407, "epoch": 1.106671740233384, "grad_norm": 1.8359375, "learning_rate": 4.532160296557581e-06, "loss": 0.98349085, "memory(GiB)": 369.42, "step": 43625, "train_speed(iter/s)": 0.20083 }, { "acc": 0.76404147, "epoch": 1.106798579401319, "grad_norm": 3.109375, "learning_rate": 4.531116283594868e-06, "loss": 0.95891075, "memory(GiB)": 369.42, "step": 43630, "train_speed(iter/s)": 0.200833 }, { "acc": 0.76618481, "epoch": 1.106925418569254, "grad_norm": 2.453125, "learning_rate": 4.530072291255753e-06, "loss": 0.95891972, "memory(GiB)": 369.42, "step": 43635, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75764055, "epoch": 1.1070522577371893, "grad_norm": 2.171875, "learning_rate": 4.529028319586157e-06, "loss": 0.99128456, "memory(GiB)": 369.42, "step": 43640, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75330992, "epoch": 1.1071790969051243, "grad_norm": 2.4375, "learning_rate": 4.527984368631997e-06, "loss": 1.05015469, "memory(GiB)": 369.42, "step": 43645, "train_speed(iter/s)": 0.200842 }, { "acc": 0.74228249, "epoch": 1.1073059360730593, "grad_norm": 2.21875, "learning_rate": 4.526940438439196e-06, "loss": 0.91843872, "memory(GiB)": 369.42, "step": 43650, "train_speed(iter/s)": 0.200844 }, { "acc": 0.7560009, "epoch": 1.1074327752409945, "grad_norm": 2.6875, "learning_rate": 4.525896529053662e-06, "loss": 1.0099247, "memory(GiB)": 369.42, "step": 43655, "train_speed(iter/s)": 0.200847 }, { "acc": 0.75241642, "epoch": 1.1075596144089295, "grad_norm": 2.046875, "learning_rate": 4.524852640521318e-06, "loss": 0.98123245, "memory(GiB)": 369.42, "step": 43660, "train_speed(iter/s)": 0.200851 }, { "acc": 0.75245609, "epoch": 1.1076864535768645, "grad_norm": 2.15625, "learning_rate": 4.523808772888073e-06, "loss": 0.94775534, "memory(GiB)": 369.42, "step": 43665, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75514841, "epoch": 1.1078132927447997, "grad_norm": 1.9375, "learning_rate": 4.522764926199848e-06, "loss": 0.97698059, "memory(GiB)": 369.42, "step": 43670, "train_speed(iter/s)": 0.200857 }, { "acc": 0.750387, "epoch": 1.1079401319127347, "grad_norm": 2.125, "learning_rate": 4.5217211005025516e-06, "loss": 0.94710093, "memory(GiB)": 369.42, "step": 43675, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74467068, "epoch": 1.1080669710806696, "grad_norm": 2.375, "learning_rate": 4.520677295842095e-06, "loss": 1.02368641, "memory(GiB)": 369.42, "step": 43680, "train_speed(iter/s)": 0.200864 }, { "acc": 0.77085009, "epoch": 1.1081938102486049, "grad_norm": 2.59375, "learning_rate": 4.5196335122643915e-06, "loss": 0.89935246, "memory(GiB)": 369.42, "step": 43685, "train_speed(iter/s)": 0.200866 }, { "acc": 0.76194258, "epoch": 1.1083206494165398, "grad_norm": 2.203125, "learning_rate": 4.518589749815352e-06, "loss": 0.93973703, "memory(GiB)": 369.42, "step": 43690, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75639434, "epoch": 1.1084474885844748, "grad_norm": 2.3125, "learning_rate": 4.517546008540884e-06, "loss": 0.97046757, "memory(GiB)": 369.42, "step": 43695, "train_speed(iter/s)": 0.200873 }, { "acc": 0.75218115, "epoch": 1.10857432775241, "grad_norm": 2.0, "learning_rate": 4.5165022884868946e-06, "loss": 0.98279934, "memory(GiB)": 369.42, "step": 43700, "train_speed(iter/s)": 0.200877 }, { "acc": 0.73797431, "epoch": 1.108701166920345, "grad_norm": 2.34375, "learning_rate": 4.515458589699295e-06, "loss": 1.06049185, "memory(GiB)": 369.42, "step": 43705, "train_speed(iter/s)": 0.200881 }, { "acc": 0.75110917, "epoch": 1.10882800608828, "grad_norm": 1.921875, "learning_rate": 4.514414912223991e-06, "loss": 0.97673836, "memory(GiB)": 369.42, "step": 43710, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75598783, "epoch": 1.108954845256215, "grad_norm": 2.21875, "learning_rate": 4.513371256106885e-06, "loss": 0.95491619, "memory(GiB)": 369.42, "step": 43715, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74543052, "epoch": 1.1090816844241502, "grad_norm": 2.46875, "learning_rate": 4.512327621393885e-06, "loss": 0.97859383, "memory(GiB)": 369.42, "step": 43720, "train_speed(iter/s)": 0.200893 }, { "acc": 0.7661624, "epoch": 1.1092085235920852, "grad_norm": 2.078125, "learning_rate": 4.511284008130892e-06, "loss": 0.94124928, "memory(GiB)": 369.42, "step": 43725, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74259887, "epoch": 1.1093353627600202, "grad_norm": 2.0625, "learning_rate": 4.510240416363813e-06, "loss": 0.97099762, "memory(GiB)": 369.42, "step": 43730, "train_speed(iter/s)": 0.2009 }, { "acc": 0.73834019, "epoch": 1.1094622019279554, "grad_norm": 2.03125, "learning_rate": 4.5091968461385455e-06, "loss": 1.02487316, "memory(GiB)": 369.42, "step": 43735, "train_speed(iter/s)": 0.200903 }, { "acc": 0.75211849, "epoch": 1.1095890410958904, "grad_norm": 2.046875, "learning_rate": 4.508153297500993e-06, "loss": 0.99596195, "memory(GiB)": 369.42, "step": 43740, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75700345, "epoch": 1.1097158802638254, "grad_norm": 2.171875, "learning_rate": 4.507109770497052e-06, "loss": 0.95996399, "memory(GiB)": 369.42, "step": 43745, "train_speed(iter/s)": 0.200909 }, { "acc": 0.76785369, "epoch": 1.1098427194317606, "grad_norm": 2.15625, "learning_rate": 4.506066265172626e-06, "loss": 0.93932629, "memory(GiB)": 369.42, "step": 43750, "train_speed(iter/s)": 0.200915 }, { "acc": 0.7261591, "epoch": 1.1099695585996956, "grad_norm": 2.15625, "learning_rate": 4.505022781573611e-06, "loss": 1.03543234, "memory(GiB)": 369.42, "step": 43755, "train_speed(iter/s)": 0.200918 }, { "acc": 0.75508566, "epoch": 1.1100963977676306, "grad_norm": 2.125, "learning_rate": 4.503979319745902e-06, "loss": 0.99075718, "memory(GiB)": 369.42, "step": 43760, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75873494, "epoch": 1.1102232369355658, "grad_norm": 2.0625, "learning_rate": 4.502935879735398e-06, "loss": 0.97224808, "memory(GiB)": 369.42, "step": 43765, "train_speed(iter/s)": 0.200924 }, { "acc": 0.74759007, "epoch": 1.1103500761035008, "grad_norm": 1.96875, "learning_rate": 4.5018924615879956e-06, "loss": 0.99239044, "memory(GiB)": 369.42, "step": 43770, "train_speed(iter/s)": 0.200927 }, { "acc": 0.75271854, "epoch": 1.1104769152714358, "grad_norm": 2.0, "learning_rate": 4.500849065349584e-06, "loss": 0.99500179, "memory(GiB)": 369.42, "step": 43775, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74548464, "epoch": 1.1106037544393708, "grad_norm": 2.5, "learning_rate": 4.499805691066059e-06, "loss": 0.96761894, "memory(GiB)": 369.42, "step": 43780, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75445428, "epoch": 1.110730593607306, "grad_norm": 2.390625, "learning_rate": 4.498762338783314e-06, "loss": 0.97540016, "memory(GiB)": 369.42, "step": 43785, "train_speed(iter/s)": 0.200935 }, { "acc": 0.7558177, "epoch": 1.110857432775241, "grad_norm": 2.25, "learning_rate": 4.49771900854724e-06, "loss": 0.95997381, "memory(GiB)": 369.42, "step": 43790, "train_speed(iter/s)": 0.200938 }, { "acc": 0.73409901, "epoch": 1.110984271943176, "grad_norm": 2.265625, "learning_rate": 4.496675700403724e-06, "loss": 1.05486355, "memory(GiB)": 369.42, "step": 43795, "train_speed(iter/s)": 0.20094 }, { "acc": 0.75205731, "epoch": 1.1111111111111112, "grad_norm": 2.15625, "learning_rate": 4.495632414398659e-06, "loss": 1.01879311, "memory(GiB)": 369.42, "step": 43800, "train_speed(iter/s)": 0.200944 }, { "acc": 0.7431632, "epoch": 1.1112379502790461, "grad_norm": 2.1875, "learning_rate": 4.494589150577932e-06, "loss": 1.04146442, "memory(GiB)": 369.42, "step": 43805, "train_speed(iter/s)": 0.200947 }, { "acc": 0.75829477, "epoch": 1.1113647894469811, "grad_norm": 2.484375, "learning_rate": 4.493545908987432e-06, "loss": 1.00954561, "memory(GiB)": 369.42, "step": 43810, "train_speed(iter/s)": 0.200951 }, { "acc": 0.73649082, "epoch": 1.1114916286149163, "grad_norm": 2.015625, "learning_rate": 4.492502689673044e-06, "loss": 1.06987953, "memory(GiB)": 369.42, "step": 43815, "train_speed(iter/s)": 0.200954 }, { "acc": 0.76740813, "epoch": 1.1116184677828513, "grad_norm": 2.203125, "learning_rate": 4.491459492680651e-06, "loss": 0.97434139, "memory(GiB)": 369.42, "step": 43820, "train_speed(iter/s)": 0.200957 }, { "acc": 0.75695248, "epoch": 1.1117453069507863, "grad_norm": 2.28125, "learning_rate": 4.4904163180561425e-06, "loss": 0.93239441, "memory(GiB)": 369.42, "step": 43825, "train_speed(iter/s)": 0.200961 }, { "acc": 0.73919897, "epoch": 1.1118721461187215, "grad_norm": 2.09375, "learning_rate": 4.4893731658453996e-06, "loss": 0.98900194, "memory(GiB)": 369.42, "step": 43830, "train_speed(iter/s)": 0.200966 }, { "acc": 0.75629644, "epoch": 1.1119989852866565, "grad_norm": 2.046875, "learning_rate": 4.4883300360943035e-06, "loss": 0.98589582, "memory(GiB)": 369.42, "step": 43835, "train_speed(iter/s)": 0.20097 }, { "acc": 0.74353313, "epoch": 1.1121258244545915, "grad_norm": 2.296875, "learning_rate": 4.4872869288487366e-06, "loss": 1.01000538, "memory(GiB)": 369.42, "step": 43840, "train_speed(iter/s)": 0.200972 }, { "acc": 0.7443716, "epoch": 1.1122526636225267, "grad_norm": 1.8984375, "learning_rate": 4.48624384415458e-06, "loss": 0.94858789, "memory(GiB)": 369.42, "step": 43845, "train_speed(iter/s)": 0.200975 }, { "acc": 0.75211711, "epoch": 1.1123795027904617, "grad_norm": 2.1875, "learning_rate": 4.485200782057715e-06, "loss": 1.03686905, "memory(GiB)": 369.42, "step": 43850, "train_speed(iter/s)": 0.200979 }, { "acc": 0.73991776, "epoch": 1.1125063419583967, "grad_norm": 2.5, "learning_rate": 4.4841577426040145e-06, "loss": 1.02152472, "memory(GiB)": 369.42, "step": 43855, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75840111, "epoch": 1.112633181126332, "grad_norm": 2.203125, "learning_rate": 4.483114725839361e-06, "loss": 0.97721596, "memory(GiB)": 369.42, "step": 43860, "train_speed(iter/s)": 0.20098 }, { "acc": 0.74800401, "epoch": 1.112760020294267, "grad_norm": 2.390625, "learning_rate": 4.482071731809629e-06, "loss": 1.01643095, "memory(GiB)": 369.42, "step": 43865, "train_speed(iter/s)": 0.200984 }, { "acc": 0.74023399, "epoch": 1.112886859462202, "grad_norm": 2.296875, "learning_rate": 4.481028760560697e-06, "loss": 1.00117474, "memory(GiB)": 369.42, "step": 43870, "train_speed(iter/s)": 0.200985 }, { "acc": 0.75472975, "epoch": 1.1130136986301369, "grad_norm": 2.09375, "learning_rate": 4.479985812138435e-06, "loss": 0.98166838, "memory(GiB)": 369.42, "step": 43875, "train_speed(iter/s)": 0.200988 }, { "acc": 0.75036306, "epoch": 1.113140537798072, "grad_norm": 2.109375, "learning_rate": 4.478942886588719e-06, "loss": 0.96106987, "memory(GiB)": 369.42, "step": 43880, "train_speed(iter/s)": 0.200993 }, { "acc": 0.74824409, "epoch": 1.113267376966007, "grad_norm": 1.9375, "learning_rate": 4.47789998395742e-06, "loss": 1.00084267, "memory(GiB)": 369.42, "step": 43885, "train_speed(iter/s)": 0.200992 }, { "acc": 0.76879292, "epoch": 1.113394216133942, "grad_norm": 2.4375, "learning_rate": 4.476857104290413e-06, "loss": 0.97088051, "memory(GiB)": 369.42, "step": 43890, "train_speed(iter/s)": 0.200997 }, { "acc": 0.75847964, "epoch": 1.1135210553018773, "grad_norm": 2.390625, "learning_rate": 4.4758142476335655e-06, "loss": 0.9523077, "memory(GiB)": 369.42, "step": 43895, "train_speed(iter/s)": 0.200999 }, { "acc": 0.75892439, "epoch": 1.1136478944698123, "grad_norm": 1.8046875, "learning_rate": 4.474771414032747e-06, "loss": 1.01718111, "memory(GiB)": 369.42, "step": 43900, "train_speed(iter/s)": 0.201002 }, { "acc": 0.75850115, "epoch": 1.1137747336377473, "grad_norm": 2.40625, "learning_rate": 4.473728603533827e-06, "loss": 0.97977543, "memory(GiB)": 369.42, "step": 43905, "train_speed(iter/s)": 0.201006 }, { "acc": 0.74827533, "epoch": 1.1139015728056825, "grad_norm": 2.328125, "learning_rate": 4.472685816182674e-06, "loss": 0.9661459, "memory(GiB)": 369.42, "step": 43910, "train_speed(iter/s)": 0.201009 }, { "acc": 0.74975061, "epoch": 1.1140284119736175, "grad_norm": 2.546875, "learning_rate": 4.471643052025152e-06, "loss": 1.00537004, "memory(GiB)": 369.42, "step": 43915, "train_speed(iter/s)": 0.201011 }, { "acc": 0.74337654, "epoch": 1.1141552511415524, "grad_norm": 2.21875, "learning_rate": 4.470600311107127e-06, "loss": 1.01064329, "memory(GiB)": 369.42, "step": 43920, "train_speed(iter/s)": 0.201013 }, { "acc": 0.74711332, "epoch": 1.1142820903094877, "grad_norm": 2.234375, "learning_rate": 4.469557593474464e-06, "loss": 0.9590085, "memory(GiB)": 369.42, "step": 43925, "train_speed(iter/s)": 0.201017 }, { "acc": 0.74874787, "epoch": 1.1144089294774226, "grad_norm": 2.1875, "learning_rate": 4.468514899173027e-06, "loss": 1.02207565, "memory(GiB)": 369.42, "step": 43930, "train_speed(iter/s)": 0.201021 }, { "acc": 0.76282492, "epoch": 1.1145357686453576, "grad_norm": 2.171875, "learning_rate": 4.4674722282486775e-06, "loss": 0.92394238, "memory(GiB)": 369.42, "step": 43935, "train_speed(iter/s)": 0.201023 }, { "acc": 0.75949335, "epoch": 1.1146626078132926, "grad_norm": 2.15625, "learning_rate": 4.4664295807472765e-06, "loss": 0.97119122, "memory(GiB)": 369.42, "step": 43940, "train_speed(iter/s)": 0.201024 }, { "acc": 0.74895263, "epoch": 1.1147894469812278, "grad_norm": 2.53125, "learning_rate": 4.465386956714684e-06, "loss": 0.97797709, "memory(GiB)": 369.42, "step": 43945, "train_speed(iter/s)": 0.201026 }, { "acc": 0.74944, "epoch": 1.1149162861491628, "grad_norm": 2.234375, "learning_rate": 4.4643443561967625e-06, "loss": 0.97686405, "memory(GiB)": 369.42, "step": 43950, "train_speed(iter/s)": 0.201029 }, { "acc": 0.75854058, "epoch": 1.1150431253170978, "grad_norm": 2.09375, "learning_rate": 4.463301779239366e-06, "loss": 0.89628849, "memory(GiB)": 369.42, "step": 43955, "train_speed(iter/s)": 0.201031 }, { "acc": 0.76593275, "epoch": 1.115169964485033, "grad_norm": 2.125, "learning_rate": 4.462259225888354e-06, "loss": 0.94509726, "memory(GiB)": 369.42, "step": 43960, "train_speed(iter/s)": 0.201034 }, { "acc": 0.73891273, "epoch": 1.115296803652968, "grad_norm": 2.046875, "learning_rate": 4.4612166961895805e-06, "loss": 1.04487219, "memory(GiB)": 369.42, "step": 43965, "train_speed(iter/s)": 0.201037 }, { "acc": 0.75264072, "epoch": 1.115423642820903, "grad_norm": 1.953125, "learning_rate": 4.460174190188905e-06, "loss": 0.96768074, "memory(GiB)": 369.42, "step": 43970, "train_speed(iter/s)": 0.201041 }, { "acc": 0.74745731, "epoch": 1.1155504819888382, "grad_norm": 2.328125, "learning_rate": 4.459131707932177e-06, "loss": 1.04859877, "memory(GiB)": 369.42, "step": 43975, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74784307, "epoch": 1.1156773211567732, "grad_norm": 2.171875, "learning_rate": 4.458089249465251e-06, "loss": 1.01141186, "memory(GiB)": 369.42, "step": 43980, "train_speed(iter/s)": 0.201048 }, { "acc": 0.73563395, "epoch": 1.1158041603247082, "grad_norm": 1.9453125, "learning_rate": 4.45704681483398e-06, "loss": 1.06262093, "memory(GiB)": 369.42, "step": 43985, "train_speed(iter/s)": 0.201051 }, { "acc": 0.74538431, "epoch": 1.1159309994926434, "grad_norm": 2.046875, "learning_rate": 4.456004404084215e-06, "loss": 0.99076347, "memory(GiB)": 369.42, "step": 43990, "train_speed(iter/s)": 0.201056 }, { "acc": 0.74009905, "epoch": 1.1160578386605784, "grad_norm": 2.609375, "learning_rate": 4.454962017261803e-06, "loss": 1.06182499, "memory(GiB)": 369.42, "step": 43995, "train_speed(iter/s)": 0.201059 }, { "acc": 0.74614458, "epoch": 1.1161846778285134, "grad_norm": 2.125, "learning_rate": 4.453919654412596e-06, "loss": 0.99366837, "memory(GiB)": 369.42, "step": 44000, "train_speed(iter/s)": 0.201064 }, { "epoch": 1.1161846778285134, "eval_acc": 0.7378305427173341, "eval_loss": 0.9702962636947632, "eval_runtime": 385.3708, "eval_samples_per_second": 16.53, "eval_steps_per_second": 8.265, "step": 44000 }, { "acc": 0.75208349, "epoch": 1.1163115169964486, "grad_norm": 2.140625, "learning_rate": 4.45287731558244e-06, "loss": 0.95327759, "memory(GiB)": 369.42, "step": 44005, "train_speed(iter/s)": 0.200414 }, { "acc": 0.75308414, "epoch": 1.1164383561643836, "grad_norm": 1.8046875, "learning_rate": 4.451835000817185e-06, "loss": 0.98241978, "memory(GiB)": 369.42, "step": 44010, "train_speed(iter/s)": 0.200418 }, { "acc": 0.76296291, "epoch": 1.1165651953323186, "grad_norm": 2.34375, "learning_rate": 4.450792710162672e-06, "loss": 0.99365873, "memory(GiB)": 369.42, "step": 44015, "train_speed(iter/s)": 0.200422 }, { "acc": 0.74589901, "epoch": 1.1166920345002538, "grad_norm": 2.296875, "learning_rate": 4.449750443664747e-06, "loss": 0.97223682, "memory(GiB)": 369.42, "step": 44020, "train_speed(iter/s)": 0.200425 }, { "acc": 0.75669012, "epoch": 1.1168188736681888, "grad_norm": 1.8125, "learning_rate": 4.448708201369254e-06, "loss": 0.97862463, "memory(GiB)": 369.42, "step": 44025, "train_speed(iter/s)": 0.200429 }, { "acc": 0.74513836, "epoch": 1.1169457128361238, "grad_norm": 2.046875, "learning_rate": 4.4476659833220374e-06, "loss": 1.01475515, "memory(GiB)": 369.42, "step": 44030, "train_speed(iter/s)": 0.200432 }, { "acc": 0.75582581, "epoch": 1.1170725520040587, "grad_norm": 2.34375, "learning_rate": 4.4466237895689365e-06, "loss": 0.9670702, "memory(GiB)": 369.42, "step": 44035, "train_speed(iter/s)": 0.200434 }, { "acc": 0.7474514, "epoch": 1.117199391171994, "grad_norm": 2.625, "learning_rate": 4.44558162015579e-06, "loss": 0.97993317, "memory(GiB)": 369.42, "step": 44040, "train_speed(iter/s)": 0.200438 }, { "acc": 0.74255605, "epoch": 1.117326230339929, "grad_norm": 2.390625, "learning_rate": 4.444539475128441e-06, "loss": 1.01772585, "memory(GiB)": 369.42, "step": 44045, "train_speed(iter/s)": 0.20044 }, { "acc": 0.75763121, "epoch": 1.117453069507864, "grad_norm": 2.0625, "learning_rate": 4.443497354532726e-06, "loss": 1.00126629, "memory(GiB)": 369.42, "step": 44050, "train_speed(iter/s)": 0.200444 }, { "acc": 0.75586853, "epoch": 1.1175799086757991, "grad_norm": 2.234375, "learning_rate": 4.442455258414482e-06, "loss": 0.94273796, "memory(GiB)": 369.42, "step": 44055, "train_speed(iter/s)": 0.200447 }, { "acc": 0.74721751, "epoch": 1.1177067478437341, "grad_norm": 2.234375, "learning_rate": 4.441413186819543e-06, "loss": 0.95807037, "memory(GiB)": 369.42, "step": 44060, "train_speed(iter/s)": 0.200449 }, { "acc": 0.75708895, "epoch": 1.1178335870116691, "grad_norm": 2.53125, "learning_rate": 4.440371139793747e-06, "loss": 0.94986658, "memory(GiB)": 369.42, "step": 44065, "train_speed(iter/s)": 0.200453 }, { "acc": 0.75045443, "epoch": 1.1179604261796043, "grad_norm": 2.0625, "learning_rate": 4.43932911738293e-06, "loss": 0.99405422, "memory(GiB)": 369.42, "step": 44070, "train_speed(iter/s)": 0.200456 }, { "acc": 0.7530035, "epoch": 1.1180872653475393, "grad_norm": 2.390625, "learning_rate": 4.438287119632917e-06, "loss": 0.96619301, "memory(GiB)": 369.42, "step": 44075, "train_speed(iter/s)": 0.20046 }, { "acc": 0.7524066, "epoch": 1.1182141045154743, "grad_norm": 3.625, "learning_rate": 4.4372451465895465e-06, "loss": 0.98936615, "memory(GiB)": 369.42, "step": 44080, "train_speed(iter/s)": 0.200464 }, { "acc": 0.74889936, "epoch": 1.1183409436834095, "grad_norm": 2.328125, "learning_rate": 4.436203198298645e-06, "loss": 1.01692543, "memory(GiB)": 369.42, "step": 44085, "train_speed(iter/s)": 0.200468 }, { "acc": 0.74782019, "epoch": 1.1184677828513445, "grad_norm": 1.9375, "learning_rate": 4.435161274806049e-06, "loss": 0.96156406, "memory(GiB)": 369.42, "step": 44090, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74034033, "epoch": 1.1185946220192795, "grad_norm": 1.984375, "learning_rate": 4.4341193761575765e-06, "loss": 1.03589668, "memory(GiB)": 369.42, "step": 44095, "train_speed(iter/s)": 0.200475 }, { "acc": 0.76014004, "epoch": 1.1187214611872145, "grad_norm": 2.34375, "learning_rate": 4.433077502399063e-06, "loss": 0.98295031, "memory(GiB)": 369.42, "step": 44100, "train_speed(iter/s)": 0.200477 }, { "acc": 0.75720482, "epoch": 1.1188483003551497, "grad_norm": 1.96875, "learning_rate": 4.43203565357633e-06, "loss": 0.92486706, "memory(GiB)": 369.42, "step": 44105, "train_speed(iter/s)": 0.20048 }, { "acc": 0.75563183, "epoch": 1.1189751395230847, "grad_norm": 2.21875, "learning_rate": 4.430993829735208e-06, "loss": 0.98075047, "memory(GiB)": 369.42, "step": 44110, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75161228, "epoch": 1.1191019786910197, "grad_norm": 2.015625, "learning_rate": 4.429952030921516e-06, "loss": 0.9549778, "memory(GiB)": 369.42, "step": 44115, "train_speed(iter/s)": 0.200486 }, { "acc": 0.75968409, "epoch": 1.119228817858955, "grad_norm": 2.671875, "learning_rate": 4.428910257181077e-06, "loss": 0.98778524, "memory(GiB)": 369.42, "step": 44120, "train_speed(iter/s)": 0.20049 }, { "acc": 0.76553602, "epoch": 1.1193556570268899, "grad_norm": 2.734375, "learning_rate": 4.427868508559717e-06, "loss": 0.96329441, "memory(GiB)": 369.42, "step": 44125, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75797853, "epoch": 1.1194824961948249, "grad_norm": 2.4375, "learning_rate": 4.426826785103256e-06, "loss": 0.92325039, "memory(GiB)": 369.42, "step": 44130, "train_speed(iter/s)": 0.200492 }, { "acc": 0.74792266, "epoch": 1.11960933536276, "grad_norm": 2.515625, "learning_rate": 4.425785086857509e-06, "loss": 1.00060158, "memory(GiB)": 369.42, "step": 44135, "train_speed(iter/s)": 0.200496 }, { "acc": 0.74564066, "epoch": 1.119736174530695, "grad_norm": 2.171875, "learning_rate": 4.424743413868298e-06, "loss": 1.00061722, "memory(GiB)": 369.42, "step": 44140, "train_speed(iter/s)": 0.200499 }, { "acc": 0.75935826, "epoch": 1.11986301369863, "grad_norm": 2.359375, "learning_rate": 4.42370176618144e-06, "loss": 0.97377491, "memory(GiB)": 369.42, "step": 44145, "train_speed(iter/s)": 0.200502 }, { "acc": 0.74439044, "epoch": 1.1199898528665653, "grad_norm": 2.09375, "learning_rate": 4.422660143842753e-06, "loss": 0.96870232, "memory(GiB)": 369.42, "step": 44150, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75262527, "epoch": 1.1201166920345003, "grad_norm": 2.515625, "learning_rate": 4.421618546898048e-06, "loss": 1.0109993, "memory(GiB)": 369.42, "step": 44155, "train_speed(iter/s)": 0.200508 }, { "acc": 0.75961409, "epoch": 1.1202435312024352, "grad_norm": 2.03125, "learning_rate": 4.420576975393143e-06, "loss": 0.94309931, "memory(GiB)": 369.42, "step": 44160, "train_speed(iter/s)": 0.200511 }, { "acc": 0.76150312, "epoch": 1.1203703703703705, "grad_norm": 2.125, "learning_rate": 4.4195354293738484e-06, "loss": 0.94659691, "memory(GiB)": 369.42, "step": 44165, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74049778, "epoch": 1.1204972095383054, "grad_norm": 2.234375, "learning_rate": 4.418493908885979e-06, "loss": 0.97833977, "memory(GiB)": 369.42, "step": 44170, "train_speed(iter/s)": 0.200518 }, { "acc": 0.75171618, "epoch": 1.1206240487062404, "grad_norm": 1.953125, "learning_rate": 4.417452413975343e-06, "loss": 1.00610409, "memory(GiB)": 369.42, "step": 44175, "train_speed(iter/s)": 0.200521 }, { "acc": 0.74953318, "epoch": 1.1207508878741756, "grad_norm": 1.984375, "learning_rate": 4.4164109446877514e-06, "loss": 0.97720108, "memory(GiB)": 369.42, "step": 44180, "train_speed(iter/s)": 0.200525 }, { "acc": 0.76191497, "epoch": 1.1208777270421106, "grad_norm": 1.8515625, "learning_rate": 4.41536950106901e-06, "loss": 0.96842232, "memory(GiB)": 369.42, "step": 44185, "train_speed(iter/s)": 0.200529 }, { "acc": 0.76754141, "epoch": 1.1210045662100456, "grad_norm": 2.5, "learning_rate": 4.414328083164931e-06, "loss": 0.93490191, "memory(GiB)": 369.42, "step": 44190, "train_speed(iter/s)": 0.200529 }, { "acc": 0.74406772, "epoch": 1.1211314053779806, "grad_norm": 2.109375, "learning_rate": 4.4132866910213154e-06, "loss": 1.03016205, "memory(GiB)": 369.42, "step": 44195, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75283298, "epoch": 1.1212582445459158, "grad_norm": 2.171875, "learning_rate": 4.41224532468397e-06, "loss": 0.9017827, "memory(GiB)": 369.42, "step": 44200, "train_speed(iter/s)": 0.200537 }, { "acc": 0.75509815, "epoch": 1.1213850837138508, "grad_norm": 2.125, "learning_rate": 4.411203984198701e-06, "loss": 0.99280796, "memory(GiB)": 369.42, "step": 44205, "train_speed(iter/s)": 0.200539 }, { "acc": 0.74130473, "epoch": 1.1215119228817858, "grad_norm": 1.75, "learning_rate": 4.41016266961131e-06, "loss": 1.00546799, "memory(GiB)": 369.42, "step": 44210, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75547333, "epoch": 1.121638762049721, "grad_norm": 2.9375, "learning_rate": 4.409121380967597e-06, "loss": 1.01514587, "memory(GiB)": 369.42, "step": 44215, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75733986, "epoch": 1.121765601217656, "grad_norm": 3.171875, "learning_rate": 4.408080118313364e-06, "loss": 1.00954962, "memory(GiB)": 369.42, "step": 44220, "train_speed(iter/s)": 0.200549 }, { "acc": 0.7532311, "epoch": 1.121892440385591, "grad_norm": 2.5625, "learning_rate": 4.40703888169441e-06, "loss": 1.00583591, "memory(GiB)": 369.42, "step": 44225, "train_speed(iter/s)": 0.200554 }, { "acc": 0.75663309, "epoch": 1.1220192795535262, "grad_norm": 2.140625, "learning_rate": 4.4059976711565355e-06, "loss": 0.92631865, "memory(GiB)": 369.42, "step": 44230, "train_speed(iter/s)": 0.200557 }, { "acc": 0.75569897, "epoch": 1.1221461187214612, "grad_norm": 2.28125, "learning_rate": 4.404956486745532e-06, "loss": 0.93993998, "memory(GiB)": 369.42, "step": 44235, "train_speed(iter/s)": 0.200561 }, { "acc": 0.75213451, "epoch": 1.1222729578893962, "grad_norm": 2.65625, "learning_rate": 4.403915328507201e-06, "loss": 1.01550474, "memory(GiB)": 369.42, "step": 44240, "train_speed(iter/s)": 0.200565 }, { "acc": 0.73818541, "epoch": 1.1223997970573314, "grad_norm": 2.203125, "learning_rate": 4.4028741964873334e-06, "loss": 0.96608295, "memory(GiB)": 369.42, "step": 44245, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74467201, "epoch": 1.1225266362252664, "grad_norm": 2.546875, "learning_rate": 4.4018330907317275e-06, "loss": 0.98436928, "memory(GiB)": 369.42, "step": 44250, "train_speed(iter/s)": 0.200572 }, { "acc": 0.73391504, "epoch": 1.1226534753932014, "grad_norm": 2.25, "learning_rate": 4.400792011286171e-06, "loss": 1.07899799, "memory(GiB)": 369.42, "step": 44255, "train_speed(iter/s)": 0.200575 }, { "acc": 0.76320896, "epoch": 1.1227803145611364, "grad_norm": 2.125, "learning_rate": 4.3997509581964566e-06, "loss": 0.96752949, "memory(GiB)": 369.42, "step": 44260, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74954686, "epoch": 1.1229071537290716, "grad_norm": 2.265625, "learning_rate": 4.398709931508376e-06, "loss": 0.97195187, "memory(GiB)": 369.42, "step": 44265, "train_speed(iter/s)": 0.200581 }, { "acc": 0.75239568, "epoch": 1.1230339928970066, "grad_norm": 2.796875, "learning_rate": 4.397668931267718e-06, "loss": 0.94030724, "memory(GiB)": 369.42, "step": 44270, "train_speed(iter/s)": 0.200582 }, { "acc": 0.74911017, "epoch": 1.1231608320649416, "grad_norm": 2.03125, "learning_rate": 4.396627957520269e-06, "loss": 0.99608793, "memory(GiB)": 369.42, "step": 44275, "train_speed(iter/s)": 0.200586 }, { "acc": 0.74542899, "epoch": 1.1232876712328768, "grad_norm": 2.34375, "learning_rate": 4.395587010311815e-06, "loss": 0.98591576, "memory(GiB)": 369.42, "step": 44280, "train_speed(iter/s)": 0.200589 }, { "acc": 0.75485778, "epoch": 1.1234145104008117, "grad_norm": 1.9453125, "learning_rate": 4.394546089688143e-06, "loss": 0.97838163, "memory(GiB)": 369.42, "step": 44285, "train_speed(iter/s)": 0.200592 }, { "acc": 0.7492548, "epoch": 1.1235413495687467, "grad_norm": 2.421875, "learning_rate": 4.3935051956950395e-06, "loss": 0.99977913, "memory(GiB)": 369.42, "step": 44290, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74686251, "epoch": 1.123668188736682, "grad_norm": 2.25, "learning_rate": 4.3924643283782824e-06, "loss": 0.97574081, "memory(GiB)": 369.42, "step": 44295, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75673389, "epoch": 1.123795027904617, "grad_norm": 2.28125, "learning_rate": 4.391423487783657e-06, "loss": 0.97321415, "memory(GiB)": 369.42, "step": 44300, "train_speed(iter/s)": 0.200605 }, { "acc": 0.76085863, "epoch": 1.123921867072552, "grad_norm": 2.421875, "learning_rate": 4.3903826739569444e-06, "loss": 0.97394562, "memory(GiB)": 369.42, "step": 44305, "train_speed(iter/s)": 0.200609 }, { "acc": 0.75857048, "epoch": 1.1240487062404871, "grad_norm": 2.34375, "learning_rate": 4.389341886943926e-06, "loss": 0.97281847, "memory(GiB)": 369.42, "step": 44310, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75269985, "epoch": 1.1241755454084221, "grad_norm": 2.296875, "learning_rate": 4.388301126790374e-06, "loss": 0.96524124, "memory(GiB)": 369.42, "step": 44315, "train_speed(iter/s)": 0.200615 }, { "acc": 0.75440035, "epoch": 1.1243023845763571, "grad_norm": 2.203125, "learning_rate": 4.387260393542071e-06, "loss": 0.99271421, "memory(GiB)": 369.42, "step": 44320, "train_speed(iter/s)": 0.200617 }, { "acc": 0.74393559, "epoch": 1.1244292237442923, "grad_norm": 2.0625, "learning_rate": 4.38621968724479e-06, "loss": 1.05002174, "memory(GiB)": 369.42, "step": 44325, "train_speed(iter/s)": 0.200622 }, { "acc": 0.75245495, "epoch": 1.1245560629122273, "grad_norm": 2.1875, "learning_rate": 4.385179007944311e-06, "loss": 0.98732777, "memory(GiB)": 369.42, "step": 44330, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74862242, "epoch": 1.1246829020801623, "grad_norm": 3.015625, "learning_rate": 4.384138355686402e-06, "loss": 1.03374767, "memory(GiB)": 369.42, "step": 44335, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75454745, "epoch": 1.1248097412480975, "grad_norm": 2.109375, "learning_rate": 4.383097730516837e-06, "loss": 0.99809132, "memory(GiB)": 369.42, "step": 44340, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76112289, "epoch": 1.1249365804160325, "grad_norm": 2.28125, "learning_rate": 4.382057132481389e-06, "loss": 0.96162758, "memory(GiB)": 369.42, "step": 44345, "train_speed(iter/s)": 0.200632 }, { "acc": 0.76066689, "epoch": 1.1250634195839675, "grad_norm": 2.5, "learning_rate": 4.381016561625829e-06, "loss": 0.95566196, "memory(GiB)": 369.42, "step": 44350, "train_speed(iter/s)": 0.200635 }, { "acc": 0.74516301, "epoch": 1.1251902587519025, "grad_norm": 2.390625, "learning_rate": 4.379976017995922e-06, "loss": 1.02735481, "memory(GiB)": 369.42, "step": 44355, "train_speed(iter/s)": 0.200637 }, { "acc": 0.75063829, "epoch": 1.1253170979198377, "grad_norm": 2.46875, "learning_rate": 4.378935501637438e-06, "loss": 1.04231968, "memory(GiB)": 369.42, "step": 44360, "train_speed(iter/s)": 0.200641 }, { "acc": 0.74699292, "epoch": 1.1254439370877727, "grad_norm": 2.203125, "learning_rate": 4.377895012596144e-06, "loss": 0.97135983, "memory(GiB)": 369.42, "step": 44365, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75390682, "epoch": 1.1255707762557077, "grad_norm": 2.140625, "learning_rate": 4.376854550917805e-06, "loss": 1.00159302, "memory(GiB)": 369.42, "step": 44370, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75421534, "epoch": 1.1256976154236429, "grad_norm": 2.0625, "learning_rate": 4.375814116648184e-06, "loss": 1.01538944, "memory(GiB)": 369.42, "step": 44375, "train_speed(iter/s)": 0.200651 }, { "acc": 0.75099058, "epoch": 1.1258244545915779, "grad_norm": 1.890625, "learning_rate": 4.374773709833045e-06, "loss": 0.98937531, "memory(GiB)": 369.42, "step": 44380, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75073953, "epoch": 1.1259512937595129, "grad_norm": 2.78125, "learning_rate": 4.37373333051815e-06, "loss": 0.98097372, "memory(GiB)": 369.42, "step": 44385, "train_speed(iter/s)": 0.200654 }, { "acc": 0.73374219, "epoch": 1.126078132927448, "grad_norm": 2.140625, "learning_rate": 4.37269297874926e-06, "loss": 1.04771442, "memory(GiB)": 369.42, "step": 44390, "train_speed(iter/s)": 0.200659 }, { "acc": 0.74007988, "epoch": 1.126204972095383, "grad_norm": 2.125, "learning_rate": 4.371652654572134e-06, "loss": 1.03140297, "memory(GiB)": 369.42, "step": 44395, "train_speed(iter/s)": 0.200662 }, { "acc": 0.76403661, "epoch": 1.126331811263318, "grad_norm": 2.1875, "learning_rate": 4.370612358032529e-06, "loss": 0.95697908, "memory(GiB)": 369.42, "step": 44400, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74499917, "epoch": 1.1264586504312533, "grad_norm": 2.71875, "learning_rate": 4.369572089176201e-06, "loss": 0.96808367, "memory(GiB)": 369.42, "step": 44405, "train_speed(iter/s)": 0.200668 }, { "acc": 0.75000668, "epoch": 1.1265854895991883, "grad_norm": 2.46875, "learning_rate": 4.3685318480489095e-06, "loss": 0.97708359, "memory(GiB)": 369.42, "step": 44410, "train_speed(iter/s)": 0.200671 }, { "acc": 0.75993223, "epoch": 1.1267123287671232, "grad_norm": 2.328125, "learning_rate": 4.367491634696405e-06, "loss": 0.9494442, "memory(GiB)": 369.42, "step": 44415, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75152779, "epoch": 1.1268391679350582, "grad_norm": 2.4375, "learning_rate": 4.366451449164442e-06, "loss": 0.95385723, "memory(GiB)": 369.42, "step": 44420, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75576963, "epoch": 1.1269660071029934, "grad_norm": 2.015625, "learning_rate": 4.365411291498774e-06, "loss": 0.95299416, "memory(GiB)": 369.42, "step": 44425, "train_speed(iter/s)": 0.200681 }, { "acc": 0.75094957, "epoch": 1.1270928462709284, "grad_norm": 2.296875, "learning_rate": 4.364371161745151e-06, "loss": 1.00958652, "memory(GiB)": 369.42, "step": 44430, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75654511, "epoch": 1.1272196854388636, "grad_norm": 2.140625, "learning_rate": 4.363331059949321e-06, "loss": 0.94892826, "memory(GiB)": 369.42, "step": 44435, "train_speed(iter/s)": 0.200689 }, { "acc": 0.74659719, "epoch": 1.1273465246067986, "grad_norm": 1.9453125, "learning_rate": 4.362290986157034e-06, "loss": 0.99107714, "memory(GiB)": 369.42, "step": 44440, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75457253, "epoch": 1.1274733637747336, "grad_norm": 2.15625, "learning_rate": 4.361250940414036e-06, "loss": 0.97797651, "memory(GiB)": 369.42, "step": 44445, "train_speed(iter/s)": 0.200696 }, { "acc": 0.75038691, "epoch": 1.1276002029426686, "grad_norm": 2.359375, "learning_rate": 4.360210922766076e-06, "loss": 0.98475399, "memory(GiB)": 369.42, "step": 44450, "train_speed(iter/s)": 0.2007 }, { "acc": 0.72868271, "epoch": 1.1277270421106038, "grad_norm": 2.203125, "learning_rate": 4.359170933258893e-06, "loss": 1.08256502, "memory(GiB)": 369.42, "step": 44455, "train_speed(iter/s)": 0.200703 }, { "acc": 0.76311898, "epoch": 1.1278538812785388, "grad_norm": 2.234375, "learning_rate": 4.358130971938235e-06, "loss": 0.96690121, "memory(GiB)": 369.42, "step": 44460, "train_speed(iter/s)": 0.200708 }, { "acc": 0.74247293, "epoch": 1.1279807204464738, "grad_norm": 2.03125, "learning_rate": 4.357091038849841e-06, "loss": 1.00452003, "memory(GiB)": 369.42, "step": 44465, "train_speed(iter/s)": 0.200711 }, { "acc": 0.76312003, "epoch": 1.128107559614409, "grad_norm": 2.21875, "learning_rate": 4.356051134039455e-06, "loss": 0.97051697, "memory(GiB)": 369.42, "step": 44470, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75761766, "epoch": 1.128234398782344, "grad_norm": 1.984375, "learning_rate": 4.3550112575528155e-06, "loss": 0.9546299, "memory(GiB)": 369.42, "step": 44475, "train_speed(iter/s)": 0.200719 }, { "acc": 0.76815434, "epoch": 1.128361237950279, "grad_norm": 1.96875, "learning_rate": 4.353971409435659e-06, "loss": 0.91244364, "memory(GiB)": 369.42, "step": 44480, "train_speed(iter/s)": 0.200723 }, { "acc": 0.76139402, "epoch": 1.1284880771182142, "grad_norm": 2.46875, "learning_rate": 4.352931589733725e-06, "loss": 0.94185352, "memory(GiB)": 369.42, "step": 44485, "train_speed(iter/s)": 0.200725 }, { "acc": 0.75057392, "epoch": 1.1286149162861492, "grad_norm": 2.75, "learning_rate": 4.35189179849275e-06, "loss": 1.02143993, "memory(GiB)": 369.42, "step": 44490, "train_speed(iter/s)": 0.200729 }, { "acc": 0.75617447, "epoch": 1.1287417554540842, "grad_norm": 2.078125, "learning_rate": 4.350852035758466e-06, "loss": 1.03817892, "memory(GiB)": 369.42, "step": 44495, "train_speed(iter/s)": 0.200731 }, { "acc": 0.75226183, "epoch": 1.1288685946220194, "grad_norm": 1.734375, "learning_rate": 4.3498123015766066e-06, "loss": 0.97496824, "memory(GiB)": 369.42, "step": 44500, "train_speed(iter/s)": 0.200735 }, { "acc": 0.76547418, "epoch": 1.1289954337899544, "grad_norm": 2.75, "learning_rate": 4.348772595992906e-06, "loss": 0.97919588, "memory(GiB)": 369.42, "step": 44505, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75760593, "epoch": 1.1291222729578894, "grad_norm": 2.265625, "learning_rate": 4.347732919053096e-06, "loss": 0.97246771, "memory(GiB)": 369.42, "step": 44510, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74984965, "epoch": 1.1292491121258244, "grad_norm": 2.109375, "learning_rate": 4.346693270802902e-06, "loss": 1.02300797, "memory(GiB)": 369.42, "step": 44515, "train_speed(iter/s)": 0.200746 }, { "acc": 0.75774755, "epoch": 1.1293759512937596, "grad_norm": 2.21875, "learning_rate": 4.345653651288055e-06, "loss": 0.96402435, "memory(GiB)": 369.42, "step": 44520, "train_speed(iter/s)": 0.200749 }, { "acc": 0.7496376, "epoch": 1.1295027904616946, "grad_norm": 2.40625, "learning_rate": 4.344614060554281e-06, "loss": 0.9376955, "memory(GiB)": 369.42, "step": 44525, "train_speed(iter/s)": 0.200754 }, { "acc": 0.75718679, "epoch": 1.1296296296296295, "grad_norm": 2.203125, "learning_rate": 4.343574498647311e-06, "loss": 0.98980656, "memory(GiB)": 369.42, "step": 44530, "train_speed(iter/s)": 0.200757 }, { "acc": 0.75906916, "epoch": 1.1297564687975648, "grad_norm": 2.03125, "learning_rate": 4.342534965612861e-06, "loss": 0.92446156, "memory(GiB)": 369.42, "step": 44535, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74830813, "epoch": 1.1298833079654997, "grad_norm": 2.296875, "learning_rate": 4.34149546149666e-06, "loss": 0.97166443, "memory(GiB)": 369.42, "step": 44540, "train_speed(iter/s)": 0.200765 }, { "acc": 0.75081358, "epoch": 1.1300101471334347, "grad_norm": 2.296875, "learning_rate": 4.340455986344428e-06, "loss": 1.04920797, "memory(GiB)": 369.42, "step": 44545, "train_speed(iter/s)": 0.20077 }, { "acc": 0.75636015, "epoch": 1.13013698630137, "grad_norm": 2.234375, "learning_rate": 4.3394165402018875e-06, "loss": 1.00044737, "memory(GiB)": 369.42, "step": 44550, "train_speed(iter/s)": 0.200774 }, { "acc": 0.75887132, "epoch": 1.130263825469305, "grad_norm": 2.140625, "learning_rate": 4.338377123114757e-06, "loss": 0.97348461, "memory(GiB)": 369.42, "step": 44555, "train_speed(iter/s)": 0.200775 }, { "acc": 0.74045658, "epoch": 1.13039066463724, "grad_norm": 1.953125, "learning_rate": 4.337337735128752e-06, "loss": 1.04187765, "memory(GiB)": 369.42, "step": 44560, "train_speed(iter/s)": 0.200778 }, { "acc": 0.74722223, "epoch": 1.1305175038051751, "grad_norm": 1.6640625, "learning_rate": 4.336298376289594e-06, "loss": 1.01307964, "memory(GiB)": 369.42, "step": 44565, "train_speed(iter/s)": 0.20078 }, { "acc": 0.73899279, "epoch": 1.1306443429731101, "grad_norm": 2.296875, "learning_rate": 4.335259046642998e-06, "loss": 1.02976675, "memory(GiB)": 369.42, "step": 44570, "train_speed(iter/s)": 0.200785 }, { "acc": 0.7532474, "epoch": 1.130771182141045, "grad_norm": 1.9140625, "learning_rate": 4.334219746234675e-06, "loss": 0.9942955, "memory(GiB)": 369.42, "step": 44575, "train_speed(iter/s)": 0.200789 }, { "acc": 0.74463882, "epoch": 1.13089802130898, "grad_norm": 2.203125, "learning_rate": 4.3331804751103395e-06, "loss": 0.97019711, "memory(GiB)": 369.42, "step": 44580, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74866495, "epoch": 1.1310248604769153, "grad_norm": 2.515625, "learning_rate": 4.332141233315705e-06, "loss": 0.97521725, "memory(GiB)": 369.42, "step": 44585, "train_speed(iter/s)": 0.200795 }, { "acc": 0.7707675, "epoch": 1.1311516996448503, "grad_norm": 1.9921875, "learning_rate": 4.331102020896482e-06, "loss": 0.98433666, "memory(GiB)": 369.42, "step": 44590, "train_speed(iter/s)": 0.200799 }, { "acc": 0.76143494, "epoch": 1.1312785388127855, "grad_norm": 2.328125, "learning_rate": 4.330062837898376e-06, "loss": 0.92907734, "memory(GiB)": 369.42, "step": 44595, "train_speed(iter/s)": 0.200801 }, { "acc": 0.75394793, "epoch": 1.1314053779807205, "grad_norm": 2.140625, "learning_rate": 4.3290236843670985e-06, "loss": 1.01611252, "memory(GiB)": 369.42, "step": 44600, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75075006, "epoch": 1.1315322171486555, "grad_norm": 2.0625, "learning_rate": 4.327984560348354e-06, "loss": 0.98447304, "memory(GiB)": 369.42, "step": 44605, "train_speed(iter/s)": 0.20081 }, { "acc": 0.75902748, "epoch": 1.1316590563165905, "grad_norm": 1.953125, "learning_rate": 4.3269454658878516e-06, "loss": 0.961726, "memory(GiB)": 369.42, "step": 44610, "train_speed(iter/s)": 0.200812 }, { "acc": 0.75675974, "epoch": 1.1317858954845257, "grad_norm": 2.0625, "learning_rate": 4.325906401031291e-06, "loss": 0.98592224, "memory(GiB)": 369.42, "step": 44615, "train_speed(iter/s)": 0.200815 }, { "acc": 0.75504355, "epoch": 1.1319127346524607, "grad_norm": 2.34375, "learning_rate": 4.324867365824376e-06, "loss": 0.9599144, "memory(GiB)": 369.42, "step": 44620, "train_speed(iter/s)": 0.200819 }, { "acc": 0.76406832, "epoch": 1.1320395738203957, "grad_norm": 2.140625, "learning_rate": 4.323828360312809e-06, "loss": 0.97840405, "memory(GiB)": 369.42, "step": 44625, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75717964, "epoch": 1.1321664129883309, "grad_norm": 2.4375, "learning_rate": 4.32278938454229e-06, "loss": 0.97820301, "memory(GiB)": 369.42, "step": 44630, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75190077, "epoch": 1.1322932521562659, "grad_norm": 2.53125, "learning_rate": 4.321750438558517e-06, "loss": 0.9386507, "memory(GiB)": 369.42, "step": 44635, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74508924, "epoch": 1.1324200913242009, "grad_norm": 2.078125, "learning_rate": 4.3207115224071874e-06, "loss": 0.92977848, "memory(GiB)": 369.42, "step": 44640, "train_speed(iter/s)": 0.200833 }, { "acc": 0.73979526, "epoch": 1.132546930492136, "grad_norm": 2.09375, "learning_rate": 4.319672636133998e-06, "loss": 1.02016115, "memory(GiB)": 369.42, "step": 44645, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75851316, "epoch": 1.132673769660071, "grad_norm": 2.34375, "learning_rate": 4.318633779784646e-06, "loss": 1.01572075, "memory(GiB)": 369.42, "step": 44650, "train_speed(iter/s)": 0.200842 }, { "acc": 0.74615421, "epoch": 1.132800608828006, "grad_norm": 2.328125, "learning_rate": 4.317594953404818e-06, "loss": 1.03408089, "memory(GiB)": 369.42, "step": 44655, "train_speed(iter/s)": 0.200844 }, { "acc": 0.75627851, "epoch": 1.1329274479959413, "grad_norm": 1.6796875, "learning_rate": 4.316556157040213e-06, "loss": 0.95005302, "memory(GiB)": 369.42, "step": 44660, "train_speed(iter/s)": 0.200849 }, { "acc": 0.73600512, "epoch": 1.1330542871638762, "grad_norm": 2.234375, "learning_rate": 4.315517390736519e-06, "loss": 1.02571211, "memory(GiB)": 369.42, "step": 44665, "train_speed(iter/s)": 0.200854 }, { "acc": 0.73223295, "epoch": 1.1331811263318112, "grad_norm": 1.765625, "learning_rate": 4.314478654539429e-06, "loss": 0.99875431, "memory(GiB)": 369.42, "step": 44670, "train_speed(iter/s)": 0.200856 }, { "acc": 0.7431283, "epoch": 1.1333079654997462, "grad_norm": 1.953125, "learning_rate": 4.313439948494625e-06, "loss": 1.02265406, "memory(GiB)": 369.42, "step": 44675, "train_speed(iter/s)": 0.20086 }, { "acc": 0.7547955, "epoch": 1.1334348046676814, "grad_norm": 2.5625, "learning_rate": 4.312401272647799e-06, "loss": 0.98717232, "memory(GiB)": 369.42, "step": 44680, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74911304, "epoch": 1.1335616438356164, "grad_norm": 2.125, "learning_rate": 4.311362627044633e-06, "loss": 0.98124523, "memory(GiB)": 369.42, "step": 44685, "train_speed(iter/s)": 0.200863 }, { "acc": 0.7454504, "epoch": 1.1336884830035514, "grad_norm": 2.4375, "learning_rate": 4.310324011730816e-06, "loss": 1.02577171, "memory(GiB)": 369.42, "step": 44690, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74382, "epoch": 1.1338153221714866, "grad_norm": 2.21875, "learning_rate": 4.309285426752027e-06, "loss": 1.00292625, "memory(GiB)": 369.42, "step": 44695, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75204649, "epoch": 1.1339421613394216, "grad_norm": 2.28125, "learning_rate": 4.308246872153947e-06, "loss": 0.98765631, "memory(GiB)": 369.42, "step": 44700, "train_speed(iter/s)": 0.200875 }, { "acc": 0.75495787, "epoch": 1.1340690005073566, "grad_norm": 1.9375, "learning_rate": 4.307208347982259e-06, "loss": 0.93953896, "memory(GiB)": 369.42, "step": 44705, "train_speed(iter/s)": 0.200878 }, { "acc": 0.75894051, "epoch": 1.1341958396752918, "grad_norm": 2.25, "learning_rate": 4.306169854282643e-06, "loss": 0.9377367, "memory(GiB)": 369.42, "step": 44710, "train_speed(iter/s)": 0.200882 }, { "acc": 0.75890946, "epoch": 1.1343226788432268, "grad_norm": 2.5, "learning_rate": 4.305131391100773e-06, "loss": 0.97507248, "memory(GiB)": 369.42, "step": 44715, "train_speed(iter/s)": 0.200884 }, { "acc": 0.76107445, "epoch": 1.1344495180111618, "grad_norm": 2.015625, "learning_rate": 4.304092958482325e-06, "loss": 0.99201126, "memory(GiB)": 369.42, "step": 44720, "train_speed(iter/s)": 0.200885 }, { "acc": 0.75774279, "epoch": 1.134576357179097, "grad_norm": 2.8125, "learning_rate": 4.303054556472978e-06, "loss": 0.94620705, "memory(GiB)": 369.42, "step": 44725, "train_speed(iter/s)": 0.200888 }, { "acc": 0.7463882, "epoch": 1.134703196347032, "grad_norm": 2.140625, "learning_rate": 4.3020161851184036e-06, "loss": 1.0211853, "memory(GiB)": 369.42, "step": 44730, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74219065, "epoch": 1.134830035514967, "grad_norm": 2.609375, "learning_rate": 4.300977844464273e-06, "loss": 1.00139132, "memory(GiB)": 369.42, "step": 44735, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75884514, "epoch": 1.134956874682902, "grad_norm": 2.3125, "learning_rate": 4.2999395345562564e-06, "loss": 0.93339453, "memory(GiB)": 369.42, "step": 44740, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74339848, "epoch": 1.1350837138508372, "grad_norm": 2.9375, "learning_rate": 4.298901255440025e-06, "loss": 0.9914113, "memory(GiB)": 369.42, "step": 44745, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74118476, "epoch": 1.1352105530187722, "grad_norm": 2.765625, "learning_rate": 4.297863007161249e-06, "loss": 0.99324789, "memory(GiB)": 369.42, "step": 44750, "train_speed(iter/s)": 0.200904 }, { "acc": 0.74821029, "epoch": 1.1353373921867074, "grad_norm": 1.921875, "learning_rate": 4.29682478976559e-06, "loss": 0.96885614, "memory(GiB)": 369.42, "step": 44755, "train_speed(iter/s)": 0.200908 }, { "acc": 0.7474329, "epoch": 1.1354642313546424, "grad_norm": 2.390625, "learning_rate": 4.295786603298717e-06, "loss": 0.97523746, "memory(GiB)": 369.42, "step": 44760, "train_speed(iter/s)": 0.200912 }, { "acc": 0.75378847, "epoch": 1.1355910705225774, "grad_norm": 1.921875, "learning_rate": 4.294748447806293e-06, "loss": 0.95407448, "memory(GiB)": 369.42, "step": 44765, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74934421, "epoch": 1.1357179096905123, "grad_norm": 2.359375, "learning_rate": 4.293710323333983e-06, "loss": 0.98726654, "memory(GiB)": 369.42, "step": 44770, "train_speed(iter/s)": 0.200914 }, { "acc": 0.74705753, "epoch": 1.1358447488584476, "grad_norm": 2.125, "learning_rate": 4.292672229927445e-06, "loss": 0.97919044, "memory(GiB)": 369.42, "step": 44775, "train_speed(iter/s)": 0.200917 }, { "acc": 0.75967884, "epoch": 1.1359715880263825, "grad_norm": 1.9921875, "learning_rate": 4.2916341676323386e-06, "loss": 0.95857773, "memory(GiB)": 369.42, "step": 44780, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75789948, "epoch": 1.1360984271943175, "grad_norm": 2.171875, "learning_rate": 4.290596136494326e-06, "loss": 0.91573572, "memory(GiB)": 369.42, "step": 44785, "train_speed(iter/s)": 0.200919 }, { "acc": 0.74009352, "epoch": 1.1362252663622527, "grad_norm": 2.71875, "learning_rate": 4.289558136559063e-06, "loss": 1.03626041, "memory(GiB)": 369.42, "step": 44790, "train_speed(iter/s)": 0.200922 }, { "acc": 0.7556951, "epoch": 1.1363521055301877, "grad_norm": 2.046875, "learning_rate": 4.288520167872203e-06, "loss": 0.95333328, "memory(GiB)": 369.42, "step": 44795, "train_speed(iter/s)": 0.200926 }, { "acc": 0.74265766, "epoch": 1.1364789446981227, "grad_norm": 2.078125, "learning_rate": 4.287482230479404e-06, "loss": 1.05037127, "memory(GiB)": 369.42, "step": 44800, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74462886, "epoch": 1.136605783866058, "grad_norm": 2.375, "learning_rate": 4.286444324426318e-06, "loss": 1.0190093, "memory(GiB)": 369.42, "step": 44805, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75625849, "epoch": 1.136732623033993, "grad_norm": 1.8828125, "learning_rate": 4.2854064497585964e-06, "loss": 0.95568714, "memory(GiB)": 369.42, "step": 44810, "train_speed(iter/s)": 0.200935 }, { "acc": 0.73816481, "epoch": 1.136859462201928, "grad_norm": 2.203125, "learning_rate": 4.284368606521888e-06, "loss": 1.03677044, "memory(GiB)": 369.42, "step": 44815, "train_speed(iter/s)": 0.200939 }, { "acc": 0.76436787, "epoch": 1.1369863013698631, "grad_norm": 1.8671875, "learning_rate": 4.283330794761845e-06, "loss": 1.001161, "memory(GiB)": 369.42, "step": 44820, "train_speed(iter/s)": 0.200943 }, { "acc": 0.7555336, "epoch": 1.137113140537798, "grad_norm": 1.96875, "learning_rate": 4.282293014524112e-06, "loss": 0.98708153, "memory(GiB)": 369.42, "step": 44825, "train_speed(iter/s)": 0.200943 }, { "acc": 0.75413265, "epoch": 1.137239979705733, "grad_norm": 1.9375, "learning_rate": 4.281255265854338e-06, "loss": 0.98254547, "memory(GiB)": 369.42, "step": 44830, "train_speed(iter/s)": 0.200947 }, { "acc": 0.75132604, "epoch": 1.137366818873668, "grad_norm": 1.9140625, "learning_rate": 4.280217548798166e-06, "loss": 0.94470348, "memory(GiB)": 369.42, "step": 44835, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75153651, "epoch": 1.1374936580416033, "grad_norm": 2.625, "learning_rate": 4.279179863401239e-06, "loss": 0.9809515, "memory(GiB)": 369.42, "step": 44840, "train_speed(iter/s)": 0.200952 }, { "acc": 0.76813078, "epoch": 1.1376204972095383, "grad_norm": 2.421875, "learning_rate": 4.278142209709199e-06, "loss": 0.93002434, "memory(GiB)": 369.42, "step": 44845, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75440903, "epoch": 1.1377473363774733, "grad_norm": 2.546875, "learning_rate": 4.277104587767691e-06, "loss": 1.03322487, "memory(GiB)": 369.42, "step": 44850, "train_speed(iter/s)": 0.200957 }, { "acc": 0.75034504, "epoch": 1.1378741755454085, "grad_norm": 3.109375, "learning_rate": 4.276066997622348e-06, "loss": 0.9924305, "memory(GiB)": 369.42, "step": 44855, "train_speed(iter/s)": 0.200961 }, { "acc": 0.77067871, "epoch": 1.1380010147133435, "grad_norm": 2.296875, "learning_rate": 4.27502943931881e-06, "loss": 0.9454401, "memory(GiB)": 369.42, "step": 44860, "train_speed(iter/s)": 0.200965 }, { "acc": 0.73957338, "epoch": 1.1381278538812785, "grad_norm": 1.90625, "learning_rate": 4.273991912902716e-06, "loss": 1.01102228, "memory(GiB)": 369.42, "step": 44865, "train_speed(iter/s)": 0.200969 }, { "acc": 0.73702416, "epoch": 1.1382546930492137, "grad_norm": 2.296875, "learning_rate": 4.272954418419699e-06, "loss": 1.00657539, "memory(GiB)": 369.42, "step": 44870, "train_speed(iter/s)": 0.200974 }, { "acc": 0.75908718, "epoch": 1.1383815322171487, "grad_norm": 2.15625, "learning_rate": 4.2719169559153905e-06, "loss": 0.94732609, "memory(GiB)": 369.42, "step": 44875, "train_speed(iter/s)": 0.200979 }, { "acc": 0.76215754, "epoch": 1.1385083713850837, "grad_norm": 1.9296875, "learning_rate": 4.270879525435426e-06, "loss": 0.96348019, "memory(GiB)": 369.42, "step": 44880, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75507851, "epoch": 1.1386352105530189, "grad_norm": 2.0, "learning_rate": 4.269842127025435e-06, "loss": 1.01405287, "memory(GiB)": 369.42, "step": 44885, "train_speed(iter/s)": 0.200985 }, { "acc": 0.74367433, "epoch": 1.1387620497209539, "grad_norm": 2.21875, "learning_rate": 4.2688047607310504e-06, "loss": 0.98430786, "memory(GiB)": 369.42, "step": 44890, "train_speed(iter/s)": 0.200989 }, { "acc": 0.73959112, "epoch": 1.1388888888888888, "grad_norm": 2.25, "learning_rate": 4.267767426597893e-06, "loss": 1.02306614, "memory(GiB)": 369.42, "step": 44895, "train_speed(iter/s)": 0.200993 }, { "acc": 0.75020223, "epoch": 1.1390157280568238, "grad_norm": 1.65625, "learning_rate": 4.266730124671594e-06, "loss": 0.98537388, "memory(GiB)": 369.42, "step": 44900, "train_speed(iter/s)": 0.200995 }, { "acc": 0.75869908, "epoch": 1.139142567224759, "grad_norm": 2.1875, "learning_rate": 4.265692854997778e-06, "loss": 0.94350462, "memory(GiB)": 369.42, "step": 44905, "train_speed(iter/s)": 0.200998 }, { "acc": 0.76784029, "epoch": 1.139269406392694, "grad_norm": 2.0, "learning_rate": 4.2646556176220714e-06, "loss": 0.95961609, "memory(GiB)": 369.42, "step": 44910, "train_speed(iter/s)": 0.201002 }, { "acc": 0.75550184, "epoch": 1.1393962455606292, "grad_norm": 2.0, "learning_rate": 4.263618412590092e-06, "loss": 0.96296043, "memory(GiB)": 369.42, "step": 44915, "train_speed(iter/s)": 0.201004 }, { "acc": 0.74612141, "epoch": 1.1395230847285642, "grad_norm": 2.140625, "learning_rate": 4.2625812399474604e-06, "loss": 0.99230328, "memory(GiB)": 369.42, "step": 44920, "train_speed(iter/s)": 0.201005 }, { "acc": 0.75364866, "epoch": 1.1396499238964992, "grad_norm": 2.046875, "learning_rate": 4.2615440997398e-06, "loss": 0.9926652, "memory(GiB)": 369.42, "step": 44925, "train_speed(iter/s)": 0.201009 }, { "acc": 0.76129761, "epoch": 1.1397767630644342, "grad_norm": 2.234375, "learning_rate": 4.2605069920127284e-06, "loss": 0.93111181, "memory(GiB)": 369.42, "step": 44930, "train_speed(iter/s)": 0.201012 }, { "acc": 0.75368404, "epoch": 1.1399036022323694, "grad_norm": 2.3125, "learning_rate": 4.25946991681186e-06, "loss": 0.99951859, "memory(GiB)": 369.42, "step": 44935, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75467877, "epoch": 1.1400304414003044, "grad_norm": 2.125, "learning_rate": 4.258432874182809e-06, "loss": 0.99744091, "memory(GiB)": 369.42, "step": 44940, "train_speed(iter/s)": 0.201019 }, { "acc": 0.75188065, "epoch": 1.1401572805682394, "grad_norm": 1.9921875, "learning_rate": 4.2573958641711925e-06, "loss": 0.99930859, "memory(GiB)": 369.42, "step": 44945, "train_speed(iter/s)": 0.201023 }, { "acc": 0.75770102, "epoch": 1.1402841197361746, "grad_norm": 1.84375, "learning_rate": 4.256358886822622e-06, "loss": 0.92966757, "memory(GiB)": 369.42, "step": 44950, "train_speed(iter/s)": 0.201025 }, { "acc": 0.74737501, "epoch": 1.1404109589041096, "grad_norm": 2.546875, "learning_rate": 4.255321942182707e-06, "loss": 0.96492844, "memory(GiB)": 369.42, "step": 44955, "train_speed(iter/s)": 0.201028 }, { "acc": 0.75050936, "epoch": 1.1405377980720446, "grad_norm": 2.015625, "learning_rate": 4.254285030297058e-06, "loss": 0.97450447, "memory(GiB)": 369.42, "step": 44960, "train_speed(iter/s)": 0.201031 }, { "acc": 0.75767155, "epoch": 1.1406646372399798, "grad_norm": 1.9609375, "learning_rate": 4.2532481512112814e-06, "loss": 0.99871044, "memory(GiB)": 369.42, "step": 44965, "train_speed(iter/s)": 0.201035 }, { "acc": 0.74892931, "epoch": 1.1407914764079148, "grad_norm": 2.4375, "learning_rate": 4.252211304970988e-06, "loss": 1.02341385, "memory(GiB)": 369.42, "step": 44970, "train_speed(iter/s)": 0.20104 }, { "acc": 0.74823556, "epoch": 1.1409183155758498, "grad_norm": 2.34375, "learning_rate": 4.251174491621778e-06, "loss": 0.98286304, "memory(GiB)": 369.42, "step": 44975, "train_speed(iter/s)": 0.201041 }, { "acc": 0.75595055, "epoch": 1.141045154743785, "grad_norm": 2.046875, "learning_rate": 4.250137711209258e-06, "loss": 1.00261889, "memory(GiB)": 369.42, "step": 44980, "train_speed(iter/s)": 0.201045 }, { "acc": 0.74379625, "epoch": 1.14117199391172, "grad_norm": 2.328125, "learning_rate": 4.249100963779028e-06, "loss": 1.06804562, "memory(GiB)": 369.42, "step": 44985, "train_speed(iter/s)": 0.201049 }, { "acc": 0.7581521, "epoch": 1.141298833079655, "grad_norm": 2.484375, "learning_rate": 4.248064249376692e-06, "loss": 0.99412518, "memory(GiB)": 369.42, "step": 44990, "train_speed(iter/s)": 0.201052 }, { "acc": 0.74801006, "epoch": 1.14142567224759, "grad_norm": 1.9609375, "learning_rate": 4.2470275680478466e-06, "loss": 0.96878948, "memory(GiB)": 369.42, "step": 44995, "train_speed(iter/s)": 0.201054 }, { "acc": 0.75402107, "epoch": 1.1415525114155252, "grad_norm": 2.40625, "learning_rate": 4.2459909198380886e-06, "loss": 0.95559788, "memory(GiB)": 369.42, "step": 45000, "train_speed(iter/s)": 0.201057 }, { "epoch": 1.1415525114155252, "eval_acc": 0.7378426579693083, "eval_loss": 0.970187783241272, "eval_runtime": 384.6052, "eval_samples_per_second": 16.562, "eval_steps_per_second": 8.281, "step": 45000 }, { "acc": 0.75987864, "epoch": 1.1416793505834602, "grad_norm": 2.265625, "learning_rate": 4.244954304793019e-06, "loss": 0.96349602, "memory(GiB)": 369.42, "step": 45005, "train_speed(iter/s)": 0.20042 }, { "acc": 0.74360032, "epoch": 1.1418061897513951, "grad_norm": 1.9921875, "learning_rate": 4.2439177229582304e-06, "loss": 1.01330776, "memory(GiB)": 369.42, "step": 45010, "train_speed(iter/s)": 0.200423 }, { "acc": 0.75190239, "epoch": 1.1419330289193304, "grad_norm": 2.09375, "learning_rate": 4.242881174379313e-06, "loss": 0.9832428, "memory(GiB)": 369.42, "step": 45015, "train_speed(iter/s)": 0.200425 }, { "acc": 0.74636388, "epoch": 1.1420598680872653, "grad_norm": 1.9375, "learning_rate": 4.241844659101865e-06, "loss": 0.98868027, "memory(GiB)": 369.42, "step": 45020, "train_speed(iter/s)": 0.200428 }, { "acc": 0.73716021, "epoch": 1.1421867072552003, "grad_norm": 1.8984375, "learning_rate": 4.240808177171472e-06, "loss": 1.08387785, "memory(GiB)": 369.42, "step": 45025, "train_speed(iter/s)": 0.20043 }, { "acc": 0.75065584, "epoch": 1.1423135464231355, "grad_norm": 2.171875, "learning_rate": 4.239771728633727e-06, "loss": 0.95540352, "memory(GiB)": 369.42, "step": 45030, "train_speed(iter/s)": 0.200433 }, { "acc": 0.71802158, "epoch": 1.1424403855910705, "grad_norm": 1.9453125, "learning_rate": 4.238735313534213e-06, "loss": 1.06768131, "memory(GiB)": 369.42, "step": 45035, "train_speed(iter/s)": 0.200437 }, { "acc": 0.74691586, "epoch": 1.1425672247590055, "grad_norm": 2.453125, "learning_rate": 4.23769893191852e-06, "loss": 1.0289772, "memory(GiB)": 369.42, "step": 45040, "train_speed(iter/s)": 0.200441 }, { "acc": 0.7543951, "epoch": 1.1426940639269407, "grad_norm": 1.8828125, "learning_rate": 4.236662583832229e-06, "loss": 0.91750031, "memory(GiB)": 369.42, "step": 45045, "train_speed(iter/s)": 0.200444 }, { "acc": 0.7521842, "epoch": 1.1428209030948757, "grad_norm": 2.09375, "learning_rate": 4.23562626932093e-06, "loss": 1.06471481, "memory(GiB)": 369.42, "step": 45050, "train_speed(iter/s)": 0.200449 }, { "acc": 0.74638004, "epoch": 1.1429477422628107, "grad_norm": 2.15625, "learning_rate": 4.234589988430198e-06, "loss": 1.01776819, "memory(GiB)": 369.42, "step": 45055, "train_speed(iter/s)": 0.200452 }, { "acc": 0.75020633, "epoch": 1.1430745814307457, "grad_norm": 2.25, "learning_rate": 4.233553741205615e-06, "loss": 1.01917477, "memory(GiB)": 369.42, "step": 45060, "train_speed(iter/s)": 0.200453 }, { "acc": 0.73745599, "epoch": 1.143201420598681, "grad_norm": 2.40625, "learning_rate": 4.2325175276927614e-06, "loss": 0.98967113, "memory(GiB)": 369.42, "step": 45065, "train_speed(iter/s)": 0.200458 }, { "acc": 0.75624828, "epoch": 1.143328259766616, "grad_norm": 1.9765625, "learning_rate": 4.231481347937214e-06, "loss": 0.98286572, "memory(GiB)": 369.42, "step": 45070, "train_speed(iter/s)": 0.200459 }, { "acc": 0.74982586, "epoch": 1.143455098934551, "grad_norm": 2.109375, "learning_rate": 4.230445201984547e-06, "loss": 0.9961525, "memory(GiB)": 369.42, "step": 45075, "train_speed(iter/s)": 0.200462 }, { "acc": 0.75334907, "epoch": 1.143581938102486, "grad_norm": 2.03125, "learning_rate": 4.229409089880336e-06, "loss": 0.99515648, "memory(GiB)": 369.42, "step": 45080, "train_speed(iter/s)": 0.200467 }, { "acc": 0.74775419, "epoch": 1.143708777270421, "grad_norm": 2.34375, "learning_rate": 4.2283730116701535e-06, "loss": 1.00071297, "memory(GiB)": 369.42, "step": 45085, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74915438, "epoch": 1.143835616438356, "grad_norm": 2.421875, "learning_rate": 4.227336967399573e-06, "loss": 0.98958282, "memory(GiB)": 369.42, "step": 45090, "train_speed(iter/s)": 0.200475 }, { "acc": 0.75956569, "epoch": 1.1439624556062913, "grad_norm": 2.078125, "learning_rate": 4.2263009571141585e-06, "loss": 0.97311411, "memory(GiB)": 369.42, "step": 45095, "train_speed(iter/s)": 0.200479 }, { "acc": 0.75604124, "epoch": 1.1440892947742263, "grad_norm": 2.09375, "learning_rate": 4.225264980859485e-06, "loss": 0.98461742, "memory(GiB)": 369.42, "step": 45100, "train_speed(iter/s)": 0.200484 }, { "acc": 0.74213638, "epoch": 1.1442161339421613, "grad_norm": 2.15625, "learning_rate": 4.224229038681115e-06, "loss": 1.00461483, "memory(GiB)": 369.42, "step": 45105, "train_speed(iter/s)": 0.200488 }, { "acc": 0.74737659, "epoch": 1.1443429731100965, "grad_norm": 1.9765625, "learning_rate": 4.223193130624619e-06, "loss": 0.98420963, "memory(GiB)": 369.42, "step": 45110, "train_speed(iter/s)": 0.200491 }, { "acc": 0.76232414, "epoch": 1.1444698122780315, "grad_norm": 2.1875, "learning_rate": 4.222157256735553e-06, "loss": 0.93204803, "memory(GiB)": 369.42, "step": 45115, "train_speed(iter/s)": 0.200493 }, { "acc": 0.76204195, "epoch": 1.1445966514459665, "grad_norm": 2.34375, "learning_rate": 4.2211214170594865e-06, "loss": 0.97582607, "memory(GiB)": 369.42, "step": 45120, "train_speed(iter/s)": 0.200495 }, { "acc": 0.73713403, "epoch": 1.1447234906139017, "grad_norm": 2.359375, "learning_rate": 4.220085611641976e-06, "loss": 1.01715193, "memory(GiB)": 369.42, "step": 45125, "train_speed(iter/s)": 0.200498 }, { "acc": 0.74814692, "epoch": 1.1448503297818367, "grad_norm": 1.875, "learning_rate": 4.2190498405285826e-06, "loss": 1.02529583, "memory(GiB)": 369.42, "step": 45130, "train_speed(iter/s)": 0.200502 }, { "acc": 0.75510387, "epoch": 1.1449771689497716, "grad_norm": 2.171875, "learning_rate": 4.218014103764865e-06, "loss": 0.98729086, "memory(GiB)": 369.42, "step": 45135, "train_speed(iter/s)": 0.200505 }, { "acc": 0.73878975, "epoch": 1.1451040081177069, "grad_norm": 2.125, "learning_rate": 4.216978401396376e-06, "loss": 1.0245575, "memory(GiB)": 369.42, "step": 45140, "train_speed(iter/s)": 0.200507 }, { "acc": 0.74458184, "epoch": 1.1452308472856418, "grad_norm": 2.609375, "learning_rate": 4.215942733468675e-06, "loss": 1.03849421, "memory(GiB)": 369.42, "step": 45145, "train_speed(iter/s)": 0.200511 }, { "acc": 0.76908627, "epoch": 1.1453576864535768, "grad_norm": 2.25, "learning_rate": 4.2149071000273134e-06, "loss": 0.91800156, "memory(GiB)": 369.42, "step": 45150, "train_speed(iter/s)": 0.200515 }, { "acc": 0.75453458, "epoch": 1.1454845256215118, "grad_norm": 2.71875, "learning_rate": 4.213871501117842e-06, "loss": 0.93514147, "memory(GiB)": 369.42, "step": 45155, "train_speed(iter/s)": 0.200518 }, { "acc": 0.73846064, "epoch": 1.145611364789447, "grad_norm": 2.203125, "learning_rate": 4.212835936785811e-06, "loss": 1.09896069, "memory(GiB)": 369.42, "step": 45160, "train_speed(iter/s)": 0.200522 }, { "acc": 0.76019316, "epoch": 1.145738203957382, "grad_norm": 2.09375, "learning_rate": 4.21180040707677e-06, "loss": 0.94392166, "memory(GiB)": 369.42, "step": 45165, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75552025, "epoch": 1.145865043125317, "grad_norm": 2.078125, "learning_rate": 4.2107649120362684e-06, "loss": 1.02933426, "memory(GiB)": 369.42, "step": 45170, "train_speed(iter/s)": 0.200529 }, { "acc": 0.74064007, "epoch": 1.1459918822932522, "grad_norm": 1.8359375, "learning_rate": 4.2097294517098465e-06, "loss": 1.01825523, "memory(GiB)": 369.42, "step": 45175, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74883385, "epoch": 1.1461187214611872, "grad_norm": 2.09375, "learning_rate": 4.208694026143054e-06, "loss": 1.04403391, "memory(GiB)": 369.42, "step": 45180, "train_speed(iter/s)": 0.200535 }, { "acc": 0.74849463, "epoch": 1.1462455606291222, "grad_norm": 2.59375, "learning_rate": 4.2076586353814295e-06, "loss": 1.10479469, "memory(GiB)": 369.42, "step": 45185, "train_speed(iter/s)": 0.200538 }, { "acc": 0.74692307, "epoch": 1.1463723997970574, "grad_norm": 2.078125, "learning_rate": 4.2066232794705174e-06, "loss": 0.99199133, "memory(GiB)": 369.42, "step": 45190, "train_speed(iter/s)": 0.20054 }, { "acc": 0.7489995, "epoch": 1.1464992389649924, "grad_norm": 2.046875, "learning_rate": 4.205587958455854e-06, "loss": 0.9819066, "memory(GiB)": 369.42, "step": 45195, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75349383, "epoch": 1.1466260781329274, "grad_norm": 1.9296875, "learning_rate": 4.204552672382981e-06, "loss": 1.00366287, "memory(GiB)": 369.42, "step": 45200, "train_speed(iter/s)": 0.200548 }, { "acc": 0.7460412, "epoch": 1.1467529173008626, "grad_norm": 1.96875, "learning_rate": 4.203517421297431e-06, "loss": 0.96854362, "memory(GiB)": 369.42, "step": 45205, "train_speed(iter/s)": 0.200552 }, { "acc": 0.74944448, "epoch": 1.1468797564687976, "grad_norm": 2.1875, "learning_rate": 4.202482205244742e-06, "loss": 0.95129299, "memory(GiB)": 369.42, "step": 45210, "train_speed(iter/s)": 0.200557 }, { "acc": 0.76497912, "epoch": 1.1470065956367326, "grad_norm": 2.34375, "learning_rate": 4.201447024270446e-06, "loss": 0.97546082, "memory(GiB)": 369.42, "step": 45215, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74028978, "epoch": 1.1471334348046676, "grad_norm": 2.171875, "learning_rate": 4.200411878420074e-06, "loss": 1.05985546, "memory(GiB)": 369.42, "step": 45220, "train_speed(iter/s)": 0.200565 }, { "acc": 0.75079899, "epoch": 1.1472602739726028, "grad_norm": 2.390625, "learning_rate": 4.199376767739158e-06, "loss": 0.97047148, "memory(GiB)": 369.42, "step": 45225, "train_speed(iter/s)": 0.200567 }, { "acc": 0.75941286, "epoch": 1.1473871131405378, "grad_norm": 2.375, "learning_rate": 4.1983416922732276e-06, "loss": 0.96531277, "memory(GiB)": 369.42, "step": 45230, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75650005, "epoch": 1.147513952308473, "grad_norm": 1.9453125, "learning_rate": 4.197306652067807e-06, "loss": 0.93456974, "memory(GiB)": 369.42, "step": 45235, "train_speed(iter/s)": 0.200574 }, { "acc": 0.76561441, "epoch": 1.147640791476408, "grad_norm": 2.265625, "learning_rate": 4.196271647168425e-06, "loss": 0.95940723, "memory(GiB)": 369.42, "step": 45240, "train_speed(iter/s)": 0.200575 }, { "acc": 0.7470005, "epoch": 1.147767630644343, "grad_norm": 2.328125, "learning_rate": 4.195236677620604e-06, "loss": 1.05988083, "memory(GiB)": 369.42, "step": 45245, "train_speed(iter/s)": 0.200579 }, { "acc": 0.76627021, "epoch": 1.147894469812278, "grad_norm": 2.6875, "learning_rate": 4.1942017434698675e-06, "loss": 0.91359329, "memory(GiB)": 369.42, "step": 45250, "train_speed(iter/s)": 0.200581 }, { "acc": 0.75315533, "epoch": 1.1480213089802132, "grad_norm": 2.015625, "learning_rate": 4.1931668447617346e-06, "loss": 0.97549858, "memory(GiB)": 369.42, "step": 45255, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75273867, "epoch": 1.1481481481481481, "grad_norm": 2.25, "learning_rate": 4.192131981541727e-06, "loss": 1.04816685, "memory(GiB)": 369.42, "step": 45260, "train_speed(iter/s)": 0.200589 }, { "acc": 0.76008358, "epoch": 1.1482749873160831, "grad_norm": 2.140625, "learning_rate": 4.19109715385536e-06, "loss": 0.95279846, "memory(GiB)": 369.42, "step": 45265, "train_speed(iter/s)": 0.200592 }, { "acc": 0.74323082, "epoch": 1.1484018264840183, "grad_norm": 2.03125, "learning_rate": 4.190062361748154e-06, "loss": 0.98316727, "memory(GiB)": 369.42, "step": 45270, "train_speed(iter/s)": 0.200595 }, { "acc": 0.73977718, "epoch": 1.1485286656519533, "grad_norm": 2.359375, "learning_rate": 4.189027605265621e-06, "loss": 0.97879381, "memory(GiB)": 369.42, "step": 45275, "train_speed(iter/s)": 0.200597 }, { "acc": 0.75392456, "epoch": 1.1486555048198883, "grad_norm": 2.640625, "learning_rate": 4.187992884453273e-06, "loss": 1.0055809, "memory(GiB)": 369.42, "step": 45280, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75301409, "epoch": 1.1487823439878235, "grad_norm": 1.9765625, "learning_rate": 4.186958199356624e-06, "loss": 1.00486832, "memory(GiB)": 369.42, "step": 45285, "train_speed(iter/s)": 0.200602 }, { "acc": 0.74711962, "epoch": 1.1489091831557585, "grad_norm": 2.1875, "learning_rate": 4.185923550021185e-06, "loss": 1.00922251, "memory(GiB)": 369.42, "step": 45290, "train_speed(iter/s)": 0.200607 }, { "acc": 0.75051174, "epoch": 1.1490360223236935, "grad_norm": 1.875, "learning_rate": 4.1848889364924625e-06, "loss": 0.99407911, "memory(GiB)": 369.42, "step": 45295, "train_speed(iter/s)": 0.200608 }, { "acc": 0.73963604, "epoch": 1.1491628614916287, "grad_norm": 2.375, "learning_rate": 4.183854358815962e-06, "loss": 1.04804325, "memory(GiB)": 369.42, "step": 45300, "train_speed(iter/s)": 0.200609 }, { "acc": 0.75066342, "epoch": 1.1492897006595637, "grad_norm": 2.359375, "learning_rate": 4.182819817037192e-06, "loss": 0.98271961, "memory(GiB)": 369.42, "step": 45305, "train_speed(iter/s)": 0.200614 }, { "acc": 0.7459271, "epoch": 1.1494165398274987, "grad_norm": 2.359375, "learning_rate": 4.181785311201655e-06, "loss": 0.98920288, "memory(GiB)": 369.42, "step": 45310, "train_speed(iter/s)": 0.200617 }, { "acc": 0.73860283, "epoch": 1.1495433789954337, "grad_norm": 2.140625, "learning_rate": 4.1807508413548515e-06, "loss": 1.04876022, "memory(GiB)": 369.42, "step": 45315, "train_speed(iter/s)": 0.200619 }, { "acc": 0.76655464, "epoch": 1.149670218163369, "grad_norm": 2.09375, "learning_rate": 4.179716407542285e-06, "loss": 0.92377377, "memory(GiB)": 369.42, "step": 45320, "train_speed(iter/s)": 0.200622 }, { "acc": 0.73821068, "epoch": 1.1497970573313039, "grad_norm": 2.171875, "learning_rate": 4.178682009809452e-06, "loss": 1.02520666, "memory(GiB)": 369.42, "step": 45325, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75423832, "epoch": 1.1499238964992389, "grad_norm": 1.8046875, "learning_rate": 4.177647648201854e-06, "loss": 0.97218561, "memory(GiB)": 369.42, "step": 45330, "train_speed(iter/s)": 0.200632 }, { "acc": 0.75392661, "epoch": 1.150050735667174, "grad_norm": 2.0625, "learning_rate": 4.1766133227649815e-06, "loss": 0.95987759, "memory(GiB)": 369.42, "step": 45335, "train_speed(iter/s)": 0.200635 }, { "acc": 0.76091471, "epoch": 1.150177574835109, "grad_norm": 2.015625, "learning_rate": 4.175579033544332e-06, "loss": 0.94665413, "memory(GiB)": 369.42, "step": 45340, "train_speed(iter/s)": 0.200639 }, { "acc": 0.74312272, "epoch": 1.150304414003044, "grad_norm": 2.171875, "learning_rate": 4.174544780585395e-06, "loss": 1.02037907, "memory(GiB)": 369.42, "step": 45345, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74608231, "epoch": 1.1504312531709793, "grad_norm": 3.734375, "learning_rate": 4.1735105639336686e-06, "loss": 1.06476192, "memory(GiB)": 369.42, "step": 45350, "train_speed(iter/s)": 0.200647 }, { "acc": 0.74430652, "epoch": 1.1505580923389143, "grad_norm": 1.921875, "learning_rate": 4.172476383634635e-06, "loss": 0.99402313, "memory(GiB)": 369.42, "step": 45355, "train_speed(iter/s)": 0.20065 }, { "acc": 0.74702063, "epoch": 1.1506849315068493, "grad_norm": 2.28125, "learning_rate": 4.171442239733783e-06, "loss": 1.05325241, "memory(GiB)": 369.42, "step": 45360, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74111114, "epoch": 1.1508117706747845, "grad_norm": 1.9609375, "learning_rate": 4.170408132276603e-06, "loss": 1.05518932, "memory(GiB)": 369.42, "step": 45365, "train_speed(iter/s)": 0.200655 }, { "acc": 0.74202175, "epoch": 1.1509386098427195, "grad_norm": 2.96875, "learning_rate": 4.1693740613085776e-06, "loss": 1.0228116, "memory(GiB)": 369.42, "step": 45370, "train_speed(iter/s)": 0.200659 }, { "acc": 0.76439886, "epoch": 1.1510654490106544, "grad_norm": 2.25, "learning_rate": 4.168340026875188e-06, "loss": 0.96637259, "memory(GiB)": 369.42, "step": 45375, "train_speed(iter/s)": 0.200664 }, { "acc": 0.75602336, "epoch": 1.1511922881785894, "grad_norm": 1.9921875, "learning_rate": 4.167306029021917e-06, "loss": 0.95971966, "memory(GiB)": 369.42, "step": 45380, "train_speed(iter/s)": 0.200669 }, { "acc": 0.74855566, "epoch": 1.1513191273465246, "grad_norm": 2.421875, "learning_rate": 4.166272067794246e-06, "loss": 1.0092186, "memory(GiB)": 369.42, "step": 45385, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74057784, "epoch": 1.1514459665144596, "grad_norm": 2.03125, "learning_rate": 4.165238143237651e-06, "loss": 1.02108669, "memory(GiB)": 369.42, "step": 45390, "train_speed(iter/s)": 0.200676 }, { "acc": 0.74490356, "epoch": 1.1515728056823948, "grad_norm": 1.9140625, "learning_rate": 4.164204255397608e-06, "loss": 0.9844758, "memory(GiB)": 369.42, "step": 45395, "train_speed(iter/s)": 0.200679 }, { "acc": 0.73895516, "epoch": 1.1516996448503298, "grad_norm": 1.8046875, "learning_rate": 4.163170404319596e-06, "loss": 0.99838142, "memory(GiB)": 369.42, "step": 45400, "train_speed(iter/s)": 0.200682 }, { "acc": 0.7551446, "epoch": 1.1518264840182648, "grad_norm": 2.1875, "learning_rate": 4.1621365900490825e-06, "loss": 0.99079342, "memory(GiB)": 369.42, "step": 45405, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75873852, "epoch": 1.1519533231861998, "grad_norm": 2.421875, "learning_rate": 4.1611028126315455e-06, "loss": 0.98622475, "memory(GiB)": 369.42, "step": 45410, "train_speed(iter/s)": 0.20069 }, { "acc": 0.74750142, "epoch": 1.152080162354135, "grad_norm": 2.078125, "learning_rate": 4.160069072112451e-06, "loss": 0.9998209, "memory(GiB)": 369.42, "step": 45415, "train_speed(iter/s)": 0.200692 }, { "acc": 0.75830889, "epoch": 1.15220700152207, "grad_norm": 1.7421875, "learning_rate": 4.1590353685372695e-06, "loss": 0.93554993, "memory(GiB)": 369.42, "step": 45420, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75734849, "epoch": 1.152333840690005, "grad_norm": 2.4375, "learning_rate": 4.158001701951465e-06, "loss": 0.91113338, "memory(GiB)": 369.42, "step": 45425, "train_speed(iter/s)": 0.200697 }, { "acc": 0.74953928, "epoch": 1.1524606798579402, "grad_norm": 2.40625, "learning_rate": 4.156968072400508e-06, "loss": 1.00543556, "memory(GiB)": 369.42, "step": 45430, "train_speed(iter/s)": 0.2007 }, { "acc": 0.74069653, "epoch": 1.1525875190258752, "grad_norm": 2.5, "learning_rate": 4.155934479929858e-06, "loss": 1.0189477, "memory(GiB)": 369.42, "step": 45435, "train_speed(iter/s)": 0.200704 }, { "acc": 0.78010454, "epoch": 1.1527143581938102, "grad_norm": 2.859375, "learning_rate": 4.154900924584976e-06, "loss": 0.8728344, "memory(GiB)": 369.42, "step": 45440, "train_speed(iter/s)": 0.200707 }, { "acc": 0.74232368, "epoch": 1.1528411973617454, "grad_norm": 2.125, "learning_rate": 4.153867406411327e-06, "loss": 1.04141445, "memory(GiB)": 369.42, "step": 45445, "train_speed(iter/s)": 0.200711 }, { "acc": 0.76023359, "epoch": 1.1529680365296804, "grad_norm": 2.078125, "learning_rate": 4.152833925454367e-06, "loss": 0.92269039, "memory(GiB)": 369.42, "step": 45450, "train_speed(iter/s)": 0.200713 }, { "acc": 0.75712695, "epoch": 1.1530948756976154, "grad_norm": 2.109375, "learning_rate": 4.1518004817595515e-06, "loss": 0.95784597, "memory(GiB)": 369.42, "step": 45455, "train_speed(iter/s)": 0.200713 }, { "acc": 0.73039532, "epoch": 1.1532217148655506, "grad_norm": 2.53125, "learning_rate": 4.150767075372338e-06, "loss": 1.07962532, "memory(GiB)": 369.42, "step": 45460, "train_speed(iter/s)": 0.200716 }, { "acc": 0.7513607, "epoch": 1.1533485540334856, "grad_norm": 2.125, "learning_rate": 4.149733706338182e-06, "loss": 0.95051489, "memory(GiB)": 369.42, "step": 45465, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75689716, "epoch": 1.1534753932014206, "grad_norm": 2.15625, "learning_rate": 4.148700374702533e-06, "loss": 0.98287468, "memory(GiB)": 369.42, "step": 45470, "train_speed(iter/s)": 0.20072 }, { "acc": 0.75918331, "epoch": 1.1536022323693556, "grad_norm": 1.9765625, "learning_rate": 4.147667080510841e-06, "loss": 0.9824563, "memory(GiB)": 369.42, "step": 45475, "train_speed(iter/s)": 0.200722 }, { "acc": 0.76155024, "epoch": 1.1537290715372908, "grad_norm": 1.8515625, "learning_rate": 4.146633823808557e-06, "loss": 0.97193336, "memory(GiB)": 369.42, "step": 45480, "train_speed(iter/s)": 0.200725 }, { "acc": 0.74088135, "epoch": 1.1538559107052258, "grad_norm": 2.3125, "learning_rate": 4.145600604641127e-06, "loss": 1.00529089, "memory(GiB)": 369.42, "step": 45485, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75318041, "epoch": 1.1539827498731607, "grad_norm": 2.265625, "learning_rate": 4.1445674230539985e-06, "loss": 0.95168123, "memory(GiB)": 369.42, "step": 45490, "train_speed(iter/s)": 0.200731 }, { "acc": 0.76320696, "epoch": 1.154109589041096, "grad_norm": 2.34375, "learning_rate": 4.143534279092613e-06, "loss": 0.97435322, "memory(GiB)": 369.42, "step": 45495, "train_speed(iter/s)": 0.200734 }, { "acc": 0.74347887, "epoch": 1.154236428209031, "grad_norm": 2.515625, "learning_rate": 4.142501172802412e-06, "loss": 1.05786572, "memory(GiB)": 369.42, "step": 45500, "train_speed(iter/s)": 0.200732 }, { "acc": 0.74648161, "epoch": 1.154363267376966, "grad_norm": 2.8125, "learning_rate": 4.14146810422884e-06, "loss": 1.01636848, "memory(GiB)": 369.42, "step": 45505, "train_speed(iter/s)": 0.200734 }, { "acc": 0.75030184, "epoch": 1.1544901065449011, "grad_norm": 2.203125, "learning_rate": 4.140435073417335e-06, "loss": 0.94425545, "memory(GiB)": 369.42, "step": 45510, "train_speed(iter/s)": 0.200737 }, { "acc": 0.74908371, "epoch": 1.1546169457128361, "grad_norm": 1.9609375, "learning_rate": 4.139402080413331e-06, "loss": 0.98656492, "memory(GiB)": 369.42, "step": 45515, "train_speed(iter/s)": 0.200739 }, { "acc": 0.74995184, "epoch": 1.1547437848807711, "grad_norm": 2.171875, "learning_rate": 4.138369125262266e-06, "loss": 1.00320339, "memory(GiB)": 369.42, "step": 45520, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74522943, "epoch": 1.1548706240487063, "grad_norm": 2.15625, "learning_rate": 4.137336208009574e-06, "loss": 1.01109505, "memory(GiB)": 369.42, "step": 45525, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74879503, "epoch": 1.1549974632166413, "grad_norm": 2.125, "learning_rate": 4.136303328700688e-06, "loss": 1.0175045, "memory(GiB)": 369.42, "step": 45530, "train_speed(iter/s)": 0.200749 }, { "acc": 0.73307333, "epoch": 1.1551243023845763, "grad_norm": 2.234375, "learning_rate": 4.135270487381037e-06, "loss": 0.99747362, "memory(GiB)": 369.42, "step": 45535, "train_speed(iter/s)": 0.200748 }, { "acc": 0.76173067, "epoch": 1.1552511415525113, "grad_norm": 1.9375, "learning_rate": 4.13423768409605e-06, "loss": 0.94905682, "memory(GiB)": 369.42, "step": 45540, "train_speed(iter/s)": 0.200752 }, { "acc": 0.75141692, "epoch": 1.1553779807204465, "grad_norm": 1.9921875, "learning_rate": 4.133204918891155e-06, "loss": 0.9933445, "memory(GiB)": 369.42, "step": 45545, "train_speed(iter/s)": 0.200756 }, { "acc": 0.7614502, "epoch": 1.1555048198883815, "grad_norm": 2.140625, "learning_rate": 4.132172191811781e-06, "loss": 0.97459879, "memory(GiB)": 369.42, "step": 45550, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75252318, "epoch": 1.1556316590563167, "grad_norm": 2.0, "learning_rate": 4.131139502903345e-06, "loss": 0.97934999, "memory(GiB)": 369.42, "step": 45555, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74571471, "epoch": 1.1557584982242517, "grad_norm": 1.8515625, "learning_rate": 4.130106852211273e-06, "loss": 1.00541401, "memory(GiB)": 369.42, "step": 45560, "train_speed(iter/s)": 0.200767 }, { "acc": 0.75631571, "epoch": 1.1558853373921867, "grad_norm": 2.15625, "learning_rate": 4.129074239780986e-06, "loss": 0.95181999, "memory(GiB)": 369.42, "step": 45565, "train_speed(iter/s)": 0.200769 }, { "acc": 0.76188145, "epoch": 1.1560121765601217, "grad_norm": 2.296875, "learning_rate": 4.128041665657903e-06, "loss": 0.9970089, "memory(GiB)": 369.42, "step": 45570, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75100141, "epoch": 1.1561390157280569, "grad_norm": 2.171875, "learning_rate": 4.127009129887441e-06, "loss": 0.97035923, "memory(GiB)": 369.42, "step": 45575, "train_speed(iter/s)": 0.200778 }, { "acc": 0.74503355, "epoch": 1.1562658548959919, "grad_norm": 2.296875, "learning_rate": 4.125976632515013e-06, "loss": 1.00966206, "memory(GiB)": 369.42, "step": 45580, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75139465, "epoch": 1.1563926940639269, "grad_norm": 2.046875, "learning_rate": 4.124944173586036e-06, "loss": 0.99578171, "memory(GiB)": 369.42, "step": 45585, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74443674, "epoch": 1.156519533231862, "grad_norm": 2.296875, "learning_rate": 4.123911753145922e-06, "loss": 1.01432743, "memory(GiB)": 369.42, "step": 45590, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75007877, "epoch": 1.156646372399797, "grad_norm": 2.140625, "learning_rate": 4.12287937124008e-06, "loss": 1.00337563, "memory(GiB)": 369.42, "step": 45595, "train_speed(iter/s)": 0.200791 }, { "acc": 0.74863796, "epoch": 1.156773211567732, "grad_norm": 2.5, "learning_rate": 4.121847027913918e-06, "loss": 1.00046196, "memory(GiB)": 369.42, "step": 45600, "train_speed(iter/s)": 0.200795 }, { "acc": 0.75792894, "epoch": 1.1569000507356673, "grad_norm": 2.453125, "learning_rate": 4.1208147232128456e-06, "loss": 0.9830843, "memory(GiB)": 369.42, "step": 45605, "train_speed(iter/s)": 0.200799 }, { "acc": 0.73839254, "epoch": 1.1570268899036023, "grad_norm": 2.203125, "learning_rate": 4.119782457182267e-06, "loss": 1.00869007, "memory(GiB)": 369.42, "step": 45610, "train_speed(iter/s)": 0.2008 }, { "acc": 0.76368332, "epoch": 1.1571537290715372, "grad_norm": 2.09375, "learning_rate": 4.118750229867585e-06, "loss": 0.95794678, "memory(GiB)": 369.42, "step": 45615, "train_speed(iter/s)": 0.200803 }, { "acc": 0.74294195, "epoch": 1.1572805682394725, "grad_norm": 2.484375, "learning_rate": 4.117718041314204e-06, "loss": 1.00558529, "memory(GiB)": 369.42, "step": 45620, "train_speed(iter/s)": 0.200807 }, { "acc": 0.74154387, "epoch": 1.1574074074074074, "grad_norm": 2.0, "learning_rate": 4.11668589156752e-06, "loss": 0.9991642, "memory(GiB)": 369.42, "step": 45625, "train_speed(iter/s)": 0.20081 }, { "acc": 0.76815963, "epoch": 1.1575342465753424, "grad_norm": 2.09375, "learning_rate": 4.115653780672937e-06, "loss": 0.92887897, "memory(GiB)": 369.42, "step": 45630, "train_speed(iter/s)": 0.200814 }, { "acc": 0.75124621, "epoch": 1.1576610857432774, "grad_norm": 2.1875, "learning_rate": 4.1146217086758475e-06, "loss": 0.98714409, "memory(GiB)": 369.42, "step": 45635, "train_speed(iter/s)": 0.200817 }, { "acc": 0.7656642, "epoch": 1.1577879249112126, "grad_norm": 2.140625, "learning_rate": 4.113589675621649e-06, "loss": 0.8994276, "memory(GiB)": 369.42, "step": 45640, "train_speed(iter/s)": 0.20082 }, { "acc": 0.76002054, "epoch": 1.1579147640791476, "grad_norm": 1.8125, "learning_rate": 4.112557681555733e-06, "loss": 0.94770412, "memory(GiB)": 369.42, "step": 45645, "train_speed(iter/s)": 0.200824 }, { "acc": 0.75343933, "epoch": 1.1580416032470826, "grad_norm": 2.140625, "learning_rate": 4.111525726523494e-06, "loss": 1.01410275, "memory(GiB)": 369.42, "step": 45650, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76380539, "epoch": 1.1581684424150178, "grad_norm": 1.8984375, "learning_rate": 4.110493810570319e-06, "loss": 0.92287207, "memory(GiB)": 369.42, "step": 45655, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75015945, "epoch": 1.1582952815829528, "grad_norm": 2.28125, "learning_rate": 4.109461933741598e-06, "loss": 0.99688206, "memory(GiB)": 369.42, "step": 45660, "train_speed(iter/s)": 0.200836 }, { "acc": 0.74357109, "epoch": 1.1584221207508878, "grad_norm": 2.375, "learning_rate": 4.108430096082716e-06, "loss": 1.03302326, "memory(GiB)": 369.42, "step": 45665, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74453821, "epoch": 1.158548959918823, "grad_norm": 2.0, "learning_rate": 4.107398297639062e-06, "loss": 1.02182713, "memory(GiB)": 369.42, "step": 45670, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74794312, "epoch": 1.158675799086758, "grad_norm": 2.265625, "learning_rate": 4.106366538456013e-06, "loss": 0.99965839, "memory(GiB)": 369.42, "step": 45675, "train_speed(iter/s)": 0.200846 }, { "acc": 0.76611099, "epoch": 1.158802638254693, "grad_norm": 2.4375, "learning_rate": 4.105334818578954e-06, "loss": 0.95865021, "memory(GiB)": 369.42, "step": 45680, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74957504, "epoch": 1.1589294774226282, "grad_norm": 2.28125, "learning_rate": 4.104303138053265e-06, "loss": 1.04420071, "memory(GiB)": 369.42, "step": 45685, "train_speed(iter/s)": 0.200855 }, { "acc": 0.77010717, "epoch": 1.1590563165905632, "grad_norm": 2.25, "learning_rate": 4.103271496924323e-06, "loss": 0.93017597, "memory(GiB)": 369.42, "step": 45690, "train_speed(iter/s)": 0.200859 }, { "acc": 0.75678654, "epoch": 1.1591831557584982, "grad_norm": 2.5625, "learning_rate": 4.102239895237503e-06, "loss": 0.99829273, "memory(GiB)": 369.42, "step": 45695, "train_speed(iter/s)": 0.200863 }, { "acc": 0.76486578, "epoch": 1.1593099949264332, "grad_norm": 2.234375, "learning_rate": 4.101208333038181e-06, "loss": 0.91270447, "memory(GiB)": 369.42, "step": 45700, "train_speed(iter/s)": 0.200866 }, { "acc": 0.74239502, "epoch": 1.1594368340943684, "grad_norm": 2.5625, "learning_rate": 4.1001768103717285e-06, "loss": 1.04038925, "memory(GiB)": 369.42, "step": 45705, "train_speed(iter/s)": 0.200869 }, { "acc": 0.74660745, "epoch": 1.1595636732623034, "grad_norm": 2.671875, "learning_rate": 4.09914532728352e-06, "loss": 0.92613134, "memory(GiB)": 369.42, "step": 45710, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74590502, "epoch": 1.1596905124302386, "grad_norm": 2.46875, "learning_rate": 4.09811388381892e-06, "loss": 1.02801075, "memory(GiB)": 369.42, "step": 45715, "train_speed(iter/s)": 0.200874 }, { "acc": 0.76645021, "epoch": 1.1598173515981736, "grad_norm": 2.375, "learning_rate": 4.097082480023298e-06, "loss": 0.94981794, "memory(GiB)": 369.42, "step": 45720, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74323092, "epoch": 1.1599441907661086, "grad_norm": 2.09375, "learning_rate": 4.09605111594202e-06, "loss": 1.01890945, "memory(GiB)": 369.42, "step": 45725, "train_speed(iter/s)": 0.200881 }, { "acc": 0.74608374, "epoch": 1.1600710299340435, "grad_norm": 1.921875, "learning_rate": 4.095019791620451e-06, "loss": 0.99874067, "memory(GiB)": 369.42, "step": 45730, "train_speed(iter/s)": 0.200883 }, { "acc": 0.75342941, "epoch": 1.1601978691019788, "grad_norm": 1.984375, "learning_rate": 4.093988507103951e-06, "loss": 0.96353226, "memory(GiB)": 369.42, "step": 45735, "train_speed(iter/s)": 0.200886 }, { "acc": 0.76126604, "epoch": 1.1603247082699137, "grad_norm": 2.34375, "learning_rate": 4.09295726243788e-06, "loss": 0.93427181, "memory(GiB)": 369.42, "step": 45740, "train_speed(iter/s)": 0.200888 }, { "acc": 0.7619257, "epoch": 1.1604515474378487, "grad_norm": 3.015625, "learning_rate": 4.091926057667601e-06, "loss": 1.01426334, "memory(GiB)": 369.42, "step": 45745, "train_speed(iter/s)": 0.200893 }, { "acc": 0.74832516, "epoch": 1.160578386605784, "grad_norm": 2.46875, "learning_rate": 4.0908948928384675e-06, "loss": 0.9438591, "memory(GiB)": 369.42, "step": 45750, "train_speed(iter/s)": 0.200897 }, { "acc": 0.75597658, "epoch": 1.160705225773719, "grad_norm": 2.4375, "learning_rate": 4.089863767995835e-06, "loss": 1.0208849, "memory(GiB)": 369.42, "step": 45755, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74607663, "epoch": 1.160832064941654, "grad_norm": 2.109375, "learning_rate": 4.088832683185057e-06, "loss": 0.97384663, "memory(GiB)": 369.42, "step": 45760, "train_speed(iter/s)": 0.200904 }, { "acc": 0.74118996, "epoch": 1.1609589041095891, "grad_norm": 2.53125, "learning_rate": 4.087801638451485e-06, "loss": 1.02863703, "memory(GiB)": 369.42, "step": 45765, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74534869, "epoch": 1.1610857432775241, "grad_norm": 1.96875, "learning_rate": 4.086770633840472e-06, "loss": 1.01564608, "memory(GiB)": 369.42, "step": 45770, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74808946, "epoch": 1.161212582445459, "grad_norm": 2.6875, "learning_rate": 4.085739669397362e-06, "loss": 0.99696159, "memory(GiB)": 369.42, "step": 45775, "train_speed(iter/s)": 0.200913 }, { "acc": 0.7480649, "epoch": 1.1613394216133943, "grad_norm": 1.796875, "learning_rate": 4.084708745167504e-06, "loss": 0.96023884, "memory(GiB)": 369.42, "step": 45780, "train_speed(iter/s)": 0.200917 }, { "acc": 0.73089638, "epoch": 1.1614662607813293, "grad_norm": 2.265625, "learning_rate": 4.08367786119624e-06, "loss": 1.07571659, "memory(GiB)": 369.42, "step": 45785, "train_speed(iter/s)": 0.200919 }, { "acc": 0.76288767, "epoch": 1.1615930999492643, "grad_norm": 2.5, "learning_rate": 4.082647017528918e-06, "loss": 0.96520014, "memory(GiB)": 369.42, "step": 45790, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75277872, "epoch": 1.1617199391171993, "grad_norm": 2.65625, "learning_rate": 4.081616214210874e-06, "loss": 0.98243732, "memory(GiB)": 369.42, "step": 45795, "train_speed(iter/s)": 0.200924 }, { "acc": 0.75591488, "epoch": 1.1618467782851345, "grad_norm": 1.8671875, "learning_rate": 4.0805854512874485e-06, "loss": 0.98900414, "memory(GiB)": 369.42, "step": 45800, "train_speed(iter/s)": 0.200928 }, { "acc": 0.75208454, "epoch": 1.1619736174530695, "grad_norm": 1.9453125, "learning_rate": 4.079554728803981e-06, "loss": 0.96852436, "memory(GiB)": 369.42, "step": 45805, "train_speed(iter/s)": 0.20093 }, { "acc": 0.76033945, "epoch": 1.1621004566210045, "grad_norm": 2.671875, "learning_rate": 4.078524046805806e-06, "loss": 0.97027302, "memory(GiB)": 369.42, "step": 45810, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74514875, "epoch": 1.1622272957889397, "grad_norm": 2.359375, "learning_rate": 4.0774934053382576e-06, "loss": 0.9896965, "memory(GiB)": 369.42, "step": 45815, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75582561, "epoch": 1.1623541349568747, "grad_norm": 2.328125, "learning_rate": 4.076462804446667e-06, "loss": 0.99246874, "memory(GiB)": 369.42, "step": 45820, "train_speed(iter/s)": 0.200941 }, { "acc": 0.75486751, "epoch": 1.1624809741248097, "grad_norm": 2.421875, "learning_rate": 4.0754322441763654e-06, "loss": 1.01425829, "memory(GiB)": 369.42, "step": 45825, "train_speed(iter/s)": 0.200944 }, { "acc": 0.74781475, "epoch": 1.1626078132927449, "grad_norm": 1.8984375, "learning_rate": 4.0744017245726834e-06, "loss": 1.02209911, "memory(GiB)": 369.42, "step": 45830, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74512172, "epoch": 1.1627346524606799, "grad_norm": 2.453125, "learning_rate": 4.073371245680944e-06, "loss": 1.02167578, "memory(GiB)": 369.42, "step": 45835, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74416175, "epoch": 1.1628614916286149, "grad_norm": 2.078125, "learning_rate": 4.0723408075464754e-06, "loss": 1.02884636, "memory(GiB)": 369.42, "step": 45840, "train_speed(iter/s)": 0.200949 }, { "acc": 0.7428493, "epoch": 1.16298833079655, "grad_norm": 2.203125, "learning_rate": 4.071310410214598e-06, "loss": 1.00936975, "memory(GiB)": 369.42, "step": 45845, "train_speed(iter/s)": 0.200953 }, { "acc": 0.74811916, "epoch": 1.163115169964485, "grad_norm": 1.78125, "learning_rate": 4.070280053730639e-06, "loss": 1.0610651, "memory(GiB)": 369.42, "step": 45850, "train_speed(iter/s)": 0.200955 }, { "acc": 0.75129213, "epoch": 1.16324200913242, "grad_norm": 2.359375, "learning_rate": 4.069249738139911e-06, "loss": 0.98436375, "memory(GiB)": 369.42, "step": 45855, "train_speed(iter/s)": 0.200958 }, { "acc": 0.75310316, "epoch": 1.163368848300355, "grad_norm": 2.625, "learning_rate": 4.068219463487736e-06, "loss": 0.97827635, "memory(GiB)": 369.42, "step": 45860, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75408707, "epoch": 1.1634956874682902, "grad_norm": 2.078125, "learning_rate": 4.0671892298194286e-06, "loss": 1.00041447, "memory(GiB)": 369.42, "step": 45865, "train_speed(iter/s)": 0.200961 }, { "acc": 0.7523097, "epoch": 1.1636225266362252, "grad_norm": 1.953125, "learning_rate": 4.066159037180304e-06, "loss": 0.99548454, "memory(GiB)": 369.42, "step": 45870, "train_speed(iter/s)": 0.200963 }, { "acc": 0.74614677, "epoch": 1.1637493658041604, "grad_norm": 2.140625, "learning_rate": 4.065128885615674e-06, "loss": 1.00838194, "memory(GiB)": 369.42, "step": 45875, "train_speed(iter/s)": 0.200967 }, { "acc": 0.73724194, "epoch": 1.1638762049720954, "grad_norm": 2.015625, "learning_rate": 4.064098775170849e-06, "loss": 1.00697842, "memory(GiB)": 369.42, "step": 45880, "train_speed(iter/s)": 0.20097 }, { "acc": 0.74973421, "epoch": 1.1640030441400304, "grad_norm": 2.1875, "learning_rate": 4.063068705891139e-06, "loss": 0.99893169, "memory(GiB)": 369.42, "step": 45885, "train_speed(iter/s)": 0.200972 }, { "acc": 0.75109034, "epoch": 1.1641298833079654, "grad_norm": 1.8203125, "learning_rate": 4.062038677821852e-06, "loss": 1.00098581, "memory(GiB)": 369.42, "step": 45890, "train_speed(iter/s)": 0.200973 }, { "acc": 0.7557312, "epoch": 1.1642567224759006, "grad_norm": 2.265625, "learning_rate": 4.061008691008289e-06, "loss": 0.98370953, "memory(GiB)": 369.42, "step": 45895, "train_speed(iter/s)": 0.200976 }, { "acc": 0.76209803, "epoch": 1.1643835616438356, "grad_norm": 2.015625, "learning_rate": 4.059978745495757e-06, "loss": 0.95467892, "memory(GiB)": 369.42, "step": 45900, "train_speed(iter/s)": 0.200978 }, { "acc": 0.75345263, "epoch": 1.1645104008117706, "grad_norm": 1.78125, "learning_rate": 4.058948841329557e-06, "loss": 0.98209095, "memory(GiB)": 369.42, "step": 45905, "train_speed(iter/s)": 0.20098 }, { "acc": 0.747646, "epoch": 1.1646372399797058, "grad_norm": 2.078125, "learning_rate": 4.057918978554989e-06, "loss": 1.02009668, "memory(GiB)": 369.42, "step": 45910, "train_speed(iter/s)": 0.200981 }, { "acc": 0.75700784, "epoch": 1.1647640791476408, "grad_norm": 2.375, "learning_rate": 4.056889157217348e-06, "loss": 0.97282333, "memory(GiB)": 369.42, "step": 45915, "train_speed(iter/s)": 0.200984 }, { "acc": 0.73869209, "epoch": 1.1648909183155758, "grad_norm": 2.359375, "learning_rate": 4.0558593773619346e-06, "loss": 1.04537048, "memory(GiB)": 369.42, "step": 45920, "train_speed(iter/s)": 0.200987 }, { "acc": 0.74882326, "epoch": 1.165017757483511, "grad_norm": 2.4375, "learning_rate": 4.05482963903404e-06, "loss": 0.96598759, "memory(GiB)": 369.42, "step": 45925, "train_speed(iter/s)": 0.200991 }, { "acc": 0.74852152, "epoch": 1.165144596651446, "grad_norm": 1.859375, "learning_rate": 4.05379994227896e-06, "loss": 1.05322943, "memory(GiB)": 369.42, "step": 45930, "train_speed(iter/s)": 0.200995 }, { "acc": 0.73651552, "epoch": 1.165271435819381, "grad_norm": 2.3125, "learning_rate": 4.052770287141981e-06, "loss": 1.01560192, "memory(GiB)": 369.42, "step": 45935, "train_speed(iter/s)": 0.200998 }, { "acc": 0.76122913, "epoch": 1.1653982749873162, "grad_norm": 2.1875, "learning_rate": 4.051740673668393e-06, "loss": 0.91663952, "memory(GiB)": 369.42, "step": 45940, "train_speed(iter/s)": 0.201002 }, { "acc": 0.74810572, "epoch": 1.1655251141552512, "grad_norm": 2.171875, "learning_rate": 4.0507111019034855e-06, "loss": 0.9771553, "memory(GiB)": 369.42, "step": 45945, "train_speed(iter/s)": 0.201005 }, { "acc": 0.75161781, "epoch": 1.1656519533231862, "grad_norm": 2.3125, "learning_rate": 4.049681571892543e-06, "loss": 0.98328323, "memory(GiB)": 369.42, "step": 45950, "train_speed(iter/s)": 0.201007 }, { "acc": 0.75301919, "epoch": 1.1657787924911212, "grad_norm": 2.125, "learning_rate": 4.048652083680847e-06, "loss": 0.98684969, "memory(GiB)": 369.42, "step": 45955, "train_speed(iter/s)": 0.20101 }, { "acc": 0.73099775, "epoch": 1.1659056316590564, "grad_norm": 1.984375, "learning_rate": 4.047622637313678e-06, "loss": 1.10826855, "memory(GiB)": 369.42, "step": 45960, "train_speed(iter/s)": 0.201014 }, { "acc": 0.76200933, "epoch": 1.1660324708269914, "grad_norm": 2.046875, "learning_rate": 4.046593232836319e-06, "loss": 0.95748816, "memory(GiB)": 369.42, "step": 45965, "train_speed(iter/s)": 0.201018 }, { "acc": 0.7591342, "epoch": 1.1661593099949263, "grad_norm": 2.265625, "learning_rate": 4.045563870294047e-06, "loss": 1.01118526, "memory(GiB)": 369.42, "step": 45970, "train_speed(iter/s)": 0.201021 }, { "acc": 0.75453987, "epoch": 1.1662861491628616, "grad_norm": 2.296875, "learning_rate": 4.044534549732135e-06, "loss": 0.92499523, "memory(GiB)": 369.42, "step": 45975, "train_speed(iter/s)": 0.201024 }, { "acc": 0.7675106, "epoch": 1.1664129883307965, "grad_norm": 2.234375, "learning_rate": 4.043505271195861e-06, "loss": 0.91168823, "memory(GiB)": 369.42, "step": 45980, "train_speed(iter/s)": 0.201027 }, { "acc": 0.75350022, "epoch": 1.1665398274987315, "grad_norm": 2.59375, "learning_rate": 4.042476034730494e-06, "loss": 1.02053823, "memory(GiB)": 369.42, "step": 45985, "train_speed(iter/s)": 0.20103 }, { "acc": 0.73978386, "epoch": 1.1666666666666667, "grad_norm": 2.265625, "learning_rate": 4.041446840381309e-06, "loss": 1.04320107, "memory(GiB)": 369.42, "step": 45990, "train_speed(iter/s)": 0.201034 }, { "acc": 0.76008902, "epoch": 1.1667935058346017, "grad_norm": 2.140625, "learning_rate": 4.040417688193569e-06, "loss": 0.94981785, "memory(GiB)": 369.42, "step": 45995, "train_speed(iter/s)": 0.201035 }, { "acc": 0.75310097, "epoch": 1.1669203450025367, "grad_norm": 2.296875, "learning_rate": 4.039388578212545e-06, "loss": 0.97165842, "memory(GiB)": 369.42, "step": 46000, "train_speed(iter/s)": 0.201038 }, { "epoch": 1.1669203450025367, "eval_acc": 0.7378054766787666, "eval_loss": 0.9700219631195068, "eval_runtime": 385.5376, "eval_samples_per_second": 16.522, "eval_steps_per_second": 8.261, "step": 46000 }, { "acc": 0.7520133, "epoch": 1.167047184170472, "grad_norm": 2.25, "learning_rate": 4.0383595104834975e-06, "loss": 0.99242477, "memory(GiB)": 369.42, "step": 46005, "train_speed(iter/s)": 0.200413 }, { "acc": 0.74793663, "epoch": 1.167174023338407, "grad_norm": 2.546875, "learning_rate": 4.037330485051695e-06, "loss": 0.99445772, "memory(GiB)": 369.42, "step": 46010, "train_speed(iter/s)": 0.200418 }, { "acc": 0.74414206, "epoch": 1.167300862506342, "grad_norm": 1.78125, "learning_rate": 4.0363015019623955e-06, "loss": 1.02449322, "memory(GiB)": 369.42, "step": 46015, "train_speed(iter/s)": 0.200422 }, { "acc": 0.77309365, "epoch": 1.167427701674277, "grad_norm": 1.8828125, "learning_rate": 4.0352725612608565e-06, "loss": 0.94441547, "memory(GiB)": 369.42, "step": 46020, "train_speed(iter/s)": 0.200426 }, { "acc": 0.75140686, "epoch": 1.167554540842212, "grad_norm": 2.421875, "learning_rate": 4.0342436629923385e-06, "loss": 0.98272448, "memory(GiB)": 369.42, "step": 46025, "train_speed(iter/s)": 0.200429 }, { "acc": 0.75016565, "epoch": 1.167681380010147, "grad_norm": 2.328125, "learning_rate": 4.033214807202098e-06, "loss": 1.00226412, "memory(GiB)": 369.42, "step": 46030, "train_speed(iter/s)": 0.200432 }, { "acc": 0.74140449, "epoch": 1.1678082191780823, "grad_norm": 2.28125, "learning_rate": 4.032185993935385e-06, "loss": 1.01990833, "memory(GiB)": 369.42, "step": 46035, "train_speed(iter/s)": 0.200436 }, { "acc": 0.75636482, "epoch": 1.1679350583460173, "grad_norm": 2.3125, "learning_rate": 4.031157223237452e-06, "loss": 0.91958637, "memory(GiB)": 369.42, "step": 46040, "train_speed(iter/s)": 0.200438 }, { "acc": 0.76245699, "epoch": 1.1680618975139523, "grad_norm": 2.109375, "learning_rate": 4.0301284951535504e-06, "loss": 0.92376919, "memory(GiB)": 369.42, "step": 46045, "train_speed(iter/s)": 0.200441 }, { "acc": 0.76615667, "epoch": 1.1681887366818873, "grad_norm": 2.015625, "learning_rate": 4.029099809728929e-06, "loss": 0.98901234, "memory(GiB)": 369.42, "step": 46050, "train_speed(iter/s)": 0.200443 }, { "acc": 0.74813881, "epoch": 1.1683155758498225, "grad_norm": 2.4375, "learning_rate": 4.028071167008831e-06, "loss": 1.05552769, "memory(GiB)": 369.42, "step": 46055, "train_speed(iter/s)": 0.200447 }, { "acc": 0.74992161, "epoch": 1.1684424150177575, "grad_norm": 2.234375, "learning_rate": 4.027042567038503e-06, "loss": 0.95031986, "memory(GiB)": 369.42, "step": 46060, "train_speed(iter/s)": 0.200451 }, { "acc": 0.75499458, "epoch": 1.1685692541856925, "grad_norm": 1.921875, "learning_rate": 4.026014009863186e-06, "loss": 1.03167, "memory(GiB)": 369.42, "step": 46065, "train_speed(iter/s)": 0.200453 }, { "acc": 0.75122056, "epoch": 1.1686960933536277, "grad_norm": 2.390625, "learning_rate": 4.024985495528124e-06, "loss": 0.98274956, "memory(GiB)": 369.42, "step": 46070, "train_speed(iter/s)": 0.200456 }, { "acc": 0.75181522, "epoch": 1.1688229325215627, "grad_norm": 2.1875, "learning_rate": 4.023957024078552e-06, "loss": 0.9872591, "memory(GiB)": 369.42, "step": 46075, "train_speed(iter/s)": 0.200459 }, { "acc": 0.75328808, "epoch": 1.1689497716894977, "grad_norm": 2.4375, "learning_rate": 4.022928595559707e-06, "loss": 0.96242142, "memory(GiB)": 369.42, "step": 46080, "train_speed(iter/s)": 0.200461 }, { "acc": 0.75344763, "epoch": 1.1690766108574329, "grad_norm": 1.984375, "learning_rate": 4.021900210016824e-06, "loss": 0.98783092, "memory(GiB)": 369.42, "step": 46085, "train_speed(iter/s)": 0.200462 }, { "acc": 0.76323333, "epoch": 1.1692034500253679, "grad_norm": 1.984375, "learning_rate": 4.020871867495139e-06, "loss": 0.92334251, "memory(GiB)": 369.42, "step": 46090, "train_speed(iter/s)": 0.200465 }, { "acc": 0.75745964, "epoch": 1.1693302891933028, "grad_norm": 2.234375, "learning_rate": 4.01984356803988e-06, "loss": 0.95915985, "memory(GiB)": 369.42, "step": 46095, "train_speed(iter/s)": 0.200468 }, { "acc": 0.76152153, "epoch": 1.169457128361238, "grad_norm": 1.96875, "learning_rate": 4.018815311696274e-06, "loss": 0.97286959, "memory(GiB)": 369.42, "step": 46100, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74333415, "epoch": 1.169583967529173, "grad_norm": 2.4375, "learning_rate": 4.017787098509555e-06, "loss": 1.00914593, "memory(GiB)": 369.42, "step": 46105, "train_speed(iter/s)": 0.200474 }, { "acc": 0.74499655, "epoch": 1.169710806697108, "grad_norm": 2.09375, "learning_rate": 4.016758928524944e-06, "loss": 0.98427944, "memory(GiB)": 369.42, "step": 46110, "train_speed(iter/s)": 0.200476 }, { "acc": 0.75958815, "epoch": 1.169837645865043, "grad_norm": 2.140625, "learning_rate": 4.015730801787663e-06, "loss": 0.94155054, "memory(GiB)": 369.42, "step": 46115, "train_speed(iter/s)": 0.20048 }, { "acc": 0.74507084, "epoch": 1.1699644850329782, "grad_norm": 2.09375, "learning_rate": 4.014702718342938e-06, "loss": 1.00762653, "memory(GiB)": 369.42, "step": 46120, "train_speed(iter/s)": 0.200484 }, { "acc": 0.74052634, "epoch": 1.1700913242009132, "grad_norm": 2.0, "learning_rate": 4.013674678235985e-06, "loss": 0.98571892, "memory(GiB)": 369.42, "step": 46125, "train_speed(iter/s)": 0.200487 }, { "acc": 0.74774647, "epoch": 1.1702181633688482, "grad_norm": 2.0625, "learning_rate": 4.012646681512026e-06, "loss": 0.9802021, "memory(GiB)": 369.42, "step": 46130, "train_speed(iter/s)": 0.20049 }, { "acc": 0.74867339, "epoch": 1.1703450025367834, "grad_norm": 2.171875, "learning_rate": 4.011618728216271e-06, "loss": 1.03233309, "memory(GiB)": 369.42, "step": 46135, "train_speed(iter/s)": 0.200494 }, { "acc": 0.74810596, "epoch": 1.1704718417047184, "grad_norm": 2.1875, "learning_rate": 4.010590818393938e-06, "loss": 1.03236485, "memory(GiB)": 369.42, "step": 46140, "train_speed(iter/s)": 0.200498 }, { "acc": 0.7671783, "epoch": 1.1705986808726534, "grad_norm": 2.25, "learning_rate": 4.009562952090238e-06, "loss": 0.97560682, "memory(GiB)": 369.42, "step": 46145, "train_speed(iter/s)": 0.2005 }, { "acc": 0.75582976, "epoch": 1.1707255200405886, "grad_norm": 2.078125, "learning_rate": 4.0085351293503825e-06, "loss": 0.91707573, "memory(GiB)": 369.42, "step": 46150, "train_speed(iter/s)": 0.200504 }, { "acc": 0.73703566, "epoch": 1.1708523592085236, "grad_norm": 1.8359375, "learning_rate": 4.007507350219578e-06, "loss": 1.04161873, "memory(GiB)": 369.42, "step": 46155, "train_speed(iter/s)": 0.200507 }, { "acc": 0.7556026, "epoch": 1.1709791983764586, "grad_norm": 2.265625, "learning_rate": 4.0064796147430305e-06, "loss": 0.9685771, "memory(GiB)": 369.42, "step": 46160, "train_speed(iter/s)": 0.20051 }, { "acc": 0.76365504, "epoch": 1.1711060375443938, "grad_norm": 2.0, "learning_rate": 4.005451922965946e-06, "loss": 0.98795328, "memory(GiB)": 369.42, "step": 46165, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75580683, "epoch": 1.1712328767123288, "grad_norm": 1.96875, "learning_rate": 4.0044242749335285e-06, "loss": 0.9785759, "memory(GiB)": 369.42, "step": 46170, "train_speed(iter/s)": 0.200517 }, { "acc": 0.74349418, "epoch": 1.1713597158802638, "grad_norm": 2.0625, "learning_rate": 4.0033966706909735e-06, "loss": 1.05208607, "memory(GiB)": 369.42, "step": 46175, "train_speed(iter/s)": 0.20052 }, { "acc": 0.75686793, "epoch": 1.1714865550481988, "grad_norm": 2.046875, "learning_rate": 4.002369110283482e-06, "loss": 0.96586151, "memory(GiB)": 369.42, "step": 46180, "train_speed(iter/s)": 0.20052 }, { "acc": 0.73899422, "epoch": 1.171613394216134, "grad_norm": 1.953125, "learning_rate": 4.001341593756253e-06, "loss": 0.99285011, "memory(GiB)": 369.42, "step": 46185, "train_speed(iter/s)": 0.200522 }, { "acc": 0.7613853, "epoch": 1.171740233384069, "grad_norm": 2.765625, "learning_rate": 4.00031412115448e-06, "loss": 0.98621092, "memory(GiB)": 369.42, "step": 46190, "train_speed(iter/s)": 0.200526 }, { "acc": 0.76031084, "epoch": 1.1718670725520042, "grad_norm": 2.109375, "learning_rate": 3.999286692523352e-06, "loss": 0.89309111, "memory(GiB)": 369.42, "step": 46195, "train_speed(iter/s)": 0.200531 }, { "acc": 0.76293344, "epoch": 1.1719939117199392, "grad_norm": 1.90625, "learning_rate": 3.998259307908065e-06, "loss": 0.92798071, "memory(GiB)": 369.42, "step": 46200, "train_speed(iter/s)": 0.200535 }, { "acc": 0.75007691, "epoch": 1.1721207508878742, "grad_norm": 2.59375, "learning_rate": 3.997231967353806e-06, "loss": 0.96677723, "memory(GiB)": 369.42, "step": 46205, "train_speed(iter/s)": 0.20054 }, { "acc": 0.76088352, "epoch": 1.1722475900558091, "grad_norm": 1.9921875, "learning_rate": 3.996204670905765e-06, "loss": 0.97290955, "memory(GiB)": 369.42, "step": 46210, "train_speed(iter/s)": 0.200543 }, { "acc": 0.76286135, "epoch": 1.1723744292237444, "grad_norm": 2.21875, "learning_rate": 3.9951774186091195e-06, "loss": 0.97996702, "memory(GiB)": 369.42, "step": 46215, "train_speed(iter/s)": 0.200548 }, { "acc": 0.7693552, "epoch": 1.1725012683916793, "grad_norm": 2.546875, "learning_rate": 3.9941502105090594e-06, "loss": 0.97786713, "memory(GiB)": 369.42, "step": 46220, "train_speed(iter/s)": 0.200549 }, { "acc": 0.76797867, "epoch": 1.1726281075596143, "grad_norm": 1.96875, "learning_rate": 3.9931230466507634e-06, "loss": 0.9110158, "memory(GiB)": 369.42, "step": 46225, "train_speed(iter/s)": 0.200551 }, { "acc": 0.75438604, "epoch": 1.1727549467275495, "grad_norm": 2.03125, "learning_rate": 3.992095927079412e-06, "loss": 1.03062706, "memory(GiB)": 369.42, "step": 46230, "train_speed(iter/s)": 0.200555 }, { "acc": 0.76130805, "epoch": 1.1728817858954845, "grad_norm": 2.5, "learning_rate": 3.991068851840182e-06, "loss": 0.94548626, "memory(GiB)": 369.42, "step": 46235, "train_speed(iter/s)": 0.200558 }, { "acc": 0.77150803, "epoch": 1.1730086250634195, "grad_norm": 2.328125, "learning_rate": 3.990041820978246e-06, "loss": 0.91499929, "memory(GiB)": 369.42, "step": 46240, "train_speed(iter/s)": 0.20056 }, { "acc": 0.76352291, "epoch": 1.1731354642313547, "grad_norm": 2.5, "learning_rate": 3.989014834538782e-06, "loss": 0.93207884, "memory(GiB)": 369.42, "step": 46245, "train_speed(iter/s)": 0.200563 }, { "acc": 0.75865293, "epoch": 1.1732623033992897, "grad_norm": 2.625, "learning_rate": 3.987987892566959e-06, "loss": 0.97049122, "memory(GiB)": 369.42, "step": 46250, "train_speed(iter/s)": 0.200566 }, { "acc": 0.76375008, "epoch": 1.1733891425672247, "grad_norm": 1.859375, "learning_rate": 3.986960995107948e-06, "loss": 0.96217728, "memory(GiB)": 369.42, "step": 46255, "train_speed(iter/s)": 0.200568 }, { "acc": 0.76470995, "epoch": 1.17351598173516, "grad_norm": 2.109375, "learning_rate": 3.985934142206912e-06, "loss": 0.95307598, "memory(GiB)": 369.42, "step": 46260, "train_speed(iter/s)": 0.200571 }, { "acc": 0.75603614, "epoch": 1.173642820903095, "grad_norm": 2.078125, "learning_rate": 3.984907333909022e-06, "loss": 0.9873724, "memory(GiB)": 369.42, "step": 46265, "train_speed(iter/s)": 0.200573 }, { "acc": 0.74130435, "epoch": 1.17376966007103, "grad_norm": 2.265625, "learning_rate": 3.983880570259441e-06, "loss": 1.01776962, "memory(GiB)": 369.42, "step": 46270, "train_speed(iter/s)": 0.200575 }, { "acc": 0.75611782, "epoch": 1.1738964992389649, "grad_norm": 2.28125, "learning_rate": 3.982853851303327e-06, "loss": 0.94368305, "memory(GiB)": 369.42, "step": 46275, "train_speed(iter/s)": 0.200579 }, { "acc": 0.74167061, "epoch": 1.1740233384069, "grad_norm": 2.5, "learning_rate": 3.981827177085842e-06, "loss": 1.01227798, "memory(GiB)": 369.42, "step": 46280, "train_speed(iter/s)": 0.200581 }, { "acc": 0.7609839, "epoch": 1.174150177574835, "grad_norm": 2.40625, "learning_rate": 3.980800547652143e-06, "loss": 0.97922096, "memory(GiB)": 369.42, "step": 46285, "train_speed(iter/s)": 0.200586 }, { "acc": 0.73727932, "epoch": 1.17427701674277, "grad_norm": 2.140625, "learning_rate": 3.979773963047388e-06, "loss": 1.09654617, "memory(GiB)": 369.42, "step": 46290, "train_speed(iter/s)": 0.200591 }, { "acc": 0.74780264, "epoch": 1.1744038559107053, "grad_norm": 2.09375, "learning_rate": 3.978747423316729e-06, "loss": 0.96859198, "memory(GiB)": 369.42, "step": 46295, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74494457, "epoch": 1.1745306950786403, "grad_norm": 1.9296875, "learning_rate": 3.977720928505317e-06, "loss": 1.02411003, "memory(GiB)": 369.42, "step": 46300, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74792867, "epoch": 1.1746575342465753, "grad_norm": 2.84375, "learning_rate": 3.976694478658301e-06, "loss": 1.01595039, "memory(GiB)": 369.42, "step": 46305, "train_speed(iter/s)": 0.2006 }, { "acc": 0.75765638, "epoch": 1.1747843734145105, "grad_norm": 1.9453125, "learning_rate": 3.975668073820834e-06, "loss": 0.94759216, "memory(GiB)": 369.42, "step": 46310, "train_speed(iter/s)": 0.200603 }, { "acc": 0.77470064, "epoch": 1.1749112125824455, "grad_norm": 2.484375, "learning_rate": 3.9746417140380576e-06, "loss": 0.87674398, "memory(GiB)": 369.42, "step": 46315, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74681306, "epoch": 1.1750380517503805, "grad_norm": 2.5625, "learning_rate": 3.973615399355114e-06, "loss": 1.00652151, "memory(GiB)": 369.42, "step": 46320, "train_speed(iter/s)": 0.200608 }, { "acc": 0.74868126, "epoch": 1.1751648909183157, "grad_norm": 2.0625, "learning_rate": 3.97258912981715e-06, "loss": 1.03616695, "memory(GiB)": 369.42, "step": 46325, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75749121, "epoch": 1.1752917300862507, "grad_norm": 2.234375, "learning_rate": 3.9715629054693035e-06, "loss": 0.95466347, "memory(GiB)": 369.42, "step": 46330, "train_speed(iter/s)": 0.200614 }, { "acc": 0.76151285, "epoch": 1.1754185692541856, "grad_norm": 2.25, "learning_rate": 3.970536726356711e-06, "loss": 0.96024275, "memory(GiB)": 369.42, "step": 46335, "train_speed(iter/s)": 0.200614 }, { "acc": 0.74982429, "epoch": 1.1755454084221206, "grad_norm": 1.8671875, "learning_rate": 3.969510592524509e-06, "loss": 1.00128651, "memory(GiB)": 369.42, "step": 46340, "train_speed(iter/s)": 0.200616 }, { "acc": 0.75211258, "epoch": 1.1756722475900558, "grad_norm": 1.890625, "learning_rate": 3.968484504017833e-06, "loss": 1.00652065, "memory(GiB)": 369.42, "step": 46345, "train_speed(iter/s)": 0.200618 }, { "acc": 0.76426344, "epoch": 1.1757990867579908, "grad_norm": 2.5625, "learning_rate": 3.967458460881815e-06, "loss": 0.94676571, "memory(GiB)": 369.42, "step": 46350, "train_speed(iter/s)": 0.200621 }, { "acc": 0.76105676, "epoch": 1.175925925925926, "grad_norm": 2.0625, "learning_rate": 3.96643246316158e-06, "loss": 0.95115204, "memory(GiB)": 369.42, "step": 46355, "train_speed(iter/s)": 0.200624 }, { "acc": 0.76024384, "epoch": 1.176052765093861, "grad_norm": 2.015625, "learning_rate": 3.965406510902263e-06, "loss": 0.94693871, "memory(GiB)": 369.42, "step": 46360, "train_speed(iter/s)": 0.200626 }, { "acc": 0.7649447, "epoch": 1.176179604261796, "grad_norm": 2.234375, "learning_rate": 3.9643806041489855e-06, "loss": 0.94790993, "memory(GiB)": 369.42, "step": 46365, "train_speed(iter/s)": 0.200629 }, { "acc": 0.7482007, "epoch": 1.176306443429731, "grad_norm": 2.0625, "learning_rate": 3.963354742946874e-06, "loss": 0.97538376, "memory(GiB)": 369.42, "step": 46370, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75225925, "epoch": 1.1764332825976662, "grad_norm": 1.78125, "learning_rate": 3.962328927341048e-06, "loss": 1.00223999, "memory(GiB)": 369.42, "step": 46375, "train_speed(iter/s)": 0.20063 }, { "acc": 0.76815825, "epoch": 1.1765601217656012, "grad_norm": 2.640625, "learning_rate": 3.961303157376628e-06, "loss": 0.94447193, "memory(GiB)": 369.42, "step": 46380, "train_speed(iter/s)": 0.200635 }, { "acc": 0.75825014, "epoch": 1.1766869609335362, "grad_norm": 2.40625, "learning_rate": 3.960277433098734e-06, "loss": 0.95590115, "memory(GiB)": 369.42, "step": 46385, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74562006, "epoch": 1.1768138001014714, "grad_norm": 2.453125, "learning_rate": 3.959251754552481e-06, "loss": 1.05494556, "memory(GiB)": 369.42, "step": 46390, "train_speed(iter/s)": 0.200644 }, { "acc": 0.73831186, "epoch": 1.1769406392694064, "grad_norm": 1.953125, "learning_rate": 3.958226121782982e-06, "loss": 1.03037529, "memory(GiB)": 369.42, "step": 46395, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75086699, "epoch": 1.1770674784373414, "grad_norm": 2.265625, "learning_rate": 3.9572005348353486e-06, "loss": 0.94820967, "memory(GiB)": 369.42, "step": 46400, "train_speed(iter/s)": 0.200648 }, { "acc": 0.74091578, "epoch": 1.1771943176052766, "grad_norm": 2.546875, "learning_rate": 3.956174993754691e-06, "loss": 1.00950079, "memory(GiB)": 369.42, "step": 46405, "train_speed(iter/s)": 0.200653 }, { "acc": 0.77590618, "epoch": 1.1773211567732116, "grad_norm": 1.796875, "learning_rate": 3.955149498586119e-06, "loss": 0.87346935, "memory(GiB)": 369.42, "step": 46410, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74547343, "epoch": 1.1774479959411466, "grad_norm": 2.765625, "learning_rate": 3.954124049374736e-06, "loss": 1.0147686, "memory(GiB)": 369.42, "step": 46415, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75368786, "epoch": 1.1775748351090818, "grad_norm": 2.078125, "learning_rate": 3.9530986461656465e-06, "loss": 0.95065727, "memory(GiB)": 369.42, "step": 46420, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75215344, "epoch": 1.1777016742770168, "grad_norm": 2.3125, "learning_rate": 3.952073289003953e-06, "loss": 0.97088528, "memory(GiB)": 369.42, "step": 46425, "train_speed(iter/s)": 0.200663 }, { "acc": 0.76671467, "epoch": 1.1778285134449518, "grad_norm": 2.1875, "learning_rate": 3.9510479779347566e-06, "loss": 0.90961771, "memory(GiB)": 369.42, "step": 46430, "train_speed(iter/s)": 0.200667 }, { "acc": 0.75472484, "epoch": 1.1779553526128868, "grad_norm": 2.328125, "learning_rate": 3.950022713003151e-06, "loss": 0.99724169, "memory(GiB)": 369.42, "step": 46435, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74644623, "epoch": 1.178082191780822, "grad_norm": 2.15625, "learning_rate": 3.9489974942542355e-06, "loss": 0.98815269, "memory(GiB)": 369.42, "step": 46440, "train_speed(iter/s)": 0.200675 }, { "acc": 0.751791, "epoch": 1.178209030948757, "grad_norm": 2.296875, "learning_rate": 3.947972321733101e-06, "loss": 1.00069771, "memory(GiB)": 369.42, "step": 46445, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75541468, "epoch": 1.178335870116692, "grad_norm": 2.046875, "learning_rate": 3.946947195484843e-06, "loss": 0.97729263, "memory(GiB)": 369.42, "step": 46450, "train_speed(iter/s)": 0.200681 }, { "acc": 0.75208421, "epoch": 1.1784627092846272, "grad_norm": 2.390625, "learning_rate": 3.945922115554548e-06, "loss": 0.9894495, "memory(GiB)": 369.42, "step": 46455, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75637236, "epoch": 1.1785895484525621, "grad_norm": 1.9609375, "learning_rate": 3.944897081987303e-06, "loss": 1.02767611, "memory(GiB)": 369.42, "step": 46460, "train_speed(iter/s)": 0.200688 }, { "acc": 0.7420464, "epoch": 1.1787163876204971, "grad_norm": 2.90625, "learning_rate": 3.943872094828197e-06, "loss": 1.01979313, "memory(GiB)": 369.42, "step": 46465, "train_speed(iter/s)": 0.200692 }, { "acc": 0.76572361, "epoch": 1.1788432267884323, "grad_norm": 2.015625, "learning_rate": 3.942847154122312e-06, "loss": 0.93255014, "memory(GiB)": 369.42, "step": 46470, "train_speed(iter/s)": 0.200696 }, { "acc": 0.75841017, "epoch": 1.1789700659563673, "grad_norm": 2.421875, "learning_rate": 3.941822259914728e-06, "loss": 0.94660721, "memory(GiB)": 369.42, "step": 46475, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75549269, "epoch": 1.1790969051243023, "grad_norm": 1.984375, "learning_rate": 3.940797412250524e-06, "loss": 0.99576912, "memory(GiB)": 369.42, "step": 46480, "train_speed(iter/s)": 0.200703 }, { "acc": 0.75401163, "epoch": 1.1792237442922375, "grad_norm": 1.7421875, "learning_rate": 3.93977261117478e-06, "loss": 0.99557037, "memory(GiB)": 369.42, "step": 46485, "train_speed(iter/s)": 0.200707 }, { "acc": 0.74393559, "epoch": 1.1793505834601725, "grad_norm": 1.9765625, "learning_rate": 3.938747856732572e-06, "loss": 0.98470001, "memory(GiB)": 369.42, "step": 46490, "train_speed(iter/s)": 0.20071 }, { "acc": 0.76126204, "epoch": 1.1794774226281075, "grad_norm": 2.140625, "learning_rate": 3.9377231489689685e-06, "loss": 1.00264511, "memory(GiB)": 369.42, "step": 46495, "train_speed(iter/s)": 0.200713 }, { "acc": 0.7470048, "epoch": 1.1796042617960425, "grad_norm": 2.109375, "learning_rate": 3.936698487929045e-06, "loss": 0.99269285, "memory(GiB)": 369.42, "step": 46500, "train_speed(iter/s)": 0.200717 }, { "acc": 0.76244869, "epoch": 1.1797311009639777, "grad_norm": 2.0, "learning_rate": 3.935673873657868e-06, "loss": 0.93092728, "memory(GiB)": 369.42, "step": 46505, "train_speed(iter/s)": 0.20072 }, { "acc": 0.74726186, "epoch": 1.1798579401319127, "grad_norm": 2.484375, "learning_rate": 3.934649306200508e-06, "loss": 1.00653305, "memory(GiB)": 369.42, "step": 46510, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75981245, "epoch": 1.179984779299848, "grad_norm": 2.109375, "learning_rate": 3.933624785602027e-06, "loss": 0.95950947, "memory(GiB)": 369.42, "step": 46515, "train_speed(iter/s)": 0.200725 }, { "acc": 0.7488204, "epoch": 1.180111618467783, "grad_norm": 1.96875, "learning_rate": 3.932600311907489e-06, "loss": 0.97247543, "memory(GiB)": 369.42, "step": 46520, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74674211, "epoch": 1.1802384576357179, "grad_norm": 2.0, "learning_rate": 3.931575885161955e-06, "loss": 1.01772003, "memory(GiB)": 369.42, "step": 46525, "train_speed(iter/s)": 0.200732 }, { "acc": 0.74751978, "epoch": 1.1803652968036529, "grad_norm": 2.1875, "learning_rate": 3.930551505410484e-06, "loss": 0.9665575, "memory(GiB)": 369.42, "step": 46530, "train_speed(iter/s)": 0.200735 }, { "acc": 0.75409737, "epoch": 1.180492135971588, "grad_norm": 2.09375, "learning_rate": 3.929527172698132e-06, "loss": 0.97956638, "memory(GiB)": 369.42, "step": 46535, "train_speed(iter/s)": 0.200739 }, { "acc": 0.74161844, "epoch": 1.180618975139523, "grad_norm": 1.9609375, "learning_rate": 3.928502887069954e-06, "loss": 1.00989208, "memory(GiB)": 369.42, "step": 46540, "train_speed(iter/s)": 0.200741 }, { "acc": 0.76884837, "epoch": 1.180745814307458, "grad_norm": 2.140625, "learning_rate": 3.927478648571003e-06, "loss": 0.89711142, "memory(GiB)": 369.42, "step": 46545, "train_speed(iter/s)": 0.200742 }, { "acc": 0.75724072, "epoch": 1.1808726534753933, "grad_norm": 2.46875, "learning_rate": 3.926454457246331e-06, "loss": 0.97523537, "memory(GiB)": 369.42, "step": 46550, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7475872, "epoch": 1.1809994926433283, "grad_norm": 2.234375, "learning_rate": 3.9254303131409834e-06, "loss": 1.0074255, "memory(GiB)": 369.42, "step": 46555, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75899677, "epoch": 1.1811263318112633, "grad_norm": 2.609375, "learning_rate": 3.924406216300009e-06, "loss": 0.96744404, "memory(GiB)": 369.42, "step": 46560, "train_speed(iter/s)": 0.200752 }, { "acc": 0.76043205, "epoch": 1.1812531709791985, "grad_norm": 2.140625, "learning_rate": 3.92338216676845e-06, "loss": 0.96284637, "memory(GiB)": 369.42, "step": 46565, "train_speed(iter/s)": 0.200753 }, { "acc": 0.7468451, "epoch": 1.1813800101471335, "grad_norm": 2.3125, "learning_rate": 3.922358164591353e-06, "loss": 1.02024632, "memory(GiB)": 369.42, "step": 46570, "train_speed(iter/s)": 0.200756 }, { "acc": 0.75291824, "epoch": 1.1815068493150684, "grad_norm": 1.890625, "learning_rate": 3.921334209813752e-06, "loss": 0.99036341, "memory(GiB)": 369.42, "step": 46575, "train_speed(iter/s)": 0.200759 }, { "acc": 0.7408452, "epoch": 1.1816336884830037, "grad_norm": 1.953125, "learning_rate": 3.92031030248069e-06, "loss": 1.02248993, "memory(GiB)": 369.42, "step": 46580, "train_speed(iter/s)": 0.200761 }, { "acc": 0.73657484, "epoch": 1.1817605276509386, "grad_norm": 2.109375, "learning_rate": 3.9192864426372e-06, "loss": 1.02455692, "memory(GiB)": 369.42, "step": 46585, "train_speed(iter/s)": 0.200765 }, { "acc": 0.76709833, "epoch": 1.1818873668188736, "grad_norm": 2.078125, "learning_rate": 3.918262630328319e-06, "loss": 0.93933525, "memory(GiB)": 369.42, "step": 46590, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75468526, "epoch": 1.1820142059868086, "grad_norm": 1.8671875, "learning_rate": 3.917238865599077e-06, "loss": 0.98569984, "memory(GiB)": 369.42, "step": 46595, "train_speed(iter/s)": 0.200772 }, { "acc": 0.77144742, "epoch": 1.1821410451547438, "grad_norm": 2.09375, "learning_rate": 3.916215148494502e-06, "loss": 0.94587479, "memory(GiB)": 369.42, "step": 46600, "train_speed(iter/s)": 0.200774 }, { "acc": 0.73714743, "epoch": 1.1822678843226788, "grad_norm": 2.5625, "learning_rate": 3.9151914790596255e-06, "loss": 1.03150959, "memory(GiB)": 369.42, "step": 46605, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75792546, "epoch": 1.1823947234906138, "grad_norm": 2.3125, "learning_rate": 3.914167857339472e-06, "loss": 0.96416302, "memory(GiB)": 369.42, "step": 46610, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74968643, "epoch": 1.182521562658549, "grad_norm": 2.390625, "learning_rate": 3.913144283379061e-06, "loss": 0.9967144, "memory(GiB)": 369.42, "step": 46615, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74781561, "epoch": 1.182648401826484, "grad_norm": 2.328125, "learning_rate": 3.912120757223418e-06, "loss": 1.04234161, "memory(GiB)": 369.42, "step": 46620, "train_speed(iter/s)": 0.200788 }, { "acc": 0.75156775, "epoch": 1.182775240994419, "grad_norm": 2.34375, "learning_rate": 3.911097278917561e-06, "loss": 1.00198574, "memory(GiB)": 369.42, "step": 46625, "train_speed(iter/s)": 0.200791 }, { "acc": 0.75770187, "epoch": 1.1829020801623542, "grad_norm": 2.171875, "learning_rate": 3.91007384850651e-06, "loss": 0.9600317, "memory(GiB)": 369.42, "step": 46630, "train_speed(iter/s)": 0.20079 }, { "acc": 0.76220655, "epoch": 1.1830289193302892, "grad_norm": 1.875, "learning_rate": 3.909050466035274e-06, "loss": 0.91683159, "memory(GiB)": 369.42, "step": 46635, "train_speed(iter/s)": 0.200793 }, { "acc": 0.75283689, "epoch": 1.1831557584982242, "grad_norm": 2.015625, "learning_rate": 3.90802713154887e-06, "loss": 0.97541943, "memory(GiB)": 369.42, "step": 46640, "train_speed(iter/s)": 0.200797 }, { "acc": 0.76638298, "epoch": 1.1832825976661594, "grad_norm": 2.421875, "learning_rate": 3.9070038450923074e-06, "loss": 0.91141205, "memory(GiB)": 369.42, "step": 46645, "train_speed(iter/s)": 0.2008 }, { "acc": 0.73959985, "epoch": 1.1834094368340944, "grad_norm": 2.328125, "learning_rate": 3.9059806067105985e-06, "loss": 1.00308609, "memory(GiB)": 369.42, "step": 46650, "train_speed(iter/s)": 0.200804 }, { "acc": 0.73981495, "epoch": 1.1835362760020294, "grad_norm": 2.21875, "learning_rate": 3.904957416448744e-06, "loss": 1.01532154, "memory(GiB)": 369.42, "step": 46655, "train_speed(iter/s)": 0.200809 }, { "acc": 0.76317081, "epoch": 1.1836631151699644, "grad_norm": 2.4375, "learning_rate": 3.903934274351753e-06, "loss": 0.96031036, "memory(GiB)": 369.42, "step": 46660, "train_speed(iter/s)": 0.200812 }, { "acc": 0.75125794, "epoch": 1.1837899543378996, "grad_norm": 2.09375, "learning_rate": 3.9029111804646245e-06, "loss": 0.98085432, "memory(GiB)": 369.42, "step": 46665, "train_speed(iter/s)": 0.200815 }, { "acc": 0.7529747, "epoch": 1.1839167935058346, "grad_norm": 2.390625, "learning_rate": 3.9018881348323626e-06, "loss": 0.98278599, "memory(GiB)": 369.42, "step": 46670, "train_speed(iter/s)": 0.20082 }, { "acc": 0.76045604, "epoch": 1.1840436326737698, "grad_norm": 1.875, "learning_rate": 3.9008651374999615e-06, "loss": 0.94468679, "memory(GiB)": 369.42, "step": 46675, "train_speed(iter/s)": 0.200822 }, { "acc": 0.76114855, "epoch": 1.1841704718417048, "grad_norm": 1.9921875, "learning_rate": 3.899842188512419e-06, "loss": 0.94655895, "memory(GiB)": 369.42, "step": 46680, "train_speed(iter/s)": 0.200825 }, { "acc": 0.75252657, "epoch": 1.1842973110096398, "grad_norm": 1.8125, "learning_rate": 3.898819287914729e-06, "loss": 0.94542332, "memory(GiB)": 369.42, "step": 46685, "train_speed(iter/s)": 0.200829 }, { "acc": 0.74598923, "epoch": 1.1844241501775747, "grad_norm": 2.328125, "learning_rate": 3.897796435751885e-06, "loss": 1.00506554, "memory(GiB)": 369.42, "step": 46690, "train_speed(iter/s)": 0.200831 }, { "acc": 0.73568468, "epoch": 1.18455098934551, "grad_norm": 2.328125, "learning_rate": 3.896773632068873e-06, "loss": 1.05187874, "memory(GiB)": 369.42, "step": 46695, "train_speed(iter/s)": 0.200833 }, { "acc": 0.74315114, "epoch": 1.184677828513445, "grad_norm": 2.21875, "learning_rate": 3.8957508769106825e-06, "loss": 1.00811768, "memory(GiB)": 369.42, "step": 46700, "train_speed(iter/s)": 0.200837 }, { "acc": 0.75204105, "epoch": 1.18480466768138, "grad_norm": 1.8984375, "learning_rate": 3.894728170322298e-06, "loss": 1.00957441, "memory(GiB)": 369.42, "step": 46705, "train_speed(iter/s)": 0.20084 }, { "acc": 0.76492634, "epoch": 1.1849315068493151, "grad_norm": 2.046875, "learning_rate": 3.893705512348705e-06, "loss": 0.94263878, "memory(GiB)": 369.42, "step": 46710, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75986352, "epoch": 1.1850583460172501, "grad_norm": 1.8671875, "learning_rate": 3.89268290303488e-06, "loss": 1.00855217, "memory(GiB)": 369.42, "step": 46715, "train_speed(iter/s)": 0.200843 }, { "acc": 0.76427946, "epoch": 1.1851851851851851, "grad_norm": 1.9765625, "learning_rate": 3.891660342425807e-06, "loss": 0.92325611, "memory(GiB)": 369.42, "step": 46720, "train_speed(iter/s)": 0.200847 }, { "acc": 0.76766496, "epoch": 1.1853120243531203, "grad_norm": 2.421875, "learning_rate": 3.890637830566459e-06, "loss": 0.93521461, "memory(GiB)": 369.42, "step": 46725, "train_speed(iter/s)": 0.20085 }, { "acc": 0.74839754, "epoch": 1.1854388635210553, "grad_norm": 2.859375, "learning_rate": 3.889615367501815e-06, "loss": 1.01966858, "memory(GiB)": 369.42, "step": 46730, "train_speed(iter/s)": 0.200851 }, { "acc": 0.76032619, "epoch": 1.1855657026889903, "grad_norm": 1.875, "learning_rate": 3.888592953276842e-06, "loss": 0.97704935, "memory(GiB)": 369.42, "step": 46735, "train_speed(iter/s)": 0.200853 }, { "acc": 0.76468163, "epoch": 1.1856925418569255, "grad_norm": 1.890625, "learning_rate": 3.8875705879365135e-06, "loss": 0.87489071, "memory(GiB)": 369.42, "step": 46740, "train_speed(iter/s)": 0.200857 }, { "acc": 0.76160917, "epoch": 1.1858193810248605, "grad_norm": 2.296875, "learning_rate": 3.886548271525797e-06, "loss": 0.93225632, "memory(GiB)": 369.42, "step": 46745, "train_speed(iter/s)": 0.20086 }, { "acc": 0.76089625, "epoch": 1.1859462201927955, "grad_norm": 2.4375, "learning_rate": 3.88552600408966e-06, "loss": 0.93835459, "memory(GiB)": 369.42, "step": 46750, "train_speed(iter/s)": 0.200865 }, { "acc": 0.74875054, "epoch": 1.1860730593607305, "grad_norm": 1.984375, "learning_rate": 3.8845037856730646e-06, "loss": 1.03430138, "memory(GiB)": 369.42, "step": 46755, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76118617, "epoch": 1.1861998985286657, "grad_norm": 2.09375, "learning_rate": 3.883481616320972e-06, "loss": 0.99663992, "memory(GiB)": 369.42, "step": 46760, "train_speed(iter/s)": 0.200872 }, { "acc": 0.75960569, "epoch": 1.1863267376966007, "grad_norm": 2.0625, "learning_rate": 3.882459496078343e-06, "loss": 0.992311, "memory(GiB)": 369.42, "step": 46765, "train_speed(iter/s)": 0.200876 }, { "acc": 0.75296717, "epoch": 1.1864535768645357, "grad_norm": 2.6875, "learning_rate": 3.881437424990137e-06, "loss": 0.94782448, "memory(GiB)": 369.42, "step": 46770, "train_speed(iter/s)": 0.200878 }, { "acc": 0.74935255, "epoch": 1.1865804160324709, "grad_norm": 2.421875, "learning_rate": 3.880415403101304e-06, "loss": 0.99423523, "memory(GiB)": 369.42, "step": 46775, "train_speed(iter/s)": 0.200883 }, { "acc": 0.76262989, "epoch": 1.1867072552004059, "grad_norm": 1.96875, "learning_rate": 3.879393430456801e-06, "loss": 0.94359779, "memory(GiB)": 369.42, "step": 46780, "train_speed(iter/s)": 0.200885 }, { "acc": 0.74666767, "epoch": 1.1868340943683409, "grad_norm": 2.109375, "learning_rate": 3.87837150710158e-06, "loss": 1.03329325, "memory(GiB)": 369.42, "step": 46785, "train_speed(iter/s)": 0.200889 }, { "acc": 0.75698514, "epoch": 1.186960933536276, "grad_norm": 2.484375, "learning_rate": 3.877349633080587e-06, "loss": 0.90483017, "memory(GiB)": 369.42, "step": 46790, "train_speed(iter/s)": 0.200892 }, { "acc": 0.75126166, "epoch": 1.187087772704211, "grad_norm": 2.109375, "learning_rate": 3.876327808438767e-06, "loss": 1.00635834, "memory(GiB)": 369.42, "step": 46795, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75161467, "epoch": 1.187214611872146, "grad_norm": 2.296875, "learning_rate": 3.875306033221069e-06, "loss": 0.95852013, "memory(GiB)": 369.42, "step": 46800, "train_speed(iter/s)": 0.200898 }, { "acc": 0.7416337, "epoch": 1.1873414510400813, "grad_norm": 2.40625, "learning_rate": 3.874284307472432e-06, "loss": 1.05351191, "memory(GiB)": 369.42, "step": 46805, "train_speed(iter/s)": 0.200898 }, { "acc": 0.74659214, "epoch": 1.1874682902080163, "grad_norm": 2.1875, "learning_rate": 3.873262631237799e-06, "loss": 1.02142849, "memory(GiB)": 369.42, "step": 46810, "train_speed(iter/s)": 0.200901 }, { "acc": 0.73758554, "epoch": 1.1875951293759512, "grad_norm": 2.234375, "learning_rate": 3.872241004562105e-06, "loss": 1.05878305, "memory(GiB)": 369.42, "step": 46815, "train_speed(iter/s)": 0.200905 }, { "acc": 0.76214285, "epoch": 1.1877219685438862, "grad_norm": 2.703125, "learning_rate": 3.871219427490285e-06, "loss": 0.92708302, "memory(GiB)": 369.42, "step": 46820, "train_speed(iter/s)": 0.200909 }, { "acc": 0.75198727, "epoch": 1.1878488077118214, "grad_norm": 2.453125, "learning_rate": 3.870197900067276e-06, "loss": 1.01750011, "memory(GiB)": 369.42, "step": 46825, "train_speed(iter/s)": 0.200913 }, { "acc": 0.76205034, "epoch": 1.1879756468797564, "grad_norm": 2.078125, "learning_rate": 3.869176422338009e-06, "loss": 0.92347164, "memory(GiB)": 369.42, "step": 46830, "train_speed(iter/s)": 0.200917 }, { "acc": 0.76057873, "epoch": 1.1881024860476916, "grad_norm": 2.484375, "learning_rate": 3.868154994347409e-06, "loss": 0.96650295, "memory(GiB)": 369.42, "step": 46835, "train_speed(iter/s)": 0.200922 }, { "acc": 0.75859251, "epoch": 1.1882293252156266, "grad_norm": 2.546875, "learning_rate": 3.867133616140406e-06, "loss": 0.92574425, "memory(GiB)": 369.42, "step": 46840, "train_speed(iter/s)": 0.200924 }, { "acc": 0.7550106, "epoch": 1.1883561643835616, "grad_norm": 2.265625, "learning_rate": 3.866112287761926e-06, "loss": 1.01167479, "memory(GiB)": 369.42, "step": 46845, "train_speed(iter/s)": 0.200924 }, { "acc": 0.74569397, "epoch": 1.1884830035514966, "grad_norm": 2.625, "learning_rate": 3.86509100925689e-06, "loss": 1.01518612, "memory(GiB)": 369.42, "step": 46850, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74509954, "epoch": 1.1886098427194318, "grad_norm": 1.9921875, "learning_rate": 3.8640697806702166e-06, "loss": 0.97066555, "memory(GiB)": 369.42, "step": 46855, "train_speed(iter/s)": 0.20093 }, { "acc": 0.75049276, "epoch": 1.1887366818873668, "grad_norm": 2.65625, "learning_rate": 3.8630486020468265e-06, "loss": 0.99566174, "memory(GiB)": 369.42, "step": 46860, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75046129, "epoch": 1.1888635210553018, "grad_norm": 2.421875, "learning_rate": 3.862027473431634e-06, "loss": 0.97947979, "memory(GiB)": 369.42, "step": 46865, "train_speed(iter/s)": 0.200937 }, { "acc": 0.76026402, "epoch": 1.188990360223237, "grad_norm": 2.6875, "learning_rate": 3.861006394869558e-06, "loss": 0.93431168, "memory(GiB)": 369.42, "step": 46870, "train_speed(iter/s)": 0.200942 }, { "acc": 0.74942055, "epoch": 1.189117199391172, "grad_norm": 2.015625, "learning_rate": 3.859985366405502e-06, "loss": 0.98447962, "memory(GiB)": 369.42, "step": 46875, "train_speed(iter/s)": 0.200946 }, { "acc": 0.74891081, "epoch": 1.189244038559107, "grad_norm": 1.7265625, "learning_rate": 3.85896438808438e-06, "loss": 0.99331207, "memory(GiB)": 369.42, "step": 46880, "train_speed(iter/s)": 0.200948 }, { "acc": 0.74033332, "epoch": 1.1893708777270422, "grad_norm": 2.3125, "learning_rate": 3.857943459951099e-06, "loss": 1.00331039, "memory(GiB)": 369.42, "step": 46885, "train_speed(iter/s)": 0.200951 }, { "acc": 0.74377737, "epoch": 1.1894977168949772, "grad_norm": 2.25, "learning_rate": 3.856922582050565e-06, "loss": 0.99730282, "memory(GiB)": 369.42, "step": 46890, "train_speed(iter/s)": 0.200954 }, { "acc": 0.77161818, "epoch": 1.1896245560629122, "grad_norm": 2.09375, "learning_rate": 3.855901754427678e-06, "loss": 0.94245243, "memory(GiB)": 369.42, "step": 46895, "train_speed(iter/s)": 0.200956 }, { "acc": 0.73632689, "epoch": 1.1897513952308474, "grad_norm": 2.40625, "learning_rate": 3.854880977127339e-06, "loss": 1.04211578, "memory(GiB)": 369.42, "step": 46900, "train_speed(iter/s)": 0.200958 }, { "acc": 0.75606213, "epoch": 1.1898782343987824, "grad_norm": 2.234375, "learning_rate": 3.8538602501944475e-06, "loss": 0.97981129, "memory(GiB)": 369.42, "step": 46905, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75393491, "epoch": 1.1900050735667174, "grad_norm": 2.5, "learning_rate": 3.852839573673902e-06, "loss": 0.94999943, "memory(GiB)": 369.42, "step": 46910, "train_speed(iter/s)": 0.200963 }, { "acc": 0.7560194, "epoch": 1.1901319127346524, "grad_norm": 2.046875, "learning_rate": 3.851818947610591e-06, "loss": 0.98691158, "memory(GiB)": 369.42, "step": 46915, "train_speed(iter/s)": 0.200967 }, { "acc": 0.75527287, "epoch": 1.1902587519025876, "grad_norm": 2.265625, "learning_rate": 3.850798372049409e-06, "loss": 0.96782475, "memory(GiB)": 369.42, "step": 46920, "train_speed(iter/s)": 0.200969 }, { "acc": 0.75300074, "epoch": 1.1903855910705226, "grad_norm": 2.15625, "learning_rate": 3.849777847035246e-06, "loss": 0.9776947, "memory(GiB)": 369.42, "step": 46925, "train_speed(iter/s)": 0.200973 }, { "acc": 0.76095862, "epoch": 1.1905124302384575, "grad_norm": 2.203125, "learning_rate": 3.84875737261299e-06, "loss": 0.92065649, "memory(GiB)": 369.42, "step": 46930, "train_speed(iter/s)": 0.200978 }, { "acc": 0.74640489, "epoch": 1.1906392694063928, "grad_norm": 2.296875, "learning_rate": 3.847736948827523e-06, "loss": 1.01540966, "memory(GiB)": 369.42, "step": 46935, "train_speed(iter/s)": 0.20098 }, { "acc": 0.76818733, "epoch": 1.1907661085743277, "grad_norm": 2.296875, "learning_rate": 3.846716575723729e-06, "loss": 0.94313107, "memory(GiB)": 369.42, "step": 46940, "train_speed(iter/s)": 0.200984 }, { "acc": 0.75126171, "epoch": 1.1908929477422627, "grad_norm": 1.8671875, "learning_rate": 3.845696253346489e-06, "loss": 1.01832762, "memory(GiB)": 369.42, "step": 46945, "train_speed(iter/s)": 0.200987 }, { "acc": 0.75622292, "epoch": 1.191019786910198, "grad_norm": 2.265625, "learning_rate": 3.8446759817406835e-06, "loss": 0.98706532, "memory(GiB)": 369.42, "step": 46950, "train_speed(iter/s)": 0.200988 }, { "acc": 0.7503902, "epoch": 1.191146626078133, "grad_norm": 1.96875, "learning_rate": 3.8436557609511856e-06, "loss": 1.01746302, "memory(GiB)": 369.42, "step": 46955, "train_speed(iter/s)": 0.200991 }, { "acc": 0.77260823, "epoch": 1.191273465246068, "grad_norm": 2.125, "learning_rate": 3.842635591022869e-06, "loss": 0.86687469, "memory(GiB)": 369.42, "step": 46960, "train_speed(iter/s)": 0.200992 }, { "acc": 0.75693808, "epoch": 1.1914003044140031, "grad_norm": 2.484375, "learning_rate": 3.8416154720006065e-06, "loss": 0.95489845, "memory(GiB)": 369.42, "step": 46965, "train_speed(iter/s)": 0.200994 }, { "acc": 0.74778018, "epoch": 1.1915271435819381, "grad_norm": 1.875, "learning_rate": 3.840595403929269e-06, "loss": 1.00378151, "memory(GiB)": 369.42, "step": 46970, "train_speed(iter/s)": 0.200998 }, { "acc": 0.746387, "epoch": 1.191653982749873, "grad_norm": 2.203125, "learning_rate": 3.839575386853721e-06, "loss": 0.95320482, "memory(GiB)": 369.42, "step": 46975, "train_speed(iter/s)": 0.201001 }, { "acc": 0.751297, "epoch": 1.191780821917808, "grad_norm": 2.078125, "learning_rate": 3.838555420818827e-06, "loss": 0.97770481, "memory(GiB)": 369.42, "step": 46980, "train_speed(iter/s)": 0.201005 }, { "acc": 0.75249257, "epoch": 1.1919076610857433, "grad_norm": 2.390625, "learning_rate": 3.837535505869453e-06, "loss": 0.98365688, "memory(GiB)": 369.42, "step": 46985, "train_speed(iter/s)": 0.201009 }, { "acc": 0.74927902, "epoch": 1.1920345002536783, "grad_norm": 1.96875, "learning_rate": 3.836515642050458e-06, "loss": 1.0113369, "memory(GiB)": 369.42, "step": 46990, "train_speed(iter/s)": 0.201012 }, { "acc": 0.7378191, "epoch": 1.1921613394216135, "grad_norm": 2.171875, "learning_rate": 3.835495829406698e-06, "loss": 0.99808407, "memory(GiB)": 369.42, "step": 46995, "train_speed(iter/s)": 0.201016 }, { "acc": 0.75947657, "epoch": 1.1922881785895485, "grad_norm": 2.25, "learning_rate": 3.834476067983031e-06, "loss": 0.98056192, "memory(GiB)": 369.42, "step": 47000, "train_speed(iter/s)": 0.201016 }, { "epoch": 1.1922881785895485, "eval_acc": 0.7378272005788584, "eval_loss": 0.9700992703437805, "eval_runtime": 385.3115, "eval_samples_per_second": 16.532, "eval_steps_per_second": 8.266, "step": 47000 }, { "acc": 0.75073881, "epoch": 1.1924150177574835, "grad_norm": 1.9921875, "learning_rate": 3.83345635782431e-06, "loss": 1.0358983, "memory(GiB)": 369.42, "step": 47005, "train_speed(iter/s)": 0.20041 }, { "acc": 0.74625831, "epoch": 1.1925418569254185, "grad_norm": 2.421875, "learning_rate": 3.832436698975388e-06, "loss": 1.00370045, "memory(GiB)": 369.42, "step": 47010, "train_speed(iter/s)": 0.200413 }, { "acc": 0.77028894, "epoch": 1.1926686960933537, "grad_norm": 2.109375, "learning_rate": 3.831417091481111e-06, "loss": 0.9128129, "memory(GiB)": 369.42, "step": 47015, "train_speed(iter/s)": 0.200417 }, { "acc": 0.765345, "epoch": 1.1927955352612887, "grad_norm": 2.09375, "learning_rate": 3.830397535386328e-06, "loss": 0.953685, "memory(GiB)": 369.42, "step": 47020, "train_speed(iter/s)": 0.200419 }, { "acc": 0.74831572, "epoch": 1.1929223744292237, "grad_norm": 2.359375, "learning_rate": 3.829378030735883e-06, "loss": 1.03820639, "memory(GiB)": 369.42, "step": 47025, "train_speed(iter/s)": 0.200422 }, { "acc": 0.75561047, "epoch": 1.1930492135971589, "grad_norm": 2.0625, "learning_rate": 3.82835857757462e-06, "loss": 0.96946716, "memory(GiB)": 369.42, "step": 47030, "train_speed(iter/s)": 0.200425 }, { "acc": 0.75799837, "epoch": 1.1931760527650939, "grad_norm": 1.8046875, "learning_rate": 3.827339175947378e-06, "loss": 1.0024766, "memory(GiB)": 369.42, "step": 47035, "train_speed(iter/s)": 0.200428 }, { "acc": 0.74840832, "epoch": 1.1933028919330289, "grad_norm": 2.1875, "learning_rate": 3.826319825898992e-06, "loss": 0.96349325, "memory(GiB)": 369.42, "step": 47040, "train_speed(iter/s)": 0.200428 }, { "acc": 0.75703154, "epoch": 1.193429731100964, "grad_norm": 2.28125, "learning_rate": 3.825300527474302e-06, "loss": 1.0018774, "memory(GiB)": 369.42, "step": 47045, "train_speed(iter/s)": 0.200432 }, { "acc": 0.74341254, "epoch": 1.193556570268899, "grad_norm": 2.4375, "learning_rate": 3.824281280718141e-06, "loss": 1.02135677, "memory(GiB)": 369.42, "step": 47050, "train_speed(iter/s)": 0.200434 }, { "acc": 0.75115509, "epoch": 1.193683409436834, "grad_norm": 2.671875, "learning_rate": 3.823262085675337e-06, "loss": 0.99242916, "memory(GiB)": 369.42, "step": 47055, "train_speed(iter/s)": 0.200437 }, { "acc": 0.74617577, "epoch": 1.1938102486047693, "grad_norm": 2.359375, "learning_rate": 3.822242942390718e-06, "loss": 1.01670284, "memory(GiB)": 369.42, "step": 47060, "train_speed(iter/s)": 0.20044 }, { "acc": 0.75278206, "epoch": 1.1939370877727042, "grad_norm": 2.1875, "learning_rate": 3.821223850909115e-06, "loss": 0.98372793, "memory(GiB)": 369.42, "step": 47065, "train_speed(iter/s)": 0.200442 }, { "acc": 0.74849181, "epoch": 1.1940639269406392, "grad_norm": 2.40625, "learning_rate": 3.820204811275351e-06, "loss": 1.00168915, "memory(GiB)": 369.42, "step": 47070, "train_speed(iter/s)": 0.200445 }, { "acc": 0.76088772, "epoch": 1.1941907661085742, "grad_norm": 1.9453125, "learning_rate": 3.8191858235342446e-06, "loss": 1.00927076, "memory(GiB)": 369.42, "step": 47075, "train_speed(iter/s)": 0.200447 }, { "acc": 0.74210277, "epoch": 1.1943176052765094, "grad_norm": 2.625, "learning_rate": 3.818166887730618e-06, "loss": 1.02098083, "memory(GiB)": 369.42, "step": 47080, "train_speed(iter/s)": 0.200452 }, { "acc": 0.73811808, "epoch": 1.1944444444444444, "grad_norm": 2.078125, "learning_rate": 3.817148003909288e-06, "loss": 1.05460339, "memory(GiB)": 369.42, "step": 47085, "train_speed(iter/s)": 0.200457 }, { "acc": 0.74904523, "epoch": 1.1945712836123794, "grad_norm": 1.875, "learning_rate": 3.816129172115073e-06, "loss": 0.9579277, "memory(GiB)": 369.42, "step": 47090, "train_speed(iter/s)": 0.200461 }, { "acc": 0.73883519, "epoch": 1.1946981227803146, "grad_norm": 2.015625, "learning_rate": 3.815110392392778e-06, "loss": 0.99827175, "memory(GiB)": 369.42, "step": 47095, "train_speed(iter/s)": 0.200463 }, { "acc": 0.75340767, "epoch": 1.1948249619482496, "grad_norm": 2.078125, "learning_rate": 3.8140916647872204e-06, "loss": 0.98185387, "memory(GiB)": 369.42, "step": 47100, "train_speed(iter/s)": 0.200466 }, { "acc": 0.74019713, "epoch": 1.1949518011161846, "grad_norm": 2.21875, "learning_rate": 3.813072989343205e-06, "loss": 1.03293076, "memory(GiB)": 369.42, "step": 47105, "train_speed(iter/s)": 0.200469 }, { "acc": 0.75689564, "epoch": 1.1950786402841198, "grad_norm": 1.90625, "learning_rate": 3.812054366105541e-06, "loss": 0.99352303, "memory(GiB)": 369.42, "step": 47110, "train_speed(iter/s)": 0.200471 }, { "acc": 0.75669589, "epoch": 1.1952054794520548, "grad_norm": 1.9765625, "learning_rate": 3.8110357951190284e-06, "loss": 0.90132046, "memory(GiB)": 369.42, "step": 47115, "train_speed(iter/s)": 0.200469 }, { "acc": 0.72065468, "epoch": 1.1953323186199898, "grad_norm": 2.078125, "learning_rate": 3.8100172764284694e-06, "loss": 1.05590572, "memory(GiB)": 369.42, "step": 47120, "train_speed(iter/s)": 0.200471 }, { "acc": 0.7653985, "epoch": 1.195459157787925, "grad_norm": 2.0625, "learning_rate": 3.8089988100786635e-06, "loss": 0.95089588, "memory(GiB)": 369.42, "step": 47125, "train_speed(iter/s)": 0.200474 }, { "acc": 0.7437645, "epoch": 1.19558599695586, "grad_norm": 2.40625, "learning_rate": 3.807980396114409e-06, "loss": 0.9457634, "memory(GiB)": 369.42, "step": 47130, "train_speed(iter/s)": 0.200479 }, { "acc": 0.75002155, "epoch": 1.195712836123795, "grad_norm": 2.03125, "learning_rate": 3.8069620345804974e-06, "loss": 0.95613842, "memory(GiB)": 369.42, "step": 47135, "train_speed(iter/s)": 0.200482 }, { "acc": 0.74484534, "epoch": 1.19583967529173, "grad_norm": 2.15625, "learning_rate": 3.8059437255217214e-06, "loss": 1.03547478, "memory(GiB)": 369.42, "step": 47140, "train_speed(iter/s)": 0.200486 }, { "acc": 0.76305523, "epoch": 1.1959665144596652, "grad_norm": 2.140625, "learning_rate": 3.8049254689828723e-06, "loss": 0.97630234, "memory(GiB)": 369.42, "step": 47145, "train_speed(iter/s)": 0.200487 }, { "acc": 0.74893537, "epoch": 1.1960933536276002, "grad_norm": 2.421875, "learning_rate": 3.8039072650087377e-06, "loss": 0.98285732, "memory(GiB)": 369.42, "step": 47150, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75215635, "epoch": 1.1962201927955354, "grad_norm": 1.9140625, "learning_rate": 3.8028891136440994e-06, "loss": 0.93769932, "memory(GiB)": 369.42, "step": 47155, "train_speed(iter/s)": 0.200495 }, { "acc": 0.74182148, "epoch": 1.1963470319634704, "grad_norm": 1.921875, "learning_rate": 3.801871014933744e-06, "loss": 0.96511021, "memory(GiB)": 369.42, "step": 47160, "train_speed(iter/s)": 0.200497 }, { "acc": 0.74837217, "epoch": 1.1964738711314054, "grad_norm": 2.609375, "learning_rate": 3.8008529689224493e-06, "loss": 0.9891716, "memory(GiB)": 369.42, "step": 47165, "train_speed(iter/s)": 0.200499 }, { "acc": 0.75232859, "epoch": 1.1966007102993403, "grad_norm": 2.265625, "learning_rate": 3.7998349756549974e-06, "loss": 0.95934563, "memory(GiB)": 369.42, "step": 47170, "train_speed(iter/s)": 0.200503 }, { "acc": 0.73141394, "epoch": 1.1967275494672756, "grad_norm": 2.140625, "learning_rate": 3.79881703517616e-06, "loss": 1.11075735, "memory(GiB)": 369.42, "step": 47175, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75476589, "epoch": 1.1968543886352105, "grad_norm": 2.5, "learning_rate": 3.797799147530713e-06, "loss": 0.98389034, "memory(GiB)": 369.42, "step": 47180, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75580635, "epoch": 1.1969812278031455, "grad_norm": 2.34375, "learning_rate": 3.796781312763425e-06, "loss": 0.9543952, "memory(GiB)": 369.42, "step": 47185, "train_speed(iter/s)": 0.200513 }, { "acc": 0.75539694, "epoch": 1.1971080669710807, "grad_norm": 2.015625, "learning_rate": 3.79576353091907e-06, "loss": 0.9495575, "memory(GiB)": 369.42, "step": 47190, "train_speed(iter/s)": 0.200516 }, { "acc": 0.76656389, "epoch": 1.1972349061390157, "grad_norm": 2.375, "learning_rate": 3.7947458020424094e-06, "loss": 0.9911005, "memory(GiB)": 369.42, "step": 47195, "train_speed(iter/s)": 0.20052 }, { "acc": 0.75731797, "epoch": 1.1973617453069507, "grad_norm": 2.03125, "learning_rate": 3.793728126178209e-06, "loss": 1.00117168, "memory(GiB)": 369.42, "step": 47200, "train_speed(iter/s)": 0.200521 }, { "acc": 0.7556118, "epoch": 1.197488584474886, "grad_norm": 2.3125, "learning_rate": 3.792710503371232e-06, "loss": 0.92123508, "memory(GiB)": 369.42, "step": 47205, "train_speed(iter/s)": 0.200521 }, { "acc": 0.74102821, "epoch": 1.197615423642821, "grad_norm": 2.34375, "learning_rate": 3.7916929336662386e-06, "loss": 1.05848923, "memory(GiB)": 369.42, "step": 47210, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75494137, "epoch": 1.197742262810756, "grad_norm": 2.203125, "learning_rate": 3.790675417107982e-06, "loss": 0.95315285, "memory(GiB)": 369.42, "step": 47215, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75545053, "epoch": 1.1978691019786911, "grad_norm": 2.203125, "learning_rate": 3.7896579537412213e-06, "loss": 0.96417084, "memory(GiB)": 369.42, "step": 47220, "train_speed(iter/s)": 0.200531 }, { "acc": 0.72990932, "epoch": 1.197995941146626, "grad_norm": 2.25, "learning_rate": 3.7886405436107076e-06, "loss": 1.07369289, "memory(GiB)": 369.42, "step": 47225, "train_speed(iter/s)": 0.200536 }, { "acc": 0.75325608, "epoch": 1.198122780314561, "grad_norm": 2.296875, "learning_rate": 3.7876231867611917e-06, "loss": 0.94809494, "memory(GiB)": 369.42, "step": 47230, "train_speed(iter/s)": 0.200539 }, { "acc": 0.74308004, "epoch": 1.198249619482496, "grad_norm": 2.015625, "learning_rate": 3.7866058832374197e-06, "loss": 1.01208057, "memory(GiB)": 369.42, "step": 47235, "train_speed(iter/s)": 0.20054 }, { "acc": 0.74264793, "epoch": 1.1983764586504313, "grad_norm": 3.40625, "learning_rate": 3.7855886330841383e-06, "loss": 1.02054281, "memory(GiB)": 369.42, "step": 47240, "train_speed(iter/s)": 0.200544 }, { "acc": 0.75401373, "epoch": 1.1985032978183663, "grad_norm": 2.546875, "learning_rate": 3.7845714363460908e-06, "loss": 0.98923435, "memory(GiB)": 369.42, "step": 47245, "train_speed(iter/s)": 0.200547 }, { "acc": 0.75567408, "epoch": 1.1986301369863013, "grad_norm": 2.171875, "learning_rate": 3.78355429306802e-06, "loss": 1.00405674, "memory(GiB)": 369.42, "step": 47250, "train_speed(iter/s)": 0.200549 }, { "acc": 0.76356936, "epoch": 1.1987569761542365, "grad_norm": 2.234375, "learning_rate": 3.7825372032946605e-06, "loss": 0.94029236, "memory(GiB)": 369.42, "step": 47255, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75657749, "epoch": 1.1988838153221715, "grad_norm": 2.203125, "learning_rate": 3.7815201670707502e-06, "loss": 0.98392582, "memory(GiB)": 369.42, "step": 47260, "train_speed(iter/s)": 0.200555 }, { "acc": 0.74923429, "epoch": 1.1990106544901065, "grad_norm": 2.1875, "learning_rate": 3.7805031844410235e-06, "loss": 1.07391376, "memory(GiB)": 369.42, "step": 47265, "train_speed(iter/s)": 0.200559 }, { "acc": 0.75701542, "epoch": 1.1991374936580417, "grad_norm": 2.09375, "learning_rate": 3.7794862554502126e-06, "loss": 0.94731312, "memory(GiB)": 369.42, "step": 47270, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74937506, "epoch": 1.1992643328259767, "grad_norm": 2.390625, "learning_rate": 3.778469380143045e-06, "loss": 0.97749844, "memory(GiB)": 369.42, "step": 47275, "train_speed(iter/s)": 0.200567 }, { "acc": 0.76115255, "epoch": 1.1993911719939117, "grad_norm": 2.0625, "learning_rate": 3.777452558564246e-06, "loss": 0.96981192, "memory(GiB)": 369.42, "step": 47280, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75756674, "epoch": 1.1995180111618469, "grad_norm": 2.125, "learning_rate": 3.776435790758543e-06, "loss": 0.93967028, "memory(GiB)": 369.42, "step": 47285, "train_speed(iter/s)": 0.200573 }, { "acc": 0.76274214, "epoch": 1.1996448503297819, "grad_norm": 2.390625, "learning_rate": 3.7754190767706577e-06, "loss": 0.89092083, "memory(GiB)": 369.42, "step": 47290, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75013824, "epoch": 1.1997716894977168, "grad_norm": 2.375, "learning_rate": 3.774402416645307e-06, "loss": 0.99665461, "memory(GiB)": 369.42, "step": 47295, "train_speed(iter/s)": 0.200578 }, { "acc": 0.75520992, "epoch": 1.1998985286656518, "grad_norm": 2.140625, "learning_rate": 3.77338581042721e-06, "loss": 0.99983845, "memory(GiB)": 369.42, "step": 47300, "train_speed(iter/s)": 0.20058 }, { "acc": 0.76687918, "epoch": 1.200025367833587, "grad_norm": 2.71875, "learning_rate": 3.7723692581610817e-06, "loss": 0.98923035, "memory(GiB)": 369.42, "step": 47305, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75432749, "epoch": 1.200152207001522, "grad_norm": 2.03125, "learning_rate": 3.771352759891637e-06, "loss": 1.0180233, "memory(GiB)": 369.42, "step": 47310, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74833097, "epoch": 1.2002790461694572, "grad_norm": 2.171875, "learning_rate": 3.7703363156635807e-06, "loss": 1.04226456, "memory(GiB)": 369.42, "step": 47315, "train_speed(iter/s)": 0.200591 }, { "acc": 0.76127548, "epoch": 1.2004058853373922, "grad_norm": 2.15625, "learning_rate": 3.769319925521624e-06, "loss": 1.00544224, "memory(GiB)": 369.42, "step": 47320, "train_speed(iter/s)": 0.200596 }, { "acc": 0.76607504, "epoch": 1.2005327245053272, "grad_norm": 2.40625, "learning_rate": 3.76830358951047e-06, "loss": 0.96935844, "memory(GiB)": 369.42, "step": 47325, "train_speed(iter/s)": 0.2006 }, { "acc": 0.74945526, "epoch": 1.2006595636732622, "grad_norm": 2.734375, "learning_rate": 3.767287307674826e-06, "loss": 0.95002117, "memory(GiB)": 369.42, "step": 47330, "train_speed(iter/s)": 0.200604 }, { "acc": 0.75605307, "epoch": 1.2007864028411974, "grad_norm": 2.328125, "learning_rate": 3.766271080059389e-06, "loss": 0.97221689, "memory(GiB)": 369.42, "step": 47335, "train_speed(iter/s)": 0.200608 }, { "acc": 0.75975466, "epoch": 1.2009132420091324, "grad_norm": 2.65625, "learning_rate": 3.7652549067088568e-06, "loss": 0.96175499, "memory(GiB)": 369.42, "step": 47340, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75901384, "epoch": 1.2010400811770674, "grad_norm": 2.53125, "learning_rate": 3.7642387876679275e-06, "loss": 1.00121937, "memory(GiB)": 369.42, "step": 47345, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74006767, "epoch": 1.2011669203450026, "grad_norm": 1.7578125, "learning_rate": 3.7632227229812947e-06, "loss": 0.98734694, "memory(GiB)": 369.42, "step": 47350, "train_speed(iter/s)": 0.200615 }, { "acc": 0.7464201, "epoch": 1.2012937595129376, "grad_norm": 2.390625, "learning_rate": 3.7622067126936475e-06, "loss": 1.02412691, "memory(GiB)": 369.42, "step": 47355, "train_speed(iter/s)": 0.200617 }, { "acc": 0.76407719, "epoch": 1.2014205986808726, "grad_norm": 2.078125, "learning_rate": 3.761190756849674e-06, "loss": 0.91917896, "memory(GiB)": 369.42, "step": 47360, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75620232, "epoch": 1.2015474378488078, "grad_norm": 2.328125, "learning_rate": 3.7601748554940633e-06, "loss": 0.9721489, "memory(GiB)": 369.42, "step": 47365, "train_speed(iter/s)": 0.200624 }, { "acc": 0.7538888, "epoch": 1.2016742770167428, "grad_norm": 1.9296875, "learning_rate": 3.7591590086714984e-06, "loss": 0.95328159, "memory(GiB)": 369.42, "step": 47370, "train_speed(iter/s)": 0.200626 }, { "acc": 0.74653015, "epoch": 1.2018011161846778, "grad_norm": 2.734375, "learning_rate": 3.7581432164266587e-06, "loss": 0.96597805, "memory(GiB)": 369.42, "step": 47375, "train_speed(iter/s)": 0.200627 }, { "acc": 0.75336905, "epoch": 1.201927955352613, "grad_norm": 2.09375, "learning_rate": 3.7571274788042255e-06, "loss": 0.98223162, "memory(GiB)": 369.42, "step": 47380, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75123291, "epoch": 1.202054794520548, "grad_norm": 2.328125, "learning_rate": 3.756111795848874e-06, "loss": 1.00709724, "memory(GiB)": 369.42, "step": 47385, "train_speed(iter/s)": 0.200633 }, { "acc": 0.74639254, "epoch": 1.202181633688483, "grad_norm": 2.234375, "learning_rate": 3.755096167605281e-06, "loss": 1.02627821, "memory(GiB)": 369.42, "step": 47390, "train_speed(iter/s)": 0.200637 }, { "acc": 0.7604743, "epoch": 1.202308472856418, "grad_norm": 2.046875, "learning_rate": 3.7540805941181165e-06, "loss": 0.9741333, "memory(GiB)": 369.42, "step": 47395, "train_speed(iter/s)": 0.20064 }, { "acc": 0.75508604, "epoch": 1.2024353120243532, "grad_norm": 2.109375, "learning_rate": 3.7530650754320492e-06, "loss": 1.0110611, "memory(GiB)": 369.42, "step": 47400, "train_speed(iter/s)": 0.200644 }, { "acc": 0.77108893, "epoch": 1.2025621511922882, "grad_norm": 1.90625, "learning_rate": 3.752049611591746e-06, "loss": 0.90195484, "memory(GiB)": 369.42, "step": 47405, "train_speed(iter/s)": 0.200646 }, { "acc": 0.74758253, "epoch": 1.2026889903602231, "grad_norm": 2.546875, "learning_rate": 3.7510342026418756e-06, "loss": 0.98818016, "memory(GiB)": 369.42, "step": 47410, "train_speed(iter/s)": 0.20065 }, { "acc": 0.75382113, "epoch": 1.2028158295281584, "grad_norm": 2.5, "learning_rate": 3.7500188486270948e-06, "loss": 1.00115166, "memory(GiB)": 369.42, "step": 47415, "train_speed(iter/s)": 0.200653 }, { "acc": 0.73789387, "epoch": 1.2029426686960933, "grad_norm": 2.359375, "learning_rate": 3.7490035495920664e-06, "loss": 1.05701485, "memory(GiB)": 369.42, "step": 47420, "train_speed(iter/s)": 0.200658 }, { "acc": 0.77070131, "epoch": 1.2030695078640283, "grad_norm": 1.9765625, "learning_rate": 3.747988305581447e-06, "loss": 0.91322927, "memory(GiB)": 369.42, "step": 47425, "train_speed(iter/s)": 0.200661 }, { "acc": 0.75442157, "epoch": 1.2031963470319635, "grad_norm": 2.375, "learning_rate": 3.7469731166398933e-06, "loss": 0.98558254, "memory(GiB)": 369.42, "step": 47430, "train_speed(iter/s)": 0.200663 }, { "acc": 0.751231, "epoch": 1.2033231861998985, "grad_norm": 2.0625, "learning_rate": 3.745957982812054e-06, "loss": 1.01344032, "memory(GiB)": 369.42, "step": 47435, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74732399, "epoch": 1.2034500253678335, "grad_norm": 2.09375, "learning_rate": 3.744942904142582e-06, "loss": 0.99786758, "memory(GiB)": 369.42, "step": 47440, "train_speed(iter/s)": 0.200669 }, { "acc": 0.76633778, "epoch": 1.2035768645357687, "grad_norm": 1.953125, "learning_rate": 3.743927880676125e-06, "loss": 0.91945553, "memory(GiB)": 369.42, "step": 47445, "train_speed(iter/s)": 0.200671 }, { "acc": 0.76141691, "epoch": 1.2037037037037037, "grad_norm": 1.796875, "learning_rate": 3.742912912457329e-06, "loss": 0.93631783, "memory(GiB)": 369.42, "step": 47450, "train_speed(iter/s)": 0.200674 }, { "acc": 0.74905338, "epoch": 1.2038305428716387, "grad_norm": 2.234375, "learning_rate": 3.7418979995308336e-06, "loss": 0.96767921, "memory(GiB)": 369.42, "step": 47455, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75018892, "epoch": 1.2039573820395737, "grad_norm": 2.015625, "learning_rate": 3.740883141941282e-06, "loss": 0.97601128, "memory(GiB)": 369.42, "step": 47460, "train_speed(iter/s)": 0.200679 }, { "acc": 0.75646038, "epoch": 1.204084221207509, "grad_norm": 1.9921875, "learning_rate": 3.7398683397333103e-06, "loss": 0.98115616, "memory(GiB)": 369.42, "step": 47465, "train_speed(iter/s)": 0.200682 }, { "acc": 0.74989395, "epoch": 1.204211060375444, "grad_norm": 1.8203125, "learning_rate": 3.7388535929515573e-06, "loss": 0.97530899, "memory(GiB)": 369.42, "step": 47470, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75876646, "epoch": 1.204337899543379, "grad_norm": 2.375, "learning_rate": 3.737838901640653e-06, "loss": 0.92887268, "memory(GiB)": 369.42, "step": 47475, "train_speed(iter/s)": 0.200689 }, { "acc": 0.75914202, "epoch": 1.204464738711314, "grad_norm": 2.859375, "learning_rate": 3.736824265845228e-06, "loss": 0.96075535, "memory(GiB)": 369.42, "step": 47480, "train_speed(iter/s)": 0.200693 }, { "acc": 0.76109638, "epoch": 1.204591577879249, "grad_norm": 2.109375, "learning_rate": 3.7358096856099118e-06, "loss": 0.95866957, "memory(GiB)": 369.42, "step": 47485, "train_speed(iter/s)": 0.200696 }, { "acc": 0.74726949, "epoch": 1.204718417047184, "grad_norm": 2.15625, "learning_rate": 3.7347951609793315e-06, "loss": 0.95460176, "memory(GiB)": 369.42, "step": 47490, "train_speed(iter/s)": 0.2007 }, { "acc": 0.7422441, "epoch": 1.2048452562151193, "grad_norm": 2.1875, "learning_rate": 3.7337806919981077e-06, "loss": 1.00754204, "memory(GiB)": 369.42, "step": 47495, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75092344, "epoch": 1.2049720953830543, "grad_norm": 2.3125, "learning_rate": 3.732766278710861e-06, "loss": 0.98402138, "memory(GiB)": 369.42, "step": 47500, "train_speed(iter/s)": 0.200705 }, { "acc": 0.75631266, "epoch": 1.2050989345509893, "grad_norm": 2.09375, "learning_rate": 3.7317519211622123e-06, "loss": 0.96779003, "memory(GiB)": 369.42, "step": 47505, "train_speed(iter/s)": 0.200708 }, { "acc": 0.74909201, "epoch": 1.2052257737189245, "grad_norm": 2.046875, "learning_rate": 3.7307376193967772e-06, "loss": 1.01893425, "memory(GiB)": 369.42, "step": 47510, "train_speed(iter/s)": 0.200711 }, { "acc": 0.752283, "epoch": 1.2053526128868595, "grad_norm": 2.109375, "learning_rate": 3.7297233734591664e-06, "loss": 0.96178827, "memory(GiB)": 369.42, "step": 47515, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74793901, "epoch": 1.2054794520547945, "grad_norm": 2.625, "learning_rate": 3.7287091833939948e-06, "loss": 1.01636362, "memory(GiB)": 369.42, "step": 47520, "train_speed(iter/s)": 0.200714 }, { "acc": 0.76213388, "epoch": 1.2056062912227297, "grad_norm": 1.96875, "learning_rate": 3.7276950492458675e-06, "loss": 0.94414139, "memory(GiB)": 369.42, "step": 47525, "train_speed(iter/s)": 0.200718 }, { "acc": 0.74359393, "epoch": 1.2057331303906647, "grad_norm": 2.125, "learning_rate": 3.7266809710593956e-06, "loss": 1.02223091, "memory(GiB)": 369.42, "step": 47530, "train_speed(iter/s)": 0.200722 }, { "acc": 0.7649879, "epoch": 1.2058599695585996, "grad_norm": 2.28125, "learning_rate": 3.7256669488791763e-06, "loss": 0.94566288, "memory(GiB)": 369.42, "step": 47535, "train_speed(iter/s)": 0.200725 }, { "acc": 0.75651922, "epoch": 1.2059868087265349, "grad_norm": 2.140625, "learning_rate": 3.7246529827498156e-06, "loss": 0.9118885, "memory(GiB)": 369.42, "step": 47540, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75402603, "epoch": 1.2061136478944698, "grad_norm": 2.015625, "learning_rate": 3.7236390727159094e-06, "loss": 0.9713542, "memory(GiB)": 369.42, "step": 47545, "train_speed(iter/s)": 0.200728 }, { "acc": 0.76619558, "epoch": 1.2062404870624048, "grad_norm": 1.9296875, "learning_rate": 3.7226252188220573e-06, "loss": 0.92003431, "memory(GiB)": 369.42, "step": 47550, "train_speed(iter/s)": 0.200731 }, { "acc": 0.74583349, "epoch": 1.2063673262303398, "grad_norm": 2.359375, "learning_rate": 3.7216114211128505e-06, "loss": 1.00570307, "memory(GiB)": 369.42, "step": 47555, "train_speed(iter/s)": 0.200735 }, { "acc": 0.75296402, "epoch": 1.206494165398275, "grad_norm": 2.3125, "learning_rate": 3.720597679632879e-06, "loss": 0.9608448, "memory(GiB)": 369.42, "step": 47560, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75112667, "epoch": 1.20662100456621, "grad_norm": 2.8125, "learning_rate": 3.7195839944267357e-06, "loss": 1.03019009, "memory(GiB)": 369.42, "step": 47565, "train_speed(iter/s)": 0.200742 }, { "acc": 0.76710567, "epoch": 1.206747843734145, "grad_norm": 1.796875, "learning_rate": 3.718570365539006e-06, "loss": 0.91928844, "memory(GiB)": 369.42, "step": 47570, "train_speed(iter/s)": 0.200744 }, { "acc": 0.76000433, "epoch": 1.2068746829020802, "grad_norm": 2.515625, "learning_rate": 3.717556793014271e-06, "loss": 0.97323265, "memory(GiB)": 369.42, "step": 47575, "train_speed(iter/s)": 0.200747 }, { "acc": 0.74929371, "epoch": 1.2070015220700152, "grad_norm": 2.078125, "learning_rate": 3.716543276897113e-06, "loss": 1.02632408, "memory(GiB)": 369.42, "step": 47580, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74674129, "epoch": 1.2071283612379502, "grad_norm": 2.03125, "learning_rate": 3.715529817232114e-06, "loss": 0.96494083, "memory(GiB)": 369.42, "step": 47585, "train_speed(iter/s)": 0.200753 }, { "acc": 0.74774084, "epoch": 1.2072552004058854, "grad_norm": 2.5, "learning_rate": 3.7145164140638483e-06, "loss": 1.00106678, "memory(GiB)": 369.42, "step": 47590, "train_speed(iter/s)": 0.200757 }, { "acc": 0.74778724, "epoch": 1.2073820395738204, "grad_norm": 2.09375, "learning_rate": 3.713503067436889e-06, "loss": 1.00410929, "memory(GiB)": 369.42, "step": 47595, "train_speed(iter/s)": 0.200759 }, { "acc": 0.75883894, "epoch": 1.2075088787417554, "grad_norm": 2.1875, "learning_rate": 3.7124897773958084e-06, "loss": 0.94611998, "memory(GiB)": 369.42, "step": 47600, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75212059, "epoch": 1.2076357179096906, "grad_norm": 2.359375, "learning_rate": 3.7114765439851752e-06, "loss": 1.03178005, "memory(GiB)": 369.42, "step": 47605, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75774055, "epoch": 1.2077625570776256, "grad_norm": 1.859375, "learning_rate": 3.7104633672495584e-06, "loss": 0.94954529, "memory(GiB)": 369.42, "step": 47610, "train_speed(iter/s)": 0.200768 }, { "acc": 0.7683279, "epoch": 1.2078893962455606, "grad_norm": 2.609375, "learning_rate": 3.709450247233519e-06, "loss": 0.91371536, "memory(GiB)": 369.42, "step": 47615, "train_speed(iter/s)": 0.200772 }, { "acc": 0.73654814, "epoch": 1.2080162354134956, "grad_norm": 3.5625, "learning_rate": 3.7084371839816204e-06, "loss": 1.02953548, "memory(GiB)": 369.42, "step": 47620, "train_speed(iter/s)": 0.200776 }, { "acc": 0.76356506, "epoch": 1.2081430745814308, "grad_norm": 1.8125, "learning_rate": 3.707424177538419e-06, "loss": 0.94198265, "memory(GiB)": 369.42, "step": 47625, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75554638, "epoch": 1.2082699137493658, "grad_norm": 2.21875, "learning_rate": 3.7064112279484753e-06, "loss": 0.97929611, "memory(GiB)": 369.42, "step": 47630, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75716043, "epoch": 1.208396752917301, "grad_norm": 2.453125, "learning_rate": 3.7053983352563407e-06, "loss": 0.96263237, "memory(GiB)": 369.42, "step": 47635, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75443192, "epoch": 1.208523592085236, "grad_norm": 1.8515625, "learning_rate": 3.704385499506565e-06, "loss": 0.96474295, "memory(GiB)": 369.42, "step": 47640, "train_speed(iter/s)": 0.200789 }, { "acc": 0.7390204, "epoch": 1.208650431253171, "grad_norm": 2.15625, "learning_rate": 3.703372720743702e-06, "loss": 1.01823301, "memory(GiB)": 369.42, "step": 47645, "train_speed(iter/s)": 0.200791 }, { "acc": 0.75293636, "epoch": 1.208777270421106, "grad_norm": 2.0625, "learning_rate": 3.7023599990122966e-06, "loss": 0.95808144, "memory(GiB)": 369.42, "step": 47650, "train_speed(iter/s)": 0.200793 }, { "acc": 0.74397812, "epoch": 1.2089041095890412, "grad_norm": 2.21875, "learning_rate": 3.7013473343568897e-06, "loss": 1.01823673, "memory(GiB)": 369.42, "step": 47655, "train_speed(iter/s)": 0.200797 }, { "acc": 0.75994306, "epoch": 1.2090309487569761, "grad_norm": 2.34375, "learning_rate": 3.700334726822026e-06, "loss": 0.95606327, "memory(GiB)": 369.42, "step": 47660, "train_speed(iter/s)": 0.200801 }, { "acc": 0.76910982, "epoch": 1.2091577879249111, "grad_norm": 2.390625, "learning_rate": 3.6993221764522435e-06, "loss": 0.89508953, "memory(GiB)": 369.42, "step": 47665, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75304193, "epoch": 1.2092846270928463, "grad_norm": 2.046875, "learning_rate": 3.6983096832920806e-06, "loss": 0.98010292, "memory(GiB)": 369.42, "step": 47670, "train_speed(iter/s)": 0.200807 }, { "acc": 0.76158028, "epoch": 1.2094114662607813, "grad_norm": 2.203125, "learning_rate": 3.697297247386066e-06, "loss": 1.00521278, "memory(GiB)": 369.42, "step": 47675, "train_speed(iter/s)": 0.20081 }, { "acc": 0.74999566, "epoch": 1.2095383054287163, "grad_norm": 1.9921875, "learning_rate": 3.6962848687787365e-06, "loss": 1.05142651, "memory(GiB)": 369.42, "step": 47680, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75094633, "epoch": 1.2096651445966515, "grad_norm": 1.96875, "learning_rate": 3.6952725475146183e-06, "loss": 0.93290854, "memory(GiB)": 369.42, "step": 47685, "train_speed(iter/s)": 0.200814 }, { "acc": 0.74977074, "epoch": 1.2097919837645865, "grad_norm": 2.0625, "learning_rate": 3.69426028363824e-06, "loss": 0.9738307, "memory(GiB)": 369.42, "step": 47690, "train_speed(iter/s)": 0.200816 }, { "acc": 0.7662323, "epoch": 1.2099188229325215, "grad_norm": 2.515625, "learning_rate": 3.6932480771941237e-06, "loss": 0.93057022, "memory(GiB)": 369.42, "step": 47695, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75695667, "epoch": 1.2100456621004567, "grad_norm": 2.125, "learning_rate": 3.6922359282267904e-06, "loss": 0.95547161, "memory(GiB)": 369.42, "step": 47700, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75622768, "epoch": 1.2101725012683917, "grad_norm": 1.875, "learning_rate": 3.6912238367807606e-06, "loss": 0.98162365, "memory(GiB)": 369.42, "step": 47705, "train_speed(iter/s)": 0.200824 }, { "acc": 0.74740219, "epoch": 1.2102993404363267, "grad_norm": 2.203125, "learning_rate": 3.6902118029005507e-06, "loss": 1.01524315, "memory(GiB)": 369.42, "step": 47710, "train_speed(iter/s)": 0.200828 }, { "acc": 0.75294161, "epoch": 1.2104261796042617, "grad_norm": 2.3125, "learning_rate": 3.6891998266306717e-06, "loss": 1.00555534, "memory(GiB)": 369.42, "step": 47715, "train_speed(iter/s)": 0.200831 }, { "acc": 0.75368848, "epoch": 1.210553018772197, "grad_norm": 2.234375, "learning_rate": 3.688187908015636e-06, "loss": 0.96993952, "memory(GiB)": 369.42, "step": 47720, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75878925, "epoch": 1.2106798579401319, "grad_norm": 2.765625, "learning_rate": 3.6871760470999546e-06, "loss": 1.00107346, "memory(GiB)": 369.42, "step": 47725, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75448904, "epoch": 1.2108066971080669, "grad_norm": 2.0625, "learning_rate": 3.6861642439281325e-06, "loss": 0.95535812, "memory(GiB)": 369.42, "step": 47730, "train_speed(iter/s)": 0.200839 }, { "acc": 0.74318595, "epoch": 1.210933536276002, "grad_norm": 2.109375, "learning_rate": 3.6851524985446707e-06, "loss": 1.03266973, "memory(GiB)": 369.42, "step": 47735, "train_speed(iter/s)": 0.200844 }, { "acc": 0.74962611, "epoch": 1.211060375443937, "grad_norm": 2.28125, "learning_rate": 3.6841408109940737e-06, "loss": 0.94102974, "memory(GiB)": 369.42, "step": 47740, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75056133, "epoch": 1.211187214611872, "grad_norm": 2.5625, "learning_rate": 3.6831291813208377e-06, "loss": 0.99519129, "memory(GiB)": 369.42, "step": 47745, "train_speed(iter/s)": 0.20085 }, { "acc": 0.73164968, "epoch": 1.2113140537798073, "grad_norm": 2.359375, "learning_rate": 3.682117609569462e-06, "loss": 1.05461922, "memory(GiB)": 369.42, "step": 47750, "train_speed(iter/s)": 0.200852 }, { "acc": 0.74661446, "epoch": 1.2114408929477423, "grad_norm": 3.0, "learning_rate": 3.681106095784436e-06, "loss": 1.02514067, "memory(GiB)": 369.42, "step": 47755, "train_speed(iter/s)": 0.200855 }, { "acc": 0.77070465, "epoch": 1.2115677321156773, "grad_norm": 1.6796875, "learning_rate": 3.6800946400102522e-06, "loss": 0.90891638, "memory(GiB)": 369.42, "step": 47760, "train_speed(iter/s)": 0.200857 }, { "acc": 0.75517406, "epoch": 1.2116945712836125, "grad_norm": 2.3125, "learning_rate": 3.6790832422913984e-06, "loss": 0.99298735, "memory(GiB)": 369.42, "step": 47765, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75131493, "epoch": 1.2118214104515475, "grad_norm": 3.09375, "learning_rate": 3.6780719026723632e-06, "loss": 0.98528671, "memory(GiB)": 369.42, "step": 47770, "train_speed(iter/s)": 0.200864 }, { "acc": 0.74125271, "epoch": 1.2119482496194824, "grad_norm": 2.21875, "learning_rate": 3.677060621197627e-06, "loss": 1.05829792, "memory(GiB)": 369.42, "step": 47775, "train_speed(iter/s)": 0.200868 }, { "acc": 0.74473829, "epoch": 1.2120750887874174, "grad_norm": 1.8203125, "learning_rate": 3.6760493979116696e-06, "loss": 0.96470366, "memory(GiB)": 369.42, "step": 47780, "train_speed(iter/s)": 0.200871 }, { "acc": 0.7651454, "epoch": 1.2122019279553526, "grad_norm": 2.0, "learning_rate": 3.6750382328589725e-06, "loss": 0.97495842, "memory(GiB)": 369.42, "step": 47785, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74702778, "epoch": 1.2123287671232876, "grad_norm": 2.71875, "learning_rate": 3.67402712608401e-06, "loss": 0.99579105, "memory(GiB)": 369.42, "step": 47790, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75072827, "epoch": 1.2124556062912228, "grad_norm": 2.0, "learning_rate": 3.673016077631253e-06, "loss": 0.984202, "memory(GiB)": 369.42, "step": 47795, "train_speed(iter/s)": 0.200881 }, { "acc": 0.74350238, "epoch": 1.2125824454591578, "grad_norm": 2.53125, "learning_rate": 3.672005087545173e-06, "loss": 1.01990557, "memory(GiB)": 369.42, "step": 47800, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75703182, "epoch": 1.2127092846270928, "grad_norm": 2.125, "learning_rate": 3.6709941558702393e-06, "loss": 0.97856789, "memory(GiB)": 369.42, "step": 47805, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75041986, "epoch": 1.2128361237950278, "grad_norm": 2.234375, "learning_rate": 3.6699832826509174e-06, "loss": 1.03681192, "memory(GiB)": 369.42, "step": 47810, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74099402, "epoch": 1.212962962962963, "grad_norm": 2.125, "learning_rate": 3.6689724679316665e-06, "loss": 1.04524107, "memory(GiB)": 369.42, "step": 47815, "train_speed(iter/s)": 0.200893 }, { "acc": 0.74376459, "epoch": 1.213089802130898, "grad_norm": 2.359375, "learning_rate": 3.66796171175695e-06, "loss": 1.01848164, "memory(GiB)": 369.42, "step": 47820, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75812292, "epoch": 1.213216641298833, "grad_norm": 2.109375, "learning_rate": 3.666951014171224e-06, "loss": 0.97693939, "memory(GiB)": 369.42, "step": 47825, "train_speed(iter/s)": 0.2009 }, { "acc": 0.74767175, "epoch": 1.2133434804667682, "grad_norm": 1.9140625, "learning_rate": 3.6659403752189453e-06, "loss": 0.96208248, "memory(GiB)": 369.42, "step": 47830, "train_speed(iter/s)": 0.200903 }, { "acc": 0.75583525, "epoch": 1.2134703196347032, "grad_norm": 2.1875, "learning_rate": 3.664929794944565e-06, "loss": 0.94296846, "memory(GiB)": 369.42, "step": 47835, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75833197, "epoch": 1.2135971588026382, "grad_norm": 2.1875, "learning_rate": 3.663919273392532e-06, "loss": 0.96979351, "memory(GiB)": 369.42, "step": 47840, "train_speed(iter/s)": 0.200908 }, { "acc": 0.76616278, "epoch": 1.2137239979705734, "grad_norm": 1.8203125, "learning_rate": 3.662908810607294e-06, "loss": 0.90163107, "memory(GiB)": 369.42, "step": 47845, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75364413, "epoch": 1.2138508371385084, "grad_norm": 2.0625, "learning_rate": 3.6618984066332986e-06, "loss": 1.02716303, "memory(GiB)": 369.42, "step": 47850, "train_speed(iter/s)": 0.200913 }, { "acc": 0.75452566, "epoch": 1.2139776763064434, "grad_norm": 2.546875, "learning_rate": 3.660888061514984e-06, "loss": 0.96422138, "memory(GiB)": 369.42, "step": 47855, "train_speed(iter/s)": 0.200914 }, { "acc": 0.75700359, "epoch": 1.2141045154743786, "grad_norm": 2.296875, "learning_rate": 3.6598777752967896e-06, "loss": 1.00945663, "memory(GiB)": 369.42, "step": 47860, "train_speed(iter/s)": 0.200916 }, { "acc": 0.75852509, "epoch": 1.2142313546423136, "grad_norm": 2.203125, "learning_rate": 3.658867548023156e-06, "loss": 0.98500481, "memory(GiB)": 369.42, "step": 47865, "train_speed(iter/s)": 0.200916 }, { "acc": 0.75487642, "epoch": 1.2143581938102486, "grad_norm": 2.515625, "learning_rate": 3.657857379738515e-06, "loss": 1.01336555, "memory(GiB)": 369.42, "step": 47870, "train_speed(iter/s)": 0.200918 }, { "acc": 0.74595127, "epoch": 1.2144850329781836, "grad_norm": 2.484375, "learning_rate": 3.656847270487298e-06, "loss": 1.02991142, "memory(GiB)": 369.42, "step": 47875, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74465322, "epoch": 1.2146118721461188, "grad_norm": 2.21875, "learning_rate": 3.655837220313936e-06, "loss": 0.99878178, "memory(GiB)": 369.42, "step": 47880, "train_speed(iter/s)": 0.200924 }, { "acc": 0.73941708, "epoch": 1.2147387113140538, "grad_norm": 1.7265625, "learning_rate": 3.654827229262852e-06, "loss": 1.01057625, "memory(GiB)": 369.42, "step": 47885, "train_speed(iter/s)": 0.200926 }, { "acc": 0.76359782, "epoch": 1.2148655504819887, "grad_norm": 2.25, "learning_rate": 3.653817297378476e-06, "loss": 0.91956959, "memory(GiB)": 369.42, "step": 47890, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75927734, "epoch": 1.214992389649924, "grad_norm": 1.96875, "learning_rate": 3.6528074247052225e-06, "loss": 0.94119759, "memory(GiB)": 369.42, "step": 47895, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75666113, "epoch": 1.215119228817859, "grad_norm": 1.6953125, "learning_rate": 3.651797611287514e-06, "loss": 0.92662029, "memory(GiB)": 369.42, "step": 47900, "train_speed(iter/s)": 0.200936 }, { "acc": 0.77033615, "epoch": 1.215246067985794, "grad_norm": 9.5, "learning_rate": 3.6507878571697646e-06, "loss": 0.94230394, "memory(GiB)": 369.42, "step": 47905, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75273962, "epoch": 1.2153729071537291, "grad_norm": 1.984375, "learning_rate": 3.6497781623963915e-06, "loss": 0.97944794, "memory(GiB)": 369.42, "step": 47910, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74519539, "epoch": 1.2154997463216641, "grad_norm": 2.078125, "learning_rate": 3.648768527011802e-06, "loss": 1.01029453, "memory(GiB)": 369.42, "step": 47915, "train_speed(iter/s)": 0.200945 }, { "acc": 0.74858303, "epoch": 1.2156265854895991, "grad_norm": 2.203125, "learning_rate": 3.6477589510604044e-06, "loss": 1.03139572, "memory(GiB)": 369.42, "step": 47920, "train_speed(iter/s)": 0.200949 }, { "acc": 0.74675126, "epoch": 1.2157534246575343, "grad_norm": 1.875, "learning_rate": 3.646749434586607e-06, "loss": 1.00056734, "memory(GiB)": 369.42, "step": 47925, "train_speed(iter/s)": 0.200952 }, { "acc": 0.76384196, "epoch": 1.2158802638254693, "grad_norm": 2.421875, "learning_rate": 3.645739977634811e-06, "loss": 0.9294158, "memory(GiB)": 369.42, "step": 47930, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75483923, "epoch": 1.2160071029934043, "grad_norm": 2.1875, "learning_rate": 3.6447305802494177e-06, "loss": 0.97906494, "memory(GiB)": 369.42, "step": 47935, "train_speed(iter/s)": 0.20096 }, { "acc": 0.75571628, "epoch": 1.2161339421613393, "grad_norm": 2.234375, "learning_rate": 3.6437212424748227e-06, "loss": 1.02804937, "memory(GiB)": 369.42, "step": 47940, "train_speed(iter/s)": 0.200964 }, { "acc": 0.74982786, "epoch": 1.2162607813292745, "grad_norm": 2.625, "learning_rate": 3.642711964355423e-06, "loss": 1.00868464, "memory(GiB)": 369.42, "step": 47945, "train_speed(iter/s)": 0.200966 }, { "acc": 0.73853626, "epoch": 1.2163876204972095, "grad_norm": 2.34375, "learning_rate": 3.6417027459356134e-06, "loss": 1.02546186, "memory(GiB)": 369.42, "step": 47950, "train_speed(iter/s)": 0.200969 }, { "acc": 0.73725557, "epoch": 1.2165144596651447, "grad_norm": 2.171875, "learning_rate": 3.640693587259778e-06, "loss": 1.07295876, "memory(GiB)": 369.42, "step": 47955, "train_speed(iter/s)": 0.200971 }, { "acc": 0.73830924, "epoch": 1.2166412988330797, "grad_norm": 1.7421875, "learning_rate": 3.6396844883723092e-06, "loss": 1.05329628, "memory(GiB)": 369.42, "step": 47960, "train_speed(iter/s)": 0.200974 }, { "acc": 0.75080481, "epoch": 1.2167681380010147, "grad_norm": 2.109375, "learning_rate": 3.6386754493175893e-06, "loss": 1.03460531, "memory(GiB)": 369.42, "step": 47965, "train_speed(iter/s)": 0.200977 }, { "acc": 0.74602847, "epoch": 1.2168949771689497, "grad_norm": 1.796875, "learning_rate": 3.637666470140003e-06, "loss": 0.99545975, "memory(GiB)": 369.42, "step": 47970, "train_speed(iter/s)": 0.200979 }, { "acc": 0.7576211, "epoch": 1.2170218163368849, "grad_norm": 2.09375, "learning_rate": 3.6366575508839265e-06, "loss": 0.98315039, "memory(GiB)": 369.42, "step": 47975, "train_speed(iter/s)": 0.200983 }, { "acc": 0.74493079, "epoch": 1.2171486555048199, "grad_norm": 2.375, "learning_rate": 3.635648691593737e-06, "loss": 1.06178875, "memory(GiB)": 369.42, "step": 47980, "train_speed(iter/s)": 0.200985 }, { "acc": 0.75695353, "epoch": 1.2172754946727549, "grad_norm": 2.0625, "learning_rate": 3.6346398923138094e-06, "loss": 0.98997078, "memory(GiB)": 369.42, "step": 47985, "train_speed(iter/s)": 0.200988 }, { "acc": 0.75433216, "epoch": 1.21740233384069, "grad_norm": 1.9765625, "learning_rate": 3.633631153088517e-06, "loss": 0.98580675, "memory(GiB)": 369.42, "step": 47990, "train_speed(iter/s)": 0.200992 }, { "acc": 0.74000492, "epoch": 1.217529173008625, "grad_norm": 2.1875, "learning_rate": 3.6326224739622255e-06, "loss": 1.06359291, "memory(GiB)": 369.42, "step": 47995, "train_speed(iter/s)": 0.200995 }, { "acc": 0.74853349, "epoch": 1.21765601217656, "grad_norm": 2.328125, "learning_rate": 3.6316138549793024e-06, "loss": 0.98147278, "memory(GiB)": 369.42, "step": 48000, "train_speed(iter/s)": 0.200997 }, { "epoch": 1.21765601217656, "eval_acc": 0.737899892090704, "eval_loss": 0.9700331091880798, "eval_runtime": 385.6386, "eval_samples_per_second": 16.518, "eval_steps_per_second": 8.259, "step": 48000 }, { "acc": 0.75966239, "epoch": 1.2177828513444953, "grad_norm": 2.125, "learning_rate": 3.630605296184111e-06, "loss": 0.94758205, "memory(GiB)": 369.42, "step": 48005, "train_speed(iter/s)": 0.200402 }, { "acc": 0.74157982, "epoch": 1.2179096905124303, "grad_norm": 2.15625, "learning_rate": 3.6295967976210146e-06, "loss": 1.02467203, "memory(GiB)": 369.42, "step": 48010, "train_speed(iter/s)": 0.200405 }, { "acc": 0.73693447, "epoch": 1.2180365296803652, "grad_norm": 2.203125, "learning_rate": 3.6285883593343685e-06, "loss": 1.06577463, "memory(GiB)": 369.42, "step": 48015, "train_speed(iter/s)": 0.200408 }, { "acc": 0.75340133, "epoch": 1.2181633688483005, "grad_norm": 2.203125, "learning_rate": 3.6275799813685274e-06, "loss": 0.99106216, "memory(GiB)": 369.42, "step": 48020, "train_speed(iter/s)": 0.200411 }, { "acc": 0.74760809, "epoch": 1.2182902080162354, "grad_norm": 2.21875, "learning_rate": 3.6265716637678484e-06, "loss": 0.99055157, "memory(GiB)": 369.42, "step": 48025, "train_speed(iter/s)": 0.200414 }, { "acc": 0.75268288, "epoch": 1.2184170471841704, "grad_norm": 2.65625, "learning_rate": 3.62556340657668e-06, "loss": 0.96916533, "memory(GiB)": 369.42, "step": 48030, "train_speed(iter/s)": 0.200417 }, { "acc": 0.7518888, "epoch": 1.2185438863521054, "grad_norm": 2.1875, "learning_rate": 3.6245552098393665e-06, "loss": 1.04314728, "memory(GiB)": 369.42, "step": 48035, "train_speed(iter/s)": 0.200421 }, { "acc": 0.7467433, "epoch": 1.2186707255200406, "grad_norm": 2.171875, "learning_rate": 3.6235470736002576e-06, "loss": 0.95976677, "memory(GiB)": 369.42, "step": 48040, "train_speed(iter/s)": 0.200424 }, { "acc": 0.75484009, "epoch": 1.2187975646879756, "grad_norm": 1.953125, "learning_rate": 3.622538997903693e-06, "loss": 0.98907642, "memory(GiB)": 369.42, "step": 48045, "train_speed(iter/s)": 0.200427 }, { "acc": 0.74458652, "epoch": 1.2189244038559106, "grad_norm": 2.140625, "learning_rate": 3.621530982794015e-06, "loss": 1.0124403, "memory(GiB)": 369.42, "step": 48050, "train_speed(iter/s)": 0.20043 }, { "acc": 0.74215846, "epoch": 1.2190512430238458, "grad_norm": 2.3125, "learning_rate": 3.620523028315558e-06, "loss": 1.05381737, "memory(GiB)": 369.42, "step": 48055, "train_speed(iter/s)": 0.200433 }, { "acc": 0.74821205, "epoch": 1.2191780821917808, "grad_norm": 2.234375, "learning_rate": 3.6195151345126556e-06, "loss": 1.04485855, "memory(GiB)": 369.42, "step": 48060, "train_speed(iter/s)": 0.200436 }, { "acc": 0.76386251, "epoch": 1.2193049213597158, "grad_norm": 1.8046875, "learning_rate": 3.6185073014296425e-06, "loss": 0.94530144, "memory(GiB)": 369.42, "step": 48065, "train_speed(iter/s)": 0.200437 }, { "acc": 0.75715384, "epoch": 1.219431760527651, "grad_norm": 2.453125, "learning_rate": 3.6174995291108474e-06, "loss": 0.93670216, "memory(GiB)": 369.42, "step": 48070, "train_speed(iter/s)": 0.20044 }, { "acc": 0.7451179, "epoch": 1.219558599695586, "grad_norm": 2.28125, "learning_rate": 3.6164918176005937e-06, "loss": 1.01375961, "memory(GiB)": 369.42, "step": 48075, "train_speed(iter/s)": 0.200443 }, { "acc": 0.73737698, "epoch": 1.219685438863521, "grad_norm": 2.09375, "learning_rate": 3.6154841669432062e-06, "loss": 0.95489502, "memory(GiB)": 369.42, "step": 48080, "train_speed(iter/s)": 0.200445 }, { "acc": 0.75888948, "epoch": 1.2198122780314562, "grad_norm": 2.125, "learning_rate": 3.614476577183007e-06, "loss": 0.97216263, "memory(GiB)": 369.42, "step": 48085, "train_speed(iter/s)": 0.200449 }, { "acc": 0.7524929, "epoch": 1.2199391171993912, "grad_norm": 1.8203125, "learning_rate": 3.6134690483643154e-06, "loss": 0.96734505, "memory(GiB)": 369.42, "step": 48090, "train_speed(iter/s)": 0.200452 }, { "acc": 0.74249859, "epoch": 1.2200659563673262, "grad_norm": 2.296875, "learning_rate": 3.6124615805314434e-06, "loss": 1.00714293, "memory(GiB)": 369.42, "step": 48095, "train_speed(iter/s)": 0.200455 }, { "acc": 0.73814373, "epoch": 1.2201927955352612, "grad_norm": 2.078125, "learning_rate": 3.611454173728707e-06, "loss": 0.99718914, "memory(GiB)": 369.42, "step": 48100, "train_speed(iter/s)": 0.200459 }, { "acc": 0.75590911, "epoch": 1.2203196347031964, "grad_norm": 2.09375, "learning_rate": 3.610446828000414e-06, "loss": 0.97628412, "memory(GiB)": 369.42, "step": 48105, "train_speed(iter/s)": 0.200459 }, { "acc": 0.74735999, "epoch": 1.2204464738711314, "grad_norm": 1.890625, "learning_rate": 3.609439543390877e-06, "loss": 1.00198135, "memory(GiB)": 369.42, "step": 48110, "train_speed(iter/s)": 0.200463 }, { "acc": 0.74921794, "epoch": 1.2205733130390666, "grad_norm": 2.046875, "learning_rate": 3.608432319944394e-06, "loss": 1.01683979, "memory(GiB)": 369.42, "step": 48115, "train_speed(iter/s)": 0.200466 }, { "acc": 0.75396156, "epoch": 1.2207001522070016, "grad_norm": 2.625, "learning_rate": 3.607425157705271e-06, "loss": 0.96344242, "memory(GiB)": 369.42, "step": 48120, "train_speed(iter/s)": 0.200469 }, { "acc": 0.74955063, "epoch": 1.2208269913749366, "grad_norm": 1.9609375, "learning_rate": 3.6064180567178064e-06, "loss": 0.99790764, "memory(GiB)": 369.42, "step": 48125, "train_speed(iter/s)": 0.200473 }, { "acc": 0.75698967, "epoch": 1.2209538305428715, "grad_norm": 1.9453125, "learning_rate": 3.6054110170263002e-06, "loss": 0.97858505, "memory(GiB)": 369.42, "step": 48130, "train_speed(iter/s)": 0.200474 }, { "acc": 0.7550046, "epoch": 1.2210806697108068, "grad_norm": 2.84375, "learning_rate": 3.6044040386750423e-06, "loss": 0.96177158, "memory(GiB)": 369.42, "step": 48135, "train_speed(iter/s)": 0.200478 }, { "acc": 0.76051579, "epoch": 1.2212075088787417, "grad_norm": 2.5625, "learning_rate": 3.6033971217083242e-06, "loss": 0.9598978, "memory(GiB)": 369.42, "step": 48140, "train_speed(iter/s)": 0.20048 }, { "acc": 0.76116557, "epoch": 1.2213343480466767, "grad_norm": 2.03125, "learning_rate": 3.602390266170438e-06, "loss": 0.94954376, "memory(GiB)": 369.42, "step": 48145, "train_speed(iter/s)": 0.200484 }, { "acc": 0.75680871, "epoch": 1.221461187214612, "grad_norm": 2.8125, "learning_rate": 3.6013834721056683e-06, "loss": 0.98575745, "memory(GiB)": 369.42, "step": 48150, "train_speed(iter/s)": 0.200488 }, { "acc": 0.76115894, "epoch": 1.221588026382547, "grad_norm": 2.390625, "learning_rate": 3.6003767395582967e-06, "loss": 0.88532715, "memory(GiB)": 369.42, "step": 48155, "train_speed(iter/s)": 0.200492 }, { "acc": 0.7602921, "epoch": 1.221714865550482, "grad_norm": 2.40625, "learning_rate": 3.599370068572604e-06, "loss": 0.89807911, "memory(GiB)": 369.42, "step": 48160, "train_speed(iter/s)": 0.200494 }, { "acc": 0.74748049, "epoch": 1.2218417047184171, "grad_norm": 2.203125, "learning_rate": 3.5983634591928705e-06, "loss": 0.97981033, "memory(GiB)": 369.42, "step": 48165, "train_speed(iter/s)": 0.200496 }, { "acc": 0.76292658, "epoch": 1.2219685438863521, "grad_norm": 2.140625, "learning_rate": 3.5973569114633704e-06, "loss": 0.96669788, "memory(GiB)": 369.42, "step": 48170, "train_speed(iter/s)": 0.200499 }, { "acc": 0.75729017, "epoch": 1.222095383054287, "grad_norm": 2.328125, "learning_rate": 3.5963504254283743e-06, "loss": 1.01911869, "memory(GiB)": 369.42, "step": 48175, "train_speed(iter/s)": 0.200502 }, { "acc": 0.75330992, "epoch": 1.2222222222222223, "grad_norm": 2.015625, "learning_rate": 3.595344001132154e-06, "loss": 0.96756172, "memory(GiB)": 369.42, "step": 48180, "train_speed(iter/s)": 0.200506 }, { "acc": 0.7497386, "epoch": 1.2223490613901573, "grad_norm": 2.140625, "learning_rate": 3.5943376386189744e-06, "loss": 0.94881496, "memory(GiB)": 369.42, "step": 48185, "train_speed(iter/s)": 0.200508 }, { "acc": 0.76410241, "epoch": 1.2224759005580923, "grad_norm": 2.3125, "learning_rate": 3.5933313379331047e-06, "loss": 0.9341506, "memory(GiB)": 369.42, "step": 48190, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74511714, "epoch": 1.2226027397260273, "grad_norm": 2.0625, "learning_rate": 3.5923250991188e-06, "loss": 1.02713051, "memory(GiB)": 369.42, "step": 48195, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74192686, "epoch": 1.2227295788939625, "grad_norm": 2.28125, "learning_rate": 3.591318922220324e-06, "loss": 1.00960464, "memory(GiB)": 369.42, "step": 48200, "train_speed(iter/s)": 0.200514 }, { "acc": 0.7497983, "epoch": 1.2228564180618975, "grad_norm": 2.6875, "learning_rate": 3.5903128072819287e-06, "loss": 1.01971703, "memory(GiB)": 369.42, "step": 48205, "train_speed(iter/s)": 0.200517 }, { "acc": 0.75503912, "epoch": 1.2229832572298325, "grad_norm": 2.171875, "learning_rate": 3.5893067543478733e-06, "loss": 1.00098286, "memory(GiB)": 369.42, "step": 48210, "train_speed(iter/s)": 0.200521 }, { "acc": 0.73919849, "epoch": 1.2231100963977677, "grad_norm": 2.265625, "learning_rate": 3.5883007634624033e-06, "loss": 1.05942974, "memory(GiB)": 369.42, "step": 48215, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74779358, "epoch": 1.2232369355657027, "grad_norm": 1.90625, "learning_rate": 3.5872948346697676e-06, "loss": 0.96451206, "memory(GiB)": 369.42, "step": 48220, "train_speed(iter/s)": 0.200527 }, { "acc": 0.77208204, "epoch": 1.2233637747336377, "grad_norm": 2.21875, "learning_rate": 3.5862889680142133e-06, "loss": 0.92837772, "memory(GiB)": 369.42, "step": 48225, "train_speed(iter/s)": 0.20053 }, { "acc": 0.74512877, "epoch": 1.2234906139015729, "grad_norm": 2.171875, "learning_rate": 3.5852831635399833e-06, "loss": 1.00294304, "memory(GiB)": 369.42, "step": 48230, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75155287, "epoch": 1.2236174530695079, "grad_norm": 2.34375, "learning_rate": 3.5842774212913144e-06, "loss": 1.0216012, "memory(GiB)": 369.42, "step": 48235, "train_speed(iter/s)": 0.200536 }, { "acc": 0.75225592, "epoch": 1.2237442922374429, "grad_norm": 2.390625, "learning_rate": 3.583271741312445e-06, "loss": 0.98573837, "memory(GiB)": 369.42, "step": 48240, "train_speed(iter/s)": 0.200539 }, { "acc": 0.7540956, "epoch": 1.223871131405378, "grad_norm": 2.5625, "learning_rate": 3.58226612364761e-06, "loss": 0.95795355, "memory(GiB)": 369.42, "step": 48245, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75760064, "epoch": 1.223997970573313, "grad_norm": 1.9609375, "learning_rate": 3.581260568341042e-06, "loss": 0.95432339, "memory(GiB)": 369.42, "step": 48250, "train_speed(iter/s)": 0.200547 }, { "acc": 0.74309559, "epoch": 1.224124809741248, "grad_norm": 2.15625, "learning_rate": 3.580255075436967e-06, "loss": 1.0366888, "memory(GiB)": 369.42, "step": 48255, "train_speed(iter/s)": 0.200552 }, { "acc": 0.74566002, "epoch": 1.224251648909183, "grad_norm": 1.890625, "learning_rate": 3.5792496449796127e-06, "loss": 0.97536182, "memory(GiB)": 369.42, "step": 48260, "train_speed(iter/s)": 0.200553 }, { "acc": 0.76324501, "epoch": 1.2243784880771182, "grad_norm": 2.046875, "learning_rate": 3.578244277013201e-06, "loss": 0.96743412, "memory(GiB)": 369.42, "step": 48265, "train_speed(iter/s)": 0.200556 }, { "acc": 0.75849018, "epoch": 1.2245053272450532, "grad_norm": 1.6953125, "learning_rate": 3.5772389715819568e-06, "loss": 0.98840714, "memory(GiB)": 369.42, "step": 48270, "train_speed(iter/s)": 0.200559 }, { "acc": 0.76023951, "epoch": 1.2246321664129884, "grad_norm": 2.265625, "learning_rate": 3.5762337287300925e-06, "loss": 1.00038204, "memory(GiB)": 369.42, "step": 48275, "train_speed(iter/s)": 0.200564 }, { "acc": 0.74625916, "epoch": 1.2247590055809234, "grad_norm": 2.171875, "learning_rate": 3.575228548501825e-06, "loss": 1.00339203, "memory(GiB)": 369.42, "step": 48280, "train_speed(iter/s)": 0.200568 }, { "acc": 0.74903536, "epoch": 1.2248858447488584, "grad_norm": 2.265625, "learning_rate": 3.574223430941368e-06, "loss": 0.98717346, "memory(GiB)": 369.42, "step": 48285, "train_speed(iter/s)": 0.200571 }, { "acc": 0.74187393, "epoch": 1.2250126839167934, "grad_norm": 2.4375, "learning_rate": 3.573218376092932e-06, "loss": 1.04862976, "memory(GiB)": 369.42, "step": 48290, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75708065, "epoch": 1.2251395230847286, "grad_norm": 2.609375, "learning_rate": 3.5722133840007197e-06, "loss": 0.98179417, "memory(GiB)": 369.42, "step": 48295, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74896894, "epoch": 1.2252663622526636, "grad_norm": 2.390625, "learning_rate": 3.5712084547089367e-06, "loss": 0.94625053, "memory(GiB)": 369.42, "step": 48300, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75891542, "epoch": 1.2253932014205986, "grad_norm": 2.203125, "learning_rate": 3.5702035882617857e-06, "loss": 0.9483367, "memory(GiB)": 369.42, "step": 48305, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74226527, "epoch": 1.2255200405885338, "grad_norm": 2.5, "learning_rate": 3.5691987847034667e-06, "loss": 1.01777878, "memory(GiB)": 369.42, "step": 48310, "train_speed(iter/s)": 0.200589 }, { "acc": 0.74724503, "epoch": 1.2256468797564688, "grad_norm": 2.34375, "learning_rate": 3.5681940440781705e-06, "loss": 0.97293224, "memory(GiB)": 369.42, "step": 48315, "train_speed(iter/s)": 0.200591 }, { "acc": 0.75244646, "epoch": 1.2257737189244038, "grad_norm": 2.03125, "learning_rate": 3.5671893664300934e-06, "loss": 0.95662022, "memory(GiB)": 369.42, "step": 48320, "train_speed(iter/s)": 0.200593 }, { "acc": 0.76648874, "epoch": 1.225900558092339, "grad_norm": 1.9609375, "learning_rate": 3.5661847518034244e-06, "loss": 0.90100136, "memory(GiB)": 369.42, "step": 48325, "train_speed(iter/s)": 0.200597 }, { "acc": 0.73704424, "epoch": 1.226027397260274, "grad_norm": 1.9921875, "learning_rate": 3.5651802002423543e-06, "loss": 1.00009632, "memory(GiB)": 369.42, "step": 48330, "train_speed(iter/s)": 0.200599 }, { "acc": 0.75965266, "epoch": 1.226154236428209, "grad_norm": 2.421875, "learning_rate": 3.5641757117910625e-06, "loss": 0.95280666, "memory(GiB)": 369.42, "step": 48335, "train_speed(iter/s)": 0.2006 }, { "acc": 0.73975267, "epoch": 1.2262810755961442, "grad_norm": 2.203125, "learning_rate": 3.563171286493734e-06, "loss": 1.02765789, "memory(GiB)": 369.42, "step": 48340, "train_speed(iter/s)": 0.200604 }, { "acc": 0.73811293, "epoch": 1.2264079147640792, "grad_norm": 2.171875, "learning_rate": 3.5621669243945457e-06, "loss": 1.01793308, "memory(GiB)": 369.42, "step": 48345, "train_speed(iter/s)": 0.200606 }, { "acc": 0.7482358, "epoch": 1.2265347539320142, "grad_norm": 1.953125, "learning_rate": 3.5611626255376785e-06, "loss": 0.99416456, "memory(GiB)": 369.42, "step": 48350, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75963545, "epoch": 1.2266615930999492, "grad_norm": 2.0, "learning_rate": 3.560158389967302e-06, "loss": 0.99120865, "memory(GiB)": 369.42, "step": 48355, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75376191, "epoch": 1.2267884322678844, "grad_norm": 2.625, "learning_rate": 3.559154217727586e-06, "loss": 0.99838028, "memory(GiB)": 369.42, "step": 48360, "train_speed(iter/s)": 0.200616 }, { "acc": 0.74684854, "epoch": 1.2269152714358194, "grad_norm": 2.265625, "learning_rate": 3.5581501088627026e-06, "loss": 1.03361874, "memory(GiB)": 369.42, "step": 48365, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75395827, "epoch": 1.2270421106037543, "grad_norm": 2.375, "learning_rate": 3.557146063416815e-06, "loss": 1.00293331, "memory(GiB)": 369.42, "step": 48370, "train_speed(iter/s)": 0.200623 }, { "acc": 0.74891701, "epoch": 1.2271689497716896, "grad_norm": 2.171875, "learning_rate": 3.5561420814340843e-06, "loss": 1.04137573, "memory(GiB)": 369.42, "step": 48375, "train_speed(iter/s)": 0.200627 }, { "acc": 0.73422861, "epoch": 1.2272957889396245, "grad_norm": 2.265625, "learning_rate": 3.555138162958671e-06, "loss": 1.05629988, "memory(GiB)": 369.42, "step": 48380, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76050673, "epoch": 1.2274226281075595, "grad_norm": 2.453125, "learning_rate": 3.5541343080347325e-06, "loss": 0.91138191, "memory(GiB)": 369.42, "step": 48385, "train_speed(iter/s)": 0.200632 }, { "acc": 0.7499465, "epoch": 1.2275494672754947, "grad_norm": 2.140625, "learning_rate": 3.5531305167064234e-06, "loss": 0.98135471, "memory(GiB)": 369.42, "step": 48390, "train_speed(iter/s)": 0.200632 }, { "acc": 0.76156502, "epoch": 1.2276763064434297, "grad_norm": 2.34375, "learning_rate": 3.5521267890178922e-06, "loss": 0.99100552, "memory(GiB)": 369.42, "step": 48395, "train_speed(iter/s)": 0.200635 }, { "acc": 0.74983525, "epoch": 1.2278031456113647, "grad_norm": 1.875, "learning_rate": 3.5511231250132905e-06, "loss": 0.94736204, "memory(GiB)": 369.42, "step": 48400, "train_speed(iter/s)": 0.200638 }, { "acc": 0.75590081, "epoch": 1.2279299847793, "grad_norm": 1.953125, "learning_rate": 3.550119524736761e-06, "loss": 1.0108078, "memory(GiB)": 369.42, "step": 48405, "train_speed(iter/s)": 0.20064 }, { "acc": 0.74004726, "epoch": 1.228056823947235, "grad_norm": 2.390625, "learning_rate": 3.5491159882324513e-06, "loss": 0.97387867, "memory(GiB)": 369.42, "step": 48410, "train_speed(iter/s)": 0.200643 }, { "acc": 0.75196133, "epoch": 1.22818366311517, "grad_norm": 1.9140625, "learning_rate": 3.548112515544495e-06, "loss": 0.96650877, "memory(GiB)": 369.42, "step": 48415, "train_speed(iter/s)": 0.200646 }, { "acc": 0.75936241, "epoch": 1.228310502283105, "grad_norm": 2.8125, "learning_rate": 3.547109106717034e-06, "loss": 0.98430939, "memory(GiB)": 369.42, "step": 48420, "train_speed(iter/s)": 0.20065 }, { "acc": 0.75007358, "epoch": 1.22843734145104, "grad_norm": 2.171875, "learning_rate": 3.546105761794199e-06, "loss": 1.00698547, "memory(GiB)": 369.42, "step": 48425, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75073938, "epoch": 1.228564180618975, "grad_norm": 2.078125, "learning_rate": 3.5451024808201268e-06, "loss": 0.96205616, "memory(GiB)": 369.42, "step": 48430, "train_speed(iter/s)": 0.200657 }, { "acc": 0.72913013, "epoch": 1.2286910197869103, "grad_norm": 2.109375, "learning_rate": 3.5440992638389417e-06, "loss": 1.05237513, "memory(GiB)": 369.42, "step": 48435, "train_speed(iter/s)": 0.200659 }, { "acc": 0.75737615, "epoch": 1.2288178589548453, "grad_norm": 2.078125, "learning_rate": 3.5430961108947705e-06, "loss": 0.98983746, "memory(GiB)": 369.42, "step": 48440, "train_speed(iter/s)": 0.200662 }, { "acc": 0.74013081, "epoch": 1.2289446981227803, "grad_norm": 1.84375, "learning_rate": 3.5420930220317373e-06, "loss": 0.99027042, "memory(GiB)": 369.42, "step": 48445, "train_speed(iter/s)": 0.200666 }, { "acc": 0.7676877, "epoch": 1.2290715372907153, "grad_norm": 2.234375, "learning_rate": 3.541089997293964e-06, "loss": 0.92393036, "memory(GiB)": 369.42, "step": 48450, "train_speed(iter/s)": 0.200669 }, { "acc": 0.76051254, "epoch": 1.2291983764586505, "grad_norm": 2.515625, "learning_rate": 3.5400870367255635e-06, "loss": 0.94332285, "memory(GiB)": 369.42, "step": 48455, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75004253, "epoch": 1.2293252156265855, "grad_norm": 2.609375, "learning_rate": 3.539084140370654e-06, "loss": 0.96883602, "memory(GiB)": 369.42, "step": 48460, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75287027, "epoch": 1.2294520547945205, "grad_norm": 2.046875, "learning_rate": 3.538081308273347e-06, "loss": 1.00986671, "memory(GiB)": 369.42, "step": 48465, "train_speed(iter/s)": 0.20068 }, { "acc": 0.75341868, "epoch": 1.2295788939624557, "grad_norm": 2.46875, "learning_rate": 3.537078540477752e-06, "loss": 0.96150551, "memory(GiB)": 369.42, "step": 48470, "train_speed(iter/s)": 0.200683 }, { "acc": 0.73702068, "epoch": 1.2297057331303907, "grad_norm": 2.1875, "learning_rate": 3.5360758370279722e-06, "loss": 1.02987118, "memory(GiB)": 369.42, "step": 48475, "train_speed(iter/s)": 0.200687 }, { "acc": 0.76446934, "epoch": 1.2298325722983257, "grad_norm": 2.09375, "learning_rate": 3.535073197968114e-06, "loss": 0.95138292, "memory(GiB)": 369.42, "step": 48480, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75097384, "epoch": 1.2299594114662609, "grad_norm": 2.359375, "learning_rate": 3.5340706233422763e-06, "loss": 1.03119526, "memory(GiB)": 369.42, "step": 48485, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74334002, "epoch": 1.2300862506341959, "grad_norm": 2.4375, "learning_rate": 3.5330681131945588e-06, "loss": 1.01485271, "memory(GiB)": 369.42, "step": 48490, "train_speed(iter/s)": 0.200697 }, { "acc": 0.74128218, "epoch": 1.2302130898021308, "grad_norm": 1.96875, "learning_rate": 3.5320656675690546e-06, "loss": 0.976371, "memory(GiB)": 369.42, "step": 48495, "train_speed(iter/s)": 0.200699 }, { "acc": 0.75191083, "epoch": 1.230339928970066, "grad_norm": 2.46875, "learning_rate": 3.531063286509855e-06, "loss": 0.96791468, "memory(GiB)": 369.42, "step": 48500, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75561352, "epoch": 1.230466768138001, "grad_norm": 2.28125, "learning_rate": 3.530060970061051e-06, "loss": 0.99125671, "memory(GiB)": 369.42, "step": 48505, "train_speed(iter/s)": 0.200704 }, { "acc": 0.7480093, "epoch": 1.230593607305936, "grad_norm": 2.171875, "learning_rate": 3.52905871826673e-06, "loss": 1.04282856, "memory(GiB)": 369.42, "step": 48510, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75612698, "epoch": 1.230720446473871, "grad_norm": 2.0625, "learning_rate": 3.5280565311709725e-06, "loss": 0.93698711, "memory(GiB)": 369.42, "step": 48515, "train_speed(iter/s)": 0.20071 }, { "acc": 0.74993334, "epoch": 1.2308472856418062, "grad_norm": 2.359375, "learning_rate": 3.5270544088178597e-06, "loss": 1.00464668, "memory(GiB)": 369.42, "step": 48520, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74629774, "epoch": 1.2309741248097412, "grad_norm": 2.125, "learning_rate": 3.526052351251471e-06, "loss": 1.02666998, "memory(GiB)": 369.42, "step": 48525, "train_speed(iter/s)": 0.200715 }, { "acc": 0.75382528, "epoch": 1.2311009639776762, "grad_norm": 2.28125, "learning_rate": 3.5250503585158825e-06, "loss": 1.01250801, "memory(GiB)": 369.42, "step": 48530, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75795441, "epoch": 1.2312278031456114, "grad_norm": 2.3125, "learning_rate": 3.5240484306551615e-06, "loss": 0.96924, "memory(GiB)": 369.42, "step": 48535, "train_speed(iter/s)": 0.20072 }, { "acc": 0.75277805, "epoch": 1.2313546423135464, "grad_norm": 2.453125, "learning_rate": 3.5230465677133813e-06, "loss": 1.00882683, "memory(GiB)": 369.42, "step": 48540, "train_speed(iter/s)": 0.200723 }, { "acc": 0.74266825, "epoch": 1.2314814814814814, "grad_norm": 2.3125, "learning_rate": 3.5220447697346063e-06, "loss": 1.0155488, "memory(GiB)": 369.42, "step": 48545, "train_speed(iter/s)": 0.200726 }, { "acc": 0.73864532, "epoch": 1.2316083206494166, "grad_norm": 2.5, "learning_rate": 3.521043036762903e-06, "loss": 1.03161201, "memory(GiB)": 369.42, "step": 48550, "train_speed(iter/s)": 0.200729 }, { "acc": 0.74454002, "epoch": 1.2317351598173516, "grad_norm": 1.96875, "learning_rate": 3.5200413688423284e-06, "loss": 0.99504938, "memory(GiB)": 369.42, "step": 48555, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74360628, "epoch": 1.2318619989852866, "grad_norm": 2.375, "learning_rate": 3.519039766016943e-06, "loss": 1.05602551, "memory(GiB)": 369.42, "step": 48560, "train_speed(iter/s)": 0.200734 }, { "acc": 0.74206891, "epoch": 1.2319888381532218, "grad_norm": 2.296875, "learning_rate": 3.5180382283307983e-06, "loss": 1.08163681, "memory(GiB)": 369.42, "step": 48565, "train_speed(iter/s)": 0.200737 }, { "acc": 0.76942968, "epoch": 1.2321156773211568, "grad_norm": 1.8203125, "learning_rate": 3.517036755827952e-06, "loss": 0.91378784, "memory(GiB)": 369.42, "step": 48570, "train_speed(iter/s)": 0.200739 }, { "acc": 0.7540627, "epoch": 1.2322425164890918, "grad_norm": 2.0, "learning_rate": 3.516035348552449e-06, "loss": 1.0172821, "memory(GiB)": 369.42, "step": 48575, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75244617, "epoch": 1.2323693556570268, "grad_norm": 2.296875, "learning_rate": 3.515034006548335e-06, "loss": 0.9935441, "memory(GiB)": 369.42, "step": 48580, "train_speed(iter/s)": 0.200747 }, { "acc": 0.76671438, "epoch": 1.232496194824962, "grad_norm": 2.46875, "learning_rate": 3.5140327298596565e-06, "loss": 0.98284416, "memory(GiB)": 369.42, "step": 48585, "train_speed(iter/s)": 0.20075 }, { "acc": 0.74672499, "epoch": 1.232623033992897, "grad_norm": 1.9765625, "learning_rate": 3.5130315185304547e-06, "loss": 0.97163057, "memory(GiB)": 369.42, "step": 48590, "train_speed(iter/s)": 0.200754 }, { "acc": 0.7459537, "epoch": 1.2327498731608322, "grad_norm": 2.09375, "learning_rate": 3.5120303726047642e-06, "loss": 0.97703199, "memory(GiB)": 369.42, "step": 48595, "train_speed(iter/s)": 0.200757 }, { "acc": 0.74518375, "epoch": 1.2328767123287672, "grad_norm": 2.078125, "learning_rate": 3.51102929212662e-06, "loss": 1.0684804, "memory(GiB)": 369.42, "step": 48600, "train_speed(iter/s)": 0.200761 }, { "acc": 0.74529295, "epoch": 1.2330035514967022, "grad_norm": 2.171875, "learning_rate": 3.5100282771400563e-06, "loss": 1.05232792, "memory(GiB)": 369.42, "step": 48605, "train_speed(iter/s)": 0.200764 }, { "acc": 0.75168428, "epoch": 1.2331303906646371, "grad_norm": 1.8515625, "learning_rate": 3.5090273276891023e-06, "loss": 0.98171291, "memory(GiB)": 369.42, "step": 48610, "train_speed(iter/s)": 0.200767 }, { "acc": 0.7534061, "epoch": 1.2332572298325724, "grad_norm": 2.03125, "learning_rate": 3.5080264438177815e-06, "loss": 0.95252457, "memory(GiB)": 369.42, "step": 48615, "train_speed(iter/s)": 0.20077 }, { "acc": 0.74400702, "epoch": 1.2333840690005073, "grad_norm": 1.890625, "learning_rate": 3.50702562557012e-06, "loss": 1.02608414, "memory(GiB)": 369.42, "step": 48620, "train_speed(iter/s)": 0.200773 }, { "acc": 0.74482155, "epoch": 1.2335109081684423, "grad_norm": 1.796875, "learning_rate": 3.506024872990135e-06, "loss": 1.02263851, "memory(GiB)": 369.42, "step": 48625, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74903011, "epoch": 1.2336377473363775, "grad_norm": 2.15625, "learning_rate": 3.5050241861218493e-06, "loss": 0.9720417, "memory(GiB)": 369.42, "step": 48630, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75499969, "epoch": 1.2337645865043125, "grad_norm": 2.53125, "learning_rate": 3.5040235650092725e-06, "loss": 0.96152973, "memory(GiB)": 369.42, "step": 48635, "train_speed(iter/s)": 0.200781 }, { "acc": 0.77187161, "epoch": 1.2338914256722475, "grad_norm": 2.4375, "learning_rate": 3.503023009696419e-06, "loss": 0.88279419, "memory(GiB)": 369.42, "step": 48640, "train_speed(iter/s)": 0.20078 }, { "acc": 0.74684553, "epoch": 1.2340182648401827, "grad_norm": 2.15625, "learning_rate": 3.5020225202272963e-06, "loss": 1.00428066, "memory(GiB)": 369.42, "step": 48645, "train_speed(iter/s)": 0.200783 }, { "acc": 0.74112611, "epoch": 1.2341451040081177, "grad_norm": 2.046875, "learning_rate": 3.501022096645913e-06, "loss": 1.00454788, "memory(GiB)": 369.42, "step": 48650, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74598408, "epoch": 1.2342719431760527, "grad_norm": 2.296875, "learning_rate": 3.5000217389962685e-06, "loss": 0.99504089, "memory(GiB)": 369.42, "step": 48655, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75171728, "epoch": 1.234398782343988, "grad_norm": 1.9296875, "learning_rate": 3.499021447322365e-06, "loss": 1.03379631, "memory(GiB)": 369.42, "step": 48660, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74962406, "epoch": 1.234525621511923, "grad_norm": 1.9296875, "learning_rate": 3.4980212216681997e-06, "loss": 0.99321966, "memory(GiB)": 369.42, "step": 48665, "train_speed(iter/s)": 0.200797 }, { "acc": 0.76440916, "epoch": 1.234652460679858, "grad_norm": 1.84375, "learning_rate": 3.4970210620777687e-06, "loss": 0.95354557, "memory(GiB)": 369.42, "step": 48670, "train_speed(iter/s)": 0.200801 }, { "acc": 0.75760498, "epoch": 1.2347792998477929, "grad_norm": 1.9921875, "learning_rate": 3.496020968595059e-06, "loss": 1.00522633, "memory(GiB)": 369.42, "step": 48675, "train_speed(iter/s)": 0.200806 }, { "acc": 0.75759869, "epoch": 1.234906139015728, "grad_norm": 1.7734375, "learning_rate": 3.4950209412640634e-06, "loss": 0.96715794, "memory(GiB)": 369.42, "step": 48680, "train_speed(iter/s)": 0.20081 }, { "acc": 0.75828953, "epoch": 1.235032978183663, "grad_norm": 1.8828125, "learning_rate": 3.494020980128766e-06, "loss": 0.9111577, "memory(GiB)": 369.42, "step": 48685, "train_speed(iter/s)": 0.200812 }, { "acc": 0.76277609, "epoch": 1.235159817351598, "grad_norm": 2.453125, "learning_rate": 3.4930210852331505e-06, "loss": 0.9289156, "memory(GiB)": 369.42, "step": 48690, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75329838, "epoch": 1.2352866565195333, "grad_norm": 2.265625, "learning_rate": 3.4920212566211943e-06, "loss": 0.92124748, "memory(GiB)": 369.42, "step": 48695, "train_speed(iter/s)": 0.20082 }, { "acc": 0.7515645, "epoch": 1.2354134956874683, "grad_norm": 2.03125, "learning_rate": 3.491021494336876e-06, "loss": 1.00616245, "memory(GiB)": 369.42, "step": 48700, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74103317, "epoch": 1.2355403348554033, "grad_norm": 2.15625, "learning_rate": 3.4900217984241692e-06, "loss": 0.9836359, "memory(GiB)": 369.42, "step": 48705, "train_speed(iter/s)": 0.200826 }, { "acc": 0.74676914, "epoch": 1.2356671740233385, "grad_norm": 3.03125, "learning_rate": 3.4890221689270466e-06, "loss": 0.98137207, "memory(GiB)": 369.42, "step": 48710, "train_speed(iter/s)": 0.200827 }, { "acc": 0.7582283, "epoch": 1.2357940131912735, "grad_norm": 2.3125, "learning_rate": 3.488022605889475e-06, "loss": 0.97819118, "memory(GiB)": 369.42, "step": 48715, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75934887, "epoch": 1.2359208523592085, "grad_norm": 2.234375, "learning_rate": 3.4870231093554172e-06, "loss": 0.94809809, "memory(GiB)": 369.42, "step": 48720, "train_speed(iter/s)": 0.200833 }, { "acc": 0.74530296, "epoch": 1.2360476915271437, "grad_norm": 1.9375, "learning_rate": 3.4860236793688407e-06, "loss": 1.01380444, "memory(GiB)": 369.42, "step": 48725, "train_speed(iter/s)": 0.200837 }, { "acc": 0.74290595, "epoch": 1.2361745306950787, "grad_norm": 2.21875, "learning_rate": 3.4850243159737024e-06, "loss": 0.98120689, "memory(GiB)": 369.42, "step": 48730, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74752517, "epoch": 1.2363013698630136, "grad_norm": 2.015625, "learning_rate": 3.4840250192139574e-06, "loss": 0.97898827, "memory(GiB)": 369.42, "step": 48735, "train_speed(iter/s)": 0.200843 }, { "acc": 0.74952497, "epoch": 1.2364282090309486, "grad_norm": 2.28125, "learning_rate": 3.4830257891335595e-06, "loss": 0.93194256, "memory(GiB)": 369.42, "step": 48740, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75291219, "epoch": 1.2365550481988838, "grad_norm": 1.9375, "learning_rate": 3.4820266257764613e-06, "loss": 0.90576696, "memory(GiB)": 369.42, "step": 48745, "train_speed(iter/s)": 0.20085 }, { "acc": 0.76635809, "epoch": 1.2366818873668188, "grad_norm": 1.9140625, "learning_rate": 3.4810275291866103e-06, "loss": 0.93729715, "memory(GiB)": 369.42, "step": 48750, "train_speed(iter/s)": 0.200852 }, { "acc": 0.75012541, "epoch": 1.236808726534754, "grad_norm": 2.359375, "learning_rate": 3.4800284994079487e-06, "loss": 0.95226784, "memory(GiB)": 369.42, "step": 48755, "train_speed(iter/s)": 0.200856 }, { "acc": 0.73815279, "epoch": 1.236935565702689, "grad_norm": 2.125, "learning_rate": 3.4790295364844207e-06, "loss": 1.02468853, "memory(GiB)": 369.42, "step": 48760, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74477472, "epoch": 1.237062404870624, "grad_norm": 1.96875, "learning_rate": 3.4780306404599628e-06, "loss": 0.96317539, "memory(GiB)": 369.42, "step": 48765, "train_speed(iter/s)": 0.200863 }, { "acc": 0.74079857, "epoch": 1.237189244038559, "grad_norm": 2.578125, "learning_rate": 3.4770318113785164e-06, "loss": 1.06882534, "memory(GiB)": 369.42, "step": 48770, "train_speed(iter/s)": 0.200867 }, { "acc": 0.75040178, "epoch": 1.2373160832064942, "grad_norm": 2.5, "learning_rate": 3.4760330492840065e-06, "loss": 1.03973999, "memory(GiB)": 369.42, "step": 48775, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74815874, "epoch": 1.2374429223744292, "grad_norm": 2.15625, "learning_rate": 3.4750343542203684e-06, "loss": 1.01456213, "memory(GiB)": 369.42, "step": 48780, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74213719, "epoch": 1.2375697615423642, "grad_norm": 2.0625, "learning_rate": 3.474035726231527e-06, "loss": 1.01567707, "memory(GiB)": 369.42, "step": 48785, "train_speed(iter/s)": 0.200876 }, { "acc": 0.75311942, "epoch": 1.2376966007102994, "grad_norm": 1.9140625, "learning_rate": 3.473037165361409e-06, "loss": 1.01714325, "memory(GiB)": 369.42, "step": 48790, "train_speed(iter/s)": 0.200879 }, { "acc": 0.75493717, "epoch": 1.2378234398782344, "grad_norm": 2.34375, "learning_rate": 3.4720386716539333e-06, "loss": 0.97720823, "memory(GiB)": 369.42, "step": 48795, "train_speed(iter/s)": 0.200881 }, { "acc": 0.7484416, "epoch": 1.2379502790461694, "grad_norm": 1.9921875, "learning_rate": 3.471040245153018e-06, "loss": 0.99971371, "memory(GiB)": 369.42, "step": 48800, "train_speed(iter/s)": 0.200886 }, { "acc": 0.74341116, "epoch": 1.2380771182141046, "grad_norm": 2.125, "learning_rate": 3.4700418859025793e-06, "loss": 1.03127775, "memory(GiB)": 369.42, "step": 48805, "train_speed(iter/s)": 0.200889 }, { "acc": 0.75909224, "epoch": 1.2382039573820396, "grad_norm": 2.4375, "learning_rate": 3.4690435939465307e-06, "loss": 0.98314199, "memory(GiB)": 369.42, "step": 48810, "train_speed(iter/s)": 0.200893 }, { "acc": 0.75056386, "epoch": 1.2383307965499746, "grad_norm": 2.4375, "learning_rate": 3.4680453693287786e-06, "loss": 0.93171329, "memory(GiB)": 369.42, "step": 48815, "train_speed(iter/s)": 0.200893 }, { "acc": 0.7504601, "epoch": 1.2384576357179098, "grad_norm": 2.46875, "learning_rate": 3.4670472120932297e-06, "loss": 1.02396193, "memory(GiB)": 369.42, "step": 48820, "train_speed(iter/s)": 0.200894 }, { "acc": 0.74539871, "epoch": 1.2385844748858448, "grad_norm": 2.359375, "learning_rate": 3.46604912228379e-06, "loss": 1.03575039, "memory(GiB)": 369.42, "step": 48825, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75511923, "epoch": 1.2387113140537798, "grad_norm": 2.5, "learning_rate": 3.46505109994436e-06, "loss": 0.98499908, "memory(GiB)": 369.42, "step": 48830, "train_speed(iter/s)": 0.200899 }, { "acc": 0.74727583, "epoch": 1.2388381532217148, "grad_norm": 1.9921875, "learning_rate": 3.464053145118833e-06, "loss": 0.9198, "memory(GiB)": 369.42, "step": 48835, "train_speed(iter/s)": 0.200899 }, { "acc": 0.75861154, "epoch": 1.23896499238965, "grad_norm": 2.46875, "learning_rate": 3.4630552578511073e-06, "loss": 0.97399464, "memory(GiB)": 369.42, "step": 48840, "train_speed(iter/s)": 0.200903 }, { "acc": 0.73887815, "epoch": 1.239091831557585, "grad_norm": 2.171875, "learning_rate": 3.4620574381850723e-06, "loss": 0.98892832, "memory(GiB)": 369.42, "step": 48845, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75165777, "epoch": 1.23921867072552, "grad_norm": 2.0625, "learning_rate": 3.4610596861646194e-06, "loss": 0.97074871, "memory(GiB)": 369.42, "step": 48850, "train_speed(iter/s)": 0.200902 }, { "acc": 0.7640029, "epoch": 1.2393455098934552, "grad_norm": 2.078125, "learning_rate": 3.460062001833632e-06, "loss": 0.94429207, "memory(GiB)": 369.42, "step": 48855, "train_speed(iter/s)": 0.200905 }, { "acc": 0.77117357, "epoch": 1.2394723490613901, "grad_norm": 2.515625, "learning_rate": 3.459064385235993e-06, "loss": 0.9819334, "memory(GiB)": 369.42, "step": 48860, "train_speed(iter/s)": 0.200907 }, { "acc": 0.74380207, "epoch": 1.2395991882293251, "grad_norm": 2.203125, "learning_rate": 3.45806683641558e-06, "loss": 0.97728996, "memory(GiB)": 369.42, "step": 48865, "train_speed(iter/s)": 0.200908 }, { "acc": 0.76003103, "epoch": 1.2397260273972603, "grad_norm": 1.9765625, "learning_rate": 3.457069355416275e-06, "loss": 0.9404707, "memory(GiB)": 369.42, "step": 48870, "train_speed(iter/s)": 0.200912 }, { "acc": 0.75227795, "epoch": 1.2398528665651953, "grad_norm": 2.265625, "learning_rate": 3.456071942281947e-06, "loss": 1.01777086, "memory(GiB)": 369.42, "step": 48875, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74039602, "epoch": 1.2399797057331303, "grad_norm": 1.9609375, "learning_rate": 3.455074597056467e-06, "loss": 1.03148985, "memory(GiB)": 369.42, "step": 48880, "train_speed(iter/s)": 0.200918 }, { "acc": 0.75513783, "epoch": 1.2401065449010655, "grad_norm": 2.21875, "learning_rate": 3.454077319783705e-06, "loss": 0.90970325, "memory(GiB)": 369.42, "step": 48885, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74058142, "epoch": 1.2402333840690005, "grad_norm": 2.0625, "learning_rate": 3.4530801105075257e-06, "loss": 1.0216444, "memory(GiB)": 369.42, "step": 48890, "train_speed(iter/s)": 0.200923 }, { "acc": 0.75150151, "epoch": 1.2403602232369355, "grad_norm": 2.390625, "learning_rate": 3.4520829692717874e-06, "loss": 1.01647367, "memory(GiB)": 369.42, "step": 48895, "train_speed(iter/s)": 0.200925 }, { "acc": 0.74863691, "epoch": 1.2404870624048705, "grad_norm": 1.90625, "learning_rate": 3.451085896120352e-06, "loss": 1.03265228, "memory(GiB)": 369.42, "step": 48900, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74679203, "epoch": 1.2406139015728057, "grad_norm": 1.953125, "learning_rate": 3.450088891097074e-06, "loss": 0.99676552, "memory(GiB)": 369.42, "step": 48905, "train_speed(iter/s)": 0.20093 }, { "acc": 0.75135112, "epoch": 1.2407407407407407, "grad_norm": 2.28125, "learning_rate": 3.4490919542458085e-06, "loss": 0.98644304, "memory(GiB)": 369.42, "step": 48910, "train_speed(iter/s)": 0.200933 }, { "acc": 0.73479538, "epoch": 1.240867579908676, "grad_norm": 2.078125, "learning_rate": 3.4480950856104002e-06, "loss": 1.02410183, "memory(GiB)": 369.42, "step": 48915, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75277195, "epoch": 1.240994419076611, "grad_norm": 2.078125, "learning_rate": 3.4470982852347e-06, "loss": 0.98958883, "memory(GiB)": 369.42, "step": 48920, "train_speed(iter/s)": 0.200941 }, { "acc": 0.74047699, "epoch": 1.2411212582445459, "grad_norm": 2.28125, "learning_rate": 3.44610155316255e-06, "loss": 1.03934631, "memory(GiB)": 369.42, "step": 48925, "train_speed(iter/s)": 0.200945 }, { "acc": 0.75150437, "epoch": 1.2412480974124809, "grad_norm": 2.15625, "learning_rate": 3.4451048894377925e-06, "loss": 0.98079033, "memory(GiB)": 369.42, "step": 48930, "train_speed(iter/s)": 0.200947 }, { "acc": 0.75804949, "epoch": 1.241374936580416, "grad_norm": 2.28125, "learning_rate": 3.444108294104264e-06, "loss": 0.96491737, "memory(GiB)": 369.42, "step": 48935, "train_speed(iter/s)": 0.200951 }, { "acc": 0.7532917, "epoch": 1.241501775748351, "grad_norm": 2.0, "learning_rate": 3.443111767205797e-06, "loss": 1.01531086, "memory(GiB)": 369.42, "step": 48940, "train_speed(iter/s)": 0.200951 }, { "acc": 0.74794011, "epoch": 1.241628614916286, "grad_norm": 2.609375, "learning_rate": 3.442115308786227e-06, "loss": 1.01481609, "memory(GiB)": 369.42, "step": 48945, "train_speed(iter/s)": 0.200956 }, { "acc": 0.75008945, "epoch": 1.2417554540842213, "grad_norm": 2.359375, "learning_rate": 3.4411189188893822e-06, "loss": 1.02201118, "memory(GiB)": 369.42, "step": 48950, "train_speed(iter/s)": 0.20096 }, { "acc": 0.74788551, "epoch": 1.2418822932521563, "grad_norm": 2.0625, "learning_rate": 3.4401225975590867e-06, "loss": 1.01177034, "memory(GiB)": 369.42, "step": 48955, "train_speed(iter/s)": 0.200963 }, { "acc": 0.74141521, "epoch": 1.2420091324200913, "grad_norm": 1.859375, "learning_rate": 3.439126344839163e-06, "loss": 0.97777081, "memory(GiB)": 369.42, "step": 48960, "train_speed(iter/s)": 0.200966 }, { "acc": 0.75430779, "epoch": 1.2421359715880265, "grad_norm": 2.0, "learning_rate": 3.438130160773431e-06, "loss": 0.97780857, "memory(GiB)": 369.42, "step": 48965, "train_speed(iter/s)": 0.200969 }, { "acc": 0.74536099, "epoch": 1.2422628107559615, "grad_norm": 1.828125, "learning_rate": 3.43713404540571e-06, "loss": 1.06058598, "memory(GiB)": 369.42, "step": 48970, "train_speed(iter/s)": 0.200972 }, { "acc": 0.75075622, "epoch": 1.2423896499238964, "grad_norm": 2.09375, "learning_rate": 3.4361379987798094e-06, "loss": 1.02529917, "memory(GiB)": 369.42, "step": 48975, "train_speed(iter/s)": 0.200974 }, { "acc": 0.76428099, "epoch": 1.2425164890918317, "grad_norm": 2.1875, "learning_rate": 3.435142020939542e-06, "loss": 0.96733561, "memory(GiB)": 369.42, "step": 48980, "train_speed(iter/s)": 0.200975 }, { "acc": 0.7558044, "epoch": 1.2426433282597666, "grad_norm": 2.359375, "learning_rate": 3.4341461119287144e-06, "loss": 0.95346737, "memory(GiB)": 369.42, "step": 48985, "train_speed(iter/s)": 0.200977 }, { "acc": 0.75960374, "epoch": 1.2427701674277016, "grad_norm": 2.34375, "learning_rate": 3.433150271791135e-06, "loss": 0.99172993, "memory(GiB)": 369.42, "step": 48990, "train_speed(iter/s)": 0.20098 }, { "acc": 0.75445223, "epoch": 1.2428970065956366, "grad_norm": 2.40625, "learning_rate": 3.432154500570599e-06, "loss": 0.94663792, "memory(GiB)": 369.42, "step": 48995, "train_speed(iter/s)": 0.200984 }, { "acc": 0.75922813, "epoch": 1.2430238457635718, "grad_norm": 1.875, "learning_rate": 3.431158798310909e-06, "loss": 0.98350763, "memory(GiB)": 369.42, "step": 49000, "train_speed(iter/s)": 0.200988 }, { "epoch": 1.2430238457635718, "eval_acc": 0.7378568620578299, "eval_loss": 0.9699128270149231, "eval_runtime": 385.6588, "eval_samples_per_second": 16.517, "eval_steps_per_second": 8.259, "step": 49000 }, { "acc": 0.74465504, "epoch": 1.2431506849315068, "grad_norm": 2.28125, "learning_rate": 3.4301631650558588e-06, "loss": 1.01186981, "memory(GiB)": 369.42, "step": 49005, "train_speed(iter/s)": 0.200403 }, { "acc": 0.73621435, "epoch": 1.2432775240994418, "grad_norm": 2.09375, "learning_rate": 3.4291676008492424e-06, "loss": 0.99347954, "memory(GiB)": 369.42, "step": 49010, "train_speed(iter/s)": 0.200408 }, { "acc": 0.76570668, "epoch": 1.243404363267377, "grad_norm": 2.03125, "learning_rate": 3.428172105734848e-06, "loss": 0.94999304, "memory(GiB)": 369.42, "step": 49015, "train_speed(iter/s)": 0.200411 }, { "acc": 0.74923582, "epoch": 1.243531202435312, "grad_norm": 2.234375, "learning_rate": 3.4271766797564608e-06, "loss": 1.00866985, "memory(GiB)": 369.42, "step": 49020, "train_speed(iter/s)": 0.200413 }, { "acc": 0.74818864, "epoch": 1.243658041603247, "grad_norm": 2.359375, "learning_rate": 3.4261813229578665e-06, "loss": 1.00539913, "memory(GiB)": 369.42, "step": 49025, "train_speed(iter/s)": 0.200417 }, { "acc": 0.75698371, "epoch": 1.2437848807711822, "grad_norm": 2.296875, "learning_rate": 3.425186035382846e-06, "loss": 0.96974421, "memory(GiB)": 369.42, "step": 49030, "train_speed(iter/s)": 0.200419 }, { "acc": 0.75598035, "epoch": 1.2439117199391172, "grad_norm": 2.296875, "learning_rate": 3.4241908170751727e-06, "loss": 0.9725235, "memory(GiB)": 369.42, "step": 49035, "train_speed(iter/s)": 0.200422 }, { "acc": 0.74851618, "epoch": 1.2440385591070522, "grad_norm": 2.109375, "learning_rate": 3.4231956680786217e-06, "loss": 0.97136393, "memory(GiB)": 369.42, "step": 49040, "train_speed(iter/s)": 0.200425 }, { "acc": 0.75016017, "epoch": 1.2441653982749874, "grad_norm": 1.9609375, "learning_rate": 3.422200588436967e-06, "loss": 0.99887714, "memory(GiB)": 369.42, "step": 49045, "train_speed(iter/s)": 0.200427 }, { "acc": 0.74338608, "epoch": 1.2442922374429224, "grad_norm": 1.828125, "learning_rate": 3.4212055781939744e-06, "loss": 1.00702915, "memory(GiB)": 369.42, "step": 49050, "train_speed(iter/s)": 0.20043 }, { "acc": 0.74388943, "epoch": 1.2444190766108574, "grad_norm": 2.4375, "learning_rate": 3.4202106373934085e-06, "loss": 1.0501688, "memory(GiB)": 369.42, "step": 49055, "train_speed(iter/s)": 0.200434 }, { "acc": 0.75007467, "epoch": 1.2445459157787924, "grad_norm": 2.265625, "learning_rate": 3.4192157660790324e-06, "loss": 1.01310997, "memory(GiB)": 369.42, "step": 49060, "train_speed(iter/s)": 0.200435 }, { "acc": 0.74374537, "epoch": 1.2446727549467276, "grad_norm": 1.8359375, "learning_rate": 3.418220964294604e-06, "loss": 0.99176397, "memory(GiB)": 369.42, "step": 49065, "train_speed(iter/s)": 0.200438 }, { "acc": 0.74693079, "epoch": 1.2447995941146626, "grad_norm": 2.125, "learning_rate": 3.417226232083881e-06, "loss": 1.00377541, "memory(GiB)": 369.42, "step": 49070, "train_speed(iter/s)": 0.200441 }, { "acc": 0.74620719, "epoch": 1.2449264332825978, "grad_norm": 2.15625, "learning_rate": 3.416231569490615e-06, "loss": 1.00479259, "memory(GiB)": 369.42, "step": 49075, "train_speed(iter/s)": 0.200442 }, { "acc": 0.7767437, "epoch": 1.2450532724505328, "grad_norm": 2.109375, "learning_rate": 3.4152369765585545e-06, "loss": 0.84397316, "memory(GiB)": 369.42, "step": 49080, "train_speed(iter/s)": 0.200446 }, { "acc": 0.75299993, "epoch": 1.2451801116184678, "grad_norm": 2.359375, "learning_rate": 3.4142424533314474e-06, "loss": 0.96889629, "memory(GiB)": 369.42, "step": 49085, "train_speed(iter/s)": 0.20045 }, { "acc": 0.73998766, "epoch": 1.2453069507864027, "grad_norm": 2.28125, "learning_rate": 3.4132479998530383e-06, "loss": 1.02318945, "memory(GiB)": 369.42, "step": 49090, "train_speed(iter/s)": 0.200452 }, { "acc": 0.7507144, "epoch": 1.245433789954338, "grad_norm": 2.046875, "learning_rate": 3.4122536161670656e-06, "loss": 0.97595387, "memory(GiB)": 369.42, "step": 49095, "train_speed(iter/s)": 0.200455 }, { "acc": 0.75907321, "epoch": 1.245560629122273, "grad_norm": 2.4375, "learning_rate": 3.411259302317267e-06, "loss": 0.95804119, "memory(GiB)": 369.42, "step": 49100, "train_speed(iter/s)": 0.200458 }, { "acc": 0.74582863, "epoch": 1.245687468290208, "grad_norm": 2.046875, "learning_rate": 3.410265058347378e-06, "loss": 0.94557152, "memory(GiB)": 369.42, "step": 49105, "train_speed(iter/s)": 0.20046 }, { "acc": 0.75934448, "epoch": 1.2458143074581431, "grad_norm": 2.109375, "learning_rate": 3.4092708843011303e-06, "loss": 0.95798206, "memory(GiB)": 369.42, "step": 49110, "train_speed(iter/s)": 0.200462 }, { "acc": 0.74454837, "epoch": 1.2459411466260781, "grad_norm": 2.265625, "learning_rate": 3.4082767802222493e-06, "loss": 1.0223732, "memory(GiB)": 369.42, "step": 49115, "train_speed(iter/s)": 0.200465 }, { "acc": 0.74732575, "epoch": 1.2460679857940131, "grad_norm": 2.484375, "learning_rate": 3.4072827461544635e-06, "loss": 0.99488077, "memory(GiB)": 369.42, "step": 49120, "train_speed(iter/s)": 0.200467 }, { "acc": 0.750562, "epoch": 1.2461948249619483, "grad_norm": 2.25, "learning_rate": 3.4062887821414935e-06, "loss": 0.97714195, "memory(GiB)": 369.42, "step": 49125, "train_speed(iter/s)": 0.20047 }, { "acc": 0.75171452, "epoch": 1.2463216641298833, "grad_norm": 2.0, "learning_rate": 3.4052948882270585e-06, "loss": 0.97310801, "memory(GiB)": 369.42, "step": 49130, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74726896, "epoch": 1.2464485032978183, "grad_norm": 2.765625, "learning_rate": 3.404301064454873e-06, "loss": 1.07307587, "memory(GiB)": 369.42, "step": 49135, "train_speed(iter/s)": 0.200473 }, { "acc": 0.75816402, "epoch": 1.2465753424657535, "grad_norm": 2.234375, "learning_rate": 3.4033073108686515e-06, "loss": 0.97174273, "memory(GiB)": 369.42, "step": 49140, "train_speed(iter/s)": 0.200477 }, { "acc": 0.73602104, "epoch": 1.2467021816336885, "grad_norm": 2.0625, "learning_rate": 3.4023136275121026e-06, "loss": 1.02751026, "memory(GiB)": 369.42, "step": 49145, "train_speed(iter/s)": 0.20048 }, { "acc": 0.72765107, "epoch": 1.2468290208016235, "grad_norm": 2.671875, "learning_rate": 3.401320014428935e-06, "loss": 1.10529423, "memory(GiB)": 369.42, "step": 49150, "train_speed(iter/s)": 0.200485 }, { "acc": 0.75008636, "epoch": 1.2469558599695585, "grad_norm": 2.078125, "learning_rate": 3.40032647166285e-06, "loss": 0.98765917, "memory(GiB)": 369.42, "step": 49155, "train_speed(iter/s)": 0.200488 }, { "acc": 0.76176419, "epoch": 1.2470826991374937, "grad_norm": 1.8984375, "learning_rate": 3.3993329992575473e-06, "loss": 0.96117878, "memory(GiB)": 369.42, "step": 49160, "train_speed(iter/s)": 0.20049 }, { "acc": 0.74427443, "epoch": 1.2472095383054287, "grad_norm": 2.03125, "learning_rate": 3.3983395972567277e-06, "loss": 1.00688257, "memory(GiB)": 369.42, "step": 49165, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75578189, "epoch": 1.2473363774733637, "grad_norm": 2.390625, "learning_rate": 3.397346265704084e-06, "loss": 0.95676546, "memory(GiB)": 369.42, "step": 49170, "train_speed(iter/s)": 0.200497 }, { "acc": 0.77830019, "epoch": 1.2474632166412989, "grad_norm": 2.40625, "learning_rate": 3.396353004643306e-06, "loss": 0.94594536, "memory(GiB)": 369.42, "step": 49175, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75308032, "epoch": 1.2475900558092339, "grad_norm": 2.046875, "learning_rate": 3.3953598141180817e-06, "loss": 0.98082523, "memory(GiB)": 369.42, "step": 49180, "train_speed(iter/s)": 0.200505 }, { "acc": 0.75955615, "epoch": 1.2477168949771689, "grad_norm": 3.03125, "learning_rate": 3.3943666941720978e-06, "loss": 0.95375767, "memory(GiB)": 369.42, "step": 49185, "train_speed(iter/s)": 0.200507 }, { "acc": 0.74960089, "epoch": 1.247843734145104, "grad_norm": 1.921875, "learning_rate": 3.3933736448490363e-06, "loss": 0.99334755, "memory(GiB)": 369.42, "step": 49190, "train_speed(iter/s)": 0.20051 }, { "acc": 0.75129795, "epoch": 1.247970573313039, "grad_norm": 2.859375, "learning_rate": 3.392380666192573e-06, "loss": 0.98528709, "memory(GiB)": 369.42, "step": 49195, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74488993, "epoch": 1.248097412480974, "grad_norm": 1.9609375, "learning_rate": 3.391387758246386e-06, "loss": 1.05271854, "memory(GiB)": 369.42, "step": 49200, "train_speed(iter/s)": 0.200518 }, { "acc": 0.76705317, "epoch": 1.2482242516489093, "grad_norm": 1.9609375, "learning_rate": 3.3903949210541477e-06, "loss": 0.89380646, "memory(GiB)": 369.42, "step": 49205, "train_speed(iter/s)": 0.200521 }, { "acc": 0.75170264, "epoch": 1.2483510908168443, "grad_norm": 1.8828125, "learning_rate": 3.389402154659529e-06, "loss": 0.99909859, "memory(GiB)": 369.42, "step": 49210, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74476047, "epoch": 1.2484779299847792, "grad_norm": 2.453125, "learning_rate": 3.388409459106192e-06, "loss": 1.03441219, "memory(GiB)": 369.42, "step": 49215, "train_speed(iter/s)": 0.200526 }, { "acc": 0.75187902, "epoch": 1.2486047691527142, "grad_norm": 2.015625, "learning_rate": 3.3874168344378024e-06, "loss": 0.95616417, "memory(GiB)": 369.42, "step": 49220, "train_speed(iter/s)": 0.200529 }, { "acc": 0.74785433, "epoch": 1.2487316083206494, "grad_norm": 2.234375, "learning_rate": 3.38642428069802e-06, "loss": 0.94083538, "memory(GiB)": 369.42, "step": 49225, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75995197, "epoch": 1.2488584474885844, "grad_norm": 2.625, "learning_rate": 3.385431797930503e-06, "loss": 1.01200256, "memory(GiB)": 369.42, "step": 49230, "train_speed(iter/s)": 0.200535 }, { "acc": 0.74232206, "epoch": 1.2489852866565196, "grad_norm": 2.03125, "learning_rate": 3.3844393861789036e-06, "loss": 1.06423874, "memory(GiB)": 369.42, "step": 49235, "train_speed(iter/s)": 0.200537 }, { "acc": 0.75377955, "epoch": 1.2491121258244546, "grad_norm": 2.640625, "learning_rate": 3.383447045486872e-06, "loss": 1.01467304, "memory(GiB)": 369.42, "step": 49240, "train_speed(iter/s)": 0.200539 }, { "acc": 0.73604136, "epoch": 1.2492389649923896, "grad_norm": 1.984375, "learning_rate": 3.382454775898057e-06, "loss": 1.04655113, "memory(GiB)": 369.42, "step": 49245, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75466223, "epoch": 1.2493658041603246, "grad_norm": 2.375, "learning_rate": 3.381462577456104e-06, "loss": 0.98167944, "memory(GiB)": 369.42, "step": 49250, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75435019, "epoch": 1.2494926433282598, "grad_norm": 2.125, "learning_rate": 3.3804704502046527e-06, "loss": 0.9716197, "memory(GiB)": 369.42, "step": 49255, "train_speed(iter/s)": 0.200546 }, { "acc": 0.76022282, "epoch": 1.2496194824961948, "grad_norm": 2.03125, "learning_rate": 3.3794783941873406e-06, "loss": 0.96248112, "memory(GiB)": 369.42, "step": 49260, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75259609, "epoch": 1.2497463216641298, "grad_norm": 2.65625, "learning_rate": 3.3784864094478044e-06, "loss": 0.99780083, "memory(GiB)": 369.42, "step": 49265, "train_speed(iter/s)": 0.200552 }, { "acc": 0.74603653, "epoch": 1.249873160832065, "grad_norm": 2.046875, "learning_rate": 3.377494496029677e-06, "loss": 0.99052706, "memory(GiB)": 369.42, "step": 49270, "train_speed(iter/s)": 0.200552 }, { "acc": 0.76038389, "epoch": 1.25, "grad_norm": 2.265625, "learning_rate": 3.3765026539765832e-06, "loss": 0.98292904, "memory(GiB)": 369.42, "step": 49275, "train_speed(iter/s)": 0.200556 }, { "acc": 0.74371061, "epoch": 1.250126839167935, "grad_norm": 2.359375, "learning_rate": 3.375510883332152e-06, "loss": 1.03310623, "memory(GiB)": 369.42, "step": 49280, "train_speed(iter/s)": 0.20056 }, { "acc": 0.74522514, "epoch": 1.2502536783358702, "grad_norm": 1.9296875, "learning_rate": 3.3745191841400037e-06, "loss": 0.99690714, "memory(GiB)": 369.42, "step": 49285, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74174519, "epoch": 1.2503805175038052, "grad_norm": 1.9140625, "learning_rate": 3.373527556443762e-06, "loss": 1.03862419, "memory(GiB)": 369.42, "step": 49290, "train_speed(iter/s)": 0.200564 }, { "acc": 0.75398264, "epoch": 1.2505073566717402, "grad_norm": 2.09375, "learning_rate": 3.372536000287038e-06, "loss": 0.99026432, "memory(GiB)": 369.42, "step": 49295, "train_speed(iter/s)": 0.200567 }, { "acc": 0.76271143, "epoch": 1.2506341958396754, "grad_norm": 2.546875, "learning_rate": 3.3715445157134474e-06, "loss": 0.96561184, "memory(GiB)": 369.42, "step": 49300, "train_speed(iter/s)": 0.200571 }, { "acc": 0.74540215, "epoch": 1.2507610350076104, "grad_norm": 1.96875, "learning_rate": 3.370553102766598e-06, "loss": 0.95055828, "memory(GiB)": 369.42, "step": 49305, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74334993, "epoch": 1.2508878741755454, "grad_norm": 2.03125, "learning_rate": 3.369561761490101e-06, "loss": 0.99771042, "memory(GiB)": 369.42, "step": 49310, "train_speed(iter/s)": 0.200575 }, { "acc": 0.74141016, "epoch": 1.2510147133434804, "grad_norm": 2.328125, "learning_rate": 3.3685704919275553e-06, "loss": 1.05533085, "memory(GiB)": 369.42, "step": 49315, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76511297, "epoch": 1.2511415525114156, "grad_norm": 2.09375, "learning_rate": 3.3675792941225625e-06, "loss": 0.9298254, "memory(GiB)": 369.42, "step": 49320, "train_speed(iter/s)": 0.200579 }, { "acc": 0.7605669, "epoch": 1.2512683916793506, "grad_norm": 2.4375, "learning_rate": 3.3665881681187214e-06, "loss": 0.97021036, "memory(GiB)": 369.42, "step": 49325, "train_speed(iter/s)": 0.200581 }, { "acc": 0.76041236, "epoch": 1.2513952308472858, "grad_norm": 2.546875, "learning_rate": 3.3655971139596265e-06, "loss": 0.97968082, "memory(GiB)": 369.42, "step": 49330, "train_speed(iter/s)": 0.200584 }, { "acc": 0.74393072, "epoch": 1.2515220700152208, "grad_norm": 2.125, "learning_rate": 3.3646061316888655e-06, "loss": 1.02660217, "memory(GiB)": 369.42, "step": 49335, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74337349, "epoch": 1.2516489091831557, "grad_norm": 2.734375, "learning_rate": 3.3636152213500295e-06, "loss": 1.03419418, "memory(GiB)": 369.42, "step": 49340, "train_speed(iter/s)": 0.200589 }, { "acc": 0.74705448, "epoch": 1.2517757483510907, "grad_norm": 2.421875, "learning_rate": 3.362624382986702e-06, "loss": 0.99263296, "memory(GiB)": 369.42, "step": 49345, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75336242, "epoch": 1.251902587519026, "grad_norm": 1.9375, "learning_rate": 3.3616336166424653e-06, "loss": 0.98794069, "memory(GiB)": 369.42, "step": 49350, "train_speed(iter/s)": 0.200595 }, { "acc": 0.75725679, "epoch": 1.252029426686961, "grad_norm": 2.234375, "learning_rate": 3.360642922360895e-06, "loss": 0.93119087, "memory(GiB)": 369.42, "step": 49355, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74235449, "epoch": 1.252156265854896, "grad_norm": 2.3125, "learning_rate": 3.3596523001855684e-06, "loss": 1.07490902, "memory(GiB)": 369.42, "step": 49360, "train_speed(iter/s)": 0.200599 }, { "acc": 0.77213655, "epoch": 1.2522831050228311, "grad_norm": 2.109375, "learning_rate": 3.358661750160057e-06, "loss": 0.91547232, "memory(GiB)": 369.42, "step": 49365, "train_speed(iter/s)": 0.200601 }, { "acc": 0.74986067, "epoch": 1.2524099441907661, "grad_norm": 1.8125, "learning_rate": 3.3576712723279326e-06, "loss": 0.98656502, "memory(GiB)": 369.42, "step": 49370, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74048767, "epoch": 1.2525367833587011, "grad_norm": 2.1875, "learning_rate": 3.3566808667327566e-06, "loss": 1.07703733, "memory(GiB)": 369.42, "step": 49375, "train_speed(iter/s)": 0.200609 }, { "acc": 0.76379595, "epoch": 1.252663622526636, "grad_norm": 2.28125, "learning_rate": 3.355690533418091e-06, "loss": 0.97804871, "memory(GiB)": 369.42, "step": 49380, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74101982, "epoch": 1.2527904616945713, "grad_norm": 1.9140625, "learning_rate": 3.354700272427499e-06, "loss": 1.0297039, "memory(GiB)": 369.42, "step": 49385, "train_speed(iter/s)": 0.200613 }, { "acc": 0.75506506, "epoch": 1.2529173008625063, "grad_norm": 2.140625, "learning_rate": 3.3537100838045356e-06, "loss": 1.04549923, "memory(GiB)": 369.42, "step": 49390, "train_speed(iter/s)": 0.200615 }, { "acc": 0.7544971, "epoch": 1.2530441400304415, "grad_norm": 2.453125, "learning_rate": 3.3527199675927526e-06, "loss": 1.00822258, "memory(GiB)": 369.42, "step": 49395, "train_speed(iter/s)": 0.200618 }, { "acc": 0.74439774, "epoch": 1.2531709791983765, "grad_norm": 2.1875, "learning_rate": 3.3517299238356982e-06, "loss": 1.04334955, "memory(GiB)": 369.42, "step": 49400, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74776001, "epoch": 1.2532978183663115, "grad_norm": 1.9609375, "learning_rate": 3.3507399525769214e-06, "loss": 0.96937389, "memory(GiB)": 369.42, "step": 49405, "train_speed(iter/s)": 0.200624 }, { "acc": 0.73453255, "epoch": 1.2534246575342465, "grad_norm": 2.3125, "learning_rate": 3.3497500538599664e-06, "loss": 1.05122881, "memory(GiB)": 369.42, "step": 49410, "train_speed(iter/s)": 0.200628 }, { "acc": 0.74896312, "epoch": 1.2535514967021817, "grad_norm": 1.984375, "learning_rate": 3.34876022772837e-06, "loss": 1.01628456, "memory(GiB)": 369.42, "step": 49415, "train_speed(iter/s)": 0.200631 }, { "acc": 0.74854107, "epoch": 1.2536783358701167, "grad_norm": 2.15625, "learning_rate": 3.347770474225672e-06, "loss": 1.04662151, "memory(GiB)": 369.42, "step": 49420, "train_speed(iter/s)": 0.200634 }, { "acc": 0.74620204, "epoch": 1.2538051750380519, "grad_norm": 2.046875, "learning_rate": 3.3467807933954034e-06, "loss": 0.97269516, "memory(GiB)": 369.42, "step": 49425, "train_speed(iter/s)": 0.200637 }, { "acc": 0.74549479, "epoch": 1.2539320142059869, "grad_norm": 1.828125, "learning_rate": 3.345791185281101e-06, "loss": 0.98576403, "memory(GiB)": 369.42, "step": 49430, "train_speed(iter/s)": 0.200642 }, { "acc": 0.762011, "epoch": 1.2540588533739219, "grad_norm": 2.125, "learning_rate": 3.3448016499262836e-06, "loss": 0.99912043, "memory(GiB)": 369.42, "step": 49435, "train_speed(iter/s)": 0.200645 }, { "acc": 0.7477469, "epoch": 1.2541856925418569, "grad_norm": 2.125, "learning_rate": 3.3438121873744812e-06, "loss": 1.02663841, "memory(GiB)": 369.42, "step": 49440, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75071621, "epoch": 1.254312531709792, "grad_norm": 1.765625, "learning_rate": 3.342822797669212e-06, "loss": 0.97227802, "memory(GiB)": 369.42, "step": 49445, "train_speed(iter/s)": 0.200651 }, { "acc": 0.74210043, "epoch": 1.254439370877727, "grad_norm": 2.1875, "learning_rate": 3.3418334808539966e-06, "loss": 1.04973068, "memory(GiB)": 369.42, "step": 49450, "train_speed(iter/s)": 0.200653 }, { "acc": 0.7385304, "epoch": 1.254566210045662, "grad_norm": 1.9765625, "learning_rate": 3.340844236972347e-06, "loss": 1.0550745, "memory(GiB)": 369.42, "step": 49455, "train_speed(iter/s)": 0.200657 }, { "acc": 0.76247606, "epoch": 1.2546930492135973, "grad_norm": 2.25, "learning_rate": 3.3398550660677748e-06, "loss": 0.97471409, "memory(GiB)": 369.42, "step": 49460, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74540091, "epoch": 1.2548198883815322, "grad_norm": 2.8125, "learning_rate": 3.3388659681837898e-06, "loss": 1.01440239, "memory(GiB)": 369.42, "step": 49465, "train_speed(iter/s)": 0.200663 }, { "acc": 0.738098, "epoch": 1.2549467275494672, "grad_norm": 2.890625, "learning_rate": 3.3378769433638965e-06, "loss": 1.05250072, "memory(GiB)": 369.42, "step": 49470, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74643564, "epoch": 1.2550735667174022, "grad_norm": 2.3125, "learning_rate": 3.336887991651595e-06, "loss": 0.99698448, "memory(GiB)": 369.42, "step": 49475, "train_speed(iter/s)": 0.200669 }, { "acc": 0.74507036, "epoch": 1.2552004058853374, "grad_norm": 1.8515625, "learning_rate": 3.3358991130903845e-06, "loss": 0.99814415, "memory(GiB)": 369.42, "step": 49480, "train_speed(iter/s)": 0.200673 }, { "acc": 0.77555461, "epoch": 1.2553272450532724, "grad_norm": 2.140625, "learning_rate": 3.334910307723761e-06, "loss": 0.91706581, "memory(GiB)": 369.42, "step": 49485, "train_speed(iter/s)": 0.200678 }, { "acc": 0.76210327, "epoch": 1.2554540842212076, "grad_norm": 2.375, "learning_rate": 3.333921575595218e-06, "loss": 0.98293839, "memory(GiB)": 369.42, "step": 49490, "train_speed(iter/s)": 0.200682 }, { "acc": 0.7338604, "epoch": 1.2555809233891426, "grad_norm": 2.125, "learning_rate": 3.3329329167482404e-06, "loss": 0.9977375, "memory(GiB)": 369.42, "step": 49495, "train_speed(iter/s)": 0.200685 }, { "acc": 0.76271915, "epoch": 1.2557077625570776, "grad_norm": 2.25, "learning_rate": 3.331944331226317e-06, "loss": 0.95665455, "memory(GiB)": 369.42, "step": 49500, "train_speed(iter/s)": 0.200689 }, { "acc": 0.76119776, "epoch": 1.2558346017250126, "grad_norm": 1.90625, "learning_rate": 3.330955819072928e-06, "loss": 0.99975567, "memory(GiB)": 369.42, "step": 49505, "train_speed(iter/s)": 0.200693 }, { "acc": 0.77892466, "epoch": 1.2559614408929478, "grad_norm": 2.34375, "learning_rate": 3.329967380331556e-06, "loss": 0.90047541, "memory(GiB)": 369.42, "step": 49510, "train_speed(iter/s)": 0.200696 }, { "acc": 0.74739985, "epoch": 1.2560882800608828, "grad_norm": 3.359375, "learning_rate": 3.3289790150456737e-06, "loss": 0.9710947, "memory(GiB)": 369.42, "step": 49515, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75394592, "epoch": 1.2562151192288178, "grad_norm": 1.8671875, "learning_rate": 3.327990723258755e-06, "loss": 0.96557941, "memory(GiB)": 369.42, "step": 49520, "train_speed(iter/s)": 0.200704 }, { "acc": 0.7495965, "epoch": 1.256341958396753, "grad_norm": 1.9296875, "learning_rate": 3.3270025050142684e-06, "loss": 0.95289135, "memory(GiB)": 369.42, "step": 49525, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75337391, "epoch": 1.256468797564688, "grad_norm": 1.9296875, "learning_rate": 3.3260143603556827e-06, "loss": 0.9700366, "memory(GiB)": 369.42, "step": 49530, "train_speed(iter/s)": 0.200711 }, { "acc": 0.74247532, "epoch": 1.256595636732623, "grad_norm": 2.28125, "learning_rate": 3.3250262893264583e-06, "loss": 0.98604889, "memory(GiB)": 369.42, "step": 49535, "train_speed(iter/s)": 0.200714 }, { "acc": 0.74030232, "epoch": 1.256722475900558, "grad_norm": 2.375, "learning_rate": 3.3240382919700555e-06, "loss": 0.98333931, "memory(GiB)": 369.42, "step": 49540, "train_speed(iter/s)": 0.200717 }, { "acc": 0.75221686, "epoch": 1.2568493150684932, "grad_norm": 2.265625, "learning_rate": 3.3230503683299316e-06, "loss": 0.9801342, "memory(GiB)": 369.42, "step": 49545, "train_speed(iter/s)": 0.20072 }, { "acc": 0.76006842, "epoch": 1.2569761542364282, "grad_norm": 2.65625, "learning_rate": 3.3220625184495404e-06, "loss": 1.00240946, "memory(GiB)": 369.42, "step": 49550, "train_speed(iter/s)": 0.200724 }, { "acc": 0.74336443, "epoch": 1.2571029934043634, "grad_norm": 2.390625, "learning_rate": 3.3210747423723293e-06, "loss": 1.03889332, "memory(GiB)": 369.42, "step": 49555, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75121222, "epoch": 1.2572298325722984, "grad_norm": 2.09375, "learning_rate": 3.3200870401417486e-06, "loss": 1.03843536, "memory(GiB)": 369.42, "step": 49560, "train_speed(iter/s)": 0.20073 }, { "acc": 0.75278225, "epoch": 1.2573566717402334, "grad_norm": 1.9609375, "learning_rate": 3.3190994118012387e-06, "loss": 1.01008091, "memory(GiB)": 369.42, "step": 49565, "train_speed(iter/s)": 0.200734 }, { "acc": 0.75160313, "epoch": 1.2574835109081683, "grad_norm": 2.234375, "learning_rate": 3.318111857394244e-06, "loss": 1.01394978, "memory(GiB)": 369.42, "step": 49570, "train_speed(iter/s)": 0.200737 }, { "acc": 0.76558404, "epoch": 1.2576103500761036, "grad_norm": 2.25, "learning_rate": 3.3171243769641957e-06, "loss": 0.97017775, "memory(GiB)": 369.42, "step": 49575, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74611444, "epoch": 1.2577371892440385, "grad_norm": 2.53125, "learning_rate": 3.316136970554532e-06, "loss": 1.02613945, "memory(GiB)": 369.42, "step": 49580, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75759306, "epoch": 1.2578640284119738, "grad_norm": 2.28125, "learning_rate": 3.315149638208681e-06, "loss": 0.99155445, "memory(GiB)": 369.42, "step": 49585, "train_speed(iter/s)": 0.200745 }, { "acc": 0.75734949, "epoch": 1.2579908675799087, "grad_norm": 2.4375, "learning_rate": 3.3141623799700738e-06, "loss": 0.97962894, "memory(GiB)": 369.42, "step": 49590, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7347084, "epoch": 1.2581177067478437, "grad_norm": 2.484375, "learning_rate": 3.3131751958821313e-06, "loss": 1.06600933, "memory(GiB)": 369.42, "step": 49595, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75377774, "epoch": 1.2582445459157787, "grad_norm": 2.5625, "learning_rate": 3.312188085988273e-06, "loss": 0.97003298, "memory(GiB)": 369.42, "step": 49600, "train_speed(iter/s)": 0.20075 }, { "acc": 0.75878453, "epoch": 1.258371385083714, "grad_norm": 2.03125, "learning_rate": 3.311201050331919e-06, "loss": 0.99736891, "memory(GiB)": 369.42, "step": 49605, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75999231, "epoch": 1.258498224251649, "grad_norm": 2.015625, "learning_rate": 3.310214088956485e-06, "loss": 0.96219664, "memory(GiB)": 369.42, "step": 49610, "train_speed(iter/s)": 0.200756 }, { "acc": 0.77147284, "epoch": 1.258625063419584, "grad_norm": 2.140625, "learning_rate": 3.3092272019053773e-06, "loss": 0.88010473, "memory(GiB)": 369.42, "step": 49615, "train_speed(iter/s)": 0.200759 }, { "acc": 0.76796026, "epoch": 1.2587519025875191, "grad_norm": 1.796875, "learning_rate": 3.308240389222006e-06, "loss": 0.91013336, "memory(GiB)": 369.42, "step": 49620, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75550394, "epoch": 1.2588787417554541, "grad_norm": 1.96875, "learning_rate": 3.3072536509497762e-06, "loss": 0.94492626, "memory(GiB)": 369.42, "step": 49625, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75804482, "epoch": 1.259005580923389, "grad_norm": 1.890625, "learning_rate": 3.306266987132089e-06, "loss": 0.9587801, "memory(GiB)": 369.42, "step": 49630, "train_speed(iter/s)": 0.200765 }, { "acc": 0.76262102, "epoch": 1.259132420091324, "grad_norm": 2.140625, "learning_rate": 3.3052803978123405e-06, "loss": 0.92300825, "memory(GiB)": 369.42, "step": 49635, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75667543, "epoch": 1.2592592592592593, "grad_norm": 1.7890625, "learning_rate": 3.3042938830339264e-06, "loss": 0.95166626, "memory(GiB)": 369.42, "step": 49640, "train_speed(iter/s)": 0.200772 }, { "acc": 0.75649395, "epoch": 1.2593860984271943, "grad_norm": 2.15625, "learning_rate": 3.303307442840238e-06, "loss": 0.93116531, "memory(GiB)": 369.42, "step": 49645, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75882339, "epoch": 1.2595129375951295, "grad_norm": 1.7890625, "learning_rate": 3.302321077274666e-06, "loss": 0.92755756, "memory(GiB)": 369.42, "step": 49650, "train_speed(iter/s)": 0.200779 }, { "acc": 0.73142366, "epoch": 1.2596397767630645, "grad_norm": 1.921875, "learning_rate": 3.30133478638059e-06, "loss": 1.07718868, "memory(GiB)": 369.42, "step": 49655, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75259104, "epoch": 1.2597666159309995, "grad_norm": 1.84375, "learning_rate": 3.300348570201395e-06, "loss": 0.95271759, "memory(GiB)": 369.42, "step": 49660, "train_speed(iter/s)": 0.200784 }, { "acc": 0.74454317, "epoch": 1.2598934550989345, "grad_norm": 2.078125, "learning_rate": 3.299362428780457e-06, "loss": 1.01598167, "memory(GiB)": 369.42, "step": 49665, "train_speed(iter/s)": 0.200787 }, { "acc": 0.76468029, "epoch": 1.2600202942668697, "grad_norm": 2.453125, "learning_rate": 3.298376362161154e-06, "loss": 0.87537403, "memory(GiB)": 369.42, "step": 49670, "train_speed(iter/s)": 0.20079 }, { "acc": 0.76079731, "epoch": 1.2601471334348047, "grad_norm": 2.296875, "learning_rate": 3.297390370386856e-06, "loss": 0.9816761, "memory(GiB)": 369.42, "step": 49675, "train_speed(iter/s)": 0.200792 }, { "acc": 0.73056054, "epoch": 1.2602739726027397, "grad_norm": 2.140625, "learning_rate": 3.2964044535009288e-06, "loss": 1.05445194, "memory(GiB)": 369.42, "step": 49680, "train_speed(iter/s)": 0.200794 }, { "acc": 0.75248899, "epoch": 1.2604008117706749, "grad_norm": 2.265625, "learning_rate": 3.2954186115467412e-06, "loss": 0.97287827, "memory(GiB)": 369.42, "step": 49685, "train_speed(iter/s)": 0.200796 }, { "acc": 0.74615679, "epoch": 1.2605276509386099, "grad_norm": 2.125, "learning_rate": 3.2944328445676543e-06, "loss": 0.99140816, "memory(GiB)": 369.42, "step": 49690, "train_speed(iter/s)": 0.200799 }, { "acc": 0.74313126, "epoch": 1.2606544901065448, "grad_norm": 2.15625, "learning_rate": 3.2934471526070254e-06, "loss": 1.04049873, "memory(GiB)": 369.42, "step": 49695, "train_speed(iter/s)": 0.200802 }, { "acc": 0.76313758, "epoch": 1.2607813292744798, "grad_norm": 2.5, "learning_rate": 3.2924615357082078e-06, "loss": 0.9734745, "memory(GiB)": 369.42, "step": 49700, "train_speed(iter/s)": 0.200807 }, { "acc": 0.74328198, "epoch": 1.260908168442415, "grad_norm": 2.40625, "learning_rate": 3.2914759939145574e-06, "loss": 1.03509302, "memory(GiB)": 369.42, "step": 49705, "train_speed(iter/s)": 0.200809 }, { "acc": 0.74502001, "epoch": 1.26103500761035, "grad_norm": 2.390625, "learning_rate": 3.2904905272694214e-06, "loss": 1.00588055, "memory(GiB)": 369.42, "step": 49710, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75316873, "epoch": 1.2611618467782852, "grad_norm": 2.046875, "learning_rate": 3.289505135816142e-06, "loss": 0.95742245, "memory(GiB)": 369.42, "step": 49715, "train_speed(iter/s)": 0.200814 }, { "acc": 0.74381609, "epoch": 1.2612886859462202, "grad_norm": 2.15625, "learning_rate": 3.2885198195980653e-06, "loss": 0.99029331, "memory(GiB)": 369.42, "step": 49720, "train_speed(iter/s)": 0.200816 }, { "acc": 0.7459487, "epoch": 1.2614155251141552, "grad_norm": 2.203125, "learning_rate": 3.287534578658527e-06, "loss": 0.99034815, "memory(GiB)": 369.42, "step": 49725, "train_speed(iter/s)": 0.200819 }, { "acc": 0.75462575, "epoch": 1.2615423642820902, "grad_norm": 2.21875, "learning_rate": 3.2865494130408657e-06, "loss": 0.9769413, "memory(GiB)": 369.42, "step": 49730, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74818735, "epoch": 1.2616692034500254, "grad_norm": 2.5, "learning_rate": 3.2855643227884097e-06, "loss": 1.00945854, "memory(GiB)": 369.42, "step": 49735, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75524249, "epoch": 1.2617960426179604, "grad_norm": 2.015625, "learning_rate": 3.28457930794449e-06, "loss": 0.9607893, "memory(GiB)": 369.42, "step": 49740, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76606798, "epoch": 1.2619228817858956, "grad_norm": 2.296875, "learning_rate": 3.283594368552429e-06, "loss": 0.93905764, "memory(GiB)": 369.42, "step": 49745, "train_speed(iter/s)": 0.200832 }, { "acc": 0.74199982, "epoch": 1.2620497209538306, "grad_norm": 2.59375, "learning_rate": 3.282609504655554e-06, "loss": 1.02265806, "memory(GiB)": 369.42, "step": 49750, "train_speed(iter/s)": 0.200833 }, { "acc": 0.75006771, "epoch": 1.2621765601217656, "grad_norm": 2.6875, "learning_rate": 3.281624716297179e-06, "loss": 0.95267544, "memory(GiB)": 369.42, "step": 49755, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75150843, "epoch": 1.2623033992897006, "grad_norm": 2.3125, "learning_rate": 3.28064000352062e-06, "loss": 1.00180159, "memory(GiB)": 369.42, "step": 49760, "train_speed(iter/s)": 0.200839 }, { "acc": 0.74863653, "epoch": 1.2624302384576358, "grad_norm": 2.109375, "learning_rate": 3.279655366369191e-06, "loss": 0.97591782, "memory(GiB)": 369.42, "step": 49765, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74324703, "epoch": 1.2625570776255708, "grad_norm": 1.9609375, "learning_rate": 3.2786708048862e-06, "loss": 1.02183943, "memory(GiB)": 369.42, "step": 49770, "train_speed(iter/s)": 0.200844 }, { "acc": 0.75454912, "epoch": 1.2626839167935058, "grad_norm": 2.59375, "learning_rate": 3.2776863191149517e-06, "loss": 1.01202221, "memory(GiB)": 369.42, "step": 49775, "train_speed(iter/s)": 0.200848 }, { "acc": 0.7563921, "epoch": 1.262810755961441, "grad_norm": 2.265625, "learning_rate": 3.2767019090987483e-06, "loss": 1.00697002, "memory(GiB)": 369.42, "step": 49780, "train_speed(iter/s)": 0.200851 }, { "acc": 0.73671827, "epoch": 1.262937595129376, "grad_norm": 1.8671875, "learning_rate": 3.27571757488089e-06, "loss": 1.071422, "memory(GiB)": 369.42, "step": 49785, "train_speed(iter/s)": 0.200853 }, { "acc": 0.75479383, "epoch": 1.263064434297311, "grad_norm": 2.28125, "learning_rate": 3.274733316504672e-06, "loss": 0.96206913, "memory(GiB)": 369.42, "step": 49790, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75483475, "epoch": 1.263191273465246, "grad_norm": 1.7421875, "learning_rate": 3.273749134013383e-06, "loss": 0.91969852, "memory(GiB)": 369.42, "step": 49795, "train_speed(iter/s)": 0.200857 }, { "acc": 0.75961428, "epoch": 1.2633181126331812, "grad_norm": 2.03125, "learning_rate": 3.2727650274503154e-06, "loss": 0.9755661, "memory(GiB)": 369.42, "step": 49800, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75339613, "epoch": 1.2634449518011162, "grad_norm": 2.359375, "learning_rate": 3.2717809968587523e-06, "loss": 0.96407194, "memory(GiB)": 369.42, "step": 49805, "train_speed(iter/s)": 0.200864 }, { "acc": 0.75724859, "epoch": 1.2635717909690514, "grad_norm": 2.328125, "learning_rate": 3.270797042281979e-06, "loss": 0.96735954, "memory(GiB)": 369.42, "step": 49810, "train_speed(iter/s)": 0.200866 }, { "acc": 0.74306822, "epoch": 1.2636986301369864, "grad_norm": 1.9453125, "learning_rate": 3.269813163763271e-06, "loss": 1.08568707, "memory(GiB)": 369.42, "step": 49815, "train_speed(iter/s)": 0.200869 }, { "acc": 0.75372677, "epoch": 1.2638254693049213, "grad_norm": 2.234375, "learning_rate": 3.268829361345904e-06, "loss": 0.97318802, "memory(GiB)": 369.42, "step": 49820, "train_speed(iter/s)": 0.20087 }, { "acc": 0.76007571, "epoch": 1.2639523084728563, "grad_norm": 2.140625, "learning_rate": 3.2678456350731526e-06, "loss": 0.94844704, "memory(GiB)": 369.42, "step": 49825, "train_speed(iter/s)": 0.200874 }, { "acc": 0.76516113, "epoch": 1.2640791476407915, "grad_norm": 2.390625, "learning_rate": 3.266861984988283e-06, "loss": 0.92973566, "memory(GiB)": 369.42, "step": 49830, "train_speed(iter/s)": 0.200877 }, { "acc": 0.76165576, "epoch": 1.2642059868087265, "grad_norm": 2.484375, "learning_rate": 3.2658784111345614e-06, "loss": 0.98903666, "memory(GiB)": 369.42, "step": 49835, "train_speed(iter/s)": 0.200881 }, { "acc": 0.75224705, "epoch": 1.2643328259766615, "grad_norm": 2.109375, "learning_rate": 3.2648949135552482e-06, "loss": 0.92481518, "memory(GiB)": 369.42, "step": 49840, "train_speed(iter/s)": 0.200883 }, { "acc": 0.75089283, "epoch": 1.2644596651445967, "grad_norm": 3.15625, "learning_rate": 3.2639114922936045e-06, "loss": 1.00177526, "memory(GiB)": 369.42, "step": 49845, "train_speed(iter/s)": 0.200885 }, { "acc": 0.76311717, "epoch": 1.2645865043125317, "grad_norm": 2.34375, "learning_rate": 3.2629281473928855e-06, "loss": 0.90258598, "memory(GiB)": 369.42, "step": 49850, "train_speed(iter/s)": 0.200889 }, { "acc": 0.7514329, "epoch": 1.2647133434804667, "grad_norm": 1.8671875, "learning_rate": 3.26194487889634e-06, "loss": 0.98395729, "memory(GiB)": 369.42, "step": 49855, "train_speed(iter/s)": 0.200893 }, { "acc": 0.76438599, "epoch": 1.2648401826484017, "grad_norm": 2.328125, "learning_rate": 3.2609616868472192e-06, "loss": 0.93901138, "memory(GiB)": 369.42, "step": 49860, "train_speed(iter/s)": 0.200897 }, { "acc": 0.75233531, "epoch": 1.264967021816337, "grad_norm": 2.40625, "learning_rate": 3.259978571288767e-06, "loss": 0.95422249, "memory(GiB)": 369.42, "step": 49865, "train_speed(iter/s)": 0.2009 }, { "acc": 0.73992147, "epoch": 1.265093860984272, "grad_norm": 2.09375, "learning_rate": 3.2589955322642293e-06, "loss": 1.06072788, "memory(GiB)": 369.42, "step": 49870, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74959641, "epoch": 1.2652207001522071, "grad_norm": 1.734375, "learning_rate": 3.2580125698168376e-06, "loss": 0.95024147, "memory(GiB)": 369.42, "step": 49875, "train_speed(iter/s)": 0.200906 }, { "acc": 0.74070892, "epoch": 1.265347539320142, "grad_norm": 2.21875, "learning_rate": 3.2570296839898314e-06, "loss": 1.0429472, "memory(GiB)": 369.42, "step": 49880, "train_speed(iter/s)": 0.200908 }, { "acc": 0.75862899, "epoch": 1.265474378488077, "grad_norm": 2.515625, "learning_rate": 3.2560468748264405e-06, "loss": 0.95714149, "memory(GiB)": 369.42, "step": 49885, "train_speed(iter/s)": 0.200911 }, { "acc": 0.74700394, "epoch": 1.265601217656012, "grad_norm": 2.09375, "learning_rate": 3.2550641423698965e-06, "loss": 0.99639854, "memory(GiB)": 369.42, "step": 49890, "train_speed(iter/s)": 0.200915 }, { "acc": 0.73565092, "epoch": 1.2657280568239473, "grad_norm": 2.015625, "learning_rate": 3.2540814866634206e-06, "loss": 1.02351437, "memory(GiB)": 369.42, "step": 49895, "train_speed(iter/s)": 0.200917 }, { "acc": 0.74240427, "epoch": 1.2658548959918823, "grad_norm": 2.1875, "learning_rate": 3.2530989077502355e-06, "loss": 0.98612614, "memory(GiB)": 369.42, "step": 49900, "train_speed(iter/s)": 0.200918 }, { "acc": 0.76719046, "epoch": 1.2659817351598175, "grad_norm": 2.578125, "learning_rate": 3.252116405673561e-06, "loss": 0.92155094, "memory(GiB)": 369.42, "step": 49905, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75854692, "epoch": 1.2661085743277525, "grad_norm": 2.53125, "learning_rate": 3.2511339804766107e-06, "loss": 0.94651909, "memory(GiB)": 369.42, "step": 49910, "train_speed(iter/s)": 0.200923 }, { "acc": 0.76351528, "epoch": 1.2662354134956875, "grad_norm": 2.3125, "learning_rate": 3.250151632202596e-06, "loss": 0.95980892, "memory(GiB)": 369.42, "step": 49915, "train_speed(iter/s)": 0.200925 }, { "acc": 0.7606967, "epoch": 1.2663622526636225, "grad_norm": 2.671875, "learning_rate": 3.249169360894724e-06, "loss": 0.96121216, "memory(GiB)": 369.42, "step": 49920, "train_speed(iter/s)": 0.200928 }, { "acc": 0.74397535, "epoch": 1.2664890918315577, "grad_norm": 2.53125, "learning_rate": 3.2481871665962006e-06, "loss": 1.07042494, "memory(GiB)": 369.42, "step": 49925, "train_speed(iter/s)": 0.200932 }, { "acc": 0.76359768, "epoch": 1.2666159309994927, "grad_norm": 2.109375, "learning_rate": 3.2472050493502282e-06, "loss": 0.93071308, "memory(GiB)": 369.42, "step": 49930, "train_speed(iter/s)": 0.200935 }, { "acc": 0.76003761, "epoch": 1.2667427701674276, "grad_norm": 2.09375, "learning_rate": 3.2462230092000017e-06, "loss": 0.93484755, "memory(GiB)": 369.42, "step": 49935, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75527472, "epoch": 1.2668696093353629, "grad_norm": 2.0625, "learning_rate": 3.2452410461887184e-06, "loss": 0.95663033, "memory(GiB)": 369.42, "step": 49940, "train_speed(iter/s)": 0.200939 }, { "acc": 0.76215239, "epoch": 1.2669964485032978, "grad_norm": 2.0625, "learning_rate": 3.244259160359567e-06, "loss": 0.96209335, "memory(GiB)": 369.42, "step": 49945, "train_speed(iter/s)": 0.200943 }, { "acc": 0.7493536, "epoch": 1.2671232876712328, "grad_norm": 2.15625, "learning_rate": 3.2432773517557385e-06, "loss": 0.98874969, "memory(GiB)": 369.42, "step": 49950, "train_speed(iter/s)": 0.200943 }, { "acc": 0.75793929, "epoch": 1.2672501268391678, "grad_norm": 1.8828125, "learning_rate": 3.2422956204204147e-06, "loss": 0.93817501, "memory(GiB)": 369.42, "step": 49955, "train_speed(iter/s)": 0.200946 }, { "acc": 0.74822936, "epoch": 1.267376966007103, "grad_norm": 2.4375, "learning_rate": 3.2413139663967763e-06, "loss": 1.03894215, "memory(GiB)": 369.42, "step": 49960, "train_speed(iter/s)": 0.200949 }, { "acc": 0.76468759, "epoch": 1.267503805175038, "grad_norm": 2.125, "learning_rate": 3.2403323897280013e-06, "loss": 0.91313238, "memory(GiB)": 369.42, "step": 49965, "train_speed(iter/s)": 0.200952 }, { "acc": 0.7552474, "epoch": 1.2676306443429732, "grad_norm": 2.25, "learning_rate": 3.2393508904572663e-06, "loss": 0.9373455, "memory(GiB)": 369.42, "step": 49970, "train_speed(iter/s)": 0.200956 }, { "acc": 0.74445372, "epoch": 1.2677574835109082, "grad_norm": 1.7578125, "learning_rate": 3.2383694686277382e-06, "loss": 1.01629496, "memory(GiB)": 369.42, "step": 49975, "train_speed(iter/s)": 0.200958 }, { "acc": 0.7393961, "epoch": 1.2678843226788432, "grad_norm": 2.03125, "learning_rate": 3.2373881242825857e-06, "loss": 0.9919364, "memory(GiB)": 369.42, "step": 49980, "train_speed(iter/s)": 0.200961 }, { "acc": 0.75262775, "epoch": 1.2680111618467782, "grad_norm": 1.96875, "learning_rate": 3.236406857464973e-06, "loss": 0.97387962, "memory(GiB)": 369.42, "step": 49985, "train_speed(iter/s)": 0.200963 }, { "acc": 0.74983139, "epoch": 1.2681380010147134, "grad_norm": 1.9609375, "learning_rate": 3.235425668218063e-06, "loss": 0.98015032, "memory(GiB)": 369.42, "step": 49990, "train_speed(iter/s)": 0.200966 }, { "acc": 0.74555674, "epoch": 1.2682648401826484, "grad_norm": 1.9765625, "learning_rate": 3.234444556585007e-06, "loss": 0.99991703, "memory(GiB)": 369.42, "step": 49995, "train_speed(iter/s)": 0.20097 }, { "acc": 0.74320431, "epoch": 1.2683916793505834, "grad_norm": 2.046875, "learning_rate": 3.233463522608964e-06, "loss": 1.00511036, "memory(GiB)": 369.42, "step": 50000, "train_speed(iter/s)": 0.200973 }, { "epoch": 1.2683916793505834, "eval_acc": 0.737939162217793, "eval_loss": 0.969763457775116, "eval_runtime": 384.535, "eval_samples_per_second": 16.565, "eval_steps_per_second": 8.283, "step": 50000 }, { "acc": 0.76524005, "epoch": 1.2685185185185186, "grad_norm": 2.234375, "learning_rate": 3.2324825663330818e-06, "loss": 0.9864933, "memory(GiB)": 369.42, "step": 50005, "train_speed(iter/s)": 0.2004 }, { "acc": 0.75825176, "epoch": 1.2686453576864536, "grad_norm": 2.421875, "learning_rate": 3.231501687800509e-06, "loss": 0.91118717, "memory(GiB)": 369.42, "step": 50010, "train_speed(iter/s)": 0.200403 }, { "acc": 0.74916258, "epoch": 1.2687721968543886, "grad_norm": 2.140625, "learning_rate": 3.2305208870543857e-06, "loss": 0.96948729, "memory(GiB)": 369.42, "step": 50015, "train_speed(iter/s)": 0.200406 }, { "acc": 0.76028595, "epoch": 1.2688990360223236, "grad_norm": 2.421875, "learning_rate": 3.2295401641378544e-06, "loss": 0.97287045, "memory(GiB)": 369.42, "step": 50020, "train_speed(iter/s)": 0.200409 }, { "acc": 0.75766487, "epoch": 1.2690258751902588, "grad_norm": 2.34375, "learning_rate": 3.2285595190940513e-06, "loss": 0.96997042, "memory(GiB)": 369.42, "step": 50025, "train_speed(iter/s)": 0.200413 }, { "acc": 0.74697747, "epoch": 1.2691527143581938, "grad_norm": 2.328125, "learning_rate": 3.2275789519661103e-06, "loss": 1.05403814, "memory(GiB)": 369.42, "step": 50030, "train_speed(iter/s)": 0.200417 }, { "acc": 0.75350304, "epoch": 1.269279553526129, "grad_norm": 2.171875, "learning_rate": 3.2265984627971595e-06, "loss": 1.01135426, "memory(GiB)": 369.42, "step": 50035, "train_speed(iter/s)": 0.200419 }, { "acc": 0.74838891, "epoch": 1.269406392694064, "grad_norm": 2.15625, "learning_rate": 3.225618051630326e-06, "loss": 1.03092556, "memory(GiB)": 369.42, "step": 50040, "train_speed(iter/s)": 0.200423 }, { "acc": 0.76322403, "epoch": 1.269533231861999, "grad_norm": 1.9140625, "learning_rate": 3.2246377185087325e-06, "loss": 1.00010414, "memory(GiB)": 369.42, "step": 50045, "train_speed(iter/s)": 0.200426 }, { "acc": 0.74298506, "epoch": 1.269660071029934, "grad_norm": 2.5625, "learning_rate": 3.2236574634755003e-06, "loss": 1.01130447, "memory(GiB)": 369.42, "step": 50050, "train_speed(iter/s)": 0.200429 }, { "acc": 0.7317162, "epoch": 1.2697869101978692, "grad_norm": 1.9609375, "learning_rate": 3.222677286573742e-06, "loss": 1.07334671, "memory(GiB)": 369.42, "step": 50055, "train_speed(iter/s)": 0.200432 }, { "acc": 0.74924102, "epoch": 1.2699137493658041, "grad_norm": 2.15625, "learning_rate": 3.221697187846571e-06, "loss": 1.01150742, "memory(GiB)": 369.42, "step": 50060, "train_speed(iter/s)": 0.200435 }, { "acc": 0.76013103, "epoch": 1.2700405885337394, "grad_norm": 2.46875, "learning_rate": 3.2207171673370984e-06, "loss": 1.01061306, "memory(GiB)": 369.42, "step": 50065, "train_speed(iter/s)": 0.200437 }, { "acc": 0.75010929, "epoch": 1.2701674277016743, "grad_norm": 2.140625, "learning_rate": 3.2197372250884295e-06, "loss": 0.99376755, "memory(GiB)": 369.42, "step": 50070, "train_speed(iter/s)": 0.20044 }, { "acc": 0.75010099, "epoch": 1.2702942668696093, "grad_norm": 2.15625, "learning_rate": 3.218757361143664e-06, "loss": 0.99371386, "memory(GiB)": 369.42, "step": 50075, "train_speed(iter/s)": 0.200441 }, { "acc": 0.75766993, "epoch": 1.2704211060375443, "grad_norm": 2.296875, "learning_rate": 3.2177775755459034e-06, "loss": 0.99572573, "memory(GiB)": 369.42, "step": 50080, "train_speed(iter/s)": 0.200444 }, { "acc": 0.75095921, "epoch": 1.2705479452054795, "grad_norm": 1.984375, "learning_rate": 3.216797868338241e-06, "loss": 0.9747694, "memory(GiB)": 369.42, "step": 50085, "train_speed(iter/s)": 0.200447 }, { "acc": 0.75511789, "epoch": 1.2706747843734145, "grad_norm": 1.953125, "learning_rate": 3.215818239563773e-06, "loss": 0.98059063, "memory(GiB)": 369.42, "step": 50090, "train_speed(iter/s)": 0.20045 }, { "acc": 0.75835838, "epoch": 1.2708016235413495, "grad_norm": 2.0625, "learning_rate": 3.2148386892655814e-06, "loss": 0.92290287, "memory(GiB)": 369.42, "step": 50095, "train_speed(iter/s)": 0.200454 }, { "acc": 0.74516764, "epoch": 1.2709284627092847, "grad_norm": 2.265625, "learning_rate": 3.2138592174867556e-06, "loss": 1.03346615, "memory(GiB)": 369.42, "step": 50100, "train_speed(iter/s)": 0.200456 }, { "acc": 0.76409445, "epoch": 1.2710553018772197, "grad_norm": 2.453125, "learning_rate": 3.2128798242703745e-06, "loss": 0.96542768, "memory(GiB)": 369.42, "step": 50105, "train_speed(iter/s)": 0.200458 }, { "acc": 0.75199175, "epoch": 1.2711821410451547, "grad_norm": 2.046875, "learning_rate": 3.2119005096595203e-06, "loss": 0.97439632, "memory(GiB)": 369.42, "step": 50110, "train_speed(iter/s)": 0.200461 }, { "acc": 0.75961599, "epoch": 1.2713089802130897, "grad_norm": 1.9296875, "learning_rate": 3.2109212736972636e-06, "loss": 0.9759203, "memory(GiB)": 369.42, "step": 50115, "train_speed(iter/s)": 0.200464 }, { "acc": 0.75477161, "epoch": 1.271435819381025, "grad_norm": 2.078125, "learning_rate": 3.2099421164266758e-06, "loss": 0.92164879, "memory(GiB)": 369.42, "step": 50120, "train_speed(iter/s)": 0.200467 }, { "acc": 0.75010595, "epoch": 1.27156265854896, "grad_norm": 2.875, "learning_rate": 3.2089630378908264e-06, "loss": 0.98874054, "memory(GiB)": 369.42, "step": 50125, "train_speed(iter/s)": 0.20047 }, { "acc": 0.75868864, "epoch": 1.271689497716895, "grad_norm": 1.796875, "learning_rate": 3.207984038132781e-06, "loss": 0.95205879, "memory(GiB)": 369.42, "step": 50130, "train_speed(iter/s)": 0.200474 }, { "acc": 0.74822769, "epoch": 1.27181633688483, "grad_norm": 2.0625, "learning_rate": 3.2070051171955966e-06, "loss": 1.02165546, "memory(GiB)": 369.42, "step": 50135, "train_speed(iter/s)": 0.200477 }, { "acc": 0.76601229, "epoch": 1.271943176052765, "grad_norm": 2.015625, "learning_rate": 3.206026275122332e-06, "loss": 0.89438848, "memory(GiB)": 369.42, "step": 50140, "train_speed(iter/s)": 0.20048 }, { "acc": 0.77146397, "epoch": 1.2720700152207, "grad_norm": 2.265625, "learning_rate": 3.205047511956042e-06, "loss": 0.94494553, "memory(GiB)": 369.42, "step": 50145, "train_speed(iter/s)": 0.200484 }, { "acc": 0.74894447, "epoch": 1.2721968543886353, "grad_norm": 2.25, "learning_rate": 3.204068827739777e-06, "loss": 1.03878012, "memory(GiB)": 369.42, "step": 50150, "train_speed(iter/s)": 0.200488 }, { "acc": 0.74877548, "epoch": 1.2723236935565703, "grad_norm": 2.171875, "learning_rate": 3.2030902225165814e-06, "loss": 0.99809246, "memory(GiB)": 369.42, "step": 50155, "train_speed(iter/s)": 0.200491 }, { "acc": 0.7647788, "epoch": 1.2724505327245053, "grad_norm": 1.71875, "learning_rate": 3.2021116963295016e-06, "loss": 0.96903477, "memory(GiB)": 369.42, "step": 50160, "train_speed(iter/s)": 0.200493 }, { "acc": 0.7506989, "epoch": 1.2725773718924405, "grad_norm": 1.9375, "learning_rate": 3.2011332492215753e-06, "loss": 0.98986797, "memory(GiB)": 369.42, "step": 50165, "train_speed(iter/s)": 0.200497 }, { "acc": 0.75302653, "epoch": 1.2727042110603755, "grad_norm": 2.3125, "learning_rate": 3.200154881235842e-06, "loss": 0.98097496, "memory(GiB)": 369.42, "step": 50170, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75301237, "epoch": 1.2728310502283104, "grad_norm": 2.203125, "learning_rate": 3.1991765924153316e-06, "loss": 0.98902073, "memory(GiB)": 369.42, "step": 50175, "train_speed(iter/s)": 0.200504 }, { "acc": 0.74591875, "epoch": 1.2729578893962454, "grad_norm": 2.03125, "learning_rate": 3.198198382803075e-06, "loss": 0.97281208, "memory(GiB)": 369.42, "step": 50180, "train_speed(iter/s)": 0.200507 }, { "acc": 0.73789692, "epoch": 1.2730847285641806, "grad_norm": 2.34375, "learning_rate": 3.197220252442097e-06, "loss": 0.99433365, "memory(GiB)": 369.42, "step": 50185, "train_speed(iter/s)": 0.20051 }, { "acc": 0.74495149, "epoch": 1.2732115677321156, "grad_norm": 2.0625, "learning_rate": 3.1962422013754237e-06, "loss": 0.98037243, "memory(GiB)": 369.42, "step": 50190, "train_speed(iter/s)": 0.200514 }, { "acc": 0.76512785, "epoch": 1.2733384069000508, "grad_norm": 2.09375, "learning_rate": 3.1952642296460696e-06, "loss": 0.94022999, "memory(GiB)": 369.42, "step": 50195, "train_speed(iter/s)": 0.200516 }, { "acc": 0.73605537, "epoch": 1.2734652460679858, "grad_norm": 2.1875, "learning_rate": 3.194286337297051e-06, "loss": 1.04850683, "memory(GiB)": 369.42, "step": 50200, "train_speed(iter/s)": 0.200519 }, { "acc": 0.74924259, "epoch": 1.2735920852359208, "grad_norm": 3.0, "learning_rate": 3.1933085243713837e-06, "loss": 0.94562855, "memory(GiB)": 369.42, "step": 50205, "train_speed(iter/s)": 0.200523 }, { "acc": 0.76273775, "epoch": 1.2737189244038558, "grad_norm": 2.171875, "learning_rate": 3.1923307909120736e-06, "loss": 1.01352329, "memory(GiB)": 369.42, "step": 50210, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75508184, "epoch": 1.273845763571791, "grad_norm": 2.046875, "learning_rate": 3.1913531369621243e-06, "loss": 0.95408211, "memory(GiB)": 369.42, "step": 50215, "train_speed(iter/s)": 0.200528 }, { "acc": 0.74880571, "epoch": 1.273972602739726, "grad_norm": 2.171875, "learning_rate": 3.19037556256454e-06, "loss": 0.97939777, "memory(GiB)": 369.42, "step": 50220, "train_speed(iter/s)": 0.200532 }, { "acc": 0.74915504, "epoch": 1.2740994419076612, "grad_norm": 2.421875, "learning_rate": 3.189398067762318e-06, "loss": 1.0067049, "memory(GiB)": 369.42, "step": 50225, "train_speed(iter/s)": 0.200536 }, { "acc": 0.7508359, "epoch": 1.2742262810755962, "grad_norm": 2.421875, "learning_rate": 3.1884206525984535e-06, "loss": 1.01426353, "memory(GiB)": 369.42, "step": 50230, "train_speed(iter/s)": 0.200538 }, { "acc": 0.74703164, "epoch": 1.2743531202435312, "grad_norm": 2.078125, "learning_rate": 3.1874433171159348e-06, "loss": 1.02530403, "memory(GiB)": 369.42, "step": 50235, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75179038, "epoch": 1.2744799594114662, "grad_norm": 1.9140625, "learning_rate": 3.1864660613577523e-06, "loss": 0.992449, "memory(GiB)": 369.42, "step": 50240, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75756636, "epoch": 1.2746067985794014, "grad_norm": 2.328125, "learning_rate": 3.185488885366889e-06, "loss": 0.92900448, "memory(GiB)": 369.42, "step": 50245, "train_speed(iter/s)": 0.200546 }, { "acc": 0.75225639, "epoch": 1.2747336377473364, "grad_norm": 1.8359375, "learning_rate": 3.1845117891863274e-06, "loss": 0.97285633, "memory(GiB)": 369.42, "step": 50250, "train_speed(iter/s)": 0.200548 }, { "acc": 0.74858513, "epoch": 1.2748604769152714, "grad_norm": 2.109375, "learning_rate": 3.1835347728590414e-06, "loss": 0.98921814, "memory(GiB)": 369.42, "step": 50255, "train_speed(iter/s)": 0.200548 }, { "acc": 0.74733658, "epoch": 1.2749873160832066, "grad_norm": 2.265625, "learning_rate": 3.1825578364280064e-06, "loss": 0.9687191, "memory(GiB)": 369.42, "step": 50260, "train_speed(iter/s)": 0.200553 }, { "acc": 0.75763183, "epoch": 1.2751141552511416, "grad_norm": 2.109375, "learning_rate": 3.181580979936192e-06, "loss": 0.98429565, "memory(GiB)": 369.42, "step": 50265, "train_speed(iter/s)": 0.200556 }, { "acc": 0.75153303, "epoch": 1.2752409944190766, "grad_norm": 1.984375, "learning_rate": 3.1806042034265656e-06, "loss": 1.00250072, "memory(GiB)": 369.42, "step": 50270, "train_speed(iter/s)": 0.20056 }, { "acc": 0.74570565, "epoch": 1.2753678335870116, "grad_norm": 2.171875, "learning_rate": 3.179627506942089e-06, "loss": 0.99816637, "memory(GiB)": 369.42, "step": 50275, "train_speed(iter/s)": 0.200563 }, { "acc": 0.75678663, "epoch": 1.2754946727549468, "grad_norm": 2.0625, "learning_rate": 3.17865089052572e-06, "loss": 1.01572132, "memory(GiB)": 369.42, "step": 50280, "train_speed(iter/s)": 0.200566 }, { "acc": 0.75898609, "epoch": 1.2756215119228818, "grad_norm": 2.0, "learning_rate": 3.177674354220418e-06, "loss": 0.97559414, "memory(GiB)": 369.42, "step": 50285, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75662169, "epoch": 1.275748351090817, "grad_norm": 2.203125, "learning_rate": 3.1766978980691355e-06, "loss": 0.96732416, "memory(GiB)": 369.42, "step": 50290, "train_speed(iter/s)": 0.200573 }, { "acc": 0.76113601, "epoch": 1.275875190258752, "grad_norm": 1.96875, "learning_rate": 3.1757215221148173e-06, "loss": 1.00124016, "memory(GiB)": 369.42, "step": 50295, "train_speed(iter/s)": 0.200576 }, { "acc": 0.75064659, "epoch": 1.276002029426687, "grad_norm": 1.953125, "learning_rate": 3.174745226400413e-06, "loss": 0.98711052, "memory(GiB)": 369.42, "step": 50300, "train_speed(iter/s)": 0.20058 }, { "acc": 0.75562248, "epoch": 1.276128868594622, "grad_norm": 2.21875, "learning_rate": 3.1737690109688613e-06, "loss": 0.96368666, "memory(GiB)": 369.42, "step": 50305, "train_speed(iter/s)": 0.20058 }, { "acc": 0.75536394, "epoch": 1.2762557077625571, "grad_norm": 2.171875, "learning_rate": 3.1727928758631054e-06, "loss": 0.96464577, "memory(GiB)": 369.42, "step": 50310, "train_speed(iter/s)": 0.200583 }, { "acc": 0.74828329, "epoch": 1.2763825469304921, "grad_norm": 2.125, "learning_rate": 3.1718168211260734e-06, "loss": 1.03033609, "memory(GiB)": 369.42, "step": 50315, "train_speed(iter/s)": 0.200585 }, { "acc": 0.74125562, "epoch": 1.2765093860984271, "grad_norm": 2.734375, "learning_rate": 3.1708408468007014e-06, "loss": 1.03555088, "memory(GiB)": 369.42, "step": 50320, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74789362, "epoch": 1.2766362252663623, "grad_norm": 2.0, "learning_rate": 3.169864952929914e-06, "loss": 1.00461111, "memory(GiB)": 369.42, "step": 50325, "train_speed(iter/s)": 0.200591 }, { "acc": 0.75568094, "epoch": 1.2767630644342973, "grad_norm": 1.859375, "learning_rate": 3.168889139556639e-06, "loss": 0.96904678, "memory(GiB)": 369.42, "step": 50330, "train_speed(iter/s)": 0.200595 }, { "acc": 0.75637636, "epoch": 1.2768899036022323, "grad_norm": 1.828125, "learning_rate": 3.1679134067237942e-06, "loss": 0.97862053, "memory(GiB)": 369.42, "step": 50335, "train_speed(iter/s)": 0.200598 }, { "acc": 0.75925608, "epoch": 1.2770167427701673, "grad_norm": 2.203125, "learning_rate": 3.166937754474296e-06, "loss": 0.96391335, "memory(GiB)": 369.42, "step": 50340, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75938787, "epoch": 1.2771435819381025, "grad_norm": 2.125, "learning_rate": 3.165962182851061e-06, "loss": 0.96027298, "memory(GiB)": 369.42, "step": 50345, "train_speed(iter/s)": 0.200604 }, { "acc": 0.75986257, "epoch": 1.2772704211060375, "grad_norm": 2.265625, "learning_rate": 3.1649866918969984e-06, "loss": 0.95040436, "memory(GiB)": 369.42, "step": 50350, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74964447, "epoch": 1.2773972602739727, "grad_norm": 2.046875, "learning_rate": 3.164011281655013e-06, "loss": 0.98566504, "memory(GiB)": 369.42, "step": 50355, "train_speed(iter/s)": 0.200608 }, { "acc": 0.74547606, "epoch": 1.2775240994419077, "grad_norm": 2.140625, "learning_rate": 3.163035952168007e-06, "loss": 1.03150444, "memory(GiB)": 369.42, "step": 50360, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75121136, "epoch": 1.2776509386098427, "grad_norm": 2.984375, "learning_rate": 3.1620607034788832e-06, "loss": 1.06820812, "memory(GiB)": 369.42, "step": 50365, "train_speed(iter/s)": 0.200612 }, { "acc": 0.74632678, "epoch": 1.2777777777777777, "grad_norm": 1.9921875, "learning_rate": 3.1610855356305354e-06, "loss": 1.0307663, "memory(GiB)": 369.42, "step": 50370, "train_speed(iter/s)": 0.200615 }, { "acc": 0.75354395, "epoch": 1.277904616945713, "grad_norm": 2.234375, "learning_rate": 3.160110448665854e-06, "loss": 0.99687767, "memory(GiB)": 369.42, "step": 50375, "train_speed(iter/s)": 0.200617 }, { "acc": 0.7553771, "epoch": 1.2780314561136479, "grad_norm": 1.828125, "learning_rate": 3.159135442627731e-06, "loss": 0.93736286, "memory(GiB)": 369.42, "step": 50380, "train_speed(iter/s)": 0.200617 }, { "acc": 0.76095667, "epoch": 1.278158295281583, "grad_norm": 1.9921875, "learning_rate": 3.158160517559049e-06, "loss": 1.00224257, "memory(GiB)": 369.42, "step": 50385, "train_speed(iter/s)": 0.200618 }, { "acc": 0.74564962, "epoch": 1.278285134449518, "grad_norm": 2.078125, "learning_rate": 3.157185673502693e-06, "loss": 1.02673779, "memory(GiB)": 369.42, "step": 50390, "train_speed(iter/s)": 0.200617 }, { "acc": 0.74298878, "epoch": 1.278411973617453, "grad_norm": 2.59375, "learning_rate": 3.156210910501537e-06, "loss": 0.98682442, "memory(GiB)": 369.42, "step": 50395, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75154686, "epoch": 1.278538812785388, "grad_norm": 1.859375, "learning_rate": 3.155236228598457e-06, "loss": 1.00091934, "memory(GiB)": 369.42, "step": 50400, "train_speed(iter/s)": 0.200624 }, { "acc": 0.74141693, "epoch": 1.2786656519533233, "grad_norm": 2.4375, "learning_rate": 3.1542616278363238e-06, "loss": 1.00685329, "memory(GiB)": 369.42, "step": 50405, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74425168, "epoch": 1.2787924911212583, "grad_norm": 2.28125, "learning_rate": 3.1532871082580064e-06, "loss": 1.03241596, "memory(GiB)": 369.42, "step": 50410, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75794725, "epoch": 1.2789193302891932, "grad_norm": 2.171875, "learning_rate": 3.152312669906366e-06, "loss": 0.97660599, "memory(GiB)": 369.42, "step": 50415, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76401405, "epoch": 1.2790461694571285, "grad_norm": 2.109375, "learning_rate": 3.1513383128242624e-06, "loss": 0.93645439, "memory(GiB)": 369.42, "step": 50420, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75220189, "epoch": 1.2791730086250634, "grad_norm": 2.609375, "learning_rate": 3.150364037054555e-06, "loss": 1.03180943, "memory(GiB)": 369.42, "step": 50425, "train_speed(iter/s)": 0.200632 }, { "acc": 0.74650164, "epoch": 1.2792998477929984, "grad_norm": 2.171875, "learning_rate": 3.149389842640096e-06, "loss": 0.96956921, "memory(GiB)": 369.42, "step": 50430, "train_speed(iter/s)": 0.200635 }, { "acc": 0.76039534, "epoch": 1.2794266869609334, "grad_norm": 2.625, "learning_rate": 3.148415729623732e-06, "loss": 0.96370449, "memory(GiB)": 369.42, "step": 50435, "train_speed(iter/s)": 0.200638 }, { "acc": 0.7582768, "epoch": 1.2795535261288686, "grad_norm": 1.7421875, "learning_rate": 3.1474416980483126e-06, "loss": 1.05134277, "memory(GiB)": 369.42, "step": 50440, "train_speed(iter/s)": 0.200642 }, { "acc": 0.76365576, "epoch": 1.2796803652968036, "grad_norm": 2.40625, "learning_rate": 3.1464677479566774e-06, "loss": 0.98630695, "memory(GiB)": 369.42, "step": 50445, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74931755, "epoch": 1.2798072044647388, "grad_norm": 2.21875, "learning_rate": 3.1454938793916677e-06, "loss": 1.03310223, "memory(GiB)": 369.42, "step": 50450, "train_speed(iter/s)": 0.200648 }, { "acc": 0.74960437, "epoch": 1.2799340436326738, "grad_norm": 2.125, "learning_rate": 3.144520092396115e-06, "loss": 0.98552341, "memory(GiB)": 369.42, "step": 50455, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74990625, "epoch": 1.2800608828006088, "grad_norm": 1.9140625, "learning_rate": 3.1435463870128536e-06, "loss": 0.96935749, "memory(GiB)": 369.42, "step": 50460, "train_speed(iter/s)": 0.200654 }, { "acc": 0.77096767, "epoch": 1.2801877219685438, "grad_norm": 2.1875, "learning_rate": 3.142572763284709e-06, "loss": 0.9243042, "memory(GiB)": 369.42, "step": 50465, "train_speed(iter/s)": 0.200658 }, { "acc": 0.74831839, "epoch": 1.280314561136479, "grad_norm": 2.21875, "learning_rate": 3.14159922125451e-06, "loss": 0.985993, "memory(GiB)": 369.42, "step": 50470, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74852304, "epoch": 1.280441400304414, "grad_norm": 1.59375, "learning_rate": 3.1406257609650724e-06, "loss": 1.00341053, "memory(GiB)": 369.42, "step": 50475, "train_speed(iter/s)": 0.200663 }, { "acc": 0.75296683, "epoch": 1.280568239472349, "grad_norm": 2.71875, "learning_rate": 3.139652382459215e-06, "loss": 1.02517033, "memory(GiB)": 369.42, "step": 50480, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74471588, "epoch": 1.2806950786402842, "grad_norm": 2.265625, "learning_rate": 3.1386790857797535e-06, "loss": 1.02886448, "memory(GiB)": 369.42, "step": 50485, "train_speed(iter/s)": 0.200672 }, { "acc": 0.76851926, "epoch": 1.2808219178082192, "grad_norm": 2.546875, "learning_rate": 3.1377058709694957e-06, "loss": 0.9547718, "memory(GiB)": 369.42, "step": 50490, "train_speed(iter/s)": 0.200674 }, { "acc": 0.7470798, "epoch": 1.2809487569761542, "grad_norm": 2.25, "learning_rate": 3.1367327380712477e-06, "loss": 0.94515743, "memory(GiB)": 369.42, "step": 50495, "train_speed(iter/s)": 0.200675 }, { "acc": 0.7501018, "epoch": 1.2810755961440892, "grad_norm": 1.984375, "learning_rate": 3.135759687127812e-06, "loss": 0.9615406, "memory(GiB)": 369.42, "step": 50500, "train_speed(iter/s)": 0.200679 }, { "acc": 0.7512372, "epoch": 1.2812024353120244, "grad_norm": 1.7421875, "learning_rate": 3.134786718181989e-06, "loss": 0.98789616, "memory(GiB)": 369.42, "step": 50505, "train_speed(iter/s)": 0.200681 }, { "acc": 0.74573822, "epoch": 1.2813292744799594, "grad_norm": 2.140625, "learning_rate": 3.133813831276575e-06, "loss": 1.00424538, "memory(GiB)": 369.42, "step": 50510, "train_speed(iter/s)": 0.200684 }, { "acc": 0.7517458, "epoch": 1.2814561136478946, "grad_norm": 2.671875, "learning_rate": 3.1328410264543585e-06, "loss": 1.00575619, "memory(GiB)": 369.42, "step": 50515, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75765667, "epoch": 1.2815829528158296, "grad_norm": 2.265625, "learning_rate": 3.131868303758131e-06, "loss": 0.94516611, "memory(GiB)": 369.42, "step": 50520, "train_speed(iter/s)": 0.200688 }, { "acc": 0.75529041, "epoch": 1.2817097919837646, "grad_norm": 2.40625, "learning_rate": 3.1308956632306754e-06, "loss": 0.95327034, "memory(GiB)": 369.42, "step": 50525, "train_speed(iter/s)": 0.200692 }, { "acc": 0.74460592, "epoch": 1.2818366311516995, "grad_norm": 2.171875, "learning_rate": 3.129923104914776e-06, "loss": 1.03726044, "memory(GiB)": 369.42, "step": 50530, "train_speed(iter/s)": 0.200694 }, { "acc": 0.76327724, "epoch": 1.2819634703196348, "grad_norm": 2.40625, "learning_rate": 3.1289506288532045e-06, "loss": 0.91056366, "memory(GiB)": 369.42, "step": 50535, "train_speed(iter/s)": 0.200697 }, { "acc": 0.7374382, "epoch": 1.2820903094875697, "grad_norm": 2.15625, "learning_rate": 3.12797823508874e-06, "loss": 1.02361393, "memory(GiB)": 369.42, "step": 50540, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75685711, "epoch": 1.282217148655505, "grad_norm": 1.984375, "learning_rate": 3.127005923664149e-06, "loss": 0.94684677, "memory(GiB)": 369.42, "step": 50545, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75262899, "epoch": 1.28234398782344, "grad_norm": 2.03125, "learning_rate": 3.1260336946222014e-06, "loss": 0.95779018, "memory(GiB)": 369.42, "step": 50550, "train_speed(iter/s)": 0.200703 }, { "acc": 0.76125555, "epoch": 1.282470826991375, "grad_norm": 2.234375, "learning_rate": 3.1250615480056584e-06, "loss": 0.90750923, "memory(GiB)": 369.42, "step": 50555, "train_speed(iter/s)": 0.200706 }, { "acc": 0.76246028, "epoch": 1.28259766615931, "grad_norm": 1.8359375, "learning_rate": 3.124089483857278e-06, "loss": 0.93780575, "memory(GiB)": 369.42, "step": 50560, "train_speed(iter/s)": 0.200708 }, { "acc": 0.74866037, "epoch": 1.2827245053272451, "grad_norm": 2.875, "learning_rate": 3.123117502219819e-06, "loss": 0.99923992, "memory(GiB)": 369.42, "step": 50565, "train_speed(iter/s)": 0.200711 }, { "acc": 0.7539382, "epoch": 1.2828513444951801, "grad_norm": 2.328125, "learning_rate": 3.122145603136032e-06, "loss": 0.97340775, "memory(GiB)": 369.42, "step": 50570, "train_speed(iter/s)": 0.200713 }, { "acc": 0.7595252, "epoch": 1.2829781836631151, "grad_norm": 2.515625, "learning_rate": 3.1211737866486653e-06, "loss": 1.00434685, "memory(GiB)": 369.42, "step": 50575, "train_speed(iter/s)": 0.200715 }, { "acc": 0.74888582, "epoch": 1.2831050228310503, "grad_norm": 1.9296875, "learning_rate": 3.1202020528004627e-06, "loss": 0.9625988, "memory(GiB)": 369.42, "step": 50580, "train_speed(iter/s)": 0.200716 }, { "acc": 0.75234919, "epoch": 1.2832318619989853, "grad_norm": 2.046875, "learning_rate": 3.119230401634167e-06, "loss": 0.99484444, "memory(GiB)": 369.42, "step": 50585, "train_speed(iter/s)": 0.200718 }, { "acc": 0.75462408, "epoch": 1.2833587011669203, "grad_norm": 2.046875, "learning_rate": 3.118258833192517e-06, "loss": 0.94671068, "memory(GiB)": 369.42, "step": 50590, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75028253, "epoch": 1.2834855403348553, "grad_norm": 2.140625, "learning_rate": 3.117287347518242e-06, "loss": 0.98245029, "memory(GiB)": 369.42, "step": 50595, "train_speed(iter/s)": 0.200726 }, { "acc": 0.7455884, "epoch": 1.2836123795027905, "grad_norm": 2.609375, "learning_rate": 3.116315944654077e-06, "loss": 1.07967758, "memory(GiB)": 369.42, "step": 50600, "train_speed(iter/s)": 0.20073 }, { "acc": 0.7476048, "epoch": 1.2837392186707255, "grad_norm": 2.203125, "learning_rate": 3.115344624642745e-06, "loss": 0.99520473, "memory(GiB)": 369.42, "step": 50605, "train_speed(iter/s)": 0.200731 }, { "acc": 0.76005049, "epoch": 1.2838660578386607, "grad_norm": 2.421875, "learning_rate": 3.1143733875269734e-06, "loss": 0.97168941, "memory(GiB)": 369.42, "step": 50610, "train_speed(iter/s)": 0.200734 }, { "acc": 0.77342424, "epoch": 1.2839928970065957, "grad_norm": 2.125, "learning_rate": 3.1134022333494774e-06, "loss": 0.91590223, "memory(GiB)": 369.42, "step": 50615, "train_speed(iter/s)": 0.200735 }, { "acc": 0.75959463, "epoch": 1.2841197361745307, "grad_norm": 2.03125, "learning_rate": 3.1124311621529744e-06, "loss": 0.9321064, "memory(GiB)": 369.42, "step": 50620, "train_speed(iter/s)": 0.200739 }, { "acc": 0.76050749, "epoch": 1.2842465753424657, "grad_norm": 2.28125, "learning_rate": 3.111460173980175e-06, "loss": 1.00490627, "memory(GiB)": 369.42, "step": 50625, "train_speed(iter/s)": 0.200742 }, { "acc": 0.75370502, "epoch": 1.2843734145104009, "grad_norm": 1.90625, "learning_rate": 3.110489268873792e-06, "loss": 0.96756544, "memory(GiB)": 369.42, "step": 50630, "train_speed(iter/s)": 0.200745 }, { "acc": 0.7604013, "epoch": 1.2845002536783359, "grad_norm": 2.453125, "learning_rate": 3.1095184468765248e-06, "loss": 1.00220585, "memory(GiB)": 369.42, "step": 50635, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74790211, "epoch": 1.2846270928462709, "grad_norm": 1.796875, "learning_rate": 3.1085477080310766e-06, "loss": 1.00323467, "memory(GiB)": 369.42, "step": 50640, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75428147, "epoch": 1.284753932014206, "grad_norm": 2.359375, "learning_rate": 3.1075770523801453e-06, "loss": 0.97326393, "memory(GiB)": 369.42, "step": 50645, "train_speed(iter/s)": 0.200757 }, { "acc": 0.76020069, "epoch": 1.284880771182141, "grad_norm": 1.921875, "learning_rate": 3.106606479966426e-06, "loss": 0.95127335, "memory(GiB)": 369.42, "step": 50650, "train_speed(iter/s)": 0.200759 }, { "acc": 0.77331572, "epoch": 1.285007610350076, "grad_norm": 2.296875, "learning_rate": 3.1056359908326044e-06, "loss": 0.99038715, "memory(GiB)": 369.42, "step": 50655, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74059262, "epoch": 1.285134449518011, "grad_norm": 2.234375, "learning_rate": 3.1046655850213707e-06, "loss": 0.95245323, "memory(GiB)": 369.42, "step": 50660, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75472889, "epoch": 1.2852612886859462, "grad_norm": 2.28125, "learning_rate": 3.103695262575407e-06, "loss": 0.99088154, "memory(GiB)": 369.42, "step": 50665, "train_speed(iter/s)": 0.200769 }, { "acc": 0.74856014, "epoch": 1.2853881278538812, "grad_norm": 2.0625, "learning_rate": 3.102725023537393e-06, "loss": 0.99648437, "memory(GiB)": 369.42, "step": 50670, "train_speed(iter/s)": 0.200774 }, { "acc": 0.75837698, "epoch": 1.2855149670218164, "grad_norm": 1.984375, "learning_rate": 3.1017548679500008e-06, "loss": 0.90350304, "memory(GiB)": 369.42, "step": 50675, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74566832, "epoch": 1.2856418061897514, "grad_norm": 2.8125, "learning_rate": 3.1007847958559057e-06, "loss": 0.97725487, "memory(GiB)": 369.42, "step": 50680, "train_speed(iter/s)": 0.200779 }, { "acc": 0.74701772, "epoch": 1.2857686453576864, "grad_norm": 2.296875, "learning_rate": 3.099814807297774e-06, "loss": 1.01540766, "memory(GiB)": 369.42, "step": 50685, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76399136, "epoch": 1.2858954845256214, "grad_norm": 2.03125, "learning_rate": 3.098844902318272e-06, "loss": 0.95111675, "memory(GiB)": 369.42, "step": 50690, "train_speed(iter/s)": 0.200784 }, { "acc": 0.75888944, "epoch": 1.2860223236935566, "grad_norm": 2.15625, "learning_rate": 3.0978750809600596e-06, "loss": 1.00832291, "memory(GiB)": 369.42, "step": 50695, "train_speed(iter/s)": 0.200786 }, { "acc": 0.76209331, "epoch": 1.2861491628614916, "grad_norm": 2.390625, "learning_rate": 3.0969053432657913e-06, "loss": 0.95187473, "memory(GiB)": 369.42, "step": 50700, "train_speed(iter/s)": 0.200787 }, { "acc": 0.7563756, "epoch": 1.2862760020294268, "grad_norm": 2.140625, "learning_rate": 3.0959356892781246e-06, "loss": 0.94994392, "memory(GiB)": 369.42, "step": 50705, "train_speed(iter/s)": 0.200791 }, { "acc": 0.7563849, "epoch": 1.2864028411973618, "grad_norm": 2.46875, "learning_rate": 3.0949661190397072e-06, "loss": 0.98407793, "memory(GiB)": 369.42, "step": 50710, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74941821, "epoch": 1.2865296803652968, "grad_norm": 2.421875, "learning_rate": 3.0939966325931852e-06, "loss": 1.04769154, "memory(GiB)": 369.42, "step": 50715, "train_speed(iter/s)": 0.200798 }, { "acc": 0.73634462, "epoch": 1.2866565195332318, "grad_norm": 2.046875, "learning_rate": 3.0930272299812e-06, "loss": 1.03678932, "memory(GiB)": 369.42, "step": 50720, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74834895, "epoch": 1.286783358701167, "grad_norm": 2.03125, "learning_rate": 3.0920579112463916e-06, "loss": 0.95029621, "memory(GiB)": 369.42, "step": 50725, "train_speed(iter/s)": 0.200806 }, { "acc": 0.74884887, "epoch": 1.286910197869102, "grad_norm": 2.109375, "learning_rate": 3.0910886764313964e-06, "loss": 1.04052372, "memory(GiB)": 369.42, "step": 50730, "train_speed(iter/s)": 0.20081 }, { "acc": 0.76138754, "epoch": 1.287037037037037, "grad_norm": 2.09375, "learning_rate": 3.0901195255788406e-06, "loss": 0.94290085, "memory(GiB)": 369.42, "step": 50735, "train_speed(iter/s)": 0.200813 }, { "acc": 0.76767063, "epoch": 1.2871638762049722, "grad_norm": 2.15625, "learning_rate": 3.089150458731357e-06, "loss": 0.90719481, "memory(GiB)": 369.42, "step": 50740, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76819496, "epoch": 1.2872907153729072, "grad_norm": 2.15625, "learning_rate": 3.0881814759315666e-06, "loss": 0.96261473, "memory(GiB)": 369.42, "step": 50745, "train_speed(iter/s)": 0.200818 }, { "acc": 0.7482626, "epoch": 1.2874175545408422, "grad_norm": 2.125, "learning_rate": 3.0872125772220934e-06, "loss": 1.00934353, "memory(GiB)": 369.42, "step": 50750, "train_speed(iter/s)": 0.200819 }, { "acc": 0.75181479, "epoch": 1.2875443937087772, "grad_norm": 2.40625, "learning_rate": 3.0862437626455483e-06, "loss": 1.02914591, "memory(GiB)": 369.42, "step": 50755, "train_speed(iter/s)": 0.200821 }, { "acc": 0.76254244, "epoch": 1.2876712328767124, "grad_norm": 2.21875, "learning_rate": 3.0852750322445473e-06, "loss": 0.90853195, "memory(GiB)": 369.42, "step": 50760, "train_speed(iter/s)": 0.200825 }, { "acc": 0.75430307, "epoch": 1.2877980720446474, "grad_norm": 2.203125, "learning_rate": 3.0843063860616982e-06, "loss": 0.99039927, "memory(GiB)": 369.42, "step": 50765, "train_speed(iter/s)": 0.200824 }, { "acc": 0.74903793, "epoch": 1.2879249112125826, "grad_norm": 2.171875, "learning_rate": 3.0833378241396094e-06, "loss": 1.00737829, "memory(GiB)": 369.42, "step": 50770, "train_speed(iter/s)": 0.200828 }, { "acc": 0.74087276, "epoch": 1.2880517503805176, "grad_norm": 1.9453125, "learning_rate": 3.0823693465208794e-06, "loss": 0.99563446, "memory(GiB)": 369.42, "step": 50775, "train_speed(iter/s)": 0.200831 }, { "acc": 0.75097914, "epoch": 1.2881785895484525, "grad_norm": 2.390625, "learning_rate": 3.081400953248106e-06, "loss": 0.97268066, "memory(GiB)": 369.42, "step": 50780, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75023127, "epoch": 1.2883054287163875, "grad_norm": 2.890625, "learning_rate": 3.0804326443638854e-06, "loss": 0.98890657, "memory(GiB)": 369.42, "step": 50785, "train_speed(iter/s)": 0.200837 }, { "acc": 0.74289675, "epoch": 1.2884322678843227, "grad_norm": 2.25, "learning_rate": 3.0794644199108087e-06, "loss": 1.03175249, "memory(GiB)": 369.42, "step": 50790, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74704266, "epoch": 1.2885591070522577, "grad_norm": 2.1875, "learning_rate": 3.07849627993146e-06, "loss": 0.9964653, "memory(GiB)": 369.42, "step": 50795, "train_speed(iter/s)": 0.200844 }, { "acc": 0.75368538, "epoch": 1.2886859462201927, "grad_norm": 1.8984375, "learning_rate": 3.0775282244684233e-06, "loss": 0.94335651, "memory(GiB)": 369.42, "step": 50800, "train_speed(iter/s)": 0.200847 }, { "acc": 0.74612923, "epoch": 1.288812785388128, "grad_norm": 1.953125, "learning_rate": 3.076560253564279e-06, "loss": 1.01066284, "memory(GiB)": 369.42, "step": 50805, "train_speed(iter/s)": 0.200851 }, { "acc": 0.7557724, "epoch": 1.288939624556063, "grad_norm": 2.65625, "learning_rate": 3.0755923672616038e-06, "loss": 1.00719957, "memory(GiB)": 369.42, "step": 50810, "train_speed(iter/s)": 0.200853 }, { "acc": 0.74253516, "epoch": 1.289066463723998, "grad_norm": 2.21875, "learning_rate": 3.074624565602966e-06, "loss": 1.03787251, "memory(GiB)": 369.42, "step": 50815, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74865746, "epoch": 1.289193302891933, "grad_norm": 2.296875, "learning_rate": 3.073656848630937e-06, "loss": 0.98346405, "memory(GiB)": 369.42, "step": 50820, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74391546, "epoch": 1.2893201420598681, "grad_norm": 1.765625, "learning_rate": 3.0726892163880784e-06, "loss": 0.98959503, "memory(GiB)": 369.42, "step": 50825, "train_speed(iter/s)": 0.200857 }, { "acc": 0.73953142, "epoch": 1.289446981227803, "grad_norm": 1.9453125, "learning_rate": 3.071721668916956e-06, "loss": 0.99536934, "memory(GiB)": 369.42, "step": 50830, "train_speed(iter/s)": 0.200859 }, { "acc": 0.76366901, "epoch": 1.2895738203957383, "grad_norm": 2.296875, "learning_rate": 3.0707542062601225e-06, "loss": 0.91231127, "memory(GiB)": 369.42, "step": 50835, "train_speed(iter/s)": 0.200861 }, { "acc": 0.74247913, "epoch": 1.2897006595636733, "grad_norm": 2.03125, "learning_rate": 3.0697868284601323e-06, "loss": 0.99355907, "memory(GiB)": 369.42, "step": 50840, "train_speed(iter/s)": 0.200863 }, { "acc": 0.7657527, "epoch": 1.2898274987316083, "grad_norm": 2.34375, "learning_rate": 3.068819535559534e-06, "loss": 0.93209324, "memory(GiB)": 369.42, "step": 50845, "train_speed(iter/s)": 0.200865 }, { "acc": 0.75896101, "epoch": 1.2899543378995433, "grad_norm": 2.1875, "learning_rate": 3.0678523276008774e-06, "loss": 0.9139122, "memory(GiB)": 369.42, "step": 50850, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76408963, "epoch": 1.2900811770674785, "grad_norm": 2.203125, "learning_rate": 3.0668852046267e-06, "loss": 0.92283821, "memory(GiB)": 369.42, "step": 50855, "train_speed(iter/s)": 0.200872 }, { "acc": 0.75267839, "epoch": 1.2902080162354135, "grad_norm": 2.25, "learning_rate": 3.0659181666795413e-06, "loss": 0.98309851, "memory(GiB)": 369.42, "step": 50860, "train_speed(iter/s)": 0.200875 }, { "acc": 0.74258261, "epoch": 1.2903348554033487, "grad_norm": 2.203125, "learning_rate": 3.0649512138019376e-06, "loss": 1.00481119, "memory(GiB)": 369.42, "step": 50865, "train_speed(iter/s)": 0.200878 }, { "acc": 0.76141434, "epoch": 1.2904616945712837, "grad_norm": 2.21875, "learning_rate": 3.0639843460364203e-06, "loss": 0.95209618, "memory(GiB)": 369.42, "step": 50870, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74890347, "epoch": 1.2905885337392187, "grad_norm": 2.046875, "learning_rate": 3.0630175634255134e-06, "loss": 1.00774403, "memory(GiB)": 369.42, "step": 50875, "train_speed(iter/s)": 0.200883 }, { "acc": 0.74267092, "epoch": 1.2907153729071537, "grad_norm": 1.9921875, "learning_rate": 3.062050866011742e-06, "loss": 1.01126356, "memory(GiB)": 369.42, "step": 50880, "train_speed(iter/s)": 0.200885 }, { "acc": 0.74264917, "epoch": 1.2908422120750889, "grad_norm": 2.3125, "learning_rate": 3.0610842538376264e-06, "loss": 1.04947271, "memory(GiB)": 369.42, "step": 50885, "train_speed(iter/s)": 0.200886 }, { "acc": 0.7530304, "epoch": 1.2909690512430239, "grad_norm": 2.0625, "learning_rate": 3.060117726945683e-06, "loss": 1.02163048, "memory(GiB)": 369.42, "step": 50890, "train_speed(iter/s)": 0.200889 }, { "acc": 0.73272729, "epoch": 1.2910958904109588, "grad_norm": 2.09375, "learning_rate": 3.059151285378421e-06, "loss": 1.03997355, "memory(GiB)": 369.42, "step": 50895, "train_speed(iter/s)": 0.200892 }, { "acc": 0.75769544, "epoch": 1.291222729578894, "grad_norm": 2.234375, "learning_rate": 3.0581849291783518e-06, "loss": 0.94608517, "memory(GiB)": 369.42, "step": 50900, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75919385, "epoch": 1.291349568746829, "grad_norm": 2.109375, "learning_rate": 3.057218658387977e-06, "loss": 0.97589016, "memory(GiB)": 369.42, "step": 50905, "train_speed(iter/s)": 0.200899 }, { "acc": 0.75179219, "epoch": 1.291476407914764, "grad_norm": 2.796875, "learning_rate": 3.0562524730498023e-06, "loss": 1.00201759, "memory(GiB)": 369.42, "step": 50910, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74508824, "epoch": 1.291603247082699, "grad_norm": 2.453125, "learning_rate": 3.055286373206321e-06, "loss": 1.02098675, "memory(GiB)": 369.42, "step": 50915, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75825214, "epoch": 1.2917300862506342, "grad_norm": 1.984375, "learning_rate": 3.054320358900027e-06, "loss": 1.02835178, "memory(GiB)": 369.42, "step": 50920, "train_speed(iter/s)": 0.20091 }, { "acc": 0.74209976, "epoch": 1.2918569254185692, "grad_norm": 2.296875, "learning_rate": 3.053354430173411e-06, "loss": 1.0229826, "memory(GiB)": 369.42, "step": 50925, "train_speed(iter/s)": 0.200911 }, { "acc": 0.76912899, "epoch": 1.2919837645865044, "grad_norm": 2.65625, "learning_rate": 3.0523885870689595e-06, "loss": 0.92789021, "memory(GiB)": 369.42, "step": 50930, "train_speed(iter/s)": 0.200915 }, { "acc": 0.76756821, "epoch": 1.2921106037544394, "grad_norm": 2.328125, "learning_rate": 3.051422829629152e-06, "loss": 0.91949825, "memory(GiB)": 369.42, "step": 50935, "train_speed(iter/s)": 0.200916 }, { "acc": 0.7557785, "epoch": 1.2922374429223744, "grad_norm": 1.75, "learning_rate": 3.050457157896467e-06, "loss": 0.97501478, "memory(GiB)": 369.42, "step": 50940, "train_speed(iter/s)": 0.20092 }, { "acc": 0.75652666, "epoch": 1.2923642820903094, "grad_norm": 1.84375, "learning_rate": 3.049491571913382e-06, "loss": 0.95611095, "memory(GiB)": 369.42, "step": 50945, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74018011, "epoch": 1.2924911212582446, "grad_norm": 2.0625, "learning_rate": 3.048526071722367e-06, "loss": 0.99473763, "memory(GiB)": 369.42, "step": 50950, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75131927, "epoch": 1.2926179604261796, "grad_norm": 2.359375, "learning_rate": 3.047560657365886e-06, "loss": 0.97102184, "memory(GiB)": 369.42, "step": 50955, "train_speed(iter/s)": 0.200929 }, { "acc": 0.74117355, "epoch": 1.2927447995941146, "grad_norm": 3.078125, "learning_rate": 3.046595328886405e-06, "loss": 1.02707024, "memory(GiB)": 369.42, "step": 50960, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75204725, "epoch": 1.2928716387620498, "grad_norm": 2.09375, "learning_rate": 3.0456300863263815e-06, "loss": 0.95199375, "memory(GiB)": 369.42, "step": 50965, "train_speed(iter/s)": 0.200935 }, { "acc": 0.75640583, "epoch": 1.2929984779299848, "grad_norm": 1.90625, "learning_rate": 3.044664929728276e-06, "loss": 0.92936649, "memory(GiB)": 369.42, "step": 50970, "train_speed(iter/s)": 0.20094 }, { "acc": 0.76221838, "epoch": 1.2931253170979198, "grad_norm": 2.296875, "learning_rate": 3.0436998591345336e-06, "loss": 0.91799698, "memory(GiB)": 369.42, "step": 50975, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74645452, "epoch": 1.2932521562658548, "grad_norm": 2.71875, "learning_rate": 3.042734874587607e-06, "loss": 1.01129456, "memory(GiB)": 369.42, "step": 50980, "train_speed(iter/s)": 0.200947 }, { "acc": 0.74003248, "epoch": 1.29337899543379, "grad_norm": 2.28125, "learning_rate": 3.041769976129938e-06, "loss": 0.98811684, "memory(GiB)": 369.42, "step": 50985, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75842066, "epoch": 1.293505834601725, "grad_norm": 2.0625, "learning_rate": 3.0408051638039697e-06, "loss": 0.98515854, "memory(GiB)": 369.42, "step": 50990, "train_speed(iter/s)": 0.200951 }, { "acc": 0.7359973, "epoch": 1.2936326737696602, "grad_norm": 3.578125, "learning_rate": 3.039840437652137e-06, "loss": 1.07619095, "memory(GiB)": 369.42, "step": 50995, "train_speed(iter/s)": 0.200954 }, { "acc": 0.75227928, "epoch": 1.2937595129375952, "grad_norm": 2.375, "learning_rate": 3.0388757977168724e-06, "loss": 0.98101063, "memory(GiB)": 369.42, "step": 51000, "train_speed(iter/s)": 0.200957 }, { "epoch": 1.2937595129375952, "eval_acc": 0.7379228692927241, "eval_loss": 0.9696890115737915, "eval_runtime": 385.6067, "eval_samples_per_second": 16.519, "eval_steps_per_second": 8.26, "step": 51000 }, { "acc": 0.75257015, "epoch": 1.2938863521055302, "grad_norm": 2.3125, "learning_rate": 3.0379112440406066e-06, "loss": 0.95217781, "memory(GiB)": 369.42, "step": 51005, "train_speed(iter/s)": 0.200396 }, { "acc": 0.74636908, "epoch": 1.2940131912734651, "grad_norm": 2.046875, "learning_rate": 3.036946776665766e-06, "loss": 1.00013123, "memory(GiB)": 369.42, "step": 51010, "train_speed(iter/s)": 0.200399 }, { "acc": 0.7571135, "epoch": 1.2941400304414004, "grad_norm": 2.34375, "learning_rate": 3.0359823956347695e-06, "loss": 0.92267799, "memory(GiB)": 369.42, "step": 51015, "train_speed(iter/s)": 0.200402 }, { "acc": 0.75659437, "epoch": 1.2942668696093353, "grad_norm": 1.921875, "learning_rate": 3.0350181009900347e-06, "loss": 1.02484131, "memory(GiB)": 369.42, "step": 51020, "train_speed(iter/s)": 0.200407 }, { "acc": 0.76623659, "epoch": 1.2943937087772706, "grad_norm": 2.046875, "learning_rate": 3.0340538927739784e-06, "loss": 0.91854687, "memory(GiB)": 369.42, "step": 51025, "train_speed(iter/s)": 0.200409 }, { "acc": 0.76690884, "epoch": 1.2945205479452055, "grad_norm": 2.109375, "learning_rate": 3.0330897710290093e-06, "loss": 0.98304548, "memory(GiB)": 369.42, "step": 51030, "train_speed(iter/s)": 0.200412 }, { "acc": 0.74248824, "epoch": 1.2946473871131405, "grad_norm": 2.109375, "learning_rate": 3.032125735797532e-06, "loss": 0.99869862, "memory(GiB)": 369.42, "step": 51035, "train_speed(iter/s)": 0.200414 }, { "acc": 0.7325841, "epoch": 1.2947742262810755, "grad_norm": 2.53125, "learning_rate": 3.031161787121952e-06, "loss": 1.06516685, "memory(GiB)": 369.42, "step": 51040, "train_speed(iter/s)": 0.200418 }, { "acc": 0.7594192, "epoch": 1.2949010654490107, "grad_norm": 1.875, "learning_rate": 3.0301979250446655e-06, "loss": 1.00487823, "memory(GiB)": 369.42, "step": 51045, "train_speed(iter/s)": 0.20042 }, { "acc": 0.7755013, "epoch": 1.2950279046169457, "grad_norm": 2.78125, "learning_rate": 3.029234149608071e-06, "loss": 0.91718979, "memory(GiB)": 369.42, "step": 51050, "train_speed(iter/s)": 0.200423 }, { "acc": 0.75880594, "epoch": 1.2951547437848807, "grad_norm": 2.359375, "learning_rate": 3.0282704608545566e-06, "loss": 0.95750275, "memory(GiB)": 369.42, "step": 51055, "train_speed(iter/s)": 0.200427 }, { "acc": 0.75642824, "epoch": 1.295281582952816, "grad_norm": 2.171875, "learning_rate": 3.0273068588265097e-06, "loss": 0.98818483, "memory(GiB)": 369.42, "step": 51060, "train_speed(iter/s)": 0.20043 }, { "acc": 0.74492445, "epoch": 1.295408422120751, "grad_norm": 1.9453125, "learning_rate": 3.0263433435663143e-06, "loss": 1.01186562, "memory(GiB)": 369.42, "step": 51065, "train_speed(iter/s)": 0.200429 }, { "acc": 0.74571781, "epoch": 1.295535261288686, "grad_norm": 2.140625, "learning_rate": 3.0253799151163522e-06, "loss": 1.03301163, "memory(GiB)": 369.42, "step": 51070, "train_speed(iter/s)": 0.200431 }, { "acc": 0.74678316, "epoch": 1.295662100456621, "grad_norm": 1.984375, "learning_rate": 3.0244165735189967e-06, "loss": 0.99523878, "memory(GiB)": 369.42, "step": 51075, "train_speed(iter/s)": 0.200433 }, { "acc": 0.73451376, "epoch": 1.295788939624556, "grad_norm": 2.359375, "learning_rate": 3.023453318816619e-06, "loss": 1.06164761, "memory(GiB)": 369.42, "step": 51080, "train_speed(iter/s)": 0.200435 }, { "acc": 0.74762106, "epoch": 1.295915778792491, "grad_norm": 2.3125, "learning_rate": 3.022490151051591e-06, "loss": 0.9478344, "memory(GiB)": 369.42, "step": 51085, "train_speed(iter/s)": 0.200437 }, { "acc": 0.75397682, "epoch": 1.2960426179604263, "grad_norm": 2.828125, "learning_rate": 3.0215270702662753e-06, "loss": 1.01637783, "memory(GiB)": 369.42, "step": 51090, "train_speed(iter/s)": 0.20044 }, { "acc": 0.76558752, "epoch": 1.2961694571283613, "grad_norm": 2.203125, "learning_rate": 3.020564076503031e-06, "loss": 0.90752153, "memory(GiB)": 369.42, "step": 51095, "train_speed(iter/s)": 0.200442 }, { "acc": 0.74527149, "epoch": 1.2962962962962963, "grad_norm": 1.7109375, "learning_rate": 3.019601169804216e-06, "loss": 1.02733059, "memory(GiB)": 369.42, "step": 51100, "train_speed(iter/s)": 0.200444 }, { "acc": 0.75823898, "epoch": 1.2964231354642313, "grad_norm": 1.703125, "learning_rate": 3.018638350212184e-06, "loss": 1.01311827, "memory(GiB)": 369.42, "step": 51105, "train_speed(iter/s)": 0.200447 }, { "acc": 0.75194483, "epoch": 1.2965499746321665, "grad_norm": 1.953125, "learning_rate": 3.0176756177692845e-06, "loss": 0.97864437, "memory(GiB)": 369.42, "step": 51110, "train_speed(iter/s)": 0.20045 }, { "acc": 0.74116292, "epoch": 1.2966768138001015, "grad_norm": 2.15625, "learning_rate": 3.01671297251786e-06, "loss": 1.02306919, "memory(GiB)": 369.42, "step": 51115, "train_speed(iter/s)": 0.200452 }, { "acc": 0.75682974, "epoch": 1.2968036529680365, "grad_norm": 1.9765625, "learning_rate": 3.0157504145002546e-06, "loss": 0.95255375, "memory(GiB)": 369.42, "step": 51120, "train_speed(iter/s)": 0.200455 }, { "acc": 0.74217806, "epoch": 1.2969304921359717, "grad_norm": 2.15625, "learning_rate": 3.0147879437588046e-06, "loss": 0.96787319, "memory(GiB)": 369.42, "step": 51125, "train_speed(iter/s)": 0.200458 }, { "acc": 0.75548706, "epoch": 1.2970573313039067, "grad_norm": 2.640625, "learning_rate": 3.013825560335845e-06, "loss": 0.95059948, "memory(GiB)": 369.42, "step": 51130, "train_speed(iter/s)": 0.200461 }, { "acc": 0.75999956, "epoch": 1.2971841704718416, "grad_norm": 2.328125, "learning_rate": 3.0128632642737044e-06, "loss": 0.91478672, "memory(GiB)": 369.42, "step": 51135, "train_speed(iter/s)": 0.200464 }, { "acc": 0.75921988, "epoch": 1.2973110096397766, "grad_norm": 2.515625, "learning_rate": 3.0119010556147088e-06, "loss": 0.94367218, "memory(GiB)": 369.42, "step": 51140, "train_speed(iter/s)": 0.200465 }, { "acc": 0.75798101, "epoch": 1.2974378488077118, "grad_norm": 1.9140625, "learning_rate": 3.0109389344011813e-06, "loss": 0.97817516, "memory(GiB)": 369.42, "step": 51145, "train_speed(iter/s)": 0.200469 }, { "acc": 0.7541564, "epoch": 1.2975646879756468, "grad_norm": 2.03125, "learning_rate": 3.0099769006754415e-06, "loss": 0.96581564, "memory(GiB)": 369.42, "step": 51150, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74550514, "epoch": 1.297691527143582, "grad_norm": 2.109375, "learning_rate": 3.0090149544798007e-06, "loss": 0.99035931, "memory(GiB)": 369.42, "step": 51155, "train_speed(iter/s)": 0.200474 }, { "acc": 0.73671641, "epoch": 1.297818366311517, "grad_norm": 2.265625, "learning_rate": 3.0080530958565712e-06, "loss": 1.00748806, "memory(GiB)": 369.42, "step": 51160, "train_speed(iter/s)": 0.200476 }, { "acc": 0.75132461, "epoch": 1.297945205479452, "grad_norm": 2.40625, "learning_rate": 3.0070913248480602e-06, "loss": 0.99033623, "memory(GiB)": 369.42, "step": 51165, "train_speed(iter/s)": 0.20048 }, { "acc": 0.76089678, "epoch": 1.298072044647387, "grad_norm": 1.84375, "learning_rate": 3.0061296414965724e-06, "loss": 0.96617413, "memory(GiB)": 369.42, "step": 51170, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75970211, "epoch": 1.2981988838153222, "grad_norm": 1.859375, "learning_rate": 3.005168045844402e-06, "loss": 1.00897312, "memory(GiB)": 369.42, "step": 51175, "train_speed(iter/s)": 0.200486 }, { "acc": 0.75789719, "epoch": 1.2983257229832572, "grad_norm": 2.40625, "learning_rate": 3.0042065379338486e-06, "loss": 0.98003883, "memory(GiB)": 369.42, "step": 51180, "train_speed(iter/s)": 0.20049 }, { "acc": 0.75104446, "epoch": 1.2984525621511924, "grad_norm": 2.296875, "learning_rate": 3.003245117807201e-06, "loss": 0.99239769, "memory(GiB)": 369.42, "step": 51185, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75688114, "epoch": 1.2985794013191274, "grad_norm": 1.9609375, "learning_rate": 3.0022837855067514e-06, "loss": 0.943011, "memory(GiB)": 369.42, "step": 51190, "train_speed(iter/s)": 0.200496 }, { "acc": 0.76334314, "epoch": 1.2987062404870624, "grad_norm": 2.046875, "learning_rate": 3.0013225410747772e-06, "loss": 0.91296597, "memory(GiB)": 369.42, "step": 51195, "train_speed(iter/s)": 0.200499 }, { "acc": 0.73508229, "epoch": 1.2988330796549974, "grad_norm": 2.3125, "learning_rate": 3.0003613845535617e-06, "loss": 0.99832153, "memory(GiB)": 369.42, "step": 51200, "train_speed(iter/s)": 0.200504 }, { "acc": 0.75359888, "epoch": 1.2989599188229326, "grad_norm": 2.328125, "learning_rate": 2.9994003159853793e-06, "loss": 0.98440742, "memory(GiB)": 369.42, "step": 51205, "train_speed(iter/s)": 0.200508 }, { "acc": 0.75622005, "epoch": 1.2990867579908676, "grad_norm": 2.296875, "learning_rate": 2.998439335412505e-06, "loss": 0.94935684, "memory(GiB)": 369.42, "step": 51210, "train_speed(iter/s)": 0.200512 }, { "acc": 0.7592485, "epoch": 1.2992135971588026, "grad_norm": 1.7421875, "learning_rate": 2.9974784428772043e-06, "loss": 0.94680882, "memory(GiB)": 369.42, "step": 51215, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75169897, "epoch": 1.2993404363267378, "grad_norm": 1.96875, "learning_rate": 2.996517638421741e-06, "loss": 1.00665646, "memory(GiB)": 369.42, "step": 51220, "train_speed(iter/s)": 0.200517 }, { "acc": 0.7588397, "epoch": 1.2994672754946728, "grad_norm": 2.015625, "learning_rate": 2.9955569220883777e-06, "loss": 0.97017117, "memory(GiB)": 369.42, "step": 51225, "train_speed(iter/s)": 0.20052 }, { "acc": 0.74527516, "epoch": 1.2995941146626078, "grad_norm": 2.234375, "learning_rate": 2.9945962939193718e-06, "loss": 1.01563568, "memory(GiB)": 369.42, "step": 51230, "train_speed(iter/s)": 0.200521 }, { "acc": 0.76191816, "epoch": 1.2997209538305428, "grad_norm": 1.8203125, "learning_rate": 2.9936357539569728e-06, "loss": 0.95423279, "memory(GiB)": 369.42, "step": 51235, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75615082, "epoch": 1.299847792998478, "grad_norm": 2.28125, "learning_rate": 2.9926753022434306e-06, "loss": 0.9387722, "memory(GiB)": 369.42, "step": 51240, "train_speed(iter/s)": 0.200529 }, { "acc": 0.76574025, "epoch": 1.299974632166413, "grad_norm": 2.046875, "learning_rate": 2.9917149388209908e-06, "loss": 0.94922218, "memory(GiB)": 369.42, "step": 51245, "train_speed(iter/s)": 0.200531 }, { "acc": 0.75329247, "epoch": 1.3001014713343482, "grad_norm": 2.125, "learning_rate": 2.9907546637318964e-06, "loss": 0.98731289, "memory(GiB)": 369.42, "step": 51250, "train_speed(iter/s)": 0.200534 }, { "acc": 0.76088028, "epoch": 1.3002283105022832, "grad_norm": 2.984375, "learning_rate": 2.98979447701838e-06, "loss": 0.99292068, "memory(GiB)": 369.42, "step": 51255, "train_speed(iter/s)": 0.200538 }, { "acc": 0.76211772, "epoch": 1.3003551496702181, "grad_norm": 2.5, "learning_rate": 2.988834378722679e-06, "loss": 0.99518127, "memory(GiB)": 369.42, "step": 51260, "train_speed(iter/s)": 0.200535 }, { "acc": 0.74854479, "epoch": 1.3004819888381531, "grad_norm": 2.015625, "learning_rate": 2.9878743688870193e-06, "loss": 1.00193615, "memory(GiB)": 369.42, "step": 51265, "train_speed(iter/s)": 0.200539 }, { "acc": 0.75113354, "epoch": 1.3006088280060883, "grad_norm": 1.875, "learning_rate": 2.9869144475536306e-06, "loss": 0.98263016, "memory(GiB)": 369.42, "step": 51270, "train_speed(iter/s)": 0.200543 }, { "acc": 0.73952222, "epoch": 1.3007356671740233, "grad_norm": 1.9296875, "learning_rate": 2.9859546147647316e-06, "loss": 1.04554939, "memory(GiB)": 369.42, "step": 51275, "train_speed(iter/s)": 0.200546 }, { "acc": 0.74287758, "epoch": 1.3008625063419583, "grad_norm": 1.9609375, "learning_rate": 2.984994870562541e-06, "loss": 1.00908241, "memory(GiB)": 369.42, "step": 51280, "train_speed(iter/s)": 0.200547 }, { "acc": 0.74556141, "epoch": 1.3009893455098935, "grad_norm": 2.0625, "learning_rate": 2.9840352149892703e-06, "loss": 0.97937794, "memory(GiB)": 369.42, "step": 51285, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75738115, "epoch": 1.3011161846778285, "grad_norm": 2.34375, "learning_rate": 2.9830756480871343e-06, "loss": 0.99274921, "memory(GiB)": 369.42, "step": 51290, "train_speed(iter/s)": 0.200553 }, { "acc": 0.75683279, "epoch": 1.3012430238457635, "grad_norm": 2.140625, "learning_rate": 2.9821161698983347e-06, "loss": 0.99788914, "memory(GiB)": 369.42, "step": 51295, "train_speed(iter/s)": 0.200555 }, { "acc": 0.75179753, "epoch": 1.3013698630136985, "grad_norm": 2.234375, "learning_rate": 2.9811567804650747e-06, "loss": 0.95189133, "memory(GiB)": 369.42, "step": 51300, "train_speed(iter/s)": 0.200559 }, { "acc": 0.74730196, "epoch": 1.3014967021816337, "grad_norm": 2.140625, "learning_rate": 2.980197479829554e-06, "loss": 0.99808273, "memory(GiB)": 369.42, "step": 51305, "train_speed(iter/s)": 0.200561 }, { "acc": 0.75916309, "epoch": 1.3016235413495687, "grad_norm": 2.140625, "learning_rate": 2.9792382680339666e-06, "loss": 0.93391008, "memory(GiB)": 369.42, "step": 51310, "train_speed(iter/s)": 0.200564 }, { "acc": 0.7394413, "epoch": 1.301750380517504, "grad_norm": 2.34375, "learning_rate": 2.9782791451205e-06, "loss": 0.99450226, "memory(GiB)": 369.42, "step": 51315, "train_speed(iter/s)": 0.200567 }, { "acc": 0.73593512, "epoch": 1.301877219685439, "grad_norm": 2.46875, "learning_rate": 2.9773201111313444e-06, "loss": 1.03715811, "memory(GiB)": 369.42, "step": 51320, "train_speed(iter/s)": 0.20057 }, { "acc": 0.76084833, "epoch": 1.302004058853374, "grad_norm": 2.21875, "learning_rate": 2.9763611661086806e-06, "loss": 0.97698135, "memory(GiB)": 369.42, "step": 51325, "train_speed(iter/s)": 0.200572 }, { "acc": 0.7671237, "epoch": 1.3021308980213089, "grad_norm": 2.34375, "learning_rate": 2.975402310094689e-06, "loss": 0.94142227, "memory(GiB)": 369.42, "step": 51330, "train_speed(iter/s)": 0.200575 }, { "acc": 0.74858308, "epoch": 1.302257737189244, "grad_norm": 1.8828125, "learning_rate": 2.9744435431315403e-06, "loss": 1.00013981, "memory(GiB)": 369.42, "step": 51335, "train_speed(iter/s)": 0.200578 }, { "acc": 0.74314384, "epoch": 1.302384576357179, "grad_norm": 2.703125, "learning_rate": 2.9734848652614097e-06, "loss": 1.05604286, "memory(GiB)": 369.42, "step": 51340, "train_speed(iter/s)": 0.200582 }, { "acc": 0.76343236, "epoch": 1.3025114155251143, "grad_norm": 2.515625, "learning_rate": 2.972526276526461e-06, "loss": 0.99903021, "memory(GiB)": 369.42, "step": 51345, "train_speed(iter/s)": 0.200585 }, { "acc": 0.76343312, "epoch": 1.3026382546930493, "grad_norm": 2.125, "learning_rate": 2.971567776968861e-06, "loss": 0.98081837, "memory(GiB)": 369.42, "step": 51350, "train_speed(iter/s)": 0.200588 }, { "acc": 0.76792879, "epoch": 1.3027650938609843, "grad_norm": 2.171875, "learning_rate": 2.9706093666307654e-06, "loss": 0.97364464, "memory(GiB)": 369.42, "step": 51355, "train_speed(iter/s)": 0.200589 }, { "acc": 0.74470563, "epoch": 1.3028919330289193, "grad_norm": 2.046875, "learning_rate": 2.969651045554329e-06, "loss": 0.98489132, "memory(GiB)": 369.42, "step": 51360, "train_speed(iter/s)": 0.200592 }, { "acc": 0.74753113, "epoch": 1.3030187721968545, "grad_norm": 2.140625, "learning_rate": 2.968692813781706e-06, "loss": 0.99027987, "memory(GiB)": 369.42, "step": 51365, "train_speed(iter/s)": 0.200595 }, { "acc": 0.74487085, "epoch": 1.3031456113647895, "grad_norm": 2.3125, "learning_rate": 2.9677346713550437e-06, "loss": 0.98234091, "memory(GiB)": 369.42, "step": 51370, "train_speed(iter/s)": 0.200597 }, { "acc": 0.76752071, "epoch": 1.3032724505327244, "grad_norm": 1.6328125, "learning_rate": 2.966776618316482e-06, "loss": 0.93373137, "memory(GiB)": 369.42, "step": 51375, "train_speed(iter/s)": 0.200601 }, { "acc": 0.74744782, "epoch": 1.3033992897006597, "grad_norm": 1.953125, "learning_rate": 2.9658186547081612e-06, "loss": 1.00909462, "memory(GiB)": 369.42, "step": 51380, "train_speed(iter/s)": 0.200603 }, { "acc": 0.75966039, "epoch": 1.3035261288685946, "grad_norm": 2.34375, "learning_rate": 2.9648607805722197e-06, "loss": 0.95951843, "memory(GiB)": 369.42, "step": 51385, "train_speed(iter/s)": 0.200606 }, { "acc": 0.76211748, "epoch": 1.3036529680365296, "grad_norm": 2.171875, "learning_rate": 2.963902995950788e-06, "loss": 0.93851299, "memory(GiB)": 369.42, "step": 51390, "train_speed(iter/s)": 0.200609 }, { "acc": 0.74172516, "epoch": 1.3037798072044646, "grad_norm": 2.484375, "learning_rate": 2.962945300885991e-06, "loss": 1.01993198, "memory(GiB)": 369.42, "step": 51395, "train_speed(iter/s)": 0.200612 }, { "acc": 0.75161152, "epoch": 1.3039066463723998, "grad_norm": 2.4375, "learning_rate": 2.9619876954199557e-06, "loss": 0.97728205, "memory(GiB)": 369.42, "step": 51400, "train_speed(iter/s)": 0.200613 }, { "acc": 0.76366138, "epoch": 1.3040334855403348, "grad_norm": 1.8828125, "learning_rate": 2.9610301795947992e-06, "loss": 0.99642811, "memory(GiB)": 369.42, "step": 51405, "train_speed(iter/s)": 0.200615 }, { "acc": 0.74358883, "epoch": 1.30416032470827, "grad_norm": 2.265625, "learning_rate": 2.9600727534526417e-06, "loss": 1.03142967, "memory(GiB)": 369.42, "step": 51410, "train_speed(iter/s)": 0.200619 }, { "acc": 0.75262861, "epoch": 1.304287163876205, "grad_norm": 2.4375, "learning_rate": 2.9591154170355895e-06, "loss": 1.02547455, "memory(GiB)": 369.42, "step": 51415, "train_speed(iter/s)": 0.200621 }, { "acc": 0.74127054, "epoch": 1.30441400304414, "grad_norm": 1.90625, "learning_rate": 2.9581581703857545e-06, "loss": 1.03521767, "memory(GiB)": 369.42, "step": 51420, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75349617, "epoch": 1.304540842212075, "grad_norm": 2.25, "learning_rate": 2.9572010135452377e-06, "loss": 0.94948683, "memory(GiB)": 369.42, "step": 51425, "train_speed(iter/s)": 0.200628 }, { "acc": 0.76511936, "epoch": 1.3046676813800102, "grad_norm": 2.125, "learning_rate": 2.9562439465561425e-06, "loss": 1.00498219, "memory(GiB)": 369.42, "step": 51430, "train_speed(iter/s)": 0.200632 }, { "acc": 0.76127415, "epoch": 1.3047945205479452, "grad_norm": 2.265625, "learning_rate": 2.955286969460563e-06, "loss": 0.96072264, "memory(GiB)": 369.42, "step": 51435, "train_speed(iter/s)": 0.200634 }, { "acc": 0.74982972, "epoch": 1.3049213597158802, "grad_norm": 2.234375, "learning_rate": 2.9543300823005903e-06, "loss": 0.9985178, "memory(GiB)": 369.42, "step": 51440, "train_speed(iter/s)": 0.200637 }, { "acc": 0.7477962, "epoch": 1.3050481988838154, "grad_norm": 1.953125, "learning_rate": 2.953373285118315e-06, "loss": 0.96570568, "memory(GiB)": 369.42, "step": 51445, "train_speed(iter/s)": 0.20064 }, { "acc": 0.7456419, "epoch": 1.3051750380517504, "grad_norm": 2.265625, "learning_rate": 2.9524165779558206e-06, "loss": 1.01110392, "memory(GiB)": 369.42, "step": 51450, "train_speed(iter/s)": 0.200643 }, { "acc": 0.75290251, "epoch": 1.3053018772196854, "grad_norm": 2.265625, "learning_rate": 2.9514599608551865e-06, "loss": 0.95323944, "memory(GiB)": 369.42, "step": 51455, "train_speed(iter/s)": 0.200647 }, { "acc": 0.74886885, "epoch": 1.3054287163876204, "grad_norm": 2.0, "learning_rate": 2.9505034338584882e-06, "loss": 1.04645996, "memory(GiB)": 369.42, "step": 51460, "train_speed(iter/s)": 0.200649 }, { "acc": 0.77021775, "epoch": 1.3055555555555556, "grad_norm": 2.25, "learning_rate": 2.9495469970078e-06, "loss": 0.9773057, "memory(GiB)": 369.42, "step": 51465, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74311347, "epoch": 1.3056823947234906, "grad_norm": 2.34375, "learning_rate": 2.9485906503451907e-06, "loss": 1.04963112, "memory(GiB)": 369.42, "step": 51470, "train_speed(iter/s)": 0.200656 }, { "acc": 0.7411871, "epoch": 1.3058092338914258, "grad_norm": 2.0625, "learning_rate": 2.9476343939127217e-06, "loss": 0.99572382, "memory(GiB)": 369.42, "step": 51475, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75973406, "epoch": 1.3059360730593608, "grad_norm": 2.03125, "learning_rate": 2.9466782277524554e-06, "loss": 0.92114525, "memory(GiB)": 369.42, "step": 51480, "train_speed(iter/s)": 0.200663 }, { "acc": 0.7493741, "epoch": 1.3060629122272958, "grad_norm": 2.265625, "learning_rate": 2.9457221519064477e-06, "loss": 1.04329872, "memory(GiB)": 369.42, "step": 51485, "train_speed(iter/s)": 0.200667 }, { "acc": 0.75231838, "epoch": 1.3061897513952307, "grad_norm": 2.34375, "learning_rate": 2.944766166416754e-06, "loss": 0.98245525, "memory(GiB)": 369.42, "step": 51490, "train_speed(iter/s)": 0.200669 }, { "acc": 0.75222678, "epoch": 1.306316590563166, "grad_norm": 2.046875, "learning_rate": 2.943810271325418e-06, "loss": 0.98981066, "memory(GiB)": 369.42, "step": 51495, "train_speed(iter/s)": 0.200671 }, { "acc": 0.74791527, "epoch": 1.306443429731101, "grad_norm": 2.125, "learning_rate": 2.9428544666744873e-06, "loss": 0.96431265, "memory(GiB)": 369.42, "step": 51500, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74326868, "epoch": 1.3065702688990362, "grad_norm": 2.25, "learning_rate": 2.9418987525060004e-06, "loss": 1.04859095, "memory(GiB)": 369.42, "step": 51505, "train_speed(iter/s)": 0.200677 }, { "acc": 0.76269999, "epoch": 1.3066971080669711, "grad_norm": 2.46875, "learning_rate": 2.9409431288619973e-06, "loss": 1.02311573, "memory(GiB)": 369.42, "step": 51510, "train_speed(iter/s)": 0.20068 }, { "acc": 0.74139767, "epoch": 1.3068239472349061, "grad_norm": 2.171875, "learning_rate": 2.939987595784507e-06, "loss": 1.04601097, "memory(GiB)": 369.42, "step": 51515, "train_speed(iter/s)": 0.200682 }, { "acc": 0.74543171, "epoch": 1.3069507864028411, "grad_norm": 2.3125, "learning_rate": 2.9390321533155585e-06, "loss": 1.06243286, "memory(GiB)": 369.42, "step": 51520, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75499163, "epoch": 1.3070776255707763, "grad_norm": 2.109375, "learning_rate": 2.9380768014971794e-06, "loss": 0.98607349, "memory(GiB)": 369.42, "step": 51525, "train_speed(iter/s)": 0.200688 }, { "acc": 0.76494789, "epoch": 1.3072044647387113, "grad_norm": 2.171875, "learning_rate": 2.937121540371389e-06, "loss": 0.91802521, "memory(GiB)": 369.42, "step": 51530, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75940232, "epoch": 1.3073313039066463, "grad_norm": 2.578125, "learning_rate": 2.9361663699802007e-06, "loss": 0.99005413, "memory(GiB)": 369.42, "step": 51535, "train_speed(iter/s)": 0.200692 }, { "acc": 0.75334167, "epoch": 1.3074581430745815, "grad_norm": 1.9140625, "learning_rate": 2.9352112903656315e-06, "loss": 0.95905571, "memory(GiB)": 369.42, "step": 51540, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74368477, "epoch": 1.3075849822425165, "grad_norm": 1.96875, "learning_rate": 2.9342563015696866e-06, "loss": 1.06629601, "memory(GiB)": 369.42, "step": 51545, "train_speed(iter/s)": 0.200697 }, { "acc": 0.7643609, "epoch": 1.3077118214104515, "grad_norm": 2.09375, "learning_rate": 2.9333014036343765e-06, "loss": 0.92588577, "memory(GiB)": 369.42, "step": 51550, "train_speed(iter/s)": 0.200699 }, { "acc": 0.75463977, "epoch": 1.3078386605783865, "grad_norm": 2.03125, "learning_rate": 2.932346596601694e-06, "loss": 0.96488991, "memory(GiB)": 369.42, "step": 51555, "train_speed(iter/s)": 0.200701 }, { "acc": 0.74272828, "epoch": 1.3079654997463217, "grad_norm": 2.0625, "learning_rate": 2.931391880513641e-06, "loss": 0.98629189, "memory(GiB)": 369.42, "step": 51560, "train_speed(iter/s)": 0.200705 }, { "acc": 0.74432945, "epoch": 1.3080923389142567, "grad_norm": 2.46875, "learning_rate": 2.9304372554122074e-06, "loss": 1.03143482, "memory(GiB)": 369.42, "step": 51565, "train_speed(iter/s)": 0.200707 }, { "acc": 0.74523468, "epoch": 1.308219178082192, "grad_norm": 2.3125, "learning_rate": 2.9294827213393857e-06, "loss": 0.99441853, "memory(GiB)": 369.42, "step": 51570, "train_speed(iter/s)": 0.200711 }, { "acc": 0.76115179, "epoch": 1.308346017250127, "grad_norm": 2.15625, "learning_rate": 2.9285282783371567e-06, "loss": 0.92949772, "memory(GiB)": 369.42, "step": 51575, "train_speed(iter/s)": 0.200714 }, { "acc": 0.75310116, "epoch": 1.3084728564180619, "grad_norm": 2.390625, "learning_rate": 2.9275739264475013e-06, "loss": 0.96647091, "memory(GiB)": 369.42, "step": 51580, "train_speed(iter/s)": 0.200717 }, { "acc": 0.74925919, "epoch": 1.3085996955859969, "grad_norm": 2.09375, "learning_rate": 2.926619665712399e-06, "loss": 0.96150923, "memory(GiB)": 369.42, "step": 51585, "train_speed(iter/s)": 0.20072 }, { "acc": 0.7411201, "epoch": 1.308726534753932, "grad_norm": 2.859375, "learning_rate": 2.9256654961738217e-06, "loss": 1.02156458, "memory(GiB)": 369.42, "step": 51590, "train_speed(iter/s)": 0.200724 }, { "acc": 0.76171007, "epoch": 1.308853373921867, "grad_norm": 2.015625, "learning_rate": 2.9247114178737356e-06, "loss": 0.93137379, "memory(GiB)": 369.42, "step": 51595, "train_speed(iter/s)": 0.200727 }, { "acc": 0.7487586, "epoch": 1.308980213089802, "grad_norm": 1.8828125, "learning_rate": 2.9237574308541063e-06, "loss": 0.98041658, "memory(GiB)": 369.42, "step": 51600, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74657469, "epoch": 1.3091070522577373, "grad_norm": 1.9453125, "learning_rate": 2.9228035351568955e-06, "loss": 1.01362829, "memory(GiB)": 369.42, "step": 51605, "train_speed(iter/s)": 0.200732 }, { "acc": 0.76179323, "epoch": 1.3092338914256723, "grad_norm": 2.609375, "learning_rate": 2.92184973082406e-06, "loss": 0.92367926, "memory(GiB)": 369.42, "step": 51610, "train_speed(iter/s)": 0.200736 }, { "acc": 0.75519409, "epoch": 1.3093607305936072, "grad_norm": 2.0625, "learning_rate": 2.920896017897551e-06, "loss": 0.97404089, "memory(GiB)": 369.42, "step": 51615, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74441657, "epoch": 1.3094875697615422, "grad_norm": 1.8359375, "learning_rate": 2.9199423964193176e-06, "loss": 1.02696552, "memory(GiB)": 369.42, "step": 51620, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75265231, "epoch": 1.3096144089294774, "grad_norm": 2.109375, "learning_rate": 2.9189888664313045e-06, "loss": 0.98040905, "memory(GiB)": 369.42, "step": 51625, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74708891, "epoch": 1.3097412480974124, "grad_norm": 2.09375, "learning_rate": 2.9180354279754517e-06, "loss": 1.01851158, "memory(GiB)": 369.42, "step": 51630, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75389795, "epoch": 1.3098680872653476, "grad_norm": 2.4375, "learning_rate": 2.9170820810936968e-06, "loss": 0.99914989, "memory(GiB)": 369.42, "step": 51635, "train_speed(iter/s)": 0.200751 }, { "acc": 0.7483036, "epoch": 1.3099949264332826, "grad_norm": 2.8125, "learning_rate": 2.9161288258279715e-06, "loss": 0.98290367, "memory(GiB)": 369.42, "step": 51640, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74537883, "epoch": 1.3101217656012176, "grad_norm": 2.015625, "learning_rate": 2.9151756622202037e-06, "loss": 0.97999191, "memory(GiB)": 369.42, "step": 51645, "train_speed(iter/s)": 0.200758 }, { "acc": 0.74773078, "epoch": 1.3102486047691526, "grad_norm": 2.765625, "learning_rate": 2.914222590312319e-06, "loss": 1.01617012, "memory(GiB)": 369.42, "step": 51650, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75230308, "epoch": 1.3103754439370878, "grad_norm": 2.90625, "learning_rate": 2.9132696101462366e-06, "loss": 0.99917879, "memory(GiB)": 369.42, "step": 51655, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74083443, "epoch": 1.3105022831050228, "grad_norm": 2.359375, "learning_rate": 2.912316721763874e-06, "loss": 1.07198696, "memory(GiB)": 369.42, "step": 51660, "train_speed(iter/s)": 0.200767 }, { "acc": 0.76903009, "epoch": 1.310629122272958, "grad_norm": 2.03125, "learning_rate": 2.9113639252071395e-06, "loss": 0.92232437, "memory(GiB)": 369.42, "step": 51665, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74550385, "epoch": 1.310755961440893, "grad_norm": 2.1875, "learning_rate": 2.91041122051795e-06, "loss": 0.99662437, "memory(GiB)": 369.42, "step": 51670, "train_speed(iter/s)": 0.200774 }, { "acc": 0.74075208, "epoch": 1.310882800608828, "grad_norm": 1.8515625, "learning_rate": 2.9094586077382016e-06, "loss": 1.03928423, "memory(GiB)": 369.42, "step": 51675, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74599681, "epoch": 1.311009639776763, "grad_norm": 2.59375, "learning_rate": 2.9085060869097977e-06, "loss": 0.96217442, "memory(GiB)": 369.42, "step": 51680, "train_speed(iter/s)": 0.20078 }, { "acc": 0.75485935, "epoch": 1.3111364789446982, "grad_norm": 2.015625, "learning_rate": 2.907553658074631e-06, "loss": 0.91399612, "memory(GiB)": 369.42, "step": 51685, "train_speed(iter/s)": 0.200781 }, { "acc": 0.7439785, "epoch": 1.3112633181126332, "grad_norm": 2.015625, "learning_rate": 2.906601321274601e-06, "loss": 1.01596584, "memory(GiB)": 369.42, "step": 51690, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75230389, "epoch": 1.3113901572805682, "grad_norm": 1.921875, "learning_rate": 2.90564907655159e-06, "loss": 1.02719612, "memory(GiB)": 369.42, "step": 51695, "train_speed(iter/s)": 0.200788 }, { "acc": 0.7599515, "epoch": 1.3115169964485034, "grad_norm": 2.078125, "learning_rate": 2.9046969239474808e-06, "loss": 0.93382378, "memory(GiB)": 369.42, "step": 51700, "train_speed(iter/s)": 0.20079 }, { "acc": 0.75223303, "epoch": 1.3116438356164384, "grad_norm": 2.0625, "learning_rate": 2.9037448635041574e-06, "loss": 1.01064301, "memory(GiB)": 369.42, "step": 51705, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74941792, "epoch": 1.3117706747843734, "grad_norm": 1.984375, "learning_rate": 2.9027928952634964e-06, "loss": 0.96273746, "memory(GiB)": 369.42, "step": 51710, "train_speed(iter/s)": 0.200797 }, { "acc": 0.75915465, "epoch": 1.3118975139523084, "grad_norm": 2.140625, "learning_rate": 2.901841019267363e-06, "loss": 0.90676851, "memory(GiB)": 369.42, "step": 51715, "train_speed(iter/s)": 0.2008 }, { "acc": 0.76043091, "epoch": 1.3120243531202436, "grad_norm": 2.140625, "learning_rate": 2.900889235557631e-06, "loss": 0.97403183, "memory(GiB)": 369.42, "step": 51720, "train_speed(iter/s)": 0.200804 }, { "acc": 0.76707678, "epoch": 1.3121511922881786, "grad_norm": 2.078125, "learning_rate": 2.8999375441761627e-06, "loss": 0.93609447, "memory(GiB)": 369.42, "step": 51725, "train_speed(iter/s)": 0.200807 }, { "acc": 0.75276508, "epoch": 1.3122780314561138, "grad_norm": 1.9609375, "learning_rate": 2.8989859451648193e-06, "loss": 0.95540829, "memory(GiB)": 369.42, "step": 51730, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75906835, "epoch": 1.3124048706240488, "grad_norm": 2.625, "learning_rate": 2.89803443856545e-06, "loss": 0.93330498, "memory(GiB)": 369.42, "step": 51735, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75300093, "epoch": 1.3125317097919837, "grad_norm": 1.9921875, "learning_rate": 2.897083024419913e-06, "loss": 0.99161091, "memory(GiB)": 369.42, "step": 51740, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75867119, "epoch": 1.3126585489599187, "grad_norm": 2.5, "learning_rate": 2.8961317027700534e-06, "loss": 0.96464252, "memory(GiB)": 369.42, "step": 51745, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74548368, "epoch": 1.312785388127854, "grad_norm": 2.28125, "learning_rate": 2.8951804736577148e-06, "loss": 0.98210764, "memory(GiB)": 369.42, "step": 51750, "train_speed(iter/s)": 0.200822 }, { "acc": 0.76443768, "epoch": 1.312912227295789, "grad_norm": 2.40625, "learning_rate": 2.894229337124736e-06, "loss": 0.98556767, "memory(GiB)": 369.42, "step": 51755, "train_speed(iter/s)": 0.200824 }, { "acc": 0.73943262, "epoch": 1.313039066463724, "grad_norm": 2.046875, "learning_rate": 2.8932782932129524e-06, "loss": 1.01461573, "memory(GiB)": 369.42, "step": 51760, "train_speed(iter/s)": 0.200827 }, { "acc": 0.75424714, "epoch": 1.3131659056316591, "grad_norm": 2.28125, "learning_rate": 2.8923273419641956e-06, "loss": 0.95050716, "memory(GiB)": 369.42, "step": 51765, "train_speed(iter/s)": 0.20083 }, { "acc": 0.7542829, "epoch": 1.3132927447995941, "grad_norm": 2.375, "learning_rate": 2.891376483420292e-06, "loss": 0.98549967, "memory(GiB)": 369.42, "step": 51770, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75582199, "epoch": 1.3134195839675291, "grad_norm": 2.390625, "learning_rate": 2.8904257176230655e-06, "loss": 0.99457169, "memory(GiB)": 369.42, "step": 51775, "train_speed(iter/s)": 0.200835 }, { "acc": 0.76004939, "epoch": 1.313546423135464, "grad_norm": 2.03125, "learning_rate": 2.8894750446143345e-06, "loss": 0.9363307, "memory(GiB)": 369.42, "step": 51780, "train_speed(iter/s)": 0.200837 }, { "acc": 0.74707117, "epoch": 1.3136732623033993, "grad_norm": 2.53125, "learning_rate": 2.8885244644359134e-06, "loss": 0.98260956, "memory(GiB)": 369.42, "step": 51785, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74900723, "epoch": 1.3138001014713343, "grad_norm": 2.265625, "learning_rate": 2.887573977129614e-06, "loss": 1.0081089, "memory(GiB)": 369.42, "step": 51790, "train_speed(iter/s)": 0.200843 }, { "acc": 0.76046743, "epoch": 1.3139269406392695, "grad_norm": 2.390625, "learning_rate": 2.886623582737242e-06, "loss": 0.97979069, "memory(GiB)": 369.42, "step": 51795, "train_speed(iter/s)": 0.200845 }, { "acc": 0.7528367, "epoch": 1.3140537798072045, "grad_norm": 2.046875, "learning_rate": 2.8856732813006007e-06, "loss": 1.03165131, "memory(GiB)": 369.42, "step": 51800, "train_speed(iter/s)": 0.200848 }, { "acc": 0.74338746, "epoch": 1.3141806189751395, "grad_norm": 2.0625, "learning_rate": 2.8847230728614854e-06, "loss": 1.07128029, "memory(GiB)": 369.42, "step": 51805, "train_speed(iter/s)": 0.200851 }, { "acc": 0.76571398, "epoch": 1.3143074581430745, "grad_norm": 2.109375, "learning_rate": 2.883772957461698e-06, "loss": 0.91395531, "memory(GiB)": 369.42, "step": 51810, "train_speed(iter/s)": 0.200848 }, { "acc": 0.74536467, "epoch": 1.3144342973110097, "grad_norm": 2.4375, "learning_rate": 2.8828229351430224e-06, "loss": 1.04905128, "memory(GiB)": 369.42, "step": 51815, "train_speed(iter/s)": 0.200853 }, { "acc": 0.75289412, "epoch": 1.3145611364789447, "grad_norm": 2.390625, "learning_rate": 2.881873005947247e-06, "loss": 0.95310841, "memory(GiB)": 369.42, "step": 51820, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74702702, "epoch": 1.31468797564688, "grad_norm": 2.0, "learning_rate": 2.88092316991615e-06, "loss": 1.00923166, "memory(GiB)": 369.42, "step": 51825, "train_speed(iter/s)": 0.200859 }, { "acc": 0.76018949, "epoch": 1.3148148148148149, "grad_norm": 2.21875, "learning_rate": 2.879973427091518e-06, "loss": 0.94814329, "memory(GiB)": 369.42, "step": 51830, "train_speed(iter/s)": 0.200863 }, { "acc": 0.75872889, "epoch": 1.3149416539827499, "grad_norm": 2.375, "learning_rate": 2.879023777515118e-06, "loss": 0.96664534, "memory(GiB)": 369.42, "step": 51835, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74763737, "epoch": 1.3150684931506849, "grad_norm": 1.859375, "learning_rate": 2.8780742212287192e-06, "loss": 0.95498028, "memory(GiB)": 369.42, "step": 51840, "train_speed(iter/s)": 0.20087 }, { "acc": 0.73971777, "epoch": 1.31519533231862, "grad_norm": 2.21875, "learning_rate": 2.8771247582740924e-06, "loss": 1.03001595, "memory(GiB)": 369.42, "step": 51845, "train_speed(iter/s)": 0.200874 }, { "acc": 0.74957547, "epoch": 1.315322171486555, "grad_norm": 2.125, "learning_rate": 2.876175388692999e-06, "loss": 0.98111801, "memory(GiB)": 369.42, "step": 51850, "train_speed(iter/s)": 0.200877 }, { "acc": 0.7437995, "epoch": 1.31544901065449, "grad_norm": 2.453125, "learning_rate": 2.875226112527192e-06, "loss": 1.04644222, "memory(GiB)": 369.42, "step": 51855, "train_speed(iter/s)": 0.200881 }, { "acc": 0.75087337, "epoch": 1.3155758498224253, "grad_norm": 2.03125, "learning_rate": 2.8742769298184246e-06, "loss": 1.01826859, "memory(GiB)": 369.42, "step": 51860, "train_speed(iter/s)": 0.200885 }, { "acc": 0.75487976, "epoch": 1.3157026889903602, "grad_norm": 1.7578125, "learning_rate": 2.8733278406084507e-06, "loss": 0.9806221, "memory(GiB)": 369.42, "step": 51865, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75896873, "epoch": 1.3158295281582952, "grad_norm": 2.203125, "learning_rate": 2.872378844939015e-06, "loss": 0.97808523, "memory(GiB)": 369.42, "step": 51870, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74948664, "epoch": 1.3159563673262302, "grad_norm": 2.09375, "learning_rate": 2.871429942851853e-06, "loss": 1.01162643, "memory(GiB)": 369.42, "step": 51875, "train_speed(iter/s)": 0.200892 }, { "acc": 0.77257309, "epoch": 1.3160832064941654, "grad_norm": 2.625, "learning_rate": 2.8704811343887075e-06, "loss": 0.9056859, "memory(GiB)": 369.42, "step": 51880, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75665398, "epoch": 1.3162100456621004, "grad_norm": 2.4375, "learning_rate": 2.86953241959131e-06, "loss": 0.97267418, "memory(GiB)": 369.42, "step": 51885, "train_speed(iter/s)": 0.200898 }, { "acc": 0.75336752, "epoch": 1.3163368848300356, "grad_norm": 2.046875, "learning_rate": 2.8685837985013874e-06, "loss": 0.94568329, "memory(GiB)": 369.42, "step": 51890, "train_speed(iter/s)": 0.200901 }, { "acc": 0.75979586, "epoch": 1.3164637239979706, "grad_norm": 2.09375, "learning_rate": 2.867635271160666e-06, "loss": 0.94907417, "memory(GiB)": 369.42, "step": 51895, "train_speed(iter/s)": 0.200905 }, { "acc": 0.73894272, "epoch": 1.3165905631659056, "grad_norm": 2.28125, "learning_rate": 2.8666868376108658e-06, "loss": 1.09223404, "memory(GiB)": 369.42, "step": 51900, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74357147, "epoch": 1.3167174023338406, "grad_norm": 2.421875, "learning_rate": 2.865738497893703e-06, "loss": 0.98018513, "memory(GiB)": 369.42, "step": 51905, "train_speed(iter/s)": 0.20091 }, { "acc": 0.7502636, "epoch": 1.3168442415017758, "grad_norm": 2.296875, "learning_rate": 2.8647902520508896e-06, "loss": 1.01469078, "memory(GiB)": 369.42, "step": 51910, "train_speed(iter/s)": 0.200912 }, { "acc": 0.74429665, "epoch": 1.3169710806697108, "grad_norm": 2.109375, "learning_rate": 2.8638421001241346e-06, "loss": 1.01230774, "memory(GiB)": 369.42, "step": 51915, "train_speed(iter/s)": 0.200915 }, { "acc": 0.74861741, "epoch": 1.3170979198376458, "grad_norm": 2.421875, "learning_rate": 2.8628940421551404e-06, "loss": 0.99073391, "memory(GiB)": 369.42, "step": 51920, "train_speed(iter/s)": 0.200917 }, { "acc": 0.73787174, "epoch": 1.317224759005581, "grad_norm": 2.296875, "learning_rate": 2.861946078185608e-06, "loss": 1.04264135, "memory(GiB)": 369.42, "step": 51925, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75430875, "epoch": 1.317351598173516, "grad_norm": 2.296875, "learning_rate": 2.860998208257233e-06, "loss": 0.97814159, "memory(GiB)": 369.42, "step": 51930, "train_speed(iter/s)": 0.20092 }, { "acc": 0.74560022, "epoch": 1.317478437341451, "grad_norm": 2.078125, "learning_rate": 2.860050432411707e-06, "loss": 1.0282239, "memory(GiB)": 369.42, "step": 51935, "train_speed(iter/s)": 0.200923 }, { "acc": 0.74878073, "epoch": 1.317605276509386, "grad_norm": 1.8828125, "learning_rate": 2.8591027506907167e-06, "loss": 1.01920776, "memory(GiB)": 369.42, "step": 51940, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75809031, "epoch": 1.3177321156773212, "grad_norm": 2.0625, "learning_rate": 2.858155163135946e-06, "loss": 0.98461981, "memory(GiB)": 369.42, "step": 51945, "train_speed(iter/s)": 0.200928 }, { "acc": 0.73840675, "epoch": 1.3178589548452562, "grad_norm": 2.046875, "learning_rate": 2.857207669789074e-06, "loss": 1.02106628, "memory(GiB)": 369.42, "step": 51950, "train_speed(iter/s)": 0.200932 }, { "acc": 0.73817234, "epoch": 1.3179857940131914, "grad_norm": 2.203125, "learning_rate": 2.8562602706917754e-06, "loss": 1.00583305, "memory(GiB)": 369.42, "step": 51955, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75791235, "epoch": 1.3181126331811264, "grad_norm": 2.03125, "learning_rate": 2.8553129658857215e-06, "loss": 0.96094627, "memory(GiB)": 369.42, "step": 51960, "train_speed(iter/s)": 0.200935 }, { "acc": 0.73594503, "epoch": 1.3182394723490614, "grad_norm": 2.015625, "learning_rate": 2.854365755412576e-06, "loss": 0.99034481, "memory(GiB)": 369.42, "step": 51965, "train_speed(iter/s)": 0.200935 }, { "acc": 0.76133366, "epoch": 1.3183663115169963, "grad_norm": 2.40625, "learning_rate": 2.8534186393140083e-06, "loss": 0.93078899, "memory(GiB)": 369.42, "step": 51970, "train_speed(iter/s)": 0.200938 }, { "acc": 0.75351214, "epoch": 1.3184931506849316, "grad_norm": 2.34375, "learning_rate": 2.8524716176316715e-06, "loss": 0.95619507, "memory(GiB)": 369.42, "step": 51975, "train_speed(iter/s)": 0.200939 }, { "acc": 0.73229671, "epoch": 1.3186199898528665, "grad_norm": 2.765625, "learning_rate": 2.851524690407218e-06, "loss": 1.08855038, "memory(GiB)": 369.42, "step": 51980, "train_speed(iter/s)": 0.200942 }, { "acc": 0.75480943, "epoch": 1.3187468290208018, "grad_norm": 1.8203125, "learning_rate": 2.8505778576823036e-06, "loss": 0.97550316, "memory(GiB)": 369.42, "step": 51985, "train_speed(iter/s)": 0.200944 }, { "acc": 0.74240618, "epoch": 1.3188736681887367, "grad_norm": 2.578125, "learning_rate": 2.849631119498573e-06, "loss": 1.03709087, "memory(GiB)": 369.42, "step": 51990, "train_speed(iter/s)": 0.200947 }, { "acc": 0.73911018, "epoch": 1.3190005073566717, "grad_norm": 1.9609375, "learning_rate": 2.8486844758976652e-06, "loss": 1.03278189, "memory(GiB)": 369.42, "step": 51995, "train_speed(iter/s)": 0.20095 }, { "acc": 0.7620553, "epoch": 1.3191273465246067, "grad_norm": 2.28125, "learning_rate": 2.8477379269212157e-06, "loss": 0.94143219, "memory(GiB)": 369.42, "step": 52000, "train_speed(iter/s)": 0.200953 }, { "epoch": 1.3191273465246067, "eval_acc": 0.7379470997966726, "eval_loss": 0.969680905342102, "eval_runtime": 385.8605, "eval_samples_per_second": 16.509, "eval_steps_per_second": 8.254, "step": 52000 }, { "acc": 0.7540103, "epoch": 1.319254185692542, "grad_norm": 2.046875, "learning_rate": 2.846791472610865e-06, "loss": 0.98085346, "memory(GiB)": 369.42, "step": 52005, "train_speed(iter/s)": 0.200404 }, { "acc": 0.74155135, "epoch": 1.319381024860477, "grad_norm": 2.1875, "learning_rate": 2.845845113008239e-06, "loss": 1.01563759, "memory(GiB)": 369.42, "step": 52010, "train_speed(iter/s)": 0.200409 }, { "acc": 0.75352015, "epoch": 1.319507864028412, "grad_norm": 2.046875, "learning_rate": 2.84489884815496e-06, "loss": 0.99570704, "memory(GiB)": 369.42, "step": 52015, "train_speed(iter/s)": 0.200413 }, { "acc": 0.73571825, "epoch": 1.3196347031963471, "grad_norm": 1.921875, "learning_rate": 2.843952678092653e-06, "loss": 0.97767506, "memory(GiB)": 369.42, "step": 52020, "train_speed(iter/s)": 0.200416 }, { "acc": 0.74141469, "epoch": 1.3197615423642821, "grad_norm": 2.34375, "learning_rate": 2.8430066028629328e-06, "loss": 1.00304737, "memory(GiB)": 369.42, "step": 52025, "train_speed(iter/s)": 0.20042 }, { "acc": 0.75590186, "epoch": 1.319888381532217, "grad_norm": 2.03125, "learning_rate": 2.842060622507415e-06, "loss": 0.95908298, "memory(GiB)": 369.42, "step": 52030, "train_speed(iter/s)": 0.200424 }, { "acc": 0.75942173, "epoch": 1.320015220700152, "grad_norm": 1.9921875, "learning_rate": 2.841114737067702e-06, "loss": 0.94152241, "memory(GiB)": 369.42, "step": 52035, "train_speed(iter/s)": 0.200427 }, { "acc": 0.76531734, "epoch": 1.3201420598680873, "grad_norm": 1.9453125, "learning_rate": 2.840168946585402e-06, "loss": 0.90878325, "memory(GiB)": 369.42, "step": 52040, "train_speed(iter/s)": 0.20043 }, { "acc": 0.77215481, "epoch": 1.3202688990360223, "grad_norm": 2.171875, "learning_rate": 2.8392232511021158e-06, "loss": 0.92684097, "memory(GiB)": 369.42, "step": 52045, "train_speed(iter/s)": 0.200432 }, { "acc": 0.74414206, "epoch": 1.3203957382039575, "grad_norm": 2.109375, "learning_rate": 2.8382776506594385e-06, "loss": 1.03096046, "memory(GiB)": 369.42, "step": 52050, "train_speed(iter/s)": 0.200435 }, { "acc": 0.75491138, "epoch": 1.3205225773718925, "grad_norm": 2.125, "learning_rate": 2.837332145298961e-06, "loss": 0.96834002, "memory(GiB)": 369.42, "step": 52055, "train_speed(iter/s)": 0.20044 }, { "acc": 0.74641728, "epoch": 1.3206494165398275, "grad_norm": 2.71875, "learning_rate": 2.836386735062271e-06, "loss": 1.02411137, "memory(GiB)": 369.42, "step": 52060, "train_speed(iter/s)": 0.200443 }, { "acc": 0.76676064, "epoch": 1.3207762557077625, "grad_norm": 2.359375, "learning_rate": 2.835441419990953e-06, "loss": 0.931147, "memory(GiB)": 369.42, "step": 52065, "train_speed(iter/s)": 0.200446 }, { "acc": 0.74399748, "epoch": 1.3209030948756977, "grad_norm": 2.015625, "learning_rate": 2.834496200126585e-06, "loss": 1.02095623, "memory(GiB)": 369.42, "step": 52070, "train_speed(iter/s)": 0.200449 }, { "acc": 0.73937626, "epoch": 1.3210299340436327, "grad_norm": 2.390625, "learning_rate": 2.8335510755107426e-06, "loss": 1.06331758, "memory(GiB)": 369.42, "step": 52075, "train_speed(iter/s)": 0.200454 }, { "acc": 0.75867624, "epoch": 1.3211567732115677, "grad_norm": 2.375, "learning_rate": 2.8326060461849966e-06, "loss": 1.0021059, "memory(GiB)": 369.42, "step": 52080, "train_speed(iter/s)": 0.200456 }, { "acc": 0.75698395, "epoch": 1.3212836123795029, "grad_norm": 2.703125, "learning_rate": 2.8316611121909126e-06, "loss": 0.96322422, "memory(GiB)": 369.42, "step": 52085, "train_speed(iter/s)": 0.200458 }, { "acc": 0.74727154, "epoch": 1.3214104515474379, "grad_norm": 2.15625, "learning_rate": 2.8307162735700544e-06, "loss": 1.07210379, "memory(GiB)": 369.42, "step": 52090, "train_speed(iter/s)": 0.200462 }, { "acc": 0.76108341, "epoch": 1.3215372907153728, "grad_norm": 2.109375, "learning_rate": 2.8297715303639796e-06, "loss": 0.91478863, "memory(GiB)": 369.42, "step": 52095, "train_speed(iter/s)": 0.200463 }, { "acc": 0.73441429, "epoch": 1.3216641298833078, "grad_norm": 1.921875, "learning_rate": 2.8288268826142423e-06, "loss": 1.04629049, "memory(GiB)": 369.42, "step": 52100, "train_speed(iter/s)": 0.200465 }, { "acc": 0.75218201, "epoch": 1.321790969051243, "grad_norm": 2.359375, "learning_rate": 2.8278823303623905e-06, "loss": 0.99902573, "memory(GiB)": 369.42, "step": 52105, "train_speed(iter/s)": 0.200468 }, { "acc": 0.74474835, "epoch": 1.321917808219178, "grad_norm": 2.125, "learning_rate": 2.8269378736499754e-06, "loss": 1.02879276, "memory(GiB)": 369.42, "step": 52110, "train_speed(iter/s)": 0.200471 }, { "acc": 0.75169883, "epoch": 1.3220446473871132, "grad_norm": 2.296875, "learning_rate": 2.8259935125185323e-06, "loss": 1.03577747, "memory(GiB)": 369.42, "step": 52115, "train_speed(iter/s)": 0.200474 }, { "acc": 0.74278021, "epoch": 1.3221714865550482, "grad_norm": 2.484375, "learning_rate": 2.8250492470096008e-06, "loss": 1.0521349, "memory(GiB)": 369.42, "step": 52120, "train_speed(iter/s)": 0.200478 }, { "acc": 0.76071625, "epoch": 1.3222983257229832, "grad_norm": 2.5625, "learning_rate": 2.824105077164712e-06, "loss": 0.95634251, "memory(GiB)": 369.42, "step": 52125, "train_speed(iter/s)": 0.20048 }, { "acc": 0.73999128, "epoch": 1.3224251648909182, "grad_norm": 1.953125, "learning_rate": 2.823161003025401e-06, "loss": 1.01686354, "memory(GiB)": 369.42, "step": 52130, "train_speed(iter/s)": 0.200483 }, { "acc": 0.74980516, "epoch": 1.3225520040588534, "grad_norm": 2.9375, "learning_rate": 2.822217024633186e-06, "loss": 1.04395428, "memory(GiB)": 369.42, "step": 52135, "train_speed(iter/s)": 0.200486 }, { "acc": 0.75570316, "epoch": 1.3226788432267884, "grad_norm": 2.125, "learning_rate": 2.821273142029587e-06, "loss": 0.99275475, "memory(GiB)": 369.42, "step": 52140, "train_speed(iter/s)": 0.20049 }, { "acc": 0.73627634, "epoch": 1.3228056823947236, "grad_norm": 2.4375, "learning_rate": 2.820329355256124e-06, "loss": 1.01723995, "memory(GiB)": 369.42, "step": 52145, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75344896, "epoch": 1.3229325215626586, "grad_norm": 2.46875, "learning_rate": 2.8193856643543106e-06, "loss": 0.99166946, "memory(GiB)": 369.42, "step": 52150, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75051918, "epoch": 1.3230593607305936, "grad_norm": 2.71875, "learning_rate": 2.8184420693656468e-06, "loss": 0.97729874, "memory(GiB)": 369.42, "step": 52155, "train_speed(iter/s)": 0.200498 }, { "acc": 0.75827312, "epoch": 1.3231861998985286, "grad_norm": 2.140625, "learning_rate": 2.817498570331643e-06, "loss": 0.93767509, "memory(GiB)": 369.42, "step": 52160, "train_speed(iter/s)": 0.2005 }, { "acc": 0.75455151, "epoch": 1.3233130390664638, "grad_norm": 2.03125, "learning_rate": 2.816555167293795e-06, "loss": 0.94467258, "memory(GiB)": 369.42, "step": 52165, "train_speed(iter/s)": 0.200504 }, { "acc": 0.75045071, "epoch": 1.3234398782343988, "grad_norm": 2.078125, "learning_rate": 2.815611860293603e-06, "loss": 1.03234882, "memory(GiB)": 369.42, "step": 52170, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75881157, "epoch": 1.3235667174023338, "grad_norm": 2.203125, "learning_rate": 2.814668649372549e-06, "loss": 0.95535545, "memory(GiB)": 369.42, "step": 52175, "train_speed(iter/s)": 0.200504 }, { "acc": 0.75043383, "epoch": 1.323693556570269, "grad_norm": 2.453125, "learning_rate": 2.8137255345721266e-06, "loss": 1.0305912, "memory(GiB)": 369.42, "step": 52180, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75295181, "epoch": 1.323820395738204, "grad_norm": 2.4375, "learning_rate": 2.8127825159338163e-06, "loss": 1.00566816, "memory(GiB)": 369.42, "step": 52185, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75430331, "epoch": 1.323947234906139, "grad_norm": 2.296875, "learning_rate": 2.8118395934990962e-06, "loss": 1.0302392, "memory(GiB)": 369.42, "step": 52190, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74246063, "epoch": 1.324074074074074, "grad_norm": 2.265625, "learning_rate": 2.81089676730944e-06, "loss": 0.99811096, "memory(GiB)": 369.42, "step": 52195, "train_speed(iter/s)": 0.200515 }, { "acc": 0.75735855, "epoch": 1.3242009132420092, "grad_norm": 1.9140625, "learning_rate": 2.8099540374063185e-06, "loss": 0.94466114, "memory(GiB)": 369.42, "step": 52200, "train_speed(iter/s)": 0.200517 }, { "acc": 0.74776859, "epoch": 1.3243277524099442, "grad_norm": 1.859375, "learning_rate": 2.8090114038311956e-06, "loss": 1.01377106, "memory(GiB)": 369.42, "step": 52205, "train_speed(iter/s)": 0.200521 }, { "acc": 0.7598805, "epoch": 1.3244545915778794, "grad_norm": 2.375, "learning_rate": 2.8080688666255328e-06, "loss": 0.95327644, "memory(GiB)": 369.42, "step": 52210, "train_speed(iter/s)": 0.200521 }, { "acc": 0.74890437, "epoch": 1.3245814307458144, "grad_norm": 2.015625, "learning_rate": 2.8071264258307884e-06, "loss": 0.96740208, "memory(GiB)": 369.42, "step": 52215, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75578375, "epoch": 1.3247082699137493, "grad_norm": 2.109375, "learning_rate": 2.8061840814884133e-06, "loss": 0.95523796, "memory(GiB)": 369.42, "step": 52220, "train_speed(iter/s)": 0.200524 }, { "acc": 0.76899619, "epoch": 1.3248351090816843, "grad_norm": 1.90625, "learning_rate": 2.805241833639858e-06, "loss": 0.96742039, "memory(GiB)": 369.42, "step": 52225, "train_speed(iter/s)": 0.200529 }, { "acc": 0.74345903, "epoch": 1.3249619482496195, "grad_norm": 1.875, "learning_rate": 2.804299682326565e-06, "loss": 1.01398296, "memory(GiB)": 369.42, "step": 52230, "train_speed(iter/s)": 0.200531 }, { "acc": 0.75417943, "epoch": 1.3250887874175545, "grad_norm": 2.15625, "learning_rate": 2.8033576275899752e-06, "loss": 1.02142715, "memory(GiB)": 369.42, "step": 52235, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74693704, "epoch": 1.3252156265854895, "grad_norm": 2.109375, "learning_rate": 2.8024156694715242e-06, "loss": 1.02281609, "memory(GiB)": 369.42, "step": 52240, "train_speed(iter/s)": 0.200535 }, { "acc": 0.75142498, "epoch": 1.3253424657534247, "grad_norm": 2.078125, "learning_rate": 2.8014738080126424e-06, "loss": 1.0080246, "memory(GiB)": 369.42, "step": 52245, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75673428, "epoch": 1.3254693049213597, "grad_norm": 1.8515625, "learning_rate": 2.8005320432547612e-06, "loss": 0.97799339, "memory(GiB)": 369.42, "step": 52250, "train_speed(iter/s)": 0.200541 }, { "acc": 0.74362288, "epoch": 1.3255961440892947, "grad_norm": 2.359375, "learning_rate": 2.7995903752392993e-06, "loss": 1.01005163, "memory(GiB)": 369.42, "step": 52255, "train_speed(iter/s)": 0.200544 }, { "acc": 0.77101173, "epoch": 1.3257229832572297, "grad_norm": 2.359375, "learning_rate": 2.7986488040076764e-06, "loss": 0.92254133, "memory(GiB)": 369.42, "step": 52260, "train_speed(iter/s)": 0.200547 }, { "acc": 0.7483737, "epoch": 1.325849822425165, "grad_norm": 2.5625, "learning_rate": 2.797707329601306e-06, "loss": 1.04189053, "memory(GiB)": 369.42, "step": 52265, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75085888, "epoch": 1.3259766615931, "grad_norm": 2.484375, "learning_rate": 2.7967659520616032e-06, "loss": 0.9875761, "memory(GiB)": 369.42, "step": 52270, "train_speed(iter/s)": 0.200554 }, { "acc": 0.75674062, "epoch": 1.3261035007610351, "grad_norm": 1.8515625, "learning_rate": 2.7958246714299685e-06, "loss": 0.88669968, "memory(GiB)": 369.42, "step": 52275, "train_speed(iter/s)": 0.200555 }, { "acc": 0.74266114, "epoch": 1.32623033992897, "grad_norm": 1.796875, "learning_rate": 2.7948834877478035e-06, "loss": 1.01470528, "memory(GiB)": 369.42, "step": 52280, "train_speed(iter/s)": 0.200558 }, { "acc": 0.73692102, "epoch": 1.326357179096905, "grad_norm": 2.421875, "learning_rate": 2.7939424010565107e-06, "loss": 1.04753723, "memory(GiB)": 369.42, "step": 52285, "train_speed(iter/s)": 0.200562 }, { "acc": 0.75494342, "epoch": 1.32648401826484, "grad_norm": 2.0625, "learning_rate": 2.793001411397482e-06, "loss": 1.02115631, "memory(GiB)": 369.42, "step": 52290, "train_speed(iter/s)": 0.200566 }, { "acc": 0.75488486, "epoch": 1.3266108574327753, "grad_norm": 2.484375, "learning_rate": 2.792060518812103e-06, "loss": 0.98455334, "memory(GiB)": 369.42, "step": 52295, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75635414, "epoch": 1.3267376966007103, "grad_norm": 2.453125, "learning_rate": 2.7911197233417574e-06, "loss": 0.97218752, "memory(GiB)": 369.42, "step": 52300, "train_speed(iter/s)": 0.200571 }, { "acc": 0.74676776, "epoch": 1.3268645357686455, "grad_norm": 2.25, "learning_rate": 2.790179025027831e-06, "loss": 1.02323895, "memory(GiB)": 369.42, "step": 52305, "train_speed(iter/s)": 0.200573 }, { "acc": 0.73453264, "epoch": 1.3269913749365805, "grad_norm": 2.578125, "learning_rate": 2.789238423911699e-06, "loss": 1.02554398, "memory(GiB)": 369.42, "step": 52310, "train_speed(iter/s)": 0.200577 }, { "acc": 0.75828004, "epoch": 1.3271182141045155, "grad_norm": 2.375, "learning_rate": 2.788297920034727e-06, "loss": 0.96068954, "memory(GiB)": 369.42, "step": 52315, "train_speed(iter/s)": 0.20058 }, { "acc": 0.75572343, "epoch": 1.3272450532724505, "grad_norm": 2.5, "learning_rate": 2.78735751343829e-06, "loss": 1.01154137, "memory(GiB)": 369.42, "step": 52320, "train_speed(iter/s)": 0.200583 }, { "acc": 0.74534926, "epoch": 1.3273718924403857, "grad_norm": 2.546875, "learning_rate": 2.786417204163748e-06, "loss": 0.98965292, "memory(GiB)": 369.42, "step": 52325, "train_speed(iter/s)": 0.200587 }, { "acc": 0.75013294, "epoch": 1.3274987316083207, "grad_norm": 1.890625, "learning_rate": 2.7854769922524593e-06, "loss": 0.99085102, "memory(GiB)": 369.42, "step": 52330, "train_speed(iter/s)": 0.20059 }, { "acc": 0.7584084, "epoch": 1.3276255707762556, "grad_norm": 2.171875, "learning_rate": 2.7845368777457803e-06, "loss": 0.9636034, "memory(GiB)": 369.42, "step": 52335, "train_speed(iter/s)": 0.200592 }, { "acc": 0.7515667, "epoch": 1.3277524099441909, "grad_norm": 2.125, "learning_rate": 2.7835968606850616e-06, "loss": 1.03216038, "memory(GiB)": 369.42, "step": 52340, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74950018, "epoch": 1.3278792491121258, "grad_norm": 2.125, "learning_rate": 2.782656941111648e-06, "loss": 0.97017899, "memory(GiB)": 369.42, "step": 52345, "train_speed(iter/s)": 0.200597 }, { "acc": 0.76520972, "epoch": 1.3280060882800608, "grad_norm": 3.296875, "learning_rate": 2.7817171190668812e-06, "loss": 0.93024492, "memory(GiB)": 369.42, "step": 52350, "train_speed(iter/s)": 0.200599 }, { "acc": 0.74089804, "epoch": 1.3281329274479958, "grad_norm": 2.40625, "learning_rate": 2.7807773945921e-06, "loss": 1.02651653, "memory(GiB)": 369.42, "step": 52355, "train_speed(iter/s)": 0.200602 }, { "acc": 0.76553183, "epoch": 1.328259766615931, "grad_norm": 1.9921875, "learning_rate": 2.7798377677286363e-06, "loss": 0.95504837, "memory(GiB)": 369.42, "step": 52360, "train_speed(iter/s)": 0.200606 }, { "acc": 0.75200319, "epoch": 1.328386605783866, "grad_norm": 2.25, "learning_rate": 2.778898238517821e-06, "loss": 0.95349312, "memory(GiB)": 369.42, "step": 52365, "train_speed(iter/s)": 0.200608 }, { "acc": 0.7512023, "epoch": 1.3285134449518012, "grad_norm": 2.078125, "learning_rate": 2.7779588070009767e-06, "loss": 1.00357475, "memory(GiB)": 369.42, "step": 52370, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75186515, "epoch": 1.3286402841197362, "grad_norm": 2.140625, "learning_rate": 2.7770194732194256e-06, "loss": 0.99136925, "memory(GiB)": 369.42, "step": 52375, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74835892, "epoch": 1.3287671232876712, "grad_norm": 1.9140625, "learning_rate": 2.7760802372144825e-06, "loss": 1.02013378, "memory(GiB)": 369.42, "step": 52380, "train_speed(iter/s)": 0.200616 }, { "acc": 0.75894113, "epoch": 1.3288939624556062, "grad_norm": 2.375, "learning_rate": 2.7751410990274596e-06, "loss": 0.98430653, "memory(GiB)": 369.42, "step": 52385, "train_speed(iter/s)": 0.200619 }, { "acc": 0.75806522, "epoch": 1.3290208016235414, "grad_norm": 2.1875, "learning_rate": 2.774202058699664e-06, "loss": 0.89898701, "memory(GiB)": 369.42, "step": 52390, "train_speed(iter/s)": 0.200621 }, { "acc": 0.74962053, "epoch": 1.3291476407914764, "grad_norm": 1.9609375, "learning_rate": 2.7732631162724005e-06, "loss": 0.97683811, "memory(GiB)": 369.42, "step": 52395, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75265589, "epoch": 1.3292744799594114, "grad_norm": 1.640625, "learning_rate": 2.772324271786966e-06, "loss": 0.96924076, "memory(GiB)": 369.42, "step": 52400, "train_speed(iter/s)": 0.200626 }, { "acc": 0.75728931, "epoch": 1.3294013191273466, "grad_norm": 2.25, "learning_rate": 2.7713855252846545e-06, "loss": 1.01127453, "memory(GiB)": 369.42, "step": 52405, "train_speed(iter/s)": 0.20063 }, { "acc": 0.75677519, "epoch": 1.3295281582952816, "grad_norm": 2.09375, "learning_rate": 2.7704468768067616e-06, "loss": 0.93010817, "memory(GiB)": 369.42, "step": 52410, "train_speed(iter/s)": 0.200633 }, { "acc": 0.73929191, "epoch": 1.3296549974632166, "grad_norm": 2.625, "learning_rate": 2.7695083263945664e-06, "loss": 1.05158787, "memory(GiB)": 369.42, "step": 52415, "train_speed(iter/s)": 0.200635 }, { "acc": 0.73727899, "epoch": 1.3297818366311516, "grad_norm": 2.1875, "learning_rate": 2.7685698740893516e-06, "loss": 1.04568748, "memory(GiB)": 369.42, "step": 52420, "train_speed(iter/s)": 0.200638 }, { "acc": 0.75002255, "epoch": 1.3299086757990868, "grad_norm": 2.4375, "learning_rate": 2.7676315199323995e-06, "loss": 0.97147789, "memory(GiB)": 369.42, "step": 52425, "train_speed(iter/s)": 0.200641 }, { "acc": 0.767381, "epoch": 1.3300355149670218, "grad_norm": 2.5, "learning_rate": 2.7666932639649814e-06, "loss": 0.91322947, "memory(GiB)": 369.42, "step": 52430, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74478369, "epoch": 1.330162354134957, "grad_norm": 2.0625, "learning_rate": 2.765755106228362e-06, "loss": 0.98896198, "memory(GiB)": 369.42, "step": 52435, "train_speed(iter/s)": 0.200647 }, { "acc": 0.74940958, "epoch": 1.330289193302892, "grad_norm": 2.15625, "learning_rate": 2.764817046763807e-06, "loss": 0.94534349, "memory(GiB)": 369.42, "step": 52440, "train_speed(iter/s)": 0.20065 }, { "acc": 0.75259724, "epoch": 1.330416032470827, "grad_norm": 2.078125, "learning_rate": 2.7638790856125786e-06, "loss": 1.01850491, "memory(GiB)": 369.42, "step": 52445, "train_speed(iter/s)": 0.200654 }, { "acc": 0.74199524, "epoch": 1.330542871638762, "grad_norm": 2.1875, "learning_rate": 2.7629412228159346e-06, "loss": 1.02138538, "memory(GiB)": 369.42, "step": 52450, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74054646, "epoch": 1.3306697108066972, "grad_norm": 2.453125, "learning_rate": 2.762003458415119e-06, "loss": 0.98851385, "memory(GiB)": 369.42, "step": 52455, "train_speed(iter/s)": 0.200658 }, { "acc": 0.75250931, "epoch": 1.3307965499746321, "grad_norm": 2.0625, "learning_rate": 2.7610657924513853e-06, "loss": 0.99604731, "memory(GiB)": 369.42, "step": 52460, "train_speed(iter/s)": 0.200662 }, { "acc": 0.74826808, "epoch": 1.3309233891425674, "grad_norm": 2.203125, "learning_rate": 2.7601282249659737e-06, "loss": 1.02246237, "memory(GiB)": 369.42, "step": 52465, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74220972, "epoch": 1.3310502283105023, "grad_norm": 2.34375, "learning_rate": 2.759190756000126e-06, "loss": 1.0311491, "memory(GiB)": 369.42, "step": 52470, "train_speed(iter/s)": 0.200667 }, { "acc": 0.74229021, "epoch": 1.3311770674784373, "grad_norm": 2.71875, "learning_rate": 2.7582533855950687e-06, "loss": 1.03521538, "memory(GiB)": 369.42, "step": 52475, "train_speed(iter/s)": 0.20067 }, { "acc": 0.74868994, "epoch": 1.3313039066463723, "grad_norm": 1.96875, "learning_rate": 2.757316113792038e-06, "loss": 1.00348282, "memory(GiB)": 369.42, "step": 52480, "train_speed(iter/s)": 0.200674 }, { "acc": 0.74379387, "epoch": 1.3314307458143075, "grad_norm": 1.9375, "learning_rate": 2.756378940632258e-06, "loss": 0.99945002, "memory(GiB)": 369.42, "step": 52485, "train_speed(iter/s)": 0.200676 }, { "acc": 0.74408894, "epoch": 1.3315575849822425, "grad_norm": 2.390625, "learning_rate": 2.755441866156949e-06, "loss": 1.0177597, "memory(GiB)": 369.42, "step": 52490, "train_speed(iter/s)": 0.20068 }, { "acc": 0.75056953, "epoch": 1.3316844241501775, "grad_norm": 2.171875, "learning_rate": 2.7545048904073278e-06, "loss": 1.03394337, "memory(GiB)": 369.42, "step": 52495, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75674639, "epoch": 1.3318112633181127, "grad_norm": 2.0625, "learning_rate": 2.7535680134246067e-06, "loss": 0.97181606, "memory(GiB)": 369.42, "step": 52500, "train_speed(iter/s)": 0.200686 }, { "acc": 0.76596665, "epoch": 1.3319381024860477, "grad_norm": 2.03125, "learning_rate": 2.752631235249995e-06, "loss": 0.99296494, "memory(GiB)": 369.42, "step": 52505, "train_speed(iter/s)": 0.200686 }, { "acc": 0.76083403, "epoch": 1.3320649416539827, "grad_norm": 2.421875, "learning_rate": 2.7516945559246945e-06, "loss": 0.9266613, "memory(GiB)": 369.42, "step": 52510, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75100064, "epoch": 1.3321917808219177, "grad_norm": 2.125, "learning_rate": 2.7507579754899053e-06, "loss": 0.96393681, "memory(GiB)": 369.42, "step": 52515, "train_speed(iter/s)": 0.200691 }, { "acc": 0.75843334, "epoch": 1.332318619989853, "grad_norm": 2.28125, "learning_rate": 2.749821493986823e-06, "loss": 0.92301607, "memory(GiB)": 369.42, "step": 52520, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75209951, "epoch": 1.332445459157788, "grad_norm": 2.453125, "learning_rate": 2.748885111456637e-06, "loss": 1.00021954, "memory(GiB)": 369.42, "step": 52525, "train_speed(iter/s)": 0.200697 }, { "acc": 0.76012731, "epoch": 1.332572298325723, "grad_norm": 2.609375, "learning_rate": 2.7479488279405354e-06, "loss": 0.99156628, "memory(GiB)": 369.42, "step": 52530, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75700879, "epoch": 1.332699137493658, "grad_norm": 2.421875, "learning_rate": 2.7470126434796984e-06, "loss": 0.95589428, "memory(GiB)": 369.42, "step": 52535, "train_speed(iter/s)": 0.200705 }, { "acc": 0.75013323, "epoch": 1.332825976661593, "grad_norm": 2.203125, "learning_rate": 2.746076558115304e-06, "loss": 0.98071709, "memory(GiB)": 369.42, "step": 52540, "train_speed(iter/s)": 0.200707 }, { "acc": 0.75958376, "epoch": 1.332952815829528, "grad_norm": 2.4375, "learning_rate": 2.7451405718885237e-06, "loss": 0.97673302, "memory(GiB)": 369.42, "step": 52545, "train_speed(iter/s)": 0.20071 }, { "acc": 0.74227524, "epoch": 1.3330796549974633, "grad_norm": 2.140625, "learning_rate": 2.7442046848405328e-06, "loss": 0.98004684, "memory(GiB)": 369.42, "step": 52550, "train_speed(iter/s)": 0.200712 }, { "acc": 0.76106367, "epoch": 1.3332064941653983, "grad_norm": 1.9375, "learning_rate": 2.743268897012489e-06, "loss": 0.95055628, "memory(GiB)": 369.42, "step": 52555, "train_speed(iter/s)": 0.200714 }, { "acc": 0.74213362, "epoch": 1.3333333333333333, "grad_norm": 2.5, "learning_rate": 2.7423332084455543e-06, "loss": 1.00251102, "memory(GiB)": 369.42, "step": 52560, "train_speed(iter/s)": 0.200713 }, { "acc": 0.77329507, "epoch": 1.3334601725012685, "grad_norm": 2.3125, "learning_rate": 2.741397619180883e-06, "loss": 0.91810608, "memory(GiB)": 369.42, "step": 52565, "train_speed(iter/s)": 0.200716 }, { "acc": 0.75456266, "epoch": 1.3335870116692035, "grad_norm": 2.109375, "learning_rate": 2.740462129259633e-06, "loss": 1.01795864, "memory(GiB)": 369.42, "step": 52570, "train_speed(iter/s)": 0.20072 }, { "acc": 0.75038395, "epoch": 1.3337138508371384, "grad_norm": 1.9921875, "learning_rate": 2.739526738722944e-06, "loss": 1.04287949, "memory(GiB)": 369.42, "step": 52575, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75008583, "epoch": 1.3338406900050734, "grad_norm": 1.8984375, "learning_rate": 2.738591447611959e-06, "loss": 0.95875092, "memory(GiB)": 369.42, "step": 52580, "train_speed(iter/s)": 0.200725 }, { "acc": 0.73887315, "epoch": 1.3339675291730086, "grad_norm": 2.21875, "learning_rate": 2.7376562559678214e-06, "loss": 0.99549427, "memory(GiB)": 369.42, "step": 52585, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75170732, "epoch": 1.3340943683409436, "grad_norm": 2.546875, "learning_rate": 2.7367211638316637e-06, "loss": 1.04936666, "memory(GiB)": 369.42, "step": 52590, "train_speed(iter/s)": 0.200731 }, { "acc": 0.75888052, "epoch": 1.3342212075088788, "grad_norm": 1.9453125, "learning_rate": 2.735786171244611e-06, "loss": 0.9629015, "memory(GiB)": 369.42, "step": 52595, "train_speed(iter/s)": 0.200734 }, { "acc": 0.74001722, "epoch": 1.3343480466768138, "grad_norm": 2.1875, "learning_rate": 2.7348512782477922e-06, "loss": 0.99132843, "memory(GiB)": 369.42, "step": 52600, "train_speed(iter/s)": 0.200736 }, { "acc": 0.76291342, "epoch": 1.3344748858447488, "grad_norm": 2.34375, "learning_rate": 2.7339164848823287e-06, "loss": 0.97276783, "memory(GiB)": 369.42, "step": 52605, "train_speed(iter/s)": 0.200737 }, { "acc": 0.75373707, "epoch": 1.3346017250126838, "grad_norm": 2.21875, "learning_rate": 2.7329817911893365e-06, "loss": 0.99334574, "memory(GiB)": 369.42, "step": 52610, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74709563, "epoch": 1.334728564180619, "grad_norm": 2.21875, "learning_rate": 2.7320471972099226e-06, "loss": 0.99733191, "memory(GiB)": 369.42, "step": 52615, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74777946, "epoch": 1.334855403348554, "grad_norm": 2.015625, "learning_rate": 2.7311127029852007e-06, "loss": 0.95846348, "memory(GiB)": 369.42, "step": 52620, "train_speed(iter/s)": 0.200745 }, { "acc": 0.73325272, "epoch": 1.3349822425164892, "grad_norm": 2.15625, "learning_rate": 2.7301783085562726e-06, "loss": 1.07242794, "memory(GiB)": 369.42, "step": 52625, "train_speed(iter/s)": 0.200748 }, { "acc": 0.74942627, "epoch": 1.3351090816844242, "grad_norm": 2.125, "learning_rate": 2.7292440139642364e-06, "loss": 0.99502392, "memory(GiB)": 369.42, "step": 52630, "train_speed(iter/s)": 0.200749 }, { "acc": 0.74916306, "epoch": 1.3352359208523592, "grad_norm": 2.125, "learning_rate": 2.7283098192501855e-06, "loss": 0.97363253, "memory(GiB)": 369.42, "step": 52635, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75551548, "epoch": 1.3353627600202942, "grad_norm": 2.140625, "learning_rate": 2.7273757244552124e-06, "loss": 0.92417507, "memory(GiB)": 369.42, "step": 52640, "train_speed(iter/s)": 0.200757 }, { "acc": 0.75179391, "epoch": 1.3354895991882294, "grad_norm": 2.1875, "learning_rate": 2.726441729620401e-06, "loss": 0.99228916, "memory(GiB)": 369.42, "step": 52645, "train_speed(iter/s)": 0.200759 }, { "acc": 0.75590458, "epoch": 1.3356164383561644, "grad_norm": 2.0625, "learning_rate": 2.725507834786833e-06, "loss": 0.99890175, "memory(GiB)": 369.42, "step": 52650, "train_speed(iter/s)": 0.200763 }, { "acc": 0.76200213, "epoch": 1.3357432775240994, "grad_norm": 2.46875, "learning_rate": 2.7245740399955857e-06, "loss": 0.97314863, "memory(GiB)": 369.42, "step": 52655, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75537066, "epoch": 1.3358701166920346, "grad_norm": 2.0625, "learning_rate": 2.72364034528773e-06, "loss": 0.97632895, "memory(GiB)": 369.42, "step": 52660, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75716677, "epoch": 1.3359969558599696, "grad_norm": 2.234375, "learning_rate": 2.722706750704337e-06, "loss": 0.97449284, "memory(GiB)": 369.42, "step": 52665, "train_speed(iter/s)": 0.200772 }, { "acc": 0.74450359, "epoch": 1.3361237950279046, "grad_norm": 2.15625, "learning_rate": 2.7217732562864673e-06, "loss": 0.9910717, "memory(GiB)": 369.42, "step": 52670, "train_speed(iter/s)": 0.200776 }, { "acc": 0.73750496, "epoch": 1.3362506341958396, "grad_norm": 2.84375, "learning_rate": 2.720839862075181e-06, "loss": 1.04774837, "memory(GiB)": 369.42, "step": 52675, "train_speed(iter/s)": 0.20078 }, { "acc": 0.75005059, "epoch": 1.3363774733637748, "grad_norm": 1.875, "learning_rate": 2.7199065681115344e-06, "loss": 1.0007782, "memory(GiB)": 369.42, "step": 52680, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75185671, "epoch": 1.3365043125317098, "grad_norm": 2.453125, "learning_rate": 2.7189733744365742e-06, "loss": 0.98634453, "memory(GiB)": 369.42, "step": 52685, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75892334, "epoch": 1.336631151699645, "grad_norm": 2.1875, "learning_rate": 2.718040281091353e-06, "loss": 0.94746475, "memory(GiB)": 369.42, "step": 52690, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75636387, "epoch": 1.33675799086758, "grad_norm": 2.234375, "learning_rate": 2.717107288116906e-06, "loss": 0.93798952, "memory(GiB)": 369.42, "step": 52695, "train_speed(iter/s)": 0.200793 }, { "acc": 0.743573, "epoch": 1.336884830035515, "grad_norm": 2.21875, "learning_rate": 2.716174395554274e-06, "loss": 1.00825024, "memory(GiB)": 369.42, "step": 52700, "train_speed(iter/s)": 0.200796 }, { "acc": 0.7476779, "epoch": 1.33701166920345, "grad_norm": 2.0, "learning_rate": 2.715241603444486e-06, "loss": 1.03141947, "memory(GiB)": 369.42, "step": 52705, "train_speed(iter/s)": 0.2008 }, { "acc": 0.75716257, "epoch": 1.3371385083713851, "grad_norm": 2.1875, "learning_rate": 2.714308911828577e-06, "loss": 0.93093529, "memory(GiB)": 369.42, "step": 52710, "train_speed(iter/s)": 0.200802 }, { "acc": 0.74511299, "epoch": 1.3372653475393201, "grad_norm": 2.28125, "learning_rate": 2.713376320747565e-06, "loss": 1.04688148, "memory(GiB)": 369.42, "step": 52715, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74451418, "epoch": 1.3373921867072551, "grad_norm": 2.109375, "learning_rate": 2.7124438302424696e-06, "loss": 0.98042555, "memory(GiB)": 369.42, "step": 52720, "train_speed(iter/s)": 0.200808 }, { "acc": 0.74161701, "epoch": 1.3375190258751903, "grad_norm": 2.59375, "learning_rate": 2.711511440354309e-06, "loss": 1.02810898, "memory(GiB)": 369.42, "step": 52725, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75229611, "epoch": 1.3376458650431253, "grad_norm": 2.546875, "learning_rate": 2.710579151124095e-06, "loss": 0.97996655, "memory(GiB)": 369.42, "step": 52730, "train_speed(iter/s)": 0.200814 }, { "acc": 0.7546711, "epoch": 1.3377727042110603, "grad_norm": 2.140625, "learning_rate": 2.70964696259283e-06, "loss": 0.94634743, "memory(GiB)": 369.42, "step": 52735, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75228348, "epoch": 1.3378995433789953, "grad_norm": 2.046875, "learning_rate": 2.7087148748015146e-06, "loss": 0.96650715, "memory(GiB)": 369.42, "step": 52740, "train_speed(iter/s)": 0.20082 }, { "acc": 0.7344017, "epoch": 1.3380263825469305, "grad_norm": 2.125, "learning_rate": 2.7077828877911517e-06, "loss": 1.10977068, "memory(GiB)": 369.42, "step": 52745, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75692401, "epoch": 1.3381532217148655, "grad_norm": 2.484375, "learning_rate": 2.706851001602733e-06, "loss": 0.92268143, "memory(GiB)": 369.42, "step": 52750, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75647922, "epoch": 1.3382800608828007, "grad_norm": 1.921875, "learning_rate": 2.7059192162772407e-06, "loss": 0.9774147, "memory(GiB)": 369.42, "step": 52755, "train_speed(iter/s)": 0.200829 }, { "acc": 0.74850636, "epoch": 1.3384069000507357, "grad_norm": 2.359375, "learning_rate": 2.704987531855666e-06, "loss": 0.99822054, "memory(GiB)": 369.42, "step": 52760, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75673571, "epoch": 1.3385337392186707, "grad_norm": 2.015625, "learning_rate": 2.704055948378986e-06, "loss": 0.98882675, "memory(GiB)": 369.42, "step": 52765, "train_speed(iter/s)": 0.200835 }, { "acc": 0.7498373, "epoch": 1.3386605783866057, "grad_norm": 2.265625, "learning_rate": 2.7031244658881773e-06, "loss": 0.92529526, "memory(GiB)": 369.42, "step": 52770, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75636044, "epoch": 1.338787417554541, "grad_norm": 2.703125, "learning_rate": 2.7021930844242085e-06, "loss": 0.99284334, "memory(GiB)": 369.42, "step": 52775, "train_speed(iter/s)": 0.200841 }, { "acc": 0.76056652, "epoch": 1.3389142567224759, "grad_norm": 2.5, "learning_rate": 2.7012618040280463e-06, "loss": 0.97095528, "memory(GiB)": 369.42, "step": 52780, "train_speed(iter/s)": 0.200843 }, { "acc": 0.76203203, "epoch": 1.339041095890411, "grad_norm": 2.453125, "learning_rate": 2.7003306247406536e-06, "loss": 0.98029308, "memory(GiB)": 369.42, "step": 52785, "train_speed(iter/s)": 0.200846 }, { "acc": 0.73901119, "epoch": 1.339167935058346, "grad_norm": 2.203125, "learning_rate": 2.6993995466029877e-06, "loss": 1.07494354, "memory(GiB)": 369.42, "step": 52790, "train_speed(iter/s)": 0.200849 }, { "acc": 0.74042168, "epoch": 1.339294774226281, "grad_norm": 1.9140625, "learning_rate": 2.6984685696560002e-06, "loss": 0.9991909, "memory(GiB)": 369.42, "step": 52795, "train_speed(iter/s)": 0.200852 }, { "acc": 0.750348, "epoch": 1.339421613394216, "grad_norm": 1.765625, "learning_rate": 2.6975376939406418e-06, "loss": 0.96499014, "memory(GiB)": 369.42, "step": 52800, "train_speed(iter/s)": 0.200854 }, { "acc": 0.75469728, "epoch": 1.3395484525621513, "grad_norm": 1.8828125, "learning_rate": 2.6966069194978537e-06, "loss": 0.96681728, "memory(GiB)": 369.42, "step": 52805, "train_speed(iter/s)": 0.200858 }, { "acc": 0.73492446, "epoch": 1.3396752917300863, "grad_norm": 2.265625, "learning_rate": 2.6956762463685787e-06, "loss": 1.04272461, "memory(GiB)": 369.42, "step": 52810, "train_speed(iter/s)": 0.200859 }, { "acc": 0.7513772, "epoch": 1.3398021308980212, "grad_norm": 2.078125, "learning_rate": 2.69474567459375e-06, "loss": 1.02899971, "memory(GiB)": 369.42, "step": 52815, "train_speed(iter/s)": 0.200861 }, { "acc": 0.75172396, "epoch": 1.3399289700659565, "grad_norm": 2.40625, "learning_rate": 2.693815204214299e-06, "loss": 1.00780954, "memory(GiB)": 369.42, "step": 52820, "train_speed(iter/s)": 0.200865 }, { "acc": 0.75502739, "epoch": 1.3400558092338914, "grad_norm": 2.140625, "learning_rate": 2.692884835271151e-06, "loss": 0.9979435, "memory(GiB)": 369.42, "step": 52825, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76484356, "epoch": 1.3401826484018264, "grad_norm": 2.765625, "learning_rate": 2.6919545678052296e-06, "loss": 1.00219021, "memory(GiB)": 369.42, "step": 52830, "train_speed(iter/s)": 0.200871 }, { "acc": 0.76686964, "epoch": 1.3403094875697614, "grad_norm": 2.59375, "learning_rate": 2.69102440185745e-06, "loss": 0.9053606, "memory(GiB)": 369.42, "step": 52835, "train_speed(iter/s)": 0.200873 }, { "acc": 0.73751955, "epoch": 1.3404363267376966, "grad_norm": 2.09375, "learning_rate": 2.690094337468726e-06, "loss": 1.0835886, "memory(GiB)": 369.42, "step": 52840, "train_speed(iter/s)": 0.200876 }, { "acc": 0.74424009, "epoch": 1.3405631659056316, "grad_norm": 2.40625, "learning_rate": 2.6891643746799643e-06, "loss": 1.01194458, "memory(GiB)": 369.42, "step": 52845, "train_speed(iter/s)": 0.200879 }, { "acc": 0.72316294, "epoch": 1.3406900050735668, "grad_norm": 2.015625, "learning_rate": 2.6882345135320753e-06, "loss": 1.0460371, "memory(GiB)": 369.42, "step": 52850, "train_speed(iter/s)": 0.200882 }, { "acc": 0.75831342, "epoch": 1.3408168442415018, "grad_norm": 2.03125, "learning_rate": 2.68730475406595e-06, "loss": 1.04314232, "memory(GiB)": 369.42, "step": 52855, "train_speed(iter/s)": 0.200883 }, { "acc": 0.73860264, "epoch": 1.3409436834094368, "grad_norm": 2.109375, "learning_rate": 2.6863750963224867e-06, "loss": 1.03748636, "memory(GiB)": 369.42, "step": 52860, "train_speed(iter/s)": 0.200887 }, { "acc": 0.73838072, "epoch": 1.3410705225773718, "grad_norm": 2.03125, "learning_rate": 2.685445540342577e-06, "loss": 1.02086735, "memory(GiB)": 369.42, "step": 52865, "train_speed(iter/s)": 0.200888 }, { "acc": 0.73538694, "epoch": 1.341197361745307, "grad_norm": 2.53125, "learning_rate": 2.6845160861671094e-06, "loss": 1.08243017, "memory(GiB)": 369.42, "step": 52870, "train_speed(iter/s)": 0.20089 }, { "acc": 0.75361295, "epoch": 1.341324200913242, "grad_norm": 2.359375, "learning_rate": 2.6835867338369593e-06, "loss": 0.94339762, "memory(GiB)": 369.42, "step": 52875, "train_speed(iter/s)": 0.200894 }, { "acc": 0.7486475, "epoch": 1.341451040081177, "grad_norm": 2.15625, "learning_rate": 2.6826574833930053e-06, "loss": 0.93138428, "memory(GiB)": 369.42, "step": 52880, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74448891, "epoch": 1.3415778792491122, "grad_norm": 2.546875, "learning_rate": 2.681728334876123e-06, "loss": 0.99443531, "memory(GiB)": 369.42, "step": 52885, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76397491, "epoch": 1.3417047184170472, "grad_norm": 2.28125, "learning_rate": 2.6807992883271806e-06, "loss": 0.89714317, "memory(GiB)": 369.42, "step": 52890, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75946574, "epoch": 1.3418315575849822, "grad_norm": 2.71875, "learning_rate": 2.6798703437870364e-06, "loss": 0.93976164, "memory(GiB)": 369.42, "step": 52895, "train_speed(iter/s)": 0.200906 }, { "acc": 0.75401449, "epoch": 1.3419583967529172, "grad_norm": 2.34375, "learning_rate": 2.678941501296555e-06, "loss": 0.97664881, "memory(GiB)": 369.42, "step": 52900, "train_speed(iter/s)": 0.20091 }, { "acc": 0.76167288, "epoch": 1.3420852359208524, "grad_norm": 2.25, "learning_rate": 2.6780127608965896e-06, "loss": 0.97952862, "memory(GiB)": 369.42, "step": 52905, "train_speed(iter/s)": 0.200913 }, { "acc": 0.76295376, "epoch": 1.3422120750887874, "grad_norm": 2.1875, "learning_rate": 2.677084122627991e-06, "loss": 0.9975132, "memory(GiB)": 369.42, "step": 52910, "train_speed(iter/s)": 0.200913 }, { "acc": 0.7453804, "epoch": 1.3423389142567226, "grad_norm": 1.6640625, "learning_rate": 2.6761555865316003e-06, "loss": 1.03194437, "memory(GiB)": 369.42, "step": 52915, "train_speed(iter/s)": 0.200917 }, { "acc": 0.75707211, "epoch": 1.3424657534246576, "grad_norm": 2.28125, "learning_rate": 2.6752271526482644e-06, "loss": 1.00124779, "memory(GiB)": 369.42, "step": 52920, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74587841, "epoch": 1.3425925925925926, "grad_norm": 2.09375, "learning_rate": 2.674298821018817e-06, "loss": 1.00751114, "memory(GiB)": 369.42, "step": 52925, "train_speed(iter/s)": 0.200924 }, { "acc": 0.74674425, "epoch": 1.3427194317605275, "grad_norm": 2.703125, "learning_rate": 2.673370591684091e-06, "loss": 1.03659029, "memory(GiB)": 369.42, "step": 52930, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74180608, "epoch": 1.3428462709284628, "grad_norm": 2.140625, "learning_rate": 2.672442464684915e-06, "loss": 1.03346176, "memory(GiB)": 369.42, "step": 52935, "train_speed(iter/s)": 0.20093 }, { "acc": 0.76372471, "epoch": 1.3429731100963977, "grad_norm": 1.9140625, "learning_rate": 2.671514440062111e-06, "loss": 0.95417252, "memory(GiB)": 369.42, "step": 52940, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75278988, "epoch": 1.343099949264333, "grad_norm": 2.265625, "learning_rate": 2.6705865178564973e-06, "loss": 1.00844469, "memory(GiB)": 369.42, "step": 52945, "train_speed(iter/s)": 0.200935 }, { "acc": 0.74907112, "epoch": 1.343226788432268, "grad_norm": 2.0625, "learning_rate": 2.6696586981088886e-06, "loss": 1.01121149, "memory(GiB)": 369.42, "step": 52950, "train_speed(iter/s)": 0.200937 }, { "acc": 0.73906488, "epoch": 1.343353627600203, "grad_norm": 2.5625, "learning_rate": 2.6687309808600947e-06, "loss": 1.01613503, "memory(GiB)": 369.42, "step": 52955, "train_speed(iter/s)": 0.20094 }, { "acc": 0.74807611, "epoch": 1.343480466768138, "grad_norm": 2.109375, "learning_rate": 2.6678033661509208e-06, "loss": 0.99681644, "memory(GiB)": 369.42, "step": 52960, "train_speed(iter/s)": 0.200943 }, { "acc": 0.74621992, "epoch": 1.3436073059360731, "grad_norm": 2.203125, "learning_rate": 2.6668758540221665e-06, "loss": 1.04154797, "memory(GiB)": 369.42, "step": 52965, "train_speed(iter/s)": 0.200946 }, { "acc": 0.74873028, "epoch": 1.3437341451040081, "grad_norm": 2.4375, "learning_rate": 2.66594844451463e-06, "loss": 0.91461105, "memory(GiB)": 369.42, "step": 52970, "train_speed(iter/s)": 0.200948 }, { "acc": 0.73747406, "epoch": 1.3438609842719431, "grad_norm": 2.0625, "learning_rate": 2.6650211376691008e-06, "loss": 1.06934967, "memory(GiB)": 369.42, "step": 52975, "train_speed(iter/s)": 0.200951 }, { "acc": 0.7499023, "epoch": 1.3439878234398783, "grad_norm": 2.125, "learning_rate": 2.664093933526368e-06, "loss": 1.00019226, "memory(GiB)": 369.42, "step": 52980, "train_speed(iter/s)": 0.200953 }, { "acc": 0.74460554, "epoch": 1.3441146626078133, "grad_norm": 2.296875, "learning_rate": 2.6631668321272097e-06, "loss": 0.99318266, "memory(GiB)": 369.42, "step": 52985, "train_speed(iter/s)": 0.200956 }, { "acc": 0.7462204, "epoch": 1.3442415017757483, "grad_norm": 2.625, "learning_rate": 2.6622398335124122e-06, "loss": 1.00454617, "memory(GiB)": 369.42, "step": 52990, "train_speed(iter/s)": 0.200959 }, { "acc": 0.7487042, "epoch": 1.3443683409436833, "grad_norm": 2.0625, "learning_rate": 2.661312937722742e-06, "loss": 1.002104, "memory(GiB)": 369.42, "step": 52995, "train_speed(iter/s)": 0.200961 }, { "acc": 0.75555286, "epoch": 1.3444951801116185, "grad_norm": 2.0, "learning_rate": 2.6603861447989703e-06, "loss": 0.97267437, "memory(GiB)": 369.42, "step": 53000, "train_speed(iter/s)": 0.200964 }, { "epoch": 1.3444951801116185, "eval_acc": 0.7378660529386379, "eval_loss": 0.9696493744850159, "eval_runtime": 385.2286, "eval_samples_per_second": 16.536, "eval_steps_per_second": 8.268, "step": 53000 }, { "acc": 0.75325727, "epoch": 1.3446220192795535, "grad_norm": 2.109375, "learning_rate": 2.65945945478186e-06, "loss": 0.94361191, "memory(GiB)": 369.42, "step": 53005, "train_speed(iter/s)": 0.200422 }, { "acc": 0.76422167, "epoch": 1.3447488584474887, "grad_norm": 1.9609375, "learning_rate": 2.658532867712176e-06, "loss": 0.94073124, "memory(GiB)": 369.42, "step": 53010, "train_speed(iter/s)": 0.200426 }, { "acc": 0.75189981, "epoch": 1.3448756976154237, "grad_norm": 2.1875, "learning_rate": 2.6576063836306687e-06, "loss": 0.93607121, "memory(GiB)": 369.42, "step": 53015, "train_speed(iter/s)": 0.200428 }, { "acc": 0.75719213, "epoch": 1.3450025367833587, "grad_norm": 2.109375, "learning_rate": 2.656680002578088e-06, "loss": 0.97021217, "memory(GiB)": 369.42, "step": 53020, "train_speed(iter/s)": 0.200432 }, { "acc": 0.76060009, "epoch": 1.3451293759512937, "grad_norm": 2.015625, "learning_rate": 2.655753724595186e-06, "loss": 0.97365875, "memory(GiB)": 369.42, "step": 53025, "train_speed(iter/s)": 0.200436 }, { "acc": 0.74530807, "epoch": 1.3452562151192289, "grad_norm": 2.1875, "learning_rate": 2.6548275497227028e-06, "loss": 0.96937046, "memory(GiB)": 369.42, "step": 53030, "train_speed(iter/s)": 0.200438 }, { "acc": 0.75640721, "epoch": 1.3453830542871639, "grad_norm": 1.9296875, "learning_rate": 2.6539014780013707e-06, "loss": 0.96434889, "memory(GiB)": 369.42, "step": 53035, "train_speed(iter/s)": 0.200441 }, { "acc": 0.74858093, "epoch": 1.3455098934550989, "grad_norm": 2.390625, "learning_rate": 2.6529755094719276e-06, "loss": 1.03025827, "memory(GiB)": 369.42, "step": 53040, "train_speed(iter/s)": 0.200442 }, { "acc": 0.74348469, "epoch": 1.345636732623034, "grad_norm": 2.328125, "learning_rate": 2.652049644175101e-06, "loss": 0.99448719, "memory(GiB)": 369.42, "step": 53045, "train_speed(iter/s)": 0.200445 }, { "acc": 0.75453787, "epoch": 1.345763571790969, "grad_norm": 1.8984375, "learning_rate": 2.6511238821516154e-06, "loss": 0.98939772, "memory(GiB)": 369.42, "step": 53050, "train_speed(iter/s)": 0.200447 }, { "acc": 0.75108624, "epoch": 1.345890410958904, "grad_norm": 2.03125, "learning_rate": 2.650198223442185e-06, "loss": 0.95305595, "memory(GiB)": 369.42, "step": 53055, "train_speed(iter/s)": 0.20045 }, { "acc": 0.74990001, "epoch": 1.346017250126839, "grad_norm": 2.1875, "learning_rate": 2.6492726680875296e-06, "loss": 0.99861956, "memory(GiB)": 369.42, "step": 53060, "train_speed(iter/s)": 0.200452 }, { "acc": 0.76239491, "epoch": 1.3461440892947742, "grad_norm": 2.359375, "learning_rate": 2.6483472161283576e-06, "loss": 0.96741467, "memory(GiB)": 369.42, "step": 53065, "train_speed(iter/s)": 0.200454 }, { "acc": 0.74906287, "epoch": 1.3462709284627092, "grad_norm": 2.390625, "learning_rate": 2.647421867605374e-06, "loss": 0.98007736, "memory(GiB)": 369.42, "step": 53070, "train_speed(iter/s)": 0.200458 }, { "acc": 0.75259781, "epoch": 1.3463977676306444, "grad_norm": 2.140625, "learning_rate": 2.6464966225592804e-06, "loss": 0.9907896, "memory(GiB)": 369.42, "step": 53075, "train_speed(iter/s)": 0.200461 }, { "acc": 0.7474555, "epoch": 1.3465246067985794, "grad_norm": 2.171875, "learning_rate": 2.645571481030773e-06, "loss": 1.01467113, "memory(GiB)": 369.42, "step": 53080, "train_speed(iter/s)": 0.200464 }, { "acc": 0.75990882, "epoch": 1.3466514459665144, "grad_norm": 2.9375, "learning_rate": 2.6446464430605434e-06, "loss": 0.99912605, "memory(GiB)": 369.42, "step": 53085, "train_speed(iter/s)": 0.200466 }, { "acc": 0.755058, "epoch": 1.3467782851344494, "grad_norm": 2.015625, "learning_rate": 2.6437215086892797e-06, "loss": 1.00276194, "memory(GiB)": 369.42, "step": 53090, "train_speed(iter/s)": 0.200469 }, { "acc": 0.76341324, "epoch": 1.3469051243023846, "grad_norm": 2.296875, "learning_rate": 2.642796677957664e-06, "loss": 0.92826557, "memory(GiB)": 369.42, "step": 53095, "train_speed(iter/s)": 0.200471 }, { "acc": 0.75034275, "epoch": 1.3470319634703196, "grad_norm": 2.03125, "learning_rate": 2.641871950906374e-06, "loss": 1.03724632, "memory(GiB)": 369.42, "step": 53100, "train_speed(iter/s)": 0.200474 }, { "acc": 0.7571106, "epoch": 1.3471588026382548, "grad_norm": 1.9296875, "learning_rate": 2.6409473275760843e-06, "loss": 1.00020666, "memory(GiB)": 369.42, "step": 53105, "train_speed(iter/s)": 0.200477 }, { "acc": 0.75282106, "epoch": 1.3472856418061898, "grad_norm": 2.25, "learning_rate": 2.640022808007463e-06, "loss": 1.03043976, "memory(GiB)": 369.42, "step": 53110, "train_speed(iter/s)": 0.200481 }, { "acc": 0.7465344, "epoch": 1.3474124809741248, "grad_norm": 2.015625, "learning_rate": 2.639098392241176e-06, "loss": 1.0255909, "memory(GiB)": 369.42, "step": 53115, "train_speed(iter/s)": 0.200484 }, { "acc": 0.74164925, "epoch": 1.3475393201420598, "grad_norm": 2.140625, "learning_rate": 2.6381740803178826e-06, "loss": 0.97667065, "memory(GiB)": 369.42, "step": 53120, "train_speed(iter/s)": 0.200486 }, { "acc": 0.7481874, "epoch": 1.347666159309995, "grad_norm": 2.0625, "learning_rate": 2.6372498722782346e-06, "loss": 0.96205139, "memory(GiB)": 369.42, "step": 53125, "train_speed(iter/s)": 0.200489 }, { "acc": 0.74457941, "epoch": 1.34779299847793, "grad_norm": 2.03125, "learning_rate": 2.6363257681628907e-06, "loss": 1.01428251, "memory(GiB)": 369.42, "step": 53130, "train_speed(iter/s)": 0.200491 }, { "acc": 0.76078176, "epoch": 1.347919837645865, "grad_norm": 2.546875, "learning_rate": 2.635401768012491e-06, "loss": 0.93670044, "memory(GiB)": 369.42, "step": 53135, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75514565, "epoch": 1.3480466768138002, "grad_norm": 2.609375, "learning_rate": 2.6344778718676783e-06, "loss": 0.968326, "memory(GiB)": 369.42, "step": 53140, "train_speed(iter/s)": 0.200495 }, { "acc": 0.74684887, "epoch": 1.3481735159817352, "grad_norm": 2.5, "learning_rate": 2.6335540797690886e-06, "loss": 1.00606518, "memory(GiB)": 369.42, "step": 53145, "train_speed(iter/s)": 0.200499 }, { "acc": 0.7679523, "epoch": 1.3483003551496702, "grad_norm": 1.7734375, "learning_rate": 2.632630391757359e-06, "loss": 0.93679867, "memory(GiB)": 369.42, "step": 53150, "train_speed(iter/s)": 0.200502 }, { "acc": 0.73976974, "epoch": 1.3484271943176052, "grad_norm": 2.453125, "learning_rate": 2.6317068078731126e-06, "loss": 0.9396637, "memory(GiB)": 369.42, "step": 53155, "train_speed(iter/s)": 0.200504 }, { "acc": 0.76894231, "epoch": 1.3485540334855404, "grad_norm": 2.140625, "learning_rate": 2.630783328156973e-06, "loss": 0.89101467, "memory(GiB)": 369.42, "step": 53160, "train_speed(iter/s)": 0.200508 }, { "acc": 0.74942875, "epoch": 1.3486808726534754, "grad_norm": 2.3125, "learning_rate": 2.629859952649562e-06, "loss": 0.96687775, "memory(GiB)": 369.42, "step": 53165, "train_speed(iter/s)": 0.200512 }, { "acc": 0.76559324, "epoch": 1.3488077118214106, "grad_norm": 2.296875, "learning_rate": 2.628936681391494e-06, "loss": 0.9282608, "memory(GiB)": 369.42, "step": 53170, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74494095, "epoch": 1.3489345509893456, "grad_norm": 2.359375, "learning_rate": 2.628013514423375e-06, "loss": 1.03675203, "memory(GiB)": 369.42, "step": 53175, "train_speed(iter/s)": 0.200517 }, { "acc": 0.74083333, "epoch": 1.3490613901572805, "grad_norm": 2.265625, "learning_rate": 2.6270904517858102e-06, "loss": 1.02719812, "memory(GiB)": 369.42, "step": 53180, "train_speed(iter/s)": 0.200518 }, { "acc": 0.7542346, "epoch": 1.3491882293252155, "grad_norm": 2.46875, "learning_rate": 2.6261674935194036e-06, "loss": 1.02322292, "memory(GiB)": 369.42, "step": 53185, "train_speed(iter/s)": 0.20052 }, { "acc": 0.76001658, "epoch": 1.3493150684931507, "grad_norm": 2.1875, "learning_rate": 2.6252446396647503e-06, "loss": 0.9342433, "memory(GiB)": 369.42, "step": 53190, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74218063, "epoch": 1.3494419076610857, "grad_norm": 2.203125, "learning_rate": 2.6243218902624367e-06, "loss": 1.03645134, "memory(GiB)": 369.42, "step": 53195, "train_speed(iter/s)": 0.200524 }, { "acc": 0.75325093, "epoch": 1.3495687468290207, "grad_norm": 2.265625, "learning_rate": 2.6233992453530555e-06, "loss": 0.94894371, "memory(GiB)": 369.42, "step": 53200, "train_speed(iter/s)": 0.200527 }, { "acc": 0.76012478, "epoch": 1.349695585996956, "grad_norm": 1.9296875, "learning_rate": 2.6224767049771856e-06, "loss": 0.93709459, "memory(GiB)": 369.42, "step": 53205, "train_speed(iter/s)": 0.200529 }, { "acc": 0.73202438, "epoch": 1.349822425164891, "grad_norm": 2.75, "learning_rate": 2.621554269175405e-06, "loss": 1.04104557, "memory(GiB)": 369.42, "step": 53210, "train_speed(iter/s)": 0.200532 }, { "acc": 0.73469195, "epoch": 1.349949264332826, "grad_norm": 2.25, "learning_rate": 2.620631937988287e-06, "loss": 1.05146923, "memory(GiB)": 369.42, "step": 53215, "train_speed(iter/s)": 0.200534 }, { "acc": 0.73988132, "epoch": 1.350076103500761, "grad_norm": 2.234375, "learning_rate": 2.6197097114564e-06, "loss": 0.98829155, "memory(GiB)": 369.42, "step": 53220, "train_speed(iter/s)": 0.200536 }, { "acc": 0.74370089, "epoch": 1.3502029426686961, "grad_norm": 2.21875, "learning_rate": 2.618787589620306e-06, "loss": 1.0500536, "memory(GiB)": 369.42, "step": 53225, "train_speed(iter/s)": 0.200539 }, { "acc": 0.76008215, "epoch": 1.350329781836631, "grad_norm": 2.453125, "learning_rate": 2.6178655725205653e-06, "loss": 0.96697121, "memory(GiB)": 369.42, "step": 53230, "train_speed(iter/s)": 0.200542 }, { "acc": 0.75914869, "epoch": 1.3504566210045663, "grad_norm": 2.171875, "learning_rate": 2.6169436601977325e-06, "loss": 0.94116955, "memory(GiB)": 369.42, "step": 53235, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74249883, "epoch": 1.3505834601725013, "grad_norm": 1.96875, "learning_rate": 2.6160218526923576e-06, "loss": 1.00021038, "memory(GiB)": 369.42, "step": 53240, "train_speed(iter/s)": 0.200548 }, { "acc": 0.73486938, "epoch": 1.3507102993404363, "grad_norm": 2.40625, "learning_rate": 2.6151001500449847e-06, "loss": 1.04343481, "memory(GiB)": 369.42, "step": 53245, "train_speed(iter/s)": 0.200551 }, { "acc": 0.7524384, "epoch": 1.3508371385083713, "grad_norm": 1.9375, "learning_rate": 2.614178552296155e-06, "loss": 1.01506882, "memory(GiB)": 369.42, "step": 53250, "train_speed(iter/s)": 0.200553 }, { "acc": 0.75351653, "epoch": 1.3509639776763065, "grad_norm": 2.28125, "learning_rate": 2.6132570594864047e-06, "loss": 0.97350712, "memory(GiB)": 369.42, "step": 53255, "train_speed(iter/s)": 0.200557 }, { "acc": 0.76293712, "epoch": 1.3510908168442415, "grad_norm": 1.9375, "learning_rate": 2.612335671656265e-06, "loss": 1.00966091, "memory(GiB)": 369.42, "step": 53260, "train_speed(iter/s)": 0.200559 }, { "acc": 0.77232475, "epoch": 1.3512176560121767, "grad_norm": 2.078125, "learning_rate": 2.6114143888462607e-06, "loss": 0.96649141, "memory(GiB)": 369.42, "step": 53265, "train_speed(iter/s)": 0.200563 }, { "acc": 0.75384941, "epoch": 1.3513444951801117, "grad_norm": 2.5, "learning_rate": 2.6104932110969195e-06, "loss": 0.99861193, "memory(GiB)": 369.42, "step": 53270, "train_speed(iter/s)": 0.200565 }, { "acc": 0.75609946, "epoch": 1.3514713343480467, "grad_norm": 1.9375, "learning_rate": 2.609572138448753e-06, "loss": 0.94980316, "memory(GiB)": 369.42, "step": 53275, "train_speed(iter/s)": 0.200568 }, { "acc": 0.75306883, "epoch": 1.3515981735159817, "grad_norm": 2.15625, "learning_rate": 2.608651170942277e-06, "loss": 0.99896736, "memory(GiB)": 369.42, "step": 53280, "train_speed(iter/s)": 0.200571 }, { "acc": 0.75548935, "epoch": 1.3517250126839169, "grad_norm": 2.03125, "learning_rate": 2.607730308617997e-06, "loss": 0.99626989, "memory(GiB)": 369.42, "step": 53285, "train_speed(iter/s)": 0.200574 }, { "acc": 0.76693697, "epoch": 1.3518518518518519, "grad_norm": 2.3125, "learning_rate": 2.6068095515164226e-06, "loss": 0.93983431, "memory(GiB)": 369.42, "step": 53290, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74598808, "epoch": 1.3519786910197868, "grad_norm": 2.28125, "learning_rate": 2.605888899678047e-06, "loss": 1.00672445, "memory(GiB)": 369.42, "step": 53295, "train_speed(iter/s)": 0.200578 }, { "acc": 0.75526247, "epoch": 1.352105530187722, "grad_norm": 2.4375, "learning_rate": 2.6049683531433645e-06, "loss": 0.96990738, "memory(GiB)": 369.42, "step": 53300, "train_speed(iter/s)": 0.200582 }, { "acc": 0.75410566, "epoch": 1.352232369355657, "grad_norm": 2.34375, "learning_rate": 2.6040479119528683e-06, "loss": 1.01279545, "memory(GiB)": 369.42, "step": 53305, "train_speed(iter/s)": 0.200584 }, { "acc": 0.75088253, "epoch": 1.352359208523592, "grad_norm": 2.0, "learning_rate": 2.6031275761470447e-06, "loss": 1.01412334, "memory(GiB)": 369.42, "step": 53310, "train_speed(iter/s)": 0.200587 }, { "acc": 0.73523235, "epoch": 1.352486047691527, "grad_norm": 2.265625, "learning_rate": 2.60220734576637e-06, "loss": 1.03369455, "memory(GiB)": 369.42, "step": 53315, "train_speed(iter/s)": 0.200591 }, { "acc": 0.73010778, "epoch": 1.3526128868594622, "grad_norm": 2.21875, "learning_rate": 2.601287220851319e-06, "loss": 1.10826321, "memory(GiB)": 369.42, "step": 53320, "train_speed(iter/s)": 0.200592 }, { "acc": 0.76593709, "epoch": 1.3527397260273972, "grad_norm": 2.828125, "learning_rate": 2.6003672014423677e-06, "loss": 0.9870348, "memory(GiB)": 369.42, "step": 53325, "train_speed(iter/s)": 0.200595 }, { "acc": 0.75842338, "epoch": 1.3528665651953324, "grad_norm": 1.9765625, "learning_rate": 2.5994472875799827e-06, "loss": 0.98208542, "memory(GiB)": 369.42, "step": 53330, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74831686, "epoch": 1.3529934043632674, "grad_norm": 1.7421875, "learning_rate": 2.598527479304619e-06, "loss": 0.98852453, "memory(GiB)": 369.42, "step": 53335, "train_speed(iter/s)": 0.200601 }, { "acc": 0.76350498, "epoch": 1.3531202435312024, "grad_norm": 1.859375, "learning_rate": 2.597607776656741e-06, "loss": 0.96097965, "memory(GiB)": 369.42, "step": 53340, "train_speed(iter/s)": 0.200604 }, { "acc": 0.75283365, "epoch": 1.3532470826991374, "grad_norm": 2.296875, "learning_rate": 2.5966881796767984e-06, "loss": 0.98846531, "memory(GiB)": 369.42, "step": 53345, "train_speed(iter/s)": 0.200606 }, { "acc": 0.74545259, "epoch": 1.3533739218670726, "grad_norm": 3.203125, "learning_rate": 2.5957686884052423e-06, "loss": 1.00834074, "memory(GiB)": 369.42, "step": 53350, "train_speed(iter/s)": 0.200608 }, { "acc": 0.74761982, "epoch": 1.3535007610350076, "grad_norm": 2.421875, "learning_rate": 2.5948493028825093e-06, "loss": 1.02534428, "memory(GiB)": 369.42, "step": 53355, "train_speed(iter/s)": 0.200612 }, { "acc": 0.75219135, "epoch": 1.3536276002029426, "grad_norm": 2.1875, "learning_rate": 2.593930023149044e-06, "loss": 0.98525276, "memory(GiB)": 369.42, "step": 53360, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74663959, "epoch": 1.3537544393708778, "grad_norm": 2.09375, "learning_rate": 2.593010849245279e-06, "loss": 0.95932388, "memory(GiB)": 369.42, "step": 53365, "train_speed(iter/s)": 0.200616 }, { "acc": 0.74500132, "epoch": 1.3538812785388128, "grad_norm": 2.171875, "learning_rate": 2.592091781211643e-06, "loss": 0.98282738, "memory(GiB)": 369.42, "step": 53370, "train_speed(iter/s)": 0.200618 }, { "acc": 0.76484375, "epoch": 1.3540081177067478, "grad_norm": 2.359375, "learning_rate": 2.591172819088561e-06, "loss": 0.90868292, "memory(GiB)": 369.42, "step": 53375, "train_speed(iter/s)": 0.20062 }, { "acc": 0.74080243, "epoch": 1.3541349568746828, "grad_norm": 1.7109375, "learning_rate": 2.590253962916453e-06, "loss": 1.01888504, "memory(GiB)": 369.42, "step": 53380, "train_speed(iter/s)": 0.200622 }, { "acc": 0.74477048, "epoch": 1.354261796042618, "grad_norm": 2.0625, "learning_rate": 2.5893352127357347e-06, "loss": 0.95631771, "memory(GiB)": 369.42, "step": 53385, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75619135, "epoch": 1.354388635210553, "grad_norm": 2.046875, "learning_rate": 2.5884165685868164e-06, "loss": 0.94003906, "memory(GiB)": 369.42, "step": 53390, "train_speed(iter/s)": 0.200628 }, { "acc": 0.73963213, "epoch": 1.3545154743784882, "grad_norm": 2.328125, "learning_rate": 2.5874980305101045e-06, "loss": 1.0347003, "memory(GiB)": 369.42, "step": 53395, "train_speed(iter/s)": 0.200631 }, { "acc": 0.7513319, "epoch": 1.3546423135464232, "grad_norm": 2.296875, "learning_rate": 2.586579598546e-06, "loss": 1.00502071, "memory(GiB)": 369.42, "step": 53400, "train_speed(iter/s)": 0.200634 }, { "acc": 0.75852938, "epoch": 1.3547691527143582, "grad_norm": 2.375, "learning_rate": 2.5856612727348995e-06, "loss": 0.92574844, "memory(GiB)": 369.42, "step": 53405, "train_speed(iter/s)": 0.200636 }, { "acc": 0.75581026, "epoch": 1.3548959918822931, "grad_norm": 2.296875, "learning_rate": 2.584743053117196e-06, "loss": 0.96565094, "memory(GiB)": 369.42, "step": 53410, "train_speed(iter/s)": 0.200639 }, { "acc": 0.75913734, "epoch": 1.3550228310502284, "grad_norm": 1.90625, "learning_rate": 2.583824939733277e-06, "loss": 0.89913635, "memory(GiB)": 369.42, "step": 53415, "train_speed(iter/s)": 0.200641 }, { "acc": 0.74046021, "epoch": 1.3551496702181633, "grad_norm": 2.140625, "learning_rate": 2.5829069326235234e-06, "loss": 1.02391586, "memory(GiB)": 369.42, "step": 53420, "train_speed(iter/s)": 0.200644 }, { "acc": 0.74662757, "epoch": 1.3552765093860986, "grad_norm": 2.265625, "learning_rate": 2.5819890318283137e-06, "loss": 0.96114187, "memory(GiB)": 369.42, "step": 53425, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75539584, "epoch": 1.3554033485540335, "grad_norm": 2.125, "learning_rate": 2.5810712373880253e-06, "loss": 0.99485579, "memory(GiB)": 369.42, "step": 53430, "train_speed(iter/s)": 0.200652 }, { "acc": 0.76003461, "epoch": 1.3555301877219685, "grad_norm": 1.8515625, "learning_rate": 2.5801535493430215e-06, "loss": 0.95601845, "memory(GiB)": 369.42, "step": 53435, "train_speed(iter/s)": 0.200653 }, { "acc": 0.76948552, "epoch": 1.3556570268899035, "grad_norm": 2.078125, "learning_rate": 2.5792359677336685e-06, "loss": 0.93121748, "memory(GiB)": 369.42, "step": 53440, "train_speed(iter/s)": 0.200654 }, { "acc": 0.75872722, "epoch": 1.3557838660578387, "grad_norm": 2.171875, "learning_rate": 2.5783184926003237e-06, "loss": 0.96837139, "memory(GiB)": 369.42, "step": 53445, "train_speed(iter/s)": 0.200658 }, { "acc": 0.74084206, "epoch": 1.3559107052257737, "grad_norm": 2.140625, "learning_rate": 2.5774011239833473e-06, "loss": 1.01649828, "memory(GiB)": 369.42, "step": 53450, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75357742, "epoch": 1.3560375443937087, "grad_norm": 2.453125, "learning_rate": 2.5764838619230843e-06, "loss": 0.98872108, "memory(GiB)": 369.42, "step": 53455, "train_speed(iter/s)": 0.200664 }, { "acc": 0.76695838, "epoch": 1.356164383561644, "grad_norm": 2.140625, "learning_rate": 2.575566706459879e-06, "loss": 0.96195221, "memory(GiB)": 369.42, "step": 53460, "train_speed(iter/s)": 0.200666 }, { "acc": 0.74736099, "epoch": 1.356291222729579, "grad_norm": 2.1875, "learning_rate": 2.574649657634076e-06, "loss": 1.01822643, "memory(GiB)": 369.42, "step": 53465, "train_speed(iter/s)": 0.200668 }, { "acc": 0.75505896, "epoch": 1.356418061897514, "grad_norm": 2.546875, "learning_rate": 2.5737327154860116e-06, "loss": 1.02434921, "memory(GiB)": 369.42, "step": 53470, "train_speed(iter/s)": 0.200672 }, { "acc": 0.73882012, "epoch": 1.356544901065449, "grad_norm": 2.21875, "learning_rate": 2.572815880056011e-06, "loss": 1.04650116, "memory(GiB)": 369.42, "step": 53475, "train_speed(iter/s)": 0.200675 }, { "acc": 0.75696964, "epoch": 1.356671740233384, "grad_norm": 2.3125, "learning_rate": 2.571899151384406e-06, "loss": 0.92553463, "memory(GiB)": 369.42, "step": 53480, "train_speed(iter/s)": 0.200677 }, { "acc": 0.74714265, "epoch": 1.356798579401319, "grad_norm": 2.125, "learning_rate": 2.5709825295115178e-06, "loss": 1.00616817, "memory(GiB)": 369.42, "step": 53485, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74463148, "epoch": 1.3569254185692543, "grad_norm": 2.078125, "learning_rate": 2.5700660144776647e-06, "loss": 1.02809906, "memory(GiB)": 369.42, "step": 53490, "train_speed(iter/s)": 0.200679 }, { "acc": 0.74556866, "epoch": 1.3570522577371893, "grad_norm": 2.359375, "learning_rate": 2.5691496063231527e-06, "loss": 0.9932272, "memory(GiB)": 369.42, "step": 53495, "train_speed(iter/s)": 0.200683 }, { "acc": 0.75447712, "epoch": 1.3571790969051243, "grad_norm": 2.265625, "learning_rate": 2.568233305088296e-06, "loss": 0.94050531, "memory(GiB)": 369.42, "step": 53500, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75367136, "epoch": 1.3573059360730593, "grad_norm": 2.0625, "learning_rate": 2.5673171108133956e-06, "loss": 0.99405346, "memory(GiB)": 369.42, "step": 53505, "train_speed(iter/s)": 0.200686 }, { "acc": 0.74164524, "epoch": 1.3574327752409945, "grad_norm": 1.8671875, "learning_rate": 2.5664010235387503e-06, "loss": 0.96981583, "memory(GiB)": 369.42, "step": 53510, "train_speed(iter/s)": 0.200689 }, { "acc": 0.75867076, "epoch": 1.3575596144089295, "grad_norm": 2.765625, "learning_rate": 2.565485043304653e-06, "loss": 0.9717886, "memory(GiB)": 369.42, "step": 53515, "train_speed(iter/s)": 0.200692 }, { "acc": 0.74611616, "epoch": 1.3576864535768645, "grad_norm": 2.15625, "learning_rate": 2.564569170151392e-06, "loss": 0.97665863, "memory(GiB)": 369.42, "step": 53520, "train_speed(iter/s)": 0.200695 }, { "acc": 0.75670824, "epoch": 1.3578132927447997, "grad_norm": 1.9921875, "learning_rate": 2.5636534041192534e-06, "loss": 0.91305141, "memory(GiB)": 369.42, "step": 53525, "train_speed(iter/s)": 0.200698 }, { "acc": 0.74992003, "epoch": 1.3579401319127347, "grad_norm": 2.09375, "learning_rate": 2.5627377452485153e-06, "loss": 0.98304815, "memory(GiB)": 369.42, "step": 53530, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75901971, "epoch": 1.3580669710806696, "grad_norm": 1.9296875, "learning_rate": 2.561822193579453e-06, "loss": 0.92868872, "memory(GiB)": 369.42, "step": 53535, "train_speed(iter/s)": 0.200704 }, { "acc": 0.76593575, "epoch": 1.3581938102486046, "grad_norm": 2.265625, "learning_rate": 2.560906749152335e-06, "loss": 0.99530296, "memory(GiB)": 369.42, "step": 53540, "train_speed(iter/s)": 0.200706 }, { "acc": 0.74835315, "epoch": 1.3583206494165398, "grad_norm": 2.234375, "learning_rate": 2.55999141200743e-06, "loss": 0.99240208, "memory(GiB)": 369.42, "step": 53545, "train_speed(iter/s)": 0.200709 }, { "acc": 0.75961742, "epoch": 1.3584474885844748, "grad_norm": 1.8984375, "learning_rate": 2.5590761821849954e-06, "loss": 0.89109516, "memory(GiB)": 369.42, "step": 53550, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75877485, "epoch": 1.35857432775241, "grad_norm": 2.328125, "learning_rate": 2.5581610597252883e-06, "loss": 0.97093124, "memory(GiB)": 369.42, "step": 53555, "train_speed(iter/s)": 0.200713 }, { "acc": 0.74920597, "epoch": 1.358701166920345, "grad_norm": 2.09375, "learning_rate": 2.5572460446685593e-06, "loss": 0.94317684, "memory(GiB)": 369.42, "step": 53560, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74656649, "epoch": 1.35882800608828, "grad_norm": 2.03125, "learning_rate": 2.5563311370550535e-06, "loss": 0.97903309, "memory(GiB)": 369.42, "step": 53565, "train_speed(iter/s)": 0.200718 }, { "acc": 0.74625025, "epoch": 1.358954845256215, "grad_norm": 2.28125, "learning_rate": 2.5554163369250194e-06, "loss": 1.04030495, "memory(GiB)": 369.42, "step": 53570, "train_speed(iter/s)": 0.200719 }, { "acc": 0.76125073, "epoch": 1.3590816844241502, "grad_norm": 2.25, "learning_rate": 2.5545016443186867e-06, "loss": 0.95456905, "memory(GiB)": 369.42, "step": 53575, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75374289, "epoch": 1.3592085235920852, "grad_norm": 2.296875, "learning_rate": 2.55358705927629e-06, "loss": 0.96583004, "memory(GiB)": 369.42, "step": 53580, "train_speed(iter/s)": 0.200724 }, { "acc": 0.74522085, "epoch": 1.3593353627600204, "grad_norm": 2.03125, "learning_rate": 2.552672581838055e-06, "loss": 0.98505106, "memory(GiB)": 369.42, "step": 53585, "train_speed(iter/s)": 0.200726 }, { "acc": 0.77363062, "epoch": 1.3594622019279554, "grad_norm": 2.5625, "learning_rate": 2.5517582120442095e-06, "loss": 0.94169769, "memory(GiB)": 369.42, "step": 53590, "train_speed(iter/s)": 0.200728 }, { "acc": 0.74273372, "epoch": 1.3595890410958904, "grad_norm": 2.125, "learning_rate": 2.5508439499349675e-06, "loss": 1.01056824, "memory(GiB)": 369.42, "step": 53595, "train_speed(iter/s)": 0.200731 }, { "acc": 0.73477211, "epoch": 1.3597158802638254, "grad_norm": 2.0625, "learning_rate": 2.549929795550541e-06, "loss": 1.01450853, "memory(GiB)": 369.42, "step": 53600, "train_speed(iter/s)": 0.200735 }, { "acc": 0.76178942, "epoch": 1.3598427194317606, "grad_norm": 2.828125, "learning_rate": 2.549015748931143e-06, "loss": 0.94737473, "memory(GiB)": 369.42, "step": 53605, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75313778, "epoch": 1.3599695585996956, "grad_norm": 2.203125, "learning_rate": 2.5481018101169763e-06, "loss": 1.01606712, "memory(GiB)": 369.42, "step": 53610, "train_speed(iter/s)": 0.200742 }, { "acc": 0.76231384, "epoch": 1.3600963977676306, "grad_norm": 2.171875, "learning_rate": 2.547187979148238e-06, "loss": 0.94782858, "memory(GiB)": 369.42, "step": 53615, "train_speed(iter/s)": 0.200745 }, { "acc": 0.76495132, "epoch": 1.3602232369355658, "grad_norm": 1.828125, "learning_rate": 2.546274256065121e-06, "loss": 0.96951656, "memory(GiB)": 369.42, "step": 53620, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75706272, "epoch": 1.3603500761035008, "grad_norm": 2.109375, "learning_rate": 2.545360640907819e-06, "loss": 0.9520937, "memory(GiB)": 369.42, "step": 53625, "train_speed(iter/s)": 0.20075 }, { "acc": 0.7422822, "epoch": 1.3604769152714358, "grad_norm": 2.125, "learning_rate": 2.544447133716518e-06, "loss": 0.97173157, "memory(GiB)": 369.42, "step": 53630, "train_speed(iter/s)": 0.200752 }, { "acc": 0.74236255, "epoch": 1.3606037544393708, "grad_norm": 2.046875, "learning_rate": 2.5435337345313904e-06, "loss": 1.00366783, "memory(GiB)": 369.42, "step": 53635, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74878759, "epoch": 1.360730593607306, "grad_norm": 2.3125, "learning_rate": 2.5426204433926194e-06, "loss": 0.97762156, "memory(GiB)": 369.42, "step": 53640, "train_speed(iter/s)": 0.200758 }, { "acc": 0.75052104, "epoch": 1.360857432775241, "grad_norm": 1.9765625, "learning_rate": 2.541707260340372e-06, "loss": 1.01185703, "memory(GiB)": 369.42, "step": 53645, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75653162, "epoch": 1.3609842719431762, "grad_norm": 1.9765625, "learning_rate": 2.5407941854148156e-06, "loss": 0.99045296, "memory(GiB)": 369.42, "step": 53650, "train_speed(iter/s)": 0.200762 }, { "acc": 0.73909426, "epoch": 1.3611111111111112, "grad_norm": 2.125, "learning_rate": 2.5398812186561095e-06, "loss": 1.04436579, "memory(GiB)": 369.42, "step": 53655, "train_speed(iter/s)": 0.200766 }, { "acc": 0.73877745, "epoch": 1.3612379502790461, "grad_norm": 1.90625, "learning_rate": 2.5389683601044114e-06, "loss": 0.98961735, "memory(GiB)": 369.42, "step": 53660, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75638199, "epoch": 1.3613647894469811, "grad_norm": 2.53125, "learning_rate": 2.538055609799873e-06, "loss": 0.97272243, "memory(GiB)": 369.42, "step": 53665, "train_speed(iter/s)": 0.200772 }, { "acc": 0.74928293, "epoch": 1.3614916286149163, "grad_norm": 2.25, "learning_rate": 2.5371429677826397e-06, "loss": 0.97983551, "memory(GiB)": 369.42, "step": 53670, "train_speed(iter/s)": 0.200775 }, { "acc": 0.76667895, "epoch": 1.3616184677828513, "grad_norm": 2.375, "learning_rate": 2.5362304340928556e-06, "loss": 0.96829567, "memory(GiB)": 369.42, "step": 53675, "train_speed(iter/s)": 0.200778 }, { "acc": 0.73868895, "epoch": 1.3617453069507863, "grad_norm": 2.359375, "learning_rate": 2.535318008770656e-06, "loss": 1.04181213, "memory(GiB)": 369.42, "step": 53680, "train_speed(iter/s)": 0.200781 }, { "acc": 0.75555472, "epoch": 1.3618721461187215, "grad_norm": 2.34375, "learning_rate": 2.534405691856175e-06, "loss": 0.97338638, "memory(GiB)": 369.42, "step": 53685, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76127939, "epoch": 1.3619989852866565, "grad_norm": 1.8671875, "learning_rate": 2.5334934833895396e-06, "loss": 0.93929787, "memory(GiB)": 369.42, "step": 53690, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74598436, "epoch": 1.3621258244545915, "grad_norm": 2.046875, "learning_rate": 2.5325813834108724e-06, "loss": 1.00505667, "memory(GiB)": 369.42, "step": 53695, "train_speed(iter/s)": 0.200787 }, { "acc": 0.76242628, "epoch": 1.3622526636225265, "grad_norm": 2.25, "learning_rate": 2.531669391960293e-06, "loss": 0.97416143, "memory(GiB)": 369.42, "step": 53700, "train_speed(iter/s)": 0.200788 }, { "acc": 0.75477819, "epoch": 1.3623795027904617, "grad_norm": 1.9296875, "learning_rate": 2.5307575090779125e-06, "loss": 1.00781078, "memory(GiB)": 369.42, "step": 53705, "train_speed(iter/s)": 0.20079 }, { "acc": 0.74826193, "epoch": 1.3625063419583967, "grad_norm": 2.21875, "learning_rate": 2.529845734803844e-06, "loss": 0.97314539, "memory(GiB)": 369.42, "step": 53710, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75692239, "epoch": 1.362633181126332, "grad_norm": 2.171875, "learning_rate": 2.5289340691781872e-06, "loss": 1.02386494, "memory(GiB)": 369.42, "step": 53715, "train_speed(iter/s)": 0.200796 }, { "acc": 0.76101294, "epoch": 1.362760020294267, "grad_norm": 2.21875, "learning_rate": 2.528022512241042e-06, "loss": 0.93894501, "memory(GiB)": 369.42, "step": 53720, "train_speed(iter/s)": 0.2008 }, { "acc": 0.76097746, "epoch": 1.362886859462202, "grad_norm": 2.109375, "learning_rate": 2.5271110640325013e-06, "loss": 0.96245117, "memory(GiB)": 369.42, "step": 53725, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75086002, "epoch": 1.3630136986301369, "grad_norm": 2.015625, "learning_rate": 2.5261997245926612e-06, "loss": 0.97485332, "memory(GiB)": 369.42, "step": 53730, "train_speed(iter/s)": 0.200806 }, { "acc": 0.74679117, "epoch": 1.363140537798072, "grad_norm": 2.171875, "learning_rate": 2.5252884939615995e-06, "loss": 0.9766243, "memory(GiB)": 369.42, "step": 53735, "train_speed(iter/s)": 0.200808 }, { "acc": 0.74797897, "epoch": 1.363267376966007, "grad_norm": 1.9140625, "learning_rate": 2.5243773721793973e-06, "loss": 0.97170486, "memory(GiB)": 369.42, "step": 53740, "train_speed(iter/s)": 0.20081 }, { "acc": 0.75772047, "epoch": 1.3633942161339423, "grad_norm": 2.34375, "learning_rate": 2.5234663592861325e-06, "loss": 0.94349384, "memory(GiB)": 369.42, "step": 53745, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74640889, "epoch": 1.3635210553018773, "grad_norm": 2.375, "learning_rate": 2.522555455321876e-06, "loss": 0.96481628, "memory(GiB)": 369.42, "step": 53750, "train_speed(iter/s)": 0.200814 }, { "acc": 0.75189333, "epoch": 1.3636478944698123, "grad_norm": 2.375, "learning_rate": 2.52164466032669e-06, "loss": 0.96030788, "memory(GiB)": 369.42, "step": 53755, "train_speed(iter/s)": 0.200817 }, { "acc": 0.75698738, "epoch": 1.3637747336377473, "grad_norm": 1.84375, "learning_rate": 2.5207339743406344e-06, "loss": 0.98310385, "memory(GiB)": 369.42, "step": 53760, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74363613, "epoch": 1.3639015728056825, "grad_norm": 1.65625, "learning_rate": 2.5198233974037705e-06, "loss": 1.00839577, "memory(GiB)": 369.42, "step": 53765, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75032778, "epoch": 1.3640284119736175, "grad_norm": 2.4375, "learning_rate": 2.5189129295561486e-06, "loss": 1.00905991, "memory(GiB)": 369.42, "step": 53770, "train_speed(iter/s)": 0.200825 }, { "acc": 0.74919519, "epoch": 1.3641552511415524, "grad_norm": 1.984375, "learning_rate": 2.518002570837809e-06, "loss": 1.00143423, "memory(GiB)": 369.42, "step": 53775, "train_speed(iter/s)": 0.200828 }, { "acc": 0.73984175, "epoch": 1.3642820903094877, "grad_norm": 2.453125, "learning_rate": 2.5170923212887997e-06, "loss": 1.06367655, "memory(GiB)": 369.42, "step": 53780, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75881739, "epoch": 1.3644089294774226, "grad_norm": 2.296875, "learning_rate": 2.5161821809491554e-06, "loss": 0.95646601, "memory(GiB)": 369.42, "step": 53785, "train_speed(iter/s)": 0.200836 }, { "acc": 0.7559082, "epoch": 1.3645357686453576, "grad_norm": 2.03125, "learning_rate": 2.5152721498589104e-06, "loss": 0.98669863, "memory(GiB)": 369.42, "step": 53790, "train_speed(iter/s)": 0.200839 }, { "acc": 0.75207925, "epoch": 1.3646626078132926, "grad_norm": 2.8125, "learning_rate": 2.514362228058086e-06, "loss": 0.98590336, "memory(GiB)": 369.42, "step": 53795, "train_speed(iter/s)": 0.200842 }, { "acc": 0.74511333, "epoch": 1.3647894469812278, "grad_norm": 2.390625, "learning_rate": 2.51345241558671e-06, "loss": 1.03752317, "memory(GiB)": 369.42, "step": 53800, "train_speed(iter/s)": 0.200845 }, { "acc": 0.7504262, "epoch": 1.3649162861491628, "grad_norm": 2.265625, "learning_rate": 2.5125427124847985e-06, "loss": 1.00305672, "memory(GiB)": 369.42, "step": 53805, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75667372, "epoch": 1.365043125317098, "grad_norm": 2.34375, "learning_rate": 2.5116331187923645e-06, "loss": 0.94959555, "memory(GiB)": 369.42, "step": 53810, "train_speed(iter/s)": 0.200845 }, { "acc": 0.75935144, "epoch": 1.365169964485033, "grad_norm": 2.328125, "learning_rate": 2.510723634549415e-06, "loss": 0.97419901, "memory(GiB)": 369.42, "step": 53815, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75911083, "epoch": 1.365296803652968, "grad_norm": 2.359375, "learning_rate": 2.509814259795954e-06, "loss": 0.98353767, "memory(GiB)": 369.42, "step": 53820, "train_speed(iter/s)": 0.200852 }, { "acc": 0.75976963, "epoch": 1.365423642820903, "grad_norm": 2.328125, "learning_rate": 2.50890499457198e-06, "loss": 0.99268475, "memory(GiB)": 369.42, "step": 53825, "train_speed(iter/s)": 0.200855 }, { "acc": 0.76494236, "epoch": 1.3655504819888382, "grad_norm": 1.96875, "learning_rate": 2.5079958389174865e-06, "loss": 0.92287674, "memory(GiB)": 369.42, "step": 53830, "train_speed(iter/s)": 0.200855 }, { "acc": 0.74738646, "epoch": 1.3656773211567732, "grad_norm": 2.1875, "learning_rate": 2.5070867928724618e-06, "loss": 1.06508989, "memory(GiB)": 369.42, "step": 53835, "train_speed(iter/s)": 0.200858 }, { "acc": 0.75493937, "epoch": 1.3658041603247082, "grad_norm": 2.0625, "learning_rate": 2.50617785647689e-06, "loss": 0.93888283, "memory(GiB)": 369.42, "step": 53840, "train_speed(iter/s)": 0.200862 }, { "acc": 0.74462795, "epoch": 1.3659309994926434, "grad_norm": 2.328125, "learning_rate": 2.5052690297707506e-06, "loss": 0.98189831, "memory(GiB)": 369.42, "step": 53845, "train_speed(iter/s)": 0.200864 }, { "acc": 0.76065626, "epoch": 1.3660578386605784, "grad_norm": 2.1875, "learning_rate": 2.5043603127940164e-06, "loss": 0.9309721, "memory(GiB)": 369.42, "step": 53850, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74687147, "epoch": 1.3661846778285134, "grad_norm": 2.203125, "learning_rate": 2.503451705586659e-06, "loss": 0.97971945, "memory(GiB)": 369.42, "step": 53855, "train_speed(iter/s)": 0.200869 }, { "acc": 0.76787643, "epoch": 1.3663115169964484, "grad_norm": 1.96875, "learning_rate": 2.5025432081886412e-06, "loss": 0.95282269, "memory(GiB)": 369.42, "step": 53860, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74840555, "epoch": 1.3664383561643836, "grad_norm": 2.28125, "learning_rate": 2.5016348206399215e-06, "loss": 0.94479027, "memory(GiB)": 369.42, "step": 53865, "train_speed(iter/s)": 0.200875 }, { "acc": 0.74654789, "epoch": 1.3665651953323186, "grad_norm": 2.703125, "learning_rate": 2.500726542980461e-06, "loss": 1.02846127, "memory(GiB)": 369.42, "step": 53870, "train_speed(iter/s)": 0.200878 }, { "acc": 0.74663811, "epoch": 1.3666920345002538, "grad_norm": 1.890625, "learning_rate": 2.499818375250204e-06, "loss": 0.9942028, "memory(GiB)": 369.42, "step": 53875, "train_speed(iter/s)": 0.20088 }, { "acc": 0.74319592, "epoch": 1.3668188736681888, "grad_norm": 2.796875, "learning_rate": 2.4989103174890946e-06, "loss": 1.00068092, "memory(GiB)": 369.42, "step": 53880, "train_speed(iter/s)": 0.200882 }, { "acc": 0.75328417, "epoch": 1.3669457128361238, "grad_norm": 2.0, "learning_rate": 2.498002369737078e-06, "loss": 1.01132679, "memory(GiB)": 369.42, "step": 53885, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75768123, "epoch": 1.3670725520040587, "grad_norm": 2.59375, "learning_rate": 2.49709453203409e-06, "loss": 0.95722084, "memory(GiB)": 369.42, "step": 53890, "train_speed(iter/s)": 0.200888 }, { "acc": 0.75099897, "epoch": 1.367199391171994, "grad_norm": 1.9296875, "learning_rate": 2.496186804420057e-06, "loss": 1.02011051, "memory(GiB)": 369.42, "step": 53895, "train_speed(iter/s)": 0.20089 }, { "acc": 0.75826139, "epoch": 1.367326230339929, "grad_norm": 1.8359375, "learning_rate": 2.4952791869349056e-06, "loss": 0.9755291, "memory(GiB)": 369.42, "step": 53900, "train_speed(iter/s)": 0.200892 }, { "acc": 0.75302696, "epoch": 1.3674530695078642, "grad_norm": 3.1875, "learning_rate": 2.4943716796185603e-06, "loss": 1.01661921, "memory(GiB)": 369.42, "step": 53905, "train_speed(iter/s)": 0.200895 }, { "acc": 0.74735146, "epoch": 1.3675799086757991, "grad_norm": 2.4375, "learning_rate": 2.493464282510937e-06, "loss": 1.05138283, "memory(GiB)": 369.42, "step": 53910, "train_speed(iter/s)": 0.200898 }, { "acc": 0.76874633, "epoch": 1.3677067478437341, "grad_norm": 2.078125, "learning_rate": 2.4925569956519414e-06, "loss": 0.94755735, "memory(GiB)": 369.42, "step": 53915, "train_speed(iter/s)": 0.2009 }, { "acc": 0.74665709, "epoch": 1.3678335870116691, "grad_norm": 2.09375, "learning_rate": 2.491649819081486e-06, "loss": 1.02579441, "memory(GiB)": 369.42, "step": 53920, "train_speed(iter/s)": 0.200903 }, { "acc": 0.76229477, "epoch": 1.3679604261796043, "grad_norm": 2.0625, "learning_rate": 2.490742752839471e-06, "loss": 1.00113573, "memory(GiB)": 369.42, "step": 53925, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75443916, "epoch": 1.3680872653475393, "grad_norm": 2.296875, "learning_rate": 2.4898357969657943e-06, "loss": 0.94761906, "memory(GiB)": 369.42, "step": 53930, "train_speed(iter/s)": 0.200908 }, { "acc": 0.75987911, "epoch": 1.3682141045154743, "grad_norm": 2.09375, "learning_rate": 2.4889289515003425e-06, "loss": 0.97334566, "memory(GiB)": 369.42, "step": 53935, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75994549, "epoch": 1.3683409436834095, "grad_norm": 2.296875, "learning_rate": 2.4880222164830085e-06, "loss": 0.98673878, "memory(GiB)": 369.42, "step": 53940, "train_speed(iter/s)": 0.200914 }, { "acc": 0.75458727, "epoch": 1.3684677828513445, "grad_norm": 1.9296875, "learning_rate": 2.4871155919536725e-06, "loss": 0.97607574, "memory(GiB)": 369.42, "step": 53945, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75836158, "epoch": 1.3685946220192795, "grad_norm": 2.953125, "learning_rate": 2.486209077952212e-06, "loss": 0.98923721, "memory(GiB)": 369.42, "step": 53950, "train_speed(iter/s)": 0.200916 }, { "acc": 0.75737514, "epoch": 1.3687214611872145, "grad_norm": 2.46875, "learning_rate": 2.4853026745185e-06, "loss": 0.98315239, "memory(GiB)": 369.42, "step": 53955, "train_speed(iter/s)": 0.200919 }, { "acc": 0.7550561, "epoch": 1.3688483003551497, "grad_norm": 1.984375, "learning_rate": 2.4843963816924035e-06, "loss": 0.96667738, "memory(GiB)": 369.42, "step": 53960, "train_speed(iter/s)": 0.200922 }, { "acc": 0.75936775, "epoch": 1.3689751395230847, "grad_norm": 1.953125, "learning_rate": 2.483490199513785e-06, "loss": 0.95221653, "memory(GiB)": 369.42, "step": 53965, "train_speed(iter/s)": 0.200923 }, { "acc": 0.76158352, "epoch": 1.36910197869102, "grad_norm": 2.59375, "learning_rate": 2.4825841280225033e-06, "loss": 0.98944283, "memory(GiB)": 369.42, "step": 53970, "train_speed(iter/s)": 0.200926 }, { "acc": 0.74966555, "epoch": 1.369228817858955, "grad_norm": 2.234375, "learning_rate": 2.4816781672584107e-06, "loss": 0.97987747, "memory(GiB)": 369.42, "step": 53975, "train_speed(iter/s)": 0.200929 }, { "acc": 0.74859962, "epoch": 1.3693556570268899, "grad_norm": 2.203125, "learning_rate": 2.480772317261356e-06, "loss": 1.01613998, "memory(GiB)": 369.42, "step": 53980, "train_speed(iter/s)": 0.200931 }, { "acc": 0.75720339, "epoch": 1.3694824961948249, "grad_norm": 2.28125, "learning_rate": 2.479866578071183e-06, "loss": 0.99502268, "memory(GiB)": 369.42, "step": 53985, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75064797, "epoch": 1.36960933536276, "grad_norm": 2.703125, "learning_rate": 2.4789609497277284e-06, "loss": 0.98358479, "memory(GiB)": 369.42, "step": 53990, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75575914, "epoch": 1.369736174530695, "grad_norm": 2.46875, "learning_rate": 2.478055432270828e-06, "loss": 0.97523365, "memory(GiB)": 369.42, "step": 53995, "train_speed(iter/s)": 0.20094 }, { "acc": 0.77387238, "epoch": 1.36986301369863, "grad_norm": 1.7734375, "learning_rate": 2.4771500257403086e-06, "loss": 0.87723074, "memory(GiB)": 369.42, "step": 54000, "train_speed(iter/s)": 0.200941 }, { "epoch": 1.36986301369863, "eval_acc": 0.7379320601735322, "eval_loss": 0.969586193561554, "eval_runtime": 386.4947, "eval_samples_per_second": 16.481, "eval_steps_per_second": 8.241, "step": 54000 }, { "acc": 0.7492753, "epoch": 1.3699898528665653, "grad_norm": 1.953125, "learning_rate": 2.476244730175993e-06, "loss": 0.98907394, "memory(GiB)": 369.42, "step": 54005, "train_speed(iter/s)": 0.200411 }, { "acc": 0.75304594, "epoch": 1.3701166920345003, "grad_norm": 2.203125, "learning_rate": 2.4753395456177056e-06, "loss": 0.99537487, "memory(GiB)": 369.42, "step": 54010, "train_speed(iter/s)": 0.200414 }, { "acc": 0.74262204, "epoch": 1.3702435312024352, "grad_norm": 1.8828125, "learning_rate": 2.474434472105255e-06, "loss": 1.00180845, "memory(GiB)": 369.42, "step": 54015, "train_speed(iter/s)": 0.200417 }, { "acc": 0.75044212, "epoch": 1.3703703703703702, "grad_norm": 2.53125, "learning_rate": 2.473529509678452e-06, "loss": 0.94717436, "memory(GiB)": 369.42, "step": 54020, "train_speed(iter/s)": 0.200421 }, { "acc": 0.7438055, "epoch": 1.3704972095383054, "grad_norm": 2.4375, "learning_rate": 2.4726246583770996e-06, "loss": 1.04781055, "memory(GiB)": 369.42, "step": 54025, "train_speed(iter/s)": 0.200423 }, { "acc": 0.74880247, "epoch": 1.3706240487062404, "grad_norm": 1.890625, "learning_rate": 2.4717199182410025e-06, "loss": 0.9485527, "memory(GiB)": 369.42, "step": 54030, "train_speed(iter/s)": 0.200426 }, { "acc": 0.76199274, "epoch": 1.3707508878741756, "grad_norm": 2.390625, "learning_rate": 2.4708152893099493e-06, "loss": 0.97809763, "memory(GiB)": 369.42, "step": 54035, "train_speed(iter/s)": 0.200429 }, { "acc": 0.76308413, "epoch": 1.3708777270421106, "grad_norm": 2.625, "learning_rate": 2.4699107716237293e-06, "loss": 0.95214348, "memory(GiB)": 369.42, "step": 54040, "train_speed(iter/s)": 0.200431 }, { "acc": 0.75191798, "epoch": 1.3710045662100456, "grad_norm": 2.0, "learning_rate": 2.469006365222132e-06, "loss": 0.97211666, "memory(GiB)": 369.42, "step": 54045, "train_speed(iter/s)": 0.200433 }, { "acc": 0.74464312, "epoch": 1.3711314053779806, "grad_norm": 2.234375, "learning_rate": 2.4681020701449365e-06, "loss": 1.01323109, "memory(GiB)": 369.42, "step": 54050, "train_speed(iter/s)": 0.200436 }, { "acc": 0.74793334, "epoch": 1.3712582445459158, "grad_norm": 2.203125, "learning_rate": 2.4671978864319123e-06, "loss": 0.99121351, "memory(GiB)": 369.42, "step": 54055, "train_speed(iter/s)": 0.200439 }, { "acc": 0.74874029, "epoch": 1.3713850837138508, "grad_norm": 2.34375, "learning_rate": 2.466293814122835e-06, "loss": 1.00054379, "memory(GiB)": 369.42, "step": 54060, "train_speed(iter/s)": 0.200442 }, { "acc": 0.75819597, "epoch": 1.371511922881786, "grad_norm": 2.734375, "learning_rate": 2.4653898532574684e-06, "loss": 0.95437603, "memory(GiB)": 369.42, "step": 54065, "train_speed(iter/s)": 0.200445 }, { "acc": 0.75174618, "epoch": 1.371638762049721, "grad_norm": 2.203125, "learning_rate": 2.4644860038755737e-06, "loss": 0.97800322, "memory(GiB)": 369.42, "step": 54070, "train_speed(iter/s)": 0.200447 }, { "acc": 0.74676433, "epoch": 1.371765601217656, "grad_norm": 2.359375, "learning_rate": 2.4635822660169007e-06, "loss": 1.02718191, "memory(GiB)": 369.42, "step": 54075, "train_speed(iter/s)": 0.20045 }, { "acc": 0.74554377, "epoch": 1.371892440385591, "grad_norm": 2.53125, "learning_rate": 2.4626786397212065e-06, "loss": 1.04660759, "memory(GiB)": 369.42, "step": 54080, "train_speed(iter/s)": 0.200453 }, { "acc": 0.76459503, "epoch": 1.3720192795535262, "grad_norm": 2.5, "learning_rate": 2.461775125028234e-06, "loss": 0.97741394, "memory(GiB)": 369.42, "step": 54085, "train_speed(iter/s)": 0.200455 }, { "acc": 0.73682747, "epoch": 1.3721461187214612, "grad_norm": 2.421875, "learning_rate": 2.4608717219777236e-06, "loss": 1.0142458, "memory(GiB)": 369.42, "step": 54090, "train_speed(iter/s)": 0.200457 }, { "acc": 0.75520477, "epoch": 1.3722729578893962, "grad_norm": 3.09375, "learning_rate": 2.459968430609411e-06, "loss": 0.95671005, "memory(GiB)": 369.42, "step": 54095, "train_speed(iter/s)": 0.200461 }, { "acc": 0.7552865, "epoch": 1.3723997970573314, "grad_norm": 2.15625, "learning_rate": 2.459065250963028e-06, "loss": 0.96041718, "memory(GiB)": 369.42, "step": 54100, "train_speed(iter/s)": 0.200464 }, { "acc": 0.76810441, "epoch": 1.3725266362252664, "grad_norm": 2.234375, "learning_rate": 2.458162183078299e-06, "loss": 0.93894405, "memory(GiB)": 369.42, "step": 54105, "train_speed(iter/s)": 0.200466 }, { "acc": 0.76507044, "epoch": 1.3726534753932014, "grad_norm": 2.1875, "learning_rate": 2.4572592269949464e-06, "loss": 0.93491163, "memory(GiB)": 369.42, "step": 54110, "train_speed(iter/s)": 0.200469 }, { "acc": 0.75901756, "epoch": 1.3727803145611364, "grad_norm": 2.265625, "learning_rate": 2.4563563827526848e-06, "loss": 0.99561558, "memory(GiB)": 369.42, "step": 54115, "train_speed(iter/s)": 0.20047 }, { "acc": 0.74916058, "epoch": 1.3729071537290716, "grad_norm": 2.3125, "learning_rate": 2.455453650391226e-06, "loss": 0.9764245, "memory(GiB)": 369.42, "step": 54120, "train_speed(iter/s)": 0.200473 }, { "acc": 0.75658073, "epoch": 1.3730339928970066, "grad_norm": 2.3125, "learning_rate": 2.454551029950277e-06, "loss": 0.98678713, "memory(GiB)": 369.42, "step": 54125, "train_speed(iter/s)": 0.200476 }, { "acc": 0.77063351, "epoch": 1.3731608320649418, "grad_norm": 2.15625, "learning_rate": 2.4536485214695377e-06, "loss": 0.90549984, "memory(GiB)": 369.42, "step": 54130, "train_speed(iter/s)": 0.20048 }, { "acc": 0.7477057, "epoch": 1.3732876712328768, "grad_norm": 1.7578125, "learning_rate": 2.4527461249887054e-06, "loss": 0.96954269, "memory(GiB)": 369.42, "step": 54135, "train_speed(iter/s)": 0.200481 }, { "acc": 0.75091352, "epoch": 1.3734145104008117, "grad_norm": 2.109375, "learning_rate": 2.451843840547471e-06, "loss": 0.97208767, "memory(GiB)": 369.42, "step": 54140, "train_speed(iter/s)": 0.200484 }, { "acc": 0.74822969, "epoch": 1.3735413495687467, "grad_norm": 2.171875, "learning_rate": 2.4509416681855193e-06, "loss": 1.00985107, "memory(GiB)": 369.42, "step": 54145, "train_speed(iter/s)": 0.200487 }, { "acc": 0.74958639, "epoch": 1.373668188736682, "grad_norm": 2.390625, "learning_rate": 2.4500396079425377e-06, "loss": 1.02776642, "memory(GiB)": 369.42, "step": 54150, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75353212, "epoch": 1.373795027904617, "grad_norm": 2.625, "learning_rate": 2.4491376598581967e-06, "loss": 1.02134342, "memory(GiB)": 369.42, "step": 54155, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75369968, "epoch": 1.373921867072552, "grad_norm": 2.484375, "learning_rate": 2.4482358239721704e-06, "loss": 1.01048813, "memory(GiB)": 369.42, "step": 54160, "train_speed(iter/s)": 0.200496 }, { "acc": 0.75395937, "epoch": 1.3740487062404871, "grad_norm": 2.3125, "learning_rate": 2.4473341003241234e-06, "loss": 0.98922453, "memory(GiB)": 369.42, "step": 54165, "train_speed(iter/s)": 0.200499 }, { "acc": 0.76005421, "epoch": 1.3741755454084221, "grad_norm": 2.265625, "learning_rate": 2.446432488953724e-06, "loss": 0.87748032, "memory(GiB)": 369.42, "step": 54170, "train_speed(iter/s)": 0.200501 }, { "acc": 0.74605474, "epoch": 1.3743023845763571, "grad_norm": 2.515625, "learning_rate": 2.445530989900622e-06, "loss": 0.98725491, "memory(GiB)": 369.42, "step": 54175, "train_speed(iter/s)": 0.200503 }, { "acc": 0.75158644, "epoch": 1.374429223744292, "grad_norm": 2.375, "learning_rate": 2.4446296032044697e-06, "loss": 1.00159054, "memory(GiB)": 369.42, "step": 54180, "train_speed(iter/s)": 0.200506 }, { "acc": 0.75421433, "epoch": 1.3745560629122273, "grad_norm": 2.25, "learning_rate": 2.443728328904919e-06, "loss": 0.98883915, "memory(GiB)": 369.42, "step": 54185, "train_speed(iter/s)": 0.200509 }, { "acc": 0.74591451, "epoch": 1.3746829020801623, "grad_norm": 2.484375, "learning_rate": 2.442827167041611e-06, "loss": 1.00888052, "memory(GiB)": 369.42, "step": 54190, "train_speed(iter/s)": 0.200511 }, { "acc": 0.75785661, "epoch": 1.3748097412480975, "grad_norm": 2.421875, "learning_rate": 2.441926117654179e-06, "loss": 0.99965782, "memory(GiB)": 369.42, "step": 54195, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75315504, "epoch": 1.3749365804160325, "grad_norm": 2.4375, "learning_rate": 2.4410251807822555e-06, "loss": 1.0315794, "memory(GiB)": 369.42, "step": 54200, "train_speed(iter/s)": 0.200517 }, { "acc": 0.75880761, "epoch": 1.3750634195839675, "grad_norm": 2.09375, "learning_rate": 2.4401243564654713e-06, "loss": 0.97684622, "memory(GiB)": 369.42, "step": 54205, "train_speed(iter/s)": 0.200521 }, { "acc": 0.76806083, "epoch": 1.3751902587519025, "grad_norm": 2.0625, "learning_rate": 2.4392236447434494e-06, "loss": 0.94446859, "memory(GiB)": 369.42, "step": 54210, "train_speed(iter/s)": 0.200523 }, { "acc": 0.7587266, "epoch": 1.3753170979198377, "grad_norm": 2.21875, "learning_rate": 2.4383230456558005e-06, "loss": 0.94311876, "memory(GiB)": 369.42, "step": 54215, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75680494, "epoch": 1.3754439370877727, "grad_norm": 2.40625, "learning_rate": 2.437422559242143e-06, "loss": 0.9782547, "memory(GiB)": 369.42, "step": 54220, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75528593, "epoch": 1.375570776255708, "grad_norm": 2.046875, "learning_rate": 2.4365221855420822e-06, "loss": 1.02595158, "memory(GiB)": 369.42, "step": 54225, "train_speed(iter/s)": 0.200532 }, { "acc": 0.7569087, "epoch": 1.3756976154236429, "grad_norm": 2.03125, "learning_rate": 2.435621924595221e-06, "loss": 0.94959679, "memory(GiB)": 369.42, "step": 54230, "train_speed(iter/s)": 0.200534 }, { "acc": 0.7599514, "epoch": 1.3758244545915779, "grad_norm": 2.15625, "learning_rate": 2.4347217764411567e-06, "loss": 0.98901215, "memory(GiB)": 369.42, "step": 54235, "train_speed(iter/s)": 0.200537 }, { "acc": 0.75838056, "epoch": 1.3759512937595129, "grad_norm": 2.25, "learning_rate": 2.433821741119482e-06, "loss": 0.95615005, "memory(GiB)": 369.42, "step": 54240, "train_speed(iter/s)": 0.200539 }, { "acc": 0.73953314, "epoch": 1.376078132927448, "grad_norm": 2.296875, "learning_rate": 2.432921818669784e-06, "loss": 0.97915516, "memory(GiB)": 369.42, "step": 54245, "train_speed(iter/s)": 0.200541 }, { "acc": 0.75766187, "epoch": 1.376204972095383, "grad_norm": 2.140625, "learning_rate": 2.432022009131646e-06, "loss": 0.96579561, "memory(GiB)": 369.42, "step": 54250, "train_speed(iter/s)": 0.200543 }, { "acc": 0.74233885, "epoch": 1.376331811263318, "grad_norm": 2.1875, "learning_rate": 2.4311223125446447e-06, "loss": 1.0557827, "memory(GiB)": 369.42, "step": 54255, "train_speed(iter/s)": 0.200547 }, { "acc": 0.73867989, "epoch": 1.3764586504312533, "grad_norm": 2.0, "learning_rate": 2.4302227289483537e-06, "loss": 1.07078886, "memory(GiB)": 369.42, "step": 54260, "train_speed(iter/s)": 0.200549 }, { "acc": 0.7581687, "epoch": 1.3765854895991883, "grad_norm": 2.296875, "learning_rate": 2.42932325838234e-06, "loss": 1.01248522, "memory(GiB)": 369.42, "step": 54265, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75109196, "epoch": 1.3767123287671232, "grad_norm": 2.15625, "learning_rate": 2.4284239008861665e-06, "loss": 0.99330692, "memory(GiB)": 369.42, "step": 54270, "train_speed(iter/s)": 0.200556 }, { "acc": 0.75187769, "epoch": 1.3768391679350582, "grad_norm": 2.28125, "learning_rate": 2.4275246564993917e-06, "loss": 1.00931549, "memory(GiB)": 369.42, "step": 54275, "train_speed(iter/s)": 0.200557 }, { "acc": 0.76472039, "epoch": 1.3769660071029934, "grad_norm": 2.171875, "learning_rate": 2.426625525261567e-06, "loss": 0.92604179, "memory(GiB)": 369.42, "step": 54280, "train_speed(iter/s)": 0.200557 }, { "acc": 0.75326338, "epoch": 1.3770928462709284, "grad_norm": 1.6484375, "learning_rate": 2.425726507212242e-06, "loss": 0.98229847, "memory(GiB)": 369.42, "step": 54285, "train_speed(iter/s)": 0.20056 }, { "acc": 0.75423899, "epoch": 1.3772196854388636, "grad_norm": 2.0, "learning_rate": 2.424827602390958e-06, "loss": 0.99380484, "memory(GiB)": 369.42, "step": 54290, "train_speed(iter/s)": 0.200564 }, { "acc": 0.74935102, "epoch": 1.3773465246067986, "grad_norm": 1.9921875, "learning_rate": 2.4239288108372534e-06, "loss": 0.98878384, "memory(GiB)": 369.42, "step": 54295, "train_speed(iter/s)": 0.200565 }, { "acc": 0.74778652, "epoch": 1.3774733637747336, "grad_norm": 2.59375, "learning_rate": 2.4230301325906606e-06, "loss": 0.99971333, "memory(GiB)": 369.42, "step": 54300, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74757152, "epoch": 1.3776002029426686, "grad_norm": 1.984375, "learning_rate": 2.4221315676907066e-06, "loss": 1.03179417, "memory(GiB)": 369.42, "step": 54305, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74452791, "epoch": 1.3777270421106038, "grad_norm": 2.25, "learning_rate": 2.4212331161769194e-06, "loss": 1.02822533, "memory(GiB)": 369.42, "step": 54310, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74865694, "epoch": 1.3778538812785388, "grad_norm": 2.03125, "learning_rate": 2.420334778088811e-06, "loss": 0.97971096, "memory(GiB)": 369.42, "step": 54315, "train_speed(iter/s)": 0.200577 }, { "acc": 0.75418243, "epoch": 1.3779807204464738, "grad_norm": 2.0625, "learning_rate": 2.4194365534658944e-06, "loss": 1.00289793, "memory(GiB)": 369.42, "step": 54320, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74451385, "epoch": 1.378107559614409, "grad_norm": 2.0625, "learning_rate": 2.4185384423476817e-06, "loss": 1.05839262, "memory(GiB)": 369.42, "step": 54325, "train_speed(iter/s)": 0.200583 }, { "acc": 0.7447444, "epoch": 1.378234398782344, "grad_norm": 2.1875, "learning_rate": 2.4176404447736758e-06, "loss": 1.02666073, "memory(GiB)": 369.42, "step": 54330, "train_speed(iter/s)": 0.200585 }, { "acc": 0.74978008, "epoch": 1.378361237950279, "grad_norm": 2.5, "learning_rate": 2.41674256078337e-06, "loss": 1.01042461, "memory(GiB)": 369.42, "step": 54335, "train_speed(iter/s)": 0.200587 }, { "acc": 0.74304452, "epoch": 1.378488077118214, "grad_norm": 2.234375, "learning_rate": 2.4158447904162585e-06, "loss": 1.05492268, "memory(GiB)": 369.42, "step": 54340, "train_speed(iter/s)": 0.20059 }, { "acc": 0.75029263, "epoch": 1.3786149162861492, "grad_norm": 2.25, "learning_rate": 2.414947133711832e-06, "loss": 1.01188583, "memory(GiB)": 369.42, "step": 54345, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74501395, "epoch": 1.3787417554540842, "grad_norm": 2.046875, "learning_rate": 2.414049590709574e-06, "loss": 0.99802513, "memory(GiB)": 369.42, "step": 54350, "train_speed(iter/s)": 0.200595 }, { "acc": 0.7348011, "epoch": 1.3788685946220194, "grad_norm": 2.109375, "learning_rate": 2.4131521614489567e-06, "loss": 1.02812138, "memory(GiB)": 369.42, "step": 54355, "train_speed(iter/s)": 0.200599 }, { "acc": 0.7596446, "epoch": 1.3789954337899544, "grad_norm": 1.953125, "learning_rate": 2.412254845969459e-06, "loss": 0.9416153, "memory(GiB)": 369.42, "step": 54360, "train_speed(iter/s)": 0.200602 }, { "acc": 0.76332235, "epoch": 1.3791222729578894, "grad_norm": 1.8359375, "learning_rate": 2.4113576443105464e-06, "loss": 0.9557272, "memory(GiB)": 369.42, "step": 54365, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74562545, "epoch": 1.3792491121258244, "grad_norm": 2.109375, "learning_rate": 2.410460556511684e-06, "loss": 0.94933586, "memory(GiB)": 369.42, "step": 54370, "train_speed(iter/s)": 0.200607 }, { "acc": 0.74712534, "epoch": 1.3793759512937596, "grad_norm": 2.125, "learning_rate": 2.4095635826123235e-06, "loss": 0.96145515, "memory(GiB)": 369.42, "step": 54375, "train_speed(iter/s)": 0.20061 }, { "acc": 0.75110397, "epoch": 1.3795027904616946, "grad_norm": 2.03125, "learning_rate": 2.4086667226519245e-06, "loss": 0.99198093, "memory(GiB)": 369.42, "step": 54380, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74813313, "epoch": 1.3796296296296298, "grad_norm": 2.375, "learning_rate": 2.4077699766699323e-06, "loss": 1.01391678, "memory(GiB)": 369.42, "step": 54385, "train_speed(iter/s)": 0.200616 }, { "acc": 0.76622314, "epoch": 1.3797564687975648, "grad_norm": 1.859375, "learning_rate": 2.4068733447057903e-06, "loss": 0.95050812, "memory(GiB)": 369.42, "step": 54390, "train_speed(iter/s)": 0.20062 }, { "acc": 0.73433323, "epoch": 1.3798833079654997, "grad_norm": 2.140625, "learning_rate": 2.405976826798936e-06, "loss": 1.05225449, "memory(GiB)": 369.42, "step": 54395, "train_speed(iter/s)": 0.200623 }, { "acc": 0.76356683, "epoch": 1.3800101471334347, "grad_norm": 1.8671875, "learning_rate": 2.405080422988802e-06, "loss": 0.9892333, "memory(GiB)": 369.42, "step": 54400, "train_speed(iter/s)": 0.200625 }, { "acc": 0.74736381, "epoch": 1.38013698630137, "grad_norm": 2.09375, "learning_rate": 2.404184133314817e-06, "loss": 1.05205936, "memory(GiB)": 369.42, "step": 54405, "train_speed(iter/s)": 0.200628 }, { "acc": 0.75420771, "epoch": 1.380263825469305, "grad_norm": 2.375, "learning_rate": 2.4032879578164027e-06, "loss": 1.00132904, "memory(GiB)": 369.42, "step": 54410, "train_speed(iter/s)": 0.200629 }, { "acc": 0.75337386, "epoch": 1.38039066463724, "grad_norm": 2.21875, "learning_rate": 2.402391896532978e-06, "loss": 0.97720804, "memory(GiB)": 369.42, "step": 54415, "train_speed(iter/s)": 0.200631 }, { "acc": 0.73475351, "epoch": 1.3805175038051751, "grad_norm": 2.125, "learning_rate": 2.4014959495039548e-06, "loss": 1.07849112, "memory(GiB)": 369.42, "step": 54420, "train_speed(iter/s)": 0.200633 }, { "acc": 0.75252542, "epoch": 1.3806443429731101, "grad_norm": 2.078125, "learning_rate": 2.4006001167687416e-06, "loss": 0.99708633, "memory(GiB)": 369.42, "step": 54425, "train_speed(iter/s)": 0.200635 }, { "acc": 0.74804292, "epoch": 1.380771182141045, "grad_norm": 1.9921875, "learning_rate": 2.39970439836674e-06, "loss": 0.97713671, "memory(GiB)": 369.42, "step": 54430, "train_speed(iter/s)": 0.200638 }, { "acc": 0.74302678, "epoch": 1.38089802130898, "grad_norm": 2.015625, "learning_rate": 2.3988087943373497e-06, "loss": 1.04001694, "memory(GiB)": 369.42, "step": 54435, "train_speed(iter/s)": 0.200641 }, { "acc": 0.7571516, "epoch": 1.3810248604769153, "grad_norm": 1.984375, "learning_rate": 2.397913304719961e-06, "loss": 0.9407629, "memory(GiB)": 369.42, "step": 54440, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75638161, "epoch": 1.3811516996448503, "grad_norm": 2.359375, "learning_rate": 2.397017929553961e-06, "loss": 0.94913807, "memory(GiB)": 369.42, "step": 54445, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75939445, "epoch": 1.3812785388127855, "grad_norm": 2.140625, "learning_rate": 2.396122668878738e-06, "loss": 0.98420467, "memory(GiB)": 369.42, "step": 54450, "train_speed(iter/s)": 0.200651 }, { "acc": 0.73381901, "epoch": 1.3814053779807205, "grad_norm": 2.21875, "learning_rate": 2.3952275227336636e-06, "loss": 1.08585472, "memory(GiB)": 369.42, "step": 54455, "train_speed(iter/s)": 0.200654 }, { "acc": 0.75285053, "epoch": 1.3815322171486555, "grad_norm": 2.171875, "learning_rate": 2.3943324911581117e-06, "loss": 0.95551739, "memory(GiB)": 369.42, "step": 54460, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75629072, "epoch": 1.3816590563165905, "grad_norm": 2.046875, "learning_rate": 2.393437574191449e-06, "loss": 0.96345491, "memory(GiB)": 369.42, "step": 54465, "train_speed(iter/s)": 0.200659 }, { "acc": 0.75367217, "epoch": 1.3817858954845257, "grad_norm": 2.09375, "learning_rate": 2.3925427718730426e-06, "loss": 0.96899357, "memory(GiB)": 369.42, "step": 54470, "train_speed(iter/s)": 0.200662 }, { "acc": 0.76422138, "epoch": 1.3819127346524607, "grad_norm": 2.34375, "learning_rate": 2.391648084242245e-06, "loss": 0.93145657, "memory(GiB)": 369.42, "step": 54475, "train_speed(iter/s)": 0.200664 }, { "acc": 0.74838028, "epoch": 1.3820395738203957, "grad_norm": 2.078125, "learning_rate": 2.3907535113384084e-06, "loss": 1.07544174, "memory(GiB)": 369.42, "step": 54480, "train_speed(iter/s)": 0.200666 }, { "acc": 0.74724765, "epoch": 1.3821664129883309, "grad_norm": 2.265625, "learning_rate": 2.389859053200883e-06, "loss": 0.99062653, "memory(GiB)": 369.42, "step": 54485, "train_speed(iter/s)": 0.200668 }, { "acc": 0.75368004, "epoch": 1.3822932521562659, "grad_norm": 2.1875, "learning_rate": 2.3889647098690127e-06, "loss": 1.00345421, "memory(GiB)": 369.42, "step": 54490, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75394192, "epoch": 1.3824200913242009, "grad_norm": 2.0625, "learning_rate": 2.3880704813821275e-06, "loss": 0.98627977, "memory(GiB)": 369.42, "step": 54495, "train_speed(iter/s)": 0.200673 }, { "acc": 0.74635844, "epoch": 1.3825469304921358, "grad_norm": 1.9609375, "learning_rate": 2.3871763677795656e-06, "loss": 1.03111706, "memory(GiB)": 369.42, "step": 54500, "train_speed(iter/s)": 0.200676 }, { "acc": 0.75408759, "epoch": 1.382673769660071, "grad_norm": 2.328125, "learning_rate": 2.386282369100653e-06, "loss": 0.93707848, "memory(GiB)": 369.42, "step": 54505, "train_speed(iter/s)": 0.20068 }, { "acc": 0.75371523, "epoch": 1.382800608828006, "grad_norm": 2.15625, "learning_rate": 2.385388485384713e-06, "loss": 1.03901291, "memory(GiB)": 369.42, "step": 54510, "train_speed(iter/s)": 0.200684 }, { "acc": 0.76988888, "epoch": 1.3829274479959413, "grad_norm": 2.09375, "learning_rate": 2.384494716671057e-06, "loss": 0.96784544, "memory(GiB)": 369.42, "step": 54515, "train_speed(iter/s)": 0.200684 }, { "acc": 0.77149038, "epoch": 1.3830542871638762, "grad_norm": 2.265625, "learning_rate": 2.3836010629990027e-06, "loss": 0.90793343, "memory(GiB)": 369.42, "step": 54520, "train_speed(iter/s)": 0.200688 }, { "acc": 0.75370893, "epoch": 1.3831811263318112, "grad_norm": 2.421875, "learning_rate": 2.382707524407855e-06, "loss": 0.98695002, "memory(GiB)": 369.42, "step": 54525, "train_speed(iter/s)": 0.20069 }, { "acc": 0.74900451, "epoch": 1.3833079654997462, "grad_norm": 2.109375, "learning_rate": 2.3818141009369155e-06, "loss": 0.9959033, "memory(GiB)": 369.42, "step": 54530, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75953388, "epoch": 1.3834348046676814, "grad_norm": 1.9296875, "learning_rate": 2.3809207926254813e-06, "loss": 1.0085825, "memory(GiB)": 369.42, "step": 54535, "train_speed(iter/s)": 0.200697 }, { "acc": 0.76194673, "epoch": 1.3835616438356164, "grad_norm": 2.125, "learning_rate": 2.380027599512844e-06, "loss": 0.96320858, "memory(GiB)": 369.42, "step": 54540, "train_speed(iter/s)": 0.200701 }, { "acc": 0.74547987, "epoch": 1.3836884830035516, "grad_norm": 2.53125, "learning_rate": 2.3791345216382906e-06, "loss": 0.98370638, "memory(GiB)": 369.42, "step": 54545, "train_speed(iter/s)": 0.200704 }, { "acc": 0.7419611, "epoch": 1.3838153221714866, "grad_norm": 2.3125, "learning_rate": 2.378241559041102e-06, "loss": 1.02246504, "memory(GiB)": 369.42, "step": 54550, "train_speed(iter/s)": 0.200707 }, { "acc": 0.7458704, "epoch": 1.3839421613394216, "grad_norm": 2.3125, "learning_rate": 2.377348711760555e-06, "loss": 1.04750767, "memory(GiB)": 369.42, "step": 54555, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75232916, "epoch": 1.3840690005073566, "grad_norm": 2.296875, "learning_rate": 2.3764559798359204e-06, "loss": 0.9839119, "memory(GiB)": 369.42, "step": 54560, "train_speed(iter/s)": 0.200714 }, { "acc": 0.75568275, "epoch": 1.3841958396752918, "grad_norm": 1.9765625, "learning_rate": 2.3755633633064658e-06, "loss": 0.94458408, "memory(GiB)": 369.42, "step": 54565, "train_speed(iter/s)": 0.200715 }, { "acc": 0.73411322, "epoch": 1.3843226788432268, "grad_norm": 1.828125, "learning_rate": 2.374670862211451e-06, "loss": 1.06939602, "memory(GiB)": 369.42, "step": 54570, "train_speed(iter/s)": 0.200719 }, { "acc": 0.7616662, "epoch": 1.3844495180111618, "grad_norm": 2.3125, "learning_rate": 2.373778476590134e-06, "loss": 0.94277172, "memory(GiB)": 369.42, "step": 54575, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75943289, "epoch": 1.384576357179097, "grad_norm": 2.109375, "learning_rate": 2.372886206481764e-06, "loss": 0.95413189, "memory(GiB)": 369.42, "step": 54580, "train_speed(iter/s)": 0.200725 }, { "acc": 0.74116936, "epoch": 1.384703196347032, "grad_norm": 1.6796875, "learning_rate": 2.3719940519255864e-06, "loss": 1.00098085, "memory(GiB)": 369.42, "step": 54585, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75063534, "epoch": 1.384830035514967, "grad_norm": 2.234375, "learning_rate": 2.371102012960847e-06, "loss": 1.03484402, "memory(GiB)": 369.42, "step": 54590, "train_speed(iter/s)": 0.200731 }, { "acc": 0.75862446, "epoch": 1.384956874682902, "grad_norm": 2.96875, "learning_rate": 2.3702100896267767e-06, "loss": 0.94700174, "memory(GiB)": 369.42, "step": 54595, "train_speed(iter/s)": 0.200734 }, { "acc": 0.76177483, "epoch": 1.3850837138508372, "grad_norm": 1.7421875, "learning_rate": 2.3693182819626077e-06, "loss": 0.99908772, "memory(GiB)": 369.42, "step": 54600, "train_speed(iter/s)": 0.200736 }, { "acc": 0.74970045, "epoch": 1.3852105530187722, "grad_norm": 1.984375, "learning_rate": 2.3684265900075637e-06, "loss": 0.95953245, "memory(GiB)": 369.42, "step": 54605, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75453205, "epoch": 1.3853373921867074, "grad_norm": 2.03125, "learning_rate": 2.3675350138008714e-06, "loss": 0.9999052, "memory(GiB)": 369.42, "step": 54610, "train_speed(iter/s)": 0.200742 }, { "acc": 0.75701299, "epoch": 1.3854642313546424, "grad_norm": 2.09375, "learning_rate": 2.3666435533817406e-06, "loss": 0.99862213, "memory(GiB)": 369.42, "step": 54615, "train_speed(iter/s)": 0.200741 }, { "acc": 0.75596275, "epoch": 1.3855910705225774, "grad_norm": 2.625, "learning_rate": 2.3657522087893806e-06, "loss": 0.98601665, "memory(GiB)": 369.42, "step": 54620, "train_speed(iter/s)": 0.200744 }, { "acc": 0.75673456, "epoch": 1.3857179096905123, "grad_norm": 1.90625, "learning_rate": 2.3648609800630022e-06, "loss": 0.98742647, "memory(GiB)": 369.42, "step": 54625, "train_speed(iter/s)": 0.200747 }, { "acc": 0.75105877, "epoch": 1.3858447488584476, "grad_norm": 1.9140625, "learning_rate": 2.363969867241805e-06, "loss": 1.02008162, "memory(GiB)": 369.42, "step": 54630, "train_speed(iter/s)": 0.200751 }, { "acc": 0.73725505, "epoch": 1.3859715880263825, "grad_norm": 2.203125, "learning_rate": 2.36307887036498e-06, "loss": 1.03408813, "memory(GiB)": 369.42, "step": 54635, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75259924, "epoch": 1.3860984271943175, "grad_norm": 2.078125, "learning_rate": 2.3621879894717177e-06, "loss": 0.9539423, "memory(GiB)": 369.42, "step": 54640, "train_speed(iter/s)": 0.200755 }, { "acc": 0.77405024, "epoch": 1.3862252663622527, "grad_norm": 2.296875, "learning_rate": 2.361297224601206e-06, "loss": 0.93062611, "memory(GiB)": 369.42, "step": 54645, "train_speed(iter/s)": 0.200758 }, { "acc": 0.76653271, "epoch": 1.3863521055301877, "grad_norm": 2.1875, "learning_rate": 2.360406575792625e-06, "loss": 0.94958544, "memory(GiB)": 369.42, "step": 54650, "train_speed(iter/s)": 0.20076 }, { "acc": 0.76093321, "epoch": 1.3864789446981227, "grad_norm": 1.90625, "learning_rate": 2.3595160430851445e-06, "loss": 0.9388422, "memory(GiB)": 369.42, "step": 54655, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74434843, "epoch": 1.3866057838660577, "grad_norm": 2.546875, "learning_rate": 2.3586256265179392e-06, "loss": 0.99974461, "memory(GiB)": 369.42, "step": 54660, "train_speed(iter/s)": 0.200767 }, { "acc": 0.7555356, "epoch": 1.386732623033993, "grad_norm": 2.453125, "learning_rate": 2.3577353261301715e-06, "loss": 0.92930164, "memory(GiB)": 369.42, "step": 54665, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75050168, "epoch": 1.386859462201928, "grad_norm": 2.4375, "learning_rate": 2.356845141961001e-06, "loss": 1.04247961, "memory(GiB)": 369.42, "step": 54670, "train_speed(iter/s)": 0.200764 }, { "acc": 0.74864311, "epoch": 1.3869863013698631, "grad_norm": 2.234375, "learning_rate": 2.355955074049582e-06, "loss": 0.95129375, "memory(GiB)": 369.42, "step": 54675, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75494633, "epoch": 1.387113140537798, "grad_norm": 2.171875, "learning_rate": 2.355065122435064e-06, "loss": 0.98650494, "memory(GiB)": 369.42, "step": 54680, "train_speed(iter/s)": 0.20077 }, { "acc": 0.74948215, "epoch": 1.387239979705733, "grad_norm": 2.296875, "learning_rate": 2.3541752871565902e-06, "loss": 0.98578653, "memory(GiB)": 369.42, "step": 54685, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75769615, "epoch": 1.387366818873668, "grad_norm": 1.796875, "learning_rate": 2.3532855682533003e-06, "loss": 0.97114754, "memory(GiB)": 369.42, "step": 54690, "train_speed(iter/s)": 0.200775 }, { "acc": 0.75458741, "epoch": 1.3874936580416033, "grad_norm": 2.40625, "learning_rate": 2.352395965764328e-06, "loss": 0.99544544, "memory(GiB)": 369.42, "step": 54695, "train_speed(iter/s)": 0.200779 }, { "acc": 0.74769373, "epoch": 1.3876204972095383, "grad_norm": 1.96875, "learning_rate": 2.3515064797288013e-06, "loss": 0.97921314, "memory(GiB)": 369.42, "step": 54700, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74908943, "epoch": 1.3877473363774735, "grad_norm": 1.859375, "learning_rate": 2.350617110185845e-06, "loss": 1.02831841, "memory(GiB)": 369.42, "step": 54705, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76285782, "epoch": 1.3878741755454085, "grad_norm": 1.9453125, "learning_rate": 2.3497278571745763e-06, "loss": 0.95420094, "memory(GiB)": 369.42, "step": 54710, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75867605, "epoch": 1.3880010147133435, "grad_norm": 2.140625, "learning_rate": 2.348838720734109e-06, "loss": 0.93149929, "memory(GiB)": 369.42, "step": 54715, "train_speed(iter/s)": 0.20079 }, { "acc": 0.76399965, "epoch": 1.3881278538812785, "grad_norm": 2.328125, "learning_rate": 2.347949700903552e-06, "loss": 0.9960022, "memory(GiB)": 369.42, "step": 54720, "train_speed(iter/s)": 0.200792 }, { "acc": 0.74583216, "epoch": 1.3882546930492137, "grad_norm": 2.234375, "learning_rate": 2.3470607977220066e-06, "loss": 0.96257801, "memory(GiB)": 369.42, "step": 54725, "train_speed(iter/s)": 0.200795 }, { "acc": 0.75448809, "epoch": 1.3883815322171487, "grad_norm": 1.984375, "learning_rate": 2.346172011228573e-06, "loss": 0.99789238, "memory(GiB)": 369.42, "step": 54730, "train_speed(iter/s)": 0.200798 }, { "acc": 0.76856794, "epoch": 1.3885083713850837, "grad_norm": 1.9375, "learning_rate": 2.345283341462342e-06, "loss": 0.9311491, "memory(GiB)": 369.42, "step": 54735, "train_speed(iter/s)": 0.200801 }, { "acc": 0.74092894, "epoch": 1.3886352105530189, "grad_norm": 2.171875, "learning_rate": 2.3443947884624026e-06, "loss": 1.04284859, "memory(GiB)": 369.42, "step": 54740, "train_speed(iter/s)": 0.200803 }, { "acc": 0.75276127, "epoch": 1.3887620497209539, "grad_norm": 2.21875, "learning_rate": 2.3435063522678346e-06, "loss": 0.95211716, "memory(GiB)": 369.42, "step": 54745, "train_speed(iter/s)": 0.200806 }, { "acc": 0.74642916, "epoch": 1.3888888888888888, "grad_norm": 1.875, "learning_rate": 2.3426180329177217e-06, "loss": 0.98652735, "memory(GiB)": 369.42, "step": 54750, "train_speed(iter/s)": 0.200807 }, { "acc": 0.7440815, "epoch": 1.3890157280568238, "grad_norm": 2.046875, "learning_rate": 2.3417298304511297e-06, "loss": 0.99599361, "memory(GiB)": 369.42, "step": 54755, "train_speed(iter/s)": 0.20081 }, { "acc": 0.737784, "epoch": 1.389142567224759, "grad_norm": 2.265625, "learning_rate": 2.340841744907127e-06, "loss": 1.06873856, "memory(GiB)": 369.42, "step": 54760, "train_speed(iter/s)": 0.200814 }, { "acc": 0.75822449, "epoch": 1.389269406392694, "grad_norm": 2.3125, "learning_rate": 2.3399537763247783e-06, "loss": 0.96012774, "memory(GiB)": 369.42, "step": 54765, "train_speed(iter/s)": 0.200818 }, { "acc": 0.75144033, "epoch": 1.3893962455606292, "grad_norm": 1.9921875, "learning_rate": 2.3390659247431404e-06, "loss": 0.98363724, "memory(GiB)": 369.42, "step": 54770, "train_speed(iter/s)": 0.200821 }, { "acc": 0.76045532, "epoch": 1.3895230847285642, "grad_norm": 2.4375, "learning_rate": 2.338178190201261e-06, "loss": 0.94370823, "memory(GiB)": 369.42, "step": 54775, "train_speed(iter/s)": 0.200822 }, { "acc": 0.7557972, "epoch": 1.3896499238964992, "grad_norm": 2.125, "learning_rate": 2.3372905727381877e-06, "loss": 0.97226276, "memory(GiB)": 369.42, "step": 54780, "train_speed(iter/s)": 0.200825 }, { "acc": 0.75726357, "epoch": 1.3897767630644342, "grad_norm": 2.109375, "learning_rate": 2.3364030723929647e-06, "loss": 0.93285561, "memory(GiB)": 369.42, "step": 54785, "train_speed(iter/s)": 0.200827 }, { "acc": 0.76718016, "epoch": 1.3899036022323694, "grad_norm": 2.1875, "learning_rate": 2.335515689204629e-06, "loss": 0.94396343, "memory(GiB)": 369.42, "step": 54790, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75779982, "epoch": 1.3900304414003044, "grad_norm": 2.234375, "learning_rate": 2.334628423212206e-06, "loss": 0.98195438, "memory(GiB)": 369.42, "step": 54795, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75296612, "epoch": 1.3901572805682394, "grad_norm": 2.4375, "learning_rate": 2.3337412744547256e-06, "loss": 0.98675404, "memory(GiB)": 369.42, "step": 54800, "train_speed(iter/s)": 0.200833 }, { "acc": 0.76305656, "epoch": 1.3902841197361746, "grad_norm": 2.109375, "learning_rate": 2.332854242971209e-06, "loss": 0.96858749, "memory(GiB)": 369.42, "step": 54805, "train_speed(iter/s)": 0.200836 }, { "acc": 0.74930959, "epoch": 1.3904109589041096, "grad_norm": 2.953125, "learning_rate": 2.331967328800672e-06, "loss": 0.98185787, "memory(GiB)": 369.42, "step": 54810, "train_speed(iter/s)": 0.200839 }, { "acc": 0.75868626, "epoch": 1.3905377980720446, "grad_norm": 2.34375, "learning_rate": 2.33108053198212e-06, "loss": 0.91900873, "memory(GiB)": 369.42, "step": 54815, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74936485, "epoch": 1.3906646372399796, "grad_norm": 2.25, "learning_rate": 2.330193852554564e-06, "loss": 0.97512074, "memory(GiB)": 369.42, "step": 54820, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75388994, "epoch": 1.3907914764079148, "grad_norm": 2.703125, "learning_rate": 2.3293072905570024e-06, "loss": 0.99163685, "memory(GiB)": 369.42, "step": 54825, "train_speed(iter/s)": 0.200844 }, { "acc": 0.75337267, "epoch": 1.3909183155758498, "grad_norm": 1.7578125, "learning_rate": 2.3284208460284303e-06, "loss": 1.01475792, "memory(GiB)": 369.42, "step": 54830, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76548071, "epoch": 1.391045154743785, "grad_norm": 2.59375, "learning_rate": 2.3275345190078364e-06, "loss": 1.01750145, "memory(GiB)": 369.42, "step": 54835, "train_speed(iter/s)": 0.200849 }, { "acc": 0.73765373, "epoch": 1.39117199391172, "grad_norm": 1.984375, "learning_rate": 2.3266483095342064e-06, "loss": 1.02872458, "memory(GiB)": 369.42, "step": 54840, "train_speed(iter/s)": 0.200852 }, { "acc": 0.75994701, "epoch": 1.391298833079655, "grad_norm": 2.171875, "learning_rate": 2.3257622176465194e-06, "loss": 0.96171589, "memory(GiB)": 369.42, "step": 54845, "train_speed(iter/s)": 0.200854 }, { "acc": 0.74422517, "epoch": 1.39142567224759, "grad_norm": 2.5625, "learning_rate": 2.3248762433837494e-06, "loss": 1.04084339, "memory(GiB)": 369.42, "step": 54850, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74709835, "epoch": 1.3915525114155252, "grad_norm": 1.9609375, "learning_rate": 2.323990386784867e-06, "loss": 0.97708712, "memory(GiB)": 369.42, "step": 54855, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75496349, "epoch": 1.3916793505834602, "grad_norm": 2.40625, "learning_rate": 2.3231046478888335e-06, "loss": 0.98851509, "memory(GiB)": 369.42, "step": 54860, "train_speed(iter/s)": 0.200864 }, { "acc": 0.75483918, "epoch": 1.3918061897513954, "grad_norm": 2.359375, "learning_rate": 2.3222190267346094e-06, "loss": 0.96693134, "memory(GiB)": 369.42, "step": 54865, "train_speed(iter/s)": 0.200868 }, { "acc": 0.75403571, "epoch": 1.3919330289193304, "grad_norm": 2.359375, "learning_rate": 2.3213335233611484e-06, "loss": 0.95358372, "memory(GiB)": 369.42, "step": 54870, "train_speed(iter/s)": 0.200871 }, { "acc": 0.75003052, "epoch": 1.3920598680872653, "grad_norm": 2.609375, "learning_rate": 2.320448137807398e-06, "loss": 0.97341537, "memory(GiB)": 369.42, "step": 54875, "train_speed(iter/s)": 0.200874 }, { "acc": 0.74403124, "epoch": 1.3921867072552003, "grad_norm": 2.046875, "learning_rate": 2.3195628701123017e-06, "loss": 1.02543535, "memory(GiB)": 369.42, "step": 54880, "train_speed(iter/s)": 0.200878 }, { "acc": 0.75291348, "epoch": 1.3923135464231355, "grad_norm": 2.078125, "learning_rate": 2.3186777203147964e-06, "loss": 0.97260189, "memory(GiB)": 369.42, "step": 54885, "train_speed(iter/s)": 0.200878 }, { "acc": 0.75203266, "epoch": 1.3924403855910705, "grad_norm": 2.03125, "learning_rate": 2.3177926884538193e-06, "loss": 1.03132095, "memory(GiB)": 369.42, "step": 54890, "train_speed(iter/s)": 0.200881 }, { "acc": 0.75382338, "epoch": 1.3925672247590055, "grad_norm": 2.40625, "learning_rate": 2.3169077745682933e-06, "loss": 0.96502142, "memory(GiB)": 369.42, "step": 54895, "train_speed(iter/s)": 0.200885 }, { "acc": 0.75238314, "epoch": 1.3926940639269407, "grad_norm": 2.375, "learning_rate": 2.316022978697143e-06, "loss": 0.97875452, "memory(GiB)": 369.42, "step": 54900, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75309572, "epoch": 1.3928209030948757, "grad_norm": 1.875, "learning_rate": 2.3151383008792826e-06, "loss": 0.9937294, "memory(GiB)": 369.42, "step": 54905, "train_speed(iter/s)": 0.20089 }, { "acc": 0.75221667, "epoch": 1.3929477422628107, "grad_norm": 2.28125, "learning_rate": 2.314253741153631e-06, "loss": 0.98100119, "memory(GiB)": 369.42, "step": 54910, "train_speed(iter/s)": 0.200892 }, { "acc": 0.7489439, "epoch": 1.3930745814307457, "grad_norm": 2.046875, "learning_rate": 2.313369299559088e-06, "loss": 0.92350655, "memory(GiB)": 369.42, "step": 54915, "train_speed(iter/s)": 0.200894 }, { "acc": 0.76265717, "epoch": 1.393201420598681, "grad_norm": 1.90625, "learning_rate": 2.3124849761345576e-06, "loss": 0.87913132, "memory(GiB)": 369.42, "step": 54920, "train_speed(iter/s)": 0.200897 }, { "acc": 0.7601994, "epoch": 1.393328259766616, "grad_norm": 1.84375, "learning_rate": 2.311600770918938e-06, "loss": 0.96985855, "memory(GiB)": 369.42, "step": 54925, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76144018, "epoch": 1.393455098934551, "grad_norm": 1.921875, "learning_rate": 2.310716683951122e-06, "loss": 0.96706228, "memory(GiB)": 369.42, "step": 54930, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75607271, "epoch": 1.393581938102486, "grad_norm": 2.734375, "learning_rate": 2.3098327152699884e-06, "loss": 1.03327827, "memory(GiB)": 369.42, "step": 54935, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75730238, "epoch": 1.393708777270421, "grad_norm": 1.984375, "learning_rate": 2.308948864914425e-06, "loss": 0.98852654, "memory(GiB)": 369.42, "step": 54940, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74447527, "epoch": 1.393835616438356, "grad_norm": 2.28125, "learning_rate": 2.308065132923305e-06, "loss": 1.02359905, "memory(GiB)": 369.42, "step": 54945, "train_speed(iter/s)": 0.200912 }, { "acc": 0.75464454, "epoch": 1.3939624556062913, "grad_norm": 2.09375, "learning_rate": 2.3071815193355005e-06, "loss": 0.93432989, "memory(GiB)": 369.42, "step": 54950, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75547915, "epoch": 1.3940892947742263, "grad_norm": 2.015625, "learning_rate": 2.3062980241898725e-06, "loss": 0.97249298, "memory(GiB)": 369.42, "step": 54955, "train_speed(iter/s)": 0.200918 }, { "acc": 0.747188, "epoch": 1.3942161339421613, "grad_norm": 2.46875, "learning_rate": 2.3054146475252852e-06, "loss": 0.99501553, "memory(GiB)": 369.42, "step": 54960, "train_speed(iter/s)": 0.200922 }, { "acc": 0.76989317, "epoch": 1.3943429731100965, "grad_norm": 2.09375, "learning_rate": 2.3045313893805926e-06, "loss": 0.89393673, "memory(GiB)": 369.42, "step": 54965, "train_speed(iter/s)": 0.200926 }, { "acc": 0.74901142, "epoch": 1.3944698122780315, "grad_norm": 2.859375, "learning_rate": 2.303648249794644e-06, "loss": 1.02827702, "memory(GiB)": 369.42, "step": 54970, "train_speed(iter/s)": 0.200928 }, { "acc": 0.75716033, "epoch": 1.3945966514459665, "grad_norm": 2.03125, "learning_rate": 2.302765228806283e-06, "loss": 0.95053406, "memory(GiB)": 369.42, "step": 54975, "train_speed(iter/s)": 0.200931 }, { "acc": 0.74886374, "epoch": 1.3947234906139014, "grad_norm": 2.25, "learning_rate": 2.30188232645435e-06, "loss": 1.04083519, "memory(GiB)": 369.42, "step": 54980, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74179034, "epoch": 1.3948503297818367, "grad_norm": 2.265625, "learning_rate": 2.300999542777678e-06, "loss": 1.02708282, "memory(GiB)": 369.42, "step": 54985, "train_speed(iter/s)": 0.200937 }, { "acc": 0.74495878, "epoch": 1.3949771689497716, "grad_norm": 2.28125, "learning_rate": 2.300116877815097e-06, "loss": 0.95411081, "memory(GiB)": 369.42, "step": 54990, "train_speed(iter/s)": 0.20094 }, { "acc": 0.74693317, "epoch": 1.3951040081177069, "grad_norm": 2.265625, "learning_rate": 2.2992343316054296e-06, "loss": 0.99067268, "memory(GiB)": 369.42, "step": 54995, "train_speed(iter/s)": 0.200943 }, { "acc": 0.75410395, "epoch": 1.3952308472856418, "grad_norm": 1.953125, "learning_rate": 2.298351904187494e-06, "loss": 0.95803242, "memory(GiB)": 369.42, "step": 55000, "train_speed(iter/s)": 0.200946 }, { "epoch": 1.3952308472856418, "eval_acc": 0.7379947252699508, "eval_loss": 0.9696020483970642, "eval_runtime": 384.5222, "eval_samples_per_second": 16.566, "eval_steps_per_second": 8.283, "step": 55000 }, { "acc": 0.75087328, "epoch": 1.3953576864535768, "grad_norm": 2.09375, "learning_rate": 2.297469595600104e-06, "loss": 0.93983564, "memory(GiB)": 369.42, "step": 55005, "train_speed(iter/s)": 0.200427 }, { "acc": 0.74331326, "epoch": 1.3954845256215118, "grad_norm": 2.125, "learning_rate": 2.2965874058820668e-06, "loss": 1.00409603, "memory(GiB)": 369.42, "step": 55010, "train_speed(iter/s)": 0.200429 }, { "acc": 0.75489287, "epoch": 1.395611364789447, "grad_norm": 2.859375, "learning_rate": 2.2957053350721857e-06, "loss": 0.96888885, "memory(GiB)": 369.42, "step": 55015, "train_speed(iter/s)": 0.200431 }, { "acc": 0.75011711, "epoch": 1.395738203957382, "grad_norm": 2.25, "learning_rate": 2.294823383209258e-06, "loss": 0.98783998, "memory(GiB)": 369.42, "step": 55020, "train_speed(iter/s)": 0.200434 }, { "acc": 0.75774803, "epoch": 1.3958650431253172, "grad_norm": 2.203125, "learning_rate": 2.2939415503320733e-06, "loss": 0.9869462, "memory(GiB)": 369.42, "step": 55025, "train_speed(iter/s)": 0.200437 }, { "acc": 0.73299589, "epoch": 1.3959918822932522, "grad_norm": 1.984375, "learning_rate": 2.293059836479425e-06, "loss": 1.04033022, "memory(GiB)": 369.42, "step": 55030, "train_speed(iter/s)": 0.200436 }, { "acc": 0.7570117, "epoch": 1.3961187214611872, "grad_norm": 1.9765625, "learning_rate": 2.2921782416900883e-06, "loss": 0.94745007, "memory(GiB)": 369.42, "step": 55035, "train_speed(iter/s)": 0.200439 }, { "acc": 0.76268549, "epoch": 1.3962455606291222, "grad_norm": 2.203125, "learning_rate": 2.2912967660028425e-06, "loss": 0.95451107, "memory(GiB)": 369.42, "step": 55040, "train_speed(iter/s)": 0.20044 }, { "acc": 0.76175361, "epoch": 1.3963723997970574, "grad_norm": 1.7109375, "learning_rate": 2.2904154094564568e-06, "loss": 0.92628908, "memory(GiB)": 369.42, "step": 55045, "train_speed(iter/s)": 0.200444 }, { "acc": 0.74744515, "epoch": 1.3964992389649924, "grad_norm": 2.234375, "learning_rate": 2.2895341720897018e-06, "loss": 0.98181171, "memory(GiB)": 369.42, "step": 55050, "train_speed(iter/s)": 0.200446 }, { "acc": 0.74080744, "epoch": 1.3966260781329274, "grad_norm": 2.421875, "learning_rate": 2.2886530539413336e-06, "loss": 1.03763428, "memory(GiB)": 369.42, "step": 55055, "train_speed(iter/s)": 0.200449 }, { "acc": 0.75235109, "epoch": 1.3967529173008626, "grad_norm": 2.09375, "learning_rate": 2.2877720550501082e-06, "loss": 1.0021431, "memory(GiB)": 369.42, "step": 55060, "train_speed(iter/s)": 0.200452 }, { "acc": 0.77099404, "epoch": 1.3968797564687976, "grad_norm": 2.375, "learning_rate": 2.2868911754547783e-06, "loss": 0.92937298, "memory(GiB)": 369.42, "step": 55065, "train_speed(iter/s)": 0.200455 }, { "acc": 0.75583725, "epoch": 1.3970065956367326, "grad_norm": 2.328125, "learning_rate": 2.28601041519409e-06, "loss": 0.9751668, "memory(GiB)": 369.42, "step": 55070, "train_speed(iter/s)": 0.200457 }, { "acc": 0.7447648, "epoch": 1.3971334348046676, "grad_norm": 2.484375, "learning_rate": 2.2851297743067786e-06, "loss": 0.99040956, "memory(GiB)": 369.42, "step": 55075, "train_speed(iter/s)": 0.20046 }, { "acc": 0.75426369, "epoch": 1.3972602739726028, "grad_norm": 2.15625, "learning_rate": 2.2842492528315784e-06, "loss": 0.93828335, "memory(GiB)": 369.42, "step": 55080, "train_speed(iter/s)": 0.200463 }, { "acc": 0.75137272, "epoch": 1.3973871131405378, "grad_norm": 1.796875, "learning_rate": 2.283368850807223e-06, "loss": 0.9442873, "memory(GiB)": 369.42, "step": 55085, "train_speed(iter/s)": 0.200466 }, { "acc": 0.76033325, "epoch": 1.397513952308473, "grad_norm": 2.21875, "learning_rate": 2.282488568272437e-06, "loss": 0.95065441, "memory(GiB)": 369.42, "step": 55090, "train_speed(iter/s)": 0.200469 }, { "acc": 0.74390068, "epoch": 1.397640791476408, "grad_norm": 2.1875, "learning_rate": 2.2816084052659326e-06, "loss": 1.01843185, "memory(GiB)": 369.42, "step": 55095, "train_speed(iter/s)": 0.200469 }, { "acc": 0.75001497, "epoch": 1.397767630644343, "grad_norm": 2.25, "learning_rate": 2.2807283618264288e-06, "loss": 1.01687069, "memory(GiB)": 369.42, "step": 55100, "train_speed(iter/s)": 0.20047 }, { "acc": 0.7580194, "epoch": 1.397894469812278, "grad_norm": 2.265625, "learning_rate": 2.2798484379926324e-06, "loss": 0.94580612, "memory(GiB)": 369.42, "step": 55105, "train_speed(iter/s)": 0.200471 }, { "acc": 0.76073103, "epoch": 1.3980213089802132, "grad_norm": 2.3125, "learning_rate": 2.278968633803246e-06, "loss": 0.93407211, "memory(GiB)": 369.42, "step": 55110, "train_speed(iter/s)": 0.200474 }, { "acc": 0.75225544, "epoch": 1.3981481481481481, "grad_norm": 2.0625, "learning_rate": 2.2780889492969684e-06, "loss": 0.9680809, "memory(GiB)": 369.42, "step": 55115, "train_speed(iter/s)": 0.200477 }, { "acc": 0.76102209, "epoch": 1.3982749873160831, "grad_norm": 2.328125, "learning_rate": 2.277209384512491e-06, "loss": 0.93639908, "memory(GiB)": 369.42, "step": 55120, "train_speed(iter/s)": 0.200479 }, { "acc": 0.74599595, "epoch": 1.3984018264840183, "grad_norm": 2.984375, "learning_rate": 2.2763299394885013e-06, "loss": 0.98807201, "memory(GiB)": 369.42, "step": 55125, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75051112, "epoch": 1.3985286656519533, "grad_norm": 2.015625, "learning_rate": 2.2754506142636808e-06, "loss": 1.01150017, "memory(GiB)": 369.42, "step": 55130, "train_speed(iter/s)": 0.200486 }, { "acc": 0.76320963, "epoch": 1.3986555048198883, "grad_norm": 2.078125, "learning_rate": 2.274571408876707e-06, "loss": 0.96957512, "memory(GiB)": 369.42, "step": 55135, "train_speed(iter/s)": 0.200489 }, { "acc": 0.76537018, "epoch": 1.3987823439878233, "grad_norm": 2.453125, "learning_rate": 2.2736923233662504e-06, "loss": 0.98872166, "memory(GiB)": 369.42, "step": 55140, "train_speed(iter/s)": 0.200491 }, { "acc": 0.75199947, "epoch": 1.3989091831557585, "grad_norm": 1.84375, "learning_rate": 2.2728133577709776e-06, "loss": 1.03076763, "memory(GiB)": 369.42, "step": 55145, "train_speed(iter/s)": 0.200494 }, { "acc": 0.75882726, "epoch": 1.3990360223236935, "grad_norm": 2.390625, "learning_rate": 2.2719345121295495e-06, "loss": 0.99601936, "memory(GiB)": 369.42, "step": 55150, "train_speed(iter/s)": 0.200497 }, { "acc": 0.73663788, "epoch": 1.3991628614916287, "grad_norm": 2.125, "learning_rate": 2.2710557864806214e-06, "loss": 1.05550709, "memory(GiB)": 369.42, "step": 55155, "train_speed(iter/s)": 0.2005 }, { "acc": 0.75864205, "epoch": 1.3992897006595637, "grad_norm": 2.3125, "learning_rate": 2.2701771808628438e-06, "loss": 0.9809124, "memory(GiB)": 369.42, "step": 55160, "train_speed(iter/s)": 0.200502 }, { "acc": 0.75346079, "epoch": 1.3994165398274987, "grad_norm": 2.09375, "learning_rate": 2.269298695314861e-06, "loss": 1.01406164, "memory(GiB)": 369.42, "step": 55165, "train_speed(iter/s)": 0.200506 }, { "acc": 0.75622883, "epoch": 1.3995433789954337, "grad_norm": 2.140625, "learning_rate": 2.268420329875314e-06, "loss": 0.97726336, "memory(GiB)": 369.42, "step": 55170, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75347867, "epoch": 1.399670218163369, "grad_norm": 2.109375, "learning_rate": 2.2675420845828363e-06, "loss": 0.9479867, "memory(GiB)": 369.42, "step": 55175, "train_speed(iter/s)": 0.200511 }, { "acc": 0.75287566, "epoch": 1.3997970573313039, "grad_norm": 2.3125, "learning_rate": 2.266663959476057e-06, "loss": 0.94258938, "memory(GiB)": 369.42, "step": 55180, "train_speed(iter/s)": 0.200514 }, { "acc": 0.75580473, "epoch": 1.399923896499239, "grad_norm": 2.359375, "learning_rate": 2.265785954593598e-06, "loss": 0.98294373, "memory(GiB)": 369.42, "step": 55185, "train_speed(iter/s)": 0.200518 }, { "acc": 0.74154673, "epoch": 1.400050735667174, "grad_norm": 2.34375, "learning_rate": 2.264908069974085e-06, "loss": 0.96910715, "memory(GiB)": 369.42, "step": 55190, "train_speed(iter/s)": 0.200522 }, { "acc": 0.76297884, "epoch": 1.400177574835109, "grad_norm": 1.75, "learning_rate": 2.2640303056561236e-06, "loss": 0.93572702, "memory(GiB)": 369.42, "step": 55195, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74069729, "epoch": 1.400304414003044, "grad_norm": 3.015625, "learning_rate": 2.2631526616783234e-06, "loss": 0.96577253, "memory(GiB)": 369.42, "step": 55200, "train_speed(iter/s)": 0.200528 }, { "acc": 0.75733919, "epoch": 1.4004312531709793, "grad_norm": 2.15625, "learning_rate": 2.2622751380792896e-06, "loss": 0.98787518, "memory(GiB)": 369.42, "step": 55205, "train_speed(iter/s)": 0.200528 }, { "acc": 0.74057307, "epoch": 1.4005580923389143, "grad_norm": 2.171875, "learning_rate": 2.26139773489762e-06, "loss": 0.99208736, "memory(GiB)": 369.42, "step": 55210, "train_speed(iter/s)": 0.200531 }, { "acc": 0.75250359, "epoch": 1.4006849315068493, "grad_norm": 2.0625, "learning_rate": 2.260520452171904e-06, "loss": 0.965938, "memory(GiB)": 369.42, "step": 55215, "train_speed(iter/s)": 0.200533 }, { "acc": 0.74805603, "epoch": 1.4008117706747845, "grad_norm": 1.9609375, "learning_rate": 2.259643289940727e-06, "loss": 0.94533339, "memory(GiB)": 369.42, "step": 55220, "train_speed(iter/s)": 0.200536 }, { "acc": 0.74941969, "epoch": 1.4009386098427195, "grad_norm": 2.046875, "learning_rate": 2.2587662482426748e-06, "loss": 0.99194679, "memory(GiB)": 369.42, "step": 55225, "train_speed(iter/s)": 0.20054 }, { "acc": 0.74845428, "epoch": 1.4010654490106544, "grad_norm": 2.78125, "learning_rate": 2.2578893271163234e-06, "loss": 1.03111687, "memory(GiB)": 369.42, "step": 55230, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75493526, "epoch": 1.4011922881785894, "grad_norm": 2.390625, "learning_rate": 2.2570125266002385e-06, "loss": 0.986096, "memory(GiB)": 369.42, "step": 55235, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75501881, "epoch": 1.4013191273465246, "grad_norm": 2.265625, "learning_rate": 2.2561358467329907e-06, "loss": 0.97618694, "memory(GiB)": 369.42, "step": 55240, "train_speed(iter/s)": 0.200548 }, { "acc": 0.74768171, "epoch": 1.4014459665144596, "grad_norm": 2.15625, "learning_rate": 2.2552592875531397e-06, "loss": 0.98289919, "memory(GiB)": 369.42, "step": 55245, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75089664, "epoch": 1.4015728056823948, "grad_norm": 2.078125, "learning_rate": 2.254382849099241e-06, "loss": 0.96329746, "memory(GiB)": 369.42, "step": 55250, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75911093, "epoch": 1.4016996448503298, "grad_norm": 2.1875, "learning_rate": 2.253506531409839e-06, "loss": 0.93457222, "memory(GiB)": 369.42, "step": 55255, "train_speed(iter/s)": 0.200555 }, { "acc": 0.74600496, "epoch": 1.4018264840182648, "grad_norm": 2.4375, "learning_rate": 2.252630334523484e-06, "loss": 1.01289959, "memory(GiB)": 369.42, "step": 55260, "train_speed(iter/s)": 0.200558 }, { "acc": 0.74615107, "epoch": 1.4019533231861998, "grad_norm": 2.484375, "learning_rate": 2.2517542584787134e-06, "loss": 1.03927345, "memory(GiB)": 369.42, "step": 55265, "train_speed(iter/s)": 0.200559 }, { "acc": 0.74907541, "epoch": 1.402080162354135, "grad_norm": 2.140625, "learning_rate": 2.2508783033140596e-06, "loss": 1.00410099, "memory(GiB)": 369.42, "step": 55270, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74219127, "epoch": 1.40220700152207, "grad_norm": 2.5, "learning_rate": 2.2500024690680528e-06, "loss": 1.05241928, "memory(GiB)": 369.42, "step": 55275, "train_speed(iter/s)": 0.200567 }, { "acc": 0.76688404, "epoch": 1.402333840690005, "grad_norm": 2.328125, "learning_rate": 2.249126755779215e-06, "loss": 0.92448692, "memory(GiB)": 369.42, "step": 55280, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74555044, "epoch": 1.4024606798579402, "grad_norm": 2.140625, "learning_rate": 2.2482511634860645e-06, "loss": 0.99689875, "memory(GiB)": 369.42, "step": 55285, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74835434, "epoch": 1.4025875190258752, "grad_norm": 2.390625, "learning_rate": 2.247375692227113e-06, "loss": 0.98508015, "memory(GiB)": 369.42, "step": 55290, "train_speed(iter/s)": 0.200575 }, { "acc": 0.74741054, "epoch": 1.4027143581938102, "grad_norm": 2.40625, "learning_rate": 2.2465003420408683e-06, "loss": 1.02581635, "memory(GiB)": 369.42, "step": 55295, "train_speed(iter/s)": 0.200579 }, { "acc": 0.75342207, "epoch": 1.4028411973617452, "grad_norm": 2.171875, "learning_rate": 2.2456251129658325e-06, "loss": 0.99342823, "memory(GiB)": 369.42, "step": 55300, "train_speed(iter/s)": 0.200581 }, { "acc": 0.7392952, "epoch": 1.4029680365296804, "grad_norm": 2.203125, "learning_rate": 2.2447500050405008e-06, "loss": 1.02082062, "memory(GiB)": 369.42, "step": 55305, "train_speed(iter/s)": 0.200582 }, { "acc": 0.76227179, "epoch": 1.4030948756976154, "grad_norm": 2.453125, "learning_rate": 2.2438750183033657e-06, "loss": 0.96890469, "memory(GiB)": 369.42, "step": 55310, "train_speed(iter/s)": 0.200585 }, { "acc": 0.76353559, "epoch": 1.4032217148655506, "grad_norm": 2.046875, "learning_rate": 2.2430001527929123e-06, "loss": 0.95699234, "memory(GiB)": 369.42, "step": 55315, "train_speed(iter/s)": 0.200588 }, { "acc": 0.75782223, "epoch": 1.4033485540334856, "grad_norm": 2.625, "learning_rate": 2.242125408547622e-06, "loss": 1.03686905, "memory(GiB)": 369.42, "step": 55320, "train_speed(iter/s)": 0.200591 }, { "acc": 0.7494453, "epoch": 1.4034753932014206, "grad_norm": 2.375, "learning_rate": 2.2412507856059667e-06, "loss": 1.02036514, "memory(GiB)": 369.42, "step": 55325, "train_speed(iter/s)": 0.200594 }, { "acc": 0.75065937, "epoch": 1.4036022323693556, "grad_norm": 2.203125, "learning_rate": 2.2403762840064223e-06, "loss": 0.9812109, "memory(GiB)": 369.42, "step": 55330, "train_speed(iter/s)": 0.200599 }, { "acc": 0.76818461, "epoch": 1.4037290715372908, "grad_norm": 1.7890625, "learning_rate": 2.239501903787448e-06, "loss": 0.9753561, "memory(GiB)": 369.42, "step": 55335, "train_speed(iter/s)": 0.200602 }, { "acc": 0.73481255, "epoch": 1.4038559107052258, "grad_norm": 2.953125, "learning_rate": 2.2386276449875057e-06, "loss": 1.0271225, "memory(GiB)": 369.42, "step": 55340, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74559097, "epoch": 1.403982749873161, "grad_norm": 2.421875, "learning_rate": 2.2377535076450452e-06, "loss": 1.00890026, "memory(GiB)": 369.42, "step": 55345, "train_speed(iter/s)": 0.200609 }, { "acc": 0.75475841, "epoch": 1.404109589041096, "grad_norm": 2.28125, "learning_rate": 2.236879491798522e-06, "loss": 0.95113287, "memory(GiB)": 369.42, "step": 55350, "train_speed(iter/s)": 0.200612 }, { "acc": 0.7590879, "epoch": 1.404236428209031, "grad_norm": 2.421875, "learning_rate": 2.2360055974863737e-06, "loss": 0.99836273, "memory(GiB)": 369.42, "step": 55355, "train_speed(iter/s)": 0.200615 }, { "acc": 0.75368109, "epoch": 1.404363267376966, "grad_norm": 2.03125, "learning_rate": 2.2351318247470376e-06, "loss": 0.98124418, "memory(GiB)": 369.42, "step": 55360, "train_speed(iter/s)": 0.200617 }, { "acc": 0.74347782, "epoch": 1.4044901065449011, "grad_norm": 2.1875, "learning_rate": 2.2342581736189496e-06, "loss": 1.00211811, "memory(GiB)": 369.42, "step": 55365, "train_speed(iter/s)": 0.200618 }, { "acc": 0.7539093, "epoch": 1.4046169457128361, "grad_norm": 1.9609375, "learning_rate": 2.233384644140537e-06, "loss": 0.98172121, "memory(GiB)": 369.42, "step": 55370, "train_speed(iter/s)": 0.20062 }, { "acc": 0.76892357, "epoch": 1.4047437848807711, "grad_norm": 2.171875, "learning_rate": 2.2325112363502167e-06, "loss": 0.9244545, "memory(GiB)": 369.42, "step": 55375, "train_speed(iter/s)": 0.200623 }, { "acc": 0.75758519, "epoch": 1.4048706240487063, "grad_norm": 1.8828125, "learning_rate": 2.23163795028641e-06, "loss": 0.96336288, "memory(GiB)": 369.42, "step": 55380, "train_speed(iter/s)": 0.200624 }, { "acc": 0.74669142, "epoch": 1.4049974632166413, "grad_norm": 2.0, "learning_rate": 2.230764785987526e-06, "loss": 0.97443848, "memory(GiB)": 369.42, "step": 55385, "train_speed(iter/s)": 0.200628 }, { "acc": 0.74961581, "epoch": 1.4051243023845763, "grad_norm": 2.703125, "learning_rate": 2.229891743491972e-06, "loss": 0.95840101, "memory(GiB)": 369.42, "step": 55390, "train_speed(iter/s)": 0.200629 }, { "acc": 0.75022783, "epoch": 1.4052511415525113, "grad_norm": 2.109375, "learning_rate": 2.2290188228381434e-06, "loss": 0.95854101, "memory(GiB)": 369.42, "step": 55395, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75643415, "epoch": 1.4053779807204465, "grad_norm": 1.8671875, "learning_rate": 2.2281460240644397e-06, "loss": 0.95234051, "memory(GiB)": 369.42, "step": 55400, "train_speed(iter/s)": 0.200632 }, { "acc": 0.74076014, "epoch": 1.4055048198883815, "grad_norm": 2.28125, "learning_rate": 2.22727334720925e-06, "loss": 1.01009626, "memory(GiB)": 369.42, "step": 55405, "train_speed(iter/s)": 0.200635 }, { "acc": 0.75877514, "epoch": 1.4056316590563167, "grad_norm": 1.890625, "learning_rate": 2.2264007923109575e-06, "loss": 0.96023502, "memory(GiB)": 369.42, "step": 55410, "train_speed(iter/s)": 0.200636 }, { "acc": 0.75475473, "epoch": 1.4057584982242517, "grad_norm": 2.671875, "learning_rate": 2.225528359407942e-06, "loss": 0.98271933, "memory(GiB)": 369.42, "step": 55415, "train_speed(iter/s)": 0.200639 }, { "acc": 0.75054798, "epoch": 1.4058853373921867, "grad_norm": 2.28125, "learning_rate": 2.2246560485385756e-06, "loss": 0.95394363, "memory(GiB)": 369.42, "step": 55420, "train_speed(iter/s)": 0.200643 }, { "acc": 0.74893351, "epoch": 1.4060121765601217, "grad_norm": 2.09375, "learning_rate": 2.2237838597412277e-06, "loss": 1.01725521, "memory(GiB)": 369.42, "step": 55425, "train_speed(iter/s)": 0.200646 }, { "acc": 0.7621829, "epoch": 1.4061390157280569, "grad_norm": 1.9921875, "learning_rate": 2.22291179305426e-06, "loss": 0.89885082, "memory(GiB)": 369.42, "step": 55430, "train_speed(iter/s)": 0.200648 }, { "acc": 0.76143208, "epoch": 1.4062658548959919, "grad_norm": 2.234375, "learning_rate": 2.222039848516031e-06, "loss": 0.96002045, "memory(GiB)": 369.42, "step": 55435, "train_speed(iter/s)": 0.200649 }, { "acc": 0.74270172, "epoch": 1.4063926940639269, "grad_norm": 2.15625, "learning_rate": 2.2211680261648918e-06, "loss": 0.98720188, "memory(GiB)": 369.42, "step": 55440, "train_speed(iter/s)": 0.200651 }, { "acc": 0.75370579, "epoch": 1.406519533231862, "grad_norm": 2.0625, "learning_rate": 2.22029632603919e-06, "loss": 0.96433353, "memory(GiB)": 369.42, "step": 55445, "train_speed(iter/s)": 0.200653 }, { "acc": 0.75831442, "epoch": 1.406646372399797, "grad_norm": 2.203125, "learning_rate": 2.2194247481772652e-06, "loss": 0.96918659, "memory(GiB)": 369.42, "step": 55450, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75273366, "epoch": 1.406773211567732, "grad_norm": 2.15625, "learning_rate": 2.218553292617455e-06, "loss": 0.97819567, "memory(GiB)": 369.42, "step": 55455, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75708761, "epoch": 1.406900050735667, "grad_norm": 1.859375, "learning_rate": 2.2176819593980892e-06, "loss": 0.94809513, "memory(GiB)": 369.42, "step": 55460, "train_speed(iter/s)": 0.200661 }, { "acc": 0.7480854, "epoch": 1.4070268899036023, "grad_norm": 2.78125, "learning_rate": 2.2168107485574914e-06, "loss": 1.05049591, "memory(GiB)": 369.42, "step": 55465, "train_speed(iter/s)": 0.200664 }, { "acc": 0.75177412, "epoch": 1.4071537290715372, "grad_norm": 2.234375, "learning_rate": 2.215939660133986e-06, "loss": 0.97548904, "memory(GiB)": 369.42, "step": 55470, "train_speed(iter/s)": 0.200668 }, { "acc": 0.76103325, "epoch": 1.4072805682394725, "grad_norm": 2.203125, "learning_rate": 2.215068694165883e-06, "loss": 0.95538883, "memory(GiB)": 369.42, "step": 55475, "train_speed(iter/s)": 0.200671 }, { "acc": 0.75027895, "epoch": 1.4074074074074074, "grad_norm": 2.1875, "learning_rate": 2.2141978506914922e-06, "loss": 0.9335125, "memory(GiB)": 369.42, "step": 55480, "train_speed(iter/s)": 0.200674 }, { "acc": 0.73592753, "epoch": 1.4075342465753424, "grad_norm": 2.078125, "learning_rate": 2.2133271297491165e-06, "loss": 1.094487, "memory(GiB)": 369.42, "step": 55485, "train_speed(iter/s)": 0.200677 }, { "acc": 0.75575838, "epoch": 1.4076610857432774, "grad_norm": 2.171875, "learning_rate": 2.2124565313770584e-06, "loss": 0.99114027, "memory(GiB)": 369.42, "step": 55490, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76473083, "epoch": 1.4077879249112126, "grad_norm": 1.9296875, "learning_rate": 2.211586055613606e-06, "loss": 0.93352737, "memory(GiB)": 369.42, "step": 55495, "train_speed(iter/s)": 0.200681 }, { "acc": 0.74315662, "epoch": 1.4079147640791476, "grad_norm": 1.890625, "learning_rate": 2.210715702497046e-06, "loss": 0.99302845, "memory(GiB)": 369.42, "step": 55500, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75079317, "epoch": 1.4080416032470828, "grad_norm": 2.3125, "learning_rate": 2.2098454720656647e-06, "loss": 1.01637659, "memory(GiB)": 369.42, "step": 55505, "train_speed(iter/s)": 0.200687 }, { "acc": 0.76238852, "epoch": 1.4081684424150178, "grad_norm": 2.625, "learning_rate": 2.2089753643577384e-06, "loss": 0.96059017, "memory(GiB)": 369.42, "step": 55510, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75076966, "epoch": 1.4082952815829528, "grad_norm": 2.140625, "learning_rate": 2.208105379411535e-06, "loss": 0.96654387, "memory(GiB)": 369.42, "step": 55515, "train_speed(iter/s)": 0.200693 }, { "acc": 0.75493007, "epoch": 1.4084221207508878, "grad_norm": 2.3125, "learning_rate": 2.2072355172653197e-06, "loss": 0.94983959, "memory(GiB)": 369.42, "step": 55520, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74957762, "epoch": 1.408548959918823, "grad_norm": 2.484375, "learning_rate": 2.2063657779573573e-06, "loss": 0.99757023, "memory(GiB)": 369.42, "step": 55525, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75507793, "epoch": 1.408675799086758, "grad_norm": 2.09375, "learning_rate": 2.2054961615259023e-06, "loss": 0.96374531, "memory(GiB)": 369.42, "step": 55530, "train_speed(iter/s)": 0.200699 }, { "acc": 0.76954737, "epoch": 1.408802638254693, "grad_norm": 1.9921875, "learning_rate": 2.2046266680091994e-06, "loss": 0.90866919, "memory(GiB)": 369.42, "step": 55535, "train_speed(iter/s)": 0.200704 }, { "acc": 0.74915953, "epoch": 1.4089294774226282, "grad_norm": 2.046875, "learning_rate": 2.2037572974454974e-06, "loss": 1.03640957, "memory(GiB)": 369.42, "step": 55540, "train_speed(iter/s)": 0.200707 }, { "acc": 0.76081839, "epoch": 1.4090563165905632, "grad_norm": 2.265625, "learning_rate": 2.202888049873034e-06, "loss": 0.96088018, "memory(GiB)": 369.42, "step": 55545, "train_speed(iter/s)": 0.20071 }, { "acc": 0.74671211, "epoch": 1.4091831557584982, "grad_norm": 2.40625, "learning_rate": 2.2020189253300428e-06, "loss": 1.09487715, "memory(GiB)": 369.42, "step": 55550, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74699392, "epoch": 1.4093099949264332, "grad_norm": 2.09375, "learning_rate": 2.2011499238547506e-06, "loss": 1.01560459, "memory(GiB)": 369.42, "step": 55555, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74299564, "epoch": 1.4094368340943684, "grad_norm": 2.53125, "learning_rate": 2.2002810454853813e-06, "loss": 1.03066769, "memory(GiB)": 369.42, "step": 55560, "train_speed(iter/s)": 0.20072 }, { "acc": 0.7480134, "epoch": 1.4095636732623034, "grad_norm": 2.40625, "learning_rate": 2.1994122902601513e-06, "loss": 0.98144846, "memory(GiB)": 369.42, "step": 55565, "train_speed(iter/s)": 0.200722 }, { "acc": 0.74386449, "epoch": 1.4096905124302386, "grad_norm": 2.359375, "learning_rate": 2.1985436582172724e-06, "loss": 0.98094559, "memory(GiB)": 369.42, "step": 55570, "train_speed(iter/s)": 0.200719 }, { "acc": 0.75017781, "epoch": 1.4098173515981736, "grad_norm": 3.078125, "learning_rate": 2.1976751493949512e-06, "loss": 1.02262659, "memory(GiB)": 369.42, "step": 55575, "train_speed(iter/s)": 0.200722 }, { "acc": 0.74802761, "epoch": 1.4099441907661086, "grad_norm": 2.3125, "learning_rate": 2.196806763831388e-06, "loss": 0.97047386, "memory(GiB)": 369.42, "step": 55580, "train_speed(iter/s)": 0.200724 }, { "acc": 0.75908976, "epoch": 1.4100710299340435, "grad_norm": 2.15625, "learning_rate": 2.1959385015647794e-06, "loss": 0.99240656, "memory(GiB)": 369.42, "step": 55585, "train_speed(iter/s)": 0.200727 }, { "acc": 0.7461185, "epoch": 1.4101978691019788, "grad_norm": 2.09375, "learning_rate": 2.195070362633314e-06, "loss": 1.01381569, "memory(GiB)": 369.42, "step": 55590, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74221249, "epoch": 1.4103247082699137, "grad_norm": 2.703125, "learning_rate": 2.194202347075178e-06, "loss": 1.02066822, "memory(GiB)": 369.42, "step": 55595, "train_speed(iter/s)": 0.200734 }, { "acc": 0.75041437, "epoch": 1.4104515474378487, "grad_norm": 2.1875, "learning_rate": 2.1933344549285485e-06, "loss": 0.99638443, "memory(GiB)": 369.42, "step": 55600, "train_speed(iter/s)": 0.200736 }, { "acc": 0.74256158, "epoch": 1.410578386605784, "grad_norm": 2.78125, "learning_rate": 2.1924666862316015e-06, "loss": 1.02968311, "memory(GiB)": 369.42, "step": 55605, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75601807, "epoch": 1.410705225773719, "grad_norm": 2.265625, "learning_rate": 2.191599041022504e-06, "loss": 0.97388649, "memory(GiB)": 369.42, "step": 55610, "train_speed(iter/s)": 0.200743 }, { "acc": 0.74947486, "epoch": 1.410832064941654, "grad_norm": 1.96875, "learning_rate": 2.19073151933942e-06, "loss": 0.92829981, "memory(GiB)": 369.42, "step": 55615, "train_speed(iter/s)": 0.200744 }, { "acc": 0.75896935, "epoch": 1.410958904109589, "grad_norm": 2.21875, "learning_rate": 2.1898641212205053e-06, "loss": 0.93651342, "memory(GiB)": 369.42, "step": 55620, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75134335, "epoch": 1.4110857432775241, "grad_norm": 2.078125, "learning_rate": 2.1889968467039114e-06, "loss": 1.01075764, "memory(GiB)": 369.42, "step": 55625, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74788361, "epoch": 1.411212582445459, "grad_norm": 2.1875, "learning_rate": 2.1881296958277897e-06, "loss": 0.97678261, "memory(GiB)": 369.42, "step": 55630, "train_speed(iter/s)": 0.200752 }, { "acc": 0.75786281, "epoch": 1.4113394216133943, "grad_norm": 2.390625, "learning_rate": 2.1872626686302767e-06, "loss": 0.95749235, "memory(GiB)": 369.42, "step": 55635, "train_speed(iter/s)": 0.200755 }, { "acc": 0.75848017, "epoch": 1.4114662607813293, "grad_norm": 2.375, "learning_rate": 2.186395765149508e-06, "loss": 0.92764797, "memory(GiB)": 369.42, "step": 55640, "train_speed(iter/s)": 0.200758 }, { "acc": 0.74223518, "epoch": 1.4115930999492643, "grad_norm": 2.0625, "learning_rate": 2.1855289854236165e-06, "loss": 1.03346519, "memory(GiB)": 369.42, "step": 55645, "train_speed(iter/s)": 0.200762 }, { "acc": 0.74672642, "epoch": 1.4117199391171993, "grad_norm": 1.9921875, "learning_rate": 2.184662329490728e-06, "loss": 1.02104645, "memory(GiB)": 369.42, "step": 55650, "train_speed(iter/s)": 0.200765 }, { "acc": 0.75327411, "epoch": 1.4118467782851345, "grad_norm": 2.390625, "learning_rate": 2.1837957973889584e-06, "loss": 0.96720371, "memory(GiB)": 369.42, "step": 55655, "train_speed(iter/s)": 0.200768 }, { "acc": 0.73786249, "epoch": 1.4119736174530695, "grad_norm": 2.40625, "learning_rate": 2.1829293891564212e-06, "loss": 1.02433052, "memory(GiB)": 369.42, "step": 55660, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74759531, "epoch": 1.4121004566210047, "grad_norm": 2.03125, "learning_rate": 2.182063104831228e-06, "loss": 0.98501339, "memory(GiB)": 369.42, "step": 55665, "train_speed(iter/s)": 0.200774 }, { "acc": 0.74915524, "epoch": 1.4122272957889397, "grad_norm": 2.03125, "learning_rate": 2.181196944451483e-06, "loss": 1.00536041, "memory(GiB)": 369.42, "step": 55670, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75137749, "epoch": 1.4123541349568747, "grad_norm": 2.484375, "learning_rate": 2.180330908055278e-06, "loss": 1.0382843, "memory(GiB)": 369.42, "step": 55675, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75023966, "epoch": 1.4124809741248097, "grad_norm": 2.0625, "learning_rate": 2.1794649956807094e-06, "loss": 0.96797056, "memory(GiB)": 369.42, "step": 55680, "train_speed(iter/s)": 0.200782 }, { "acc": 0.76864691, "epoch": 1.4126078132927449, "grad_norm": 2.140625, "learning_rate": 2.178599207365864e-06, "loss": 0.91076603, "memory(GiB)": 369.42, "step": 55685, "train_speed(iter/s)": 0.200784 }, { "acc": 0.75563822, "epoch": 1.4127346524606799, "grad_norm": 2.03125, "learning_rate": 2.1777335431488234e-06, "loss": 0.97062969, "memory(GiB)": 369.42, "step": 55690, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75800695, "epoch": 1.4128614916286149, "grad_norm": 2.328125, "learning_rate": 2.176868003067659e-06, "loss": 1.01210709, "memory(GiB)": 369.42, "step": 55695, "train_speed(iter/s)": 0.20079 }, { "acc": 0.74805908, "epoch": 1.41298833079655, "grad_norm": 2.234375, "learning_rate": 2.1760025871604456e-06, "loss": 0.98663158, "memory(GiB)": 369.42, "step": 55700, "train_speed(iter/s)": 0.200793 }, { "acc": 0.75715151, "epoch": 1.413115169964485, "grad_norm": 2.515625, "learning_rate": 2.175137295465247e-06, "loss": 0.95774345, "memory(GiB)": 369.42, "step": 55705, "train_speed(iter/s)": 0.200797 }, { "acc": 0.74766903, "epoch": 1.41324200913242, "grad_norm": 1.8671875, "learning_rate": 2.174272128020122e-06, "loss": 1.03710918, "memory(GiB)": 369.42, "step": 55710, "train_speed(iter/s)": 0.200799 }, { "acc": 0.76205978, "epoch": 1.413368848300355, "grad_norm": 2.390625, "learning_rate": 2.1734070848631245e-06, "loss": 0.97513504, "memory(GiB)": 369.42, "step": 55715, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75678988, "epoch": 1.4134956874682902, "grad_norm": 2.46875, "learning_rate": 2.172542166032303e-06, "loss": 0.9058363, "memory(GiB)": 369.42, "step": 55720, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75497808, "epoch": 1.4136225266362252, "grad_norm": 2.046875, "learning_rate": 2.171677371565701e-06, "loss": 0.9843811, "memory(GiB)": 369.42, "step": 55725, "train_speed(iter/s)": 0.200807 }, { "acc": 0.74728155, "epoch": 1.4137493658041604, "grad_norm": 2.4375, "learning_rate": 2.1708127015013565e-06, "loss": 1.02606373, "memory(GiB)": 369.42, "step": 55730, "train_speed(iter/s)": 0.200809 }, { "acc": 0.74393024, "epoch": 1.4138762049720954, "grad_norm": 2.21875, "learning_rate": 2.1699481558773e-06, "loss": 0.95655403, "memory(GiB)": 369.42, "step": 55735, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75357199, "epoch": 1.4140030441400304, "grad_norm": 1.703125, "learning_rate": 2.1690837347315597e-06, "loss": 0.99066887, "memory(GiB)": 369.42, "step": 55740, "train_speed(iter/s)": 0.200816 }, { "acc": 0.75362329, "epoch": 1.4141298833079654, "grad_norm": 2.453125, "learning_rate": 2.168219438102155e-06, "loss": 0.97270584, "memory(GiB)": 369.42, "step": 55745, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74546666, "epoch": 1.4142567224759006, "grad_norm": 2.578125, "learning_rate": 2.167355266027103e-06, "loss": 0.97145615, "memory(GiB)": 369.42, "step": 55750, "train_speed(iter/s)": 0.20082 }, { "acc": 0.75915265, "epoch": 1.4143835616438356, "grad_norm": 2.03125, "learning_rate": 2.1664912185444127e-06, "loss": 0.9315115, "memory(GiB)": 369.42, "step": 55755, "train_speed(iter/s)": 0.200823 }, { "acc": 0.7432806, "epoch": 1.4145104008117706, "grad_norm": 2.0, "learning_rate": 2.16562729569209e-06, "loss": 1.08626804, "memory(GiB)": 369.42, "step": 55760, "train_speed(iter/s)": 0.200826 }, { "acc": 0.76102123, "epoch": 1.4146372399797058, "grad_norm": 2.90625, "learning_rate": 2.164763497508131e-06, "loss": 0.94415569, "memory(GiB)": 369.42, "step": 55765, "train_speed(iter/s)": 0.200828 }, { "acc": 0.76451454, "epoch": 1.4147640791476408, "grad_norm": 2.09375, "learning_rate": 2.1638998240305355e-06, "loss": 0.96965027, "memory(GiB)": 369.42, "step": 55770, "train_speed(iter/s)": 0.200831 }, { "acc": 0.74452624, "epoch": 1.4148909183155758, "grad_norm": 2.40625, "learning_rate": 2.163036275297286e-06, "loss": 1.00254116, "memory(GiB)": 369.42, "step": 55775, "train_speed(iter/s)": 0.200829 }, { "acc": 0.74667025, "epoch": 1.4150177574835108, "grad_norm": 2.171875, "learning_rate": 2.162172851346368e-06, "loss": 0.9842083, "memory(GiB)": 369.42, "step": 55780, "train_speed(iter/s)": 0.200831 }, { "acc": 0.76139479, "epoch": 1.415144596651446, "grad_norm": 2.421875, "learning_rate": 2.1613095522157557e-06, "loss": 0.94075127, "memory(GiB)": 369.42, "step": 55785, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75195327, "epoch": 1.415271435819381, "grad_norm": 2.171875, "learning_rate": 2.1604463779434267e-06, "loss": 0.96406145, "memory(GiB)": 369.42, "step": 55790, "train_speed(iter/s)": 0.200837 }, { "acc": 0.74843659, "epoch": 1.4153982749873162, "grad_norm": 2.171875, "learning_rate": 2.159583328567342e-06, "loss": 0.99698668, "memory(GiB)": 369.42, "step": 55795, "train_speed(iter/s)": 0.20084 }, { "acc": 0.7570425, "epoch": 1.4155251141552512, "grad_norm": 2.125, "learning_rate": 2.158720404125462e-06, "loss": 0.98519487, "memory(GiB)": 369.42, "step": 55800, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75750256, "epoch": 1.4156519533231862, "grad_norm": 2.109375, "learning_rate": 2.1578576046557463e-06, "loss": 0.96652327, "memory(GiB)": 369.42, "step": 55805, "train_speed(iter/s)": 0.200845 }, { "acc": 0.74316683, "epoch": 1.4157787924911212, "grad_norm": 2.140625, "learning_rate": 2.156994930196144e-06, "loss": 0.9679039, "memory(GiB)": 369.42, "step": 55810, "train_speed(iter/s)": 0.200848 }, { "acc": 0.75639029, "epoch": 1.4159056316590564, "grad_norm": 2.21875, "learning_rate": 2.156132380784594e-06, "loss": 0.92240295, "memory(GiB)": 369.42, "step": 55815, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75004578, "epoch": 1.4160324708269914, "grad_norm": 2.5625, "learning_rate": 2.155269956459041e-06, "loss": 0.96572266, "memory(GiB)": 369.42, "step": 55820, "train_speed(iter/s)": 0.200853 }, { "acc": 0.7581821, "epoch": 1.4161593099949266, "grad_norm": 2.375, "learning_rate": 2.1544076572574156e-06, "loss": 0.98561172, "memory(GiB)": 369.42, "step": 55825, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75259695, "epoch": 1.4162861491628616, "grad_norm": 1.7890625, "learning_rate": 2.1535454832176482e-06, "loss": 0.96201668, "memory(GiB)": 369.42, "step": 55830, "train_speed(iter/s)": 0.200858 }, { "acc": 0.74125376, "epoch": 1.4164129883307965, "grad_norm": 2.1875, "learning_rate": 2.1526834343776556e-06, "loss": 1.04629736, "memory(GiB)": 369.42, "step": 55835, "train_speed(iter/s)": 0.200862 }, { "acc": 0.73985567, "epoch": 1.4165398274987315, "grad_norm": 3.078125, "learning_rate": 2.1518215107753593e-06, "loss": 1.03834057, "memory(GiB)": 369.42, "step": 55840, "train_speed(iter/s)": 0.200865 }, { "acc": 0.73581467, "epoch": 1.4166666666666667, "grad_norm": 2.09375, "learning_rate": 2.1509597124486693e-06, "loss": 1.03009815, "memory(GiB)": 369.42, "step": 55845, "train_speed(iter/s)": 0.200868 }, { "acc": 0.75899992, "epoch": 1.4167935058346017, "grad_norm": 1.90625, "learning_rate": 2.1500980394354907e-06, "loss": 0.98580532, "memory(GiB)": 369.42, "step": 55850, "train_speed(iter/s)": 0.200872 }, { "acc": 0.74767642, "epoch": 1.4169203450025367, "grad_norm": 2.09375, "learning_rate": 2.1492364917737252e-06, "loss": 0.98605576, "memory(GiB)": 369.42, "step": 55855, "train_speed(iter/s)": 0.200874 }, { "acc": 0.75030441, "epoch": 1.417047184170472, "grad_norm": 2.375, "learning_rate": 2.148375069501266e-06, "loss": 0.99690132, "memory(GiB)": 369.42, "step": 55860, "train_speed(iter/s)": 0.200878 }, { "acc": 0.75522184, "epoch": 1.417174023338407, "grad_norm": 1.609375, "learning_rate": 2.147513772656003e-06, "loss": 0.96120949, "memory(GiB)": 369.42, "step": 55865, "train_speed(iter/s)": 0.200881 }, { "acc": 0.74514737, "epoch": 1.417300862506342, "grad_norm": 2.796875, "learning_rate": 2.1466526012758194e-06, "loss": 0.98488255, "memory(GiB)": 369.42, "step": 55870, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75352325, "epoch": 1.417427701674277, "grad_norm": 2.0, "learning_rate": 2.145791555398594e-06, "loss": 0.94988632, "memory(GiB)": 369.42, "step": 55875, "train_speed(iter/s)": 0.200886 }, { "acc": 0.76636558, "epoch": 1.417554540842212, "grad_norm": 2.265625, "learning_rate": 2.144930635062199e-06, "loss": 0.89909306, "memory(GiB)": 369.42, "step": 55880, "train_speed(iter/s)": 0.20089 }, { "acc": 0.7353363, "epoch": 1.417681380010147, "grad_norm": 2.46875, "learning_rate": 2.144069840304502e-06, "loss": 1.02269917, "memory(GiB)": 369.42, "step": 55885, "train_speed(iter/s)": 0.200894 }, { "acc": 0.74954939, "epoch": 1.4178082191780823, "grad_norm": 2.3125, "learning_rate": 2.1432091711633634e-06, "loss": 1.00917702, "memory(GiB)": 369.42, "step": 55890, "train_speed(iter/s)": 0.200896 }, { "acc": 0.7484376, "epoch": 1.4179350583460173, "grad_norm": 2.171875, "learning_rate": 2.142348627676641e-06, "loss": 1.0045042, "memory(GiB)": 369.42, "step": 55895, "train_speed(iter/s)": 0.200898 }, { "acc": 0.75866694, "epoch": 1.4180618975139523, "grad_norm": 2.171875, "learning_rate": 2.1414882098821836e-06, "loss": 0.95985317, "memory(GiB)": 369.42, "step": 55900, "train_speed(iter/s)": 0.200901 }, { "acc": 0.76249714, "epoch": 1.4181887366818873, "grad_norm": 2.1875, "learning_rate": 2.1406279178178355e-06, "loss": 0.99280758, "memory(GiB)": 369.42, "step": 55905, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75280056, "epoch": 1.4183155758498225, "grad_norm": 2.359375, "learning_rate": 2.1397677515214422e-06, "loss": 0.99823084, "memory(GiB)": 369.42, "step": 55910, "train_speed(iter/s)": 0.200906 }, { "acc": 0.74759331, "epoch": 1.4184424150177575, "grad_norm": 2.734375, "learning_rate": 2.1389077110308304e-06, "loss": 1.02475586, "memory(GiB)": 369.42, "step": 55915, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74920025, "epoch": 1.4185692541856925, "grad_norm": 2.546875, "learning_rate": 2.138047796383832e-06, "loss": 0.99394035, "memory(GiB)": 369.42, "step": 55920, "train_speed(iter/s)": 0.200912 }, { "acc": 0.74664788, "epoch": 1.4186960933536277, "grad_norm": 2.84375, "learning_rate": 2.1371880076182666e-06, "loss": 1.02259274, "memory(GiB)": 369.42, "step": 55925, "train_speed(iter/s)": 0.200915 }, { "acc": 0.74402108, "epoch": 1.4188229325215627, "grad_norm": 2.4375, "learning_rate": 2.1363283447719584e-06, "loss": 1.02433853, "memory(GiB)": 369.42, "step": 55930, "train_speed(iter/s)": 0.200917 }, { "acc": 0.76057673, "epoch": 1.4189497716894977, "grad_norm": 2.671875, "learning_rate": 2.135468807882713e-06, "loss": 0.95914049, "memory(GiB)": 369.42, "step": 55935, "train_speed(iter/s)": 0.200921 }, { "acc": 0.74791355, "epoch": 1.4190766108574326, "grad_norm": 2.46875, "learning_rate": 2.1346093969883367e-06, "loss": 0.971945, "memory(GiB)": 369.42, "step": 55940, "train_speed(iter/s)": 0.200923 }, { "acc": 0.75171499, "epoch": 1.4192034500253679, "grad_norm": 2.09375, "learning_rate": 2.1337501121266345e-06, "loss": 0.95195522, "memory(GiB)": 369.42, "step": 55945, "train_speed(iter/s)": 0.200927 }, { "acc": 0.73687611, "epoch": 1.4193302891933028, "grad_norm": 2.765625, "learning_rate": 2.132890953335401e-06, "loss": 1.0563982, "memory(GiB)": 369.42, "step": 55950, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74272642, "epoch": 1.419457128361238, "grad_norm": 2.21875, "learning_rate": 2.1320319206524215e-06, "loss": 1.03937874, "memory(GiB)": 369.42, "step": 55955, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75573702, "epoch": 1.419583967529173, "grad_norm": 1.984375, "learning_rate": 2.1311730141154813e-06, "loss": 0.99418354, "memory(GiB)": 369.42, "step": 55960, "train_speed(iter/s)": 0.200935 }, { "acc": 0.74540534, "epoch": 1.419710806697108, "grad_norm": 1.9921875, "learning_rate": 2.1303142337623623e-06, "loss": 1.0269516, "memory(GiB)": 369.42, "step": 55965, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75048652, "epoch": 1.419837645865043, "grad_norm": 2.53125, "learning_rate": 2.1294555796308375e-06, "loss": 0.95389309, "memory(GiB)": 369.42, "step": 55970, "train_speed(iter/s)": 0.200939 }, { "acc": 0.75696287, "epoch": 1.4199644850329782, "grad_norm": 2.234375, "learning_rate": 2.1285970517586686e-06, "loss": 1.02576866, "memory(GiB)": 369.42, "step": 55975, "train_speed(iter/s)": 0.200942 }, { "acc": 0.7386961, "epoch": 1.4200913242009132, "grad_norm": 2.109375, "learning_rate": 2.127738650183623e-06, "loss": 1.00384445, "memory(GiB)": 369.42, "step": 55980, "train_speed(iter/s)": 0.200945 }, { "acc": 0.7491776, "epoch": 1.4202181633688484, "grad_norm": 2.078125, "learning_rate": 2.1268803749434546e-06, "loss": 0.99269791, "memory(GiB)": 369.42, "step": 55985, "train_speed(iter/s)": 0.200948 }, { "acc": 0.73392916, "epoch": 1.4203450025367834, "grad_norm": 2.390625, "learning_rate": 2.1260222260759158e-06, "loss": 0.98603363, "memory(GiB)": 369.42, "step": 55990, "train_speed(iter/s)": 0.20095 }, { "acc": 0.75764489, "epoch": 1.4204718417047184, "grad_norm": 2.0, "learning_rate": 2.1251642036187502e-06, "loss": 0.97847357, "memory(GiB)": 369.42, "step": 55995, "train_speed(iter/s)": 0.200953 }, { "acc": 0.75978255, "epoch": 1.4205986808726534, "grad_norm": 2.109375, "learning_rate": 2.124306307609699e-06, "loss": 0.99053936, "memory(GiB)": 369.42, "step": 56000, "train_speed(iter/s)": 0.200955 }, { "epoch": 1.4205986808726534, "eval_acc": 0.7379320601735322, "eval_loss": 0.9695500135421753, "eval_runtime": 385.8523, "eval_samples_per_second": 16.509, "eval_steps_per_second": 8.254, "step": 56000 }, { "acc": 0.7306385, "epoch": 1.4207255200405886, "grad_norm": 2.21875, "learning_rate": 2.123448538086495e-06, "loss": 1.0425334, "memory(GiB)": 369.42, "step": 56005, "train_speed(iter/s)": 0.200442 }, { "acc": 0.74788537, "epoch": 1.4208523592085236, "grad_norm": 2.5625, "learning_rate": 2.122590895086867e-06, "loss": 1.05821505, "memory(GiB)": 369.42, "step": 56010, "train_speed(iter/s)": 0.200444 }, { "acc": 0.74828672, "epoch": 1.4209791983764586, "grad_norm": 2.203125, "learning_rate": 2.1217333786485385e-06, "loss": 0.97012463, "memory(GiB)": 369.42, "step": 56015, "train_speed(iter/s)": 0.200448 }, { "acc": 0.7486784, "epoch": 1.4211060375443938, "grad_norm": 2.171875, "learning_rate": 2.120875988809226e-06, "loss": 1.01030941, "memory(GiB)": 369.42, "step": 56020, "train_speed(iter/s)": 0.20045 }, { "acc": 0.75951662, "epoch": 1.4212328767123288, "grad_norm": 2.328125, "learning_rate": 2.1200187256066425e-06, "loss": 0.94662952, "memory(GiB)": 369.42, "step": 56025, "train_speed(iter/s)": 0.200453 }, { "acc": 0.75371561, "epoch": 1.4213597158802638, "grad_norm": 2.5625, "learning_rate": 2.119161589078493e-06, "loss": 1.00246296, "memory(GiB)": 369.42, "step": 56030, "train_speed(iter/s)": 0.200455 }, { "acc": 0.73938866, "epoch": 1.4214865550481988, "grad_norm": 2.234375, "learning_rate": 2.118304579262479e-06, "loss": 1.05409775, "memory(GiB)": 369.42, "step": 56035, "train_speed(iter/s)": 0.200457 }, { "acc": 0.76211319, "epoch": 1.421613394216134, "grad_norm": 1.8125, "learning_rate": 2.1174476961962957e-06, "loss": 0.985989, "memory(GiB)": 369.42, "step": 56040, "train_speed(iter/s)": 0.200459 }, { "acc": 0.74453363, "epoch": 1.421740233384069, "grad_norm": 2.328125, "learning_rate": 2.1165909399176328e-06, "loss": 1.02758446, "memory(GiB)": 369.42, "step": 56045, "train_speed(iter/s)": 0.200462 }, { "acc": 0.74173412, "epoch": 1.4218670725520042, "grad_norm": 2.15625, "learning_rate": 2.1157343104641733e-06, "loss": 1.04431133, "memory(GiB)": 369.42, "step": 56050, "train_speed(iter/s)": 0.200466 }, { "acc": 0.75282946, "epoch": 1.4219939117199392, "grad_norm": 2.125, "learning_rate": 2.114877807873596e-06, "loss": 0.92140121, "memory(GiB)": 369.42, "step": 56055, "train_speed(iter/s)": 0.200468 }, { "acc": 0.74343481, "epoch": 1.4221207508878742, "grad_norm": 2.015625, "learning_rate": 2.114021432183574e-06, "loss": 0.96533136, "memory(GiB)": 369.42, "step": 56060, "train_speed(iter/s)": 0.20047 }, { "acc": 0.74606729, "epoch": 1.4222475900558091, "grad_norm": 2.09375, "learning_rate": 2.113165183431773e-06, "loss": 1.00927916, "memory(GiB)": 369.42, "step": 56065, "train_speed(iter/s)": 0.200472 }, { "acc": 0.76798196, "epoch": 1.4223744292237444, "grad_norm": 2.71875, "learning_rate": 2.112309061655859e-06, "loss": 0.95896978, "memory(GiB)": 369.42, "step": 56070, "train_speed(iter/s)": 0.200475 }, { "acc": 0.77304201, "epoch": 1.4225012683916793, "grad_norm": 2.328125, "learning_rate": 2.1114530668934836e-06, "loss": 0.94811478, "memory(GiB)": 369.42, "step": 56075, "train_speed(iter/s)": 0.200476 }, { "acc": 0.75032148, "epoch": 1.4226281075596143, "grad_norm": 2.265625, "learning_rate": 2.1105971991822966e-06, "loss": 0.98908062, "memory(GiB)": 369.42, "step": 56080, "train_speed(iter/s)": 0.200479 }, { "acc": 0.74523501, "epoch": 1.4227549467275495, "grad_norm": 2.40625, "learning_rate": 2.1097414585599474e-06, "loss": 1.0068593, "memory(GiB)": 369.42, "step": 56085, "train_speed(iter/s)": 0.200482 }, { "acc": 0.7530757, "epoch": 1.4228817858954845, "grad_norm": 2.453125, "learning_rate": 2.1088858450640743e-06, "loss": 1.01048565, "memory(GiB)": 369.42, "step": 56090, "train_speed(iter/s)": 0.200485 }, { "acc": 0.75464587, "epoch": 1.4230086250634195, "grad_norm": 2.484375, "learning_rate": 2.108030358732308e-06, "loss": 0.9334631, "memory(GiB)": 369.42, "step": 56095, "train_speed(iter/s)": 0.200488 }, { "acc": 0.75195603, "epoch": 1.4231354642313545, "grad_norm": 2.171875, "learning_rate": 2.107174999602277e-06, "loss": 1.01578588, "memory(GiB)": 369.42, "step": 56100, "train_speed(iter/s)": 0.200492 }, { "acc": 0.7628581, "epoch": 1.4232623033992897, "grad_norm": 2.421875, "learning_rate": 2.106319767711606e-06, "loss": 0.95361938, "memory(GiB)": 369.42, "step": 56105, "train_speed(iter/s)": 0.200494 }, { "acc": 0.74133806, "epoch": 1.4233891425672247, "grad_norm": 2.4375, "learning_rate": 2.105464663097913e-06, "loss": 1.01436024, "memory(GiB)": 369.42, "step": 56110, "train_speed(iter/s)": 0.200497 }, { "acc": 0.76270761, "epoch": 1.42351598173516, "grad_norm": 2.390625, "learning_rate": 2.1046096857988033e-06, "loss": 0.95536308, "memory(GiB)": 369.42, "step": 56115, "train_speed(iter/s)": 0.200498 }, { "acc": 0.74573035, "epoch": 1.423642820903095, "grad_norm": 1.8984375, "learning_rate": 2.103754835851889e-06, "loss": 0.98845158, "memory(GiB)": 369.42, "step": 56120, "train_speed(iter/s)": 0.200501 }, { "acc": 0.7601131, "epoch": 1.42376966007103, "grad_norm": 2.015625, "learning_rate": 2.102900113294768e-06, "loss": 0.96562843, "memory(GiB)": 369.42, "step": 56125, "train_speed(iter/s)": 0.200504 }, { "acc": 0.75132608, "epoch": 1.4238964992389649, "grad_norm": 2.15625, "learning_rate": 2.1020455181650356e-06, "loss": 0.96866016, "memory(GiB)": 369.42, "step": 56130, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75003185, "epoch": 1.4240233384069, "grad_norm": 2.40625, "learning_rate": 2.101191050500277e-06, "loss": 0.99468651, "memory(GiB)": 369.42, "step": 56135, "train_speed(iter/s)": 0.200509 }, { "acc": 0.74384351, "epoch": 1.424150177574835, "grad_norm": 2.390625, "learning_rate": 2.1003367103380797e-06, "loss": 0.98082485, "memory(GiB)": 369.42, "step": 56140, "train_speed(iter/s)": 0.200513 }, { "acc": 0.75132713, "epoch": 1.4242770167427703, "grad_norm": 2.546875, "learning_rate": 2.09948249771602e-06, "loss": 1.04879665, "memory(GiB)": 369.42, "step": 56145, "train_speed(iter/s)": 0.200514 }, { "acc": 0.74874682, "epoch": 1.4244038559107053, "grad_norm": 2.8125, "learning_rate": 2.09862841267167e-06, "loss": 0.93269863, "memory(GiB)": 369.42, "step": 56150, "train_speed(iter/s)": 0.200516 }, { "acc": 0.74939508, "epoch": 1.4245306950786403, "grad_norm": 2.484375, "learning_rate": 2.097774455242596e-06, "loss": 1.01138725, "memory(GiB)": 369.42, "step": 56155, "train_speed(iter/s)": 0.200517 }, { "acc": 0.75226898, "epoch": 1.4246575342465753, "grad_norm": 2.015625, "learning_rate": 2.096920625466359e-06, "loss": 0.96342564, "memory(GiB)": 369.42, "step": 56160, "train_speed(iter/s)": 0.200519 }, { "acc": 0.7430275, "epoch": 1.4247843734145105, "grad_norm": 2.203125, "learning_rate": 2.0960669233805136e-06, "loss": 1.03885508, "memory(GiB)": 369.42, "step": 56165, "train_speed(iter/s)": 0.200521 }, { "acc": 0.75391736, "epoch": 1.4249112125824455, "grad_norm": 1.8125, "learning_rate": 2.0952133490226106e-06, "loss": 1.03104191, "memory(GiB)": 369.42, "step": 56170, "train_speed(iter/s)": 0.200525 }, { "acc": 0.74377995, "epoch": 1.4250380517503805, "grad_norm": 2.375, "learning_rate": 2.0943599024301935e-06, "loss": 1.01597328, "memory(GiB)": 369.42, "step": 56175, "train_speed(iter/s)": 0.200528 }, { "acc": 0.75147419, "epoch": 1.4251648909183157, "grad_norm": 2.4375, "learning_rate": 2.093506583640801e-06, "loss": 0.97345695, "memory(GiB)": 369.42, "step": 56180, "train_speed(iter/s)": 0.200529 }, { "acc": 0.75239086, "epoch": 1.4252917300862507, "grad_norm": 2.203125, "learning_rate": 2.092653392691965e-06, "loss": 0.94930182, "memory(GiB)": 369.42, "step": 56185, "train_speed(iter/s)": 0.200531 }, { "acc": 0.7507093, "epoch": 1.4254185692541856, "grad_norm": 2.015625, "learning_rate": 2.091800329621213e-06, "loss": 0.98469639, "memory(GiB)": 369.42, "step": 56190, "train_speed(iter/s)": 0.200534 }, { "acc": 0.76425085, "epoch": 1.4255454084221206, "grad_norm": 1.8671875, "learning_rate": 2.0909473944660667e-06, "loss": 0.92561035, "memory(GiB)": 369.42, "step": 56195, "train_speed(iter/s)": 0.200536 }, { "acc": 0.74939928, "epoch": 1.4256722475900558, "grad_norm": 1.9921875, "learning_rate": 2.0900945872640427e-06, "loss": 1.03291979, "memory(GiB)": 369.42, "step": 56200, "train_speed(iter/s)": 0.200539 }, { "acc": 0.75782323, "epoch": 1.4257990867579908, "grad_norm": 1.9296875, "learning_rate": 2.0892419080526484e-06, "loss": 0.95777617, "memory(GiB)": 369.42, "step": 56205, "train_speed(iter/s)": 0.20054 }, { "acc": 0.76287408, "epoch": 1.425925925925926, "grad_norm": 2.203125, "learning_rate": 2.0883893568693935e-06, "loss": 0.95731602, "memory(GiB)": 369.42, "step": 56210, "train_speed(iter/s)": 0.200543 }, { "acc": 0.76550636, "epoch": 1.426052765093861, "grad_norm": 2.171875, "learning_rate": 2.087536933751773e-06, "loss": 0.99937258, "memory(GiB)": 369.42, "step": 56215, "train_speed(iter/s)": 0.200546 }, { "acc": 0.76079054, "epoch": 1.426179604261796, "grad_norm": 2.703125, "learning_rate": 2.0866846387372814e-06, "loss": 0.98982792, "memory(GiB)": 369.42, "step": 56220, "train_speed(iter/s)": 0.200548 }, { "acc": 0.73647037, "epoch": 1.426306443429731, "grad_norm": 2.21875, "learning_rate": 2.085832471863404e-06, "loss": 1.06646461, "memory(GiB)": 369.42, "step": 56225, "train_speed(iter/s)": 0.20055 }, { "acc": 0.74056139, "epoch": 1.4264332825976662, "grad_norm": 2.265625, "learning_rate": 2.08498043316763e-06, "loss": 1.02948914, "memory(GiB)": 369.42, "step": 56230, "train_speed(iter/s)": 0.200553 }, { "acc": 0.76004734, "epoch": 1.4265601217656012, "grad_norm": 2.40625, "learning_rate": 2.0841285226874282e-06, "loss": 0.99381323, "memory(GiB)": 369.42, "step": 56235, "train_speed(iter/s)": 0.200556 }, { "acc": 0.7423027, "epoch": 1.4266869609335362, "grad_norm": 2.140625, "learning_rate": 2.083276740460271e-06, "loss": 0.98055382, "memory(GiB)": 369.42, "step": 56240, "train_speed(iter/s)": 0.200559 }, { "acc": 0.75725679, "epoch": 1.4268138001014714, "grad_norm": 1.9921875, "learning_rate": 2.082425086523627e-06, "loss": 0.92349052, "memory(GiB)": 369.42, "step": 56245, "train_speed(iter/s)": 0.200561 }, { "acc": 0.75433168, "epoch": 1.4269406392694064, "grad_norm": 2.15625, "learning_rate": 2.0815735609149556e-06, "loss": 0.96171436, "memory(GiB)": 369.42, "step": 56250, "train_speed(iter/s)": 0.200565 }, { "acc": 0.76863146, "epoch": 1.4270674784373414, "grad_norm": 2.0, "learning_rate": 2.080722163671705e-06, "loss": 0.94195881, "memory(GiB)": 369.42, "step": 56255, "train_speed(iter/s)": 0.200568 }, { "acc": 0.75755491, "epoch": 1.4271943176052764, "grad_norm": 2.140625, "learning_rate": 2.079870894831329e-06, "loss": 0.98284435, "memory(GiB)": 369.42, "step": 56260, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74973526, "epoch": 1.4273211567732116, "grad_norm": 2.09375, "learning_rate": 2.0790197544312683e-06, "loss": 1.04633827, "memory(GiB)": 369.42, "step": 56265, "train_speed(iter/s)": 0.200571 }, { "acc": 0.74249196, "epoch": 1.4274479959411466, "grad_norm": 2.046875, "learning_rate": 2.0781687425089624e-06, "loss": 1.00692406, "memory(GiB)": 369.42, "step": 56270, "train_speed(iter/s)": 0.200573 }, { "acc": 0.75093699, "epoch": 1.4275748351090818, "grad_norm": 2.40625, "learning_rate": 2.0773178591018363e-06, "loss": 1.00777664, "memory(GiB)": 369.42, "step": 56275, "train_speed(iter/s)": 0.200575 }, { "acc": 0.76517096, "epoch": 1.4277016742770168, "grad_norm": 2.765625, "learning_rate": 2.076467104247322e-06, "loss": 0.93706656, "memory(GiB)": 369.42, "step": 56280, "train_speed(iter/s)": 0.200579 }, { "acc": 0.76189489, "epoch": 1.4278285134449518, "grad_norm": 2.03125, "learning_rate": 2.0756164779828365e-06, "loss": 0.93687115, "memory(GiB)": 369.42, "step": 56285, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75266724, "epoch": 1.4279553526128868, "grad_norm": 1.9921875, "learning_rate": 2.0747659803457943e-06, "loss": 0.96503191, "memory(GiB)": 369.42, "step": 56290, "train_speed(iter/s)": 0.200586 }, { "acc": 0.76516852, "epoch": 1.428082191780822, "grad_norm": 2.046875, "learning_rate": 2.073915611373605e-06, "loss": 0.94437304, "memory(GiB)": 369.42, "step": 56295, "train_speed(iter/s)": 0.200589 }, { "acc": 0.7461998, "epoch": 1.428209030948757, "grad_norm": 1.828125, "learning_rate": 2.0730653711036713e-06, "loss": 0.99663486, "memory(GiB)": 369.42, "step": 56300, "train_speed(iter/s)": 0.200592 }, { "acc": 0.76612816, "epoch": 1.4283358701166922, "grad_norm": 2.203125, "learning_rate": 2.0722152595733903e-06, "loss": 0.92917061, "memory(GiB)": 369.42, "step": 56305, "train_speed(iter/s)": 0.200596 }, { "acc": 0.76063976, "epoch": 1.4284627092846272, "grad_norm": 2.015625, "learning_rate": 2.0713652768201536e-06, "loss": 0.9393055, "memory(GiB)": 369.42, "step": 56310, "train_speed(iter/s)": 0.200595 }, { "acc": 0.74820266, "epoch": 1.4285895484525621, "grad_norm": 2.859375, "learning_rate": 2.0705154228813477e-06, "loss": 0.99679184, "memory(GiB)": 369.42, "step": 56315, "train_speed(iter/s)": 0.200599 }, { "acc": 0.75940523, "epoch": 1.4287163876204971, "grad_norm": 2.046875, "learning_rate": 2.0696656977943524e-06, "loss": 0.94665298, "memory(GiB)": 369.42, "step": 56320, "train_speed(iter/s)": 0.200601 }, { "acc": 0.74465981, "epoch": 1.4288432267884323, "grad_norm": 2.140625, "learning_rate": 2.068816101596543e-06, "loss": 1.00097561, "memory(GiB)": 369.42, "step": 56325, "train_speed(iter/s)": 0.200603 }, { "acc": 0.76347589, "epoch": 1.4289700659563673, "grad_norm": 1.9140625, "learning_rate": 2.067966634325288e-06, "loss": 0.91834717, "memory(GiB)": 369.42, "step": 56330, "train_speed(iter/s)": 0.200607 }, { "acc": 0.75563593, "epoch": 1.4290969051243023, "grad_norm": 2.25, "learning_rate": 2.0671172960179513e-06, "loss": 0.96974659, "memory(GiB)": 369.42, "step": 56335, "train_speed(iter/s)": 0.20061 }, { "acc": 0.76121964, "epoch": 1.4292237442922375, "grad_norm": 1.9765625, "learning_rate": 2.06626808671189e-06, "loss": 0.92570477, "memory(GiB)": 369.42, "step": 56340, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74700222, "epoch": 1.4293505834601725, "grad_norm": 2.5, "learning_rate": 2.065419006444455e-06, "loss": 0.9934207, "memory(GiB)": 369.42, "step": 56345, "train_speed(iter/s)": 0.200616 }, { "acc": 0.75807505, "epoch": 1.4294774226281075, "grad_norm": 2.078125, "learning_rate": 2.0645700552529973e-06, "loss": 0.95983448, "memory(GiB)": 369.42, "step": 56350, "train_speed(iter/s)": 0.20062 }, { "acc": 0.73725262, "epoch": 1.4296042617960425, "grad_norm": 2.109375, "learning_rate": 2.063721233174853e-06, "loss": 1.04833746, "memory(GiB)": 369.42, "step": 56355, "train_speed(iter/s)": 0.200623 }, { "acc": 0.75871706, "epoch": 1.4297311009639777, "grad_norm": 2.21875, "learning_rate": 2.0628725402473574e-06, "loss": 0.91478615, "memory(GiB)": 369.42, "step": 56360, "train_speed(iter/s)": 0.200625 }, { "acc": 0.7581068, "epoch": 1.4298579401319127, "grad_norm": 2.234375, "learning_rate": 2.0620239765078404e-06, "loss": 0.96323843, "memory(GiB)": 369.42, "step": 56365, "train_speed(iter/s)": 0.200627 }, { "acc": 0.74705925, "epoch": 1.429984779299848, "grad_norm": 1.9921875, "learning_rate": 2.0611755419936285e-06, "loss": 0.99311285, "memory(GiB)": 369.42, "step": 56370, "train_speed(iter/s)": 0.20063 }, { "acc": 0.75317602, "epoch": 1.430111618467783, "grad_norm": 1.8984375, "learning_rate": 2.0603272367420357e-06, "loss": 0.94042587, "memory(GiB)": 369.42, "step": 56375, "train_speed(iter/s)": 0.200632 }, { "acc": 0.76565824, "epoch": 1.4302384576357179, "grad_norm": 2.375, "learning_rate": 2.0594790607903743e-06, "loss": 0.94953537, "memory(GiB)": 369.42, "step": 56380, "train_speed(iter/s)": 0.200637 }, { "acc": 0.75717182, "epoch": 1.4303652968036529, "grad_norm": 1.984375, "learning_rate": 2.0586310141759534e-06, "loss": 0.97611103, "memory(GiB)": 369.42, "step": 56385, "train_speed(iter/s)": 0.200639 }, { "acc": 0.74193296, "epoch": 1.430492135971588, "grad_norm": 2.046875, "learning_rate": 2.0577830969360753e-06, "loss": 1.00202007, "memory(GiB)": 369.42, "step": 56390, "train_speed(iter/s)": 0.200643 }, { "acc": 0.76221495, "epoch": 1.430618975139523, "grad_norm": 2.28125, "learning_rate": 2.0569353091080304e-06, "loss": 0.96013832, "memory(GiB)": 369.42, "step": 56395, "train_speed(iter/s)": 0.200646 }, { "acc": 0.76192007, "epoch": 1.430745814307458, "grad_norm": 2.296875, "learning_rate": 2.056087650729109e-06, "loss": 0.94062347, "memory(GiB)": 369.42, "step": 56400, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75201492, "epoch": 1.4308726534753933, "grad_norm": 2.109375, "learning_rate": 2.0552401218365975e-06, "loss": 1.01870499, "memory(GiB)": 369.42, "step": 56405, "train_speed(iter/s)": 0.20065 }, { "acc": 0.74936218, "epoch": 1.4309994926433283, "grad_norm": 1.8359375, "learning_rate": 2.054392722467775e-06, "loss": 0.97223349, "memory(GiB)": 369.42, "step": 56410, "train_speed(iter/s)": 0.200654 }, { "acc": 0.73430643, "epoch": 1.4311263318112633, "grad_norm": 2.140625, "learning_rate": 2.0535454526599086e-06, "loss": 1.04760551, "memory(GiB)": 369.42, "step": 56415, "train_speed(iter/s)": 0.200657 }, { "acc": 0.75382099, "epoch": 1.4312531709791982, "grad_norm": 2.421875, "learning_rate": 2.0526983124502692e-06, "loss": 0.96417542, "memory(GiB)": 369.42, "step": 56420, "train_speed(iter/s)": 0.200658 }, { "acc": 0.74673347, "epoch": 1.4313800101471335, "grad_norm": 2.46875, "learning_rate": 2.051851301876117e-06, "loss": 1.01457119, "memory(GiB)": 369.42, "step": 56425, "train_speed(iter/s)": 0.20066 }, { "acc": 0.73782301, "epoch": 1.4315068493150684, "grad_norm": 2.5, "learning_rate": 2.0510044209747078e-06, "loss": 1.06099911, "memory(GiB)": 369.42, "step": 56430, "train_speed(iter/s)": 0.200663 }, { "acc": 0.74596806, "epoch": 1.4316336884830037, "grad_norm": 2.25, "learning_rate": 2.05015766978329e-06, "loss": 0.97858009, "memory(GiB)": 369.42, "step": 56435, "train_speed(iter/s)": 0.200667 }, { "acc": 0.7449017, "epoch": 1.4317605276509386, "grad_norm": 2.015625, "learning_rate": 2.049311048339108e-06, "loss": 1.00034542, "memory(GiB)": 369.42, "step": 56440, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75546951, "epoch": 1.4318873668188736, "grad_norm": 2.03125, "learning_rate": 2.0484645566793993e-06, "loss": 1.00597, "memory(GiB)": 369.42, "step": 56445, "train_speed(iter/s)": 0.200672 }, { "acc": 0.7400898, "epoch": 1.4320142059868086, "grad_norm": 2.203125, "learning_rate": 2.0476181948413975e-06, "loss": 1.04919672, "memory(GiB)": 369.42, "step": 56450, "train_speed(iter/s)": 0.200676 }, { "acc": 0.74956665, "epoch": 1.4321410451547438, "grad_norm": 2.359375, "learning_rate": 2.0467719628623293e-06, "loss": 0.97518826, "memory(GiB)": 369.42, "step": 56455, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76100793, "epoch": 1.4322678843226788, "grad_norm": 2.140625, "learning_rate": 2.045925860779415e-06, "loss": 0.96724758, "memory(GiB)": 369.42, "step": 56460, "train_speed(iter/s)": 0.200683 }, { "acc": 0.74460063, "epoch": 1.432394723490614, "grad_norm": 2.0625, "learning_rate": 2.0450798886298707e-06, "loss": 0.98822117, "memory(GiB)": 369.42, "step": 56465, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75005937, "epoch": 1.432521562658549, "grad_norm": 1.9765625, "learning_rate": 2.044234046450905e-06, "loss": 0.9804224, "memory(GiB)": 369.42, "step": 56470, "train_speed(iter/s)": 0.200687 }, { "acc": 0.75875635, "epoch": 1.432648401826484, "grad_norm": 1.96875, "learning_rate": 2.0433883342797233e-06, "loss": 0.96904449, "memory(GiB)": 369.42, "step": 56475, "train_speed(iter/s)": 0.200689 }, { "acc": 0.76360703, "epoch": 1.432775240994419, "grad_norm": 2.0625, "learning_rate": 2.042542752153522e-06, "loss": 0.96735058, "memory(GiB)": 369.42, "step": 56480, "train_speed(iter/s)": 0.200691 }, { "acc": 0.75338407, "epoch": 1.4329020801623542, "grad_norm": 2.71875, "learning_rate": 2.0416973001094953e-06, "loss": 1.00469398, "memory(GiB)": 369.42, "step": 56485, "train_speed(iter/s)": 0.200694 }, { "acc": 0.75608702, "epoch": 1.4330289193302892, "grad_norm": 2.09375, "learning_rate": 2.0408519781848292e-06, "loss": 0.96932755, "memory(GiB)": 369.42, "step": 56490, "train_speed(iter/s)": 0.200696 }, { "acc": 0.76153269, "epoch": 1.4331557584982242, "grad_norm": 1.796875, "learning_rate": 2.0400067864167044e-06, "loss": 0.93598585, "memory(GiB)": 369.42, "step": 56495, "train_speed(iter/s)": 0.200698 }, { "acc": 0.76468134, "epoch": 1.4332825976661594, "grad_norm": 2.171875, "learning_rate": 2.0391617248422967e-06, "loss": 0.95332603, "memory(GiB)": 369.42, "step": 56500, "train_speed(iter/s)": 0.200701 }, { "acc": 0.73522501, "epoch": 1.4334094368340944, "grad_norm": 2.578125, "learning_rate": 2.038316793498774e-06, "loss": 1.05672493, "memory(GiB)": 369.42, "step": 56505, "train_speed(iter/s)": 0.200702 }, { "acc": 0.75040708, "epoch": 1.4335362760020294, "grad_norm": 2.828125, "learning_rate": 2.037471992423305e-06, "loss": 0.98290129, "memory(GiB)": 369.42, "step": 56510, "train_speed(iter/s)": 0.200705 }, { "acc": 0.73358502, "epoch": 1.4336631151699644, "grad_norm": 2.421875, "learning_rate": 2.036627321653043e-06, "loss": 1.04289761, "memory(GiB)": 369.42, "step": 56515, "train_speed(iter/s)": 0.200709 }, { "acc": 0.74712191, "epoch": 1.4337899543378996, "grad_norm": 2.0625, "learning_rate": 2.0357827812251403e-06, "loss": 1.0248065, "memory(GiB)": 369.42, "step": 56520, "train_speed(iter/s)": 0.200708 }, { "acc": 0.75680056, "epoch": 1.4339167935058346, "grad_norm": 2.296875, "learning_rate": 2.0349383711767463e-06, "loss": 0.97148867, "memory(GiB)": 369.42, "step": 56525, "train_speed(iter/s)": 0.200712 }, { "acc": 0.75311537, "epoch": 1.4340436326737698, "grad_norm": 2.171875, "learning_rate": 2.0340940915450026e-06, "loss": 0.99122963, "memory(GiB)": 369.42, "step": 56530, "train_speed(iter/s)": 0.200715 }, { "acc": 0.74963923, "epoch": 1.4341704718417048, "grad_norm": 2.1875, "learning_rate": 2.033249942367041e-06, "loss": 0.99631958, "memory(GiB)": 369.42, "step": 56535, "train_speed(iter/s)": 0.200719 }, { "acc": 0.76341534, "epoch": 1.4342973110096398, "grad_norm": 2.140625, "learning_rate": 2.032405923679991e-06, "loss": 0.93011284, "memory(GiB)": 369.42, "step": 56540, "train_speed(iter/s)": 0.20072 }, { "acc": 0.74607229, "epoch": 1.4344241501775747, "grad_norm": 1.890625, "learning_rate": 2.0315620355209792e-06, "loss": 1.03981915, "memory(GiB)": 369.42, "step": 56545, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75401955, "epoch": 1.43455098934551, "grad_norm": 3.0625, "learning_rate": 2.030718277927124e-06, "loss": 0.9978303, "memory(GiB)": 369.42, "step": 56550, "train_speed(iter/s)": 0.200725 }, { "acc": 0.7617167, "epoch": 1.434677828513445, "grad_norm": 2.015625, "learning_rate": 2.0298746509355326e-06, "loss": 0.94138927, "memory(GiB)": 369.42, "step": 56555, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75824246, "epoch": 1.43480466768138, "grad_norm": 1.890625, "learning_rate": 2.0290311545833166e-06, "loss": 0.9785944, "memory(GiB)": 369.42, "step": 56560, "train_speed(iter/s)": 0.200732 }, { "acc": 0.75708709, "epoch": 1.4349315068493151, "grad_norm": 2.046875, "learning_rate": 2.028187788907574e-06, "loss": 1.02085152, "memory(GiB)": 369.42, "step": 56565, "train_speed(iter/s)": 0.200735 }, { "acc": 0.76106749, "epoch": 1.4350583460172501, "grad_norm": 1.8125, "learning_rate": 2.027344553945403e-06, "loss": 0.98012028, "memory(GiB)": 369.42, "step": 56570, "train_speed(iter/s)": 0.200736 }, { "acc": 0.75240736, "epoch": 1.4351851851851851, "grad_norm": 1.9375, "learning_rate": 2.0265014497338868e-06, "loss": 0.93475962, "memory(GiB)": 369.42, "step": 56575, "train_speed(iter/s)": 0.200738 }, { "acc": 0.75036807, "epoch": 1.43531202435312, "grad_norm": 2.5, "learning_rate": 2.0256584763101145e-06, "loss": 0.96295633, "memory(GiB)": 369.42, "step": 56580, "train_speed(iter/s)": 0.200741 }, { "acc": 0.75088224, "epoch": 1.4354388635210553, "grad_norm": 2.03125, "learning_rate": 2.024815633711162e-06, "loss": 1.01603794, "memory(GiB)": 369.42, "step": 56585, "train_speed(iter/s)": 0.200742 }, { "acc": 0.75367565, "epoch": 1.4355657026889903, "grad_norm": 2.140625, "learning_rate": 2.0239729219741005e-06, "loss": 0.97036629, "memory(GiB)": 369.42, "step": 56590, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75454268, "epoch": 1.4356925418569255, "grad_norm": 2.3125, "learning_rate": 2.0231303411359975e-06, "loss": 0.97205982, "memory(GiB)": 369.42, "step": 56595, "train_speed(iter/s)": 0.200746 }, { "acc": 0.74163475, "epoch": 1.4358193810248605, "grad_norm": 2.625, "learning_rate": 2.0222878912339127e-06, "loss": 1.03047285, "memory(GiB)": 369.42, "step": 56600, "train_speed(iter/s)": 0.200749 }, { "acc": 0.76244855, "epoch": 1.4359462201927955, "grad_norm": 2.296875, "learning_rate": 2.021445572304901e-06, "loss": 1.00649338, "memory(GiB)": 369.42, "step": 56605, "train_speed(iter/s)": 0.200752 }, { "acc": 0.75502815, "epoch": 1.4360730593607305, "grad_norm": 2.578125, "learning_rate": 2.0206033843860113e-06, "loss": 0.94494076, "memory(GiB)": 369.42, "step": 56610, "train_speed(iter/s)": 0.200754 }, { "acc": 0.7518929, "epoch": 1.4361998985286657, "grad_norm": 2.078125, "learning_rate": 2.0197613275142868e-06, "loss": 0.97980919, "memory(GiB)": 369.42, "step": 56615, "train_speed(iter/s)": 0.200757 }, { "acc": 0.75548763, "epoch": 1.4363267376966007, "grad_norm": 1.96875, "learning_rate": 2.018919401726765e-06, "loss": 0.944911, "memory(GiB)": 369.42, "step": 56620, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75144949, "epoch": 1.436453576864536, "grad_norm": 1.8671875, "learning_rate": 2.0180776070604773e-06, "loss": 0.98545856, "memory(GiB)": 369.42, "step": 56625, "train_speed(iter/s)": 0.200764 }, { "acc": 0.75618849, "epoch": 1.4365804160324709, "grad_norm": 2.109375, "learning_rate": 2.0172359435524497e-06, "loss": 0.98431129, "memory(GiB)": 369.42, "step": 56630, "train_speed(iter/s)": 0.200767 }, { "acc": 0.75159545, "epoch": 1.4367072552004059, "grad_norm": 2.4375, "learning_rate": 2.0163944112397027e-06, "loss": 0.9596673, "memory(GiB)": 369.42, "step": 56635, "train_speed(iter/s)": 0.200768 }, { "acc": 0.7446167, "epoch": 1.4368340943683409, "grad_norm": 2.140625, "learning_rate": 2.01555301015925e-06, "loss": 1.01364222, "memory(GiB)": 369.42, "step": 56640, "train_speed(iter/s)": 0.20077 }, { "acc": 0.74944963, "epoch": 1.436960933536276, "grad_norm": 1.9140625, "learning_rate": 2.0147117403480994e-06, "loss": 1.00281134, "memory(GiB)": 369.42, "step": 56645, "train_speed(iter/s)": 0.200774 }, { "acc": 0.75429506, "epoch": 1.437087772704211, "grad_norm": 1.9296875, "learning_rate": 2.0138706018432576e-06, "loss": 0.94681616, "memory(GiB)": 369.42, "step": 56650, "train_speed(iter/s)": 0.200778 }, { "acc": 0.74394045, "epoch": 1.437214611872146, "grad_norm": 1.734375, "learning_rate": 2.0130295946817176e-06, "loss": 0.98064976, "memory(GiB)": 369.42, "step": 56655, "train_speed(iter/s)": 0.200781 }, { "acc": 0.75448136, "epoch": 1.4373414510400813, "grad_norm": 2.03125, "learning_rate": 2.0121887189004713e-06, "loss": 0.97250872, "memory(GiB)": 369.42, "step": 56660, "train_speed(iter/s)": 0.200784 }, { "acc": 0.75255771, "epoch": 1.4374682902080163, "grad_norm": 1.8828125, "learning_rate": 2.0113479745365033e-06, "loss": 0.96982555, "memory(GiB)": 369.42, "step": 56665, "train_speed(iter/s)": 0.200785 }, { "acc": 0.74930148, "epoch": 1.4375951293759512, "grad_norm": 2.21875, "learning_rate": 2.0105073616267984e-06, "loss": 1.01725397, "memory(GiB)": 369.42, "step": 56670, "train_speed(iter/s)": 0.200787 }, { "acc": 0.74866943, "epoch": 1.4377219685438862, "grad_norm": 2.53125, "learning_rate": 2.0096668802083254e-06, "loss": 1.02179585, "memory(GiB)": 369.42, "step": 56675, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75691557, "epoch": 1.4378488077118214, "grad_norm": 2.046875, "learning_rate": 2.0088265303180516e-06, "loss": 0.97311268, "memory(GiB)": 369.42, "step": 56680, "train_speed(iter/s)": 0.20079 }, { "acc": 0.75236969, "epoch": 1.4379756468797564, "grad_norm": 2.453125, "learning_rate": 2.0079863119929434e-06, "loss": 0.99495935, "memory(GiB)": 369.42, "step": 56685, "train_speed(iter/s)": 0.200792 }, { "acc": 0.7495131, "epoch": 1.4381024860476916, "grad_norm": 2.09375, "learning_rate": 2.0071462252699575e-06, "loss": 1.04902306, "memory(GiB)": 369.42, "step": 56690, "train_speed(iter/s)": 0.200796 }, { "acc": 0.7479208, "epoch": 1.4382293252156266, "grad_norm": 2.109375, "learning_rate": 2.006306270186039e-06, "loss": 0.96758833, "memory(GiB)": 369.42, "step": 56695, "train_speed(iter/s)": 0.2008 }, { "acc": 0.74852748, "epoch": 1.4383561643835616, "grad_norm": 1.890625, "learning_rate": 2.005466446778139e-06, "loss": 0.95960331, "memory(GiB)": 369.42, "step": 56700, "train_speed(iter/s)": 0.200801 }, { "acc": 0.74515505, "epoch": 1.4384830035514966, "grad_norm": 2.078125, "learning_rate": 2.0046267550831935e-06, "loss": 1.04126577, "memory(GiB)": 369.42, "step": 56705, "train_speed(iter/s)": 0.200805 }, { "acc": 0.74876776, "epoch": 1.4386098427194318, "grad_norm": 2.109375, "learning_rate": 2.003787195138139e-06, "loss": 1.01829958, "memory(GiB)": 369.42, "step": 56710, "train_speed(iter/s)": 0.200808 }, { "acc": 0.75234413, "epoch": 1.4387366818873668, "grad_norm": 2.046875, "learning_rate": 2.002947766979897e-06, "loss": 1.00277386, "memory(GiB)": 369.42, "step": 56715, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75509319, "epoch": 1.4388635210553018, "grad_norm": 2.09375, "learning_rate": 2.0021084706453945e-06, "loss": 0.96560516, "memory(GiB)": 369.42, "step": 56720, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75110965, "epoch": 1.438990360223237, "grad_norm": 2.09375, "learning_rate": 2.0012693061715467e-06, "loss": 0.93773565, "memory(GiB)": 369.42, "step": 56725, "train_speed(iter/s)": 0.200816 }, { "acc": 0.76846037, "epoch": 1.439117199391172, "grad_norm": 2.515625, "learning_rate": 2.000430273595263e-06, "loss": 0.92154074, "memory(GiB)": 369.42, "step": 56730, "train_speed(iter/s)": 0.200818 }, { "acc": 0.7412077, "epoch": 1.439244038559107, "grad_norm": 2.40625, "learning_rate": 1.9995913729534477e-06, "loss": 1.00273094, "memory(GiB)": 369.42, "step": 56735, "train_speed(iter/s)": 0.20082 }, { "acc": 0.73848472, "epoch": 1.439370877727042, "grad_norm": 1.9609375, "learning_rate": 1.9987526042830003e-06, "loss": 1.0219738, "memory(GiB)": 369.42, "step": 56740, "train_speed(iter/s)": 0.200822 }, { "acc": 0.758465, "epoch": 1.4394977168949772, "grad_norm": 1.8984375, "learning_rate": 1.9979139676208124e-06, "loss": 0.94367342, "memory(GiB)": 369.42, "step": 56745, "train_speed(iter/s)": 0.200824 }, { "acc": 0.74335456, "epoch": 1.4396245560629122, "grad_norm": 2.359375, "learning_rate": 1.9970754630037718e-06, "loss": 0.97762909, "memory(GiB)": 369.42, "step": 56750, "train_speed(iter/s)": 0.200826 }, { "acc": 0.76342773, "epoch": 1.4397513952308474, "grad_norm": 1.8984375, "learning_rate": 1.9962370904687596e-06, "loss": 0.92182121, "memory(GiB)": 369.42, "step": 56755, "train_speed(iter/s)": 0.200829 }, { "acc": 0.7455493, "epoch": 1.4398782343987824, "grad_norm": 2.765625, "learning_rate": 1.9953988500526506e-06, "loss": 1.00921574, "memory(GiB)": 369.42, "step": 56760, "train_speed(iter/s)": 0.200832 }, { "acc": 0.76159263, "epoch": 1.4400050735667174, "grad_norm": 2.03125, "learning_rate": 1.994560741792315e-06, "loss": 0.95608501, "memory(GiB)": 369.42, "step": 56765, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75053425, "epoch": 1.4401319127346524, "grad_norm": 2.265625, "learning_rate": 1.993722765724616e-06, "loss": 0.97910166, "memory(GiB)": 369.42, "step": 56770, "train_speed(iter/s)": 0.200837 }, { "acc": 0.74014282, "epoch": 1.4402587519025876, "grad_norm": 2.328125, "learning_rate": 1.992884921886412e-06, "loss": 1.02811031, "memory(GiB)": 369.42, "step": 56775, "train_speed(iter/s)": 0.20084 }, { "acc": 0.74875369, "epoch": 1.4403855910705226, "grad_norm": 2.21875, "learning_rate": 1.9920472103145555e-06, "loss": 1.01226206, "memory(GiB)": 369.42, "step": 56780, "train_speed(iter/s)": 0.200841 }, { "acc": 0.74780664, "epoch": 1.4405124302384578, "grad_norm": 1.9453125, "learning_rate": 1.99120963104589e-06, "loss": 1.00512028, "memory(GiB)": 369.42, "step": 56785, "train_speed(iter/s)": 0.200844 }, { "acc": 0.75174832, "epoch": 1.4406392694063928, "grad_norm": 2.328125, "learning_rate": 1.990372184117262e-06, "loss": 1.01607189, "memory(GiB)": 369.42, "step": 56790, "train_speed(iter/s)": 0.200847 }, { "acc": 0.73811417, "epoch": 1.4407661085743277, "grad_norm": 2.265625, "learning_rate": 1.9895348695655e-06, "loss": 0.98721676, "memory(GiB)": 369.42, "step": 56795, "train_speed(iter/s)": 0.20085 }, { "acc": 0.76525507, "epoch": 1.4408929477422627, "grad_norm": 2.0, "learning_rate": 1.9886976874274356e-06, "loss": 0.92031431, "memory(GiB)": 369.42, "step": 56800, "train_speed(iter/s)": 0.200853 }, { "acc": 0.76413736, "epoch": 1.441019786910198, "grad_norm": 2.125, "learning_rate": 1.9878606377398895e-06, "loss": 0.90269413, "memory(GiB)": 369.42, "step": 56805, "train_speed(iter/s)": 0.200855 }, { "acc": 0.75346274, "epoch": 1.441146626078133, "grad_norm": 1.703125, "learning_rate": 1.9870237205396844e-06, "loss": 0.96557112, "memory(GiB)": 369.42, "step": 56810, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74942684, "epoch": 1.441273465246068, "grad_norm": 2.296875, "learning_rate": 1.986186935863626e-06, "loss": 0.99192352, "memory(GiB)": 369.42, "step": 56815, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75583401, "epoch": 1.4414003044140031, "grad_norm": 2.1875, "learning_rate": 1.9853502837485207e-06, "loss": 0.97535038, "memory(GiB)": 369.42, "step": 56820, "train_speed(iter/s)": 0.200863 }, { "acc": 0.75255733, "epoch": 1.4415271435819381, "grad_norm": 2.078125, "learning_rate": 1.9845137642311707e-06, "loss": 0.96242599, "memory(GiB)": 369.42, "step": 56825, "train_speed(iter/s)": 0.200865 }, { "acc": 0.76480312, "epoch": 1.441653982749873, "grad_norm": 2.4375, "learning_rate": 1.9836773773483704e-06, "loss": 0.96256142, "memory(GiB)": 369.42, "step": 56830, "train_speed(iter/s)": 0.200869 }, { "acc": 0.74578276, "epoch": 1.441780821917808, "grad_norm": 2.21875, "learning_rate": 1.982841123136904e-06, "loss": 1.01592884, "memory(GiB)": 369.42, "step": 56835, "train_speed(iter/s)": 0.200871 }, { "acc": 0.73954954, "epoch": 1.4419076610857433, "grad_norm": 2.03125, "learning_rate": 1.982005001633554e-06, "loss": 0.99346886, "memory(GiB)": 369.42, "step": 56840, "train_speed(iter/s)": 0.200875 }, { "acc": 0.73023896, "epoch": 1.4420345002536783, "grad_norm": 2.171875, "learning_rate": 1.9811690128751002e-06, "loss": 1.07905273, "memory(GiB)": 369.42, "step": 56845, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75108423, "epoch": 1.4421613394216135, "grad_norm": 1.9609375, "learning_rate": 1.980333156898313e-06, "loss": 0.96395473, "memory(GiB)": 369.42, "step": 56850, "train_speed(iter/s)": 0.200879 }, { "acc": 0.74603992, "epoch": 1.4422881785895485, "grad_norm": 2.0, "learning_rate": 1.979497433739952e-06, "loss": 1.02380161, "memory(GiB)": 369.42, "step": 56855, "train_speed(iter/s)": 0.200882 }, { "acc": 0.7589767, "epoch": 1.4424150177574835, "grad_norm": 2.234375, "learning_rate": 1.9786618434367814e-06, "loss": 0.93732929, "memory(GiB)": 369.42, "step": 56860, "train_speed(iter/s)": 0.200885 }, { "acc": 0.74322872, "epoch": 1.4425418569254185, "grad_norm": 2.40625, "learning_rate": 1.977826386025552e-06, "loss": 0.98833275, "memory(GiB)": 369.42, "step": 56865, "train_speed(iter/s)": 0.200888 }, { "acc": 0.74984975, "epoch": 1.4426686960933537, "grad_norm": 2.0625, "learning_rate": 1.976991061543011e-06, "loss": 0.99744244, "memory(GiB)": 369.42, "step": 56870, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74191885, "epoch": 1.4427955352612887, "grad_norm": 2.421875, "learning_rate": 1.9761558700259e-06, "loss": 1.02540236, "memory(GiB)": 369.42, "step": 56875, "train_speed(iter/s)": 0.200893 }, { "acc": 0.76674671, "epoch": 1.4429223744292237, "grad_norm": 2.296875, "learning_rate": 1.9753208115109546e-06, "loss": 0.9562212, "memory(GiB)": 369.42, "step": 56880, "train_speed(iter/s)": 0.200897 }, { "acc": 0.75838099, "epoch": 1.4430492135971589, "grad_norm": 2.09375, "learning_rate": 1.9744858860349043e-06, "loss": 0.99442835, "memory(GiB)": 369.42, "step": 56885, "train_speed(iter/s)": 0.200899 }, { "acc": 0.75228949, "epoch": 1.4431760527650939, "grad_norm": 1.9609375, "learning_rate": 1.9736510936344723e-06, "loss": 0.96893005, "memory(GiB)": 369.42, "step": 56890, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74369545, "epoch": 1.4433028919330289, "grad_norm": 2.21875, "learning_rate": 1.9728164343463764e-06, "loss": 1.05990763, "memory(GiB)": 369.42, "step": 56895, "train_speed(iter/s)": 0.200905 }, { "acc": 0.76163826, "epoch": 1.4434297311009638, "grad_norm": 2.546875, "learning_rate": 1.97198190820733e-06, "loss": 0.9721036, "memory(GiB)": 369.42, "step": 56900, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74025788, "epoch": 1.443556570268899, "grad_norm": 1.8046875, "learning_rate": 1.9711475152540376e-06, "loss": 1.02260704, "memory(GiB)": 369.42, "step": 56905, "train_speed(iter/s)": 0.200912 }, { "acc": 0.74586825, "epoch": 1.443683409436834, "grad_norm": 2.140625, "learning_rate": 1.9703132555232007e-06, "loss": 1.01475716, "memory(GiB)": 369.42, "step": 56910, "train_speed(iter/s)": 0.200915 }, { "acc": 0.76162319, "epoch": 1.4438102486047693, "grad_norm": 2.25, "learning_rate": 1.9694791290515135e-06, "loss": 0.95996628, "memory(GiB)": 369.42, "step": 56915, "train_speed(iter/s)": 0.200918 }, { "acc": 0.75830994, "epoch": 1.4439370877727042, "grad_norm": 2.34375, "learning_rate": 1.968645135875665e-06, "loss": 1.01628103, "memory(GiB)": 369.42, "step": 56920, "train_speed(iter/s)": 0.200919 }, { "acc": 0.74679971, "epoch": 1.4440639269406392, "grad_norm": 2.09375, "learning_rate": 1.967811276032335e-06, "loss": 0.97950459, "memory(GiB)": 369.42, "step": 56925, "train_speed(iter/s)": 0.200922 }, { "acc": 0.74593182, "epoch": 1.4441907661085742, "grad_norm": 2.25, "learning_rate": 1.966977549558206e-06, "loss": 1.01445236, "memory(GiB)": 369.42, "step": 56930, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75944271, "epoch": 1.4443176052765094, "grad_norm": 1.765625, "learning_rate": 1.966143956489945e-06, "loss": 0.98523293, "memory(GiB)": 369.42, "step": 56935, "train_speed(iter/s)": 0.200926 }, { "acc": 0.76024117, "epoch": 1.4444444444444444, "grad_norm": 2.125, "learning_rate": 1.965310496864217e-06, "loss": 1.00467215, "memory(GiB)": 369.42, "step": 56940, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75818224, "epoch": 1.4445712836123796, "grad_norm": 2.28125, "learning_rate": 1.9644771707176813e-06, "loss": 0.97352867, "memory(GiB)": 369.42, "step": 56945, "train_speed(iter/s)": 0.200931 }, { "acc": 0.75191355, "epoch": 1.4446981227803146, "grad_norm": 1.8203125, "learning_rate": 1.963643978086996e-06, "loss": 0.97768593, "memory(GiB)": 369.42, "step": 56950, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74925704, "epoch": 1.4448249619482496, "grad_norm": 2.375, "learning_rate": 1.9628109190088023e-06, "loss": 0.95994959, "memory(GiB)": 369.42, "step": 56955, "train_speed(iter/s)": 0.200935 }, { "acc": 0.7582345, "epoch": 1.4449518011161846, "grad_norm": 2.390625, "learning_rate": 1.961977993519743e-06, "loss": 1.00972691, "memory(GiB)": 369.42, "step": 56960, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75337434, "epoch": 1.4450786402841198, "grad_norm": 2.09375, "learning_rate": 1.9611452016564574e-06, "loss": 0.95218716, "memory(GiB)": 369.42, "step": 56965, "train_speed(iter/s)": 0.200941 }, { "acc": 0.74664068, "epoch": 1.4452054794520548, "grad_norm": 2.21875, "learning_rate": 1.960312543455575e-06, "loss": 1.05662632, "memory(GiB)": 369.42, "step": 56970, "train_speed(iter/s)": 0.200943 }, { "acc": 0.7458519, "epoch": 1.4453323186199898, "grad_norm": 2.359375, "learning_rate": 1.959480018953716e-06, "loss": 0.98832092, "memory(GiB)": 369.42, "step": 56975, "train_speed(iter/s)": 0.200947 }, { "acc": 0.75043964, "epoch": 1.445459157787925, "grad_norm": 1.609375, "learning_rate": 1.9586476281874994e-06, "loss": 1.00769854, "memory(GiB)": 369.42, "step": 56980, "train_speed(iter/s)": 0.200949 }, { "acc": 0.73383951, "epoch": 1.44558599695586, "grad_norm": 2.5625, "learning_rate": 1.9578153711935403e-06, "loss": 1.03528099, "memory(GiB)": 369.42, "step": 56985, "train_speed(iter/s)": 0.200952 }, { "acc": 0.72885036, "epoch": 1.445712836123795, "grad_norm": 2.4375, "learning_rate": 1.9569832480084456e-06, "loss": 1.0432004, "memory(GiB)": 369.42, "step": 56990, "train_speed(iter/s)": 0.200955 }, { "acc": 0.74181056, "epoch": 1.44583967529173, "grad_norm": 3.015625, "learning_rate": 1.9561512586688096e-06, "loss": 1.03628197, "memory(GiB)": 369.42, "step": 56995, "train_speed(iter/s)": 0.200958 }, { "acc": 0.75020428, "epoch": 1.4459665144596652, "grad_norm": 2.234375, "learning_rate": 1.9553194032112334e-06, "loss": 0.97038565, "memory(GiB)": 369.42, "step": 57000, "train_speed(iter/s)": 0.20096 }, { "epoch": 1.4459665144596652, "eval_acc": 0.7379516952370767, "eval_loss": 0.9695073366165161, "eval_runtime": 385.084, "eval_samples_per_second": 16.542, "eval_steps_per_second": 8.271, "step": 57000 }, { "acc": 0.75120678, "epoch": 1.4460933536276002, "grad_norm": 2.140625, "learning_rate": 1.954487681672303e-06, "loss": 1.00188198, "memory(GiB)": 369.42, "step": 57005, "train_speed(iter/s)": 0.200458 }, { "acc": 0.76427546, "epoch": 1.4462201927955354, "grad_norm": 3.078125, "learning_rate": 1.9536560940886033e-06, "loss": 0.95818615, "memory(GiB)": 369.42, "step": 57010, "train_speed(iter/s)": 0.200461 }, { "acc": 0.7421865, "epoch": 1.4463470319634704, "grad_norm": 2.234375, "learning_rate": 1.9528246404967067e-06, "loss": 1.01549368, "memory(GiB)": 369.42, "step": 57015, "train_speed(iter/s)": 0.200464 }, { "acc": 0.76008511, "epoch": 1.4464738711314054, "grad_norm": 2.375, "learning_rate": 1.951993320933188e-06, "loss": 0.99566326, "memory(GiB)": 369.42, "step": 57020, "train_speed(iter/s)": 0.200466 }, { "acc": 0.74930964, "epoch": 1.4466007102993403, "grad_norm": 2.640625, "learning_rate": 1.951162135434612e-06, "loss": 1.06720877, "memory(GiB)": 369.42, "step": 57025, "train_speed(iter/s)": 0.200469 }, { "acc": 0.7578661, "epoch": 1.4467275494672756, "grad_norm": 2.40625, "learning_rate": 1.9503310840375374e-06, "loss": 0.97132645, "memory(GiB)": 369.42, "step": 57030, "train_speed(iter/s)": 0.200472 }, { "acc": 0.74397192, "epoch": 1.4468543886352105, "grad_norm": 2.4375, "learning_rate": 1.949500166778517e-06, "loss": 1.04936275, "memory(GiB)": 369.42, "step": 57035, "train_speed(iter/s)": 0.200474 }, { "acc": 0.75592523, "epoch": 1.4469812278031455, "grad_norm": 2.15625, "learning_rate": 1.948669383694099e-06, "loss": 0.98113976, "memory(GiB)": 369.42, "step": 57040, "train_speed(iter/s)": 0.200477 }, { "acc": 0.75863142, "epoch": 1.4471080669710807, "grad_norm": 2.03125, "learning_rate": 1.947838734820825e-06, "loss": 0.966255, "memory(GiB)": 369.42, "step": 57045, "train_speed(iter/s)": 0.20048 }, { "acc": 0.75620661, "epoch": 1.4472349061390157, "grad_norm": 2.328125, "learning_rate": 1.94700822019523e-06, "loss": 0.94692755, "memory(GiB)": 369.42, "step": 57050, "train_speed(iter/s)": 0.200481 }, { "acc": 0.76334515, "epoch": 1.4473617453069507, "grad_norm": 2.375, "learning_rate": 1.9461778398538447e-06, "loss": 0.93089972, "memory(GiB)": 369.42, "step": 57055, "train_speed(iter/s)": 0.200484 }, { "acc": 0.72993064, "epoch": 1.4474885844748857, "grad_norm": 2.375, "learning_rate": 1.945347593833191e-06, "loss": 1.09968157, "memory(GiB)": 369.42, "step": 57060, "train_speed(iter/s)": 0.200487 }, { "acc": 0.73840232, "epoch": 1.447615423642821, "grad_norm": 2.171875, "learning_rate": 1.9445174821697893e-06, "loss": 1.04674473, "memory(GiB)": 369.42, "step": 57065, "train_speed(iter/s)": 0.200489 }, { "acc": 0.73332005, "epoch": 1.447742262810756, "grad_norm": 1.96875, "learning_rate": 1.94368750490015e-06, "loss": 1.05319118, "memory(GiB)": 369.42, "step": 57070, "train_speed(iter/s)": 0.200493 }, { "acc": 0.76567545, "epoch": 1.4478691019786911, "grad_norm": 2.390625, "learning_rate": 1.94285766206078e-06, "loss": 0.95137053, "memory(GiB)": 369.42, "step": 57075, "train_speed(iter/s)": 0.200495 }, { "acc": 0.74672861, "epoch": 1.447995941146626, "grad_norm": 2.609375, "learning_rate": 1.9420279536881794e-06, "loss": 1.01918125, "memory(GiB)": 369.42, "step": 57080, "train_speed(iter/s)": 0.200498 }, { "acc": 0.74914203, "epoch": 1.448122780314561, "grad_norm": 1.9296875, "learning_rate": 1.9411983798188398e-06, "loss": 1.02384396, "memory(GiB)": 369.42, "step": 57085, "train_speed(iter/s)": 0.200501 }, { "acc": 0.74771786, "epoch": 1.448249619482496, "grad_norm": 2.046875, "learning_rate": 1.940368940489256e-06, "loss": 0.98731594, "memory(GiB)": 369.42, "step": 57090, "train_speed(iter/s)": 0.200504 }, { "acc": 0.75618272, "epoch": 1.4483764586504313, "grad_norm": 2.546875, "learning_rate": 1.939539635735905e-06, "loss": 0.96148882, "memory(GiB)": 369.42, "step": 57095, "train_speed(iter/s)": 0.200507 }, { "acc": 0.75970783, "epoch": 1.4485032978183663, "grad_norm": 2.140625, "learning_rate": 1.9387104655952625e-06, "loss": 0.97535505, "memory(GiB)": 369.42, "step": 57100, "train_speed(iter/s)": 0.200509 }, { "acc": 0.74523897, "epoch": 1.4486301369863015, "grad_norm": 2.21875, "learning_rate": 1.9378814301038033e-06, "loss": 0.9785759, "memory(GiB)": 369.42, "step": 57105, "train_speed(iter/s)": 0.20051 }, { "acc": 0.75158672, "epoch": 1.4487569761542365, "grad_norm": 2.125, "learning_rate": 1.937052529297992e-06, "loss": 0.96165695, "memory(GiB)": 369.42, "step": 57110, "train_speed(iter/s)": 0.200513 }, { "acc": 0.76151562, "epoch": 1.4488838153221715, "grad_norm": 2.25, "learning_rate": 1.9362237632142838e-06, "loss": 0.94843788, "memory(GiB)": 369.42, "step": 57115, "train_speed(iter/s)": 0.200516 }, { "acc": 0.76068258, "epoch": 1.4490106544901065, "grad_norm": 1.953125, "learning_rate": 1.9353951318891313e-06, "loss": 0.92521572, "memory(GiB)": 369.42, "step": 57120, "train_speed(iter/s)": 0.200517 }, { "acc": 0.76945429, "epoch": 1.4491374936580417, "grad_norm": 2.328125, "learning_rate": 1.9345666353589855e-06, "loss": 0.9704772, "memory(GiB)": 369.42, "step": 57125, "train_speed(iter/s)": 0.20052 }, { "acc": 0.75636187, "epoch": 1.4492643328259767, "grad_norm": 2.53125, "learning_rate": 1.9337382736602868e-06, "loss": 0.95531874, "memory(GiB)": 369.42, "step": 57130, "train_speed(iter/s)": 0.200521 }, { "acc": 0.75017672, "epoch": 1.4493911719939117, "grad_norm": 2.375, "learning_rate": 1.9329100468294646e-06, "loss": 0.98349504, "memory(GiB)": 369.42, "step": 57135, "train_speed(iter/s)": 0.200524 }, { "acc": 0.75712299, "epoch": 1.4495180111618469, "grad_norm": 1.9140625, "learning_rate": 1.9320819549029546e-06, "loss": 0.94896917, "memory(GiB)": 369.42, "step": 57140, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75287018, "epoch": 1.4496448503297819, "grad_norm": 1.921875, "learning_rate": 1.9312539979171774e-06, "loss": 1.00339785, "memory(GiB)": 369.42, "step": 57145, "train_speed(iter/s)": 0.200529 }, { "acc": 0.7398138, "epoch": 1.4497716894977168, "grad_norm": 1.96875, "learning_rate": 1.9304261759085525e-06, "loss": 0.98522892, "memory(GiB)": 369.42, "step": 57150, "train_speed(iter/s)": 0.200531 }, { "acc": 0.75370669, "epoch": 1.4498985286656518, "grad_norm": 2.40625, "learning_rate": 1.929598488913485e-06, "loss": 0.97950535, "memory(GiB)": 369.42, "step": 57155, "train_speed(iter/s)": 0.200534 }, { "acc": 0.75971308, "epoch": 1.450025367833587, "grad_norm": 1.8671875, "learning_rate": 1.928770936968386e-06, "loss": 0.93774776, "memory(GiB)": 369.42, "step": 57160, "train_speed(iter/s)": 0.200535 }, { "acc": 0.75880246, "epoch": 1.450152207001522, "grad_norm": 2.015625, "learning_rate": 1.927943520109653e-06, "loss": 0.95304108, "memory(GiB)": 369.42, "step": 57165, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75092897, "epoch": 1.4502790461694572, "grad_norm": 2.46875, "learning_rate": 1.9271162383736804e-06, "loss": 1.01613045, "memory(GiB)": 369.42, "step": 57170, "train_speed(iter/s)": 0.200541 }, { "acc": 0.76342845, "epoch": 1.4504058853373922, "grad_norm": 2.546875, "learning_rate": 1.9262890917968547e-06, "loss": 0.95746803, "memory(GiB)": 369.42, "step": 57175, "train_speed(iter/s)": 0.200543 }, { "acc": 0.73856754, "epoch": 1.4505327245053272, "grad_norm": 2.34375, "learning_rate": 1.925462080415558e-06, "loss": 1.07482491, "memory(GiB)": 369.42, "step": 57180, "train_speed(iter/s)": 0.200545 }, { "acc": 0.74521418, "epoch": 1.4506595636732622, "grad_norm": 2.234375, "learning_rate": 1.924635204266166e-06, "loss": 1.02991285, "memory(GiB)": 369.42, "step": 57185, "train_speed(iter/s)": 0.200547 }, { "acc": 0.76569233, "epoch": 1.4507864028411974, "grad_norm": 1.828125, "learning_rate": 1.923808463385048e-06, "loss": 0.94459877, "memory(GiB)": 369.42, "step": 57190, "train_speed(iter/s)": 0.20055 }, { "acc": 0.74695754, "epoch": 1.4509132420091324, "grad_norm": 1.9765625, "learning_rate": 1.922981857808568e-06, "loss": 1.01652107, "memory(GiB)": 369.42, "step": 57195, "train_speed(iter/s)": 0.200554 }, { "acc": 0.74625993, "epoch": 1.4510400811770674, "grad_norm": 1.8515625, "learning_rate": 1.9221553875730835e-06, "loss": 1.02790756, "memory(GiB)": 369.42, "step": 57200, "train_speed(iter/s)": 0.200556 }, { "acc": 0.7532608, "epoch": 1.4511669203450026, "grad_norm": 1.984375, "learning_rate": 1.921329052714947e-06, "loss": 0.9803544, "memory(GiB)": 369.42, "step": 57205, "train_speed(iter/s)": 0.200559 }, { "acc": 0.7600193, "epoch": 1.4512937595129376, "grad_norm": 2.609375, "learning_rate": 1.920502853270504e-06, "loss": 0.9801899, "memory(GiB)": 369.42, "step": 57210, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74770722, "epoch": 1.4514205986808726, "grad_norm": 2.125, "learning_rate": 1.919676789276094e-06, "loss": 0.97807541, "memory(GiB)": 369.42, "step": 57215, "train_speed(iter/s)": 0.200565 }, { "acc": 0.74137511, "epoch": 1.4515474378488076, "grad_norm": 2.5, "learning_rate": 1.918850860768052e-06, "loss": 1.04533739, "memory(GiB)": 369.42, "step": 57220, "train_speed(iter/s)": 0.200565 }, { "acc": 0.75667596, "epoch": 1.4516742770167428, "grad_norm": 2.390625, "learning_rate": 1.918025067782704e-06, "loss": 0.95155487, "memory(GiB)": 369.42, "step": 57225, "train_speed(iter/s)": 0.200567 }, { "acc": 0.7469923, "epoch": 1.4518011161846778, "grad_norm": 2.171875, "learning_rate": 1.9171994103563766e-06, "loss": 1.0093956, "memory(GiB)": 369.42, "step": 57230, "train_speed(iter/s)": 0.200569 }, { "acc": 0.7348556, "epoch": 1.451927955352613, "grad_norm": 2.265625, "learning_rate": 1.916373888525381e-06, "loss": 1.05298939, "memory(GiB)": 369.42, "step": 57235, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74012527, "epoch": 1.452054794520548, "grad_norm": 2.53125, "learning_rate": 1.9155485023260294e-06, "loss": 1.02140026, "memory(GiB)": 369.42, "step": 57240, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74533854, "epoch": 1.452181633688483, "grad_norm": 1.96875, "learning_rate": 1.914723251794624e-06, "loss": 0.99974728, "memory(GiB)": 369.42, "step": 57245, "train_speed(iter/s)": 0.200576 }, { "acc": 0.74904251, "epoch": 1.452308472856418, "grad_norm": 2.359375, "learning_rate": 1.9138981369674688e-06, "loss": 1.02276459, "memory(GiB)": 369.42, "step": 57250, "train_speed(iter/s)": 0.200578 }, { "acc": 0.73674841, "epoch": 1.4524353120243532, "grad_norm": 2.03125, "learning_rate": 1.9130731578808493e-06, "loss": 1.04060993, "memory(GiB)": 369.42, "step": 57255, "train_speed(iter/s)": 0.200581 }, { "acc": 0.74590664, "epoch": 1.4525621511922882, "grad_norm": 2.09375, "learning_rate": 1.912248314571053e-06, "loss": 1.00560694, "memory(GiB)": 369.42, "step": 57260, "train_speed(iter/s)": 0.200583 }, { "acc": 0.75964251, "epoch": 1.4526889903602234, "grad_norm": 2.3125, "learning_rate": 1.9114236070743638e-06, "loss": 0.9820116, "memory(GiB)": 369.42, "step": 57265, "train_speed(iter/s)": 0.200584 }, { "acc": 0.75738802, "epoch": 1.4528158295281584, "grad_norm": 2.171875, "learning_rate": 1.910599035427055e-06, "loss": 1.00068932, "memory(GiB)": 369.42, "step": 57270, "train_speed(iter/s)": 0.200588 }, { "acc": 0.75182829, "epoch": 1.4529426686960933, "grad_norm": 2.5, "learning_rate": 1.909774599665392e-06, "loss": 0.95256424, "memory(GiB)": 369.42, "step": 57275, "train_speed(iter/s)": 0.200591 }, { "acc": 0.7627964, "epoch": 1.4530695078640283, "grad_norm": 2.75, "learning_rate": 1.9089502998256382e-06, "loss": 0.9330102, "memory(GiB)": 369.42, "step": 57280, "train_speed(iter/s)": 0.200594 }, { "acc": 0.75644555, "epoch": 1.4531963470319635, "grad_norm": 2.390625, "learning_rate": 1.9081261359440517e-06, "loss": 0.96621361, "memory(GiB)": 369.42, "step": 57285, "train_speed(iter/s)": 0.200595 }, { "acc": 0.76753464, "epoch": 1.4533231861998985, "grad_norm": 2.421875, "learning_rate": 1.9073021080568837e-06, "loss": 0.9587328, "memory(GiB)": 369.42, "step": 57290, "train_speed(iter/s)": 0.200599 }, { "acc": 0.75796461, "epoch": 1.4534500253678335, "grad_norm": 2.546875, "learning_rate": 1.9064782162003737e-06, "loss": 0.94276237, "memory(GiB)": 369.42, "step": 57295, "train_speed(iter/s)": 0.200603 }, { "acc": 0.75446672, "epoch": 1.4535768645357687, "grad_norm": 2.03125, "learning_rate": 1.9056544604107646e-06, "loss": 0.96283569, "memory(GiB)": 369.42, "step": 57300, "train_speed(iter/s)": 0.200606 }, { "acc": 0.74658995, "epoch": 1.4537037037037037, "grad_norm": 2.578125, "learning_rate": 1.9048308407242882e-06, "loss": 0.96123619, "memory(GiB)": 369.42, "step": 57305, "train_speed(iter/s)": 0.200609 }, { "acc": 0.75008111, "epoch": 1.4538305428716387, "grad_norm": 2.65625, "learning_rate": 1.90400735717717e-06, "loss": 1.00619593, "memory(GiB)": 369.42, "step": 57310, "train_speed(iter/s)": 0.200611 }, { "acc": 0.74549179, "epoch": 1.4539573820395737, "grad_norm": 2.125, "learning_rate": 1.903184009805631e-06, "loss": 0.97299385, "memory(GiB)": 369.42, "step": 57315, "train_speed(iter/s)": 0.200613 }, { "acc": 0.74745383, "epoch": 1.454084221207509, "grad_norm": 2.0625, "learning_rate": 1.9023607986458854e-06, "loss": 1.05226822, "memory(GiB)": 369.42, "step": 57320, "train_speed(iter/s)": 0.200616 }, { "acc": 0.75237484, "epoch": 1.454211060375444, "grad_norm": 2.109375, "learning_rate": 1.901537723734142e-06, "loss": 1.01282177, "memory(GiB)": 369.42, "step": 57325, "train_speed(iter/s)": 0.200618 }, { "acc": 0.75202427, "epoch": 1.454337899543379, "grad_norm": 1.8203125, "learning_rate": 1.9007147851066031e-06, "loss": 0.96880693, "memory(GiB)": 369.42, "step": 57330, "train_speed(iter/s)": 0.200621 }, { "acc": 0.76625633, "epoch": 1.454464738711314, "grad_norm": 1.7890625, "learning_rate": 1.8998919827994654e-06, "loss": 0.90015488, "memory(GiB)": 369.42, "step": 57335, "train_speed(iter/s)": 0.200624 }, { "acc": 0.75747123, "epoch": 1.454591577879249, "grad_norm": 2.203125, "learning_rate": 1.899069316848919e-06, "loss": 1.0144393, "memory(GiB)": 369.42, "step": 57340, "train_speed(iter/s)": 0.200626 }, { "acc": 0.75327034, "epoch": 1.454718417047184, "grad_norm": 1.8046875, "learning_rate": 1.8982467872911486e-06, "loss": 1.04383373, "memory(GiB)": 369.42, "step": 57345, "train_speed(iter/s)": 0.200629 }, { "acc": 0.74308014, "epoch": 1.4548452562151193, "grad_norm": 1.953125, "learning_rate": 1.8974243941623332e-06, "loss": 1.00535526, "memory(GiB)": 369.42, "step": 57350, "train_speed(iter/s)": 0.200631 }, { "acc": 0.73632693, "epoch": 1.4549720953830543, "grad_norm": 2.359375, "learning_rate": 1.896602137498645e-06, "loss": 1.03445969, "memory(GiB)": 369.42, "step": 57355, "train_speed(iter/s)": 0.200634 }, { "acc": 0.74867392, "epoch": 1.4550989345509893, "grad_norm": 2.265625, "learning_rate": 1.89578001733625e-06, "loss": 0.99080791, "memory(GiB)": 369.42, "step": 57360, "train_speed(iter/s)": 0.200636 }, { "acc": 0.76016054, "epoch": 1.4552257737189245, "grad_norm": 1.9296875, "learning_rate": 1.8949580337113078e-06, "loss": 0.94852905, "memory(GiB)": 369.42, "step": 57365, "train_speed(iter/s)": 0.200638 }, { "acc": 0.73514132, "epoch": 1.4553526128868595, "grad_norm": 2.234375, "learning_rate": 1.8941361866599778e-06, "loss": 1.03768578, "memory(GiB)": 369.42, "step": 57370, "train_speed(iter/s)": 0.200639 }, { "acc": 0.74055543, "epoch": 1.4554794520547945, "grad_norm": 1.9296875, "learning_rate": 1.893314476218403e-06, "loss": 1.01553993, "memory(GiB)": 369.42, "step": 57375, "train_speed(iter/s)": 0.200643 }, { "acc": 0.7474154, "epoch": 1.4556062912227294, "grad_norm": 2.0625, "learning_rate": 1.8924929024227279e-06, "loss": 0.98504028, "memory(GiB)": 369.42, "step": 57380, "train_speed(iter/s)": 0.200646 }, { "acc": 0.74815373, "epoch": 1.4557331303906647, "grad_norm": 2.390625, "learning_rate": 1.8916714653090874e-06, "loss": 1.02948761, "memory(GiB)": 369.42, "step": 57385, "train_speed(iter/s)": 0.200649 }, { "acc": 0.74353094, "epoch": 1.4558599695585996, "grad_norm": 2.078125, "learning_rate": 1.8908501649136174e-06, "loss": 1.02798634, "memory(GiB)": 369.42, "step": 57390, "train_speed(iter/s)": 0.200651 }, { "acc": 0.76208954, "epoch": 1.4559868087265349, "grad_norm": 2.21875, "learning_rate": 1.8900290012724358e-06, "loss": 0.96345978, "memory(GiB)": 369.42, "step": 57395, "train_speed(iter/s)": 0.200654 }, { "acc": 0.75379906, "epoch": 1.4561136478944698, "grad_norm": 2.515625, "learning_rate": 1.889207974421663e-06, "loss": 0.93561535, "memory(GiB)": 369.42, "step": 57400, "train_speed(iter/s)": 0.200656 }, { "acc": 0.76414437, "epoch": 1.4562404870624048, "grad_norm": 2.828125, "learning_rate": 1.8883870843974134e-06, "loss": 0.95430727, "memory(GiB)": 369.42, "step": 57405, "train_speed(iter/s)": 0.200657 }, { "acc": 0.72934923, "epoch": 1.4563673262303398, "grad_norm": 2.4375, "learning_rate": 1.887566331235794e-06, "loss": 1.09583645, "memory(GiB)": 369.42, "step": 57410, "train_speed(iter/s)": 0.200659 }, { "acc": 0.73227363, "epoch": 1.456494165398275, "grad_norm": 2.515625, "learning_rate": 1.8867457149729013e-06, "loss": 1.11407471, "memory(GiB)": 369.42, "step": 57415, "train_speed(iter/s)": 0.200663 }, { "acc": 0.74988403, "epoch": 1.45662100456621, "grad_norm": 1.9609375, "learning_rate": 1.8859252356448305e-06, "loss": 1.00082121, "memory(GiB)": 369.42, "step": 57420, "train_speed(iter/s)": 0.200664 }, { "acc": 0.77365942, "epoch": 1.4567478437341452, "grad_norm": 1.859375, "learning_rate": 1.8851048932876725e-06, "loss": 0.89856873, "memory(GiB)": 369.42, "step": 57425, "train_speed(iter/s)": 0.200666 }, { "acc": 0.76531782, "epoch": 1.4568746829020802, "grad_norm": 2.125, "learning_rate": 1.8842846879375104e-06, "loss": 0.94815559, "memory(GiB)": 369.42, "step": 57430, "train_speed(iter/s)": 0.20067 }, { "acc": 0.75892673, "epoch": 1.4570015220700152, "grad_norm": 2.421875, "learning_rate": 1.8834646196304146e-06, "loss": 0.95228157, "memory(GiB)": 369.42, "step": 57435, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74655943, "epoch": 1.4571283612379502, "grad_norm": 2.578125, "learning_rate": 1.8826446884024612e-06, "loss": 1.03990383, "memory(GiB)": 369.42, "step": 57440, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75117698, "epoch": 1.4572552004058854, "grad_norm": 1.8828125, "learning_rate": 1.8818248942897122e-06, "loss": 0.97113867, "memory(GiB)": 369.42, "step": 57445, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74867959, "epoch": 1.4573820395738204, "grad_norm": 2.25, "learning_rate": 1.8810052373282277e-06, "loss": 0.99196777, "memory(GiB)": 369.42, "step": 57450, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74228826, "epoch": 1.4575088787417554, "grad_norm": 2.25, "learning_rate": 1.880185717554055e-06, "loss": 1.01849098, "memory(GiB)": 369.42, "step": 57455, "train_speed(iter/s)": 0.200678 }, { "acc": 0.74127731, "epoch": 1.4576357179096906, "grad_norm": 2.1875, "learning_rate": 1.879366335003245e-06, "loss": 1.02735014, "memory(GiB)": 369.42, "step": 57460, "train_speed(iter/s)": 0.200679 }, { "acc": 0.75148611, "epoch": 1.4577625570776256, "grad_norm": 2.46875, "learning_rate": 1.8785470897118362e-06, "loss": 1.00816631, "memory(GiB)": 369.42, "step": 57465, "train_speed(iter/s)": 0.200682 }, { "acc": 0.75594654, "epoch": 1.4578893962455606, "grad_norm": 2.25, "learning_rate": 1.8777279817158627e-06, "loss": 0.97508392, "memory(GiB)": 369.42, "step": 57470, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75294127, "epoch": 1.4580162354134956, "grad_norm": 2.25, "learning_rate": 1.8769090110513522e-06, "loss": 0.97307606, "memory(GiB)": 369.42, "step": 57475, "train_speed(iter/s)": 0.200688 }, { "acc": 0.74372511, "epoch": 1.4581430745814308, "grad_norm": 2.3125, "learning_rate": 1.8760901777543273e-06, "loss": 1.00061874, "memory(GiB)": 369.42, "step": 57480, "train_speed(iter/s)": 0.20069 }, { "acc": 0.75180016, "epoch": 1.4582699137493658, "grad_norm": 2.234375, "learning_rate": 1.8752714818608036e-06, "loss": 0.97644072, "memory(GiB)": 369.42, "step": 57485, "train_speed(iter/s)": 0.200693 }, { "acc": 0.76035128, "epoch": 1.458396752917301, "grad_norm": 2.0625, "learning_rate": 1.874452923406791e-06, "loss": 0.96723137, "memory(GiB)": 369.42, "step": 57490, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74764948, "epoch": 1.458523592085236, "grad_norm": 2.125, "learning_rate": 1.8736345024282937e-06, "loss": 1.0125824, "memory(GiB)": 369.42, "step": 57495, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75547638, "epoch": 1.458650431253171, "grad_norm": 2.375, "learning_rate": 1.8728162189613085e-06, "loss": 0.96307449, "memory(GiB)": 369.42, "step": 57500, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75762663, "epoch": 1.458777270421106, "grad_norm": 2.296875, "learning_rate": 1.8719980730418285e-06, "loss": 0.9898737, "memory(GiB)": 369.42, "step": 57505, "train_speed(iter/s)": 0.200703 }, { "acc": 0.75287299, "epoch": 1.4589041095890412, "grad_norm": 2.828125, "learning_rate": 1.8711800647058388e-06, "loss": 0.99223347, "memory(GiB)": 369.42, "step": 57510, "train_speed(iter/s)": 0.200707 }, { "acc": 0.73922071, "epoch": 1.4590309487569761, "grad_norm": 2.3125, "learning_rate": 1.8703621939893185e-06, "loss": 1.07940044, "memory(GiB)": 369.42, "step": 57515, "train_speed(iter/s)": 0.20071 }, { "acc": 0.75383997, "epoch": 1.4591577879249111, "grad_norm": 2.375, "learning_rate": 1.869544460928242e-06, "loss": 0.98224792, "memory(GiB)": 369.42, "step": 57520, "train_speed(iter/s)": 0.200713 }, { "acc": 0.75704145, "epoch": 1.4592846270928463, "grad_norm": 2.125, "learning_rate": 1.868726865558575e-06, "loss": 0.96115789, "memory(GiB)": 369.42, "step": 57525, "train_speed(iter/s)": 0.200714 }, { "acc": 0.7523303, "epoch": 1.4594114662607813, "grad_norm": 1.8359375, "learning_rate": 1.8679094079162835e-06, "loss": 0.96745176, "memory(GiB)": 369.42, "step": 57530, "train_speed(iter/s)": 0.200717 }, { "acc": 0.75030823, "epoch": 1.4595383054287163, "grad_norm": 2.390625, "learning_rate": 1.867092088037319e-06, "loss": 0.96301842, "memory(GiB)": 369.42, "step": 57535, "train_speed(iter/s)": 0.200719 }, { "acc": 0.74523468, "epoch": 1.4596651445966513, "grad_norm": 2.296875, "learning_rate": 1.8662749059576296e-06, "loss": 1.05154238, "memory(GiB)": 369.42, "step": 57540, "train_speed(iter/s)": 0.200722 }, { "acc": 0.75787354, "epoch": 1.4597919837645865, "grad_norm": 2.5, "learning_rate": 1.865457861713163e-06, "loss": 1.02116938, "memory(GiB)": 369.42, "step": 57545, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75737262, "epoch": 1.4599188229325215, "grad_norm": 2.0, "learning_rate": 1.8646409553398558e-06, "loss": 0.95039644, "memory(GiB)": 369.42, "step": 57550, "train_speed(iter/s)": 0.200724 }, { "acc": 0.75341988, "epoch": 1.4600456621004567, "grad_norm": 2.0, "learning_rate": 1.8638241868736367e-06, "loss": 0.96838436, "memory(GiB)": 369.42, "step": 57555, "train_speed(iter/s)": 0.200726 }, { "acc": 0.75222082, "epoch": 1.4601725012683917, "grad_norm": 2.421875, "learning_rate": 1.8630075563504297e-06, "loss": 1.04320841, "memory(GiB)": 369.42, "step": 57560, "train_speed(iter/s)": 0.200728 }, { "acc": 0.75032892, "epoch": 1.4602993404363267, "grad_norm": 1.96875, "learning_rate": 1.8621910638061575e-06, "loss": 0.99188919, "memory(GiB)": 369.42, "step": 57565, "train_speed(iter/s)": 0.20073 }, { "acc": 0.74122839, "epoch": 1.4604261796042617, "grad_norm": 2.21875, "learning_rate": 1.8613747092767336e-06, "loss": 0.99831467, "memory(GiB)": 369.42, "step": 57570, "train_speed(iter/s)": 0.200733 }, { "acc": 0.75490952, "epoch": 1.460553018772197, "grad_norm": 2.109375, "learning_rate": 1.8605584927980596e-06, "loss": 1.02215385, "memory(GiB)": 369.42, "step": 57575, "train_speed(iter/s)": 0.200735 }, { "acc": 0.75081186, "epoch": 1.4606798579401319, "grad_norm": 4.6875, "learning_rate": 1.859742414406041e-06, "loss": 0.93166733, "memory(GiB)": 369.42, "step": 57580, "train_speed(iter/s)": 0.200736 }, { "acc": 0.76713595, "epoch": 1.460806697108067, "grad_norm": 2.109375, "learning_rate": 1.8589264741365714e-06, "loss": 0.87655048, "memory(GiB)": 369.42, "step": 57585, "train_speed(iter/s)": 0.200739 }, { "acc": 0.73974457, "epoch": 1.460933536276002, "grad_norm": 1.78125, "learning_rate": 1.8581106720255414e-06, "loss": 1.05681057, "memory(GiB)": 369.42, "step": 57590, "train_speed(iter/s)": 0.200742 }, { "acc": 0.75485754, "epoch": 1.461060375443937, "grad_norm": 2.390625, "learning_rate": 1.8572950081088282e-06, "loss": 0.95398769, "memory(GiB)": 369.42, "step": 57595, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74419222, "epoch": 1.461187214611872, "grad_norm": 2.25, "learning_rate": 1.856479482422313e-06, "loss": 1.00028553, "memory(GiB)": 369.42, "step": 57600, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75714064, "epoch": 1.4613140537798073, "grad_norm": 2.4375, "learning_rate": 1.8556640950018651e-06, "loss": 0.9692729, "memory(GiB)": 369.42, "step": 57605, "train_speed(iter/s)": 0.200751 }, { "acc": 0.76151743, "epoch": 1.4614408929477423, "grad_norm": 2.515625, "learning_rate": 1.8548488458833485e-06, "loss": 0.96463394, "memory(GiB)": 369.42, "step": 57610, "train_speed(iter/s)": 0.200751 }, { "acc": 0.75445824, "epoch": 1.4615677321156773, "grad_norm": 1.8671875, "learning_rate": 1.854033735102622e-06, "loss": 0.96957321, "memory(GiB)": 369.42, "step": 57615, "train_speed(iter/s)": 0.200754 }, { "acc": 0.76161222, "epoch": 1.4616945712836125, "grad_norm": 2.53125, "learning_rate": 1.8532187626955377e-06, "loss": 0.9776207, "memory(GiB)": 369.42, "step": 57620, "train_speed(iter/s)": 0.200757 }, { "acc": 0.75257969, "epoch": 1.4618214104515475, "grad_norm": 1.8671875, "learning_rate": 1.8524039286979417e-06, "loss": 0.98928013, "memory(GiB)": 369.42, "step": 57625, "train_speed(iter/s)": 0.200759 }, { "acc": 0.75734844, "epoch": 1.4619482496194824, "grad_norm": 2.171875, "learning_rate": 1.8515892331456736e-06, "loss": 1.00906506, "memory(GiB)": 369.42, "step": 57630, "train_speed(iter/s)": 0.200762 }, { "acc": 0.77920756, "epoch": 1.4620750887874174, "grad_norm": 2.15625, "learning_rate": 1.850774676074568e-06, "loss": 0.89346952, "memory(GiB)": 369.42, "step": 57635, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74027128, "epoch": 1.4622019279553526, "grad_norm": 2.078125, "learning_rate": 1.8499602575204522e-06, "loss": 1.05902462, "memory(GiB)": 369.42, "step": 57640, "train_speed(iter/s)": 0.200764 }, { "acc": 0.74871693, "epoch": 1.4623287671232876, "grad_norm": 1.8984375, "learning_rate": 1.8491459775191484e-06, "loss": 0.9875782, "memory(GiB)": 369.42, "step": 57645, "train_speed(iter/s)": 0.200767 }, { "acc": 0.74005213, "epoch": 1.4624556062912228, "grad_norm": 2.28125, "learning_rate": 1.8483318361064716e-06, "loss": 1.00663052, "memory(GiB)": 369.42, "step": 57650, "train_speed(iter/s)": 0.20077 }, { "acc": 0.75825276, "epoch": 1.4625824454591578, "grad_norm": 2.0625, "learning_rate": 1.847517833318232e-06, "loss": 0.9320466, "memory(GiB)": 369.42, "step": 57655, "train_speed(iter/s)": 0.200773 }, { "acc": 0.76032963, "epoch": 1.4627092846270928, "grad_norm": 2.1875, "learning_rate": 1.8467039691902334e-06, "loss": 0.9080862, "memory(GiB)": 369.42, "step": 57660, "train_speed(iter/s)": 0.200776 }, { "acc": 0.76207666, "epoch": 1.4628361237950278, "grad_norm": 2.15625, "learning_rate": 1.8458902437582705e-06, "loss": 0.91928768, "memory(GiB)": 369.42, "step": 57665, "train_speed(iter/s)": 0.200778 }, { "acc": 0.75635915, "epoch": 1.462962962962963, "grad_norm": 2.015625, "learning_rate": 1.8450766570581402e-06, "loss": 0.99684954, "memory(GiB)": 369.42, "step": 57670, "train_speed(iter/s)": 0.20078 }, { "acc": 0.74446259, "epoch": 1.463089802130898, "grad_norm": 2.15625, "learning_rate": 1.8442632091256223e-06, "loss": 1.01498184, "memory(GiB)": 369.42, "step": 57675, "train_speed(iter/s)": 0.200784 }, { "acc": 0.74606261, "epoch": 1.463216641298833, "grad_norm": 1.8984375, "learning_rate": 1.8434498999964983e-06, "loss": 1.01067753, "memory(GiB)": 369.42, "step": 57680, "train_speed(iter/s)": 0.200783 }, { "acc": 0.75924473, "epoch": 1.4633434804667682, "grad_norm": 2.203125, "learning_rate": 1.8426367297065384e-06, "loss": 0.9906827, "memory(GiB)": 369.42, "step": 57685, "train_speed(iter/s)": 0.200786 }, { "acc": 0.73794031, "epoch": 1.4634703196347032, "grad_norm": 2.0625, "learning_rate": 1.841823698291516e-06, "loss": 1.03532791, "memory(GiB)": 369.42, "step": 57690, "train_speed(iter/s)": 0.200789 }, { "acc": 0.75089006, "epoch": 1.4635971588026382, "grad_norm": 2.15625, "learning_rate": 1.8410108057871851e-06, "loss": 0.97730885, "memory(GiB)": 369.42, "step": 57695, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75149059, "epoch": 1.4637239979705732, "grad_norm": 2.390625, "learning_rate": 1.8401980522293017e-06, "loss": 0.97894869, "memory(GiB)": 369.42, "step": 57700, "train_speed(iter/s)": 0.200793 }, { "acc": 0.74484305, "epoch": 1.4638508371385084, "grad_norm": 2.84375, "learning_rate": 1.839385437653617e-06, "loss": 0.98610344, "memory(GiB)": 369.42, "step": 57705, "train_speed(iter/s)": 0.200794 }, { "acc": 0.74374695, "epoch": 1.4639776763064434, "grad_norm": 1.6875, "learning_rate": 1.8385729620958731e-06, "loss": 1.02799768, "memory(GiB)": 369.42, "step": 57710, "train_speed(iter/s)": 0.200797 }, { "acc": 0.76072206, "epoch": 1.4641045154743786, "grad_norm": 2.609375, "learning_rate": 1.8377606255918024e-06, "loss": 0.97977867, "memory(GiB)": 369.42, "step": 57715, "train_speed(iter/s)": 0.200799 }, { "acc": 0.75329638, "epoch": 1.4642313546423136, "grad_norm": 2.1875, "learning_rate": 1.8369484281771388e-06, "loss": 0.99445162, "memory(GiB)": 369.42, "step": 57720, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75818911, "epoch": 1.4643581938102486, "grad_norm": 1.9296875, "learning_rate": 1.836136369887606e-06, "loss": 0.96525135, "memory(GiB)": 369.42, "step": 57725, "train_speed(iter/s)": 0.200804 }, { "acc": 0.74967337, "epoch": 1.4644850329781836, "grad_norm": 3.296875, "learning_rate": 1.8353244507589225e-06, "loss": 0.97668533, "memory(GiB)": 369.42, "step": 57730, "train_speed(iter/s)": 0.200807 }, { "acc": 0.74413805, "epoch": 1.4646118721461188, "grad_norm": 2.0625, "learning_rate": 1.8345126708267958e-06, "loss": 1.01246929, "memory(GiB)": 369.42, "step": 57735, "train_speed(iter/s)": 0.20081 }, { "acc": 0.76181302, "epoch": 1.4647387113140538, "grad_norm": 2.421875, "learning_rate": 1.8337010301269364e-06, "loss": 0.94426899, "memory(GiB)": 369.42, "step": 57740, "train_speed(iter/s)": 0.200811 }, { "acc": 0.75122881, "epoch": 1.464865550481989, "grad_norm": 2.703125, "learning_rate": 1.8328895286950422e-06, "loss": 1.01217766, "memory(GiB)": 369.42, "step": 57745, "train_speed(iter/s)": 0.200815 }, { "acc": 0.74893899, "epoch": 1.464992389649924, "grad_norm": 2.0625, "learning_rate": 1.8320781665668063e-06, "loss": 0.99424, "memory(GiB)": 369.42, "step": 57750, "train_speed(iter/s)": 0.200817 }, { "acc": 0.73387132, "epoch": 1.465119228817859, "grad_norm": 2.171875, "learning_rate": 1.8312669437779167e-06, "loss": 1.05294285, "memory(GiB)": 369.42, "step": 57755, "train_speed(iter/s)": 0.20082 }, { "acc": 0.73986573, "epoch": 1.465246067985794, "grad_norm": 2.375, "learning_rate": 1.8304558603640544e-06, "loss": 1.00887432, "memory(GiB)": 369.42, "step": 57760, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75339737, "epoch": 1.4653729071537291, "grad_norm": 1.96875, "learning_rate": 1.8296449163608942e-06, "loss": 1.01880589, "memory(GiB)": 369.42, "step": 57765, "train_speed(iter/s)": 0.200824 }, { "acc": 0.76447153, "epoch": 1.4654997463216641, "grad_norm": 1.7265625, "learning_rate": 1.8288341118041052e-06, "loss": 0.90166016, "memory(GiB)": 369.42, "step": 57770, "train_speed(iter/s)": 0.200826 }, { "acc": 0.75920143, "epoch": 1.4656265854895991, "grad_norm": 2.4375, "learning_rate": 1.82802344672935e-06, "loss": 0.99643707, "memory(GiB)": 369.42, "step": 57775, "train_speed(iter/s)": 0.20083 }, { "acc": 0.74310098, "epoch": 1.4657534246575343, "grad_norm": 1.8671875, "learning_rate": 1.8272129211722855e-06, "loss": 1.04379663, "memory(GiB)": 369.42, "step": 57780, "train_speed(iter/s)": 0.200832 }, { "acc": 0.75854712, "epoch": 1.4658802638254693, "grad_norm": 2.390625, "learning_rate": 1.8264025351685627e-06, "loss": 1.0124073, "memory(GiB)": 369.42, "step": 57785, "train_speed(iter/s)": 0.200834 }, { "acc": 0.75515213, "epoch": 1.4660071029934043, "grad_norm": 2.15625, "learning_rate": 1.8255922887538251e-06, "loss": 0.95952969, "memory(GiB)": 369.42, "step": 57790, "train_speed(iter/s)": 0.200837 }, { "acc": 0.75616302, "epoch": 1.4661339421613393, "grad_norm": 2.546875, "learning_rate": 1.8247821819637112e-06, "loss": 0.99798355, "memory(GiB)": 369.42, "step": 57795, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75162163, "epoch": 1.4662607813292745, "grad_norm": 2.5, "learning_rate": 1.8239722148338534e-06, "loss": 0.95892601, "memory(GiB)": 369.42, "step": 57800, "train_speed(iter/s)": 0.200843 }, { "acc": 0.74338417, "epoch": 1.4663876204972095, "grad_norm": 2.0625, "learning_rate": 1.823162387399876e-06, "loss": 1.02563877, "memory(GiB)": 369.42, "step": 57805, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76730385, "epoch": 1.4665144596651447, "grad_norm": 2.296875, "learning_rate": 1.822352699697404e-06, "loss": 0.95621891, "memory(GiB)": 369.42, "step": 57810, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75960031, "epoch": 1.4666412988330797, "grad_norm": 1.9453125, "learning_rate": 1.8215431517620452e-06, "loss": 0.96888304, "memory(GiB)": 369.42, "step": 57815, "train_speed(iter/s)": 0.200852 }, { "acc": 0.73809619, "epoch": 1.4667681380010147, "grad_norm": 2.328125, "learning_rate": 1.8207337436294097e-06, "loss": 1.12690392, "memory(GiB)": 369.42, "step": 57820, "train_speed(iter/s)": 0.200853 }, { "acc": 0.73781767, "epoch": 1.4668949771689497, "grad_norm": 2.34375, "learning_rate": 1.819924475335097e-06, "loss": 1.01250668, "memory(GiB)": 369.42, "step": 57825, "train_speed(iter/s)": 0.200856 }, { "acc": 0.74751563, "epoch": 1.4670218163368849, "grad_norm": 1.9609375, "learning_rate": 1.8191153469147065e-06, "loss": 1.00415955, "memory(GiB)": 369.42, "step": 57830, "train_speed(iter/s)": 0.200859 }, { "acc": 0.73983812, "epoch": 1.4671486555048199, "grad_norm": 2.140625, "learning_rate": 1.8183063584038236e-06, "loss": 1.04040718, "memory(GiB)": 369.42, "step": 57835, "train_speed(iter/s)": 0.200862 }, { "acc": 0.75562677, "epoch": 1.4672754946727549, "grad_norm": 2.1875, "learning_rate": 1.8174975098380304e-06, "loss": 1.00390053, "memory(GiB)": 369.42, "step": 57840, "train_speed(iter/s)": 0.200866 }, { "acc": 0.7496645, "epoch": 1.46740233384069, "grad_norm": 2.125, "learning_rate": 1.8166888012529078e-06, "loss": 1.02894554, "memory(GiB)": 369.42, "step": 57845, "train_speed(iter/s)": 0.200867 }, { "acc": 0.74839087, "epoch": 1.467529173008625, "grad_norm": 2.296875, "learning_rate": 1.8158802326840252e-06, "loss": 0.98134022, "memory(GiB)": 369.42, "step": 57850, "train_speed(iter/s)": 0.200869 }, { "acc": 0.76409125, "epoch": 1.46765601217656, "grad_norm": 1.7734375, "learning_rate": 1.8150718041669447e-06, "loss": 0.92982216, "memory(GiB)": 369.42, "step": 57855, "train_speed(iter/s)": 0.200871 }, { "acc": 0.75009775, "epoch": 1.467782851344495, "grad_norm": 2.25, "learning_rate": 1.814263515737224e-06, "loss": 1.00381117, "memory(GiB)": 369.42, "step": 57860, "train_speed(iter/s)": 0.200874 }, { "acc": 0.75084929, "epoch": 1.4679096905124303, "grad_norm": 1.734375, "learning_rate": 1.813455367430419e-06, "loss": 0.92643633, "memory(GiB)": 369.42, "step": 57865, "train_speed(iter/s)": 0.200877 }, { "acc": 0.74595385, "epoch": 1.4680365296803652, "grad_norm": 2.21875, "learning_rate": 1.812647359282076e-06, "loss": 1.06213741, "memory(GiB)": 369.42, "step": 57870, "train_speed(iter/s)": 0.200879 }, { "acc": 0.73874483, "epoch": 1.4681633688483005, "grad_norm": 2.109375, "learning_rate": 1.8118394913277287e-06, "loss": 0.99679794, "memory(GiB)": 369.42, "step": 57875, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74956546, "epoch": 1.4682902080162354, "grad_norm": 2.375, "learning_rate": 1.8110317636029162e-06, "loss": 0.98583393, "memory(GiB)": 369.42, "step": 57880, "train_speed(iter/s)": 0.200884 }, { "acc": 0.75739231, "epoch": 1.4684170471841704, "grad_norm": 2.390625, "learning_rate": 1.810224176143165e-06, "loss": 0.99678297, "memory(GiB)": 369.42, "step": 57885, "train_speed(iter/s)": 0.200887 }, { "acc": 0.74531164, "epoch": 1.4685438863521054, "grad_norm": 2.15625, "learning_rate": 1.8094167289839953e-06, "loss": 0.90548058, "memory(GiB)": 369.42, "step": 57890, "train_speed(iter/s)": 0.200889 }, { "acc": 0.7413126, "epoch": 1.4686707255200406, "grad_norm": 2.40625, "learning_rate": 1.808609422160923e-06, "loss": 1.05731411, "memory(GiB)": 369.42, "step": 57895, "train_speed(iter/s)": 0.200891 }, { "acc": 0.76297426, "epoch": 1.4687975646879756, "grad_norm": 2.40625, "learning_rate": 1.8078022557094571e-06, "loss": 0.99150162, "memory(GiB)": 369.42, "step": 57900, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75044107, "epoch": 1.4689244038559108, "grad_norm": 2.53125, "learning_rate": 1.8069952296651e-06, "loss": 1.03257923, "memory(GiB)": 369.42, "step": 57905, "train_speed(iter/s)": 0.200897 }, { "acc": 0.74739037, "epoch": 1.4690512430238458, "grad_norm": 2.890625, "learning_rate": 1.8061883440633481e-06, "loss": 1.01414452, "memory(GiB)": 369.42, "step": 57910, "train_speed(iter/s)": 0.2009 }, { "acc": 0.76154261, "epoch": 1.4691780821917808, "grad_norm": 2.265625, "learning_rate": 1.8053815989396927e-06, "loss": 0.98190002, "memory(GiB)": 369.42, "step": 57915, "train_speed(iter/s)": 0.200902 }, { "acc": 0.76739802, "epoch": 1.4693049213597158, "grad_norm": 2.15625, "learning_rate": 1.8045749943296171e-06, "loss": 0.92266903, "memory(GiB)": 369.42, "step": 57920, "train_speed(iter/s)": 0.200905 }, { "acc": 0.75203457, "epoch": 1.469431760527651, "grad_norm": 2.265625, "learning_rate": 1.8037685302686003e-06, "loss": 1.01186752, "memory(GiB)": 369.42, "step": 57925, "train_speed(iter/s)": 0.200907 }, { "acc": 0.74670334, "epoch": 1.469558599695586, "grad_norm": 2.28125, "learning_rate": 1.8029622067921133e-06, "loss": 1.03595228, "memory(GiB)": 369.42, "step": 57930, "train_speed(iter/s)": 0.200909 }, { "acc": 0.76481133, "epoch": 1.469685438863521, "grad_norm": 2.34375, "learning_rate": 1.8021560239356223e-06, "loss": 0.95131664, "memory(GiB)": 369.42, "step": 57935, "train_speed(iter/s)": 0.200912 }, { "acc": 0.76101208, "epoch": 1.4698122780314562, "grad_norm": 1.9296875, "learning_rate": 1.8013499817345865e-06, "loss": 0.98135586, "memory(GiB)": 369.42, "step": 57940, "train_speed(iter/s)": 0.200914 }, { "acc": 0.75782471, "epoch": 1.4699391171993912, "grad_norm": 2.078125, "learning_rate": 1.8005440802244595e-06, "loss": 0.99773264, "memory(GiB)": 369.42, "step": 57945, "train_speed(iter/s)": 0.200917 }, { "acc": 0.75471611, "epoch": 1.4700659563673262, "grad_norm": 1.8671875, "learning_rate": 1.7997383194406887e-06, "loss": 0.94976425, "memory(GiB)": 369.42, "step": 57950, "train_speed(iter/s)": 0.20092 }, { "acc": 0.7518012, "epoch": 1.4701927955352612, "grad_norm": 1.96875, "learning_rate": 1.7989326994187146e-06, "loss": 1.01799202, "memory(GiB)": 369.42, "step": 57955, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75105457, "epoch": 1.4703196347031964, "grad_norm": 2.328125, "learning_rate": 1.798127220193972e-06, "loss": 1.00058632, "memory(GiB)": 369.42, "step": 57960, "train_speed(iter/s)": 0.200919 }, { "acc": 0.73766203, "epoch": 1.4704464738711314, "grad_norm": 2.484375, "learning_rate": 1.7973218818018878e-06, "loss": 1.02329922, "memory(GiB)": 369.42, "step": 57965, "train_speed(iter/s)": 0.200922 }, { "acc": 0.75210314, "epoch": 1.4705733130390666, "grad_norm": 2.125, "learning_rate": 1.7965166842778897e-06, "loss": 0.97249241, "memory(GiB)": 369.42, "step": 57970, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75799532, "epoch": 1.4707001522070016, "grad_norm": 1.859375, "learning_rate": 1.7957116276573888e-06, "loss": 0.97541046, "memory(GiB)": 369.42, "step": 57975, "train_speed(iter/s)": 0.200928 }, { "acc": 0.7612752, "epoch": 1.4708269913749366, "grad_norm": 2.203125, "learning_rate": 1.7949067119757951e-06, "loss": 0.91139135, "memory(GiB)": 369.42, "step": 57980, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74241943, "epoch": 1.4709538305428715, "grad_norm": 2.25, "learning_rate": 1.7941019372685154e-06, "loss": 1.01660461, "memory(GiB)": 369.42, "step": 57985, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74598455, "epoch": 1.4710806697108068, "grad_norm": 2.515625, "learning_rate": 1.7932973035709471e-06, "loss": 0.91355038, "memory(GiB)": 369.42, "step": 57990, "train_speed(iter/s)": 0.200936 }, { "acc": 0.74718075, "epoch": 1.4712075088787417, "grad_norm": 2.40625, "learning_rate": 1.792492810918479e-06, "loss": 0.98654776, "memory(GiB)": 369.42, "step": 57995, "train_speed(iter/s)": 0.200938 }, { "acc": 0.7284565, "epoch": 1.4713343480466767, "grad_norm": 1.9453125, "learning_rate": 1.7916884593464957e-06, "loss": 1.02991943, "memory(GiB)": 369.42, "step": 58000, "train_speed(iter/s)": 0.200941 }, { "epoch": 1.4713343480466767, "eval_acc": 0.7379153494811539, "eval_loss": 0.9695245623588562, "eval_runtime": 384.8996, "eval_samples_per_second": 16.55, "eval_steps_per_second": 8.275, "step": 58000 }, { "acc": 0.74831066, "epoch": 1.471461187214612, "grad_norm": 2.234375, "learning_rate": 1.79088424889038e-06, "loss": 1.0157383, "memory(GiB)": 369.42, "step": 58005, "train_speed(iter/s)": 0.200448 }, { "acc": 0.75623226, "epoch": 1.471588026382547, "grad_norm": 2.671875, "learning_rate": 1.7900801795855043e-06, "loss": 0.9625289, "memory(GiB)": 369.42, "step": 58010, "train_speed(iter/s)": 0.200451 }, { "acc": 0.75242896, "epoch": 1.471714865550482, "grad_norm": 2.421875, "learning_rate": 1.7892762514672303e-06, "loss": 0.95668898, "memory(GiB)": 369.42, "step": 58015, "train_speed(iter/s)": 0.200454 }, { "acc": 0.75797319, "epoch": 1.471841704718417, "grad_norm": 2.234375, "learning_rate": 1.7884724645709228e-06, "loss": 0.96399364, "memory(GiB)": 369.42, "step": 58020, "train_speed(iter/s)": 0.200457 }, { "acc": 0.75433679, "epoch": 1.4719685438863521, "grad_norm": 1.9921875, "learning_rate": 1.7876688189319353e-06, "loss": 0.98500137, "memory(GiB)": 369.42, "step": 58025, "train_speed(iter/s)": 0.200459 }, { "acc": 0.75165882, "epoch": 1.472095383054287, "grad_norm": 1.8203125, "learning_rate": 1.7868653145856163e-06, "loss": 0.95573845, "memory(GiB)": 369.42, "step": 58030, "train_speed(iter/s)": 0.200462 }, { "acc": 0.76557832, "epoch": 1.4722222222222223, "grad_norm": 2.296875, "learning_rate": 1.7860619515673034e-06, "loss": 0.89192715, "memory(GiB)": 369.42, "step": 58035, "train_speed(iter/s)": 0.200465 }, { "acc": 0.76136169, "epoch": 1.4723490613901573, "grad_norm": 1.984375, "learning_rate": 1.785258729912337e-06, "loss": 0.9784132, "memory(GiB)": 369.42, "step": 58040, "train_speed(iter/s)": 0.200467 }, { "acc": 0.73509531, "epoch": 1.4724759005580923, "grad_norm": 2.28125, "learning_rate": 1.784455649656044e-06, "loss": 1.06883202, "memory(GiB)": 369.42, "step": 58045, "train_speed(iter/s)": 0.20047 }, { "acc": 0.75428324, "epoch": 1.4726027397260273, "grad_norm": 1.8984375, "learning_rate": 1.7836527108337482e-06, "loss": 1.00431347, "memory(GiB)": 369.42, "step": 58050, "train_speed(iter/s)": 0.200472 }, { "acc": 0.76420412, "epoch": 1.4727295788939625, "grad_norm": 1.734375, "learning_rate": 1.782849913480766e-06, "loss": 0.97465019, "memory(GiB)": 369.42, "step": 58055, "train_speed(iter/s)": 0.200475 }, { "acc": 0.7495944, "epoch": 1.4728564180618975, "grad_norm": 1.984375, "learning_rate": 1.7820472576324078e-06, "loss": 0.98928204, "memory(GiB)": 369.42, "step": 58060, "train_speed(iter/s)": 0.200477 }, { "acc": 0.76301527, "epoch": 1.4729832572298327, "grad_norm": 2.390625, "learning_rate": 1.7812447433239789e-06, "loss": 0.94967632, "memory(GiB)": 369.42, "step": 58065, "train_speed(iter/s)": 0.200479 }, { "acc": 0.74923477, "epoch": 1.4731100963977677, "grad_norm": 2.078125, "learning_rate": 1.7804423705907764e-06, "loss": 1.07938747, "memory(GiB)": 369.42, "step": 58070, "train_speed(iter/s)": 0.200481 }, { "acc": 0.75484457, "epoch": 1.4732369355657027, "grad_norm": 1.9921875, "learning_rate": 1.779640139468093e-06, "loss": 0.95872087, "memory(GiB)": 369.42, "step": 58075, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75007486, "epoch": 1.4733637747336377, "grad_norm": 1.984375, "learning_rate": 1.778838049991214e-06, "loss": 0.98195724, "memory(GiB)": 369.42, "step": 58080, "train_speed(iter/s)": 0.200487 }, { "acc": 0.75479875, "epoch": 1.4734906139015729, "grad_norm": 1.9609375, "learning_rate": 1.778036102195419e-06, "loss": 0.97595701, "memory(GiB)": 369.42, "step": 58085, "train_speed(iter/s)": 0.20049 }, { "acc": 0.7516777, "epoch": 1.4736174530695079, "grad_norm": 2.09375, "learning_rate": 1.7772342961159817e-06, "loss": 1.00753937, "memory(GiB)": 369.42, "step": 58090, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75890379, "epoch": 1.4737442922374429, "grad_norm": 2.265625, "learning_rate": 1.7764326317881681e-06, "loss": 0.95565052, "memory(GiB)": 369.42, "step": 58095, "train_speed(iter/s)": 0.200497 }, { "acc": 0.75102749, "epoch": 1.473871131405378, "grad_norm": 2.609375, "learning_rate": 1.77563110924724e-06, "loss": 1.00836983, "memory(GiB)": 369.42, "step": 58100, "train_speed(iter/s)": 0.2005 }, { "acc": 0.74702816, "epoch": 1.473997970573313, "grad_norm": 2.21875, "learning_rate": 1.7748297285284494e-06, "loss": 0.98832722, "memory(GiB)": 369.42, "step": 58105, "train_speed(iter/s)": 0.200502 }, { "acc": 0.75957732, "epoch": 1.474124809741248, "grad_norm": 2.46875, "learning_rate": 1.7740284896670507e-06, "loss": 0.93738785, "memory(GiB)": 369.42, "step": 58110, "train_speed(iter/s)": 0.200505 }, { "acc": 0.75613909, "epoch": 1.474251648909183, "grad_norm": 2.140625, "learning_rate": 1.7732273926982796e-06, "loss": 0.9552352, "memory(GiB)": 369.42, "step": 58115, "train_speed(iter/s)": 0.200507 }, { "acc": 0.7528152, "epoch": 1.4743784880771182, "grad_norm": 1.9453125, "learning_rate": 1.7724264376573747e-06, "loss": 0.97016144, "memory(GiB)": 369.42, "step": 58120, "train_speed(iter/s)": 0.20051 }, { "acc": 0.74372687, "epoch": 1.4745053272450532, "grad_norm": 2.171875, "learning_rate": 1.7716256245795631e-06, "loss": 1.03050661, "memory(GiB)": 369.42, "step": 58125, "train_speed(iter/s)": 0.200512 }, { "acc": 0.75531387, "epoch": 1.4746321664129884, "grad_norm": 2.09375, "learning_rate": 1.7708249535000737e-06, "loss": 0.96223373, "memory(GiB)": 369.42, "step": 58130, "train_speed(iter/s)": 0.200516 }, { "acc": 0.74243555, "epoch": 1.4747590055809234, "grad_norm": 1.984375, "learning_rate": 1.7700244244541182e-06, "loss": 1.00353441, "memory(GiB)": 369.42, "step": 58135, "train_speed(iter/s)": 0.200519 }, { "acc": 0.76470661, "epoch": 1.4748858447488584, "grad_norm": 2.171875, "learning_rate": 1.7692240374769081e-06, "loss": 0.97963037, "memory(GiB)": 369.42, "step": 58140, "train_speed(iter/s)": 0.200521 }, { "acc": 0.75194921, "epoch": 1.4750126839167934, "grad_norm": 2.140625, "learning_rate": 1.7684237926036507e-06, "loss": 1.02393875, "memory(GiB)": 369.42, "step": 58145, "train_speed(iter/s)": 0.200523 }, { "acc": 0.74503293, "epoch": 1.4751395230847286, "grad_norm": 1.9609375, "learning_rate": 1.7676236898695442e-06, "loss": 1.03119783, "memory(GiB)": 369.42, "step": 58150, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75367031, "epoch": 1.4752663622526636, "grad_norm": 2.140625, "learning_rate": 1.7668237293097762e-06, "loss": 1.0191721, "memory(GiB)": 369.42, "step": 58155, "train_speed(iter/s)": 0.200529 }, { "acc": 0.76367254, "epoch": 1.4753932014205986, "grad_norm": 2.125, "learning_rate": 1.7660239109595374e-06, "loss": 0.9556572, "memory(GiB)": 369.42, "step": 58160, "train_speed(iter/s)": 0.200531 }, { "acc": 0.72709079, "epoch": 1.4755200405885338, "grad_norm": 2.6875, "learning_rate": 1.7652242348540056e-06, "loss": 1.07014122, "memory(GiB)": 369.42, "step": 58165, "train_speed(iter/s)": 0.200532 }, { "acc": 0.75729074, "epoch": 1.4756468797564688, "grad_norm": 2.15625, "learning_rate": 1.764424701028356e-06, "loss": 0.92999659, "memory(GiB)": 369.42, "step": 58170, "train_speed(iter/s)": 0.200535 }, { "acc": 0.74226279, "epoch": 1.4757737189244038, "grad_norm": 2.21875, "learning_rate": 1.7636253095177507e-06, "loss": 1.057024, "memory(GiB)": 369.42, "step": 58175, "train_speed(iter/s)": 0.200537 }, { "acc": 0.73839316, "epoch": 1.4759005580923388, "grad_norm": 2.0, "learning_rate": 1.762826060357355e-06, "loss": 0.95610123, "memory(GiB)": 369.42, "step": 58180, "train_speed(iter/s)": 0.20054 }, { "acc": 0.76378212, "epoch": 1.476027397260274, "grad_norm": 2.046875, "learning_rate": 1.762026953582322e-06, "loss": 0.94072151, "memory(GiB)": 369.42, "step": 58185, "train_speed(iter/s)": 0.200544 }, { "acc": 0.75720949, "epoch": 1.476154236428209, "grad_norm": 2.453125, "learning_rate": 1.7612279892278006e-06, "loss": 0.95449085, "memory(GiB)": 369.42, "step": 58190, "train_speed(iter/s)": 0.200542 }, { "acc": 0.75293007, "epoch": 1.4762810755961442, "grad_norm": 2.078125, "learning_rate": 1.7604291673289314e-06, "loss": 1.00442753, "memory(GiB)": 369.42, "step": 58195, "train_speed(iter/s)": 0.200545 }, { "acc": 0.75341191, "epoch": 1.4764079147640792, "grad_norm": 2.109375, "learning_rate": 1.759630487920852e-06, "loss": 0.92873907, "memory(GiB)": 369.42, "step": 58200, "train_speed(iter/s)": 0.200547 }, { "acc": 0.75477753, "epoch": 1.4765347539320142, "grad_norm": 2.0, "learning_rate": 1.7588319510386903e-06, "loss": 0.96297712, "memory(GiB)": 369.42, "step": 58205, "train_speed(iter/s)": 0.20055 }, { "acc": 0.75126286, "epoch": 1.4766615930999492, "grad_norm": 2.0, "learning_rate": 1.7580335567175704e-06, "loss": 0.94073887, "memory(GiB)": 369.42, "step": 58210, "train_speed(iter/s)": 0.200552 }, { "acc": 0.73664961, "epoch": 1.4767884322678844, "grad_norm": 1.953125, "learning_rate": 1.7572353049926094e-06, "loss": 1.08010521, "memory(GiB)": 369.42, "step": 58215, "train_speed(iter/s)": 0.200555 }, { "acc": 0.76357107, "epoch": 1.4769152714358194, "grad_norm": 2.015625, "learning_rate": 1.7564371958989173e-06, "loss": 0.91525412, "memory(GiB)": 369.42, "step": 58220, "train_speed(iter/s)": 0.200557 }, { "acc": 0.75890222, "epoch": 1.4770421106037546, "grad_norm": 2.390625, "learning_rate": 1.7556392294715984e-06, "loss": 1.00414391, "memory(GiB)": 369.42, "step": 58225, "train_speed(iter/s)": 0.20056 }, { "acc": 0.73669405, "epoch": 1.4771689497716896, "grad_norm": 2.21875, "learning_rate": 1.7548414057457518e-06, "loss": 1.00287647, "memory(GiB)": 369.42, "step": 58230, "train_speed(iter/s)": 0.200562 }, { "acc": 0.74484482, "epoch": 1.4772957889396245, "grad_norm": 1.921875, "learning_rate": 1.7540437247564685e-06, "loss": 0.9878334, "memory(GiB)": 369.42, "step": 58235, "train_speed(iter/s)": 0.200564 }, { "acc": 0.76719484, "epoch": 1.4774226281075595, "grad_norm": 1.828125, "learning_rate": 1.7532461865388345e-06, "loss": 0.92032909, "memory(GiB)": 369.42, "step": 58240, "train_speed(iter/s)": 0.200567 }, { "acc": 0.75055428, "epoch": 1.4775494672754947, "grad_norm": 2.0625, "learning_rate": 1.752448791127927e-06, "loss": 0.97960758, "memory(GiB)": 369.42, "step": 58245, "train_speed(iter/s)": 0.20057 }, { "acc": 0.74927073, "epoch": 1.4776763064434297, "grad_norm": 2.328125, "learning_rate": 1.7516515385588245e-06, "loss": 1.02100058, "memory(GiB)": 369.42, "step": 58250, "train_speed(iter/s)": 0.200573 }, { "acc": 0.75019145, "epoch": 1.4778031456113647, "grad_norm": 2.34375, "learning_rate": 1.7508544288665885e-06, "loss": 0.9871109, "memory(GiB)": 369.42, "step": 58255, "train_speed(iter/s)": 0.200576 }, { "acc": 0.76016531, "epoch": 1.4779299847793, "grad_norm": 2.015625, "learning_rate": 1.750057462086281e-06, "loss": 0.95831852, "memory(GiB)": 369.42, "step": 58260, "train_speed(iter/s)": 0.200579 }, { "acc": 0.75921235, "epoch": 1.478056823947235, "grad_norm": 2.0, "learning_rate": 1.7492606382529542e-06, "loss": 0.96902065, "memory(GiB)": 369.42, "step": 58265, "train_speed(iter/s)": 0.200582 }, { "acc": 0.75584164, "epoch": 1.47818366311517, "grad_norm": 2.296875, "learning_rate": 1.748463957401662e-06, "loss": 1.0363121, "memory(GiB)": 369.42, "step": 58270, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75234232, "epoch": 1.478310502283105, "grad_norm": 2.171875, "learning_rate": 1.7476674195674404e-06, "loss": 0.98968172, "memory(GiB)": 369.42, "step": 58275, "train_speed(iter/s)": 0.200588 }, { "acc": 0.75556016, "epoch": 1.47843734145104, "grad_norm": 2.484375, "learning_rate": 1.7468710247853244e-06, "loss": 1.02889404, "memory(GiB)": 369.42, "step": 58280, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75034556, "epoch": 1.478564180618975, "grad_norm": 2.015625, "learning_rate": 1.7460747730903466e-06, "loss": 0.95617199, "memory(GiB)": 369.42, "step": 58285, "train_speed(iter/s)": 0.200594 }, { "acc": 0.75530796, "epoch": 1.4786910197869103, "grad_norm": 2.46875, "learning_rate": 1.7452786645175297e-06, "loss": 0.97252121, "memory(GiB)": 369.42, "step": 58290, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74147711, "epoch": 1.4788178589548453, "grad_norm": 2.078125, "learning_rate": 1.7444826991018864e-06, "loss": 1.06710272, "memory(GiB)": 369.42, "step": 58295, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74801826, "epoch": 1.4789446981227803, "grad_norm": 2.1875, "learning_rate": 1.7436868768784276e-06, "loss": 0.97747192, "memory(GiB)": 369.42, "step": 58300, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75853739, "epoch": 1.4790715372907153, "grad_norm": 2.59375, "learning_rate": 1.7428911978821594e-06, "loss": 0.97784967, "memory(GiB)": 369.42, "step": 58305, "train_speed(iter/s)": 0.200604 }, { "acc": 0.74973822, "epoch": 1.4791983764586505, "grad_norm": 2.28125, "learning_rate": 1.7420956621480806e-06, "loss": 1.00261097, "memory(GiB)": 369.42, "step": 58310, "train_speed(iter/s)": 0.200607 }, { "acc": 0.7594008, "epoch": 1.4793252156265855, "grad_norm": 1.8671875, "learning_rate": 1.7413002697111765e-06, "loss": 0.95121937, "memory(GiB)": 369.42, "step": 58315, "train_speed(iter/s)": 0.200609 }, { "acc": 0.76215963, "epoch": 1.4794520547945205, "grad_norm": 2.375, "learning_rate": 1.7405050206064372e-06, "loss": 0.97995882, "memory(GiB)": 369.42, "step": 58320, "train_speed(iter/s)": 0.200612 }, { "acc": 0.75075169, "epoch": 1.4795788939624557, "grad_norm": 1.90625, "learning_rate": 1.73970991486884e-06, "loss": 0.99197369, "memory(GiB)": 369.42, "step": 58325, "train_speed(iter/s)": 0.200615 }, { "acc": 0.74512763, "epoch": 1.4797057331303907, "grad_norm": 2.53125, "learning_rate": 1.7389149525333565e-06, "loss": 1.03464012, "memory(GiB)": 369.42, "step": 58330, "train_speed(iter/s)": 0.200617 }, { "acc": 0.731392, "epoch": 1.4798325722983257, "grad_norm": 2.03125, "learning_rate": 1.7381201336349535e-06, "loss": 0.98739691, "memory(GiB)": 369.42, "step": 58335, "train_speed(iter/s)": 0.200618 }, { "acc": 0.75458765, "epoch": 1.4799594114662606, "grad_norm": 2.3125, "learning_rate": 1.7373254582085896e-06, "loss": 0.98355885, "memory(GiB)": 369.42, "step": 58340, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75778112, "epoch": 1.4800862506341959, "grad_norm": 2.375, "learning_rate": 1.7365309262892194e-06, "loss": 0.93061867, "memory(GiB)": 369.42, "step": 58345, "train_speed(iter/s)": 0.200623 }, { "acc": 0.73402505, "epoch": 1.4802130898021308, "grad_norm": 2.625, "learning_rate": 1.735736537911789e-06, "loss": 1.05320015, "memory(GiB)": 369.42, "step": 58350, "train_speed(iter/s)": 0.200627 }, { "acc": 0.73784637, "epoch": 1.480339928970066, "grad_norm": 1.9140625, "learning_rate": 1.7349422931112403e-06, "loss": 1.0193162, "memory(GiB)": 369.42, "step": 58355, "train_speed(iter/s)": 0.20063 }, { "acc": 0.75778913, "epoch": 1.480466768138001, "grad_norm": 1.84375, "learning_rate": 1.7341481919225062e-06, "loss": 0.96876945, "memory(GiB)": 369.42, "step": 58360, "train_speed(iter/s)": 0.200633 }, { "acc": 0.75128775, "epoch": 1.480593607305936, "grad_norm": 1.7890625, "learning_rate": 1.733354234380516e-06, "loss": 0.95374146, "memory(GiB)": 369.42, "step": 58365, "train_speed(iter/s)": 0.200635 }, { "acc": 0.7544363, "epoch": 1.480720446473871, "grad_norm": 2.109375, "learning_rate": 1.7325604205201912e-06, "loss": 0.92039814, "memory(GiB)": 369.42, "step": 58370, "train_speed(iter/s)": 0.200636 }, { "acc": 0.7550231, "epoch": 1.4808472856418062, "grad_norm": 1.890625, "learning_rate": 1.7317667503764468e-06, "loss": 0.93746443, "memory(GiB)": 369.42, "step": 58375, "train_speed(iter/s)": 0.200638 }, { "acc": 0.73674307, "epoch": 1.4809741248097412, "grad_norm": 2.203125, "learning_rate": 1.7309732239841926e-06, "loss": 1.09814434, "memory(GiB)": 369.42, "step": 58380, "train_speed(iter/s)": 0.200642 }, { "acc": 0.75589857, "epoch": 1.4811009639776764, "grad_norm": 2.078125, "learning_rate": 1.730179841378331e-06, "loss": 0.95944958, "memory(GiB)": 369.42, "step": 58385, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75551691, "epoch": 1.4812278031456114, "grad_norm": 1.96875, "learning_rate": 1.7293866025937589e-06, "loss": 0.98189764, "memory(GiB)": 369.42, "step": 58390, "train_speed(iter/s)": 0.200647 }, { "acc": 0.75232754, "epoch": 1.4813546423135464, "grad_norm": 2.015625, "learning_rate": 1.7285935076653659e-06, "loss": 0.95563755, "memory(GiB)": 369.42, "step": 58395, "train_speed(iter/s)": 0.20065 }, { "acc": 0.75808687, "epoch": 1.4814814814814814, "grad_norm": 2.40625, "learning_rate": 1.7278005566280365e-06, "loss": 0.95908585, "memory(GiB)": 369.42, "step": 58400, "train_speed(iter/s)": 0.200652 }, { "acc": 0.73971696, "epoch": 1.4816083206494166, "grad_norm": 2.21875, "learning_rate": 1.727007749516646e-06, "loss": 1.02736435, "memory(GiB)": 369.42, "step": 58405, "train_speed(iter/s)": 0.200655 }, { "acc": 0.74523854, "epoch": 1.4817351598173516, "grad_norm": 2.03125, "learning_rate": 1.7262150863660709e-06, "loss": 1.05676041, "memory(GiB)": 369.42, "step": 58410, "train_speed(iter/s)": 0.200659 }, { "acc": 0.75461483, "epoch": 1.4818619989852866, "grad_norm": 1.9609375, "learning_rate": 1.7254225672111713e-06, "loss": 0.99459534, "memory(GiB)": 369.42, "step": 58415, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74397316, "epoch": 1.4819888381532218, "grad_norm": 2.578125, "learning_rate": 1.7246301920868052e-06, "loss": 1.00716896, "memory(GiB)": 369.42, "step": 58420, "train_speed(iter/s)": 0.200665 }, { "acc": 0.7516047, "epoch": 1.4821156773211568, "grad_norm": 2.265625, "learning_rate": 1.723837961027829e-06, "loss": 0.88896065, "memory(GiB)": 369.42, "step": 58425, "train_speed(iter/s)": 0.200667 }, { "acc": 0.75399532, "epoch": 1.4822425164890918, "grad_norm": 2.375, "learning_rate": 1.723045874069087e-06, "loss": 0.98475475, "memory(GiB)": 369.42, "step": 58430, "train_speed(iter/s)": 0.200669 }, { "acc": 0.75820055, "epoch": 1.4823693556570268, "grad_norm": 1.9609375, "learning_rate": 1.7222539312454167e-06, "loss": 0.98496437, "memory(GiB)": 369.42, "step": 58435, "train_speed(iter/s)": 0.200673 }, { "acc": 0.75677848, "epoch": 1.482496194824962, "grad_norm": 2.46875, "learning_rate": 1.7214621325916515e-06, "loss": 1.02521458, "memory(GiB)": 369.42, "step": 58440, "train_speed(iter/s)": 0.200677 }, { "acc": 0.76092386, "epoch": 1.482623033992897, "grad_norm": 2.953125, "learning_rate": 1.7206704781426204e-06, "loss": 0.96561737, "memory(GiB)": 369.42, "step": 58445, "train_speed(iter/s)": 0.200678 }, { "acc": 0.77452097, "epoch": 1.4827498731608322, "grad_norm": 2.140625, "learning_rate": 1.7198789679331445e-06, "loss": 0.92243595, "memory(GiB)": 369.42, "step": 58450, "train_speed(iter/s)": 0.200679 }, { "acc": 0.76462498, "epoch": 1.4828767123287672, "grad_norm": 1.6328125, "learning_rate": 1.7190876019980329e-06, "loss": 0.91987877, "memory(GiB)": 369.42, "step": 58455, "train_speed(iter/s)": 0.200681 }, { "acc": 0.74812927, "epoch": 1.4830035514967022, "grad_norm": 2.078125, "learning_rate": 1.7182963803720987e-06, "loss": 1.00669441, "memory(GiB)": 369.42, "step": 58460, "train_speed(iter/s)": 0.200683 }, { "acc": 0.73678446, "epoch": 1.4831303906646371, "grad_norm": 2.421875, "learning_rate": 1.7175053030901418e-06, "loss": 1.03831186, "memory(GiB)": 369.42, "step": 58465, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75167742, "epoch": 1.4832572298325724, "grad_norm": 2.25, "learning_rate": 1.7167143701869582e-06, "loss": 0.99664307, "memory(GiB)": 369.42, "step": 58470, "train_speed(iter/s)": 0.200687 }, { "acc": 0.75170784, "epoch": 1.4833840690005073, "grad_norm": 2.40625, "learning_rate": 1.7159235816973318e-06, "loss": 0.94716215, "memory(GiB)": 369.42, "step": 58475, "train_speed(iter/s)": 0.200691 }, { "acc": 0.75768862, "epoch": 1.4835109081684423, "grad_norm": 1.9296875, "learning_rate": 1.7151329376560506e-06, "loss": 1.00603333, "memory(GiB)": 369.42, "step": 58480, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74098244, "epoch": 1.4836377473363775, "grad_norm": 1.75, "learning_rate": 1.7143424380978885e-06, "loss": 0.98882399, "memory(GiB)": 369.42, "step": 58485, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74643602, "epoch": 1.4837645865043125, "grad_norm": 2.359375, "learning_rate": 1.7135520830576157e-06, "loss": 1.0198391, "memory(GiB)": 369.42, "step": 58490, "train_speed(iter/s)": 0.200698 }, { "acc": 0.74931302, "epoch": 1.4838914256722475, "grad_norm": 2.09375, "learning_rate": 1.712761872569995e-06, "loss": 0.98913975, "memory(GiB)": 369.42, "step": 58495, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75178938, "epoch": 1.4840182648401825, "grad_norm": 2.03125, "learning_rate": 1.7119718066697838e-06, "loss": 0.99554005, "memory(GiB)": 369.42, "step": 58500, "train_speed(iter/s)": 0.200702 }, { "acc": 0.74241896, "epoch": 1.4841451040081177, "grad_norm": 2.078125, "learning_rate": 1.7111818853917323e-06, "loss": 0.98751326, "memory(GiB)": 369.42, "step": 58505, "train_speed(iter/s)": 0.200703 }, { "acc": 0.75413256, "epoch": 1.4842719431760527, "grad_norm": 2.078125, "learning_rate": 1.710392108770585e-06, "loss": 1.04098358, "memory(GiB)": 369.42, "step": 58510, "train_speed(iter/s)": 0.200706 }, { "acc": 0.75487852, "epoch": 1.484398782343988, "grad_norm": 2.0, "learning_rate": 1.7096024768410796e-06, "loss": 1.01136818, "memory(GiB)": 369.42, "step": 58515, "train_speed(iter/s)": 0.200708 }, { "acc": 0.73895173, "epoch": 1.484525621511923, "grad_norm": 1.765625, "learning_rate": 1.7088129896379484e-06, "loss": 1.04480896, "memory(GiB)": 369.42, "step": 58520, "train_speed(iter/s)": 0.200711 }, { "acc": 0.74876919, "epoch": 1.484652460679858, "grad_norm": 2.015625, "learning_rate": 1.7080236471959155e-06, "loss": 0.97951736, "memory(GiB)": 369.42, "step": 58525, "train_speed(iter/s)": 0.200712 }, { "acc": 0.74523802, "epoch": 1.4847792998477929, "grad_norm": 2.296875, "learning_rate": 1.7072344495497007e-06, "loss": 1.03208733, "memory(GiB)": 369.42, "step": 58530, "train_speed(iter/s)": 0.200714 }, { "acc": 0.7763762, "epoch": 1.484906139015728, "grad_norm": 1.9765625, "learning_rate": 1.7064453967340155e-06, "loss": 0.938344, "memory(GiB)": 369.42, "step": 58535, "train_speed(iter/s)": 0.200716 }, { "acc": 0.74672456, "epoch": 1.485032978183663, "grad_norm": 2.21875, "learning_rate": 1.7056564887835664e-06, "loss": 0.98181076, "memory(GiB)": 369.42, "step": 58540, "train_speed(iter/s)": 0.200719 }, { "acc": 0.73342237, "epoch": 1.4851598173515983, "grad_norm": 2.203125, "learning_rate": 1.704867725733052e-06, "loss": 1.06446562, "memory(GiB)": 369.42, "step": 58545, "train_speed(iter/s)": 0.200723 }, { "acc": 0.75174217, "epoch": 1.4852866565195333, "grad_norm": 2.296875, "learning_rate": 1.7040791076171692e-06, "loss": 0.99664288, "memory(GiB)": 369.42, "step": 58550, "train_speed(iter/s)": 0.200726 }, { "acc": 0.75601864, "epoch": 1.4854134956874683, "grad_norm": 1.9296875, "learning_rate": 1.7032906344706017e-06, "loss": 0.95727386, "memory(GiB)": 369.42, "step": 58555, "train_speed(iter/s)": 0.20073 }, { "acc": 0.7406765, "epoch": 1.4855403348554033, "grad_norm": 2.03125, "learning_rate": 1.7025023063280306e-06, "loss": 1.01109562, "memory(GiB)": 369.42, "step": 58560, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74323883, "epoch": 1.4856671740233385, "grad_norm": 1.96875, "learning_rate": 1.701714123224128e-06, "loss": 1.01704597, "memory(GiB)": 369.42, "step": 58565, "train_speed(iter/s)": 0.200736 }, { "acc": 0.74594517, "epoch": 1.4857940131912735, "grad_norm": 2.03125, "learning_rate": 1.7009260851935684e-06, "loss": 0.98653669, "memory(GiB)": 369.42, "step": 58570, "train_speed(iter/s)": 0.200738 }, { "acc": 0.73880196, "epoch": 1.4859208523592085, "grad_norm": 1.9609375, "learning_rate": 1.7001381922710064e-06, "loss": 1.05495968, "memory(GiB)": 369.42, "step": 58575, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74588628, "epoch": 1.4860476915271437, "grad_norm": 2.296875, "learning_rate": 1.699350444491098e-06, "loss": 0.94948368, "memory(GiB)": 369.42, "step": 58580, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74390726, "epoch": 1.4861745306950787, "grad_norm": 1.9296875, "learning_rate": 1.6985628418884953e-06, "loss": 1.01838074, "memory(GiB)": 369.42, "step": 58585, "train_speed(iter/s)": 0.200745 }, { "acc": 0.75159369, "epoch": 1.4863013698630136, "grad_norm": 2.28125, "learning_rate": 1.6977753844978406e-06, "loss": 1.0155014, "memory(GiB)": 369.42, "step": 58590, "train_speed(iter/s)": 0.200747 }, { "acc": 0.7516324, "epoch": 1.4864282090309486, "grad_norm": 2.28125, "learning_rate": 1.696988072353764e-06, "loss": 0.96600246, "memory(GiB)": 369.42, "step": 58595, "train_speed(iter/s)": 0.200749 }, { "acc": 0.75264297, "epoch": 1.4865550481988838, "grad_norm": 1.9921875, "learning_rate": 1.6962009054909007e-06, "loss": 0.97331142, "memory(GiB)": 369.42, "step": 58600, "train_speed(iter/s)": 0.200752 }, { "acc": 0.75891905, "epoch": 1.4866818873668188, "grad_norm": 1.9296875, "learning_rate": 1.6954138839438723e-06, "loss": 0.96418133, "memory(GiB)": 369.42, "step": 58605, "train_speed(iter/s)": 0.200754 }, { "acc": 0.74591837, "epoch": 1.486808726534754, "grad_norm": 2.515625, "learning_rate": 1.6946270077472966e-06, "loss": 1.04938679, "memory(GiB)": 369.42, "step": 58610, "train_speed(iter/s)": 0.200756 }, { "acc": 0.75529513, "epoch": 1.486935565702689, "grad_norm": 2.078125, "learning_rate": 1.6938402769357787e-06, "loss": 0.94603977, "memory(GiB)": 369.42, "step": 58615, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75233488, "epoch": 1.487062404870624, "grad_norm": 2.453125, "learning_rate": 1.6930536915439288e-06, "loss": 0.96463203, "memory(GiB)": 369.42, "step": 58620, "train_speed(iter/s)": 0.200761 }, { "acc": 0.75572672, "epoch": 1.487189244038559, "grad_norm": 2.46875, "learning_rate": 1.6922672516063415e-06, "loss": 0.98055153, "memory(GiB)": 369.42, "step": 58625, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74345484, "epoch": 1.4873160832064942, "grad_norm": 1.9140625, "learning_rate": 1.6914809571576086e-06, "loss": 1.00705729, "memory(GiB)": 369.42, "step": 58630, "train_speed(iter/s)": 0.200766 }, { "acc": 0.74449935, "epoch": 1.4874429223744292, "grad_norm": 2.046875, "learning_rate": 1.6906948082323149e-06, "loss": 1.03131809, "memory(GiB)": 369.42, "step": 58635, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75213208, "epoch": 1.4875697615423642, "grad_norm": 2.140625, "learning_rate": 1.689908804865038e-06, "loss": 1.00945435, "memory(GiB)": 369.42, "step": 58640, "train_speed(iter/s)": 0.200771 }, { "acc": 0.74548936, "epoch": 1.4876966007102994, "grad_norm": 1.9375, "learning_rate": 1.6891229470903509e-06, "loss": 1.0108264, "memory(GiB)": 369.42, "step": 58645, "train_speed(iter/s)": 0.200773 }, { "acc": 0.75024056, "epoch": 1.4878234398782344, "grad_norm": 2.390625, "learning_rate": 1.6883372349428184e-06, "loss": 1.01042156, "memory(GiB)": 369.42, "step": 58650, "train_speed(iter/s)": 0.200777 }, { "acc": 0.76389427, "epoch": 1.4879502790461694, "grad_norm": 2.484375, "learning_rate": 1.6875516684569999e-06, "loss": 0.94888, "memory(GiB)": 369.42, "step": 58655, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75416374, "epoch": 1.4880771182141044, "grad_norm": 2.15625, "learning_rate": 1.686766247667448e-06, "loss": 0.99466667, "memory(GiB)": 369.42, "step": 58660, "train_speed(iter/s)": 0.200782 }, { "acc": 0.75000715, "epoch": 1.4882039573820396, "grad_norm": 2.34375, "learning_rate": 1.685980972608709e-06, "loss": 0.97826262, "memory(GiB)": 369.42, "step": 58665, "train_speed(iter/s)": 0.200784 }, { "acc": 0.74723563, "epoch": 1.4883307965499746, "grad_norm": 1.9609375, "learning_rate": 1.6851958433153227e-06, "loss": 0.96642437, "memory(GiB)": 369.42, "step": 58670, "train_speed(iter/s)": 0.200786 }, { "acc": 0.74387465, "epoch": 1.4884576357179098, "grad_norm": 2.390625, "learning_rate": 1.684410859821823e-06, "loss": 1.03701077, "memory(GiB)": 369.42, "step": 58675, "train_speed(iter/s)": 0.200788 }, { "acc": 0.76799402, "epoch": 1.4885844748858448, "grad_norm": 1.984375, "learning_rate": 1.6836260221627364e-06, "loss": 0.91552258, "memory(GiB)": 369.42, "step": 58680, "train_speed(iter/s)": 0.20079 }, { "acc": 0.75427933, "epoch": 1.4887113140537798, "grad_norm": 1.84375, "learning_rate": 1.682841330372582e-06, "loss": 0.97283125, "memory(GiB)": 369.42, "step": 58685, "train_speed(iter/s)": 0.200792 }, { "acc": 0.75912275, "epoch": 1.4888381532217148, "grad_norm": 1.75, "learning_rate": 1.6820567844858798e-06, "loss": 0.93208809, "memory(GiB)": 369.42, "step": 58690, "train_speed(iter/s)": 0.200795 }, { "acc": 0.75405254, "epoch": 1.48896499238965, "grad_norm": 2.15625, "learning_rate": 1.681272384537132e-06, "loss": 0.97389889, "memory(GiB)": 369.42, "step": 58695, "train_speed(iter/s)": 0.200796 }, { "acc": 0.75687685, "epoch": 1.489091831557585, "grad_norm": 1.8984375, "learning_rate": 1.6804881305608423e-06, "loss": 0.93660183, "memory(GiB)": 369.42, "step": 58700, "train_speed(iter/s)": 0.200798 }, { "acc": 0.74815226, "epoch": 1.4892186707255202, "grad_norm": 2.0, "learning_rate": 1.679704022591503e-06, "loss": 1.00203934, "memory(GiB)": 369.42, "step": 58705, "train_speed(iter/s)": 0.200801 }, { "acc": 0.75338602, "epoch": 1.4893455098934552, "grad_norm": 2.09375, "learning_rate": 1.678920060663608e-06, "loss": 0.98525171, "memory(GiB)": 369.42, "step": 58710, "train_speed(iter/s)": 0.200802 }, { "acc": 0.75009069, "epoch": 1.4894723490613901, "grad_norm": 2.109375, "learning_rate": 1.6781362448116344e-06, "loss": 0.95747204, "memory(GiB)": 369.42, "step": 58715, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75418634, "epoch": 1.4895991882293251, "grad_norm": 1.921875, "learning_rate": 1.6773525750700586e-06, "loss": 0.97515602, "memory(GiB)": 369.42, "step": 58720, "train_speed(iter/s)": 0.200808 }, { "acc": 0.75903053, "epoch": 1.4897260273972603, "grad_norm": 2.375, "learning_rate": 1.676569051473353e-06, "loss": 0.9593811, "memory(GiB)": 369.42, "step": 58725, "train_speed(iter/s)": 0.200811 }, { "acc": 0.74528217, "epoch": 1.4898528665651953, "grad_norm": 2.15625, "learning_rate": 1.6757856740559796e-06, "loss": 0.98819942, "memory(GiB)": 369.42, "step": 58730, "train_speed(iter/s)": 0.200814 }, { "acc": 0.7602592, "epoch": 1.4899797057331303, "grad_norm": 1.8359375, "learning_rate": 1.6750024428523926e-06, "loss": 0.93074331, "memory(GiB)": 369.42, "step": 58735, "train_speed(iter/s)": 0.200818 }, { "acc": 0.75562167, "epoch": 1.4901065449010655, "grad_norm": 1.8671875, "learning_rate": 1.6742193578970418e-06, "loss": 0.99389877, "memory(GiB)": 369.42, "step": 58740, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75550442, "epoch": 1.4902333840690005, "grad_norm": 2.484375, "learning_rate": 1.673436419224373e-06, "loss": 1.01734695, "memory(GiB)": 369.42, "step": 58745, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74218636, "epoch": 1.4903602232369355, "grad_norm": 2.296875, "learning_rate": 1.6726536268688248e-06, "loss": 1.0215127, "memory(GiB)": 369.42, "step": 58750, "train_speed(iter/s)": 0.200827 }, { "acc": 0.75741591, "epoch": 1.4904870624048705, "grad_norm": 2.3125, "learning_rate": 1.671870980864822e-06, "loss": 0.97290125, "memory(GiB)": 369.42, "step": 58755, "train_speed(iter/s)": 0.200829 }, { "acc": 0.75551381, "epoch": 1.4906139015728057, "grad_norm": 2.234375, "learning_rate": 1.6710884812467943e-06, "loss": 0.91216335, "memory(GiB)": 369.42, "step": 58760, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75586023, "epoch": 1.4907407407407407, "grad_norm": 2.046875, "learning_rate": 1.6703061280491579e-06, "loss": 1.00092144, "memory(GiB)": 369.42, "step": 58765, "train_speed(iter/s)": 0.200833 }, { "acc": 0.74995055, "epoch": 1.490867579908676, "grad_norm": 2.140625, "learning_rate": 1.6695239213063237e-06, "loss": 0.97323456, "memory(GiB)": 369.42, "step": 58770, "train_speed(iter/s)": 0.200836 }, { "acc": 0.7579587, "epoch": 1.490994419076611, "grad_norm": 2.390625, "learning_rate": 1.6687418610526972e-06, "loss": 0.97189598, "memory(GiB)": 369.42, "step": 58775, "train_speed(iter/s)": 0.200838 }, { "acc": 0.75806036, "epoch": 1.4911212582445459, "grad_norm": 2.40625, "learning_rate": 1.6679599473226766e-06, "loss": 0.94405937, "memory(GiB)": 369.42, "step": 58780, "train_speed(iter/s)": 0.20084 }, { "acc": 0.75254507, "epoch": 1.4912480974124809, "grad_norm": 2.34375, "learning_rate": 1.6671781801506536e-06, "loss": 1.04242668, "memory(GiB)": 369.42, "step": 58785, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75355864, "epoch": 1.491374936580416, "grad_norm": 1.8671875, "learning_rate": 1.6663965595710147e-06, "loss": 0.97029419, "memory(GiB)": 369.42, "step": 58790, "train_speed(iter/s)": 0.200845 }, { "acc": 0.76483197, "epoch": 1.491501775748351, "grad_norm": 2.34375, "learning_rate": 1.6656150856181386e-06, "loss": 0.97020998, "memory(GiB)": 369.42, "step": 58795, "train_speed(iter/s)": 0.20085 }, { "acc": 0.73515615, "epoch": 1.491628614916286, "grad_norm": 2.046875, "learning_rate": 1.6648337583263974e-06, "loss": 1.03553162, "memory(GiB)": 369.42, "step": 58800, "train_speed(iter/s)": 0.200853 }, { "acc": 0.75155716, "epoch": 1.4917554540842213, "grad_norm": 2.0625, "learning_rate": 1.6640525777301586e-06, "loss": 1.00711479, "memory(GiB)": 369.42, "step": 58805, "train_speed(iter/s)": 0.200855 }, { "acc": 0.77074404, "epoch": 1.4918822932521563, "grad_norm": 2.40625, "learning_rate": 1.663271543863781e-06, "loss": 0.9421629, "memory(GiB)": 369.42, "step": 58810, "train_speed(iter/s)": 0.200859 }, { "acc": 0.75898595, "epoch": 1.4920091324200913, "grad_norm": 2.265625, "learning_rate": 1.6624906567616183e-06, "loss": 1.00253124, "memory(GiB)": 369.42, "step": 58815, "train_speed(iter/s)": 0.20086 }, { "acc": 0.75535889, "epoch": 1.4921359715880262, "grad_norm": 2.09375, "learning_rate": 1.6617099164580175e-06, "loss": 0.97405434, "memory(GiB)": 369.42, "step": 58820, "train_speed(iter/s)": 0.200863 }, { "acc": 0.75385103, "epoch": 1.4922628107559615, "grad_norm": 2.34375, "learning_rate": 1.660929322987319e-06, "loss": 0.95108242, "memory(GiB)": 369.42, "step": 58825, "train_speed(iter/s)": 0.200865 }, { "acc": 0.74425693, "epoch": 1.4923896499238964, "grad_norm": 2.671875, "learning_rate": 1.660148876383857e-06, "loss": 1.00967522, "memory(GiB)": 369.42, "step": 58830, "train_speed(iter/s)": 0.200868 }, { "acc": 0.75861268, "epoch": 1.4925164890918317, "grad_norm": 2.21875, "learning_rate": 1.6593685766819584e-06, "loss": 0.99017963, "memory(GiB)": 369.42, "step": 58835, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74810734, "epoch": 1.4926433282597666, "grad_norm": 2.03125, "learning_rate": 1.658588423915945e-06, "loss": 0.94569063, "memory(GiB)": 369.42, "step": 58840, "train_speed(iter/s)": 0.200874 }, { "acc": 0.75725489, "epoch": 1.4927701674277016, "grad_norm": 1.8984375, "learning_rate": 1.6578084181201293e-06, "loss": 0.9947854, "memory(GiB)": 369.42, "step": 58845, "train_speed(iter/s)": 0.200877 }, { "acc": 0.74755616, "epoch": 1.4928970065956366, "grad_norm": 2.578125, "learning_rate": 1.6570285593288242e-06, "loss": 0.99491062, "memory(GiB)": 369.42, "step": 58850, "train_speed(iter/s)": 0.200878 }, { "acc": 0.74210315, "epoch": 1.4930238457635718, "grad_norm": 2.296875, "learning_rate": 1.6562488475763267e-06, "loss": 1.03115139, "memory(GiB)": 369.42, "step": 58855, "train_speed(iter/s)": 0.20088 }, { "acc": 0.74407158, "epoch": 1.4931506849315068, "grad_norm": 2.71875, "learning_rate": 1.6554692828969321e-06, "loss": 0.97581921, "memory(GiB)": 369.42, "step": 58860, "train_speed(iter/s)": 0.20088 }, { "acc": 0.74371028, "epoch": 1.493277524099442, "grad_norm": 2.578125, "learning_rate": 1.6546898653249326e-06, "loss": 1.01785355, "memory(GiB)": 369.42, "step": 58865, "train_speed(iter/s)": 0.200882 }, { "acc": 0.74466429, "epoch": 1.493404363267377, "grad_norm": 2.015625, "learning_rate": 1.65391059489461e-06, "loss": 1.00736752, "memory(GiB)": 369.42, "step": 58870, "train_speed(iter/s)": 0.200886 }, { "acc": 0.73800821, "epoch": 1.493531202435312, "grad_norm": 1.8671875, "learning_rate": 1.6531314716402369e-06, "loss": 1.04110928, "memory(GiB)": 369.42, "step": 58875, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74789643, "epoch": 1.493658041603247, "grad_norm": 2.28125, "learning_rate": 1.652352495596083e-06, "loss": 0.98609753, "memory(GiB)": 369.42, "step": 58880, "train_speed(iter/s)": 0.200891 }, { "acc": 0.75199356, "epoch": 1.4937848807711822, "grad_norm": 2.0625, "learning_rate": 1.6515736667964144e-06, "loss": 0.98916664, "memory(GiB)": 369.42, "step": 58885, "train_speed(iter/s)": 0.200894 }, { "acc": 0.75518007, "epoch": 1.4939117199391172, "grad_norm": 2.484375, "learning_rate": 1.6507949852754867e-06, "loss": 1.04629745, "memory(GiB)": 369.42, "step": 58890, "train_speed(iter/s)": 0.200898 }, { "acc": 0.74355102, "epoch": 1.4940385591070522, "grad_norm": 1.8828125, "learning_rate": 1.6500164510675453e-06, "loss": 1.00220356, "memory(GiB)": 369.42, "step": 58895, "train_speed(iter/s)": 0.200899 }, { "acc": 0.76461077, "epoch": 1.4941653982749874, "grad_norm": 3.75, "learning_rate": 1.649238064206839e-06, "loss": 0.95019321, "memory(GiB)": 369.42, "step": 58900, "train_speed(iter/s)": 0.200901 }, { "acc": 0.74971342, "epoch": 1.4942922374429224, "grad_norm": 2.0, "learning_rate": 1.6484598247276023e-06, "loss": 1.00043125, "memory(GiB)": 369.42, "step": 58905, "train_speed(iter/s)": 0.200904 }, { "acc": 0.74318943, "epoch": 1.4944190766108574, "grad_norm": 2.203125, "learning_rate": 1.6476817326640682e-06, "loss": 1.00118752, "memory(GiB)": 369.42, "step": 58910, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75871158, "epoch": 1.4945459157787924, "grad_norm": 2.375, "learning_rate": 1.646903788050455e-06, "loss": 0.96958904, "memory(GiB)": 369.42, "step": 58915, "train_speed(iter/s)": 0.200908 }, { "acc": 0.74765768, "epoch": 1.4946727549467276, "grad_norm": 1.953125, "learning_rate": 1.6461259909209853e-06, "loss": 0.96195354, "memory(GiB)": 369.42, "step": 58920, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75741835, "epoch": 1.4947995941146626, "grad_norm": 1.953125, "learning_rate": 1.6453483413098687e-06, "loss": 0.98444443, "memory(GiB)": 369.42, "step": 58925, "train_speed(iter/s)": 0.200913 }, { "acc": 0.74658933, "epoch": 1.4949264332825978, "grad_norm": 2.078125, "learning_rate": 1.6445708392513093e-06, "loss": 1.03476057, "memory(GiB)": 369.42, "step": 58930, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74638405, "epoch": 1.4950532724505328, "grad_norm": 2.109375, "learning_rate": 1.643793484779505e-06, "loss": 1.01506958, "memory(GiB)": 369.42, "step": 58935, "train_speed(iter/s)": 0.200918 }, { "acc": 0.76374054, "epoch": 1.4951801116184678, "grad_norm": 2.640625, "learning_rate": 1.6430162779286484e-06, "loss": 0.95745306, "memory(GiB)": 369.42, "step": 58940, "train_speed(iter/s)": 0.200921 }, { "acc": 0.76026158, "epoch": 1.4953069507864027, "grad_norm": 1.8671875, "learning_rate": 1.6422392187329233e-06, "loss": 0.94339952, "memory(GiB)": 369.42, "step": 58945, "train_speed(iter/s)": 0.200923 }, { "acc": 0.74421868, "epoch": 1.495433789954338, "grad_norm": 2.671875, "learning_rate": 1.6414623072265085e-06, "loss": 1.02812681, "memory(GiB)": 369.42, "step": 58950, "train_speed(iter/s)": 0.200925 }, { "acc": 0.76079326, "epoch": 1.495560629122273, "grad_norm": 2.234375, "learning_rate": 1.6406855434435765e-06, "loss": 0.97053909, "memory(GiB)": 369.42, "step": 58955, "train_speed(iter/s)": 0.200927 }, { "acc": 0.73818355, "epoch": 1.495687468290208, "grad_norm": 1.953125, "learning_rate": 1.6399089274182922e-06, "loss": 1.01040134, "memory(GiB)": 369.42, "step": 58960, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75288439, "epoch": 1.4958143074581431, "grad_norm": 1.8203125, "learning_rate": 1.6391324591848156e-06, "loss": 0.96663103, "memory(GiB)": 369.42, "step": 58965, "train_speed(iter/s)": 0.200932 }, { "acc": 0.75806065, "epoch": 1.4959411466260781, "grad_norm": 1.96875, "learning_rate": 1.6383561387772984e-06, "loss": 0.96388264, "memory(GiB)": 369.42, "step": 58970, "train_speed(iter/s)": 0.200934 }, { "acc": 0.74934068, "epoch": 1.4960679857940131, "grad_norm": 1.984375, "learning_rate": 1.6375799662298868e-06, "loss": 0.99920101, "memory(GiB)": 369.42, "step": 58975, "train_speed(iter/s)": 0.200937 }, { "acc": 0.74766178, "epoch": 1.4961948249619481, "grad_norm": 2.40625, "learning_rate": 1.6368039415767201e-06, "loss": 1.01629496, "memory(GiB)": 369.42, "step": 58980, "train_speed(iter/s)": 0.20094 }, { "acc": 0.74622774, "epoch": 1.4963216641298833, "grad_norm": 2.171875, "learning_rate": 1.6360280648519305e-06, "loss": 1.00173149, "memory(GiB)": 369.42, "step": 58985, "train_speed(iter/s)": 0.200943 }, { "acc": 0.76260986, "epoch": 1.4964485032978183, "grad_norm": 2.125, "learning_rate": 1.6352523360896488e-06, "loss": 0.9693572, "memory(GiB)": 369.42, "step": 58990, "train_speed(iter/s)": 0.200944 }, { "acc": 0.74560127, "epoch": 1.4965753424657535, "grad_norm": 1.9921875, "learning_rate": 1.63447675532399e-06, "loss": 1.03157568, "memory(GiB)": 369.42, "step": 58995, "train_speed(iter/s)": 0.200946 }, { "acc": 0.75622473, "epoch": 1.4967021816336885, "grad_norm": 2.859375, "learning_rate": 1.6337013225890698e-06, "loss": 1.04165726, "memory(GiB)": 369.42, "step": 59000, "train_speed(iter/s)": 0.200949 }, { "epoch": 1.4967021816336885, "eval_acc": 0.7379884587603089, "eval_loss": 0.9695234894752502, "eval_runtime": 384.7904, "eval_samples_per_second": 16.554, "eval_steps_per_second": 8.277, "step": 59000 }, { "acc": 0.76077824, "epoch": 1.4968290208016235, "grad_norm": 1.828125, "learning_rate": 1.6329260379189932e-06, "loss": 0.9556469, "memory(GiB)": 369.42, "step": 59005, "train_speed(iter/s)": 0.200463 }, { "acc": 0.75116806, "epoch": 1.4969558599695585, "grad_norm": 2.140625, "learning_rate": 1.6321509013478653e-06, "loss": 0.9601593, "memory(GiB)": 369.42, "step": 59010, "train_speed(iter/s)": 0.200465 }, { "acc": 0.74758739, "epoch": 1.4970826991374937, "grad_norm": 2.046875, "learning_rate": 1.6313759129097757e-06, "loss": 1.03289051, "memory(GiB)": 369.42, "step": 59015, "train_speed(iter/s)": 0.200468 }, { "acc": 0.73857937, "epoch": 1.4972095383054287, "grad_norm": 2.09375, "learning_rate": 1.6306010726388117e-06, "loss": 1.02325745, "memory(GiB)": 369.42, "step": 59020, "train_speed(iter/s)": 0.20047 }, { "acc": 0.7489615, "epoch": 1.497336377473364, "grad_norm": 2.6875, "learning_rate": 1.6298263805690573e-06, "loss": 1.04287491, "memory(GiB)": 369.42, "step": 59025, "train_speed(iter/s)": 0.200474 }, { "acc": 0.75581384, "epoch": 1.4974632166412989, "grad_norm": 2.015625, "learning_rate": 1.629051836734587e-06, "loss": 0.94833641, "memory(GiB)": 369.42, "step": 59030, "train_speed(iter/s)": 0.200476 }, { "acc": 0.77291346, "epoch": 1.4975900558092339, "grad_norm": 2.265625, "learning_rate": 1.6282774411694641e-06, "loss": 0.9341651, "memory(GiB)": 369.42, "step": 59035, "train_speed(iter/s)": 0.200479 }, { "acc": 0.75034981, "epoch": 1.4977168949771689, "grad_norm": 2.53125, "learning_rate": 1.6275031939077545e-06, "loss": 0.97648268, "memory(GiB)": 369.42, "step": 59040, "train_speed(iter/s)": 0.200481 }, { "acc": 0.7576385, "epoch": 1.497843734145104, "grad_norm": 2.09375, "learning_rate": 1.6267290949835119e-06, "loss": 0.95611467, "memory(GiB)": 369.42, "step": 59045, "train_speed(iter/s)": 0.200482 }, { "acc": 0.74709463, "epoch": 1.497970573313039, "grad_norm": 2.046875, "learning_rate": 1.6259551444307852e-06, "loss": 0.97755547, "memory(GiB)": 369.42, "step": 59050, "train_speed(iter/s)": 0.200484 }, { "acc": 0.73499289, "epoch": 1.498097412480974, "grad_norm": 2.21875, "learning_rate": 1.6251813422836127e-06, "loss": 1.00284061, "memory(GiB)": 369.42, "step": 59055, "train_speed(iter/s)": 0.200487 }, { "acc": 0.75728321, "epoch": 1.4982242516489093, "grad_norm": 2.421875, "learning_rate": 1.6244076885760334e-06, "loss": 0.96515064, "memory(GiB)": 369.42, "step": 59060, "train_speed(iter/s)": 0.20049 }, { "acc": 0.74555883, "epoch": 1.4983510908168443, "grad_norm": 2.21875, "learning_rate": 1.6236341833420755e-06, "loss": 1.0465353, "memory(GiB)": 369.42, "step": 59065, "train_speed(iter/s)": 0.200492 }, { "acc": 0.74851217, "epoch": 1.4984779299847792, "grad_norm": 2.203125, "learning_rate": 1.6228608266157596e-06, "loss": 0.97922955, "memory(GiB)": 369.42, "step": 59070, "train_speed(iter/s)": 0.200495 }, { "acc": 0.76280193, "epoch": 1.4986047691527142, "grad_norm": 2.125, "learning_rate": 1.6220876184311034e-06, "loss": 0.93743992, "memory(GiB)": 369.42, "step": 59075, "train_speed(iter/s)": 0.200497 }, { "acc": 0.75010738, "epoch": 1.4987316083206494, "grad_norm": 2.546875, "learning_rate": 1.6213145588221146e-06, "loss": 0.95043421, "memory(GiB)": 369.42, "step": 59080, "train_speed(iter/s)": 0.200498 }, { "acc": 0.74627619, "epoch": 1.4988584474885844, "grad_norm": 2.40625, "learning_rate": 1.620541647822796e-06, "loss": 0.96973629, "memory(GiB)": 369.42, "step": 59085, "train_speed(iter/s)": 0.2005 }, { "acc": 0.75108633, "epoch": 1.4989852866565196, "grad_norm": 2.484375, "learning_rate": 1.6197688854671444e-06, "loss": 1.02813492, "memory(GiB)": 369.42, "step": 59090, "train_speed(iter/s)": 0.200502 }, { "acc": 0.76336689, "epoch": 1.4991121258244546, "grad_norm": 2.046875, "learning_rate": 1.6189962717891484e-06, "loss": 0.93606834, "memory(GiB)": 369.42, "step": 59095, "train_speed(iter/s)": 0.200504 }, { "acc": 0.74174414, "epoch": 1.4992389649923896, "grad_norm": 2.25, "learning_rate": 1.6182238068227917e-06, "loss": 1.07546654, "memory(GiB)": 369.42, "step": 59100, "train_speed(iter/s)": 0.200508 }, { "acc": 0.74702845, "epoch": 1.4993658041603246, "grad_norm": 2.09375, "learning_rate": 1.6174514906020505e-06, "loss": 0.97255564, "memory(GiB)": 369.42, "step": 59105, "train_speed(iter/s)": 0.200511 }, { "acc": 0.76214814, "epoch": 1.4994926433282598, "grad_norm": 1.8046875, "learning_rate": 1.6166793231608952e-06, "loss": 0.93033638, "memory(GiB)": 369.42, "step": 59110, "train_speed(iter/s)": 0.200512 }, { "acc": 0.74546075, "epoch": 1.4996194824961948, "grad_norm": 2.203125, "learning_rate": 1.615907304533288e-06, "loss": 1.03358355, "memory(GiB)": 369.42, "step": 59115, "train_speed(iter/s)": 0.200511 }, { "acc": 0.75318146, "epoch": 1.4997463216641298, "grad_norm": 2.0, "learning_rate": 1.6151354347531868e-06, "loss": 1.00870132, "memory(GiB)": 369.42, "step": 59120, "train_speed(iter/s)": 0.200514 }, { "acc": 0.76071014, "epoch": 1.499873160832065, "grad_norm": 2.46875, "learning_rate": 1.61436371385454e-06, "loss": 0.98755474, "memory(GiB)": 369.42, "step": 59125, "train_speed(iter/s)": 0.200516 }, { "acc": 0.74935083, "epoch": 1.5, "grad_norm": 2.359375, "learning_rate": 1.6135921418712959e-06, "loss": 0.99438553, "memory(GiB)": 369.42, "step": 59130, "train_speed(iter/s)": 0.200518 }, { "acc": 0.75999575, "epoch": 1.500126839167935, "grad_norm": 3.640625, "learning_rate": 1.6128207188373867e-06, "loss": 0.97425508, "memory(GiB)": 369.42, "step": 59135, "train_speed(iter/s)": 0.200519 }, { "acc": 0.74761019, "epoch": 1.50025367833587, "grad_norm": 1.859375, "learning_rate": 1.6120494447867451e-06, "loss": 1.01178799, "memory(GiB)": 369.42, "step": 59140, "train_speed(iter/s)": 0.200522 }, { "acc": 0.75897226, "epoch": 1.5003805175038052, "grad_norm": 3.1875, "learning_rate": 1.6112783197532932e-06, "loss": 0.92842922, "memory(GiB)": 369.42, "step": 59145, "train_speed(iter/s)": 0.200524 }, { "acc": 0.74311528, "epoch": 1.5005073566717404, "grad_norm": 2.265625, "learning_rate": 1.6105073437709545e-06, "loss": 0.9646121, "memory(GiB)": 369.42, "step": 59150, "train_speed(iter/s)": 0.200527 }, { "acc": 0.72937574, "epoch": 1.5006341958396754, "grad_norm": 2.25, "learning_rate": 1.6097365168736335e-06, "loss": 1.05689278, "memory(GiB)": 369.42, "step": 59155, "train_speed(iter/s)": 0.20053 }, { "acc": 0.75057302, "epoch": 1.5007610350076104, "grad_norm": 2.0625, "learning_rate": 1.6089658390952351e-06, "loss": 0.97541752, "memory(GiB)": 369.42, "step": 59160, "train_speed(iter/s)": 0.200532 }, { "acc": 0.75976081, "epoch": 1.5008878741755454, "grad_norm": 2.0625, "learning_rate": 1.6081953104696612e-06, "loss": 0.98844204, "memory(GiB)": 369.42, "step": 59165, "train_speed(iter/s)": 0.200535 }, { "acc": 0.75525379, "epoch": 1.5010147133434804, "grad_norm": 2.109375, "learning_rate": 1.6074249310308021e-06, "loss": 1.01247826, "memory(GiB)": 369.42, "step": 59170, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75818548, "epoch": 1.5011415525114156, "grad_norm": 2.234375, "learning_rate": 1.6066547008125399e-06, "loss": 0.96015167, "memory(GiB)": 369.42, "step": 59175, "train_speed(iter/s)": 0.200541 }, { "acc": 0.74223275, "epoch": 1.5012683916793506, "grad_norm": 2.25, "learning_rate": 1.6058846198487522e-06, "loss": 0.99780712, "memory(GiB)": 369.42, "step": 59180, "train_speed(iter/s)": 0.200543 }, { "acc": 0.73642607, "epoch": 1.5013952308472858, "grad_norm": 2.078125, "learning_rate": 1.6051146881733142e-06, "loss": 0.99072437, "memory(GiB)": 369.42, "step": 59185, "train_speed(iter/s)": 0.200546 }, { "acc": 0.755584, "epoch": 1.5015220700152208, "grad_norm": 2.78125, "learning_rate": 1.6043449058200916e-06, "loss": 0.99250164, "memory(GiB)": 369.42, "step": 59190, "train_speed(iter/s)": 0.200549 }, { "acc": 0.74764438, "epoch": 1.5016489091831557, "grad_norm": 2.125, "learning_rate": 1.6035752728229364e-06, "loss": 0.96084995, "memory(GiB)": 369.42, "step": 59195, "train_speed(iter/s)": 0.200551 }, { "acc": 0.75310745, "epoch": 1.5017757483510907, "grad_norm": 1.9453125, "learning_rate": 1.6028057892157067e-06, "loss": 1.02103701, "memory(GiB)": 369.42, "step": 59200, "train_speed(iter/s)": 0.200554 }, { "acc": 0.75007067, "epoch": 1.5019025875190257, "grad_norm": 2.09375, "learning_rate": 1.602036455032246e-06, "loss": 0.97053795, "memory(GiB)": 369.42, "step": 59205, "train_speed(iter/s)": 0.200556 }, { "acc": 0.75033646, "epoch": 1.502029426686961, "grad_norm": 2.34375, "learning_rate": 1.6012672703063925e-06, "loss": 1.00354366, "memory(GiB)": 369.42, "step": 59210, "train_speed(iter/s)": 0.200558 }, { "acc": 0.74538021, "epoch": 1.5021562658548961, "grad_norm": 2.515625, "learning_rate": 1.600498235071979e-06, "loss": 1.03516426, "memory(GiB)": 369.42, "step": 59215, "train_speed(iter/s)": 0.200561 }, { "acc": 0.73008003, "epoch": 1.5022831050228311, "grad_norm": 2.109375, "learning_rate": 1.5997293493628301e-06, "loss": 1.00858183, "memory(GiB)": 369.42, "step": 59220, "train_speed(iter/s)": 0.200564 }, { "acc": 0.75153208, "epoch": 1.5024099441907661, "grad_norm": 2.359375, "learning_rate": 1.598960613212766e-06, "loss": 0.98669376, "memory(GiB)": 369.42, "step": 59225, "train_speed(iter/s)": 0.200567 }, { "acc": 0.76054564, "epoch": 1.5025367833587011, "grad_norm": 2.0625, "learning_rate": 1.598192026655599e-06, "loss": 1.01188831, "memory(GiB)": 369.42, "step": 59230, "train_speed(iter/s)": 0.200569 }, { "acc": 0.75188136, "epoch": 1.502663622526636, "grad_norm": 2.046875, "learning_rate": 1.5974235897251344e-06, "loss": 0.96397486, "memory(GiB)": 369.42, "step": 59235, "train_speed(iter/s)": 0.200569 }, { "acc": 0.74986973, "epoch": 1.5027904616945713, "grad_norm": 2.75, "learning_rate": 1.5966553024551717e-06, "loss": 1.01125679, "memory(GiB)": 369.42, "step": 59240, "train_speed(iter/s)": 0.200572 }, { "acc": 0.74719038, "epoch": 1.5029173008625063, "grad_norm": 2.15625, "learning_rate": 1.5958871648795032e-06, "loss": 0.98115721, "memory(GiB)": 369.42, "step": 59245, "train_speed(iter/s)": 0.200575 }, { "acc": 0.75992785, "epoch": 1.5030441400304415, "grad_norm": 2.109375, "learning_rate": 1.5951191770319164e-06, "loss": 0.94256344, "memory(GiB)": 369.42, "step": 59250, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76049261, "epoch": 1.5031709791983765, "grad_norm": 2.34375, "learning_rate": 1.594351338946189e-06, "loss": 0.98509865, "memory(GiB)": 369.42, "step": 59255, "train_speed(iter/s)": 0.20058 }, { "acc": 0.75168495, "epoch": 1.5032978183663115, "grad_norm": 2.40625, "learning_rate": 1.5935836506560953e-06, "loss": 0.9855505, "memory(GiB)": 369.42, "step": 59260, "train_speed(iter/s)": 0.200583 }, { "acc": 0.761373, "epoch": 1.5034246575342465, "grad_norm": 1.859375, "learning_rate": 1.5928161121954012e-06, "loss": 0.93731461, "memory(GiB)": 369.42, "step": 59265, "train_speed(iter/s)": 0.200585 }, { "acc": 0.76079092, "epoch": 1.5035514967021817, "grad_norm": 2.015625, "learning_rate": 1.592048723597866e-06, "loss": 0.95133171, "memory(GiB)": 369.42, "step": 59270, "train_speed(iter/s)": 0.200586 }, { "acc": 0.77298317, "epoch": 1.5036783358701167, "grad_norm": 2.15625, "learning_rate": 1.591281484897244e-06, "loss": 0.9397007, "memory(GiB)": 369.42, "step": 59275, "train_speed(iter/s)": 0.200589 }, { "acc": 0.75557728, "epoch": 1.5038051750380519, "grad_norm": 2.28125, "learning_rate": 1.5905143961272807e-06, "loss": 0.98395805, "memory(GiB)": 369.42, "step": 59280, "train_speed(iter/s)": 0.200592 }, { "acc": 0.75934315, "epoch": 1.5039320142059869, "grad_norm": 2.421875, "learning_rate": 1.5897474573217153e-06, "loss": 0.90704966, "memory(GiB)": 369.42, "step": 59285, "train_speed(iter/s)": 0.200593 }, { "acc": 0.74531364, "epoch": 1.5040588533739219, "grad_norm": 2.609375, "learning_rate": 1.588980668514285e-06, "loss": 1.01013393, "memory(GiB)": 369.42, "step": 59290, "train_speed(iter/s)": 0.200597 }, { "acc": 0.74727407, "epoch": 1.5041856925418569, "grad_norm": 1.9609375, "learning_rate": 1.5882140297387127e-06, "loss": 0.9526288, "memory(GiB)": 369.42, "step": 59295, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74989281, "epoch": 1.5043125317097918, "grad_norm": 2.109375, "learning_rate": 1.5874475410287189e-06, "loss": 1.01630859, "memory(GiB)": 369.42, "step": 59300, "train_speed(iter/s)": 0.200599 }, { "acc": 0.75282116, "epoch": 1.504439370877727, "grad_norm": 2.078125, "learning_rate": 1.586681202418019e-06, "loss": 1.01397057, "memory(GiB)": 369.42, "step": 59305, "train_speed(iter/s)": 0.200601 }, { "acc": 0.75330639, "epoch": 1.5045662100456623, "grad_norm": 2.34375, "learning_rate": 1.5859150139403212e-06, "loss": 1.03061161, "memory(GiB)": 369.42, "step": 59310, "train_speed(iter/s)": 0.200603 }, { "acc": 0.75409861, "epoch": 1.5046930492135973, "grad_norm": 2.125, "learning_rate": 1.585148975629322e-06, "loss": 0.97027512, "memory(GiB)": 369.42, "step": 59315, "train_speed(iter/s)": 0.200605 }, { "acc": 0.74623675, "epoch": 1.5048198883815322, "grad_norm": 2.421875, "learning_rate": 1.5843830875187155e-06, "loss": 0.99088802, "memory(GiB)": 369.42, "step": 59320, "train_speed(iter/s)": 0.200608 }, { "acc": 0.75168104, "epoch": 1.5049467275494672, "grad_norm": 2.3125, "learning_rate": 1.583617349642192e-06, "loss": 1.03901424, "memory(GiB)": 369.42, "step": 59325, "train_speed(iter/s)": 0.200611 }, { "acc": 0.74822874, "epoch": 1.5050735667174022, "grad_norm": 1.9296875, "learning_rate": 1.5828517620334322e-06, "loss": 1.02455406, "memory(GiB)": 369.42, "step": 59330, "train_speed(iter/s)": 0.200613 }, { "acc": 0.76260495, "epoch": 1.5052004058853374, "grad_norm": 2.125, "learning_rate": 1.5820863247261054e-06, "loss": 0.95999451, "memory(GiB)": 369.42, "step": 59335, "train_speed(iter/s)": 0.200616 }, { "acc": 0.75283532, "epoch": 1.5053272450532724, "grad_norm": 2.25, "learning_rate": 1.5813210377538834e-06, "loss": 1.01447468, "memory(GiB)": 369.42, "step": 59340, "train_speed(iter/s)": 0.200617 }, { "acc": 0.75350685, "epoch": 1.5054540842212076, "grad_norm": 1.9140625, "learning_rate": 1.5805559011504252e-06, "loss": 0.98678989, "memory(GiB)": 369.42, "step": 59345, "train_speed(iter/s)": 0.200619 }, { "acc": 0.75576172, "epoch": 1.5055809233891426, "grad_norm": 2.046875, "learning_rate": 1.5797909149493873e-06, "loss": 0.91079407, "memory(GiB)": 369.42, "step": 59350, "train_speed(iter/s)": 0.200622 }, { "acc": 0.73420835, "epoch": 1.5057077625570776, "grad_norm": 2.09375, "learning_rate": 1.5790260791844114e-06, "loss": 1.04566622, "memory(GiB)": 369.42, "step": 59355, "train_speed(iter/s)": 0.200625 }, { "acc": 0.75785484, "epoch": 1.5058346017250126, "grad_norm": 1.953125, "learning_rate": 1.5782613938891438e-06, "loss": 0.97434759, "memory(GiB)": 369.42, "step": 59360, "train_speed(iter/s)": 0.200627 }, { "acc": 0.76839867, "epoch": 1.5059614408929476, "grad_norm": 2.703125, "learning_rate": 1.5774968590972172e-06, "loss": 0.9672596, "memory(GiB)": 369.42, "step": 59365, "train_speed(iter/s)": 0.200631 }, { "acc": 0.75829453, "epoch": 1.5060882800608828, "grad_norm": 2.125, "learning_rate": 1.5767324748422592e-06, "loss": 0.98031006, "memory(GiB)": 369.42, "step": 59370, "train_speed(iter/s)": 0.200633 }, { "acc": 0.75652199, "epoch": 1.506215119228818, "grad_norm": 2.390625, "learning_rate": 1.5759682411578909e-06, "loss": 0.96041126, "memory(GiB)": 369.42, "step": 59375, "train_speed(iter/s)": 0.200635 }, { "acc": 0.75615711, "epoch": 1.506341958396753, "grad_norm": 2.15625, "learning_rate": 1.575204158077726e-06, "loss": 0.99898376, "memory(GiB)": 369.42, "step": 59380, "train_speed(iter/s)": 0.200638 }, { "acc": 0.76126294, "epoch": 1.506468797564688, "grad_norm": 2.21875, "learning_rate": 1.574440225635373e-06, "loss": 0.97048225, "memory(GiB)": 369.42, "step": 59385, "train_speed(iter/s)": 0.200641 }, { "acc": 0.75120735, "epoch": 1.506595636732623, "grad_norm": 1.921875, "learning_rate": 1.5736764438644332e-06, "loss": 0.99955387, "memory(GiB)": 369.42, "step": 59390, "train_speed(iter/s)": 0.200643 }, { "acc": 0.75671682, "epoch": 1.506722475900558, "grad_norm": 2.421875, "learning_rate": 1.5729128127985004e-06, "loss": 0.95423336, "memory(GiB)": 369.42, "step": 59395, "train_speed(iter/s)": 0.200647 }, { "acc": 0.73659582, "epoch": 1.5068493150684932, "grad_norm": 2.03125, "learning_rate": 1.5721493324711633e-06, "loss": 1.01718979, "memory(GiB)": 369.42, "step": 59400, "train_speed(iter/s)": 0.20065 }, { "acc": 0.74538269, "epoch": 1.5069761542364282, "grad_norm": 2.015625, "learning_rate": 1.5713860029160028e-06, "loss": 0.99395332, "memory(GiB)": 369.42, "step": 59405, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74602585, "epoch": 1.5071029934043634, "grad_norm": 2.3125, "learning_rate": 1.5706228241665932e-06, "loss": 1.00536995, "memory(GiB)": 369.42, "step": 59410, "train_speed(iter/s)": 0.200654 }, { "acc": 0.76310167, "epoch": 1.5072298325722984, "grad_norm": 1.765625, "learning_rate": 1.5698597962565032e-06, "loss": 0.9675787, "memory(GiB)": 369.42, "step": 59415, "train_speed(iter/s)": 0.200657 }, { "acc": 0.74489336, "epoch": 1.5073566717402334, "grad_norm": 2.328125, "learning_rate": 1.5690969192192933e-06, "loss": 0.95430603, "memory(GiB)": 369.42, "step": 59420, "train_speed(iter/s)": 0.20066 }, { "acc": 0.75268226, "epoch": 1.5074835109081683, "grad_norm": 1.7890625, "learning_rate": 1.5683341930885183e-06, "loss": 0.97480602, "memory(GiB)": 369.42, "step": 59425, "train_speed(iter/s)": 0.200661 }, { "acc": 0.74295101, "epoch": 1.5076103500761036, "grad_norm": 2.265625, "learning_rate": 1.567571617897729e-06, "loss": 1.01046467, "memory(GiB)": 369.42, "step": 59430, "train_speed(iter/s)": 0.200665 }, { "acc": 0.75986929, "epoch": 1.5077371892440385, "grad_norm": 1.8515625, "learning_rate": 1.566809193680463e-06, "loss": 0.95184212, "memory(GiB)": 369.42, "step": 59435, "train_speed(iter/s)": 0.200668 }, { "acc": 0.74959497, "epoch": 1.5078640284119738, "grad_norm": 2.515625, "learning_rate": 1.566046920470257e-06, "loss": 1.04611845, "memory(GiB)": 369.42, "step": 59440, "train_speed(iter/s)": 0.200672 }, { "acc": 0.76406689, "epoch": 1.5079908675799087, "grad_norm": 1.890625, "learning_rate": 1.5652847983006376e-06, "loss": 0.97078381, "memory(GiB)": 369.42, "step": 59445, "train_speed(iter/s)": 0.200674 }, { "acc": 0.75166521, "epoch": 1.5081177067478437, "grad_norm": 1.8671875, "learning_rate": 1.564522827205131e-06, "loss": 0.96485119, "memory(GiB)": 369.42, "step": 59450, "train_speed(iter/s)": 0.200677 }, { "acc": 0.74544935, "epoch": 1.5082445459157787, "grad_norm": 2.15625, "learning_rate": 1.5637610072172464e-06, "loss": 0.98062038, "memory(GiB)": 369.42, "step": 59455, "train_speed(iter/s)": 0.20068 }, { "acc": 0.75128307, "epoch": 1.5083713850837137, "grad_norm": 2.171875, "learning_rate": 1.5629993383704933e-06, "loss": 1.03801823, "memory(GiB)": 369.42, "step": 59460, "train_speed(iter/s)": 0.200682 }, { "acc": 0.75190191, "epoch": 1.508498224251649, "grad_norm": 2.3125, "learning_rate": 1.5622378206983764e-06, "loss": 1.00313292, "memory(GiB)": 369.42, "step": 59465, "train_speed(iter/s)": 0.200684 }, { "acc": 0.75624604, "epoch": 1.5086250634195841, "grad_norm": 2.140625, "learning_rate": 1.5614764542343896e-06, "loss": 0.97307415, "memory(GiB)": 369.42, "step": 59470, "train_speed(iter/s)": 0.200687 }, { "acc": 0.7467154, "epoch": 1.5087519025875191, "grad_norm": 2.1875, "learning_rate": 1.5607152390120173e-06, "loss": 0.984165, "memory(GiB)": 369.42, "step": 59475, "train_speed(iter/s)": 0.20069 }, { "acc": 0.73974357, "epoch": 1.5088787417554541, "grad_norm": 2.0, "learning_rate": 1.5599541750647457e-06, "loss": 1.02399397, "memory(GiB)": 369.42, "step": 59480, "train_speed(iter/s)": 0.200693 }, { "acc": 0.74805012, "epoch": 1.509005580923389, "grad_norm": 2.1875, "learning_rate": 1.559193262426048e-06, "loss": 0.97617245, "memory(GiB)": 369.42, "step": 59485, "train_speed(iter/s)": 0.200696 }, { "acc": 0.75098462, "epoch": 1.509132420091324, "grad_norm": 1.859375, "learning_rate": 1.5584325011293943e-06, "loss": 0.96744137, "memory(GiB)": 369.42, "step": 59490, "train_speed(iter/s)": 0.200697 }, { "acc": 0.7437356, "epoch": 1.5092592592592593, "grad_norm": 2.21875, "learning_rate": 1.5576718912082417e-06, "loss": 1.04115696, "memory(GiB)": 369.42, "step": 59495, "train_speed(iter/s)": 0.2007 }, { "acc": 0.75538206, "epoch": 1.5093860984271943, "grad_norm": 2.203125, "learning_rate": 1.5569114326960494e-06, "loss": 1.00119658, "memory(GiB)": 369.42, "step": 59500, "train_speed(iter/s)": 0.200701 }, { "acc": 0.75945759, "epoch": 1.5095129375951295, "grad_norm": 2.140625, "learning_rate": 1.5561511256262651e-06, "loss": 0.95243721, "memory(GiB)": 369.42, "step": 59505, "train_speed(iter/s)": 0.200704 }, { "acc": 0.75554428, "epoch": 1.5096397767630645, "grad_norm": 1.953125, "learning_rate": 1.55539097003233e-06, "loss": 0.98355618, "memory(GiB)": 369.42, "step": 59510, "train_speed(iter/s)": 0.200708 }, { "acc": 0.75521669, "epoch": 1.5097666159309995, "grad_norm": 2.078125, "learning_rate": 1.5546309659476788e-06, "loss": 0.98082962, "memory(GiB)": 369.42, "step": 59515, "train_speed(iter/s)": 0.200711 }, { "acc": 0.76732874, "epoch": 1.5098934550989345, "grad_norm": 1.9765625, "learning_rate": 1.55387111340574e-06, "loss": 0.95528736, "memory(GiB)": 369.42, "step": 59520, "train_speed(iter/s)": 0.200713 }, { "acc": 0.75793858, "epoch": 1.5100202942668695, "grad_norm": 2.25, "learning_rate": 1.553111412439936e-06, "loss": 0.98728008, "memory(GiB)": 369.42, "step": 59525, "train_speed(iter/s)": 0.200715 }, { "acc": 0.74315972, "epoch": 1.5101471334348047, "grad_norm": 2.140625, "learning_rate": 1.5523518630836809e-06, "loss": 1.05203266, "memory(GiB)": 369.42, "step": 59530, "train_speed(iter/s)": 0.200717 }, { "acc": 0.76428108, "epoch": 1.5102739726027399, "grad_norm": 2.40625, "learning_rate": 1.551592465370384e-06, "loss": 0.94708529, "memory(GiB)": 369.42, "step": 59535, "train_speed(iter/s)": 0.20072 }, { "acc": 0.73829708, "epoch": 1.5104008117706749, "grad_norm": 2.28125, "learning_rate": 1.5508332193334457e-06, "loss": 0.95586758, "memory(GiB)": 369.42, "step": 59540, "train_speed(iter/s)": 0.200722 }, { "acc": 0.74245501, "epoch": 1.5105276509386099, "grad_norm": 2.046875, "learning_rate": 1.5500741250062628e-06, "loss": 1.05161705, "memory(GiB)": 369.42, "step": 59545, "train_speed(iter/s)": 0.200725 }, { "acc": 0.77009153, "epoch": 1.5106544901065448, "grad_norm": 2.25, "learning_rate": 1.549315182422222e-06, "loss": 0.95166264, "memory(GiB)": 369.42, "step": 59550, "train_speed(iter/s)": 0.200728 }, { "acc": 0.76143084, "epoch": 1.5107813292744798, "grad_norm": 1.8203125, "learning_rate": 1.5485563916147062e-06, "loss": 1.01877613, "memory(GiB)": 369.42, "step": 59555, "train_speed(iter/s)": 0.20073 }, { "acc": 0.73811407, "epoch": 1.510908168442415, "grad_norm": 2.21875, "learning_rate": 1.5477977526170895e-06, "loss": 0.99868412, "memory(GiB)": 369.42, "step": 59560, "train_speed(iter/s)": 0.200733 }, { "acc": 0.74986286, "epoch": 1.51103500761035, "grad_norm": 2.359375, "learning_rate": 1.5470392654627392e-06, "loss": 1.0498909, "memory(GiB)": 369.42, "step": 59565, "train_speed(iter/s)": 0.200736 }, { "acc": 0.75295086, "epoch": 1.5111618467782852, "grad_norm": 2.09375, "learning_rate": 1.5462809301850212e-06, "loss": 0.99598217, "memory(GiB)": 369.42, "step": 59570, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75155058, "epoch": 1.5112886859462202, "grad_norm": 2.046875, "learning_rate": 1.5455227468172862e-06, "loss": 1.00456028, "memory(GiB)": 369.42, "step": 59575, "train_speed(iter/s)": 0.200742 }, { "acc": 0.74349537, "epoch": 1.5114155251141552, "grad_norm": 2.46875, "learning_rate": 1.5447647153928842e-06, "loss": 1.04095993, "memory(GiB)": 369.42, "step": 59580, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74872584, "epoch": 1.5115423642820902, "grad_norm": 2.5, "learning_rate": 1.5440068359451548e-06, "loss": 1.00010891, "memory(GiB)": 369.42, "step": 59585, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75541372, "epoch": 1.5116692034500254, "grad_norm": 3.4375, "learning_rate": 1.5432491085074381e-06, "loss": 1.01363106, "memory(GiB)": 369.42, "step": 59590, "train_speed(iter/s)": 0.200751 }, { "acc": 0.73850026, "epoch": 1.5117960426179604, "grad_norm": 1.9296875, "learning_rate": 1.5424915331130568e-06, "loss": 1.02836809, "memory(GiB)": 369.42, "step": 59595, "train_speed(iter/s)": 0.200752 }, { "acc": 0.7641758, "epoch": 1.5119228817858956, "grad_norm": 2.609375, "learning_rate": 1.5417341097953332e-06, "loss": 0.89804344, "memory(GiB)": 369.42, "step": 59600, "train_speed(iter/s)": 0.200754 }, { "acc": 0.76754837, "epoch": 1.5120497209538306, "grad_norm": 2.6875, "learning_rate": 1.540976838587585e-06, "loss": 1.01317177, "memory(GiB)": 369.42, "step": 59605, "train_speed(iter/s)": 0.200757 }, { "acc": 0.7529047, "epoch": 1.5121765601217656, "grad_norm": 2.078125, "learning_rate": 1.5402197195231205e-06, "loss": 0.97823696, "memory(GiB)": 369.42, "step": 59610, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75560322, "epoch": 1.5123033992897006, "grad_norm": 2.03125, "learning_rate": 1.5394627526352379e-06, "loss": 0.96446581, "memory(GiB)": 369.42, "step": 59615, "train_speed(iter/s)": 0.200763 }, { "acc": 0.75406055, "epoch": 1.5124302384576356, "grad_norm": 2.3125, "learning_rate": 1.5387059379572322e-06, "loss": 0.98231354, "memory(GiB)": 369.42, "step": 59620, "train_speed(iter/s)": 0.200766 }, { "acc": 0.77153211, "epoch": 1.5125570776255708, "grad_norm": 2.359375, "learning_rate": 1.537949275522394e-06, "loss": 0.94888124, "memory(GiB)": 369.42, "step": 59625, "train_speed(iter/s)": 0.200768 }, { "acc": 0.75099168, "epoch": 1.512683916793506, "grad_norm": 2.609375, "learning_rate": 1.5371927653640056e-06, "loss": 1.00397625, "memory(GiB)": 369.42, "step": 59630, "train_speed(iter/s)": 0.200771 }, { "acc": 0.75203319, "epoch": 1.512810755961441, "grad_norm": 1.953125, "learning_rate": 1.5364364075153366e-06, "loss": 0.94125738, "memory(GiB)": 369.42, "step": 59635, "train_speed(iter/s)": 0.200774 }, { "acc": 0.75654631, "epoch": 1.512937595129376, "grad_norm": 2.09375, "learning_rate": 1.5356802020096595e-06, "loss": 0.97099571, "memory(GiB)": 369.42, "step": 59640, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75663109, "epoch": 1.513064434297311, "grad_norm": 1.9921875, "learning_rate": 1.5349241488802346e-06, "loss": 0.95764236, "memory(GiB)": 369.42, "step": 59645, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75029516, "epoch": 1.513191273465246, "grad_norm": 2.125, "learning_rate": 1.5341682481603155e-06, "loss": 1.0001152, "memory(GiB)": 369.42, "step": 59650, "train_speed(iter/s)": 0.200779 }, { "acc": 0.75355148, "epoch": 1.5133181126331812, "grad_norm": 2.328125, "learning_rate": 1.5334124998831512e-06, "loss": 0.98203564, "memory(GiB)": 369.42, "step": 59655, "train_speed(iter/s)": 0.20078 }, { "acc": 0.73361607, "epoch": 1.5134449518011162, "grad_norm": 1.9140625, "learning_rate": 1.532656904081982e-06, "loss": 1.03742027, "memory(GiB)": 369.42, "step": 59660, "train_speed(iter/s)": 0.200783 }, { "acc": 0.76102104, "epoch": 1.5135717909690514, "grad_norm": 2.171875, "learning_rate": 1.5319014607900428e-06, "loss": 0.93081083, "memory(GiB)": 369.42, "step": 59665, "train_speed(iter/s)": 0.200785 }, { "acc": 0.75911913, "epoch": 1.5136986301369864, "grad_norm": 2.1875, "learning_rate": 1.5311461700405617e-06, "loss": 0.97561035, "memory(GiB)": 369.42, "step": 59670, "train_speed(iter/s)": 0.200786 }, { "acc": 0.75183744, "epoch": 1.5138254693049213, "grad_norm": 2.515625, "learning_rate": 1.5303910318667586e-06, "loss": 1.00039291, "memory(GiB)": 369.42, "step": 59675, "train_speed(iter/s)": 0.200788 }, { "acc": 0.75114965, "epoch": 1.5139523084728563, "grad_norm": 2.09375, "learning_rate": 1.529636046301849e-06, "loss": 1.00607719, "memory(GiB)": 369.42, "step": 59680, "train_speed(iter/s)": 0.200791 }, { "acc": 0.73936367, "epoch": 1.5140791476407913, "grad_norm": 2.375, "learning_rate": 1.5288812133790405e-06, "loss": 1.0300705, "memory(GiB)": 369.42, "step": 59685, "train_speed(iter/s)": 0.200794 }, { "acc": 0.73833432, "epoch": 1.5142059868087265, "grad_norm": 2.140625, "learning_rate": 1.5281265331315332e-06, "loss": 0.99805479, "memory(GiB)": 369.42, "step": 59690, "train_speed(iter/s)": 0.200797 }, { "acc": 0.75572548, "epoch": 1.5143328259766617, "grad_norm": 2.046875, "learning_rate": 1.5273720055925217e-06, "loss": 0.97665386, "memory(GiB)": 369.42, "step": 59695, "train_speed(iter/s)": 0.2008 }, { "acc": 0.752316, "epoch": 1.5144596651445967, "grad_norm": 2.0, "learning_rate": 1.5266176307951936e-06, "loss": 1.02114964, "memory(GiB)": 369.42, "step": 59700, "train_speed(iter/s)": 0.200803 }, { "acc": 0.74340076, "epoch": 1.5145865043125317, "grad_norm": 1.8984375, "learning_rate": 1.5258634087727298e-06, "loss": 0.95831795, "memory(GiB)": 369.42, "step": 59705, "train_speed(iter/s)": 0.200805 }, { "acc": 0.75488825, "epoch": 1.5147133434804667, "grad_norm": 2.0625, "learning_rate": 1.5251093395583045e-06, "loss": 0.99153891, "memory(GiB)": 369.42, "step": 59710, "train_speed(iter/s)": 0.200808 }, { "acc": 0.74910698, "epoch": 1.5148401826484017, "grad_norm": 2.03125, "learning_rate": 1.5243554231850843e-06, "loss": 1.01382141, "memory(GiB)": 369.42, "step": 59715, "train_speed(iter/s)": 0.20081 }, { "acc": 0.74416223, "epoch": 1.514967021816337, "grad_norm": 2.015625, "learning_rate": 1.5236016596862302e-06, "loss": 0.99026747, "memory(GiB)": 369.42, "step": 59720, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74744797, "epoch": 1.515093860984272, "grad_norm": 1.828125, "learning_rate": 1.5228480490948943e-06, "loss": 0.94247055, "memory(GiB)": 369.42, "step": 59725, "train_speed(iter/s)": 0.200812 }, { "acc": 0.75584192, "epoch": 1.5152207001522071, "grad_norm": 2.140625, "learning_rate": 1.5220945914442292e-06, "loss": 1.01087294, "memory(GiB)": 369.42, "step": 59730, "train_speed(iter/s)": 0.200815 }, { "acc": 0.74506044, "epoch": 1.515347539320142, "grad_norm": 2.0625, "learning_rate": 1.52134128676737e-06, "loss": 0.94177799, "memory(GiB)": 369.42, "step": 59735, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74904184, "epoch": 1.515474378488077, "grad_norm": 2.5, "learning_rate": 1.5205881350974504e-06, "loss": 1.04935417, "memory(GiB)": 369.42, "step": 59740, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74581509, "epoch": 1.515601217656012, "grad_norm": 2.75, "learning_rate": 1.5198351364676012e-06, "loss": 0.99257717, "memory(GiB)": 369.42, "step": 59745, "train_speed(iter/s)": 0.200823 }, { "acc": 0.75051126, "epoch": 1.5157280568239473, "grad_norm": 2.140625, "learning_rate": 1.5190822909109415e-06, "loss": 1.0019083, "memory(GiB)": 369.42, "step": 59750, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74918871, "epoch": 1.5158548959918823, "grad_norm": 1.84375, "learning_rate": 1.5183295984605824e-06, "loss": 1.03790493, "memory(GiB)": 369.42, "step": 59755, "train_speed(iter/s)": 0.200826 }, { "acc": 0.76141047, "epoch": 1.5159817351598175, "grad_norm": 2.015625, "learning_rate": 1.5175770591496303e-06, "loss": 0.96075859, "memory(GiB)": 369.42, "step": 59760, "train_speed(iter/s)": 0.200829 }, { "acc": 0.75067253, "epoch": 1.5161085743277525, "grad_norm": 1.8671875, "learning_rate": 1.5168246730111892e-06, "loss": 1.01064272, "memory(GiB)": 369.42, "step": 59765, "train_speed(iter/s)": 0.20083 }, { "acc": 0.76228485, "epoch": 1.5162354134956875, "grad_norm": 2.0625, "learning_rate": 1.5160724400783511e-06, "loss": 0.98419361, "memory(GiB)": 369.42, "step": 59770, "train_speed(iter/s)": 0.200833 }, { "acc": 0.74263487, "epoch": 1.5163622526636225, "grad_norm": 2.0, "learning_rate": 1.5153203603841992e-06, "loss": 0.99718828, "memory(GiB)": 369.42, "step": 59775, "train_speed(iter/s)": 0.200835 }, { "acc": 0.74447398, "epoch": 1.5164890918315574, "grad_norm": 2.078125, "learning_rate": 1.5145684339618172e-06, "loss": 1.05038548, "memory(GiB)": 369.42, "step": 59780, "train_speed(iter/s)": 0.200839 }, { "acc": 0.76620436, "epoch": 1.5166159309994927, "grad_norm": 2.203125, "learning_rate": 1.5138166608442768e-06, "loss": 0.92161312, "memory(GiB)": 369.42, "step": 59785, "train_speed(iter/s)": 0.20084 }, { "acc": 0.75103865, "epoch": 1.5167427701674279, "grad_norm": 2.375, "learning_rate": 1.5130650410646452e-06, "loss": 0.96007748, "memory(GiB)": 369.42, "step": 59790, "train_speed(iter/s)": 0.200842 }, { "acc": 0.75962052, "epoch": 1.5168696093353629, "grad_norm": 1.765625, "learning_rate": 1.5123135746559792e-06, "loss": 0.97549353, "memory(GiB)": 369.42, "step": 59795, "train_speed(iter/s)": 0.200843 }, { "acc": 0.7638607, "epoch": 1.5169964485032978, "grad_norm": 2.703125, "learning_rate": 1.5115622616513343e-06, "loss": 0.97424345, "memory(GiB)": 369.42, "step": 59800, "train_speed(iter/s)": 0.200845 }, { "acc": 0.74146652, "epoch": 1.5171232876712328, "grad_norm": 2.015625, "learning_rate": 1.5108111020837564e-06, "loss": 1.08046494, "memory(GiB)": 369.42, "step": 59805, "train_speed(iter/s)": 0.200848 }, { "acc": 0.75586576, "epoch": 1.5172501268391678, "grad_norm": 2.0, "learning_rate": 1.5100600959862838e-06, "loss": 0.94106359, "memory(GiB)": 369.42, "step": 59810, "train_speed(iter/s)": 0.200851 }, { "acc": 0.74796605, "epoch": 1.517376966007103, "grad_norm": 2.203125, "learning_rate": 1.5093092433919497e-06, "loss": 0.99498634, "memory(GiB)": 369.42, "step": 59815, "train_speed(iter/s)": 0.200853 }, { "acc": 0.75561953, "epoch": 1.517503805175038, "grad_norm": 2.046875, "learning_rate": 1.5085585443337803e-06, "loss": 0.93709259, "memory(GiB)": 369.42, "step": 59820, "train_speed(iter/s)": 0.200857 }, { "acc": 0.74833117, "epoch": 1.5176306443429732, "grad_norm": 2.4375, "learning_rate": 1.507807998844794e-06, "loss": 1.05497055, "memory(GiB)": 369.42, "step": 59825, "train_speed(iter/s)": 0.20086 }, { "acc": 0.74988909, "epoch": 1.5177574835109082, "grad_norm": 2.21875, "learning_rate": 1.5070576069580039e-06, "loss": 0.95696144, "memory(GiB)": 369.42, "step": 59830, "train_speed(iter/s)": 0.200862 }, { "acc": 0.75948582, "epoch": 1.5178843226788432, "grad_norm": 1.84375, "learning_rate": 1.5063073687064144e-06, "loss": 1.00317316, "memory(GiB)": 369.42, "step": 59835, "train_speed(iter/s)": 0.200864 }, { "acc": 0.76483517, "epoch": 1.5180111618467782, "grad_norm": 2.21875, "learning_rate": 1.5055572841230253e-06, "loss": 0.97637911, "memory(GiB)": 369.42, "step": 59840, "train_speed(iter/s)": 0.200866 }, { "acc": 0.76252398, "epoch": 1.5181380010147132, "grad_norm": 2.015625, "learning_rate": 1.5048073532408287e-06, "loss": 0.99154291, "memory(GiB)": 369.42, "step": 59845, "train_speed(iter/s)": 0.200868 }, { "acc": 0.76154938, "epoch": 1.5182648401826484, "grad_norm": 1.84375, "learning_rate": 1.5040575760928094e-06, "loss": 0.96218033, "memory(GiB)": 369.42, "step": 59850, "train_speed(iter/s)": 0.200871 }, { "acc": 0.747508, "epoch": 1.5183916793505836, "grad_norm": 2.140625, "learning_rate": 1.5033079527119466e-06, "loss": 0.9900054, "memory(GiB)": 369.42, "step": 59855, "train_speed(iter/s)": 0.200873 }, { "acc": 0.76400442, "epoch": 1.5185185185185186, "grad_norm": 2.234375, "learning_rate": 1.5025584831312112e-06, "loss": 0.95809803, "memory(GiB)": 369.42, "step": 59860, "train_speed(iter/s)": 0.200871 }, { "acc": 0.74177179, "epoch": 1.5186453576864536, "grad_norm": 2.484375, "learning_rate": 1.5018091673835667e-06, "loss": 1.01613922, "memory(GiB)": 369.42, "step": 59865, "train_speed(iter/s)": 0.200874 }, { "acc": 0.75711966, "epoch": 1.5187721968543886, "grad_norm": 2.078125, "learning_rate": 1.501060005501977e-06, "loss": 0.97298565, "memory(GiB)": 369.42, "step": 59870, "train_speed(iter/s)": 0.200877 }, { "acc": 0.75756407, "epoch": 1.5188990360223236, "grad_norm": 2.34375, "learning_rate": 1.500310997519388e-06, "loss": 0.99389696, "memory(GiB)": 369.42, "step": 59875, "train_speed(iter/s)": 0.20088 }, { "acc": 0.75404758, "epoch": 1.5190258751902588, "grad_norm": 2.125, "learning_rate": 1.4995621434687468e-06, "loss": 1.00685101, "memory(GiB)": 369.42, "step": 59880, "train_speed(iter/s)": 0.200882 }, { "acc": 0.73921566, "epoch": 1.5191527143581938, "grad_norm": 2.21875, "learning_rate": 1.4988134433829892e-06, "loss": 1.0339015, "memory(GiB)": 369.42, "step": 59885, "train_speed(iter/s)": 0.200884 }, { "acc": 0.74163656, "epoch": 1.519279553526129, "grad_norm": 2.03125, "learning_rate": 1.4980648972950507e-06, "loss": 0.99315357, "memory(GiB)": 369.42, "step": 59890, "train_speed(iter/s)": 0.200887 }, { "acc": 0.75584412, "epoch": 1.519406392694064, "grad_norm": 2.515625, "learning_rate": 1.4973165052378518e-06, "loss": 0.95993242, "memory(GiB)": 369.42, "step": 59895, "train_speed(iter/s)": 0.20089 }, { "acc": 0.75045156, "epoch": 1.519533231861999, "grad_norm": 2.234375, "learning_rate": 1.49656826724431e-06, "loss": 1.00922394, "memory(GiB)": 369.42, "step": 59900, "train_speed(iter/s)": 0.200892 }, { "acc": 0.75563536, "epoch": 1.519660071029934, "grad_norm": 2.046875, "learning_rate": 1.4958201833473386e-06, "loss": 0.96443043, "memory(GiB)": 369.42, "step": 59905, "train_speed(iter/s)": 0.200894 }, { "acc": 0.74078064, "epoch": 1.5197869101978692, "grad_norm": 2.015625, "learning_rate": 1.4950722535798423e-06, "loss": 1.02323771, "memory(GiB)": 369.42, "step": 59910, "train_speed(iter/s)": 0.200896 }, { "acc": 0.74311523, "epoch": 1.5199137493658041, "grad_norm": 2.015625, "learning_rate": 1.4943244779747134e-06, "loss": 1.01173534, "memory(GiB)": 369.42, "step": 59915, "train_speed(iter/s)": 0.200898 }, { "acc": 0.74826641, "epoch": 1.5200405885337394, "grad_norm": 1.8984375, "learning_rate": 1.4935768565648478e-06, "loss": 0.96277523, "memory(GiB)": 369.42, "step": 59920, "train_speed(iter/s)": 0.200901 }, { "acc": 0.75028496, "epoch": 1.5201674277016743, "grad_norm": 2.703125, "learning_rate": 1.4928293893831265e-06, "loss": 1.02654324, "memory(GiB)": 369.42, "step": 59925, "train_speed(iter/s)": 0.200904 }, { "acc": 0.75096231, "epoch": 1.5202942668696093, "grad_norm": 2.453125, "learning_rate": 1.4920820764624288e-06, "loss": 0.98135834, "memory(GiB)": 369.42, "step": 59930, "train_speed(iter/s)": 0.200906 }, { "acc": 0.75031929, "epoch": 1.5204211060375443, "grad_norm": 2.28125, "learning_rate": 1.4913349178356202e-06, "loss": 1.00255499, "memory(GiB)": 369.42, "step": 59935, "train_speed(iter/s)": 0.200909 }, { "acc": 0.74498463, "epoch": 1.5205479452054793, "grad_norm": 2.421875, "learning_rate": 1.4905879135355684e-06, "loss": 1.03437748, "memory(GiB)": 369.42, "step": 59940, "train_speed(iter/s)": 0.200911 }, { "acc": 0.74654684, "epoch": 1.5206747843734145, "grad_norm": 2.453125, "learning_rate": 1.4898410635951282e-06, "loss": 0.97446442, "memory(GiB)": 369.42, "step": 59945, "train_speed(iter/s)": 0.200912 }, { "acc": 0.76036701, "epoch": 1.5208016235413497, "grad_norm": 2.3125, "learning_rate": 1.4890943680471503e-06, "loss": 0.96972656, "memory(GiB)": 369.42, "step": 59950, "train_speed(iter/s)": 0.200915 }, { "acc": 0.74606042, "epoch": 1.5209284627092847, "grad_norm": 2.21875, "learning_rate": 1.4883478269244766e-06, "loss": 0.99836359, "memory(GiB)": 369.42, "step": 59955, "train_speed(iter/s)": 0.200919 }, { "acc": 0.75278897, "epoch": 1.5210553018772197, "grad_norm": 2.015625, "learning_rate": 1.4876014402599443e-06, "loss": 1.04748993, "memory(GiB)": 369.42, "step": 59960, "train_speed(iter/s)": 0.20092 }, { "acc": 0.74607611, "epoch": 1.5211821410451547, "grad_norm": 2.125, "learning_rate": 1.4868552080863824e-06, "loss": 1.00015392, "memory(GiB)": 369.42, "step": 59965, "train_speed(iter/s)": 0.200924 }, { "acc": 0.76058064, "epoch": 1.5213089802130897, "grad_norm": 2.796875, "learning_rate": 1.4861091304366139e-06, "loss": 0.94582682, "memory(GiB)": 369.42, "step": 59970, "train_speed(iter/s)": 0.200925 }, { "acc": 0.7656013, "epoch": 1.521435819381025, "grad_norm": 2.171875, "learning_rate": 1.4853632073434533e-06, "loss": 0.88302555, "memory(GiB)": 369.42, "step": 59975, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74521875, "epoch": 1.52156265854896, "grad_norm": 3.109375, "learning_rate": 1.484617438839711e-06, "loss": 1.0397171, "memory(GiB)": 369.42, "step": 59980, "train_speed(iter/s)": 0.20093 }, { "acc": 0.74989557, "epoch": 1.521689497716895, "grad_norm": 2.671875, "learning_rate": 1.483871824958189e-06, "loss": 0.97214594, "memory(GiB)": 369.42, "step": 59985, "train_speed(iter/s)": 0.200932 }, { "acc": 0.74756689, "epoch": 1.52181633688483, "grad_norm": 2.1875, "learning_rate": 1.483126365731682e-06, "loss": 0.96643867, "memory(GiB)": 369.42, "step": 59990, "train_speed(iter/s)": 0.200935 }, { "acc": 0.7606535, "epoch": 1.521943176052765, "grad_norm": 2.0, "learning_rate": 1.4823810611929795e-06, "loss": 0.92127304, "memory(GiB)": 369.42, "step": 59995, "train_speed(iter/s)": 0.200938 }, { "acc": 0.75976729, "epoch": 1.5220700152207, "grad_norm": 1.84375, "learning_rate": 1.481635911374863e-06, "loss": 0.94213638, "memory(GiB)": 369.42, "step": 60000, "train_speed(iter/s)": 0.200938 }, { "epoch": 1.5220700152207, "eval_acc": 0.7379759257410252, "eval_loss": 0.9695476293563843, "eval_runtime": 384.8828, "eval_samples_per_second": 16.55, "eval_steps_per_second": 8.275, "step": 60000 }, { "acc": 0.75604625, "epoch": 1.522196854388635, "grad_norm": 2.3125, "learning_rate": 1.480890916310106e-06, "loss": 0.98380737, "memory(GiB)": 369.42, "step": 60005, "train_speed(iter/s)": 0.200463 }, { "acc": 0.76353674, "epoch": 1.5223236935565703, "grad_norm": 2.25, "learning_rate": 1.4801460760314811e-06, "loss": 0.9461628, "memory(GiB)": 369.42, "step": 60010, "train_speed(iter/s)": 0.200465 }, { "acc": 0.74540057, "epoch": 1.5224505327245055, "grad_norm": 2.53125, "learning_rate": 1.4794013905717453e-06, "loss": 1.01768074, "memory(GiB)": 369.42, "step": 60015, "train_speed(iter/s)": 0.200468 }, { "acc": 0.75111055, "epoch": 1.5225773718924405, "grad_norm": 1.890625, "learning_rate": 1.4786568599636548e-06, "loss": 0.98036423, "memory(GiB)": 369.42, "step": 60020, "train_speed(iter/s)": 0.20047 }, { "acc": 0.75655642, "epoch": 1.5227042110603755, "grad_norm": 1.9453125, "learning_rate": 1.4779124842399556e-06, "loss": 1.00042858, "memory(GiB)": 369.42, "step": 60025, "train_speed(iter/s)": 0.200473 }, { "acc": 0.74939375, "epoch": 1.5228310502283104, "grad_norm": 2.359375, "learning_rate": 1.4771682634333933e-06, "loss": 0.96194296, "memory(GiB)": 369.42, "step": 60030, "train_speed(iter/s)": 0.200475 }, { "acc": 0.76066294, "epoch": 1.5229578893962454, "grad_norm": 2.609375, "learning_rate": 1.4764241975766975e-06, "loss": 0.93056908, "memory(GiB)": 369.42, "step": 60035, "train_speed(iter/s)": 0.200477 }, { "acc": 0.75990191, "epoch": 1.5230847285641806, "grad_norm": 3.234375, "learning_rate": 1.475680286702596e-06, "loss": 0.9351469, "memory(GiB)": 369.42, "step": 60040, "train_speed(iter/s)": 0.200478 }, { "acc": 0.75459518, "epoch": 1.5232115677321156, "grad_norm": 1.9609375, "learning_rate": 1.474936530843812e-06, "loss": 0.92768307, "memory(GiB)": 369.42, "step": 60045, "train_speed(iter/s)": 0.20048 }, { "acc": 0.7549789, "epoch": 1.5233384069000508, "grad_norm": 2.28125, "learning_rate": 1.4741929300330588e-06, "loss": 0.9730341, "memory(GiB)": 369.42, "step": 60050, "train_speed(iter/s)": 0.200483 }, { "acc": 0.75381427, "epoch": 1.5234652460679858, "grad_norm": 2.078125, "learning_rate": 1.4734494843030405e-06, "loss": 0.96462545, "memory(GiB)": 369.42, "step": 60055, "train_speed(iter/s)": 0.200486 }, { "acc": 0.74525518, "epoch": 1.5235920852359208, "grad_norm": 2.0625, "learning_rate": 1.4727061936864573e-06, "loss": 1.01023588, "memory(GiB)": 369.42, "step": 60060, "train_speed(iter/s)": 0.200488 }, { "acc": 0.73960571, "epoch": 1.5237189244038558, "grad_norm": 2.1875, "learning_rate": 1.4719630582160056e-06, "loss": 1.05094929, "memory(GiB)": 369.42, "step": 60065, "train_speed(iter/s)": 0.20049 }, { "acc": 0.75099821, "epoch": 1.523845763571791, "grad_norm": 2.078125, "learning_rate": 1.4712200779243718e-06, "loss": 0.99296494, "memory(GiB)": 369.42, "step": 60070, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75698996, "epoch": 1.523972602739726, "grad_norm": 2.28125, "learning_rate": 1.4704772528442308e-06, "loss": 1.00396891, "memory(GiB)": 369.42, "step": 60075, "train_speed(iter/s)": 0.200495 }, { "acc": 0.75690317, "epoch": 1.5240994419076612, "grad_norm": 1.953125, "learning_rate": 1.46973458300826e-06, "loss": 0.89989386, "memory(GiB)": 369.42, "step": 60080, "train_speed(iter/s)": 0.200498 }, { "acc": 0.7514617, "epoch": 1.5242262810755962, "grad_norm": 2.25, "learning_rate": 1.4689920684491232e-06, "loss": 0.99493141, "memory(GiB)": 369.42, "step": 60085, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75628319, "epoch": 1.5243531202435312, "grad_norm": 2.3125, "learning_rate": 1.4682497091994807e-06, "loss": 0.9888134, "memory(GiB)": 369.42, "step": 60090, "train_speed(iter/s)": 0.200502 }, { "acc": 0.74684629, "epoch": 1.5244799594114662, "grad_norm": 2.140625, "learning_rate": 1.467507505291984e-06, "loss": 0.98946018, "memory(GiB)": 369.42, "step": 60095, "train_speed(iter/s)": 0.200505 }, { "acc": 0.73320408, "epoch": 1.5246067985794012, "grad_norm": 2.328125, "learning_rate": 1.4667654567592781e-06, "loss": 1.07187462, "memory(GiB)": 369.42, "step": 60100, "train_speed(iter/s)": 0.200507 }, { "acc": 0.7427207, "epoch": 1.5247336377473364, "grad_norm": 2.15625, "learning_rate": 1.4660235636340025e-06, "loss": 1.01088314, "memory(GiB)": 369.42, "step": 60105, "train_speed(iter/s)": 0.20051 }, { "acc": 0.74734774, "epoch": 1.5248604769152716, "grad_norm": 2.5, "learning_rate": 1.465281825948789e-06, "loss": 0.98382301, "memory(GiB)": 369.42, "step": 60110, "train_speed(iter/s)": 0.200513 }, { "acc": 0.75127354, "epoch": 1.5249873160832066, "grad_norm": 2.140625, "learning_rate": 1.464540243736262e-06, "loss": 0.9878933, "memory(GiB)": 369.42, "step": 60115, "train_speed(iter/s)": 0.200515 }, { "acc": 0.75556641, "epoch": 1.5251141552511416, "grad_norm": 2.25, "learning_rate": 1.4637988170290396e-06, "loss": 0.94343777, "memory(GiB)": 369.42, "step": 60120, "train_speed(iter/s)": 0.200517 }, { "acc": 0.76515341, "epoch": 1.5252409944190766, "grad_norm": 2.046875, "learning_rate": 1.4630575458597334e-06, "loss": 0.93649178, "memory(GiB)": 369.42, "step": 60125, "train_speed(iter/s)": 0.200519 }, { "acc": 0.75317445, "epoch": 1.5253678335870116, "grad_norm": 2.484375, "learning_rate": 1.4623164302609472e-06, "loss": 0.93528566, "memory(GiB)": 369.42, "step": 60130, "train_speed(iter/s)": 0.200521 }, { "acc": 0.74207258, "epoch": 1.5254946727549468, "grad_norm": 2.3125, "learning_rate": 1.4615754702652796e-06, "loss": 1.06559467, "memory(GiB)": 369.42, "step": 60135, "train_speed(iter/s)": 0.200522 }, { "acc": 0.76757617, "epoch": 1.5256215119228818, "grad_norm": 1.984375, "learning_rate": 1.4608346659053208e-06, "loss": 0.92353926, "memory(GiB)": 369.42, "step": 60140, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75003195, "epoch": 1.525748351090817, "grad_norm": 2.109375, "learning_rate": 1.4600940172136541e-06, "loss": 0.98843966, "memory(GiB)": 369.42, "step": 60145, "train_speed(iter/s)": 0.200528 }, { "acc": 0.75531511, "epoch": 1.525875190258752, "grad_norm": 2.4375, "learning_rate": 1.4593535242228575e-06, "loss": 0.95945721, "memory(GiB)": 369.42, "step": 60150, "train_speed(iter/s)": 0.200531 }, { "acc": 0.74550934, "epoch": 1.526002029426687, "grad_norm": 3.640625, "learning_rate": 1.4586131869655001e-06, "loss": 0.98386841, "memory(GiB)": 369.42, "step": 60155, "train_speed(iter/s)": 0.200533 }, { "acc": 0.73985472, "epoch": 1.526128868594622, "grad_norm": 2.21875, "learning_rate": 1.4578730054741462e-06, "loss": 1.0363677, "memory(GiB)": 369.42, "step": 60160, "train_speed(iter/s)": 0.200534 }, { "acc": 0.74863873, "epoch": 1.526255707762557, "grad_norm": 2.234375, "learning_rate": 1.4571329797813511e-06, "loss": 0.97506075, "memory(GiB)": 369.42, "step": 60165, "train_speed(iter/s)": 0.200536 }, { "acc": 0.76534142, "epoch": 1.5263825469304921, "grad_norm": 2.390625, "learning_rate": 1.4563931099196678e-06, "loss": 0.92285805, "memory(GiB)": 369.42, "step": 60170, "train_speed(iter/s)": 0.200539 }, { "acc": 0.75136976, "epoch": 1.5265093860984273, "grad_norm": 2.078125, "learning_rate": 1.455653395921635e-06, "loss": 0.95753345, "memory(GiB)": 369.42, "step": 60175, "train_speed(iter/s)": 0.20054 }, { "acc": 0.75880737, "epoch": 1.5266362252663623, "grad_norm": 2.734375, "learning_rate": 1.4549138378197891e-06, "loss": 1.00951328, "memory(GiB)": 369.42, "step": 60180, "train_speed(iter/s)": 0.200541 }, { "acc": 0.7450388, "epoch": 1.5267630644342973, "grad_norm": 2.140625, "learning_rate": 1.4541744356466615e-06, "loss": 0.99101992, "memory(GiB)": 369.42, "step": 60185, "train_speed(iter/s)": 0.200544 }, { "acc": 0.74799471, "epoch": 1.5268899036022323, "grad_norm": 1.8359375, "learning_rate": 1.4534351894347748e-06, "loss": 0.91675568, "memory(GiB)": 369.42, "step": 60190, "train_speed(iter/s)": 0.200547 }, { "acc": 0.7560914, "epoch": 1.5270167427701673, "grad_norm": 2.21875, "learning_rate": 1.4526960992166412e-06, "loss": 0.97928944, "memory(GiB)": 369.42, "step": 60195, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75593843, "epoch": 1.5271435819381025, "grad_norm": 1.9296875, "learning_rate": 1.4519571650247687e-06, "loss": 0.96324425, "memory(GiB)": 369.42, "step": 60200, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75797672, "epoch": 1.5272704211060375, "grad_norm": 1.7890625, "learning_rate": 1.4512183868916629e-06, "loss": 0.98589344, "memory(GiB)": 369.42, "step": 60205, "train_speed(iter/s)": 0.200554 }, { "acc": 0.75779896, "epoch": 1.5273972602739727, "grad_norm": 2.234375, "learning_rate": 1.4504797648498186e-06, "loss": 0.99757042, "memory(GiB)": 369.42, "step": 60210, "train_speed(iter/s)": 0.200557 }, { "acc": 0.75882559, "epoch": 1.5275240994419077, "grad_norm": 2.109375, "learning_rate": 1.4497412989317184e-06, "loss": 0.94707584, "memory(GiB)": 369.42, "step": 60215, "train_speed(iter/s)": 0.20056 }, { "acc": 0.75503626, "epoch": 1.5276509386098427, "grad_norm": 2.1875, "learning_rate": 1.4490029891698476e-06, "loss": 1.01495514, "memory(GiB)": 369.42, "step": 60220, "train_speed(iter/s)": 0.200562 }, { "acc": 0.75047817, "epoch": 1.5277777777777777, "grad_norm": 2.109375, "learning_rate": 1.44826483559668e-06, "loss": 0.94109955, "memory(GiB)": 369.42, "step": 60225, "train_speed(iter/s)": 0.200565 }, { "acc": 0.76524091, "epoch": 1.527904616945713, "grad_norm": 2.09375, "learning_rate": 1.4475268382446833e-06, "loss": 0.93286676, "memory(GiB)": 369.42, "step": 60230, "train_speed(iter/s)": 0.200568 }, { "acc": 0.75304823, "epoch": 1.5280314561136479, "grad_norm": 2.296875, "learning_rate": 1.4467889971463144e-06, "loss": 0.96585569, "memory(GiB)": 369.42, "step": 60235, "train_speed(iter/s)": 0.200571 }, { "acc": 0.75737119, "epoch": 1.528158295281583, "grad_norm": 1.90625, "learning_rate": 1.4460513123340308e-06, "loss": 0.9676343, "memory(GiB)": 369.42, "step": 60240, "train_speed(iter/s)": 0.200574 }, { "acc": 0.76228614, "epoch": 1.528285134449518, "grad_norm": 2.46875, "learning_rate": 1.4453137838402775e-06, "loss": 0.97228184, "memory(GiB)": 369.42, "step": 60245, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76379471, "epoch": 1.528411973617453, "grad_norm": 2.125, "learning_rate": 1.4445764116974948e-06, "loss": 0.94603176, "memory(GiB)": 369.42, "step": 60250, "train_speed(iter/s)": 0.200582 }, { "acc": 0.7566443, "epoch": 1.528538812785388, "grad_norm": 2.21875, "learning_rate": 1.4438391959381149e-06, "loss": 0.95265579, "memory(GiB)": 369.42, "step": 60255, "train_speed(iter/s)": 0.200585 }, { "acc": 0.75157776, "epoch": 1.528665651953323, "grad_norm": 2.1875, "learning_rate": 1.4431021365945647e-06, "loss": 0.97608566, "memory(GiB)": 369.42, "step": 60260, "train_speed(iter/s)": 0.200588 }, { "acc": 0.74274864, "epoch": 1.5287924911212583, "grad_norm": 2.21875, "learning_rate": 1.4423652336992627e-06, "loss": 0.99958534, "memory(GiB)": 369.42, "step": 60265, "train_speed(iter/s)": 0.200591 }, { "acc": 0.75607615, "epoch": 1.5289193302891935, "grad_norm": 2.59375, "learning_rate": 1.4416284872846215e-06, "loss": 0.98364315, "memory(GiB)": 369.42, "step": 60270, "train_speed(iter/s)": 0.200594 }, { "acc": 0.77124915, "epoch": 1.5290461694571285, "grad_norm": 2.21875, "learning_rate": 1.440891897383046e-06, "loss": 0.88488026, "memory(GiB)": 369.42, "step": 60275, "train_speed(iter/s)": 0.200597 }, { "acc": 0.76859241, "epoch": 1.5291730086250634, "grad_norm": 2.46875, "learning_rate": 1.4401554640269354e-06, "loss": 0.88463764, "memory(GiB)": 369.42, "step": 60280, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74858394, "epoch": 1.5292998477929984, "grad_norm": 2.296875, "learning_rate": 1.4394191872486812e-06, "loss": 0.98398838, "memory(GiB)": 369.42, "step": 60285, "train_speed(iter/s)": 0.200598 }, { "acc": 0.74190078, "epoch": 1.5294266869609334, "grad_norm": 2.421875, "learning_rate": 1.4386830670806684e-06, "loss": 1.02483273, "memory(GiB)": 369.42, "step": 60290, "train_speed(iter/s)": 0.200601 }, { "acc": 0.7567174, "epoch": 1.5295535261288686, "grad_norm": 2.171875, "learning_rate": 1.4379471035552738e-06, "loss": 0.96410151, "memory(GiB)": 369.42, "step": 60295, "train_speed(iter/s)": 0.200602 }, { "acc": 0.75490637, "epoch": 1.5296803652968036, "grad_norm": 2.3125, "learning_rate": 1.437211296704869e-06, "loss": 1.00773106, "memory(GiB)": 369.42, "step": 60300, "train_speed(iter/s)": 0.200606 }, { "acc": 0.74808512, "epoch": 1.5298072044647388, "grad_norm": 1.8359375, "learning_rate": 1.4364756465618167e-06, "loss": 1.02131195, "memory(GiB)": 369.42, "step": 60305, "train_speed(iter/s)": 0.200608 }, { "acc": 0.76010494, "epoch": 1.5299340436326738, "grad_norm": 2.140625, "learning_rate": 1.4357401531584792e-06, "loss": 0.94486485, "memory(GiB)": 369.42, "step": 60310, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75862684, "epoch": 1.5300608828006088, "grad_norm": 2.25, "learning_rate": 1.4350048165272006e-06, "loss": 0.93054247, "memory(GiB)": 369.42, "step": 60315, "train_speed(iter/s)": 0.200614 }, { "acc": 0.74572239, "epoch": 1.5301877219685438, "grad_norm": 2.53125, "learning_rate": 1.4342696367003272e-06, "loss": 1.03017569, "memory(GiB)": 369.42, "step": 60320, "train_speed(iter/s)": 0.200617 }, { "acc": 0.75368257, "epoch": 1.5303145611364788, "grad_norm": 2.578125, "learning_rate": 1.433534613710193e-06, "loss": 0.94918346, "memory(GiB)": 369.42, "step": 60325, "train_speed(iter/s)": 0.20062 }, { "acc": 0.75458636, "epoch": 1.530441400304414, "grad_norm": 2.1875, "learning_rate": 1.4327997475891331e-06, "loss": 1.01875706, "memory(GiB)": 369.42, "step": 60330, "train_speed(iter/s)": 0.200622 }, { "acc": 0.7626811, "epoch": 1.5305682394723492, "grad_norm": 2.171875, "learning_rate": 1.432065038369465e-06, "loss": 0.94787741, "memory(GiB)": 369.42, "step": 60335, "train_speed(iter/s)": 0.200624 }, { "acc": 0.76306114, "epoch": 1.5306950786402842, "grad_norm": 1.9921875, "learning_rate": 1.4313304860835048e-06, "loss": 0.91970892, "memory(GiB)": 369.42, "step": 60340, "train_speed(iter/s)": 0.200627 }, { "acc": 0.74257669, "epoch": 1.5308219178082192, "grad_norm": 2.109375, "learning_rate": 1.4305960907635641e-06, "loss": 1.01169672, "memory(GiB)": 369.42, "step": 60345, "train_speed(iter/s)": 0.20063 }, { "acc": 0.76177788, "epoch": 1.5309487569761542, "grad_norm": 2.0, "learning_rate": 1.4298618524419455e-06, "loss": 0.9848877, "memory(GiB)": 369.42, "step": 60350, "train_speed(iter/s)": 0.200632 }, { "acc": 0.75054569, "epoch": 1.5310755961440892, "grad_norm": 2.046875, "learning_rate": 1.4291277711509388e-06, "loss": 1.01922455, "memory(GiB)": 369.42, "step": 60355, "train_speed(iter/s)": 0.200636 }, { "acc": 0.7630641, "epoch": 1.5312024353120244, "grad_norm": 2.03125, "learning_rate": 1.428393846922837e-06, "loss": 0.86773033, "memory(GiB)": 369.42, "step": 60360, "train_speed(iter/s)": 0.200637 }, { "acc": 0.76795125, "epoch": 1.5313292744799594, "grad_norm": 1.9375, "learning_rate": 1.4276600797899199e-06, "loss": 0.92821569, "memory(GiB)": 369.42, "step": 60365, "train_speed(iter/s)": 0.200641 }, { "acc": 0.75697079, "epoch": 1.5314561136478946, "grad_norm": 2.21875, "learning_rate": 1.426926469784463e-06, "loss": 0.93545189, "memory(GiB)": 369.42, "step": 60370, "train_speed(iter/s)": 0.200644 }, { "acc": 0.75309978, "epoch": 1.5315829528158296, "grad_norm": 2.421875, "learning_rate": 1.42619301693873e-06, "loss": 1.01114149, "memory(GiB)": 369.42, "step": 60375, "train_speed(iter/s)": 0.200648 }, { "acc": 0.75903172, "epoch": 1.5317097919837646, "grad_norm": 2.140625, "learning_rate": 1.4254597212849858e-06, "loss": 0.97217636, "memory(GiB)": 369.42, "step": 60380, "train_speed(iter/s)": 0.200651 }, { "acc": 0.74617443, "epoch": 1.5318366311516995, "grad_norm": 2.328125, "learning_rate": 1.4247265828554819e-06, "loss": 0.99483042, "memory(GiB)": 369.42, "step": 60385, "train_speed(iter/s)": 0.200653 }, { "acc": 0.74509335, "epoch": 1.5319634703196348, "grad_norm": 2.578125, "learning_rate": 1.423993601682465e-06, "loss": 1.01581745, "memory(GiB)": 369.42, "step": 60390, "train_speed(iter/s)": 0.200656 }, { "acc": 0.74519758, "epoch": 1.5320903094875697, "grad_norm": 2.109375, "learning_rate": 1.423260777798176e-06, "loss": 0.97786055, "memory(GiB)": 369.42, "step": 60395, "train_speed(iter/s)": 0.200659 }, { "acc": 0.74749279, "epoch": 1.532217148655505, "grad_norm": 2.6875, "learning_rate": 1.4225281112348466e-06, "loss": 1.02136917, "memory(GiB)": 369.42, "step": 60400, "train_speed(iter/s)": 0.200662 }, { "acc": 0.73952017, "epoch": 1.53234398782344, "grad_norm": 2.359375, "learning_rate": 1.421795602024703e-06, "loss": 1.04005051, "memory(GiB)": 369.42, "step": 60405, "train_speed(iter/s)": 0.200664 }, { "acc": 0.75897989, "epoch": 1.532470826991375, "grad_norm": 2.25, "learning_rate": 1.4210632501999643e-06, "loss": 0.98680468, "memory(GiB)": 369.42, "step": 60410, "train_speed(iter/s)": 0.200665 }, { "acc": 0.74801779, "epoch": 1.53259766615931, "grad_norm": 2.078125, "learning_rate": 1.4203310557928428e-06, "loss": 0.98121452, "memory(GiB)": 369.42, "step": 60415, "train_speed(iter/s)": 0.200668 }, { "acc": 0.73651361, "epoch": 1.532724505327245, "grad_norm": 2.21875, "learning_rate": 1.4195990188355435e-06, "loss": 1.07286301, "memory(GiB)": 369.42, "step": 60420, "train_speed(iter/s)": 0.200671 }, { "acc": 0.75606213, "epoch": 1.5328513444951801, "grad_norm": 2.6875, "learning_rate": 1.418867139360265e-06, "loss": 0.97037153, "memory(GiB)": 369.42, "step": 60425, "train_speed(iter/s)": 0.200673 }, { "acc": 0.76049223, "epoch": 1.5329781836631153, "grad_norm": 2.453125, "learning_rate": 1.418135417399198e-06, "loss": 0.96474819, "memory(GiB)": 369.42, "step": 60430, "train_speed(iter/s)": 0.200675 }, { "acc": 0.74202833, "epoch": 1.5331050228310503, "grad_norm": 2.703125, "learning_rate": 1.4174038529845273e-06, "loss": 1.06578484, "memory(GiB)": 369.42, "step": 60435, "train_speed(iter/s)": 0.200679 }, { "acc": 0.74522381, "epoch": 1.5332318619989853, "grad_norm": 2.453125, "learning_rate": 1.4166724461484304e-06, "loss": 1.01010408, "memory(GiB)": 369.42, "step": 60440, "train_speed(iter/s)": 0.200681 }, { "acc": 0.7428153, "epoch": 1.5333587011669203, "grad_norm": 1.828125, "learning_rate": 1.4159411969230758e-06, "loss": 1.03186264, "memory(GiB)": 369.42, "step": 60445, "train_speed(iter/s)": 0.200685 }, { "acc": 0.74824877, "epoch": 1.5334855403348553, "grad_norm": 1.953125, "learning_rate": 1.4152101053406325e-06, "loss": 0.95919132, "memory(GiB)": 369.42, "step": 60450, "train_speed(iter/s)": 0.200686 }, { "acc": 0.75634432, "epoch": 1.5336123795027905, "grad_norm": 2.0, "learning_rate": 1.4144791714332517e-06, "loss": 0.94557571, "memory(GiB)": 369.42, "step": 60455, "train_speed(iter/s)": 0.200688 }, { "acc": 0.7516407, "epoch": 1.5337392186707255, "grad_norm": 2.109375, "learning_rate": 1.4137483952330855e-06, "loss": 0.98852291, "memory(GiB)": 369.42, "step": 60460, "train_speed(iter/s)": 0.200689 }, { "acc": 0.76176023, "epoch": 1.5338660578386607, "grad_norm": 2.71875, "learning_rate": 1.4130177767722753e-06, "loss": 0.95988169, "memory(GiB)": 369.42, "step": 60465, "train_speed(iter/s)": 0.200691 }, { "acc": 0.75627561, "epoch": 1.5339928970065957, "grad_norm": 2.1875, "learning_rate": 1.4122873160829603e-06, "loss": 0.97512894, "memory(GiB)": 369.42, "step": 60470, "train_speed(iter/s)": 0.200692 }, { "acc": 0.74720516, "epoch": 1.5341197361745307, "grad_norm": 1.828125, "learning_rate": 1.4115570131972655e-06, "loss": 1.00542688, "memory(GiB)": 369.42, "step": 60475, "train_speed(iter/s)": 0.200695 }, { "acc": 0.75811491, "epoch": 1.5342465753424657, "grad_norm": 1.984375, "learning_rate": 1.4108268681473136e-06, "loss": 0.98266449, "memory(GiB)": 369.42, "step": 60480, "train_speed(iter/s)": 0.200697 }, { "acc": 0.75084844, "epoch": 1.5343734145104007, "grad_norm": 3.28125, "learning_rate": 1.4100968809652215e-06, "loss": 1.00983896, "memory(GiB)": 369.42, "step": 60485, "train_speed(iter/s)": 0.200698 }, { "acc": 0.74700069, "epoch": 1.5345002536783359, "grad_norm": 2.65625, "learning_rate": 1.4093670516830982e-06, "loss": 1.00649052, "memory(GiB)": 369.42, "step": 60490, "train_speed(iter/s)": 0.200701 }, { "acc": 0.74544191, "epoch": 1.534627092846271, "grad_norm": 2.28125, "learning_rate": 1.4086373803330417e-06, "loss": 1.01832256, "memory(GiB)": 369.42, "step": 60495, "train_speed(iter/s)": 0.200703 }, { "acc": 0.74579978, "epoch": 1.534753932014206, "grad_norm": 2.28125, "learning_rate": 1.4079078669471457e-06, "loss": 0.98527756, "memory(GiB)": 369.42, "step": 60500, "train_speed(iter/s)": 0.200705 }, { "acc": 0.76158595, "epoch": 1.534880771182141, "grad_norm": 2.203125, "learning_rate": 1.4071785115575005e-06, "loss": 0.9348917, "memory(GiB)": 369.42, "step": 60505, "train_speed(iter/s)": 0.200708 }, { "acc": 0.76131029, "epoch": 1.535007610350076, "grad_norm": 2.109375, "learning_rate": 1.4064493141961872e-06, "loss": 0.91261492, "memory(GiB)": 369.42, "step": 60510, "train_speed(iter/s)": 0.200708 }, { "acc": 0.76336412, "epoch": 1.535134449518011, "grad_norm": 2.546875, "learning_rate": 1.4057202748952736e-06, "loss": 0.94866819, "memory(GiB)": 369.42, "step": 60515, "train_speed(iter/s)": 0.200711 }, { "acc": 0.75189862, "epoch": 1.5352612886859462, "grad_norm": 2.609375, "learning_rate": 1.4049913936868314e-06, "loss": 0.96162424, "memory(GiB)": 369.42, "step": 60520, "train_speed(iter/s)": 0.200714 }, { "acc": 0.7412034, "epoch": 1.5353881278538812, "grad_norm": 2.34375, "learning_rate": 1.4042626706029184e-06, "loss": 1.01254215, "memory(GiB)": 369.42, "step": 60525, "train_speed(iter/s)": 0.200717 }, { "acc": 0.72873192, "epoch": 1.5355149670218164, "grad_norm": 2.3125, "learning_rate": 1.4035341056755864e-06, "loss": 1.05253429, "memory(GiB)": 369.42, "step": 60530, "train_speed(iter/s)": 0.200718 }, { "acc": 0.76657877, "epoch": 1.5356418061897514, "grad_norm": 2.71875, "learning_rate": 1.402805698936882e-06, "loss": 0.94662094, "memory(GiB)": 369.42, "step": 60535, "train_speed(iter/s)": 0.200721 }, { "acc": 0.7486208, "epoch": 1.5357686453576864, "grad_norm": 2.0625, "learning_rate": 1.4020774504188428e-06, "loss": 1.01966248, "memory(GiB)": 369.42, "step": 60540, "train_speed(iter/s)": 0.200723 }, { "acc": 0.7505682, "epoch": 1.5358954845256214, "grad_norm": 2.265625, "learning_rate": 1.4013493601535016e-06, "loss": 1.01525421, "memory(GiB)": 369.42, "step": 60545, "train_speed(iter/s)": 0.200725 }, { "acc": 0.74374857, "epoch": 1.5360223236935566, "grad_norm": 2.984375, "learning_rate": 1.400621428172882e-06, "loss": 1.06155987, "memory(GiB)": 369.42, "step": 60550, "train_speed(iter/s)": 0.200727 }, { "acc": 0.75203476, "epoch": 1.5361491628614916, "grad_norm": 2.515625, "learning_rate": 1.399893654509002e-06, "loss": 1.01347942, "memory(GiB)": 369.42, "step": 60555, "train_speed(iter/s)": 0.200729 }, { "acc": 0.75256319, "epoch": 1.5362760020294268, "grad_norm": 2.0, "learning_rate": 1.3991660391938721e-06, "loss": 0.98665619, "memory(GiB)": 369.42, "step": 60560, "train_speed(iter/s)": 0.200733 }, { "acc": 0.75031123, "epoch": 1.5364028411973618, "grad_norm": 2.390625, "learning_rate": 1.398438582259497e-06, "loss": 1.00546103, "memory(GiB)": 369.42, "step": 60565, "train_speed(iter/s)": 0.200735 }, { "acc": 0.74242182, "epoch": 1.5365296803652968, "grad_norm": 2.234375, "learning_rate": 1.3977112837378726e-06, "loss": 0.99081211, "memory(GiB)": 369.42, "step": 60570, "train_speed(iter/s)": 0.200736 }, { "acc": 0.73970156, "epoch": 1.5366565195332318, "grad_norm": 2.109375, "learning_rate": 1.3969841436609888e-06, "loss": 1.02352581, "memory(GiB)": 369.42, "step": 60575, "train_speed(iter/s)": 0.200739 }, { "acc": 0.75025682, "epoch": 1.5367833587011668, "grad_norm": 1.9375, "learning_rate": 1.396257162060829e-06, "loss": 0.95478401, "memory(GiB)": 369.42, "step": 60580, "train_speed(iter/s)": 0.200741 }, { "acc": 0.74189129, "epoch": 1.536910197869102, "grad_norm": 1.8125, "learning_rate": 1.395530338969367e-06, "loss": 1.03002434, "memory(GiB)": 369.42, "step": 60585, "train_speed(iter/s)": 0.200744 }, { "acc": 0.74759464, "epoch": 1.5370370370370372, "grad_norm": 2.25, "learning_rate": 1.3948036744185767e-06, "loss": 0.96787586, "memory(GiB)": 369.42, "step": 60590, "train_speed(iter/s)": 0.200747 }, { "acc": 0.7540451, "epoch": 1.5371638762049722, "grad_norm": 2.484375, "learning_rate": 1.3940771684404153e-06, "loss": 0.94461193, "memory(GiB)": 369.42, "step": 60595, "train_speed(iter/s)": 0.20075 }, { "acc": 0.76063781, "epoch": 1.5372907153729072, "grad_norm": 1.8671875, "learning_rate": 1.393350821066839e-06, "loss": 0.89607239, "memory(GiB)": 369.42, "step": 60600, "train_speed(iter/s)": 0.200751 }, { "acc": 0.7660181, "epoch": 1.5374175545408422, "grad_norm": 2.15625, "learning_rate": 1.3926246323297948e-06, "loss": 0.96952381, "memory(GiB)": 369.42, "step": 60605, "train_speed(iter/s)": 0.20075 }, { "acc": 0.75757518, "epoch": 1.5375443937087772, "grad_norm": 2.203125, "learning_rate": 1.3918986022612285e-06, "loss": 0.93901691, "memory(GiB)": 369.42, "step": 60610, "train_speed(iter/s)": 0.200752 }, { "acc": 0.75729256, "epoch": 1.5376712328767124, "grad_norm": 2.234375, "learning_rate": 1.3911727308930684e-06, "loss": 0.97916336, "memory(GiB)": 369.42, "step": 60615, "train_speed(iter/s)": 0.200753 }, { "acc": 0.75100574, "epoch": 1.5377980720446474, "grad_norm": 2.015625, "learning_rate": 1.3904470182572428e-06, "loss": 1.00283289, "memory(GiB)": 369.42, "step": 60620, "train_speed(iter/s)": 0.200755 }, { "acc": 0.74635725, "epoch": 1.5379249112125826, "grad_norm": 2.359375, "learning_rate": 1.3897214643856744e-06, "loss": 1.07257986, "memory(GiB)": 369.42, "step": 60625, "train_speed(iter/s)": 0.200757 }, { "acc": 0.7663228, "epoch": 1.5380517503805176, "grad_norm": 2.359375, "learning_rate": 1.388996069310276e-06, "loss": 0.95357933, "memory(GiB)": 369.42, "step": 60630, "train_speed(iter/s)": 0.20076 }, { "acc": 0.75048866, "epoch": 1.5381785895484525, "grad_norm": 2.25, "learning_rate": 1.3882708330629514e-06, "loss": 1.04183264, "memory(GiB)": 369.42, "step": 60635, "train_speed(iter/s)": 0.200763 }, { "acc": 0.74248915, "epoch": 1.5383054287163875, "grad_norm": 2.09375, "learning_rate": 1.3875457556755989e-06, "loss": 1.02874489, "memory(GiB)": 369.42, "step": 60640, "train_speed(iter/s)": 0.200766 }, { "acc": 0.75165091, "epoch": 1.5384322678843225, "grad_norm": 2.296875, "learning_rate": 1.386820837180114e-06, "loss": 0.97092133, "memory(GiB)": 369.42, "step": 60645, "train_speed(iter/s)": 0.200769 }, { "acc": 0.75659342, "epoch": 1.5385591070522577, "grad_norm": 2.3125, "learning_rate": 1.386096077608382e-06, "loss": 0.93328514, "memory(GiB)": 369.42, "step": 60650, "train_speed(iter/s)": 0.200771 }, { "acc": 0.75728397, "epoch": 1.538685946220193, "grad_norm": 2.0625, "learning_rate": 1.385371476992276e-06, "loss": 0.94976177, "memory(GiB)": 369.42, "step": 60655, "train_speed(iter/s)": 0.200773 }, { "acc": 0.73885484, "epoch": 1.538812785388128, "grad_norm": 2.359375, "learning_rate": 1.3846470353636726e-06, "loss": 1.01770287, "memory(GiB)": 369.42, "step": 60660, "train_speed(iter/s)": 0.200777 }, { "acc": 0.74587679, "epoch": 1.538939624556063, "grad_norm": 1.8359375, "learning_rate": 1.3839227527544336e-06, "loss": 0.97347431, "memory(GiB)": 369.42, "step": 60665, "train_speed(iter/s)": 0.20078 }, { "acc": 0.76020689, "epoch": 1.539066463723998, "grad_norm": 1.859375, "learning_rate": 1.3831986291964184e-06, "loss": 0.95799999, "memory(GiB)": 369.42, "step": 60670, "train_speed(iter/s)": 0.200782 }, { "acc": 0.74898338, "epoch": 1.539193302891933, "grad_norm": 2.296875, "learning_rate": 1.382474664721472e-06, "loss": 1.02519798, "memory(GiB)": 369.42, "step": 60675, "train_speed(iter/s)": 0.200784 }, { "acc": 0.76696577, "epoch": 1.5393201420598681, "grad_norm": 2.09375, "learning_rate": 1.3817508593614425e-06, "loss": 0.88427286, "memory(GiB)": 369.42, "step": 60680, "train_speed(iter/s)": 0.200787 }, { "acc": 0.75337906, "epoch": 1.539446981227803, "grad_norm": 2.0625, "learning_rate": 1.381027213148165e-06, "loss": 1.00215168, "memory(GiB)": 369.42, "step": 60685, "train_speed(iter/s)": 0.20079 }, { "acc": 0.74809055, "epoch": 1.5395738203957383, "grad_norm": 2.359375, "learning_rate": 1.3803037261134678e-06, "loss": 0.97019835, "memory(GiB)": 369.42, "step": 60690, "train_speed(iter/s)": 0.200793 }, { "acc": 0.75836906, "epoch": 1.5397006595636733, "grad_norm": 2.390625, "learning_rate": 1.3795803982891736e-06, "loss": 1.00434074, "memory(GiB)": 369.42, "step": 60695, "train_speed(iter/s)": 0.200795 }, { "acc": 0.7507267, "epoch": 1.5398274987316083, "grad_norm": 2.140625, "learning_rate": 1.3788572297070974e-06, "loss": 0.94860191, "memory(GiB)": 369.42, "step": 60700, "train_speed(iter/s)": 0.200797 }, { "acc": 0.74913845, "epoch": 1.5399543378995433, "grad_norm": 1.8359375, "learning_rate": 1.3781342203990478e-06, "loss": 0.98818626, "memory(GiB)": 369.42, "step": 60705, "train_speed(iter/s)": 0.200801 }, { "acc": 0.74928665, "epoch": 1.5400811770674785, "grad_norm": 2.515625, "learning_rate": 1.3774113703968255e-06, "loss": 1.01211586, "memory(GiB)": 369.42, "step": 60710, "train_speed(iter/s)": 0.200804 }, { "acc": 0.75094538, "epoch": 1.5402080162354135, "grad_norm": 2.109375, "learning_rate": 1.3766886797322248e-06, "loss": 0.9778265, "memory(GiB)": 369.42, "step": 60715, "train_speed(iter/s)": 0.200807 }, { "acc": 0.76479044, "epoch": 1.5403348554033487, "grad_norm": 1.5390625, "learning_rate": 1.3759661484370324e-06, "loss": 0.91748018, "memory(GiB)": 369.42, "step": 60720, "train_speed(iter/s)": 0.200809 }, { "acc": 0.76514149, "epoch": 1.5404616945712837, "grad_norm": 2.359375, "learning_rate": 1.3752437765430294e-06, "loss": 0.96334209, "memory(GiB)": 369.42, "step": 60725, "train_speed(iter/s)": 0.200812 }, { "acc": 0.74919138, "epoch": 1.5405885337392187, "grad_norm": 2.03125, "learning_rate": 1.3745215640819886e-06, "loss": 1.00165691, "memory(GiB)": 369.42, "step": 60730, "train_speed(iter/s)": 0.200815 }, { "acc": 0.76113558, "epoch": 1.5407153729071537, "grad_norm": 2.34375, "learning_rate": 1.373799511085676e-06, "loss": 0.93581543, "memory(GiB)": 369.42, "step": 60735, "train_speed(iter/s)": 0.200817 }, { "acc": 0.74517879, "epoch": 1.5408422120750886, "grad_norm": 2.15625, "learning_rate": 1.3730776175858506e-06, "loss": 1.01072006, "memory(GiB)": 369.42, "step": 60740, "train_speed(iter/s)": 0.200819 }, { "acc": 0.76708193, "epoch": 1.5409690512430239, "grad_norm": 2.234375, "learning_rate": 1.3723558836142631e-06, "loss": 0.95039177, "memory(GiB)": 369.42, "step": 60745, "train_speed(iter/s)": 0.200821 }, { "acc": 0.75742226, "epoch": 1.541095890410959, "grad_norm": 1.734375, "learning_rate": 1.371634309202663e-06, "loss": 0.94465294, "memory(GiB)": 369.42, "step": 60750, "train_speed(iter/s)": 0.200825 }, { "acc": 0.74925785, "epoch": 1.541222729578894, "grad_norm": 2.390625, "learning_rate": 1.3709128943827842e-06, "loss": 0.97220354, "memory(GiB)": 369.42, "step": 60755, "train_speed(iter/s)": 0.200828 }, { "acc": 0.73982348, "epoch": 1.541349568746829, "grad_norm": 2.375, "learning_rate": 1.3701916391863573e-06, "loss": 1.00606031, "memory(GiB)": 369.42, "step": 60760, "train_speed(iter/s)": 0.200832 }, { "acc": 0.73849134, "epoch": 1.541476407914764, "grad_norm": 2.546875, "learning_rate": 1.3694705436451093e-06, "loss": 1.0231658, "memory(GiB)": 369.42, "step": 60765, "train_speed(iter/s)": 0.200833 }, { "acc": 0.76219583, "epoch": 1.541603247082699, "grad_norm": 2.234375, "learning_rate": 1.368749607790758e-06, "loss": 0.92472801, "memory(GiB)": 369.42, "step": 60770, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75675511, "epoch": 1.5417300862506342, "grad_norm": 2.171875, "learning_rate": 1.3680288316550095e-06, "loss": 0.93398104, "memory(GiB)": 369.42, "step": 60775, "train_speed(iter/s)": 0.200839 }, { "acc": 0.74469433, "epoch": 1.5418569254185692, "grad_norm": 2.71875, "learning_rate": 1.3673082152695672e-06, "loss": 1.02382221, "memory(GiB)": 369.42, "step": 60780, "train_speed(iter/s)": 0.200842 }, { "acc": 0.73008204, "epoch": 1.5419837645865044, "grad_norm": 1.984375, "learning_rate": 1.3665877586661296e-06, "loss": 1.04543915, "memory(GiB)": 369.42, "step": 60785, "train_speed(iter/s)": 0.200843 }, { "acc": 0.75996256, "epoch": 1.5421106037544394, "grad_norm": 2.0, "learning_rate": 1.3658674618763862e-06, "loss": 0.94133768, "memory(GiB)": 369.42, "step": 60790, "train_speed(iter/s)": 0.200845 }, { "acc": 0.74124212, "epoch": 1.5422374429223744, "grad_norm": 2.328125, "learning_rate": 1.365147324932014e-06, "loss": 0.98146324, "memory(GiB)": 369.42, "step": 60795, "train_speed(iter/s)": 0.200846 }, { "acc": 0.74345341, "epoch": 1.5423642820903094, "grad_norm": 2.546875, "learning_rate": 1.3644273478646925e-06, "loss": 1.01802473, "memory(GiB)": 369.42, "step": 60800, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75004964, "epoch": 1.5424911212582444, "grad_norm": 2.359375, "learning_rate": 1.3637075307060877e-06, "loss": 1.0390686, "memory(GiB)": 369.42, "step": 60805, "train_speed(iter/s)": 0.200851 }, { "acc": 0.75560384, "epoch": 1.5426179604261796, "grad_norm": 1.796875, "learning_rate": 1.362987873487862e-06, "loss": 0.98225012, "memory(GiB)": 369.42, "step": 60810, "train_speed(iter/s)": 0.200854 }, { "acc": 0.74744005, "epoch": 1.5427447995941148, "grad_norm": 2.171875, "learning_rate": 1.362268376241665e-06, "loss": 0.96369982, "memory(GiB)": 369.42, "step": 60815, "train_speed(iter/s)": 0.200856 }, { "acc": 0.72533665, "epoch": 1.5428716387620498, "grad_norm": 2.890625, "learning_rate": 1.3615490389991476e-06, "loss": 1.08754692, "memory(GiB)": 369.42, "step": 60820, "train_speed(iter/s)": 0.200858 }, { "acc": 0.75736494, "epoch": 1.5429984779299848, "grad_norm": 2.0625, "learning_rate": 1.360829861791948e-06, "loss": 0.96432991, "memory(GiB)": 369.42, "step": 60825, "train_speed(iter/s)": 0.200861 }, { "acc": 0.74689808, "epoch": 1.5431253170979198, "grad_norm": 2.34375, "learning_rate": 1.3601108446516985e-06, "loss": 0.9807683, "memory(GiB)": 369.42, "step": 60830, "train_speed(iter/s)": 0.200863 }, { "acc": 0.7623661, "epoch": 1.5432521562658548, "grad_norm": 2.234375, "learning_rate": 1.3593919876100254e-06, "loss": 0.93714142, "memory(GiB)": 369.42, "step": 60835, "train_speed(iter/s)": 0.200866 }, { "acc": 0.74785643, "epoch": 1.54337899543379, "grad_norm": 2.265625, "learning_rate": 1.3586732906985467e-06, "loss": 1.0046669, "memory(GiB)": 369.42, "step": 60840, "train_speed(iter/s)": 0.200869 }, { "acc": 0.75187931, "epoch": 1.543505834601725, "grad_norm": 2.203125, "learning_rate": 1.357954753948874e-06, "loss": 0.9617712, "memory(GiB)": 369.42, "step": 60845, "train_speed(iter/s)": 0.200872 }, { "acc": 0.77003894, "epoch": 1.5436326737696602, "grad_norm": 2.078125, "learning_rate": 1.3572363773926117e-06, "loss": 0.94211006, "memory(GiB)": 369.42, "step": 60850, "train_speed(iter/s)": 0.200873 }, { "acc": 0.75740051, "epoch": 1.5437595129375952, "grad_norm": 2.515625, "learning_rate": 1.3565181610613571e-06, "loss": 0.97285233, "memory(GiB)": 369.42, "step": 60855, "train_speed(iter/s)": 0.200874 }, { "acc": 0.75190425, "epoch": 1.5438863521055302, "grad_norm": 2.359375, "learning_rate": 1.3558001049867008e-06, "loss": 0.99402599, "memory(GiB)": 369.42, "step": 60860, "train_speed(iter/s)": 0.200877 }, { "acc": 0.76067719, "epoch": 1.5440131912734651, "grad_norm": 2.015625, "learning_rate": 1.3550822092002264e-06, "loss": 0.9749033, "memory(GiB)": 369.42, "step": 60865, "train_speed(iter/s)": 0.20088 }, { "acc": 0.7663887, "epoch": 1.5441400304414004, "grad_norm": 1.8125, "learning_rate": 1.3543644737335099e-06, "loss": 0.90934849, "memory(GiB)": 369.42, "step": 60870, "train_speed(iter/s)": 0.20088 }, { "acc": 0.73678713, "epoch": 1.5442668696093353, "grad_norm": 2.390625, "learning_rate": 1.35364689861812e-06, "loss": 1.04421463, "memory(GiB)": 369.42, "step": 60875, "train_speed(iter/s)": 0.200884 }, { "acc": 0.76307526, "epoch": 1.5443937087772706, "grad_norm": 2.375, "learning_rate": 1.3529294838856194e-06, "loss": 0.97604599, "memory(GiB)": 369.42, "step": 60880, "train_speed(iter/s)": 0.200887 }, { "acc": 0.74818325, "epoch": 1.5445205479452055, "grad_norm": 2.21875, "learning_rate": 1.3522122295675616e-06, "loss": 0.97321119, "memory(GiB)": 369.42, "step": 60885, "train_speed(iter/s)": 0.20089 }, { "acc": 0.7527544, "epoch": 1.5446473871131405, "grad_norm": 2.609375, "learning_rate": 1.351495135695499e-06, "loss": 0.98043804, "memory(GiB)": 369.42, "step": 60890, "train_speed(iter/s)": 0.200893 }, { "acc": 0.75038519, "epoch": 1.5447742262810755, "grad_norm": 1.8984375, "learning_rate": 1.3507782023009692e-06, "loss": 0.98143654, "memory(GiB)": 369.42, "step": 60895, "train_speed(iter/s)": 0.200894 }, { "acc": 0.74814167, "epoch": 1.5449010654490105, "grad_norm": 2.09375, "learning_rate": 1.3500614294155056e-06, "loss": 0.97935371, "memory(GiB)": 369.42, "step": 60900, "train_speed(iter/s)": 0.200895 }, { "acc": 0.75799675, "epoch": 1.5450279046169457, "grad_norm": 2.109375, "learning_rate": 1.3493448170706347e-06, "loss": 0.98648357, "memory(GiB)": 369.42, "step": 60905, "train_speed(iter/s)": 0.200896 }, { "acc": 0.75448608, "epoch": 1.545154743784881, "grad_norm": 1.984375, "learning_rate": 1.348628365297881e-06, "loss": 1.02672367, "memory(GiB)": 369.42, "step": 60910, "train_speed(iter/s)": 0.200899 }, { "acc": 0.74797859, "epoch": 1.545281582952816, "grad_norm": 2.0625, "learning_rate": 1.3479120741287526e-06, "loss": 0.96805305, "memory(GiB)": 369.42, "step": 60915, "train_speed(iter/s)": 0.200902 }, { "acc": 0.75172749, "epoch": 1.545408422120751, "grad_norm": 1.96875, "learning_rate": 1.3471959435947552e-06, "loss": 1.03360958, "memory(GiB)": 369.42, "step": 60920, "train_speed(iter/s)": 0.200904 }, { "acc": 0.73998394, "epoch": 1.545535261288686, "grad_norm": 2.15625, "learning_rate": 1.3464799737273898e-06, "loss": 1.01746998, "memory(GiB)": 369.42, "step": 60925, "train_speed(iter/s)": 0.200905 }, { "acc": 0.750704, "epoch": 1.545662100456621, "grad_norm": 2.515625, "learning_rate": 1.3457641645581487e-06, "loss": 0.99029999, "memory(GiB)": 369.42, "step": 60930, "train_speed(iter/s)": 0.200909 }, { "acc": 0.73973374, "epoch": 1.545788939624556, "grad_norm": 2.28125, "learning_rate": 1.3450485161185133e-06, "loss": 1.03114529, "memory(GiB)": 369.42, "step": 60935, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75258555, "epoch": 1.545915778792491, "grad_norm": 2.765625, "learning_rate": 1.344333028439961e-06, "loss": 0.96073093, "memory(GiB)": 369.42, "step": 60940, "train_speed(iter/s)": 0.200913 }, { "acc": 0.76620817, "epoch": 1.5460426179604263, "grad_norm": 2.40625, "learning_rate": 1.3436177015539647e-06, "loss": 0.94156494, "memory(GiB)": 369.42, "step": 60945, "train_speed(iter/s)": 0.200914 }, { "acc": 0.77045913, "epoch": 1.5461694571283613, "grad_norm": 2.203125, "learning_rate": 1.3429025354919877e-06, "loss": 0.94006071, "memory(GiB)": 369.42, "step": 60950, "train_speed(iter/s)": 0.200916 }, { "acc": 0.74224424, "epoch": 1.5462962962962963, "grad_norm": 2.1875, "learning_rate": 1.3421875302854826e-06, "loss": 1.00845299, "memory(GiB)": 369.42, "step": 60955, "train_speed(iter/s)": 0.20092 }, { "acc": 0.74390292, "epoch": 1.5464231354642313, "grad_norm": 2.109375, "learning_rate": 1.3414726859659016e-06, "loss": 0.96057587, "memory(GiB)": 369.42, "step": 60960, "train_speed(iter/s)": 0.200923 }, { "acc": 0.74178877, "epoch": 1.5465499746321663, "grad_norm": 2.125, "learning_rate": 1.3407580025646866e-06, "loss": 1.02656021, "memory(GiB)": 369.42, "step": 60965, "train_speed(iter/s)": 0.200925 }, { "acc": 0.75639992, "epoch": 1.5466768138001015, "grad_norm": 2.765625, "learning_rate": 1.3400434801132716e-06, "loss": 1.00917492, "memory(GiB)": 369.42, "step": 60970, "train_speed(iter/s)": 0.200928 }, { "acc": 0.74601421, "epoch": 1.5468036529680367, "grad_norm": 2.59375, "learning_rate": 1.3393291186430852e-06, "loss": 0.97277889, "memory(GiB)": 369.42, "step": 60975, "train_speed(iter/s)": 0.20093 }, { "acc": 0.7416564, "epoch": 1.5469304921359717, "grad_norm": 3.140625, "learning_rate": 1.338614918185548e-06, "loss": 1.00755653, "memory(GiB)": 369.42, "step": 60980, "train_speed(iter/s)": 0.200933 }, { "acc": 0.75258489, "epoch": 1.5470573313039067, "grad_norm": 1.9765625, "learning_rate": 1.3379008787720732e-06, "loss": 1.00158358, "memory(GiB)": 369.42, "step": 60985, "train_speed(iter/s)": 0.200934 }, { "acc": 0.75903664, "epoch": 1.5471841704718416, "grad_norm": 2.28125, "learning_rate": 1.3371870004340681e-06, "loss": 1.02068748, "memory(GiB)": 369.42, "step": 60990, "train_speed(iter/s)": 0.200937 }, { "acc": 0.75959191, "epoch": 1.5473110096397766, "grad_norm": 2.140625, "learning_rate": 1.3364732832029315e-06, "loss": 0.95933037, "memory(GiB)": 369.42, "step": 60995, "train_speed(iter/s)": 0.200939 }, { "acc": 0.76168113, "epoch": 1.5474378488077118, "grad_norm": 2.15625, "learning_rate": 1.335759727110057e-06, "loss": 0.94601574, "memory(GiB)": 369.42, "step": 61000, "train_speed(iter/s)": 0.200943 }, { "epoch": 1.5474378488077118, "eval_acc": 0.7380210446104466, "eval_loss": 0.9695082306861877, "eval_runtime": 384.1921, "eval_samples_per_second": 16.58, "eval_steps_per_second": 8.29, "step": 61000 }, { "acc": 0.75061674, "epoch": 1.5475646879756468, "grad_norm": 2.765625, "learning_rate": 1.335046332186829e-06, "loss": 1.02164841, "memory(GiB)": 369.42, "step": 61005, "train_speed(iter/s)": 0.200474 }, { "acc": 0.75559177, "epoch": 1.547691527143582, "grad_norm": 2.3125, "learning_rate": 1.3343330984646262e-06, "loss": 0.95766859, "memory(GiB)": 369.42, "step": 61010, "train_speed(iter/s)": 0.200477 }, { "acc": 0.76546793, "epoch": 1.547818366311517, "grad_norm": 2.078125, "learning_rate": 1.33362002597482e-06, "loss": 0.98544445, "memory(GiB)": 369.42, "step": 61015, "train_speed(iter/s)": 0.200479 }, { "acc": 0.74502459, "epoch": 1.547945205479452, "grad_norm": 2.09375, "learning_rate": 1.3329071147487743e-06, "loss": 0.9867672, "memory(GiB)": 369.42, "step": 61020, "train_speed(iter/s)": 0.20048 }, { "acc": 0.75563669, "epoch": 1.548072044647387, "grad_norm": 2.234375, "learning_rate": 1.3321943648178442e-06, "loss": 0.93115063, "memory(GiB)": 369.42, "step": 61025, "train_speed(iter/s)": 0.200483 }, { "acc": 0.74853606, "epoch": 1.5481988838153222, "grad_norm": 1.9609375, "learning_rate": 1.3314817762133848e-06, "loss": 0.99733105, "memory(GiB)": 369.42, "step": 61030, "train_speed(iter/s)": 0.200485 }, { "acc": 0.75589819, "epoch": 1.5483257229832572, "grad_norm": 2.234375, "learning_rate": 1.330769348966734e-06, "loss": 1.01866703, "memory(GiB)": 369.42, "step": 61035, "train_speed(iter/s)": 0.200487 }, { "acc": 0.75719066, "epoch": 1.5484525621511924, "grad_norm": 2.21875, "learning_rate": 1.3300570831092292e-06, "loss": 0.97826366, "memory(GiB)": 369.42, "step": 61040, "train_speed(iter/s)": 0.200486 }, { "acc": 0.758603, "epoch": 1.5485794013191274, "grad_norm": 1.9140625, "learning_rate": 1.3293449786721973e-06, "loss": 0.92966795, "memory(GiB)": 369.42, "step": 61045, "train_speed(iter/s)": 0.200488 }, { "acc": 0.74274611, "epoch": 1.5487062404870624, "grad_norm": 2.296875, "learning_rate": 1.3286330356869648e-06, "loss": 1.01605911, "memory(GiB)": 369.42, "step": 61050, "train_speed(iter/s)": 0.20049 }, { "acc": 0.73620596, "epoch": 1.5488330796549974, "grad_norm": 2.234375, "learning_rate": 1.3279212541848413e-06, "loss": 1.04619875, "memory(GiB)": 369.42, "step": 61055, "train_speed(iter/s)": 0.200493 }, { "acc": 0.75078363, "epoch": 1.5489599188229324, "grad_norm": 2.640625, "learning_rate": 1.3272096341971342e-06, "loss": 1.00829945, "memory(GiB)": 369.42, "step": 61060, "train_speed(iter/s)": 0.200497 }, { "acc": 0.7436018, "epoch": 1.5490867579908676, "grad_norm": 2.390625, "learning_rate": 1.326498175755147e-06, "loss": 1.00143623, "memory(GiB)": 369.42, "step": 61065, "train_speed(iter/s)": 0.200499 }, { "acc": 0.74738998, "epoch": 1.5492135971588028, "grad_norm": 2.953125, "learning_rate": 1.3257868788901722e-06, "loss": 1.02815399, "memory(GiB)": 369.42, "step": 61070, "train_speed(iter/s)": 0.200501 }, { "acc": 0.75504251, "epoch": 1.5493404363267378, "grad_norm": 2.09375, "learning_rate": 1.3250757436334932e-06, "loss": 0.95357494, "memory(GiB)": 369.42, "step": 61075, "train_speed(iter/s)": 0.200503 }, { "acc": 0.75551462, "epoch": 1.5494672754946728, "grad_norm": 2.265625, "learning_rate": 1.3243647700163887e-06, "loss": 0.93908501, "memory(GiB)": 369.42, "step": 61080, "train_speed(iter/s)": 0.200505 }, { "acc": 0.75073876, "epoch": 1.5495941146626078, "grad_norm": 1.9609375, "learning_rate": 1.323653958070134e-06, "loss": 1.00255671, "memory(GiB)": 369.42, "step": 61085, "train_speed(iter/s)": 0.200509 }, { "acc": 0.75215688, "epoch": 1.5497209538305428, "grad_norm": 2.390625, "learning_rate": 1.3229433078259928e-06, "loss": 0.97639503, "memory(GiB)": 369.42, "step": 61090, "train_speed(iter/s)": 0.200511 }, { "acc": 0.74664955, "epoch": 1.549847792998478, "grad_norm": 1.84375, "learning_rate": 1.3222328193152195e-06, "loss": 0.95007181, "memory(GiB)": 369.42, "step": 61095, "train_speed(iter/s)": 0.200513 }, { "acc": 0.75901723, "epoch": 1.549974632166413, "grad_norm": 2.5, "learning_rate": 1.3215224925690683e-06, "loss": 0.99861765, "memory(GiB)": 369.42, "step": 61100, "train_speed(iter/s)": 0.200515 }, { "acc": 0.77350836, "epoch": 1.5501014713343482, "grad_norm": 1.828125, "learning_rate": 1.3208123276187807e-06, "loss": 0.90308495, "memory(GiB)": 369.42, "step": 61105, "train_speed(iter/s)": 0.200518 }, { "acc": 0.74690409, "epoch": 1.5502283105022832, "grad_norm": 1.8984375, "learning_rate": 1.3201023244955952e-06, "loss": 1.0023757, "memory(GiB)": 369.42, "step": 61110, "train_speed(iter/s)": 0.200519 }, { "acc": 0.76304216, "epoch": 1.5503551496702181, "grad_norm": 1.96875, "learning_rate": 1.319392483230736e-06, "loss": 0.97036438, "memory(GiB)": 369.42, "step": 61115, "train_speed(iter/s)": 0.200522 }, { "acc": 0.73526859, "epoch": 1.5504819888381531, "grad_norm": 2.21875, "learning_rate": 1.318682803855429e-06, "loss": 1.05599079, "memory(GiB)": 369.42, "step": 61120, "train_speed(iter/s)": 0.200525 }, { "acc": 0.75498447, "epoch": 1.5506088280060881, "grad_norm": 2.4375, "learning_rate": 1.3179732864008888e-06, "loss": 0.9927309, "memory(GiB)": 369.42, "step": 61125, "train_speed(iter/s)": 0.200527 }, { "acc": 0.75059547, "epoch": 1.5507356671740233, "grad_norm": 2.25, "learning_rate": 1.3172639308983226e-06, "loss": 0.96467819, "memory(GiB)": 369.42, "step": 61130, "train_speed(iter/s)": 0.200529 }, { "acc": 0.75153031, "epoch": 1.5508625063419585, "grad_norm": 2.015625, "learning_rate": 1.3165547373789306e-06, "loss": 0.95423965, "memory(GiB)": 369.42, "step": 61135, "train_speed(iter/s)": 0.200532 }, { "acc": 0.75410795, "epoch": 1.5509893455098935, "grad_norm": 1.96875, "learning_rate": 1.3158457058739066e-06, "loss": 0.96426306, "memory(GiB)": 369.42, "step": 61140, "train_speed(iter/s)": 0.200533 }, { "acc": 0.75392776, "epoch": 1.5511161846778285, "grad_norm": 2.375, "learning_rate": 1.3151368364144373e-06, "loss": 1.02012596, "memory(GiB)": 369.42, "step": 61145, "train_speed(iter/s)": 0.200536 }, { "acc": 0.75457149, "epoch": 1.5512430238457635, "grad_norm": 2.140625, "learning_rate": 1.3144281290317012e-06, "loss": 0.97920494, "memory(GiB)": 369.42, "step": 61150, "train_speed(iter/s)": 0.200538 }, { "acc": 0.75151062, "epoch": 1.5513698630136985, "grad_norm": 2.515625, "learning_rate": 1.3137195837568716e-06, "loss": 0.95366688, "memory(GiB)": 369.42, "step": 61155, "train_speed(iter/s)": 0.20054 }, { "acc": 0.75766726, "epoch": 1.5514967021816337, "grad_norm": 2.21875, "learning_rate": 1.313011200621112e-06, "loss": 0.99649458, "memory(GiB)": 369.42, "step": 61160, "train_speed(iter/s)": 0.200543 }, { "acc": 0.75069261, "epoch": 1.5516235413495687, "grad_norm": 1.765625, "learning_rate": 1.312302979655582e-06, "loss": 0.97330551, "memory(GiB)": 369.42, "step": 61165, "train_speed(iter/s)": 0.200546 }, { "acc": 0.75094304, "epoch": 1.551750380517504, "grad_norm": 2.421875, "learning_rate": 1.3115949208914302e-06, "loss": 0.97296848, "memory(GiB)": 369.42, "step": 61170, "train_speed(iter/s)": 0.200549 }, { "acc": 0.75380583, "epoch": 1.551877219685439, "grad_norm": 1.8828125, "learning_rate": 1.3108870243598022e-06, "loss": 0.98096542, "memory(GiB)": 369.42, "step": 61175, "train_speed(iter/s)": 0.200552 }, { "acc": 0.75220423, "epoch": 1.552004058853374, "grad_norm": 2.15625, "learning_rate": 1.310179290091833e-06, "loss": 1.01732578, "memory(GiB)": 369.42, "step": 61180, "train_speed(iter/s)": 0.200555 }, { "acc": 0.7656136, "epoch": 1.5521308980213089, "grad_norm": 2.234375, "learning_rate": 1.3094717181186518e-06, "loss": 0.97071781, "memory(GiB)": 369.42, "step": 61185, "train_speed(iter/s)": 0.200557 }, { "acc": 0.74443607, "epoch": 1.552257737189244, "grad_norm": 2.765625, "learning_rate": 1.3087643084713836e-06, "loss": 0.99563351, "memory(GiB)": 369.42, "step": 61190, "train_speed(iter/s)": 0.200561 }, { "acc": 0.76740017, "epoch": 1.552384576357179, "grad_norm": 2.75, "learning_rate": 1.30805706118114e-06, "loss": 0.95067158, "memory(GiB)": 369.42, "step": 61195, "train_speed(iter/s)": 0.200563 }, { "acc": 0.74353414, "epoch": 1.5525114155251143, "grad_norm": 2.484375, "learning_rate": 1.3073499762790287e-06, "loss": 1.00025692, "memory(GiB)": 369.42, "step": 61200, "train_speed(iter/s)": 0.200565 }, { "acc": 0.75754032, "epoch": 1.5526382546930493, "grad_norm": 1.9140625, "learning_rate": 1.306643053796154e-06, "loss": 0.96900988, "memory(GiB)": 369.42, "step": 61205, "train_speed(iter/s)": 0.200568 }, { "acc": 0.75679584, "epoch": 1.5527650938609843, "grad_norm": 2.578125, "learning_rate": 1.3059362937636084e-06, "loss": 1.0075552, "memory(GiB)": 369.42, "step": 61210, "train_speed(iter/s)": 0.200571 }, { "acc": 0.7636858, "epoch": 1.5528919330289193, "grad_norm": 1.9296875, "learning_rate": 1.3052296962124756e-06, "loss": 0.90282001, "memory(GiB)": 369.42, "step": 61215, "train_speed(iter/s)": 0.200573 }, { "acc": 0.7423943, "epoch": 1.5530187721968542, "grad_norm": 2.421875, "learning_rate": 1.3045232611738357e-06, "loss": 1.00973701, "memory(GiB)": 369.42, "step": 61220, "train_speed(iter/s)": 0.200575 }, { "acc": 0.74613705, "epoch": 1.5531456113647895, "grad_norm": 2.15625, "learning_rate": 1.3038169886787632e-06, "loss": 0.98388596, "memory(GiB)": 369.42, "step": 61225, "train_speed(iter/s)": 0.200578 }, { "acc": 0.76969948, "epoch": 1.5532724505327247, "grad_norm": 3.375, "learning_rate": 1.3031108787583235e-06, "loss": 0.96623135, "memory(GiB)": 369.42, "step": 61230, "train_speed(iter/s)": 0.20058 }, { "acc": 0.74065108, "epoch": 1.5533992897006597, "grad_norm": 2.046875, "learning_rate": 1.3024049314435694e-06, "loss": 0.9805069, "memory(GiB)": 369.42, "step": 61235, "train_speed(iter/s)": 0.200582 }, { "acc": 0.74413471, "epoch": 1.5535261288685946, "grad_norm": 2.21875, "learning_rate": 1.301699146765557e-06, "loss": 1.00373268, "memory(GiB)": 369.42, "step": 61240, "train_speed(iter/s)": 0.200584 }, { "acc": 0.76019373, "epoch": 1.5536529680365296, "grad_norm": 1.9375, "learning_rate": 1.3009935247553274e-06, "loss": 0.99786654, "memory(GiB)": 369.42, "step": 61245, "train_speed(iter/s)": 0.200586 }, { "acc": 0.74132938, "epoch": 1.5537798072044646, "grad_norm": 2.140625, "learning_rate": 1.3002880654439192e-06, "loss": 1.01941166, "memory(GiB)": 369.42, "step": 61250, "train_speed(iter/s)": 0.200588 }, { "acc": 0.743962, "epoch": 1.5539066463723998, "grad_norm": 1.9609375, "learning_rate": 1.2995827688623568e-06, "loss": 1.02968769, "memory(GiB)": 369.42, "step": 61255, "train_speed(iter/s)": 0.200591 }, { "acc": 0.76021156, "epoch": 1.5540334855403348, "grad_norm": 1.8359375, "learning_rate": 1.298877635041667e-06, "loss": 0.92100506, "memory(GiB)": 369.42, "step": 61260, "train_speed(iter/s)": 0.200593 }, { "acc": 0.73936319, "epoch": 1.55416032470827, "grad_norm": 2.09375, "learning_rate": 1.2981726640128633e-06, "loss": 1.02694893, "memory(GiB)": 369.42, "step": 61265, "train_speed(iter/s)": 0.200596 }, { "acc": 0.76039443, "epoch": 1.554287163876205, "grad_norm": 1.84375, "learning_rate": 1.297467855806953e-06, "loss": 0.92557831, "memory(GiB)": 369.42, "step": 61270, "train_speed(iter/s)": 0.200598 }, { "acc": 0.75387192, "epoch": 1.55441400304414, "grad_norm": 2.328125, "learning_rate": 1.2967632104549371e-06, "loss": 0.98438683, "memory(GiB)": 369.42, "step": 61275, "train_speed(iter/s)": 0.2006 }, { "acc": 0.75214214, "epoch": 1.554540842212075, "grad_norm": 2.0, "learning_rate": 1.296058727987809e-06, "loss": 0.9816803, "memory(GiB)": 369.42, "step": 61280, "train_speed(iter/s)": 0.200603 }, { "acc": 0.74891605, "epoch": 1.55466768138001, "grad_norm": 1.984375, "learning_rate": 1.295354408436555e-06, "loss": 1.04683628, "memory(GiB)": 369.42, "step": 61285, "train_speed(iter/s)": 0.200605 }, { "acc": 0.7591938, "epoch": 1.5547945205479452, "grad_norm": 2.421875, "learning_rate": 1.294650251832154e-06, "loss": 0.95636473, "memory(GiB)": 369.42, "step": 61290, "train_speed(iter/s)": 0.200607 }, { "acc": 0.75086527, "epoch": 1.5549213597158804, "grad_norm": 2.484375, "learning_rate": 1.2939462582055784e-06, "loss": 0.96566648, "memory(GiB)": 369.42, "step": 61295, "train_speed(iter/s)": 0.200609 }, { "acc": 0.7554461, "epoch": 1.5550481988838154, "grad_norm": 2.09375, "learning_rate": 1.2932424275877926e-06, "loss": 0.94938889, "memory(GiB)": 369.42, "step": 61300, "train_speed(iter/s)": 0.200611 }, { "acc": 0.75583677, "epoch": 1.5551750380517504, "grad_norm": 2.484375, "learning_rate": 1.2925387600097543e-06, "loss": 0.9830327, "memory(GiB)": 369.42, "step": 61305, "train_speed(iter/s)": 0.200614 }, { "acc": 0.74961157, "epoch": 1.5553018772196854, "grad_norm": 1.7265625, "learning_rate": 1.291835255502414e-06, "loss": 0.95781174, "memory(GiB)": 369.42, "step": 61310, "train_speed(iter/s)": 0.200616 }, { "acc": 0.74087825, "epoch": 1.5554287163876204, "grad_norm": 1.8671875, "learning_rate": 1.2911319140967148e-06, "loss": 0.96721601, "memory(GiB)": 369.42, "step": 61315, "train_speed(iter/s)": 0.200619 }, { "acc": 0.75336781, "epoch": 1.5555555555555556, "grad_norm": 2.21875, "learning_rate": 1.290428735823593e-06, "loss": 1.01021957, "memory(GiB)": 369.42, "step": 61320, "train_speed(iter/s)": 0.20062 }, { "acc": 0.73796625, "epoch": 1.5556823947234906, "grad_norm": 2.234375, "learning_rate": 1.2897257207139758e-06, "loss": 1.04680576, "memory(GiB)": 369.42, "step": 61325, "train_speed(iter/s)": 0.200623 }, { "acc": 0.75287609, "epoch": 1.5558092338914258, "grad_norm": 2.0, "learning_rate": 1.28902286879879e-06, "loss": 0.92381077, "memory(GiB)": 369.42, "step": 61330, "train_speed(iter/s)": 0.200626 }, { "acc": 0.74543571, "epoch": 1.5559360730593608, "grad_norm": 2.328125, "learning_rate": 1.2883201801089445e-06, "loss": 1.01535101, "memory(GiB)": 369.42, "step": 61335, "train_speed(iter/s)": 0.200627 }, { "acc": 0.73544836, "epoch": 1.5560629122272958, "grad_norm": 2.125, "learning_rate": 1.2876176546753494e-06, "loss": 1.04972706, "memory(GiB)": 369.42, "step": 61340, "train_speed(iter/s)": 0.200629 }, { "acc": 0.75231185, "epoch": 1.5561897513952307, "grad_norm": 2.109375, "learning_rate": 1.286915292528903e-06, "loss": 1.02124004, "memory(GiB)": 369.42, "step": 61345, "train_speed(iter/s)": 0.200632 }, { "acc": 0.74991994, "epoch": 1.556316590563166, "grad_norm": 2.359375, "learning_rate": 1.286213093700503e-06, "loss": 0.98109207, "memory(GiB)": 369.42, "step": 61350, "train_speed(iter/s)": 0.200634 }, { "acc": 0.75850773, "epoch": 1.556443429731101, "grad_norm": 2.3125, "learning_rate": 1.28551105822103e-06, "loss": 0.95358105, "memory(GiB)": 369.42, "step": 61355, "train_speed(iter/s)": 0.200634 }, { "acc": 0.75774698, "epoch": 1.5565702688990362, "grad_norm": 2.265625, "learning_rate": 1.2848091861213636e-06, "loss": 0.96658611, "memory(GiB)": 369.42, "step": 61360, "train_speed(iter/s)": 0.200637 }, { "acc": 0.73639069, "epoch": 1.5566971080669711, "grad_norm": 2.375, "learning_rate": 1.2841074774323775e-06, "loss": 1.08227072, "memory(GiB)": 369.42, "step": 61365, "train_speed(iter/s)": 0.200639 }, { "acc": 0.74302845, "epoch": 1.5568239472349061, "grad_norm": 1.8515625, "learning_rate": 1.2834059321849363e-06, "loss": 1.02471218, "memory(GiB)": 369.42, "step": 61370, "train_speed(iter/s)": 0.200642 }, { "acc": 0.77206326, "epoch": 1.5569507864028411, "grad_norm": 2.203125, "learning_rate": 1.2827045504098928e-06, "loss": 0.97759266, "memory(GiB)": 369.42, "step": 61375, "train_speed(iter/s)": 0.200643 }, { "acc": 0.75652285, "epoch": 1.5570776255707761, "grad_norm": 2.34375, "learning_rate": 1.2820033321381009e-06, "loss": 0.95899944, "memory(GiB)": 369.42, "step": 61380, "train_speed(iter/s)": 0.200643 }, { "acc": 0.76397743, "epoch": 1.5572044647387113, "grad_norm": 2.296875, "learning_rate": 1.2813022774004024e-06, "loss": 0.94040289, "memory(GiB)": 369.42, "step": 61385, "train_speed(iter/s)": 0.200645 }, { "acc": 0.76007314, "epoch": 1.5573313039066465, "grad_norm": 2.15625, "learning_rate": 1.280601386227634e-06, "loss": 1.00151329, "memory(GiB)": 369.42, "step": 61390, "train_speed(iter/s)": 0.200648 }, { "acc": 0.74949808, "epoch": 1.5574581430745815, "grad_norm": 2.25, "learning_rate": 1.279900658650619e-06, "loss": 1.00302048, "memory(GiB)": 369.42, "step": 61395, "train_speed(iter/s)": 0.20065 }, { "acc": 0.74222932, "epoch": 1.5575849822425165, "grad_norm": 2.40625, "learning_rate": 1.2792000947001842e-06, "loss": 1.00405693, "memory(GiB)": 369.42, "step": 61400, "train_speed(iter/s)": 0.200652 }, { "acc": 0.74968376, "epoch": 1.5577118214104515, "grad_norm": 1.9296875, "learning_rate": 1.2784996944071415e-06, "loss": 1.02017899, "memory(GiB)": 369.42, "step": 61405, "train_speed(iter/s)": 0.200653 }, { "acc": 0.74529333, "epoch": 1.5578386605783865, "grad_norm": 2.234375, "learning_rate": 1.2777994578022972e-06, "loss": 1.00980196, "memory(GiB)": 369.42, "step": 61410, "train_speed(iter/s)": 0.200655 }, { "acc": 0.74634852, "epoch": 1.5579654997463217, "grad_norm": 2.546875, "learning_rate": 1.2770993849164514e-06, "loss": 1.02123499, "memory(GiB)": 369.42, "step": 61415, "train_speed(iter/s)": 0.200657 }, { "acc": 0.7343173, "epoch": 1.5580923389142567, "grad_norm": 2.40625, "learning_rate": 1.276399475780396e-06, "loss": 1.0763155, "memory(GiB)": 369.42, "step": 61420, "train_speed(iter/s)": 0.20066 }, { "acc": 0.74970827, "epoch": 1.558219178082192, "grad_norm": 2.0625, "learning_rate": 1.2756997304249164e-06, "loss": 1.02187595, "memory(GiB)": 369.42, "step": 61425, "train_speed(iter/s)": 0.200663 }, { "acc": 0.75577455, "epoch": 1.558346017250127, "grad_norm": 2.296875, "learning_rate": 1.2750001488807906e-06, "loss": 0.95550213, "memory(GiB)": 369.42, "step": 61430, "train_speed(iter/s)": 0.200665 }, { "acc": 0.75706835, "epoch": 1.5584728564180619, "grad_norm": 2.609375, "learning_rate": 1.2743007311787892e-06, "loss": 1.02164087, "memory(GiB)": 369.42, "step": 61435, "train_speed(iter/s)": 0.200668 }, { "acc": 0.754986, "epoch": 1.5585996955859969, "grad_norm": 2.3125, "learning_rate": 1.2736014773496757e-06, "loss": 0.95564327, "memory(GiB)": 369.42, "step": 61440, "train_speed(iter/s)": 0.200671 }, { "acc": 0.76998444, "epoch": 1.5587265347539319, "grad_norm": 2.125, "learning_rate": 1.2729023874242064e-06, "loss": 0.90879574, "memory(GiB)": 369.42, "step": 61445, "train_speed(iter/s)": 0.200672 }, { "acc": 0.74123931, "epoch": 1.558853373921867, "grad_norm": 2.5, "learning_rate": 1.2722034614331303e-06, "loss": 1.06932659, "memory(GiB)": 369.42, "step": 61450, "train_speed(iter/s)": 0.200676 }, { "acc": 0.73714809, "epoch": 1.5589802130898023, "grad_norm": 2.28125, "learning_rate": 1.2715046994071889e-06, "loss": 1.00295544, "memory(GiB)": 369.42, "step": 61455, "train_speed(iter/s)": 0.200678 }, { "acc": 0.74077468, "epoch": 1.5591070522577373, "grad_norm": 2.265625, "learning_rate": 1.2708061013771179e-06, "loss": 1.00695763, "memory(GiB)": 369.42, "step": 61460, "train_speed(iter/s)": 0.200681 }, { "acc": 0.75357008, "epoch": 1.5592338914256723, "grad_norm": 1.8828125, "learning_rate": 1.2701076673736428e-06, "loss": 0.9481617, "memory(GiB)": 369.42, "step": 61465, "train_speed(iter/s)": 0.200683 }, { "acc": 0.74140081, "epoch": 1.5593607305936072, "grad_norm": 2.15625, "learning_rate": 1.269409397427488e-06, "loss": 0.99734478, "memory(GiB)": 369.42, "step": 61470, "train_speed(iter/s)": 0.200685 }, { "acc": 0.75038576, "epoch": 1.5594875697615422, "grad_norm": 2.21875, "learning_rate": 1.2687112915693622e-06, "loss": 0.94465961, "memory(GiB)": 369.42, "step": 61475, "train_speed(iter/s)": 0.200686 }, { "acc": 0.74215655, "epoch": 1.5596144089294774, "grad_norm": 2.015625, "learning_rate": 1.2680133498299729e-06, "loss": 1.04604645, "memory(GiB)": 369.42, "step": 61480, "train_speed(iter/s)": 0.200686 }, { "acc": 0.74568143, "epoch": 1.5597412480974124, "grad_norm": 2.15625, "learning_rate": 1.2673155722400177e-06, "loss": 1.01604023, "memory(GiB)": 369.42, "step": 61485, "train_speed(iter/s)": 0.200689 }, { "acc": 0.74547515, "epoch": 1.5598680872653476, "grad_norm": 2.25, "learning_rate": 1.2666179588301908e-06, "loss": 1.03348236, "memory(GiB)": 369.42, "step": 61490, "train_speed(iter/s)": 0.200692 }, { "acc": 0.75786324, "epoch": 1.5599949264332826, "grad_norm": 2.0625, "learning_rate": 1.2659205096311738e-06, "loss": 0.95807304, "memory(GiB)": 369.42, "step": 61495, "train_speed(iter/s)": 0.200695 }, { "acc": 0.74570589, "epoch": 1.5601217656012176, "grad_norm": 2.125, "learning_rate": 1.2652232246736423e-06, "loss": 0.97257004, "memory(GiB)": 369.42, "step": 61500, "train_speed(iter/s)": 0.200698 }, { "acc": 0.75400772, "epoch": 1.5602486047691526, "grad_norm": 2.359375, "learning_rate": 1.2645261039882694e-06, "loss": 1.050424, "memory(GiB)": 369.42, "step": 61505, "train_speed(iter/s)": 0.200699 }, { "acc": 0.7442698, "epoch": 1.5603754439370878, "grad_norm": 2.1875, "learning_rate": 1.263829147605718e-06, "loss": 1.00061836, "memory(GiB)": 369.42, "step": 61510, "train_speed(iter/s)": 0.200702 }, { "acc": 0.7601687, "epoch": 1.5605022831050228, "grad_norm": 2.21875, "learning_rate": 1.26313235555664e-06, "loss": 0.97140446, "memory(GiB)": 369.42, "step": 61515, "train_speed(iter/s)": 0.200705 }, { "acc": 0.76673918, "epoch": 1.560629122272958, "grad_norm": 2.28125, "learning_rate": 1.2624357278716832e-06, "loss": 0.96723928, "memory(GiB)": 369.42, "step": 61520, "train_speed(iter/s)": 0.200708 }, { "acc": 0.74142985, "epoch": 1.560755961440893, "grad_norm": 2.0625, "learning_rate": 1.2617392645814913e-06, "loss": 0.9944932, "memory(GiB)": 369.42, "step": 61525, "train_speed(iter/s)": 0.200711 }, { "acc": 0.74053411, "epoch": 1.560882800608828, "grad_norm": 2.125, "learning_rate": 1.2610429657166983e-06, "loss": 1.05729446, "memory(GiB)": 369.42, "step": 61530, "train_speed(iter/s)": 0.200714 }, { "acc": 0.76577263, "epoch": 1.561009639776763, "grad_norm": 1.90625, "learning_rate": 1.2603468313079265e-06, "loss": 0.92260742, "memory(GiB)": 369.42, "step": 61535, "train_speed(iter/s)": 0.200716 }, { "acc": 0.76054745, "epoch": 1.561136478944698, "grad_norm": 2.09375, "learning_rate": 1.2596508613857982e-06, "loss": 0.9269002, "memory(GiB)": 369.42, "step": 61540, "train_speed(iter/s)": 0.200719 }, { "acc": 0.74246178, "epoch": 1.5612633181126332, "grad_norm": 2.15625, "learning_rate": 1.258955055980925e-06, "loss": 1.00511341, "memory(GiB)": 369.42, "step": 61545, "train_speed(iter/s)": 0.200723 }, { "acc": 0.74996543, "epoch": 1.5613901572805684, "grad_norm": 2.09375, "learning_rate": 1.258259415123911e-06, "loss": 0.97431479, "memory(GiB)": 369.42, "step": 61550, "train_speed(iter/s)": 0.200725 }, { "acc": 0.76638021, "epoch": 1.5615169964485034, "grad_norm": 1.96875, "learning_rate": 1.2575639388453532e-06, "loss": 0.92841187, "memory(GiB)": 369.42, "step": 61555, "train_speed(iter/s)": 0.200729 }, { "acc": 0.75235319, "epoch": 1.5616438356164384, "grad_norm": 2.25, "learning_rate": 1.2568686271758423e-06, "loss": 0.96403913, "memory(GiB)": 369.42, "step": 61560, "train_speed(iter/s)": 0.200731 }, { "acc": 0.75115986, "epoch": 1.5617706747843734, "grad_norm": 1.9453125, "learning_rate": 1.2561734801459612e-06, "loss": 1.00285835, "memory(GiB)": 369.42, "step": 61565, "train_speed(iter/s)": 0.200733 }, { "acc": 0.75559883, "epoch": 1.5618975139523084, "grad_norm": 2.203125, "learning_rate": 1.2554784977862856e-06, "loss": 0.95059681, "memory(GiB)": 369.42, "step": 61570, "train_speed(iter/s)": 0.200736 }, { "acc": 0.75535293, "epoch": 1.5620243531202436, "grad_norm": 2.203125, "learning_rate": 1.2547836801273833e-06, "loss": 1.00327282, "memory(GiB)": 369.42, "step": 61575, "train_speed(iter/s)": 0.200738 }, { "acc": 0.74758167, "epoch": 1.5621511922881786, "grad_norm": 2.34375, "learning_rate": 1.2540890271998162e-06, "loss": 0.99624748, "memory(GiB)": 369.42, "step": 61580, "train_speed(iter/s)": 0.200739 }, { "acc": 0.73542395, "epoch": 1.5622780314561138, "grad_norm": 1.890625, "learning_rate": 1.2533945390341379e-06, "loss": 1.07597542, "memory(GiB)": 369.42, "step": 61585, "train_speed(iter/s)": 0.20074 }, { "acc": 0.74879875, "epoch": 1.5624048706240488, "grad_norm": 2.125, "learning_rate": 1.2527002156608946e-06, "loss": 0.98743849, "memory(GiB)": 369.42, "step": 61590, "train_speed(iter/s)": 0.200743 }, { "acc": 0.75783005, "epoch": 1.5625317097919837, "grad_norm": 2.390625, "learning_rate": 1.2520060571106275e-06, "loss": 0.97351742, "memory(GiB)": 369.42, "step": 61595, "train_speed(iter/s)": 0.200745 }, { "acc": 0.74232273, "epoch": 1.5626585489599187, "grad_norm": 2.109375, "learning_rate": 1.2513120634138665e-06, "loss": 1.04324799, "memory(GiB)": 369.42, "step": 61600, "train_speed(iter/s)": 0.200746 }, { "acc": 0.75398655, "epoch": 1.5627853881278537, "grad_norm": 2.390625, "learning_rate": 1.250618234601138e-06, "loss": 0.93580723, "memory(GiB)": 369.42, "step": 61605, "train_speed(iter/s)": 0.200748 }, { "acc": 0.75304489, "epoch": 1.562912227295789, "grad_norm": 2.109375, "learning_rate": 1.2499245707029595e-06, "loss": 0.93689194, "memory(GiB)": 369.42, "step": 61610, "train_speed(iter/s)": 0.200751 }, { "acc": 0.74546905, "epoch": 1.5630390664637241, "grad_norm": 2.3125, "learning_rate": 1.2492310717498412e-06, "loss": 0.98425436, "memory(GiB)": 369.42, "step": 61615, "train_speed(iter/s)": 0.200753 }, { "acc": 0.76957922, "epoch": 1.5631659056316591, "grad_norm": 2.171875, "learning_rate": 1.2485377377722863e-06, "loss": 0.99021759, "memory(GiB)": 369.42, "step": 61620, "train_speed(iter/s)": 0.200756 }, { "acc": 0.76612768, "epoch": 1.5632927447995941, "grad_norm": 3.359375, "learning_rate": 1.2478445688007894e-06, "loss": 0.95046682, "memory(GiB)": 369.42, "step": 61625, "train_speed(iter/s)": 0.200759 }, { "acc": 0.75737915, "epoch": 1.5634195839675291, "grad_norm": 2.234375, "learning_rate": 1.2471515648658434e-06, "loss": 0.93713331, "memory(GiB)": 369.42, "step": 61630, "train_speed(iter/s)": 0.200762 }, { "acc": 0.75905037, "epoch": 1.563546423135464, "grad_norm": 2.3125, "learning_rate": 1.2464587259979254e-06, "loss": 1.00295305, "memory(GiB)": 369.42, "step": 61635, "train_speed(iter/s)": 0.200765 }, { "acc": 0.74716902, "epoch": 1.5636732623033993, "grad_norm": 3.0625, "learning_rate": 1.2457660522275095e-06, "loss": 1.02386436, "memory(GiB)": 369.42, "step": 61640, "train_speed(iter/s)": 0.200768 }, { "acc": 0.7536696, "epoch": 1.5638001014713343, "grad_norm": 2.734375, "learning_rate": 1.2450735435850654e-06, "loss": 0.99869204, "memory(GiB)": 369.42, "step": 61645, "train_speed(iter/s)": 0.20077 }, { "acc": 0.74643283, "epoch": 1.5639269406392695, "grad_norm": 2.3125, "learning_rate": 1.244381200101053e-06, "loss": 1.01457939, "memory(GiB)": 369.42, "step": 61650, "train_speed(iter/s)": 0.200774 }, { "acc": 0.74611993, "epoch": 1.5640537798072045, "grad_norm": 2.3125, "learning_rate": 1.2436890218059217e-06, "loss": 1.02442045, "memory(GiB)": 369.42, "step": 61655, "train_speed(iter/s)": 0.200776 }, { "acc": 0.75048594, "epoch": 1.5641806189751395, "grad_norm": 2.03125, "learning_rate": 1.2429970087301163e-06, "loss": 0.95192833, "memory(GiB)": 369.42, "step": 61660, "train_speed(iter/s)": 0.200776 }, { "acc": 0.74513121, "epoch": 1.5643074581430745, "grad_norm": 2.421875, "learning_rate": 1.2423051609040777e-06, "loss": 1.0316946, "memory(GiB)": 369.42, "step": 61665, "train_speed(iter/s)": 0.200779 }, { "acc": 0.73562145, "epoch": 1.5644342973110097, "grad_norm": 2.296875, "learning_rate": 1.2416134783582368e-06, "loss": 1.02911358, "memory(GiB)": 369.42, "step": 61670, "train_speed(iter/s)": 0.200781 }, { "acc": 0.74976177, "epoch": 1.5645611364789447, "grad_norm": 2.03125, "learning_rate": 1.2409219611230116e-06, "loss": 1.01076107, "memory(GiB)": 369.42, "step": 61675, "train_speed(iter/s)": 0.200783 }, { "acc": 0.75765781, "epoch": 1.56468797564688, "grad_norm": 2.046875, "learning_rate": 1.2402306092288236e-06, "loss": 0.94128056, "memory(GiB)": 369.42, "step": 61680, "train_speed(iter/s)": 0.200786 }, { "acc": 0.75839105, "epoch": 1.5648148148148149, "grad_norm": 2.515625, "learning_rate": 1.2395394227060793e-06, "loss": 1.00547791, "memory(GiB)": 369.42, "step": 61685, "train_speed(iter/s)": 0.200787 }, { "acc": 0.74134865, "epoch": 1.5649416539827499, "grad_norm": 2.15625, "learning_rate": 1.238848401585182e-06, "loss": 0.97275677, "memory(GiB)": 369.42, "step": 61690, "train_speed(iter/s)": 0.20079 }, { "acc": 0.73976178, "epoch": 1.5650684931506849, "grad_norm": 2.25, "learning_rate": 1.2381575458965218e-06, "loss": 0.97785625, "memory(GiB)": 369.42, "step": 61695, "train_speed(iter/s)": 0.200793 }, { "acc": 0.75431786, "epoch": 1.5651953323186198, "grad_norm": 2.375, "learning_rate": 1.2374668556704888e-06, "loss": 0.9837697, "memory(GiB)": 369.42, "step": 61700, "train_speed(iter/s)": 0.200796 }, { "acc": 0.76139584, "epoch": 1.565322171486555, "grad_norm": 2.40625, "learning_rate": 1.2367763309374625e-06, "loss": 0.92958794, "memory(GiB)": 369.42, "step": 61705, "train_speed(iter/s)": 0.200799 }, { "acc": 0.75468206, "epoch": 1.5654490106544903, "grad_norm": 1.8515625, "learning_rate": 1.2360859717278145e-06, "loss": 0.95923071, "memory(GiB)": 369.42, "step": 61710, "train_speed(iter/s)": 0.200801 }, { "acc": 0.73847466, "epoch": 1.5655758498224253, "grad_norm": 2.171875, "learning_rate": 1.2353957780719106e-06, "loss": 0.99224987, "memory(GiB)": 369.42, "step": 61715, "train_speed(iter/s)": 0.200801 }, { "acc": 0.73725448, "epoch": 1.5657026889903602, "grad_norm": 2.46875, "learning_rate": 1.2347057500001075e-06, "loss": 1.06018019, "memory(GiB)": 369.42, "step": 61720, "train_speed(iter/s)": 0.200804 }, { "acc": 0.75594854, "epoch": 1.5658295281582952, "grad_norm": 2.109375, "learning_rate": 1.2340158875427566e-06, "loss": 0.98801899, "memory(GiB)": 369.42, "step": 61725, "train_speed(iter/s)": 0.200807 }, { "acc": 0.750949, "epoch": 1.5659563673262302, "grad_norm": 2.34375, "learning_rate": 1.2333261907302013e-06, "loss": 0.98749285, "memory(GiB)": 369.42, "step": 61730, "train_speed(iter/s)": 0.20081 }, { "acc": 0.76470728, "epoch": 1.5660832064941654, "grad_norm": 1.921875, "learning_rate": 1.2326366595927763e-06, "loss": 0.96464491, "memory(GiB)": 369.42, "step": 61735, "train_speed(iter/s)": 0.200813 }, { "acc": 0.75267363, "epoch": 1.5662100456621004, "grad_norm": 1.984375, "learning_rate": 1.2319472941608118e-06, "loss": 1.00549355, "memory(GiB)": 369.42, "step": 61740, "train_speed(iter/s)": 0.200815 }, { "acc": 0.7541461, "epoch": 1.5663368848300356, "grad_norm": 2.453125, "learning_rate": 1.231258094464628e-06, "loss": 1.01324348, "memory(GiB)": 369.42, "step": 61745, "train_speed(iter/s)": 0.200818 }, { "acc": 0.74633646, "epoch": 1.5664637239979706, "grad_norm": 2.375, "learning_rate": 1.23056906053454e-06, "loss": 0.9956522, "memory(GiB)": 369.42, "step": 61750, "train_speed(iter/s)": 0.20082 }, { "acc": 0.74102964, "epoch": 1.5665905631659056, "grad_norm": 2.265625, "learning_rate": 1.2298801924008535e-06, "loss": 1.02769146, "memory(GiB)": 369.42, "step": 61755, "train_speed(iter/s)": 0.200821 }, { "acc": 0.74559517, "epoch": 1.5667174023338406, "grad_norm": 1.9375, "learning_rate": 1.2291914900938685e-06, "loss": 0.9885231, "memory(GiB)": 369.42, "step": 61760, "train_speed(iter/s)": 0.200823 }, { "acc": 0.74995232, "epoch": 1.5668442415017756, "grad_norm": 2.421875, "learning_rate": 1.2285029536438759e-06, "loss": 0.96651382, "memory(GiB)": 369.42, "step": 61765, "train_speed(iter/s)": 0.200826 }, { "acc": 0.74107056, "epoch": 1.5669710806697108, "grad_norm": 1.75, "learning_rate": 1.227814583081165e-06, "loss": 1.01510401, "memory(GiB)": 369.42, "step": 61770, "train_speed(iter/s)": 0.200828 }, { "acc": 0.7616683, "epoch": 1.567097919837646, "grad_norm": 2.953125, "learning_rate": 1.2271263784360088e-06, "loss": 0.96492939, "memory(GiB)": 369.42, "step": 61775, "train_speed(iter/s)": 0.20083 }, { "acc": 0.75314803, "epoch": 1.567224759005581, "grad_norm": 2.296875, "learning_rate": 1.2264383397386787e-06, "loss": 1.022649, "memory(GiB)": 369.42, "step": 61780, "train_speed(iter/s)": 0.200833 }, { "acc": 0.75694466, "epoch": 1.567351598173516, "grad_norm": 2.46875, "learning_rate": 1.225750467019437e-06, "loss": 1.02338152, "memory(GiB)": 369.42, "step": 61785, "train_speed(iter/s)": 0.200834 }, { "acc": 0.74490919, "epoch": 1.567478437341451, "grad_norm": 2.046875, "learning_rate": 1.2250627603085435e-06, "loss": 1.06877728, "memory(GiB)": 369.42, "step": 61790, "train_speed(iter/s)": 0.200836 }, { "acc": 0.75871687, "epoch": 1.567605276509386, "grad_norm": 2.796875, "learning_rate": 1.2243752196362423e-06, "loss": 0.97964287, "memory(GiB)": 369.42, "step": 61795, "train_speed(iter/s)": 0.200838 }, { "acc": 0.74575586, "epoch": 1.5677321156773212, "grad_norm": 2.078125, "learning_rate": 1.2236878450327743e-06, "loss": 1.0045969, "memory(GiB)": 369.42, "step": 61800, "train_speed(iter/s)": 0.200841 }, { "acc": 0.75335479, "epoch": 1.5678589548452562, "grad_norm": 2.546875, "learning_rate": 1.223000636528377e-06, "loss": 0.99722099, "memory(GiB)": 369.42, "step": 61805, "train_speed(iter/s)": 0.200843 }, { "acc": 0.7625277, "epoch": 1.5679857940131914, "grad_norm": 1.8671875, "learning_rate": 1.2223135941532754e-06, "loss": 0.93645172, "memory(GiB)": 369.42, "step": 61810, "train_speed(iter/s)": 0.200846 }, { "acc": 0.75620508, "epoch": 1.5681126331811264, "grad_norm": 2.28125, "learning_rate": 1.2216267179376857e-06, "loss": 0.98650227, "memory(GiB)": 369.42, "step": 61815, "train_speed(iter/s)": 0.200847 }, { "acc": 0.7574491, "epoch": 1.5682394723490614, "grad_norm": 2.25, "learning_rate": 1.2209400079118233e-06, "loss": 0.98450699, "memory(GiB)": 369.42, "step": 61820, "train_speed(iter/s)": 0.200849 }, { "acc": 0.75403595, "epoch": 1.5683663115169963, "grad_norm": 1.890625, "learning_rate": 1.2202534641058916e-06, "loss": 0.99183607, "memory(GiB)": 369.42, "step": 61825, "train_speed(iter/s)": 0.20085 }, { "acc": 0.76725264, "epoch": 1.5684931506849316, "grad_norm": 2.125, "learning_rate": 1.2195670865500896e-06, "loss": 0.88212738, "memory(GiB)": 369.42, "step": 61830, "train_speed(iter/s)": 0.200852 }, { "acc": 0.75488467, "epoch": 1.5686199898528665, "grad_norm": 2.25, "learning_rate": 1.2188808752746022e-06, "loss": 1.02915039, "memory(GiB)": 369.42, "step": 61835, "train_speed(iter/s)": 0.200855 }, { "acc": 0.76303091, "epoch": 1.5687468290208018, "grad_norm": 2.5625, "learning_rate": 1.2181948303096176e-06, "loss": 0.98994751, "memory(GiB)": 369.42, "step": 61840, "train_speed(iter/s)": 0.200857 }, { "acc": 0.7503191, "epoch": 1.5688736681887367, "grad_norm": 2.0625, "learning_rate": 1.2175089516853083e-06, "loss": 0.96463261, "memory(GiB)": 369.42, "step": 61845, "train_speed(iter/s)": 0.200858 }, { "acc": 0.76232357, "epoch": 1.5690005073566717, "grad_norm": 1.859375, "learning_rate": 1.216823239431843e-06, "loss": 0.98570738, "memory(GiB)": 369.42, "step": 61850, "train_speed(iter/s)": 0.200861 }, { "acc": 0.76219506, "epoch": 1.5691273465246067, "grad_norm": 2.765625, "learning_rate": 1.2161376935793827e-06, "loss": 0.96447258, "memory(GiB)": 369.42, "step": 61855, "train_speed(iter/s)": 0.200864 }, { "acc": 0.74627762, "epoch": 1.5692541856925417, "grad_norm": 2.296875, "learning_rate": 1.21545231415808e-06, "loss": 1.00192261, "memory(GiB)": 369.42, "step": 61860, "train_speed(iter/s)": 0.200866 }, { "acc": 0.74571581, "epoch": 1.569381024860477, "grad_norm": 2.078125, "learning_rate": 1.2147671011980816e-06, "loss": 1.03519268, "memory(GiB)": 369.42, "step": 61865, "train_speed(iter/s)": 0.20087 }, { "acc": 0.75500922, "epoch": 1.5695078640284121, "grad_norm": 2.234375, "learning_rate": 1.2140820547295256e-06, "loss": 0.95098791, "memory(GiB)": 369.42, "step": 61870, "train_speed(iter/s)": 0.200873 }, { "acc": 0.74724293, "epoch": 1.5696347031963471, "grad_norm": 2.0625, "learning_rate": 1.2133971747825435e-06, "loss": 0.96093445, "memory(GiB)": 369.42, "step": 61875, "train_speed(iter/s)": 0.200876 }, { "acc": 0.75726118, "epoch": 1.5697615423642821, "grad_norm": 2.578125, "learning_rate": 1.2127124613872603e-06, "loss": 1.01743622, "memory(GiB)": 369.42, "step": 61880, "train_speed(iter/s)": 0.200878 }, { "acc": 0.74396358, "epoch": 1.569888381532217, "grad_norm": 2.4375, "learning_rate": 1.2120279145737918e-06, "loss": 0.98419123, "memory(GiB)": 369.42, "step": 61885, "train_speed(iter/s)": 0.20088 }, { "acc": 0.76325731, "epoch": 1.570015220700152, "grad_norm": 2.15625, "learning_rate": 1.2113435343722474e-06, "loss": 0.93211002, "memory(GiB)": 369.42, "step": 61890, "train_speed(iter/s)": 0.200882 }, { "acc": 0.75258598, "epoch": 1.5701420598680873, "grad_norm": 1.96875, "learning_rate": 1.21065932081273e-06, "loss": 0.95750694, "memory(GiB)": 369.42, "step": 61895, "train_speed(iter/s)": 0.200883 }, { "acc": 0.74728308, "epoch": 1.5702688990360223, "grad_norm": 1.953125, "learning_rate": 1.2099752739253334e-06, "loss": 0.99747658, "memory(GiB)": 369.42, "step": 61900, "train_speed(iter/s)": 0.200886 }, { "acc": 0.75718107, "epoch": 1.5703957382039575, "grad_norm": 2.21875, "learning_rate": 1.209291393740144e-06, "loss": 1.01069937, "memory(GiB)": 369.42, "step": 61905, "train_speed(iter/s)": 0.200889 }, { "acc": 0.74199991, "epoch": 1.5705225773718925, "grad_norm": 2.171875, "learning_rate": 1.2086076802872472e-06, "loss": 1.00227146, "memory(GiB)": 369.42, "step": 61910, "train_speed(iter/s)": 0.20089 }, { "acc": 0.74439478, "epoch": 1.5706494165398275, "grad_norm": 2.203125, "learning_rate": 1.2079241335967096e-06, "loss": 1.01734028, "memory(GiB)": 369.42, "step": 61915, "train_speed(iter/s)": 0.200893 }, { "acc": 0.73360376, "epoch": 1.5707762557077625, "grad_norm": 2.171875, "learning_rate": 1.207240753698599e-06, "loss": 1.04237108, "memory(GiB)": 369.42, "step": 61920, "train_speed(iter/s)": 0.200895 }, { "acc": 0.77251663, "epoch": 1.5709030948756975, "grad_norm": 2.296875, "learning_rate": 1.2065575406229723e-06, "loss": 0.88449097, "memory(GiB)": 369.42, "step": 61925, "train_speed(iter/s)": 0.200898 }, { "acc": 0.76680799, "epoch": 1.5710299340436327, "grad_norm": 2.328125, "learning_rate": 1.2058744943998847e-06, "loss": 0.96432104, "memory(GiB)": 369.42, "step": 61930, "train_speed(iter/s)": 0.2009 }, { "acc": 0.75627022, "epoch": 1.5711567732115679, "grad_norm": 2.546875, "learning_rate": 1.2051916150593746e-06, "loss": 0.99565716, "memory(GiB)": 369.42, "step": 61935, "train_speed(iter/s)": 0.200903 }, { "acc": 0.74355164, "epoch": 1.5712836123795029, "grad_norm": 2.0625, "learning_rate": 1.2045089026314783e-06, "loss": 0.98800163, "memory(GiB)": 369.42, "step": 61940, "train_speed(iter/s)": 0.200905 }, { "acc": 0.74728112, "epoch": 1.5714104515474379, "grad_norm": 2.140625, "learning_rate": 1.2038263571462278e-06, "loss": 0.96964598, "memory(GiB)": 369.42, "step": 61945, "train_speed(iter/s)": 0.200907 }, { "acc": 0.75590525, "epoch": 1.5715372907153728, "grad_norm": 3.046875, "learning_rate": 1.203143978633644e-06, "loss": 0.99737158, "memory(GiB)": 369.42, "step": 61950, "train_speed(iter/s)": 0.20091 }, { "acc": 0.75294905, "epoch": 1.5716641298833078, "grad_norm": 2.40625, "learning_rate": 1.2024617671237388e-06, "loss": 0.92519741, "memory(GiB)": 369.42, "step": 61955, "train_speed(iter/s)": 0.200911 }, { "acc": 0.74247055, "epoch": 1.571790969051243, "grad_norm": 1.578125, "learning_rate": 1.2017797226465178e-06, "loss": 0.99150791, "memory(GiB)": 369.42, "step": 61960, "train_speed(iter/s)": 0.200912 }, { "acc": 0.73412657, "epoch": 1.571917808219178, "grad_norm": 1.96875, "learning_rate": 1.2010978452319843e-06, "loss": 1.00438461, "memory(GiB)": 369.42, "step": 61965, "train_speed(iter/s)": 0.200915 }, { "acc": 0.75111971, "epoch": 1.5720446473871132, "grad_norm": 2.140625, "learning_rate": 1.2004161349101295e-06, "loss": 0.97882271, "memory(GiB)": 369.42, "step": 61970, "train_speed(iter/s)": 0.200918 }, { "acc": 0.76177316, "epoch": 1.5721714865550482, "grad_norm": 2.828125, "learning_rate": 1.1997345917109348e-06, "loss": 0.96944962, "memory(GiB)": 369.42, "step": 61975, "train_speed(iter/s)": 0.200921 }, { "acc": 0.75221357, "epoch": 1.5722983257229832, "grad_norm": 1.9921875, "learning_rate": 1.1990532156643808e-06, "loss": 0.99835682, "memory(GiB)": 369.42, "step": 61980, "train_speed(iter/s)": 0.200924 }, { "acc": 0.75356717, "epoch": 1.5724251648909182, "grad_norm": 2.140625, "learning_rate": 1.198372006800436e-06, "loss": 0.99387121, "memory(GiB)": 369.42, "step": 61985, "train_speed(iter/s)": 0.200927 }, { "acc": 0.74902945, "epoch": 1.5725520040588534, "grad_norm": 2.125, "learning_rate": 1.1976909651490637e-06, "loss": 0.98822174, "memory(GiB)": 369.42, "step": 61990, "train_speed(iter/s)": 0.200929 }, { "acc": 0.75781174, "epoch": 1.5726788432267884, "grad_norm": 1.703125, "learning_rate": 1.1970100907402188e-06, "loss": 0.96552105, "memory(GiB)": 369.42, "step": 61995, "train_speed(iter/s)": 0.200931 }, { "acc": 0.76263742, "epoch": 1.5728056823947236, "grad_norm": 2.671875, "learning_rate": 1.1963293836038492e-06, "loss": 0.97611961, "memory(GiB)": 369.42, "step": 62000, "train_speed(iter/s)": 0.200934 }, { "epoch": 1.5728056823947236, "eval_acc": 0.7380060049873062, "eval_loss": 0.9694408178329468, "eval_runtime": 385.1394, "eval_samples_per_second": 16.539, "eval_steps_per_second": 8.27, "step": 62000 } ], "logging_steps": 5, "max_steps": 78840, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0361927255155802e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }